/* tre-python.c - TRE Python language bindings This sotfware is released under a BSD-style license. See the file LICENSE for details and copyright. The original version of this code was contributed by Nikolai Saoukh . */ #include "Python.h" #include "structmember.h" #include #define TRE_MODULE "tre" typedef struct { PyObject_HEAD regex_t rgx; int flags; } TrePatternObject; typedef struct { PyObject_HEAD regaparams_t ap; } TreFuzzynessObject; typedef struct { PyObject_HEAD regamatch_t am; PyObject *targ; /* string we matched against */ TreFuzzynessObject *fz; /* fuzzyness used during match */ } TreMatchObject; static PyObject *ErrorObject; static void _set_tre_err(int rc, regex_t *rgx) { PyObject *errval; char emsg[256]; size_t elen; elen = tre_regerror(rc, rgx, emsg, sizeof(emsg)); if (emsg[elen] == '\0') elen--; errval = Py_BuildValue("s#", emsg, elen); PyErr_SetObject(ErrorObject, errval); Py_XDECREF(errval); } static PyObject * TreFuzzyness_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { static char *kwlist[] = { "delcost", "inscost", "maxcost", "subcost", "maxdel", "maxerr", "maxins", "maxsub", NULL }; TreFuzzynessObject *self; self = (TreFuzzynessObject*)type->tp_alloc(type, 0); if (self == NULL) return NULL; tre_regaparams_default(&self->ap); if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iiiiiiii", kwlist, &self->ap.cost_del, &self->ap.cost_ins, &self->ap.max_cost, &self->ap.cost_subst, &self->ap.max_del, &self->ap.max_err, &self->ap.max_ins, &self->ap.max_subst)) { Py_DECREF(self); return NULL; } return (PyObject*)self; } static PyObject * TreFuzzyness_repr(PyObject *obj) { TreFuzzynessObject *self = (TreFuzzynessObject*)obj; PyObject *o; o = PyString_FromFormat("%s(delcost=%d,inscost=%d,maxcost=%d,subcost=%d," "maxdel=%d,maxerr=%d,maxins=%d,maxsub=%d)", self->ob_type->tp_name, self->ap.cost_del, self->ap.cost_ins, self->ap.max_cost, self->ap.cost_subst, self->ap.max_del, self->ap.max_err, self->ap.max_ins, self->ap.max_subst); return o; } static PyMemberDef TreFuzzyness_members[] = { { "delcost", T_INT, offsetof(TreFuzzynessObject, ap.cost_del), 0, "The cost of a deleted character" }, { "inscost", T_INT, offsetof(TreFuzzynessObject, ap.cost_ins), 0, "The cost of an inserted character" }, { "maxcost", T_INT, offsetof(TreFuzzynessObject, ap.max_cost), 0, "The maximum allowed cost of a match. If this is set to zero, an exact " "match is searched for" }, { "subcost", T_INT, offsetof(TreFuzzynessObject, ap.cost_subst), 0, "The cost of a substituted character" }, { "maxdel", T_INT, offsetof(TreFuzzynessObject, ap.max_del), 0, "Maximum allowed number of deleted characters" }, { "maxerr", T_INT, offsetof(TreFuzzynessObject, ap.max_err), 0, "Maximum allowed number of errors (inserts + deletes + substitutes)" }, { "maxins", T_INT, offsetof(TreFuzzynessObject, ap.max_ins), 0, "Maximum allowed number of inserted characters" }, { "maxsub", T_INT, offsetof(TreFuzzynessObject, ap.max_subst), 0, "Maximum allowed number of substituted characters" }, { NULL } }; static PyTypeObject TreFuzzynessType = { PyObject_HEAD_INIT(NULL) 0, /* ob_size */ TRE_MODULE ".Fuzzyness", /* tp_name */ sizeof(TreFuzzynessObject), /* tp_basicsize */ 0, /* tp_itemsize */ /* methods */ 0, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_compare */ TreFuzzyness_repr, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ 0, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ /* tp_doc */ TRE_MODULE ".fuzzyness object holds approximation parameters for match", 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ 0, /* tp_methods */ TreFuzzyness_members, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ 0, /* tp_init */ 0, /* tp_alloc */ TreFuzzyness_new /* tp_new */ }; static PyObject * PyTreMatch_groups(TreMatchObject *self, PyObject *dummy) { PyObject *result; size_t i; if (self->am.nmatch < 1) { Py_INCREF(Py_None); return Py_None; } result = PyTuple_New(self->am.nmatch); for (i = 0; i < self->am.nmatch; i++) { PyObject *range; regmatch_t *rm = &self->am.pmatch[i]; if (rm->rm_so == (-1) && rm->rm_eo == (-1)) { Py_INCREF(Py_None); range = Py_None; } else { range = Py_BuildValue("(ii)", rm->rm_so, rm->rm_eo); } PyTuple_SetItem(result, i, range); } return (PyObject*)result; } static PyObject * PyTreMatch_groupi(PyObject *obj, int gn) { TreMatchObject *self = (TreMatchObject*)obj; PyObject *result; regmatch_t *rm; if (gn < 0 || (size_t)gn > self->am.nmatch - 1) { PyErr_SetString(PyExc_ValueError, "out of bounds"); return NULL; } rm = &self->am.pmatch[gn]; if (rm->rm_so == (-1) && rm->rm_eo == (-1)) { Py_INCREF(Py_None); return Py_None; } result = PySequence_GetSlice(self->targ, rm->rm_so, rm->rm_eo); return result; } static PyObject * PyTreMatch_group(TreMatchObject *self, PyObject *grpno) { PyObject *result; long gn; gn = PyInt_AsLong(grpno); if (PyErr_Occurred()) return NULL; result = PyTreMatch_groupi((PyObject*)self, gn); return result; } static PyMethodDef TreMatch_methods[] = { {"group", (PyCFunction)PyTreMatch_group, METH_O, "return submatched string or None if a parenthesized subexpression did " "not participate in a match"}, {"groups", (PyCFunction)PyTreMatch_groups, METH_NOARGS, "return the tuple of slice tuples for all parenthesized subexpressions " "(None for not participated)"}, {NULL, NULL} }; static PyMemberDef TreMatch_members[] = { { "cost", T_INT, offsetof(TreMatchObject, am.cost), READONLY, "Cost of the match" }, { "numdel", T_INT, offsetof(TreMatchObject, am.num_del), READONLY, "Number of deletes in the match" }, { "numins", T_INT, offsetof(TreMatchObject, am.num_ins), READONLY, "Number of inserts in the match" }, { "numsub", T_INT, offsetof(TreMatchObject, am.num_subst), READONLY, "Number of substitutes in the match" }, { "fuzzyness", T_OBJECT, offsetof(TreMatchObject, fz), READONLY, "Fuzzyness used during match" }, { NULL } }; static void PyTreMatch_dealloc(TreMatchObject *self) { Py_XDECREF(self->targ); Py_XDECREF(self->fz); if (self->am.pmatch != NULL) PyMem_Del(self->am.pmatch); PyObject_Del(self); } static PySequenceMethods TreMatch_as_sequence_methods = { 0, /* sq_length */ 0, /* sq_concat */ 0, /* sq_repeat */ PyTreMatch_groupi, /* sq_item */ 0, /* sq_slice */ 0, /* sq_ass_item */ 0, /* sq_ass_slice */ 0, /* sq_contains */ 0, /* sq_inplace_concat */ 0 /* sq_inplace_repeat */ }; static PyTypeObject TreMatchType = { PyObject_HEAD_INIT(NULL) 0, /* ob_size */ TRE_MODULE ".Match", /* tp_name */ sizeof(TreMatchObject), /* tp_basicsize */ 0, /* tp_itemsize */ /* methods */ (destructor)PyTreMatch_dealloc, /* tp_dealloc */ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_compare */ 0, /* tp_repr */ 0, /* tp_as_number */ &TreMatch_as_sequence_methods, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ 0, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ TRE_MODULE ".match object holds result of successful match", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ TreMatch_methods, /* tp_methods */ TreMatch_members /* tp_members */ }; static TreMatchObject * newTreMatchObject(void) { TreMatchObject *self; self = PyObject_New(TreMatchObject, &TreMatchType); if (self == NULL) return NULL; memset(&self->am, '\0', sizeof(self->am)); self->targ = NULL; self->fz = NULL; return self; } static PyObject * PyTrePattern_search(TrePatternObject *self, PyObject *args) { PyObject *pstring; int eflags = 0; TreMatchObject *mo; TreFuzzynessObject *fz; size_t nsub; int rc; regmatch_t *pm; char *targ; size_t tlen; if (PyTuple_Size(args) > 0 && PyUnicode_Check(PyTuple_GetItem(args, 0))) { if (!PyArg_ParseTuple(args, "UO!|i:search", &pstring, &TreFuzzynessType, &fz, &eflags)) return NULL; } else { if (!PyArg_ParseTuple(args, "SO!|i:search", &pstring, &TreFuzzynessType, &fz, &eflags)) return NULL; } mo = newTreMatchObject(); if (mo == NULL) return NULL; nsub = self->rgx.re_nsub + 1; pm = PyMem_New(regmatch_t, nsub); if (!pm) { Py_DECREF(mo); return PyErr_NoMemory(); } mo->am.nmatch = nsub; mo->am.pmatch = pm; if (PyUnicode_Check(pstring)) { Py_ssize_t len = PyUnicode_GetSize(pstring); wchar_t *buf = calloc(sizeof(wchar_t), len); if(!buf) { Py_DECREF(mo); return PyErr_NoMemory(); } PyUnicode_AsWideChar(pstring, buf, len); rc = tre_regawnexec(&self->rgx, buf, len, &mo->am, fz->ap, eflags); free(buf); } else { targ = PyString_AsString(pstring); tlen = PyString_Size(pstring); rc = tre_reganexec(&self->rgx, targ, tlen, &mo->am, fz->ap, eflags); } if (PyErr_Occurred()) { Py_DECREF(mo); return NULL; } if (rc == REG_OK) { Py_INCREF(pstring); mo->targ = pstring; Py_INCREF(fz); mo->fz = fz; return (PyObject*)mo; } if (rc == REG_NOMATCH) { Py_DECREF(mo); Py_INCREF(Py_None); return Py_None; } _set_tre_err(rc, &self->rgx); Py_DECREF(mo); return NULL; } static PyMethodDef TrePattern_methods[] = { { "search", (PyCFunction)PyTrePattern_search, METH_VARARGS, "try to search in the given string, returning " TRE_MODULE ".match object " "or None on failure" }, {NULL, NULL} }; static PyMemberDef TrePattern_members[] = { { "nsub", T_INT, offsetof(TrePatternObject, rgx.re_nsub), READONLY, "Number of parenthesized subexpressions in regex" }, { NULL } }; static void PyTrePattern_dealloc(TrePatternObject *self) { tre_regfree(&self->rgx); PyObject_Del(self); } static PyTypeObject TrePatternType = { PyObject_HEAD_INIT(NULL) 0, /* ob_size */ TRE_MODULE ".Pattern", /* tp_name */ sizeof(TrePatternObject), /* tp_basicsize */ 0, /* tp_itemsize */ /* methods */ (destructor)PyTrePattern_dealloc, /*tp_dealloc*/ 0, /* tp_print */ 0, /* tp_getattr */ 0, /* tp_setattr */ 0, /* tp_compare */ 0, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ 0, /* tp_hash */ 0, /* tp_call */ 0, /* tp_str */ 0, /* tp_getattro */ 0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ TRE_MODULE ".pattern object holds compiled tre regex", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ TrePattern_methods, /* tp_methods */ TrePattern_members /* tp_members */ }; static TrePatternObject * newTrePatternObject() { TrePatternObject *self; self = PyObject_New(TrePatternObject, &TrePatternType); if (self == NULL) return NULL; self->flags = 0; return self; } static PyObject * PyTre_ncompile(PyObject *self, PyObject *args) { TrePatternObject *rv; PyUnicodeObject *upattern = NULL; char *pattern = NULL; int pattlen; int cflags = 0; int rc; if (PyTuple_Size(args) > 0 && PyUnicode_Check(PyTuple_GetItem(args, 0))) { if (!PyArg_ParseTuple(args, "U|i:compile", &upattern, &cflags)) return NULL; } else { if (!PyArg_ParseTuple(args, "s#|i:compile", &pattern, &pattlen, &cflags)) return NULL; } rv = newTrePatternObject(); if (rv == NULL) return NULL; if (upattern != NULL) { Py_ssize_t len = PyUnicode_GetSize(upattern); wchar_t *buf = calloc(sizeof(wchar_t), len); if(!buf) { Py_DECREF(rv); return PyErr_NoMemory(); } PyUnicode_AsWideChar(upattern, buf, len); rc = tre_regwncomp(&rv->rgx, buf, len, cflags); free(buf); } else rc = tre_regncomp(&rv->rgx, (char*)pattern, pattlen, cflags); if (rc != REG_OK) { if (!PyErr_Occurred()) _set_tre_err(rc, &rv->rgx); Py_DECREF(rv); return NULL; } rv->flags = cflags; return (PyObject*)rv; } static PyMethodDef tre_methods[] = { { "compile", PyTre_ncompile, METH_VARARGS, "Compile a regular expression pattern, returning a " TRE_MODULE ".pattern object" }, { NULL, NULL } }; static char *tre_doc = "Python module for TRE library\n\nModule exports " "the only function: compile"; static struct _tre_flags { char *name; int val; } tre_flags[] = { { "EXTENDED", REG_EXTENDED }, { "ICASE", REG_ICASE }, { "NEWLINE", REG_NEWLINE }, { "NOSUB", REG_NOSUB }, { "LITERAL", REG_LITERAL }, { "NOTBOL", REG_NOTBOL }, { "NOTEOL", REG_NOTEOL }, { NULL, 0 } }; PyMODINIT_FUNC inittre(void) { PyObject *m; struct _tre_flags *fp; if (PyType_Ready(&TreFuzzynessType) < 0) return; if (PyType_Ready(&TreMatchType) < 0) return; if (PyType_Ready(&TrePatternType) < 0) return; /* Create the module and add the functions */ m = Py_InitModule3(TRE_MODULE, tre_methods, tre_doc); if (m == NULL) return; Py_INCREF(&TreFuzzynessType); if (PyModule_AddObject(m, "Fuzzyness", (PyObject*)&TreFuzzynessType) < 0) return; Py_INCREF(&TreMatchType); if (PyModule_AddObject(m, "Match", (PyObject*)&TreMatchType) < 0) return; Py_INCREF(&TrePatternType); if (PyModule_AddObject(m, "Pattern", (PyObject*)&TrePatternType) < 0) return; ErrorObject = PyErr_NewException(TRE_MODULE ".Error", NULL, NULL); Py_INCREF(ErrorObject); if (PyModule_AddObject(m, "Error", ErrorObject) < 0) return; /* Insert the flags */ for (fp = tre_flags; fp->name != NULL; fp++) if (PyModule_AddIntConstant(m, fp->name, fp->val) < 0) return; }