From 94ca4e578de5fcdbf98d67b6310cecafe32177c0 Mon Sep 17 00:00:00 2001 From: David Reiss Date: Tue, 16 Mar 2010 11:48:56 -0700 Subject: [PATCH] Initial commit of pyre2 --- .gitignore | 2 + LICENSE | 25 ++ README | 1 + README.rst | 51 ++++ _re2.cc | 732 +++++++++++++++++++++++++++++++++++++++++++++++++++++ re2.py | 9 + setup.py | 15 ++ 7 files changed, 835 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 120000 README create mode 100644 README.rst create mode 100644 _re2.cc create mode 100644 re2.py create mode 100755 setup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..2247d5f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/build +/dist diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..bde830be --- /dev/null +++ b/LICENSE @@ -0,0 +1,25 @@ +Copyright (c) 2010, David Reiss and Facebook, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +* Neither the name of Facebook nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README b/README new file mode 120000 index 00000000..92cacd28 --- /dev/null +++ b/README @@ -0,0 +1 @@ +README.rst \ No newline at end of file diff --git a/README.rst b/README.rst new file mode 100644 index 00000000..5ed9d0c4 --- /dev/null +++ b/README.rst @@ -0,0 +1,51 @@ +===== +pyre2 +===== + +.. contents:: + +Summary +======= + +pyre2 is a Python extension that wraps +`Google's RE2 regular expression library +`_. +It implements many of the features of Python's built-in +``re`` module with compatible interfaces. + + +New Features +============ + +* ``Regexp`` objects have a ``fullmatch`` method that works like ``match``, + but anchors the match at both the start and the end. +* ``Regexp`` objects have + ``test_search``, ``test_match``, and ``test_fullmatch`` + methods that work like ``search``, ``match``, and ``fullmatch``, + but only return ``True`` or ``False`` to indicate + whether the match was successful. + These methods should be faster than the full versions, + especially for patterns with capturing groups. + + +Missing Features +================ + +* No substitution methods. +* No flags. +* No ``split``, ``findall``, or ``finditer``. +* No top-level convenience functions like ``search`` and ``match``. + (Just use compile.) +* No compile cache. + (If you care enough about performance to use RE2, + you probably care enough to cache your own patterns.) +* No ``lastindex`` or ``lastgroup`` on ``Match`` objects. + + +Current Status +============== + +pyre2 has only received basic testing, +and I am by no means a Python extension expert, +so it is quite possible that it contains bugs. +I'd guess the most likely are reference leaks in error cases. diff --git a/_re2.cc b/_re2.cc new file mode 100644 index 00000000..15799b1e --- /dev/null +++ b/_re2.cc @@ -0,0 +1,732 @@ +/* + * Copyright (c) 2010, David Reiss and Facebook, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Facebook nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#define PY_SSIZE_T_CLEAN +#include + +#include + +#include +#include +using std::nothrow; + +#include "re2/re2.h" +using re2::RE2; +using re2::StringPiece; + + +typedef struct { + PyObject_HEAD + // __dict__. Simpler than implementing getattr and possibly faster. + PyObject* attr_dict; + RE2* re2_obj; +} RegexpObject2; + +typedef struct { + PyObject_HEAD + // __dict__. Simpler than implementing getattr and possibly faster. + PyObject* attr_dict; + // Cache of __dict__["re"] and __dict__["string", which are used for group() + // calls. These fields do *not* own their own references. They piggyback on + // the references in attr_dict. + PyObject* re; + PyObject* string; + // There are several possible approaches to storing the matched groups: + // 1. Fully materialize the groups tuple at match time. + // 2. Cache allocate PyString objects when groups are requested. + // 3. Always allocate new PyStrings on demand. + // I've chosen to go with #3. It's the simplest, and I'm pretty sure it's + // optimal in all cases where no group is fetched more than once. + StringPiece* groups; +} MatchObject2; + + +// Imported from re2. +static PyObject* error_class; + + +// Forward declarations of methods, creators, and destructors. +static void regexp_dealloc(RegexpObject2* self); +static PyObject* create_regexp(PyObject* pattern); +static PyObject* regexp_search(RegexpObject2* self, PyObject* args, PyObject* kwds); +static PyObject* regexp_match(RegexpObject2* self, PyObject* args, PyObject* kwds); +static PyObject* regexp_fullmatch(RegexpObject2* self, PyObject* args, PyObject* kwds); +static PyObject* regexp_test_search(RegexpObject2* self, PyObject* args, PyObject* kwds); +static PyObject* regexp_test_match(RegexpObject2* self, PyObject* args, PyObject* kwds); +static PyObject* regexp_test_fullmatch(RegexpObject2* self, PyObject* args, PyObject* kwds); +static void match_dealloc(MatchObject2* self); +static PyObject* create_match(PyObject* re, PyObject* string, long pos, long endpos, StringPiece* groups); +static PyObject* match_group(MatchObject2* self, PyObject* args); +static PyObject* match_groups(MatchObject2* self, PyObject* args, PyObject* kwds); +static PyObject* match_groupdict(MatchObject2* self, PyObject* args, PyObject* kwds); +static PyObject* match_start(MatchObject2* self, PyObject* args); +static PyObject* match_end(MatchObject2* self, PyObject* args); +static PyObject* match_span(MatchObject2* self, PyObject* args); + + +static PyMethodDef regexp_methods[] = { + {"search", (PyCFunction)regexp_search, METH_VARARGS | METH_KEYWORDS, + "search(string[, pos[, endpos]]) --> match object or None.\n" + " Scan through string looking for a match, and return a corresponding\n" + " MatchObject instance. Return None if no position in the string matches." + }, + {"match", (PyCFunction)regexp_match, METH_VARARGS | METH_KEYWORDS, + "match(string[, pos[, endpos]]) --> match object or None.\n" + " Matches zero or more characters at the beginning of the string" + }, + {"fullmatch", (PyCFunction)regexp_fullmatch, METH_VARARGS | METH_KEYWORDS, + "fullmatch(string[, pos[, endpos]]) --> match object or None.\n" + " Matches the entire string" + }, + {"test_search", (PyCFunction)regexp_test_search, METH_VARARGS | METH_KEYWORDS, + "test_search(string[, pos[, endpos]]) --> bool.\n" + " Like 'search', but only returns whether a match was found." + }, + {"test_match", (PyCFunction)regexp_test_match, METH_VARARGS | METH_KEYWORDS, + "test_match(string[, pos[, endpos]]) --> match object or None.\n" + " Like 'match', but only returns whether a match was found." + }, + {"test_fullmatch", (PyCFunction)regexp_test_fullmatch, METH_VARARGS | METH_KEYWORDS, + "test_fullmatch(string[, pos[, endpos]]) --> match object or None.\n" + " Like 'fullmatch', but only returns whether a match was found." + }, + {NULL} /* Sentinel */ +}; + +static PyMethodDef match_methods[] = { + {"group", (PyCFunction)match_group, METH_VARARGS, + NULL + }, + {"groups", (PyCFunction)match_groups, METH_VARARGS | METH_KEYWORDS, + NULL + }, + {"groupdict", (PyCFunction)match_groupdict, METH_VARARGS | METH_KEYWORDS, + NULL + }, + {"start", (PyCFunction)match_start, METH_VARARGS, + NULL + }, + {"end", (PyCFunction)match_end, METH_VARARGS, + NULL + }, + {"span", (PyCFunction)match_span, METH_VARARGS, + NULL + }, + {NULL} /* Sentinel */ +}; + + +// Simple method to block setattr. +static int +_no_setattr(PyObject* obj, PyObject* name, PyObject* v) { + (void)name; + (void)v; + PyErr_Format(PyExc_AttributeError, + "'%s' object attributes are read-only", + obj->ob_type->tp_name); + return -1; +} + + +static PyTypeObject Regexp_Type2 = { + PyObject_HEAD_INIT(NULL) + 0, /*ob_size*/ + "_re2.RE2_Regexp", /*tp_name*/ + sizeof(RegexpObject2), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + (destructor)regexp_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + _no_setattr, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT, /*tp_flags*/ + "RE2 regexp objects", /*tp_doc*/ + 0, /*tp_traverse*/ + 0, /*tp_clear*/ + 0, /*tp_richcompare*/ + 0, /*tp_weaklistoffset*/ + 0, /*tp_iter*/ + 0, /*tp_iternext*/ + regexp_methods, /*tp_methods*/ + 0, /*tp_members*/ + 0, /*tp_getset*/ + 0, /*tp_base*/ + 0, /*tp_dict*/ + 0, /*tp_descr_get*/ + 0, /*tp_descr_set*/ + offsetof(RegexpObject2, attr_dict), /*tp_dictoffset*/ + 0, /*tp_init*/ + 0, /*tp_alloc*/ + 0, /*tp_new*/ +}; + +static PyTypeObject Match_Type2 = { + PyObject_HEAD_INIT(NULL) + 0, /*ob_size*/ + "_re2.RE2_Match", /*tp_name*/ + sizeof(MatchObject2), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + (destructor)match_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash*/ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + _no_setattr, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT, /*tp_flags*/ + "RE2 match objects", /*tp_doc*/ + 0, /*tp_traverse*/ + 0, /*tp_clear*/ + 0, /*tp_richcompare*/ + 0, /*tp_weaklistoffset*/ + 0, /*tp_iter*/ + 0, /*tp_iternext*/ + match_methods, /*tp_methods*/ + 0, /*tp_members*/ + 0, /*tp_getset*/ + 0, /*tp_base*/ + 0, /*tp_dict*/ + 0, /*tp_descr_get*/ + 0, /*tp_descr_set*/ + offsetof(MatchObject2, attr_dict), /*tp_dictoffset*/ + 0, /*tp_init*/ + 0, /*tp_alloc*/ + 0, /*tp_new*/ +}; + + +static void +regexp_dealloc(RegexpObject2* self) +{ + delete self->re2_obj; + Py_XDECREF(self->attr_dict); + PyObject_Del(self); +} + +static PyObject* +create_regexp(PyObject* pattern) +{ + RegexpObject2* regexp = PyObject_New(RegexpObject2, &Regexp_Type2); + if (regexp == NULL) { + return NULL; + } + regexp->re2_obj = NULL; + + const char* raw_pattern = PyString_AS_STRING(pattern); + Py_ssize_t len_pattern = PyString_GET_SIZE(pattern); + + regexp->re2_obj = new(nothrow) RE2(StringPiece(raw_pattern, len_pattern)); + + if (regexp->re2_obj == NULL) { + PyErr_NoMemory(); + Py_DECREF(regexp); + return NULL; + } + + if (!regexp->re2_obj->ok()) { + long code = (long)regexp->re2_obj->error_code(); + const std::string& msg = regexp->re2_obj->error(); + PyObject* value = Py_BuildValue("ls#", code, msg.data(), msg.length()); + if (value == NULL) { + Py_DECREF(regexp); + return NULL; + } + PyErr_SetObject(error_class, value); + Py_DECREF(regexp); + return NULL; + } + + PyObject* groupindex = PyDict_New(); + if (groupindex == NULL) { + Py_DECREF(regexp); + return NULL; + } + + // Build up the attr_dict early so regexp can take ownership of our reference + // to groupindex. + regexp->attr_dict = Py_BuildValue("{sisNsO}", + "groups", regexp->re2_obj->NumberOfCapturingGroups(), + "groupindex", groupindex, + "pattern", pattern); + if (regexp->attr_dict == NULL) { + Py_DECREF(regexp); + return NULL; + } + + const std::map& name_map = regexp->re2_obj->NamedCapturingGroups(); + for (std::map::const_iterator it = name_map.begin(); it != name_map.end(); ++it) { + PyObject* index = PyInt_FromLong(it->second); + if (index == NULL) { + Py_DECREF(regexp); + return NULL; + } + int res = PyDict_SetItemString(groupindex, it->first.c_str(), index); + Py_DECREF(index); + if (res < 0) { + Py_DECREF(regexp); + return NULL; + } + } + + return (PyObject*)regexp; +} + +static PyObject* +_do_search(RegexpObject2* self, PyObject* args, PyObject* kwds, RE2::Anchor anchor, bool return_match) +{ + PyObject* string; + const char* subject; + Py_ssize_t slen; + long pos = 0; + long endpos = LONG_MAX; + + static const char* kwlist[] = { + "string", + "pos", + "endpos", + NULL}; + + // Using O! instead of s# here, because we want to stash the original + // PyObject* in the match object on a successful match. + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!|ll", (char**)kwlist, + &PyString_Type, &string, + &pos, &endpos)) { + return NULL; + } + + subject = PyString_AS_STRING(string); + slen = PyString_GET_SIZE(string); + if (pos < 0) pos = 0; + if (pos > slen) pos = slen; + if (endpos < pos) endpos = pos; + if (endpos > slen) endpos = slen; + + // Don't bother allocating these if we are just doing a test. + int n_groups = 0; + StringPiece* groups = NULL; + if (return_match) { + n_groups = self->re2_obj->NumberOfCapturingGroups() + 1; + groups = new(nothrow) StringPiece[n_groups]; + + if (groups == NULL) { + PyErr_NoMemory(); + return NULL; + } + } + + bool matched = self->re2_obj->Match( + StringPiece(subject+pos, endpos-pos), + 0, // Not sure why this arg exists. + anchor, + groups, + n_groups); + + if (!return_match) { + if (matched) { + Py_RETURN_TRUE; + } + Py_RETURN_FALSE; + } + + if (!matched) { + delete[] groups; + Py_RETURN_NONE; + } + + // create_match is going to Py_BuildValue the pos and endpos into + // PyObjects. We could optimize the case where pos and/or endpos were + // explicitly passed in by forwarding the existing PyObjects. + // That requires much more intricate code, though. + return create_match((PyObject*)self, string, pos, endpos, groups); +} + +static PyObject* +regexp_search(RegexpObject2* self, PyObject* args, PyObject* kwds) +{ + return _do_search(self, args, kwds, RE2::UNANCHORED, true); +} + +static PyObject* +regexp_match(RegexpObject2* self, PyObject* args, PyObject* kwds) +{ + return _do_search(self, args, kwds, RE2::ANCHOR_START, true); +} + +static PyObject* +regexp_fullmatch(RegexpObject2* self, PyObject* args, PyObject* kwds) +{ + return _do_search(self, args, kwds, RE2::ANCHOR_BOTH, true); +} + +static PyObject* +regexp_test_search(RegexpObject2* self, PyObject* args, PyObject* kwds) +{ + return _do_search(self, args, kwds, RE2::UNANCHORED, false); +} + +static PyObject* +regexp_test_match(RegexpObject2* self, PyObject* args, PyObject* kwds) +{ + return _do_search(self, args, kwds, RE2::ANCHOR_START, false); +} + +static PyObject* +regexp_test_fullmatch(RegexpObject2* self, PyObject* args, PyObject* kwds) +{ + return _do_search(self, args, kwds, RE2::ANCHOR_BOTH, false); +} + + +static void +match_dealloc(MatchObject2* self) +{ + delete[] self->groups; + Py_XDECREF(self->attr_dict); + PyObject_Del(self); +} + +static PyObject* +create_match(PyObject* re, PyObject* string, + long pos, long endpos, + StringPiece* groups) +{ + MatchObject2* match = PyObject_New(MatchObject2, &Match_Type2); + if (match == NULL) { + delete[] groups; + return NULL; + } + match->groups = groups; + match->re = re; + match->string = string; + + match->attr_dict = Py_BuildValue("{sOsOslsl}", + "re", re, + "string", string, + "pos", pos, + "endpos", endpos); + if (match->attr_dict == NULL) { + Py_DECREF(match); + return NULL; + } + + return (PyObject*)match; +} + +/** + * Attempt to convert an untrusted group index (PyObject* group) into + * a trusted one (*idx_p). Return false on failure (exception). + */ +static bool +_group_idx(MatchObject2* self, PyObject* group, long* idx_p) +{ + if (group == NULL) { + return false; + } + PyErr_Clear(); // Is this necessary? + long idx = PyInt_AsLong(group); + if (idx == -1 && PyErr_Occurred() != NULL) { + return false; + } + // TODO: Consider caching NumberOfCapturingGroups. + if (idx < 0 || idx > ((RegexpObject2*)self->re)->re2_obj->NumberOfCapturingGroups()) { + PyErr_SetString(PyExc_IndexError, "no such group"); + return false; + } + *idx_p = idx; + return true; +} + +/** + * Extract the start and end indexes of a pre-checked group number. + * Sets both to -1 if it did not participate in the match. + */ +static bool +_group_span(MatchObject2* self, long idx, Py_ssize_t* o_start, Py_ssize_t* o_end) +{ + // "idx" is expected to be verified. + StringPiece& piece = self->groups[idx]; + if (piece.data() == NULL) { + *o_start = -1; + *o_end = -1; + return false; + } + Py_ssize_t start = piece.data() - PyString_AS_STRING(self->string); + *o_start = start; + *o_end = start + piece.length(); + return true; +} + +/** + * Return a pre-checked group number as a string, or default_obj + * if it didn't participate in the match. + */ +static PyObject* +_group_get_i(MatchObject2* self, long idx, PyObject* default_obj) +{ + Py_ssize_t start; + Py_ssize_t end; + if (!_group_span(self, idx, &start, &end)) { + Py_INCREF(default_obj); + return default_obj; + } + return PySequence_GetSlice(self->string, start, end); +} + +/** + * Return n un-checked group number as a string. + */ +static PyObject* +_group_get_o(MatchObject2* self, PyObject* group) +{ + long idx; + if (!_group_idx(self, group, &idx)) { + return NULL; + } + return _group_get_i(self, idx, Py_None); +} + + +static PyObject* +match_group(MatchObject2* self, PyObject* args) +{ + long idx = 0; + Py_ssize_t nargs = PyTuple_GET_SIZE(args); + switch (nargs) { + case 1: + if (!_group_idx(self, PyTuple_GET_ITEM(args, 0), &idx)) { + return NULL; + } + // Fall through. + case 0: + return _group_get_i(self, idx, Py_None); + default: + PyObject* ret = PyTuple_New(nargs); + if (ret == NULL) { + return NULL; + } + + for (int i = 0; i < nargs; i++) { + PyObject* group = _group_get_o(self, PyTuple_GET_ITEM(args, i)); + if (group == NULL) { + Py_DECREF(ret); + return NULL; + } + PyTuple_SET_ITEM(ret, i, group); + } + return ret; + } +} + +static PyObject* +match_groups(MatchObject2* self, PyObject* args, PyObject* kwds) +{ + static const char* kwlist[] = { + "default", + NULL}; + + PyObject* default_obj = Py_None; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", (char**)kwlist, + &default_obj)) { + return NULL; + } + + int ngroups = ((RegexpObject2*)self->re)->re2_obj->NumberOfCapturingGroups(); + + PyObject* ret = PyTuple_New(ngroups); + if (ret == NULL) { + return NULL; + } + + for (int i = 1; i <= ngroups; i++) { + PyObject* group = _group_get_i(self, i, default_obj); + if (group == NULL) { + Py_DECREF(ret); + return NULL; + } + PyTuple_SET_ITEM(ret, i-1, group); + } + + return ret; +} + +static PyObject* +match_groupdict(MatchObject2* self, PyObject* args, PyObject* kwds) +{ + static const char* kwlist[] = { + "default", + NULL}; + + PyObject* default_obj = Py_None; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", (char**)kwlist, + &default_obj)) { + return NULL; + } + + PyObject* ret = PyDict_New(); + if (ret == NULL) { + return NULL; + } + + const std::map& name_map = ((RegexpObject2*)self->re)->re2_obj->NamedCapturingGroups(); + for (std::map::const_iterator it = name_map.begin(); it != name_map.end(); ++it) { + PyObject* group = _group_get_i(self, it->second, default_obj); + if (group == NULL) { + Py_DECREF(ret); + return NULL; + } + // TODO: Group names with embedded zeroes? + int res = PyDict_SetItemString(ret, it->first.data(), group); + Py_DECREF(group); + if (res < 0) { + Py_DECREF(ret); + return NULL; + } + } + + return ret; +} + +enum span_mode_t { START, END, SPAN }; + +static PyObject* +_do_span(MatchObject2* self, PyObject* args, const char* name, span_mode_t mode) +{ + long idx = 0; + PyObject* group = NULL; + if (!PyArg_UnpackTuple(args, name, 0, 1, + &group)) { + return NULL; + } + if (group != NULL) { + if (!_group_idx(self, group, &idx)) { + return NULL; + } + } + + Py_ssize_t start = - 1; + Py_ssize_t end = - 1; + + (void)_group_span(self, idx, &start, &end); + switch (mode) { + case START : return Py_BuildValue("n", start ); + case END : return Py_BuildValue("n", end ); + case SPAN: + return Py_BuildValue("nn", start, end); + } + + // Make gcc happy. + return NULL; +} + +static PyObject* +match_start(MatchObject2* self, PyObject* args) +{ + return _do_span(self, args, "start", START); +} + +static PyObject* +match_end(MatchObject2* self, PyObject* args) +{ + return _do_span(self, args, "end", END); +} + +static PyObject* +match_span(MatchObject2* self, PyObject* args) +{ + return _do_span(self, args, "span", SPAN); +} + + +static PyObject* +_compile(RegexpObject2* self, PyObject* args, PyObject* kwds) +{ + static const char* kwlist[] = { + "pattern", + NULL}; + + PyObject* pattern; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O", (char**)kwlist, + &pattern)) { + return NULL; + } + + return create_regexp(pattern); +} + +static PyMethodDef methods[] = { + {"_compile", (PyCFunction)_compile, METH_VARARGS | METH_KEYWORDS, + NULL + }, + {NULL} /* Sentinel */ +}; + +PyMODINIT_FUNC +init_re2(void) +{ + Regexp_Type2.tp_new = PyType_GenericNew; + if (PyType_Ready(&Regexp_Type2) < 0) { + return; + } + + Match_Type2.tp_new = PyType_GenericNew; + if (PyType_Ready(&Match_Type2) < 0) { + return; + } + + PyObject* re2_mod = PyImport_ImportModuleNoBlock("re2"); + if (re2_mod == NULL) { + return; + } + /* static global */ error_class = PyObject_GetAttrString(re2_mod, "error"); + if (error_class == NULL) { + return; + } + + PyObject* mod = Py_InitModule("_re2", methods); + (void)mod; +} diff --git a/re2.py b/re2.py new file mode 100644 index 00000000..587af0be --- /dev/null +++ b/re2.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python + +# Define this first, since it is referenced by _re2. +class error(Exception): + pass + +import _re2 + +compile = _re2._compile diff --git a/setup.py b/setup.py new file mode 100755 index 00000000..3e933c7e --- /dev/null +++ b/setup.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python + +from distutils.core import setup, Extension + +setup( + name="re2", + version="0.1.0", + description="Python wrapper for Google's RE2", + author="David Reiss", + py_modules = ["re2"], + ext_modules = [Extension("_re2", + sources = ["_re2.cc"], + libraries = ["re2"], + )], + )