From cfc6f2abec098d38e6758347cd1b60bfcdbe72fc Mon Sep 17 00:00:00 2001 From: Andreas van Cranenburgh Date: Thu, 30 Apr 2020 19:41:41 +0200 Subject: [PATCH] Add contains() method - contains() works like match() but returns a bool to avoid creating a Match object. see #12 - add wrapper for re.Pattern so that contains() and count() methods are also available when falling back to re. --- src/compile.pxi | 4 +-- src/pattern.pxi | 89 +++++++++++++++++++++++++++++++++++++++++++++++++ src/re2.pyx | 9 ++++- tests/count.txt | 13 +++++--- 4 files changed, 108 insertions(+), 7 deletions(-) diff --git a/src/compile.pxi b/src/compile.pxi index f56af557..1e53f602 100644 --- a/src/compile.pxi +++ b/src/compile.pxi @@ -20,7 +20,7 @@ def _compile(object pattern, int flags=0, int max_mem=8388608): elif current_notification == FALLBACK_WARNING: warnings.warn("WARNING: Using re module. Reason: %s" % error_msg) try: - result = re.compile(pattern, flags) + result = PythonRePattern(pattern, flags) except re.error as err: raise RegexError(*err.args) return result @@ -93,7 +93,7 @@ def _compile(object pattern, int flags=0, int max_mem=8388608): raise RegexError(error_msg) elif current_notification == FALLBACK_WARNING: warnings.warn("WARNING: Using re module. Reason: %s" % error_msg) - return re.compile(original_pattern, flags) + return PythonRePattern(original_pattern, flags) cdef Pattern pypattern = Pattern() cdef map[cpp_string, int] named_groups = re_pattern.NamedCapturingGroups() diff --git a/src/pattern.pxi b/src/pattern.pxi index 5c75de7b..0950db2b 100644 --- a/src/pattern.pxi +++ b/src/pattern.pxi @@ -78,6 +78,45 @@ cdef class Pattern: release_cstring(&buf) return m + def contains(self, object string, int pos=0, int endpos=-1): + """"contains(string[, pos[, endpos]]) --> bool." + + Scan through string looking for a match, and return True or False.""" + cdef char * cstring + cdef Py_ssize_t size + cdef Py_buffer buf + cdef int retval + cdef int encoded = 0 + cdef StringPiece * sp + + if 0 <= endpos <= pos: + return False + + bytestr = unicode_to_bytes(string, &encoded, self.encoded) + if pystring_to_cstring(bytestr, &cstring, &size, &buf) == -1: + raise TypeError('expected string or buffer') + try: + if encoded == 2 and (pos or endpos != -1): + utf8indices(cstring, size, &pos, &endpos) + if pos > size: + return False + if 0 <= endpos < size: + size = endpos + + sp = new StringPiece(cstring, size) + with nogil: + retval = self.re_pattern.Match( + sp[0], + pos, + size, + UNANCHORED, + NULL, + 0) + del sp + finally: + release_cstring(&buf) + return retval != 0 + def count(self, object string, int pos=0, int endpos=-1): """Return number of non-overlapping matches of pattern in string.""" cdef char * cstring @@ -547,3 +586,53 @@ cdef class Pattern: def __dealloc__(self): del self.re_pattern + + +class PythonRePattern: + """A wrapper for re.Pattern to support the extra methods defined by re2 + (contains, count).""" + def __init__(self, pattern, flags=None): + self._pattern = re.compile(pattern, flags) + self.pattern = pattern + self.flags = flags + self.groupindex = self._pattern.groupindex + self.groups = self._pattern.groups + + def contains(self, string): + return bool(self._pattern.search(string)) + + def count(self, string, pos=0, endpos=9223372036854775807): + return len(self._pattern.findall(string, pos, endpos)) + + def findall(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.findall(string, pos, endpos) + + def finditer(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.finditer(string, pos, endpos) + + def fullmatch(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.fullmatch(string, pos, endpos) + + def match(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.match(string, pos, endpos) + + def scanner(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.scanner(string, pos, endpos) + + def search(self, string, pos=0, endpos=9223372036854775807): + return self._pattern.search(string, pos, endpos) + + def split(self, string, maxsplit=0): + return self._pattern.split(string, maxsplit) + + def sub(self, repl, string, count=0): + return self._pattern.sub(repl, string, count) + + def subn(self, repl, string, count=0): + return self._pattern.subn(repl, string, count) + + def __repr__(self): + return repr(self._pattern) + + def __reduce__(self): + return (self, (self.pattern, self.flags)) diff --git a/src/re2.pyx b/src/re2.pyx index 36fe86b0..6638f5fb 100644 --- a/src/re2.pyx +++ b/src/re2.pyx @@ -72,7 +72,8 @@ This module exports the following functions:: count Count all occurrences of a pattern in a string. match Match a regular expression pattern to the beginning of a string. fullmatch Match a regular expression pattern to all of a string. - search Search a string for the presence of a pattern. + search Search a string for a pattern and return Match object. + contains Same as search, but only return bool. sub Substitute occurrences of a pattern found in a string. subn Same as sub, but also return the number of substitutions made. split Split a string by the occurrences of a pattern. @@ -170,6 +171,12 @@ def fullmatch(pattern, string, int flags=0): return compile(pattern, flags).fullmatch(string) +def contains(pattern, string, int flags=0): + """Scan through string looking for a match to the pattern, returning + True or False.""" + return compile(pattern, flags).contains(string) + + def finditer(pattern, string, int flags=0): """Yield all non-overlapping matches in the string. diff --git a/tests/count.txt b/tests/count.txt index f5ab6ced..3c848fb7 100644 --- a/tests/count.txt +++ b/tests/count.txt @@ -9,13 +9,10 @@ This one is from http://docs.python.org/library/re.html?#finding-all-adverbs: >>> re2.count(r"\w+ly", "He was carefully disguised but captured quickly by police.") 2 -This one makes sure all groups are found: +Groups should not affect count(): >>> re2.count(r"(\w+)=(\d+)", "foo=1,foo=2") 2 - -When there's only one matched group, it should not be returned in a tuple: - >>> re2.count(r"(\w)\w", "fx") 1 @@ -31,3 +28,11 @@ A pattern matching an empty string: >>> re2.count("", "foo") 4 + +contains tests +============== + + >>> re2.contains('a', 'bbabb') + True + >>> re2.contains('a', 'bbbbb') + False