Skip to content

Commit

Permalink
add ASCII-only option, to mimic default RE2 behaviour
Browse files Browse the repository at this point in the history
This is a workaround, motivated by the difference in handling non-valid UTF8
bytes that Oniriguma has, compared to Go's default RE2.

See src-d/enry#225 (comment)

Summary of changes:
 - c: prevent `NewOnigRegex()` from hard-coding UTF8
 - c: `NewOnigRegex()` now propely calls to `onig_initialize()` [1]
 - go: expose new `MustCompileASCII()` \w default charecter class matching only ASCII
 - go: `MustCompile()` refactored, `initRegexp()` extracted for common UTF8/ASCII logic

Encoding was not exposed on Go API level intentionaly for simplisity,
in order to avoid introducing complex struct type [2] to API surface.

 1. https://github.com/kkos/oniguruma/blob/83572e983928243d741f61ac290fc057d69fefc3/doc/API#L6
 2. https://github.com/kkos/oniguruma/blob/83572e983928243d741f61ac290fc057d69fefc3/src/oniguruma.h#L121

Signed-off-by: Alexander Bezzubov <[email protected]>
  • Loading branch information
bzz committed May 7, 2019
1 parent 7883039 commit 8b90ffc
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 5 deletions.
3 changes: 2 additions & 1 deletion chelper.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ int NewOnigRegex( char *pattern, int pattern_length, int option,
*error_info = (OnigErrorInfo *) malloc(sizeof(OnigErrorInfo));
memset(*error_info, 0, sizeof(OnigErrorInfo));

*encoding = (void*)ONIG_ENCODING_UTF8;
OnigEncoding use_encs[] = { *encoding };
onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0]));

*error_buffer = (char*) malloc(ONIG_MAX_ERROR_MESSAGE_LEN * sizeof(char));

Expand Down
26 changes: 22 additions & 4 deletions regex.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,22 @@ type Regexp struct {
}

func NewRegexp(pattern string, option int) (re *Regexp, err error) {
re = &Regexp{pattern: pattern}
patternCharPtr := C.CString(pattern)
defer C.free(unsafe.Pointer(patternCharPtr))
re = &Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_UTF8}
return initRegexp(re, option)
}

func NewRegexpASCII(pattern string, option int) (re *Regexp, err error) {
re = &Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_ASCII}
return initRegexp(re, option)
}

func initRegexp(re *Regexp, option int) (*Regexp, error) {
var err error
patternCharPtr := C.CString(re.pattern)
defer C.free(unsafe.Pointer(patternCharPtr))
mutex.Lock()
defer mutex.Unlock()
error_code := C.NewOnigRegex(patternCharPtr, C.int(len(pattern)), C.int(option), &re.regex, &re.region, &re.encoding, &re.errorInfo, &re.errorBuf)
error_code := C.NewOnigRegex(patternCharPtr, C.int(len(re.pattern)), C.int(option), &re.regex, &re.region, &re.encoding, &re.errorInfo, &re.errorBuf)
if error_code != C.ONIG_NORMAL {
err = errors.New(C.GoString(re.errorBuf))
} else {
Expand Down Expand Up @@ -95,6 +104,15 @@ func MustCompileWithOption(str string, option int) *Regexp {
return regexp
}

// MustCompileASCII equivalent of MustCompile but with char matching only ASCII.
func MustCompileASCII(str string) *Regexp {
regexp, error := NewRegexpASCII(str, ONIG_OPTION_DEFAULT)
if error != nil {
panic("regexp: compiling " + str + ": " + error.Error())
}
return regexp
}

func (re *Regexp) Free() {
mutex.Lock()
if re.regex != nil {
Expand Down

0 comments on commit 8b90ffc

Please sign in to comment.