Skip to content

Commit

Permalink
Fix tokenising when using using more than just a-zA-Z
Browse files Browse the repository at this point in the history
Previously: `Händler` would be tokenized as `ndler` or `ändler` depending on python version
Rather than the expected `händler`

Solution: use `regexp` rather than `re`.
This gives us the ability to use unicode character clasess such as `[[:upper:]]` and `[[:lower:]]`

Fixes myint#35
  • Loading branch information
robotdana committed Nov 30, 2018
1 parent 70307ba commit 0816f8e
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 13 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
/MANIFEST
__pycache__/
*.pyc
test.cram.err
20 changes: 10 additions & 10 deletions scspell/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

import argparse
import os
import re
import regex
import sys
import shutil
import uuid
Expand Down Expand Up @@ -78,22 +78,22 @@
# Treat anything alphanumeric as a token of interest, as long as it is not
# immediately preceded by a single backslash. (The string "\ntext" should
# match on "text" rather than "ntext".)
C_ESCAPE_TOKEN_REGEX = re.compile(r'(?<![^\\]\\)\w+')
C_ESCAPE_TOKEN_REGEX = regex.compile(r'(?<![^\\]\\)\w+')

# \ is not a character escape in e.g. LaTeX
TOKEN_REGEX = re.compile(r'\w+')
TOKEN_REGEX = regex.compile(r'\w+')

# Hex digits will be treated as a special case, because they can look like
# word-like even though they are actually numeric
HEX_REGEX = re.compile(r'0x[0-9a-fA-F]+')
HEX_REGEX = regex.compile(r'0x[0-9a-fA-F]+')

# We assume that tokens will be split using either underscores,
# digits, or camelCase conventions (or both)
US_REGEX = re.compile(r'[_\d]+')
CAMEL_WORD_REGEX = re.compile(r'([A-Z][a-z]*)')
US_REGEX = regex.compile(r'[_\d]+')
CAMEL_WORD_REGEX = regex.compile(r'([[:upper:]][[:lower:]]*)')

# File-id specifiers take this form
FILE_ID_REGEX = re.compile(r'scspell-id:[ \t]*([a-zA-Z0-9_\-]+)')
FILE_ID_REGEX = regex.compile(r'scspell-id:[ \t]*([a-zA-Z0-9_\-]+)')


class MatchDescriptor(object):
Expand Down Expand Up @@ -384,7 +384,7 @@ def handle_failed_check_interactively(
print("%s:%u: Unmatched '%s' --> {%s}" %
(filename, match_desc.get_line_num(), token,
', '.join([st for st in unmatched_subtokens])))
MATCH_REGEX = re.compile(re.escape(match_desc.get_token()))
MATCH_REGEX = regex.compile(regex.escape(match_desc.get_token()))
while True:
print("""\
(i)gnore, (I)gnore all, (r)eplace, (R)eplace all, (a)dd to dictionary, or
Expand All @@ -405,7 +405,7 @@ def handle_failed_check_interactively(
(Canceled.)\n""")
else:
ignores.add(replacement.lower())
tail = re.sub(
tail = regex.sub(
MATCH_REGEX, replacement, match_desc.get_remainder(),
1 if ch == 'r' else 0)
print()
Expand Down Expand Up @@ -771,7 +771,7 @@ def add_to_dict(dictionary_type, word, files=[],
dicts.add_by_file_id(word, file_id)

elif dictionary_type[0] == 'p':
ext = re.sub(r'.*\.', '.', '.{}'.format(files[0].lower()))
ext = regex.sub(r'.*\.', '.', '.{}'.format(files[0].lower()))
if not dicts.add_by_extension(word, ext):
print("Dictionary for file extension '{}' not found."
.format(ext), file=sys.stderr)
Expand Down
4 changes: 2 additions & 2 deletions scspell/_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import io
import json
import os
import re
import regex
import sys
from bisect import bisect_left
from . import _util
Expand All @@ -41,7 +41,7 @@


# Valid file ID strings take this form
FILE_ID_REGEX = re.compile(r'[a-zA-Z0-9_\-]+')
FILE_ID_REGEX = regex.compile(r'[a-zA-Z0-9_\-]+')


MATCH_NATURAL = 0x1
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,6 @@ def get_version():
'Topic :: Software Development',
'Topic :: Text Processing :: Linguistic',
'Topic :: Utilities'],
platforms=['any']
platforms=['any'],
install_requires=['regex']
)
8 changes: 8 additions & 0 deletions test.cram
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ Test spelling mistake.
bad.txt:1: 'blabbb' not found in dictionary (from token 'blabbb')
[1]

Test spelling mistake with diacritics.

$ $SCSPELL $TESTDIR/tests/basedicts/unicode-testfile
/Users/dana/M/scspell/tests/basedicts/unicode-testfile:1: 'b\xc3\xa4dly' not found in dictionary (from token 'B\xc3\xa4dly') (esc)
/Users/dana/M/scspell/tests/basedicts/unicode-testfile:1: '\xc3\xa1lmost' not found in dictionary (from token '\xc3\x81lmost') (esc)
/Users/dana/M/scspell/tests/basedicts/unicode-testfile:1: '\xc3\xa7\xc3\xa5m\xc3\xa9l', '\xc3\xa7\xc3\xa4se' were not found in the dictionary (from token '\xc3\x87\xc3\xa5m\xc3\xa9l\xc3\x87\xc3\xa4se') (esc)
[1]

Test okay file.

$ echo 'This is okay.' > good.txt
Expand Down
1 change: 1 addition & 0 deletions tests/basedicts/unicode-testfile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Bädly Álmost ÇåmélÇäse

0 comments on commit 0816f8e

Please sign in to comment.