From 7f12d2526c2048a4a795e10f1b3d7865f8e4458d Mon Sep 17 00:00:00 2001 From: Thomas Bird Date: Fri, 1 Sep 2023 16:08:45 +0100 Subject: [PATCH 1/4] try to fix issue with builfing cchardet --- .github/workflows/unittests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 6a8210a..56ffb76 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -65,7 +65,7 @@ jobs: export LD_LIBRARY_PATH=${{ env.LD_LIBRARY_PATH }} export C_INCLUDE_PATH=${{ env.C_INCLUDE_PATH }} export CPP_INCLUDE_PATH=${{ env.CPP_INCLUDE_PATH }} - python -m pip install --upgrade pip wheel setuptools + python -m pip install --upgrade pip wheel setuptools Cython pip install -r requirements/python-dev - name: Cache restore nltk data From 328c1b81f1679872551774976c70885a18a551d3 Mon Sep 17 00:00:00 2001 From: Thomas Bird Date: Fri, 1 Sep 2023 17:02:06 +0100 Subject: [PATCH 2/4] fix mypy and flake8 issues --- scrubadub/comparison.py | 6 +++--- scrubadub/detectors/base.py | 14 +++++++------- scrubadub/detectors/date_of_birth.py | 4 ++++ scrubadub/filth/base.py | 8 ++++---- scrubadub/filth/phone.py | 3 +-- scrubadub/post_processors/filth_replacer.py | 10 ++++++---- scrubadub/scrubbers.py | 5 +++-- 7 files changed, 28 insertions(+), 22 deletions(-) diff --git a/scrubadub/comparison.py b/scrubadub/comparison.py index 70d2920..d14e3b3 100644 --- a/scrubadub/comparison.py +++ b/scrubadub/comparison.py @@ -9,7 +9,7 @@ from .filth import Filth from .detectors.tagged import KnownFilthItem -from typing import List, Dict, Union, Optional, Tuple, Callable, Iterable, Type, Set +from typing import List, Dict, Union, Optional, Tuple, Callable, Iterable, Type import numpy as np import pandas as pd import sklearn.metrics @@ -27,8 +27,8 @@ class TextPosition(ToStringMixin): def __init__(self, filth: Filth, grouping_function: GroupingFunction): self.beg = filth.beg self.end = filth.end - self.detected = set() # type: Set[Tuple[str, ...]] - self.tagged = set() # type: Set[Tuple[str, ...]] + self.detected: set[Tuple[str, ...]] = set() + self.tagged: set[Tuple[str, ...]] = set() self.document_name = str(filth.document_name or '') # type: str if isinstance(filth, filth_module.TaggedEvaluationFilth): diff --git a/scrubadub/detectors/base.py b/scrubadub/detectors/base.py index 628c543..0755d4d 100644 --- a/scrubadub/detectors/base.py +++ b/scrubadub/detectors/base.py @@ -32,9 +32,9 @@ class Detector(object): ```Detector.supported_local()``` function. """ - filth_cls = Filth # type: ClassVar[Type[Filth]] - name = 'detector' # type: str - autoload = False # type: bool + filth_cls: ClassVar[Type[Filth]] = Filth + name: str = 'detector' + autoload: bool = False def __init__(self, name: Optional[str] = None, locale: str = 'en_US'): """Initialise the ``Detector``. @@ -46,7 +46,7 @@ def __init__(self, name: Optional[str] = None, locale: str = 'en_US'): :type locale: str, optional """ if getattr(self, 'name', 'detector') == 'detector' and getattr(self, 'filth_cls', None) is not None: - if getattr(self.filth_cls, 'type', None) is not None and type(self) != Detector: + if getattr(self.filth_cls, 'type', None) is not None and type(self) is not Detector: self.name = self.filth_cls.type warnings.warn( "Setting the detector name from the filth_cls.type is depreciated, please declare an explicit name" @@ -111,8 +111,8 @@ class RegexDetector(Detector): 'This url will be found {{URL}}' """ - regex = None # type: Optional[Pattern[str]] - filth_cls = Filth # type: ClassVar[Type[Filth]] + regex: Optional[Pattern[str]] = None + filth_cls: ClassVar[Type[Filth]] = Filth def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]: """Yields discovered filth in the provided ``text``. @@ -145,7 +145,7 @@ def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generato class RegionLocalisedRegexDetector(RegexDetector): """Detector to detect ``Filth`` localised using regular expressions localised by the region""" - region_regex = {} # type: Dict[str, Pattern] + region_regex: Dict[str, Pattern] = {} def __init__(self, **kwargs): """Initialise the ``Detector``. diff --git a/scrubadub/detectors/date_of_birth.py b/scrubadub/detectors/date_of_birth.py index 6aa4eaa..ef80ea5 100644 --- a/scrubadub/detectors/date_of_birth.py +++ b/scrubadub/detectors/date_of_birth.py @@ -78,6 +78,8 @@ def __init__(self, context_before: int = 2, context_after: int = 1, require_cont self.context_after = context_after self.require_context = require_context + if self.language is None: + raise ValueError("Langauge is not set.") try: self.context_words = self.context_words_language_map[self.language] except KeyError: @@ -100,6 +102,8 @@ def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generato """ # using the dateparser lib - locale can be set here + if self.language is None: + raise ValueError("Langauge is not set.") try: date_picker = search_dates(text, languages=[self.language]) except RecursionError: diff --git a/scrubadub/filth/base.py b/scrubadub/filth/base.py index 004c6b6..615949d 100644 --- a/scrubadub/filth/base.py +++ b/scrubadub/filth/base.py @@ -13,18 +13,18 @@ class Filth(object): # this allows people to customize the output, especially for placeholder # text and identifier replacements - prefix = u'{{' # type: ClassVar[str] - suffix = u'}}' # type: ClassVar[str] + prefix: ClassVar[str] = u'{{' + suffix: ClassVar[str] = u'}}' # the `type` is used when filths are merged to come up with a sane label - type = 'unknown' # type: ClassVar[str] + type: ClassVar[str] = 'unknown' # the `lookup` is used to keep track of all of the different types of filth # that are encountered across all `Filth` types. lookup = utils.Lookup() # For backwards compatibility, but this is deprecated. - regex = None # type: Optional[Pattern[str]] + regex: Optional[Pattern[str]] = None def __init__(self, beg: Optional[int] = None, end: Optional[int] = None, text: Optional[str] = None, match: Optional[Match] = None, detector_name: Optional[str] = None, diff --git a/scrubadub/filth/phone.py b/scrubadub/filth/phone.py index 869ed8e..5f20409 100644 --- a/scrubadub/filth/phone.py +++ b/scrubadub/filth/phone.py @@ -2,7 +2,6 @@ import phonenumbers from faker import Faker -from typing import List from .base import Filth from .. import utils @@ -22,7 +21,7 @@ def generate(faker: Faker) -> str: """ phone_number = '' language, region = utils.locale_split(faker._locales[0]) - results = [] # type: List[phonenumbers.PhoneNumberMatch] + results = [] # type: list[phonenumbers.PhoneNumberMatch] # Here I'm filtering for numbers that pass validation by the phonenumbers package while len(results) < 1: # Faker generates random numbers of the right format eg (###)###-#### diff --git a/scrubadub/post_processors/filth_replacer.py b/scrubadub/post_processors/filth_replacer.py index 75f15e3..90322cd 100644 --- a/scrubadub/post_processors/filth_replacer.py +++ b/scrubadub/post_processors/filth_replacer.py @@ -3,7 +3,7 @@ import math import hashlib -from typing import Sequence, Optional, Union, Dict +from typing import Sequence, Optional, Union from collections import defaultdict from scrubadub.filth import Filth, MergedFilth, TaggedEvaluationFilth @@ -43,7 +43,7 @@ class FilthReplacer(PostProcessor): # NOTE: this is not an efficient way to store this in memory. could # alternatively hash the type and text and do away with the overhead # bits of storing the tuple in the lookup - typed_lookup = defaultdict(lambda: utils.Lookup(), {}) # type: Dict[str, utils.Lookup] + typed_lookup = defaultdict(lambda: utils.Lookup(), {}) # type: dict[str, utils.Lookup] def __init__(self, include_type: bool = True, include_count: bool = False, include_hash: bool = False, uppercase: bool = True, separator: Optional[str] = None, hash_length: Optional[int] = None, @@ -101,9 +101,11 @@ def filth_label(self, filth: Filth) -> str: replacement_pieces = [] if self.include_type: - filth_type = getattr(f, 'type', None) - if filth_type is None: + filth_type_check: Optional[str] = getattr(f, 'type', None) + if filth_type_check is None: continue + else: + filth_type: str = filth_type_check if filth_type == TaggedEvaluationFilth.type: filth_comparison_type = getattr(f, 'comparison_type', None) if filth_comparison_type is not None: diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py index ca17523..c1529a8 100644 --- a/scrubadub/scrubbers.py +++ b/scrubadub/scrubbers.py @@ -152,7 +152,8 @@ def _check_and_add_detector(self, detector: Detector, warn: bool = False): ) % locals()) self._detectors[name] = detector - def add_post_processor(self, post_processor: Union[PostProcessor, Type[PostProcessor], str], index: int = None): + def add_post_processor(self, post_processor: Union[PostProcessor, Type[PostProcessor], str], + index: Optional[int] = None): """Add a ``PostProcessor`` to a Scrubber You can add a post-processor to a ``Scrubber`` by passing one of three objects to this function: @@ -215,7 +216,7 @@ def remove_post_processor(self, post_processor: Union[PostProcessor, Type[PostPr elif isinstance(post_processor, str): self._post_processors = [x for x in self._post_processors if x.name != post_processor] - def _check_and_add_post_processor(self, post_processor: PostProcessor, index: int = None): + def _check_and_add_post_processor(self, post_processor: PostProcessor, index: Optional[int] = None): """Check the types and add the PostProcessor to the scrubber""" if not isinstance(post_processor, PostProcessor): raise TypeError(( From 678aa81cbc022ede03f7613354dd00fef962011e Mon Sep 17 00:00:00 2001 From: Thomas Bird Date: Fri, 1 Sep 2023 17:35:04 +0100 Subject: [PATCH 3/4] move to better maintained packages --- requirements/python-dev | 4 ++-- tests/benchmark_accuracy_real_data.py | 4 +--- tests/run.py | 5 ++--- tests/test_comparison_classes.py | 6 +++--- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/requirements/python-dev b/requirements/python-dev index 84491b1..a1e807f 100644 --- a/requirements/python-dev +++ b/requirements/python-dev @@ -4,7 +4,7 @@ # needed to run the tests flake8 coveralls -nose +pytest mypy tox @@ -13,7 +13,7 @@ sphinx>=3 sphinx_rtd_theme>=0.5 # This is for the tests/benchmark_accuracy_real_data.py script -cchardet +chardet pandas click python-magic diff --git a/tests/benchmark_accuracy_real_data.py b/tests/benchmark_accuracy_real_data.py index 04159d6..2d1632d 100755 --- a/tests/benchmark_accuracy_real_data.py +++ b/tests/benchmark_accuracy_real_data.py @@ -7,9 +7,7 @@ import click import magic import dotenv -# import chardet -# try a new chardet package, its a drop in replacement based on a mozilla project. -import cchardet as chardet +import chardet import logging import posixpath import azure.storage.blob diff --git a/tests/run.py b/tests/run.py index 74feac6..67179a9 100755 --- a/tests/run.py +++ b/tests/run.py @@ -9,11 +9,10 @@ tests = [ "mypy --config-file setup.cfg scrubadub/", "flake8 --config setup.cfg scrubadub/", - # If py3.5 then examples with spacy don't work so disable doctests - 'if python3 --version | grep -Evq "Python (3\\.5\\.)" ; then nosetests --with-doctest --doctest-extension=rst ./tests/ ./scrubadub/ ./docs/ ; else nosetests ; fi', + 'pytest --doctest-glob="*.rst" ./tests/ ./scrubadub/ ./docs/', "python3 ./tests/benchmark_accuracy.py --fast", "python3 ./tests/benchmark_time.py", - 'if python3 --version | grep -Evq "Python (3\\.5\\.)" ; then cd docs && make html && cd - ; fi', + 'cd docs && make html && cd -', ] diff --git a/tests/test_comparison_classes.py b/tests/test_comparison_classes.py index ee4b40b..b62069a 100644 --- a/tests/test_comparison_classes.py +++ b/tests/test_comparison_classes.py @@ -246,11 +246,11 @@ def test_filth_grouper(self): self.assertEqual(['filth', 'detector', 'locale'], df.columns.names) self.assertEqual( [ - ('name', 'tagged', 'en_US'), ('phone', 'phone', 'en_GB'), - ('phone', 'phone', 'en_US'), ('phone', 'tagged', 'en_GB'), - ('phone', 'tagged', 'en_US') + ('phone', 'phone', 'en_US'), + ('phone', 'tagged', 'en_US'), + ('name', 'tagged', 'en_US'), ], df.columns.values.tolist(), ) From 02a2893c49247ea128f81f07e1cb8fb38acace25 Mon Sep 17 00:00:00 2001 From: Thomas Bird Date: Fri, 1 Sep 2023 17:35:31 +0100 Subject: [PATCH 4/4] fix issue with credit card detector not detecting numbers at the start of a string --- scrubadub/detectors/credit_card.py | 3 ++- tests/test_detector_credit_card.py | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/scrubadub/detectors/credit_card.py b/scrubadub/detectors/credit_card.py index 2e2b3ab..6537686 100644 --- a/scrubadub/detectors/credit_card.py +++ b/scrubadub/detectors/credit_card.py @@ -29,7 +29,7 @@ class CreditCardDetector(RegexDetector): # TODO: regex doesn't match if the credit card number has spaces/dashes in regex = re.compile(( - r"(?<=\s)" + r"\b" r"(?:4[0-9]{12}(?:[0-9]{3})?" # Visa r"|(?:5[1-5][0-9]{2}" # MasterCard r"|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}" @@ -37,4 +37,5 @@ class CreditCardDetector(RegexDetector): r"|3(?:0[0-5]|[68][0-9])[0-9]{11}" # Diners Club r"|6(?:011|5[0-9]{2})[0-9]{12}" # Discover r"|(?:2131|1800|35\d{3})\d{11})" # JCB + r"\b" ), re.VERBOSE) diff --git a/tests/test_detector_credit_card.py b/tests/test_detector_credit_card.py index 2d8399e..3fa1288 100644 --- a/tests/test_detector_credit_card.py +++ b/tests/test_detector_credit_card.py @@ -99,3 +99,10 @@ def test_visa2(self): AFTER: My credit card is {{CREDIT_CARD}}. """ self.compare_before_after() + + def test_start_of_string(self): + """ + BEFORE: 4012888888881881. + AFTER: {{CREDIT_CARD}}. + """ + self.compare_before_after()