LeapBeyond · thomasbird · Sep 1, 2023 · Sep 1, 2023 · Sep 1, 2023 · Sep 1, 2023
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
@@ -65,7 +65,7 @@ jobs:
         export LD_LIBRARY_PATH=${{ env.LD_LIBRARY_PATH }}
         export C_INCLUDE_PATH=${{ env.C_INCLUDE_PATH }}
         export CPP_INCLUDE_PATH=${{ env.CPP_INCLUDE_PATH }}
-        python -m pip install --upgrade pip wheel setuptools
+        python -m pip install --upgrade pip wheel setuptools Cython
         pip install -r requirements/python-dev
 
     - name: Cache restore nltk data

diff --git a/requirements/python-dev b/requirements/python-dev
@@ -4,7 +4,7 @@
 # needed to run the tests
 flake8
 coveralls
-nose
+pytest
 mypy
 tox
 
@@ -13,7 +13,7 @@ sphinx>=3
 sphinx_rtd_theme>=0.5
 
 # This is for the tests/benchmark_accuracy_real_data.py script
-cchardet
+chardet
 pandas
 click
 python-magic

diff --git a/scrubadub/comparison.py b/scrubadub/comparison.py
@@ -9,7 +9,7 @@
 from .filth import Filth
 from .detectors.tagged import KnownFilthItem
 
-from typing import List, Dict, Union, Optional, Tuple, Callable, Iterable, Type, Set
+from typing import List, Dict, Union, Optional, Tuple, Callable, Iterable, Type
 import numpy as np
 import pandas as pd
 import sklearn.metrics
@@ -27,8 +27,8 @@ class TextPosition(ToStringMixin):
     def __init__(self, filth: Filth, grouping_function: GroupingFunction):
         self.beg = filth.beg
         self.end = filth.end
-        self.detected = set()  # type: Set[Tuple[str, ...]]
-        self.tagged = set()  # type: Set[Tuple[str, ...]]
+        self.detected: set[Tuple[str, ...]] = set()
+        self.tagged: set[Tuple[str, ...]] = set()
         self.document_name = str(filth.document_name or '')  # type: str
 
         if isinstance(filth, filth_module.TaggedEvaluationFilth):

diff --git a/scrubadub/detectors/base.py b/scrubadub/detectors/base.py
@@ -32,9 +32,9 @@ class Detector(object):
     ```Detector.supported_local()``` function.
     """
 
-    filth_cls = Filth  # type: ClassVar[Type[Filth]]
-    name = 'detector'  # type: str
-    autoload = False  # type: bool
+    filth_cls: ClassVar[Type[Filth]] = Filth
+    name: str = 'detector'
+    autoload: bool = False
 
     def __init__(self, name: Optional[str] = None, locale: str = 'en_US'):
         """Initialise the ``Detector``.
@@ -46,7 +46,7 @@ def __init__(self, name: Optional[str] = None, locale: str = 'en_US'):
         :type locale: str, optional
         """
         if getattr(self, 'name', 'detector') == 'detector' and getattr(self, 'filth_cls', None) is not None:
-            if getattr(self.filth_cls, 'type', None) is not None and type(self) != Detector:
+            if getattr(self.filth_cls, 'type', None) is not None and type(self) is not Detector:
                 self.name = self.filth_cls.type
                 warnings.warn(
                     "Setting the detector name from the filth_cls.type is depreciated, please declare an explicit name"
@@ -111,8 +111,8 @@ class RegexDetector(Detector):
         'This url will be found {{URL}}'
     """
 
-    regex = None  # type: Optional[Pattern[str]]
-    filth_cls = Filth  # type: ClassVar[Type[Filth]]
+    regex: Optional[Pattern[str]] = None
+    filth_cls: ClassVar[Type[Filth]] = Filth
 
     def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
         """Yields discovered filth in the provided ``text``.
@@ -145,7 +145,7 @@ def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generato
 
 class RegionLocalisedRegexDetector(RegexDetector):
     """Detector to detect ``Filth`` localised using regular expressions localised by the region"""
-    region_regex = {}  # type: Dict[str, Pattern]
+    region_regex: Dict[str, Pattern] = {}
 
     def __init__(self, **kwargs):
         """Initialise the ``Detector``.

diff --git a/scrubadub/detectors/credit_card.py b/scrubadub/detectors/credit_card.py
@@ -29,12 +29,13 @@ class CreditCardDetector(RegexDetector):
     # TODO: regex doesn't match if the credit card number has spaces/dashes in
 
     regex = re.compile((
-        r"(?<=\s)"
+        r"\b"
         r"(?:4[0-9]{12}(?:[0-9]{3})?"  		# Visa
         r"|(?:5[1-5][0-9]{2}"          		# MasterCard
         r"|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}"
         r"|3[47][0-9]{13}"             		# American Express
         r"|3(?:0[0-5]|[68][0-9])[0-9]{11}"   	# Diners Club
         r"|6(?:011|5[0-9]{2})[0-9]{12}"      	# Discover
         r"|(?:2131|1800|35\d{3})\d{11})"      	# JCB
+        r"\b"
     ), re.VERBOSE)
diff --git a/scrubadub/detectors/date_of_birth.py b/scrubadub/detectors/date_of_birth.py
@@ -78,6 +78,8 @@ def __init__(self, context_before: int = 2, context_after: int = 1, require_cont
         self.context_after = context_after
         self.require_context = require_context
 
+        if self.language is None:
+            raise ValueError("Langauge is not set.")
         try:
             self.context_words = self.context_words_language_map[self.language]
         except KeyError:
@@ -100,6 +102,8 @@ def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generato
         """
 
         # using the dateparser lib - locale can be set here
+        if self.language is None:
+            raise ValueError("Langauge is not set.")
         try:
             date_picker = search_dates(text, languages=[self.language])
         except RecursionError:

diff --git a/scrubadub/filth/base.py b/scrubadub/filth/base.py
@@ -13,18 +13,18 @@ class Filth(object):
 
     # this allows people to customize the output, especially for placeholder
     # text and identifier replacements
-    prefix = u'{{'  # type: ClassVar[str]
-    suffix = u'}}'  # type: ClassVar[str]
+    prefix: ClassVar[str] = u'{{'
+    suffix: ClassVar[str] = u'}}'
 
     # the `type` is used when filths are merged to come up with a sane label
-    type = 'unknown'  # type: ClassVar[str]
+    type: ClassVar[str] = 'unknown'
 
     # the `lookup` is used to keep track of all of the different types of filth
     # that are encountered across all `Filth` types.
     lookup = utils.Lookup()
 
     # For backwards compatibility, but this is deprecated.
-    regex = None  # type: Optional[Pattern[str]]
+    regex: Optional[Pattern[str]] = None
 
     def __init__(self, beg: Optional[int] = None, end: Optional[int] = None, text: Optional[str] = None,
                  match: Optional[Match] = None, detector_name: Optional[str] = None,

diff --git a/scrubadub/filth/phone.py b/scrubadub/filth/phone.py
@@ -2,7 +2,6 @@
 import phonenumbers
 
 from faker import Faker
-from typing import List
 
 from .base import Filth
 from .. import utils
@@ -22,7 +21,7 @@ def generate(faker: Faker) -> str:
         """
         phone_number = ''
         language, region = utils.locale_split(faker._locales[0])
-        results = []  # type: List[phonenumbers.PhoneNumberMatch]
+        results = []  # type: list[phonenumbers.PhoneNumberMatch]
         # Here I'm filtering for numbers that pass validation by the phonenumbers package
         while len(results) < 1:
             # Faker generates random numbers of the right format eg (###)###-####

diff --git a/scrubadub/post_processors/filth_replacer.py b/scrubadub/post_processors/filth_replacer.py
@@ -3,7 +3,7 @@
 import math
 import hashlib
 
-from typing import Sequence, Optional, Union, Dict
+from typing import Sequence, Optional, Union
 from collections import defaultdict
 
 from scrubadub.filth import Filth, MergedFilth, TaggedEvaluationFilth
@@ -43,7 +43,7 @@ class FilthReplacer(PostProcessor):
     # NOTE: this is not an efficient way to store this in memory. could
     # alternatively hash the type and text and do away with the overhead
     # bits of storing the tuple in the lookup
-    typed_lookup = defaultdict(lambda: utils.Lookup(), {})  # type: Dict[str, utils.Lookup]
+    typed_lookup = defaultdict(lambda: utils.Lookup(), {})  # type: dict[str, utils.Lookup]
 
     def __init__(self, include_type: bool = True, include_count: bool = False, include_hash: bool = False,
                  uppercase: bool = True, separator: Optional[str] = None, hash_length: Optional[int] = None,
@@ -101,9 +101,11 @@ def filth_label(self, filth: Filth) -> str:
             replacement_pieces = []
 
             if self.include_type:
-                filth_type = getattr(f, 'type', None)
-                if filth_type is None:
+                filth_type_check: Optional[str] = getattr(f, 'type', None)
+                if filth_type_check is None:
                     continue
+                else:
+                    filth_type: str = filth_type_check
                 if filth_type == TaggedEvaluationFilth.type:
                     filth_comparison_type = getattr(f, 'comparison_type', None)
                     if filth_comparison_type is not None:

diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py
@@ -152,7 +152,8 @@ def _check_and_add_detector(self, detector: Detector, warn: bool = False):
             ) % locals())
         self._detectors[name] = detector
 
-    def add_post_processor(self, post_processor: Union[PostProcessor, Type[PostProcessor], str], index: int = None):
+    def add_post_processor(self, post_processor: Union[PostProcessor, Type[PostProcessor], str],
+                           index: Optional[int] = None):
         """Add a ``PostProcessor`` to a Scrubber
 
         You can add a post-processor to a ``Scrubber`` by passing one of three objects to this function:
@@ -215,7 +216,7 @@ def remove_post_processor(self, post_processor: Union[PostProcessor, Type[PostPr
         elif isinstance(post_processor, str):
             self._post_processors = [x for x in self._post_processors if x.name != post_processor]
 
-    def _check_and_add_post_processor(self, post_processor: PostProcessor, index: int = None):
+    def _check_and_add_post_processor(self, post_processor: PostProcessor, index: Optional[int] = None):
         """Check the types and add the PostProcessor to the scrubber"""
         if not isinstance(post_processor, PostProcessor):
             raise TypeError((

diff --git a/tests/benchmark_accuracy_real_data.py b/tests/benchmark_accuracy_real_data.py
@@ -7,9 +7,7 @@
 import click
 import magic
 import dotenv
-# import chardet
-# try a new chardet package, its a drop in replacement based on a mozilla project.
-import cchardet as chardet
+import chardet
 import logging
 import posixpath
 import azure.storage.blob

diff --git a/tests/run.py b/tests/run.py
@@ -9,11 +9,10 @@
 tests = [
     "mypy --config-file setup.cfg scrubadub/",
     "flake8  --config setup.cfg scrubadub/",
-    # If py3.5 then examples with spacy don't work so disable doctests
-    'if python3 --version | grep -Evq "Python (3\\.5\\.)" ; then nosetests --with-doctest --doctest-extension=rst ./tests/ ./scrubadub/ ./docs/ ; else nosetests ; fi',
+    'pytest --doctest-glob="*.rst" ./tests/ ./scrubadub/ ./docs/',
     "python3 ./tests/benchmark_accuracy.py --fast",
     "python3 ./tests/benchmark_time.py",
-    'if python3 --version | grep -Evq "Python (3\\.5\\.)" ; then cd docs && make html && cd - ; fi',
+    'cd docs && make html && cd -',
 ]
 
 

diff --git a/tests/test_comparison_classes.py b/tests/test_comparison_classes.py
@@ -246,11 +246,11 @@ def test_filth_grouper(self):
         self.assertEqual(['filth', 'detector', 'locale'], df.columns.names)
         self.assertEqual(
             [
-                ('name', 'tagged', 'en_US'),
                 ('phone', 'phone', 'en_GB'),
-                ('phone', 'phone', 'en_US'),
                 ('phone', 'tagged', 'en_GB'),
-                ('phone', 'tagged', 'en_US')
+                ('phone', 'phone', 'en_US'),
+                ('phone', 'tagged', 'en_US'),
+                ('name', 'tagged', 'en_US'),
             ],
             df.columns.values.tolist(),
         )

diff --git a/tests/test_detector_credit_card.py b/tests/test_detector_credit_card.py
@@ -99,3 +99,10 @@ def test_visa2(self):
         AFTER:  My credit card is {{CREDIT_CARD}}.
         """
         self.compare_before_after()
+
+    def test_start_of_string(self):
+        """
+        BEFORE: 4012888888881881.
+        AFTER:  {{CREDIT_CARD}}.
+        """
+        self.compare_before_after()