add parameter: remove trailing slashes (#52)

* experimental normalization: systematically remove trailing slashes * now optional trailing slash removal
adbar · Jan 29, 2024 · 6175dd7 · 6175dd7
1 parent bc2e64c
commit 6175dd7
Show file tree

Hide file tree

Showing 8 changed files with 73 additions and 16 deletions.
diff --git a/courlan/__init__.py b/courlan/__init__.py
@@ -2,7 +2,6 @@
 coURLan: Clean, filter, normalize, and sample URLs
 """
 
-
 # meta
 __title__ = "courlan"
 __author__ = "Adrien Barbaresi"

diff --git a/courlan/clean.py b/courlan/clean.py
@@ -179,6 +179,7 @@ def normalize_url(
     parsed_url: Union[SplitResult, str],
     strict: bool = False,
     language: Optional[str] = None,
+    trailing_slash: bool = True,
 ) -> str:
     "Takes a URL string or a parsed URL and returns a normalized URL string"
     parsed_url = _parse(parsed_url)
@@ -200,6 +201,13 @@ def normalize_url(
     newquery = clean_query(parsed_url.query, strict, language) or ""
     if newquery and newpath == "":
         newpath = "/"
+    elif (
+        not trailing_slash
+        and not newquery
+        and len(newpath) > 1
+        and newpath.endswith("/")
+    ):
+        newpath = newpath.rstrip("/")
     # fragment
     newfragment = "" if strict else normalize_fragment(parsed_url.fragment, language)
     # rebuild

diff --git a/courlan/core.py b/courlan/core.py
@@ -48,6 +48,7 @@ def check_url(
     with_redirects: bool = False,
     language: Optional[str] = None,
     with_nav: bool = False,
+    trailing_slash: bool = True,
 ) -> Optional[Tuple[str, str]]:
     """Check links for appropriateness and sanity
     Args:
@@ -85,7 +86,10 @@ def check_url(
             raise ValueError
 
         # internationalization and language heuristics in URL
-        if language is not None and lang_filter(url, language, strict) is False:
+        if (
+            language is not None
+            and lang_filter(url, language, strict, trailing_slash) is False
+        ):
             LOGGER.debug("rejected, lang filter: %s", url)
             raise ValueError
 
@@ -111,7 +115,7 @@ def check_url(
             raise ValueError
 
         # normalize
-        url = normalize_url(parsed_url, strict, language)
+        url = normalize_url(parsed_url, strict, language, trailing_slash)
 
         # domain info: use blacklist in strict mode only
         if strict:
@@ -138,6 +142,7 @@ def extract_links(
     no_filter: bool = False,
     language: Optional[str] = None,
     strict: bool = True,
+    trailing_slash: bool = True,
     with_nav: bool = False,
     redirects: bool = False,
     reference: Optional[str] = None,
@@ -152,6 +157,7 @@ def extract_links(
         no_filter: override settings and bypass checks to return all possible URLs
         language: set target language (ISO 639-1 codes)
         strict: set to True for stricter filtering
+        trailing_slash: set to False to trim trailing slashes
         with_nav: set to True to include navigation pages instead of discarding them
         with_redirects: set to True for redirection test (per HTTP HEAD request)
         reference: provide a host reference for external/internal evaluation
@@ -197,6 +203,7 @@ def extract_links(
             checked = check_url(
                 link,
                 strict=strict,
+                trailing_slash=trailing_slash,
                 with_nav=with_nav,
                 with_redirects=redirects,
                 language=language,

diff --git a/courlan/filters.py b/courlan/filters.py
@@ -2,7 +2,6 @@
 Bundles functions needed to target text content and validate the input.
 """
 
-
 ## This file is available from https://github.com/adbar/courlan
 ## under GNU GPL v3 license
 
@@ -82,9 +81,12 @@
 
 # language filter
 PATH_LANG_FILTER = re.compile(
-    r"(?:https?://[^/]+/)([a-z]{2})([_-][a-z]{2,3})?(?:/)", re.IGNORECASE
+    r"(?:https?://[^/]+/)([a-z]{2})([_-][a-z]{2,3})?(?:/|$)", re.IGNORECASE
 )
 ALL_PATH_LANGS = re.compile(r"(?:/)([a-z]{2})([_-][a-z]{2})?(?:/)", re.IGNORECASE)
+ALL_PATH_LANGS_NO_TRAILING = re.compile(
+    r"(?:/)([a-z]{2})([_-][a-z]{2})?(?:/|$)", re.IGNORECASE
+)
 HOST_LANG_FILTER = re.compile(
     r"https?://([a-z]{2})\.(?:[^.]{4,})\.(?:[^.]+)(?:\.[^.]+)?/", re.IGNORECASE
 )
@@ -202,7 +204,12 @@ def langcodes_score(language: str, segment: str, score: int) -> int:
     return score
 
 
-def lang_filter(url: str, language: Optional[str] = None, strict: bool = False) -> bool:
+def lang_filter(
+    url: str,
+    language: Optional[str] = None,
+    strict: bool = False,
+    trailing_slash: bool = True,
+) -> bool:
     """Heuristics targeting internationalization and linguistic elements.
     Based on a score."""
     # sanity check
@@ -214,7 +221,10 @@ def lang_filter(url: str, language: Optional[str] = None, strict: bool = False)
     match = PATH_LANG_FILTER.match(url)
     if match:
         # look for other occurrences
-        occurrences = ALL_PATH_LANGS.findall(url)
+        if trailing_slash:
+            occurrences = ALL_PATH_LANGS.findall(url)
+        else:
+            occurrences = ALL_PATH_LANGS_NO_TRAILING.findall(url)
         if len(occurrences) == 1:
             score = langcodes_score(language, match[1], score)
         elif len(occurrences) == 2:

diff --git a/courlan/sampling.py b/courlan/sampling.py
@@ -2,7 +2,6 @@
 Utilities dedicated to URL sampling
 """
 
-
 import logging
 
 # from functools import cmp_to_key

diff --git a/courlan/urlstore.py b/courlan/urlstore.py
@@ -68,19 +68,29 @@ def __init__(self, urlpath: str, visited: bool) -> None:
 
 class UrlStore:
     "Defines a class to store domain-classified URLs and perform checks against it."
-    __slots__ = ("compressed", "done", "language", "strict", "urldict", "_lock")
+    __slots__ = (
+        "compressed",
+        "done",
+        "language",
+        "strict",
+        "trailing_slash",
+        "urldict",
+        "_lock",
+    )
 
     def __init__(
         self,
         compressed: bool = False,
         language: Optional[str] = None,
         strict: bool = False,
+        trailing: bool = True,
         verbose: bool = False,
     ) -> None:
         self.compressed: bool = compressed
         self.done: bool = False
         self.language: Optional[str] = language
         self.strict: bool = strict
+        self.trailing_slash: bool = trailing
         self.urldict: DefaultDict[str, DomainEntry] = defaultdict(DomainEntry)
         self._lock: Lock = Lock()
 
@@ -112,12 +122,18 @@ def _buffer_urls(
                 # filter
                 if (
                     self.language is not None
-                    and lang_filter(url, self.language, self.strict) is False
+                    and lang_filter(
+                        url, self.language, self.strict, self.trailing_slash
+                    )
+                    is False
                 ):
                     LOGGER.debug("Wrong language: %s", url)
                     raise ValueError
                 parsed_url = normalize_url(
-                    parsed_url, strict=self.strict, language=self.language
+                    parsed_url,
+                    strict=self.strict,
+                    language=self.language,
+                    trailing_slash=self.trailing_slash,
                 )
                 hostinfo, urlpath = get_host_and_path(parsed_url)
                 inputdict[hostinfo].append(UrlPathTuple(urlpath, visited))

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -312,7 +312,9 @@ def test_path_filter():
 
 
 def test_lang_filter():
+    assert lang_filter("http://test.com/az", "de", trailing_slash=False) is False
     assert lang_filter("http://test.com/az/", "de") is False
+    assert lang_filter("http://test.com/de", "de", trailing_slash=False) is True
     assert lang_filter("http://test.com/de/", "de") is True
     assert (
         lang_filter(
@@ -945,9 +947,23 @@ def test_extraction():
         external_bool=False,
         strict=True,
         with_nav=True,
+        trailing_slash=True,
     )
     assert sorted(links) == [
-        "https://example.org/page/",  # parameter stripped by strict filtering
+        "https://example.org/page/",
+        "https://example.org/page/10",
+    ]
+    links = extract_links(
+        pagecontent,
+        "https://example.org",
+        external_bool=False,
+        strict=True,
+        trailing_slash=False,
+        with_nav=True,
+    )
+    print(links)
+    assert sorted(links) == [
+        "https://example.org/page",  # parameter stripped by strict filtering
         "https://example.org/page/10",
     ]
     links = extract_links(

diff --git a/tests/urlstore_tests.py b/tests/urlstore_tests.py
@@ -182,19 +182,20 @@ def test_urlstore():
     my_urls.add_urls(extension_urls)
     assert len(my_urls._load_urls(example_domain)) == len(example_urls) + 10
     # test extension + deduplication
+    my_urls.trailing_slash = False
     extension_urls = [f"{example_domain}/1/{str(a)}/" for a in range(11)]
     my_urls.add_urls(appendleft=extension_urls)
     url_tuples = my_urls._load_urls(example_domain)
     assert len(url_tuples) == len(example_urls) + 11
-    assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10/"
+    assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10"
 
     # duplicates
     my_urls.add_urls(extension_urls)
     my_urls.add_urls(appendleft=extension_urls)
     assert len(my_urls._load_urls(example_domain)) == len(example_urls) + len(
         extension_urls
     )
-    assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10/"
+    assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10"
 
     # get_url
     assert my_urls.urldict[example_domain].timestamp is None
@@ -204,7 +205,7 @@ def test_urlstore():
     timestamp = my_urls.urldict[example_domain].timestamp
     sleep(0.1)
     url2 = my_urls.get_url(example_domain)
-    assert url1 != url2 and url1 == "https://www.example.org/1/10/"
+    assert url1 != url2 and url1 == "https://www.example.org/1/10"
     assert my_urls.urldict[example_domain].count == 2
     assert timestamp != my_urls.urldict[example_domain].timestamp
     assert url2 not in set(my_urls.find_unvisited_urls(example_domain))
@@ -371,6 +372,7 @@ def test_dbdump(capsys):
 def test_from_html():
     "Test link extraction procedures."
     url_store = UrlStore()
+    url_store.trailing_slash = False
     base_url = "https://example.org"
     htmlstring = '<html><body><a href="https://example.com/page1"/><a href="https://example.org/page1/"/><a href="https://test.org/page1"/></body></html>'
     # 1 internal link in total
@@ -398,7 +400,7 @@ def test_from_html():
     url_store.add_from_html(htmlstring, base_url, lang="en")
     todo = url_store.find_unvisited_urls(base_url)
     known_links = url_store.find_known_urls(base_url)
-    assert "https://example.org/en/page1/" in todo and len(known_links) == 4
+    assert "https://example.org/en/page1" in todo and len(known_links) == 4
     # wrong language
     htmlstring = '<html><body><a href="https://example.org/en/page2"/></body></html>'
     url_store.add_from_html(htmlstring, base_url, lang="de")
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,6 @@ @@
     coURLan: Clean, filter, normalize, and sample URLs
     """
     # meta
     __title__ = "courlan"
     __author__ = "Adrien Barbaresi"
@@ Expand Down @@