Skip to content

Commit

Permalink
add parameter: remove trailing slashes (#52)
Browse files Browse the repository at this point in the history
* experimental normalization: systematically remove trailing slashes

* now optional trailing slash removal
  • Loading branch information
adbar authored Jan 29, 2024
1 parent bc2e64c commit 6175dd7
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 16 deletions.
1 change: 0 additions & 1 deletion courlan/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
coURLan: Clean, filter, normalize, and sample URLs
"""


# meta
__title__ = "courlan"
__author__ = "Adrien Barbaresi"
Expand Down
8 changes: 8 additions & 0 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ def normalize_url(
parsed_url: Union[SplitResult, str],
strict: bool = False,
language: Optional[str] = None,
trailing_slash: bool = True,
) -> str:
"Takes a URL string or a parsed URL and returns a normalized URL string"
parsed_url = _parse(parsed_url)
Expand All @@ -200,6 +201,13 @@ def normalize_url(
newquery = clean_query(parsed_url.query, strict, language) or ""
if newquery and newpath == "":
newpath = "/"
elif (
not trailing_slash
and not newquery
and len(newpath) > 1
and newpath.endswith("/")
):
newpath = newpath.rstrip("/")
# fragment
newfragment = "" if strict else normalize_fragment(parsed_url.fragment, language)
# rebuild
Expand Down
11 changes: 9 additions & 2 deletions courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def check_url(
with_redirects: bool = False,
language: Optional[str] = None,
with_nav: bool = False,
trailing_slash: bool = True,
) -> Optional[Tuple[str, str]]:
"""Check links for appropriateness and sanity
Args:
Expand Down Expand Up @@ -85,7 +86,10 @@ def check_url(
raise ValueError

# internationalization and language heuristics in URL
if language is not None and lang_filter(url, language, strict) is False:
if (
language is not None
and lang_filter(url, language, strict, trailing_slash) is False
):
LOGGER.debug("rejected, lang filter: %s", url)
raise ValueError

Expand All @@ -111,7 +115,7 @@ def check_url(
raise ValueError

# normalize
url = normalize_url(parsed_url, strict, language)
url = normalize_url(parsed_url, strict, language, trailing_slash)

# domain info: use blacklist in strict mode only
if strict:
Expand All @@ -138,6 +142,7 @@ def extract_links(
no_filter: bool = False,
language: Optional[str] = None,
strict: bool = True,
trailing_slash: bool = True,
with_nav: bool = False,
redirects: bool = False,
reference: Optional[str] = None,
Expand All @@ -152,6 +157,7 @@ def extract_links(
no_filter: override settings and bypass checks to return all possible URLs
language: set target language (ISO 639-1 codes)
strict: set to True for stricter filtering
trailing_slash: set to False to trim trailing slashes
with_nav: set to True to include navigation pages instead of discarding them
with_redirects: set to True for redirection test (per HTTP HEAD request)
reference: provide a host reference for external/internal evaluation
Expand Down Expand Up @@ -197,6 +203,7 @@ def extract_links(
checked = check_url(
link,
strict=strict,
trailing_slash=trailing_slash,
with_nav=with_nav,
with_redirects=redirects,
language=language,
Expand Down
18 changes: 14 additions & 4 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
Bundles functions needed to target text content and validate the input.
"""


## This file is available from https://github.com/adbar/courlan
## under GNU GPL v3 license

Expand Down Expand Up @@ -82,9 +81,12 @@

# language filter
PATH_LANG_FILTER = re.compile(
r"(?:https?://[^/]+/)([a-z]{2})([_-][a-z]{2,3})?(?:/)", re.IGNORECASE
r"(?:https?://[^/]+/)([a-z]{2})([_-][a-z]{2,3})?(?:/|$)", re.IGNORECASE
)
ALL_PATH_LANGS = re.compile(r"(?:/)([a-z]{2})([_-][a-z]{2})?(?:/)", re.IGNORECASE)
ALL_PATH_LANGS_NO_TRAILING = re.compile(
r"(?:/)([a-z]{2})([_-][a-z]{2})?(?:/|$)", re.IGNORECASE
)
HOST_LANG_FILTER = re.compile(
r"https?://([a-z]{2})\.(?:[^.]{4,})\.(?:[^.]+)(?:\.[^.]+)?/", re.IGNORECASE
)
Expand Down Expand Up @@ -202,7 +204,12 @@ def langcodes_score(language: str, segment: str, score: int) -> int:
return score


def lang_filter(url: str, language: Optional[str] = None, strict: bool = False) -> bool:
def lang_filter(
url: str,
language: Optional[str] = None,
strict: bool = False,
trailing_slash: bool = True,
) -> bool:
"""Heuristics targeting internationalization and linguistic elements.
Based on a score."""
# sanity check
Expand All @@ -214,7 +221,10 @@ def lang_filter(url: str, language: Optional[str] = None, strict: bool = False)
match = PATH_LANG_FILTER.match(url)
if match:
# look for other occurrences
occurrences = ALL_PATH_LANGS.findall(url)
if trailing_slash:
occurrences = ALL_PATH_LANGS.findall(url)
else:
occurrences = ALL_PATH_LANGS_NO_TRAILING.findall(url)
if len(occurrences) == 1:
score = langcodes_score(language, match[1], score)
elif len(occurrences) == 2:
Expand Down
1 change: 0 additions & 1 deletion courlan/sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
Utilities dedicated to URL sampling
"""


import logging

# from functools import cmp_to_key
Expand Down
22 changes: 19 additions & 3 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,19 +68,29 @@ def __init__(self, urlpath: str, visited: bool) -> None:

class UrlStore:
"Defines a class to store domain-classified URLs and perform checks against it."
__slots__ = ("compressed", "done", "language", "strict", "urldict", "_lock")
__slots__ = (
"compressed",
"done",
"language",
"strict",
"trailing_slash",
"urldict",
"_lock",
)

def __init__(
self,
compressed: bool = False,
language: Optional[str] = None,
strict: bool = False,
trailing: bool = True,
verbose: bool = False,
) -> None:
self.compressed: bool = compressed
self.done: bool = False
self.language: Optional[str] = language
self.strict: bool = strict
self.trailing_slash: bool = trailing
self.urldict: DefaultDict[str, DomainEntry] = defaultdict(DomainEntry)
self._lock: Lock = Lock()

Expand Down Expand Up @@ -112,12 +122,18 @@ def _buffer_urls(
# filter
if (
self.language is not None
and lang_filter(url, self.language, self.strict) is False
and lang_filter(
url, self.language, self.strict, self.trailing_slash
)
is False
):
LOGGER.debug("Wrong language: %s", url)
raise ValueError
parsed_url = normalize_url(
parsed_url, strict=self.strict, language=self.language
parsed_url,
strict=self.strict,
language=self.language,
trailing_slash=self.trailing_slash,
)
hostinfo, urlpath = get_host_and_path(parsed_url)
inputdict[hostinfo].append(UrlPathTuple(urlpath, visited))
Expand Down
18 changes: 17 additions & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,9 @@ def test_path_filter():


def test_lang_filter():
assert lang_filter("http://test.com/az", "de", trailing_slash=False) is False
assert lang_filter("http://test.com/az/", "de") is False
assert lang_filter("http://test.com/de", "de", trailing_slash=False) is True
assert lang_filter("http://test.com/de/", "de") is True
assert (
lang_filter(
Expand Down Expand Up @@ -945,9 +947,23 @@ def test_extraction():
external_bool=False,
strict=True,
with_nav=True,
trailing_slash=True,
)
assert sorted(links) == [
"https://example.org/page/", # parameter stripped by strict filtering
"https://example.org/page/",
"https://example.org/page/10",
]
links = extract_links(
pagecontent,
"https://example.org",
external_bool=False,
strict=True,
trailing_slash=False,
with_nav=True,
)
print(links)
assert sorted(links) == [
"https://example.org/page", # parameter stripped by strict filtering
"https://example.org/page/10",
]
links = extract_links(
Expand Down
10 changes: 6 additions & 4 deletions tests/urlstore_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,19 +182,20 @@ def test_urlstore():
my_urls.add_urls(extension_urls)
assert len(my_urls._load_urls(example_domain)) == len(example_urls) + 10
# test extension + deduplication
my_urls.trailing_slash = False
extension_urls = [f"{example_domain}/1/{str(a)}/" for a in range(11)]
my_urls.add_urls(appendleft=extension_urls)
url_tuples = my_urls._load_urls(example_domain)
assert len(url_tuples) == len(example_urls) + 11
assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10/"
assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10"

# duplicates
my_urls.add_urls(extension_urls)
my_urls.add_urls(appendleft=extension_urls)
assert len(my_urls._load_urls(example_domain)) == len(example_urls) + len(
extension_urls
)
assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10/"
assert url_tuples[-1].urlpath == "/1/9" and url_tuples[0].urlpath == "/1/10"

# get_url
assert my_urls.urldict[example_domain].timestamp is None
Expand All @@ -204,7 +205,7 @@ def test_urlstore():
timestamp = my_urls.urldict[example_domain].timestamp
sleep(0.1)
url2 = my_urls.get_url(example_domain)
assert url1 != url2 and url1 == "https://www.example.org/1/10/"
assert url1 != url2 and url1 == "https://www.example.org/1/10"
assert my_urls.urldict[example_domain].count == 2
assert timestamp != my_urls.urldict[example_domain].timestamp
assert url2 not in set(my_urls.find_unvisited_urls(example_domain))
Expand Down Expand Up @@ -371,6 +372,7 @@ def test_dbdump(capsys):
def test_from_html():
"Test link extraction procedures."
url_store = UrlStore()
url_store.trailing_slash = False
base_url = "https://example.org"
htmlstring = '<html><body><a href="https://example.com/page1"/><a href="https://example.org/page1/"/><a href="https://test.org/page1"/></body></html>'
# 1 internal link in total
Expand Down Expand Up @@ -398,7 +400,7 @@ def test_from_html():
url_store.add_from_html(htmlstring, base_url, lang="en")
todo = url_store.find_unvisited_urls(base_url)
known_links = url_store.find_known_urls(base_url)
assert "https://example.org/en/page1/" in todo and len(known_links) == 4
assert "https://example.org/en/page1" in todo and len(known_links) == 4
# wrong language
htmlstring = '<html><body><a href="https://example.org/en/page2"/></body></html>'
url_store.add_from_html(htmlstring, base_url, lang="de")
Expand Down

0 comments on commit 6175dd7

Please sign in to comment.