adbar · sourcery-ai · Jan 16, 2024 · sourcery-ai · Jan 16, 2024 · sourcery-ai
diff --git a/courlan/clean.py b/courlan/clean.py
@@ -91,14 +91,10 @@ def scrub_url(url: str) -> str:
             if match and is_valid_url(match[1]):
                 url = match[1]
                 LOGGER.debug("taking url: %s", url)
-    # too long and garbled URLs e.g. due to quotes URLs
-    # https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
-    # if len(url) > 500:  # arbitrary choice
-    match = TRAILING_PARTS.match(url)
-    if match:
+    if match := TRAILING_PARTS.match(url):
         url = match[1]
     if len(url) > 500:
-        LOGGER.debug("invalid-looking link %s of length %d", url[:50] + "…", len(url))
+        LOGGER.debug("invalid-looking link %s of length %d", f"{url[:50]}…", len(url))
 
     # trailing slashes in URLs without path or in embedded URLs
     if url.count("/") == 3 or url.count("://") > 1:
@@ -118,11 +114,13 @@ def clean_query(
         for qelem in sorted(qdict):
             teststr = qelem.lower()
             # control param
-            if strict:
-                if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS:
-                    continue
-            # get rid of trackers
-            elif TRACKERS_RE.search(teststr):
+            if (
+                strict
+                and teststr not in ALLOWED_PARAMS
+                and teststr not in CONTROL_PARAMS
+                or not strict
+                and TRACKERS_RE.search(teststr)
+            ):
                 continue
             # control language
             if language is not None and teststr in CONTROL_PARAMS:

diff --git a/courlan/cli.py b/courlan/cli.py
@@ -107,10 +107,10 @@ def _batch_lines(inputfile: str) -> Iterator[List[str]]:
     "Read input line in batches"
     with open(inputfile, "r", encoding="utf-8", errors="ignore") as inputfh:
         while True:
-            batch = [line.strip() for line in islice(inputfh, 10**5)]
-            if not batch:
+            if batch := [line.strip() for line in islice(inputfh, 10**5)]:
+                yield batch
+            else:
                 return
-            yield batch
 
 
 def _cli_sample(args: Any) -> None:
@@ -138,45 +138,44 @@ def _cli_sample(args: Any) -> None:
 
 def _cli_process(args: Any) -> None:
     "Read input file bit by bit and process URLs in batches."
-    with ProcessPoolExecutor(max_workers=args.parallel) as executor, open(
-        args.outputfile, "w", encoding="utf-8"
-    ) as outputfh, open(
-        args.inputfile, "r", encoding="utf-8", errors="ignore"
-    ) as inputfh:
+    with (ProcessPoolExecutor(max_workers=args.parallel) as executor, open(
+            args.outputfile, "w", encoding="utf-8"
+        ) as outputfh, open(
+            args.inputfile, "r", encoding="utf-8", errors="ignore"
+        ) as inputfh):
         while True:
             batches = []  # type: List[List[str]]
             while len(batches) < 1000:
-                line_batch = list(islice(inputfh, 1000))
-                if not line_batch:
-                    break
-                batches.append(line_batch)
-
-            if batches:
-                futures = (
-                    executor.submit(
-                        _cli_check_urls,
-                        batch,
-                        strict=args.strict,
-                        with_redirects=args.redirects,
-                        language=args.language,
-                    )
-                    for batch in batches
-                )
+                if line_batch := list(islice(inputfh, 1000)):
+                    batches.append(line_batch)
 
-                for future in as_completed(futures):
-                    for valid, url in future.result():
-                        if valid:
-                            outputfh.write(url + "\n")
-                        # proceed with discarded URLs. to be rewritten
-                        elif args.discardedfile is not None:
-                            with open(
-                                args.discardedfile, "a", encoding="utf-8"
-                            ) as discardfh:
-                                discardfh.write(url)
-
-                batches = []
-            else:
+                else:
+                    break
+            if not batches:
                 break
+            futures = (
+                executor.submit(
+                    _cli_check_urls,
+                    batch,
+                    strict=args.strict,
+                    with_redirects=args.redirects,
+                    language=args.language,
+                )
+                for batch in batches
+            )
+
+            for future in as_completed(futures):
+                for valid, url in future.result():
+                    if valid:
+                        outputfh.write(url + "\n")
+                    # proceed with discarded URLs. to be rewritten
+                    elif args.discardedfile is not None:
+                        with open(
+                            args.discardedfile, "a", encoding="utf-8"
+                        ) as discardfh:
+                            discardfh.write(url)
+
+            batches = []
 
 
 def process_args(args: Any) -> None:

diff --git a/courlan/core.py b/courlan/core.py
@@ -174,26 +174,22 @@ def extract_links(
         if "rel" in link and "nofollow" in link:
             continue
         # https://en.wikipedia.org/wiki/Hreflang
-        if no_filter is False and language is not None and "hreflang" in link:
+        if not no_filter and language is not None and "hreflang" in link:
             langmatch = HREFLANG_REGEX.search(link)
             if langmatch and (
                 langmatch[1].startswith(language) or langmatch[1] == "x-default"
             ):
-                linkmatch = LINK_REGEX.search(link)
-                if linkmatch:
+                if linkmatch := LINK_REGEX.search(link):
                     candidates.add(linkmatch[1])
-        # default
-        else:
-            linkmatch = LINK_REGEX.search(link)
-            if linkmatch:
-                candidates.add(linkmatch[1])
+        elif linkmatch := LINK_REGEX.search(link):
+            candidates.add(linkmatch[1])
     # filter candidates
     for link in candidates:
         # repair using base
         if not link.startswith("http"):
             link = fix_relative_urls(url, link)
         # check
-        if no_filter is False:
+        if not no_filter:
             checked = check_url(
                 link,
                 strict=strict,

diff --git a/courlan/filters.py b/courlan/filters.py
@@ -174,10 +174,7 @@ def domain_filter(domain: str) -> bool:
         return False
 
     extension_match = EXTENSION_REGEX.search(domain)
-    if extension_match and extension_match[0] in WHITELISTED_EXTENSIONS:
-        return False
-
-    return True
+    return not extension_match or extension_match[0] not in WHITELISTED_EXTENSIONS
 
 
 def extension_filter(urlpath: str) -> bool:
@@ -215,21 +212,17 @@ def lang_filter(url: str, language: Optional[str] = None, strict: bool = False)
         return True
     # init score
     score = 0
-    # first test: internationalization in URL path
-    match = PATH_LANG_FILTER.match(url)
-    if match:
+    if match := PATH_LANG_FILTER.match(url):
         # look for other occurrences
         occurrences = ALL_PATH_LANGS.findall(url)
         if len(occurrences) == 1:
             score = langcodes_score(language, match[1], score)
         elif len(occurrences) == 2:
             for occurrence in occurrences:
                 score = langcodes_score(language, occurrence, score)
-        # don't perform the test if there are too many candidates: > 2
     # second test: prepended language cues
     if strict and language in LANGUAGE_MAPPINGS:
-        match = HOST_LANG_FILTER.match(url)
-        if match:
+        if match := HOST_LANG_FILTER.match(url):
             candidate = match[1].lower()
             LOGGER.debug("candidate lang %s found in URL", candidate)
             if candidate in LANGUAGE_MAPPINGS[language]:

diff --git a/courlan/urlstore.py b/courlan/urlstore.py
@@ -146,12 +146,12 @@ def _store_urls(
     ) -> None:
         # http/https switch
         if domain.startswith("http://"):
-            candidate = "https" + domain[4:]
+            candidate = f"https{domain[4:]}"
             # switch
             if candidate in self.urldict:
                 domain = candidate
         elif domain.startswith("https://"):
-            candidate = "http" + domain[5:]
+            candidate = f"http{domain[5:]}"
             # replace entry
             if candidate in self.urldict:
                 self.urldict[domain] = self.urldict[candidate]

diff --git a/courlan/urlutils.py b/courlan/urlutils.py
@@ -35,12 +35,9 @@ def get_tldinfo(
     if not url or not isinstance(url, str):
         return None, None
     if fast:
-        # try with regexes
-        domain_match = DOMAIN_REGEX.match(url)
-        if domain_match:
+        if domain_match := DOMAIN_REGEX.match(url):
             full_domain = STRIP_DOMAIN_REGEX.sub("", domain_match[1])
-            clean_match = NO_EXTENSION_REGEX.match(full_domain)
-            if clean_match:
+            if clean_match := NO_EXTENSION_REGEX.match(full_domain):
                 return clean_match[0], full_domain
     # fallback
     tldinfo = get_tld(url, as_object=True, fail_silently=True)
@@ -62,10 +59,7 @@ def extract_domain(
     if full_domain is None:
         return None
     # blacklisting
-    if domain in blacklist or full_domain in blacklist:
-        return None
-    # return domain
-    return full_domain
+    return None if domain in blacklist or full_domain in blacklist else full_domain
 
 
 def _parse(url: Any) -> SplitResult:
@@ -83,10 +77,7 @@ def get_base_url(url: Any) -> str:
     """Strip URL of some of its parts to get base URL.
     Accepts strings and urllib.parse ParseResult objects."""
     parsed_url = _parse(url)
-    if parsed_url.scheme:
-        scheme = parsed_url.scheme + "://"
-    else:
-        scheme = ""
+    scheme = f"{parsed_url.scheme}://" if parsed_url.scheme else ""
     return scheme + parsed_url.netloc
 
 
@@ -144,9 +135,7 @@ def is_external(url: str, reference: str, ignore_suffix: bool = True) -> bool:
     stripped_ref, ref = get_tldinfo(reference, fast=True)
     stripped_domain, domain = get_tldinfo(url, fast=True)
     # comparison
-    if ignore_suffix:
-        return stripped_domain != stripped_ref
-    return domain != ref
+    return stripped_domain != stripped_ref if ignore_suffix else domain != ref
 
 
 def is_known_link(link: str, known_links: Set[str]) -> bool:

diff --git a/helpers/common-crawl-normalize.py b/helpers/common-crawl-normalize.py
@@ -38,9 +38,7 @@
                     # normalize URL problem
                     line = re.sub(r'\./', '/', line, 1)
 
-                    # not elegant, but will do the trick
-                    match = re.match(r'(.+?)/', line)
-                    if match:
+                    if match := re.match(r'(.+?)/', line):
                         core = match.group(1)
                     else:
                         print ('ERROR: Curious one (1):', lastseen)
@@ -49,8 +47,7 @@
                         print ('ERROR: Curious one (2):', lastseen)
                         continue
                     elements = re.findall(r'.+?\.', core)
-                    match = re.search(r'\.([^\.]+)$', core)
-                    if match:
+                    if match := re.search(r'\.([^\.]+)$', core):
                         lastone = match.group(1)
                     else:
                         print ('ERROR: Curious one (3):', lastseen)
@@ -60,7 +57,7 @@
                     core = ''.join(reversed(elements))
                     core = re.sub(r'\.$', '', core)
                     line = re.sub(r'^.+?/', '/', line)
-                    line = 'http://' + lastone + '.' + core + line
+                    line = f'http://{lastone}.{core}{line}'
                     outputfh.write(line + '\n')
 
 

diff --git a/helpers/find-wordpress-urls.py b/helpers/find-wordpress-urls.py
@@ -35,36 +35,33 @@ def find_target(url):
 
     # wordpress.com
     if re.match(r'https?://.+?\.wordpress\.[a-z]{2,3}', url): # [a-z]+?
-       # files.wordpress.com hack
-       url = re.sub(r'\.files\.wordpress\.', '.wordpress.', url)
-       # wphost = re.match (r'(htt.+?\.wordpress\.[a-z]{2,3}/)', url)
-       wphost = re.match (r'(htt.+?\.wordpress\.[a-z]{2,3})/?', url)
-       if wphost:
-           return wphost.group(1).rstrip('/') + '/'
-
-    # K.O. victory
-    ko_string = re.match(r'(.+?)(/wp/|/wordpress/|/wp-content/)', url)
-    if ko_string:
+        # files.wordpress.com hack
+        url = re.sub(r'\.files\.wordpress\.', '.wordpress.', url)
+        if wphost := re.match(r'(htt.+?\.wordpress\.[a-z]{2,3})/?', url):
+            return wphost.group(1).rstrip('/') + '/'
+
+    if ko_string := re.match(r'(.+?)(/wp/|/wordpress/|/wp-content/)', url):
         return ko_string.group(1).rstrip('/') + '/'
 
-    # /tag/ /category/
-    tagcat = re.match(r'(.+?)(/tag/|/category/|\?cat=)', url)
-    if tagcat:
+    if tagcat := re.match(r'(.+?)(/tag/|/category/|\?cat=)', url):
         return tagcat.group(1).rstrip('/') + '/'
 
     # query parameters
     if re.search(r'/\?p=|\?page_id=|\?paged=/', url):
-        mquery = re.match(r'(https?://.+?/)(blog/|weblog/)?(\?p=|\?page_id=|\?paged=)', url)
-        if mquery:
+        if mquery := re.match(
+            r'(https?://.+?/)(blog/|weblog/)?(\?p=|\?page_id=|\?paged=)', url
+        ):
             if mquery.group(2) and mquery.group(3):
                 return mquery.group(1) + mquery.group(2)
             else:
                 return mquery.group(1).rstrip('/') + '/'
 
     # URL types
     if re.search(r'/20[0-9]{2}/[0-9]{2}/|/archives/', url):
-        url_types = re.search(r'(https?://.+?/)(blog/|weblog/)?(20[0-9]{2}/[0-9]{2}/|/archives/)', url)
-        if url_types:
+        if url_types := re.search(
+            r'(https?://.+?/)(blog/|weblog/)?(20[0-9]{2}/[0-9]{2}/|/archives/)',
+            url,
+        ):
             # print (url)
             if url_types.group(2) and url_types.group(3):
                 return url_types.group(1) + url_types.group(2)
@@ -81,8 +78,10 @@ def find_target(url):
     if args.lax is True and re.search(
         r'/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/', url
     ):
-        url_lax = re.search(r'(https?://.+?/)(blog/|weblog/)?(/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/)', url)
-        if url_lax:
+        if url_lax := re.search(
+            r'(https?://.+?/)(blog/|weblog/)?(/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/)',
+            url,
+        ):
             if url_lax.group(2) and url_lax.group(3):
                 return url_lax.group(1) + url_lax.group(2)
             else:

diff --git a/tests/urlstore_tests.py b/tests/urlstore_tests.py
@@ -158,7 +158,7 @@ def test_urlstore():
     assert my_urls.total_url_number() == len(urls)
     assert my_urls.get_all_counts() == [0, 0]
 
-    if my_urls.compressed is False:
+    if not my_urls.compressed:
         assert sum(len(v.tuples) for _, v in my_urls.urldict.items()) == len(urls)
     my_urls.add_urls(["https://visited.com/visited"], visited=True)
     assert my_urls.urldict["https://visited.com"].tuples[0].visited is True
@@ -269,8 +269,8 @@ def test_urlstore():
     assert my_urls.has_been_visited(f"{example_domain}/999") is False
     candidates = [url1, f"{example_domain}/this", f"{example_domain}/999"]
     assert my_urls.filter_unvisited_urls(candidates) == [
-        example_domain + "/this",
-        example_domain + "/999",
+        f"{example_domain}/this",
+        f"{example_domain}/999",
     ]
     assert (
         len(my_urls.find_known_urls(example_domain))