Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sourcery refactored master branch #78

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 9 additions & 11 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,10 @@ def scrub_url(url: str) -> str:
if match and is_valid_url(match[1]):
url = match[1]
LOGGER.debug("taking url: %s", url)
# too long and garbled URLs e.g. due to quotes URLs
# https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
# if len(url) > 500: # arbitrary choice
match = TRAILING_PARTS.match(url)
if match:
if match := TRAILING_PARTS.match(url):
url = match[1]
if len(url) > 500:
LOGGER.debug("invalid-looking link %s of length %d", url[:50] + "…", len(url))
LOGGER.debug("invalid-looking link %s of length %d", f"{url[:50]}…", len(url))
Comment on lines -94 to +97
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function scrub_url refactored with the following changes:

This removes the following comments ( why? ):

# too long and garbled URLs e.g. due to quotes URLs
# https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
# if len(url) > 500:  # arbitrary choice


# trailing slashes in URLs without path or in embedded URLs
if url.count("/") == 3 or url.count("://") > 1:
Expand All @@ -118,11 +114,13 @@ def clean_query(
for qelem in sorted(qdict):
teststr = qelem.lower()
# control param
if strict:
if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS:
continue
# get rid of trackers
elif TRACKERS_RE.search(teststr):
if (
strict
and teststr not in ALLOWED_PARAMS
and teststr not in CONTROL_PARAMS
or not strict
and TRACKERS_RE.search(teststr)
):
Comment on lines -121 to +123
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function clean_query refactored with the following changes:

This removes the following comments ( why? ):

# get rid of trackers

continue
# control language
if language is not None and teststr in CONTROL_PARAMS:
Expand Down
73 changes: 36 additions & 37 deletions courlan/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,10 @@ def _batch_lines(inputfile: str) -> Iterator[List[str]]:
"Read input line in batches"
with open(inputfile, "r", encoding="utf-8", errors="ignore") as inputfh:
while True:
batch = [line.strip() for line in islice(inputfh, 10**5)]
if not batch:
if batch := [line.strip() for line in islice(inputfh, 10**5)]:
yield batch
else:
return
yield batch
Comment on lines -110 to -113
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function _batch_lines refactored with the following changes:



def _cli_sample(args: Any) -> None:
Expand Down Expand Up @@ -138,45 +138,44 @@ def _cli_sample(args: Any) -> None:

def _cli_process(args: Any) -> None:
"Read input file bit by bit and process URLs in batches."
with ProcessPoolExecutor(max_workers=args.parallel) as executor, open(
args.outputfile, "w", encoding="utf-8"
) as outputfh, open(
args.inputfile, "r", encoding="utf-8", errors="ignore"
) as inputfh:
with (ProcessPoolExecutor(max_workers=args.parallel) as executor, open(
args.outputfile, "w", encoding="utf-8"
) as outputfh, open(
args.inputfile, "r", encoding="utf-8", errors="ignore"
) as inputfh):
while True:
batches = [] # type: List[List[str]]
while len(batches) < 1000:
line_batch = list(islice(inputfh, 1000))
if not line_batch:
break
batches.append(line_batch)

if batches:
futures = (
executor.submit(
_cli_check_urls,
batch,
strict=args.strict,
with_redirects=args.redirects,
language=args.language,
)
for batch in batches
)
if line_batch := list(islice(inputfh, 1000)):
batches.append(line_batch)

for future in as_completed(futures):
for valid, url in future.result():
if valid:
outputfh.write(url + "\n")
# proceed with discarded URLs. to be rewritten
elif args.discardedfile is not None:
with open(
args.discardedfile, "a", encoding="utf-8"
) as discardfh:
discardfh.write(url)

batches = []
else:
else:
break
if not batches:
break
futures = (
executor.submit(
_cli_check_urls,
batch,
strict=args.strict,
with_redirects=args.redirects,
language=args.language,
)
for batch in batches
)

for future in as_completed(futures):
for valid, url in future.result():
if valid:
outputfh.write(url + "\n")
# proceed with discarded URLs. to be rewritten
elif args.discardedfile is not None:
with open(
args.discardedfile, "a", encoding="utf-8"
) as discardfh:
discardfh.write(url)

batches = []
Comment on lines -141 to +178
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function _cli_process refactored with the following changes:



def process_args(args: Any) -> None:
Expand Down
14 changes: 5 additions & 9 deletions courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,26 +174,22 @@ def extract_links(
if "rel" in link and "nofollow" in link:
continue
# https://en.wikipedia.org/wiki/Hreflang
if no_filter is False and language is not None and "hreflang" in link:
if not no_filter and language is not None and "hreflang" in link:
langmatch = HREFLANG_REGEX.search(link)
if langmatch and (
langmatch[1].startswith(language) or langmatch[1] == "x-default"
):
linkmatch = LINK_REGEX.search(link)
if linkmatch:
if linkmatch := LINK_REGEX.search(link):
candidates.add(linkmatch[1])
# default
else:
linkmatch = LINK_REGEX.search(link)
if linkmatch:
candidates.add(linkmatch[1])
elif linkmatch := LINK_REGEX.search(link):
candidates.add(linkmatch[1])
# filter candidates
for link in candidates:
# repair using base
if not link.startswith("http"):
link = fix_relative_urls(url, link)
# check
if no_filter is False:
if not no_filter:
Comment on lines -177 to +192
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function extract_links refactored with the following changes:

This removes the following comments ( why? ):

# default

checked = check_url(
link,
strict=strict,
Expand Down
13 changes: 3 additions & 10 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,10 +174,7 @@ def domain_filter(domain: str) -> bool:
return False

extension_match = EXTENSION_REGEX.search(domain)
if extension_match and extension_match[0] in WHITELISTED_EXTENSIONS:
return False

return True
return not extension_match or extension_match[0] not in WHITELISTED_EXTENSIONS
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function domain_filter refactored with the following changes:



def extension_filter(urlpath: str) -> bool:
Expand Down Expand Up @@ -215,21 +212,17 @@ def lang_filter(url: str, language: Optional[str] = None, strict: bool = False)
return True
# init score
score = 0
# first test: internationalization in URL path
match = PATH_LANG_FILTER.match(url)
if match:
if match := PATH_LANG_FILTER.match(url):
Comment on lines -218 to +215
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function lang_filter refactored with the following changes:

This removes the following comments ( why? ):

# don't perform the test if there are too many candidates: > 2
# first test: internationalization in URL path

# look for other occurrences
occurrences = ALL_PATH_LANGS.findall(url)
if len(occurrences) == 1:
score = langcodes_score(language, match[1], score)
elif len(occurrences) == 2:
for occurrence in occurrences:
score = langcodes_score(language, occurrence, score)
# don't perform the test if there are too many candidates: > 2
# second test: prepended language cues
if strict and language in LANGUAGE_MAPPINGS:
match = HOST_LANG_FILTER.match(url)
if match:
if match := HOST_LANG_FILTER.match(url):
candidate = match[1].lower()
LOGGER.debug("candidate lang %s found in URL", candidate)
if candidate in LANGUAGE_MAPPINGS[language]:
Expand Down
4 changes: 2 additions & 2 deletions courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,12 @@ def _store_urls(
) -> None:
# http/https switch
if domain.startswith("http://"):
candidate = "https" + domain[4:]
candidate = f"https{domain[4:]}"
# switch
if candidate in self.urldict:
domain = candidate
elif domain.startswith("https://"):
candidate = "http" + domain[5:]
candidate = f"http{domain[5:]}"
Comment on lines -149 to +154
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function UrlStore._store_urls refactored with the following changes:

# replace entry
if candidate in self.urldict:
self.urldict[domain] = self.urldict[candidate]
Expand Down
21 changes: 5 additions & 16 deletions courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,9 @@ def get_tldinfo(
if not url or not isinstance(url, str):
return None, None
if fast:
# try with regexes
domain_match = DOMAIN_REGEX.match(url)
if domain_match:
if domain_match := DOMAIN_REGEX.match(url):
full_domain = STRIP_DOMAIN_REGEX.sub("", domain_match[1])
clean_match = NO_EXTENSION_REGEX.match(full_domain)
if clean_match:
if clean_match := NO_EXTENSION_REGEX.match(full_domain):
Comment on lines -38 to +40
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function get_tldinfo refactored with the following changes:

This removes the following comments ( why? ):

# try with regexes

return clean_match[0], full_domain
# fallback
tldinfo = get_tld(url, as_object=True, fail_silently=True)
Expand All @@ -62,10 +59,7 @@ def extract_domain(
if full_domain is None:
return None
# blacklisting
if domain in blacklist or full_domain in blacklist:
return None
# return domain
return full_domain
return None if domain in blacklist or full_domain in blacklist else full_domain
Comment on lines -65 to +62
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function extract_domain refactored with the following changes:

This removes the following comments ( why? ):

# return domain



def _parse(url: Any) -> SplitResult:
Expand All @@ -83,10 +77,7 @@ def get_base_url(url: Any) -> str:
"""Strip URL of some of its parts to get base URL.
Accepts strings and urllib.parse ParseResult objects."""
parsed_url = _parse(url)
if parsed_url.scheme:
scheme = parsed_url.scheme + "://"
else:
scheme = ""
scheme = f"{parsed_url.scheme}://" if parsed_url.scheme else ""
Comment on lines -86 to +80
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function get_base_url refactored with the following changes:

return scheme + parsed_url.netloc


Expand Down Expand Up @@ -144,9 +135,7 @@ def is_external(url: str, reference: str, ignore_suffix: bool = True) -> bool:
stripped_ref, ref = get_tldinfo(reference, fast=True)
stripped_domain, domain = get_tldinfo(url, fast=True)
# comparison
if ignore_suffix:
return stripped_domain != stripped_ref
return domain != ref
return stripped_domain != stripped_ref if ignore_suffix else domain != ref
Comment on lines -147 to +138
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function is_external refactored with the following changes:



def is_known_link(link: str, known_links: Set[str]) -> bool:
Expand Down
9 changes: 3 additions & 6 deletions helpers/common-crawl-normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,7 @@
# normalize URL problem
line = re.sub(r'\./', '/', line, 1)

# not elegant, but will do the trick
match = re.match(r'(.+?)/', line)
if match:
if match := re.match(r'(.+?)/', line):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 41-63 refactored with the following changes:

This removes the following comments ( why? ):

# not elegant, but will do the trick

core = match.group(1)
else:
print ('ERROR: Curious one (1):', lastseen)
Expand All @@ -49,8 +47,7 @@
print ('ERROR: Curious one (2):', lastseen)
continue
elements = re.findall(r'.+?\.', core)
match = re.search(r'\.([^\.]+)$', core)
if match:
if match := re.search(r'\.([^\.]+)$', core):
lastone = match.group(1)
else:
print ('ERROR: Curious one (3):', lastseen)
Expand All @@ -60,7 +57,7 @@
core = ''.join(reversed(elements))
core = re.sub(r'\.$', '', core)
line = re.sub(r'^.+?/', '/', line)
line = 'http://' + lastone + '.' + core + line
line = f'http://{lastone}.{core}{line}'
outputfh.write(line + '\n')


Expand Down
37 changes: 18 additions & 19 deletions helpers/find-wordpress-urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,36 +35,33 @@ def find_target(url):

# wordpress.com
if re.match(r'https?://.+?\.wordpress\.[a-z]{2,3}', url): # [a-z]+?
# files.wordpress.com hack
url = re.sub(r'\.files\.wordpress\.', '.wordpress.', url)
# wphost = re.match (r'(htt.+?\.wordpress\.[a-z]{2,3}/)', url)
wphost = re.match (r'(htt.+?\.wordpress\.[a-z]{2,3})/?', url)
if wphost:
return wphost.group(1).rstrip('/') + '/'

# K.O. victory
ko_string = re.match(r'(.+?)(/wp/|/wordpress/|/wp-content/)', url)
if ko_string:
# files.wordpress.com hack
url = re.sub(r'\.files\.wordpress\.', '.wordpress.', url)
if wphost := re.match(r'(htt.+?\.wordpress\.[a-z]{2,3})/?', url):
return wphost.group(1).rstrip('/') + '/'

if ko_string := re.match(r'(.+?)(/wp/|/wordpress/|/wp-content/)', url):
return ko_string.group(1).rstrip('/') + '/'

# /tag/ /category/
tagcat = re.match(r'(.+?)(/tag/|/category/|\?cat=)', url)
if tagcat:
if tagcat := re.match(r'(.+?)(/tag/|/category/|\?cat=)', url):
return tagcat.group(1).rstrip('/') + '/'

# query parameters
if re.search(r'/\?p=|\?page_id=|\?paged=/', url):
mquery = re.match(r'(https?://.+?/)(blog/|weblog/)?(\?p=|\?page_id=|\?paged=)', url)
if mquery:
if mquery := re.match(
r'(https?://.+?/)(blog/|weblog/)?(\?p=|\?page_id=|\?paged=)', url
):
Comment on lines -38 to +53
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function find_target refactored with the following changes:

This removes the following comments ( why? ):

# wphost = re.match (r'(htt.+?\.wordpress\.[a-z]{2,3}/)', url)
# K.O. victory
# /tag/ /category/

if mquery.group(2) and mquery.group(3):
return mquery.group(1) + mquery.group(2)
else:
return mquery.group(1).rstrip('/') + '/'

# URL types
if re.search(r'/20[0-9]{2}/[0-9]{2}/|/archives/', url):
url_types = re.search(r'(https?://.+?/)(blog/|weblog/)?(20[0-9]{2}/[0-9]{2}/|/archives/)', url)
if url_types:
if url_types := re.search(
r'(https?://.+?/)(blog/|weblog/)?(20[0-9]{2}/[0-9]{2}/|/archives/)',
url,
):
# print (url)
if url_types.group(2) and url_types.group(3):
return url_types.group(1) + url_types.group(2)
Expand All @@ -81,8 +78,10 @@ def find_target(url):
if args.lax is True and re.search(
r'/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/', url
):
url_lax = re.search(r'(https?://.+?/)(blog/|weblog/)?(/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/)', url)
if url_lax:
if url_lax := re.search(
r'(https?://.+?/)(blog/|weblog/)?(/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/)',
url,
):
if url_lax.group(2) and url_lax.group(3):
return url_lax.group(1) + url_lax.group(2)
else:
Expand Down
6 changes: 3 additions & 3 deletions tests/urlstore_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def test_urlstore():
assert my_urls.total_url_number() == len(urls)
assert my_urls.get_all_counts() == [0, 0]

if my_urls.compressed is False:
if not my_urls.compressed:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function test_urlstore refactored with the following changes:

assert sum(len(v.tuples) for _, v in my_urls.urldict.items()) == len(urls)
my_urls.add_urls(["https://visited.com/visited"], visited=True)
assert my_urls.urldict["https://visited.com"].tuples[0].visited is True
Expand Down Expand Up @@ -269,8 +269,8 @@ def test_urlstore():
assert my_urls.has_been_visited(f"{example_domain}/999") is False
candidates = [url1, f"{example_domain}/this", f"{example_domain}/999"]
assert my_urls.filter_unvisited_urls(candidates) == [
example_domain + "/this",
example_domain + "/999",
f"{example_domain}/this",
f"{example_domain}/999",
]
assert (
len(my_urls.find_known_urls(example_domain))
Expand Down
Loading