-
-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Sourcery refactored master branch #78
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -91,14 +91,10 @@ def scrub_url(url: str) -> str: | |
if match and is_valid_url(match[1]): | ||
url = match[1] | ||
LOGGER.debug("taking url: %s", url) | ||
# too long and garbled URLs e.g. due to quotes URLs | ||
# https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py | ||
# if len(url) > 500: # arbitrary choice | ||
match = TRAILING_PARTS.match(url) | ||
if match: | ||
if match := TRAILING_PARTS.match(url): | ||
url = match[1] | ||
if len(url) > 500: | ||
LOGGER.debug("invalid-looking link %s of length %d", url[:50] + "…", len(url)) | ||
LOGGER.debug("invalid-looking link %s of length %d", f"{url[:50]}…", len(url)) | ||
|
||
# trailing slashes in URLs without path or in embedded URLs | ||
if url.count("/") == 3 or url.count("://") > 1: | ||
|
@@ -118,11 +114,13 @@ def clean_query( | |
for qelem in sorted(qdict): | ||
teststr = qelem.lower() | ||
# control param | ||
if strict: | ||
if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS: | ||
continue | ||
# get rid of trackers | ||
elif TRACKERS_RE.search(teststr): | ||
if ( | ||
strict | ||
and teststr not in ALLOWED_PARAMS | ||
and teststr not in CONTROL_PARAMS | ||
or not strict | ||
and TRACKERS_RE.search(teststr) | ||
): | ||
Comment on lines
-121
to
+123
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ):
|
||
continue | ||
# control language | ||
if language is not None and teststr in CONTROL_PARAMS: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -107,10 +107,10 @@ def _batch_lines(inputfile: str) -> Iterator[List[str]]: | |
"Read input line in batches" | ||
with open(inputfile, "r", encoding="utf-8", errors="ignore") as inputfh: | ||
while True: | ||
batch = [line.strip() for line in islice(inputfh, 10**5)] | ||
if not batch: | ||
if batch := [line.strip() for line in islice(inputfh, 10**5)]: | ||
yield batch | ||
else: | ||
return | ||
yield batch | ||
Comment on lines
-110
to
-113
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
|
||
def _cli_sample(args: Any) -> None: | ||
|
@@ -138,45 +138,44 @@ def _cli_sample(args: Any) -> None: | |
|
||
def _cli_process(args: Any) -> None: | ||
"Read input file bit by bit and process URLs in batches." | ||
with ProcessPoolExecutor(max_workers=args.parallel) as executor, open( | ||
args.outputfile, "w", encoding="utf-8" | ||
) as outputfh, open( | ||
args.inputfile, "r", encoding="utf-8", errors="ignore" | ||
) as inputfh: | ||
with (ProcessPoolExecutor(max_workers=args.parallel) as executor, open( | ||
args.outputfile, "w", encoding="utf-8" | ||
) as outputfh, open( | ||
args.inputfile, "r", encoding="utf-8", errors="ignore" | ||
) as inputfh): | ||
while True: | ||
batches = [] # type: List[List[str]] | ||
while len(batches) < 1000: | ||
line_batch = list(islice(inputfh, 1000)) | ||
if not line_batch: | ||
break | ||
batches.append(line_batch) | ||
|
||
if batches: | ||
futures = ( | ||
executor.submit( | ||
_cli_check_urls, | ||
batch, | ||
strict=args.strict, | ||
with_redirects=args.redirects, | ||
language=args.language, | ||
) | ||
for batch in batches | ||
) | ||
if line_batch := list(islice(inputfh, 1000)): | ||
batches.append(line_batch) | ||
|
||
for future in as_completed(futures): | ||
for valid, url in future.result(): | ||
if valid: | ||
outputfh.write(url + "\n") | ||
# proceed with discarded URLs. to be rewritten | ||
elif args.discardedfile is not None: | ||
with open( | ||
args.discardedfile, "a", encoding="utf-8" | ||
) as discardfh: | ||
discardfh.write(url) | ||
|
||
batches = [] | ||
else: | ||
else: | ||
break | ||
if not batches: | ||
break | ||
futures = ( | ||
executor.submit( | ||
_cli_check_urls, | ||
batch, | ||
strict=args.strict, | ||
with_redirects=args.redirects, | ||
language=args.language, | ||
) | ||
for batch in batches | ||
) | ||
|
||
for future in as_completed(futures): | ||
for valid, url in future.result(): | ||
if valid: | ||
outputfh.write(url + "\n") | ||
# proceed with discarded URLs. to be rewritten | ||
elif args.discardedfile is not None: | ||
with open( | ||
args.discardedfile, "a", encoding="utf-8" | ||
) as discardfh: | ||
discardfh.write(url) | ||
|
||
batches = [] | ||
Comment on lines
-141
to
+178
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
|
||
def process_args(args: Any) -> None: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -174,26 +174,22 @@ def extract_links( | |
if "rel" in link and "nofollow" in link: | ||
continue | ||
# https://en.wikipedia.org/wiki/Hreflang | ||
if no_filter is False and language is not None and "hreflang" in link: | ||
if not no_filter and language is not None and "hreflang" in link: | ||
langmatch = HREFLANG_REGEX.search(link) | ||
if langmatch and ( | ||
langmatch[1].startswith(language) or langmatch[1] == "x-default" | ||
): | ||
linkmatch = LINK_REGEX.search(link) | ||
if linkmatch: | ||
if linkmatch := LINK_REGEX.search(link): | ||
candidates.add(linkmatch[1]) | ||
# default | ||
else: | ||
linkmatch = LINK_REGEX.search(link) | ||
if linkmatch: | ||
candidates.add(linkmatch[1]) | ||
elif linkmatch := LINK_REGEX.search(link): | ||
candidates.add(linkmatch[1]) | ||
# filter candidates | ||
for link in candidates: | ||
# repair using base | ||
if not link.startswith("http"): | ||
link = fix_relative_urls(url, link) | ||
# check | ||
if no_filter is False: | ||
if not no_filter: | ||
Comment on lines
-177
to
+192
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ):
|
||
checked = check_url( | ||
link, | ||
strict=strict, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -174,10 +174,7 @@ def domain_filter(domain: str) -> bool: | |
return False | ||
|
||
extension_match = EXTENSION_REGEX.search(domain) | ||
if extension_match and extension_match[0] in WHITELISTED_EXTENSIONS: | ||
return False | ||
|
||
return True | ||
return not extension_match or extension_match[0] not in WHITELISTED_EXTENSIONS | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
|
||
def extension_filter(urlpath: str) -> bool: | ||
|
@@ -215,21 +212,17 @@ def lang_filter(url: str, language: Optional[str] = None, strict: bool = False) | |
return True | ||
# init score | ||
score = 0 | ||
# first test: internationalization in URL path | ||
match = PATH_LANG_FILTER.match(url) | ||
if match: | ||
if match := PATH_LANG_FILTER.match(url): | ||
Comment on lines
-218
to
+215
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ):
|
||
# look for other occurrences | ||
occurrences = ALL_PATH_LANGS.findall(url) | ||
if len(occurrences) == 1: | ||
score = langcodes_score(language, match[1], score) | ||
elif len(occurrences) == 2: | ||
for occurrence in occurrences: | ||
score = langcodes_score(language, occurrence, score) | ||
# don't perform the test if there are too many candidates: > 2 | ||
# second test: prepended language cues | ||
if strict and language in LANGUAGE_MAPPINGS: | ||
match = HOST_LANG_FILTER.match(url) | ||
if match: | ||
if match := HOST_LANG_FILTER.match(url): | ||
candidate = match[1].lower() | ||
LOGGER.debug("candidate lang %s found in URL", candidate) | ||
if candidate in LANGUAGE_MAPPINGS[language]: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -146,12 +146,12 @@ def _store_urls( | |
) -> None: | ||
# http/https switch | ||
if domain.startswith("http://"): | ||
candidate = "https" + domain[4:] | ||
candidate = f"https{domain[4:]}" | ||
# switch | ||
if candidate in self.urldict: | ||
domain = candidate | ||
elif domain.startswith("https://"): | ||
candidate = "http" + domain[5:] | ||
candidate = f"http{domain[5:]}" | ||
Comment on lines
-149
to
+154
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
# replace entry | ||
if candidate in self.urldict: | ||
self.urldict[domain] = self.urldict[candidate] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,12 +35,9 @@ def get_tldinfo( | |
if not url or not isinstance(url, str): | ||
return None, None | ||
if fast: | ||
# try with regexes | ||
domain_match = DOMAIN_REGEX.match(url) | ||
if domain_match: | ||
if domain_match := DOMAIN_REGEX.match(url): | ||
full_domain = STRIP_DOMAIN_REGEX.sub("", domain_match[1]) | ||
clean_match = NO_EXTENSION_REGEX.match(full_domain) | ||
if clean_match: | ||
if clean_match := NO_EXTENSION_REGEX.match(full_domain): | ||
Comment on lines
-38
to
+40
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ):
|
||
return clean_match[0], full_domain | ||
# fallback | ||
tldinfo = get_tld(url, as_object=True, fail_silently=True) | ||
|
@@ -62,10 +59,7 @@ def extract_domain( | |
if full_domain is None: | ||
return None | ||
# blacklisting | ||
if domain in blacklist or full_domain in blacklist: | ||
return None | ||
# return domain | ||
return full_domain | ||
return None if domain in blacklist or full_domain in blacklist else full_domain | ||
Comment on lines
-65
to
+62
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ):
|
||
|
||
|
||
def _parse(url: Any) -> SplitResult: | ||
|
@@ -83,10 +77,7 @@ def get_base_url(url: Any) -> str: | |
"""Strip URL of some of its parts to get base URL. | ||
Accepts strings and urllib.parse ParseResult objects.""" | ||
parsed_url = _parse(url) | ||
if parsed_url.scheme: | ||
scheme = parsed_url.scheme + "://" | ||
else: | ||
scheme = "" | ||
scheme = f"{parsed_url.scheme}://" if parsed_url.scheme else "" | ||
Comment on lines
-86
to
+80
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
return scheme + parsed_url.netloc | ||
|
||
|
||
|
@@ -144,9 +135,7 @@ def is_external(url: str, reference: str, ignore_suffix: bool = True) -> bool: | |
stripped_ref, ref = get_tldinfo(reference, fast=True) | ||
stripped_domain, domain = get_tldinfo(url, fast=True) | ||
# comparison | ||
if ignore_suffix: | ||
return stripped_domain != stripped_ref | ||
return domain != ref | ||
return stripped_domain != stripped_ref if ignore_suffix else domain != ref | ||
Comment on lines
-147
to
+138
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
|
||
def is_known_link(link: str, known_links: Set[str]) -> bool: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,9 +38,7 @@ | |
# normalize URL problem | ||
line = re.sub(r'\./', '/', line, 1) | ||
|
||
# not elegant, but will do the trick | ||
match = re.match(r'(.+?)/', line) | ||
if match: | ||
if match := re.match(r'(.+?)/', line): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lines
This removes the following comments ( why? ):
|
||
core = match.group(1) | ||
else: | ||
print ('ERROR: Curious one (1):', lastseen) | ||
|
@@ -49,8 +47,7 @@ | |
print ('ERROR: Curious one (2):', lastseen) | ||
continue | ||
elements = re.findall(r'.+?\.', core) | ||
match = re.search(r'\.([^\.]+)$', core) | ||
if match: | ||
if match := re.search(r'\.([^\.]+)$', core): | ||
lastone = match.group(1) | ||
else: | ||
print ('ERROR: Curious one (3):', lastseen) | ||
|
@@ -60,7 +57,7 @@ | |
core = ''.join(reversed(elements)) | ||
core = re.sub(r'\.$', '', core) | ||
line = re.sub(r'^.+?/', '/', line) | ||
line = 'http://' + lastone + '.' + core + line | ||
line = f'http://{lastone}.{core}{line}' | ||
outputfh.write(line + '\n') | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,36 +35,33 @@ def find_target(url): | |
|
||
# wordpress.com | ||
if re.match(r'https?://.+?\.wordpress\.[a-z]{2,3}', url): # [a-z]+? | ||
# files.wordpress.com hack | ||
url = re.sub(r'\.files\.wordpress\.', '.wordpress.', url) | ||
# wphost = re.match (r'(htt.+?\.wordpress\.[a-z]{2,3}/)', url) | ||
wphost = re.match (r'(htt.+?\.wordpress\.[a-z]{2,3})/?', url) | ||
if wphost: | ||
return wphost.group(1).rstrip('/') + '/' | ||
|
||
# K.O. victory | ||
ko_string = re.match(r'(.+?)(/wp/|/wordpress/|/wp-content/)', url) | ||
if ko_string: | ||
# files.wordpress.com hack | ||
url = re.sub(r'\.files\.wordpress\.', '.wordpress.', url) | ||
if wphost := re.match(r'(htt.+?\.wordpress\.[a-z]{2,3})/?', url): | ||
return wphost.group(1).rstrip('/') + '/' | ||
|
||
if ko_string := re.match(r'(.+?)(/wp/|/wordpress/|/wp-content/)', url): | ||
return ko_string.group(1).rstrip('/') + '/' | ||
|
||
# /tag/ /category/ | ||
tagcat = re.match(r'(.+?)(/tag/|/category/|\?cat=)', url) | ||
if tagcat: | ||
if tagcat := re.match(r'(.+?)(/tag/|/category/|\?cat=)', url): | ||
return tagcat.group(1).rstrip('/') + '/' | ||
|
||
# query parameters | ||
if re.search(r'/\?p=|\?page_id=|\?paged=/', url): | ||
mquery = re.match(r'(https?://.+?/)(blog/|weblog/)?(\?p=|\?page_id=|\?paged=)', url) | ||
if mquery: | ||
if mquery := re.match( | ||
r'(https?://.+?/)(blog/|weblog/)?(\?p=|\?page_id=|\?paged=)', url | ||
): | ||
Comment on lines
-38
to
+53
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ):
|
||
if mquery.group(2) and mquery.group(3): | ||
return mquery.group(1) + mquery.group(2) | ||
else: | ||
return mquery.group(1).rstrip('/') + '/' | ||
|
||
# URL types | ||
if re.search(r'/20[0-9]{2}/[0-9]{2}/|/archives/', url): | ||
url_types = re.search(r'(https?://.+?/)(blog/|weblog/)?(20[0-9]{2}/[0-9]{2}/|/archives/)', url) | ||
if url_types: | ||
if url_types := re.search( | ||
r'(https?://.+?/)(blog/|weblog/)?(20[0-9]{2}/[0-9]{2}/|/archives/)', | ||
url, | ||
): | ||
# print (url) | ||
if url_types.group(2) and url_types.group(3): | ||
return url_types.group(1) + url_types.group(2) | ||
|
@@ -81,8 +78,10 @@ def find_target(url): | |
if args.lax is True and re.search( | ||
r'/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/', url | ||
): | ||
url_lax = re.search(r'(https?://.+?/)(blog/|weblog/)?(/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/)', url) | ||
if url_lax: | ||
if url_lax := re.search( | ||
r'(https?://.+?/)(blog/|weblog/)?(/[a-z]+-[a-z]+-[a-z]+|/20[0-9]{2}/)', | ||
url, | ||
): | ||
if url_lax.group(2) and url_lax.group(3): | ||
return url_lax.group(1) + url_lax.group(2) | ||
else: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -158,7 +158,7 @@ def test_urlstore(): | |
assert my_urls.total_url_number() == len(urls) | ||
assert my_urls.get_all_counts() == [0, 0] | ||
|
||
if my_urls.compressed is False: | ||
if not my_urls.compressed: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
assert sum(len(v.tuples) for _, v in my_urls.urldict.items()) == len(urls) | ||
my_urls.add_urls(["https://visited.com/visited"], visited=True) | ||
assert my_urls.urldict["https://visited.com"].tuples[0].visited is True | ||
|
@@ -269,8 +269,8 @@ def test_urlstore(): | |
assert my_urls.has_been_visited(f"{example_domain}/999") is False | ||
candidates = [url1, f"{example_domain}/this", f"{example_domain}/999"] | ||
assert my_urls.filter_unvisited_urls(candidates) == [ | ||
example_domain + "/this", | ||
example_domain + "/999", | ||
f"{example_domain}/this", | ||
f"{example_domain}/999", | ||
] | ||
assert ( | ||
len(my_urls.find_known_urls(example_domain)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function
scrub_url
refactored with the following changes:use-named-expression
)use-fstring-for-concatenation
)This removes the following comments ( why? ):