-
-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Sourcery refactored master branch #78
Conversation
# too long and garbled URLs e.g. due to quotes URLs | ||
# https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py | ||
# if len(url) > 500: # arbitrary choice | ||
match = TRAILING_PARTS.match(url) | ||
if match: | ||
if match := TRAILING_PARTS.match(url): | ||
url = match[1] | ||
if len(url) > 500: | ||
LOGGER.debug("invalid-looking link %s of length %d", url[:50] + "…", len(url)) | ||
LOGGER.debug("invalid-looking link %s of length %d", f"{url[:50]}…", len(url)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function scrub_url
refactored with the following changes:
- Use named expression to simplify assignment and conditional (
use-named-expression
) - Use f-string instead of string concatenation (
use-fstring-for-concatenation
)
This removes the following comments ( why? ):
# too long and garbled URLs e.g. due to quotes URLs
# https://github.com/cocrawler/cocrawler/blob/main/cocrawler/urls.py
# if len(url) > 500: # arbitrary choice
if strict: | ||
if teststr not in ALLOWED_PARAMS and teststr not in CONTROL_PARAMS: | ||
continue | ||
# get rid of trackers | ||
elif TRACKERS_RE.search(teststr): | ||
if ( | ||
strict | ||
and teststr not in ALLOWED_PARAMS | ||
and teststr not in CONTROL_PARAMS | ||
or not strict | ||
and TRACKERS_RE.search(teststr) | ||
): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function clean_query
refactored with the following changes:
- Merge duplicate blocks in conditional (
merge-duplicate-blocks
)
This removes the following comments ( why? ):
# get rid of trackers
batch = [line.strip() for line in islice(inputfh, 10**5)] | ||
if not batch: | ||
if batch := [line.strip() for line in islice(inputfh, 10**5)]: | ||
yield batch | ||
else: | ||
return | ||
yield batch |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function _batch_lines
refactored with the following changes:
- Use named expression to simplify assignment and conditional (
use-named-expression
) - Lift code into else after jump in control flow (
reintroduce-else
) - Swap if/else branches (
swap-if-else-branches
)
with ProcessPoolExecutor(max_workers=args.parallel) as executor, open( | ||
args.outputfile, "w", encoding="utf-8" | ||
) as outputfh, open( | ||
args.inputfile, "r", encoding="utf-8", errors="ignore" | ||
) as inputfh: | ||
with (ProcessPoolExecutor(max_workers=args.parallel) as executor, open( | ||
args.outputfile, "w", encoding="utf-8" | ||
) as outputfh, open( | ||
args.inputfile, "r", encoding="utf-8", errors="ignore" | ||
) as inputfh): | ||
while True: | ||
batches = [] # type: List[List[str]] | ||
while len(batches) < 1000: | ||
line_batch = list(islice(inputfh, 1000)) | ||
if not line_batch: | ||
break | ||
batches.append(line_batch) | ||
|
||
if batches: | ||
futures = ( | ||
executor.submit( | ||
_cli_check_urls, | ||
batch, | ||
strict=args.strict, | ||
with_redirects=args.redirects, | ||
language=args.language, | ||
) | ||
for batch in batches | ||
) | ||
if line_batch := list(islice(inputfh, 1000)): | ||
batches.append(line_batch) | ||
|
||
for future in as_completed(futures): | ||
for valid, url in future.result(): | ||
if valid: | ||
outputfh.write(url + "\n") | ||
# proceed with discarded URLs. to be rewritten | ||
elif args.discardedfile is not None: | ||
with open( | ||
args.discardedfile, "a", encoding="utf-8" | ||
) as discardfh: | ||
discardfh.write(url) | ||
|
||
batches = [] | ||
else: | ||
else: | ||
break | ||
if not batches: | ||
break | ||
futures = ( | ||
executor.submit( | ||
_cli_check_urls, | ||
batch, | ||
strict=args.strict, | ||
with_redirects=args.redirects, | ||
language=args.language, | ||
) | ||
for batch in batches | ||
) | ||
|
||
for future in as_completed(futures): | ||
for valid, url in future.result(): | ||
if valid: | ||
outputfh.write(url + "\n") | ||
# proceed with discarded URLs. to be rewritten | ||
elif args.discardedfile is not None: | ||
with open( | ||
args.discardedfile, "a", encoding="utf-8" | ||
) as discardfh: | ||
discardfh.write(url) | ||
|
||
batches = [] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function _cli_process
refactored with the following changes:
- Swap if/else branches [×2] (
swap-if-else-branches
) - Remove unnecessary else after guard condition (
remove-unnecessary-else
) - Lift code into else after jump in control flow (
reintroduce-else
) - Use named expression to simplify assignment and conditional (
use-named-expression
)
if no_filter is False and language is not None and "hreflang" in link: | ||
if not no_filter and language is not None and "hreflang" in link: | ||
langmatch = HREFLANG_REGEX.search(link) | ||
if langmatch and ( | ||
langmatch[1].startswith(language) or langmatch[1] == "x-default" | ||
): | ||
linkmatch = LINK_REGEX.search(link) | ||
if linkmatch: | ||
if linkmatch := LINK_REGEX.search(link): | ||
candidates.add(linkmatch[1]) | ||
# default | ||
else: | ||
linkmatch = LINK_REGEX.search(link) | ||
if linkmatch: | ||
candidates.add(linkmatch[1]) | ||
elif linkmatch := LINK_REGEX.search(link): | ||
candidates.add(linkmatch[1]) | ||
# filter candidates | ||
for link in candidates: | ||
# repair using base | ||
if not link.startswith("http"): | ||
link = fix_relative_urls(url, link) | ||
# check | ||
if no_filter is False: | ||
if not no_filter: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function extract_links
refactored with the following changes:
- Merge else clause's nested if statement into elif (
merge-else-if-into-elif
) - Simplify comparison to boolean [×2] (
simplify-boolean-comparison
) - Use named expression to simplify assignment and conditional [×2] (
use-named-expression
)
This removes the following comments ( why? ):
# default
if parsed_url.scheme: | ||
scheme = parsed_url.scheme + "://" | ||
else: | ||
scheme = "" | ||
scheme = f"{parsed_url.scheme}://" if parsed_url.scheme else "" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function get_base_url
refactored with the following changes:
- Replace if statement with if expression (
assign-if-exp
) - Use f-string instead of string concatenation (
use-fstring-for-concatenation
)
if ignore_suffix: | ||
return stripped_domain != stripped_ref | ||
return domain != ref | ||
return stripped_domain != stripped_ref if ignore_suffix else domain != ref |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function is_external
refactored with the following changes:
- Lift code into else after jump in control flow (
reintroduce-else
) - Replace if statement with if expression (
assign-if-exp
)
# not elegant, but will do the trick | ||
match = re.match(r'(.+?)/', line) | ||
if match: | ||
if match := re.match(r'(.+?)/', line): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lines 41-63
refactored with the following changes:
- Use named expression to simplify assignment and conditional [×2] (
use-named-expression
) - Use f-string instead of string concatenation [×4] (
use-fstring-for-concatenation
)
This removes the following comments ( why? ):
# not elegant, but will do the trick
# files.wordpress.com hack | ||
url = re.sub(r'\.files\.wordpress\.', '.wordpress.', url) | ||
# wphost = re.match (r'(htt.+?\.wordpress\.[a-z]{2,3}/)', url) | ||
wphost = re.match (r'(htt.+?\.wordpress\.[a-z]{2,3})/?', url) | ||
if wphost: | ||
return wphost.group(1).rstrip('/') + '/' | ||
|
||
# K.O. victory | ||
ko_string = re.match(r'(.+?)(/wp/|/wordpress/|/wp-content/)', url) | ||
if ko_string: | ||
# files.wordpress.com hack | ||
url = re.sub(r'\.files\.wordpress\.', '.wordpress.', url) | ||
if wphost := re.match(r'(htt.+?\.wordpress\.[a-z]{2,3})/?', url): | ||
return wphost.group(1).rstrip('/') + '/' | ||
|
||
if ko_string := re.match(r'(.+?)(/wp/|/wordpress/|/wp-content/)', url): | ||
return ko_string.group(1).rstrip('/') + '/' | ||
|
||
# /tag/ /category/ | ||
tagcat = re.match(r'(.+?)(/tag/|/category/|\?cat=)', url) | ||
if tagcat: | ||
if tagcat := re.match(r'(.+?)(/tag/|/category/|\?cat=)', url): | ||
return tagcat.group(1).rstrip('/') + '/' | ||
|
||
# query parameters | ||
if re.search(r'/\?p=|\?page_id=|\?paged=/', url): | ||
mquery = re.match(r'(https?://.+?/)(blog/|weblog/)?(\?p=|\?page_id=|\?paged=)', url) | ||
if mquery: | ||
if mquery := re.match( | ||
r'(https?://.+?/)(blog/|weblog/)?(\?p=|\?page_id=|\?paged=)', url | ||
): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function find_target
refactored with the following changes:
- Use named expression to simplify assignment and conditional [×6] (
use-named-expression
)
This removes the following comments ( why? ):
# wphost = re.match (r'(htt.+?\.wordpress\.[a-z]{2,3}/)', url)
# K.O. victory
# /tag/ /category/
if my_urls.compressed is False: | ||
if not my_urls.compressed: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function test_urlstore
refactored with the following changes:
- Simplify comparison to boolean (
simplify-boolean-comparison
) - Use f-string instead of string concatenation [×2] (
use-fstring-for-concatenation
)
Branch
master
refactored by Sourcery.If you're happy with these changes, merge this Pull Request using the Squash and merge strategy.
See our documentation here.
Run Sourcery locally
Reduce the feedback loop during development by using the Sourcery editor plugin:
Review changes via command line
To manually merge these changes, make sure you're on the
master
branch, then run:Help us improve this pull request!