Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Core adjustments (typing, misc fixes, replacing os.path with pathlib.Path) #2293

Merged
merged 4 commits into from
Mar 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 15 additions & 18 deletions lncrawl/core/app.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import atexit
import logging
import os
import shutil
from pathlib import Path
from threading import Thread
from typing import Dict, List, Optional, Tuple
from urllib.parse import urlparse
Expand Down Expand Up @@ -148,15 +148,13 @@ def get_novel_info(self):
)

source_name = slugify(urlparse(self.crawler.home_url).netloc)
self.output_path = os.path.join(
C.DEFAULT_OUTPUT_PATH, source_name, self.good_file_name
)
self.output_path = Path(C.DEFAULT_OUTPUT_PATH) / source_name / self.good_file_name

# ----------------------------------------------------------------------- #

def start_download(self):
"""Requires: crawler, chapters, output_path"""
if not self.output_path or not os.path.isdir(self.output_path):
if not self.output_path or not Path(self.output_path).is_dir():
raise LNException("Output path is not defined")

assert self.crawler
Expand All @@ -167,8 +165,8 @@ def start_download(self):
fetch_chapter_images(self)
save_metadata(self, True)

if not self.output_formats.get("json", False):
shutil.rmtree(os.path.join(self.output_path, "json"), ignore_errors=True)
if not self.output_formats.get(OutputFormat.json.value, False):
shutil.rmtree(Path(self.output_path) / "json", ignore_errors=True)

if self.can_do("logout"):
self.crawler.logout()
Expand Down Expand Up @@ -208,39 +206,38 @@ def compress_books(self, archive_singles=False):
logger.info("Compressing output...")

# Get which paths to be archived with their base names
path_to_process = []
path_to_process: list[tuple[Path, str]] = []
for fmt in available_formats:
root_dir = os.path.join(self.output_path, fmt)
if os.path.isdir(root_dir):
root_dir: Path = Path(self.output_path) / fmt
if root_dir.is_dir():
path_to_process.append(
[root_dir, self.good_file_name + " (" + fmt + ")"]
(root_dir, self.good_file_name + " (" + fmt + ")")
)

# Archive files
self.archived_outputs = []
for root_dir, output_name in path_to_process:
file_list = os.listdir(root_dir)
file_list = list(root_dir.glob("*"))
if len(file_list) == 0:
logger.info("It has no files: %s", root_dir)
continue

archived_file = None
if (
len(file_list) == 1
and not archive_singles
and not os.path.isdir(os.path.join(root_dir, file_list[0]))
and not (root_dir / file_list[0]).is_dir()
):
logger.info("Not archiving single file inside %s" % root_dir)
archived_file = os.path.join(root_dir, file_list[0])
archived_file = (root_dir / file_list[0]).as_posix()
else:
base_path = os.path.join(self.output_path, output_name)
base_path = Path(self.output_path) / output_name
logger.info("Compressing %s to %s" % (root_dir, base_path))
archived_file = shutil.make_archive(
base_path,
base_path.as_posix(),
format="zip",
root_dir=root_dir,
)
logger.info("Compressed: %s", os.path.basename(archived_file))
logger.info("Compressed: %s", Path(archived_file).name)

if archived_file:
self.archived_outputs.append(archived_file)
229 changes: 107 additions & 122 deletions lncrawl/core/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,96 +32,86 @@ def __init__(self) -> None:
# the tag will be removed if the text inside contains the pattern
}

self.bad_tags: Set[str] = set(
[
# tag names to remove
"address",
"amp-auto-ads",
"audio",
"button",
"figcaption",
"footer",
"form",
"header",
"iframe",
"input",
"ins",
"map",
"nav",
"noscript",
"object",
"output",
"pirate",
"script",
"select",
"source",
"style",
"textarea",
"tfoot",
"video",
]
)
self.bad_css: Set[str] = set(
[
# css selector to select and remove tags
".adblock-service",
".sharedaddy",
".saboxplugin-wrap",
".adbox",
".ads-middle",
".ads",
".adsbygoogle",
".adsense-code",
".cb_p6_patreon_button",
".code-block",
".ezoic-ad-adaptive",
".ezoic-ad",
".ezoic-adpicker-ad",
".googlepublisherads",
".inline-ad-slot",
".jp-relatedposts",
".sharedaddy",
".wp-post-navigation",
"a[href*='patreon.com']",
"a[href*='paypal.me']",
]
)
self.p_block_tags: Set[str] = set(
[
# tags that can be used as paragraph break
"article",
"aside",
"div",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"main",
"p",
"section",
]
)
self.unchanged_tags: Set[str] = set(
[
# tags to keep unchanged with text and attributes
"canvas",
"img",
"pre",
]
)
self.plain_text_tags: Set[str] = set(
[
# tags that will be joined together in a paragraph
"a",
"abbr",
"acronym",
"label",
"span",
"time",
]
)
self.bad_tags: Set[str] = {
# tag names to remove
"address",
"amp-auto-ads",
"audio",
"button",
"figcaption",
"footer",
"form",
"header",
"iframe",
"input",
"ins",
"map",
"nav",
"noscript",
"object",
"output",
"pirate",
"script",
"select",
"source",
"style",
"textarea",
"tfoot",
"video",
}
self.bad_css: Set[str] = {
# css selector to select and remove tags
".adblock-service",
".sharedaddy",
".saboxplugin-wrap",
".adbox",
".ads-middle",
".ads",
".adsbygoogle",
".adsense-code",
".cb_p6_patreon_button",
".code-block",
".ezoic-ad-adaptive",
".ezoic-ad",
".ezoic-adpicker-ad",
".googlepublisherads",
".inline-ad-slot",
".jp-relatedposts",
".sharedaddy",
".wp-post-navigation",
"a[href*='patreon.com']",
"a[href*='paypal.me']",
}
self.p_block_tags: Set[str] = {
# tags that can be used as paragraph break
"article",
"aside",
"div",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"main",
"p",
"section",
}
self.unchanged_tags: Set[str] = {
# tags to keep unchanged with text and attributes
"canvas",
"img",
"pre",
}
self.plain_text_tags: Set[str] = {
# tags that will be joined together in a paragraph
"a",
"abbr",
"acronym",
"label",
"span",
"time",
}
self.substitutions: Dict[str, str] = {
# replace one string with another one
# "&": "&",
Expand All @@ -133,34 +123,28 @@ def __init__(self) -> None:
# "“s": "'s",
# "”s": "'s",
}
self.whitelist_attributes: Set[str] = set(
[
# the attributes to keep while cleaning a tag
"src",
"style",
# table and table children attributes
"colspan",
"rowspan",
"headers",
"scope",
"axis",
"id", # id required for headers ref
]
)
self.whitelist_css_property: Set[str] = set(
[
# the css styles to keep while cleaning style tag
"font-style",
"font-weight",
]
)
self.image_src_attributes: Set[str] = set(
[
"data-lazy-src",
"data-src",
"src",
]
)
self.whitelist_attributes: Set[str] = {
# the attributes to keep while cleaning a tag
"src",
"style",
# table and table children attributes
"colspan",
"rowspan",
"headers",
"scope",
"axis",
"id", # id required for headers ref
}
self.whitelist_css_property: Set[str] = {
# the css styles to keep while cleaning style tag
"font-style",
"font-weight",
}
self.image_src_attributes: Set[str] = {
"data-lazy-src",
"data-src",
"src",
}

def extract_contents(self, tag) -> str:
self.clean_contents(tag)
Expand Down Expand Up @@ -190,7 +174,8 @@ def clean_contents(self, div):
for tag in div.find_all(True):
if isinstance(tag, Comment):
tag.extract() # Remove comments
elif not isinstance(tag, Tag):
continue
if not isinstance(tag, Tag):
continue # Skip elements that are not a Tag
if tag.name in self.bad_tags:
tag.extract() # Remove bad tags
Expand Down Expand Up @@ -224,7 +209,7 @@ def extract_on_duplicate_sibling(self, tag: Tag):
if next_tag.name == tag.name:
tag.extract()

def clean_attributes(self, tag: Tag) -> dict:
def clean_attributes(self, tag: Tag):
attrs = {}
for name, value in tag.attrs.items():
if name not in self.whitelist_attributes:
Expand All @@ -246,7 +231,7 @@ def tag_contains_bad_text(self, tag: Tag) -> bool:
if not isinstance(pattern, re.Pattern):
pattern = re.compile(pattern, re.M)
self.bad_tag_text_pairs[tag.name] = pattern
return pattern.search(tag.text)
return bool(pattern.search(tag.text))

def clean_image(self, tag: Tag):
src = None
Expand Down Expand Up @@ -326,4 +311,4 @@ def contains_bad_texts(self, text: str) -> bool:
if not hasattr(self, "__blacklist__"):
pattern = re.compile("|".join(["(%s)" % p for p in self.bad_text_regex]))
self.__blacklist__ = pattern
return self.__blacklist__.search(text)
return bool(self.__blacklist__.search(text))
6 changes: 4 additions & 2 deletions lncrawl/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,15 +155,17 @@ def download_chapters(
unit="item",
fail_fast=fail_fast,
)
chapter = None
for (index, future) in futures.items():
try:
chapter = chapters[index]
chapter.body = future.result()
self.extract_chapter_images(chapter)
chapter.success = True
except Exception as e:
chapter.body = ""
chapter.success = False
if isinstance(chapter, Chapter):
chapter.body = ""
chapter.success = False
if isinstance(e, KeyboardInterrupt):
break
finally:
Expand Down
Loading
Loading