From 522978b0dc707b8aabcbdf6a728b7f25a2baa4aa Mon Sep 17 00:00:00 2001 From: ACA Date: Sun, 3 Mar 2024 13:28:45 +0100 Subject: [PATCH 1/4] cleaner: use set literals, improve typing --- lncrawl/core/cleaner.py | 234 +++++++++++++++++++--------------------- 1 file changed, 110 insertions(+), 124 deletions(-) diff --git a/lncrawl/core/cleaner.py b/lncrawl/core/cleaner.py index eebf1df87..e1b1eef48 100644 --- a/lncrawl/core/cleaner.py +++ b/lncrawl/core/cleaner.py @@ -4,7 +4,7 @@ import unicodedata from typing import AnyStr, Dict, List, Set, Union -from bs4 import Comment, Tag +from bs4 import Comment, Tag, PageElement LINE_SEP = "
" @@ -16,6 +16,8 @@ NONPRINTABLE = itertools.chain(range(0x00, 0x20), range(0x7F, 0xA0), INVISIBLE_CHARS) NONPRINTABLE_MAPPING = {character: None for character in NONPRINTABLE} +TAG_LIKE = Union[Comment, PageElement, Tag] + class TextCleaner: def __init__(self) -> None: @@ -32,96 +34,86 @@ def __init__(self) -> None: # the tag will be removed if the text inside contains the pattern } - self.bad_tags: Set[str] = set( - [ - # tag names to remove - "address", - "amp-auto-ads", - "audio", - "button", - "figcaption", - "footer", - "form", - "header", - "iframe", - "input", - "ins", - "map", - "nav", - "noscript", - "object", - "output", - "pirate", - "script", - "select", - "source", - "style", - "textarea", - "tfoot", - "video", - ] - ) - self.bad_css: Set[str] = set( - [ - # css selector to select and remove tags - ".adblock-service", - ".sharedaddy", - ".saboxplugin-wrap", - ".adbox", - ".ads-middle", - ".ads", - ".adsbygoogle", - ".adsense-code", - ".cb_p6_patreon_button", - ".code-block", - ".ezoic-ad-adaptive", - ".ezoic-ad", - ".ezoic-adpicker-ad", - ".googlepublisherads", - ".inline-ad-slot", - ".jp-relatedposts", - ".sharedaddy", - ".wp-post-navigation", - "a[href*='patreon.com']", - "a[href*='paypal.me']", - ] - ) - self.p_block_tags: Set[str] = set( - [ - # tags that can be used as paragraph break - "article", - "aside", - "div", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "main", - "p", - "section", - ] - ) - self.unchanged_tags: Set[str] = set( - [ - # tags to keep unchanged with text and attributes - "canvas", - "img", - "pre", - ] - ) - self.plain_text_tags: Set[str] = set( - [ - # tags that will be joined together in a paragraph - "a", - "abbr", - "acronym", - "label", - "span", - "time", - ] - ) + self.bad_tags: Set[str] = { + # tag names to remove + "address", + "amp-auto-ads", + "audio", + "button", + "figcaption", + "footer", + "form", + "header", + "iframe", + "input", + "ins", + "map", + "nav", + "noscript", + "object", + "output", + "pirate", + "script", + "select", + "source", + "style", + "textarea", + "tfoot", + "video", + } + self.bad_css: Set[str] = { + # css selector to select and remove tags + ".adblock-service", + ".sharedaddy", + ".saboxplugin-wrap", + ".adbox", + ".ads-middle", + ".ads", + ".adsbygoogle", + ".adsense-code", + ".cb_p6_patreon_button", + ".code-block", + ".ezoic-ad-adaptive", + ".ezoic-ad", + ".ezoic-adpicker-ad", + ".googlepublisherads", + ".inline-ad-slot", + ".jp-relatedposts", + ".sharedaddy", + ".wp-post-navigation", + "a[href*='patreon.com']", + "a[href*='paypal.me']", + } + self.p_block_tags: Set[str] = { + # tags that can be used as paragraph break + "article", + "aside", + "div", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "main", + "p", + "section", + } + self.unchanged_tags: Set[str] = { + # tags to keep unchanged with text and attributes + "canvas", + "img", + "pre", + } + self.plain_text_tags: Set[str] = { + # tags that will be joined together in a paragraph + "a", + "abbr", + "acronym", + "label", + "span", + "time", + } self.substitutions: Dict[str, str] = { # replace one string with another one # "&": "&", @@ -133,34 +125,28 @@ def __init__(self) -> None: # "“s": "'s", # "”s": "'s", } - self.whitelist_attributes: Set[str] = set( - [ - # the attributes to keep while cleaning a tag - "src", - "style", - # table and table children attributes - "colspan", - "rowspan", - "headers", - "scope", - "axis", - "id", # id required for headers ref - ] - ) - self.whitelist_css_property: Set[str] = set( - [ - # the css styles to keep while cleaning style tag - "font-style", - "font-weight", - ] - ) - self.image_src_attributes: Set[str] = set( - [ - "data-lazy-src", - "data-src", - "src", - ] - ) + self.whitelist_attributes: Set[str] = { + # the attributes to keep while cleaning a tag + "src", + "style", + # table and table children attributes + "colspan", + "rowspan", + "headers", + "scope", + "axis", + "id", # id required for headers ref + } + self.whitelist_css_property: Set[str] = { + # the css styles to keep while cleaning style tag + "font-style", + "font-weight", + } + self.image_src_attributes: Set[str] = { + "data-lazy-src", + "data-src", + "src", + } def extract_contents(self, tag) -> str: self.clean_contents(tag) @@ -217,14 +203,14 @@ def clean_text(self, text) -> str: ) return text - def extract_on_duplicate_sibling(self, tag: Tag): + def extract_on_duplicate_sibling(self, tag: TAG_LIKE): next_tag = tag.next_sibling if not isinstance(next_tag, Tag): return if next_tag.name == tag.name: tag.extract() - def clean_attributes(self, tag: Tag) -> dict: + def clean_attributes(self, tag: TAG_LIKE): attrs = {} for name, value in tag.attrs.items(): if name not in self.whitelist_attributes: @@ -246,9 +232,9 @@ def tag_contains_bad_text(self, tag: Tag) -> bool: if not isinstance(pattern, re.Pattern): pattern = re.compile(pattern, re.M) self.bad_tag_text_pairs[tag.name] = pattern - return pattern.search(tag.text) + return bool(pattern.search(tag.text)) - def clean_image(self, tag: Tag): + def clean_image(self, tag: TAG_LIKE): src = None for name in self.image_src_attributes: src = tag.get(name) @@ -326,4 +312,4 @@ def contains_bad_texts(self, text: str) -> bool: if not hasattr(self, "__blacklist__"): pattern = re.compile("|".join(["(%s)" % p for p in self.bad_text_regex])) self.__blacklist__ = pattern - return self.__blacklist__.search(text) + return bool(self.__blacklist__.search(text)) From 02f44f4b2b3a9a10e8a6a103c0bc62b95dd39260 Mon Sep 17 00:00:00 2001 From: ACA Date: Sun, 3 Mar 2024 13:35:22 +0100 Subject: [PATCH 2/4] core: improve typing, fix some type assignments, docstrings use pathlib in some places --- lncrawl/core/app.py | 5 +++-- lncrawl/core/crawler.py | 6 ++++-- lncrawl/core/downloader.py | 4 ++-- lncrawl/core/logconfig.py | 4 ++-- lncrawl/core/novel_info.py | 3 ++- lncrawl/core/proxy.py | 4 ++-- lncrawl/core/taskman.py | 3 ++- 7 files changed, 17 insertions(+), 12 deletions(-) diff --git a/lncrawl/core/app.py b/lncrawl/core/app.py index 83f92f8e0..809004725 100644 --- a/lncrawl/core/app.py +++ b/lncrawl/core/app.py @@ -2,6 +2,7 @@ import logging import os import shutil +from pathlib import Path from threading import Thread from typing import Dict, List, Optional, Tuple from urllib.parse import urlparse @@ -167,7 +168,7 @@ def start_download(self): fetch_chapter_images(self) save_metadata(self, True) - if not self.output_formats.get("json", False): + if not self.output_formats.get(OutputFormat.json.value, False): shutil.rmtree(os.path.join(self.output_path, "json"), ignore_errors=True) if self.can_do("logout"): @@ -233,7 +234,7 @@ def compress_books(self, archive_singles=False): logger.info("Not archiving single file inside %s" % root_dir) archived_file = os.path.join(root_dir, file_list[0]) else: - base_path = os.path.join(self.output_path, output_name) + base_path = Path(self.output_path) / output_name logger.info("Compressing %s to %s" % (root_dir, base_path)) archived_file = shutil.make_archive( base_path, diff --git a/lncrawl/core/crawler.py b/lncrawl/core/crawler.py index bafde7435..20c89b00b 100644 --- a/lncrawl/core/crawler.py +++ b/lncrawl/core/crawler.py @@ -155,6 +155,7 @@ def download_chapters( unit="item", fail_fast=fail_fast, ) + chapter = None for (index, future) in futures.items(): try: chapter = chapters[index] @@ -162,8 +163,9 @@ def download_chapters( self.extract_chapter_images(chapter) chapter.success = True except Exception as e: - chapter.body = "" - chapter.success = False + if isinstance(chapter, Chapter): + chapter.body = "" + chapter.success = False if isinstance(e, KeyboardInterrupt): break finally: diff --git a/lncrawl/core/downloader.py b/lncrawl/core/downloader.py index 57d836a58..39bcf18a6 100644 --- a/lncrawl/core/downloader.py +++ b/lncrawl/core/downloader.py @@ -80,9 +80,9 @@ def fetch_chapter_body(app): old_chapter = json.load(file) chapter.update(**old_chapter) except FileNotFoundError: - logger.info("Missing File: %s Retrieved!" % (file_name)) + logger.info("Missing File: %s Retrieved!" % file_name) except json.JSONDecodeError: - logger.info("Unable to decode JSON from the file: %s" % (file_name)) + logger.info("Unable to decode JSON from the file: %s" % file_name) except Exception as e: logger.exception("An error occurred while reading the file:", e) diff --git a/lncrawl/core/logconfig.py b/lncrawl/core/logconfig.py index 660b6cd50..898676c3f 100644 --- a/lncrawl/core/logconfig.py +++ b/lncrawl/core/logconfig.py @@ -65,8 +65,8 @@ def configure_logging(): } if not log_file: del config["handlers"]["file"] - config["root"]["level"] = level + config["root"]["level"] = logging.getLevelName(level) config["root"]["handlers"] = ["console"] - config["handlers"]["console"]["level"] = level + config["handlers"]["console"]["level"] = logging.getLevelName(level) logging.config.dictConfig(config) diff --git a/lncrawl/core/novel_info.py b/lncrawl/core/novel_info.py index 77d487ef0..ef2c542cb 100644 --- a/lncrawl/core/novel_info.py +++ b/lncrawl/core/novel_info.py @@ -1,6 +1,7 @@ import math import os import re +from pathlib import Path from typing import Dict from .. import constants as C @@ -109,5 +110,5 @@ def save_metadata(app, completed=False): ) os.makedirs(app.output_path, exist_ok=True) - file_name = os.path.join(app.output_path, C.META_FILE_NAME) + file_name = Path(app.output_path) / C.META_FILE_NAME novel.to_json(file_name, encoding="utf-8", indent=2) diff --git a/lncrawl/core/proxy.py b/lncrawl/core/proxy.py index ed7dc2822..e2247ba40 100644 --- a/lncrawl/core/proxy.py +++ b/lncrawl/core/proxy.py @@ -156,7 +156,7 @@ def __find_proxies(): __proxy_list.setdefault(scheme, []) if __proxy_visited_at.get(url, 0) + __proxy_ttl < time.time(): __validate_and_add(scheme, ip, url) - __proxy_visited_at[url] = time.time() + __proxy_visited_at[url] = int(time.time()) wait_times = 3 * 60 while wait_times and not __has_exit: @@ -178,6 +178,6 @@ def start_proxy_fetcher(): Thread(target=__find_proxies, daemon=False).start() -def stop_proxy_fetcher(): +def stop_proxy_fetcher(*args, **kwargs): global __has_exit __has_exit = True diff --git a/lncrawl/core/taskman.py b/lncrawl/core/taskman.py index e9b186d1b..7fccb7f38 100644 --- a/lncrawl/core/taskman.py +++ b/lncrawl/core/taskman.py @@ -136,7 +136,7 @@ def domain_gate(self, hostname: str = ""): """Limit number of entry per hostname. Args: - url: A fully qualified url. + hostname: A fully qualified url. Returns: A semaphore object to wait. @@ -179,6 +179,7 @@ def resolve_futures( disable_bar: Hides the progress bar if True. desc: The progress bar description unit: The progress unit name + fail_fast: Fail on first error """ if not futures: return From cf8b6126b7eb1468e7293e1ce8cb61e2cfdaa266 Mon Sep 17 00:00:00 2001 From: ACA Date: Sun, 3 Mar 2024 14:05:34 +0100 Subject: [PATCH 3/4] core: app, downloader, novel_info, sources: replace os with pathlib where possible --- lncrawl/core/app.py | 28 ++++++++++++---------------- lncrawl/core/downloader.py | 34 +++++++++++++++++----------------- lncrawl/core/novel_info.py | 3 +-- lncrawl/core/sources.py | 8 ++++---- 4 files changed, 34 insertions(+), 39 deletions(-) diff --git a/lncrawl/core/app.py b/lncrawl/core/app.py index 809004725..46a1744a8 100644 --- a/lncrawl/core/app.py +++ b/lncrawl/core/app.py @@ -1,6 +1,5 @@ import atexit import logging -import os import shutil from pathlib import Path from threading import Thread @@ -149,15 +148,13 @@ def get_novel_info(self): ) source_name = slugify(urlparse(self.crawler.home_url).netloc) - self.output_path = os.path.join( - C.DEFAULT_OUTPUT_PATH, source_name, self.good_file_name - ) + self.output_path = Path(C.DEFAULT_OUTPUT_PATH) / source_name / self.good_file_name # ----------------------------------------------------------------------- # def start_download(self): """Requires: crawler, chapters, output_path""" - if not self.output_path or not os.path.isdir(self.output_path): + if not self.output_path or not Path(self.output_path).is_dir(): raise LNException("Output path is not defined") assert self.crawler @@ -169,7 +166,7 @@ def start_download(self): save_metadata(self, True) if not self.output_formats.get(OutputFormat.json.value, False): - shutil.rmtree(os.path.join(self.output_path, "json"), ignore_errors=True) + shutil.rmtree(Path(self.output_path) / "json", ignore_errors=True) if self.can_do("logout"): self.crawler.logout() @@ -209,39 +206,38 @@ def compress_books(self, archive_singles=False): logger.info("Compressing output...") # Get which paths to be archived with their base names - path_to_process = [] + path_to_process: list[tuple[Path, str]] = [] for fmt in available_formats: - root_dir = os.path.join(self.output_path, fmt) - if os.path.isdir(root_dir): + root_dir: Path = Path(self.output_path) / fmt + if root_dir.is_dir(): path_to_process.append( - [root_dir, self.good_file_name + " (" + fmt + ")"] + (root_dir, self.good_file_name + " (" + fmt + ")") ) # Archive files self.archived_outputs = [] for root_dir, output_name in path_to_process: - file_list = os.listdir(root_dir) + file_list = list(root_dir.glob("*")) if len(file_list) == 0: logger.info("It has no files: %s", root_dir) continue - archived_file = None if ( len(file_list) == 1 and not archive_singles - and not os.path.isdir(os.path.join(root_dir, file_list[0])) + and not (root_dir / file_list[0]).is_dir() ): logger.info("Not archiving single file inside %s" % root_dir) - archived_file = os.path.join(root_dir, file_list[0]) + archived_file = (root_dir / file_list[0]).as_posix() else: base_path = Path(self.output_path) / output_name logger.info("Compressing %s to %s" % (root_dir, base_path)) archived_file = shutil.make_archive( - base_path, + base_path.as_posix(), format="zip", root_dir=root_dir, ) - logger.info("Compressed: %s", os.path.basename(archived_file)) + logger.info("Compressed: %s", Path(archived_file).name) if archived_file: self.archived_outputs.append(archived_file) diff --git a/lncrawl/core/downloader.py b/lncrawl/core/downloader.py index 39bcf18a6..09bea39db 100644 --- a/lncrawl/core/downloader.py +++ b/lncrawl/core/downloader.py @@ -3,7 +3,7 @@ """ import json import logging -import os +from pathlib import Path from ..models.chapter import Chapter from ..utils.imgen import generate_cover_image @@ -17,13 +17,13 @@ def _chapter_file( output_path: str, pack_by_volume: bool, ): - dir_name = os.path.join(output_path, "json") + dir_name = Path(output_path) / "json" if pack_by_volume: vol_name = "Volume " + str(chapter.volume).rjust(2, "0") - dir_name = os.path.join(dir_name, vol_name) + dir_name = dir_name / vol_name chapter_name = str(chapter.id).rjust(5, "0") - json_file = os.path.join(dir_name, chapter_name + ".json") + json_file = dir_name / (chapter_name + ".json") return json_file @@ -54,8 +54,8 @@ def _save_chapter(app, chapter: Chapter): output_path=app.output_path, pack_by_volume=app.pack_by_volume, ) - os.makedirs(os.path.dirname(file_name), exist_ok=True) - with open(file_name, "w", encoding="utf-8") as fp: + file_name.parent.mkdir(parents=True, exist_ok=True) + with file_name.open("w", encoding="utf-8") as fp: json.dump(chapter, fp, ensure_ascii=False) @@ -100,22 +100,22 @@ def fetch_chapter_body(app): logger.info(f"Processed {len(app.chapters)} chapters [{app.progress} fetched]") -def _fetch_content_image(app, url, image_file): +def _fetch_content_image(app, url, image_file: Path): from .app import App assert isinstance(app, App) - if url and not os.path.isfile(image_file): + if url and not (image_file.exists() and image_file.is_file()): try: img = app.crawler.download_image(url) - os.makedirs(os.path.dirname(image_file), exist_ok=True) + image_file.parent.mkdir(parents=True, exist_ok=True) if img.mode not in ("L", "RGB", "YCbCr", "RGBX"): if img.mode == "RGBa": #RGBa -> RGB isn't supported so we go through RGBA first img.convert("RGBA").convert("RGB") else: img = img.convert("RGB") - img.save(image_file, "JPEG", optimized=True) + img.save(image_file.as_posix(), "JPEG", optimized=True) img.close() logger.debug("Saved image: %s", image_file) finally: @@ -129,7 +129,7 @@ def _fetch_cover_image(app): assert app.crawler is not None filename = "cover.jpg" - cover_file = os.path.join(app.output_path, filename) + cover_file = Path(app.output_path) / filename if app.crawler.novel_cover: try: _fetch_content_image( @@ -141,12 +141,12 @@ def _fetch_cover_image(app): if logger.isEnabledFor(logging.DEBUG): logger.exception("Failed to download cover", e) - if not os.path.isfile(cover_file): - generate_cover_image(cover_file) + if not cover_file.exists() and cover_file.is_file(): + generate_cover_image(cover_file.as_posix()) app.progress += 1 app.book_cover = cover_file - assert os.path.isfile(app.book_cover), "Failed to download or generate cover image" + assert Path(app.book_cover).is_file(), "Failed to download or generate cover image" def _discard_failed_images(app, chapter, failed): @@ -191,7 +191,7 @@ def fetch_chapter_images(app): ] # download content images - image_folder = os.path.join(app.output_path, "images") + image_folder = Path(app.output_path) / "images" images_to_download = set( [ (filename, url) @@ -204,7 +204,7 @@ def fetch_chapter_images(app): _fetch_content_image, app, url, - os.path.join(image_folder, filename), + image_folder / filename, ) for filename, url in images_to_download ] @@ -215,7 +215,7 @@ def fetch_chapter_images(app): failed = [ filename for filename, url in images_to_download - if not os.path.isfile(os.path.join(image_folder, filename)) + if not (image_folder / filename).is_file() ] finally: logger.info("Processed %d images [%d failed]" % (app.progress, len(failed))) diff --git a/lncrawl/core/novel_info.py b/lncrawl/core/novel_info.py index ef2c542cb..a993ed2d3 100644 --- a/lncrawl/core/novel_info.py +++ b/lncrawl/core/novel_info.py @@ -1,5 +1,4 @@ import math -import os import re from pathlib import Path from typing import Dict @@ -109,6 +108,6 @@ def save_metadata(app, completed=False): ), ) - os.makedirs(app.output_path, exist_ok=True) + Path(app.output_path).mkdir(parents=True, exist_ok=True) file_name = Path(app.output_path) / C.META_FILE_NAME novel.to_json(file_name, encoding="utf-8", indent=2) diff --git a/lncrawl/core/sources.py b/lncrawl/core/sources.py index 79c88d8c3..9c4ea6c9e 100644 --- a/lncrawl/core/sources.py +++ b/lncrawl/core/sources.py @@ -83,7 +83,7 @@ def __download_data(url: str): __index_fetch_internval_in_seconds = 30 * 60 __master_index_file_url = "https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/master/sources/_index.json" -__user_data_path = Path(os.path.expanduser("~")) / ".lncrawl" +__user_data_path = Path("~").expanduser() / ".lncrawl" __local_data_path = Path(__file__).parent.parent.absolute() if not (__local_data_path / "sources").is_dir(): __local_data_path = __local_data_path.parent @@ -110,7 +110,7 @@ def __load_current_index(): def __save_current_index(): index_file = __user_data_path / "sources" / "_index.json" - os.makedirs(index_file.parent, exist_ok=True) + index_file.parent.mkdir(parents=True, exist_ok=True) logger.debug("Saving current index data to %s", index_file) with open(index_file, "w", encoding="utf8") as fp: @@ -170,12 +170,12 @@ def __save_source_data(source_id, data): dst_dir = dst_file.parent temp_file = dst_dir / ("." + dst_file.name) - os.makedirs(dst_dir, exist_ok=True) + dst_dir.mkdir(parents=True, exist_ok=True) with open(temp_file, "wb") as fp: fp.write(data) if dst_file.exists(): - os.remove(dst_file) + dst_file.unlink() temp_file.rename(dst_file) global __current_index From f6b35642c2e16ff434df4b8518903a2105232777 Mon Sep 17 00:00:00 2001 From: ACA Date: Sun, 10 Mar 2024 12:03:39 +0100 Subject: [PATCH 4/4] core cleaner: update typing --- lncrawl/core/cleaner.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/lncrawl/core/cleaner.py b/lncrawl/core/cleaner.py index e1b1eef48..3856c4ac3 100644 --- a/lncrawl/core/cleaner.py +++ b/lncrawl/core/cleaner.py @@ -4,7 +4,7 @@ import unicodedata from typing import AnyStr, Dict, List, Set, Union -from bs4 import Comment, Tag, PageElement +from bs4 import Comment, Tag LINE_SEP = "
" @@ -16,8 +16,6 @@ NONPRINTABLE = itertools.chain(range(0x00, 0x20), range(0x7F, 0xA0), INVISIBLE_CHARS) NONPRINTABLE_MAPPING = {character: None for character in NONPRINTABLE} -TAG_LIKE = Union[Comment, PageElement, Tag] - class TextCleaner: def __init__(self) -> None: @@ -176,7 +174,8 @@ def clean_contents(self, div): for tag in div.find_all(True): if isinstance(tag, Comment): tag.extract() # Remove comments - elif not isinstance(tag, Tag): + continue + if not isinstance(tag, Tag): continue # Skip elements that are not a Tag if tag.name in self.bad_tags: tag.extract() # Remove bad tags @@ -203,14 +202,14 @@ def clean_text(self, text) -> str: ) return text - def extract_on_duplicate_sibling(self, tag: TAG_LIKE): + def extract_on_duplicate_sibling(self, tag: Tag): next_tag = tag.next_sibling if not isinstance(next_tag, Tag): return if next_tag.name == tag.name: tag.extract() - def clean_attributes(self, tag: TAG_LIKE): + def clean_attributes(self, tag: Tag): attrs = {} for name, value in tag.attrs.items(): if name not in self.whitelist_attributes: @@ -234,7 +233,7 @@ def tag_contains_bad_text(self, tag: Tag) -> bool: self.bad_tag_text_pairs[tag.name] = pattern return bool(pattern.search(tag.text)) - def clean_image(self, tag: TAG_LIKE): + def clean_image(self, tag: Tag): src = None for name in self.image_src_attributes: src = tag.get(name)