Merge pull request #2293 from camp00000/core-adjustments-extensive

Core adjustments (typing, misc fixes, replacing os.path with pathlib.Path)
dipu-bd · Mar 29, 2024 · b009bbd · b009bbd
2 parents 1b8c640 + f6b3564
commit b009bbd
Show file tree

Hide file tree

Showing 9 changed files with 158 additions and 173 deletions.
diff --git a/lncrawl/core/app.py b/lncrawl/core/app.py
@@ -1,7 +1,7 @@
 import atexit
 import logging
-import os
 import shutil
+from pathlib import Path
 from threading import Thread
 from typing import Dict, List, Optional, Tuple
 from urllib.parse import urlparse
@@ -148,15 +148,13 @@ def get_novel_info(self):
             )
 
         source_name = slugify(urlparse(self.crawler.home_url).netloc)
-        self.output_path = os.path.join(
-            C.DEFAULT_OUTPUT_PATH, source_name, self.good_file_name
-        )
+        self.output_path = Path(C.DEFAULT_OUTPUT_PATH) / source_name / self.good_file_name
 
     # ----------------------------------------------------------------------- #
 
     def start_download(self):
         """Requires: crawler, chapters, output_path"""
-        if not self.output_path or not os.path.isdir(self.output_path):
+        if not self.output_path or not Path(self.output_path).is_dir():
             raise LNException("Output path is not defined")
 
         assert self.crawler
@@ -167,8 +165,8 @@ def start_download(self):
         fetch_chapter_images(self)
         save_metadata(self, True)
 
-        if not self.output_formats.get("json", False):
-            shutil.rmtree(os.path.join(self.output_path, "json"), ignore_errors=True)
+        if not self.output_formats.get(OutputFormat.json.value, False):
+            shutil.rmtree(Path(self.output_path) / "json", ignore_errors=True)
 
         if self.can_do("logout"):
             self.crawler.logout()
@@ -208,39 +206,38 @@ def compress_books(self, archive_singles=False):
         logger.info("Compressing output...")
 
         # Get which paths to be archived with their base names
-        path_to_process = []
+        path_to_process: list[tuple[Path, str]] = []
         for fmt in available_formats:
-            root_dir = os.path.join(self.output_path, fmt)
-            if os.path.isdir(root_dir):
+            root_dir: Path = Path(self.output_path) / fmt
+            if root_dir.is_dir():
                 path_to_process.append(
-                    [root_dir, self.good_file_name + " (" + fmt + ")"]
+                    (root_dir, self.good_file_name + " (" + fmt + ")")
                 )
 
         # Archive files
         self.archived_outputs = []
         for root_dir, output_name in path_to_process:
-            file_list = os.listdir(root_dir)
+            file_list = list(root_dir.glob("*"))
             if len(file_list) == 0:
                 logger.info("It has no files: %s", root_dir)
                 continue
 
-            archived_file = None
             if (
                 len(file_list) == 1
                 and not archive_singles
-                and not os.path.isdir(os.path.join(root_dir, file_list[0]))
+                and not (root_dir / file_list[0]).is_dir()
             ):
                 logger.info("Not archiving single file inside %s" % root_dir)
-                archived_file = os.path.join(root_dir, file_list[0])
+                archived_file = (root_dir / file_list[0]).as_posix()
             else:
-                base_path = os.path.join(self.output_path, output_name)
+                base_path = Path(self.output_path) / output_name
                 logger.info("Compressing %s to %s" % (root_dir, base_path))
                 archived_file = shutil.make_archive(
-                    base_path,
+                    base_path.as_posix(),
                     format="zip",
                     root_dir=root_dir,
                 )
-                logger.info("Compressed: %s", os.path.basename(archived_file))
+                logger.info("Compressed: %s", Path(archived_file).name)
 
             if archived_file:
                 self.archived_outputs.append(archived_file)
diff --git a/lncrawl/core/cleaner.py b/lncrawl/core/cleaner.py
@@ -32,96 +32,86 @@ def __init__(self) -> None:
             # the tag will be removed if the text inside contains the pattern
         }
 
-        self.bad_tags: Set[str] = set(
-            [
-                # tag names to remove
-                "address",
-                "amp-auto-ads",
-                "audio",
-                "button",
-                "figcaption",
-                "footer",
-                "form",
-                "header",
-                "iframe",
-                "input",
-                "ins",
-                "map",
-                "nav",
-                "noscript",
-                "object",
-                "output",
-                "pirate",
-                "script",
-                "select",
-                "source",
-                "style",
-                "textarea",
-                "tfoot",
-                "video",
-            ]
-        )
-        self.bad_css: Set[str] = set(
-            [
-                # css selector to select and remove tags
-                ".adblock-service",
-                ".sharedaddy",
-                ".saboxplugin-wrap",
-                ".adbox",
-                ".ads-middle",
-                ".ads",
-                ".adsbygoogle",
-                ".adsense-code",
-                ".cb_p6_patreon_button",
-                ".code-block",
-                ".ezoic-ad-adaptive",
-                ".ezoic-ad",
-                ".ezoic-adpicker-ad",
-                ".googlepublisherads",
-                ".inline-ad-slot",
-                ".jp-relatedposts",
-                ".sharedaddy",
-                ".wp-post-navigation",
-                "a[href*='patreon.com']",
-                "a[href*='paypal.me']",
-            ]
-        )
-        self.p_block_tags: Set[str] = set(
-            [
-                # tags that can be used as paragraph break
-                "article",
-                "aside",
-                "div",
-                "h1",
-                "h2",
-                "h3",
-                "h4",
-                "h5",
-                "h6",
-                "main",
-                "p",
-                "section",
-            ]
-        )
-        self.unchanged_tags: Set[str] = set(
-            [
-                # tags to keep unchanged with text and attributes
-                "canvas",
-                "img",
-                "pre",
-            ]
-        )
-        self.plain_text_tags: Set[str] = set(
-            [
-                # tags that will be joined together in a paragraph
-                "a",
-                "abbr",
-                "acronym",
-                "label",
-                "span",
-                "time",
-            ]
-        )
+        self.bad_tags: Set[str] = {
+            # tag names to remove
+            "address",
+            "amp-auto-ads",
+            "audio",
+            "button",
+            "figcaption",
+            "footer",
+            "form",
+            "header",
+            "iframe",
+            "input",
+            "ins",
+            "map",
+            "nav",
+            "noscript",
+            "object",
+            "output",
+            "pirate",
+            "script",
+            "select",
+            "source",
+            "style",
+            "textarea",
+            "tfoot",
+            "video",
+        }
+        self.bad_css: Set[str] = {
+            # css selector to select and remove tags
+            ".adblock-service",
+            ".sharedaddy",
+            ".saboxplugin-wrap",
+            ".adbox",
+            ".ads-middle",
+            ".ads",
+            ".adsbygoogle",
+            ".adsense-code",
+            ".cb_p6_patreon_button",
+            ".code-block",
+            ".ezoic-ad-adaptive",
+            ".ezoic-ad",
+            ".ezoic-adpicker-ad",
+            ".googlepublisherads",
+            ".inline-ad-slot",
+            ".jp-relatedposts",
+            ".sharedaddy",
+            ".wp-post-navigation",
+            "a[href*='patreon.com']",
+            "a[href*='paypal.me']",
+        }
+        self.p_block_tags: Set[str] = {
+            # tags that can be used as paragraph break
+            "article",
+            "aside",
+            "div",
+            "h1",
+            "h2",
+            "h3",
+            "h4",
+            "h5",
+            "h6",
+            "main",
+            "p",
+            "section",
+        }
+        self.unchanged_tags: Set[str] = {
+            # tags to keep unchanged with text and attributes
+            "canvas",
+            "img",
+            "pre",
+        }
+        self.plain_text_tags: Set[str] = {
+            # tags that will be joined together in a paragraph
+            "a",
+            "abbr",
+            "acronym",
+            "label",
+            "span",
+            "time",
+        }
         self.substitutions: Dict[str, str] = {
             # replace one string with another one
             # "&": "&amp;",
@@ -133,34 +123,28 @@ def __init__(self) -> None:
             # "“s": "'s",
             # "”s": "'s",
         }
-        self.whitelist_attributes: Set[str] = set(
-            [
-                # the attributes to keep while cleaning a tag
-                "src",
-                "style",
-                # table and table children attributes
-                "colspan",
-                "rowspan",
-                "headers",
-                "scope",
-                "axis",
-                "id",  # id required for headers ref
-            ]
-        )
-        self.whitelist_css_property: Set[str] = set(
-            [
-                # the css styles to keep while cleaning style tag
-                "font-style",
-                "font-weight",
-            ]
-        )
-        self.image_src_attributes: Set[str] = set(
-            [
-                "data-lazy-src",
-                "data-src",
-                "src",
-            ]
-        )
+        self.whitelist_attributes: Set[str] = {
+            # the attributes to keep while cleaning a tag
+            "src",
+            "style",
+            # table and table children attributes
+            "colspan",
+            "rowspan",
+            "headers",
+            "scope",
+            "axis",
+            "id",  # id required for headers ref
+        }
+        self.whitelist_css_property: Set[str] = {
+            # the css styles to keep while cleaning style tag
+            "font-style",
+            "font-weight",
+        }
+        self.image_src_attributes: Set[str] = {
+            "data-lazy-src",
+            "data-src",
+            "src",
+        }
 
     def extract_contents(self, tag) -> str:
         self.clean_contents(tag)
@@ -190,7 +174,8 @@ def clean_contents(self, div):
         for tag in div.find_all(True):
             if isinstance(tag, Comment):
                 tag.extract()  # Remove comments
-            elif not isinstance(tag, Tag):
+                continue
+            if not isinstance(tag, Tag):
                 continue  # Skip elements that are not a Tag
             if tag.name in self.bad_tags:
                 tag.extract()  # Remove bad tags
@@ -224,7 +209,7 @@ def extract_on_duplicate_sibling(self, tag: Tag):
         if next_tag.name == tag.name:
             tag.extract()
 
-    def clean_attributes(self, tag: Tag) -> dict:
+    def clean_attributes(self, tag: Tag):
         attrs = {}
         for name, value in tag.attrs.items():
             if name not in self.whitelist_attributes:
@@ -246,7 +231,7 @@ def tag_contains_bad_text(self, tag: Tag) -> bool:
         if not isinstance(pattern, re.Pattern):
             pattern = re.compile(pattern, re.M)
             self.bad_tag_text_pairs[tag.name] = pattern
-        return pattern.search(tag.text)
+        return bool(pattern.search(tag.text))
 
     def clean_image(self, tag: Tag):
         src = None
@@ -326,4 +311,4 @@ def contains_bad_texts(self, text: str) -> bool:
         if not hasattr(self, "__blacklist__"):
             pattern = re.compile("|".join(["(%s)" % p for p in self.bad_text_regex]))
             self.__blacklist__ = pattern
-        return self.__blacklist__.search(text)
+        return bool(self.__blacklist__.search(text))
diff --git a/lncrawl/core/crawler.py b/lncrawl/core/crawler.py
@@ -155,15 +155,17 @@ def download_chapters(
             unit="item",
             fail_fast=fail_fast,
         )
+        chapter = None
         for (index, future) in futures.items():
             try:
                 chapter = chapters[index]
                 chapter.body = future.result()
                 self.extract_chapter_images(chapter)
                 chapter.success = True
             except Exception as e:
-                chapter.body = ""
-                chapter.success = False
+                if isinstance(chapter, Chapter):
+                    chapter.body = ""
+                    chapter.success = False
                 if isinstance(e, KeyboardInterrupt):
                     break
             finally: