From 522978b0dc707b8aabcbdf6a728b7f25a2baa4aa Mon Sep 17 00:00:00 2001
From: ACA <aca-0021@proton.me>
Date: Sun, 3 Mar 2024 13:28:45 +0100
Subject: [PATCH 1/4] cleaner: use set literals, improve typing

---
 lncrawl/core/cleaner.py | 234 +++++++++++++++++++---------------------
 1 file changed, 110 insertions(+), 124 deletions(-)
diff --git a/lncrawl/core/cleaner.py b/lncrawl/core/cleaner.py
index eebf1df87..e1b1eef48 100644
--- a/lncrawl/core/cleaner.py
+++ b/lncrawl/core/cleaner.py
@@ -4,7 +4,7 @@
 import unicodedata
 from typing import AnyStr, Dict, List, Set, Union
 
-from bs4 import Comment, Tag
+from bs4 import Comment, Tag, PageElement
 
 LINE_SEP = "<br>"
 
@@ -16,6 +16,8 @@
 NONPRINTABLE = itertools.chain(range(0x00, 0x20), range(0x7F, 0xA0), INVISIBLE_CHARS)
 NONPRINTABLE_MAPPING = {character: None for character in NONPRINTABLE}
 
+TAG_LIKE = Union[Comment, PageElement, Tag]
+
 
 class TextCleaner:
     def __init__(self) -> None:
@@ -32,96 +34,86 @@ def __init__(self) -> None:
             # the tag will be removed if the text inside contains the pattern
         }
 
-        self.bad_tags: Set[str] = set(
-            [
-                # tag names to remove
-                "address",
-                "amp-auto-ads",
-                "audio",
-                "button",
-                "figcaption",
-                "footer",
-                "form",
-                "header",
-                "iframe",
-                "input",
-                "ins",
-                "map",
-                "nav",
-                "noscript",
-                "object",
-                "output",
-                "pirate",
-                "script",
-                "select",
-                "source",
-                "style",
-                "textarea",
-                "tfoot",
-                "video",
-            ]
-        )
-        self.bad_css: Set[str] = set(
-            [
-                # css selector to select and remove tags
-                ".adblock-service",
-                ".sharedaddy",
-                ".saboxplugin-wrap",
-                ".adbox",
-                ".ads-middle",
-                ".ads",
-                ".adsbygoogle",
-                ".adsense-code",
-                ".cb_p6_patreon_button",
-                ".code-block",
-                ".ezoic-ad-adaptive",
-                ".ezoic-ad",
-                ".ezoic-adpicker-ad",
-                ".googlepublisherads",
-                ".inline-ad-slot",
-                ".jp-relatedposts",
-                ".sharedaddy",
-                ".wp-post-navigation",
-                "a[href*='patreon.com']",
-                "a[href*='paypal.me']",
-            ]
-        )
-        self.p_block_tags: Set[str] = set(
-            [
-                # tags that can be used as paragraph break
-                "article",
-                "aside",
-                "div",
-                "h1",
-                "h2",
-                "h3",
-                "h4",
-                "h5",
-                "h6",
-                "main",
-                "p",
-                "section",
-            ]
-        )
-        self.unchanged_tags: Set[str] = set(
-            [
-                # tags to keep unchanged with text and attributes
-                "canvas",
-                "img",
-                "pre",
-            ]
-        )
-        self.plain_text_tags: Set[str] = set(
-            [
-                # tags that will be joined together in a paragraph
-                "a",
-                "abbr",
-                "acronym",
-                "label",
-                "span",
-                "time",
-            ]
-        )
+        self.bad_tags: Set[str] = {
+            # tag names to remove
+            "address",
+            "amp-auto-ads",
+            "audio",
+            "button",
+            "figcaption",
+            "footer",
+            "form",
+            "header",
+            "iframe",
+            "input",
+            "ins",
+            "map",
+            "nav",
+            "noscript",
+            "object",
+            "output",
+            "pirate",
+            "script",
+            "select",
+            "source",
+            "style",
+            "textarea",
+            "tfoot",
+            "video",
+        }
+        self.bad_css: Set[str] = {
+            # css selector to select and remove tags
+            ".adblock-service",
+            ".sharedaddy",
+            ".saboxplugin-wrap",
+            ".adbox",
+            ".ads-middle",
+            ".ads",
+            ".adsbygoogle",
+            ".adsense-code",
+            ".cb_p6_patreon_button",
+            ".code-block",
+            ".ezoic-ad-adaptive",
+            ".ezoic-ad",
+            ".ezoic-adpicker-ad",
+            ".googlepublisherads",
+            ".inline-ad-slot",
+            ".jp-relatedposts",
+            ".sharedaddy",
+            ".wp-post-navigation",
+            "a[href*='patreon.com']",
+            "a[href*='paypal.me']",
+        }
+        self.p_block_tags: Set[str] = {
+            # tags that can be used as paragraph break
+            "article",
+            "aside",
+            "div",
+            "h1",
+            "h2",
+            "h3",
+            "h4",
+            "h5",
+            "h6",
+            "main",
+            "p",
+            "section",
+        }
+        self.unchanged_tags: Set[str] = {
+            # tags to keep unchanged with text and attributes
+            "canvas",
+            "img",
+            "pre",
+        }
+        self.plain_text_tags: Set[str] = {
+            # tags that will be joined together in a paragraph
+            "a",
+            "abbr",
+            "acronym",
+            "label",
+            "span",
+            "time",
+        }
         self.substitutions: Dict[str, str] = {
             # replace one string with another one
             # "&": "&amp;",
@@ -133,34 +125,28 @@ def __init__(self) -> None:
             # "“s": "'s",
             # "”s": "'s",
         }
-        self.whitelist_attributes: Set[str] = set(
-            [
-                # the attributes to keep while cleaning a tag
-                "src",
-                "style",
-                # table and table children attributes
-                "colspan",
-                "rowspan",
-                "headers",
-                "scope",
-                "axis",
-                "id",  # id required for headers ref
-            ]
-        )
-        self.whitelist_css_property: Set[str] = set(
-            [
-                # the css styles to keep while cleaning style tag
-                "font-style",
-                "font-weight",
-            ]
-        )
-        self.image_src_attributes: Set[str] = set(
-            [
-                "data-lazy-src",
-                "data-src",
-                "src",
-            ]
-        )
+        self.whitelist_attributes: Set[str] = {
+            # the attributes to keep while cleaning a tag
+            "src",
+            "style",
+            # table and table children attributes
+            "colspan",
+            "rowspan",
+            "headers",
+            "scope",
+            "axis",
+            "id",  # id required for headers ref
+        }
+        self.whitelist_css_property: Set[str] = {
+            # the css styles to keep while cleaning style tag
+            "font-style",
+            "font-weight",
+        }
+        self.image_src_attributes: Set[str] = {
+            "data-lazy-src",
+            "data-src",
+            "src",
+        }
 
     def extract_contents(self, tag) -> str:
         self.clean_contents(tag)
@@ -217,14 +203,14 @@ def clean_text(self, text) -> str:
         )
         return text
 
-    def extract_on_duplicate_sibling(self, tag: Tag):
+    def extract_on_duplicate_sibling(self, tag: TAG_LIKE):
         next_tag = tag.next_sibling
         if not isinstance(next_tag, Tag):
             return
         if next_tag.name == tag.name:
             tag.extract()
 
-    def clean_attributes(self, tag: Tag) -> dict:
+    def clean_attributes(self, tag: TAG_LIKE):
         attrs = {}
         for name, value in tag.attrs.items():
             if name not in self.whitelist_attributes:
@@ -246,9 +232,9 @@ def tag_contains_bad_text(self, tag: Tag) -> bool:
         if not isinstance(pattern, re.Pattern):
             pattern = re.compile(pattern, re.M)
             self.bad_tag_text_pairs[tag.name] = pattern
-        return pattern.search(tag.text)
+        return bool(pattern.search(tag.text))
 
-    def clean_image(self, tag: Tag):
+    def clean_image(self, tag: TAG_LIKE):
         src = None
         for name in self.image_src_attributes:
             src = tag.get(name)
@@ -326,4 +312,4 @@ def contains_bad_texts(self, text: str) -> bool:
         if not hasattr(self, "__blacklist__"):
             pattern = re.compile("|".join(["(%s)" % p for p in self.bad_text_regex]))
             self.__blacklist__ = pattern
-        return self.__blacklist__.search(text)
+        return bool(self.__blacklist__.search(text))

From 02f44f4b2b3a9a10e8a6a103c0bc62b95dd39260 Mon Sep 17 00:00:00 2001
From: ACA <aca-0021@proton.me>
Date: Sun, 3 Mar 2024 13:35:22 +0100
Subject: [PATCH 2/4] core: improve typing, fix some type assignments,
 docstrings use pathlib in some places

---
 lncrawl/core/app.py        | 5 +++--
 lncrawl/core/crawler.py    | 6 ++++--
 lncrawl/core/downloader.py | 4 ++--
 lncrawl/core/logconfig.py  | 4 ++--
 lncrawl/core/novel_info.py | 3 ++-
 lncrawl/core/proxy.py      | 4 ++--
 lncrawl/core/taskman.py    | 3 ++-
 7 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/lncrawl/core/app.py b/lncrawl/core/app.py
index 83f92f8e0..809004725 100644
--- a/lncrawl/core/app.py
+++ b/lncrawl/core/app.py
@@ -2,6 +2,7 @@
 import logging
 import os
 import shutil
+from pathlib import Path
 from threading import Thread
 from typing import Dict, List, Optional, Tuple
 from urllib.parse import urlparse
@@ -167,7 +168,7 @@ def start_download(self):
         fetch_chapter_images(self)
         save_metadata(self, True)
 
-        if not self.output_formats.get("json", False):
+        if not self.output_formats.get(OutputFormat.json.value, False):
             shutil.rmtree(os.path.join(self.output_path, "json"), ignore_errors=True)
 
         if self.can_do("logout"):
@@ -233,7 +234,7 @@ def compress_books(self, archive_singles=False):
                 logger.info("Not archiving single file inside %s" % root_dir)
                 archived_file = os.path.join(root_dir, file_list[0])
             else:
-                base_path = os.path.join(self.output_path, output_name)
+                base_path = Path(self.output_path) / output_name
                 logger.info("Compressing %s to %s" % (root_dir, base_path))
                 archived_file = shutil.make_archive(
                     base_path,
diff --git a/lncrawl/core/crawler.py b/lncrawl/core/crawler.py
index bafde7435..20c89b00b 100644
--- a/lncrawl/core/crawler.py
+++ b/lncrawl/core/crawler.py
@@ -155,6 +155,7 @@ def download_chapters(
             unit="item",
             fail_fast=fail_fast,
         )
+        chapter = None
         for (index, future) in futures.items():
             try:
                 chapter = chapters[index]
@@ -162,8 +163,9 @@ def download_chapters(
                 self.extract_chapter_images(chapter)
                 chapter.success = True
             except Exception as e:
-                chapter.body = ""
-                chapter.success = False
+                if isinstance(chapter, Chapter):
+                    chapter.body = ""
+                    chapter.success = False
                 if isinstance(e, KeyboardInterrupt):
                     break
             finally:
diff --git a/lncrawl/core/downloader.py b/lncrawl/core/downloader.py
index 57d836a58..39bcf18a6 100644
--- a/lncrawl/core/downloader.py
+++ b/lncrawl/core/downloader.py
@@ -80,9 +80,9 @@ def fetch_chapter_body(app):
                 old_chapter = json.load(file)
                 chapter.update(**old_chapter)
         except FileNotFoundError:
-            logger.info("Missing File: %s Retrieved!" % (file_name))
+            logger.info("Missing File: %s Retrieved!" % file_name)
         except json.JSONDecodeError:
-            logger.info("Unable to decode JSON from the file: %s" % (file_name))
+            logger.info("Unable to decode JSON from the file: %s" % file_name)
         except Exception as e:
             logger.exception("An error occurred while reading the file:", e)
 
diff --git a/lncrawl/core/logconfig.py b/lncrawl/core/logconfig.py
index 660b6cd50..898676c3f 100644
--- a/lncrawl/core/logconfig.py
+++ b/lncrawl/core/logconfig.py
@@ -65,8 +65,8 @@ def configure_logging():
     }
     if not log_file:
         del config["handlers"]["file"]
-        config["root"]["level"] = level
+        config["root"]["level"] = logging.getLevelName(level)
         config["root"]["handlers"] = ["console"]
-        config["handlers"]["console"]["level"] = level
+        config["handlers"]["console"]["level"] = logging.getLevelName(level)
 
     logging.config.dictConfig(config)
diff --git a/lncrawl/core/novel_info.py b/lncrawl/core/novel_info.py
index 77d487ef0..ef2c542cb 100644
--- a/lncrawl/core/novel_info.py
+++ b/lncrawl/core/novel_info.py
@@ -1,6 +1,7 @@
 import math
 import os
 import re
+from pathlib import Path
 from typing import Dict
 
 from .. import constants as C
@@ -109,5 +110,5 @@ def save_metadata(app, completed=False):
     )
 
     os.makedirs(app.output_path, exist_ok=True)
-    file_name = os.path.join(app.output_path, C.META_FILE_NAME)
+    file_name = Path(app.output_path) / C.META_FILE_NAME
     novel.to_json(file_name, encoding="utf-8", indent=2)
diff --git a/lncrawl/core/proxy.py b/lncrawl/core/proxy.py
index ed7dc2822..e2247ba40 100644
--- a/lncrawl/core/proxy.py
+++ b/lncrawl/core/proxy.py
@@ -156,7 +156,7 @@ def __find_proxies():
                 __proxy_list.setdefault(scheme, [])
                 if __proxy_visited_at.get(url, 0) + __proxy_ttl < time.time():
                     __validate_and_add(scheme, ip, url)
-                    __proxy_visited_at[url] = time.time()
+                    __proxy_visited_at[url] = int(time.time())
 
             wait_times = 3 * 60
             while wait_times and not __has_exit:
@@ -178,6 +178,6 @@ def start_proxy_fetcher():
     Thread(target=__find_proxies, daemon=False).start()
 
 
-def stop_proxy_fetcher():
+def stop_proxy_fetcher(*args, **kwargs):
     global __has_exit
     __has_exit = True
diff --git a/lncrawl/core/taskman.py b/lncrawl/core/taskman.py
index e9b186d1b..7fccb7f38 100644
--- a/lncrawl/core/taskman.py
+++ b/lncrawl/core/taskman.py
@@ -136,7 +136,7 @@ def domain_gate(self, hostname: str = ""):
         """Limit number of entry per hostname.
 
         Args:
-            url: A fully qualified url.
+            hostname: A fully qualified url.
 
         Returns:
             A semaphore object to wait.
@@ -179,6 +179,7 @@ def resolve_futures(
             disable_bar: Hides the progress bar if True.
             desc: The progress bar description
             unit: The progress unit name
+            fail_fast: Fail on first error
         """
         if not futures:
             return

From cf8b6126b7eb1468e7293e1ce8cb61e2cfdaa266 Mon Sep 17 00:00:00 2001
From: ACA <aca-0021@proton.me>
Date: Sun, 3 Mar 2024 14:05:34 +0100
Subject: [PATCH 3/4] core: app, downloader, novel_info, sources: replace os
 with pathlib where possible

---
 lncrawl/core/app.py        | 28 ++++++++++++----------------
 lncrawl/core/downloader.py | 34 +++++++++++++++++-----------------
 lncrawl/core/novel_info.py |  3 +--
 lncrawl/core/sources.py    |  8 ++++----
 4 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/lncrawl/core/app.py b/lncrawl/core/app.py
index 809004725..46a1744a8 100644
--- a/lncrawl/core/app.py
+++ b/lncrawl/core/app.py
@@ -1,6 +1,5 @@
 import atexit
 import logging
-import os
 import shutil
 from pathlib import Path
 from threading import Thread
@@ -149,15 +148,13 @@ def get_novel_info(self):
             )
 
         source_name = slugify(urlparse(self.crawler.home_url).netloc)
-        self.output_path = os.path.join(
-            C.DEFAULT_OUTPUT_PATH, source_name, self.good_file_name
-        )
+        self.output_path = Path(C.DEFAULT_OUTPUT_PATH) / source_name / self.good_file_name
 
     # ----------------------------------------------------------------------- #
 
     def start_download(self):
         """Requires: crawler, chapters, output_path"""
-        if not self.output_path or not os.path.isdir(self.output_path):
+        if not self.output_path or not Path(self.output_path).is_dir():
             raise LNException("Output path is not defined")
 
         assert self.crawler
@@ -169,7 +166,7 @@ def start_download(self):
         save_metadata(self, True)
 
         if not self.output_formats.get(OutputFormat.json.value, False):
-            shutil.rmtree(os.path.join(self.output_path, "json"), ignore_errors=True)
+            shutil.rmtree(Path(self.output_path) / "json", ignore_errors=True)
 
         if self.can_do("logout"):
             self.crawler.logout()
@@ -209,39 +206,38 @@ def compress_books(self, archive_singles=False):
         logger.info("Compressing output...")
 
         # Get which paths to be archived with their base names
-        path_to_process = []
+        path_to_process: list[tuple[Path, str]] = []
         for fmt in available_formats:
-            root_dir = os.path.join(self.output_path, fmt)
-            if os.path.isdir(root_dir):
+            root_dir: Path = Path(self.output_path) / fmt
+            if root_dir.is_dir():
                 path_to_process.append(
-                    [root_dir, self.good_file_name + " (" + fmt + ")"]
+                    (root_dir, self.good_file_name + " (" + fmt + ")")
                 )
 
         # Archive files
         self.archived_outputs = []
         for root_dir, output_name in path_to_process:
-            file_list = os.listdir(root_dir)
+            file_list = list(root_dir.glob("*"))
             if len(file_list) == 0:
                 logger.info("It has no files: %s", root_dir)
                 continue
 
-            archived_file = None
             if (
                 len(file_list) == 1
                 and not archive_singles
-                and not os.path.isdir(os.path.join(root_dir, file_list[0]))
+                and not (root_dir / file_list[0]).is_dir()
             ):
                 logger.info("Not archiving single file inside %s" % root_dir)
-                archived_file = os.path.join(root_dir, file_list[0])
+                archived_file = (root_dir / file_list[0]).as_posix()
             else:
                 base_path = Path(self.output_path) / output_name
                 logger.info("Compressing %s to %s" % (root_dir, base_path))
                 archived_file = shutil.make_archive(
-                    base_path,
+                    base_path.as_posix(),
                     format="zip",
                     root_dir=root_dir,
                 )
-                logger.info("Compressed: %s", os.path.basename(archived_file))
+                logger.info("Compressed: %s", Path(archived_file).name)
 
             if archived_file:
                 self.archived_outputs.append(archived_file)
diff --git a/lncrawl/core/downloader.py b/lncrawl/core/downloader.py
index 39bcf18a6..09bea39db 100644
--- a/lncrawl/core/downloader.py
+++ b/lncrawl/core/downloader.py
@@ -3,7 +3,7 @@
 """
 import json
 import logging
-import os
+from pathlib import Path
 
 from ..models.chapter import Chapter
 from ..utils.imgen import generate_cover_image
@@ -17,13 +17,13 @@ def _chapter_file(
     output_path: str,
     pack_by_volume: bool,
 ):
-    dir_name = os.path.join(output_path, "json")
+    dir_name = Path(output_path) / "json"
     if pack_by_volume:
         vol_name = "Volume " + str(chapter.volume).rjust(2, "0")
-        dir_name = os.path.join(dir_name, vol_name)
+        dir_name = dir_name / vol_name
 
     chapter_name = str(chapter.id).rjust(5, "0")
-    json_file = os.path.join(dir_name, chapter_name + ".json")
+    json_file = dir_name / (chapter_name + ".json")
     return json_file
 
 
@@ -54,8 +54,8 @@ def _save_chapter(app, chapter: Chapter):
         output_path=app.output_path,
         pack_by_volume=app.pack_by_volume,
     )
-    os.makedirs(os.path.dirname(file_name), exist_ok=True)
-    with open(file_name, "w", encoding="utf-8") as fp:
+    file_name.parent.mkdir(parents=True, exist_ok=True)
+    with file_name.open("w", encoding="utf-8") as fp:
         json.dump(chapter, fp, ensure_ascii=False)
 
 
@@ -100,22 +100,22 @@ def fetch_chapter_body(app):
     logger.info(f"Processed {len(app.chapters)} chapters [{app.progress} fetched]")
 
 
-def _fetch_content_image(app, url, image_file):
+def _fetch_content_image(app, url, image_file: Path):
     from .app import App
 
     assert isinstance(app, App)
 
-    if url and not os.path.isfile(image_file):
+    if url and not (image_file.exists() and image_file.is_file()):
         try:
             img = app.crawler.download_image(url)
-            os.makedirs(os.path.dirname(image_file), exist_ok=True)
+            image_file.parent.mkdir(parents=True, exist_ok=True)
             if img.mode not in ("L", "RGB", "YCbCr", "RGBX"):
                 if img.mode == "RGBa":
                     #RGBa -> RGB isn't supported so we go through RGBA first
                     img.convert("RGBA").convert("RGB")
                 else:
                     img = img.convert("RGB")
-            img.save(image_file, "JPEG", optimized=True)
+            img.save(image_file.as_posix(), "JPEG", optimized=True)
             img.close()
             logger.debug("Saved image: %s", image_file)
         finally:
@@ -129,7 +129,7 @@ def _fetch_cover_image(app):
     assert app.crawler is not None
 
     filename = "cover.jpg"
-    cover_file = os.path.join(app.output_path, filename)
+    cover_file = Path(app.output_path) / filename
     if app.crawler.novel_cover:
         try:
             _fetch_content_image(
@@ -141,12 +141,12 @@ def _fetch_cover_image(app):
             if logger.isEnabledFor(logging.DEBUG):
                 logger.exception("Failed to download cover", e)
 
-    if not os.path.isfile(cover_file):
-        generate_cover_image(cover_file)
+    if not cover_file.exists() and cover_file.is_file():
+        generate_cover_image(cover_file.as_posix())
 
     app.progress += 1
     app.book_cover = cover_file
-    assert os.path.isfile(app.book_cover), "Failed to download or generate cover image"
+    assert Path(app.book_cover).is_file(), "Failed to download or generate cover image"
 
 
 def _discard_failed_images(app, chapter, failed):
@@ -191,7 +191,7 @@ def fetch_chapter_images(app):
     ]
 
     # download content images
-    image_folder = os.path.join(app.output_path, "images")
+    image_folder = Path(app.output_path) / "images"
     images_to_download = set(
         [
             (filename, url)
@@ -204,7 +204,7 @@ def fetch_chapter_images(app):
             _fetch_content_image,
             app,
             url,
-            os.path.join(image_folder, filename),
+            image_folder / filename,
         )
         for filename, url in images_to_download
     ]
@@ -215,7 +215,7 @@ def fetch_chapter_images(app):
         failed = [
             filename
             for filename, url in images_to_download
-            if not os.path.isfile(os.path.join(image_folder, filename))
+            if not (image_folder / filename).is_file()
         ]
     finally:
         logger.info("Processed %d images [%d failed]" % (app.progress, len(failed)))
diff --git a/lncrawl/core/novel_info.py b/lncrawl/core/novel_info.py
index ef2c542cb..a993ed2d3 100644
--- a/lncrawl/core/novel_info.py
+++ b/lncrawl/core/novel_info.py
@@ -1,5 +1,4 @@
 import math
-import os
 import re
 from pathlib import Path
 from typing import Dict
@@ -109,6 +108,6 @@ def save_metadata(app, completed=False):
         ),
     )
 
-    os.makedirs(app.output_path, exist_ok=True)
+    Path(app.output_path).mkdir(parents=True, exist_ok=True)
     file_name = Path(app.output_path) / C.META_FILE_NAME
     novel.to_json(file_name, encoding="utf-8", indent=2)
diff --git a/lncrawl/core/sources.py b/lncrawl/core/sources.py
index 79c88d8c3..9c4ea6c9e 100644
--- a/lncrawl/core/sources.py
+++ b/lncrawl/core/sources.py
@@ -83,7 +83,7 @@ def __download_data(url: str):
 __index_fetch_internval_in_seconds = 30 * 60
 __master_index_file_url = "https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/master/sources/_index.json"
 
-__user_data_path = Path(os.path.expanduser("~")) / ".lncrawl"
+__user_data_path = Path("~").expanduser() / ".lncrawl"
 __local_data_path = Path(__file__).parent.parent.absolute()
 if not (__local_data_path / "sources").is_dir():
     __local_data_path = __local_data_path.parent
@@ -110,7 +110,7 @@ def __load_current_index():
 
 def __save_current_index():
     index_file = __user_data_path / "sources" / "_index.json"
-    os.makedirs(index_file.parent, exist_ok=True)
+    index_file.parent.mkdir(parents=True, exist_ok=True)
 
     logger.debug("Saving current index data to %s", index_file)
     with open(index_file, "w", encoding="utf8") as fp:
@@ -170,12 +170,12 @@ def __save_source_data(source_id, data):
     dst_dir = dst_file.parent
     temp_file = dst_dir / ("." + dst_file.name)
 
-    os.makedirs(dst_dir, exist_ok=True)
+    dst_dir.mkdir(parents=True, exist_ok=True)
     with open(temp_file, "wb") as fp:
         fp.write(data)
 
     if dst_file.exists():
-        os.remove(dst_file)
+        dst_file.unlink()
     temp_file.rename(dst_file)
 
     global __current_index

From f6b35642c2e16ff434df4b8518903a2105232777 Mon Sep 17 00:00:00 2001
From: ACA <aca-0021@proton.me>
Date: Sun, 10 Mar 2024 12:03:39 +0100
Subject: [PATCH 4/4] core cleaner: update typing

---
 lncrawl/core/cleaner.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/lncrawl/core/cleaner.py b/lncrawl/core/cleaner.py
index e1b1eef48..3856c4ac3 100644
--- a/lncrawl/core/cleaner.py
+++ b/lncrawl/core/cleaner.py
@@ -4,7 +4,7 @@
 import unicodedata
 from typing import AnyStr, Dict, List, Set, Union
 
-from bs4 import Comment, Tag, PageElement
+from bs4 import Comment, Tag
 
 LINE_SEP = "<br>"
 
@@ -16,8 +16,6 @@
 NONPRINTABLE = itertools.chain(range(0x00, 0x20), range(0x7F, 0xA0), INVISIBLE_CHARS)
 NONPRINTABLE_MAPPING = {character: None for character in NONPRINTABLE}
 
-TAG_LIKE = Union[Comment, PageElement, Tag]
-
 
 class TextCleaner:
     def __init__(self) -> None:
@@ -176,7 +174,8 @@ def clean_contents(self, div):
         for tag in div.find_all(True):
             if isinstance(tag, Comment):
                 tag.extract()  # Remove comments
-            elif not isinstance(tag, Tag):
+                continue
+            if not isinstance(tag, Tag):
                 continue  # Skip elements that are not a Tag
             if tag.name in self.bad_tags:
                 tag.extract()  # Remove bad tags
@@ -203,14 +202,14 @@ def clean_text(self, text) -> str:
         )
         return text
 
-    def extract_on_duplicate_sibling(self, tag: TAG_LIKE):
+    def extract_on_duplicate_sibling(self, tag: Tag):
         next_tag = tag.next_sibling
         if not isinstance(next_tag, Tag):
             return
         if next_tag.name == tag.name:
             tag.extract()
 
-    def clean_attributes(self, tag: TAG_LIKE):
+    def clean_attributes(self, tag: Tag):
         attrs = {}
         for name, value in tag.attrs.items():
             if name not in self.whitelist_attributes:
@@ -234,7 +233,7 @@ def tag_contains_bad_text(self, tag: Tag) -> bool:
             self.bad_tag_text_pairs[tag.name] = pattern
         return bool(pattern.search(tag.text))
 
-    def clean_image(self, tag: TAG_LIKE):
+    def clean_image(self, tag: Tag):
         src = None
         for name in self.image_src_attributes:
             src = tag.get(name)