Skip to content

Commit

Permalink
Merge pull request #2293 from camp00000/core-adjustments-extensive
Browse files Browse the repository at this point in the history
Core adjustments (typing, misc fixes, replacing os.path with pathlib.Path)
  • Loading branch information
dipu-bd authored Mar 29, 2024
2 parents 1b8c640 + f6b3564 commit b009bbd
Show file tree
Hide file tree
Showing 9 changed files with 158 additions and 173 deletions.
33 changes: 15 additions & 18 deletions lncrawl/core/app.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import atexit
import logging
import os
import shutil
from pathlib import Path
from threading import Thread
from typing import Dict, List, Optional, Tuple
from urllib.parse import urlparse
Expand Down Expand Up @@ -148,15 +148,13 @@ def get_novel_info(self):
)

source_name = slugify(urlparse(self.crawler.home_url).netloc)
self.output_path = os.path.join(
C.DEFAULT_OUTPUT_PATH, source_name, self.good_file_name
)
self.output_path = Path(C.DEFAULT_OUTPUT_PATH) / source_name / self.good_file_name

# ----------------------------------------------------------------------- #

def start_download(self):
"""Requires: crawler, chapters, output_path"""
if not self.output_path or not os.path.isdir(self.output_path):
if not self.output_path or not Path(self.output_path).is_dir():
raise LNException("Output path is not defined")

assert self.crawler
Expand All @@ -167,8 +165,8 @@ def start_download(self):
fetch_chapter_images(self)
save_metadata(self, True)

if not self.output_formats.get("json", False):
shutil.rmtree(os.path.join(self.output_path, "json"), ignore_errors=True)
if not self.output_formats.get(OutputFormat.json.value, False):
shutil.rmtree(Path(self.output_path) / "json", ignore_errors=True)

if self.can_do("logout"):
self.crawler.logout()
Expand Down Expand Up @@ -208,39 +206,38 @@ def compress_books(self, archive_singles=False):
logger.info("Compressing output...")

# Get which paths to be archived with their base names
path_to_process = []
path_to_process: list[tuple[Path, str]] = []
for fmt in available_formats:
root_dir = os.path.join(self.output_path, fmt)
if os.path.isdir(root_dir):
root_dir: Path = Path(self.output_path) / fmt
if root_dir.is_dir():
path_to_process.append(
[root_dir, self.good_file_name + " (" + fmt + ")"]
(root_dir, self.good_file_name + " (" + fmt + ")")
)

# Archive files
self.archived_outputs = []
for root_dir, output_name in path_to_process:
file_list = os.listdir(root_dir)
file_list = list(root_dir.glob("*"))
if len(file_list) == 0:
logger.info("It has no files: %s", root_dir)
continue

archived_file = None
if (
len(file_list) == 1
and not archive_singles
and not os.path.isdir(os.path.join(root_dir, file_list[0]))
and not (root_dir / file_list[0]).is_dir()
):
logger.info("Not archiving single file inside %s" % root_dir)
archived_file = os.path.join(root_dir, file_list[0])
archived_file = (root_dir / file_list[0]).as_posix()
else:
base_path = os.path.join(self.output_path, output_name)
base_path = Path(self.output_path) / output_name
logger.info("Compressing %s to %s" % (root_dir, base_path))
archived_file = shutil.make_archive(
base_path,
base_path.as_posix(),
format="zip",
root_dir=root_dir,
)
logger.info("Compressed: %s", os.path.basename(archived_file))
logger.info("Compressed: %s", Path(archived_file).name)

if archived_file:
self.archived_outputs.append(archived_file)
229 changes: 107 additions & 122 deletions lncrawl/core/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,96 +32,86 @@ def __init__(self) -> None:
# the tag will be removed if the text inside contains the pattern
}

self.bad_tags: Set[str] = set(
[
# tag names to remove
"address",
"amp-auto-ads",
"audio",
"button",
"figcaption",
"footer",
"form",
"header",
"iframe",
"input",
"ins",
"map",
"nav",
"noscript",
"object",
"output",
"pirate",
"script",
"select",
"source",
"style",
"textarea",
"tfoot",
"video",
]
)
self.bad_css: Set[str] = set(
[
# css selector to select and remove tags
".adblock-service",
".sharedaddy",
".saboxplugin-wrap",
".adbox",
".ads-middle",
".ads",
".adsbygoogle",
".adsense-code",
".cb_p6_patreon_button",
".code-block",
".ezoic-ad-adaptive",
".ezoic-ad",
".ezoic-adpicker-ad",
".googlepublisherads",
".inline-ad-slot",
".jp-relatedposts",
".sharedaddy",
".wp-post-navigation",
"a[href*='patreon.com']",
"a[href*='paypal.me']",
]
)
self.p_block_tags: Set[str] = set(
[
# tags that can be used as paragraph break
"article",
"aside",
"div",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"main",
"p",
"section",
]
)
self.unchanged_tags: Set[str] = set(
[
# tags to keep unchanged with text and attributes
"canvas",
"img",
"pre",
]
)
self.plain_text_tags: Set[str] = set(
[
# tags that will be joined together in a paragraph
"a",
"abbr",
"acronym",
"label",
"span",
"time",
]
)
self.bad_tags: Set[str] = {
# tag names to remove
"address",
"amp-auto-ads",
"audio",
"button",
"figcaption",
"footer",
"form",
"header",
"iframe",
"input",
"ins",
"map",
"nav",
"noscript",
"object",
"output",
"pirate",
"script",
"select",
"source",
"style",
"textarea",
"tfoot",
"video",
}
self.bad_css: Set[str] = {
# css selector to select and remove tags
".adblock-service",
".sharedaddy",
".saboxplugin-wrap",
".adbox",
".ads-middle",
".ads",
".adsbygoogle",
".adsense-code",
".cb_p6_patreon_button",
".code-block",
".ezoic-ad-adaptive",
".ezoic-ad",
".ezoic-adpicker-ad",
".googlepublisherads",
".inline-ad-slot",
".jp-relatedposts",
".sharedaddy",
".wp-post-navigation",
"a[href*='patreon.com']",
"a[href*='paypal.me']",
}
self.p_block_tags: Set[str] = {
# tags that can be used as paragraph break
"article",
"aside",
"div",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"main",
"p",
"section",
}
self.unchanged_tags: Set[str] = {
# tags to keep unchanged with text and attributes
"canvas",
"img",
"pre",
}
self.plain_text_tags: Set[str] = {
# tags that will be joined together in a paragraph
"a",
"abbr",
"acronym",
"label",
"span",
"time",
}
self.substitutions: Dict[str, str] = {
# replace one string with another one
# "&": "&",
Expand All @@ -133,34 +123,28 @@ def __init__(self) -> None:
# "“s": "'s",
# "”s": "'s",
}
self.whitelist_attributes: Set[str] = set(
[
# the attributes to keep while cleaning a tag
"src",
"style",
# table and table children attributes
"colspan",
"rowspan",
"headers",
"scope",
"axis",
"id", # id required for headers ref
]
)
self.whitelist_css_property: Set[str] = set(
[
# the css styles to keep while cleaning style tag
"font-style",
"font-weight",
]
)
self.image_src_attributes: Set[str] = set(
[
"data-lazy-src",
"data-src",
"src",
]
)
self.whitelist_attributes: Set[str] = {
# the attributes to keep while cleaning a tag
"src",
"style",
# table and table children attributes
"colspan",
"rowspan",
"headers",
"scope",
"axis",
"id", # id required for headers ref
}
self.whitelist_css_property: Set[str] = {
# the css styles to keep while cleaning style tag
"font-style",
"font-weight",
}
self.image_src_attributes: Set[str] = {
"data-lazy-src",
"data-src",
"src",
}

def extract_contents(self, tag) -> str:
self.clean_contents(tag)
Expand Down Expand Up @@ -190,7 +174,8 @@ def clean_contents(self, div):
for tag in div.find_all(True):
if isinstance(tag, Comment):
tag.extract() # Remove comments
elif not isinstance(tag, Tag):
continue
if not isinstance(tag, Tag):
continue # Skip elements that are not a Tag
if tag.name in self.bad_tags:
tag.extract() # Remove bad tags
Expand Down Expand Up @@ -224,7 +209,7 @@ def extract_on_duplicate_sibling(self, tag: Tag):
if next_tag.name == tag.name:
tag.extract()

def clean_attributes(self, tag: Tag) -> dict:
def clean_attributes(self, tag: Tag):
attrs = {}
for name, value in tag.attrs.items():
if name not in self.whitelist_attributes:
Expand All @@ -246,7 +231,7 @@ def tag_contains_bad_text(self, tag: Tag) -> bool:
if not isinstance(pattern, re.Pattern):
pattern = re.compile(pattern, re.M)
self.bad_tag_text_pairs[tag.name] = pattern
return pattern.search(tag.text)
return bool(pattern.search(tag.text))

def clean_image(self, tag: Tag):
src = None
Expand Down Expand Up @@ -326,4 +311,4 @@ def contains_bad_texts(self, text: str) -> bool:
if not hasattr(self, "__blacklist__"):
pattern = re.compile("|".join(["(%s)" % p for p in self.bad_text_regex]))
self.__blacklist__ = pattern
return self.__blacklist__.search(text)
return bool(self.__blacklist__.search(text))
6 changes: 4 additions & 2 deletions lncrawl/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,15 +155,17 @@ def download_chapters(
unit="item",
fail_fast=fail_fast,
)
chapter = None
for (index, future) in futures.items():
try:
chapter = chapters[index]
chapter.body = future.result()
self.extract_chapter_images(chapter)
chapter.success = True
except Exception as e:
chapter.body = ""
chapter.success = False
if isinstance(chapter, Chapter):
chapter.body = ""
chapter.success = False
if isinstance(e, KeyboardInterrupt):
break
finally:
Expand Down
Loading

0 comments on commit b009bbd

Please sign in to comment.