From 42a8a7540441ffa2b1d43e7546b09e77b8c11fea Mon Sep 17 00:00:00 2001 From: ACA Date: Sun, 21 Jan 2024 22:56:36 +0100 Subject: [PATCH 1/4] add faqwiki.py source (https://faqwiki.us/) --- sources/en/f/faqwiki.py | 135 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 sources/en/f/faqwiki.py diff --git a/sources/en/f/faqwiki.py b/sources/en/f/faqwiki.py new file mode 100644 index 000000000..5b1185cf5 --- /dev/null +++ b/sources/en/f/faqwiki.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- +import logging + +from bs4.element import Tag +from lncrawl.core.crawler import Crawler +from lncrawl.models import Volume, Chapter, SearchResult + +logger = logging.getLogger(__name__) + + +class FaqWiki(Crawler): + base_url = ["https://faqwiki.us/"] + has_manga = False + has_mtl = True + + def initialize(self) -> None: + # There's about 4+ ads as img tags within each chapter. + # Have not yet seen an img be part of any chapter, worst case we'll miss out on it. + self.cleaner.bad_tags.add("img") + + def read_novel_info(self): + soup = self.get_soup(self.novel_url) + + content = soup.select_one(".entry-content") + + entry_title = soup.select_one("h1.entry-title") + assert isinstance(entry_title, Tag) # this must be here, is part of normal site structure/framework + self.novel_title = entry_title.text.strip() + # remove suffix from completed novels' title + if self.novel_title.endswith(" – All Chapters"): + self.novel_title = self.novel_title[0:self.novel_title.find(" – All Chapters")] + self.novel_author = "FaqWiki" + cover = content.select_one('img[importance="high"]') + self.novel_cover = self.absolute_url(cover["src"]) + + # remove any optimized image size GET args from novel cover URL + if "?" in self.novel_cover: + self.novel_cover = self.novel_cover[0:self.novel_cover.find("?")] + + metadata_container = soup.select_one("div.book-review-block__meta-item-value") + keywords = { + "desc": "Description:", + "alt_name": "Alternate Names:", + "genre": "Genre:", + "author": "Author(s):", + "status": "Status:", + "original_pub": "Original Publisher:" + } + + if metadata_container: + metadata = metadata_container.text # doesn't have line breaks anyway so not splitting here + pos_dict = {} + for key, sep in keywords.items(): + pos_dict[key + "_start"] = metadata.find(sep) + pos_dict[key] = metadata.find(sep) + len(sep) + + self.novel_synopsis = metadata[pos_dict["desc"]:pos_dict["alt_name_start"]].strip() + self.novel_tags = metadata[pos_dict["genre"]:pos_dict["author_start"]].strip().split(" ") + self.novel_author = metadata[pos_dict["author"]:pos_dict["status_start"]].strip() + + logger.info("Novel title: %s", self.novel_title) + logger.info("Novel synopsis: %s", self.novel_synopsis) + logger.info("Novel tags: %s", ",".join(self.novel_tags)) + logger.info("Novel author: %s", self.novel_author) + logger.info("Novel cover: %s", self.novel_cover) + + chap_list = soup.select_one('#lcp_instance_0').select("li>a") + chap_list.reverse() # since newest chapter is first child, reverse + for idx, a in enumerate(chap_list): + if "chapter" not in a.text.strip().lower(): + continue + chap_id = 1 + idx + vol_id = 1 + len(self.chapters) // 100 + vol_title = f"Volume {vol_id}" + if chap_id % 100 == 1: + self.volumes.append( + Volume( + id=vol_id, + title=vol_title + )) + + # chapter name is only (sometimes) present in chapter page, not in overview + entry_title = f"Chapter {chap_id}" + + self.chapters.append( + Chapter( + id=chap_id, + url=self.absolute_url(a["href"]), + title=entry_title, + volume=vol_id, + volume_title=vol_title + ), + ) + + def download_chapter_body(self, chapter): + soup = self.get_soup(chapter.url) + + contents_html = soup.select_one("div.entry-content") + contents_html = self.cleaner.clean_contents(contents_html) + contents_str = self.cleaner.extract_contents(contents_html) + + return contents_str + + def search_novel(self, query: str): + novel_selector = "article > div > header > h3.entry-title > a" + next_selector = "div.nav-links > a.next" + + soup = self.get_soup(f"https://faqwiki.us/?s={query.replace(' ','+')}&post_type=page") + empty = "nothing found" in soup.select_one("h1.page-title").text.strip().lower() + if empty: + return [] + + novels = soup.select(novel_selector) + + # loop over all pages via next button and get all novels + next_page = soup.select_one(next_selector) + while next_page: + page_soup = self.get_soup(self.absolute_url(next_page["href"])) + novels += page_soup.select(novel_selector) + next_page = page_soup.select_one(next_selector) + + results = [] + for novel in novels: + # filter out "fake" novels (links to All, completed & ongoing pages) + if "novels" in novel.text.lower(): + pass + # simple but at least won't taint results + if query.lower() in novel.text.lower(): + results.append( + SearchResult( + title=novel.text, + url=novel["href"] + ) + ) + return results From f8424985b4fb6a2cddc735eb8cf41b9aba8217c2 Mon Sep 17 00:00:00 2001 From: ACA Date: Thu, 1 Feb 2024 23:09:44 +0100 Subject: [PATCH 2/4] faqwiki: fix downloads for novels with missing cover img --- sources/en/f/faqwiki.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sources/en/f/faqwiki.py b/sources/en/f/faqwiki.py index 5b1185cf5..a82a0892e 100644 --- a/sources/en/f/faqwiki.py +++ b/sources/en/f/faqwiki.py @@ -31,10 +31,11 @@ def read_novel_info(self): self.novel_title = self.novel_title[0:self.novel_title.find(" – All Chapters")] self.novel_author = "FaqWiki" cover = content.select_one('img[importance="high"]') - self.novel_cover = self.absolute_url(cover["src"]) - + # is missing in some rarer cases + if cover: + self.novel_cover = self.absolute_url(cover["src"]) # remove any optimized image size GET args from novel cover URL - if "?" in self.novel_cover: + if self.novel_cover and "?" in self.novel_cover: self.novel_cover = self.novel_cover[0:self.novel_cover.find("?")] metadata_container = soup.select_one("div.book-review-block__meta-item-value") From c457136d496eb3da6279098d69b2fd7d2994fd30 Mon Sep 17 00:00:00 2001 From: ACA Date: Sun, 4 Feb 2024 16:18:15 +0100 Subject: [PATCH 3/4] faqwiki: fix downloads for novels with chapters in chronological order --- sources/en/f/faqwiki.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sources/en/f/faqwiki.py b/sources/en/f/faqwiki.py index a82a0892e..e58e9f27c 100644 --- a/sources/en/f/faqwiki.py +++ b/sources/en/f/faqwiki.py @@ -66,7 +66,10 @@ def read_novel_info(self): logger.info("Novel cover: %s", self.novel_cover) chap_list = soup.select_one('#lcp_instance_0').select("li>a") - chap_list.reverse() # since newest chapter is first child, reverse + + # in rare cases the chapter list is newest to oldest + if not chap_list[0].text.lower().replace(' ', '').endswith(("chapter0", "chapter1")): + chap_list.reverse() for idx, a in enumerate(chap_list): if "chapter" not in a.text.strip().lower(): continue From 6c474c06e8a2538e6a2b8eb947c6ecc75d58d920 Mon Sep 17 00:00:00 2001 From: ACA Date: Tue, 6 Feb 2024 22:40:46 +0100 Subject: [PATCH 4/4] faqwiki: remove conditional chapter reversal & improve cover image download --- sources/en/f/faqwiki.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/sources/en/f/faqwiki.py b/sources/en/f/faqwiki.py index e58e9f27c..6194fa828 100644 --- a/sources/en/f/faqwiki.py +++ b/sources/en/f/faqwiki.py @@ -30,10 +30,17 @@ def read_novel_info(self): if self.novel_title.endswith(" – All Chapters"): self.novel_title = self.novel_title[0:self.novel_title.find(" – All Chapters")] self.novel_author = "FaqWiki" - cover = content.select_one('img[importance="high"]') + cover = content.select_one('.wp-block-image img') # is missing in some rarer cases if cover: - self.novel_cover = self.absolute_url(cover["src"]) + src = str(cover['src']) + # may be replaced with JS after load, in such case try and get the real img hidden in data-values + if src.startswith("data:"): + try: + src = cover["data-ezsrc"] + except KeyError: + pass + self.novel_cover = self.absolute_url(src) # remove any optimized image size GET args from novel cover URL if self.novel_cover and "?" in self.novel_cover: self.novel_cover = self.novel_cover[0:self.novel_cover.find("?")] @@ -67,11 +74,8 @@ def read_novel_info(self): chap_list = soup.select_one('#lcp_instance_0').select("li>a") - # in rare cases the chapter list is newest to oldest - if not chap_list[0].text.lower().replace(' ', '').endswith(("chapter0", "chapter1")): - chap_list.reverse() for idx, a in enumerate(chap_list): - if "chapter" not in a.text.strip().lower(): + if "chapter" not in a.text.lower(): continue chap_id = 1 + idx vol_id = 1 + len(self.chapters) // 100