dipu-bd · Vuizur · Jan 31, 2024 · Feb 9, 2024 · Dec 27, 2023 · Dec 27, 2023
diff --git a/.github/contribs.json b/.github/contribs.json
@@ -89,5 +89,15 @@
   "Neory Dominise": null,
   "[email protected]": null,
   "HeliosLHC": "HeliosLHC",
-  "[email protected]": "HeliosLHC"
+  "[email protected]": "HeliosLHC",
+  "alzamer2": "alzamer2",
+  "[email protected]": "alzamer2",
+  "Unknown404": null,
+  "[email protected]": null,
+  "ACA": null,
+  "[email protected]": null,
+  "Campiotti": null,
+  "[email protected]": null,
+  "Nilan Ekanayake": null,
+  "[email protected]": null
 }
diff --git a/README.md b/README.md
diff --git a/lncrawl/VERSION b/lncrawl/VERSION
@@ -1 +1 @@
-3.4.1
+3.4.2
diff --git a/lncrawl/bots/telegram/__init__.py b/lncrawl/bots/telegram/__init__.py
@@ -131,7 +131,7 @@ def start(self):
 
     async def error_handler(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
         """Log Errors caused by Updates."""
-        logger.warn("Error: %s\nCaused by: %s", context.error, update)
+        logger.warning("Error: %s\nCaused by: %s", context.error, update)
 
     async def show_help(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
         await update.message.reply_text("Send /start to create new session.\n")

diff --git a/lncrawl/core/app.py b/lncrawl/core/app.py
@@ -240,7 +240,7 @@ def compress_books(self, archive_singles=False):
                     format="zip",
                     root_dir=root_dir,
                 )
-                logger.info("Compressed:", os.path.basename(archived_file))
+                logger.info("Compressed: %s", os.path.basename(archived_file))
 
             if archived_file:
                 self.archived_outputs.append(archived_file)
diff --git a/lncrawl/core/sources.py b/lncrawl/core/sources.py
@@ -137,7 +137,7 @@ def __load_latest_index():
     except Exception as e:
         if "crawlers" not in __current_index:
             raise LNException("Could not fetch sources index")
-        logger.warn("Could not download latest index. Error: %s", e)
+        logger.warning("Could not download latest index. Error: %s", e)
         __latest_index = __current_index
 
 
@@ -223,7 +223,7 @@ def __download_sources():
         try:
             __save_source_data(sid, data)
         except Exception as e:
-            logger.warn("Failed to save source file. Error: %s", e)
+            logger.warning("Failed to save source file. Error: %s", e)
 
 
 # --------------------------------------------------------------------------- #
@@ -248,7 +248,7 @@ def __import_crawlers(file_path: Path) -> List[Type[Crawler]]:
         module = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(module)
     except Exception as e:
-        logger.warn("Module load failed: %s | %s", file_path, e)
+        logger.warning("Module load failed: %s | %s", file_path, e)
         return []
 
     language_code = ""
@@ -296,7 +296,7 @@ def __add_crawlers_from_path(path: Path):
         return
 
     if not path.exists():
-        logger.warn("Path does not exists: %s", path)
+        logger.warning("Path does not exists: %s", path)
         return
 
     if path.is_dir():
@@ -312,7 +312,7 @@ def __add_crawlers_from_path(path: Path):
             for url in getattr(crawler, "base_url"):
                 crawler_list[url] = crawler
     except Exception as e:
-        logger.warn("Could not load crawlers from %s. Error: %s", path, e)
+        logger.warning("Could not load crawlers from %s. Error: %s", path, e)
 
 
 # --------------------------------------------------------------------------- #

diff --git a/lncrawl/templates/browser/general.py b/lncrawl/templates/browser/general.py
@@ -25,13 +25,13 @@ def read_novel_info_in_scraper(self) -> None:
         try:
             self.novel_cover = self.parse_cover(soup)
         except Exception as e:
-            logger.warn("Failed to parse novel cover | %s", e)
+            logger.warning("Failed to parse novel cover | %s", e)
 
         try:
             authors = set(list(self.parse_authors(soup)))
             self.novel_author = ", ".join(authors)
         except Exception as e:
-            logger.warn("Failed to parse novel authors | %s", e)
+            logger.warning("Failed to parse novel authors | %s", e)
 
         for item in self.parse_chapter_list(soup):
             if isinstance(item, Chapter):
@@ -51,13 +51,13 @@ def read_novel_info_in_browser(self) -> None:
         try:
             self.novel_cover = self.parse_cover_in_browser()
         except Exception as e:
-            logger.warn("Failed to parse novel cover | %s", e)
+            logger.warning("Failed to parse novel cover | %s", e)
 
         try:
             authors = set(list(self.parse_authors_in_browser()))
             self.novel_author = ", ".join(authors)
         except Exception as e:
-            logger.warn("Failed to parse novel authors | %s", e)
+            logger.warning("Failed to parse novel authors | %s", e)
 
         for item in self.parse_chapter_list_in_browser():
             if isinstance(item, Chapter):

diff --git a/lncrawl/templates/soup/general.py b/lncrawl/templates/soup/general.py
@@ -23,13 +23,13 @@ def read_novel_info(self) -> None:
         try:
             self.novel_cover = self.parse_cover(soup)
         except Exception as e:
-            logger.warn("Failed to parse novel cover | %s", e)
+            logger.warning("Failed to parse novel cover | %s", e)
 
         try:
             authors = set(list(self.parse_authors(soup)))
             self.novel_author = ", ".join(authors)
         except Exception as e:
-            logger.warn("Failed to parse novel authors | %s", e)
+            logger.warning("Failed to parse novel authors | %s", e)
 
         for item in self.parse_chapter_list(soup):
             if isinstance(item, Chapter):

diff --git a/lncrawl/utils/pbincli.py b/lncrawl/utils/pbincli.py
@@ -23,7 +23,7 @@ class PBinCLIException(Exception):
 
 
 def PBinCLIError(message):
-    logger.warn("PBinCLI Error: {}".format(message))
+    logger.warning("PBinCLI Error: {}".format(message))
 
 
 def path_leaf(path):

diff --git a/sources/_index.json b/sources/_index.json
diff --git a/sources/en/b/bato.py b/sources/en/b/bato.py
@@ -139,10 +139,10 @@ def read_novel_info(self):
 
     def download_chapter_body(self, chapter):
         soup = self.get_soup(chapter["url"])
-        soup = soup.find("script", string=re.compile(r"const imgHttpLis = \["))
+        soup = soup.find("script", string=re.compile(r"const imgHttps = \["))
 
         img_list = json.loads(
-            re.search(r"const imgHttpLis = (.*);", soup.text).group(1)
+            re.search(r"const imgHttps = (.*);", soup.text).group(1)
         )
 
         bato_pass = decode_pass(
@@ -151,10 +151,17 @@ def download_chapter_body(self, chapter):
 
         bato_word = re.search(r"const batoWord = (.*);", soup.text).group(1).strip('"')
 
+        # looks like some kind of "access" GET args that may be necessary, not always though
         query_args = json.loads(decrypt(bato_word, bato_pass).decode())
 
-        image_urls = [
-            f'<img src="{img}?{args}">' for img, args in zip(img_list, query_args)
-        ]
+        # so if it ends up empty or mismatches, just ignore it and return the img list instead
+        if len(query_args) != len(img_list):
+            image_urls = [
+                f'<img src="{img}" alt="img">' for img in img_list
+            ]
+        else:
+            image_urls = [
+                f'<img src="{img}?{args}">' for img, args in zip(img_list, query_args)
+            ]
 
         return "<p>" + "</p><p>".join(image_urls) + "</p>"
diff --git a/sources/en/c/coffeemanga.py b/sources/en/c/coffeemanga.py
@@ -3,13 +3,12 @@
 from lncrawl.core.crawler import Crawler
 
 logger = logging.getLogger(__name__)
-search_url = "https://coffeemanga.com/?s=%s&post_type=wp-manga"
-chapter_list_url = "https://coffeemanga.com/wp-admin/admin-ajax.php"
+search_url = "https://coffeemanga.io/?s=%s&post_type=wp-manga"
 
 
 class CoffeeManga(Crawler):
     has_manga = True
-    base_url = "https://coffeemanga.com/"
+    base_url = ["https://coffeemanga.io/"]
 
     def search_novel(self, query):
         query = query.lower().replace(" ", "+")
@@ -53,7 +52,8 @@ def read_novel_info(self):
         )
         logger.info("%s", self.novel_author)
 
-        for a in reversed(soup.select("ul.main li.wp-manga-chapter a")):
+        soup = self.post_soup(f"{self.novel_url}ajax/chapters/")
+        for a in reversed(soup.select("li.wp-manga-chapter a")):
             chap_id = len(self.chapters) + 1
             vol_id = len(self.chapters) // 100 + 1
             if len(self.chapters) % 100 == 0:

diff --git a/sources/en/d/dobelyuwai.py b/sources/en/d/dobelyuwai.py
@@ -32,7 +32,7 @@ def read_novel_info(self):
         # try:
         #     self.novel_author = soup.select_one('div.entry-content > p:nth-child(2)').text.strip()
         # except Exception as e:
-        #     logger.warn('Failed to get novel auth. Error: %s', e)
+        #     logger.warning('Failed to get novel auth. Error: %s', e)
         # logger.info('%s', self.novel_author)
 
         # Removes none TOC links from bottom of page.

diff --git a/sources/en/f/fanstrans.py b/sources/en/f/fanstrans.py
@@ -26,6 +26,8 @@ def initialize(self) -> None:
             r"^Get  on Patreon",
             r"^Check out other novels on Fan’s Translation~",
             r"^to get Notification for latest Chapter Releases",
+            r"^Can’t wait to read more? Want to show your support? Click",
+            r"^to be a sponsor and get additional chapters ahead of time!",
         ]
     )
     self.cleaner.bad_tags.update(["a"])
@@ -36,6 +38,7 @@ class FansTranslations(Crawler):
 
     def initialize(self) -> None:
         self.cleaner.bad_tags.update(["h3"])
+        self.init_executor(4)
 
     def search_novel(self, query):
         query = query.lower().replace(" ", "+")

diff --git a/sources/en/f/faqwiki.py b/sources/en/f/faqwiki.py
@@ -0,0 +1,143 @@
+# -*- coding: utf-8 -*-
+import logging
+
+from bs4.element import Tag
+from lncrawl.core.crawler import Crawler
+from lncrawl.models import Volume, Chapter, SearchResult
+
+logger = logging.getLogger(__name__)
+
+
+class FaqWiki(Crawler):
+    base_url = ["https://faqwiki.us/"]
+    has_manga = False
+    has_mtl = True
+
+    def initialize(self) -> None:
+        # There's about 4+ ads as img tags within each chapter.
+        # Have not yet seen an img be part of any chapter, worst case we'll miss out on it.
+        self.cleaner.bad_tags.add("img")
+
+    def read_novel_info(self):
+        soup = self.get_soup(self.novel_url)
+
+        content = soup.select_one(".entry-content")
+
+        entry_title = soup.select_one("h1.entry-title")
+        assert isinstance(entry_title, Tag)  # this must be here, is part of normal site structure/framework
+        self.novel_title = entry_title.text.strip()
+        # remove suffix from completed novels' title
+        if self.novel_title.endswith(" – All Chapters"):
+            self.novel_title = self.novel_title[0:self.novel_title.find(" – All Chapters")]
+        self.novel_author = "FaqWiki"
+        cover = content.select_one('.wp-block-image img')
+        # is missing in some rarer cases
+        if cover:
+            src = str(cover['src'])
+            # may be replaced with JS after load, in such case try and get the real img hidden in data-values
+            if src.startswith("data:"):
+                try:
+                    src = cover["data-ezsrc"]
+                except KeyError:
+                    pass
+            self.novel_cover = self.absolute_url(src)
+        # remove any optimized image size GET args from novel cover URL
+        if self.novel_cover and "?" in self.novel_cover:
+            self.novel_cover = self.novel_cover[0:self.novel_cover.find("?")]
+
+        metadata_container = soup.select_one("div.book-review-block__meta-item-value")
+        keywords = {
+            "desc": "Description:",
+            "alt_name": "Alternate Names:",
+            "genre": "Genre:",
+            "author": "Author(s):",
+            "status": "Status:",
+            "original_pub": "Original Publisher:"
+        }
+
+        if metadata_container:
+            metadata = metadata_container.text  # doesn't have line breaks anyway so not splitting here
+            pos_dict = {}
+            for key, sep in keywords.items():
+                pos_dict[key + "_start"] = metadata.find(sep)
+                pos_dict[key] = metadata.find(sep) + len(sep)
+
+            self.novel_synopsis = metadata[pos_dict["desc"]:pos_dict["alt_name_start"]].strip()
+            self.novel_tags = metadata[pos_dict["genre"]:pos_dict["author_start"]].strip().split(" ")
+            self.novel_author = metadata[pos_dict["author"]:pos_dict["status_start"]].strip()
+
+        logger.info("Novel title: %s", self.novel_title)
+        logger.info("Novel synopsis: %s", self.novel_synopsis)
+        logger.info("Novel tags: %s", ",".join(self.novel_tags))
+        logger.info("Novel author: %s", self.novel_author)
+        logger.info("Novel cover: %s", self.novel_cover)
+
+        chap_list = soup.select_one('#lcp_instance_0').select("li>a")
+
+        for idx, a in enumerate(chap_list):
+            if "chapter" not in a.text.lower():
+                continue
+            chap_id = 1 + idx
+            vol_id = 1 + len(self.chapters) // 100
+            vol_title = f"Volume {vol_id}"
+            if chap_id % 100 == 1:
+                self.volumes.append(
+                    Volume(
+                        id=vol_id,
+                        title=vol_title
+                    ))
+
+            # chapter name is only (sometimes) present in chapter page, not in overview
+            entry_title = f"Chapter {chap_id}"
+
+            self.chapters.append(
+                Chapter(
+                    id=chap_id,
+                    url=self.absolute_url(a["href"]),
+                    title=entry_title,
+                    volume=vol_id,
+                    volume_title=vol_title
+                ),
+            )
+
+    def download_chapter_body(self, chapter):
+        soup = self.get_soup(chapter.url)
+
+        contents_html = soup.select_one("div.entry-content")
+        contents_html = self.cleaner.clean_contents(contents_html)
+        contents_str = self.cleaner.extract_contents(contents_html)
+
+        return contents_str
+
+    def search_novel(self, query: str):
+        novel_selector = "article > div > header > h3.entry-title > a"
+        next_selector = "div.nav-links > a.next"
+
+        soup = self.get_soup(f"https://faqwiki.us/?s={query.replace(' ','+')}&post_type=page")
+        empty = "nothing found" in soup.select_one("h1.page-title").text.strip().lower()
+        if empty:
+            return []
+
+        novels = soup.select(novel_selector)
+
+        # loop over all pages via next button and get all novels
+        next_page = soup.select_one(next_selector)
+        while next_page:
+            page_soup = self.get_soup(self.absolute_url(next_page["href"]))
+            novels += page_soup.select(novel_selector)
+            next_page = page_soup.select_one(next_selector)
+
+        results = []
+        for novel in novels:
+            # filter out "fake" novels (links to All, completed & ongoing pages)
+            if "novels" in novel.text.lower():
+                pass
+            # simple but at least won't taint results
+            if query.lower() in novel.text.lower():
+                results.append(
+                    SearchResult(
+                        title=novel.text,
+                        url=novel["href"]
+                    )
+                )
+        return results
diff --git a/sources/en/i/isotls.py b/sources/en/i/isotls.py
@@ -26,7 +26,7 @@ def read_novel_info(self):
         if possible_novel_author:
             self.novel_author = possible_novel_author['content']
 
-        for a in soup.select('main section div:nth-child(2) ul li a'):
+        for a in soup.select('main section:nth-child(3) nav ul li a'):
             chap_id = len(self.chapters) + 1
             vol_id = len(self.chapters) // 100 + 1
             if len(self.chapters) % 100 == 0:
@@ -41,6 +41,5 @@ def read_novel_info(self):
 
     def download_chapter_body(self, chapter):
         soup = self.get_soup(chapter['url'])
-        contents = soup.select('article p')
-        body = [str(p) for p in contents if p.text.strip()]
-        return '<p>' + '</p><p>'.join(body) + '</p>'
+        contents = soup.select_one("div.content")
+        return self.cleaner.extract_contents(contents)
diff --git a/sources/en/l/lightnovelreader.py b/sources/en/l/lightnovelreader.py
@@ -19,7 +19,7 @@ class LightnovelReader(Crawler):
         "https://lnreader.org/",
         "https://www.lnreader.org/",
         "http://readlightnovel.online/",
-        "https://readlightnovel.app/",
+        "https://readlitenovel.com/",
     ]
 
     def initialize(self) -> None: