From 8e7af301d36aac5ccd7275a8984c7f11fbdc18c4 Mon Sep 17 00:00:00 2001 From: kuwoyuki Date: Fri, 23 Feb 2024 00:14:40 +0600 Subject: [PATCH 1/7] fix: reaperscans --- lncrawl/core/scraper.py | 12 ++-- sources/en/r/reaperscans.py | 120 ++++++++++++++++++++++-------------- 2 files changed, 80 insertions(+), 52 deletions(-) diff --git a/lncrawl/core/scraper.py b/lncrawl/core/scraper.py index ccedc0767..689a2fae9 100644 --- a/lncrawl/core/scraper.py +++ b/lncrawl/core/scraper.py @@ -223,9 +223,11 @@ def submit_form( headers = CaseInsensitiveDict(headers) headers.setdefault( "Content-Type", - "multipart/form-data" - if multipart - else "application/x-www-form-urlencoded; charset=UTF-8", + ( + "multipart/form-data" + if multipart + else "application/x-www-form-urlencoded; charset=UTF-8" + ), ) return self.post_response(url, data=data, headers=headers, **kwargs) @@ -269,7 +271,7 @@ def get_json(self, url, headers={}, **kwargs) -> Any: response = self.get_response(url, headers=headers, **kwargs) return response.json() - def post_json(self, url, data={}, headers={}) -> Any: + def post_json(self, url, data={}, headers={}, **kwargs) -> Any: """Make a POST request and return the content as JSON object""" headers = CaseInsensitiveDict(headers) headers.setdefault("Content-Type", "application/json") @@ -277,7 +279,7 @@ def post_json(self, url, data={}, headers={}) -> Any: "Accept", "application/json,text/plain,*/*", ) - response = self.post_response(url, data=data, headers=headers) + response = self.post_response(url, data=data, headers=headers, **kwargs) return response.json() def submit_form_json( diff --git a/sources/en/r/reaperscans.py b/sources/en/r/reaperscans.py index 03332b82f..33560b68c 100644 --- a/sources/en/r/reaperscans.py +++ b/sources/en/r/reaperscans.py @@ -1,11 +1,12 @@ # -*- coding: utf-8 -*- +import json import logging +import time from bs4 import Tag from lncrawl.core.crawler import Crawler logger = logging.getLogger(__name__) -search_url = "https://reaperscans.com/?s=%s&post_type=wp-manga" class Reaperscans(Crawler): @@ -22,72 +23,97 @@ def initialize(self): "https://discord.gg/MaRegMFhRb", "https://discord.gg/reapercomics", "h ttps://discord.gg/reapercomic", + "https://discord.gg/sb2jqkv", "____", - "Join our Discord for updates on releases!", + "Join our Discord", ] ) + self.init_executor(ratelimit=0.9) - def search_novel(self, query): - query = query.lower().replace(" ", "+") - soup = self.get_soup(search_url % query) + def get_chapters_from_page(self, page, body, token): + url = self.absolute_url("/livewire/message/" + body["fingerprint"]["name"]) + body["updates"] = [ + { + "type": "callMethod", + "payload": { + "id": "00000", + "method": "gotoPage", + "params": [page, "page"], + }, + } + ] - results = [] - for tab in soup.select(".c-tabs-item__content"): - a = tab.select_one(".post-title h3 a") - latest = tab.select_one(".latest-chap .chapter a").text - votes = tab.select_one(".rating .total_votes").text - results.append( + response = self.post_json(url=url, data=json.dumps(body), timeout=10) + return self.make_soup(response["effects"]["html"]) + + def get_chapters_from_doc(self, dom): + return [ + { + "title": a.select_one("p").text.strip(), + "url": self.absolute_url(a["href"]), + } + for a in dom.select("div[wire\\3A id] ul[role] li a") + ] + + def insert_chapters(self, total_count, chapter_list): + for ch in chapter_list: + self.chapters.insert( + 0, { - "title": a.text.strip(), - "url": self.absolute_url(a["href"]), - "info": "%s | Rating: %s" % (latest, votes), - } + "id": total_count - len(self.chapters), + "title": ch["title"], + "url": ch["url"], + }, ) - return results - def read_novel_info(self): logger.debug("Visiting %s", self.novel_url) soup = self.get_soup(self.novel_url) - possible_title = soup.select_one(".post-title h1") + possible_title = soup.select_one("h1") assert isinstance(possible_title, Tag) - for span in possible_title.select("span"): - span.extract() self.novel_title = possible_title.text.strip() logger.info("Novel title: %s", self.novel_title) - possible_image = soup.select_one(".summary_image a img") + possible_image = soup.select_one(".h-full .w-full img") if isinstance(possible_image, Tag): - self.novel_cover = self.absolute_url(possible_image["data-src"]) + self.novel_cover = self.absolute_url(possible_image["src"]) logger.info("Novel cover: %s", self.novel_cover) - self.novel_author = " ".join( - [a.text.strip() for a in soup.select('.author-content a[href*="author"]')] + # prolly not even needed, didn't check + csrf = soup.select_one('meta[name="csrf-token"]')["content"] + # livewire container + container = soup.select_one("main div[wire\\:id][wire\\:initial-data]") + # first page ssr json + data = container["wire:initial-data"] + body = json.loads(data) + # del the dom effects attr + body.pop("effects") + + page_count = int( + container.select_one( + 'span[wire\\:key^="paginator-page"]:nth-last-child(2)' + ).text.strip() ) - logger.info("%s", self.novel_author) - - chapter_list_url = self.absolute_url("ajax/chapters", self.novel_url) - soup = self.post_soup(chapter_list_url, headers={"accept": "*/*"}) - for a in reversed( - soup.select('.wp-manga-chapter:not(.premium-block) a[href*="/chapter"]') - ): # This stops it from trying to download locked premium chapters. - for span in a.findAll("span"): # Removes time and date from chapter title. - span.extract() - chap_id = len(self.chapters) + 1 - vol_id = 1 + len(self.chapters) // 100 - if chap_id % 100 == 1: - self.volumes.append({"id": vol_id}) - self.chapters.append( - { - "id": chap_id, - "volume": vol_id, - "title": a.text.strip(), - "url": self.absolute_url(a["href"]), - } - ) + # meh but i can't find a better selector + chapter_count = int( + container.select_one( + "nav > div:last-child > div:first-child > p > span:nth-last-child(2)" + ).text.strip() + ) + + chaps = self.get_chapters_from_doc(container) + self.insert_chapters(chapter_count, chaps) + + for k in range(2, page_count + 1): + dom = self.get_chapters_from_page(k, body, csrf) + chaps = self.get_chapters_from_doc(dom) + self.insert_chapters(chapter_count, chaps) + # 429 otherwise, could use executor here tho maybe + time.sleep(1) def download_chapter_body(self, chapter): - soup = self.get_soup(chapter["url"]) - contents = soup.select_one("div.text-left") + # TODO: better retry/timeout settings + soup = self.get_soup(chapter["url"], retry=3, timeout=10) + contents = soup.select_one("article") return self.cleaner.extract_contents(contents) From 1b4e8c6dc0aa4fa5fd0cb5894b3b9399d68050df Mon Sep 17 00:00:00 2001 From: kuwoyuki Date: Fri, 23 Feb 2024 01:07:18 +0600 Subject: [PATCH 2/7] chore: _ --- sources/en/r/reaperscans.py | 38 +++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/sources/en/r/reaperscans.py b/sources/en/r/reaperscans.py index 33560b68c..7551c6462 100644 --- a/sources/en/r/reaperscans.py +++ b/sources/en/r/reaperscans.py @@ -2,6 +2,7 @@ import json import logging import time +import re from bs4 import Tag from lncrawl.core.crawler import Crawler @@ -30,7 +31,7 @@ def initialize(self): ) self.init_executor(ratelimit=0.9) - def get_chapters_from_page(self, page, body, token): + def get_chapters_from_page(self, page, body): url = self.absolute_url("/livewire/message/" + body["fingerprint"]["name"]) body["updates"] = [ { @@ -70,9 +71,7 @@ def read_novel_info(self): logger.debug("Visiting %s", self.novel_url) soup = self.get_soup(self.novel_url) - possible_title = soup.select_one("h1") - assert isinstance(possible_title, Tag) - self.novel_title = possible_title.text.strip() + self.novel_title = soup.select_one("h1").text.strip() logger.info("Novel title: %s", self.novel_title) possible_image = soup.select_one(".h-full .w-full img") @@ -80,33 +79,36 @@ def read_novel_info(self): self.novel_cover = self.absolute_url(possible_image["src"]) logger.info("Novel cover: %s", self.novel_cover) - # prolly not even needed, didn't check - csrf = soup.select_one('meta[name="csrf-token"]')["content"] # livewire container container = soup.select_one("main div[wire\\:id][wire\\:initial-data]") # first page ssr json - data = container["wire:initial-data"] - body = json.loads(data) - # del the dom effects attr + body = json.loads(container["wire:initial-data"]) body.pop("effects") - page_count = int( - container.select_one( - 'span[wire\\:key^="paginator-page"]:nth-last-child(2)' - ).text.strip() - ) # meh but i can't find a better selector chapter_count = int( - container.select_one( - "nav > div:last-child > div:first-child > p > span:nth-last-child(2)" - ).text.strip() + re.search( + r"\d+", + soup.find( + lambda tag: tag.name == "h1" and "Chapters" in tag.text + ).text.strip(), + )[0] ) chaps = self.get_chapters_from_doc(container) self.insert_chapters(chapter_count, chaps) + page_count = 1 + last_page = container.select_one( + 'span[wire\\:key^="paginator-page"]:nth-last-child(2)' + ) + if isinstance(last_page, Tag): + page_count = int(last_page.text.strip()) + if page_count != 1: + return + for k in range(2, page_count + 1): - dom = self.get_chapters_from_page(k, body, csrf) + dom = self.get_chapters_from_page(k, body) chaps = self.get_chapters_from_doc(dom) self.insert_chapters(chapter_count, chaps) # 429 otherwise, could use executor here tho maybe From 6ed913f84878161b059a3fbed656d334d5242f61 Mon Sep 17 00:00:00 2001 From: kuwoyuki Date: Fri, 23 Feb 2024 01:10:15 +0600 Subject: [PATCH 3/7] chore: _ --- sources/en/r/reaperscans.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sources/en/r/reaperscans.py b/sources/en/r/reaperscans.py index 7551c6462..dc9df9bc2 100644 --- a/sources/en/r/reaperscans.py +++ b/sources/en/r/reaperscans.py @@ -104,7 +104,8 @@ def read_novel_info(self): ) if isinstance(last_page, Tag): page_count = int(last_page.text.strip()) - if page_count != 1: + else: + # if we don't have the pagination el return for k in range(2, page_count + 1): From 7396a442d9a154cd193bb31fa1947593b1cc55ba Mon Sep 17 00:00:00 2001 From: kuwoyuki Date: Fri, 23 Feb 2024 22:40:26 +0600 Subject: [PATCH 4/7] fix: reverse, use executor for toc --- sources/en/r/reaperscans.py | 41 ++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/sources/en/r/reaperscans.py b/sources/en/r/reaperscans.py index dc9df9bc2..b78fc810b 100644 --- a/sources/en/r/reaperscans.py +++ b/sources/en/r/reaperscans.py @@ -56,16 +56,15 @@ def get_chapters_from_doc(self, dom): for a in dom.select("div[wire\\3A id] ul[role] li a") ] - def insert_chapters(self, total_count, chapter_list): - for ch in chapter_list: - self.chapters.insert( - 0, - { - "id": total_count - len(self.chapters), - "title": ch["title"], - "url": ch["url"], - }, - ) + def insert_chapters(self, chapters): + self.chapters = [ + { + "id": i + 1, + "title": x["title"], + "url": x["url"], + } + for i, x in enumerate(reversed(chapters)) + ] def read_novel_info(self): logger.debug("Visiting %s", self.novel_url) @@ -95,25 +94,29 @@ def read_novel_info(self): )[0] ) - chaps = self.get_chapters_from_doc(container) - self.insert_chapters(chapter_count, chaps) - + chapters = self.get_chapters_from_doc(container) page_count = 1 last_page = container.select_one( 'span[wire\\:key^="paginator-page"]:nth-last-child(2)' ) + if isinstance(last_page, Tag): page_count = int(last_page.text.strip()) else: + self.insert_chapters(chapters) # if we don't have the pagination el return - for k in range(2, page_count + 1): - dom = self.get_chapters_from_page(k, body) - chaps = self.get_chapters_from_doc(dom) - self.insert_chapters(chapter_count, chaps) - # 429 otherwise, could use executor here tho maybe - time.sleep(1) + toc_futures = [ + self.executor.submit(self.get_chapters_from_page, k, body) + for k in range(2, page_count + 1) + ] + self.resolve_futures(toc_futures, desc="TOC", unit="page") + for f in toc_futures: + dom = f.result() + chapters.extend(self.get_chapters_from_doc(dom)) + + self.insert_chapters(chapters) def download_chapter_body(self, chapter): # TODO: better retry/timeout settings From bbedbf6d44508cad286058b2884a0b5eaf2a8fc3 Mon Sep 17 00:00:00 2001 From: kuwoyuki Date: Fri, 23 Feb 2024 22:42:38 +0600 Subject: [PATCH 5/7] fix: lint --- sources/en/r/reaperscans.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/sources/en/r/reaperscans.py b/sources/en/r/reaperscans.py index b78fc810b..a0fdbbf9e 100644 --- a/sources/en/r/reaperscans.py +++ b/sources/en/r/reaperscans.py @@ -83,17 +83,7 @@ def read_novel_info(self): # first page ssr json body = json.loads(container["wire:initial-data"]) body.pop("effects") - - # meh but i can't find a better selector - chapter_count = int( - re.search( - r"\d+", - soup.find( - lambda tag: tag.name == "h1" and "Chapters" in tag.text - ).text.strip(), - )[0] - ) - + # initial chapters from soup chapters = self.get_chapters_from_doc(container) page_count = 1 last_page = container.select_one( From e8da097f4dd97e67f209953bb1e5ad2e28f75a76 Mon Sep 17 00:00:00 2001 From: kuwoyuki Date: Fri, 23 Feb 2024 22:44:55 +0600 Subject: [PATCH 6/7] chore: _ --- sources/en/r/reaperscans.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sources/en/r/reaperscans.py b/sources/en/r/reaperscans.py index a0fdbbf9e..8519e892b 100644 --- a/sources/en/r/reaperscans.py +++ b/sources/en/r/reaperscans.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- import json import logging -import time -import re from bs4 import Tag from lncrawl.core.crawler import Crawler From 51fb83c941109205426f3f50cb588a58706a242a Mon Sep 17 00:00:00 2001 From: kuwoyuki Date: Fri, 23 Feb 2024 22:47:24 +0600 Subject: [PATCH 7/7] chore: _ --- sources/en/r/reaperscans.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sources/en/r/reaperscans.py b/sources/en/r/reaperscans.py index 8519e892b..10b6d0491 100644 --- a/sources/en/r/reaperscans.py +++ b/sources/en/r/reaperscans.py @@ -101,8 +101,7 @@ def read_novel_info(self): ] self.resolve_futures(toc_futures, desc="TOC", unit="page") for f in toc_futures: - dom = f.result() - chapters.extend(self.get_chapters_from_doc(dom)) + chapters.extend(self.get_chapters_from_doc(f.result())) self.insert_chapters(chapters)