Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: reaperscans #2277

Merged
merged 7 commits into from
Feb 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions lncrawl/core/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,9 +223,11 @@ def submit_form(
headers = CaseInsensitiveDict(headers)
headers.setdefault(
"Content-Type",
"multipart/form-data"
if multipart
else "application/x-www-form-urlencoded; charset=UTF-8",
(
"multipart/form-data"
if multipart
else "application/x-www-form-urlencoded; charset=UTF-8"
),
)
return self.post_response(url, data=data, headers=headers, **kwargs)

Expand Down Expand Up @@ -269,15 +271,15 @@ def get_json(self, url, headers={}, **kwargs) -> Any:
response = self.get_response(url, headers=headers, **kwargs)
return response.json()

def post_json(self, url, data={}, headers={}) -> Any:
def post_json(self, url, data={}, headers={}, **kwargs) -> Any:
"""Make a POST request and return the content as JSON object"""
headers = CaseInsensitiveDict(headers)
headers.setdefault("Content-Type", "application/json")
headers.setdefault(
"Accept",
"application/json,text/plain,*/*",
)
response = self.post_response(url, data=data, headers=headers)
response = self.post_response(url, data=data, headers=headers, **kwargs)
return response.json()

def submit_form_json(
Expand Down
123 changes: 71 additions & 52 deletions sources/en/r/reaperscans.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# -*- coding: utf-8 -*-
import json
import logging
from bs4 import Tag

from lncrawl.core.crawler import Crawler

logger = logging.getLogger(__name__)
search_url = "https://reaperscans.com/?s=%s&post_type=wp-manga"


class Reaperscans(Crawler):
Expand All @@ -22,72 +22,91 @@ def initialize(self):
"https://discord.gg/MaRegMFhRb",
"https://discord.gg/reapercomics",
"h ttps://discord.gg/reapercomic",
"https://discord.gg/sb2jqkv",
"____",
"Join our Discord for updates on releases!",
"Join our Discord",
]
)
self.init_executor(ratelimit=0.9)

def search_novel(self, query):
query = query.lower().replace(" ", "+")
soup = self.get_soup(search_url % query)

results = []
for tab in soup.select(".c-tabs-item__content"):
a = tab.select_one(".post-title h3 a")
latest = tab.select_one(".latest-chap .chapter a").text
votes = tab.select_one(".rating .total_votes").text
results.append(
{
"title": a.text.strip(),
"url": self.absolute_url(a["href"]),
"info": "%s | Rating: %s" % (latest, votes),
}
)

return results
def get_chapters_from_page(self, page, body):
url = self.absolute_url("/livewire/message/" + body["fingerprint"]["name"])
body["updates"] = [
{
"type": "callMethod",
"payload": {
"id": "00000",
"method": "gotoPage",
"params": [page, "page"],
},
}
]

response = self.post_json(url=url, data=json.dumps(body), timeout=10)
return self.make_soup(response["effects"]["html"])

def get_chapters_from_doc(self, dom):
return [
{
"title": a.select_one("p").text.strip(),
"url": self.absolute_url(a["href"]),
}
for a in dom.select("div[wire\\3A id] ul[role] li a")
]

def insert_chapters(self, chapters):
self.chapters = [
{
"id": i + 1,
"title": x["title"],
"url": x["url"],
}
for i, x in enumerate(reversed(chapters))
]

def read_novel_info(self):
logger.debug("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url)

possible_title = soup.select_one(".post-title h1")
assert isinstance(possible_title, Tag)
for span in possible_title.select("span"):
span.extract()
self.novel_title = possible_title.text.strip()
self.novel_title = soup.select_one("h1").text.strip()
logger.info("Novel title: %s", self.novel_title)

possible_image = soup.select_one(".summary_image a img")
possible_image = soup.select_one(".h-full .w-full img")
if isinstance(possible_image, Tag):
self.novel_cover = self.absolute_url(possible_image["data-src"])
self.novel_cover = self.absolute_url(possible_image["src"])
logger.info("Novel cover: %s", self.novel_cover)

self.novel_author = " ".join(
[a.text.strip() for a in soup.select('.author-content a[href*="author"]')]
# livewire container
container = soup.select_one("main div[wire\\:id][wire\\:initial-data]")
# first page ssr json
body = json.loads(container["wire:initial-data"])
body.pop("effects")
# initial chapters from soup
chapters = self.get_chapters_from_doc(container)
page_count = 1
last_page = container.select_one(
'span[wire\\:key^="paginator-page"]:nth-last-child(2)'
)
logger.info("%s", self.novel_author)

chapter_list_url = self.absolute_url("ajax/chapters", self.novel_url)
soup = self.post_soup(chapter_list_url, headers={"accept": "*/*"})
for a in reversed(
soup.select('.wp-manga-chapter:not(.premium-block) a[href*="/chapter"]')
): # This stops it from trying to download locked premium chapters.
for span in a.findAll("span"): # Removes time and date from chapter title.
span.extract()
chap_id = len(self.chapters) + 1
vol_id = 1 + len(self.chapters) // 100
if chap_id % 100 == 1:
self.volumes.append({"id": vol_id})
self.chapters.append(
{
"id": chap_id,
"volume": vol_id,
"title": a.text.strip(),
"url": self.absolute_url(a["href"]),
}
)

if isinstance(last_page, Tag):
page_count = int(last_page.text.strip())
else:
self.insert_chapters(chapters)
# if we don't have the pagination el
return

toc_futures = [
self.executor.submit(self.get_chapters_from_page, k, body)
for k in range(2, page_count + 1)
]
self.resolve_futures(toc_futures, desc="TOC", unit="page")
for f in toc_futures:
chapters.extend(self.get_chapters_from_doc(f.result()))

self.insert_chapters(chapters)

def download_chapter_body(self, chapter):
soup = self.get_soup(chapter["url"])
contents = soup.select_one("div.text-left")
# TODO: better retry/timeout settings
soup = self.get_soup(chapter["url"], retry=3, timeout=10)
contents = soup.select_one("article")
return self.cleaner.extract_contents(contents)
Loading