Skip to content

Commit

Permalink
69shuba: auto-fix chapter indexing, fix issue with getting > 4.3k cha…
Browse files Browse the repository at this point in the history
…pters, remove debug
  • Loading branch information
ACA committed Feb 5, 2024
1 parent 36d89ab commit 80622fa
Showing 1 changed file with 17 additions and 4 deletions.
21 changes: 17 additions & 4 deletions sources/zh/69shuba.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ class sixnineshu(Crawler):
"https://www.69xinshu.com/",
]

def initialize(self):
# the default lxml parser cannot handle the huge gbk encoded sites (fails after 4.3k chapters)
self.init_parser("html.parser")

def search_novel(self, query):
query = urllib.parse.quote(query.encode("gbk"))
data = f"searchkey={query}&submit=Search"
Expand Down Expand Up @@ -94,17 +98,26 @@ def read_novel_info(self):
# https://www.69shuba.com/txt/A43616.htm -> https://www.69shuba.com/A43616/
soup = self.get_soup(self.novel_url.replace("/txt/", "/").replace(".htm", "/"), encoding="gbk")

for li in soup.select("div.catalog ul li"):
# manually correct their false chapter identifiers if need be
correction = 0
for idx, li in enumerate(soup.select("div#catalog ul li")):
chap_id = int(li["data-num"])
if chap_id == 7:
print(str(li.select_one("a")["href"]))
if idx == 0:
# 1-2 = -1; 1-1 = 0; 1 - 0 = +1
correction = 1 - chap_id
chap_id += correction
vol_id = len(self.chapters) // 100 + 1
if len(self.chapters) % 100 == 0:
self.volumes.append(Volume(vol_id))
a = li.select_one("a")
if not a:
# this should not occur with html.parser, if it does, likely due to parser/encoding issue
logger.warning("Failed to get Chapter %d! Missing Link", chap_id)
continue
self.chapters.append(
Chapter(
chap_id,
url=self.absolute_url(li.select_one("a")["href"]),
url=a["href"],
title=li.text.strip(),
volume=vol_id
)
Expand Down

0 comments on commit 80622fa

Please sign in to comment.