From 6b9677dafdcad3871c6b228ee3f69fb9efc57c06 Mon Sep 17 00:00:00 2001 From: wdpm <1137299673@qq.com> Date: Sun, 3 Nov 2024 00:37:50 +0800 Subject: [PATCH] fix: add if judgement for not intact text content in linovelib mobile traditional version --- .../mobile-version-text-not-intact.html | 155 ++++++++++++++++++ pyproject.toml | 5 +- src/linovelib2epub/exceptions.py | 18 +- src/linovelib2epub/spider/linovelib_spider.py | 99 +++++++---- 4 files changed, 242 insertions(+), 35 deletions(-) create mode 100644 analyze/linovelib-mobile/mobile-version-text-not-intact.html diff --git a/analyze/linovelib-mobile/mobile-version-text-not-intact.html b/analyze/linovelib-mobile/mobile-version-text-not-intact.html new file mode 100644 index 0000000..0e1ae5e --- /dev/null +++ b/analyze/linovelib-mobile/mobile-version-text-not-intact.html @@ -0,0 +1,155 @@ + + + + + 女主角? 聖女? 不,我是全業女僕(自豪)! 第1章 序章 路多帕克家的大小姐以及萬能女僕 _ 嗶哩輕小說 + + + + + + + + + + + + + + + + + + + + +
+
+
+
+

序章 路多帕克家的大小姐以及萬能女僕

+

第1章

+
+

「歡迎光臨」

+
+

面對來訪的兩個少女,黑髮嬌小的女僕彬彬有禮地鞠躬。

+

然而,少女們並未注意到女僕,而是看著所謂的宅邸的玄關,露出驚愕的表情。

+
+

「這是露西亞娜家的宅邸?」

+

「咦?真的嗎?因為,之前來訪的時候更……」

+
+

兩個少女感到驚訝也是無可厚非的。

+

這是她們第二次拜訪這座宅邸。但是,那時候更……

+
+

「更骯髒破舊嗎?」

+

「「露西亞娜!」」

+
+

從宅邸深處走出一名少女。穿著一件嫩草色的洋裝,金髮碧眼的她,以優雅的舉止走到了門廳等待的少女們身旁。

+

黑髮的女僕輕盈地退到一旁,靜靜地站在宅邸主人的身後。

+
+

「歡迎,貝亞朵莉絲,蜜莉亞莉雅。謝謝你們今天到訪。」

+
+

面對微笑著的少女,被邀請的兩位少女,貝亞朵莉絲和蜜莉亞莉雅,露出了更加驚訝的表情。

+
+

「露西亞娜,你……」

+

「……你怎麼變得如此美麗,露西亞娜。」

+
+

+ 兩個少女無法掩飾自己的驚訝。因為站在她們面前的人,露西亞娜・路多帕克伯爵千金的容貌,站姿,都是如此美麗。

+
+

+ 她原本就擁有美麗的外貌。然而,雖然是伯爵家,但她的家族卻是聞名的沒落貴族。連傭人都沒錢僱用,更不用說維持宅邸和讓她穿著華麗了,只是個名存實亡的貧窮貴族。

+

+ 儘管如此,出現在眼前的少女美麗得難以形容。相隔兩周再次見面的露西亞娜,從頭到腳都展現出真正的伯爵千金的風采。

+
+

+ 原本素材美麗的未得到妥善的照顧的頭髮,現在,每一根都得到了精心的呵護,光彩奪目。腰間長長的金色頭髮在後腦勺部分編織起來,走路時波浪狀的髮絲輕輕地跳舞著圓舞曲。

+
+

+ 曾經因為沒有得到足夠照顧而變得粗糙的白皙肌膚,現在已變得與年齡相稱的潤澤美麗。她那白皙的肌膚、淡淡的薄桃色雙唇,在清純中透露著少女的嫵媚。

+
+

有著一對碧眼的露西亞娜,在貝亞朵莉絲和蜜莉亞莉雅面前,以優美的姿態輕輕拾起裙擺,慢慢地彎膝鞠躬。

+
+

「歡迎至我家來訪,非常感謝。」

+
+

這是非常優雅的行禮。

+

貝亞朵莉絲和蜜莉亞莉雅對她優雅的動作感嘆不已。

+

很難相信她們同齡。從未想過她變得如此美麗。

+
+

「梅洛蒂,我們要去露台,請為我們準備茶點。」

+

「遵命,大小姐。」

+
+

當露西亞娜微笑著對黑髮的女僕梅洛蒂提出要求時,她優雅地行禮微笑著離開。

+
+

「來,我們走吧。」

+
+

在露西亞娜的帶領下,貝亞朵莉絲和蜜莉亞莉雅前往露台。在此期間,她們都被露西亞娜和宅邸的美麗所吸引。

+

再次看著宅邸,毫無疑問這裡正是她們之前拜訪過的地方。身為伯爵家的宅邸規模較小,擺設並不豪華。……(內容加載失敗!請重載或更換瀏覽器)

+

【手機版頁面由於相容性問題暫不支持電腦端閱讀,請使用手機閱讀。】
+
+ + + +
+
+
+
+ + + +
+ + + + + + + + + + + + + \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a62a1cb..76b6e09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ classifiers = [ # alternatives: use hatch plugin to read requirement.txt # now is manual work dependencies = [ + 'setuptools', 'bs4>=0.0.1', 'demjson3>=3.0.5', 'EbookLib>=0.17.1', @@ -46,10 +47,10 @@ dependencies = [ 'pillow>=11.0.0', 'inquirer>=3.1.2', 'aiofiles>=23.1.0', - 'aiohttp>=3.8.4', + 'aiohttp>=3.10.2', 'dynaconf>=3.2.3', 'brotli>=1.1.0', - 'lxml>=4.9.2', + 'lxml>=5.3.0', 'tabulate>=0.9.0', 'DrissionPage>=4.0.4.5', 'selenium>=4.17.2', diff --git a/src/linovelib2epub/exceptions.py b/src/linovelib2epub/exceptions.py index 0b4ce4b..c927bee 100644 --- a/src/linovelib2epub/exceptions.py +++ b/src/linovelib2epub/exceptions.py @@ -5,6 +5,18 @@ class LinovelibException(Exception): pass class PageContentAbnormalException(LinovelibException): - def __init__(self, message="Page content is abnormal."): - self.message = message - super().__init__(self.message) + def __init__(self, message="Page content is abnormal"): + super().__init__(message) + + +class EmptyTitleError(LinovelibException): + def __init__(self, message = "The book title is empty"): + super().__init__(message) + +class EmptyArticleError(LinovelibException): + def __init__(self, message = "The article tag can't be found"): + super().__init__(message) + +class NotIntactTextError(LinovelibException): + def __init__(self, message="The text content is not intact"): + super().__init__(message) \ No newline at end of file diff --git a/src/linovelib2epub/spider/linovelib_spider.py b/src/linovelib2epub/spider/linovelib_spider.py index c4b2fa7..eb6c00e 100644 --- a/src/linovelib2epub/spider/linovelib_spider.py +++ b/src/linovelib2epub/spider/linovelib_spider.py @@ -14,12 +14,13 @@ import requests from DrissionPage import ChromiumOptions, WebPage from PIL import Image -from bs4 import (BeautifulSoup) +from bs4 import (BeautifulSoup, PageElement) from . import BaseNovelWebsiteSpider from .linovelib_mobile_rule import LinovelibMobileRuleParser from .linovelib_pc_rule import LinovelibPCRuleParser -from ..exceptions import LinovelibException, PageContentAbnormalException +from ..exceptions import LinovelibException, PageContentAbnormalException, EmptyTitleError, NotIntactTextError, \ + EmptyArticleError from ..models import LightNovel, LightNovelChapter, LightNovelVolume, LightNovelImage, CatalogLinovelibChapter, \ CatalogLinovelibVolume from ..utils import (cookiedict_from_str, create_folder_if_not_exists, @@ -363,6 +364,7 @@ def _extract_image_list(image_dict=None): return image_url_list + class LinovelibSpiderMobile(BaseLinovelibSpider): def fetch(self): @@ -435,7 +437,7 @@ def _anti_js_obfuscation(html): res = html.translate(table) return res - def _sanitize_html(html: BeautifulSoup) -> str: + def _sanitize_html(html: PageElement) -> str: """ Strip useless script on body tag by reg or soup library method. e.g. @@ -454,6 +456,48 @@ def _sanitize_html(html: BeautifulSoup) -> str: return re.sub(r'', '', str(html_copy), flags=re.DOTALL) + def _require_not_empty_article(soup, page_link): + article_soup = soup.find(id=self._html_content_id) + if article_soup is None: + hints = """ + This can happen for the following reasons: + - The html structure of bilinovel website has changed. => You can submit a github issue to remind the maintainer. + - You are on a network outside of Chinese mainland, and want to request the traditional Chinese version of the website + without specifying the target_site parameter. => Refer README document and set the target_site parameter. + """ + dedent_hints = textwrap.dedent(hints) + self.logger.fatal( + f'The content of {page_link} is Empty and content_id ={self._html_content_id}.' + f'{dedent_hints}') + + raise EmptyArticleError() + + def _require_not_empty_title(soup: BeautifulSoup): + new_title = soup.find(id='atitle') + if new_title is None: + raise EmptyTitleError() + + return new_title + + def _require_intact_text_content(soup: BeautifulSoup): + article_page_element = soup.find(id=self._html_content_id) + article_text = article_page_element.text + error_keywords = [ + "內容加載失敗", + "請重載", + "更換瀏覽器", + "相容性問題", + "不支持電腦端閱讀", + "請使用手機閱讀" + ] + + threshold = 2 + error_count = sum(1 for keyword in error_keywords if keyword in article_text) + if error_count >= threshold: + raise NotIntactTextError() + + return article_page_element + book_catalog_rs = None try: book_catalog_rs = requests_get_with_retry(self._session, @@ -504,21 +548,33 @@ def _sanitize_html(html: BeautifulSoup) -> str: for page_link in catalog_chapter.chapter_urls: self._apply_crawl_delay('page_crawl_delay') - # retry until get the correct title & content body + soup = None + new_title = None + article_page_element = None while True: try: page_resp = self._fetch_page(page_link, max_retries=self.spider_settings['http_retries']) - except (Exception,): - continue - # double check if the title exists - page_resp = page_resp or '' - soup = BeautifulSoup(page_resp, 'lxml') - new_title = soup.find(id='atitle') - if new_title is not None: - self.logger.debug(f'page({page_link}) size={len(page_resp)}') + page_resp = page_resp or '' + soup = BeautifulSoup(page_resp, 'lxml') + + new_title = _require_not_empty_title(soup) + _require_not_empty_article(soup, page_link) + article_page_element = _require_intact_text_content(soup) + + # happy path break + except (EmptyTitleError, NotIntactTextError) as ex: + self.logger.error(f'url: {page_link}; {ex}') + continue + except EmptyArticleError as ex: + self.logger.error(f'url: {page_link}; {ex}; Exit eagerly.') + # very sad path + sys.exit(-1) + except (Exception,) as ex: + self.logger.error(f'url: {page_link}; {ex}') + continue # 分页判断过滤 if not new_title.text.startswith(light_novel_chapter.title): @@ -530,24 +586,7 @@ def _sanitize_html(html: BeautifulSoup) -> str: light_novel_chapter.title = new_title.text images = soup.find_all('img') - - # solve dynamic id - #
- article_soup = soup.find(id=self._html_content_id) - if not article_soup: - hints = """ - This can happen for the following reasons: - - The html structure of bilinovel website has changed. => You can submit a github issue to remind the maintainer. - - You are on a network outside of Chinese mainland, and want to request the traditional Chinese version of the website - without specifying the target_site parameter. => Refer README document and set the target_site parameter. - """ - dedent_hints = textwrap.dedent(hints) - self.logger.fatal( - f'The content of {page_link} is Empty and content_id ={self._html_content_id}.' - f'{dedent_hints}') - sys.exit(1) - - article = _sanitize_html(article_soup) + article = _sanitize_html(article_page_element) for _, image in enumerate(images): # #