🐛 Bug: Fix the bug where web search fails to retrieve the correct for…

…mat.
yym68686 · Nov 5, 2024 · df839aa · df839aa
1 parent fe1dba6
commit df839aa
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 7 deletions.
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 
 setup(
     name="modelmerge",
-    version="0.11.59",
+    version="0.11.60",
     description="modelmerge is a multi-large language model API aggregator.",
     long_description=Path.open(Path("README.md"), encoding="utf-8").read(),
     long_description_content_type="text/markdown",

diff --git a/src/ModelMerge/plugins/websearch.py b/src/ModelMerge/plugins/websearch.py
@@ -31,10 +31,18 @@ def get_body(url):
             text = httpx.get(url, verify=False, timeout=5).text
             if text == "":
                 return "抱歉，目前无法访问该网页。"
-            body = lxml.html.fromstring(text).xpath('//body')
+            # body = lxml.html.fromstring(text).xpath('//body')
+
+            doc = lxml.html.fromstring(text)
+            # 检查是否是GitHub raw文件格式（body > pre）
+            if doc.xpath('//body/pre'):
+                return text  # 直接返回原始文本，保留格式
+
+            body = doc.xpath('//body')
             if body == [] and text != "":
                 body = text
-                return body
+                return f'<pre>{body}</pre>'
+                # return body
             else:
                 body = body[0]
                 body = Cleaner(javascript=True, style=True).clean_html(body)

diff --git a/test/test_Web_crawler.py b/test/test_Web_crawler.py
@@ -82,10 +82,18 @@ def get_body(url):
             text = httpx.get(url, verify=False, timeout=5).text
             if text == "":
                 return "抱歉，目前无法访问该网页。"
-            body = lxml.html.fromstring(text).xpath('//body')
+            # body = lxml.html.fromstring(text).xpath('//body')
+
+            doc = lxml.html.fromstring(text)
+            # 检查是否是GitHub raw文件格式（body > pre）
+            if doc.xpath('//body/pre'):
+                return text  # 直接返回原始文本，保留格式
+
+            body = doc.xpath('//body')
             if body == [] and text != "":
                 body = text
-                return body
+                return f'<pre>{body}</pre>'
+                # return body
             else:
                 body = body[0]
                 body = Cleaner(javascript=True, style=True).clean_html(body)
@@ -237,8 +245,8 @@ def score_content(content):
 # for url in ['https://www.airuniversity.af.edu/JIPA/Display/Article/3111127/the-uschina-trade-war-vietnam-emerges-as-the-greatest-winner/']:
 # for url in ['https://zhuanlan.zhihu.com/p/646786536']:
 # for url in ['https://zh.wikipedia.org/wiki/%E4%BF%84%E7%BE%85%E6%96%AF%E5%85%A5%E4%BE%B5%E7%83%8F%E5%85%8B%E8%98%AD']:
-# for url in ['https://raw.githubusercontent.com/yym68686/ChatGPT-Telegram-Bot/main/README.md']:
-for url in ['https://raw.githubusercontent.com/openai/openai-python/main/src/openai/api_requestor.py']:
+for url in ['https://raw.githubusercontent.com/yym68686/ChatGPT-Telegram-Bot/main/README.md']:
+# for url in ['https://raw.githubusercontent.com/openai/openai-python/main/src/openai/api_requestor.py']:
 # for url in ['https://stock.finance.sina.com.cn/usstock/quotes/aapl.html']:
     # Web_crawler(url)
     # print(get_body(url))