Skip to content

Commit

Permalink
🐛 Bug: Fix the bug where web search fails to retrieve the correct for…
Browse files Browse the repository at this point in the history
…mat.
  • Loading branch information
yym68686 committed Nov 5, 2024
1 parent fe1dba6 commit df839aa
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 7 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

setup(
name="modelmerge",
version="0.11.59",
version="0.11.60",
description="modelmerge is a multi-large language model API aggregator.",
long_description=Path.open(Path("README.md"), encoding="utf-8").read(),
long_description_content_type="text/markdown",
Expand Down
12 changes: 10 additions & 2 deletions src/ModelMerge/plugins/websearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,18 @@ def get_body(url):
text = httpx.get(url, verify=False, timeout=5).text
if text == "":
return "抱歉,目前无法访问该网页。"
body = lxml.html.fromstring(text).xpath('//body')
# body = lxml.html.fromstring(text).xpath('//body')

doc = lxml.html.fromstring(text)
# 检查是否是GitHub raw文件格式(body > pre)
if doc.xpath('//body/pre'):
return text # 直接返回原始文本,保留格式

body = doc.xpath('//body')
if body == [] and text != "":
body = text
return body
return f'<pre>{body}</pre>'
# return body
else:
body = body[0]
body = Cleaner(javascript=True, style=True).clean_html(body)
Expand Down
16 changes: 12 additions & 4 deletions test/test_Web_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,18 @@ def get_body(url):
text = httpx.get(url, verify=False, timeout=5).text
if text == "":
return "抱歉,目前无法访问该网页。"
body = lxml.html.fromstring(text).xpath('//body')
# body = lxml.html.fromstring(text).xpath('//body')

doc = lxml.html.fromstring(text)
# 检查是否是GitHub raw文件格式(body > pre)
if doc.xpath('//body/pre'):
return text # 直接返回原始文本,保留格式

body = doc.xpath('//body')
if body == [] and text != "":
body = text
return body
return f'<pre>{body}</pre>'
# return body
else:
body = body[0]
body = Cleaner(javascript=True, style=True).clean_html(body)
Expand Down Expand Up @@ -237,8 +245,8 @@ def score_content(content):
# for url in ['https://www.airuniversity.af.edu/JIPA/Display/Article/3111127/the-uschina-trade-war-vietnam-emerges-as-the-greatest-winner/']:
# for url in ['https://zhuanlan.zhihu.com/p/646786536']:
# for url in ['https://zh.wikipedia.org/wiki/%E4%BF%84%E7%BE%85%E6%96%AF%E5%85%A5%E4%BE%B5%E7%83%8F%E5%85%8B%E8%98%AD']:
# for url in ['https://raw.githubusercontent.com/yym68686/ChatGPT-Telegram-Bot/main/README.md']:
for url in ['https://raw.githubusercontent.com/openai/openai-python/main/src/openai/api_requestor.py']:
for url in ['https://raw.githubusercontent.com/yym68686/ChatGPT-Telegram-Bot/main/README.md']:
# for url in ['https://raw.githubusercontent.com/openai/openai-python/main/src/openai/api_requestor.py']:
# for url in ['https://stock.finance.sina.com.cn/usstock/quotes/aapl.html']:
# Web_crawler(url)
# print(get_body(url))
Expand Down

0 comments on commit df839aa

Please sign in to comment.