From e0d442bc48a1eb7e744b43a5cd5f5e2d3d303b67 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Fri, 15 Nov 2024 15:24:33 +0800 Subject: [PATCH] Enlarge the term weight difference --- api/apps/document_app.py | 13 ++++++++++++- rag/nlp/query.py | 2 +- rag/nlp/term_weight.py | 2 ++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/api/apps/document_app.py b/api/apps/document_app.py index d1636b5de9..6a2362a902 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License # +import os.path import pathlib import re @@ -36,7 +37,7 @@ from api.settings import RetCode, docStoreConn from api.utils.api_utils import get_json_result from rag.utils.storage_factory import STORAGE_IMPL -from api.utils.file_utils import filename_type, thumbnail +from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory from api.utils.web_utils import html2pdf, is_valid_url from api.constants import IMG_BASE64_PREFIX @@ -529,15 +530,25 @@ def parse(): if not is_valid_url(url): return get_json_result( data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR) + download_path = os.path.join(get_project_base_directory(), "logs/downloads") + os.makedirs(download_path, exist_ok=True) from selenium.webdriver import Chrome, ChromeOptions options = ChromeOptions() options.add_argument('--headless') options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') + options.add_experimental_option('prefs', { + 'download.default_directory': download_path, + 'download.prompt_for_download': False, + 'download.directory_upgrade': True, + 'safebrowsing.enabled': True + }) driver = Chrome(options=options) driver.get(url) + print(driver.get_downloadable_files()) sections = RAGFlowHtmlParser().parser_txt(driver.page_source) + driver.close() return get_json_result(data="\n".join(sections)) if 'file' not in request.files: diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 9a6ecec2da..3da59d31be 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -66,7 +66,7 @@ def rmWWW(txt): def question(self, txt, tbl="qa", min_match:float=0.6): txt = re.sub( - r"[ :\r\n\t,,。??/`!!&\^%%()^]+", + r"[ :\r\n\t,,。??/`!!&\^%%()^\[\]]+", " ", rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())), ).strip() diff --git a/rag/nlp/term_weight.py b/rag/nlp/term_weight.py index afd409a159..8e1e598369 100644 --- a/rag/nlp/term_weight.py +++ b/rag/nlp/term_weight.py @@ -228,6 +228,7 @@ def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5))) idf2 = np.array([idf(df(t), 1000000000) for t in tks]) wts = (0.3 * idf1 + 0.7 * idf2) * \ np.array([ner(t) * postag(t) for t in tks]) + wts = [math.exp(s) for s in wts] tw = list(zip(tks, wts)) else: for tk in tks: @@ -236,6 +237,7 @@ def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5))) idf2 = np.array([idf(df(t), 1000000000) for t in tt]) wts = (0.3 * idf1 + 0.7 * idf2) * \ np.array([ner(t) * postag(t) for t in tt]) + wts = [math.exp(s) for s in wts] tw.extend(zip(tt, wts)) S = np.sum([s for _, s in tw])