Skip to content

Commit

Permalink
Enlarge the term weight difference
Browse files Browse the repository at this point in the history
  • Loading branch information
KevinHuSh committed Nov 15, 2024
1 parent 6d451db commit e0d442b
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 2 deletions.
13 changes: 12 additions & 1 deletion api/apps/document_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License
#
import os.path
import pathlib
import re

Expand All @@ -36,7 +37,7 @@
from api.settings import RetCode, docStoreConn
from api.utils.api_utils import get_json_result
from rag.utils.storage_factory import STORAGE_IMPL
from api.utils.file_utils import filename_type, thumbnail
from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
from api.utils.web_utils import html2pdf, is_valid_url
from api.constants import IMG_BASE64_PREFIX

Expand Down Expand Up @@ -529,15 +530,25 @@ def parse():
if not is_valid_url(url):
return get_json_result(
data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
download_path = os.path.join(get_project_base_directory(), "logs/downloads")
os.makedirs(download_path, exist_ok=True)
from selenium.webdriver import Chrome, ChromeOptions
options = ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_experimental_option('prefs', {
'download.default_directory': download_path,
'download.prompt_for_download': False,
'download.directory_upgrade': True,
'safebrowsing.enabled': True
})
driver = Chrome(options=options)
driver.get(url)
print(driver.get_downloadable_files())
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
driver.close()
return get_json_result(data="\n".join(sections))

if 'file' not in request.files:
Expand Down
2 changes: 1 addition & 1 deletion rag/nlp/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def rmWWW(txt):

def question(self, txt, tbl="qa", min_match:float=0.6):
txt = re.sub(
r"[ :\r\n\t,,。??/`!!&\^%%()^]+",
r"[ :\r\n\t,,。??/`!!&\^%%()^\[\]]+",
" ",
rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
).strip()
Expand Down
2 changes: 2 additions & 0 deletions rag/nlp/term_weight.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
idf2 = np.array([idf(df(t), 1000000000) for t in tks])
wts = (0.3 * idf1 + 0.7 * idf2) * \
np.array([ner(t) * postag(t) for t in tks])
wts = [math.exp(s) for s in wts]
tw = list(zip(tks, wts))
else:
for tk in tks:
Expand All @@ -236,6 +237,7 @@ def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
wts = (0.3 * idf1 + 0.7 * idf2) * \
np.array([ner(t) * postag(t) for t in tt])
wts = [math.exp(s) for s in wts]
tw.extend(zip(tt, wts))

S = np.sum([s for _, s in tw])
Expand Down

0 comments on commit e0d442b

Please sign in to comment.