-
Notifications
You must be signed in to change notification settings - Fork 8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
21 changed files
with
849 additions
and
294 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
|
||
from crazy_functions.json_fns.pydantic_io import GptJsonIO, JsonStringError | ||
from request_llms.bridge_all import predict_no_ui_long_connection | ||
def get_code_block(reply): | ||
import re | ||
pattern = r"```([\s\S]*?)```" # regex pattern to match code blocks | ||
matches = re.findall(pattern, reply) # find all code blocks in text | ||
if len(matches) == 1: | ||
return "```" + matches[0] + "```" # code block | ||
raise RuntimeError("GPT is not generating proper code.") | ||
|
||
def is_same_thing(a, b, llm_kwargs): | ||
from pydantic import BaseModel, Field | ||
class IsSameThing(BaseModel): | ||
is_same_thing: bool = Field(description="determine whether two objects are same thing.", default=False) | ||
|
||
def run_gpt_fn(inputs, sys_prompt, history=[]): | ||
return predict_no_ui_long_connection( | ||
inputs=inputs, llm_kwargs=llm_kwargs, | ||
history=history, sys_prompt=sys_prompt, observe_window=[] | ||
) | ||
|
||
gpt_json_io = GptJsonIO(IsSameThing) | ||
inputs_01 = "Identity whether the user input and the target is the same thing: \n target object: {a} \n user input object: {b} \n\n\n".format(a=a, b=b) | ||
inputs_01 += "\n\n\n Note that the user may describe the target object with a different language, e.g. cat and 猫 are the same thing." | ||
analyze_res_cot_01 = run_gpt_fn(inputs_01, "", []) | ||
|
||
inputs_02 = inputs_01 + gpt_json_io.format_instructions | ||
analyze_res = run_gpt_fn(inputs_02, "", [inputs_01, analyze_res_cot_01]) | ||
|
||
try: | ||
res = gpt_json_io.generate_output_auto_repair(analyze_res, run_gpt_fn) | ||
return res.is_same_thing | ||
except JsonStringError as e: | ||
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
# From project chatglm-langchain | ||
|
||
|
||
from langchain.document_loaders import UnstructuredFileLoader | ||
from langchain.text_splitter import CharacterTextSplitter | ||
import re | ||
from typing import List | ||
|
||
class ChineseTextSplitter(CharacterTextSplitter): | ||
def __init__(self, pdf: bool = False, sentence_size: int = None, **kwargs): | ||
super().__init__(**kwargs) | ||
self.pdf = pdf | ||
self.sentence_size = sentence_size | ||
|
||
def split_text1(self, text: str) -> List[str]: | ||
if self.pdf: | ||
text = re.sub(r"\n{3,}", "\n", text) | ||
text = re.sub('\s', ' ', text) | ||
text = text.replace("\n\n", "") | ||
sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :; | ||
sent_list = [] | ||
for ele in sent_sep_pattern.split(text): | ||
if sent_sep_pattern.match(ele) and sent_list: | ||
sent_list[-1] += ele | ||
elif ele: | ||
sent_list.append(ele) | ||
return sent_list | ||
|
||
def split_text(self, text: str) -> List[str]: ##此处需要进一步优化逻辑 | ||
if self.pdf: | ||
text = re.sub(r"\n{3,}", r"\n", text) | ||
text = re.sub('\s', " ", text) | ||
text = re.sub("\n\n", "", text) | ||
|
||
text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符 | ||
text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号 | ||
text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号 | ||
text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r'\1\n\2', text) | ||
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号 | ||
text = text.rstrip() # 段尾如果有多余的\n就去掉它 | ||
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。 | ||
ls = [i for i in text.split("\n") if i] | ||
for ele in ls: | ||
if len(ele) > self.sentence_size: | ||
ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele) | ||
ele1_ls = ele1.split("\n") | ||
for ele_ele1 in ele1_ls: | ||
if len(ele_ele1) > self.sentence_size: | ||
ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1) | ||
ele2_ls = ele_ele2.split("\n") | ||
for ele_ele2 in ele2_ls: | ||
if len(ele_ele2) > self.sentence_size: | ||
ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2) | ||
ele2_id = ele2_ls.index(ele_ele2) | ||
ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[ | ||
ele2_id + 1:] | ||
ele_id = ele1_ls.index(ele_ele1) | ||
ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:] | ||
|
||
id = ls.index(ele) | ||
ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:] | ||
return ls | ||
|
||
def load_file(filepath, sentence_size): | ||
loader = UnstructuredFileLoader(filepath, mode="elements") | ||
textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size) | ||
docs = loader.load_and_split(text_splitter=textsplitter) | ||
# write_check_file(filepath, docs) | ||
return docs | ||
|
Oops, something went wrong.