Skip to content

Commit

Permalink
feat(ChatDB): ChatDB Use fintune model
Browse files Browse the repository at this point in the history
1.Compatible with community pure sql output model
  • Loading branch information
yhjun1026 committed Nov 17, 2023
1 parent 343d646 commit ea363a4
Show file tree
Hide file tree
Showing 26 changed files with 105 additions and 77 deletions.
58 changes: 33 additions & 25 deletions pilot/base_modules/agent/commands/command_mange.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def check_last_plugin_call_ready(self, all_context):
return False

def __deal_error_md_tags(self, all_context, api_context, include_end: bool = True):
error_md_tags = ["```", "```python", "```xml", "```json", "```markdown"]
error_md_tags = ["```", "```python", "```xml", "```json", "```markdown", "```sql"]
if include_end == False:
md_tag_end = ""
else:
Expand All @@ -261,7 +261,6 @@ def __deal_error_md_tags(self, all_context, api_context, include_end: bool = Tru
return all_context

def api_view_context(self, all_context: str, display_mode: bool = False):
error_mk_tags = ["```", "```python", "```xml"]
call_context_map = extract_content_open_ending(
all_context, self.agent_prefix, self.agent_end, True
)
Expand Down Expand Up @@ -294,8 +293,10 @@ def api_view_context(self, all_context: str, display_mode: bool = False):
now_time = datetime.now().timestamp() * 1000
cost = (now_time - self.start_time) / 1000
cost_str = "{:.2f}".format(cost)
for tag in error_mk_tags:
all_context = all_context.replace(tag + api_context, api_context)
all_context = self.__deal_error_md_tags(
all_context, api_context
)

all_context = all_context.replace(
api_context,
f'\n<span style="color:green">Waiting...{cost_str}S</span>\n',
Expand Down Expand Up @@ -444,29 +445,36 @@ def display_sql_llmvis(self, llm_text, sql_run_func):
Returns:
ChartView protocol text
"""
if self.__is_need_wait_plugin_call(llm_text):
# wait api call generate complete
if self.check_last_plugin_call_ready(llm_text):
self.update_from_context(llm_text)
for key, value in self.plugin_status_map.items():
if value.status == Status.TODO.value:
value.status = Status.RUNNING.value
logging.info(f"sql展示执行:{value.name},{value.args}")
try:
sql = value.args["sql"]
if sql is not None and len(sql) > 0:
data_df = sql_run_func(sql)
value.df = data_df
value.api_result = json.loads(data_df.to_json(orient='records', date_format='iso', date_unit='s'))
value.status = Status.COMPLETED.value
else:
try:
if self.__is_need_wait_plugin_call(llm_text):
# wait api call generate complete
if self.check_last_plugin_call_ready(llm_text):
self.update_from_context(llm_text)
for key, value in self.plugin_status_map.items():
if value.status == Status.TODO.value:
value.status = Status.RUNNING.value
logging.info(f"sql展示执行:{value.name},{value.args}")
try:
sql = value.args["sql"]
if sql is not None and len(sql) > 0:
data_df = sql_run_func(sql)
value.df = data_df
value.api_result = json.loads(
data_df.to_json(orient='records', date_format='iso', date_unit='s'))
value.status = Status.COMPLETED.value
else:
value.status = Status.FAILED.value
value.err_msg = "No executable sql!"

except Exception as e:
value.status = Status.FAILED.value
value.err_msg = "No executable sql!"
value.err_msg = str(e)
value.end_time = datetime.now().timestamp() * 1000
except Exception as e:
logging.error("Api parsing exception", e)
value.status = Status.FAILED.value
value.err_msg = "Api parsing exception," + str(e)

except Exception as e:
value.status = Status.FAILED.value
value.err_msg = str(e)
value.end_time = datetime.now().timestamp() * 1000
return self.api_view_context(llm_text, True)


1 change: 1 addition & 0 deletions pilot/out_parser/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ def parse_prompt_response(self, model_out_text) -> T:
.replace("\\n", " ")
.replace("\n", " ")
.replace("\\", " ")
.replace("\_", "_")
)
cleaned_output = self.__illegal_json_ends(cleaned_output)
return cleaned_output
Expand Down
6 changes: 6 additions & 0 deletions pilot/scene/base_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ async def generate_input_values(self) -> Dict:
def do_action(self, prompt_response):
return prompt_response


def message_adjust(self):
pass

def get_llm_speak(self, prompt_define_response):
if hasattr(prompt_define_response, "thoughts"):
if isinstance(prompt_define_response.thoughts, dict):
Expand Down Expand Up @@ -294,6 +298,8 @@ async def nostream_call(self):

view_message = view_message.replace("\n", "\\n")
self.current_message.add_view_message(view_message)
self.message_adjust()

span.end()
except Exception as e:
print(traceback.format_exc())
Expand Down
2 changes: 1 addition & 1 deletion pilot/scene/chat_agent/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ async def generate_input_values(self) -> Dict[str, str]:
return input_values

def stream_plugin_call(self, text):
text = text.replace("\n", " ")
text = text.replace("\\n", " ").replace("\n", " ").replace("\_", "_").replace("\\", " ")
with root_tracer.start_span(
"ChatAgent.stream_plugin_call.api_call", metadata={"text": text}
):
Expand Down
3 changes: 2 additions & 1 deletion pilot/scene/chat_agent/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@
3.根据上面约束的方式生成每个工具的调用,对于工具使用的提示文本,需要在工具使用前生成
4.如果用户目标无法理解和意图不明确,优先使用搜索引擎工具
5.参数内容可能需要根据用户的目标推理得到,不仅仅是从文本提取
6.约束条件和工具信息作为推理过程的辅助信息,不要表达在给用户的输出内容中
6.约束条件和工具信息作为推理过程的辅助信息,对应内容不要表达在给用户的输出内容中
7.不要把<api-call></api-call>部分内容放在markdown标签里
{expand_constraints}
工具列表:
Expand Down
2 changes: 1 addition & 1 deletion pilot/scene/chat_data/chat_excel/excel_analyze/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ async def prepare(self):
return result

def stream_plugin_call(self, text):
text = text.replace("\n", " ")
text = text.replace("\\n", " ").replace("\n", " ").replace("\_", "_").replace("\\", " ")
with root_tracer.start_span(
"ChatExcel.stream_plugin_call.run_display_sql", metadata={"text": text}
):
Expand Down
8 changes: 4 additions & 4 deletions pilot/scene/chat_data/chat_excel/excel_analyze/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
_PROMPT_SCENE_DEFINE_EN = "You are a data analysis expert. "

_DEFAULT_TEMPLATE_EN = """
Please use the data structure information in the above historical dialogue and combine it with data analysis to answer the user's questions while satisfying the constraints.
Please use the data structure column analysis information generated in the above historical dialogue to answer the user's questions through duckdb sql data analysis under the following constraints..
Constraint:
1.Please fully understand the user's problem and use duckdb sql for analysis. The analysis content is returned in the output format required below. Please output the sql in the corresponding sql parameter.
Expand All @@ -30,14 +30,14 @@

_PROMPT_SCENE_DEFINE_ZH = """你是一个数据分析专家!"""
_DEFAULT_TEMPLATE_ZH = """
请使用上述历史对话中生成的数据结构信息,在满足下面约束条件下通过duckdb sql数据分析回答用户的问题。
请使用历史对话中的数据结构信息,在满足下面约束条件下通过duckdb sql数据分析回答用户的问题。
约束条件:
1.请充分理解用户的问题,使用duckdb sql的方式进行分析, 分析内容按下面要求的输出格式返回,sql请输出在对应的sql参数中
2.请从如下给出的展示方式种选择最优的一种用以进行数据渲染,将类型名称放入返回要求格式的name参数值种,如果找不到最合适的则使用'Table'作为展示方式,可用数据展示方式如下: {disply_type}
3.SQL中需要使用的表名是: {table_name},请检查你生成的sql,不要使用没在数据结构中的列名,。
4.优先使用数据分析的方式回答,如果用户问题不涉及数据分析内容,你可以按你的理解进行回答
5.要求的输出格式中<api-call></api-call>部分需要被代码解析执行,请确保这部分内容按要求输出,不要参考历史信息的返回格式,请按下面要求返回
请确保你的输出格式如下:
请确保你的输出内容格式如下:
对用户说的想法摘要.<api-call><name>[数据展示方式]</name><args><sql>[正确的duckdb数据分析sql]</sql></args></api-call>
用户问题:{user_input}
Expand All @@ -59,7 +59,7 @@
# Temperature is a configuration hyperparameter that controls the randomness of language model output.
# A high temperature produces more unpredictable and creative results, while a low temperature produces more common and conservative output.
# For example, if you adjust the temperature to 0.5, the model will usually generate text that is more predictable and less creative than if you set the temperature to 1.0.
PROMPT_TEMPERATURE = 0.8
PROMPT_TEMPERATURE = 0.3

prompt = PromptTemplate(
template_scene=ChatScene.ChatExcel.value(),
Expand Down
12 changes: 12 additions & 0 deletions pilot/scene/chat_data/chat_excel/excel_learning/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pilot.scene.base_message import (
HumanMessage,
ViewMessage,
AIMessage
)
from pilot.scene.base_chat import BaseChat
from pilot.scene.base import ChatScene
Expand Down Expand Up @@ -59,3 +60,14 @@ async def generate_input_values(self) -> Dict:
"file_name": self.excel_reader.excel_file_name
}
return input_values

def message_adjust(self):
### adjust learning result in messages
view_message = ""
for message in self.current_message.messages:
if message.type == ViewMessage.type:
view_message = message.content

for message in self.current_message.messages:
if message.type == AIMessage.type:
message.content = view_message
55 changes: 28 additions & 27 deletions pilot/scene/chat_data/chat_excel/excel_learning/out_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,54 +21,55 @@ def __init__(self, sep: str, is_stream_out: bool):
super().__init__(sep=sep, is_stream_out=is_stream_out)
self.is_downgraded = False


def parse_prompt_response(self, model_out_text):
try:
clean_str = super().parse_prompt_response(model_out_text)
logger.info(f"parse_prompt_response:{model_out_text},{model_out_text}")
response = json.loads(clean_str)
for key in sorted(response):
if key.strip() == "DataAnalysis":
desciption = response[key]
desciption = response[key]
if key.strip() == "ColumnAnalysis":
clounms = response[key]
if key.strip() == "AnalysisProgram":
plans = response[key]
return ExcelResponse(desciption=desciption, clounms=clounms, plans=plans)
except Exception as e:
logger.error(f"parse_prompt_response Faild!{str(e)}")
self.is_downgraded = True
return ExcelResponse(desciption=model_out_text, clounms=self.data_schema, plans=None)
clounms = []
for name in self.data_schema:
clounms.append({name: "-"})
return ExcelResponse(desciption=model_out_text, clounms=clounms, plans=None)

def __build_colunms_html(self, clounms_data):
html_colunms = f"### **Data Structure**\n"
column_index = 0
for item in clounms_data:
column_index += 1
keys = item.keys()
for key in keys:
html_colunms = (
html_colunms + f"- **{column_index}.[{key}]** _{item[key]}_\n"
)
return html_colunms


def __build_plans_html(self, plans_data):
html_plans = f"### **Analysis plans**\n"
index = 0
if plans_data:
for item in plans_data:
index += 1
html_plans = html_plans + f"{item} \n"
return html_plans

def parse_view_response(self, speak, data, prompt_response) -> str:
if data and not isinstance(data, str):
### tool out data to table view
html_title = f"### **Data Summary**\n{data.desciption} "
html_colunms = f"### **Data Structure**\n"
if self.is_downgraded:
column_index = 0
for item in data.clounms:
column_index += 1
html_colunms = (
html_colunms + f"- **{column_index}.[{item}]** _未知_\n"
)
else:
column_index = 0
for item in data.clounms:
column_index += 1
keys = item.keys()
for key in keys:
html_colunms = (
html_colunms + f"- **{column_index}.[{key}]** _{item[key]}_\n"
)
html_colunms = self.__build_colunms_html(data.clounms)
html_plans = self.__build_plans_html(data.plans)

html_plans = f"### **Recommended analysis plan**\n"
index = 0
if data.plans:
for item in data.plans:
index += 1
html_plans = html_plans + f"{item} \n"
html = f"""{html_title}\n{html_colunms}\n{html_plans}"""
return html
else:
Expand Down
11 changes: 5 additions & 6 deletions pilot/scene/chat_data/chat_excel/excel_learning/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,11 @@
下面是用户文件{file_name}的一部分数据,请学习理解该数据的结构和内容,按要求输出解析结果:
{data_example}
分析各列数据的含义和作用,并对专业术语进行简单明了的解释, 如果是时间类型请给出时间格式类似:yyyy-MM-dd HH:MM:ss.
将列名作为key,分析解释作为value,生成json数组如[\\{{"列名1": "分析解释内容1"\\}},\\{{"列名2":"分析解释2"\\}}],并输出在返回json内容的ColumnAnalysis属性中.
请不要修改或者翻译列名,确保和给出数据列名一致
将列名作为属性名,分析解释作为属性值,组成json数组,并输出在返回json内容的ColumnAnalysis属性中.
请不要修改或者翻译列名,确保和给出数据列名一致.
针对数据从不同维度提供一些有用的分析思路给用户。
提供一些分析方案思路,请一步一步思考。
请以确保只以JSON格式回答,格式如下:
请一步一步思考,确保只以JSON格式回答,具体格式如下:
{response}
"""

Expand Down Expand Up @@ -67,7 +66,7 @@
# Temperature is a configuration hyperparameter that controls the randomness of language model output.
# A high temperature produces more unpredictable and creative results, while a low temperature produces more common and conservative output.
# For example, if you adjust the temperature to 0.5, the model will usually generate text that is more predictable and less creative than if you set the temperature to 1.0.
PROMPT_TEMPERATURE = 0.5
PROMPT_TEMPERATURE = 0.8

prompt = PromptTemplate(
template_scene=ChatScene.ExcelLearning.value(),
Expand Down
2 changes: 1 addition & 1 deletion pilot/server/static/404.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pilot/server/static/404/index.html

Large diffs are not rendered by default.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pilot/server/static/agent/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pilot/server/static/chat/[scene]/[id]/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pilot/server/static/chat/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pilot/server/static/database/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pilot/server/static/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pilot/server/static/knowledge/chunk/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pilot/server/static/knowledge/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pilot/server/static/models/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pilot/server/static/prompt/index.html

Large diffs are not rendered by default.

0 comments on commit ea363a4

Please sign in to comment.