feat(ChatDB): ChatDB Use fintune model

1.Compatible with community pure sql output model
eosphoros-ai · Nov 17, 2023 · ea363a4 · ea363a4
1 parent 343d646
commit ea363a4
Show file tree

Hide file tree

Showing 26 changed files with 105 additions and 77 deletions.
diff --git a/pilot/base_modules/agent/commands/command_mange.py b/pilot/base_modules/agent/commands/command_mange.py
@@ -242,7 +242,7 @@ def check_last_plugin_call_ready(self, all_context):
         return False
 
     def __deal_error_md_tags(self, all_context, api_context, include_end: bool = True):
-        error_md_tags = ["```", "```python", "```xml", "```json", "```markdown"]
+        error_md_tags = ["```", "```python", "```xml", "```json", "```markdown", "```sql"]
         if include_end == False:
             md_tag_end = ""
         else:
@@ -261,7 +261,6 @@ def __deal_error_md_tags(self, all_context, api_context, include_end: bool = Tru
         return all_context
 
     def api_view_context(self, all_context: str, display_mode: bool = False):
-        error_mk_tags = ["```", "```python", "```xml"]
         call_context_map = extract_content_open_ending(
             all_context, self.agent_prefix, self.agent_end, True
         )
@@ -294,8 +293,10 @@ def api_view_context(self, all_context: str, display_mode: bool = False):
                 now_time = datetime.now().timestamp() * 1000
                 cost = (now_time - self.start_time) / 1000
                 cost_str = "{:.2f}".format(cost)
-                for tag in error_mk_tags:
-                    all_context = all_context.replace(tag + api_context, api_context)
+                all_context = self.__deal_error_md_tags(
+                    all_context, api_context
+                )
+
                 all_context = all_context.replace(
                     api_context,
                     f'\n<span style="color:green">Waiting...{cost_str}S</span>\n',
@@ -444,29 +445,36 @@ def display_sql_llmvis(self, llm_text, sql_run_func):
         Returns:
            ChartView protocol text
         """
-        if self.__is_need_wait_plugin_call(llm_text):
-            # wait api call generate complete
-            if self.check_last_plugin_call_ready(llm_text):
-                self.update_from_context(llm_text)
-                for key, value in self.plugin_status_map.items():
-                    if value.status == Status.TODO.value:
-                        value.status = Status.RUNNING.value
-                        logging.info(f"sql展示执行:{value.name},{value.args}")
-                        try:
-                            sql = value.args["sql"]
-                            if sql is not None and len(sql) > 0:
-                                data_df = sql_run_func(sql)
-                                value.df = data_df
-                                value.api_result = json.loads(data_df.to_json(orient='records', date_format='iso', date_unit='s'))
-                                value.status = Status.COMPLETED.value
-                            else:
+        try:
+            if self.__is_need_wait_plugin_call(llm_text):
+                # wait api call generate complete
+                if self.check_last_plugin_call_ready(llm_text):
+                    self.update_from_context(llm_text)
+                    for key, value in self.plugin_status_map.items():
+                        if value.status == Status.TODO.value:
+                            value.status = Status.RUNNING.value
+                            logging.info(f"sql展示执行:{value.name},{value.args}")
+                            try:
+                                sql = value.args["sql"]
+                                if sql is not None and len(sql) > 0:
+                                    data_df = sql_run_func(sql)
+                                    value.df = data_df
+                                    value.api_result = json.loads(
+                                        data_df.to_json(orient='records', date_format='iso', date_unit='s'))
+                                    value.status = Status.COMPLETED.value
+                                else:
+                                    value.status = Status.FAILED.value
+                                    value.err_msg = "No executable sql！"
+
+                            except Exception as e:
                                 value.status = Status.FAILED.value
-                                value.err_msg = "No executable sql！"
+                                value.err_msg = str(e)
+                            value.end_time = datetime.now().timestamp() * 1000
+        except Exception as e:
+            logging.error("Api parsing exception", e)
+            value.status = Status.FAILED.value
+            value.err_msg = "Api parsing exception," + str(e)
 
-                        except Exception as e:
-                            value.status = Status.FAILED.value
-                            value.err_msg = str(e)
-                        value.end_time = datetime.now().timestamp() * 1000
         return self.api_view_context(llm_text, True)
 
 
diff --git a/pilot/out_parser/base.py b/pilot/out_parser/base.py
@@ -215,6 +215,7 @@ def parse_prompt_response(self, model_out_text) -> T:
             .replace("\\n", " ")
             .replace("\n", " ")
             .replace("\\", " ")
+            .replace("\_", "_")
         )
         cleaned_output = self.__illegal_json_ends(cleaned_output)
         return cleaned_output

diff --git a/pilot/scene/base_chat.py b/pilot/scene/base_chat.py
@@ -111,6 +111,10 @@ async def generate_input_values(self) -> Dict:
     def do_action(self, prompt_response):
         return prompt_response
 
+
+    def message_adjust(self):
+        pass
+
     def get_llm_speak(self, prompt_define_response):
         if hasattr(prompt_define_response, "thoughts"):
             if isinstance(prompt_define_response.thoughts, dict):
@@ -294,6 +298,8 @@ async def nostream_call(self):
 
             view_message = view_message.replace("\n", "\\n")
             self.current_message.add_view_message(view_message)
+            self.message_adjust()
+
             span.end()
         except Exception as e:
             print(traceback.format_exc())

diff --git a/pilot/scene/chat_agent/chat.py b/pilot/scene/chat_agent/chat.py
@@ -64,7 +64,7 @@ async def generate_input_values(self) -> Dict[str, str]:
         return input_values
 
     def stream_plugin_call(self, text):
-        text = text.replace("\n", " ")
+        text = text.replace("\\n", " ").replace("\n", " ").replace("\_", "_").replace("\\", " ")
         with root_tracer.start_span(
             "ChatAgent.stream_plugin_call.api_call", metadata={"text": text}
         ):

diff --git a/pilot/scene/chat_agent/prompt.py b/pilot/scene/chat_agent/prompt.py
@@ -42,7 +42,8 @@
     3.根据上面约束的方式生成每个工具的调用，对于工具使用的提示文本，需要在工具使用前生成
     4.如果用户目标无法理解和意图不明确，优先使用搜索引擎工具
     5.参数内容可能需要根据用户的目标推理得到，不仅仅是从文本提取
-    6.约束条件和工具信息作为推理过程的辅助信息，不要表达在给用户的输出内容中
+    6.约束条件和工具信息作为推理过程的辅助信息，对应内容不要表达在给用户的输出内容中
+    7.不要把<api-call></api-call>部分内容放在markdown标签里
     {expand_constraints}
 
 工具列表:

diff --git a/pilot/scene/chat_data/chat_excel/excel_analyze/chat.py b/pilot/scene/chat_data/chat_excel/excel_analyze/chat.py
@@ -100,7 +100,7 @@ async def prepare(self):
         return result
 
     def stream_plugin_call(self, text):
-        text = text.replace("\n", " ")
+        text = text.replace("\\n", " ").replace("\n", " ").replace("\_", "_").replace("\\", " ")
         with root_tracer.start_span(
             "ChatExcel.stream_plugin_call.run_display_sql", metadata={"text": text}
         ):

diff --git a/pilot/scene/chat_data/chat_excel/excel_analyze/prompt.py b/pilot/scene/chat_data/chat_excel/excel_analyze/prompt.py
@@ -12,7 +12,7 @@
 _PROMPT_SCENE_DEFINE_EN = "You are a data analysis expert. "
 
 _DEFAULT_TEMPLATE_EN = """
-Please use the data structure information in the above historical dialogue and combine it with data analysis to answer the user's questions while satisfying the constraints.
+Please use the data structure column analysis information generated in the above historical dialogue to answer the user's questions through duckdb sql data analysis under the following constraints..
 
 Constraint:
     1.Please fully understand the user's problem and use duckdb sql for analysis. The analysis content is returned in the output format required below. Please output the sql in the corresponding sql parameter.
@@ -30,14 +30,14 @@
 
 _PROMPT_SCENE_DEFINE_ZH = """你是一个数据分析专家！"""
 _DEFAULT_TEMPLATE_ZH = """
-请使用上述历史对话中生成的数据结构信息，在满足下面约束条件下通过duckdb sql数据分析回答用户的问题。
+请使用历史对话中的数据结构信息，在满足下面约束条件下通过duckdb sql数据分析回答用户的问题。
 约束条件:
 	1.请充分理解用户的问题，使用duckdb sql的方式进行分析， 分析内容按下面要求的输出格式返回，sql请输出在对应的sql参数中
 	2.请从如下给出的展示方式种选择最优的一种用以进行数据渲染，将类型名称放入返回要求格式的name参数值种，如果找不到最合适的则使用'Table'作为展示方式，可用数据展示方式如下: {disply_type}
 	3.SQL中需要使用的表名是: {table_name},请检查你生成的sql，不要使用没在数据结构中的列名，。
 	4.优先使用数据分析的方式回答，如果用户问题不涉及数据分析内容，你可以按你的理解进行回答
 	5.要求的输出格式中<api-call></api-call>部分需要被代码解析执行，请确保这部分内容按要求输出，不要参考历史信息的返回格式，请按下面要求返回
-请确保你的输出格式如下:
+请确保你的输出内容格式如下:
     对用户说的想法摘要.<api-call><name>[数据展示方式]</name><args><sql>[正确的duckdb数据分析sql]</sql></args></api-call>
 
 用户问题：{user_input}
@@ -59,7 +59,7 @@
 # Temperature is a configuration hyperparameter that controls the randomness of language model output.
 # A high temperature produces more unpredictable and creative results, while a low temperature produces more common and conservative output.
 # For example, if you adjust the temperature to 0.5, the model will usually generate text that is more predictable and less creative than if you set the temperature to 1.0.
-PROMPT_TEMPERATURE = 0.8
+PROMPT_TEMPERATURE = 0.3
 
 prompt = PromptTemplate(
     template_scene=ChatScene.ChatExcel.value(),

diff --git a/pilot/scene/chat_data/chat_excel/excel_learning/chat.py b/pilot/scene/chat_data/chat_excel/excel_learning/chat.py
@@ -4,6 +4,7 @@
 from pilot.scene.base_message import (
     HumanMessage,
     ViewMessage,
+    AIMessage
 )
 from pilot.scene.base_chat import BaseChat
 from pilot.scene.base import ChatScene
@@ -59,3 +60,14 @@ async def generate_input_values(self) -> Dict:
             "file_name": self.excel_reader.excel_file_name
         }
         return input_values
+
+    def message_adjust(self):
+        ### adjust learning result in messages
+        view_message = ""
+        for message in self.current_message.messages:
+            if message.type == ViewMessage.type:
+                view_message = message.content
+
+        for message in self.current_message.messages:
+            if message.type == AIMessage.type:
+                message.content = view_message
diff --git a/pilot/scene/chat_data/chat_excel/excel_learning/out_parser.py b/pilot/scene/chat_data/chat_excel/excel_learning/out_parser.py
@@ -21,54 +21,55 @@ def __init__(self, sep: str, is_stream_out: bool):
         super().__init__(sep=sep, is_stream_out=is_stream_out)
         self.is_downgraded = False
 
-
     def parse_prompt_response(self, model_out_text):
         try:
             clean_str = super().parse_prompt_response(model_out_text)
             logger.info(f"parse_prompt_response:{model_out_text},{model_out_text}")
             response = json.loads(clean_str)
             for key in sorted(response):
                 if key.strip() == "DataAnalysis":
-                    desciption = response[key]
+                    desciption =  response[key]
                 if key.strip() == "ColumnAnalysis":
                     clounms = response[key]
                 if key.strip() == "AnalysisProgram":
                     plans = response[key]
             return ExcelResponse(desciption=desciption, clounms=clounms, plans=plans)
         except Exception as e:
             logger.error(f"parse_prompt_response Faild!{str(e)}")
-            self.is_downgraded = True
-            return ExcelResponse(desciption=model_out_text, clounms=self.data_schema, plans=None)
+            clounms = []
+            for name in self.data_schema:
+                clounms.append({name: "-"})
+            return ExcelResponse(desciption=model_out_text, clounms=clounms, plans=None)
+
+    def __build_colunms_html(self, clounms_data):
+        html_colunms = f"### **Data Structure**\n"
+        column_index = 0
+        for item in clounms_data:
+            column_index += 1
+            keys = item.keys()
+            for key in keys:
+                html_colunms = (
+                        html_colunms + f"- **{column_index}.[{key}]**   _{item[key]}_\n"
+                )
+        return  html_colunms
+
 
+    def __build_plans_html(self, plans_data):
+        html_plans = f"### **Analysis plans**\n"
+        index = 0
+        if plans_data:
+            for item in plans_data:
+                index += 1
+                html_plans = html_plans + f"{item} \n"
+        return html_plans
 
     def parse_view_response(self, speak, data, prompt_response) -> str:
         if data and not isinstance(data, str):
             ### tool out data to table view
             html_title = f"### **Data Summary**\n{data.desciption} "
-            html_colunms = f"### **Data Structure**\n"
-            if self.is_downgraded:
-                column_index = 0
-                for item in data.clounms:
-                    column_index += 1
-                    html_colunms = (
-                        html_colunms + f"- **{column_index}.[{item}]**   _未知_\n"
-                    )
-            else:
-                column_index = 0
-                for item in data.clounms:
-                    column_index += 1
-                    keys = item.keys()
-                    for key in keys:
-                        html_colunms = (
-                            html_colunms + f"- **{column_index}.[{key}]**   _{item[key]}_\n"
-                        )
+            html_colunms = self.__build_colunms_html(data.clounms)
+            html_plans = self.__build_plans_html(data.plans)
 
-            html_plans = f"### **Recommended analysis plan**\n"
-            index = 0
-            if data.plans:
-                for item in data.plans:
-                    index += 1
-                    html_plans = html_plans + f"{item} \n"
             html = f"""{html_title}\n{html_colunms}\n{html_plans}"""
             return html
         else:

diff --git a/pilot/scene/chat_data/chat_excel/excel_learning/prompt.py b/pilot/scene/chat_data/chat_excel/excel_learning/prompt.py
@@ -28,12 +28,11 @@
 下面是用户文件{file_name}的一部分数据，请学习理解该数据的结构和内容，按要求输出解析结果:
     {data_example}
 分析各列数据的含义和作用，并对专业术语进行简单明了的解释, 如果是时间类型请给出时间格式类似:yyyy-MM-dd HH:MM:ss.
-将列名作为key，分析解释作为value，生成json数组如[\\{{"列名1": "分析解释内容1"\\}},\\{{"列名2":"分析解释2"\\}}]，并输出在返回json内容的ColumnAnalysis属性中.
-请不要修改或者翻译列名，确保和给出数据列名一致
+将列名作为属性名，分析解释作为属性值,组成json数组，并输出在返回json内容的ColumnAnalysis属性中.
+请不要修改或者翻译列名，确保和给出数据列名一致.
+针对数据从不同维度提供一些有用的分析思路给用户。
 
-提供一些分析方案思路，请一步一步思考。
-
-请以确保只以JSON格式回答，格式如下：
+请一步一步思考,确保只以JSON格式回答，具体格式如下：
     {response}
 """
 
@@ -67,7 +66,7 @@
 # Temperature is a configuration hyperparameter that controls the randomness of language model output.
 # A high temperature produces more unpredictable and creative results, while a low temperature produces more common and conservative output.
 # For example, if you adjust the temperature to 0.5, the model will usually generate text that is more predictable and less creative than if you set the temperature to 1.0.
-PROMPT_TEMPERATURE = 0.5
+PROMPT_TEMPERATURE = 0.8
 
 prompt = PromptTemplate(
     template_scene=ChatScene.ExcelLearning.value(),

diff --git a/pilot/server/static/404.html b/pilot/server/static/404.html
diff --git a/pilot/server/static/404/index.html b/pilot/server/static/404/index.html
diff --git a/...c/j9lfvm6UVgUuLg2BMjk1w/_buildManifest.js → ...c/8DLUDtzfrUUv_4HYlGW9p/_buildManifest.js b/...c/j9lfvm6UVgUuLg2BMjk1w/_buildManifest.js → ...c/8DLUDtzfrUUv_4HYlGW9p/_buildManifest.js
diff --git a/...tic/j9lfvm6UVgUuLg2BMjk1w/_ssgManifest.js → ...tic/8DLUDtzfrUUv_4HYlGW9p/_ssgManifest.js b/...tic/j9lfvm6UVgUuLg2BMjk1w/_ssgManifest.js → ...tic/8DLUDtzfrUUv_4HYlGW9p/_ssgManifest.js
diff --git a/...tic/chunks/pages/_app-120ec20e0ef913c0.js → ...tic/chunks/pages/_app-5048853e4f571e60.js b/...tic/chunks/pages/_app-120ec20e0ef913c0.js → ...tic/chunks/pages/_app-5048853e4f571e60.js
diff --git a/...ic/chunks/pages/index-e5fd29b9e2d6bb59.js → ...ic/chunks/pages/index-b1c8f59fe7e5d7df.js b/...ic/chunks/pages/index-e5fd29b9e2d6bb59.js → ...ic/chunks/pages/index-b1c8f59fe7e5d7df.js
diff --git a/...static/chunks/webpack-1af2ff4dbb3f386d.js → ...static/chunks/webpack-6db517c886de77b1.js b/...static/chunks/webpack-1af2ff4dbb3f386d.js → ...static/chunks/webpack-6db517c886de77b1.js
diff --git a/pilot/server/static/agent/index.html b/pilot/server/static/agent/index.html
diff --git a/pilot/server/static/chat/[scene]/[id]/index.html b/pilot/server/static/chat/[scene]/[id]/index.html
diff --git a/pilot/server/static/chat/index.html b/pilot/server/static/chat/index.html
diff --git a/pilot/server/static/database/index.html b/pilot/server/static/database/index.html
diff --git a/pilot/server/static/index.html b/pilot/server/static/index.html
diff --git a/pilot/server/static/knowledge/chunk/index.html b/pilot/server/static/knowledge/chunk/index.html
diff --git a/pilot/server/static/knowledge/index.html b/pilot/server/static/knowledge/index.html
diff --git a/pilot/server/static/models/index.html b/pilot/server/static/models/index.html
diff --git a/pilot/server/static/prompt/index.html b/pilot/server/static/prompt/index.html