From b42a77465b8f5d5f36a92e82d834c539e732cb9c Mon Sep 17 00:00:00 2001 From: yhjun1026 <460342015@qq.com> Date: Thu, 19 Oct 2023 19:19:41 +0800 Subject: [PATCH 1/6] feat(ChatExcel): ChatExcel example file add example excel file use to test ChatExcel --- pilot/base_modules/agent/db/my_plugin_db.py | 7 +++++-- pilot/base_modules/agent/db/plugin_hub_db.py | 14 +++++++++----- pilot/memory/chat_history/chat_history_db.py | 15 +++++++++------ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/pilot/base_modules/agent/db/my_plugin_db.py b/pilot/base_modules/agent/db/my_plugin_db.py index b021088db..acfb70e23 100644 --- a/pilot/base_modules/agent/db/my_plugin_db.py +++ b/pilot/base_modules/agent/db/my_plugin_db.py @@ -9,7 +9,10 @@ class MyPluginEntity(Base): __tablename__ = "my_plugin" - + __table_args__ = { + "mysql_charset": "utf8mb4", + "mysql_collate": "utf8mb4_unicode_ci", + } id = Column(Integer, primary_key=True, comment="autoincrement id") tenant = Column(String(255), nullable=True, comment="user's tenant") user_code = Column(String(255), nullable=False, comment="user code") @@ -27,7 +30,7 @@ class MyPluginEntity(Base): created_at = Column( DateTime, default=datetime.utcnow, comment="plugin install time" ) - __table_args__ = (UniqueConstraint("user_code", "name", name="uk_name"),) + UniqueConstraint("user_code", "name", name="uk_name") class MyPluginDao(BaseDao[MyPluginEntity]): diff --git a/pilot/base_modules/agent/db/plugin_hub_db.py b/pilot/base_modules/agent/db/plugin_hub_db.py index 8507bcc8e..872324e33 100644 --- a/pilot/base_modules/agent/db/plugin_hub_db.py +++ b/pilot/base_modules/agent/db/plugin_hub_db.py @@ -1,7 +1,7 @@ from datetime import datetime import pytz from typing import List -from sqlalchemy import Column, Integer, String, Index, DateTime, func, Boolean +from sqlalchemy import Column, Integer, String, Index, DateTime, func, Boolean, DDL from sqlalchemy import UniqueConstraint from pilot.base_modules.meta_data.meta_data import Base @@ -9,8 +9,13 @@ from pilot.base_modules.meta_data.meta_data import Base, engine, session +char_set_sql = DDL("ALTER TABLE plugin_hub CONVERT TO CHARACTER SET utf8mb4") class PluginHubEntity(Base): __tablename__ = "plugin_hub" + __table_args__ = { + "mysql_charset": "utf8mb4", + "mysql_collate": "utf8mb4_unicode_ci", + } id = Column( Integer, primary_key=True, autoincrement=True, comment="autoincrement id" ) @@ -26,10 +31,9 @@ class PluginHubEntity(Base): created_at = Column(DateTime, default=datetime.utcnow, comment="plugin upload time") installed = Column(Integer, default=False, comment="plugin already installed count") - __table_args__ = ( - UniqueConstraint("name", name="uk_name"), - Index("idx_q_type", "type"), - ) + UniqueConstraint("name", name="uk_name") + Index("idx_q_type", "type") + class PluginHubDao(BaseDao[PluginHubEntity]): diff --git a/pilot/memory/chat_history/chat_history_db.py b/pilot/memory/chat_history/chat_history_db.py index 2b1a57c28..6d66d5d4c 100644 --- a/pilot/memory/chat_history/chat_history_db.py +++ b/pilot/memory/chat_history/chat_history_db.py @@ -10,6 +10,10 @@ class ChatHistoryEntity(Base): id = Column( Integer, primary_key=True, autoincrement=True, comment="autoincrement id" ) + __table_args__ = { + "mysql_charset": "utf8mb4", + "mysql_collate": "utf8mb4_unicode_ci", + } conv_uid = Column( String(255), unique=False, @@ -21,12 +25,11 @@ class ChatHistoryEntity(Base): user_name = Column(String(255), nullable=True, comment="interlocutor") messages = Column(Text, nullable=True, comment="Conversation details") - __table_args__ = ( - UniqueConstraint("conv_uid", name="uk_conversation"), - Index("idx_q_user", "user_name"), - Index("idx_q_mode", "chat_mode"), - Index("idx_q_conv", "summary"), - ) + UniqueConstraint("conv_uid", name="uk_conversation") + Index("idx_q_user", "user_name") + Index("idx_q_mode", "chat_mode") + Index("idx_q_conv", "summary") + class ChatHistoryDao(BaseDao[ChatHistoryEntity]): From e98786eb69a62c0648c56c17d31d59a293984358 Mon Sep 17 00:00:00 2001 From: aries_ckt <916701291@qq.com> Date: Thu, 19 Oct 2023 19:34:41 +0800 Subject: [PATCH 2/6] fix:meta data schema charset bug --- docs/getting_started/faq/deploy/deploy_faq.md | 8 ++- .../getting_started/faq/deploy/deploy_faq.po | 53 +++++++++++-------- pilot/base_modules/agent/db/plugin_hub_db.py | 3 +- pilot/embedding_engine/source_embedding.py | 9 +++- pilot/memory/chat_history/chat_history_db.py | 1 - pilot/openapi/api_v1/feedback/feed_back_db.py | 4 ++ pilot/server/knowledge/chunk_db.py | 4 ++ pilot/server/knowledge/document_db.py | 6 +++ pilot/server/knowledge/space_db.py | 4 ++ pilot/server/prompt/prompt_manage_db.py | 4 ++ pilot/vector_store/connector.py | 26 ++++++--- 11 files changed, 91 insertions(+), 31 deletions(-) diff --git a/docs/getting_started/faq/deploy/deploy_faq.md b/docs/getting_started/faq/deploy/deploy_faq.md index 1262395fc..c787337ed 100644 --- a/docs/getting_started/faq/deploy/deploy_faq.md +++ b/docs/getting_started/faq/deploy/deploy_faq.md @@ -97,4 +97,10 @@ pip install langchain>=0.0.286 ```commandline pip install --use-pep517 fschat -``` \ No newline at end of file +``` + +##### Q9: alembic.util.exc.CommandError: Target database is not up to date. +delete files in `DB-GPT/pilot/meta_data/alembic/versions/` and reboot. +```commandline +rm -rf DB-GPT/pilot/meta_data/alembic/versions/* +``` diff --git a/docs/locales/zh_CN/LC_MESSAGES/getting_started/faq/deploy/deploy_faq.po b/docs/locales/zh_CN/LC_MESSAGES/getting_started/faq/deploy/deploy_faq.po index 41f58c015..e8a085390 100644 --- a/docs/locales/zh_CN/LC_MESSAGES/getting_started/faq/deploy/deploy_faq.po +++ b/docs/locales/zh_CN/LC_MESSAGES/getting_started/faq/deploy/deploy_faq.po @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: DB-GPT 👏👏 0.3.5\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-09-26 17:47+0800\n" +"POT-Creation-Date: 2023-10-19 19:31+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: zh_CN\n" @@ -20,12 +20,12 @@ msgstr "" "Generated-By: Babel 2.12.1\n" #: ../../getting_started/faq/deploy/deploy_faq.md:1 -#: ca823e9d6d1d433db7ed15c8273e1b00 +#: fb640f7c38744cbf996dcf7f73f325f6 msgid "Installation FAQ" msgstr "Installation FAQ" #: ../../getting_started/faq/deploy/deploy_faq.md:5 -#: 3803d098c534434f9f513b3a62de54a4 +#: 79fd80e469d14d608554d53a0e0ed2e3 #, fuzzy msgid "" "Q1: execute `pip install -e .` error, found some package cannot find " @@ -35,18 +35,18 @@ msgstr "" "cannot find correct version." #: ../../getting_started/faq/deploy/deploy_faq.md:6 -#: b785864f47e643df9a4669d8da6167d6 +#: f1f6e3291d1446b5bbcf744cd4c4e89a msgid "change the pip source." msgstr "替换pip源." #: ../../getting_started/faq/deploy/deploy_faq.md:13 #: ../../getting_started/faq/deploy/deploy_faq.md:20 -#: c41f026fb1464c71a45d0746c224ecce f70fb69b568d4fc4ad4c4731b2032eaf +#: 68e1b39a08774a81b9061cc5205e4c1c dd34901f446749e998cd34ec5b6c44f4 msgid "or" msgstr "或者" #: ../../getting_started/faq/deploy/deploy_faq.md:27 -#: d179e3d695764f838dc354eb0d978bb3 +#: 0899f0e28dae443b8f912d96c797b79c msgid "" "Q2: sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) unable to" " open database file" @@ -55,86 +55,97 @@ msgstr "" " open database file" #: ../../getting_started/faq/deploy/deploy_faq.md:29 -#: 55174e8d247a414e8c6c8861d4707a55 +#: 3e60d8190e49436b8c40b34a67b7bfb3 msgid "make sure you pull latest code or create directory with mkdir pilot/data" msgstr "make sure you pull latest code or create directory with mkdir pilot/data" #: ../../getting_started/faq/deploy/deploy_faq.md:31 -#: dbce9e9cae734a5083a6f0fc28bce7cd +#: baeaae20238842d3b8e4ae5b337198e5 msgid "Q3: The model keeps getting killed." msgstr "Q3: The model keeps getting killed." #: ../../getting_started/faq/deploy/deploy_faq.md:33 -#: 2de5648d2e7546bf85f20f4162003298 +#: eb3936307ad64b19b73483ff9ae126f2 msgid "" "your GPU VRAM size is not enough, try replace your hardware or replace " "other llms." msgstr "GPU显存不够, 增加显存或者换一个显存小的模型" #: ../../getting_started/faq/deploy/deploy_faq.md:35 -#: 47810771cd364964b9b5b8fd85bca4ee +#: f6dba770717041699c73b4cd00d48aad msgid "Q4: How to access website on the public network" msgstr "" #: ../../getting_started/faq/deploy/deploy_faq.md:37 -#: e8c5bac6680648509d528ea6aaf5994e +#: 447d9e9374de44bab6d8a03f2c936676 msgid "" "You can try to use gradio's [network](https://github.com/gradio-" "app/gradio/blob/main/gradio/networking.py) to achieve." msgstr "" #: ../../getting_started/faq/deploy/deploy_faq.md:48 -#: bb75ec127f574c00a09d92d5206e9357 +#: 5e34dd4dfcf34feeb1815dfa974041d0 msgid "Open `url` with your browser to see the website." msgstr "" #: ../../getting_started/faq/deploy/deploy_faq.md:50 -#: 5fdb87b84bd94385a1a93dab8d41ebe8 +#: aaef774ce6124021a3862bc0a25d465f msgid "Q5: (Windows) execute `pip install -e .` error" msgstr "" #: ../../getting_started/faq/deploy/deploy_faq.md:52 -#: 31eef51e044044f29f3ad08defa9c305 +#: ec3945df451c4ec2b32ebb476f45c82b msgid "The error log like the following:" msgstr "" #: ../../getting_started/faq/deploy/deploy_faq.md:71 -#: aaba0c3060b443e4b9877f70d78321ce +#: 1df09f6d9f9b4c1a8a32d6e271e5ee39 msgid "" "Download and install `Microsoft C++ Build Tools` from [visual-cpp-build-" "tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)" msgstr "" #: ../../getting_started/faq/deploy/deploy_faq.md:75 -#: 4c8137546e5c4240884f7ea6d9d922bf +#: 251f47bfa5694242a1c9d81a2022b7a0 msgid "Q6: `Torch not compiled with CUDA enabled`" msgstr "" #: ../../getting_started/faq/deploy/deploy_faq.md:82 -#: 01daf14f8c494219b1d9a5af4449951e +#: bc9dfdfc47924a0e8d3ec535e23bf923 msgid "Install [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit-archive)" msgstr "" #: ../../getting_started/faq/deploy/deploy_faq.md:83 -#: c75e6371911e4d5ca6859e51501c9679 +#: b5a632baa42745bdbee5d6ba516d8d8b msgid "" "Reinstall PyTorch [start-locally](https://pytorch.org/get-started/locally" "/#start-locally) with CUDA support." msgstr "" #: ../../getting_started/faq/deploy/deploy_faq.md:85 -#: 7cfb9003e505445ebb9ed3d015e184e2 +#: 0092fb91642749f5a55b629017c0de6a msgid "Q7: ImportError: cannot import name 'PersistentClient' from 'chromadb'." msgstr "Q7: ImportError: cannot import name 'PersistentClient' from 'chromadb'." #: ../../getting_started/faq/deploy/deploy_faq.md:91 -#: e1d5d5d85ddc480d8d81f7b550848cbf +#: 4aa87418f2a54c138bf3b7ff28a7e776 msgid "" "Q8: pydantic.error_wrappers.ValidationError:1 validation error for " "HuggingFaceEmbeddings.model_kwargs extra not permitted" -msgstr "Q8: pydantic.error_wrappers.ValidationError:1 validation error for " +msgstr "" +"Q8: pydantic.error_wrappers.ValidationError:1 validation error for " "HuggingFaceEmbeddings.model_kwargs extra not permitted" +#: ../../getting_started/faq/deploy/deploy_faq.md:102 +#: 6b690ab272af44f6b126cfe5ce1435ef +msgid "Q9: alembic.util.exc.CommandError: Target database is not up to date." +msgstr "" + +#: ../../getting_started/faq/deploy/deploy_faq.md:103 +#: 223026d3b9124363b695937922d8f8d5 +msgid "delete files in `DB-GPT/pilot/meta_data/alembic/versions/` and reboot." +msgstr "删除`DB-GPT/pilot/meta_data/alembic/versions/`目录下文件" + #~ msgid "" #~ "Q2: When use Mysql, Access denied " #~ "for user 'root@localhost'(using password :NO)" diff --git a/pilot/base_modules/agent/db/plugin_hub_db.py b/pilot/base_modules/agent/db/plugin_hub_db.py index 872324e33..89ec314c7 100644 --- a/pilot/base_modules/agent/db/plugin_hub_db.py +++ b/pilot/base_modules/agent/db/plugin_hub_db.py @@ -10,6 +10,8 @@ char_set_sql = DDL("ALTER TABLE plugin_hub CONVERT TO CHARACTER SET utf8mb4") + + class PluginHubEntity(Base): __tablename__ = "plugin_hub" __table_args__ = { @@ -35,7 +37,6 @@ class PluginHubEntity(Base): Index("idx_q_type", "type") - class PluginHubDao(BaseDao[PluginHubEntity]): def __init__(self): super().__init__( diff --git a/pilot/embedding_engine/source_embedding.py b/pilot/embedding_engine/source_embedding.py index 24bae97b2..f8f9458bc 100644 --- a/pilot/embedding_engine/source_embedding.py +++ b/pilot/embedding_engine/source_embedding.py @@ -29,7 +29,14 @@ def __init__( text_splitter: Optional[TextSplitter] = None, embedding_args: Optional[Dict] = None, ): - """Initialize with Loader url, model_name, vector_store_config""" + """Initialize with Loader url, model_name, vector_store_config + Args: + - file_path: data source path + - vector_store_config: vector store config params. + - source_reader: Optional[] + - text_splitter: Optional[TextSplitter] + - embedding_args: Optional + """ self.file_path = file_path self.vector_store_config = vector_store_config self.source_reader = source_reader or None diff --git a/pilot/memory/chat_history/chat_history_db.py b/pilot/memory/chat_history/chat_history_db.py index 6d66d5d4c..2f9b2999c 100644 --- a/pilot/memory/chat_history/chat_history_db.py +++ b/pilot/memory/chat_history/chat_history_db.py @@ -31,7 +31,6 @@ class ChatHistoryEntity(Base): Index("idx_q_conv", "summary") - class ChatHistoryDao(BaseDao[ChatHistoryEntity]): def __init__(self): super().__init__( diff --git a/pilot/openapi/api_v1/feedback/feed_back_db.py b/pilot/openapi/api_v1/feedback/feed_back_db.py index 64b99b8de..3d697263b 100644 --- a/pilot/openapi/api_v1/feedback/feed_back_db.py +++ b/pilot/openapi/api_v1/feedback/feed_back_db.py @@ -9,6 +9,10 @@ class ChatFeedBackEntity(Base): __tablename__ = "chat_feed_back" + __table_args__ = { + "mysql_charset": "utf8mb4", + "mysql_collate": "utf8mb4_unicode_ci", + } id = Column(Integer, primary_key=True) conv_uid = Column(String(128)) conv_index = Column(Integer) diff --git a/pilot/server/knowledge/chunk_db.py b/pilot/server/knowledge/chunk_db.py index 205b77044..f1e792377 100644 --- a/pilot/server/knowledge/chunk_db.py +++ b/pilot/server/knowledge/chunk_db.py @@ -12,6 +12,10 @@ class DocumentChunkEntity(Base): __tablename__ = "document_chunk" + __table_args__ = { + "mysql_charset": "utf8mb4", + "mysql_collate": "utf8mb4_unicode_ci", + } id = Column(Integer, primary_key=True) document_id = Column(Integer) doc_name = Column(String(100)) diff --git a/pilot/server/knowledge/document_db.py b/pilot/server/knowledge/document_db.py index 5f7b47add..b2c54aa64 100644 --- a/pilot/server/knowledge/document_db.py +++ b/pilot/server/knowledge/document_db.py @@ -11,6 +11,10 @@ class KnowledgeDocumentEntity(Base): __tablename__ = "knowledge_document" + __table_args__ = { + "mysql_charset": "utf8mb4", + "mysql_collate": "utf8mb4_unicode_ci", + } id = Column(Integer, primary_key=True) doc_name = Column(String(100)) doc_type = Column(String(100)) @@ -24,6 +28,8 @@ class KnowledgeDocumentEntity(Base): gmt_created = Column(DateTime) gmt_modified = Column(DateTime) + __table_args__ = {"mysql_charset": "utf8mb4"} + def __repr__(self): return f"KnowledgeDocumentEntity(id={self.id}, doc_name='{self.doc_name}', doc_type='{self.doc_type}', chunk_size='{self.chunk_size}', status='{self.status}', last_sync='{self.last_sync}', content='{self.content}', result='{self.result}', gmt_created='{self.gmt_created}', gmt_modified='{self.gmt_modified}')" diff --git a/pilot/server/knowledge/space_db.py b/pilot/server/knowledge/space_db.py index 491fe303b..b833906da 100644 --- a/pilot/server/knowledge/space_db.py +++ b/pilot/server/knowledge/space_db.py @@ -12,6 +12,10 @@ class KnowledgeSpaceEntity(Base): __tablename__ = "knowledge_space" + __table_args__ = { + "mysql_charset": "utf8mb4", + "mysql_collate": "utf8mb4_unicode_ci", + } id = Column(Integer, primary_key=True) name = Column(String(100)) vector_type = Column(String(100)) diff --git a/pilot/server/prompt/prompt_manage_db.py b/pilot/server/prompt/prompt_manage_db.py index 56bbac20d..ea482a3bb 100644 --- a/pilot/server/prompt/prompt_manage_db.py +++ b/pilot/server/prompt/prompt_manage_db.py @@ -13,6 +13,10 @@ class PromptManageEntity(Base): __tablename__ = "prompt_manage" + __table_args__ = { + "mysql_charset": "utf8mb4", + "mysql_collate": "utf8mb4_unicode_ci", + } id = Column(Integer, primary_key=True) chat_scene = Column(String(100)) sub_chat_scene = Column(String(100)) diff --git a/pilot/vector_store/connector.py b/pilot/vector_store/connector.py index fd2198c0f..b174d7289 100644 --- a/pilot/vector_store/connector.py +++ b/pilot/vector_store/connector.py @@ -14,7 +14,11 @@ class VectorStoreConnector: """ def __init__(self, vector_store_type, ctx: {}) -> None: - """initialize vector store connector.""" + """initialize vector store connector. + Args: + - vector_store_type: vector store type Milvus, Chroma, Weaviate + - ctx: vector store config params. + """ self.ctx = ctx self._register() @@ -30,20 +34,30 @@ def load_document(self, docs): """load document in vector database.""" return self.client.load_document(docs) - def similar_search(self, docs, topk): - """similar search in vector database.""" - return self.client.similar_search(docs, topk) + def similar_search(self, query: str, topk: int): + """similar search in vector database. + Args: + - query: query text + - topk: topk + """ + return self.client.similar_search(query, topk) def vector_name_exists(self): """is vector store name exist.""" return self.client.vector_name_exists() def delete_vector_name(self, vector_name): - """vector store delete""" + """vector store delete + Args: + - vector_name: vector store name + """ return self.client.delete_vector_name(vector_name) def delete_by_ids(self, ids): - """vector store delete by ids.""" + """vector store delete by ids. + Args: + - ids: vector ids + """ return self.client.delete_by_ids(ids=ids) def _match(self, vector_store_type) -> bool: From 01eae42554dc3188bb126a39d179a2d463acaea5 Mon Sep 17 00:00:00 2001 From: aries_ckt <916701291@qq.com> Date: Thu, 19 Oct 2023 19:39:36 +0800 Subject: [PATCH 3/6] style:fmt --- pilot/embedding_engine/source_embedding.py | 26 +++++++++++++++++----- pilot/vector_store/connector.py | 6 ++--- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/pilot/embedding_engine/source_embedding.py b/pilot/embedding_engine/source_embedding.py index f8f9458bc..64e422079 100644 --- a/pilot/embedding_engine/source_embedding.py +++ b/pilot/embedding_engine/source_embedding.py @@ -51,21 +51,33 @@ def read(self) -> List[ABC]: @register def data_process(self, text): - """pre process data.""" + """pre process data. + Args: + - text: raw text + """ @register def text_splitter(self, text_splitter: TextSplitter): - """add text split chunk""" + """add text split chunk + Args: + - text_splitter: TextSplitter + """ pass @register def text_to_vector(self, docs): - """transform vector""" + """transform vector + Args: + - docs: List[Document] + """ pass @register def index_to_store(self, docs): - """index to vector store""" + """index to vector store + Args: + - docs: List[Document] + """ self.vector_client = VectorStoreConnector( self.vector_store_config["vector_store_type"], self.vector_store_config ) @@ -73,7 +85,10 @@ def index_to_store(self, docs): @register def similar_search(self, doc, topk): - """vector store similarity_search""" + """vector store similarity_search + Args: + - query: query + """ self.vector_client = VectorStoreConnector( self.vector_store_config["vector_store_type"], self.vector_store_config ) @@ -89,6 +104,7 @@ def vector_name_exist(self): return self.vector_client.vector_name_exists() def source_embedding(self): + """read()->data_process()->text_split()->index_to_store()""" if "read" in registered_methods: text = self.read() if "data_process" in registered_methods: diff --git a/pilot/vector_store/connector.py b/pilot/vector_store/connector.py index b174d7289..1bf86082f 100644 --- a/pilot/vector_store/connector.py +++ b/pilot/vector_store/connector.py @@ -34,13 +34,13 @@ def load_document(self, docs): """load document in vector database.""" return self.client.load_document(docs) - def similar_search(self, query: str, topk: int): + def similar_search(self, doc: str, topk: int): """similar search in vector database. Args: - - query: query text + - doc: query text - topk: topk """ - return self.client.similar_search(query, topk) + return self.client.similar_search(doc, topk) def vector_name_exists(self): """is vector store name exist.""" From d11ec46ee59f97ad2de66f44bb45653e6733336b Mon Sep 17 00:00:00 2001 From: aries_ckt <916701291@qq.com> Date: Thu, 19 Oct 2023 20:16:38 +0800 Subject: [PATCH 4/6] style:fmt --- pilot/embedding_engine/csv_embedding.py | 8 +- pilot/embedding_engine/embedding_engine.py | 21 +++++- pilot/embedding_engine/pdf_embedding.py | 8 +- pilot/embedding_engine/ppt_embedding.py | 8 +- pilot/embedding_engine/source_embedding.py | 6 +- pilot/embedding_engine/string_embedding.py | 8 +- pilot/embedding_engine/url_embedding.py | 8 +- pilot/embedding_engine/word_embedding.py | 8 +- pilot/server/knowledge/service.py | 87 ++++++++++++++++------ 9 files changed, 131 insertions(+), 31 deletions(-) diff --git a/pilot/embedding_engine/csv_embedding.py b/pilot/embedding_engine/csv_embedding.py index 5f58fac9d..efd6b905e 100644 --- a/pilot/embedding_engine/csv_embedding.py +++ b/pilot/embedding_engine/csv_embedding.py @@ -21,7 +21,13 @@ def __init__( source_reader: Optional = None, text_splitter: Optional[TextSplitter] = None, ): - """Initialize with csv path.""" + """Initialize with csv path. + Args: + - file_path: data source path + - vector_store_config: vector store config params. + - source_reader: Optional[BaseLoader] + - text_splitter: Optional[TextSplitter] + """ super().__init__( file_path, vector_store_config, source_reader=None, text_splitter=None ) diff --git a/pilot/embedding_engine/embedding_engine.py b/pilot/embedding_engine/embedding_engine.py index 77d1a6667..ce739c3bb 100644 --- a/pilot/embedding_engine/embedding_engine.py +++ b/pilot/embedding_engine/embedding_engine.py @@ -28,7 +28,16 @@ def __init__( text_splitter: Optional[TextSplitter] = None, embedding_factory: EmbeddingFactory = None, ): - """Initialize with knowledge embedding client, model_name, vector_store_config, knowledge_type, knowledge_source""" + """Initialize with knowledge embedding client, model_name, vector_store_config, knowledge_type, knowledge_source + Args: + - model_name: model_name + - vector_store_config: vector store config: Dict + - knowledge_type: Optional[KnowledgeType] + - knowledge_source: Optional[str] + - source_reader: Optional[BaseLoader] + - text_splitter: Optional[TextSplitter] + - embedding_factory: EmbeddingFactory + """ self.knowledge_source = knowledge_source self.model_name = model_name self.vector_store_config = vector_store_config @@ -65,6 +74,11 @@ def init_knowledge_embedding(self): ) def similar_search(self, text, topk): + """vector db similar search + Args: + - text: query text + - topk: top k + """ vector_client = VectorStoreConnector( self.vector_store_config["vector_store_type"], self.vector_store_config ) @@ -75,12 +89,17 @@ def similar_search(self, text, topk): return ans def vector_exist(self): + """vector db is exist""" vector_client = VectorStoreConnector( self.vector_store_config["vector_store_type"], self.vector_store_config ) return vector_client.vector_name_exists() def delete_by_ids(self, ids): + """delete vector db by ids + Args: + - ids: vector ids + """ vector_client = VectorStoreConnector( self.vector_store_config["vector_store_type"], self.vector_store_config ) diff --git a/pilot/embedding_engine/pdf_embedding.py b/pilot/embedding_engine/pdf_embedding.py index bb62cb708..1f603d284 100644 --- a/pilot/embedding_engine/pdf_embedding.py +++ b/pilot/embedding_engine/pdf_embedding.py @@ -23,7 +23,13 @@ def __init__( source_reader: Optional = None, text_splitter: Optional[TextSplitter] = None, ): - """Initialize pdf word path.""" + """Initialize pdf word path. + Args: + - file_path: data source path + - vector_store_config: vector store config params. + - source_reader: Optional[BaseLoader] + - text_splitter: Optional[TextSplitter] + """ super().__init__( file_path, vector_store_config, source_reader=None, text_splitter=None ) diff --git a/pilot/embedding_engine/ppt_embedding.py b/pilot/embedding_engine/ppt_embedding.py index 9058c092f..42137d91d 100644 --- a/pilot/embedding_engine/ppt_embedding.py +++ b/pilot/embedding_engine/ppt_embedding.py @@ -23,7 +23,13 @@ def __init__( source_reader: Optional = None, text_splitter: Optional[TextSplitter] = None, ): - """Initialize ppt word path.""" + """Initialize ppt word path. + Args: + - file_path: data source path + - vector_store_config: vector store config params. + - source_reader: Optional[BaseLoader] + - text_splitter: Optional[TextSplitter] + """ super().__init__( file_path, vector_store_config, source_reader=None, text_splitter=None ) diff --git a/pilot/embedding_engine/source_embedding.py b/pilot/embedding_engine/source_embedding.py index 64e422079..950c84ded 100644 --- a/pilot/embedding_engine/source_embedding.py +++ b/pilot/embedding_engine/source_embedding.py @@ -33,7 +33,7 @@ def __init__( Args: - file_path: data source path - vector_store_config: vector store config params. - - source_reader: Optional[] + - source_reader: Optional[BaseLoader] - text_splitter: Optional[TextSplitter] - embedding_args: Optional """ @@ -52,8 +52,8 @@ def read(self) -> List[ABC]: @register def data_process(self, text): """pre process data. - Args: - - text: raw text + Args: + - text: raw text """ @register diff --git a/pilot/embedding_engine/string_embedding.py b/pilot/embedding_engine/string_embedding.py index 95e7ba6d1..fafdb27f2 100644 --- a/pilot/embedding_engine/string_embedding.py +++ b/pilot/embedding_engine/string_embedding.py @@ -20,7 +20,13 @@ def __init__( source_reader: Optional = None, text_splitter: Optional[TextSplitter] = None, ): - """Initialize raw text word path.""" + """Initialize raw text word path. + Args: + - file_path: data source path + - vector_store_config: vector store config params. + - source_reader: Optional[BaseLoader] + - text_splitter: Optional[TextSplitter] + """ super().__init__( file_path=file_path, vector_store_config=vector_store_config, diff --git a/pilot/embedding_engine/url_embedding.py b/pilot/embedding_engine/url_embedding.py index e00cf84e2..71842c2f1 100644 --- a/pilot/embedding_engine/url_embedding.py +++ b/pilot/embedding_engine/url_embedding.py @@ -22,7 +22,13 @@ def __init__( source_reader: Optional = None, text_splitter: Optional[TextSplitter] = None, ): - """Initialize url word path.""" + """Initialize url word path. + Args: + - file_path: data source path + - vector_store_config: vector store config params. + - source_reader: Optional[BaseLoader] + - text_splitter: Optional[TextSplitter] + """ super().__init__( file_path, vector_store_config, source_reader=None, text_splitter=None ) diff --git a/pilot/embedding_engine/word_embedding.py b/pilot/embedding_engine/word_embedding.py index aba50fe24..5bfe7ec88 100644 --- a/pilot/embedding_engine/word_embedding.py +++ b/pilot/embedding_engine/word_embedding.py @@ -23,7 +23,13 @@ def __init__( source_reader: Optional = None, text_splitter: Optional[TextSplitter] = None, ): - """Initialize with word path.""" + """Initialize with word path. + Args: + - file_path: data source path + - vector_store_config: vector store config params. + - source_reader: Optional[BaseLoader] + - text_splitter: Optional[TextSplitter] + """ super().__init__( file_path, vector_store_config, source_reader=None, text_splitter=None ) diff --git a/pilot/server/knowledge/service.py b/pilot/server/knowledge/service.py index c11fc3b46..701246195 100644 --- a/pilot/server/knowledge/service.py +++ b/pilot/server/knowledge/service.py @@ -57,12 +57,21 @@ class SyncStatus(Enum): # @singleton class KnowledgeService: + """KnowledgeService + Knowledge Management Service: + -knowledge_space management + -knowledge_document management + -embedding management + """ + def __init__(self): pass - """create knowledge space""" - def create_knowledge_space(self, request: KnowledgeSpaceRequest): + """create knowledge space + Args: + - request: KnowledgeSpaceRequest + """ query = KnowledgeSpaceEntity( name=request.name, ) @@ -72,9 +81,11 @@ def create_knowledge_space(self, request: KnowledgeSpaceRequest): knowledge_space_dao.create_knowledge_space(request) return True - """create knowledge document""" - def create_knowledge_document(self, space, request: KnowledgeDocumentRequest): + """create knowledge document + Args: + - request: KnowledgeDocumentRequest + """ query = KnowledgeDocumentEntity(doc_name=request.doc_name, space=space) documents = knowledge_document_dao.get_knowledge_documents(query) if len(documents) > 0: @@ -91,9 +102,11 @@ def create_knowledge_document(self, space, request: KnowledgeDocumentRequest): ) return knowledge_document_dao.create_knowledge_document(document) - """get knowledge space""" - def get_knowledge_space(self, request: KnowledgeSpaceRequest): + """get knowledge space + Args: + - request: KnowledgeSpaceRequest + """ query = KnowledgeSpaceEntity( name=request.name, vector_type=request.vector_type, owner=request.owner ) @@ -116,6 +129,10 @@ def get_knowledge_space(self, request: KnowledgeSpaceRequest): return responses def arguments(self, space_name): + """show knowledge space arguments + Args: + - space_name: Knowledge Space Name + """ query = KnowledgeSpaceEntity(name=space_name) spaces = knowledge_space_dao.get_knowledge_space(query) if len(spaces) != 1: @@ -128,6 +145,11 @@ def arguments(self, space_name): return json.loads(context) def argument_save(self, space_name, argument_request: SpaceArgumentRequest): + """save argument + Args: + - space_name: Knowledge Space Name + - argument_request: SpaceArgumentRequest + """ query = KnowledgeSpaceEntity(name=space_name) spaces = knowledge_space_dao.get_knowledge_space(query) if len(spaces) != 1: @@ -136,9 +158,12 @@ def argument_save(self, space_name, argument_request: SpaceArgumentRequest): space.context = argument_request.argument return knowledge_space_dao.update_knowledge_space(space) - """get knowledge get_knowledge_documents""" - def get_knowledge_documents(self, space, request: DocumentQueryRequest): + """get knowledge documents + Args: + - space: Knowledge Space Name + - request: DocumentQueryRequest + """ query = KnowledgeDocumentEntity( doc_name=request.doc_name, doc_type=request.doc_type, @@ -153,9 +178,12 @@ def get_knowledge_documents(self, space, request: DocumentQueryRequest): res.page = request.page return res - """sync knowledge document chunk into vector store""" - def sync_knowledge_document(self, space_name, sync_request: DocumentSyncRequest): + """sync knowledge document chunk into vector store + Args: + - space: Knowledge Space Name + - sync_request: DocumentSyncRequest + """ from pilot.embedding_engine.embedding_engine import EmbeddingEngine from pilot.embedding_engine.embedding_factory import EmbeddingFactory from pilot.embedding_engine.pre_text_splitter import PreTextSplitter @@ -249,11 +277,6 @@ def sync_knowledge_document(self, space_name, sync_request: DocumentSyncRequest) doc.chunk_size = len(chunk_docs) doc.gmt_modified = datetime.now() knowledge_document_dao.update_knowledge_document(doc) - # async doc embeddings - # thread = threading.Thread( - # target=self.async_doc_embedding, args=(client, chunk_docs, doc) - # ) - # thread.start() executor = CFG.SYSTEM_APP.get_component( ComponentType.EXECUTOR_DEFAULT, ExecutorFactory ).create() @@ -277,16 +300,21 @@ def sync_knowledge_document(self, space_name, sync_request: DocumentSyncRequest) return True - """update knowledge space""" - def update_knowledge_space( self, space_id: int, space_request: KnowledgeSpaceRequest ): + """update knowledge space + Args: + - space_id: space id + - space_request: KnowledgeSpaceRequest + """ knowledge_space_dao.update_knowledge_space(space_id, space_request) - """delete knowledge space""" - def delete_space(self, space_name: str): + """delete knowledge space + Args: + - space_name: knowledge space name + """ query = KnowledgeSpaceEntity(name=space_name) spaces = knowledge_space_dao.get_knowledge_space(query) if len(spaces) == 0: @@ -312,6 +340,11 @@ def delete_space(self, space_name: str): return knowledge_space_dao.delete_knowledge_space(space) def delete_document(self, space_name: str, doc_name: str): + """delete document + Args: + - space_name: knowledge space name + - doc_name: doocument name + """ document_query = KnowledgeDocumentEntity(doc_name=doc_name, space=space_name) documents = knowledge_document_dao.get_documents(document_query) if len(documents) != 1: @@ -332,9 +365,11 @@ def delete_document(self, space_name: str, doc_name: str): # delete document return knowledge_document_dao.delete(document_query) - """get document chunks""" - def get_document_chunks(self, request: ChunkQueryRequest): + """get document chunks + Args: + - request: ChunkQueryRequest + """ query = DocumentChunkEntity( id=request.id, document_id=request.document_id, @@ -350,6 +385,12 @@ def get_document_chunks(self, request: ChunkQueryRequest): return res def async_doc_embedding(self, client, chunk_docs, doc): + """async document embedding into vector db + Args: + - client: EmbeddingEngine Client + - chunk_docs: List[Document] + - doc: doc + """ logger.info( f"async_doc_embedding, doc:{doc.doc_name}, chunk_size:{len(chunk_docs)}, begin embedding to vector store-{CFG.VECTOR_STORE_TYPE}" ) @@ -391,6 +432,10 @@ def _build_default_context(self): return context_template_string def get_space_context(self, space_name): + """get space contect + Args: + - space_name: space name + """ request = KnowledgeSpaceRequest() request.name = space_name spaces = self.get_knowledge_space(request) From 517737d730085621000cbe18187b91963da1e8e9 Mon Sep 17 00:00:00 2001 From: aries_ckt <916701291@qq.com> Date: Thu, 19 Oct 2023 20:55:52 +0800 Subject: [PATCH 5/6] style:fmt --- pilot/connections/rdbms/base.py | 22 ++++++++++++++++++- pilot/scene/base_chat.py | 14 ++++++++++++ pilot/scene/chat_agent/chat.py | 10 +++++++++ pilot/scene/chat_dashboard/chat.py | 11 ++++++++-- .../chat_excel/excel_analyze/chat.py | 10 +++++++++ pilot/scene/chat_db/auto_execute/chat.py | 11 ++++++++++ pilot/scene/chat_db/professional_qa/chat.py | 11 ++++++++-- pilot/scene/chat_execution/chat.py | 10 +++++++++ pilot/scene/chat_knowledge/v1/chat.py | 11 ++++++++-- 9 files changed, 103 insertions(+), 7 deletions(-) diff --git a/pilot/connections/rdbms/base.py b/pilot/connections/rdbms/base.py index b590294ac..43b6a3993 100644 --- a/pilot/connections/rdbms/base.py +++ b/pilot/connections/rdbms/base.py @@ -52,7 +52,18 @@ def __init__( custom_table_info: Optional[dict] = None, view_support: bool = False, ): - """Create engine from database URI.""" + """Create engine from database URI. + Args: + - engine: Engine sqlalchemy.engine + - schema: Optional[str]. + - metadata: Optional[MetaData] + - ignore_tables: Optional[List[str]] + - include_tables: Optional[List[str]] + - sample_rows_in_table_info: int default:3, + - indexes_in_table_info: bool = False, + - custom_table_info: Optional[dict] = None, + - view_support: bool = False, + """ self._engine = engine self._schema = schema if include_tables and ignore_tables: @@ -92,6 +103,15 @@ def from_uri_db( engine_args: Optional[dict] = None, **kwargs: Any, ) -> RDBMSDatabase: + """Construct a SQLAlchemy engine from uri database. + Args: + host (str): database host. + port (int): database port. + user (str): database user. + pwd (str): database password. + db_name (str): database name. + engine_args (Optional[dict]):other engine_args. + """ db_url: str = ( cls.driver + "://" diff --git a/pilot/scene/base_chat.py b/pilot/scene/base_chat.py index 243760090..5d685573e 100644 --- a/pilot/scene/base_chat.py +++ b/pilot/scene/base_chat.py @@ -21,6 +21,12 @@ class BaseChat(ABC): + """DB-GPT Chat Service Base Module + Include: + stream_call():scene + prompt -> stream response + nostream_call():scene + prompt -> nostream response + """ + chat_scene: str = None llm_model: Any = None # By default, keep the last two rounds of conversation records as the context @@ -32,6 +38,14 @@ class Config: arbitrary_types_allowed = True def __init__(self, chat_param: Dict): + """Chat Module Initialization + Args: + - chat_param: Dict + - chat_session_id: (str) chat session_id + - current_user_input: (str) current user input + - model_name:(str) llm model name + - select_param:(str) select param + """ self.chat_session_id = chat_param["chat_session_id"] self.chat_mode = chat_param["chat_mode"] self.current_user_input: str = chat_param["current_user_input"] diff --git a/pilot/scene/chat_agent/chat.py b/pilot/scene/chat_agent/chat.py index 4734c8106..ef37f8448 100644 --- a/pilot/scene/chat_agent/chat.py +++ b/pilot/scene/chat_agent/chat.py @@ -18,10 +18,20 @@ class ChatAgent(BaseChat): + """Chat With Agent through plugin""" + chat_scene: str = ChatScene.ChatAgent.value() chat_retention_rounds = 0 def __init__(self, chat_param: Dict): + """Chat Agent Module Initialization + Args: + - chat_param: Dict + - chat_session_id: (str) chat session_id + - current_user_input: (str) current user input + - model_name:(str) llm model name + - select_param:(str) agent plugin + """ if not chat_param["select_param"]: raise ValueError("Please select a Plugin!") self.select_plugins = chat_param["select_param"].split(",") diff --git a/pilot/scene/chat_dashboard/chat.py b/pilot/scene/chat_dashboard/chat.py index ba5bdd8ce..7e4433670 100644 --- a/pilot/scene/chat_dashboard/chat.py +++ b/pilot/scene/chat_dashboard/chat.py @@ -19,10 +19,17 @@ class ChatDashboard(BaseChat): chat_scene: str = ChatScene.ChatDashboard.value() report_name: str - """Number of results to return from the query""" + """Chat Dashboard to generate dashboard chart""" def __init__(self, chat_param: Dict): - """ """ + """Chat Dashboard Module Initialization + Args: + - chat_param: Dict + - chat_session_id: (str) chat session_id + - current_user_input: (str) current user input + - model_name:(str) llm model name + - select_param:(str) dbname + """ self.db_name = chat_param["select_param"] chat_param["chat_mode"] = ChatScene.ChatDashboard super().__init__(chat_param=chat_param) diff --git a/pilot/scene/chat_data/chat_excel/excel_analyze/chat.py b/pilot/scene/chat_data/chat_excel/excel_analyze/chat.py index 9599c1402..343afc4bd 100644 --- a/pilot/scene/chat_data/chat_excel/excel_analyze/chat.py +++ b/pilot/scene/chat_data/chat_excel/excel_analyze/chat.py @@ -19,10 +19,20 @@ class ChatExcel(BaseChat): + """a Excel analyzer to analyze Excel Data""" + chat_scene: str = ChatScene.ChatExcel.value() chat_retention_rounds = 1 def __init__(self, chat_param: Dict): + """Chat Excel Module Initialization + Args: + - chat_param: Dict + - chat_session_id: (str) chat session_id + - current_user_input: (str) current user input + - model_name:(str) llm model name + - select_param:(str) file path + """ chat_mode = ChatScene.ChatExcel self.select_param = chat_param["select_param"] diff --git a/pilot/scene/chat_db/auto_execute/chat.py b/pilot/scene/chat_db/auto_execute/chat.py index e8e1e2ebd..f92df7a3a 100644 --- a/pilot/scene/chat_db/auto_execute/chat.py +++ b/pilot/scene/chat_db/auto_execute/chat.py @@ -15,6 +15,14 @@ class ChatWithDbAutoExecute(BaseChat): """Number of results to return from the query""" def __init__(self, chat_param: Dict): + """Chat Data Module Initialization + Args: + - chat_param: Dict + - chat_session_id: (str) chat session_id + - current_user_input: (str) current user input + - model_name:(str) llm model name + - select_param:(str) dbname + """ chat_mode = ChatScene.ChatWithDbExecute self.db_name = chat_param["select_param"] chat_param["chat_mode"] = chat_mode @@ -31,6 +39,9 @@ def __init__(self, chat_param: Dict): self.top_k: int = 200 def generate_input_values(self): + """ + generate input values + """ try: from pilot.summary.db_summary_client import DBSummaryClient except ImportError: diff --git a/pilot/scene/chat_db/professional_qa/chat.py b/pilot/scene/chat_db/professional_qa/chat.py index 39f4052a6..abdfd9f00 100644 --- a/pilot/scene/chat_db/professional_qa/chat.py +++ b/pilot/scene/chat_db/professional_qa/chat.py @@ -12,10 +12,17 @@ class ChatWithDbQA(BaseChat): chat_scene: str = ChatScene.ChatWithDbQA.value() - """Number of results to return from the query""" + """As a DBA, Chat DB Module, chat with combine DB meta schema """ def __init__(self, chat_param: Dict): - """ """ + """Chat DB Module Initialization + Args: + - chat_param: Dict + - chat_session_id: (str) chat session_id + - current_user_input: (str) current user input + - model_name:(str) llm model name + - select_param:(str) dbname + """ self.db_name = chat_param["select_param"] chat_param["chat_mode"] = ChatScene.ChatWithDbQA super().__init__(chat_param=chat_param) diff --git a/pilot/scene/chat_execution/chat.py b/pilot/scene/chat_execution/chat.py index e4c5175a6..c6d7bbe2f 100644 --- a/pilot/scene/chat_execution/chat.py +++ b/pilot/scene/chat_execution/chat.py @@ -11,11 +11,21 @@ class ChatWithPlugin(BaseChat): + """Chat With Plugin""" + chat_scene: str = ChatScene.ChatExecution.value() plugins_prompt_generator: PluginPromptGenerator select_plugin: str = None def __init__(self, chat_param: Dict): + """Chat Dashboard Module Initialization + Args: + - chat_param: Dict + - chat_session_id: (str) chat session_id + - current_user_input: (str) current user input + - model_name:(str) llm model name + - select_param:(str) plugin selector + """ self.plugin_selector = chat_param["select_param"] chat_param["chat_mode"] = ChatScene.ChatExecution super().__init__(chat_param=chat_param) diff --git a/pilot/scene/chat_knowledge/v1/chat.py b/pilot/scene/chat_knowledge/v1/chat.py index 672669a1b..9681f13c6 100644 --- a/pilot/scene/chat_knowledge/v1/chat.py +++ b/pilot/scene/chat_knowledge/v1/chat.py @@ -19,10 +19,17 @@ class ChatKnowledge(BaseChat): chat_scene: str = ChatScene.ChatKnowledge.value() - """Number of results to return from the query""" + """KBQA Chat Module""" def __init__(self, chat_param: Dict): - """ """ + """Chat Knowledge Module Initialization + Args: + - chat_param: Dict + - chat_session_id: (str) chat session_id + - current_user_input: (str) current user input + - model_name:(str) llm model name + - select_param:(str) space name + """ from pilot.embedding_engine.embedding_engine import EmbeddingEngine from pilot.embedding_engine.embedding_factory import EmbeddingFactory From 8acc2dc8613d7acbacbc345deb1929f20ed36cab Mon Sep 17 00:00:00 2001 From: aries_ckt <916701291@qq.com> Date: Thu, 19 Oct 2023 21:10:33 +0800 Subject: [PATCH 6/6] style:fmt --- pilot/server/base.py | 1 + pilot/server/dbgpt_server.py | 3 +++ pilot/server/llmserver.py | 3 +++ 3 files changed, 7 insertions(+) diff --git a/pilot/server/base.py b/pilot/server/base.py index 75e16ab85..e66efcf43 100644 --- a/pilot/server/base.py +++ b/pilot/server/base.py @@ -21,6 +21,7 @@ def signal_handler(sig, frame): def async_db_summary(system_app: SystemApp): + """async db schema into vector db""" from pilot.summary.db_summary_client import DBSummaryClient client = DBSummaryClient(system_app=system_app) diff --git a/pilot/server/dbgpt_server.py b/pilot/server/dbgpt_server.py index 5b57d18bb..c6e084a93 100644 --- a/pilot/server/dbgpt_server.py +++ b/pilot/server/dbgpt_server.py @@ -115,6 +115,9 @@ def _get_webserver_params(args: List[str] = None): def initialize_app(param: WebWerverParameters = None, args: List[str] = None): """Initialize app If you use gunicorn as a process manager, initialize_app can be invoke in `on_starting` hook. + Args: + param:WebWerverParameters + args:List[str] """ if not param: param = _get_webserver_params(args) diff --git a/pilot/server/llmserver.py b/pilot/server/llmserver.py index 1a2dd49ce..9526f38b5 100644 --- a/pilot/server/llmserver.py +++ b/pilot/server/llmserver.py @@ -16,6 +16,9 @@ model_path = LLM_MODEL_CONFIG.get(CFG.LLM_MODEL) if __name__ == "__main__": + """run llm server including controller, manager worker + If you use gunicorn as a process manager, initialize_app can be invoke in `on_starting` hook. + """ run_worker_manager( model_name=CFG.LLM_MODEL, model_path=model_path,