support oceanbase as an optional vector database (#1435)

Signed-off-by: shanhaikang.shk <[email protected]>
eosphoros-ai · Apr 24, 2024 · 6520367 · 6520367
1 parent 91c1371
commit 6520367
Show file tree

Hide file tree

Showing 10 changed files with 975 additions and 12 deletions.
diff --git a/dbgpt/_private/config.py b/dbgpt/_private/config.py
@@ -238,6 +238,18 @@ def __init__(self) -> None:
         self.MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", None)
         self.MILVUS_PASSWORD = os.getenv("MILVUS_PASSWORD", None)
 
+        ## OceanBase Configuration
+        self.OB_HOST = os.getenv("OB_HOST", "127.0.0.1")
+        self.OB_PORT = int(os.getenv("OB_PORT", "2881"))
+        self.OB_USER = os.getenv("OB_USER", "root")
+        self.OB_PASSWORD = os.getenv("OB_PASSWORD", "")
+        self.OB_DATABASE = os.getenv("OB_DATABASE", "test")
+        self.OB_SQL_DBG_LOG_PATH = os.getenv("OB_SQL_DBG_LOG_PATH", "")
+        self.OB_ENABLE_NORMALIZE_VECTOR = bool(
+            os.getenv("OB_ENABLE_NORMALIZE_VECTOR", "")
+        )
+        self.OB_ENABLE_INDEX = bool(os.getenv("OB_ENABLE_INDEX", ""))
+
         # QLoRA
         self.QLoRA = os.getenv("QUANTIZE_QLORA", "True")
         self.IS_LOAD_8BIT = os.getenv("QUANTIZE_8bit", "True").lower() == "true"

diff --git a/dbgpt/rag/text_splitter/tests/test_splitters.py b/dbgpt/rag/text_splitter/tests/test_splitters.py
@@ -25,11 +25,11 @@ def test_md_header_text_splitter() -> None:
     output = markdown_splitter.split_text(markdown_document)
     expected_output = [
         Chunk(
-            content="{'Header 1': 'dbgpt', 'Header 2': 'description'}, my name is dbgpt",
+            content='"dbgpt-description": my name is dbgpt',
             metadata={"Header 1": "dbgpt", "Header 2": "description"},
         ),
         Chunk(
-            content="{'Header 1': 'dbgpt', 'Header 2': 'content'}, my name is aries",
+            content='"dbgpt-content": my name is aries',
             metadata={"Header 1": "dbgpt", "Header 2": "content"},
         ),
     ]

diff --git a/dbgpt/rag/text_splitter/text_splitter.py b/dbgpt/rag/text_splitter/text_splitter.py
@@ -515,7 +515,8 @@ def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Chunk]:
                 aggregated_chunks[-1]["content"] += "  \n" + line["content"]
             else:
                 # Otherwise, append the current line to the aggregated list
-                line["content"] = f"{line['metadata']}, " + line["content"]
+                subtitles = "-".join((list(line["metadata"].values())))
+                line["content"] = f'"{subtitles}": ' + line["content"]
                 aggregated_chunks.append(line)
 
         return [
@@ -557,16 +558,28 @@ def split_text(  # type: ignore
         # header_stack: List[Dict[str, Union[int, str]]] = []
         header_stack: List[HeaderType] = []
         initial_metadata: Dict[str, str] = {}
+        # Determine whether a line is within a markdown code block.
+        in_code_block = False
         for line in lines:
             stripped_line = line.strip()
+            # A code frame starts with "```"
+            with_code_frame = stripped_line.startswith("```") and (
+                stripped_line != "```"
+            )
+            if (not in_code_block) and with_code_frame:
+                in_code_block = True
             # Check each line against each of the header types (e.g., #, ##)
             for sep, name in self.headers_to_split_on:
                 # Check if line starts with a header that we intend to split on
-                if stripped_line.startswith(sep) and (
-                    # Header with no text OR header is followed by space
-                    # Both are valid conditions that sep is being used a header
-                    len(stripped_line) == len(sep)
-                    or stripped_line[len(sep)] == " "
+                if (
+                    (not in_code_block)
+                    and stripped_line.startswith(sep)
+                    and (
+                        # Header with no text OR header is followed by space
+                        # Both are valid conditions that sep is being used a header
+                        len(stripped_line) == len(sep)
+                        or stripped_line[len(sep)] == " "
+                    )
                 ):
                     # Ensure we are tracking the header as metadata
                     if name is not None:
@@ -620,6 +633,10 @@ def split_text(  # type: ignore
                     )
                     current_content.clear()
 
+            # Code block ends
+            if in_code_block and stripped_line == "```":
+                in_code_block = False
+
             current_metadata = initial_metadata.copy()
         if current_content:
             lines_with_metadata.append(

diff --git a/dbgpt/storage/vector_store/__init__.py b/dbgpt/storage/vector_store/__init__.py
@@ -26,6 +26,12 @@ def _import_weaviate() -> Any:
     return WeaviateStore
 
 
+def _import_oceanbase() -> Any:
+    from dbgpt.storage.vector_store.oceanbase_store import OceanBaseStore
+
+    return OceanBaseStore
+
+
 def __getattr__(name: str) -> Any:
     if name == "Chroma":
         return _import_chroma()
@@ -35,8 +41,10 @@ def __getattr__(name: str) -> Any:
         return _import_weaviate()
     elif name == "PGVector":
         return _import_pgvector()
+    elif name == "OceanBase":
+        return _import_oceanbase()
     else:
         raise AttributeError(f"Could not find: {name}")
 
 
-__all__ = ["Chroma", "Milvus", "Weaviate", "PGVector"]
+__all__ = ["Chroma", "Milvus", "Weaviate", "OceanBase", "PGVector"]