Skip to content

Commit

Permalink
support oceanbase as an optional vector database (#1435)
Browse files Browse the repository at this point in the history
Signed-off-by: shanhaikang.shk <[email protected]>
  • Loading branch information
GITHUBear authored Apr 24, 2024
1 parent 91c1371 commit 6520367
Show file tree
Hide file tree
Showing 10 changed files with 975 additions and 12 deletions.
12 changes: 12 additions & 0 deletions dbgpt/_private/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,18 @@ def __init__(self) -> None:
self.MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", None)
self.MILVUS_PASSWORD = os.getenv("MILVUS_PASSWORD", None)

## OceanBase Configuration
self.OB_HOST = os.getenv("OB_HOST", "127.0.0.1")
self.OB_PORT = int(os.getenv("OB_PORT", "2881"))
self.OB_USER = os.getenv("OB_USER", "root")
self.OB_PASSWORD = os.getenv("OB_PASSWORD", "")
self.OB_DATABASE = os.getenv("OB_DATABASE", "test")
self.OB_SQL_DBG_LOG_PATH = os.getenv("OB_SQL_DBG_LOG_PATH", "")
self.OB_ENABLE_NORMALIZE_VECTOR = bool(
os.getenv("OB_ENABLE_NORMALIZE_VECTOR", "")
)
self.OB_ENABLE_INDEX = bool(os.getenv("OB_ENABLE_INDEX", ""))

# QLoRA
self.QLoRA = os.getenv("QUANTIZE_QLORA", "True")
self.IS_LOAD_8BIT = os.getenv("QUANTIZE_8bit", "True").lower() == "true"
Expand Down
4 changes: 2 additions & 2 deletions dbgpt/rag/text_splitter/tests/test_splitters.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ def test_md_header_text_splitter() -> None:
output = markdown_splitter.split_text(markdown_document)
expected_output = [
Chunk(
content="{'Header 1': 'dbgpt', 'Header 2': 'description'}, my name is dbgpt",
content='"dbgpt-description": my name is dbgpt',
metadata={"Header 1": "dbgpt", "Header 2": "description"},
),
Chunk(
content="{'Header 1': 'dbgpt', 'Header 2': 'content'}, my name is aries",
content='"dbgpt-content": my name is aries',
metadata={"Header 1": "dbgpt", "Header 2": "content"},
),
]
Expand Down
29 changes: 23 additions & 6 deletions dbgpt/rag/text_splitter/text_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,8 @@ def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Chunk]:
aggregated_chunks[-1]["content"] += " \n" + line["content"]
else:
# Otherwise, append the current line to the aggregated list
line["content"] = f"{line['metadata']}, " + line["content"]
subtitles = "-".join((list(line["metadata"].values())))
line["content"] = f'"{subtitles}": ' + line["content"]
aggregated_chunks.append(line)

return [
Expand Down Expand Up @@ -557,16 +558,28 @@ def split_text( # type: ignore
# header_stack: List[Dict[str, Union[int, str]]] = []
header_stack: List[HeaderType] = []
initial_metadata: Dict[str, str] = {}
# Determine whether a line is within a markdown code block.
in_code_block = False
for line in lines:
stripped_line = line.strip()
# A code frame starts with "```"
with_code_frame = stripped_line.startswith("```") and (
stripped_line != "```"
)
if (not in_code_block) and with_code_frame:
in_code_block = True
# Check each line against each of the header types (e.g., #, ##)
for sep, name in self.headers_to_split_on:
# Check if line starts with a header that we intend to split on
if stripped_line.startswith(sep) and (
# Header with no text OR header is followed by space
# Both are valid conditions that sep is being used a header
len(stripped_line) == len(sep)
or stripped_line[len(sep)] == " "
if (
(not in_code_block)
and stripped_line.startswith(sep)
and (
# Header with no text OR header is followed by space
# Both are valid conditions that sep is being used a header
len(stripped_line) == len(sep)
or stripped_line[len(sep)] == " "
)
):
# Ensure we are tracking the header as metadata
if name is not None:
Expand Down Expand Up @@ -620,6 +633,10 @@ def split_text( # type: ignore
)
current_content.clear()

# Code block ends
if in_code_block and stripped_line == "```":
in_code_block = False

current_metadata = initial_metadata.copy()
if current_content:
lines_with_metadata.append(
Expand Down
10 changes: 9 additions & 1 deletion dbgpt/storage/vector_store/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ def _import_weaviate() -> Any:
return WeaviateStore


def _import_oceanbase() -> Any:
from dbgpt.storage.vector_store.oceanbase_store import OceanBaseStore

return OceanBaseStore


def __getattr__(name: str) -> Any:
if name == "Chroma":
return _import_chroma()
Expand All @@ -35,8 +41,10 @@ def __getattr__(name: str) -> Any:
return _import_weaviate()
elif name == "PGVector":
return _import_pgvector()
elif name == "OceanBase":
return _import_oceanbase()
else:
raise AttributeError(f"Could not find: {name}")


__all__ = ["Chroma", "Milvus", "Weaviate", "PGVector"]
__all__ = ["Chroma", "Milvus", "Weaviate", "OceanBase", "PGVector"]
Loading

0 comments on commit 6520367

Please sign in to comment.