Skip to content

Commit

Permalink
refactor: the document structure of GraphRAG (#2084)
Browse files Browse the repository at this point in the history
  • Loading branch information
Appointat authored Oct 21, 2024
1 parent 584f090 commit 6d66678
Show file tree
Hide file tree
Showing 15 changed files with 401 additions and 284 deletions.
8 changes: 5 additions & 3 deletions .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,11 @@ KNOWLEDGE_GRAPH_EXTRACT_SEARCH_RECALL_SCORE=0.3
KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_TOP_SIZE=20
KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_RECALL_SCORE=0.0

ENABLE_GRAPH_COMMUNITY_SUMMARY=True # enable the graph community summary
ENABLE_TRIPLET_GRAPH=True # enable the graph search for triplets
ENABLE_DOCUMENT_GRAPH=True # enable the graph search for documents and chunks
GRAPH_COMMUNITY_SUMMARY_ENABLED=True # enable the graph community summary
TRIPLET_GRAPH_ENABLED=True # enable the graph search for triplets
DOCUMENT_GRAPH_ENABLED=True # enable the graph search for documents and chunks

KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE=5 # the top size of knowledge graph search for chunks

### Chroma vector db config
#CHROMA_PERSIST_PATH=/root/DB-GPT/pilot/data
Expand Down
4 changes: 2 additions & 2 deletions dbgpt/_private/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,8 @@ def __init__(self) -> None:

# Vector Store Configuration
self.VECTOR_STORE_TYPE = os.getenv("VECTOR_STORE_TYPE", "Chroma")
self.ENABLE_GRAPH_COMMUNITY_SUMMARY = (
os.getenv("ENABLE_GRAPH_COMMUNITY_SUMMARY", "").lower() == "true"
self.GRAPH_COMMUNITY_SUMMARY_ENABLED = (
os.getenv("GRAPH_COMMUNITY_SUMMARY_ENABLED", "").lower() == "true"
)
self.MILVUS_URL = os.getenv("MILVUS_URL", "127.0.0.1")
self.MILVUS_PORT = os.getenv("MILVUS_PORT", "19530")
Expand Down
1 change: 1 addition & 0 deletions dbgpt/core/interface/knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class Chunk(Document):
chunk_id: str = Field(
default_factory=lambda: str(uuid.uuid4()), description="unique id for the chunk"
)
chunk_name: str = Field(default="", description="chunk name")
content: str = Field(default="", description="chunk text content")

metadata: Dict[str, Any] = Field(
Expand Down
2 changes: 1 addition & 1 deletion dbgpt/serve/rag/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def __init__(

def __rewrite_index_store_type(self, index_store_type):
# Rewrite Knowledge Graph Type
if CFG.ENABLE_GRAPH_COMMUNITY_SUMMARY:
if CFG.GRAPH_COMMUNITY_SUMMARY_ENABLED:
if index_store_type == "KnowledgeGraph":
return "CommunitySummaryKnowledgeGraph"
return index_store_type
Expand Down
12 changes: 2 additions & 10 deletions dbgpt/storage/graph_store/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ class GraphStoreConfig(BaseModel):
default=False,
description="Enable graph community summary or not.",
)
enable_document_graph: bool = Field(
document_graph_enabled: bool = Field(
default=True,
description="Enable document graph search or not.",
)
enable_triplet_graph: bool = Field(
triplet_graph_enabled: bool = Field(
default=True,
description="Enable knowledge graph search or not.",
)
Expand All @@ -48,11 +48,3 @@ def __init__(self, config: GraphStoreConfig):
@abstractmethod
def get_config(self) -> GraphStoreConfig:
"""Get the graph store config."""

@abstractmethod
def _escape_quotes(self, text: str) -> str:
"""Escape single and double quotes in a string for queries."""

# @abstractmethod
# def _paser(self, entities: List[Vertex]) -> str:
# """Parse entities to string."""
34 changes: 11 additions & 23 deletions dbgpt/storage/graph_store/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ class GraphElemType(Enum):

DOCUMENT = "document"
CHUNK = "chunk"
ENTITY = "entity" # view as general vertex in the general case
RELATION = "relation" # view as general edge in the general case
ENTITY = "entity" # default vertex type in knowledge graph
RELATION = "relation" # default edge type in knowledge graph
INCLUDE = "include"
NEXT = "next"

Expand All @@ -39,7 +39,15 @@ def is_vertex(self) -> bool:

def is_edge(self) -> bool:
"""Check if the element is an edge."""
return not self.is_vertex()
return self in [
GraphElemType.RELATION,
GraphElemType.INCLUDE,
GraphElemType.NEXT,
GraphElemType.DOCUMENT_INCLUDE_CHUNK,
GraphElemType.CHUNK_INCLUDE_CHUNK,
GraphElemType.CHUNK_INCLUDE_ENTITY,
GraphElemType.CHUNK_NEXT_CHUNK,
]


class Direction(Enum):
Expand Down Expand Up @@ -335,26 +343,6 @@ def append_edge(self, edge: Edge) -> bool:
self._edge_count += 1
return True

def upsert_vertex_and_edge(
self,
src_vid: str,
src_name: str,
src_props: Dict[str, Any],
dst_vid: str,
dst_name: str,
dst_props: Dict[str, Any],
edge_name: str,
edge_type: str,
):
"""Uperst src and dst vertex, and edge."""
src_vertex = Vertex(src_vid, src_name, **src_props)
dst_vertex = Vertex(dst_vid, dst_name, **dst_props)
edge = Edge(src_vid, dst_vid, edge_name, **{"edge_type": edge_type})

self.upsert_vertex(src_vertex)
self.upsert_vertex(dst_vertex)
self.append_edge(edge)

def upsert_graph(self, graph: "MemoryGraph"):
"""Upsert a graph."""
for vertex in graph.vertices():
Expand Down
6 changes: 0 additions & 6 deletions dbgpt/storage/graph_store/memgraph_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,3 @@ def __init__(self, graph_store_config: MemoryGraphStoreConfig):
def get_config(self):
"""Get the graph store config."""
return self._graph_store_config

def _escape_quotes(self, text: str) -> str:
"""Escape single and double quotes in a string for queries."""
raise NotImplementedError(
"_escape_quotes is not implemented by MemoryGraphStore"
)
15 changes: 5 additions & 10 deletions dbgpt/storage/graph_store/tugraph_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,16 +80,16 @@ def __init__(self, config: TuGraphStoreConfig) -> None:
self._username = os.getenv("TUGRAPH_USERNAME", config.username)
self._password = os.getenv("TUGRAPH_PASSWORD", config.password)
self._enable_summary = (
os.getenv("ENABLE_GRAPH_COMMUNITY_SUMMARY", "").lower() == "true"
os.getenv("GRAPH_COMMUNITY_SUMMARY_ENABLED", "").lower() == "true"
or config.enable_summary
)
self._enable_document_graph = (
os.getenv("ENABLE_DOCUMENT_GRAPH", "").lower() == "true"
or config.enable_document_graph
os.getenv("DOCUMENT_GRAPH_ENABLED", "").lower() == "true"
or config.document_graph_enabled
)
self._enable_triplet_graph = (
os.getenv("ENABLE_TRIPLET_GRAPH", "").lower() == "true"
or config.enable_triplet_graph
os.getenv("TRIPLET_GRAPH_ENABLED", "").lower() == "true"
or config.triplet_graph_enabled
)
self._plugin_names = (
os.getenv("TUGRAPH_PLUGIN_NAMES", "leiden").split(",")
Expand Down Expand Up @@ -159,8 +159,3 @@ def _upload_plugin(self):
f"'{name} Plugin', false, 'v1')"
)
self.conn.run(gql)

def _escape_quotes(self, value: str) -> str:
"""Escape single and double quotes in a string for queries."""
if value is not None:
return value.replace("'", "").replace('"', "")
14 changes: 14 additions & 0 deletions dbgpt/storage/knowledge_graph/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
from abc import ABC, abstractmethod
from typing import List, Optional

from pydantic import Field

from dbgpt._private.pydantic import ConfigDict
from dbgpt.core import Chunk
from dbgpt.rag.index.base import IndexStoreBase, IndexStoreConfig
from dbgpt.storage.graph_store.graph import Graph

Expand All @@ -31,3 +34,14 @@ def query_graph(self, limit: Optional[int] = None) -> Graph:
@abstractmethod
def delete_by_ids(self, ids: str) -> List[str]:
"""Delete document by ids."""


class ParagraphChunk(Chunk):
"""Loaded chunk, used in GraphRAG."""

chunk_parent_id: str = Field(default=None, description="id of parent chunk")
chunk_parent_name: str = Field(default=None, description="parent chunk name")
parent_content: str = Field(default=None, description="parent chunk text content")
parent_is_document: bool = Field(
default=False, description="is parent chunk a document"
)
43 changes: 36 additions & 7 deletions dbgpt/storage/knowledge_graph/community/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import AsyncGenerator, Iterator, List, Optional
from typing import AsyncGenerator, Iterator, List, Optional, Union

from dbgpt.storage.graph_store.base import GraphStoreBase
from dbgpt.storage.graph_store.graph import (
Expand All @@ -14,6 +14,7 @@
MemoryGraph,
Vertex,
)
from dbgpt.storage.knowledge_graph.base import ParagraphChunk

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -91,17 +92,17 @@ def upsert_edge(
"""Upsert edge."""

@abstractmethod
def upsert_chunks(self, chunk: Iterator[Vertex]) -> None:
def upsert_chunks(
self, chunks: Union[Iterator[Vertex], Iterator[ParagraphChunk]]
) -> None:
"""Upsert chunk."""

@abstractmethod
def upsert_documents(self, documents: Iterator[Vertex]) -> None:
def upsert_documents(
self, documents: Union[Iterator[Vertex], Iterator[ParagraphChunk]]
) -> None:
"""Upsert documents."""

@abstractmethod
def upsert_relations(self, relations: Iterator[Edge]) -> None:
"""Upsert relations."""

@abstractmethod
def insert_triplet(self, sub: str, rel: str, obj: str) -> None:
"""Insert triplet."""
Expand All @@ -110,6 +111,34 @@ def insert_triplet(self, sub: str, rel: str, obj: str) -> None:
def upsert_graph(self, graph: Graph) -> None:
"""Insert graph."""

@abstractmethod
def upsert_doc_include_chunk(
self,
chunk: ParagraphChunk,
) -> None:
"""Convert chunk to document include chunk."""

@abstractmethod
def upsert_chunk_include_chunk(
self,
chunk: ParagraphChunk,
) -> None:
"""Convert chunk to chunk include chunk."""

@abstractmethod
def upsert_chunk_next_chunk(
self,
chunk: ParagraphChunk,
next_chunk: ParagraphChunk,
):
"""Uperst the vertices and the edge in chunk_next_chunk."""

@abstractmethod
def upsert_chunk_include_entity(
self, chunk: ParagraphChunk, entity: Vertex
) -> None:
"""Convert chunk to chunk include entity."""

@abstractmethod
def delete_document(self, chunk_id: str) -> None:
"""Delete document in graph store."""
Expand Down
41 changes: 36 additions & 5 deletions dbgpt/storage/knowledge_graph/community/memgraph_store_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import json
import logging
from typing import AsyncGenerator, Iterator, List, Optional, Tuple
from typing import AsyncGenerator, Iterator, List, Optional, Tuple, Union

from dbgpt.storage.graph_store.graph import (
Direction,
Expand All @@ -16,6 +16,7 @@
MemoryGraphStore,
MemoryGraphStoreConfig,
)
from dbgpt.storage.knowledge_graph.base import ParagraphChunk
from dbgpt.storage.knowledge_graph.community.base import Community, GraphStoreAdapter

logger = logging.getLogger(__name__)
Expand All @@ -38,11 +39,11 @@ def __init__(self, enable_summary: bool = False):

async def discover_communities(self, **kwargs) -> List[str]:
"""Run community discovery with leiden."""
pass
[]

async def get_community(self, community_id: str) -> Community:
"""Get community."""
pass
raise NotImplementedError("Memory graph store does not have community")

def get_graph_config(self):
"""Get the graph store config."""
Expand Down Expand Up @@ -96,18 +97,48 @@ def upsert_edge(
"""Upsert edges."""
pass

def upsert_chunks(self, chunks: Iterator[Vertex]) -> None:
def upsert_chunks(
self, chunks: Union[Iterator[Vertex], Iterator[ParagraphChunk]]
) -> None:
"""Upsert chunks."""
pass

def upsert_documents(self, documents: Iterator[Vertex]) -> None:
def upsert_documents(
self, documents: Union[Iterator[Vertex], Iterator[ParagraphChunk]]
) -> None:
"""Upsert documents."""
pass

def upsert_relations(self, relations: Iterator[Edge]) -> None:
"""Upsert relations."""
pass

def upsert_doc_include_chunk(
self,
chunk: ParagraphChunk,
) -> None:
"""Convert chunk to document include chunk."""
pass

def upsert_chunk_include_chunk(
self,
chunk: ParagraphChunk,
) -> None:
"""Convert chunk to chunk include chunk."""
pass

def upsert_chunk_next_chunk(
self, chunk: ParagraphChunk, next_chunk: ParagraphChunk
):
"""Uperst the vertices and the edge in chunk_next_chunk."""
pass

def upsert_chunk_include_entity(
self, chunk: ParagraphChunk, entity: Vertex
) -> None:
"""Convert chunk to chunk include entity."""
pass

def insert_triplet(self, subj: str, rel: str, obj: str) -> None:
"""Add triplet."""
self._graph_store._graph.append_edge(Edge(subj, obj, rel))
Expand Down
Loading

0 comments on commit 6d66678

Please sign in to comment.