diff --git a/.env.template b/.env.template index cb96c9dcb..27688c6db 100644 --- a/.env.template +++ b/.env.template @@ -159,17 +159,20 @@ VECTOR_STORE_TYPE=Chroma GRAPH_STORE_TYPE=TuGraph KNOWLEDGE_GRAPH_EXTRACT_SEARCH_TOP_SIZE=5 KNOWLEDGE_GRAPH_EXTRACT_SEARCH_RECALL_SCORE=0.3 +KNOWLEDGE_GRAPH_SIMILARITY_SEARCH_TOP_SIZE=5 +KNOWLEDGE_GRAPH_SIMILARITY_SEARCH_RECALL_SCORE=0.3 KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_TOP_SIZE=20 KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_RECALL_SCORE=0.0 GRAPH_COMMUNITY_SUMMARY_ENABLED=True # enable the graph community summary TRIPLET_GRAPH_ENABLED=True # enable the graph search for triplets DOCUMENT_GRAPH_ENABLED=True # enable the graph search for documents and chunks +SIMILARITY_SEARCH_ENABLED=False # enable the similarity search for entities and chunks KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE=5 # the top size of knowledge graph search for chunks KNOWLEDGE_GRAPH_EXTRACTION_BATCH_SIZE=20 # the batch size of triplet extraction from the text COMMUNITY_SUMMARY_BATCH_SIZE=20 # the batch size of parallel community summary process - +KNOWLEDGE_GRAPH_EMBEDDING_BATCH_SIZE=20 # the batch size of embedding from the text ### Chroma vector db config #CHROMA_PERSIST_PATH=/root/DB-GPT/pilot/data diff --git a/dbgpt/rag/transformer/base.py b/dbgpt/rag/transformer/base.py index 35b3e022d..627c68507 100644 --- a/dbgpt/rag/transformer/base.py +++ b/dbgpt/rag/transformer/base.py @@ -29,8 +29,9 @@ async def embed(self, text: str) -> List[float]: @abstractmethod async def batch_embed( self, - texts: List[str], - ) -> List[List[float]]: + graphs_list: List[List], + batch_size: int = 1, + ) -> List[List]: """Batch embed vectors from texts.""" diff --git a/dbgpt/rag/transformer/graph_embedder.py b/dbgpt/rag/transformer/graph_embedder.py index 6865832a6..3f58fd687 100644 --- a/dbgpt/rag/transformer/graph_embedder.py +++ b/dbgpt/rag/transformer/graph_embedder.py @@ -1,51 +1,90 @@ """GraphEmbedder class.""" +import asyncio import logging from typing import List -from dbgpt.rag.transformer.text2vector import Text2Vector +from tenacity import retry, stop_after_attempt, wait_fixed + +from dbgpt.core.interface.embeddings import Embeddings +from dbgpt.rag.transformer.base import EmbedderBase from dbgpt.storage.graph_store.graph import Graph, GraphElemType logger = logging.getLogger(__name__) -class GraphEmbedder(Text2Vector): +class GraphEmbedder(EmbedderBase): """GraphEmbedder class.""" - def __init__(self): - """Initialize the GraphEmbedder""" + def __init__(self, embedding_fn: Embeddings): + """Initialize the GraphEmbedder.""" + self.embedding_fn = embedding_fn super().__init__() - + async def embed( self, text: str, ) -> List[float]: - """Embed""" - return await super()._embed(text) - + """Embed.""" + return await self.embedding_fn.aembed_query(text) + async def batch_embed( self, graphs_list: List[List[Graph]], + batch_size: int = 1, ) -> List[List[Graph]]: - """Embed graphs from graphs in batches""" - + """Embed graphs from graphs in batches.""" for graphs in graphs_list: for graph in graphs: + + texts = [] + vectors = [] + + # Get the text from graph for vertex in graph.vertices(): if vertex.get_prop("vertex_type") == GraphElemType.CHUNK.value: - text = vertex.get_prop("content") - vector = await self._embed(text) - vertex.set_prop("embedding", vector) + texts.append(vertex.get_prop("content")) elif vertex.get_prop("vertex_type") == GraphElemType.ENTITY.value: - vector = await self._embed(vertex.vid) - vertex.set_prop("embedding", vector) + texts.append(vertex.vid) else: - text = "" + texts.append(" ") + + n_texts = len(texts) + + # Batch embedding + for batch_idx in range(0, n_texts, batch_size): + start_idx = batch_idx + end_idx = min(start_idx + batch_size, n_texts) + batch_texts = texts[start_idx:end_idx] + + # Create tasks + embedding_tasks = [(self._embed(text)) for text in batch_texts] + + # Process embedding in parallel + batch_results = await asyncio.gather( + *(task for task in embedding_tasks), return_exceptions=True + ) + + # Place results in the correct positions + for idx, vector in enumerate(batch_results): + if isinstance(vector, Exception): + raise RuntimeError(f"Failed to embed text{idx}") + else: + vectors.append(vector) + + # Push vectors back into Graph + for vertex, vector in zip(graph.vertices(), vectors): + vertex.set_prop("_embedding", vector) return graphs_list - + + @retry(stop=stop_after_attempt(3), wait=wait_fixed(2)) + async def _embed(self, text: str) -> List: + """Inner embed.""" + return await self.embedding_fn.aembed_query(text) + def truncate(self): - """""" + """Do nothing by default.""" def drop(self): - """""" + """Do nothing by default.""" diff --git a/dbgpt/rag/transformer/text2vector.py b/dbgpt/rag/transformer/text2vector.py index a81336509..ea383ee3b 100644 --- a/dbgpt/rag/transformer/text2vector.py +++ b/dbgpt/rag/transformer/text2vector.py @@ -1,47 +1,32 @@ """Text2Vector class.""" import logging -from abc import ABC -from http import HTTPStatus from typing import List -import dashscope - +from dbgpt.core.interface.embeddings import Embeddings from dbgpt.rag.transformer.base import EmbedderBase logger = logging.getLogger(__name__) -class Text2Vector(EmbedderBase, ABC): +class Text2Vector(EmbedderBase): """Text2Vector class.""" - def __init__(self): - """Initialize the Embedder""" + def __init__(self, embedding_fn: Embeddings): + """Initialize the Embedder.""" + self.embedding_fn = embedding_fn + super().__init__() async def embed(self, text: str) -> List[float]: """Embed vector from text.""" - return await self._embed(text) + return await self.embedding_fn.aembed_query(text) async def batch_embed( self, - texts: List[str], + text_list: List[List], + batch_size: int = 1, ) -> List[List[float]]: - """Batch embed vectors from texts.""" - results = [] - for text in texts: - vector = await self._embed(text) - results.extend(vector) - return results - - async def _embed(self, text: str) -> List[float]: - """Embed vector from text.""" - resp = dashscope.TextEmbedding.call( - model = dashscope.TextEmbedding.Models.text_embedding_v3, - input = text, - dimension = 512) - embeddings = resp.output['embeddings'] - embedding = embeddings[0]['embedding'] - return list(embedding) + """Embed texts from graphs in batches.""" def truncate(self): """Do nothing by default.""" diff --git a/dbgpt/storage/graph_store/tugraph_store.py b/dbgpt/storage/graph_store/tugraph_store.py index 4f8437245..cc2471cae 100644 --- a/dbgpt/storage/graph_store/tugraph_store.py +++ b/dbgpt/storage/graph_store/tugraph_store.py @@ -67,6 +67,18 @@ class TuGraphStoreConfig(GraphStoreConfig): "/dbgpt-tugraph-plugins/tree/master/cpp" ), ) + similarity_search_enabled: bool = Field( + default=False, + description="Enable the similarity search", + ) + similarity_search_topk: int = Field( + default=5, + description="Topk of knowledge graph extract", + ) + similarity_search_score_threshold: float = Field( + default=0.3, + description="Recall score of knowledge graph extract", + ) class TuGraphStore(GraphStoreBase): @@ -83,6 +95,11 @@ def __init__(self, config: TuGraphStoreConfig) -> None: os.getenv("GRAPH_COMMUNITY_SUMMARY_ENABLED", "").lower() == "true" or config.enable_summary ) + self._similarity_search_enabled = ( + os.environ["SIMILARITY_SEARCH_ENABLED"].lower() == "true" + if "SIMILARITY_SEARCH_ENABLED" in os.environ + else config.similarity_search_enabled + ) self._plugin_names = ( os.getenv("TUGRAPH_PLUGIN_NAMES", "leiden").split(",") or config.plugin_names @@ -98,6 +115,19 @@ def __init__(self, config: TuGraphStoreConfig) -> None: db_name=config.name, ) + self._similarity_search_topk = int( + os.getenv( + "KNOWLEDGE_GRAPH_SIMILARITY_SEARCH_TOP_SIZE", + config.similarity_search_topk, + ) + ) + self._similarity_search_score_threshold = float( + os.getenv( + "KNOWLEDGE_GRAPH_SIMILARITY_SEARCH_RECALL_SCORE", + config.similarity_search_score_threshold, + ) + ) + def get_config(self) -> TuGraphStoreConfig: """Get the TuGraph store config.""" return self._config diff --git a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py index 611f6708b..f52d9121a 100644 --- a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py +++ b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py @@ -42,10 +42,6 @@ def __init__(self, graph_store: TuGraphStore): # Create the graph self.create_graph(self.graph_store.get_config().name) - # vector index create control - self._chunk_vector_index = False - self._entity_vector_index = False - async def discover_communities(self, **kwargs) -> List[str]: """Run community discovery with leiden.""" mg = self.query( @@ -149,7 +145,11 @@ def upsert_entities(self, entities: Iterator[Vertex]) -> None: "_document_id": "0", "_chunk_id": "0", "_community_id": "0", - "_embedding": entity.get_prop("embedding"), + **( + {"_embedding": entity.get_prop("_embedding")} + if self.graph_store._similarity_search_enabled + else {} + ), } for entity in entities ] @@ -158,16 +158,32 @@ def upsert_entities(self, entities: Iterator[Vertex]) -> None: f'"{GraphElemType.ENTITY.value}", ' f"[{self._convert_dict_to_str(entity_list)}])" ) - create_vector_index_query = ( - f"CALL db.addVertexVectorIndex(" - f'"{GraphElemType.ENTITY.value}", "_embedding", ' - "{dimension: 512})" - ) - self.graph_store.conn.run(query=entity_query) - if not self._entity_vector_index : - self.graph_store.conn.run(query=create_vector_index_query) - self._entity_vector_index = True + # If similarity search enabled, then ready to create vector index + if self.graph_store._similarity_search_enabled: + # Check wheather the vector index exist + check_entity_vector_query = ( + f"CALL db.showVertexVectorIndex() " + "YIELD label_name, field_name " + f"WHERE label_name = '{GraphElemType.ENTITY.value}' " + f"AND field_name = '_embedding' " + "RETURN label_name" + ) + # If not exist, then create vector index + if not self.query(check_entity_vector_query).vertex_count: + # Get the dimension + dimension = len(entity_list[0].get("_embedding")) + # Then create index + create_vector_index_query = ( + f"CALL db.addVertexVectorIndex(" + f'"{GraphElemType.ENTITY.value}", "_embedding", ' + "{dimension: " + f"{dimension}" + "})" + ) + self.graph_store.conn.run(query=create_vector_index_query) + + self.graph_store.conn.run(query=entity_query) def upsert_edge( self, edges: Iterator[Edge], edge_type: str, src_type: str, dst_type: str @@ -203,7 +219,11 @@ def upsert_chunks(self, chunks: Iterator[Union[Vertex, ParagraphChunk]]) -> None "id": self._escape_quotes(chunk.vid), "name": self._escape_quotes(chunk.name), "content": self._escape_quotes(chunk.get_prop("content")), - "_embedding": chunk.get_prop("embedding"), + **( + {"_embedding": chunk.get_prop("_embedding")} + if self.graph_store._similarity_search_enabled + else {} + ), } for chunk in chunks ] @@ -213,16 +233,33 @@ def upsert_chunks(self, chunks: Iterator[Union[Vertex, ParagraphChunk]]) -> None f'"{GraphElemType.CHUNK.value}", ' f"[{self._convert_dict_to_str(chunk_list)}])" ) - create_vector_index_query = ( - f"CALL db.addVertexVectorIndex(" - f'"{GraphElemType.CHUNK.value}", "_embedding", ' - "{dimension: 512})" - ) + + if not all(isinstance(chunk, ParagraphChunk) for chunk in chunks): + # If similarity search enabled, then ready to create vector index + if self.graph_store._similarity_search_enabled: + # Check wheather the vector index exist + check_chunk_vector_query = ( + f"CALL db.showVertexVectorIndex() " + "YIELD label_name, field_name " + f"WHERE label_name = '{GraphElemType.CHUNK.value}' " + f"AND field_name = '_embedding' " + "RETURN label_name" + ) + # If not exist, then create vector index + if not self.query(check_chunk_vector_query).vertex_count: + # Get the dimension + dimension = len(chunk_list[0].get("_embedding")) + # Then create index + create_vector_index_query = ( + f"CALL db.addVertexVectorIndex(" + f'"{GraphElemType.CHUNK.value}", "_embedding", ' + "{dimension: " + f"{dimension}" + "})" + ) + self.graph_store.conn.run(query=create_vector_index_query) + self.graph_store.conn.run(query=chunk_query) - - if not self._chunk_vector_index : - self.graph_store.conn.run(query=create_vector_index_query) - self._chunk_vector_index = True def upsert_documents( self, documents: Iterator[Union[Vertex, ParagraphChunk]] @@ -586,28 +623,41 @@ def explore( rel = f"<-[r:{GraphElemType.RELATION.value}*{depth_string}]-" else: rel = f"-[r:{GraphElemType.RELATION.value}*{depth_string}]-" - - if all(isinstance(item, str) for item in subs): - header = f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} " + + if not self.graph_store._similarity_search_enabled: + conditional_statement = ( + f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} " + ) else: - final_list = [] + # If enable similarity search, using knn-search to get the id + similar_entities = [] + # Get the vector from vectors + # Then do knn-search for each vectors by using TuGraph for sub in subs: - vector = str(sub); - similarity_search = ( + vector = str(sub) + similarity_retrieval_query = ( f"CALL db.vertexVectorKnnSearch(" f"'{GraphElemType.ENTITY.value}','_embedding', {vector}, " - "{top_k:2, hnsw_ef_search:10})" - "YIELD node RETURN node.id AS id;" + "{top_k:" + f"{self.graph_store._similarity_search_topk}" + "}) YIELD node " + "WHERE node.distance < " + f"{self.graph_store._similarity_search_score_threshold} " + "RETURN node.id AS id;" ) - result_list = self.graph_store.conn.run(query=similarity_search) - final_list.extend(result_list) - id_list = [(record["id"]) for record in final_list] - header = f"WHERE n.id IN {id_list} " + result_list = self.graph_store.conn.run( + query=similarity_retrieval_query + ) + # Merge the result for each knn-search result + similar_entities.extend(result_list) + # Get the id from result + id_list = [(record["id"]) for record in similar_entities] + conditional_statement = f"WHERE n.id IN {id_list} " query = ( f"MATCH p=(n:{GraphElemType.ENTITY.value})" f"{rel}(m:{GraphElemType.ENTITY.value}) " - f"{header}" + f"{conditional_statement}" f"RETURN p {limit_string}" ) return self.query(query=query, white_list=["description"]) @@ -628,51 +678,24 @@ def explore( # Check if the entities exist in the graph - if all(isinstance(item, str) for item in subs): - header = f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} " - else: - final_list = [] - for sub in subs: - vector = str(sub); - similarity_search = ( - f"CALL db.vertexVectorKnnSearch(" - f"'{GraphElemType.ENTITY.value}','_embedding', {vector}, " - "{top_k:2, hnsw_ef_search:10})" - "YIELD node RETURN node.id AS id" - ) - result_list = self.graph_store.conn.run(query=similarity_search) - final_list.extend(result_list) - id_list = [(record["id"]) for record in final_list] - header = f"WHERE n.id IN {id_list} " + conditional_statement = ( + f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} " + ) check_entity_query = ( f"MATCH (n:{GraphElemType.ENTITY.value}) " - f"{header}" + f"{conditional_statement}" "RETURN n" ) - if self.query(check_entity_query): + if self.query(check_entity_query).vertex_count != 0: # Query the leaf chunks in the chain from documents to chunks - if all(isinstance(item, str) for item in subs): - header = f"WHERE m.name IN {[self._escape_quotes(sub) for sub in subs]} " - else: - final_list = [] - for sub in subs: - vector = str(sub); - similarity_search = ( - f"CALL db.vertexVectorKnnSearch(" - f"'{GraphElemType.ENTITY.value}','_embedding', {vector}, " - "{top_k:2, hnsw_ef_search:10})" - "YIELD node RETURN node.name AS name" - ) - result_list = self.graph_store.conn.run(query=similarity_search) - final_list.extend(result_list) - name_list = [(record["name"]) for record in final_list] - header = f"WHERE n.name IN {name_list} " - + conditional_statement = ( + f"WHERE m.name IN {[self._escape_quotes(sub) for sub in subs]} " + ) leaf_chunk_query = ( f"MATCH p=(n:{GraphElemType.CHUNK.value})-" f"[r:{GraphElemType.INCLUDE.value}]->" f"(m:{GraphElemType.ENTITY.value})" - f"{header} " + f"{conditional_statement} " f"RETURN n" ) graph_of_leaf_chunks = self.query( @@ -706,25 +729,34 @@ def explore( ) ) else: - if all(isinstance(item, str) for item in subs): + if not self.graph_store._similarity_search_enabled: _subs_condition = " OR ".join( - [f"m.content CONTAINS '{self._escape_quotes(sub)}'" for sub in subs] + [ + f"m.content CONTAINS '{self._escape_quotes(sub)}'" + for sub in subs + ] ) else: - final_list = [] + similar_chunks = [] for sub in subs: - vector = str(sub); - similarity_search = ( + vector = str(sub) + similarity_retrieval_query = ( f"CALL db.vertexVectorKnnSearch(" f"'{GraphElemType.CHUNK.value}','_embedding', {vector}, " - "{top_k:2, hnsw_ef_search:10})" - "YIELD node RETURN node.name AS name" + "{top_k:" + f"{self.graph_store._similarity_search_topk}" + "}) YIELD node " + "WHERE node.distance < " + f"{self.graph_store._similarity_search_score_threshold} " + "RETURN node.name AS name" + ) + result_list = self.graph_store.conn.run( + query=similarity_retrieval_query ) - result_list = self.graph_store.conn.run(query=similarity_search) - final_list.extend(result_list) - name_list = [(record["name"]) for record in final_list] + similar_chunks.extend(result_list) + name_list = [(record["name"]) for record in similar_chunks] _subs_condition = f"n.name IN {name_list} " - + # Query the chain from documents to chunks, # document -> chunk -> chunk -> chunk -> ... -> chunk chain_query = ( diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py index f8e40a1d4..f8108da1b 100644 --- a/dbgpt/storage/knowledge_graph/community_summary.py +++ b/dbgpt/storage/knowledge_graph/community_summary.py @@ -10,6 +10,7 @@ from dbgpt.rag.transformer.community_summarizer import CommunitySummarizer from dbgpt.rag.transformer.graph_embedder import GraphEmbedder from dbgpt.rag.transformer.graph_extractor import GraphExtractor +from dbgpt.rag.transformer.text2vector import Text2Vector from dbgpt.storage.knowledge_graph.base import ParagraphChunk from dbgpt.storage.knowledge_graph.community.community_store import CommunityStore from dbgpt.storage.knowledge_graph.knowledge_graph import ( @@ -66,7 +67,10 @@ class CommunitySummaryKnowledgeGraphConfig(BuiltinKnowledgeGraphConfig): default=True, description="Enable the graph search for documents and chunks", ) - + similarity_search_enabled: bool = Field( + default=False, + description="Enable the similarity search", + ) knowledge_graph_chunk_search_top_size: int = Field( default=5, description="Top size of knowledge graph chunk search", @@ -79,9 +83,9 @@ class CommunitySummaryKnowledgeGraphConfig(BuiltinKnowledgeGraphConfig): default=20, description="Batch size of parallel community building process", ) - similar_search_enabled: bool = Field( - default=False, - description="Enable the similarity search", + knowledge_graph_embedding_batch_size: int = Field( + default=20, + description="Batch size of triplets embedding from the text", ) @@ -138,16 +142,22 @@ def __init__(self, config: CommunitySummaryKnowledgeGraphConfig): config.knowledge_graph_extraction_batch_size, ) ) + self._triplet_embedding_batch_size = int( + os.getenv( + "KNOWLEDGE_GRAPH_EMBEDDING_BATCH_SIZE", + config.knowledge_graph_embedding_batch_size, + ) + ) self._community_summary_batch_size = int( os.getenv( "COMMUNITY_SUMMARY_BATCH_SIZE", config.community_summary_batch_size, ) ) - self._similar_search_enabled = ( - os.environ["SIMILAR_SEARCH_ENABLED"].lower() == "true" - if "SIMILAR_SEARCH_ENABLED" in os.environ - else config.similar_search_enabled + self._similarity_search_enabled = ( + os.environ["SIMILARITY_SEARCH_ENABLED"].lower() == "true" + if "SIMILARITY_SEARCH_ENABLED" in os.environ + else config.similarity_search_enabled ) def extractor_configure(name: str, cfg: VectorStoreConfig): @@ -170,7 +180,9 @@ def extractor_configure(name: str, cfg: VectorStoreConfig): ), ) - self._garph_embedder = GraphEmbedder() + self._graph_embedder = GraphEmbedder(self._config.embedding_fn) + + self._text_embedder = Text2Vector(self._config.embedding_fn) def community_store_configure(name: str, cfg: VectorStoreConfig): cfg.name = name @@ -256,7 +268,14 @@ async def _aload_triplet_graph(self, chunks: List[Chunk]) -> None: if not graphs_list: raise ValueError("No graphs extracted from the chunks") - graphs_list = await self._garph_embedder.batch_embed(graphs_list) + similarity_search_enabled = self._similarity_search_enabled + + # If enable the similarity search, add the embedding to the graphs + if similarity_search_enabled: + graphs_list = await self._graph_embedder.batch_embed( + graphs_list, + batch_size=self._triplet_embedding_batch_size, + ) # Upsert the graphs into the graph store for idx, graphs in enumerate(graphs_list): @@ -348,15 +367,8 @@ async def asimilar_search_with_scores( context = "\n".join(summaries) if summaries else "" # Vector similarity search - similar_search_enabled = self._similar_search_enabled + similarity_search_enabled = self._similarity_search_enabled - if similar_search_enabled: - keywords: List[List[float]] = [] - vector = await self._garph_embedder.embed(text) - keywords.append(vector) - else: - keywords: List[str] = await self._keyword_extractor.extract(text) - subgraph = None subgraph_for_doc = None @@ -364,41 +376,73 @@ async def asimilar_search_with_scores( triplet_graph_enabled = self._triplet_graph_enabled document_graph_enabled = self._document_graph_enabled - if triplet_graph_enabled: - subgraph = self._graph_store_apdater.explore( - subs=keywords, limit=topk, search_scope="knowledge_graph" - ) - - if document_graph_enabled: - keywords_for_document_graph = keywords - if similar_search_enabled: - for vertex in subgraph.vertices(): - vector = await self._garph_embedder.embed(vertex.name) - keywords_for_document_graph.append(vector) - else: + keywords: List[str] = await self._keyword_extractor.extract(text) + # If enable similarity search, using vector to search + if similarity_search_enabled: + vectors: List[List[float]] = [] + vector = await self._text_embedder.embed(text) + vectors.append(vector) + + if triplet_graph_enabled: + # Using similarity search to get entity.vid as keywords + subgraph = self._graph_store_apdater.explore( + subs=vectors, limit=topk, search_scope="knowledge_graph" + ) + # Append keywords + if document_graph_enabled: + keywords_for_document_graph = keywords for vertex in subgraph.vertices(): keywords_for_document_graph.append(vertex.name) - subgraph_for_doc = self._graph_store_apdater.explore( - subs=keywords_for_document_graph, - limit=self._knowledge_graph_chunk_search_top_size, - search_scope="document_graph", - ) + # Using keywords to get chunk and doc + subgraph_for_doc = self._graph_store_apdater.explore( + subs=keywords_for_document_graph, + limit=self._knowledge_graph_chunk_search_top_size, + search_scope="document_graph", + ) + else: + if document_graph_enabled: + # Using similarity search toget chunk and doc + subgraph_for_doc = self._graph_store_apdater.explore( + subs=vectors, + limit=self._knowledge_graph_chunk_search_top_size, + search_scope="document_graph", + ) + logger.info(f"Search subgraph from the following vectors:\n{len(vectors)}") + + # Else using keywords else: - if document_graph_enabled: - subgraph_for_doc = self._graph_store_apdater.explore( - subs=keywords, - limit=self._knowledge_graph_chunk_search_top_size, - search_scope="document_graph", + if triplet_graph_enabled: + subgraph = self._graph_store_apdater.explore( + subs=keywords, limit=topk, search_scope="knowledge_graph" ) + if document_graph_enabled: + keywords_for_document_graph = keywords + for vertex in subgraph.vertices(): + keywords_for_document_graph.append(vertex.name) + + subgraph_for_doc = self._graph_store_apdater.explore( + subs=keywords_for_document_graph, + limit=self._knowledge_graph_chunk_search_top_size, + search_scope="document_graph", + ) + else: + if document_graph_enabled: + subgraph_for_doc = self._graph_store_apdater.explore( + subs=keywords, + limit=self._knowledge_graph_chunk_search_top_size, + search_scope="document_graph", + ) + logger.info( + f"Search subgraph from the following keywords:\n{len(keywords)}" + ) + knowledge_graph_str = subgraph.format() if subgraph else "" knowledge_graph_for_doc_str = ( subgraph_for_doc.format() if subgraph_for_doc else "" ) - logger.info(f"Search subgraph from the following keywords:\n{len(keywords)}") - if not (summaries or knowledge_graph_str or knowledge_graph_for_doc_str): return [] @@ -420,7 +464,9 @@ def truncate(self) -> List[str]: logger.info("Truncate triplet extractor") self._graph_extractor.truncate() logger.info("Truncate graph embedder") - self._garph_embedder.truncate() + self._graph_embedder.truncate() + logger.info("Truncate text embedder") + self._text_embedder.truncate() return [self._config.name] def delete_vector_name(self, index_name: str): @@ -435,7 +481,10 @@ def delete_vector_name(self, index_name: str): self._graph_extractor.drop() logger.info("Drop graph embedder") - self._garph_embedder.drop() + self._graph_embedder.drop() + + logger.info("Drop text embedder") + self._text_embedder.drop() HYBRID_SEARCH_PT = """ diff --git a/docs/docs/cookbook/rag/graph_rag_app_develop.md b/docs/docs/cookbook/rag/graph_rag_app_develop.md index 46c328bd5..ab888c354 100644 --- a/docs/docs/cookbook/rag/graph_rag_app_develop.md +++ b/docs/docs/cookbook/rag/graph_rag_app_develop.md @@ -360,7 +360,7 @@ DB-GPT社区与TuGraph社区的比较 总体而言,DB-GPT社区和TuGraph社区在社区贡献、生态系统和开发者参与等方面各具特色。DB-GPT社区更侧重于AI应用的多样性和组织间的合作,而TuGraph社区则专注于图数据的高效管理和分析。两者的共同点在于都强调了开源和社区合作的重要性,推动了各自领域的技术进步和应用发展。 ``` -### Latest Updates +### Retrieval Of Document Structure In version 0.6.1 of DB-GPT, we have added a new feature: - Retrieval of triplets with the **retrieval of document structure** @@ -389,4 +389,51 @@ We decompose standard format files (currently best support for Markdown files) i What is the next? -We aim to construct a more complex Graph that covers more comprehensive information to support more sophisticated retrieval algorithms in our GraphRAG. \ No newline at end of file +We aim to construct a more complex Graph that covers more comprehensive information to support more sophisticated retrieval algorithms in our GraphRAG. + + +### Similarity Search for GraphRAG: + +In new version of DB-GPT, we have added a new feature: +- **Using similarity search** for etrieval of GraphRAG + +#### How to use? + +Set variables below in `.env` file, let DB-GPT know you want to using similarity search. + +``` +SIMILARITY_SEARCH_ENABLED=True # enable the similarity search for entities and chunks +KNOWLEDGE_GRAPH_EMBEDDING_BATCH_SIZE=20 # the batch size of embedding from the text +KNOWLEDGE_GRAPH_SIMILARITY_SEARCH_TOP_SIZE=5 # set the topk of the vector similarity search +KNOWLEDGE_GRAPH_SIMILARITY_SEARCH_RECALL_SCORE=0.3 # set the reacall score of the vector similarity search +``` + +And you also need to choose the embedding model in `.env` file + +``` +## Openai embedding model, See dbgpt/model/parameter.py +# EMBEDDING_MODEL=proxy_openai +# proxy_openai_proxy_server_url=https://api.openai.com/v1 +# proxy_openai_proxy_api_key={your-openai-sk} +# proxy_openai_proxy_backend=text-embedding-ada-002 + + +## qwen embedding model, See dbgpt/model/parameter.py +# EMBEDDING_MODEL=proxy_tongyi +# proxy_tongyi_proxy_backend=text-embedding-v1 +# proxy_tongyi_proxy_api_key={your-api-key} + +## qianfan embedding model, See dbgpt/model/parameter.py +#EMBEDDING_MODEL=proxy_qianfan +#proxy_qianfan_proxy_backend=bge-large-zh +#proxy_qianfan_proxy_api_key={your-api-key} +#proxy_qianfan_proxy_api_secret={your-secret-key} +``` + +#### Why to use? + +TuGraph support vector store, vector index and vector similarity search now. Therefore, GraphRAG can make good use of this feature to obtain better retrieval ability than keyword retrieval. + +To take advantage of this feature, we add the _embedding field to entity and chunk to store the embedding data. + +The vector index is created for the _embedding field by using TuGraph's vector index and similarity search ability is used to get the most similar results of the problem.