From 25d47ce3432545a1be58c9799e27181ac26106af Mon Sep 17 00:00:00 2001 From: Appointat Date: Tue, 5 Nov 2024 14:01:18 +0800 Subject: [PATCH] feat: Enhance the triplets extraction in the knowledge graph by the batch size (#2091) --- .env.template | 1 + dbgpt/rag/transformer/base.py | 10 + dbgpt/rag/transformer/graph_extractor.py | 98 +++++- dbgpt/rag/transformer/llm_extractor.py | 28 ++ dbgpt/rag/transformer/triplet_extractor.py | 3 +- dbgpt/storage/graph_store/base.py | 8 - dbgpt/storage/graph_store/tugraph_store.py | 8 - .../community/tugraph_store_adapter.py | 128 +++++-- .../knowledge_graph/community_summary.py | 321 ++++++++---------- .../cookbook/rag/graph_rag_app_develop.md | 1 + 10 files changed, 362 insertions(+), 244 deletions(-) diff --git a/.env.template b/.env.template index 45e075d67..453e2c6d3 100644 --- a/.env.template +++ b/.env.template @@ -167,6 +167,7 @@ TRIPLET_GRAPH_ENABLED=True # enable the graph search for triplets DOCUMENT_GRAPH_ENABLED=True # enable the graph search for documents and chunks KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE=5 # the top size of knowledge graph search for chunks +KNOWLEDGE_GRAPH_EXTRACTION_BATCH_SIZE=20 # the batch size of triplet extraction from the text ### Chroma vector db config #CHROMA_PERSIST_PATH=/root/DB-GPT/pilot/data diff --git a/dbgpt/rag/transformer/base.py b/dbgpt/rag/transformer/base.py index 289f02887..a71c2da14 100644 --- a/dbgpt/rag/transformer/base.py +++ b/dbgpt/rag/transformer/base.py @@ -1,4 +1,5 @@ """Transformer base class.""" + import logging from abc import ABC, abstractmethod from typing import List, Optional @@ -37,6 +38,15 @@ class ExtractorBase(TransformerBase, ABC): async def extract(self, text: str, limit: Optional[int] = None) -> List: """Extract results from text.""" + @abstractmethod + async def batch_extract( + self, + texts: List[str], + batch_size: int = 1, + limit: Optional[int] = None, + ) -> List: + """Batch extract results from texts.""" + class TranslatorBase(TransformerBase, ABC): """Translator base class.""" diff --git a/dbgpt/rag/transformer/graph_extractor.py b/dbgpt/rag/transformer/graph_extractor.py index 12751e89f..7a02f74d1 100644 --- a/dbgpt/rag/transformer/graph_extractor.py +++ b/dbgpt/rag/transformer/graph_extractor.py @@ -1,8 +1,9 @@ """GraphExtractor class.""" +import asyncio import logging import re -from typing import List, Optional +from typing import Dict, List, Optional from dbgpt.core import Chunk, LLMClient from dbgpt.rag.transformer.llm_extractor import LLMExtractor @@ -23,35 +24,96 @@ def __init__( self._chunk_history = chunk_history config = self._chunk_history.get_config() + self._vector_space = config.name self._max_chunks_once_load = config.max_chunks_once_load self._max_threads = config.max_threads self._topk = config.topk self._score_threshold = config.score_threshold - async def extract(self, text: str, limit: Optional[int] = None) -> List: - """Load similar chunks.""" - # load similar chunks - chunks = await self._chunk_history.asimilar_search_with_scores( - text, self._topk, self._score_threshold - ) - history = [ - f"Section {i + 1}:\n{chunk.content}" for i, chunk in enumerate(chunks) - ] - context = "\n".join(history) if history else "" - - try: - # extract with chunk history - return await super()._extract(text, context, limit) - - finally: - # save chunk to history + async def aload_chunk_context(self, texts: List[str]) -> Dict[str, str]: + """Load chunk context.""" + text_context_map: Dict[str, str] = {} + + for text in texts: + # Load similar chunks + chunks = await self._chunk_history.asimilar_search_with_scores( + text, self._topk, self._score_threshold + ) + history = [ + f"Section {i + 1}:\n{chunk.content}" for i, chunk in enumerate(chunks) + ] + + # Save chunk to history await self._chunk_history.aload_document_with_limit( [Chunk(content=text, metadata={"relevant_cnt": len(history)})], self._max_chunks_once_load, self._max_threads, ) + # Save chunk context to map + context = "\n".join(history) if history else "" + text_context_map[text] = context + return text_context_map + + async def extract(self, text: str, limit: Optional[int] = None) -> List: + """Extract graphs from text. + + Suggestion: to extract triplets in batches, call `batch_extract`. + """ + # Load similar chunks + text_context_map = await self.aload_chunk_context([text]) + context = text_context_map[text] + + # Extract with chunk history + return await super()._extract(text, context, limit) + + async def batch_extract( + self, + texts: List[str], + batch_size: int = 1, + limit: Optional[int] = None, + ) -> List[List[Graph]]: + """Extract graphs from chunks in batches. + + Returns list of graphs in same order as input texts (text <-> graphs). + """ + if batch_size < 1: + raise ValueError("batch_size >= 1") + + # 1. Load chunk context + text_context_map = await self.aload_chunk_context(texts) + + # Pre-allocate results list to maintain order + graphs_list: List[List[Graph]] = [None] * len(texts) + total_batches = (len(texts) + batch_size - 1) // batch_size + + for batch_idx in range(total_batches): + start_idx = batch_idx * batch_size + end_idx = min((batch_idx + 1) * batch_size, len(texts)) + batch_texts = texts[start_idx:end_idx] + + # 2. Create tasks with their original indices + extraction_tasks = [ + ( + idx, + self._extract(text, text_context_map[text], limit), + ) + for idx, text in enumerate(batch_texts, start=start_idx) + ] + + # 3. Process extraction in parallel while keeping track of indices + batch_results = await asyncio.gather( + *(task for _, task in extraction_tasks) + ) + + # 4. Place results in the correct positions + for (idx, _), graphs in zip(extraction_tasks, batch_results): + graphs_list[idx] = graphs + + assert all(x is not None for x in graphs_list), "All positions should be filled" + return graphs_list + def _parse_response(self, text: str, limit: Optional[int] = None) -> List[Graph]: graph = MemoryGraph() edge_count = 0 diff --git a/dbgpt/rag/transformer/llm_extractor.py b/dbgpt/rag/transformer/llm_extractor.py index 494096d51..049611bc7 100644 --- a/dbgpt/rag/transformer/llm_extractor.py +++ b/dbgpt/rag/transformer/llm_extractor.py @@ -1,4 +1,6 @@ """TripletExtractor class.""" + +import asyncio import logging from abc import ABC, abstractmethod from typing import List, Optional @@ -22,6 +24,32 @@ async def extract(self, text: str, limit: Optional[int] = None) -> List: """Extract by LLM.""" return await self._extract(text, None, limit) + async def batch_extract( + self, + texts: List[str], + batch_size: int = 1, + limit: Optional[int] = None, + ) -> List: + """Batch extract by LLM.""" + if batch_size < 1: + raise ValueError("batch_size >= 1") + + results = [] + + for i in range(0, len(texts), batch_size): + batch_texts = texts[i : i + batch_size] + + # Create tasks for current batch + extraction_tasks = [ + self._extract(text, None, limit) for text in batch_texts + ] + + # Execute batch concurrently and wait for all to complete + batch_results = await asyncio.gather(*extraction_tasks) + results.extend(batch_results) + + return results + async def _extract( self, text: str, history: str = None, limit: Optional[int] = None ) -> List: diff --git a/dbgpt/rag/transformer/triplet_extractor.py b/dbgpt/rag/transformer/triplet_extractor.py index 7a591560f..60b5346f3 100644 --- a/dbgpt/rag/transformer/triplet_extractor.py +++ b/dbgpt/rag/transformer/triplet_extractor.py @@ -1,4 +1,5 @@ """TripletExtractor class.""" + import logging import re from typing import Any, List, Optional, Tuple @@ -12,7 +13,7 @@ "Some text is provided below. Given the text, " "extract up to knowledge triplets as more as possible " "in the form of (subject, predicate, object).\n" - "Avoid stopwords.\n" + "Avoid stopwords. The subject, predicate, object can not be none.\n" "---------------------\n" "Example:\n" "Text: Alice is Bob's mother.\n" diff --git a/dbgpt/storage/graph_store/base.py b/dbgpt/storage/graph_store/base.py index 5c2112578..a3344eeea 100644 --- a/dbgpt/storage/graph_store/base.py +++ b/dbgpt/storage/graph_store/base.py @@ -27,14 +27,6 @@ class GraphStoreConfig(BaseModel): default=False, description="Enable graph community summary or not.", ) - document_graph_enabled: bool = Field( - default=True, - description="Enable document graph search or not.", - ) - triplet_graph_enabled: bool = Field( - default=True, - description="Enable knowledge graph search or not.", - ) class GraphStoreBase(ABC): diff --git a/dbgpt/storage/graph_store/tugraph_store.py b/dbgpt/storage/graph_store/tugraph_store.py index c20965947..4f8437245 100644 --- a/dbgpt/storage/graph_store/tugraph_store.py +++ b/dbgpt/storage/graph_store/tugraph_store.py @@ -83,14 +83,6 @@ def __init__(self, config: TuGraphStoreConfig) -> None: os.getenv("GRAPH_COMMUNITY_SUMMARY_ENABLED", "").lower() == "true" or config.enable_summary ) - self._enable_document_graph = ( - os.getenv("DOCUMENT_GRAPH_ENABLED", "").lower() == "true" - or config.document_graph_enabled - ) - self._enable_triplet_graph = ( - os.getenv("TRIPLET_GRAPH_ENABLED", "").lower() == "true" - or config.triplet_graph_enabled - ) self._plugin_names = ( os.getenv("TUGRAPH_PLUGIN_NAMES", "leiden").split(",") or config.plugin_names diff --git a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py index d65969d76..fa107a28b 100644 --- a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py +++ b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py @@ -544,7 +544,7 @@ def explore( if not subs: return MemoryGraph() - if depth < 0: + if depth <= 0: depth = 3 depth_string = f"1..{depth}" @@ -566,23 +566,95 @@ def explore( f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} " f"RETURN p {limit_string}" ) - return self.query(query) + return self.query(query=query, white_list=["description"]) else: + # If there exists the entities in the graph, return the graph that + # includes the leaf chunks that connect to the entities, the chains from + # documents to the leaf chunks, and the chain from documents to chunks; + # document -> chunk -> chunk -> ... -> leaf chunk -> (entity) + # + # If not, return the graph that includes the chains from documents to chunks + # that contain the subs (keywords). + # document -> chunk -> chunk -> ... -> leaf chunk (that contains the subs) + # + # And only the leaf chunks contain the content, and the other chunks do not + # contain any properties except the id, name. + graph = MemoryGraph() - for sub in subs: - query = ( + # Check if the entities exist in the graph + check_entity_query = ( + f"MATCH (n:{GraphElemType.ENTITY.value}) " + f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} " + "RETURN n" + ) + if self.query(check_entity_query): + # Query the leaf chunks in the chain from documents to chunks + leaf_chunk_query = ( + f"MATCH p=(n:{GraphElemType.CHUNK.value})-" + f"[r:{GraphElemType.INCLUDE.value}]->" + f"(m:{GraphElemType.ENTITY.value})" + f"WHERE m.name IN {[self._escape_quotes(sub) for sub in subs]} " + f"RETURN n" + ) + graph_of_leaf_chunks = self.query( + query=leaf_chunk_query, white_list=["content"] + ) + + # Query the chain from documents to chunks, + # document -> chunk -> ... -> leaf_chunks + chunk_names = [ + self._escape_quotes(vertex.name) + for vertex in graph_of_leaf_chunks.vertices() + ] + chain_query = ( + f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-" + f"[:{GraphElemType.INCLUDE.value}*{depth_string}]->" + f"(m:{GraphElemType.CHUNK.value})" + f"WHERE m.name IN {chunk_names} " + "RETURN p" + ) + # Filter all the properties by with_list + graph.upsert_graph(self.query(query=chain_query, white_list=[""])) + + # The number of leaf chunks caompared to the `limit` + if not limit or len(chunk_names) <= limit: + graph.upsert_graph(graph_of_leaf_chunks) + else: + limited_leaf_chunk_query = leaf_chunk_query + f" {limit_string}" + graph.upsert_graph( + self.query( + query=limited_leaf_chunk_query, white_list=["content"] + ) + ) + else: + _subs_condition = " OR ".join( + [f"m.content CONTAINS '{self._escape_quotes(sub)}'" for sub in subs] + ) + + # Query the chain from documents to chunks, + # document -> chunk -> chunk -> chunk -> ... -> chunk + chain_query = ( + f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-" + f"[r:{GraphElemType.INCLUDE.value}*{depth_string}]->" + f"(m:{GraphElemType.CHUNK.value})" + f"WHERE {_subs_condition}" + "RETURN p" + ) + # Filter all the properties by with_list + graph.upsert_graph(self.query(query=chain_query, white_list=[""])) + + # Query the leaf chunks in the chain from documents to chunks + leaf_chunk_query = ( f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-" - f"[r:{GraphElemType.INCLUDE.value}*{depth_string}]-" - f"(m:{GraphElemType.CHUNK.value})WHERE m.content CONTAINS " - f"'{self._escape_quotes(sub)}' " - f"RETURN p {limit_string}" - ) # if it contains the subjects - result = self.query(query) - for vertex in result.vertices(): - graph.upsert_vertex(vertex) - for edge in result.edges(): - graph.append_edge(edge) + f"[r:{GraphElemType.INCLUDE.value}*{depth_string}]->" + f"(m:{GraphElemType.CHUNK.value})" + f"WHERE {_subs_condition}" + f"RETURN m {limit_string}" + ) + graph.upsert_graph( + self.query(query=leaf_chunk_query, white_list=["content"]) + ) return graph @@ -607,6 +679,7 @@ def query(self, query: str, **kwargs) -> MemoryGraph: vertices, edges = self._get_nodes_edges_from_queried_data( query_result, white_list ) + mg = MemoryGraph() for vertex in vertices: mg.upsert_vertex(vertex) @@ -714,7 +787,7 @@ def _get_nodes_edges_from_queried_data( from neo4j import graph def filter_properties( - properties: dict[str, Any], white_list: List[str] + properties: dict[str, Any], white_list: Optional[List[str]] = None ) -> Dict[str, Any]: """Filter the properties. @@ -723,13 +796,26 @@ def filter_properties( entity_properties = ["id", "name", "description", "_document_id", "_chunk_id", "_community_id"] edge_properties = ["id", "name", "description", "_chunk_id"] + Args: + properties: Dictionary of properties to filter + white_list: List of properties to keep + - If None: Keep default properties (those not starting with '_' + and not in ['id', 'name']) + - If [""]: Remove all properties (return empty dict) + - If list of strings: Keep only properties in white_list """ - return { - key: value - for key, value in properties.items() - if (not key.startswith("_") and key not in ["id", "name"]) - or key in white_list - } + return ( + {} + if white_list == [""] + else { + key: value + for key, value in properties.items() + if ( + (not key.startswith("_") and key not in ["id", "name"]) + or (white_list is not None and key in white_list) + ) + } + ) # Parse the data to nodes and relationships for record in data: diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py index 904b0beba..62e3e4c13 100644 --- a/dbgpt/storage/knowledge_graph/community_summary.py +++ b/dbgpt/storage/knowledge_graph/community_summary.py @@ -9,7 +9,6 @@ from dbgpt.core import Chunk from dbgpt.rag.transformer.community_summarizer import CommunitySummarizer from dbgpt.rag.transformer.graph_extractor import GraphExtractor -from dbgpt.storage.graph_store.graph import MemoryGraph from dbgpt.storage.knowledge_graph.base import ParagraphChunk from dbgpt.storage.knowledge_graph.community.community_store import CommunityStore from dbgpt.storage.knowledge_graph.knowledge_graph import ( @@ -59,10 +58,23 @@ class CommunitySummaryKnowledgeGraphConfig(BuiltinKnowledgeGraphConfig): default=0.0, description="Recall score of community search in knowledge graph", ) + triplet_graph_enabled: bool = Field( + default=True, + description="Enable the graph search for triplets", + ) + document_graph_enabled: bool = Field( + default=True, + description="Enable the graph search for documents and chunks", + ) + knowledge_graph_chunk_search_top_size: int = Field( default=5, description="Top size of knowledge graph chunk search", ) + knowledge_graph_extraction_batch_size: int = Field( + default=20, + description="Batch size of triplets extraction from the text", + ) class CommunitySummaryKnowledgeGraph(BuiltinKnowledgeGraph): @@ -96,6 +108,28 @@ def __init__(self, config: CommunitySummaryKnowledgeGraphConfig): config.community_score_threshold, ) ) + self._document_graph_enabled = ( + os.environ["DOCUMENT_GRAPH_ENABLED"].lower() == "true" + if "DOCUMENT_GRAPH_ENABLED" in os.environ + else config.document_graph_enabled + ) + self._triplet_graph_enabled = ( + os.environ["TRIPLET_GRAPH_ENABLED"].lower() == "true" + if "TRIPLET_GRAPH_ENABLED" in os.environ + else config.triplet_graph_enabled + ) + self._knowledge_graph_chunk_search_top_size = int( + os.getenv( + "KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE", + config.knowledge_graph_chunk_search_top_size, + ) + ) + self._triplet_extraction_batch_size = int( + os.getenv( + "KNOWLEDGE_GRAPH_EXTRACTION_BATCH_SIZE", + config.knowledge_graph_extraction_batch_size, + ) + ) def extractor_configure(name: str, cfg: VectorStoreConfig): cfg.name = name @@ -154,7 +188,7 @@ async def _aload_document_graph(self, chunks: List[Chunk]) -> None: The chunks include the doc structure. """ - if not self._graph_store.get_config().document_graph_enabled: + if not self._document_graph_enabled: return _chunks: List[ParagraphChunk] = [ @@ -185,33 +219,35 @@ async def _aload_triplet_graph(self, chunks: List[Chunk]) -> None: The chunks include the doc structure. """ - if not self._graph_store.get_config().triplet_graph_enabled: + if not self._triplet_graph_enabled: return - document_graph_enabled = self._graph_store.get_config().document_graph_enabled - for chunk in chunks: - # TODO: Use asyncio to extract graph to accelerate the process - # (attention to the CAP of the graph db) + document_graph_enabled = self._document_graph_enabled - graphs: List[MemoryGraph] = await self._graph_extractor.extract( - chunk.content - ) + # Extract the triplets from the chunks, and return the list of graphs + # in the same order as the input texts + graphs_list = await self._graph_extractor.batch_extract( + [chunk.content for chunk in chunks], + batch_size=self._triplet_extraction_batch_size, + ) + # Upsert the graphs into the graph store + for idx, graphs in enumerate(graphs_list): for graph in graphs: if document_graph_enabled: - # append the chunk id to the edge + # Append the chunk id to the edge for edge in graph.edges(): - edge.set_prop("_chunk_id", chunk.chunk_id) + edge.set_prop("_chunk_id", chunks[idx].chunk_id) graph.append_edge(edge=edge) - # upsert the graph + # Upsert the graph self._graph_store_apdater.upsert_graph(graph) # chunk -> include -> entity if document_graph_enabled: for vertex in graph.vertices(): self._graph_store_apdater.upsert_chunk_include_entity( - chunk=chunk, entity=vertex + chunk=chunks[idx], entity=vertex ) def _load_chunks( @@ -285,13 +321,15 @@ async def asimilar_search_with_scores( context = "\n".join(summaries) if summaries else "" keywords: List[str] = await self._keyword_extractor.extract(text) + subgraph = None + subgraph_for_doc = None # Local search: extract keywords and explore subgraph - triplet_graph_enabled = self._graph_store.get_config().triplet_graph_enabled - document_graph_enabled = self._graph_store.get_config().document_graph_enabled + triplet_graph_enabled = self._triplet_graph_enabled + document_graph_enabled = self._document_graph_enabled if triplet_graph_enabled: - subgraph: MemoryGraph = self._graph_store_apdater.explore( + subgraph = self._graph_store_apdater.explore( subs=keywords, limit=topk, search_scope="knowledge_graph" ) @@ -302,14 +340,14 @@ async def asimilar_search_with_scores( subgraph_for_doc = self._graph_store_apdater.explore( subs=keywords_for_document_graph, - limit=self._config.knowledge_graph_chunk_search_top_size, + limit=self._knowledge_graph_chunk_search_top_size, search_scope="document_graph", ) else: if document_graph_enabled: subgraph_for_doc = self._graph_store_apdater.explore( subs=keywords, - limit=self._config.knowledge_graph_chunk_search_top_size, + limit=self._knowledge_graph_chunk_search_top_size, search_scope="document_graph", ) knowledge_graph_str = subgraph.format() if subgraph else "" @@ -323,7 +361,7 @@ async def asimilar_search_with_scores( return [] # merge search results into context - content = HYBRID_SEARCH_PT_CN.format( + content = HYBRID_SEARCH_PT.format( context=context, knowledge_graph=knowledge_graph_str, knowledge_graph_for_doc=knowledge_graph_for_doc_str, @@ -353,179 +391,86 @@ def delete_vector_name(self, index_name: str): self._graph_extractor.drop() -HYBRID_SEARCH_PT_CN = """## 角色 -你非常擅长结合提示词模板提供的[上下文]信息与[知识图谱]信息, -准确恰当地回答用户的问题,并保证不会输出与上下文和知识图谱无关的信息。 - -## 技能 -### 技能 1: 上下文理解 -- 准确地理解[上下文]提供的信息,上下文信息可能被拆分为多个章节。 -- 上下文的每个章节内容都会以[Section]开始,并按需进行了编号。 -- 上下文信息提供了与用户问题相关度最高的总结性描述,请合理使用它们。 -### 技能 2: 知识图谱理解 -- 准确地识别[知识图谱]中提供的[Entities:]章节中的实体信息和[Relationships:]章节中的关系信息,实体和关系信息的一般格式为: -``` -* 实体信息格式: -- (实体名) -- (实体名:实体描述) -- (实体名:实体属性表) -- (文本块ID:文档块内容) -- (目录ID:目录名) -- (文档ID:文档名称) - -* 关系信息的格式: -- (来源实体名)-[关系名]->(目标实体名) -- (来源实体名)-[关系名:关系描述]->(目标实体名) -- (来源实体名)-[关系名:关系属性表]->(目标实体名) -- (文本块实体)-[包含]->(实体名) -- (目录ID)-[包含]->(文本块实体) -- (目录ID)-[包含]->(子目录ID) -- (文档ID)-[包含]->(文本块实体) -- (文档ID)-[包含]->(目录ID) -``` -- 正确地将关系信息中的实体名/ID与实体信息关联,还原出图结构。 -- 将图结构所表达的信息作为用户提问的明细上下文,辅助生成更好的答案。 - - -## 约束条件 -- 不要在答案中描述你的思考过程,直接给出用户问题的答案,不要生成无关信息。 -- 若[知识图谱]或者[知识库原文]没有提供信息,此时应根据[上下文]提供的信息回答问题。 -- 确保以第三人称书写,从客观角度结合[上下文]、[知识图谱]和[知识库原文]表达的信息回答问题。 -- 若提供的信息相互矛盾,请解决矛盾并提供一个单一、连贯的描述。 -- 避免使用停用词和过于常见的词汇。 - -## 参考案例 -``` -[上下文]: -Section 1: -菲尔・贾伯的大儿子叫雅各布・贾伯。 -Section 2: -菲尔・贾伯的小儿子叫比尔・贾伯。 - -[知识图谱]: -Entities: -(菲尔・贾伯#菲尔兹咖啡创始人) -(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌) -(雅各布・贾伯#菲尔・贾伯的儿子) -(美国多地#菲尔兹咖啡的扩展地区) - -Relationships: -(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立) -(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点) -(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的儿子) -(雅各布・贾伯#担任#首席执行官#在2005年成为菲尔兹咖啡的首席执行官) -(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围) - -[知识库原文]: -... -``` - ----- - -接下来的[上下文]、[知识图谱]和[知识库原文]的信息,可以帮助你回答更好地用户的问题。 - -[上下文]: -{context} - -[知识图谱]: -{knowledge_graph} - -[知识库原文] -{knowledge_graph_for_doc} -""" # noqa: E501 - -HYBRID_SEARCH_PT_EN = """## Role -You excel at combining the information provided in the [Context] with -information from the [KnowledgeGraph] to accurately and appropriately -answer user questions, ensuring that you do not output information -unrelated to the context and knowledge graph. - -## Skills -### Skill 1: Context Understanding -- Accurately understand the information provided in the [Context], -which may be divided into several sections. -- Each section in the context will start with [Section] -and may be numbered as needed. -- The context provides a summary description most relevant to the user's -question, and it should be used wisely. -### Skill 2: Knowledge Graph Understanding -- Accurately identify entity information in the [Entities:] section and -relationship information in the [Relationships:] section -of the [KnowledgeGraph]. The general format for entity -and relationship information is: -``` -* Entity Information Format: -- (entity_name) -- (entity_name: entity_description) -- (entity_name: entity_property_map) -- (chunk_id: chunk_content) -- (catalog_id: catalog_name) -- (document_id: document_name) - -* Relationship Information Format: -- (source_entity_name)-[relationship_name]->(target_entity_name) -- (source_entity_name)-[relationship_name: relationship_description]->(target_entity_name) -- (source_entity_name)-[relationship_name: relationship_property_map]->(target_entity_name) -- (chunk_id)-[Contains]->(entity_name) -- (catalog_id)-[Contains]->(chunk_id) -- (catalog_id)-[Contains]->(sub_catalog_id) -- (document_id)-[Contains]->(chunk_id) -- (document_id)-[Contains]->(catalog_id) -``` -- Correctly associate entity names/IDs in the relationship information -with entity information to restore the graph structure. -- Use the information expressed by the graph structure as detailed -context for the user's query to assist in generating better answers. - -## Constraints -- Don't describe your thought process in the answer, provide the answer -to the user's question directly without generating irrelevant information. -- If the [KnowledgeGraph] or [Knowledge base original text] does not provide information, you should answer -the question based on the information provided in the [Context]. -- Ensure to write in the third person, responding to questions from -an objective perspective based on the information combined from the -[Context], the [KnowledgeGraph] and the [Knowledge base original text]. -- If the provided information is contradictory, resolve the -contradictions and provide a single, coherent description. -- Avoid using stop words and overly common vocabulary. - -## Reference Example -``` -[Context]: -Section 1: -Phil Schiller's eldest son is Jacob Schiller. -Section 2: -Phil Schiller's youngest son is Bill Schiller. - -[KnowledgeGraph]: -Entities: -(Phil Jaber#Founder of Philz Coffee) -(Philz Coffee#Coffee brand founded in Berkeley, California) -(Jacob Jaber#Son of Phil Jaber) -(Multiple locations in the USA#Expansion regions of Philz Coffee) - -Relationships: -(Phil Jaber#Created#Philz Coffee#Founded in Berkeley, California in 1978) -(Philz Coffee#Located in#Berkeley, California#Founding location of Philz Coffee) -(Phil Jaber#Has#Jacob Jaber#Son of Phil Jaber) -(Jacob Jaber#Serves as#CEO#Became CEO of Philz Coffee in 2005) -(Philz Coffee#Expanded to#Multiple locations in the USA#Expansion regions of Philz Coffee) - -[Knowledge base original text] -... -``` - ----- - -The following information from the [Context], [KnowledgeGraph] and [Knowledge base original text] -can help you better answer user questions. +HYBRID_SEARCH_PT = """ +===== +The following information from [Context], [Knowledge Graph], and [Original Text From RAG] can help you answer user questions better. [Context]: {context} -[KnowledgeGraph]: +[Knowledge Graph]: {knowledge_graph} -[Knowledge base original text] +[Original Text From RAG] {knowledge_graph_for_doc} +===== + +You are very good at combining the [Context] information provided by the prompt word template with the [Knowledge Graph] information, +answering the user's questions accurately and appropriately, and ensuring that no information irrelevant to the context and knowledge graph is output. + +## Role: GraphRAG Assistant + +### Core Capabilities +0. Make sure DO NOT answer irrelevant questions from the user. + +1. Information Processing +- Process contextual information across multiple sections ([Section] markers) +- Interpret knowledge graph relationships ((entity)-[relationship]->(entity)) +- Synthesize information from both structured and unstructured sources + +2. Response Generation +- Provide nuanced, multi-perspective answers +- Balance technical accuracy with conversational engagement +- Connect related concepts across different information sources +- Highlight uncertainties and limitations when appropriate + +3. Interaction Style +- Maintain a natural, engaging conversation flow +- Ask clarifying questions when needed +- Provide examples and analogies to illustrate complex points +- Adapt explanation depth based on user's apparent expertise + +4. Knowledge Integration +- Seamlessly blend information from: + * Context sections + * Knowledge graph relationships + * Background knowledge (when appropriate) +- Prioritize relevance over comprehensiveness +- Acknowledge information gaps explicitly + +5. Quality Assurance +- Verify logical consistency across sources +- Cross-reference relationships for validation +- Flag potential contradictions or ambiguities +- Provide confidence levels when appropriate + +### Information Sources Handling +1. Context Processing [Context] +- Parse information from numbered sections systematically +- Identify key concepts and relationships within each section +- Track section dependencies and cross-references +- Prioritize recent/relevant sections for the query + +2. Knowledge Graph Integration [Knowledge Graph] +- Parse Entities and Relationships sections separately +- Map entity-relationship-entity triples accurately +- Understand relationship directionality +- Use graph structure to find connected information + +3. Original Text Reference [Original Text From RAG] +- The GraphRAG document directory is stored as an edge in relationships to show the hierarchy of the current source text in the entire document. +- Use as authoritative source for detailed information +- Cross-reference with Context and Knowledge Graph +- Extract supporting evidence and examples +- Resolve conflicts between sources using this as primary reference + +### Output Format +1. Answer Structure +- Lead with synthesized core information +- Support with specific references to sources +- Include relevant entity-relationship pairs +- Conclude with confidence assessment +- Use the markdown format of the "quote" to highlight the original text (in details) from "GraphRAG" + +===== """ # noqa: E501 diff --git a/docs/docs/cookbook/rag/graph_rag_app_develop.md b/docs/docs/cookbook/rag/graph_rag_app_develop.md index c63a66e6e..0cdca75d4 100644 --- a/docs/docs/cookbook/rag/graph_rag_app_develop.md +++ b/docs/docs/cookbook/rag/graph_rag_app_develop.md @@ -116,6 +116,7 @@ GRAPH_COMMUNITY_SUMMARY_ENABLED=True # enable the graph community summary TRIPLET_GRAPH_ENABLED=True # enable the graph search for the triplets DOCUMENT_GRAPH_ENABLED=True # enable the graph search for documents and chunks KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE=5 # the number of the searched triplets in a retrieval +KNOWLEDGE_GRAPH_EXTRACTION_BATCH_SIZE=20 # the batch size of triplet extraction from the text ```