✨ feat(GraphRAG): enhance GraphRAG by graph community summary (#1801)

Co-authored-by: Florian <fanzhidongyzby@163.com> Co-authored-by: KingSkyLi <15566300566@163.com> Co-authored-by: aries_ckt <916701291@qq.com> Co-authored-by: Fangyin Cheng <staneyffer@gmail.com> Co-authored-by: yvonneyx <zhuyuxin0627@gmail.com>
2025-09-07 03:50:42 +00:00 · 2024-08-30 21:59:44 +08:00
parent 471689ba20
commit 759f7d99cc
59 changed files with 29316 additions and 411 deletions
--- a/dbgpt/rag/transformer/graph_extractor.py
+++ b/dbgpt/rag/transformer/graph_extractor.py
@@ -0,0 +1,304 @@
+"""GraphExtractor class."""
+
+import logging
+import re
+from typing import List, Optional
+
+from dbgpt.core import Chunk, LLMClient
+from dbgpt.rag.transformer.llm_extractor import LLMExtractor
+from dbgpt.storage.graph_store.graph import Edge, Graph, MemoryGraph, Vertex
+from dbgpt.storage.vector_store.base import VectorStoreBase
+
+logger = logging.getLogger(__name__)
+
+
+class GraphExtractor(LLMExtractor):
+    """GraphExtractor class."""
+
+    def __init__(
+        self, llm_client: LLMClient, model_name: str, chunk_history: VectorStoreBase
+    ):
+        """Initialize the GraphExtractor."""
+        super().__init__(llm_client, model_name, GRAPH_EXTRACT_PT_CN)
+        self._chunk_history = chunk_history
+
+        config = self._chunk_history.get_config()
+        self._vector_space = config.name
+        self._max_chunks_once_load = config.max_chunks_once_load
+        self._max_threads = config.max_threads
+        self._topk = config.topk
+        self._score_threshold = config.score_threshold
+
+    async def extract(self, text: str, limit: Optional[int] = None) -> List:
+        """Load similar chunks."""
+        # load similar chunks
+        chunks = await self._chunk_history.asimilar_search_with_scores(
+            text, self._topk, self._score_threshold
+        )
+        history = [
+            f"Section {i + 1}:\n{chunk.content}" for i, chunk in enumerate(chunks)
+        ]
+        context = "\n".join(history) if history else ""
+
+        try:
+            # extract with chunk history
+            return await super()._extract(text, context, limit)
+
+        finally:
+            # save chunk to history
+            await self._chunk_history.aload_document_with_limit(
+                [Chunk(content=text, metadata={"relevant_cnt": len(history)})],
+                self._max_chunks_once_load,
+                self._max_threads,
+            )
+
+    def _parse_response(self, text: str, limit: Optional[int] = None) -> List[Graph]:
+        graph = MemoryGraph()
+        edge_count = 0
+        current_section = None
+        for line in text.split("\n"):
+            line = line.strip()
+            if line in ["Entities:", "Relationships:"]:
+                current_section = line[:-1]
+            elif line and current_section:
+                if current_section == "Entities":
+                    match = re.match(r"\((.*?)#(.*?)\)", line)
+                    if match:
+                        name, summary = [part.strip() for part in match.groups()]
+                        graph.upsert_vertex(Vertex(name, description=summary))
+                elif current_section == "Relationships":
+                    match = re.match(r"\((.*?)#(.*?)#(.*?)#(.*?)\)", line)
+                    if match:
+                        source, name, target, summary = [
+                            part.strip() for part in match.groups()
+                        ]
+                        edge_count += 1
+                        graph.append_edge(
+                            Edge(source, target, name, description=summary)
+                        )
+
+            if limit and edge_count >= limit:
+                break
+
+        return [graph]
+
+    def truncate(self):
+        """Truncate chunk history."""
+        self._chunk_history.truncate()
+
+    def drop(self):
+        """Drop chunk history."""
+        self._chunk_history.delete_vector_name(self._vector_space)
+
+
+GRAPH_EXTRACT_PT_CN = (
+    "## 角色\n"
+    "你是一个知识图谱工程专家，非常擅长从文本中精确抽取知识图谱的实体"
+    "（主体、客体）和关系，并能对实体和关系的含义做出恰当的总结性描述。\n"
+    "\n"
+    "## 技能\n"
+    "### 技能 1: 实体抽取\n"
+    "--请按照如下步骤抽取实体--\n"
+    "1. 准确地识别文本中的实体信息，一般是名词、代词等。\n"
+    "2. 准确地识别实体的修饰性描述，一般作为定语对实体特征做补充。\n"
+    "3. 对相同概念的实体（同义词、别称、代指），请合并为单一简洁的实体名，"
+    "并合并它们的描述信息。\n"
+    "4. 对合并后的实体描述信息做简洁、恰当、连贯的总结。\n"
+    "\n"
+    "### 技能 2: 关系抽取\n"
+    "--请按照如下步骤抽取关系--\n"
+    "1. 准确地识别文本中实体之间的关联信息，一般是动词、代词等。\n"
+    "2. 准确地识别关系的修饰性描述，一般作为状语对关系特征做补充。\n"
+    "3. 对相同概念的关系（同义词、别称、代指），请合并为单一简洁的关系名，"
+    "并合并它们的描述信息。\n"
+    "4. 对合并后的关系描述信息做简洁、恰当、连贯的总结。\n"
+    "\n"
+    "### 技能 3: 关联上下文\n"
+    "- 关联上下文来自与当前待抽取文本相关的前置段落内容，"
+    "可以为知识抽取提供信息补充。\n"
+    "- 合理利用提供的上下文信息，知识抽取过程中出现的内容引用可能来自关联上下文。\n"
+    "- 不要对关联上下文的内容做知识抽取，而仅作为关联信息参考。\n"
+    "- 关联上下文是可选信息，可能为空。\n"
+    "\n"
+    "## 约束条件\n"
+    "- 如果文本已提供了图结构格式的数据，直接转换为输出格式返回，"
+    "不要修改实体或ID名称。"
+    "- 尽可能多的生成文本中提及的实体和关系信息，但不要随意创造不存在的实体和关系。\n"
+    "- 确保以第三人称书写，从客观角度描述实体名称、关系名称，以及他们的总结性描述。\n"
+    "- 尽可能多地使用关联上下文中的信息丰富实体和关系的内容，这非常重要。\n"
+    "- 如果实体或关系的总结描述为空，不提供总结描述信息，不要生成无关的描述信息。\n"
+    "- 如果提供的描述信息相互矛盾，请解决矛盾并提供一个单一、连贯的描述。\n"
+    "- 实体和关系的名称或者描述文本出现#和:字符时，使用_字符替换，其他字符不要修改。"
+    "- 避免使用停用词和过于常见的词汇。\n"
+    "\n"
+    "## 输出格式\n"
+    "Entities:\n"
+    "(实体名#实体总结)\n"
+    "...\n\n"
+    "Relationships:\n"
+    "(来源实体名#关系名#目标实体名#关系总结)\n"
+    "...\n"
+    "\n"
+    "## 参考案例"
+    "--案例仅帮助你理解提示词的输入和输出格式，请不要在答案中使用它们。--\n"
+    "输入:\n"
+    "```\n"
+    "[上下文]:\n"
+    "Section 1:\n"
+    "菲尔・贾伯的大儿子叫雅各布・贾伯。\n"
+    "Section 2:\n"
+    "菲尔・贾伯的小儿子叫比尔・贾伯。\n"
+    "..."
+    "\n"
+    "[文本]:\n"
+    "菲尔兹咖啡由菲尔・贾伯于1978年在加利福尼亚州伯克利创立。"
+    "因其独特的混合咖啡而闻名，菲尔兹已扩展到美国多地。"
+    "他的大儿子于2005年成为首席执行官，并带领公司实现了显著增长。\n"
+    "```\n"
+    "\n"
+    "输出:\n"
+    "```\n"
+    "Entities:\n"
+    "(菲尔・贾伯#菲尔兹咖啡创始人)\n"
+    "(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌)\n"
+    "(雅各布・贾伯#菲尔・贾伯的大儿子)\n"
+    "(美国多地#菲尔兹咖啡的扩展地区)\n"
+    "\n"
+    "Relationships:\n"
+    "(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立)\n"
+    "(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点)\n"
+    "(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的大儿子)\n"
+    "(雅各布・贾伯#管理#菲尔兹咖啡#在2005年担任首席执行官)\n"
+    "(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围)\n"
+    "```\n"
+    "\n"
+    "----\n"
+    "\n"
+    "请根据接下来[上下文]提供的信息，按照上述要求，抽取[文本]中的实体和关系数据。\n"
+    "\n"
+    "[上下文]:\n"
+    "{history}\n"
+    "\n"
+    "[文本]:\n"
+    "{text}\n"
+    "\n"
+    "[结果]:\n"
+    "\n"
+)
+
+GRAPH_EXTRACT_PT_EN = (
+    "## Role\n"
+    "You are an expert in Knowledge Graph Engineering, skilled at extracting "
+    "entities (subjects, objects) and relations from text, and summarizing "
+    "their meanings effectively.\n"
+    "\n"
+    "## Skills\n"
+    "### Skill 1: Entity Extraction\n"
+    "--Please follow these steps to extract entities--\n"
+    "1. Accurately identify entity information in the text, "
+    "usually nouns, pronouns, etc.\n"
+    "2. Accurately identify descriptive information, "
+    "usually as adjectives, that supplements entity features.\n"
+    "3. Merge synonymous, alias, or reference entities into "
+    "a single concise entity name, and merge their descriptive information.\n"
+    "4. Provide a concise, appropriate, and coherent summary "
+    "of the combined entity descriptions.\n"
+    "\n"
+    "### Skill 2: Relation Extraction\n"
+    "--Please follow these steps to extract relations--\n"
+    "1. Accurately identify relation information between entities in the text, "
+    "usually verbs, pronouns, etc.\n"
+    "2. Accurately identify descriptive information, usually as adverbs, "
+    "that supplements relation features.\n"
+    "3. Merge synonymous, alias, or reference relations into "
+    "a single concise relation name, and merge their descriptive information.\n"
+    "4. Provide a concise, appropriate, and coherent summary "
+    "of the combined relation descriptions.\n"
+    "\n"
+    "### Skill 3: Contextual Association\n"
+    "- Context comes from preceding paragraphs related to the current "
+    "extraction text and can provide supplementary information.\n"
+    "- Appropriately use contextual information, content references "
+    "during extraction may come from this context.\n"
+    "- Do not extract knowledge from contextual content, "
+    "use it only as a reference.\n"
+    "- Context is optional and may be empty.\n"
+    "\n"
+    "## Constraints\n"
+    "- If the text has provided data that is similar to or the same as the "
+    "output format, please format the output directly according to the "
+    "output format requirements."
+    "- Generate as much entity and relation information mentioned in the text "
+    "as possible, but do not create nonexistent entities or relations.\n"
+    "- Ensure the writing is in the third person, describing entity names, "
+    "relation names, and their summaries objectively.\n"
+    "- Use as much contextual information as possible to enrich the content "
+    "of entities and relations, this is very important.\n"
+    "- If a summary of an entity or relation is empty, do not provide "
+    "summary information, and do not generate irrelevant descriptions.\n"
+    "- If provided descriptions are contradictory, resolve the conflict "
+    "and provide a single, coherent description.\n"
+    "- Replace any # or : characters in entity's and relation's "
+    "names or descriptions with an _ character.\n"
+    "- Avoid using stop words and overly common terms.\n"
+    "\n"
+    "## Output Format\n"
+    "Entities:\n"
+    "(entity_name#entity_summary)\n"
+    "...\n\n"
+    "Relationships:\n"
+    "(source_entity_name#relation_name#target_entity_name#relation_summary)\n"
+    "...\n"
+    "\n"
+    "## Reference Example\n"
+    "--The case is only to help you understand the input and output format of "
+    "the prompt, please do not use it in your answer.--\n"
+    "Input:\n"
+    "```\n"
+    "[Context]:\n"
+    "Section 1:\n"
+    "Phil Jabber's eldest son is named Jacob Jabber.\n"
+    "Section 2:\n"
+    "Phil Jabber's youngest son is named Bill Jabber.\n"
+    "..."
+    "\n"
+    "[Text]:\n"
+    "Philz Coffee was founded by Phil Jabber in 1978 in Berkeley, California. "
+    "Known for its distinctive blend coffee, Philz has expanded to multiple "
+    "locations in the USA. His eldest son became CEO in 2005, "
+    "leading significant growth for the company.\n"
+    "```\n"
+    "\n"
+    "Output:\n"
+    "```\n"
+    "Entities:\n"
+    "(Phil Jabber#Founder of Philz Coffee)\n"
+    "(Philz Coffee#Coffee brand founded in Berkeley, California)\n"
+    "(Jacob Jabber#Phil Jabber's eldest son)\n"
+    "(Multiple locations in the USA#Philz Coffee's expansion area)\n"
+    "\n"
+    "Relationships:\n"
+    "(Phil Jabber#Founded#Philz Coffee"
+    "#Founded in 1978 in Berkeley, California)\n"
+    "(Philz Coffee#Located in#Berkeley, California"
+    "#Philz Coffee's founding location)\n"
+    "(Phil Jabber#Has#Jacob Jabber#Phil Jabber's eldest son)\n"
+    "(Jacob Jabber#Manage#Philz Coffee#Serve as CEO in 2005)\n"
+    "(Philz Coffee#Expanded to#Multiple locations in the USA"
+    "#Philz Coffee's expansion area)\n"
+    "```\n"
+    "\n"
+    "----\n"
+    "\n"
+    "Please extract the entities and relationships data from the [Text] "
+    "according to the above requirements, using the provided [Context].\n"
+    "\n"
+    "[Context]:\n"
+    "{history}\n"
+    "\n"
+    "[Text]:\n"
+    "{text}\n"
+    "\n"
+    "[Results]:\n"
+    "\n"
+)