feat(GraphRAG): enhance GraphRAG by graph community summary (#1801)

Co-authored-by: Florian <fanzhidongyzby@163.com>
Co-authored-by: KingSkyLi <15566300566@163.com>
Co-authored-by: aries_ckt <916701291@qq.com>
Co-authored-by: Fangyin Cheng <staneyffer@gmail.com>
Co-authored-by: yvonneyx <zhuyuxin0627@gmail.com>
This commit is contained in:
M1n9X
2024-08-30 21:59:44 +08:00
committed by GitHub
parent 471689ba20
commit 759f7d99cc
59 changed files with 29316 additions and 411 deletions

View File

@@ -0,0 +1,304 @@
"""GraphExtractor class."""
import logging
import re
from typing import List, Optional
from dbgpt.core import Chunk, LLMClient
from dbgpt.rag.transformer.llm_extractor import LLMExtractor
from dbgpt.storage.graph_store.graph import Edge, Graph, MemoryGraph, Vertex
from dbgpt.storage.vector_store.base import VectorStoreBase
logger = logging.getLogger(__name__)
class GraphExtractor(LLMExtractor):
"""GraphExtractor class."""
def __init__(
self, llm_client: LLMClient, model_name: str, chunk_history: VectorStoreBase
):
"""Initialize the GraphExtractor."""
super().__init__(llm_client, model_name, GRAPH_EXTRACT_PT_CN)
self._chunk_history = chunk_history
config = self._chunk_history.get_config()
self._vector_space = config.name
self._max_chunks_once_load = config.max_chunks_once_load
self._max_threads = config.max_threads
self._topk = config.topk
self._score_threshold = config.score_threshold
async def extract(self, text: str, limit: Optional[int] = None) -> List:
"""Load similar chunks."""
# load similar chunks
chunks = await self._chunk_history.asimilar_search_with_scores(
text, self._topk, self._score_threshold
)
history = [
f"Section {i + 1}:\n{chunk.content}" for i, chunk in enumerate(chunks)
]
context = "\n".join(history) if history else ""
try:
# extract with chunk history
return await super()._extract(text, context, limit)
finally:
# save chunk to history
await self._chunk_history.aload_document_with_limit(
[Chunk(content=text, metadata={"relevant_cnt": len(history)})],
self._max_chunks_once_load,
self._max_threads,
)
def _parse_response(self, text: str, limit: Optional[int] = None) -> List[Graph]:
graph = MemoryGraph()
edge_count = 0
current_section = None
for line in text.split("\n"):
line = line.strip()
if line in ["Entities:", "Relationships:"]:
current_section = line[:-1]
elif line and current_section:
if current_section == "Entities":
match = re.match(r"\((.*?)#(.*?)\)", line)
if match:
name, summary = [part.strip() for part in match.groups()]
graph.upsert_vertex(Vertex(name, description=summary))
elif current_section == "Relationships":
match = re.match(r"\((.*?)#(.*?)#(.*?)#(.*?)\)", line)
if match:
source, name, target, summary = [
part.strip() for part in match.groups()
]
edge_count += 1
graph.append_edge(
Edge(source, target, name, description=summary)
)
if limit and edge_count >= limit:
break
return [graph]
def truncate(self):
"""Truncate chunk history."""
self._chunk_history.truncate()
def drop(self):
"""Drop chunk history."""
self._chunk_history.delete_vector_name(self._vector_space)
GRAPH_EXTRACT_PT_CN = (
"## 角色\n"
"你是一个知识图谱工程专家,非常擅长从文本中精确抽取知识图谱的实体"
"(主体、客体)和关系,并能对实体和关系的含义做出恰当的总结性描述。\n"
"\n"
"## 技能\n"
"### 技能 1: 实体抽取\n"
"--请按照如下步骤抽取实体--\n"
"1. 准确地识别文本中的实体信息,一般是名词、代词等。\n"
"2. 准确地识别实体的修饰性描述,一般作为定语对实体特征做补充。\n"
"3. 对相同概念的实体(同义词、别称、代指),请合并为单一简洁的实体名,"
"并合并它们的描述信息。\n"
"4. 对合并后的实体描述信息做简洁、恰当、连贯的总结。\n"
"\n"
"### 技能 2: 关系抽取\n"
"--请按照如下步骤抽取关系--\n"
"1. 准确地识别文本中实体之间的关联信息,一般是动词、代词等。\n"
"2. 准确地识别关系的修饰性描述,一般作为状语对关系特征做补充。\n"
"3. 对相同概念的关系(同义词、别称、代指),请合并为单一简洁的关系名,"
"并合并它们的描述信息。\n"
"4. 对合并后的关系描述信息做简洁、恰当、连贯的总结。\n"
"\n"
"### 技能 3: 关联上下文\n"
"- 关联上下文来自与当前待抽取文本相关的前置段落内容,"
"可以为知识抽取提供信息补充。\n"
"- 合理利用提供的上下文信息,知识抽取过程中出现的内容引用可能来自关联上下文。\n"
"- 不要对关联上下文的内容做知识抽取,而仅作为关联信息参考。\n"
"- 关联上下文是可选信息,可能为空。\n"
"\n"
"## 约束条件\n"
"- 如果文本已提供了图结构格式的数据,直接转换为输出格式返回,"
"不要修改实体或ID名称。"
"- 尽可能多的生成文本中提及的实体和关系信息,但不要随意创造不存在的实体和关系。\n"
"- 确保以第三人称书写,从客观角度描述实体名称、关系名称,以及他们的总结性描述。\n"
"- 尽可能多地使用关联上下文中的信息丰富实体和关系的内容,这非常重要。\n"
"- 如果实体或关系的总结描述为空,不提供总结描述信息,不要生成无关的描述信息。\n"
"- 如果提供的描述信息相互矛盾,请解决矛盾并提供一个单一、连贯的描述。\n"
"- 实体和关系的名称或者描述文本出现#和:字符时使用_字符替换其他字符不要修改。"
"- 避免使用停用词和过于常见的词汇。\n"
"\n"
"## 输出格式\n"
"Entities:\n"
"(实体名#实体总结)\n"
"...\n\n"
"Relationships:\n"
"(来源实体名#关系名#目标实体名#关系总结)\n"
"...\n"
"\n"
"## 参考案例"
"--案例仅帮助你理解提示词的输入和输出格式,请不要在答案中使用它们。--\n"
"输入:\n"
"```\n"
"[上下文]:\n"
"Section 1:\n"
"菲尔・贾伯的大儿子叫雅各布・贾伯。\n"
"Section 2:\n"
"菲尔・贾伯的小儿子叫比尔・贾伯。\n"
"..."
"\n"
"[文本]:\n"
"菲尔兹咖啡由菲尔・贾伯于1978年在加利福尼亚州伯克利创立。"
"因其独特的混合咖啡而闻名,菲尔兹已扩展到美国多地。"
"他的大儿子于2005年成为首席执行官并带领公司实现了显著增长。\n"
"```\n"
"\n"
"输出:\n"
"```\n"
"Entities:\n"
"(菲尔・贾伯#菲尔兹咖啡创始人)\n"
"(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌)\n"
"(雅各布・贾伯#菲尔・贾伯的大儿子)\n"
"(美国多地#菲尔兹咖啡的扩展地区)\n"
"\n"
"Relationships:\n"
"(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立)\n"
"(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点)\n"
"(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的大儿子)\n"
"(雅各布・贾伯#管理#菲尔兹咖啡#在2005年担任首席执行官)\n"
"(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围)\n"
"```\n"
"\n"
"----\n"
"\n"
"请根据接下来[上下文]提供的信息,按照上述要求,抽取[文本]中的实体和关系数据。\n"
"\n"
"[上下文]:\n"
"{history}\n"
"\n"
"[文本]:\n"
"{text}\n"
"\n"
"[结果]:\n"
"\n"
)
GRAPH_EXTRACT_PT_EN = (
"## Role\n"
"You are an expert in Knowledge Graph Engineering, skilled at extracting "
"entities (subjects, objects) and relations from text, and summarizing "
"their meanings effectively.\n"
"\n"
"## Skills\n"
"### Skill 1: Entity Extraction\n"
"--Please follow these steps to extract entities--\n"
"1. Accurately identify entity information in the text, "
"usually nouns, pronouns, etc.\n"
"2. Accurately identify descriptive information, "
"usually as adjectives, that supplements entity features.\n"
"3. Merge synonymous, alias, or reference entities into "
"a single concise entity name, and merge their descriptive information.\n"
"4. Provide a concise, appropriate, and coherent summary "
"of the combined entity descriptions.\n"
"\n"
"### Skill 2: Relation Extraction\n"
"--Please follow these steps to extract relations--\n"
"1. Accurately identify relation information between entities in the text, "
"usually verbs, pronouns, etc.\n"
"2. Accurately identify descriptive information, usually as adverbs, "
"that supplements relation features.\n"
"3. Merge synonymous, alias, or reference relations into "
"a single concise relation name, and merge their descriptive information.\n"
"4. Provide a concise, appropriate, and coherent summary "
"of the combined relation descriptions.\n"
"\n"
"### Skill 3: Contextual Association\n"
"- Context comes from preceding paragraphs related to the current "
"extraction text and can provide supplementary information.\n"
"- Appropriately use contextual information, content references "
"during extraction may come from this context.\n"
"- Do not extract knowledge from contextual content, "
"use it only as a reference.\n"
"- Context is optional and may be empty.\n"
"\n"
"## Constraints\n"
"- If the text has provided data that is similar to or the same as the "
"output format, please format the output directly according to the "
"output format requirements."
"- Generate as much entity and relation information mentioned in the text "
"as possible, but do not create nonexistent entities or relations.\n"
"- Ensure the writing is in the third person, describing entity names, "
"relation names, and their summaries objectively.\n"
"- Use as much contextual information as possible to enrich the content "
"of entities and relations, this is very important.\n"
"- If a summary of an entity or relation is empty, do not provide "
"summary information, and do not generate irrelevant descriptions.\n"
"- If provided descriptions are contradictory, resolve the conflict "
"and provide a single, coherent description.\n"
"- Replace any # or : characters in entity's and relation's "
"names or descriptions with an _ character.\n"
"- Avoid using stop words and overly common terms.\n"
"\n"
"## Output Format\n"
"Entities:\n"
"(entity_name#entity_summary)\n"
"...\n\n"
"Relationships:\n"
"(source_entity_name#relation_name#target_entity_name#relation_summary)\n"
"...\n"
"\n"
"## Reference Example\n"
"--The case is only to help you understand the input and output format of "
"the prompt, please do not use it in your answer.--\n"
"Input:\n"
"```\n"
"[Context]:\n"
"Section 1:\n"
"Phil Jabber's eldest son is named Jacob Jabber.\n"
"Section 2:\n"
"Phil Jabber's youngest son is named Bill Jabber.\n"
"..."
"\n"
"[Text]:\n"
"Philz Coffee was founded by Phil Jabber in 1978 in Berkeley, California. "
"Known for its distinctive blend coffee, Philz has expanded to multiple "
"locations in the USA. His eldest son became CEO in 2005, "
"leading significant growth for the company.\n"
"```\n"
"\n"
"Output:\n"
"```\n"
"Entities:\n"
"(Phil Jabber#Founder of Philz Coffee)\n"
"(Philz Coffee#Coffee brand founded in Berkeley, California)\n"
"(Jacob Jabber#Phil Jabber's eldest son)\n"
"(Multiple locations in the USA#Philz Coffee's expansion area)\n"
"\n"
"Relationships:\n"
"(Phil Jabber#Founded#Philz Coffee"
"#Founded in 1978 in Berkeley, California)\n"
"(Philz Coffee#Located in#Berkeley, California"
"#Philz Coffee's founding location)\n"
"(Phil Jabber#Has#Jacob Jabber#Phil Jabber's eldest son)\n"
"(Jacob Jabber#Manage#Philz Coffee#Serve as CEO in 2005)\n"
"(Philz Coffee#Expanded to#Multiple locations in the USA"
"#Philz Coffee's expansion area)\n"
"```\n"
"\n"
"----\n"
"\n"
"Please extract the entities and relationships data from the [Text] "
"according to the above requirements, using the provided [Context].\n"
"\n"
"[Context]:\n"
"{history}\n"
"\n"
"[Text]:\n"
"{text}\n"
"\n"
"[Results]:\n"
"\n"
)