DB-GPT/dbgpt/rag/transformer/graph_extractor.py
M1n9X 759f7d99cc
feat(GraphRAG): enhance GraphRAG by graph community summary (#1801)
Co-authored-by: Florian <fanzhidongyzby@163.com>
Co-authored-by: KingSkyLi <15566300566@163.com>
Co-authored-by: aries_ckt <916701291@qq.com>
Co-authored-by: Fangyin Cheng <staneyffer@gmail.com>
Co-authored-by: yvonneyx <zhuyuxin0627@gmail.com>
2024-08-30 21:59:44 +08:00

305 lines
13 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""GraphExtractor class."""
import logging
import re
from typing import List, Optional
from dbgpt.core import Chunk, LLMClient
from dbgpt.rag.transformer.llm_extractor import LLMExtractor
from dbgpt.storage.graph_store.graph import Edge, Graph, MemoryGraph, Vertex
from dbgpt.storage.vector_store.base import VectorStoreBase
logger = logging.getLogger(__name__)
class GraphExtractor(LLMExtractor):
"""GraphExtractor class."""
def __init__(
self, llm_client: LLMClient, model_name: str, chunk_history: VectorStoreBase
):
"""Initialize the GraphExtractor."""
super().__init__(llm_client, model_name, GRAPH_EXTRACT_PT_CN)
self._chunk_history = chunk_history
config = self._chunk_history.get_config()
self._vector_space = config.name
self._max_chunks_once_load = config.max_chunks_once_load
self._max_threads = config.max_threads
self._topk = config.topk
self._score_threshold = config.score_threshold
async def extract(self, text: str, limit: Optional[int] = None) -> List:
"""Load similar chunks."""
# load similar chunks
chunks = await self._chunk_history.asimilar_search_with_scores(
text, self._topk, self._score_threshold
)
history = [
f"Section {i + 1}:\n{chunk.content}" for i, chunk in enumerate(chunks)
]
context = "\n".join(history) if history else ""
try:
# extract with chunk history
return await super()._extract(text, context, limit)
finally:
# save chunk to history
await self._chunk_history.aload_document_with_limit(
[Chunk(content=text, metadata={"relevant_cnt": len(history)})],
self._max_chunks_once_load,
self._max_threads,
)
def _parse_response(self, text: str, limit: Optional[int] = None) -> List[Graph]:
graph = MemoryGraph()
edge_count = 0
current_section = None
for line in text.split("\n"):
line = line.strip()
if line in ["Entities:", "Relationships:"]:
current_section = line[:-1]
elif line and current_section:
if current_section == "Entities":
match = re.match(r"\((.*?)#(.*?)\)", line)
if match:
name, summary = [part.strip() for part in match.groups()]
graph.upsert_vertex(Vertex(name, description=summary))
elif current_section == "Relationships":
match = re.match(r"\((.*?)#(.*?)#(.*?)#(.*?)\)", line)
if match:
source, name, target, summary = [
part.strip() for part in match.groups()
]
edge_count += 1
graph.append_edge(
Edge(source, target, name, description=summary)
)
if limit and edge_count >= limit:
break
return [graph]
def truncate(self):
"""Truncate chunk history."""
self._chunk_history.truncate()
def drop(self):
"""Drop chunk history."""
self._chunk_history.delete_vector_name(self._vector_space)
GRAPH_EXTRACT_PT_CN = (
"## 角色\n"
"你是一个知识图谱工程专家,非常擅长从文本中精确抽取知识图谱的实体"
"(主体、客体)和关系,并能对实体和关系的含义做出恰当的总结性描述。\n"
"\n"
"## 技能\n"
"### 技能 1: 实体抽取\n"
"--请按照如下步骤抽取实体--\n"
"1. 准确地识别文本中的实体信息,一般是名词、代词等。\n"
"2. 准确地识别实体的修饰性描述,一般作为定语对实体特征做补充。\n"
"3. 对相同概念的实体(同义词、别称、代指),请合并为单一简洁的实体名,"
"并合并它们的描述信息。\n"
"4. 对合并后的实体描述信息做简洁、恰当、连贯的总结。\n"
"\n"
"### 技能 2: 关系抽取\n"
"--请按照如下步骤抽取关系--\n"
"1. 准确地识别文本中实体之间的关联信息,一般是动词、代词等。\n"
"2. 准确地识别关系的修饰性描述,一般作为状语对关系特征做补充。\n"
"3. 对相同概念的关系(同义词、别称、代指),请合并为单一简洁的关系名,"
"并合并它们的描述信息。\n"
"4. 对合并后的关系描述信息做简洁、恰当、连贯的总结。\n"
"\n"
"### 技能 3: 关联上下文\n"
"- 关联上下文来自与当前待抽取文本相关的前置段落内容,"
"可以为知识抽取提供信息补充。\n"
"- 合理利用提供的上下文信息,知识抽取过程中出现的内容引用可能来自关联上下文。\n"
"- 不要对关联上下文的内容做知识抽取,而仅作为关联信息参考。\n"
"- 关联上下文是可选信息,可能为空。\n"
"\n"
"## 约束条件\n"
"- 如果文本已提供了图结构格式的数据,直接转换为输出格式返回,"
"不要修改实体或ID名称。"
"- 尽可能多的生成文本中提及的实体和关系信息,但不要随意创造不存在的实体和关系。\n"
"- 确保以第三人称书写,从客观角度描述实体名称、关系名称,以及他们的总结性描述。\n"
"- 尽可能多地使用关联上下文中的信息丰富实体和关系的内容,这非常重要。\n"
"- 如果实体或关系的总结描述为空,不提供总结描述信息,不要生成无关的描述信息。\n"
"- 如果提供的描述信息相互矛盾,请解决矛盾并提供一个单一、连贯的描述。\n"
"- 实体和关系的名称或者描述文本出现#和:字符时使用_字符替换其他字符不要修改。"
"- 避免使用停用词和过于常见的词汇。\n"
"\n"
"## 输出格式\n"
"Entities:\n"
"(实体名#实体总结)\n"
"...\n\n"
"Relationships:\n"
"(来源实体名#关系名#目标实体名#关系总结)\n"
"...\n"
"\n"
"## 参考案例"
"--案例仅帮助你理解提示词的输入和输出格式,请不要在答案中使用它们。--\n"
"输入:\n"
"```\n"
"[上下文]:\n"
"Section 1:\n"
"菲尔・贾伯的大儿子叫雅各布・贾伯。\n"
"Section 2:\n"
"菲尔・贾伯的小儿子叫比尔・贾伯。\n"
"..."
"\n"
"[文本]:\n"
"菲尔兹咖啡由菲尔・贾伯于1978年在加利福尼亚州伯克利创立。"
"因其独特的混合咖啡而闻名,菲尔兹已扩展到美国多地。"
"他的大儿子于2005年成为首席执行官并带领公司实现了显著增长。\n"
"```\n"
"\n"
"输出:\n"
"```\n"
"Entities:\n"
"(菲尔・贾伯#菲尔兹咖啡创始人)\n"
"(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌)\n"
"(雅各布・贾伯#菲尔・贾伯的大儿子)\n"
"(美国多地#菲尔兹咖啡的扩展地区)\n"
"\n"
"Relationships:\n"
"(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立)\n"
"(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点)\n"
"(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的大儿子)\n"
"(雅各布・贾伯#管理#菲尔兹咖啡#在2005年担任首席执行官)\n"
"(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围)\n"
"```\n"
"\n"
"----\n"
"\n"
"请根据接下来[上下文]提供的信息,按照上述要求,抽取[文本]中的实体和关系数据。\n"
"\n"
"[上下文]:\n"
"{history}\n"
"\n"
"[文本]:\n"
"{text}\n"
"\n"
"[结果]:\n"
"\n"
)
GRAPH_EXTRACT_PT_EN = (
"## Role\n"
"You are an expert in Knowledge Graph Engineering, skilled at extracting "
"entities (subjects, objects) and relations from text, and summarizing "
"their meanings effectively.\n"
"\n"
"## Skills\n"
"### Skill 1: Entity Extraction\n"
"--Please follow these steps to extract entities--\n"
"1. Accurately identify entity information in the text, "
"usually nouns, pronouns, etc.\n"
"2. Accurately identify descriptive information, "
"usually as adjectives, that supplements entity features.\n"
"3. Merge synonymous, alias, or reference entities into "
"a single concise entity name, and merge their descriptive information.\n"
"4. Provide a concise, appropriate, and coherent summary "
"of the combined entity descriptions.\n"
"\n"
"### Skill 2: Relation Extraction\n"
"--Please follow these steps to extract relations--\n"
"1. Accurately identify relation information between entities in the text, "
"usually verbs, pronouns, etc.\n"
"2. Accurately identify descriptive information, usually as adverbs, "
"that supplements relation features.\n"
"3. Merge synonymous, alias, or reference relations into "
"a single concise relation name, and merge their descriptive information.\n"
"4. Provide a concise, appropriate, and coherent summary "
"of the combined relation descriptions.\n"
"\n"
"### Skill 3: Contextual Association\n"
"- Context comes from preceding paragraphs related to the current "
"extraction text and can provide supplementary information.\n"
"- Appropriately use contextual information, content references "
"during extraction may come from this context.\n"
"- Do not extract knowledge from contextual content, "
"use it only as a reference.\n"
"- Context is optional and may be empty.\n"
"\n"
"## Constraints\n"
"- If the text has provided data that is similar to or the same as the "
"output format, please format the output directly according to the "
"output format requirements."
"- Generate as much entity and relation information mentioned in the text "
"as possible, but do not create nonexistent entities or relations.\n"
"- Ensure the writing is in the third person, describing entity names, "
"relation names, and their summaries objectively.\n"
"- Use as much contextual information as possible to enrich the content "
"of entities and relations, this is very important.\n"
"- If a summary of an entity or relation is empty, do not provide "
"summary information, and do not generate irrelevant descriptions.\n"
"- If provided descriptions are contradictory, resolve the conflict "
"and provide a single, coherent description.\n"
"- Replace any # or : characters in entity's and relation's "
"names or descriptions with an _ character.\n"
"- Avoid using stop words and overly common terms.\n"
"\n"
"## Output Format\n"
"Entities:\n"
"(entity_name#entity_summary)\n"
"...\n\n"
"Relationships:\n"
"(source_entity_name#relation_name#target_entity_name#relation_summary)\n"
"...\n"
"\n"
"## Reference Example\n"
"--The case is only to help you understand the input and output format of "
"the prompt, please do not use it in your answer.--\n"
"Input:\n"
"```\n"
"[Context]:\n"
"Section 1:\n"
"Phil Jabber's eldest son is named Jacob Jabber.\n"
"Section 2:\n"
"Phil Jabber's youngest son is named Bill Jabber.\n"
"..."
"\n"
"[Text]:\n"
"Philz Coffee was founded by Phil Jabber in 1978 in Berkeley, California. "
"Known for its distinctive blend coffee, Philz has expanded to multiple "
"locations in the USA. His eldest son became CEO in 2005, "
"leading significant growth for the company.\n"
"```\n"
"\n"
"Output:\n"
"```\n"
"Entities:\n"
"(Phil Jabber#Founder of Philz Coffee)\n"
"(Philz Coffee#Coffee brand founded in Berkeley, California)\n"
"(Jacob Jabber#Phil Jabber's eldest son)\n"
"(Multiple locations in the USA#Philz Coffee's expansion area)\n"
"\n"
"Relationships:\n"
"(Phil Jabber#Founded#Philz Coffee"
"#Founded in 1978 in Berkeley, California)\n"
"(Philz Coffee#Located in#Berkeley, California"
"#Philz Coffee's founding location)\n"
"(Phil Jabber#Has#Jacob Jabber#Phil Jabber's eldest son)\n"
"(Jacob Jabber#Manage#Philz Coffee#Serve as CEO in 2005)\n"
"(Philz Coffee#Expanded to#Multiple locations in the USA"
"#Philz Coffee's expansion area)\n"
"```\n"
"\n"
"----\n"
"\n"
"Please extract the entities and relationships data from the [Text] "
"according to the above requirements, using the provided [Context].\n"
"\n"
"[Context]:\n"
"{history}\n"
"\n"
"[Text]:\n"
"{text}\n"
"\n"
"[Results]:\n"
"\n"
)