mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-09-09 21:08:59 +00:00
✨ feat(GraphRAG): enhance GraphRAG by graph community summary (#1801)
Co-authored-by: Florian <fanzhidongyzby@163.com> Co-authored-by: KingSkyLi <15566300566@163.com> Co-authored-by: aries_ckt <916701291@qq.com> Co-authored-by: Fangyin Cheng <staneyffer@gmail.com> Co-authored-by: yvonneyx <zhuyuxin0627@gmail.com>
This commit is contained in:
@@ -9,11 +9,27 @@ logger = logging.getLogger(__name__)
|
||||
class TransformerBase:
|
||||
"""Transformer base class."""
|
||||
|
||||
@abstractmethod
|
||||
def truncate(self):
|
||||
"""Truncate operation."""
|
||||
|
||||
@abstractmethod
|
||||
def drop(self):
|
||||
"""Clean operation."""
|
||||
|
||||
|
||||
class EmbedderBase(TransformerBase, ABC):
|
||||
"""Embedder base class."""
|
||||
|
||||
|
||||
class SummarizerBase(TransformerBase, ABC):
|
||||
"""Summarizer base class."""
|
||||
|
||||
@abstractmethod
|
||||
async def summarize(self, **args) -> str:
|
||||
"""Summarize result."""
|
||||
|
||||
|
||||
class ExtractorBase(TransformerBase, ABC):
|
||||
"""Extractor base class."""
|
||||
|
||||
|
208
dbgpt/rag/transformer/community_summarizer.py
Normal file
208
dbgpt/rag/transformer/community_summarizer.py
Normal file
@@ -0,0 +1,208 @@
|
||||
"""CommunitySummarizer class."""
|
||||
|
||||
import logging
|
||||
|
||||
from dbgpt.core import LLMClient
|
||||
from dbgpt.rag.transformer.llm_summarizer import LLMSummarizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CommunitySummarizer(LLMSummarizer):
|
||||
"""CommunitySummarizer class."""
|
||||
|
||||
def __init__(self, llm_client: LLMClient, model_name: str):
|
||||
"""Initialize the CommunitySummaryExtractor."""
|
||||
super().__init__(llm_client, model_name, COMMUNITY_SUMMARY_PT_CN)
|
||||
|
||||
|
||||
COMMUNITY_SUMMARY_PT_CN = (
|
||||
"## 角色\n"
|
||||
"你非常擅长知识图谱的信息总结,能根据给定的知识图谱中的实体和关系的名称以及描述"
|
||||
"信息,全面、恰当地对知识图谱子图信息做出总结性描述,并且不会丢失关键的信息。\n"
|
||||
"\n"
|
||||
"## 技能\n"
|
||||
"### 技能 1: 实体识别\n"
|
||||
"- 准确地识别[Entities:]章节中的实体信息,包括实体名、实体描述信息。\n"
|
||||
"- 实体信息的一般格式有:\n"
|
||||
"(实体名)\n"
|
||||
"(实体名:实体描述)\n"
|
||||
"(实体名:实体属性表)\n"
|
||||
"(文本块ID:文档块内容)\n"
|
||||
"(目录ID:目录名)\n"
|
||||
"(文档ID:文档名称)\n"
|
||||
"\n"
|
||||
"### 技能 2: 关系识别\n"
|
||||
"- 准确地识别[Relationships:]章节中的关系信息,包括来源实体名、关系名、"
|
||||
"目标实体名、关系描述信息,实体名也可能是文档ID、目录ID、文本块ID。\n"
|
||||
"- 关系信息的一般格式有:\n"
|
||||
"(来源实体名)-[关系名]->(目标实体名)\n"
|
||||
"(来源实体名)-[关系名:关系描述]->(目标实体名)\n"
|
||||
"(来源实体名)-[关系名:关系属性表]->(目标实体名)\n"
|
||||
"(文本块ID)-[包含]->(实体名)\n"
|
||||
"(目录ID)-[包含]->(文本块实体)\n"
|
||||
"(目录ID)-[包含]->(子目录ID)\n"
|
||||
"(文档ID)-[包含]->(文本块实体)\n"
|
||||
"(文档ID)-[包含]->(目录ID)\n"
|
||||
"\n"
|
||||
"### 技能 3: 图结构理解\n"
|
||||
"--请按照如下步骤理解图结构--\n"
|
||||
"1. 正确地将关系信息中的来源实体名与实体信息关联。\n"
|
||||
"2. 正确地将关系信息中的目标实体名与实体信息关联。\n"
|
||||
"3. 根据提供的关系信息还原出图结构。"
|
||||
"\n"
|
||||
"### 技能 4: 知识图谱总结\n"
|
||||
"--请按照如下步骤总结知识图谱--\n"
|
||||
"1. 确定知识图谱表达的主题或话题,突出关键实体和关系。"
|
||||
"2. 使用准确、恰当、简洁的语言总结图结构表达的信息,不要生成与图结构中无关的信息。"
|
||||
"\n"
|
||||
"## 约束条件\n"
|
||||
"- 不要在答案中描述你的思考过程,直接给出用户问题的答案,不要生成无关信息。\n"
|
||||
"- 确保以第三人称书写,从客观角度对知识图谱表达的信息进行总结性描述。\n"
|
||||
"- 如果实体或关系的描述信息为空,对最终的总结信息没有贡献,不要生成无关信息。\n"
|
||||
"- 如果提供的描述信息相互矛盾,请解决矛盾并提供一个单一、连贯的描述。\n"
|
||||
"- 避免使用停用词和过于常见的词汇。\n"
|
||||
"\n"
|
||||
"## 参考案例\n"
|
||||
"--案例仅帮助你理解提示词的输入和输出格式,请不要在答案中使用它们。--\n"
|
||||
"输入:\n"
|
||||
"```\n"
|
||||
"Entities:\n"
|
||||
"(菲尔・贾伯#菲尔兹咖啡创始人)\n"
|
||||
"(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌)\n"
|
||||
"(雅各布・贾伯#菲尔・贾伯的儿子)\n"
|
||||
"(美国多地#菲尔兹咖啡的扩展地区)\n"
|
||||
"\n"
|
||||
"Relationships:\n"
|
||||
"(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立)\n"
|
||||
"(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点)\n"
|
||||
"(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的儿子)\n"
|
||||
"(雅各布・贾伯#担任#首席执行官#在2005年成为菲尔兹咖啡的首席执行官)\n"
|
||||
"(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围)\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"输出:\n"
|
||||
"```\n"
|
||||
"菲尔兹咖啡是由菲尔・贾伯在1978年于加利福尼亚州伯克利创立的咖啡品牌。"
|
||||
"菲尔・贾伯的儿子雅各布・贾伯在2005年接任首席执行官,领导公司扩展到了美国多地,"
|
||||
"进一步巩固了菲尔兹咖啡作为加利福尼亚州伯克利创立的咖啡品牌的市场地位。\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"----\n"
|
||||
"\n"
|
||||
"请根据接下来[知识图谱]提供的信息,按照上述要求,总结知识图谱表达的信息。\n"
|
||||
"\n"
|
||||
"[知识图谱]:\n"
|
||||
"{graph}\n"
|
||||
"\n"
|
||||
"[总结]:\n"
|
||||
"\n"
|
||||
)
|
||||
|
||||
COMMUNITY_SUMMARY_PT_EN = (
|
||||
"## Role\n"
|
||||
"You are highly skilled in summarizing information from knowledge graphs. "
|
||||
"Based on the names and descriptions of entities and relationships in a "
|
||||
"given knowledge graph, you can comprehensively and appropriately summarize"
|
||||
" the information of the subgraph without losing critical details.\n"
|
||||
"\n"
|
||||
"## Skills\n"
|
||||
"### Skill 1: Entity Recognition\n"
|
||||
"- Accurately recognize entity information in the [Entities:] section, "
|
||||
"including entity names and descriptions.\n"
|
||||
"- The general formats for entity information are:\n"
|
||||
"(entity_name)\n"
|
||||
"(entity_name: entity_description)\n"
|
||||
"(entity_name: entity_property_map)\n"
|
||||
"(chunk_id: chunk_content)\n"
|
||||
"(catalog_id: catalog_name)\n"
|
||||
"(document_id: document_name)\n"
|
||||
"\n"
|
||||
"### Skill 2: Relationship Recognition\n"
|
||||
"- Accurately recognize relationship information in the [Relationships:] "
|
||||
"section, including source_entity_name, relationship_name, "
|
||||
"target_entity_name, and relationship_description, The entity_name may "
|
||||
"also be the document_id, catalog_id, or chunk_id.\n"
|
||||
"- The general formats for relationship information are:\n"
|
||||
"(source_entity_name)-[relationship_name]->(target_entity_name)\n"
|
||||
"(source_entity_name)-[relationship_name: relationship_description]->"
|
||||
"(target_entity_name)\n"
|
||||
"(source_entity_name)-[relationship_name: relationship_property_map]->"
|
||||
"(target_entity_name)\n"
|
||||
"(chunk_id)-[Contains]->(entity_name)\n"
|
||||
"(catalog_id)-[Contains]->(chunk_id)\n"
|
||||
"(catalog_id)-[Contains]->(sub_catalog_id)\n"
|
||||
"(document_id)-[Contains]->(chunk_id)\n"
|
||||
"(document_id)-[Contains]->(catalog_id)\n"
|
||||
"\n"
|
||||
"### Skill 3: Graph Structure Understanding\n"
|
||||
"--Follow these steps to understand the graph structure--\n"
|
||||
"1. Correctly associate the source entity name in the "
|
||||
"relationship information with the entity information.\n"
|
||||
"2. Correctly associate the target entity name in the "
|
||||
"relationship information with the entity information.\n"
|
||||
"3. Reconstruct the graph structure based on the provided "
|
||||
"relationship information."
|
||||
"\n"
|
||||
"### Skill 4: Knowledge Graph Summarization\n"
|
||||
"--Follow these steps to summarize the knowledge graph--\n"
|
||||
"1. Determine the theme or topic expressed by the knowledge graph, "
|
||||
"highlighting key entities and relationships."
|
||||
"2. Use accurate, appropriate, and concise language to summarize "
|
||||
"the information expressed by the graph "
|
||||
"without generating irrelevant information."
|
||||
"\n"
|
||||
"## Constraints\n"
|
||||
"- Don't describe your thought process in the answer, provide the answer "
|
||||
"to the user's question directly without generating irrelevant information."
|
||||
"- Ensure the summary is written in the third person and objectively "
|
||||
"reflects the information conveyed by the knowledge graph.\n"
|
||||
"- If the descriptions of entities or relationships are empty and "
|
||||
"contribute nothing to the final summary, "
|
||||
"do not generate unrelated information.\n"
|
||||
"- If the provided descriptions are contradictory, resolve the conflicts "
|
||||
"and provide a single, coherent description.\n"
|
||||
"- Avoid using stop words and overly common words.\n"
|
||||
"\n"
|
||||
"## Reference Example\n"
|
||||
"--The case is only to help you understand the input and output format of "
|
||||
"the prompt, please do not use it in your answer.--\n"
|
||||
"Input:\n"
|
||||
"```\n"
|
||||
"Entities:\n"
|
||||
"(Phil Jaber#Founder of Philz Coffee)\n"
|
||||
"(Philz Coffee#Coffee brand founded in Berkeley, California)\n"
|
||||
"(Jacob Jaber#Son of Phil Jaber)\n"
|
||||
"(Multiple locations in the USA#Expansion regions of Philz Coffee)\n"
|
||||
"\n"
|
||||
"Relationships:\n"
|
||||
"(Phil Jaber#Created#Philz Coffee"
|
||||
"#Founded in Berkeley, California in 1978)\n"
|
||||
"(Philz Coffee#Located in#Berkeley, California"
|
||||
"#Founding location of Philz Coffee)\n"
|
||||
"(Phil Jaber#Has#Jacob Jaber#Son of Phil Jaber)\n"
|
||||
"(Jacob Jaber#Serves as#CEO#Became CEO of Philz Coffee in 2005)\n"
|
||||
"(Philz Coffee#Expanded to#Multiple locations in the USA"
|
||||
"#Expansion regions of Philz Coffee)\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"Output:\n"
|
||||
"```\n"
|
||||
"Philz Coffee is a coffee brand founded by Phil Jaber in 1978 in "
|
||||
"Berkeley, California. Phil Jaber's son, Jacob Jaber, took over as CEO in "
|
||||
"2005, leading the company to expand to multiple locations in the USA, "
|
||||
"further solidifying Philz Coffee's market position as a coffee brand "
|
||||
"founded in Berkeley, California.\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"----\n"
|
||||
"\n"
|
||||
"Please summarize the information expressed by the [KnowledgeGraph] "
|
||||
"provided in the following section according to the above requirements.\n"
|
||||
"\n"
|
||||
"[KnowledgeGraph]:\n"
|
||||
"{graph}\n"
|
||||
"\n"
|
||||
"[Summary]:\n"
|
||||
"\n"
|
||||
)
|
304
dbgpt/rag/transformer/graph_extractor.py
Normal file
304
dbgpt/rag/transformer/graph_extractor.py
Normal file
@@ -0,0 +1,304 @@
|
||||
"""GraphExtractor class."""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import List, Optional
|
||||
|
||||
from dbgpt.core import Chunk, LLMClient
|
||||
from dbgpt.rag.transformer.llm_extractor import LLMExtractor
|
||||
from dbgpt.storage.graph_store.graph import Edge, Graph, MemoryGraph, Vertex
|
||||
from dbgpt.storage.vector_store.base import VectorStoreBase
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GraphExtractor(LLMExtractor):
|
||||
"""GraphExtractor class."""
|
||||
|
||||
def __init__(
|
||||
self, llm_client: LLMClient, model_name: str, chunk_history: VectorStoreBase
|
||||
):
|
||||
"""Initialize the GraphExtractor."""
|
||||
super().__init__(llm_client, model_name, GRAPH_EXTRACT_PT_CN)
|
||||
self._chunk_history = chunk_history
|
||||
|
||||
config = self._chunk_history.get_config()
|
||||
self._vector_space = config.name
|
||||
self._max_chunks_once_load = config.max_chunks_once_load
|
||||
self._max_threads = config.max_threads
|
||||
self._topk = config.topk
|
||||
self._score_threshold = config.score_threshold
|
||||
|
||||
async def extract(self, text: str, limit: Optional[int] = None) -> List:
|
||||
"""Load similar chunks."""
|
||||
# load similar chunks
|
||||
chunks = await self._chunk_history.asimilar_search_with_scores(
|
||||
text, self._topk, self._score_threshold
|
||||
)
|
||||
history = [
|
||||
f"Section {i + 1}:\n{chunk.content}" for i, chunk in enumerate(chunks)
|
||||
]
|
||||
context = "\n".join(history) if history else ""
|
||||
|
||||
try:
|
||||
# extract with chunk history
|
||||
return await super()._extract(text, context, limit)
|
||||
|
||||
finally:
|
||||
# save chunk to history
|
||||
await self._chunk_history.aload_document_with_limit(
|
||||
[Chunk(content=text, metadata={"relevant_cnt": len(history)})],
|
||||
self._max_chunks_once_load,
|
||||
self._max_threads,
|
||||
)
|
||||
|
||||
def _parse_response(self, text: str, limit: Optional[int] = None) -> List[Graph]:
|
||||
graph = MemoryGraph()
|
||||
edge_count = 0
|
||||
current_section = None
|
||||
for line in text.split("\n"):
|
||||
line = line.strip()
|
||||
if line in ["Entities:", "Relationships:"]:
|
||||
current_section = line[:-1]
|
||||
elif line and current_section:
|
||||
if current_section == "Entities":
|
||||
match = re.match(r"\((.*?)#(.*?)\)", line)
|
||||
if match:
|
||||
name, summary = [part.strip() for part in match.groups()]
|
||||
graph.upsert_vertex(Vertex(name, description=summary))
|
||||
elif current_section == "Relationships":
|
||||
match = re.match(r"\((.*?)#(.*?)#(.*?)#(.*?)\)", line)
|
||||
if match:
|
||||
source, name, target, summary = [
|
||||
part.strip() for part in match.groups()
|
||||
]
|
||||
edge_count += 1
|
||||
graph.append_edge(
|
||||
Edge(source, target, name, description=summary)
|
||||
)
|
||||
|
||||
if limit and edge_count >= limit:
|
||||
break
|
||||
|
||||
return [graph]
|
||||
|
||||
def truncate(self):
|
||||
"""Truncate chunk history."""
|
||||
self._chunk_history.truncate()
|
||||
|
||||
def drop(self):
|
||||
"""Drop chunk history."""
|
||||
self._chunk_history.delete_vector_name(self._vector_space)
|
||||
|
||||
|
||||
GRAPH_EXTRACT_PT_CN = (
|
||||
"## 角色\n"
|
||||
"你是一个知识图谱工程专家,非常擅长从文本中精确抽取知识图谱的实体"
|
||||
"(主体、客体)和关系,并能对实体和关系的含义做出恰当的总结性描述。\n"
|
||||
"\n"
|
||||
"## 技能\n"
|
||||
"### 技能 1: 实体抽取\n"
|
||||
"--请按照如下步骤抽取实体--\n"
|
||||
"1. 准确地识别文本中的实体信息,一般是名词、代词等。\n"
|
||||
"2. 准确地识别实体的修饰性描述,一般作为定语对实体特征做补充。\n"
|
||||
"3. 对相同概念的实体(同义词、别称、代指),请合并为单一简洁的实体名,"
|
||||
"并合并它们的描述信息。\n"
|
||||
"4. 对合并后的实体描述信息做简洁、恰当、连贯的总结。\n"
|
||||
"\n"
|
||||
"### 技能 2: 关系抽取\n"
|
||||
"--请按照如下步骤抽取关系--\n"
|
||||
"1. 准确地识别文本中实体之间的关联信息,一般是动词、代词等。\n"
|
||||
"2. 准确地识别关系的修饰性描述,一般作为状语对关系特征做补充。\n"
|
||||
"3. 对相同概念的关系(同义词、别称、代指),请合并为单一简洁的关系名,"
|
||||
"并合并它们的描述信息。\n"
|
||||
"4. 对合并后的关系描述信息做简洁、恰当、连贯的总结。\n"
|
||||
"\n"
|
||||
"### 技能 3: 关联上下文\n"
|
||||
"- 关联上下文来自与当前待抽取文本相关的前置段落内容,"
|
||||
"可以为知识抽取提供信息补充。\n"
|
||||
"- 合理利用提供的上下文信息,知识抽取过程中出现的内容引用可能来自关联上下文。\n"
|
||||
"- 不要对关联上下文的内容做知识抽取,而仅作为关联信息参考。\n"
|
||||
"- 关联上下文是可选信息,可能为空。\n"
|
||||
"\n"
|
||||
"## 约束条件\n"
|
||||
"- 如果文本已提供了图结构格式的数据,直接转换为输出格式返回,"
|
||||
"不要修改实体或ID名称。"
|
||||
"- 尽可能多的生成文本中提及的实体和关系信息,但不要随意创造不存在的实体和关系。\n"
|
||||
"- 确保以第三人称书写,从客观角度描述实体名称、关系名称,以及他们的总结性描述。\n"
|
||||
"- 尽可能多地使用关联上下文中的信息丰富实体和关系的内容,这非常重要。\n"
|
||||
"- 如果实体或关系的总结描述为空,不提供总结描述信息,不要生成无关的描述信息。\n"
|
||||
"- 如果提供的描述信息相互矛盾,请解决矛盾并提供一个单一、连贯的描述。\n"
|
||||
"- 实体和关系的名称或者描述文本出现#和:字符时,使用_字符替换,其他字符不要修改。"
|
||||
"- 避免使用停用词和过于常见的词汇。\n"
|
||||
"\n"
|
||||
"## 输出格式\n"
|
||||
"Entities:\n"
|
||||
"(实体名#实体总结)\n"
|
||||
"...\n\n"
|
||||
"Relationships:\n"
|
||||
"(来源实体名#关系名#目标实体名#关系总结)\n"
|
||||
"...\n"
|
||||
"\n"
|
||||
"## 参考案例"
|
||||
"--案例仅帮助你理解提示词的输入和输出格式,请不要在答案中使用它们。--\n"
|
||||
"输入:\n"
|
||||
"```\n"
|
||||
"[上下文]:\n"
|
||||
"Section 1:\n"
|
||||
"菲尔・贾伯的大儿子叫雅各布・贾伯。\n"
|
||||
"Section 2:\n"
|
||||
"菲尔・贾伯的小儿子叫比尔・贾伯。\n"
|
||||
"..."
|
||||
"\n"
|
||||
"[文本]:\n"
|
||||
"菲尔兹咖啡由菲尔・贾伯于1978年在加利福尼亚州伯克利创立。"
|
||||
"因其独特的混合咖啡而闻名,菲尔兹已扩展到美国多地。"
|
||||
"他的大儿子于2005年成为首席执行官,并带领公司实现了显著增长。\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"输出:\n"
|
||||
"```\n"
|
||||
"Entities:\n"
|
||||
"(菲尔・贾伯#菲尔兹咖啡创始人)\n"
|
||||
"(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌)\n"
|
||||
"(雅各布・贾伯#菲尔・贾伯的大儿子)\n"
|
||||
"(美国多地#菲尔兹咖啡的扩展地区)\n"
|
||||
"\n"
|
||||
"Relationships:\n"
|
||||
"(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立)\n"
|
||||
"(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点)\n"
|
||||
"(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的大儿子)\n"
|
||||
"(雅各布・贾伯#管理#菲尔兹咖啡#在2005年担任首席执行官)\n"
|
||||
"(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围)\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"----\n"
|
||||
"\n"
|
||||
"请根据接下来[上下文]提供的信息,按照上述要求,抽取[文本]中的实体和关系数据。\n"
|
||||
"\n"
|
||||
"[上下文]:\n"
|
||||
"{history}\n"
|
||||
"\n"
|
||||
"[文本]:\n"
|
||||
"{text}\n"
|
||||
"\n"
|
||||
"[结果]:\n"
|
||||
"\n"
|
||||
)
|
||||
|
||||
GRAPH_EXTRACT_PT_EN = (
|
||||
"## Role\n"
|
||||
"You are an expert in Knowledge Graph Engineering, skilled at extracting "
|
||||
"entities (subjects, objects) and relations from text, and summarizing "
|
||||
"their meanings effectively.\n"
|
||||
"\n"
|
||||
"## Skills\n"
|
||||
"### Skill 1: Entity Extraction\n"
|
||||
"--Please follow these steps to extract entities--\n"
|
||||
"1. Accurately identify entity information in the text, "
|
||||
"usually nouns, pronouns, etc.\n"
|
||||
"2. Accurately identify descriptive information, "
|
||||
"usually as adjectives, that supplements entity features.\n"
|
||||
"3. Merge synonymous, alias, or reference entities into "
|
||||
"a single concise entity name, and merge their descriptive information.\n"
|
||||
"4. Provide a concise, appropriate, and coherent summary "
|
||||
"of the combined entity descriptions.\n"
|
||||
"\n"
|
||||
"### Skill 2: Relation Extraction\n"
|
||||
"--Please follow these steps to extract relations--\n"
|
||||
"1. Accurately identify relation information between entities in the text, "
|
||||
"usually verbs, pronouns, etc.\n"
|
||||
"2. Accurately identify descriptive information, usually as adverbs, "
|
||||
"that supplements relation features.\n"
|
||||
"3. Merge synonymous, alias, or reference relations into "
|
||||
"a single concise relation name, and merge their descriptive information.\n"
|
||||
"4. Provide a concise, appropriate, and coherent summary "
|
||||
"of the combined relation descriptions.\n"
|
||||
"\n"
|
||||
"### Skill 3: Contextual Association\n"
|
||||
"- Context comes from preceding paragraphs related to the current "
|
||||
"extraction text and can provide supplementary information.\n"
|
||||
"- Appropriately use contextual information, content references "
|
||||
"during extraction may come from this context.\n"
|
||||
"- Do not extract knowledge from contextual content, "
|
||||
"use it only as a reference.\n"
|
||||
"- Context is optional and may be empty.\n"
|
||||
"\n"
|
||||
"## Constraints\n"
|
||||
"- If the text has provided data that is similar to or the same as the "
|
||||
"output format, please format the output directly according to the "
|
||||
"output format requirements."
|
||||
"- Generate as much entity and relation information mentioned in the text "
|
||||
"as possible, but do not create nonexistent entities or relations.\n"
|
||||
"- Ensure the writing is in the third person, describing entity names, "
|
||||
"relation names, and their summaries objectively.\n"
|
||||
"- Use as much contextual information as possible to enrich the content "
|
||||
"of entities and relations, this is very important.\n"
|
||||
"- If a summary of an entity or relation is empty, do not provide "
|
||||
"summary information, and do not generate irrelevant descriptions.\n"
|
||||
"- If provided descriptions are contradictory, resolve the conflict "
|
||||
"and provide a single, coherent description.\n"
|
||||
"- Replace any # or : characters in entity's and relation's "
|
||||
"names or descriptions with an _ character.\n"
|
||||
"- Avoid using stop words and overly common terms.\n"
|
||||
"\n"
|
||||
"## Output Format\n"
|
||||
"Entities:\n"
|
||||
"(entity_name#entity_summary)\n"
|
||||
"...\n\n"
|
||||
"Relationships:\n"
|
||||
"(source_entity_name#relation_name#target_entity_name#relation_summary)\n"
|
||||
"...\n"
|
||||
"\n"
|
||||
"## Reference Example\n"
|
||||
"--The case is only to help you understand the input and output format of "
|
||||
"the prompt, please do not use it in your answer.--\n"
|
||||
"Input:\n"
|
||||
"```\n"
|
||||
"[Context]:\n"
|
||||
"Section 1:\n"
|
||||
"Phil Jabber's eldest son is named Jacob Jabber.\n"
|
||||
"Section 2:\n"
|
||||
"Phil Jabber's youngest son is named Bill Jabber.\n"
|
||||
"..."
|
||||
"\n"
|
||||
"[Text]:\n"
|
||||
"Philz Coffee was founded by Phil Jabber in 1978 in Berkeley, California. "
|
||||
"Known for its distinctive blend coffee, Philz has expanded to multiple "
|
||||
"locations in the USA. His eldest son became CEO in 2005, "
|
||||
"leading significant growth for the company.\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"Output:\n"
|
||||
"```\n"
|
||||
"Entities:\n"
|
||||
"(Phil Jabber#Founder of Philz Coffee)\n"
|
||||
"(Philz Coffee#Coffee brand founded in Berkeley, California)\n"
|
||||
"(Jacob Jabber#Phil Jabber's eldest son)\n"
|
||||
"(Multiple locations in the USA#Philz Coffee's expansion area)\n"
|
||||
"\n"
|
||||
"Relationships:\n"
|
||||
"(Phil Jabber#Founded#Philz Coffee"
|
||||
"#Founded in 1978 in Berkeley, California)\n"
|
||||
"(Philz Coffee#Located in#Berkeley, California"
|
||||
"#Philz Coffee's founding location)\n"
|
||||
"(Phil Jabber#Has#Jacob Jabber#Phil Jabber's eldest son)\n"
|
||||
"(Jacob Jabber#Manage#Philz Coffee#Serve as CEO in 2005)\n"
|
||||
"(Philz Coffee#Expanded to#Multiple locations in the USA"
|
||||
"#Philz Coffee's expansion area)\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"----\n"
|
||||
"\n"
|
||||
"Please extract the entities and relationships data from the [Text] "
|
||||
"according to the above requirements, using the provided [Context].\n"
|
||||
"\n"
|
||||
"[Context]:\n"
|
||||
"{history}\n"
|
||||
"\n"
|
||||
"[Text]:\n"
|
||||
"{text}\n"
|
||||
"\n"
|
||||
"[Results]:\n"
|
||||
"\n"
|
||||
)
|
@@ -19,9 +19,20 @@ class LLMExtractor(ExtractorBase, ABC):
|
||||
self._prompt_template = prompt_template
|
||||
|
||||
async def extract(self, text: str, limit: Optional[int] = None) -> List:
|
||||
"""Extract by LLm."""
|
||||
"""Extract by LLM."""
|
||||
return await self._extract(text, None, limit)
|
||||
|
||||
async def _extract(
|
||||
self, text: str, history: str = None, limit: Optional[int] = None
|
||||
) -> List:
|
||||
"""Inner extract by LLM."""
|
||||
template = HumanPromptTemplate.from_template(self._prompt_template)
|
||||
messages = template.format_messages(text=text)
|
||||
|
||||
messages = (
|
||||
template.format_messages(text=text, history=history)
|
||||
if history is not None
|
||||
else template.format_messages(text=text)
|
||||
)
|
||||
|
||||
# use default model if needed
|
||||
if not self._model_name:
|
||||
@@ -45,6 +56,12 @@ class LLMExtractor(ExtractorBase, ABC):
|
||||
ValueError("optional argument limit >= 1")
|
||||
return self._parse_response(response.text, limit)
|
||||
|
||||
def truncate(self):
|
||||
"""Do nothing by default."""
|
||||
|
||||
def drop(self):
|
||||
"""Do nothing by default."""
|
||||
|
||||
@abstractmethod
|
||||
def _parse_response(self, text: str, limit: Optional[int] = None) -> List:
|
||||
"""Parse llm response."""
|
||||
|
48
dbgpt/rag/transformer/llm_summarizer.py
Normal file
48
dbgpt/rag/transformer/llm_summarizer.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""LLMSummarizer class."""
|
||||
import logging
|
||||
from abc import ABC
|
||||
|
||||
from dbgpt.core import HumanPromptTemplate, LLMClient, ModelMessage, ModelRequest
|
||||
from dbgpt.rag.transformer.base import SummarizerBase
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LLMSummarizer(SummarizerBase, ABC):
|
||||
"""LLMSummarizer class."""
|
||||
|
||||
def __init__(self, llm_client: LLMClient, model_name: str, prompt_template: str):
|
||||
"""Initialize the LLMSummarizer."""
|
||||
self._llm_client = llm_client
|
||||
self._model_name = model_name
|
||||
self._prompt_template = prompt_template
|
||||
|
||||
async def summarize(self, **args) -> str:
|
||||
"""Summarize by LLM."""
|
||||
template = HumanPromptTemplate.from_template(self._prompt_template)
|
||||
messages = template.format_messages(**args)
|
||||
|
||||
# use default model if needed
|
||||
if not self._model_name:
|
||||
models = await self._llm_client.models()
|
||||
if not models:
|
||||
raise Exception("No models available")
|
||||
self._model_name = models[0].model
|
||||
logger.info(f"Using model {self._model_name} to extract")
|
||||
|
||||
model_messages = ModelMessage.from_base_messages(messages)
|
||||
request = ModelRequest(model=self._model_name, messages=model_messages)
|
||||
response = await self._llm_client.generate(request=request)
|
||||
|
||||
if not response.success:
|
||||
code = str(response.error_code)
|
||||
reason = response.text
|
||||
logger.error(f"request llm failed ({code}) {reason}")
|
||||
|
||||
return response.text
|
||||
|
||||
def truncate(self):
|
||||
"""Do nothing by default."""
|
||||
|
||||
def drop(self):
|
||||
"""Do nothing by default."""
|
Reference in New Issue
Block a user