"""GraphExtractor class.""" import logging import re from typing import List, Optional from dbgpt.core import Chunk, LLMClient from dbgpt.rag.transformer.llm_extractor import LLMExtractor from dbgpt.storage.graph_store.graph import Edge, Graph, MemoryGraph, Vertex from dbgpt.storage.vector_store.base import VectorStoreBase logger = logging.getLogger(__name__) class GraphExtractor(LLMExtractor): """GraphExtractor class.""" def __init__( self, llm_client: LLMClient, model_name: str, chunk_history: VectorStoreBase ): """Initialize the GraphExtractor.""" super().__init__(llm_client, model_name, GRAPH_EXTRACT_PT_CN) self._chunk_history = chunk_history config = self._chunk_history.get_config() self._vector_space = config.name self._max_chunks_once_load = config.max_chunks_once_load self._max_threads = config.max_threads self._topk = config.topk self._score_threshold = config.score_threshold async def extract(self, text: str, limit: Optional[int] = None) -> List: """Load similar chunks.""" # load similar chunks chunks = await self._chunk_history.asimilar_search_with_scores( text, self._topk, self._score_threshold ) history = [ f"Section {i + 1}:\n{chunk.content}" for i, chunk in enumerate(chunks) ] context = "\n".join(history) if history else "" try: # extract with chunk history return await super()._extract(text, context, limit) finally: # save chunk to history await self._chunk_history.aload_document_with_limit( [Chunk(content=text, metadata={"relevant_cnt": len(history)})], self._max_chunks_once_load, self._max_threads, ) def _parse_response(self, text: str, limit: Optional[int] = None) -> List[Graph]: graph = MemoryGraph() edge_count = 0 current_section = None for line in text.split("\n"): line = line.strip() if line in ["Entities:", "Relationships:"]: current_section = line[:-1] elif line and current_section: if current_section == "Entities": match = re.match(r"\((.*?)#(.*?)\)", line) if match: name, summary = [part.strip() for part in match.groups()] graph.upsert_vertex(Vertex(name, description=summary)) elif current_section == "Relationships": match = re.match(r"\((.*?)#(.*?)#(.*?)#(.*?)\)", line) if match: source, name, target, summary = [ part.strip() for part in match.groups() ] edge_count += 1 graph.append_edge( Edge(source, target, name, description=summary) ) if limit and edge_count >= limit: break return [graph] def truncate(self): """Truncate chunk history.""" self._chunk_history.truncate() def drop(self): """Drop chunk history.""" self._chunk_history.delete_vector_name(self._vector_space) GRAPH_EXTRACT_PT_CN = ( "## 角色\n" "你是一个知识图谱工程专家,非常擅长从文本中精确抽取知识图谱的实体" "(主体、客体)和关系,并能对实体和关系的含义做出恰当的总结性描述。\n" "\n" "## 技能\n" "### 技能 1: 实体抽取\n" "--请按照如下步骤抽取实体--\n" "1. 准确地识别文本中的实体信息,一般是名词、代词等。\n" "2. 准确地识别实体的修饰性描述,一般作为定语对实体特征做补充。\n" "3. 对相同概念的实体(同义词、别称、代指),请合并为单一简洁的实体名," "并合并它们的描述信息。\n" "4. 对合并后的实体描述信息做简洁、恰当、连贯的总结。\n" "\n" "### 技能 2: 关系抽取\n" "--请按照如下步骤抽取关系--\n" "1. 准确地识别文本中实体之间的关联信息,一般是动词、代词等。\n" "2. 准确地识别关系的修饰性描述,一般作为状语对关系特征做补充。\n" "3. 对相同概念的关系(同义词、别称、代指),请合并为单一简洁的关系名," "并合并它们的描述信息。\n" "4. 对合并后的关系描述信息做简洁、恰当、连贯的总结。\n" "\n" "### 技能 3: 关联上下文\n" "- 关联上下文来自与当前待抽取文本相关的前置段落内容," "可以为知识抽取提供信息补充。\n" "- 合理利用提供的上下文信息,知识抽取过程中出现的内容引用可能来自关联上下文。\n" "- 不要对关联上下文的内容做知识抽取,而仅作为关联信息参考。\n" "- 关联上下文是可选信息,可能为空。\n" "\n" "## 约束条件\n" "- 如果文本已提供了图结构格式的数据,直接转换为输出格式返回," "不要修改实体或ID名称。" "- 尽可能多的生成文本中提及的实体和关系信息,但不要随意创造不存在的实体和关系。\n" "- 确保以第三人称书写,从客观角度描述实体名称、关系名称,以及他们的总结性描述。\n" "- 尽可能多地使用关联上下文中的信息丰富实体和关系的内容,这非常重要。\n" "- 如果实体或关系的总结描述为空,不提供总结描述信息,不要生成无关的描述信息。\n" "- 如果提供的描述信息相互矛盾,请解决矛盾并提供一个单一、连贯的描述。\n" "- 实体和关系的名称或者描述文本出现#和:字符时,使用_字符替换,其他字符不要修改。" "- 避免使用停用词和过于常见的词汇。\n" "\n" "## 输出格式\n" "Entities:\n" "(实体名#实体总结)\n" "...\n\n" "Relationships:\n" "(来源实体名#关系名#目标实体名#关系总结)\n" "...\n" "\n" "## 参考案例" "--案例仅帮助你理解提示词的输入和输出格式,请不要在答案中使用它们。--\n" "输入:\n" "```\n" "[上下文]:\n" "Section 1:\n" "菲尔・贾伯的大儿子叫雅各布・贾伯。\n" "Section 2:\n" "菲尔・贾伯的小儿子叫比尔・贾伯。\n" "..." "\n" "[文本]:\n" "菲尔兹咖啡由菲尔・贾伯于1978年在加利福尼亚州伯克利创立。" "因其独特的混合咖啡而闻名,菲尔兹已扩展到美国多地。" "他的大儿子于2005年成为首席执行官,并带领公司实现了显著增长。\n" "```\n" "\n" "输出:\n" "```\n" "Entities:\n" "(菲尔・贾伯#菲尔兹咖啡创始人)\n" "(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌)\n" "(雅各布・贾伯#菲尔・贾伯的大儿子)\n" "(美国多地#菲尔兹咖啡的扩展地区)\n" "\n" "Relationships:\n" "(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立)\n" "(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点)\n" "(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的大儿子)\n" "(雅各布・贾伯#管理#菲尔兹咖啡#在2005年担任首席执行官)\n" "(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围)\n" "```\n" "\n" "----\n" "\n" "请根据接下来[上下文]提供的信息,按照上述要求,抽取[文本]中的实体和关系数据。\n" "\n" "[上下文]:\n" "{history}\n" "\n" "[文本]:\n" "{text}\n" "\n" "[结果]:\n" "\n" ) GRAPH_EXTRACT_PT_EN = ( "## Role\n" "You are an expert in Knowledge Graph Engineering, skilled at extracting " "entities (subjects, objects) and relations from text, and summarizing " "their meanings effectively.\n" "\n" "## Skills\n" "### Skill 1: Entity Extraction\n" "--Please follow these steps to extract entities--\n" "1. Accurately identify entity information in the text, " "usually nouns, pronouns, etc.\n" "2. Accurately identify descriptive information, " "usually as adjectives, that supplements entity features.\n" "3. Merge synonymous, alias, or reference entities into " "a single concise entity name, and merge their descriptive information.\n" "4. Provide a concise, appropriate, and coherent summary " "of the combined entity descriptions.\n" "\n" "### Skill 2: Relation Extraction\n" "--Please follow these steps to extract relations--\n" "1. Accurately identify relation information between entities in the text, " "usually verbs, pronouns, etc.\n" "2. Accurately identify descriptive information, usually as adverbs, " "that supplements relation features.\n" "3. Merge synonymous, alias, or reference relations into " "a single concise relation name, and merge their descriptive information.\n" "4. Provide a concise, appropriate, and coherent summary " "of the combined relation descriptions.\n" "\n" "### Skill 3: Contextual Association\n" "- Context comes from preceding paragraphs related to the current " "extraction text and can provide supplementary information.\n" "- Appropriately use contextual information, content references " "during extraction may come from this context.\n" "- Do not extract knowledge from contextual content, " "use it only as a reference.\n" "- Context is optional and may be empty.\n" "\n" "## Constraints\n" "- If the text has provided data that is similar to or the same as the " "output format, please format the output directly according to the " "output format requirements." "- Generate as much entity and relation information mentioned in the text " "as possible, but do not create nonexistent entities or relations.\n" "- Ensure the writing is in the third person, describing entity names, " "relation names, and their summaries objectively.\n" "- Use as much contextual information as possible to enrich the content " "of entities and relations, this is very important.\n" "- If a summary of an entity or relation is empty, do not provide " "summary information, and do not generate irrelevant descriptions.\n" "- If provided descriptions are contradictory, resolve the conflict " "and provide a single, coherent description.\n" "- Replace any # or : characters in entity's and relation's " "names or descriptions with an _ character.\n" "- Avoid using stop words and overly common terms.\n" "\n" "## Output Format\n" "Entities:\n" "(entity_name#entity_summary)\n" "...\n\n" "Relationships:\n" "(source_entity_name#relation_name#target_entity_name#relation_summary)\n" "...\n" "\n" "## Reference Example\n" "--The case is only to help you understand the input and output format of " "the prompt, please do not use it in your answer.--\n" "Input:\n" "```\n" "[Context]:\n" "Section 1:\n" "Phil Jabber's eldest son is named Jacob Jabber.\n" "Section 2:\n" "Phil Jabber's youngest son is named Bill Jabber.\n" "..." "\n" "[Text]:\n" "Philz Coffee was founded by Phil Jabber in 1978 in Berkeley, California. " "Known for its distinctive blend coffee, Philz has expanded to multiple " "locations in the USA. His eldest son became CEO in 2005, " "leading significant growth for the company.\n" "```\n" "\n" "Output:\n" "```\n" "Entities:\n" "(Phil Jabber#Founder of Philz Coffee)\n" "(Philz Coffee#Coffee brand founded in Berkeley, California)\n" "(Jacob Jabber#Phil Jabber's eldest son)\n" "(Multiple locations in the USA#Philz Coffee's expansion area)\n" "\n" "Relationships:\n" "(Phil Jabber#Founded#Philz Coffee" "#Founded in 1978 in Berkeley, California)\n" "(Philz Coffee#Located in#Berkeley, California" "#Philz Coffee's founding location)\n" "(Phil Jabber#Has#Jacob Jabber#Phil Jabber's eldest son)\n" "(Jacob Jabber#Manage#Philz Coffee#Serve as CEO in 2005)\n" "(Philz Coffee#Expanded to#Multiple locations in the USA" "#Philz Coffee's expansion area)\n" "```\n" "\n" "----\n" "\n" "Please extract the entities and relationships data from the [Text] " "according to the above requirements, using the provided [Context].\n" "\n" "[Context]:\n" "{history}\n" "\n" "[Text]:\n" "{text}\n" "\n" "[Results]:\n" "\n" )