mirror of
				https://github.com/csunny/DB-GPT.git
				synced 2025-11-03 17:00:25 +00:00 
			
		
		
		
	Co-authored-by: Florian <fanzhidongyzby@163.com> Co-authored-by: KingSkyLi <15566300566@163.com> Co-authored-by: aries_ckt <916701291@qq.com> Co-authored-by: Fangyin Cheng <staneyffer@gmail.com> Co-authored-by: yvonneyx <zhuyuxin0627@gmail.com>
		
			
				
	
	
		
			374 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			374 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""Define the CommunitySummaryKnowledgeGraph."""
 | 
						||
 | 
						||
import logging
 | 
						||
import os
 | 
						||
from typing import List, Optional
 | 
						||
 | 
						||
from dbgpt._private.pydantic import ConfigDict, Field
 | 
						||
from dbgpt.core import Chunk
 | 
						||
from dbgpt.rag.transformer.community_summarizer import CommunitySummarizer
 | 
						||
from dbgpt.rag.transformer.graph_extractor import GraphExtractor
 | 
						||
from dbgpt.storage.knowledge_graph.community.community_store import CommunityStore
 | 
						||
from dbgpt.storage.knowledge_graph.community.factory import CommunityStoreAdapterFactory
 | 
						||
from dbgpt.storage.knowledge_graph.knowledge_graph import (
 | 
						||
    BuiltinKnowledgeGraph,
 | 
						||
    BuiltinKnowledgeGraphConfig,
 | 
						||
)
 | 
						||
from dbgpt.storage.vector_store.base import VectorStoreConfig
 | 
						||
from dbgpt.storage.vector_store.factory import VectorStoreFactory
 | 
						||
from dbgpt.storage.vector_store.filters import MetadataFilters
 | 
						||
 | 
						||
logger = logging.getLogger(__name__)
 | 
						||
 | 
						||
 | 
						||
class CommunitySummaryKnowledgeGraphConfig(BuiltinKnowledgeGraphConfig):
 | 
						||
    """Community summary knowledge graph config."""
 | 
						||
 | 
						||
    model_config = ConfigDict(arbitrary_types_allowed=True)
 | 
						||
 | 
						||
    vector_store_type: str = Field(
 | 
						||
        default="Chroma", description="The type of vector store."
 | 
						||
    )
 | 
						||
    user: Optional[str] = Field(
 | 
						||
        default=None,
 | 
						||
        description="The user of vector store, if not set, will use the default user.",
 | 
						||
    )
 | 
						||
    password: Optional[str] = Field(
 | 
						||
        default=None,
 | 
						||
        description=(
 | 
						||
            "The password of vector store, if not set, will use the default password."
 | 
						||
        ),
 | 
						||
    )
 | 
						||
    extract_topk: int = Field(
 | 
						||
        default=5,
 | 
						||
        description="Topk of knowledge graph extract",
 | 
						||
    )
 | 
						||
    extract_score_threshold: float = Field(
 | 
						||
        default=0.3,
 | 
						||
        description="Recall score of knowledge graph extract",
 | 
						||
    )
 | 
						||
    community_topk: int = Field(
 | 
						||
        default=50,
 | 
						||
        description="Topk of community search in knowledge graph",
 | 
						||
    )
 | 
						||
    community_score_threshold: float = Field(
 | 
						||
        default=0.0,
 | 
						||
        description="Recall score of community search in knowledge graph",
 | 
						||
    )
 | 
						||
 | 
						||
 | 
						||
class CommunitySummaryKnowledgeGraph(BuiltinKnowledgeGraph):
 | 
						||
    """Community summary knowledge graph class."""
 | 
						||
 | 
						||
    def __init__(self, config: CommunitySummaryKnowledgeGraphConfig):
 | 
						||
        """Initialize community summary knowledge graph class."""
 | 
						||
        super().__init__(config)
 | 
						||
        self._config = config
 | 
						||
 | 
						||
        self._vector_store_type = os.getenv(
 | 
						||
            "VECTOR_STORE_TYPE", config.vector_store_type
 | 
						||
        )
 | 
						||
        self._extract_topk = int(
 | 
						||
            os.getenv("KNOWLEDGE_GRAPH_EXTRACT_SEARCH_TOP_SIZE", config.extract_topk)
 | 
						||
        )
 | 
						||
        self._extract_score_threshold = float(
 | 
						||
            os.getenv(
 | 
						||
                "KNOWLEDGE_GRAPH_EXTRACT_SEARCH_RECALL_SCORE",
 | 
						||
                config.extract_score_threshold,
 | 
						||
            )
 | 
						||
        )
 | 
						||
        self._community_topk = int(
 | 
						||
            os.getenv(
 | 
						||
                "KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_TOP_SIZE", config.community_topk
 | 
						||
            )
 | 
						||
        )
 | 
						||
        self._community_score_threshold = float(
 | 
						||
            os.getenv(
 | 
						||
                "KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_RECALL_SCORE",
 | 
						||
                config.community_score_threshold,
 | 
						||
            )
 | 
						||
        )
 | 
						||
 | 
						||
        def extractor_configure(name: str, cfg: VectorStoreConfig):
 | 
						||
            cfg.name = name
 | 
						||
            cfg.embedding_fn = config.embedding_fn
 | 
						||
            cfg.max_chunks_once_load = config.max_chunks_once_load
 | 
						||
            cfg.max_threads = config.max_threads
 | 
						||
            cfg.user = config.user
 | 
						||
            cfg.password = config.password
 | 
						||
            cfg.topk = self._extract_topk
 | 
						||
            cfg.score_threshold = self._extract_score_threshold
 | 
						||
 | 
						||
        self._graph_extractor = GraphExtractor(
 | 
						||
            self._llm_client,
 | 
						||
            self._model_name,
 | 
						||
            VectorStoreFactory.create(
 | 
						||
                self._vector_store_type,
 | 
						||
                config.name + "_CHUNK_HISTORY",
 | 
						||
                extractor_configure,
 | 
						||
            ),
 | 
						||
        )
 | 
						||
 | 
						||
        def community_store_configure(name: str, cfg: VectorStoreConfig):
 | 
						||
            cfg.name = name
 | 
						||
            cfg.embedding_fn = config.embedding_fn
 | 
						||
            cfg.max_chunks_once_load = config.max_chunks_once_load
 | 
						||
            cfg.max_threads = config.max_threads
 | 
						||
            cfg.user = config.user
 | 
						||
            cfg.password = config.password
 | 
						||
            cfg.topk = self._community_topk
 | 
						||
            cfg.score_threshold = self._community_score_threshold
 | 
						||
 | 
						||
        self._community_store = CommunityStore(
 | 
						||
            CommunityStoreAdapterFactory.create(self._graph_store),
 | 
						||
            CommunitySummarizer(self._llm_client, self._model_name),
 | 
						||
            VectorStoreFactory.create(
 | 
						||
                self._vector_store_type,
 | 
						||
                config.name + "_COMMUNITY_SUMMARY",
 | 
						||
                community_store_configure,
 | 
						||
            ),
 | 
						||
        )
 | 
						||
 | 
						||
    def get_config(self) -> BuiltinKnowledgeGraphConfig:
 | 
						||
        """Get the knowledge graph config."""
 | 
						||
        return self._config
 | 
						||
 | 
						||
    async def aload_document(self, chunks: List[Chunk]) -> List[str]:
 | 
						||
        """Extract and persist graph."""
 | 
						||
        # todo add doc node
 | 
						||
        for chunk in chunks:
 | 
						||
            # todo add chunk node
 | 
						||
            # todo add relation doc-chunk
 | 
						||
 | 
						||
            # extract graphs and save
 | 
						||
            graphs = await self._graph_extractor.extract(chunk.content)
 | 
						||
            for graph in graphs:
 | 
						||
                self._graph_store.insert_graph(graph)
 | 
						||
 | 
						||
        # build communities and save
 | 
						||
        await self._community_store.build_communities()
 | 
						||
 | 
						||
        return [chunk.chunk_id for chunk in chunks]
 | 
						||
 | 
						||
    async def asimilar_search_with_scores(
 | 
						||
        self,
 | 
						||
        text,
 | 
						||
        topk,
 | 
						||
        score_threshold: float,
 | 
						||
        filters: Optional[MetadataFilters] = None,
 | 
						||
    ) -> List[Chunk]:
 | 
						||
        """Retrieve relevant community summaries."""
 | 
						||
        # global search: retrieve relevant community summaries
 | 
						||
        communities = await self._community_store.search_communities(text)
 | 
						||
        summaries = [
 | 
						||
            f"Section {i + 1}:\n{community.summary}"
 | 
						||
            for i, community in enumerate(communities)
 | 
						||
        ]
 | 
						||
        context = "\n".join(summaries) if summaries else ""
 | 
						||
 | 
						||
        # local search: extract keywords and explore subgraph
 | 
						||
        keywords = await self._keyword_extractor.extract(text)
 | 
						||
        subgraph = self._graph_store.explore(keywords, limit=topk).format()
 | 
						||
        logger.info(f"Search subgraph from {len(keywords)} keywords")
 | 
						||
 | 
						||
        if not summaries and not subgraph:
 | 
						||
            return []
 | 
						||
 | 
						||
        # merge search results into context
 | 
						||
        content = HYBRID_SEARCH_PT_CN.format(context=context, graph=subgraph)
 | 
						||
        return [Chunk(content=content)]
 | 
						||
 | 
						||
    def truncate(self) -> List[str]:
 | 
						||
        """Truncate knowledge graph."""
 | 
						||
        logger.info("Truncate community store")
 | 
						||
        self._community_store.truncate()
 | 
						||
        logger.info("Truncate keyword extractor")
 | 
						||
        self._keyword_extractor.truncate()
 | 
						||
        logger.info("Truncate triplet extractor")
 | 
						||
        self._graph_extractor.truncate()
 | 
						||
        return [self._config.name]
 | 
						||
 | 
						||
    def delete_vector_name(self, index_name: str):
 | 
						||
        """Delete knowledge graph."""
 | 
						||
        logger.info("Drop community store")
 | 
						||
        self._community_store.drop()
 | 
						||
 | 
						||
        logger.info("Drop keyword extractor")
 | 
						||
        self._keyword_extractor.drop()
 | 
						||
 | 
						||
        logger.info("Drop triplet extractor")
 | 
						||
        self._graph_extractor.drop()
 | 
						||
 | 
						||
 | 
						||
HYBRID_SEARCH_PT_CN = (
 | 
						||
    "## 角色\n"
 | 
						||
    "你非常擅长结合提示词模板提供的[上下文]信息与[知识图谱]信息,"
 | 
						||
    "准确恰当地回答用户的问题,并保证不会输出与上下文和知识图谱无关的信息。"
 | 
						||
    "\n"
 | 
						||
    "## 技能\n"
 | 
						||
    "### 技能 1: 上下文理解\n"
 | 
						||
    "- 准确地理解[上下文]提供的信息,上下文信息可能被拆分为多个章节。\n"
 | 
						||
    "- 上下文的每个章节内容都会以[Section]开始,并按需进行了编号。\n"
 | 
						||
    "- 上下文信息提供了与用户问题相关度最高的总结性描述,请合理使用它们。"
 | 
						||
    "### 技能 2: 知识图谱理解\n"
 | 
						||
    "- 准确地识别[知识图谱]中提供的[Entities:]章节中的实体信息"
 | 
						||
    "和[Relationships:]章节中的关系信息,实体和关系信息的一般格式为:\n"
 | 
						||
    "```"
 | 
						||
    "* 实体信息格式:\n"
 | 
						||
    "- (实体名)\n"
 | 
						||
    "- (实体名:实体描述)\n"
 | 
						||
    "- (实体名:实体属性表)\n"
 | 
						||
    "- (文本块ID:文档块内容)\n"
 | 
						||
    "- (目录ID:目录名)\n"
 | 
						||
    "- (文档ID:文档名称)\n"
 | 
						||
    "\n"
 | 
						||
    "* 关系信息的格式:\n"
 | 
						||
    "- (来源实体名)-[关系名]->(目标实体名)\n"
 | 
						||
    "- (来源实体名)-[关系名:关系描述]->(目标实体名)\n"
 | 
						||
    "- (来源实体名)-[关系名:关系属性表]->(目标实体名)\n"
 | 
						||
    "- (文本块实体)-[包含]->(实体名)\n"
 | 
						||
    "- (目录ID)-[包含]->(文本块实体)\n"
 | 
						||
    "- (目录ID)-[包含]->(子目录ID)\n"
 | 
						||
    "- (文档ID)-[包含]->(文本块实体)\n"
 | 
						||
    "- (文档ID)-[包含]->(目录ID)\n"
 | 
						||
    "```"
 | 
						||
    "- 正确地将关系信息中的实体名/ID与实体信息关联,还原出图结构。"
 | 
						||
    "- 将图结构所表达的信息作为用户提问的明细上下文,辅助生成更好的答案。\n"
 | 
						||
    "\n"
 | 
						||
    "## 约束条件\n"
 | 
						||
    "- 不要在答案中描述你的思考过程,直接给出用户问题的答案,不要生成无关信息。\n"
 | 
						||
    "- 若[知识图谱]没有提供信息,此时应根据[上下文]提供的信息回答问题。"
 | 
						||
    "- 确保以第三人称书写,从客观角度结合[上下文]和[知识图谱]表达的信息回答问题。\n"
 | 
						||
    "- 若提供的信息相互矛盾,请解决矛盾并提供一个单一、连贯的描述。\n"
 | 
						||
    "- 避免使用停用词和过于常见的词汇。\n"
 | 
						||
    "\n"
 | 
						||
    "## 参考案例\n"
 | 
						||
    "```\n"
 | 
						||
    "[上下文]:\n"
 | 
						||
    "Section 1:\n"
 | 
						||
    "菲尔・贾伯的大儿子叫雅各布・贾伯。\n"
 | 
						||
    "Section 2:\n"
 | 
						||
    "菲尔・贾伯的小儿子叫比尔・贾伯。\n"
 | 
						||
    "[知识图谱]:\n"
 | 
						||
    "Entities:\n"
 | 
						||
    "(菲尔・贾伯#菲尔兹咖啡创始人)\n"
 | 
						||
    "(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌)\n"
 | 
						||
    "(雅各布・贾伯#菲尔・贾伯的儿子)\n"
 | 
						||
    "(美国多地#菲尔兹咖啡的扩展地区)\n"
 | 
						||
    "\n"
 | 
						||
    "Relationships:\n"
 | 
						||
    "(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立)\n"
 | 
						||
    "(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点)\n"
 | 
						||
    "(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的儿子)\n"
 | 
						||
    "(雅各布・贾伯#担任#首席执行官#在2005年成为菲尔兹咖啡的首席执行官)\n"
 | 
						||
    "(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围)\n"
 | 
						||
    "```\n"
 | 
						||
    "\n"
 | 
						||
    "----\n"
 | 
						||
    "\n"
 | 
						||
    "接下来的[上下文]和[知识图谱]的信息,可以帮助你回答更好地用户的问题。\n"
 | 
						||
    "\n"
 | 
						||
    "[上下文]:\n"
 | 
						||
    "{context}\n"
 | 
						||
    "\n"
 | 
						||
    "[知识图谱]:\n"
 | 
						||
    "{graph}\n"
 | 
						||
    "\n"
 | 
						||
)
 | 
						||
 | 
						||
HYBRID_SEARCH_PT_EN = (
 | 
						||
    "## Role\n"
 | 
						||
    "You excel at combining the information provided in the [Context] with "
 | 
						||
    "information from the [KnowledgeGraph] to accurately and appropriately "
 | 
						||
    "answer user questions, ensuring that you do not output information "
 | 
						||
    "unrelated to the context and knowledge graph.\n"
 | 
						||
    "\n"
 | 
						||
    "## Skills\n"
 | 
						||
    "### Skill 1: Context Understanding\n"
 | 
						||
    "- Accurately understand the information provided in the [Context], "
 | 
						||
    "which may be divided into several sections.\n"
 | 
						||
    "- Each section in the context will start with [Section] "
 | 
						||
    "and may be numbered as needed.\n"
 | 
						||
    "- The context provides a summary description most relevant to the user’s "
 | 
						||
    "question, and it should be used wisely."
 | 
						||
    "### Skill 2: Knowledge Graph Understanding\n"
 | 
						||
    "- Accurately identify entity information in the [Entities:] section and "
 | 
						||
    "relationship information in the [Relationships:] section "
 | 
						||
    "of the [KnowledgeGraph]. The general format for entity "
 | 
						||
    "and relationship information is:\n"
 | 
						||
    "```"
 | 
						||
    "* Entity Information Format:\n"
 | 
						||
    "- (entity_name)\n"
 | 
						||
    "- (entity_name: entity_description)\n"
 | 
						||
    "- (entity_name: entity_property_map)\n"
 | 
						||
    "- (chunk_id: chunk_content)\n"
 | 
						||
    "- (catalog_id: catalog_name)\n"
 | 
						||
    "- (document_id: document_name)\n"
 | 
						||
    "\n"
 | 
						||
    "* Relationship Information Format:\n"
 | 
						||
    "- (source_entity_name)-[relationship_name]->(target_entity_name)\n"
 | 
						||
    "- (source_entity_name)-[relationship_name: relationship_description]->"
 | 
						||
    "(target_entity_name)\n"
 | 
						||
    "- (source_entity_name)-[relationship_name: relationship_property_map]->"
 | 
						||
    "(target_entity_name)\n"
 | 
						||
    "- (chunk_id)-[Contains]->(entity_name)\n"
 | 
						||
    "- (catalog_id)-[Contains]->(chunk_id)\n"
 | 
						||
    "- (catalog_id)-[Contains]->(sub_catalog_id)\n"
 | 
						||
    "- (document_id)-[Contains]->(chunk_id)\n"
 | 
						||
    "- (document_id)-[Contains]->(catalog_id)\n"
 | 
						||
    "```"
 | 
						||
    "- Correctly associate entity names/IDs in the relationship information "
 | 
						||
    "with entity information to restore the graph structure."
 | 
						||
    "- Use the information expressed by the graph structure as detailed "
 | 
						||
    "context for the user's query to assist in generating better answers.\n"
 | 
						||
    "\n"
 | 
						||
    "## Constraints\n"
 | 
						||
    "- Don't describe your thought process in the answer, provide the answer "
 | 
						||
    "to the user's question directly without generating irrelevant information."
 | 
						||
    "- If the [KnowledgeGraph] does not provide information, you should answer "
 | 
						||
    "the question based on the information provided in the [Context]."
 | 
						||
    "- Ensure to write in the third person, responding to questions from "
 | 
						||
    "an objective perspective based on the information combined from the "
 | 
						||
    "[Context] and the [KnowledgeGraph].\n"
 | 
						||
    "- If the provided information is contradictory, resolve the "
 | 
						||
    "contradictions and provide a single, coherent description.\n"
 | 
						||
    "- Avoid using stop words and overly common vocabulary.\n"
 | 
						||
    "\n"
 | 
						||
    "## Reference Example\n"
 | 
						||
    "```\n"
 | 
						||
    "[Context]:\n"
 | 
						||
    "Section 1:\n"
 | 
						||
    "Phil Schiller's eldest son is Jacob Schiller.\n"
 | 
						||
    "Section 2:\n"
 | 
						||
    "Phil Schiller's youngest son is Bill Schiller.\n"
 | 
						||
    "[KnowledgeGraph]:\n"
 | 
						||
    "Entities:\n"
 | 
						||
    "(Phil Jaber#Founder of Philz Coffee)\n"
 | 
						||
    "(Philz Coffee#Coffee brand founded in Berkeley, California)\n"
 | 
						||
    "(Jacob Jaber#Son of Phil Jaber)\n"
 | 
						||
    "(Multiple locations in the USA#Expansion regions of Philz Coffee)\n"
 | 
						||
    "\n"
 | 
						||
    "Relationships:\n"
 | 
						||
    "(Phil Jaber#Created#Philz Coffee"
 | 
						||
    "#Founded in Berkeley, California in 1978)\n"
 | 
						||
    "(Philz Coffee#Located in#Berkeley, California"
 | 
						||
    "#Founding location of Philz Coffee)\n"
 | 
						||
    "(Phil Jaber#Has#Jacob Jaber#Son of Phil Jaber)\n"
 | 
						||
    "(Jacob Jaber#Serves as#CEO#Became CEO of Philz Coffee in 2005)\n"
 | 
						||
    "(Philz Coffee#Expanded to#Multiple locations in the USA"
 | 
						||
    "#Expansion regions of Philz Coffee)\n"
 | 
						||
    "```\n"
 | 
						||
    "\n"
 | 
						||
    "----\n"
 | 
						||
    "\n"
 | 
						||
    "The following information from the [Context] and [KnowledgeGraph] can "
 | 
						||
    "help you better answer user questions.\n"
 | 
						||
    "\n"
 | 
						||
    "[Context]:\n"
 | 
						||
    "{context}\n"
 | 
						||
    "\n"
 | 
						||
    "[KnowledgeGraph]:\n"
 | 
						||
    "{graph}\n"
 | 
						||
    "\n"
 | 
						||
)
 |