Files
DB-GPT/dbgpt/storage/knowledge_graph/community_summary.py
Florian d9e20426fe fix: fix unit test error (#2085)
Co-authored-by: aries_ckt <916701291@qq.com>
Co-authored-by: Appointat <kuda.czk@antgroup.com>
2024-10-22 09:35:51 +08:00

546 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Define the CommunitySummaryKnowledgeGraph."""
import logging
import os
import uuid
from typing import List, Optional, Tuple
from dbgpt._private.pydantic import ConfigDict, Field
from dbgpt.core import Chunk
from dbgpt.rag.transformer.community_summarizer import CommunitySummarizer
from dbgpt.rag.transformer.graph_extractor import GraphExtractor
from dbgpt.storage.graph_store.graph import MemoryGraph
from dbgpt.storage.knowledge_graph.base import ParagraphChunk
from dbgpt.storage.knowledge_graph.community.community_store import CommunityStore
from dbgpt.storage.knowledge_graph.knowledge_graph import (
BuiltinKnowledgeGraph,
BuiltinKnowledgeGraphConfig,
)
from dbgpt.storage.vector_store.base import VectorStoreConfig
from dbgpt.storage.vector_store.factory import VectorStoreFactory
from dbgpt.storage.vector_store.filters import MetadataFilters
logger = logging.getLogger(__name__)
class CommunitySummaryKnowledgeGraphConfig(BuiltinKnowledgeGraphConfig):
"""Community summary knowledge graph config."""
model_config = ConfigDict(arbitrary_types_allowed=True)
vector_store_type: str = Field(
default="Chroma",
description="The type of vector store.",
)
user: Optional[str] = Field(
default=None,
description="The user of vector store, if not set, will use the default user.",
)
password: Optional[str] = Field(
default=None,
description=(
"The password of vector store, "
"if not set, will use the default password."
),
)
extract_topk: int = Field(
default=5,
description="Topk of knowledge graph extract",
)
extract_score_threshold: float = Field(
default=0.3,
description="Recall score of knowledge graph extract",
)
community_topk: int = Field(
default=50,
description="Topk of community search in knowledge graph",
)
community_score_threshold: float = Field(
default=0.0,
description="Recall score of community search in knowledge graph",
)
knowledge_graph_chunk_search_top_size: int = Field(
default=5,
description="Top size of knowledge graph chunk search",
)
class CommunitySummaryKnowledgeGraph(BuiltinKnowledgeGraph):
"""Community summary knowledge graph class."""
def __init__(self, config: CommunitySummaryKnowledgeGraphConfig):
"""Initialize community summary knowledge graph class."""
super().__init__(config)
self._config = config
self._vector_store_type = os.getenv(
"VECTOR_STORE_TYPE", config.vector_store_type
)
self._extract_topk = int(
os.getenv("KNOWLEDGE_GRAPH_EXTRACT_SEARCH_TOP_SIZE", config.extract_topk)
)
self._extract_score_threshold = float(
os.getenv(
"KNOWLEDGE_GRAPH_EXTRACT_SEARCH_RECALL_SCORE",
config.extract_score_threshold,
)
)
self._community_topk = int(
os.getenv(
"KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_TOP_SIZE", config.community_topk
)
)
self._community_score_threshold = float(
os.getenv(
"KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_RECALL_SCORE",
config.community_score_threshold,
)
)
def extractor_configure(name: str, cfg: VectorStoreConfig):
cfg.name = name
cfg.embedding_fn = config.embedding_fn
cfg.max_chunks_once_load = config.max_chunks_once_load
cfg.max_threads = config.max_threads
cfg.user = config.user
cfg.password = config.password
cfg.topk = self._extract_topk
cfg.score_threshold = self._extract_score_threshold
self._graph_extractor = GraphExtractor(
self._llm_client,
self._model_name,
VectorStoreFactory.create(
self._vector_store_type,
config.name + "_CHUNK_HISTORY",
extractor_configure,
),
)
def community_store_configure(name: str, cfg: VectorStoreConfig):
cfg.name = name
cfg.embedding_fn = config.embedding_fn
cfg.max_chunks_once_load = config.max_chunks_once_load
cfg.max_threads = config.max_threads
cfg.user = config.user
cfg.password = config.password
cfg.topk = self._community_topk
cfg.score_threshold = self._community_score_threshold
self._community_store = CommunityStore(
self._graph_store_apdater,
CommunitySummarizer(self._llm_client, self._model_name),
VectorStoreFactory.create(
self._vector_store_type,
config.name + "_COMMUNITY_SUMMARY",
community_store_configure,
),
)
def get_config(self) -> BuiltinKnowledgeGraphConfig:
"""Get the knowledge graph config."""
return self._config
async def aload_document(self, chunks: List[Chunk]) -> List[str]:
"""Extract and persist graph from the document file."""
await self._aload_document_graph(chunks)
await self._aload_triplet_graph(chunks)
await self._community_store.build_communities()
return [chunk.chunk_id for chunk in chunks]
async def _aload_document_graph(self, chunks: List[Chunk]) -> None:
"""Load the knowledge graph from the chunks.
The chunks include the doc structure.
"""
if not self._graph_store.get_config().document_graph_enabled:
return
_chunks: List[ParagraphChunk] = [
ParagraphChunk.model_validate(chunk.model_dump()) for chunk in chunks
]
documment_chunk, paragraph_chunks = self._load_chunks(_chunks)
# upsert the document and chunks vertices
self._graph_store_apdater.upsert_documents(iter([documment_chunk]))
self._graph_store_apdater.upsert_chunks(iter(paragraph_chunks))
# upsert the document structure
for chunk_index, chunk in enumerate(paragraph_chunks):
# document -> include -> chunk
if chunk.parent_is_document:
self._graph_store_apdater.upsert_doc_include_chunk(chunk=chunk)
else: # chunk -> include -> chunk
self._graph_store_apdater.upsert_chunk_include_chunk(chunk=chunk)
# chunk -> next -> chunk
if chunk_index >= 1:
self._graph_store_apdater.upsert_chunk_next_chunk(
chunk=paragraph_chunks[chunk_index - 1], next_chunk=chunk
)
async def _aload_triplet_graph(self, chunks: List[Chunk]) -> None:
"""Load the knowledge graph from the chunks.
The chunks include the doc structure.
"""
if not self._graph_store.get_config().triplet_graph_enabled:
return
document_graph_enabled = self._graph_store.get_config().document_graph_enabled
for chunk in chunks:
# TODO: Use asyncio to extract graph to accelerate the process
# (attention to the CAP of the graph db)
graphs: List[MemoryGraph] = await self._graph_extractor.extract(
chunk.content
)
for graph in graphs:
if document_graph_enabled:
# append the chunk id to the edge
for edge in graph.edges():
edge.set_prop("_chunk_id", chunk.chunk_id)
graph.append_edge(edge=edge)
# upsert the graph
self._graph_store_apdater.upsert_graph(graph)
# chunk -> include -> entity
if document_graph_enabled:
for vertex in graph.vertices():
self._graph_store_apdater.upsert_chunk_include_entity(
chunk=chunk, entity=vertex
)
def _load_chunks(
self, chunks: List[ParagraphChunk]
) -> Tuple[ParagraphChunk, List[ParagraphChunk]]:
"""Load the chunks, and add the parent-child relationship within chunks."""
# init default document
doc_id = str(uuid.uuid4())
doc_name = os.path.basename(chunks[0].metadata["source"] or "Text_Node")
doc_chunk = ParagraphChunk(
chunk_id=doc_id,
chunk_name=doc_name,
)
# chunk.metadata = {"Header0": "title", "Header1": "title", ..., "source": "source_path"} # noqa: E501
for chunk_index, chunk in enumerate(chunks):
parent = None
directory_keys = list(chunk.metadata.keys())[
:-1
] # ex: ['Header0', 'Header1', 'Header2', ...]
parent_level = directory_keys[-2] if len(directory_keys) > 1 else None
current_level = directory_keys[-1] if directory_keys else "Header0"
chunk.chunk_name = chunk.metadata.get(current_level, "none_header_chunk")
# Find the parent chunk for every chunk
# parent chunk -> chunk
if parent_level:
for parent_direct in reversed(directory_keys[:-1]):
parent_titile = chunk.metadata.get(parent_direct, None)
for n in reversed(range(chunk_index)):
metadata = chunks[n].metadata
keys = list(metadata.keys())[:-1]
if (
metadata
and parent_direct == keys[-1]
and parent_titile == metadata.get(parent_direct)
):
parent = chunks[n]
chunk.chunk_parent_id = parent.chunk_id
chunk.chunk_parent_name = parent_titile
chunk.parent_content = parent.content
break
if chunk_index - n > len(directory_keys):
break
if chunk.chunk_parent_id:
break
if not chunk.chunk_parent_id:
chunk.chunk_parent_id = doc_id
chunk.chunk_parent_name = doc_name
chunk.parent_content = ""
chunk.parent_is_document = True
return doc_chunk, chunks
def similar_search(
self, text: str, topk: int, filters: Optional[MetadataFilters] = None
) -> List[Chunk]:
"""Similar search in index database.
Args:
text(str): The query text.
topk(int): The number of similar documents to return.
filters(Optional[MetadataFilters]): metadata filters.
Return:
List[Chunk]: The similar documents.
"""
return []
async def asimilar_search_with_scores(
self,
text,
topk,
score_threshold: float,
filters: Optional[MetadataFilters] = None,
) -> List[Chunk]:
"""Retrieve relevant community summaries."""
# Global search: retrieve relevant community summaries
communities = await self._community_store.search_communities(text)
summaries = [
f"Section {i + 1}:\n{community.summary}"
for i, community in enumerate(communities)
]
context = "\n".join(summaries) if summaries else ""
keywords: List[str] = await self._keyword_extractor.extract(text)
# Local search: extract keywords and explore subgraph
triplet_graph_enabled = self._graph_store.get_config().triplet_graph_enabled
document_graph_enabled = self._graph_store.get_config().document_graph_enabled
if triplet_graph_enabled:
subgraph: MemoryGraph = self._graph_store_apdater.explore(
subs=keywords, limit=topk, search_scope="knowledge_graph"
)
if document_graph_enabled:
keywords_for_document_graph = keywords
for vertex in subgraph.vertices():
keywords_for_document_graph.append(vertex.name)
subgraph_for_doc = self._graph_store_apdater.explore(
subs=keywords_for_document_graph,
limit=self._config.knowledge_graph_chunk_search_top_size,
search_scope="document_graph",
)
else:
if document_graph_enabled:
subgraph_for_doc = self._graph_store_apdater.explore(
subs=keywords,
limit=self._config.knowledge_graph_chunk_search_top_size,
search_scope="document_graph",
)
knowledge_graph_str = subgraph.format() if subgraph else ""
knowledge_graph_for_doc_str = (
subgraph_for_doc.format() if subgraph_for_doc else ""
)
logger.info(f"Search subgraph from the following keywords:\n{len(keywords)}")
if not (summaries or knowledge_graph_str or knowledge_graph_for_doc_str):
return []
# merge search results into context
content = HYBRID_SEARCH_PT_CN.format(
context=context,
knowledge_graph=knowledge_graph_str,
knowledge_graph_for_doc=knowledge_graph_for_doc_str,
)
logger.info(f"Final GraphRAG queried prompt:\n{content}")
return [Chunk(content=content)]
def truncate(self) -> List[str]:
"""Truncate knowledge graph."""
logger.info("Truncate community store")
self._community_store.truncate()
logger.info("Truncate keyword extractor")
self._keyword_extractor.truncate()
logger.info("Truncate triplet extractor")
self._graph_extractor.truncate()
return [self._config.name]
def delete_vector_name(self, index_name: str):
"""Delete knowledge graph."""
logger.info("Drop community store")
self._community_store.drop()
logger.info("Drop keyword extractor")
self._keyword_extractor.drop()
logger.info("Drop triplet extractor")
self._graph_extractor.drop()
HYBRID_SEARCH_PT_CN = """## 角色
你非常擅长结合提示词模板提供的[上下文]信息与[知识图谱]信息,
准确恰当地回答用户的问题,并保证不会输出与上下文和知识图谱无关的信息。
## 技能
### 技能 1: 上下文理解
- 准确地理解[上下文]提供的信息,上下文信息可能被拆分为多个章节。
- 上下文的每个章节内容都会以[Section]开始,并按需进行了编号。
- 上下文信息提供了与用户问题相关度最高的总结性描述,请合理使用它们。
### 技能 2: 知识图谱理解
- 准确地识别[知识图谱]中提供的[Entities:]章节中的实体信息和[Relationships:]章节中的关系信息,实体和关系信息的一般格式为:
```
* 实体信息格式:
- (实体名)
- (实体名:实体描述)
- (实体名:实体属性表)
- (文本块ID:文档块内容)
- (目录ID:目录名)
- (文档ID:文档名称)
* 关系信息的格式:
- (来源实体名)-[关系名]->(目标实体名)
- (来源实体名)-[关系名:关系描述]->(目标实体名)
- (来源实体名)-[关系名:关系属性表]->(目标实体名)
- (文本块实体)-[包含]->(实体名)
- (目录ID)-[包含]->(文本块实体)
- (目录ID)-[包含]->(子目录ID)
- (文档ID)-[包含]->(文本块实体)
- (文档ID)-[包含]->(目录ID)
```
- 正确地将关系信息中的实体名/ID与实体信息关联还原出图结构。
- 将图结构所表达的信息作为用户提问的明细上下文,辅助生成更好的答案。
## 约束条件
- 不要在答案中描述你的思考过程,直接给出用户问题的答案,不要生成无关信息。
- 若[知识图谱]或者[知识库原文]没有提供信息,此时应根据[上下文]提供的信息回答问题。
- 确保以第三人称书写,从客观角度结合[上下文]、[知识图谱]和[知识库原文]表达的信息回答问题。
- 若提供的信息相互矛盾,请解决矛盾并提供一个单一、连贯的描述。
- 避免使用停用词和过于常见的词汇。
## 参考案例
```
[上下文]:
Section 1:
菲尔・贾伯的大儿子叫雅各布・贾伯。
Section 2:
菲尔・贾伯的小儿子叫比尔・贾伯。
[知识图谱]:
Entities:
(菲尔・贾伯#菲尔兹咖啡创始人)
(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌)
(雅各布・贾伯#菲尔・贾伯的儿子)
(美国多地#菲尔兹咖啡的扩展地区)
Relationships:
(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立)
(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点)
(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的儿子)
(雅各布・贾伯#担任#首席执行官#在2005年成为菲尔兹咖啡的首席执行官)
(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围)
[知识库原文]:
...
```
----
接下来的[上下文]、[知识图谱]和[知识库原文]的信息,可以帮助你回答更好地用户的问题。
[上下文]:
{context}
[知识图谱]:
{knowledge_graph}
[知识库原文]
{knowledge_graph_for_doc}
""" # noqa: E501
HYBRID_SEARCH_PT_EN = """## Role
You excel at combining the information provided in the [Context] with
information from the [KnowledgeGraph] to accurately and appropriately
answer user questions, ensuring that you do not output information
unrelated to the context and knowledge graph.
## Skills
### Skill 1: Context Understanding
- Accurately understand the information provided in the [Context],
which may be divided into several sections.
- Each section in the context will start with [Section]
and may be numbered as needed.
- The context provides a summary description most relevant to the user's
question, and it should be used wisely.
### Skill 2: Knowledge Graph Understanding
- Accurately identify entity information in the [Entities:] section and
relationship information in the [Relationships:] section
of the [KnowledgeGraph]. The general format for entity
and relationship information is:
```
* Entity Information Format:
- (entity_name)
- (entity_name: entity_description)
- (entity_name: entity_property_map)
- (chunk_id: chunk_content)
- (catalog_id: catalog_name)
- (document_id: document_name)
* Relationship Information Format:
- (source_entity_name)-[relationship_name]->(target_entity_name)
- (source_entity_name)-[relationship_name: relationship_description]->(target_entity_name)
- (source_entity_name)-[relationship_name: relationship_property_map]->(target_entity_name)
- (chunk_id)-[Contains]->(entity_name)
- (catalog_id)-[Contains]->(chunk_id)
- (catalog_id)-[Contains]->(sub_catalog_id)
- (document_id)-[Contains]->(chunk_id)
- (document_id)-[Contains]->(catalog_id)
```
- Correctly associate entity names/IDs in the relationship information
with entity information to restore the graph structure.
- Use the information expressed by the graph structure as detailed
context for the user's query to assist in generating better answers.
## Constraints
- Don't describe your thought process in the answer, provide the answer
to the user's question directly without generating irrelevant information.
- If the [KnowledgeGraph] or [Knowledge base original text] does not provide information, you should answer
the question based on the information provided in the [Context].
- Ensure to write in the third person, responding to questions from
an objective perspective based on the information combined from the
[Context], the [KnowledgeGraph] and the [Knowledge base original text].
- If the provided information is contradictory, resolve the
contradictions and provide a single, coherent description.
- Avoid using stop words and overly common vocabulary.
## Reference Example
```
[Context]:
Section 1:
Phil Schiller's eldest son is Jacob Schiller.
Section 2:
Phil Schiller's youngest son is Bill Schiller.
[KnowledgeGraph]:
Entities:
(Phil Jaber#Founder of Philz Coffee)
(Philz Coffee#Coffee brand founded in Berkeley, California)
(Jacob Jaber#Son of Phil Jaber)
(Multiple locations in the USA#Expansion regions of Philz Coffee)
Relationships:
(Phil Jaber#Created#Philz Coffee#Founded in Berkeley, California in 1978)
(Philz Coffee#Located in#Berkeley, California#Founding location of Philz Coffee)
(Phil Jaber#Has#Jacob Jaber#Son of Phil Jaber)
(Jacob Jaber#Serves as#CEO#Became CEO of Philz Coffee in 2005)
(Philz Coffee#Expanded to#Multiple locations in the USA#Expansion regions of Philz Coffee)
[Knowledge base original text]
...
```
----
The following information from the [Context], [KnowledgeGraph] and [Knowledge base original text]
can help you better answer user questions.
[Context]:
{context}
[KnowledgeGraph]:
{knowledge_graph}
[Knowledge base original text]
{knowledge_graph_for_doc}
""" # noqa: E501