mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-09-04 10:34:30 +00:00
✨ feat(GraphRAG): enhance GraphRAG by graph community summary (#1801)
Co-authored-by: Florian <fanzhidongyzby@163.com> Co-authored-by: KingSkyLi <15566300566@163.com> Co-authored-by: aries_ckt <916701291@qq.com> Co-authored-by: Fangyin Cheng <staneyffer@gmail.com> Co-authored-by: yvonneyx <zhuyuxin0627@gmail.com>
This commit is contained in:
@@ -71,7 +71,7 @@ EMBEDDING_MODEL=text2vec
|
||||
#EMBEDDING_MODEL=bge-large-zh
|
||||
KNOWLEDGE_CHUNK_SIZE=500
|
||||
KNOWLEDGE_SEARCH_TOP_SIZE=5
|
||||
KNOWLEDGE_GRAPH_SEARCH_TOP_SIZE=50
|
||||
KNOWLEDGE_GRAPH_SEARCH_TOP_SIZE=200
|
||||
## Maximum number of chunks to load at once, if your single document is too large,
|
||||
## you can set this value to a higher value for better performance.
|
||||
## if out of memory when load large document, you can set this value to a lower value.
|
||||
@@ -157,6 +157,11 @@ EXECUTE_LOCAL_COMMANDS=False
|
||||
#*******************************************************************#
|
||||
VECTOR_STORE_TYPE=Chroma
|
||||
GRAPH_STORE_TYPE=TuGraph
|
||||
GRAPH_COMMUNITY_SUMMARY_ENABLED=True
|
||||
KNOWLEDGE_GRAPH_EXTRACT_SEARCH_TOP_SIZE=5
|
||||
KNOWLEDGE_GRAPH_EXTRACT_SEARCH_RECALL_SCORE=0.3
|
||||
KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_TOP_SIZE=20
|
||||
KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_RECALL_SCORE=0.0
|
||||
|
||||
### Chroma vector db config
|
||||
#CHROMA_PERSIST_PATH=/root/DB-GPT/pilot/data
|
||||
@@ -187,7 +192,7 @@ ElasticSearch_PASSWORD={your_password}
|
||||
#TUGRAPH_PASSWORD=73@TuGraph
|
||||
#TUGRAPH_VERTEX_TYPE=entity
|
||||
#TUGRAPH_EDGE_TYPE=relation
|
||||
#TUGRAPH_EDGE_NAME_KEY=label
|
||||
#TUGRAPH_PLUGIN_NAMES=leiden
|
||||
|
||||
#*******************************************************************#
|
||||
#** WebServer Language Support **#
|
||||
|
3
.gitignore
vendored
3
.gitignore
vendored
@@ -4,7 +4,6 @@ __pycache__/
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
message/
|
||||
dbgpt/util/extensions/
|
||||
.env*
|
||||
@@ -185,4 +184,4 @@ thirdparty
|
||||
/examples/**/*.gv
|
||||
/examples/**/*.gv.pdf
|
||||
/i18n/locales/**/**/*_ai_translated.po
|
||||
/i18n/locales/**/**/*~
|
||||
/i18n/locales/**/**/*~
|
||||
|
@@ -213,6 +213,9 @@ class Config(metaclass=Singleton):
|
||||
|
||||
# Vector Store Configuration
|
||||
self.VECTOR_STORE_TYPE = os.getenv("VECTOR_STORE_TYPE", "Chroma")
|
||||
self.GRAPH_COMMUNITY_SUMMARY_ENABLED = (
|
||||
os.getenv("GRAPH_COMMUNITY_SUMMARY_ENABLED", "").lower() == "true"
|
||||
)
|
||||
self.MILVUS_URL = os.getenv("MILVUS_URL", "127.0.0.1")
|
||||
self.MILVUS_PORT = os.getenv("MILVUS_PORT", "19530")
|
||||
self.MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", None)
|
||||
|
@@ -112,13 +112,15 @@ def arguments(space_id: str):
|
||||
|
||||
|
||||
@router.post("/knowledge/{space_name}/recall_test")
|
||||
def recall_test(
|
||||
async def recall_test(
|
||||
space_name: str,
|
||||
request: DocumentRecallTestRequest,
|
||||
):
|
||||
print(f"/knowledge/{space_name}/recall_test params:")
|
||||
try:
|
||||
return Result.succ(knowledge_space_service.recall_test(space_name, request))
|
||||
return Result.succ(
|
||||
await knowledge_space_service.recall_test(space_name, request)
|
||||
)
|
||||
except Exception as e:
|
||||
return Result.failed(code="E000X", msg=f"{space_name} recall_test error {e}")
|
||||
|
||||
|
@@ -309,7 +309,7 @@ class KnowledgeService:
|
||||
"""
|
||||
return knowledge_space_dao.get_knowledge_space_by_ids(ids)
|
||||
|
||||
def recall_test(
|
||||
async def recall_test(
|
||||
self, space_name, doc_recall_test_request: DocumentRecallTestRequest
|
||||
):
|
||||
logger.info(f"recall_test {space_name}, {doc_recall_test_request}")
|
||||
@@ -338,7 +338,7 @@ class KnowledgeService:
|
||||
knowledge_space_retriever = KnowledgeSpaceRetriever(
|
||||
space_id=space.id, top_k=top_k
|
||||
)
|
||||
chunks = knowledge_space_retriever.retrieve_with_scores(
|
||||
chunks = await knowledge_space_retriever.aretrieve_with_scores(
|
||||
question, score_threshold
|
||||
)
|
||||
retrievers_end_time = timeit.default_timer()
|
||||
@@ -646,13 +646,16 @@ class KnowledgeService:
|
||||
graph = vector_store_connector.client.query_graph(limit=limit)
|
||||
res = {"nodes": [], "edges": []}
|
||||
for node in graph.vertices():
|
||||
res["nodes"].append({"vid": node.vid})
|
||||
for edge in graph.edges():
|
||||
res["edges"].append(
|
||||
res["nodes"].append(
|
||||
{
|
||||
"src": edge.sid,
|
||||
"dst": edge.tid,
|
||||
"label": edge.props[graph.edge_label],
|
||||
"id": node.vid,
|
||||
"communityId": node.get_prop("_community_id"),
|
||||
"name": node.vid,
|
||||
"type": "",
|
||||
}
|
||||
)
|
||||
for edge in graph.edges():
|
||||
res["edges"].append(
|
||||
{"source": edge.sid, "target": edge.tid, "name": edge.name, "type": ""}
|
||||
)
|
||||
return res
|
||||
|
@@ -1,7 +1,7 @@
|
||||
"""TuGraph Connector."""
|
||||
|
||||
import json
|
||||
from typing import Dict, List, cast
|
||||
from typing import Dict, Generator, List, cast
|
||||
|
||||
from .base import BaseConnector
|
||||
|
||||
@@ -23,11 +23,16 @@ class TuGraphConnector(BaseConnector):
|
||||
def create_graph(self, graph_name: str) -> None:
|
||||
"""Create a new graph."""
|
||||
# run the query to get vertex labels
|
||||
with self._driver.session(database="default") as session:
|
||||
graph_list = session.run("CALL dbms.graph.listGraphs()").data()
|
||||
exists = any(item["graph_name"] == graph_name for item in graph_list)
|
||||
if not exists:
|
||||
session.run(f"CALL dbms.graph.createGraph('{graph_name}', '', 2048)")
|
||||
try:
|
||||
with self._driver.session(database="default") as session:
|
||||
graph_list = session.run("CALL dbms.graph.listGraphs()").data()
|
||||
exists = any(item["graph_name"] == graph_name for item in graph_list)
|
||||
if not exists:
|
||||
session.run(
|
||||
f"CALL dbms.graph.createGraph('{graph_name}', '', 2048)"
|
||||
)
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to create graph '{graph_name}': {str(e)}")
|
||||
|
||||
def delete_graph(self, graph_name: str) -> None:
|
||||
"""Delete a graph."""
|
||||
@@ -89,10 +94,19 @@ class TuGraphConnector(BaseConnector):
|
||||
self._driver.close()
|
||||
|
||||
def run(self, query: str, fetch: str = "all") -> List:
|
||||
"""Run query."""
|
||||
with self._driver.session(database=self._graph) as session:
|
||||
try:
|
||||
result = session.run(query)
|
||||
return list(result)
|
||||
except Exception as e:
|
||||
raise Exception(f"Query execution failed: {e}")
|
||||
|
||||
def run_stream(self, query: str) -> Generator:
|
||||
"""Run GQL."""
|
||||
with self._driver.session(database=self._graph) as session:
|
||||
result = session.run(query)
|
||||
return list(result)
|
||||
yield from result
|
||||
|
||||
def get_columns(self, table_name: str, table_type: str = "vertex") -> List[Dict]:
|
||||
"""Get fields about specified graph.
|
||||
|
@@ -20,19 +20,19 @@ from .embeddings import ( # noqa: F401
|
||||
from .rerank import CrossEncoderRerankEmbeddings, OpenAPIRerankEmbeddings # noqa: F401
|
||||
|
||||
__ALL__ = [
|
||||
"CrossEncoderRerankEmbeddings",
|
||||
"DefaultEmbeddingFactory",
|
||||
"EmbeddingFactory",
|
||||
"Embeddings",
|
||||
"HuggingFaceBgeEmbeddings",
|
||||
"HuggingFaceEmbeddings",
|
||||
"HuggingFaceInferenceAPIEmbeddings",
|
||||
"HuggingFaceInstructEmbeddings",
|
||||
"JinaEmbeddings",
|
||||
"OpenAPIEmbeddings",
|
||||
"OllamaEmbeddings",
|
||||
"DefaultEmbeddingFactory",
|
||||
"EmbeddingFactory",
|
||||
"WrappedEmbeddingFactory",
|
||||
"TongYiEmbeddings",
|
||||
"CrossEncoderRerankEmbeddings",
|
||||
"OpenAPIEmbeddings",
|
||||
"OpenAPIRerankEmbeddings",
|
||||
"QianFanEmbeddings",
|
||||
"TongYiEmbeddings",
|
||||
"WrappedEmbeddingFactory",
|
||||
]
|
||||
|
@@ -54,6 +54,10 @@ class IndexStoreBase(ABC):
|
||||
"""Init index store."""
|
||||
self._executor = executor or ThreadPoolExecutor()
|
||||
|
||||
@abstractmethod
|
||||
def get_config(self) -> IndexStoreConfig:
|
||||
"""Get the index store config."""
|
||||
|
||||
@abstractmethod
|
||||
def load_document(self, chunks: List[Chunk]) -> List[str]:
|
||||
"""Load document in index database.
|
||||
@@ -104,6 +108,10 @@ class IndexStoreBase(ABC):
|
||||
ids(str): The vector ids to delete, separated by comma.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def truncate(self) -> List[str]:
|
||||
"""Truncate data by name."""
|
||||
|
||||
@abstractmethod
|
||||
def delete_vector_name(self, index_name: str):
|
||||
"""Delete index by name.
|
||||
@@ -188,7 +196,7 @@ class IndexStoreBase(ABC):
|
||||
Return:
|
||||
List[Chunk]: The similar documents.
|
||||
"""
|
||||
return self.similar_search_with_scores(text, topk, 1.0, filters)
|
||||
return self.similar_search_with_scores(text, topk, 0.0, filters)
|
||||
|
||||
async def asimilar_search_with_scores(
|
||||
self,
|
||||
|
@@ -9,11 +9,27 @@ logger = logging.getLogger(__name__)
|
||||
class TransformerBase:
|
||||
"""Transformer base class."""
|
||||
|
||||
@abstractmethod
|
||||
def truncate(self):
|
||||
"""Truncate operation."""
|
||||
|
||||
@abstractmethod
|
||||
def drop(self):
|
||||
"""Clean operation."""
|
||||
|
||||
|
||||
class EmbedderBase(TransformerBase, ABC):
|
||||
"""Embedder base class."""
|
||||
|
||||
|
||||
class SummarizerBase(TransformerBase, ABC):
|
||||
"""Summarizer base class."""
|
||||
|
||||
@abstractmethod
|
||||
async def summarize(self, **args) -> str:
|
||||
"""Summarize result."""
|
||||
|
||||
|
||||
class ExtractorBase(TransformerBase, ABC):
|
||||
"""Extractor base class."""
|
||||
|
||||
|
208
dbgpt/rag/transformer/community_summarizer.py
Normal file
208
dbgpt/rag/transformer/community_summarizer.py
Normal file
@@ -0,0 +1,208 @@
|
||||
"""CommunitySummarizer class."""
|
||||
|
||||
import logging
|
||||
|
||||
from dbgpt.core import LLMClient
|
||||
from dbgpt.rag.transformer.llm_summarizer import LLMSummarizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CommunitySummarizer(LLMSummarizer):
|
||||
"""CommunitySummarizer class."""
|
||||
|
||||
def __init__(self, llm_client: LLMClient, model_name: str):
|
||||
"""Initialize the CommunitySummaryExtractor."""
|
||||
super().__init__(llm_client, model_name, COMMUNITY_SUMMARY_PT_CN)
|
||||
|
||||
|
||||
COMMUNITY_SUMMARY_PT_CN = (
|
||||
"## 角色\n"
|
||||
"你非常擅长知识图谱的信息总结,能根据给定的知识图谱中的实体和关系的名称以及描述"
|
||||
"信息,全面、恰当地对知识图谱子图信息做出总结性描述,并且不会丢失关键的信息。\n"
|
||||
"\n"
|
||||
"## 技能\n"
|
||||
"### 技能 1: 实体识别\n"
|
||||
"- 准确地识别[Entities:]章节中的实体信息,包括实体名、实体描述信息。\n"
|
||||
"- 实体信息的一般格式有:\n"
|
||||
"(实体名)\n"
|
||||
"(实体名:实体描述)\n"
|
||||
"(实体名:实体属性表)\n"
|
||||
"(文本块ID:文档块内容)\n"
|
||||
"(目录ID:目录名)\n"
|
||||
"(文档ID:文档名称)\n"
|
||||
"\n"
|
||||
"### 技能 2: 关系识别\n"
|
||||
"- 准确地识别[Relationships:]章节中的关系信息,包括来源实体名、关系名、"
|
||||
"目标实体名、关系描述信息,实体名也可能是文档ID、目录ID、文本块ID。\n"
|
||||
"- 关系信息的一般格式有:\n"
|
||||
"(来源实体名)-[关系名]->(目标实体名)\n"
|
||||
"(来源实体名)-[关系名:关系描述]->(目标实体名)\n"
|
||||
"(来源实体名)-[关系名:关系属性表]->(目标实体名)\n"
|
||||
"(文本块ID)-[包含]->(实体名)\n"
|
||||
"(目录ID)-[包含]->(文本块实体)\n"
|
||||
"(目录ID)-[包含]->(子目录ID)\n"
|
||||
"(文档ID)-[包含]->(文本块实体)\n"
|
||||
"(文档ID)-[包含]->(目录ID)\n"
|
||||
"\n"
|
||||
"### 技能 3: 图结构理解\n"
|
||||
"--请按照如下步骤理解图结构--\n"
|
||||
"1. 正确地将关系信息中的来源实体名与实体信息关联。\n"
|
||||
"2. 正确地将关系信息中的目标实体名与实体信息关联。\n"
|
||||
"3. 根据提供的关系信息还原出图结构。"
|
||||
"\n"
|
||||
"### 技能 4: 知识图谱总结\n"
|
||||
"--请按照如下步骤总结知识图谱--\n"
|
||||
"1. 确定知识图谱表达的主题或话题,突出关键实体和关系。"
|
||||
"2. 使用准确、恰当、简洁的语言总结图结构表达的信息,不要生成与图结构中无关的信息。"
|
||||
"\n"
|
||||
"## 约束条件\n"
|
||||
"- 不要在答案中描述你的思考过程,直接给出用户问题的答案,不要生成无关信息。\n"
|
||||
"- 确保以第三人称书写,从客观角度对知识图谱表达的信息进行总结性描述。\n"
|
||||
"- 如果实体或关系的描述信息为空,对最终的总结信息没有贡献,不要生成无关信息。\n"
|
||||
"- 如果提供的描述信息相互矛盾,请解决矛盾并提供一个单一、连贯的描述。\n"
|
||||
"- 避免使用停用词和过于常见的词汇。\n"
|
||||
"\n"
|
||||
"## 参考案例\n"
|
||||
"--案例仅帮助你理解提示词的输入和输出格式,请不要在答案中使用它们。--\n"
|
||||
"输入:\n"
|
||||
"```\n"
|
||||
"Entities:\n"
|
||||
"(菲尔・贾伯#菲尔兹咖啡创始人)\n"
|
||||
"(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌)\n"
|
||||
"(雅各布・贾伯#菲尔・贾伯的儿子)\n"
|
||||
"(美国多地#菲尔兹咖啡的扩展地区)\n"
|
||||
"\n"
|
||||
"Relationships:\n"
|
||||
"(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立)\n"
|
||||
"(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点)\n"
|
||||
"(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的儿子)\n"
|
||||
"(雅各布・贾伯#担任#首席执行官#在2005年成为菲尔兹咖啡的首席执行官)\n"
|
||||
"(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围)\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"输出:\n"
|
||||
"```\n"
|
||||
"菲尔兹咖啡是由菲尔・贾伯在1978年于加利福尼亚州伯克利创立的咖啡品牌。"
|
||||
"菲尔・贾伯的儿子雅各布・贾伯在2005年接任首席执行官,领导公司扩展到了美国多地,"
|
||||
"进一步巩固了菲尔兹咖啡作为加利福尼亚州伯克利创立的咖啡品牌的市场地位。\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"----\n"
|
||||
"\n"
|
||||
"请根据接下来[知识图谱]提供的信息,按照上述要求,总结知识图谱表达的信息。\n"
|
||||
"\n"
|
||||
"[知识图谱]:\n"
|
||||
"{graph}\n"
|
||||
"\n"
|
||||
"[总结]:\n"
|
||||
"\n"
|
||||
)
|
||||
|
||||
COMMUNITY_SUMMARY_PT_EN = (
|
||||
"## Role\n"
|
||||
"You are highly skilled in summarizing information from knowledge graphs. "
|
||||
"Based on the names and descriptions of entities and relationships in a "
|
||||
"given knowledge graph, you can comprehensively and appropriately summarize"
|
||||
" the information of the subgraph without losing critical details.\n"
|
||||
"\n"
|
||||
"## Skills\n"
|
||||
"### Skill 1: Entity Recognition\n"
|
||||
"- Accurately recognize entity information in the [Entities:] section, "
|
||||
"including entity names and descriptions.\n"
|
||||
"- The general formats for entity information are:\n"
|
||||
"(entity_name)\n"
|
||||
"(entity_name: entity_description)\n"
|
||||
"(entity_name: entity_property_map)\n"
|
||||
"(chunk_id: chunk_content)\n"
|
||||
"(catalog_id: catalog_name)\n"
|
||||
"(document_id: document_name)\n"
|
||||
"\n"
|
||||
"### Skill 2: Relationship Recognition\n"
|
||||
"- Accurately recognize relationship information in the [Relationships:] "
|
||||
"section, including source_entity_name, relationship_name, "
|
||||
"target_entity_name, and relationship_description, The entity_name may "
|
||||
"also be the document_id, catalog_id, or chunk_id.\n"
|
||||
"- The general formats for relationship information are:\n"
|
||||
"(source_entity_name)-[relationship_name]->(target_entity_name)\n"
|
||||
"(source_entity_name)-[relationship_name: relationship_description]->"
|
||||
"(target_entity_name)\n"
|
||||
"(source_entity_name)-[relationship_name: relationship_property_map]->"
|
||||
"(target_entity_name)\n"
|
||||
"(chunk_id)-[Contains]->(entity_name)\n"
|
||||
"(catalog_id)-[Contains]->(chunk_id)\n"
|
||||
"(catalog_id)-[Contains]->(sub_catalog_id)\n"
|
||||
"(document_id)-[Contains]->(chunk_id)\n"
|
||||
"(document_id)-[Contains]->(catalog_id)\n"
|
||||
"\n"
|
||||
"### Skill 3: Graph Structure Understanding\n"
|
||||
"--Follow these steps to understand the graph structure--\n"
|
||||
"1. Correctly associate the source entity name in the "
|
||||
"relationship information with the entity information.\n"
|
||||
"2. Correctly associate the target entity name in the "
|
||||
"relationship information with the entity information.\n"
|
||||
"3. Reconstruct the graph structure based on the provided "
|
||||
"relationship information."
|
||||
"\n"
|
||||
"### Skill 4: Knowledge Graph Summarization\n"
|
||||
"--Follow these steps to summarize the knowledge graph--\n"
|
||||
"1. Determine the theme or topic expressed by the knowledge graph, "
|
||||
"highlighting key entities and relationships."
|
||||
"2. Use accurate, appropriate, and concise language to summarize "
|
||||
"the information expressed by the graph "
|
||||
"without generating irrelevant information."
|
||||
"\n"
|
||||
"## Constraints\n"
|
||||
"- Don't describe your thought process in the answer, provide the answer "
|
||||
"to the user's question directly without generating irrelevant information."
|
||||
"- Ensure the summary is written in the third person and objectively "
|
||||
"reflects the information conveyed by the knowledge graph.\n"
|
||||
"- If the descriptions of entities or relationships are empty and "
|
||||
"contribute nothing to the final summary, "
|
||||
"do not generate unrelated information.\n"
|
||||
"- If the provided descriptions are contradictory, resolve the conflicts "
|
||||
"and provide a single, coherent description.\n"
|
||||
"- Avoid using stop words and overly common words.\n"
|
||||
"\n"
|
||||
"## Reference Example\n"
|
||||
"--The case is only to help you understand the input and output format of "
|
||||
"the prompt, please do not use it in your answer.--\n"
|
||||
"Input:\n"
|
||||
"```\n"
|
||||
"Entities:\n"
|
||||
"(Phil Jaber#Founder of Philz Coffee)\n"
|
||||
"(Philz Coffee#Coffee brand founded in Berkeley, California)\n"
|
||||
"(Jacob Jaber#Son of Phil Jaber)\n"
|
||||
"(Multiple locations in the USA#Expansion regions of Philz Coffee)\n"
|
||||
"\n"
|
||||
"Relationships:\n"
|
||||
"(Phil Jaber#Created#Philz Coffee"
|
||||
"#Founded in Berkeley, California in 1978)\n"
|
||||
"(Philz Coffee#Located in#Berkeley, California"
|
||||
"#Founding location of Philz Coffee)\n"
|
||||
"(Phil Jaber#Has#Jacob Jaber#Son of Phil Jaber)\n"
|
||||
"(Jacob Jaber#Serves as#CEO#Became CEO of Philz Coffee in 2005)\n"
|
||||
"(Philz Coffee#Expanded to#Multiple locations in the USA"
|
||||
"#Expansion regions of Philz Coffee)\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"Output:\n"
|
||||
"```\n"
|
||||
"Philz Coffee is a coffee brand founded by Phil Jaber in 1978 in "
|
||||
"Berkeley, California. Phil Jaber's son, Jacob Jaber, took over as CEO in "
|
||||
"2005, leading the company to expand to multiple locations in the USA, "
|
||||
"further solidifying Philz Coffee's market position as a coffee brand "
|
||||
"founded in Berkeley, California.\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"----\n"
|
||||
"\n"
|
||||
"Please summarize the information expressed by the [KnowledgeGraph] "
|
||||
"provided in the following section according to the above requirements.\n"
|
||||
"\n"
|
||||
"[KnowledgeGraph]:\n"
|
||||
"{graph}\n"
|
||||
"\n"
|
||||
"[Summary]:\n"
|
||||
"\n"
|
||||
)
|
304
dbgpt/rag/transformer/graph_extractor.py
Normal file
304
dbgpt/rag/transformer/graph_extractor.py
Normal file
@@ -0,0 +1,304 @@
|
||||
"""GraphExtractor class."""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import List, Optional
|
||||
|
||||
from dbgpt.core import Chunk, LLMClient
|
||||
from dbgpt.rag.transformer.llm_extractor import LLMExtractor
|
||||
from dbgpt.storage.graph_store.graph import Edge, Graph, MemoryGraph, Vertex
|
||||
from dbgpt.storage.vector_store.base import VectorStoreBase
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GraphExtractor(LLMExtractor):
|
||||
"""GraphExtractor class."""
|
||||
|
||||
def __init__(
|
||||
self, llm_client: LLMClient, model_name: str, chunk_history: VectorStoreBase
|
||||
):
|
||||
"""Initialize the GraphExtractor."""
|
||||
super().__init__(llm_client, model_name, GRAPH_EXTRACT_PT_CN)
|
||||
self._chunk_history = chunk_history
|
||||
|
||||
config = self._chunk_history.get_config()
|
||||
self._vector_space = config.name
|
||||
self._max_chunks_once_load = config.max_chunks_once_load
|
||||
self._max_threads = config.max_threads
|
||||
self._topk = config.topk
|
||||
self._score_threshold = config.score_threshold
|
||||
|
||||
async def extract(self, text: str, limit: Optional[int] = None) -> List:
|
||||
"""Load similar chunks."""
|
||||
# load similar chunks
|
||||
chunks = await self._chunk_history.asimilar_search_with_scores(
|
||||
text, self._topk, self._score_threshold
|
||||
)
|
||||
history = [
|
||||
f"Section {i + 1}:\n{chunk.content}" for i, chunk in enumerate(chunks)
|
||||
]
|
||||
context = "\n".join(history) if history else ""
|
||||
|
||||
try:
|
||||
# extract with chunk history
|
||||
return await super()._extract(text, context, limit)
|
||||
|
||||
finally:
|
||||
# save chunk to history
|
||||
await self._chunk_history.aload_document_with_limit(
|
||||
[Chunk(content=text, metadata={"relevant_cnt": len(history)})],
|
||||
self._max_chunks_once_load,
|
||||
self._max_threads,
|
||||
)
|
||||
|
||||
def _parse_response(self, text: str, limit: Optional[int] = None) -> List[Graph]:
|
||||
graph = MemoryGraph()
|
||||
edge_count = 0
|
||||
current_section = None
|
||||
for line in text.split("\n"):
|
||||
line = line.strip()
|
||||
if line in ["Entities:", "Relationships:"]:
|
||||
current_section = line[:-1]
|
||||
elif line and current_section:
|
||||
if current_section == "Entities":
|
||||
match = re.match(r"\((.*?)#(.*?)\)", line)
|
||||
if match:
|
||||
name, summary = [part.strip() for part in match.groups()]
|
||||
graph.upsert_vertex(Vertex(name, description=summary))
|
||||
elif current_section == "Relationships":
|
||||
match = re.match(r"\((.*?)#(.*?)#(.*?)#(.*?)\)", line)
|
||||
if match:
|
||||
source, name, target, summary = [
|
||||
part.strip() for part in match.groups()
|
||||
]
|
||||
edge_count += 1
|
||||
graph.append_edge(
|
||||
Edge(source, target, name, description=summary)
|
||||
)
|
||||
|
||||
if limit and edge_count >= limit:
|
||||
break
|
||||
|
||||
return [graph]
|
||||
|
||||
def truncate(self):
|
||||
"""Truncate chunk history."""
|
||||
self._chunk_history.truncate()
|
||||
|
||||
def drop(self):
|
||||
"""Drop chunk history."""
|
||||
self._chunk_history.delete_vector_name(self._vector_space)
|
||||
|
||||
|
||||
GRAPH_EXTRACT_PT_CN = (
|
||||
"## 角色\n"
|
||||
"你是一个知识图谱工程专家,非常擅长从文本中精确抽取知识图谱的实体"
|
||||
"(主体、客体)和关系,并能对实体和关系的含义做出恰当的总结性描述。\n"
|
||||
"\n"
|
||||
"## 技能\n"
|
||||
"### 技能 1: 实体抽取\n"
|
||||
"--请按照如下步骤抽取实体--\n"
|
||||
"1. 准确地识别文本中的实体信息,一般是名词、代词等。\n"
|
||||
"2. 准确地识别实体的修饰性描述,一般作为定语对实体特征做补充。\n"
|
||||
"3. 对相同概念的实体(同义词、别称、代指),请合并为单一简洁的实体名,"
|
||||
"并合并它们的描述信息。\n"
|
||||
"4. 对合并后的实体描述信息做简洁、恰当、连贯的总结。\n"
|
||||
"\n"
|
||||
"### 技能 2: 关系抽取\n"
|
||||
"--请按照如下步骤抽取关系--\n"
|
||||
"1. 准确地识别文本中实体之间的关联信息,一般是动词、代词等。\n"
|
||||
"2. 准确地识别关系的修饰性描述,一般作为状语对关系特征做补充。\n"
|
||||
"3. 对相同概念的关系(同义词、别称、代指),请合并为单一简洁的关系名,"
|
||||
"并合并它们的描述信息。\n"
|
||||
"4. 对合并后的关系描述信息做简洁、恰当、连贯的总结。\n"
|
||||
"\n"
|
||||
"### 技能 3: 关联上下文\n"
|
||||
"- 关联上下文来自与当前待抽取文本相关的前置段落内容,"
|
||||
"可以为知识抽取提供信息补充。\n"
|
||||
"- 合理利用提供的上下文信息,知识抽取过程中出现的内容引用可能来自关联上下文。\n"
|
||||
"- 不要对关联上下文的内容做知识抽取,而仅作为关联信息参考。\n"
|
||||
"- 关联上下文是可选信息,可能为空。\n"
|
||||
"\n"
|
||||
"## 约束条件\n"
|
||||
"- 如果文本已提供了图结构格式的数据,直接转换为输出格式返回,"
|
||||
"不要修改实体或ID名称。"
|
||||
"- 尽可能多的生成文本中提及的实体和关系信息,但不要随意创造不存在的实体和关系。\n"
|
||||
"- 确保以第三人称书写,从客观角度描述实体名称、关系名称,以及他们的总结性描述。\n"
|
||||
"- 尽可能多地使用关联上下文中的信息丰富实体和关系的内容,这非常重要。\n"
|
||||
"- 如果实体或关系的总结描述为空,不提供总结描述信息,不要生成无关的描述信息。\n"
|
||||
"- 如果提供的描述信息相互矛盾,请解决矛盾并提供一个单一、连贯的描述。\n"
|
||||
"- 实体和关系的名称或者描述文本出现#和:字符时,使用_字符替换,其他字符不要修改。"
|
||||
"- 避免使用停用词和过于常见的词汇。\n"
|
||||
"\n"
|
||||
"## 输出格式\n"
|
||||
"Entities:\n"
|
||||
"(实体名#实体总结)\n"
|
||||
"...\n\n"
|
||||
"Relationships:\n"
|
||||
"(来源实体名#关系名#目标实体名#关系总结)\n"
|
||||
"...\n"
|
||||
"\n"
|
||||
"## 参考案例"
|
||||
"--案例仅帮助你理解提示词的输入和输出格式,请不要在答案中使用它们。--\n"
|
||||
"输入:\n"
|
||||
"```\n"
|
||||
"[上下文]:\n"
|
||||
"Section 1:\n"
|
||||
"菲尔・贾伯的大儿子叫雅各布・贾伯。\n"
|
||||
"Section 2:\n"
|
||||
"菲尔・贾伯的小儿子叫比尔・贾伯。\n"
|
||||
"..."
|
||||
"\n"
|
||||
"[文本]:\n"
|
||||
"菲尔兹咖啡由菲尔・贾伯于1978年在加利福尼亚州伯克利创立。"
|
||||
"因其独特的混合咖啡而闻名,菲尔兹已扩展到美国多地。"
|
||||
"他的大儿子于2005年成为首席执行官,并带领公司实现了显著增长。\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"输出:\n"
|
||||
"```\n"
|
||||
"Entities:\n"
|
||||
"(菲尔・贾伯#菲尔兹咖啡创始人)\n"
|
||||
"(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌)\n"
|
||||
"(雅各布・贾伯#菲尔・贾伯的大儿子)\n"
|
||||
"(美国多地#菲尔兹咖啡的扩展地区)\n"
|
||||
"\n"
|
||||
"Relationships:\n"
|
||||
"(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立)\n"
|
||||
"(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点)\n"
|
||||
"(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的大儿子)\n"
|
||||
"(雅各布・贾伯#管理#菲尔兹咖啡#在2005年担任首席执行官)\n"
|
||||
"(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围)\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"----\n"
|
||||
"\n"
|
||||
"请根据接下来[上下文]提供的信息,按照上述要求,抽取[文本]中的实体和关系数据。\n"
|
||||
"\n"
|
||||
"[上下文]:\n"
|
||||
"{history}\n"
|
||||
"\n"
|
||||
"[文本]:\n"
|
||||
"{text}\n"
|
||||
"\n"
|
||||
"[结果]:\n"
|
||||
"\n"
|
||||
)
|
||||
|
||||
GRAPH_EXTRACT_PT_EN = (
|
||||
"## Role\n"
|
||||
"You are an expert in Knowledge Graph Engineering, skilled at extracting "
|
||||
"entities (subjects, objects) and relations from text, and summarizing "
|
||||
"their meanings effectively.\n"
|
||||
"\n"
|
||||
"## Skills\n"
|
||||
"### Skill 1: Entity Extraction\n"
|
||||
"--Please follow these steps to extract entities--\n"
|
||||
"1. Accurately identify entity information in the text, "
|
||||
"usually nouns, pronouns, etc.\n"
|
||||
"2. Accurately identify descriptive information, "
|
||||
"usually as adjectives, that supplements entity features.\n"
|
||||
"3. Merge synonymous, alias, or reference entities into "
|
||||
"a single concise entity name, and merge their descriptive information.\n"
|
||||
"4. Provide a concise, appropriate, and coherent summary "
|
||||
"of the combined entity descriptions.\n"
|
||||
"\n"
|
||||
"### Skill 2: Relation Extraction\n"
|
||||
"--Please follow these steps to extract relations--\n"
|
||||
"1. Accurately identify relation information between entities in the text, "
|
||||
"usually verbs, pronouns, etc.\n"
|
||||
"2. Accurately identify descriptive information, usually as adverbs, "
|
||||
"that supplements relation features.\n"
|
||||
"3. Merge synonymous, alias, or reference relations into "
|
||||
"a single concise relation name, and merge their descriptive information.\n"
|
||||
"4. Provide a concise, appropriate, and coherent summary "
|
||||
"of the combined relation descriptions.\n"
|
||||
"\n"
|
||||
"### Skill 3: Contextual Association\n"
|
||||
"- Context comes from preceding paragraphs related to the current "
|
||||
"extraction text and can provide supplementary information.\n"
|
||||
"- Appropriately use contextual information, content references "
|
||||
"during extraction may come from this context.\n"
|
||||
"- Do not extract knowledge from contextual content, "
|
||||
"use it only as a reference.\n"
|
||||
"- Context is optional and may be empty.\n"
|
||||
"\n"
|
||||
"## Constraints\n"
|
||||
"- If the text has provided data that is similar to or the same as the "
|
||||
"output format, please format the output directly according to the "
|
||||
"output format requirements."
|
||||
"- Generate as much entity and relation information mentioned in the text "
|
||||
"as possible, but do not create nonexistent entities or relations.\n"
|
||||
"- Ensure the writing is in the third person, describing entity names, "
|
||||
"relation names, and their summaries objectively.\n"
|
||||
"- Use as much contextual information as possible to enrich the content "
|
||||
"of entities and relations, this is very important.\n"
|
||||
"- If a summary of an entity or relation is empty, do not provide "
|
||||
"summary information, and do not generate irrelevant descriptions.\n"
|
||||
"- If provided descriptions are contradictory, resolve the conflict "
|
||||
"and provide a single, coherent description.\n"
|
||||
"- Replace any # or : characters in entity's and relation's "
|
||||
"names or descriptions with an _ character.\n"
|
||||
"- Avoid using stop words and overly common terms.\n"
|
||||
"\n"
|
||||
"## Output Format\n"
|
||||
"Entities:\n"
|
||||
"(entity_name#entity_summary)\n"
|
||||
"...\n\n"
|
||||
"Relationships:\n"
|
||||
"(source_entity_name#relation_name#target_entity_name#relation_summary)\n"
|
||||
"...\n"
|
||||
"\n"
|
||||
"## Reference Example\n"
|
||||
"--The case is only to help you understand the input and output format of "
|
||||
"the prompt, please do not use it in your answer.--\n"
|
||||
"Input:\n"
|
||||
"```\n"
|
||||
"[Context]:\n"
|
||||
"Section 1:\n"
|
||||
"Phil Jabber's eldest son is named Jacob Jabber.\n"
|
||||
"Section 2:\n"
|
||||
"Phil Jabber's youngest son is named Bill Jabber.\n"
|
||||
"..."
|
||||
"\n"
|
||||
"[Text]:\n"
|
||||
"Philz Coffee was founded by Phil Jabber in 1978 in Berkeley, California. "
|
||||
"Known for its distinctive blend coffee, Philz has expanded to multiple "
|
||||
"locations in the USA. His eldest son became CEO in 2005, "
|
||||
"leading significant growth for the company.\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"Output:\n"
|
||||
"```\n"
|
||||
"Entities:\n"
|
||||
"(Phil Jabber#Founder of Philz Coffee)\n"
|
||||
"(Philz Coffee#Coffee brand founded in Berkeley, California)\n"
|
||||
"(Jacob Jabber#Phil Jabber's eldest son)\n"
|
||||
"(Multiple locations in the USA#Philz Coffee's expansion area)\n"
|
||||
"\n"
|
||||
"Relationships:\n"
|
||||
"(Phil Jabber#Founded#Philz Coffee"
|
||||
"#Founded in 1978 in Berkeley, California)\n"
|
||||
"(Philz Coffee#Located in#Berkeley, California"
|
||||
"#Philz Coffee's founding location)\n"
|
||||
"(Phil Jabber#Has#Jacob Jabber#Phil Jabber's eldest son)\n"
|
||||
"(Jacob Jabber#Manage#Philz Coffee#Serve as CEO in 2005)\n"
|
||||
"(Philz Coffee#Expanded to#Multiple locations in the USA"
|
||||
"#Philz Coffee's expansion area)\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"----\n"
|
||||
"\n"
|
||||
"Please extract the entities and relationships data from the [Text] "
|
||||
"according to the above requirements, using the provided [Context].\n"
|
||||
"\n"
|
||||
"[Context]:\n"
|
||||
"{history}\n"
|
||||
"\n"
|
||||
"[Text]:\n"
|
||||
"{text}\n"
|
||||
"\n"
|
||||
"[Results]:\n"
|
||||
"\n"
|
||||
)
|
@@ -19,9 +19,20 @@ class LLMExtractor(ExtractorBase, ABC):
|
||||
self._prompt_template = prompt_template
|
||||
|
||||
async def extract(self, text: str, limit: Optional[int] = None) -> List:
|
||||
"""Extract by LLm."""
|
||||
"""Extract by LLM."""
|
||||
return await self._extract(text, None, limit)
|
||||
|
||||
async def _extract(
|
||||
self, text: str, history: str = None, limit: Optional[int] = None
|
||||
) -> List:
|
||||
"""Inner extract by LLM."""
|
||||
template = HumanPromptTemplate.from_template(self._prompt_template)
|
||||
messages = template.format_messages(text=text)
|
||||
|
||||
messages = (
|
||||
template.format_messages(text=text, history=history)
|
||||
if history is not None
|
||||
else template.format_messages(text=text)
|
||||
)
|
||||
|
||||
# use default model if needed
|
||||
if not self._model_name:
|
||||
@@ -45,6 +56,12 @@ class LLMExtractor(ExtractorBase, ABC):
|
||||
ValueError("optional argument limit >= 1")
|
||||
return self._parse_response(response.text, limit)
|
||||
|
||||
def truncate(self):
|
||||
"""Do nothing by default."""
|
||||
|
||||
def drop(self):
|
||||
"""Do nothing by default."""
|
||||
|
||||
@abstractmethod
|
||||
def _parse_response(self, text: str, limit: Optional[int] = None) -> List:
|
||||
"""Parse llm response."""
|
||||
|
48
dbgpt/rag/transformer/llm_summarizer.py
Normal file
48
dbgpt/rag/transformer/llm_summarizer.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""LLMSummarizer class."""
|
||||
import logging
|
||||
from abc import ABC
|
||||
|
||||
from dbgpt.core import HumanPromptTemplate, LLMClient, ModelMessage, ModelRequest
|
||||
from dbgpt.rag.transformer.base import SummarizerBase
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LLMSummarizer(SummarizerBase, ABC):
|
||||
"""LLMSummarizer class."""
|
||||
|
||||
def __init__(self, llm_client: LLMClient, model_name: str, prompt_template: str):
|
||||
"""Initialize the LLMSummarizer."""
|
||||
self._llm_client = llm_client
|
||||
self._model_name = model_name
|
||||
self._prompt_template = prompt_template
|
||||
|
||||
async def summarize(self, **args) -> str:
|
||||
"""Summarize by LLM."""
|
||||
template = HumanPromptTemplate.from_template(self._prompt_template)
|
||||
messages = template.format_messages(**args)
|
||||
|
||||
# use default model if needed
|
||||
if not self._model_name:
|
||||
models = await self._llm_client.models()
|
||||
if not models:
|
||||
raise Exception("No models available")
|
||||
self._model_name = models[0].model
|
||||
logger.info(f"Using model {self._model_name} to extract")
|
||||
|
||||
model_messages = ModelMessage.from_base_messages(messages)
|
||||
request = ModelRequest(model=self._model_name, messages=model_messages)
|
||||
response = await self._llm_client.generate(request=request)
|
||||
|
||||
if not response.success:
|
||||
code = str(response.error_code)
|
||||
reason = response.text
|
||||
logger.error(f"request llm failed ({code}) {reason}")
|
||||
|
||||
return response.text
|
||||
|
||||
def truncate(self):
|
||||
"""Do nothing by default."""
|
||||
|
||||
def drop(self):
|
||||
"""Do nothing by default."""
|
@@ -6,6 +6,7 @@ import os
|
||||
from collections import defaultdict
|
||||
from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Type, cast
|
||||
|
||||
from dbgpt.app.component_configs import CFG
|
||||
from dbgpt.core import Chunk, Embeddings
|
||||
from dbgpt.core.awel.flow import (
|
||||
FunctionDynamicOptions,
|
||||
@@ -95,6 +96,7 @@ class VectorStoreConnector:
|
||||
self._index_store_config = vector_store_config
|
||||
self._register()
|
||||
|
||||
vector_store_type = self.__rewrite_index_store_type(vector_store_type)
|
||||
if self._match(vector_store_type):
|
||||
self.connector_class, self.config_class = connector[vector_store_type]
|
||||
else:
|
||||
@@ -124,6 +126,13 @@ class VectorStoreConnector:
|
||||
logger.error("connect vector store failed: %s", e)
|
||||
raise e
|
||||
|
||||
def __rewrite_index_store_type(self, index_store_type):
|
||||
# Rewrite Knowledge Graph Type
|
||||
if CFG.GRAPH_COMMUNITY_SUMMARY_ENABLED:
|
||||
if index_store_type == "KnowledgeGraph":
|
||||
return "CommunitySummaryKnowledgeGraph"
|
||||
return index_store_type
|
||||
|
||||
@classmethod
|
||||
def from_default(
|
||||
cls,
|
||||
@@ -270,6 +279,10 @@ class VectorStoreConnector:
|
||||
"""
|
||||
return self.client.delete_by_ids(ids=ids)
|
||||
|
||||
def truncate(self):
|
||||
"""Truncate data."""
|
||||
return self.client.truncate()
|
||||
|
||||
@property
|
||||
def current_embeddings(self) -> Optional[Embeddings]:
|
||||
"""Return the current embeddings."""
|
||||
|
@@ -5,7 +5,7 @@ from concurrent.futures import Executor, ThreadPoolExecutor
|
||||
from typing import List, Optional
|
||||
|
||||
from dbgpt.core import Chunk
|
||||
from dbgpt.rag.index.base import logger
|
||||
from dbgpt.rag.index.base import IndexStoreConfig, logger
|
||||
from dbgpt.storage.full_text.base import FullTextStoreBase
|
||||
from dbgpt.storage.vector_store.elastic_store import ElasticsearchVectorConfig
|
||||
from dbgpt.storage.vector_store.filters import MetadataFilters
|
||||
@@ -35,6 +35,7 @@ class ElasticDocumentStore(FullTextStoreBase):
|
||||
This similarity has the following options:
|
||||
"""
|
||||
super().__init__()
|
||||
self._es_config = es_config
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
self._es_config = es_config
|
||||
@@ -94,6 +95,10 @@ class ElasticDocumentStore(FullTextStoreBase):
|
||||
)
|
||||
self._executor = executor or ThreadPoolExecutor()
|
||||
|
||||
def get_config(self) -> IndexStoreConfig:
|
||||
"""Get the es store config."""
|
||||
return self._es_config
|
||||
|
||||
def load_document(self, chunks: List[Chunk]) -> List[str]:
|
||||
"""Load document in elasticsearch.
|
||||
|
||||
|
@@ -2,11 +2,11 @@
|
||||
from typing import List, Optional
|
||||
|
||||
from dbgpt.core import Chunk
|
||||
from dbgpt.rag.index.base import IndexStoreBase
|
||||
from dbgpt.storage.full_text.base import FullTextStoreBase
|
||||
from dbgpt.storage.vector_store.filters import MetadataFilters
|
||||
|
||||
|
||||
class OpenSearch(IndexStoreBase):
|
||||
class OpenSearch(FullTextStoreBase):
|
||||
"""OpenSearch index store."""
|
||||
|
||||
def load_document(self, chunks: List[Chunk]) -> List[str]:
|
||||
|
@@ -1,7 +1,7 @@
|
||||
"""Graph store base class."""
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import Generator, List, Optional, Tuple
|
||||
|
||||
from dbgpt._private.pydantic import BaseModel, ConfigDict, Field
|
||||
from dbgpt.core import Embeddings
|
||||
@@ -23,15 +23,35 @@ class GraphStoreConfig(BaseModel):
|
||||
default=None,
|
||||
description="The embedding function of graph store, optional.",
|
||||
)
|
||||
summary_enabled: bool = Field(
|
||||
default=False,
|
||||
description="Enable graph community summary or not.",
|
||||
)
|
||||
|
||||
|
||||
class GraphStoreBase(ABC):
|
||||
"""Graph store base class."""
|
||||
|
||||
@abstractmethod
|
||||
def get_config(self) -> GraphStoreConfig:
|
||||
"""Get the graph store config."""
|
||||
|
||||
@abstractmethod
|
||||
def get_vertex_type(self) -> str:
|
||||
"""Get the vertex type."""
|
||||
|
||||
@abstractmethod
|
||||
def get_edge_type(self) -> str:
|
||||
"""Get the edge type."""
|
||||
|
||||
@abstractmethod
|
||||
def insert_triplet(self, sub: str, rel: str, obj: str):
|
||||
"""Add triplet."""
|
||||
|
||||
@abstractmethod
|
||||
def insert_graph(self, graph: Graph):
|
||||
"""Add graph."""
|
||||
|
||||
@abstractmethod
|
||||
def get_triplets(self, sub: str) -> List[Tuple[str, str]]:
|
||||
"""Get triplets."""
|
||||
@@ -40,6 +60,10 @@ class GraphStoreBase(ABC):
|
||||
def delete_triplet(self, sub: str, rel: str, obj: str):
|
||||
"""Delete triplet."""
|
||||
|
||||
@abstractmethod
|
||||
def truncate(self):
|
||||
"""Truncate Graph."""
|
||||
|
||||
@abstractmethod
|
||||
def drop(self):
|
||||
"""Drop graph."""
|
||||
@@ -66,3 +90,11 @@ class GraphStoreBase(ABC):
|
||||
@abstractmethod
|
||||
def query(self, query: str, **args) -> Graph:
|
||||
"""Execute a query."""
|
||||
|
||||
def aquery(self, query: str, **args) -> Graph:
|
||||
"""Async execute a query."""
|
||||
return self.query(query, **args)
|
||||
|
||||
@abstractmethod
|
||||
def stream_query(self, query: str) -> Generator[Graph, None, None]:
|
||||
"""Execute stream query."""
|
||||
|
@@ -1,4 +1,4 @@
|
||||
"""Connector for vector store."""
|
||||
"""Graph store factory."""
|
||||
import logging
|
||||
from typing import Tuple, Type
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
"""Graph store base class."""
|
||||
"""Graph definition."""
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
@@ -24,9 +24,15 @@ class Direction(Enum):
|
||||
class Elem(ABC):
|
||||
"""Elem class."""
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, name: Optional[str] = None):
|
||||
"""Initialize Elem."""
|
||||
self._props = {}
|
||||
self._name = name
|
||||
self._props: Dict[str, Any] = {}
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
"""Return the edge label."""
|
||||
return self._name or ""
|
||||
|
||||
@property
|
||||
def props(self) -> Dict[str, Any]:
|
||||
@@ -46,14 +52,17 @@ class Elem(ABC):
|
||||
self._props.pop(key, None)
|
||||
|
||||
def has_props(self, **props):
|
||||
"""Check if the element has the specified properties with the given values."""
|
||||
"""Check all key-value pairs exist."""
|
||||
return all(self._props.get(k) == v for k, v in props.items())
|
||||
|
||||
@abstractmethod
|
||||
def format(self, label_key: Optional[str] = None):
|
||||
def format(self) -> str:
|
||||
"""Format properties into a string."""
|
||||
if len(self._props) == 1:
|
||||
return str(next(iter(self._props.values())))
|
||||
|
||||
formatted_props = [
|
||||
f"{k}:{json.dumps(v)}" for k, v in self._props.items() if k != label_key
|
||||
f"{k}:{json.dumps(v, ensure_ascii=False)}" for k, v in self._props.items()
|
||||
]
|
||||
return f"{{{';'.join(formatted_props)}}}"
|
||||
|
||||
@@ -61,9 +70,9 @@ class Elem(ABC):
|
||||
class Vertex(Elem):
|
||||
"""Vertex class."""
|
||||
|
||||
def __init__(self, vid: str, **props):
|
||||
def __init__(self, vid: str, name: Optional[str] = None, **props):
|
||||
"""Initialize Vertex."""
|
||||
super().__init__()
|
||||
super().__init__(name)
|
||||
self._vid = vid
|
||||
for k, v in props.items():
|
||||
self.set_prop(k, v)
|
||||
@@ -73,26 +82,43 @@ class Vertex(Elem):
|
||||
"""Return the vertex ID."""
|
||||
return self._vid
|
||||
|
||||
def format(self, label_key: Optional[str] = None):
|
||||
"""Format vertex properties into a string."""
|
||||
label = self.get_prop(label_key) if label_key else self._vid
|
||||
props_str = super().format(label_key)
|
||||
if props_str == "{}":
|
||||
return f"({label})"
|
||||
@property
|
||||
def name(self) -> str:
|
||||
"""Return the vertex name."""
|
||||
return super().name or self._vid
|
||||
|
||||
def format(self, concise: bool = False):
|
||||
"""Format vertex into a string."""
|
||||
name = self._name or self._vid
|
||||
if concise:
|
||||
return f"({name})"
|
||||
|
||||
if self._props:
|
||||
return f"({name}:{super().format()})"
|
||||
else:
|
||||
return f"({label}:{props_str})"
|
||||
return f"({name})"
|
||||
|
||||
def __str__(self):
|
||||
"""Return the vertex ID as its string representation."""
|
||||
return f"({self._vid})"
|
||||
|
||||
|
||||
class IdVertex(Vertex):
|
||||
"""IdVertex class."""
|
||||
|
||||
def __init__(self, vid: str):
|
||||
"""Initialize Idvertex."""
|
||||
super().__init__(vid)
|
||||
|
||||
|
||||
class Edge(Elem):
|
||||
"""Edge class."""
|
||||
|
||||
def __init__(self, sid: str, tid: str, **props):
|
||||
def __init__(self, sid: str, tid: str, name: str, **props):
|
||||
"""Initialize Edge."""
|
||||
super().__init__()
|
||||
assert name, "Edge name is required"
|
||||
|
||||
super().__init__(name)
|
||||
self._sid = sid
|
||||
self._tid = tid
|
||||
for k, v in props.items():
|
||||
@@ -117,23 +143,20 @@ class Edge(Elem):
|
||||
else:
|
||||
raise ValueError(f"Get nid of {vid} on {self} failed")
|
||||
|
||||
def format(self, label_key: Optional[str] = None):
|
||||
def format(self):
|
||||
"""Format the edge properties into a string."""
|
||||
label = self.get_prop(label_key) if label_key else ""
|
||||
props_str = super().format(label_key)
|
||||
if props_str == "{}":
|
||||
return f"-[{label}]->" if label else "->"
|
||||
if self._props:
|
||||
return f"-[{self._name}:{super().format()}]->"
|
||||
else:
|
||||
return f"-[{label}:{props_str}]->" if label else f"-[{props_str}]->"
|
||||
return f"-[{self._name}]->"
|
||||
|
||||
def triplet(self, label_key: str) -> Tuple[str, str, str]:
|
||||
def triplet(self) -> Tuple[str, str, str]:
|
||||
"""Return a triplet."""
|
||||
assert label_key, "label key is needed"
|
||||
return self._sid, str(self.get_prop(label_key)), self._tid
|
||||
return self.sid, self.name, self.tid
|
||||
|
||||
def __str__(self):
|
||||
"""Return the edge '(sid)->(tid)'."""
|
||||
return f"({self._sid})->({self._tid})"
|
||||
return f"({self._sid})-[{self._name}]->({self._tid})"
|
||||
|
||||
|
||||
class Graph(ABC):
|
||||
@@ -177,8 +200,8 @@ class Graph(ABC):
|
||||
"""Delete vertices and their neighbor edges."""
|
||||
|
||||
@abstractmethod
|
||||
def del_edges(self, sid: str, tid: str, **props):
|
||||
"""Delete edges(sid -> tid) matches props."""
|
||||
def del_edges(self, sid: str, tid: str, name: str, **props):
|
||||
"""Delete edges(sid -[name]-> tid) matches props."""
|
||||
|
||||
@abstractmethod
|
||||
def del_neighbor_edges(self, vid: str, direction: Direction = Direction.OUT):
|
||||
@@ -203,19 +226,19 @@ class Graph(ABC):
|
||||
def format(self) -> str:
|
||||
"""Format graph data to string."""
|
||||
|
||||
@abstractmethod
|
||||
def truncate(self):
|
||||
"""Truncate graph."""
|
||||
|
||||
|
||||
class MemoryGraph(Graph):
|
||||
"""Graph class."""
|
||||
|
||||
def __init__(self, vertex_label: Optional[str] = None, edge_label: str = "label"):
|
||||
def __init__(self):
|
||||
"""Initialize MemoryGraph with vertex label and edge label."""
|
||||
assert edge_label, "Edge label is needed"
|
||||
|
||||
# metadata
|
||||
self._vertex_label = vertex_label
|
||||
self._edge_label = edge_label
|
||||
self._vertex_prop_keys = {vertex_label} if vertex_label else set()
|
||||
self._edge_prop_keys = {edge_label}
|
||||
self._vertex_prop_keys = set()
|
||||
self._edge_prop_keys = set()
|
||||
self._edge_count = 0
|
||||
|
||||
# init vertices, out edges, in edges index
|
||||
@@ -223,26 +246,6 @@ class MemoryGraph(Graph):
|
||||
self._oes: Any = defaultdict(lambda: defaultdict(set))
|
||||
self._ies: Any = defaultdict(lambda: defaultdict(set))
|
||||
|
||||
@property
|
||||
def vertex_label(self):
|
||||
"""Return the label for vertices."""
|
||||
return self._vertex_label
|
||||
|
||||
@property
|
||||
def edge_label(self):
|
||||
"""Return the label for edges."""
|
||||
return self._edge_label
|
||||
|
||||
@property
|
||||
def vertex_prop_keys(self):
|
||||
"""Return a set of property keys for vertices."""
|
||||
return self._vertex_prop_keys
|
||||
|
||||
@property
|
||||
def edge_prop_keys(self):
|
||||
"""Return a set of property keys for edges."""
|
||||
return self._edge_prop_keys
|
||||
|
||||
@property
|
||||
def vertex_count(self):
|
||||
"""Return the number of vertices in the graph."""
|
||||
@@ -256,7 +259,10 @@ class MemoryGraph(Graph):
|
||||
def upsert_vertex(self, vertex: Vertex):
|
||||
"""Insert or update a vertex based on its ID."""
|
||||
if vertex.vid in self._vs:
|
||||
self._vs[vertex.vid].props.update(vertex.props)
|
||||
if isinstance(self._vs[vertex.vid], IdVertex):
|
||||
self._vs[vertex.vid] = vertex
|
||||
else:
|
||||
self._vs[vertex.vid].props.update(vertex.props)
|
||||
else:
|
||||
self._vs[vertex.vid] = vertex
|
||||
|
||||
@@ -265,9 +271,6 @@ class MemoryGraph(Graph):
|
||||
|
||||
def append_edge(self, edge: Edge):
|
||||
"""Append an edge if it doesn't exist; requires edge label."""
|
||||
if self.edge_label not in edge.props.keys():
|
||||
raise ValueError(f"Edge prop '{self.edge_label}' is needed")
|
||||
|
||||
sid = edge.sid
|
||||
tid = edge.tid
|
||||
|
||||
@@ -275,8 +278,8 @@ class MemoryGraph(Graph):
|
||||
return False
|
||||
|
||||
# init vertex index
|
||||
self._vs.setdefault(sid, Vertex(sid))
|
||||
self._vs.setdefault(tid, Vertex(tid))
|
||||
self._vs.setdefault(sid, IdVertex(sid))
|
||||
self._vs.setdefault(tid, IdVertex(tid))
|
||||
|
||||
# update edge index
|
||||
self._oes[sid][tid].add(edge)
|
||||
@@ -346,18 +349,19 @@ class MemoryGraph(Graph):
|
||||
self.del_neighbor_edges(vid, Direction.BOTH)
|
||||
self._vs.pop(vid, None)
|
||||
|
||||
def del_edges(self, sid: str, tid: str, **props):
|
||||
def del_edges(self, sid: str, tid: str, name: str, **props):
|
||||
"""Delete edges."""
|
||||
old_edge_cnt = len(self._oes[sid][tid])
|
||||
|
||||
if not props:
|
||||
self._edge_count -= old_edge_cnt
|
||||
self._oes[sid].pop(tid, None)
|
||||
self._ies[tid].pop(sid, None)
|
||||
return
|
||||
|
||||
def remove_matches(es):
|
||||
return set(filter(lambda e: not e.has_props(**props), es))
|
||||
return set(
|
||||
filter(
|
||||
lambda e: not (
|
||||
(name == e.name if name else True) and e.has_props(**props)
|
||||
),
|
||||
es,
|
||||
)
|
||||
)
|
||||
|
||||
self._oes[sid][tid] = remove_matches(self._oes[sid][tid])
|
||||
self._ies[tid][sid] = remove_matches(self._ies[tid][sid])
|
||||
@@ -439,12 +443,10 @@ class MemoryGraph(Graph):
|
||||
"schema": [
|
||||
{
|
||||
"type": "VERTEX",
|
||||
"label": f"{self._vertex_label}",
|
||||
"properties": [{"name": k} for k in self._vertex_prop_keys],
|
||||
},
|
||||
{
|
||||
"type": "EDGE",
|
||||
"label": f"{self._edge_label}",
|
||||
"properties": [{"name": k} for k in self._edge_prop_keys],
|
||||
},
|
||||
]
|
||||
@@ -452,14 +454,30 @@ class MemoryGraph(Graph):
|
||||
|
||||
def format(self) -> str:
|
||||
"""Format graph to string."""
|
||||
vs_str = "\n".join(v.format(self.vertex_label) for v in self.vertices())
|
||||
vs_str = "\n".join(v.format() for v in self.vertices())
|
||||
es_str = "\n".join(
|
||||
f"{self.get_vertex(e.sid).format(self.vertex_label)}"
|
||||
f"{e.format(self.edge_label)}"
|
||||
f"{self.get_vertex(e.tid).format(self.vertex_label)}"
|
||||
f"{self.get_vertex(e.sid).format(concise=True)}"
|
||||
f"{e.format()}"
|
||||
f"{self.get_vertex(e.tid).format(concise=True)}"
|
||||
for e in self.edges()
|
||||
)
|
||||
return f"Vertices:\n{vs_str}\n\nEdges:\n{es_str}"
|
||||
return (
|
||||
f"Entities:\n{vs_str}\n\n" f"Relationships:\n{es_str}"
|
||||
if (vs_str or es_str)
|
||||
else ""
|
||||
)
|
||||
|
||||
def truncate(self):
|
||||
"""Truncate graph."""
|
||||
# clean metadata
|
||||
self._vertex_prop_keys.clear()
|
||||
self._edge_prop_keys.clear()
|
||||
self._edge_count = 0
|
||||
|
||||
# clean data and index
|
||||
self._vs.clear()
|
||||
self._oes.clear()
|
||||
self._ies.clear()
|
||||
|
||||
def graphviz(self, name="g"):
|
||||
"""View graphviz graph: https://dreampuf.github.io/GraphvizOnline."""
|
||||
@@ -468,7 +486,7 @@ class MemoryGraph(Graph):
|
||||
g.add_node(vertex.vid)
|
||||
|
||||
for edge in self.edges():
|
||||
triplet = edge.triplet(self.edge_label)
|
||||
triplet = edge.triplet()
|
||||
g.add_edge(triplet[0], triplet[2], label=triplet[1])
|
||||
|
||||
digraph = nx.nx_agraph.to_agraph(g).to_string()
|
||||
|
@@ -1,9 +1,9 @@
|
||||
"""Graph store base class."""
|
||||
"""Memory graph store."""
|
||||
import json
|
||||
import logging
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import Generator, List, Optional, Tuple
|
||||
|
||||
from dbgpt._private.pydantic import ConfigDict, Field
|
||||
from dbgpt._private.pydantic import ConfigDict
|
||||
from dbgpt.storage.graph_store.base import GraphStoreBase, GraphStoreConfig
|
||||
from dbgpt.storage.graph_store.graph import Direction, Edge, Graph, MemoryGraph
|
||||
|
||||
@@ -15,32 +15,51 @@ class MemoryGraphStoreConfig(GraphStoreConfig):
|
||||
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
edge_name_key: str = Field(
|
||||
default="label",
|
||||
description="The label of edge name, `label` by default.",
|
||||
)
|
||||
|
||||
|
||||
class MemoryGraphStore(GraphStoreBase):
|
||||
"""Memory graph store."""
|
||||
|
||||
def __init__(self, graph_store_config: MemoryGraphStoreConfig):
|
||||
"""Initialize MemoryGraphStore with a memory graph."""
|
||||
self._edge_name_key = graph_store_config.edge_name_key
|
||||
self._graph = MemoryGraph(edge_label=self._edge_name_key)
|
||||
self._graph_store_config = graph_store_config
|
||||
self._graph = MemoryGraph()
|
||||
|
||||
def get_config(self):
|
||||
"""Get the graph store config."""
|
||||
return self._graph_store_config
|
||||
|
||||
def get_edge_type(self) -> str:
|
||||
"""Get the edge type."""
|
||||
raise NotImplementedError("Memory graph store does not have edge type")
|
||||
|
||||
def get_vertex_type(self) -> str:
|
||||
"""Get the vertex type."""
|
||||
raise NotImplementedError("Memory graph store does not have vertex type")
|
||||
|
||||
def insert_triplet(self, sub: str, rel: str, obj: str):
|
||||
"""Insert a triplet into the graph."""
|
||||
self._graph.append_edge(Edge(sub, obj, **{self._edge_name_key: rel}))
|
||||
self._graph.append_edge(Edge(sub, obj, rel))
|
||||
|
||||
def insert_graph(self, graph: Graph):
|
||||
"""Add graph."""
|
||||
for vertex in graph.vertices():
|
||||
self._graph.upsert_vertex(vertex)
|
||||
|
||||
for edge in graph.edges():
|
||||
self._graph.append_edge(edge)
|
||||
|
||||
def get_triplets(self, sub: str) -> List[Tuple[str, str]]:
|
||||
"""Retrieve triplets originating from a subject."""
|
||||
subgraph = self.explore([sub], direct=Direction.OUT, depth=1)
|
||||
return [(e.get_prop(self._edge_name_key), e.tid) for e in subgraph.edges()]
|
||||
return [(e.name, e.tid) for e in subgraph.edges()]
|
||||
|
||||
def delete_triplet(self, sub: str, rel: str, obj: str):
|
||||
"""Delete a specific triplet from the graph."""
|
||||
self._graph.del_edges(sub, obj, **{self._edge_name_key: rel})
|
||||
self._graph.del_edges(sub, obj, rel)
|
||||
|
||||
def truncate(self):
|
||||
"""Truncate graph."""
|
||||
self._graph.truncate()
|
||||
|
||||
def drop(self):
|
||||
"""Drop graph."""
|
||||
@@ -50,7 +69,7 @@ class MemoryGraphStore(GraphStoreBase):
|
||||
"""Return the graph schema as a JSON string."""
|
||||
return json.dumps(self._graph.schema())
|
||||
|
||||
def get_full_graph(self, limit: Optional[int] = None) -> MemoryGraph:
|
||||
def get_full_graph(self, limit: Optional[int] = None) -> Graph:
|
||||
"""Return self."""
|
||||
if not limit:
|
||||
return self._graph
|
||||
@@ -79,3 +98,7 @@ class MemoryGraphStore(GraphStoreBase):
|
||||
def query(self, query: str, **args) -> Graph:
|
||||
"""Execute a query on graph."""
|
||||
raise NotImplementedError("Query memory graph not allowed")
|
||||
|
||||
def stream_query(self, query: str) -> Generator[Graph, None, None]:
|
||||
"""Execute stream query."""
|
||||
raise NotImplementedError("Stream query memory graph not allowed")
|
||||
|
@@ -1,10 +1,8 @@
|
||||
"""Neo4j vector store."""
|
||||
"""Neo4j store."""
|
||||
import logging
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from dbgpt._private.pydantic import ConfigDict
|
||||
from dbgpt.storage.graph_store.base import GraphStoreBase, GraphStoreConfig
|
||||
from dbgpt.storage.graph_store.graph import Direction, Graph, MemoryGraph
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -19,46 +17,3 @@ class Neo4jStore(GraphStoreBase):
|
||||
"""Neo4j graph store."""
|
||||
|
||||
# todo: add neo4j implementation
|
||||
|
||||
def __init__(self, graph_store_config: Neo4jStoreConfig):
|
||||
"""Initialize the Neo4jStore with connection details."""
|
||||
pass
|
||||
|
||||
def insert_triplet(self, sub: str, rel: str, obj: str):
|
||||
"""Insert triplets."""
|
||||
pass
|
||||
|
||||
def get_triplets(self, sub: str) -> List[Tuple[str, str]]:
|
||||
"""Get triplets."""
|
||||
return []
|
||||
|
||||
def delete_triplet(self, sub: str, rel: str, obj: str):
|
||||
"""Delete triplets."""
|
||||
pass
|
||||
|
||||
def drop(self):
|
||||
"""Drop graph."""
|
||||
pass
|
||||
|
||||
def get_schema(self, refresh: bool = False) -> str:
|
||||
"""Get schema."""
|
||||
return ""
|
||||
|
||||
def get_full_graph(self, limit: Optional[int] = None) -> Graph:
|
||||
"""Get full graph."""
|
||||
return MemoryGraph()
|
||||
|
||||
def explore(
|
||||
self,
|
||||
subs: List[str],
|
||||
direct: Direction = Direction.BOTH,
|
||||
depth: Optional[int] = None,
|
||||
fan: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> Graph:
|
||||
"""Explore the graph from given subjects up to a depth."""
|
||||
return MemoryGraph()
|
||||
|
||||
def query(self, query: str, **args) -> Graph:
|
||||
"""Execute a query on graph."""
|
||||
return MemoryGraph()
|
||||
|
@@ -1,12 +1,14 @@
|
||||
"""TuGraph vector store."""
|
||||
"""TuGraph store."""
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import Any, Generator, Iterator, List, Optional, Tuple
|
||||
|
||||
from dbgpt._private.pydantic import ConfigDict, Field
|
||||
from dbgpt.datasource.conn_tugraph import TuGraphConnector
|
||||
from dbgpt.storage.graph_store.base import GraphStoreBase, GraphStoreConfig
|
||||
from dbgpt.storage.graph_store.graph import Direction, Edge, MemoryGraph, Vertex
|
||||
from dbgpt.storage.graph_store.graph import Direction, Edge, Graph, MemoryGraph, Vertex
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -29,20 +31,24 @@ class TuGraphStoreConfig(GraphStoreConfig):
|
||||
description="login username",
|
||||
)
|
||||
password: str = Field(
|
||||
default="123456",
|
||||
default="73@TuGraph",
|
||||
description="login password",
|
||||
)
|
||||
vertex_type: str = Field(
|
||||
default="entity",
|
||||
description="The type of graph vertex, `entity` by default.",
|
||||
description="The type of vertex, `entity` by default.",
|
||||
)
|
||||
edge_type: str = Field(
|
||||
default="relation",
|
||||
description="The type of graph edge, `relation` by default.",
|
||||
description="The type of edge, `relation` by default.",
|
||||
)
|
||||
edge_name_key: str = Field(
|
||||
default="label",
|
||||
description="The label of edge name, `label` by default.",
|
||||
plugin_names: List[str] = Field(
|
||||
default=["leiden"],
|
||||
description=(
|
||||
"Plugins need to be loaded when initialize TuGraph, "
|
||||
"code: https://github.com/TuGraph-family"
|
||||
"/dbgpt-tugraph-plugins/tree/master/cpp"
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -51,20 +57,23 @@ class TuGraphStore(GraphStoreBase):
|
||||
|
||||
def __init__(self, config: TuGraphStoreConfig) -> None:
|
||||
"""Initialize the TuGraphStore with connection details."""
|
||||
self._host = os.getenv("TUGRAPH_HOST", "127.0.0.1") or config.host
|
||||
self._port = int(os.getenv("TUGRAPH_PORT", 7687)) or config.port
|
||||
self._username = os.getenv("TUGRAPH_USERNAME", "admin") or config.username
|
||||
self._password = os.getenv("TUGRAPH_PASSWORD", "73@TuGraph") or config.password
|
||||
self._node_label = (
|
||||
os.getenv("TUGRAPH_VERTEX_TYPE", "entity") or config.vertex_type
|
||||
self._config = config
|
||||
self._host = os.getenv("TUGRAPH_HOST", config.host)
|
||||
self._port = int(os.getenv("TUGRAPH_PORT", config.port))
|
||||
self._username = os.getenv("TUGRAPH_USERNAME", config.username)
|
||||
self._password = os.getenv("TUGRAPH_PASSWORD", config.password)
|
||||
self._summary_enabled = (
|
||||
os.getenv("GRAPH_COMMUNITY_SUMMARY_ENABLED", "").lower() == "true"
|
||||
or config.summary_enabled
|
||||
)
|
||||
self._edge_label = (
|
||||
os.getenv("TUGRAPH_EDGE_TYPE", "relation") or config.edge_type
|
||||
)
|
||||
self.edge_name_key = (
|
||||
os.getenv("TUGRAPH_EDGE_NAME_KEY", "label") or config.edge_name_key
|
||||
self._plugin_names = (
|
||||
os.getenv("TUGRAPH_PLUGIN_NAMES", "leiden").split(",")
|
||||
or config.plugin_names
|
||||
)
|
||||
self._graph_name = config.name
|
||||
self._vertex_type = os.getenv("TUGRAPH_VERTEX_TYPE", config.vertex_type)
|
||||
self._edge_type = os.getenv("TUGRAPH_EDGE_TYPE", config.edge_type)
|
||||
|
||||
self.conn = TuGraphConnector.from_uri_db(
|
||||
host=self._host,
|
||||
port=self._port,
|
||||
@@ -72,35 +81,197 @@ class TuGraphStore(GraphStoreBase):
|
||||
pwd=self._password,
|
||||
db_name=config.name,
|
||||
)
|
||||
self.conn.create_graph(graph_name=config.name)
|
||||
|
||||
self._create_graph(config.name)
|
||||
|
||||
def get_vertex_type(self) -> str:
|
||||
"""Get the vertex type."""
|
||||
return self._vertex_type
|
||||
|
||||
def get_edge_type(self) -> str:
|
||||
"""Get the edge type."""
|
||||
return self._edge_type
|
||||
|
||||
def _create_graph(self, graph_name: str):
|
||||
self.conn.create_graph(graph_name=graph_name)
|
||||
self._create_schema()
|
||||
if self._summary_enabled:
|
||||
self._upload_plugin()
|
||||
|
||||
def _check_label(self, elem_type: str):
|
||||
result = self.conn.get_table_names()
|
||||
if elem_type == "vertex":
|
||||
return self._node_label in result["vertex_tables"]
|
||||
return self._vertex_type in result["vertex_tables"]
|
||||
if elem_type == "edge":
|
||||
return self._edge_label in result["edge_tables"]
|
||||
return self._edge_type in result["edge_tables"]
|
||||
|
||||
def _add_vertex_index(self, field_name):
|
||||
gql = f"CALL db.addIndex('{self._vertex_type}', '{field_name}', false)"
|
||||
self.conn.run(gql)
|
||||
|
||||
def _upload_plugin(self):
|
||||
gql = "CALL db.plugin.listPlugin('CPP','v1')"
|
||||
result = self.conn.run(gql)
|
||||
result_names = [
|
||||
json.loads(record["plugin_description"])["name"] for record in result
|
||||
]
|
||||
missing_plugins = [
|
||||
name for name in self._plugin_names if name not in result_names
|
||||
]
|
||||
|
||||
if len(missing_plugins):
|
||||
for name in missing_plugins:
|
||||
try:
|
||||
from dbgpt_tugraph_plugins import ( # type: ignore # noqa
|
||||
get_plugin_binary_path,
|
||||
)
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"dbgpt-tugraph-plugins is not installed, "
|
||||
"pip install dbgpt-tugraph-plugins==0.1.0rc1 -U -i "
|
||||
"https://pypi.org/simple"
|
||||
)
|
||||
plugin_path = get_plugin_binary_path("leiden")
|
||||
with open(plugin_path, "rb") as f:
|
||||
content = f.read()
|
||||
content = base64.b64encode(content).decode()
|
||||
gql = (
|
||||
f"CALL db.plugin.loadPlugin('CPP', '{name}', '{content}', "
|
||||
"'SO', '{name} Plugin', false, 'v1')"
|
||||
)
|
||||
self.conn.run(gql)
|
||||
|
||||
def _create_schema(self):
|
||||
if not self._check_label("vertex"):
|
||||
create_vertex_gql = (
|
||||
f"CALL db.createLabel("
|
||||
f"'vertex', '{self._node_label}', "
|
||||
f"'id', ['id',string,false])"
|
||||
)
|
||||
self.conn.run(create_vertex_gql)
|
||||
if self._summary_enabled:
|
||||
create_vertex_gql = (
|
||||
f"CALL db.createLabel("
|
||||
f"'vertex', '{self._vertex_type}', "
|
||||
f"'id', ['id',string,false],"
|
||||
f"['name',string,false],"
|
||||
f"['_document_id',string,true],"
|
||||
f"['_chunk_id',string,true],"
|
||||
f"['_community_id',string,true],"
|
||||
f"['description',string,true])"
|
||||
)
|
||||
self.conn.run(create_vertex_gql)
|
||||
self._add_vertex_index("_community_id")
|
||||
else:
|
||||
create_vertex_gql = (
|
||||
f"CALL db.createLabel("
|
||||
f"'vertex', '{self._vertex_type}', "
|
||||
f"'id', ['id',string,false],"
|
||||
f"['name',string,false])"
|
||||
)
|
||||
self.conn.run(create_vertex_gql)
|
||||
|
||||
if not self._check_label("edge"):
|
||||
create_edge_gql = f"""CALL db.createLabel(
|
||||
'edge', '{self._edge_label}', '[["{self._node_label}",
|
||||
"{self._node_label}"]]', ["id",STRING,false])"""
|
||||
'edge', '{self._edge_type}',
|
||||
'[["{self._vertex_type}",
|
||||
"{self._vertex_type}"]]',
|
||||
["id",STRING,false],
|
||||
["name",STRING,false])"""
|
||||
if self._summary_enabled:
|
||||
create_edge_gql = f"""CALL db.createLabel(
|
||||
'edge', '{self._edge_type}',
|
||||
'[["{self._vertex_type}",
|
||||
"{self._vertex_type}"]]',
|
||||
["id",STRING,false],
|
||||
["name",STRING,false],
|
||||
["description",STRING,true])"""
|
||||
self.conn.run(create_edge_gql)
|
||||
|
||||
def _format_query_data(self, data, white_prop_list: List[str]):
|
||||
nodes_list = []
|
||||
rels_list: List[Any] = []
|
||||
_white_list = white_prop_list
|
||||
from neo4j import graph
|
||||
|
||||
def get_filtered_properties(properties, white_list):
|
||||
return {
|
||||
key: value
|
||||
for key, value in properties.items()
|
||||
if (not key.startswith("_") and key not in ["id", "name"])
|
||||
or key in white_list
|
||||
}
|
||||
|
||||
def process_node(node: graph.Node):
|
||||
node_id = node._properties.get("id")
|
||||
node_name = node._properties.get("name")
|
||||
node_properties = get_filtered_properties(node._properties, _white_list)
|
||||
nodes_list.append(
|
||||
{"id": node_id, "name": node_name, "properties": node_properties}
|
||||
)
|
||||
|
||||
def process_relationship(rel: graph.Relationship):
|
||||
name = rel._properties.get("name", "")
|
||||
rel_nodes = rel.nodes
|
||||
src_id = rel_nodes[0]._properties.get("id")
|
||||
dst_id = rel_nodes[1]._properties.get("id")
|
||||
for node in rel_nodes:
|
||||
process_node(node)
|
||||
edge_properties = get_filtered_properties(rel._properties, _white_list)
|
||||
if not any(
|
||||
existing_edge.get("name") == name
|
||||
and existing_edge.get("src_id") == src_id
|
||||
and existing_edge.get("dst_id") == dst_id
|
||||
for existing_edge in rels_list
|
||||
):
|
||||
rels_list.append(
|
||||
{
|
||||
"src_id": src_id,
|
||||
"dst_id": dst_id,
|
||||
"name": name,
|
||||
"properties": edge_properties,
|
||||
}
|
||||
)
|
||||
|
||||
def process_path(path: graph.Path):
|
||||
for rel in path.relationships:
|
||||
process_relationship(rel)
|
||||
|
||||
def process_other(value):
|
||||
if not any(
|
||||
existing_node.get("id") == "json_node" for existing_node in nodes_list
|
||||
):
|
||||
nodes_list.append(
|
||||
{
|
||||
"id": "json_node",
|
||||
"name": "json_node",
|
||||
"properties": {"description": value},
|
||||
}
|
||||
)
|
||||
|
||||
for record in data:
|
||||
for key in record.keys():
|
||||
value = record[key]
|
||||
if isinstance(value, graph.Node):
|
||||
process_node(value)
|
||||
elif isinstance(value, graph.Relationship):
|
||||
process_relationship(value)
|
||||
elif isinstance(value, graph.Path):
|
||||
process_path(value)
|
||||
else:
|
||||
process_other(value)
|
||||
nodes = [
|
||||
Vertex(node["id"], node["name"], **node["properties"])
|
||||
for node in nodes_list
|
||||
]
|
||||
rels = [
|
||||
Edge(edge["src_id"], edge["dst_id"], edge["name"], **edge["properties"])
|
||||
for edge in rels_list
|
||||
]
|
||||
return {"nodes": nodes, "edges": rels}
|
||||
|
||||
def get_config(self):
|
||||
"""Get the graph store config."""
|
||||
return self._config
|
||||
|
||||
def get_triplets(self, subj: str) -> List[Tuple[str, str]]:
|
||||
"""Get triplets."""
|
||||
query = (
|
||||
f"MATCH (n1:{self._node_label})-[r]->(n2:{self._node_label}) "
|
||||
f"MATCH (n1:{self._vertex_type})-[r]->(n2:{self._vertex_type}) "
|
||||
f'WHERE n1.id = "{subj}" RETURN r.id as rel, n2.id as obj;'
|
||||
)
|
||||
data = self.conn.run(query)
|
||||
@@ -117,16 +288,83 @@ class TuGraphStore(GraphStoreBase):
|
||||
rel_escaped = escape_quotes(rel)
|
||||
obj_escaped = escape_quotes(obj)
|
||||
|
||||
subj_query = f"MERGE (n1:{self._node_label} {{id:'{subj_escaped}'}})"
|
||||
obj_query = f"MERGE (n1:{self._node_label} {{id:'{obj_escaped}'}})"
|
||||
rel_query = (
|
||||
f"MERGE (n1:{self._node_label} {{id:'{subj_escaped}'}})"
|
||||
f"-[r:{self._edge_label} {{id:'{rel_escaped}'}}]->"
|
||||
f"(n2:{self._node_label} {{id:'{obj_escaped}'}})"
|
||||
node_query = f"""CALL db.upsertVertex(
|
||||
'{self._vertex_type}',
|
||||
[{{id:'{subj_escaped}',name:'{subj_escaped}'}},
|
||||
{{id:'{obj_escaped}',name:'{obj_escaped}'}}])"""
|
||||
edge_query = f"""CALL db.upsertEdge(
|
||||
'{self._edge_type}',
|
||||
{{type:"{self._vertex_type}",key:"sid"}},
|
||||
{{type:"{self._vertex_type}", key:"tid"}},
|
||||
[{{sid:"{subj_escaped}",
|
||||
tid: "{obj_escaped}",
|
||||
id:"{rel_escaped}",
|
||||
name: "{rel_escaped}"}}])"""
|
||||
self.conn.run(query=node_query)
|
||||
self.conn.run(query=edge_query)
|
||||
|
||||
def insert_graph(self, graph: Graph) -> None:
|
||||
"""Add graph."""
|
||||
|
||||
def escape_quotes(value: str) -> str:
|
||||
"""Escape single and double quotes in a string for queries."""
|
||||
if value is not None:
|
||||
return value.replace("'", "").replace('"', "")
|
||||
|
||||
nodes: Iterator[Vertex] = graph.vertices()
|
||||
edges: Iterator[Edge] = graph.edges()
|
||||
node_list = []
|
||||
edge_list = []
|
||||
|
||||
def parser(node_list):
|
||||
formatted_nodes = [
|
||||
"{"
|
||||
+ ", ".join(
|
||||
f'{k}: "{v}"' if isinstance(v, str) else f"{k}: {v}"
|
||||
for k, v in node.items()
|
||||
)
|
||||
+ "}"
|
||||
for node in node_list
|
||||
]
|
||||
return f"""{', '.join(formatted_nodes)}"""
|
||||
|
||||
for node in nodes:
|
||||
node_list.append(
|
||||
{
|
||||
"id": escape_quotes(node.vid),
|
||||
"name": escape_quotes(node.name),
|
||||
"description": escape_quotes(node.get_prop("description")) or "",
|
||||
"_document_id": "0",
|
||||
"_chunk_id": "0",
|
||||
"_community_id": "0",
|
||||
}
|
||||
)
|
||||
node_query = (
|
||||
f"""CALL db.upsertVertex("{self._vertex_type}", [{parser(node_list)}])"""
|
||||
)
|
||||
self.conn.run(query=subj_query)
|
||||
self.conn.run(query=obj_query)
|
||||
self.conn.run(query=rel_query)
|
||||
for edge in edges:
|
||||
edge_list.append(
|
||||
{
|
||||
"sid": escape_quotes(edge.sid),
|
||||
"tid": escape_quotes(edge.tid),
|
||||
"id": escape_quotes(edge.name),
|
||||
"name": escape_quotes(edge.name),
|
||||
"description": escape_quotes(edge.get_prop("description")),
|
||||
}
|
||||
)
|
||||
|
||||
edge_query = f"""CALL db.upsertEdge(
|
||||
"{self._edge_type}",
|
||||
{{type:"{self._vertex_type}", key:"sid"}},
|
||||
{{type:"{self._vertex_type}", key:"tid"}},
|
||||
[{parser(edge_list)}])"""
|
||||
self.conn.run(query=node_query)
|
||||
self.conn.run(query=edge_query)
|
||||
|
||||
def truncate(self):
|
||||
"""Truncate Graph."""
|
||||
gql = "MATCH (n) DELETE n"
|
||||
self.conn.run(gql)
|
||||
|
||||
def drop(self):
|
||||
"""Delete Graph."""
|
||||
@@ -135,9 +373,9 @@ class TuGraphStore(GraphStoreBase):
|
||||
def delete_triplet(self, sub: str, rel: str, obj: str) -> None:
|
||||
"""Delete triplet."""
|
||||
del_query = (
|
||||
f"MATCH (n1:{self._node_label} {{id:'{sub}'}})"
|
||||
f"-[r:{self._edge_label} {{id:'{rel}'}}]->"
|
||||
f"(n2:{self._node_label} {{id:'{obj}'}}) DELETE n1,n2,r"
|
||||
f"MATCH (n1:{self._vertex_type} {{id:'{sub}'}})"
|
||||
f"-[r:{self._edge_type} {{id:'{rel}'}}]->"
|
||||
f"(n2:{self._vertex_type} {{id:'{obj}'}}) DELETE n1,n2,r"
|
||||
)
|
||||
self.conn.run(query=del_query)
|
||||
|
||||
@@ -148,11 +386,20 @@ class TuGraphStore(GraphStoreBase):
|
||||
schema = data[0]["schema"]
|
||||
return schema
|
||||
|
||||
def get_full_graph(self, limit: Optional[int] = None) -> MemoryGraph:
|
||||
def get_full_graph(self, limit: Optional[int] = None) -> Graph:
|
||||
"""Get full graph."""
|
||||
if not limit:
|
||||
raise Exception("limit must be set")
|
||||
return self.query(f"MATCH (n)-[r]-(m) RETURN n,m,r LIMIT {limit}")
|
||||
graph_result = self.query(
|
||||
f"MATCH (n)-[r]-(m) RETURN n,r,m LIMIT {limit}",
|
||||
white_list=["_community_id"],
|
||||
)
|
||||
all_graph = MemoryGraph()
|
||||
for vertex in graph_result.vertices():
|
||||
all_graph.upsert_vertex(vertex)
|
||||
for edge in graph_result.edges():
|
||||
all_graph.append_edge(edge)
|
||||
return all_graph
|
||||
|
||||
def explore(
|
||||
self,
|
||||
@@ -161,8 +408,11 @@ class TuGraphStore(GraphStoreBase):
|
||||
depth: Optional[int] = None,
|
||||
fan: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> MemoryGraph:
|
||||
) -> Graph:
|
||||
"""Explore the graph from given subjects up to a depth."""
|
||||
if not subs:
|
||||
return MemoryGraph()
|
||||
|
||||
if fan is not None:
|
||||
raise ValueError("Fan functionality is not supported at this time.")
|
||||
else:
|
||||
@@ -173,67 +423,88 @@ class TuGraphStore(GraphStoreBase):
|
||||
limit_string = f"LIMIT {limit}"
|
||||
if limit is None:
|
||||
limit_string = ""
|
||||
|
||||
if direct.name == "OUT":
|
||||
rel = f"-[r:{self._edge_type}*{depth_string}]->"
|
||||
elif direct.name == "IN":
|
||||
rel = f"<-[r:{self._edge_type}*{depth_string}]-"
|
||||
else:
|
||||
rel = f"-[r:{self._edge_type}*{depth_string}]-"
|
||||
query = (
|
||||
f"MATCH p=(n:{self._node_label})"
|
||||
f"-[r:{self._edge_label}*{depth_string}]-(m:{self._node_label}) "
|
||||
f"MATCH p=(n:{self._vertex_type})"
|
||||
f"{rel}(m:{self._vertex_type}) "
|
||||
f"WHERE n.id IN {subs} RETURN p {limit_string}"
|
||||
)
|
||||
return self.query(query)
|
||||
|
||||
def query(self, query: str, **args) -> MemoryGraph:
|
||||
"""Execute a query on graph."""
|
||||
|
||||
def _format_paths(paths):
|
||||
formatted_paths = []
|
||||
for path in paths:
|
||||
formatted_path = []
|
||||
nodes = list(path["p"].nodes)
|
||||
rels = list(path["p"].relationships)
|
||||
for i in range(len(nodes)):
|
||||
formatted_path.append(nodes[i]._properties["id"])
|
||||
if i < len(rels):
|
||||
formatted_path.append(rels[i]._properties["id"])
|
||||
formatted_paths.append(formatted_path)
|
||||
return formatted_paths
|
||||
|
||||
def _format_query_data(data):
|
||||
node_ids_set = set()
|
||||
rels_set = set()
|
||||
from neo4j import graph
|
||||
|
||||
for record in data:
|
||||
for key in record.keys():
|
||||
value = record[key]
|
||||
if isinstance(value, graph.Node):
|
||||
node_id = value._properties["id"]
|
||||
node_ids_set.add(node_id)
|
||||
elif isinstance(value, graph.Relationship):
|
||||
rel_nodes = value.nodes
|
||||
prop_id = value._properties["id"]
|
||||
src_id = rel_nodes[0]._properties["id"]
|
||||
dst_id = rel_nodes[1]._properties["id"]
|
||||
rels_set.add((src_id, dst_id, prop_id))
|
||||
elif isinstance(value, graph.Path):
|
||||
formatted_paths = _format_paths(data)
|
||||
for path in formatted_paths:
|
||||
for i in range(0, len(path), 2):
|
||||
node_ids_set.add(path[i])
|
||||
if i + 2 < len(path):
|
||||
rels_set.add((path[i], path[i + 2], path[i + 1]))
|
||||
|
||||
nodes = [Vertex(node_id) for node_id in node_ids_set]
|
||||
rels = [
|
||||
Edge(src_id, dst_id, label=prop_id)
|
||||
for (src_id, dst_id, prop_id) in rels_set
|
||||
]
|
||||
return {"nodes": nodes, "edges": rels}
|
||||
|
||||
result = self.conn.run(query=query)
|
||||
graph = _format_query_data(result)
|
||||
white_list = args.get("white_list", [])
|
||||
graph = self._format_query_data(result, white_list)
|
||||
mg = MemoryGraph()
|
||||
for vertex in graph["nodes"]:
|
||||
mg.upsert_vertex(vertex)
|
||||
for edge in graph["edges"]:
|
||||
mg.append_edge(edge)
|
||||
return mg
|
||||
|
||||
def stream_query(self, query: str) -> Generator[Graph, None, None]:
|
||||
"""Execute a stream query."""
|
||||
from neo4j import graph
|
||||
|
||||
for record in self.conn.run_stream(query):
|
||||
mg = MemoryGraph()
|
||||
for key in record.keys():
|
||||
value = record[key]
|
||||
if isinstance(value, graph.Node):
|
||||
node_id = value._properties["id"]
|
||||
description = value._properties["description"]
|
||||
vertex = Vertex(node_id, name=node_id, description=description)
|
||||
mg.upsert_vertex(vertex)
|
||||
elif isinstance(value, graph.Relationship):
|
||||
rel_nodes = value.nodes
|
||||
prop_id = value._properties["id"]
|
||||
src_id = rel_nodes[0]._properties["id"]
|
||||
dst_id = rel_nodes[1]._properties["id"]
|
||||
description = value._properties["description"]
|
||||
edge = Edge(src_id, dst_id, name=prop_id, description=description)
|
||||
mg.append_edge(edge)
|
||||
elif isinstance(value, graph.Path):
|
||||
nodes = list(record["p"].nodes)
|
||||
rels = list(record["p"].relationships)
|
||||
formatted_path = []
|
||||
for i in range(len(nodes)):
|
||||
formatted_path.append(
|
||||
{
|
||||
"id": nodes[i]._properties["id"],
|
||||
"description": nodes[i]._properties["description"],
|
||||
}
|
||||
)
|
||||
if i < len(rels):
|
||||
formatted_path.append(
|
||||
{
|
||||
"id": rels[i]._properties["id"],
|
||||
"description": rels[i]._properties["description"],
|
||||
}
|
||||
)
|
||||
for i in range(0, len(formatted_path), 2):
|
||||
mg.upsert_vertex(
|
||||
Vertex(
|
||||
formatted_path[i]["id"],
|
||||
name=formatted_path[i]["id"],
|
||||
description=formatted_path[i]["description"],
|
||||
)
|
||||
)
|
||||
if i + 2 < len(formatted_path):
|
||||
mg.append_edge(
|
||||
Edge(
|
||||
formatted_path[i]["id"],
|
||||
formatted_path[i + 2]["id"],
|
||||
name=formatted_path[i + 1]["id"],
|
||||
description=formatted_path[i + 1]["description"],
|
||||
)
|
||||
)
|
||||
else:
|
||||
vertex = Vertex("json_node", name="json_node", description=value)
|
||||
mg.upsert_vertex(vertex)
|
||||
yield mg
|
||||
|
@@ -19,6 +19,10 @@ class KnowledgeGraphConfig(IndexStoreConfig):
|
||||
class KnowledgeGraphBase(IndexStoreBase, ABC):
|
||||
"""Knowledge graph base class."""
|
||||
|
||||
@abstractmethod
|
||||
def get_config(self) -> KnowledgeGraphConfig:
|
||||
"""Get the knowledge graph config."""
|
||||
|
||||
@abstractmethod
|
||||
def query_graph(self, limit: Optional[int] = None) -> Graph:
|
||||
"""Get graph data."""
|
||||
|
1
dbgpt/storage/knowledge_graph/community/__init__.py
Normal file
1
dbgpt/storage/knowledge_graph/community/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Community Module."""
|
73
dbgpt/storage/knowledge_graph/community/base.py
Normal file
73
dbgpt/storage/knowledge_graph/community/base.py
Normal file
@@ -0,0 +1,73 @@
|
||||
"""Define Classes about Community."""
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional
|
||||
|
||||
from dbgpt.storage.graph_store.base import GraphStoreBase
|
||||
from dbgpt.storage.graph_store.graph import Graph
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Community:
|
||||
"""Community class."""
|
||||
|
||||
id: str
|
||||
data: Optional[Graph] = None
|
||||
summary: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CommunityTree:
|
||||
"""Represents a community tree."""
|
||||
|
||||
|
||||
class CommunityStoreAdapter(ABC):
|
||||
"""Community Store Adapter."""
|
||||
|
||||
def __init__(self, graph_store: GraphStoreBase):
|
||||
"""Initialize Community Store Adapter."""
|
||||
self._graph_store = graph_store
|
||||
|
||||
@property
|
||||
def graph_store(self) -> GraphStoreBase:
|
||||
"""Get graph store."""
|
||||
return self._graph_store
|
||||
|
||||
@abstractmethod
|
||||
async def discover_communities(self, **kwargs) -> List[str]:
|
||||
"""Run community discovery."""
|
||||
|
||||
@abstractmethod
|
||||
async def get_community(self, community_id: str) -> Community:
|
||||
"""Get community."""
|
||||
|
||||
|
||||
class CommunityMetastore(ABC):
|
||||
"""Community metastore class."""
|
||||
|
||||
@abstractmethod
|
||||
def get(self, community_id: str) -> Community:
|
||||
"""Get community."""
|
||||
|
||||
@abstractmethod
|
||||
def list(self) -> List[Community]:
|
||||
"""Get all communities."""
|
||||
|
||||
@abstractmethod
|
||||
async def search(self, query: str) -> List[Community]:
|
||||
"""Search communities relevant to query."""
|
||||
|
||||
@abstractmethod
|
||||
async def save(self, communities: List[Community]):
|
||||
"""Save communities."""
|
||||
|
||||
@abstractmethod
|
||||
async def truncate(self):
|
||||
"""Truncate all communities."""
|
||||
|
||||
@abstractmethod
|
||||
def drop(self):
|
||||
"""Drop community metastore."""
|
@@ -0,0 +1,63 @@
|
||||
"""Builtin Community metastore."""
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
from dbgpt.core import Chunk
|
||||
from dbgpt.datasource.rdbms.base import RDBMSConnector
|
||||
from dbgpt.storage.knowledge_graph.community.base import Community, CommunityMetastore
|
||||
from dbgpt.storage.vector_store.base import VectorStoreBase
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BuiltinCommunityMetastore(CommunityMetastore):
|
||||
"""Builtin Community metastore."""
|
||||
|
||||
def __init__(
|
||||
self, vector_store: VectorStoreBase, rdb_store: Optional[RDBMSConnector] = None
|
||||
):
|
||||
"""Initialize Community metastore."""
|
||||
self._vector_store = vector_store
|
||||
self._rdb_store = rdb_store
|
||||
|
||||
config = self._vector_store.get_config()
|
||||
self._vector_space = config.name
|
||||
self._max_chunks_once_load = config.max_chunks_once_load
|
||||
self._max_threads = config.max_threads
|
||||
self._topk = config.topk
|
||||
self._score_threshold = config.score_threshold
|
||||
|
||||
def get(self, community_id: str) -> Community:
|
||||
"""Get community."""
|
||||
raise NotImplementedError("Get community not allowed")
|
||||
|
||||
def list(self) -> List[Community]:
|
||||
"""Get all communities."""
|
||||
raise NotImplementedError("List communities not allowed")
|
||||
|
||||
async def search(self, query: str) -> List[Community]:
|
||||
"""Search communities relevant to query."""
|
||||
chunks = await self._vector_store.asimilar_search_with_scores(
|
||||
query, self._topk, self._score_threshold
|
||||
)
|
||||
return [Community(id=chunk.chunk_id, summary=chunk.content) for chunk in chunks]
|
||||
|
||||
async def save(self, communities: List[Community]):
|
||||
"""Save communities."""
|
||||
chunks = [
|
||||
Chunk(id=c.id, content=c.summary, metadata={"total": len(communities)})
|
||||
for c in communities
|
||||
]
|
||||
await self._vector_store.aload_document_with_limit(
|
||||
chunks, self._max_chunks_once_load, self._max_threads
|
||||
)
|
||||
logger.info(f"Save {len(communities)} communities")
|
||||
|
||||
async def truncate(self):
|
||||
"""Truncate community metastore."""
|
||||
self._vector_store.truncate()
|
||||
|
||||
def drop(self):
|
||||
"""Drop community metastore."""
|
||||
if self._vector_store.vector_name_exists():
|
||||
self._vector_store.delete_vector_name(self._vector_space)
|
83
dbgpt/storage/knowledge_graph/community/community_store.py
Normal file
83
dbgpt/storage/knowledge_graph/community/community_store.py
Normal file
@@ -0,0 +1,83 @@
|
||||
"""Define the CommunityStore class."""
|
||||
|
||||
import logging
|
||||
from typing import List
|
||||
|
||||
from dbgpt.rag.transformer.community_summarizer import CommunitySummarizer
|
||||
from dbgpt.storage.knowledge_graph.community.base import (
|
||||
Community,
|
||||
CommunityStoreAdapter,
|
||||
)
|
||||
from dbgpt.storage.knowledge_graph.community.community_metastore import (
|
||||
BuiltinCommunityMetastore,
|
||||
)
|
||||
from dbgpt.storage.vector_store.base import VectorStoreBase
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CommunityStore:
|
||||
"""CommunityStore Class."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
community_store_adapter: CommunityStoreAdapter,
|
||||
community_summarizer: CommunitySummarizer,
|
||||
vector_store: VectorStoreBase,
|
||||
):
|
||||
"""Initialize the CommunityStore class."""
|
||||
self._community_store_adapter = community_store_adapter
|
||||
self._community_summarizer = community_summarizer
|
||||
self._meta_store = BuiltinCommunityMetastore(vector_store)
|
||||
|
||||
async def build_communities(self):
|
||||
"""Discover communities."""
|
||||
community_ids = await (self._community_store_adapter.discover_communities())
|
||||
|
||||
# summarize communities
|
||||
communities = []
|
||||
for community_id in community_ids:
|
||||
community = await (
|
||||
self._community_store_adapter.get_community(community_id)
|
||||
)
|
||||
graph = community.data.format()
|
||||
if not graph:
|
||||
break
|
||||
|
||||
community.summary = await (
|
||||
self._community_summarizer.summarize(graph=graph)
|
||||
)
|
||||
communities.append(community)
|
||||
logger.info(
|
||||
f"Summarize community {community_id}: " f"{community.summary[:50]}..."
|
||||
)
|
||||
|
||||
# truncate then save new summaries
|
||||
await self._meta_store.truncate()
|
||||
await self._meta_store.save(communities)
|
||||
|
||||
async def search_communities(self, query: str) -> List[Community]:
|
||||
"""Search communities."""
|
||||
return await self._meta_store.search(query)
|
||||
|
||||
def truncate(self):
|
||||
"""Truncate community store."""
|
||||
logger.info("Truncate community metastore")
|
||||
self._meta_store.truncate()
|
||||
|
||||
logger.info("Truncate community summarizer")
|
||||
self._community_summarizer.truncate()
|
||||
|
||||
logger.info("Truncate graph")
|
||||
self._community_store_adapter.graph_store.truncate()
|
||||
|
||||
def drop(self):
|
||||
"""Drop community store."""
|
||||
logger.info("Remove community metastore")
|
||||
self._meta_store.drop()
|
||||
|
||||
logger.info("Remove community summarizer")
|
||||
self._community_summarizer.drop()
|
||||
|
||||
logger.info("Remove graph")
|
||||
self._community_store_adapter.graph_store.drop()
|
30
dbgpt/storage/knowledge_graph/community/factory.py
Normal file
30
dbgpt/storage/knowledge_graph/community/factory.py
Normal file
@@ -0,0 +1,30 @@
|
||||
"""CommunityStoreAdapter factory."""
|
||||
import logging
|
||||
|
||||
from dbgpt.storage.graph_store.base import GraphStoreBase
|
||||
from dbgpt.storage.graph_store.tugraph_store import TuGraphStore
|
||||
from dbgpt.storage.knowledge_graph.community.base import CommunityStoreAdapter
|
||||
from dbgpt.storage.knowledge_graph.community.tugraph_adapter import (
|
||||
TuGraphCommunityStoreAdapter,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CommunityStoreAdapterFactory:
|
||||
"""Factory for community store adapter."""
|
||||
|
||||
@staticmethod
|
||||
def create(graph_store: GraphStoreBase) -> CommunityStoreAdapter:
|
||||
"""Create a CommunityStoreAdapter instance.
|
||||
|
||||
Args:
|
||||
- graph_store_type: graph store type Memory, TuGraph, Neo4j
|
||||
"""
|
||||
if isinstance(graph_store, TuGraphStore):
|
||||
return TuGraphCommunityStoreAdapter(graph_store)
|
||||
else:
|
||||
raise Exception(
|
||||
"create community store adapter for %s failed",
|
||||
graph_store.__class__.__name__,
|
||||
)
|
52
dbgpt/storage/knowledge_graph/community/tugraph_adapter.py
Normal file
52
dbgpt/storage/knowledge_graph/community/tugraph_adapter.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""TuGraph Community Store Adapter."""
|
||||
import json
|
||||
import logging
|
||||
from typing import List
|
||||
|
||||
from dbgpt.storage.graph_store.graph import MemoryGraph
|
||||
from dbgpt.storage.knowledge_graph.community.base import (
|
||||
Community,
|
||||
CommunityStoreAdapter,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TuGraphCommunityStoreAdapter(CommunityStoreAdapter):
|
||||
"""TuGraph Community Store Adapter."""
|
||||
|
||||
MAX_HIERARCHY_LEVEL = 3
|
||||
|
||||
async def discover_communities(self, **kwargs) -> List[str]:
|
||||
"""Run community discovery with leiden."""
|
||||
mg = self._graph_store.query(
|
||||
"CALL db.plugin.callPlugin"
|
||||
"('CPP','leiden','{\"leiden_val\":\"_community_id\"}',60.00,false)"
|
||||
)
|
||||
result = mg.get_vertex("json_node").get_prop("description")
|
||||
community_ids = json.loads(result)["community_id_list"]
|
||||
logger.info(f"Discovered {len(community_ids)} communities.")
|
||||
return community_ids
|
||||
|
||||
async def get_community(self, community_id: str) -> Community:
|
||||
"""Get community."""
|
||||
query = (
|
||||
f"MATCH (n:{self._graph_store.get_vertex_type()})"
|
||||
f"WHERE n._community_id = '{community_id}' RETURN n"
|
||||
)
|
||||
edge_query = (
|
||||
f"MATCH (n:{self._graph_store.get_vertex_type()})-"
|
||||
f"[r:{self._graph_store.get_edge_type()}]-"
|
||||
f"(m:{self._graph_store.get_vertex_type()})"
|
||||
f"WHERE n._community_id = '{community_id}' RETURN n,r,m"
|
||||
)
|
||||
|
||||
all_vertex_graph = self._graph_store.aquery(query)
|
||||
all_edge_graph = self._graph_store.aquery(edge_query)
|
||||
all_graph = MemoryGraph()
|
||||
for vertex in all_vertex_graph.vertices():
|
||||
all_graph.upsert_vertex(vertex)
|
||||
for edge in all_edge_graph.edges():
|
||||
all_graph.append_edge(edge)
|
||||
|
||||
return Community(id=community_id, data=all_graph)
|
373
dbgpt/storage/knowledge_graph/community_summary.py
Normal file
373
dbgpt/storage/knowledge_graph/community_summary.py
Normal file
@@ -0,0 +1,373 @@
|
||||
"""Define the CommunitySummaryKnowledgeGraph."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
from dbgpt._private.pydantic import ConfigDict, Field
|
||||
from dbgpt.core import Chunk
|
||||
from dbgpt.rag.transformer.community_summarizer import CommunitySummarizer
|
||||
from dbgpt.rag.transformer.graph_extractor import GraphExtractor
|
||||
from dbgpt.storage.knowledge_graph.community.community_store import CommunityStore
|
||||
from dbgpt.storage.knowledge_graph.community.factory import CommunityStoreAdapterFactory
|
||||
from dbgpt.storage.knowledge_graph.knowledge_graph import (
|
||||
BuiltinKnowledgeGraph,
|
||||
BuiltinKnowledgeGraphConfig,
|
||||
)
|
||||
from dbgpt.storage.vector_store.base import VectorStoreConfig
|
||||
from dbgpt.storage.vector_store.factory import VectorStoreFactory
|
||||
from dbgpt.storage.vector_store.filters import MetadataFilters
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CommunitySummaryKnowledgeGraphConfig(BuiltinKnowledgeGraphConfig):
|
||||
"""Community summary knowledge graph config."""
|
||||
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
vector_store_type: str = Field(
|
||||
default="Chroma", description="The type of vector store."
|
||||
)
|
||||
user: Optional[str] = Field(
|
||||
default=None,
|
||||
description="The user of vector store, if not set, will use the default user.",
|
||||
)
|
||||
password: Optional[str] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"The password of vector store, if not set, will use the default password."
|
||||
),
|
||||
)
|
||||
extract_topk: int = Field(
|
||||
default=5,
|
||||
description="Topk of knowledge graph extract",
|
||||
)
|
||||
extract_score_threshold: float = Field(
|
||||
default=0.3,
|
||||
description="Recall score of knowledge graph extract",
|
||||
)
|
||||
community_topk: int = Field(
|
||||
default=50,
|
||||
description="Topk of community search in knowledge graph",
|
||||
)
|
||||
community_score_threshold: float = Field(
|
||||
default=0.0,
|
||||
description="Recall score of community search in knowledge graph",
|
||||
)
|
||||
|
||||
|
||||
class CommunitySummaryKnowledgeGraph(BuiltinKnowledgeGraph):
|
||||
"""Community summary knowledge graph class."""
|
||||
|
||||
def __init__(self, config: CommunitySummaryKnowledgeGraphConfig):
|
||||
"""Initialize community summary knowledge graph class."""
|
||||
super().__init__(config)
|
||||
self._config = config
|
||||
|
||||
self._vector_store_type = os.getenv(
|
||||
"VECTOR_STORE_TYPE", config.vector_store_type
|
||||
)
|
||||
self._extract_topk = int(
|
||||
os.getenv("KNOWLEDGE_GRAPH_EXTRACT_SEARCH_TOP_SIZE", config.extract_topk)
|
||||
)
|
||||
self._extract_score_threshold = float(
|
||||
os.getenv(
|
||||
"KNOWLEDGE_GRAPH_EXTRACT_SEARCH_RECALL_SCORE",
|
||||
config.extract_score_threshold,
|
||||
)
|
||||
)
|
||||
self._community_topk = int(
|
||||
os.getenv(
|
||||
"KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_TOP_SIZE", config.community_topk
|
||||
)
|
||||
)
|
||||
self._community_score_threshold = float(
|
||||
os.getenv(
|
||||
"KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_RECALL_SCORE",
|
||||
config.community_score_threshold,
|
||||
)
|
||||
)
|
||||
|
||||
def extractor_configure(name: str, cfg: VectorStoreConfig):
|
||||
cfg.name = name
|
||||
cfg.embedding_fn = config.embedding_fn
|
||||
cfg.max_chunks_once_load = config.max_chunks_once_load
|
||||
cfg.max_threads = config.max_threads
|
||||
cfg.user = config.user
|
||||
cfg.password = config.password
|
||||
cfg.topk = self._extract_topk
|
||||
cfg.score_threshold = self._extract_score_threshold
|
||||
|
||||
self._graph_extractor = GraphExtractor(
|
||||
self._llm_client,
|
||||
self._model_name,
|
||||
VectorStoreFactory.create(
|
||||
self._vector_store_type,
|
||||
config.name + "_CHUNK_HISTORY",
|
||||
extractor_configure,
|
||||
),
|
||||
)
|
||||
|
||||
def community_store_configure(name: str, cfg: VectorStoreConfig):
|
||||
cfg.name = name
|
||||
cfg.embedding_fn = config.embedding_fn
|
||||
cfg.max_chunks_once_load = config.max_chunks_once_load
|
||||
cfg.max_threads = config.max_threads
|
||||
cfg.user = config.user
|
||||
cfg.password = config.password
|
||||
cfg.topk = self._community_topk
|
||||
cfg.score_threshold = self._community_score_threshold
|
||||
|
||||
self._community_store = CommunityStore(
|
||||
CommunityStoreAdapterFactory.create(self._graph_store),
|
||||
CommunitySummarizer(self._llm_client, self._model_name),
|
||||
VectorStoreFactory.create(
|
||||
self._vector_store_type,
|
||||
config.name + "_COMMUNITY_SUMMARY",
|
||||
community_store_configure,
|
||||
),
|
||||
)
|
||||
|
||||
def get_config(self) -> BuiltinKnowledgeGraphConfig:
|
||||
"""Get the knowledge graph config."""
|
||||
return self._config
|
||||
|
||||
async def aload_document(self, chunks: List[Chunk]) -> List[str]:
|
||||
"""Extract and persist graph."""
|
||||
# todo add doc node
|
||||
for chunk in chunks:
|
||||
# todo add chunk node
|
||||
# todo add relation doc-chunk
|
||||
|
||||
# extract graphs and save
|
||||
graphs = await self._graph_extractor.extract(chunk.content)
|
||||
for graph in graphs:
|
||||
self._graph_store.insert_graph(graph)
|
||||
|
||||
# build communities and save
|
||||
await self._community_store.build_communities()
|
||||
|
||||
return [chunk.chunk_id for chunk in chunks]
|
||||
|
||||
async def asimilar_search_with_scores(
|
||||
self,
|
||||
text,
|
||||
topk,
|
||||
score_threshold: float,
|
||||
filters: Optional[MetadataFilters] = None,
|
||||
) -> List[Chunk]:
|
||||
"""Retrieve relevant community summaries."""
|
||||
# global search: retrieve relevant community summaries
|
||||
communities = await self._community_store.search_communities(text)
|
||||
summaries = [
|
||||
f"Section {i + 1}:\n{community.summary}"
|
||||
for i, community in enumerate(communities)
|
||||
]
|
||||
context = "\n".join(summaries) if summaries else ""
|
||||
|
||||
# local search: extract keywords and explore subgraph
|
||||
keywords = await self._keyword_extractor.extract(text)
|
||||
subgraph = self._graph_store.explore(keywords, limit=topk).format()
|
||||
logger.info(f"Search subgraph from {len(keywords)} keywords")
|
||||
|
||||
if not summaries and not subgraph:
|
||||
return []
|
||||
|
||||
# merge search results into context
|
||||
content = HYBRID_SEARCH_PT_CN.format(context=context, graph=subgraph)
|
||||
return [Chunk(content=content)]
|
||||
|
||||
def truncate(self) -> List[str]:
|
||||
"""Truncate knowledge graph."""
|
||||
logger.info("Truncate community store")
|
||||
self._community_store.truncate()
|
||||
logger.info("Truncate keyword extractor")
|
||||
self._keyword_extractor.truncate()
|
||||
logger.info("Truncate triplet extractor")
|
||||
self._graph_extractor.truncate()
|
||||
return [self._config.name]
|
||||
|
||||
def delete_vector_name(self, index_name: str):
|
||||
"""Delete knowledge graph."""
|
||||
logger.info("Drop community store")
|
||||
self._community_store.drop()
|
||||
|
||||
logger.info("Drop keyword extractor")
|
||||
self._keyword_extractor.drop()
|
||||
|
||||
logger.info("Drop triplet extractor")
|
||||
self._graph_extractor.drop()
|
||||
|
||||
|
||||
HYBRID_SEARCH_PT_CN = (
|
||||
"## 角色\n"
|
||||
"你非常擅长结合提示词模板提供的[上下文]信息与[知识图谱]信息,"
|
||||
"准确恰当地回答用户的问题,并保证不会输出与上下文和知识图谱无关的信息。"
|
||||
"\n"
|
||||
"## 技能\n"
|
||||
"### 技能 1: 上下文理解\n"
|
||||
"- 准确地理解[上下文]提供的信息,上下文信息可能被拆分为多个章节。\n"
|
||||
"- 上下文的每个章节内容都会以[Section]开始,并按需进行了编号。\n"
|
||||
"- 上下文信息提供了与用户问题相关度最高的总结性描述,请合理使用它们。"
|
||||
"### 技能 2: 知识图谱理解\n"
|
||||
"- 准确地识别[知识图谱]中提供的[Entities:]章节中的实体信息"
|
||||
"和[Relationships:]章节中的关系信息,实体和关系信息的一般格式为:\n"
|
||||
"```"
|
||||
"* 实体信息格式:\n"
|
||||
"- (实体名)\n"
|
||||
"- (实体名:实体描述)\n"
|
||||
"- (实体名:实体属性表)\n"
|
||||
"- (文本块ID:文档块内容)\n"
|
||||
"- (目录ID:目录名)\n"
|
||||
"- (文档ID:文档名称)\n"
|
||||
"\n"
|
||||
"* 关系信息的格式:\n"
|
||||
"- (来源实体名)-[关系名]->(目标实体名)\n"
|
||||
"- (来源实体名)-[关系名:关系描述]->(目标实体名)\n"
|
||||
"- (来源实体名)-[关系名:关系属性表]->(目标实体名)\n"
|
||||
"- (文本块实体)-[包含]->(实体名)\n"
|
||||
"- (目录ID)-[包含]->(文本块实体)\n"
|
||||
"- (目录ID)-[包含]->(子目录ID)\n"
|
||||
"- (文档ID)-[包含]->(文本块实体)\n"
|
||||
"- (文档ID)-[包含]->(目录ID)\n"
|
||||
"```"
|
||||
"- 正确地将关系信息中的实体名/ID与实体信息关联,还原出图结构。"
|
||||
"- 将图结构所表达的信息作为用户提问的明细上下文,辅助生成更好的答案。\n"
|
||||
"\n"
|
||||
"## 约束条件\n"
|
||||
"- 不要在答案中描述你的思考过程,直接给出用户问题的答案,不要生成无关信息。\n"
|
||||
"- 若[知识图谱]没有提供信息,此时应根据[上下文]提供的信息回答问题。"
|
||||
"- 确保以第三人称书写,从客观角度结合[上下文]和[知识图谱]表达的信息回答问题。\n"
|
||||
"- 若提供的信息相互矛盾,请解决矛盾并提供一个单一、连贯的描述。\n"
|
||||
"- 避免使用停用词和过于常见的词汇。\n"
|
||||
"\n"
|
||||
"## 参考案例\n"
|
||||
"```\n"
|
||||
"[上下文]:\n"
|
||||
"Section 1:\n"
|
||||
"菲尔・贾伯的大儿子叫雅各布・贾伯。\n"
|
||||
"Section 2:\n"
|
||||
"菲尔・贾伯的小儿子叫比尔・贾伯。\n"
|
||||
"[知识图谱]:\n"
|
||||
"Entities:\n"
|
||||
"(菲尔・贾伯#菲尔兹咖啡创始人)\n"
|
||||
"(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌)\n"
|
||||
"(雅各布・贾伯#菲尔・贾伯的儿子)\n"
|
||||
"(美国多地#菲尔兹咖啡的扩展地区)\n"
|
||||
"\n"
|
||||
"Relationships:\n"
|
||||
"(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立)\n"
|
||||
"(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点)\n"
|
||||
"(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的儿子)\n"
|
||||
"(雅各布・贾伯#担任#首席执行官#在2005年成为菲尔兹咖啡的首席执行官)\n"
|
||||
"(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围)\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"----\n"
|
||||
"\n"
|
||||
"接下来的[上下文]和[知识图谱]的信息,可以帮助你回答更好地用户的问题。\n"
|
||||
"\n"
|
||||
"[上下文]:\n"
|
||||
"{context}\n"
|
||||
"\n"
|
||||
"[知识图谱]:\n"
|
||||
"{graph}\n"
|
||||
"\n"
|
||||
)
|
||||
|
||||
HYBRID_SEARCH_PT_EN = (
|
||||
"## Role\n"
|
||||
"You excel at combining the information provided in the [Context] with "
|
||||
"information from the [KnowledgeGraph] to accurately and appropriately "
|
||||
"answer user questions, ensuring that you do not output information "
|
||||
"unrelated to the context and knowledge graph.\n"
|
||||
"\n"
|
||||
"## Skills\n"
|
||||
"### Skill 1: Context Understanding\n"
|
||||
"- Accurately understand the information provided in the [Context], "
|
||||
"which may be divided into several sections.\n"
|
||||
"- Each section in the context will start with [Section] "
|
||||
"and may be numbered as needed.\n"
|
||||
"- The context provides a summary description most relevant to the user’s "
|
||||
"question, and it should be used wisely."
|
||||
"### Skill 2: Knowledge Graph Understanding\n"
|
||||
"- Accurately identify entity information in the [Entities:] section and "
|
||||
"relationship information in the [Relationships:] section "
|
||||
"of the [KnowledgeGraph]. The general format for entity "
|
||||
"and relationship information is:\n"
|
||||
"```"
|
||||
"* Entity Information Format:\n"
|
||||
"- (entity_name)\n"
|
||||
"- (entity_name: entity_description)\n"
|
||||
"- (entity_name: entity_property_map)\n"
|
||||
"- (chunk_id: chunk_content)\n"
|
||||
"- (catalog_id: catalog_name)\n"
|
||||
"- (document_id: document_name)\n"
|
||||
"\n"
|
||||
"* Relationship Information Format:\n"
|
||||
"- (source_entity_name)-[relationship_name]->(target_entity_name)\n"
|
||||
"- (source_entity_name)-[relationship_name: relationship_description]->"
|
||||
"(target_entity_name)\n"
|
||||
"- (source_entity_name)-[relationship_name: relationship_property_map]->"
|
||||
"(target_entity_name)\n"
|
||||
"- (chunk_id)-[Contains]->(entity_name)\n"
|
||||
"- (catalog_id)-[Contains]->(chunk_id)\n"
|
||||
"- (catalog_id)-[Contains]->(sub_catalog_id)\n"
|
||||
"- (document_id)-[Contains]->(chunk_id)\n"
|
||||
"- (document_id)-[Contains]->(catalog_id)\n"
|
||||
"```"
|
||||
"- Correctly associate entity names/IDs in the relationship information "
|
||||
"with entity information to restore the graph structure."
|
||||
"- Use the information expressed by the graph structure as detailed "
|
||||
"context for the user's query to assist in generating better answers.\n"
|
||||
"\n"
|
||||
"## Constraints\n"
|
||||
"- Don't describe your thought process in the answer, provide the answer "
|
||||
"to the user's question directly without generating irrelevant information."
|
||||
"- If the [KnowledgeGraph] does not provide information, you should answer "
|
||||
"the question based on the information provided in the [Context]."
|
||||
"- Ensure to write in the third person, responding to questions from "
|
||||
"an objective perspective based on the information combined from the "
|
||||
"[Context] and the [KnowledgeGraph].\n"
|
||||
"- If the provided information is contradictory, resolve the "
|
||||
"contradictions and provide a single, coherent description.\n"
|
||||
"- Avoid using stop words and overly common vocabulary.\n"
|
||||
"\n"
|
||||
"## Reference Example\n"
|
||||
"```\n"
|
||||
"[Context]:\n"
|
||||
"Section 1:\n"
|
||||
"Phil Schiller's eldest son is Jacob Schiller.\n"
|
||||
"Section 2:\n"
|
||||
"Phil Schiller's youngest son is Bill Schiller.\n"
|
||||
"[KnowledgeGraph]:\n"
|
||||
"Entities:\n"
|
||||
"(Phil Jaber#Founder of Philz Coffee)\n"
|
||||
"(Philz Coffee#Coffee brand founded in Berkeley, California)\n"
|
||||
"(Jacob Jaber#Son of Phil Jaber)\n"
|
||||
"(Multiple locations in the USA#Expansion regions of Philz Coffee)\n"
|
||||
"\n"
|
||||
"Relationships:\n"
|
||||
"(Phil Jaber#Created#Philz Coffee"
|
||||
"#Founded in Berkeley, California in 1978)\n"
|
||||
"(Philz Coffee#Located in#Berkeley, California"
|
||||
"#Founding location of Philz Coffee)\n"
|
||||
"(Phil Jaber#Has#Jacob Jaber#Son of Phil Jaber)\n"
|
||||
"(Jacob Jaber#Serves as#CEO#Became CEO of Philz Coffee in 2005)\n"
|
||||
"(Philz Coffee#Expanded to#Multiple locations in the USA"
|
||||
"#Expansion regions of Philz Coffee)\n"
|
||||
"```\n"
|
||||
"\n"
|
||||
"----\n"
|
||||
"\n"
|
||||
"The following information from the [Context] and [KnowledgeGraph] can "
|
||||
"help you better answer user questions.\n"
|
||||
"\n"
|
||||
"[Context]:\n"
|
||||
"{context}\n"
|
||||
"\n"
|
||||
"[KnowledgeGraph]:\n"
|
||||
"{graph}\n"
|
||||
"\n"
|
||||
)
|
@@ -36,8 +36,9 @@ class BuiltinKnowledgeGraph(KnowledgeGraphBase):
|
||||
|
||||
def __init__(self, config: BuiltinKnowledgeGraphConfig):
|
||||
"""Create builtin knowledge graph instance."""
|
||||
self._config = config
|
||||
super().__init__()
|
||||
self._config = config
|
||||
|
||||
self._llm_client = config.llm_client
|
||||
if not self._llm_client:
|
||||
raise ValueError("No llm client provided.")
|
||||
@@ -45,17 +46,19 @@ class BuiltinKnowledgeGraph(KnowledgeGraphBase):
|
||||
self._model_name = config.model_name
|
||||
self._triplet_extractor = TripletExtractor(self._llm_client, self._model_name)
|
||||
self._keyword_extractor = KeywordExtractor(self._llm_client, self._model_name)
|
||||
self._graph_store_type = (
|
||||
os.getenv("GRAPH_STORE_TYPE", "TuGraph") or config.graph_store_type
|
||||
)
|
||||
self._graph_store = self.__init_graph_store(config)
|
||||
|
||||
def __init_graph_store(self, config) -> GraphStoreBase:
|
||||
def configure(cfg: GraphStoreConfig):
|
||||
cfg.name = self._config.name
|
||||
cfg.embedding_fn = self._config.embedding_fn
|
||||
cfg.name = config.name
|
||||
cfg.embedding_fn = config.embedding_fn
|
||||
|
||||
self._graph_store: GraphStoreBase = GraphStoreFactory.create(
|
||||
self._graph_store_type, configure
|
||||
)
|
||||
graph_store_type = os.getenv("GRAPH_STORE_TYPE") or config.graph_store_type
|
||||
return GraphStoreFactory.create(graph_store_type, configure)
|
||||
|
||||
def get_config(self) -> BuiltinKnowledgeGraphConfig:
|
||||
"""Get the knowledge graph config."""
|
||||
return self._config
|
||||
|
||||
def load_document(self, chunks: List[Chunk]) -> List[str]:
|
||||
"""Extract and persist triplets to graph store."""
|
||||
@@ -113,35 +116,59 @@ class BuiltinKnowledgeGraph(KnowledgeGraphBase):
|
||||
|
||||
# extract keywords and explore graph store
|
||||
keywords = await self._keyword_extractor.extract(text)
|
||||
subgraph = self._graph_store.explore(keywords, limit=topk)
|
||||
subgraph = self._graph_store.explore(keywords, limit=topk).format()
|
||||
logger.info(f"Search subgraph from {len(keywords)} keywords")
|
||||
|
||||
if not subgraph:
|
||||
return []
|
||||
|
||||
content = (
|
||||
"The following vertices and edges data after [Subgraph Data] "
|
||||
"are retrieved from the knowledge graph based on the keywords:\n"
|
||||
f"Keywords:\n{','.join(keywords)}\n"
|
||||
"The following entities and relationships provided after "
|
||||
"[Subgraph] are retrieved from the knowledge graph "
|
||||
"based on the keywords:\n"
|
||||
f"\"{','.join(keywords)}\".\n"
|
||||
"---------------------\n"
|
||||
"You can refer to the sample vertices and edges to understand "
|
||||
"the real knowledge graph data provided by [Subgraph Data].\n"
|
||||
"Sample vertices:\n"
|
||||
"The following examples after [Entities] and [Relationships] that "
|
||||
"can help you understand the data format of the knowledge graph, "
|
||||
"but do not use them in the answer.\n"
|
||||
"[Entities]:\n"
|
||||
"(alice)\n"
|
||||
"(bob:{age:28})\n"
|
||||
'(carry:{age:18;role:"teacher"})\n\n'
|
||||
"Sample edges:\n"
|
||||
"[Relationships]:\n"
|
||||
"(alice)-[reward]->(alice)\n"
|
||||
'(alice)-[notify:{method:"email"}]->'
|
||||
'(carry:{age:18;role:"teacher"})\n'
|
||||
'(bob:{age:28})-[teach:{course:"math";hour:180}]->(alice)\n'
|
||||
"---------------------\n"
|
||||
f"Subgraph Data:\n{subgraph.format()}\n"
|
||||
f"[Subgraph]:\n{subgraph}\n"
|
||||
)
|
||||
return [Chunk(content=content, metadata=subgraph.schema())]
|
||||
return [Chunk(content=content)]
|
||||
|
||||
def query_graph(self, limit: Optional[int] = None) -> Graph:
|
||||
"""Query graph."""
|
||||
return self._graph_store.get_full_graph(limit)
|
||||
|
||||
def truncate(self) -> List[str]:
|
||||
"""Truncate knowledge graph."""
|
||||
logger.info(f"Truncate graph {self._config.name}")
|
||||
self._graph_store.truncate()
|
||||
|
||||
logger.info("Truncate keyword extractor")
|
||||
self._keyword_extractor.truncate()
|
||||
|
||||
logger.info("Truncate triplet extractor")
|
||||
self._triplet_extractor.truncate()
|
||||
|
||||
return [self._config.name]
|
||||
|
||||
def delete_vector_name(self, index_name: str):
|
||||
"""Delete vector name."""
|
||||
logger.info(f"Remove graph index {index_name}")
|
||||
logger.info(f"Drop graph {index_name}")
|
||||
self._graph_store.drop()
|
||||
|
||||
logger.info("Drop keyword extractor")
|
||||
self._keyword_extractor.drop()
|
||||
|
||||
logger.info("Drop triplet extractor")
|
||||
self._triplet_extractor.drop()
|
||||
|
@@ -1,12 +1,8 @@
|
||||
"""OpenSPG class."""
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
from dbgpt._private.pydantic import ConfigDict
|
||||
from dbgpt.core import Chunk
|
||||
from dbgpt.storage.graph_store.graph import Graph, MemoryGraph
|
||||
from dbgpt.storage.knowledge_graph.base import KnowledgeGraphBase, KnowledgeGraphConfig
|
||||
from dbgpt.storage.vector_store.filters import MetadataFilters
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -21,29 +17,3 @@ class OpenSPG(KnowledgeGraphBase):
|
||||
"""OpenSPG class."""
|
||||
|
||||
# todo: add OpenSPG implementation
|
||||
|
||||
def __init__(self, config: OpenSPGConfig):
|
||||
"""Initialize the OpenSPG with config details."""
|
||||
pass
|
||||
|
||||
def load_document(self, chunks: List[Chunk]) -> List[str]:
|
||||
"""Load document."""
|
||||
return []
|
||||
|
||||
def similar_search_with_scores(
|
||||
self,
|
||||
text,
|
||||
topk,
|
||||
score_threshold: float,
|
||||
filters: Optional[MetadataFilters] = None,
|
||||
) -> List[Chunk]:
|
||||
"""Similar with scores."""
|
||||
return []
|
||||
|
||||
def query_graph(self, limit: Optional[int] = None) -> Graph:
|
||||
"""Query graph."""
|
||||
return MemoryGraph()
|
||||
|
||||
def delete_vector_name(self, index_name: str):
|
||||
"""Delete vector name."""
|
||||
pass
|
||||
|
@@ -56,6 +56,15 @@ def _import_builtin_knowledge_graph() -> Tuple[Type, Type]:
|
||||
return BuiltinKnowledgeGraph, BuiltinKnowledgeGraphConfig
|
||||
|
||||
|
||||
def _import_community_summary_knowledge_graph() -> Tuple[Type, Type]:
|
||||
from dbgpt.storage.knowledge_graph.community_summary import (
|
||||
CommunitySummaryKnowledgeGraph,
|
||||
CommunitySummaryKnowledgeGraphConfig,
|
||||
)
|
||||
|
||||
return CommunitySummaryKnowledgeGraph, CommunitySummaryKnowledgeGraphConfig
|
||||
|
||||
|
||||
def _import_openspg() -> Tuple[Type, Type]:
|
||||
from dbgpt.storage.knowledge_graph.open_spg import OpenSPG, OpenSPGConfig
|
||||
|
||||
@@ -86,6 +95,8 @@ def __getattr__(name: str) -> Tuple[Type, Type]:
|
||||
return _import_elastic()
|
||||
elif name == "KnowledgeGraph":
|
||||
return _import_builtin_knowledge_graph()
|
||||
elif name == "CommunitySummaryKnowledgeGraph":
|
||||
return _import_community_summary_knowledge_graph()
|
||||
elif name == "OpenSPG":
|
||||
return _import_openspg()
|
||||
elif name == "FullText":
|
||||
@@ -103,7 +114,7 @@ __vector_store__ = [
|
||||
"ElasticSearch",
|
||||
]
|
||||
|
||||
__knowledge_graph__ = ["KnowledgeGraph", "OpenSPG"]
|
||||
__knowledge_graph__ = ["KnowledgeGraph", "CommunitySummaryKnowledgeGraph", "OpenSPG"]
|
||||
|
||||
__document_store__ = ["FullText"]
|
||||
|
||||
|
@@ -99,6 +99,14 @@ class VectorStoreConfig(IndexStoreConfig):
|
||||
"The password of vector store, if not set, will use the default password."
|
||||
),
|
||||
)
|
||||
topk: int = Field(
|
||||
default=5,
|
||||
description="Topk of vector search",
|
||||
)
|
||||
score_threshold: float = Field(
|
||||
default=0.3,
|
||||
description="Recall score of vector search",
|
||||
)
|
||||
|
||||
|
||||
class VectorStoreBase(IndexStoreBase, ABC):
|
||||
@@ -108,6 +116,10 @@ class VectorStoreBase(IndexStoreBase, ABC):
|
||||
"""Initialize vector store."""
|
||||
super().__init__(executor)
|
||||
|
||||
@abstractmethod
|
||||
def get_config(self) -> VectorStoreConfig:
|
||||
"""Get the vector store config."""
|
||||
|
||||
def filter_by_score_threshold(
|
||||
self, chunks: List[Chunk], score_threshold: float
|
||||
) -> List[Chunk]:
|
||||
@@ -126,7 +138,7 @@ class VectorStoreBase(IndexStoreBase, ABC):
|
||||
metadata=chunk.metadata,
|
||||
content=chunk.content,
|
||||
score=chunk.score,
|
||||
chunk_id=str(id),
|
||||
chunk_id=chunk.chunk_id,
|
||||
)
|
||||
for chunk in chunks
|
||||
if chunk.score >= score_threshold
|
||||
|
@@ -63,6 +63,8 @@ class ChromaStore(VectorStoreBase):
|
||||
vector_store_config(ChromaVectorConfig): vector store config.
|
||||
"""
|
||||
super().__init__()
|
||||
self._vector_store_config = vector_store_config
|
||||
|
||||
chroma_vector_config = vector_store_config.to_dict(exclude_none=True)
|
||||
chroma_path = chroma_vector_config.get(
|
||||
"persist_path", os.path.join(PILOT_PATH, "data")
|
||||
@@ -89,6 +91,10 @@ class ChromaStore(VectorStoreBase):
|
||||
metadata=collection_metadata,
|
||||
)
|
||||
|
||||
def get_config(self) -> ChromaVectorConfig:
|
||||
"""Get the vector store config."""
|
||||
return self._vector_store_config
|
||||
|
||||
def similar_search(
|
||||
self, text, topk, filters: Optional[MetadataFilters] = None
|
||||
) -> List[Chunk]:
|
||||
@@ -100,10 +106,16 @@ class ChromaStore(VectorStoreBase):
|
||||
filters=filters,
|
||||
)
|
||||
return [
|
||||
Chunk(content=chroma_result[0], metadata=chroma_result[1] or {}, score=0.0)
|
||||
Chunk(
|
||||
content=chroma_result[0],
|
||||
metadata=chroma_result[1] or {},
|
||||
score=0.0,
|
||||
chunk_id=chroma_result[2],
|
||||
)
|
||||
for chroma_result in zip(
|
||||
chroma_results["documents"][0],
|
||||
chroma_results["metadatas"][0],
|
||||
chroma_results["ids"][0],
|
||||
)
|
||||
]
|
||||
|
||||
@@ -134,12 +146,14 @@ class ChromaStore(VectorStoreBase):
|
||||
content=chroma_result[0],
|
||||
metadata=chroma_result[1] or {},
|
||||
score=(1 - chroma_result[2]),
|
||||
chunk_id=chroma_result[3],
|
||||
)
|
||||
)
|
||||
for chroma_result in zip(
|
||||
chroma_results["documents"][0],
|
||||
chroma_results["metadatas"][0],
|
||||
chroma_results["distances"][0],
|
||||
chroma_results["ids"][0],
|
||||
)
|
||||
]
|
||||
return self.filter_by_score_threshold(chunks, score_threshold)
|
||||
@@ -181,6 +195,20 @@ class ChromaStore(VectorStoreBase):
|
||||
if len(ids) > 0:
|
||||
self._collection.delete(ids=ids)
|
||||
|
||||
def truncate(self) -> List[str]:
|
||||
"""Truncate data index_name."""
|
||||
logger.info(f"begin truncate chroma collection:{self._collection.name}")
|
||||
results = self._collection.get()
|
||||
ids = results.get("ids")
|
||||
if ids:
|
||||
self._collection.delete(ids=ids)
|
||||
logger.info(
|
||||
f"truncate chroma collection {self._collection.name} "
|
||||
f"{len(ids)} chunks success"
|
||||
)
|
||||
return ids
|
||||
return []
|
||||
|
||||
def convert_metadata_filters(
|
||||
self,
|
||||
filters: MetadataFilters,
|
||||
|
@@ -126,6 +126,8 @@ class ElasticStore(VectorStoreBase):
|
||||
vector_store_config (ElasticsearchVectorConfig): ElasticsearchStore config.
|
||||
"""
|
||||
super().__init__()
|
||||
self._vector_store_config = vector_store_config
|
||||
|
||||
connect_kwargs = {}
|
||||
elasticsearch_vector_config = vector_store_config.dict()
|
||||
self.uri = elasticsearch_vector_config.get("uri") or os.getenv(
|
||||
@@ -234,6 +236,10 @@ class ElasticStore(VectorStoreBase):
|
||||
except Exception as e:
|
||||
logger.error(f"ElasticSearch connection failed: {e}")
|
||||
|
||||
def get_config(self) -> ElasticsearchVectorConfig:
|
||||
"""Get the vector store config."""
|
||||
return self._vector_store_config
|
||||
|
||||
def load_document(
|
||||
self,
|
||||
chunks: List[Chunk],
|
||||
|
44
dbgpt/storage/vector_store/factory.py
Normal file
44
dbgpt/storage/vector_store/factory.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""Vector store factory."""
|
||||
import logging
|
||||
from typing import Tuple, Type
|
||||
|
||||
from dbgpt.storage import vector_store
|
||||
from dbgpt.storage.vector_store.base import VectorStoreBase, VectorStoreConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VectorStoreFactory:
|
||||
"""Factory for vector store."""
|
||||
|
||||
@staticmethod
|
||||
def create(
|
||||
vector_store_type: str, vector_space_name: str, vector_store_configure=None
|
||||
) -> VectorStoreBase:
|
||||
"""Create a VectorStore instance.
|
||||
|
||||
Args:
|
||||
- vector_store_type: vector store type Chroma, Milvus, etc.
|
||||
- vector_store_config: vector store config
|
||||
"""
|
||||
store_cls, cfg_cls = VectorStoreFactory.__find_type(vector_store_type)
|
||||
|
||||
try:
|
||||
config = cfg_cls()
|
||||
if vector_store_configure:
|
||||
vector_store_configure(vector_space_name, config)
|
||||
return store_cls(config)
|
||||
except Exception as e:
|
||||
logger.error("create vector store failed: %s", e)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def __find_type(vector_store_type: str) -> Tuple[Type, Type]:
|
||||
for t in vector_store.__vector_store__:
|
||||
if t.lower() == vector_store_type.lower():
|
||||
store_cls, cfg_cls = getattr(vector_store, t)
|
||||
if issubclass(store_cls, VectorStoreBase) and issubclass(
|
||||
cfg_cls, VectorStoreConfig
|
||||
):
|
||||
return store_cls, cfg_cls
|
||||
raise Exception(f"Vector store {vector_store_type} not supported")
|
@@ -150,6 +150,8 @@ class MilvusStore(VectorStoreBase):
|
||||
refer to https://milvus.io/docs/v2.0.x/manage_connection.md
|
||||
"""
|
||||
super().__init__()
|
||||
self._vector_store_config = vector_store_config
|
||||
|
||||
try:
|
||||
from pymilvus import connections
|
||||
except ImportError:
|
||||
@@ -363,6 +365,10 @@ class MilvusStore(VectorStoreBase):
|
||||
|
||||
return res.primary_keys
|
||||
|
||||
def get_config(self) -> MilvusVectorConfig:
|
||||
"""Get the vector store config."""
|
||||
return self._vector_store_config
|
||||
|
||||
def load_document(self, chunks: List[Chunk]) -> List[str]:
|
||||
"""Load document in vector database."""
|
||||
batch_size = 500
|
||||
|
@@ -718,6 +718,8 @@ class OceanBaseStore(VectorStoreBase):
|
||||
if vector_store_config.embedding_fn is None:
|
||||
raise ValueError("embedding_fn is required for OceanBaseStore")
|
||||
super().__init__()
|
||||
self._vector_store_config = vector_store_config
|
||||
|
||||
self.embeddings = vector_store_config.embedding_fn
|
||||
self.collection_name = vector_store_config.name
|
||||
vector_store_config = vector_store_config.dict()
|
||||
@@ -760,6 +762,10 @@ class OceanBaseStore(VectorStoreBase):
|
||||
enable_normalize_vector=self.OB_ENABLE_NORMALIZE_VECTOR,
|
||||
)
|
||||
|
||||
def get_config(self) -> OceanBaseConfig:
|
||||
"""Get the vector store config."""
|
||||
return self._vector_store_config
|
||||
|
||||
def similar_search(
|
||||
self, text, topk, filters: Optional[MetadataFilters] = None, **kwargs: Any
|
||||
) -> List[Chunk]:
|
||||
|
@@ -64,6 +64,8 @@ class PGVectorStore(VectorStoreBase):
|
||||
"Please install the `langchain` package to use the PGVector."
|
||||
)
|
||||
super().__init__()
|
||||
self._vector_store_config = vector_store_config
|
||||
|
||||
self.connection_string = vector_store_config.connection_string
|
||||
self.embeddings = vector_store_config.embedding_fn
|
||||
self.collection_name = vector_store_config.name
|
||||
@@ -74,6 +76,10 @@ class PGVectorStore(VectorStoreBase):
|
||||
connection_string=self.connection_string,
|
||||
)
|
||||
|
||||
def get_config(self) -> PGVectorConfig:
|
||||
"""Get the vector store config."""
|
||||
return self._vector_store_config
|
||||
|
||||
def similar_search(
|
||||
self, text: str, topk: int, filters: Optional[MetadataFilters] = None
|
||||
) -> List[Chunk]:
|
||||
|
@@ -69,6 +69,8 @@ class WeaviateStore(VectorStoreBase):
|
||||
"Please install it with `pip install weaviate-client`."
|
||||
)
|
||||
super().__init__()
|
||||
self._vector_store_config = vector_store_config
|
||||
|
||||
self.weaviate_url = vector_store_config.weaviate_url
|
||||
self.embedding = vector_store_config.embedding_fn
|
||||
self.vector_name = vector_store_config.name
|
||||
@@ -78,6 +80,10 @@ class WeaviateStore(VectorStoreBase):
|
||||
|
||||
self.vector_store_client = weaviate.Client(self.weaviate_url)
|
||||
|
||||
def get_config(self) -> WeaviateVectorConfig:
|
||||
"""Get the vector store config."""
|
||||
return self._vector_store_config
|
||||
|
||||
def similar_search(
|
||||
self, text: str, topk: int, filters: Optional[MetadataFilters] = None
|
||||
) -> List[Chunk]:
|
||||
|
@@ -1,12 +1,19 @@
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from dbgpt.configs.model_config import ROOT_PATH
|
||||
from dbgpt.core import Chunk, HumanPromptTemplate, ModelMessage, ModelRequest
|
||||
from dbgpt.model.proxy.llms.chatgpt import OpenAILLMClient
|
||||
from dbgpt.rag import ChunkParameters
|
||||
from dbgpt.rag.assembler import EmbeddingAssembler
|
||||
from dbgpt.rag.embedding import DefaultEmbeddingFactory
|
||||
from dbgpt.rag.knowledge import KnowledgeFactory
|
||||
from dbgpt.rag.retriever import RetrieverStrategy
|
||||
from dbgpt.storage.knowledge_graph.community_summary import (
|
||||
CommunitySummaryKnowledgeGraph,
|
||||
CommunitySummaryKnowledgeGraphConfig,
|
||||
)
|
||||
from dbgpt.storage.knowledge_graph.knowledge_graph import (
|
||||
BuiltinKnowledgeGraph,
|
||||
BuiltinKnowledgeGraphConfig,
|
||||
@@ -15,7 +22,7 @@ from dbgpt.storage.knowledge_graph.knowledge_graph import (
|
||||
"""GraphRAG example.
|
||||
pre-requirements:
|
||||
* Set LLM config (url/sk) in `.env`.
|
||||
* Setup/startup TuGraph from: https://github.com/TuGraph-family/tugraph-db
|
||||
* Install pytest utils: `pip install pytest pytest-asyncio`
|
||||
* Config TuGraph following the format below.
|
||||
```
|
||||
GRAPH_STORE_TYPE=TuGraph
|
||||
@@ -24,46 +31,100 @@ from dbgpt.storage.knowledge_graph.knowledge_graph import (
|
||||
TUGRAPH_USERNAME=admin
|
||||
TUGRAPH_PASSWORD=73@TuGraph
|
||||
```
|
||||
|
||||
Examples:
|
||||
..code-block:: shell
|
||||
python examples/rag/graph_rag_example.py
|
||||
pytest -s examples/rag/graph_rag_example.py
|
||||
"""
|
||||
|
||||
llm_client = OpenAILLMClient()
|
||||
model_name = "gpt-4o-mini"
|
||||
|
||||
def _create_kg_connector():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_naive_graph_rag():
|
||||
await __run_graph_rag(
|
||||
knowledge_file="examples/test_files/graphrag-mini.md",
|
||||
chunk_strategy="CHUNK_BY_SIZE",
|
||||
knowledge_graph=__create_naive_kg_connector(),
|
||||
question="What's the relationship between TuGraph and DB-GPT ?",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_community_graph_rag():
|
||||
await __run_graph_rag(
|
||||
knowledge_file="examples/test_files/graphrag-mini.md",
|
||||
chunk_strategy="CHUNK_BY_MARKDOWN_HEADER",
|
||||
knowledge_graph=__create_community_kg_connector(),
|
||||
question="What's the relationship between TuGraph and DB-GPT ?",
|
||||
)
|
||||
|
||||
|
||||
def __create_naive_kg_connector():
|
||||
"""Create knowledge graph connector."""
|
||||
return BuiltinKnowledgeGraph(
|
||||
config=BuiltinKnowledgeGraphConfig(
|
||||
name="graph_rag_test",
|
||||
name="naive_graph_rag_test",
|
||||
embedding_fn=None,
|
||||
llm_client=OpenAILLMClient(),
|
||||
model_name="gpt-3.5-turbo",
|
||||
llm_client=llm_client,
|
||||
model_name=model_name,
|
||||
graph_store_type="MemoryGraph",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
async def main():
|
||||
file_path = os.path.join(ROOT_PATH, "examples/test_files/tranformers_story.md")
|
||||
def __create_community_kg_connector():
|
||||
"""Create community knowledge graph connector."""
|
||||
return CommunitySummaryKnowledgeGraph(
|
||||
config=CommunitySummaryKnowledgeGraphConfig(
|
||||
name="community_graph_rag_test",
|
||||
embedding_fn=DefaultEmbeddingFactory.openai(),
|
||||
llm_client=llm_client,
|
||||
model_name=model_name,
|
||||
graph_store_type="TuGraphGraph",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
async def ask_chunk(chunk: Chunk, question) -> str:
|
||||
rag_template = (
|
||||
"Based on the following [Context] {context}, " "answer [Question] {question}."
|
||||
)
|
||||
template = HumanPromptTemplate.from_template(rag_template)
|
||||
messages = template.format_messages(context=chunk.content, question=question)
|
||||
model_messages = ModelMessage.from_base_messages(messages)
|
||||
request = ModelRequest(model=model_name, messages=model_messages)
|
||||
response = await llm_client.generate(request=request)
|
||||
|
||||
if not response.success:
|
||||
code = str(response.error_code)
|
||||
reason = response.text
|
||||
raise Exception(f"request llm failed ({code}) {reason}")
|
||||
|
||||
return response.text
|
||||
|
||||
|
||||
async def __run_graph_rag(knowledge_file, chunk_strategy, knowledge_graph, question):
|
||||
file_path = os.path.join(ROOT_PATH, knowledge_file).format()
|
||||
knowledge = KnowledgeFactory.from_file_path(file_path)
|
||||
graph_store = _create_kg_connector()
|
||||
chunk_parameters = ChunkParameters(chunk_strategy="CHUNK_BY_SIZE")
|
||||
# get embedding assembler
|
||||
assembler = await EmbeddingAssembler.aload_from_knowledge(
|
||||
knowledge=knowledge,
|
||||
chunk_parameters=chunk_parameters,
|
||||
index_store=graph_store,
|
||||
retrieve_strategy=RetrieverStrategy.GRAPH,
|
||||
)
|
||||
await assembler.apersist()
|
||||
# get embeddings retriever
|
||||
retriever = assembler.as_retriever(3)
|
||||
chunks = await retriever.aretrieve_with_scores(
|
||||
"What actions has Megatron taken ?", score_threshold=0.3
|
||||
)
|
||||
print(f"embedding rag example results:{chunks}")
|
||||
graph_store.delete_vector_name("graph_rag_test")
|
||||
try:
|
||||
chunk_parameters = ChunkParameters(chunk_strategy=chunk_strategy)
|
||||
|
||||
# get embedding assembler
|
||||
assembler = await EmbeddingAssembler.aload_from_knowledge(
|
||||
knowledge=knowledge,
|
||||
chunk_parameters=chunk_parameters,
|
||||
index_store=knowledge_graph,
|
||||
retrieve_strategy=RetrieverStrategy.GRAPH,
|
||||
)
|
||||
await assembler.apersist()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
# get embeddings retriever
|
||||
retriever = assembler.as_retriever(1)
|
||||
chunks = await retriever.aretrieve_with_scores(question, score_threshold=0.3)
|
||||
|
||||
# chat
|
||||
print(f"{await ask_chunk(chunks[0], question)}")
|
||||
|
||||
finally:
|
||||
knowledge_graph.delete_vector_name(knowledge_graph.get_config().name)
|
||||
|
185
examples/test_files/dbgpt.md
Normal file
185
examples/test_files/dbgpt.md
Normal file
@@ -0,0 +1,185 @@
|
||||
# DB-GPT: 用私有化LLM技术定义数据库下一代交互方式
|
||||
## DB-GPT 是什么?
|
||||
🤖️ **DB-GPT是一个开源的AI原生数据应用开发框架(AI Native Data App Development framework with AWEL(Agentic Workflow Expression Language) and Agents)。**
|
||||
目的是构建大模型领域的基础设施,通过开发多模型管理(SMMF)、Text2SQL效果优化、RAG框架以及优化、Multi-Agents框架协作、AWEL(智能体工作流编排)等多种技术能力,让围绕数据库构建大模型应用更简单,更方便。
|
||||
🚀 **数据3.0 时代,基于模型、数据库,企业/开发者可以用更少的代码搭建自己的专属应用。**
|
||||
## 效果演示
|
||||
### AI原生数据智能应用
|
||||
|
||||
---
|
||||
|
||||
- 🔥🔥🔥 [V0.5.0发布——通过工作流与智能体开发原生数据应用](https://www.yuque.com/eosphoros/dbgpt-docs/owcrh9423f9rqkg2)
|
||||
|
||||
---
|
||||
|
||||
### Data Agents
|
||||

|
||||

|
||||

|
||||
## 目录
|
||||
|
||||
- [架构方案](#架构方案)
|
||||
- [安装](#安装)
|
||||
- [特性简介](#特性一览)
|
||||
- [贡献](#贡献)
|
||||
- [路线图](#路线图)
|
||||
- [联系我们](#联系我们)
|
||||
## 架构方案
|
||||

|
||||
核心能力主要有以下几个部分:
|
||||
|
||||
- **RAG(Retrieval Augmented Generation)**,RAG是当下落地实践最多,也是最迫切的领域,DB-GPT目前已经实现了一套基于RAG的框架,用户可以基于DB-GPT的RAG能力构建知识类应用。
|
||||
- **GBI**:生成式BI是DB-GPT项目的核心能力之一,为构建企业报表分析、业务洞察提供基础的数智化技术保障。
|
||||
- **微调框架**: 模型微调是任何一个企业在垂直、细分领域落地不可或缺的能力,DB-GPT提供了完整的微调框架,实现与DB-GPT项目的无缝打通,在最近的微调中,基于spider的准确率已经做到了82.5%
|
||||
- **数据驱动的Multi-Agents框架**: DB-GPT提供了数据驱动的自进化Multi-Agents框架,目标是可以持续基于数据做决策与执行。
|
||||
- **数据工厂**: 数据工厂主要是在大模型时代,做可信知识、数据的清洗加工。
|
||||
- **数据源**: 对接各类数据源,实现生产业务数据无缝对接到DB-GPT核心能力。
|
||||
### 智能体编排语言(AWEL)
|
||||
AWEL(Agentic Workflow Expression Language)是一套专为大模型应用开发设计的智能体工作流表达语言,它提供了强大的功能和灵活性。通过 AWEL API 您可以专注于大模型应用业务逻辑的开发,而不需要关注繁琐的模型和环境细节,AWEL 采用分层 API 的设计, AWEL 的分层 API 设计架构如下图所示:
|
||||
|
||||

|
||||
|
||||
AWEL在设计上分为三个层次,依次为算子层、AgentFrame层以及DSL层,以下对三个层次做简要介绍。
|
||||
|
||||
- 算子层
|
||||
|
||||
算子层是指LLM应用开发过程中一个个最基本的操作原子,比如在一个RAG应用开发时。 检索、向量化、模型交互、Prompt处理等都是一个个基础算子。 在后续的发展中,框架会进一步对算子进行抽象与标准化设计。 可以根据标准API快速实现一组算子。
|
||||
|
||||
- AgentFrame层
|
||||
|
||||
AgentFrame层将算子做进一步封装,可以基于算子做链式计算。 这一层链式计算也支持分布式,支持如filter、join、map、reduce等一套链式计算操作。 后续也将支持更多的计算逻辑。
|
||||
|
||||
- DSL层
|
||||
|
||||
DSL层提供一套标准的结构化表示语言,可以通过写DSL语句完成AgentFrame与算子的操作,让围绕数据编写大模型应用更具确定性,避免通过自然语言编写的不确定性,使得围绕数据与大模型的应用编程变为确定性应用编程。
|
||||
### RAG架构
|
||||

|
||||
### Agent架构
|
||||
DB-GPT Agent是一个多Agent框架,目的是提供生产级Agent构建的基础框架能力。我们认为,生产级代理应用程序需要基于数据驱动的决策,并且可以在可控制的工作流中进行编排。
|
||||
在我们的设计中,提供了一套以Agent为核心,融合多模型管理、RAGs、API调用、可视化、AWEL智能体编排、Text2SQL、意图识别等一系列技术的生产级数据应用开发框架。
|
||||

|
||||
如同所示: 在DB-GPT中,Agent是一等公民,其他RAGs、Tools、数据源等都是Agent依赖的资源,包括模型也是一种资源。
|
||||
Agent的核心模块主要有Memory、Profile、Planing、Action等模块。
|
||||
围绕Agent的核心模块,往上构建多Agent之间的协作能力,协作主要有三种形式。
|
||||
|
||||
1. 单一Agent: 单个Agent有具体任务与目标,不涉及多模型协作。
|
||||
2. Auto-Plan: Agent自己制定计划,在多Agent协作时负责路径规划、分工协作等。
|
||||
3. AWEL: 编排,通过程序编排来实现多智能体的协作。
|
||||
### 多模型架构
|
||||
在AIGC应用探索与生产落地中,难以避免直接与模型服务对接,但是目前大模型的推理部署还没有一个事实标准,不断有新的模型发布,也不断有新的训练方法被提出,我们需要花大量的时间来适配多变的底层模型环境,而这在一定程度上制约了AIGC应用的探索和落地。
|
||||

|
||||
SMMF由模型推理层、模型部署层两部分组成。模型推理层对应模型推理框架vLLM、TGI和TensorRT等。模型部署层向下对接推理层,向上提供模型服务能力。 模型部署框架在推理框架之上,提供了多模型实例、多推理框架、多云、自动扩缩容与可观测性等能力。
|
||||
### 子模块
|
||||
|
||||
- [DB-GPT-Hub](https://github.com/eosphoros-ai/DB-GPT-Hub) 通过微调来持续提升Text2SQL效果
|
||||
- [DB-GPT-Plugins](https://github.com/eosphoros-ai/DB-GPT-Plugins) DB-GPT 插件仓库, 兼容Auto-GPT
|
||||
- [GPT-Vis](https://github.com/eosphoros-ai/DB-GPT-Web) 可视化协议
|
||||
- [dbgpts](https://github.com/eosphoros-ai/dbgpts) dbgpts 是官方提供的数据应用仓库, 包含数据智能应用, 智能体编排流程模版, 通用算子等构建在DB-GPT之上的资源。
|
||||
## 安装
|
||||
[**教程**](https://www.yuque.com/eosphoros/dbgpt-docs/bex30nsv60ru0fmx)
|
||||
|
||||
- [**快速开始**](https://www.yuque.com/eosphoros/dbgpt-docs/ew0kf1plm0bru2ga)
|
||||
- [源码安装](https://www.yuque.com/eosphoros/dbgpt-docs/urh3fcx8tu0s9xmb)
|
||||
- [Docker安装](https://www.yuque.com/eosphoros/dbgpt-docs/glf87qg4xxcyrp89)
|
||||
- [Docker Compose安装](https://www.yuque.com/eosphoros/dbgpt-docs/wwdu11e0v5nkfzin)
|
||||
- [**使用手册**](https://www.yuque.com/eosphoros/dbgpt-docs/tkspdd0tcy2vlnu4)
|
||||
- [知识库](https://www.yuque.com/eosphoros/dbgpt-docs/ycyz3d9b62fccqxh)
|
||||
- [数据对话](https://www.yuque.com/eosphoros/dbgpt-docs/gd9hbhi1dextqgbz)
|
||||
- [Excel对话](https://www.yuque.com/eosphoros/dbgpt-docs/prugoype0xd2g4bb)
|
||||
- [数据库对话](https://www.yuque.com/eosphoros/dbgpt-docs/wswpv3zcm2c9snmg)
|
||||
- [报表分析](https://www.yuque.com/eosphoros/dbgpt-docs/vsv49p33eg4p5xc1)
|
||||
- [Agents](https://www.yuque.com/eosphoros/dbgpt-docs/pom41m7oqtdd57hm)
|
||||
- [**进阶教程**](https://www.yuque.com/eosphoros/dbgpt-docs/dxalqb8wsv2xkm5f)
|
||||
- [智能体工作流使用](https://www.yuque.com/eosphoros/dbgpt-docs/hcomfb3yrleg7gmq)
|
||||
- [智能应用使用](https://www.yuque.com/eosphoros/dbgpt-docs/aiagvxeb86iarq6r)
|
||||
- [多模型管理](https://www.yuque.com/eosphoros/dbgpt-docs/huzgcf2abzvqy8uv)
|
||||
- [命令行使用](https://www.yuque.com/eosphoros/dbgpt-docs/gd4kgumgd004aly8)
|
||||
- [**模型服务部署**](https://www.yuque.com/eosphoros/dbgpt-docs/vubxiv9cqed5mc6o)
|
||||
- [单机部署](https://www.yuque.com/eosphoros/dbgpt-docs/kwg1ed88lu5fgawb)
|
||||
- [集群部署](https://www.yuque.com/eosphoros/dbgpt-docs/gmbp9619ytyn2v1s)
|
||||
- [vLLM](https://www.yuque.com/eosphoros/dbgpt-docs/bhy9igdvanx1uluf)
|
||||
- [**如何Debug**](https://www.yuque.com/eosphoros/dbgpt-docs/eyg0ocbc2ce3q95r)
|
||||
- [**AWEL**](https://www.yuque.com/eosphoros/dbgpt-docs/zozbzslbfk0m0op5)
|
||||
- [**FAQ**](https://www.yuque.com/eosphoros/dbgpt-docs/gomtc46qonmyt44l)
|
||||
## 特性一览
|
||||
|
||||
- **私域问答&数据处理&RAG**支持内置、多文件格式上传、插件自抓取等方式自定义构建知识库,对海量结构化,非结构化数据做统一向量存储与检索
|
||||
- **多数据源&GBI**支持自然语言与Excel、数据库、数仓等多种数据源交互,并支持分析报告。
|
||||
- **自动化微调**围绕大语言模型、Text2SQL数据集、LoRA/QLoRA/Pturning等微调方法构建的自动化微调轻量框架, 让TextSQL微调像流水线一样方便。详见: [DB-GPT-Hub](https://github.com/eosphoros-ai/DB-GPT-Hub)
|
||||
- **数据驱动的Agents插件**支持自定义插件执行任务,原生支持Auto-GPT插件模型,Agents协议采用Agent Protocol标准
|
||||
- **多模型支持与管理**海量模型支持,包括开源、API代理等几十种大语言模型。如LLaMA/LLaMA2、Baichuan、ChatGLM、文心、通义、智谱等。当前已支持如下模型:
|
||||
- 新增支持模型
|
||||
- 🔥🔥🔥 [Meta-Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct)
|
||||
- 🔥🔥🔥 [Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)
|
||||
- 🔥🔥🔥 [Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
|
||||
- 🔥🔥🔥 [gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it)
|
||||
- 🔥🔥🔥 [gemma-2-9b-it](https://huggingface.co/google/gemma-2-9b-it)
|
||||
- 🔥🔥🔥 [DeepSeek-Coder-V2-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct)
|
||||
- 🔥🔥🔥 [DeepSeek-Coder-V2-Lite-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)
|
||||
- 🔥🔥🔥 [Qwen2-57B-A14B-Instruct](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct)
|
||||
- 🔥🔥🔥 [Qwen2-57B-A14B-Instruct](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct)
|
||||
- 🔥🔥🔥 [Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct)
|
||||
- 🔥🔥🔥 [Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct)
|
||||
- 🔥🔥🔥 [Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct)
|
||||
- 🔥🔥🔥 [Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct)
|
||||
- 🔥🔥🔥 [glm-4-9b-chat](https://huggingface.co/THUDM/glm-4-9b-chat)
|
||||
- 🔥🔥🔥 [Phi-3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
|
||||
- 🔥🔥🔥 [Yi-1.5-34B-Chat](https://huggingface.co/01-ai/Yi-1.5-34B-Chat)
|
||||
- 🔥🔥🔥 [Yi-1.5-9B-Chat](https://huggingface.co/01-ai/Yi-1.5-9B-Chat)
|
||||
- 🔥🔥🔥 [Yi-1.5-6B-Chat](https://huggingface.co/01-ai/Yi-1.5-6B-Chat)
|
||||
- 🔥🔥🔥 [Qwen1.5-110B-Chat](https://huggingface.co/Qwen/Qwen1.5-110B-Chat)
|
||||
- 🔥🔥🔥 [Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat)
|
||||
- 🔥🔥🔥 [Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)
|
||||
- 🔥🔥🔥 [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
|
||||
- 🔥🔥🔥 [CodeQwen1.5-7B-Chat](https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat)
|
||||
- 🔥🔥🔥 [Qwen1.5-32B-Chat](https://huggingface.co/Qwen/Qwen1.5-32B-Chat)
|
||||
- 🔥🔥🔥 [Starling-LM-7B-beta](https://huggingface.co/Nexusflow/Starling-LM-7B-beta)
|
||||
- 🔥🔥🔥 [gemma-7b-it](https://huggingface.co/google/gemma-7b-it)
|
||||
- 🔥🔥🔥 [gemma-2b-it](https://huggingface.co/google/gemma-2b-it)
|
||||
- 🔥🔥🔥 [SOLAR-10.7B](https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0)
|
||||
- 🔥🔥🔥 [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
|
||||
- 🔥🔥🔥 [Qwen-72B-Chat](https://huggingface.co/Qwen/Qwen-72B-Chat)
|
||||
- 🔥🔥🔥 [Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat)
|
||||
- [更多开源模型](https://www.yuque.com/eosphoros/dbgpt-docs/iqaaqwriwhp6zslc#qQktR)
|
||||
- 支持在线代理模型
|
||||
- [x] [DeepSeek.deepseek-chat](https://platform.deepseek.com/api-docs/)
|
||||
- [x] [Ollama.API](https://github.com/ollama/ollama/blob/main/docs/api.md)
|
||||
- [x] [月之暗面.Moonshot](https://platform.moonshot.cn/docs/)
|
||||
- [x] [零一万物.Yi](https://platform.lingyiwanwu.com/docs)
|
||||
- [x] [OpenAI·ChatGPT](https://api.openai.com/)
|
||||
- [x] [百川·Baichuan](https://platform.baichuan-ai.com/)
|
||||
- [x] [阿里·通义](https://www.aliyun.com/product/dashscope)
|
||||
- [x] [百度·文心](https://cloud.baidu.com/product/wenxinworkshop?track=dingbutonglan)
|
||||
- [x] [智谱·ChatGLM](http://open.bigmodel.cn/)
|
||||
- [x] [讯飞·星火](https://xinghuo.xfyun.cn/)
|
||||
- [x] [Google·Bard](https://bard.google.com/)
|
||||
- [x] [Google·Gemini](https://makersuite.google.com/app/apikey)
|
||||
- **隐私安全**通过私有化大模型、代理脱敏等多种技术保障数据的隐私安全。
|
||||
- [支持数据源](https://www.yuque.com/eosphoros/dbgpt-docs/rc4r27ybmdwg9472)
|
||||
## Image
|
||||
🌐 [AutoDL镜像](https://www.codewithgpu.com/i/eosphoros-ai/DB-GPT/dbgpt)
|
||||
🌐 [小程序云部署](https://www.yuque.com/eosphoros/dbgpt-docs/ek12ly8k661tbyn8)
|
||||
### 多语言切换
|
||||
在.env 配置文件当中,修改LANGUAGE参数来切换使用不同的语言,默认是英文(中文zh, 英文en, 其他语言待补充)
|
||||
## 使用说明
|
||||
### 多模型使用
|
||||
### 数据Agents使用
|
||||
|
||||
- [数据Agents](https://www.yuque.com/eosphoros/dbgpt-docs/gwz4rayfuwz78fbq)
|
||||
## 贡献
|
||||
## 更加详细的贡献指南请参考[如何贡献](https://github.com/eosphoros-ai/DB-GPT/blob/main/CONTRIBUTING.md)。
|
||||
这是一个用于数据库的复杂且创新的工具, 我们的项目也在紧急的开发当中, 会陆续发布一些新的feature。如在使用当中有任何具体问题, 优先在项目下提issue, 如有需要, 请联系如下微信,我会尽力提供帮助,同时也非常欢迎大家参与到项目建设中。
|
||||
## Licence
|
||||
The MIT License (MIT)
|
||||
## 引用
|
||||
如果您发现`DB-GPT`对您的研究或开发有用,请引用以下[论文](https://arxiv.org/abs/2312.17449):
|
||||
```
|
||||
@article{xue2023dbgpt,
|
||||
title={DB-GPT: Empowering Database Interactions with Private Large Language Models},
|
||||
author={Siqiao Xue and Caigao Jiang and Wenhui Shi and Fangyin Cheng and Keting Chen and Hongjun Yang and Zhiping Zhang and Jianshan He and Hongyang Zhang and Ganglin Wei and Wang Zhao and Fan Zhou and Danrui Qi and Hong Yi and Shaodong Liu and Faqiang Chen},
|
||||
year={2023},
|
||||
journal={arXiv preprint arXiv:2312.17449},
|
||||
url={https://arxiv.org/abs/2312.17449}
|
||||
}
|
||||
```
|
||||
|
97
examples/test_files/graphrag-mini.md
Normal file
97
examples/test_files/graphrag-mini.md
Normal file
@@ -0,0 +1,97 @@
|
||||
|
||||
# TuGraph DB项目生态图谱
|
||||
Entities:
|
||||
(TuGraph-family/tugraph-db#github_repo)
|
||||
(vesoft-inc/nebula#github_repo)
|
||||
(PaddlePaddle/Paddle#github_repo)
|
||||
(apache/brpc#github_repo)
|
||||
(TuGraph-family/tugraph-web#github_repo)
|
||||
(TuGraph-family/tugraph-db-client-java#github_repo)
|
||||
(alibaba/GraphScope#github_repo)
|
||||
(ClickHouse/ClickHouse#github_repo)
|
||||
(TuGraph-family/fma-common#github_repo)
|
||||
(vesoft-inc/nebula-docs-cn#github_repo)
|
||||
(eosphoros-ai/DB-GPT#github_repo)
|
||||
(eosphoros-ai#github_organization)
|
||||
(yandex#github_organization)
|
||||
(alibaba#github_organization)
|
||||
(TuGraph-family#github_organization)
|
||||
(baidu#github_organization)
|
||||
(apache#github_organization)
|
||||
(vesoft-inc#github_organization)
|
||||
|
||||
Relationships:
|
||||
(TuGraph-family/tugraph-db#common_developer#vesoft-inc/nebula#common_developer count 10)
|
||||
(TuGraph-family/tugraph-db#common_developer#PaddlePaddle/Paddle#common_developer count 9)
|
||||
(TuGraph-family/tugraph-db#common_developer#apache/brpc#common_developer count 7)
|
||||
(TuGraph-family/tugraph-db#common_developer#TuGraph-family/tugraph-web#common_developer count 7)
|
||||
(TuGraph-family/tugraph-db#common_developer#TuGraph-family/tugraph-db-client-java#common_developer count 7)
|
||||
(TuGraph-family/tugraph-db#common_developer#alibaba/GraphScope#common_developer count 6)
|
||||
(TuGraph-family/tugraph-db#common_developer#ClickHouse/ClickHouse#common_developer count 6)
|
||||
(TuGraph-family/tugraph-db#common_developer#TuGraph-family/fma-common#common_developer count 6)
|
||||
(TuGraph-family/tugraph-db#common_developer#vesoft-inc/nebula-docs-cn#common_developer count 6)
|
||||
(TuGraph-family/tugraph-db#common_developer#eosphoros-ai/DB-GPT#common_developer count 6)
|
||||
(eosphoros-ai/DB-GPT#belong_to#eosphoros-ai#belong_to)
|
||||
(ClickHouse/ClickHouse#belong_to#yandex#belong_to)
|
||||
(alibaba/GraphScope#belong_to#alibaba#belong_to)
|
||||
(TuGraph-family/tugraph-db#belong_to#TuGraph-family#belong_to)
|
||||
(TuGraph-family/tugraph-web#belong_to#TuGraph-family#belong_to)
|
||||
(TuGraph-family/fma-common#belong_to#TuGraph-family#belong_to)
|
||||
(TuGraph-family/tugraph-db-client-java#belong_to#TuGraph-family#belong_to)
|
||||
(PaddlePaddle/Paddle#belong_to#baidu#belong_to)
|
||||
(apache/brpc#belong_to#apache#belong_to)
|
||||
(vesoft-inc/nebula#belong_to#vesoft-inc#belong_to)
|
||||
(vesoft-inc/nebula-docs-cn#belong_to#vesoft-inc#belong_to)
|
||||
|
||||
|
||||
# DB-GPT项目生态图谱
|
||||
Entities:
|
||||
(eosphoros-ai/DB-GPT#github_repo)
|
||||
(chatchat-space/Langchain-Chatchat#github_repo)
|
||||
(hiyouga/LLaMA-Factory#github_repo)
|
||||
(lm-sys/FastChat#github_repo)
|
||||
(langchain-ai/langchain#github_repo)
|
||||
(eosphoros-ai/DB-GPT-Hub#github_repo)
|
||||
(THUDM/ChatGLM-6B#github_repo)
|
||||
(langgenius/dify#github_repo)
|
||||
(vllm-project/vllm#github_repo)
|
||||
(QwenLM/Qwen#github_repo)
|
||||
(PaddlePaddle/PaddleOCR#github_repo)
|
||||
(vllm-project#github_organization)
|
||||
(eosphoros-ai#github_organization)
|
||||
(PaddlePaddle#github_organization)
|
||||
(QwenLM#github_organization)
|
||||
(THUDM#github_organization)
|
||||
(lm-sys#github_organization)
|
||||
(chatchat-space#github_organization)
|
||||
(langchain-ai#github_organization)
|
||||
(langgenius#github_organization)
|
||||
|
||||
Relationships:
|
||||
(eosphoros-ai/DB-GPT#common_developer#chatchat-space/Langchain-Chatchat#common_developer count 82)
|
||||
(eosphoros-ai/DB-GPT#common_developer#hiyouga/LLaMA-Factory#common_developer count 45)
|
||||
(eosphoros-ai/DB-GPT#common_developer#lm-sys/FastChat#common_developer count 39)
|
||||
(eosphoros-ai/DB-GPT#common_developer#langchain-ai/langchain#common_developer count 37)
|
||||
(eosphoros-ai/DB-GPT#common_developer#eosphoros-ai/DB-GPT-Hub#common_developer count 37)
|
||||
(eosphoros-ai/DB-GPT#common_developer#THUDM/ChatGLM-6B#common_developer count 31)
|
||||
(eosphoros-ai/DB-GPT#common_developer#langgenius/dify#common_developer count 30)
|
||||
(eosphoros-ai/DB-GPT#common_developer#vllm-project/vllm#common_developer count 27)
|
||||
(eosphoros-ai/DB-GPT#common_developer#QwenLM/Qwen#common_developer count 26)
|
||||
(eosphoros-ai/DB-GPT#common_developer#PaddlePaddle/PaddleOCR#common_developer count 24)
|
||||
(vllm-project/vllm#belong_to#vllm-project#belong_to)
|
||||
(eosphoros-ai/DB-GPT#belong_to#eosphoros-ai#belong_to)
|
||||
(eosphoros-ai/DB-GPT-Hub#belong_to#eosphoros-ai#belong_to)
|
||||
(PaddlePaddle/PaddleOCR#belong_to#PaddlePaddle#belong_to)
|
||||
(QwenLM/Qwen#belong_to#QwenLM#belong_to)
|
||||
(THUDM/ChatGLM-6B#belong_to#THUDM#belong_to)
|
||||
(lm-sys/FastChat#belong_to#lm-sys#belong_to)
|
||||
(chatchat-space/Langchain-Chatchat#belong_to#chatchat-space#belong_to)
|
||||
(langchain-ai/langchain#belong_to#langchain-ai#belong_to)
|
||||
(langgenius/dify#belong_to#langgenius#belong_to)
|
||||
|
||||
|
||||
# TuGraph简介
|
||||
TuGraph图数据库由蚂蚁集团与清华大学联合研发,构建了一套包含图存储、图计算、图学习、图研发平台的完善的图技术体系,支持海量多源的关联数据的实时处理,显著提升数据分析效率,支撑了蚂蚁支付、安全、社交、公益、数据治理等300多个场景应用。拥有业界领先规模的图集群,解决了图数据分析面临的大数据量、高吞吐率和低延迟等重大挑战,是蚂蚁集团金融风控能力的重要基础设施,显著提升了欺诈洗钱等金融风险的实时识别能力和审理分析效率,并面向金融、工业、政务服务等行业客户。TuGraph产品家族中,开源产品包括:TuGraph DB、TuGraph Analytics、OSGraph、ChatTuGraph等。内源产品包括:GeaBase、GeaFlow、GeaLearn、GeaMaker等。
|
||||
|
||||
# DB-GPT简介
|
||||
DB-GPT是一个开源的AI原生数据应用开发框架(AI Native Data App Development framework with AWEL(Agentic Workflow Expression Language) and Agents)。目的是构建大模型领域的基础设施,通过开发多模型管理(SMMF)、Text2SQL效果优化、RAG框架以及优化、Multi-Agents框架协作、AWEL(智能体工作流编排)等多种技术能力,让围绕数据库构建大模型应用更简单,更方便。
|
1814
examples/test_files/osgraph.md
Normal file
1814
examples/test_files/osgraph.md
Normal file
File diff suppressed because it is too large
Load Diff
286
examples/test_files/tugraph.md
Normal file
286
examples/test_files/tugraph.md
Normal file
@@ -0,0 +1,286 @@
|
||||
# TuGraph
|
||||
TuGraph图数据库由蚂蚁集团与清华大学联合研发,构建了一套包含图存储、图计算、图学习、图研发平台的完善的图技术体系,支持海量多源的关联数据的实时处理,显著提升数据分析效率,支撑了蚂蚁支付、安全、社交、公益、数据治理等300多个场景应用。拥有业界领先规模的图集群,解决了图数据分析面临的大数据量、高吞吐率和低延迟等重大挑战,是蚂蚁集团金融风控能力的重要基础设施,显著提升了欺诈洗钱等金融风险的实时识别能力和审理分析效率,并面向金融、工业、政务服务等行业客户。TuGraph产品家族中,开源产品包括:TuGraph DB、TuGraph Analytics、OSGraph、ChatTuGraph等。内源产品包括:GeaBase、GeaFlow、GeaLearn、GeaMaker等。
|
||||
|
||||
TuGraph企业级图数据管理平台提供对关联数据的复杂、深度分析功能。TuGraph以分布式集群架构,支持海量数据的高吞吐、高可用性、高并发读写和ACID事务操作。通过对数据的分片、分区,支持水平扩展,提供对点、边、属性、拓扑等结构的查询、过滤、索引等功能。TuGraph提供离线、近线、在线的图算法和图学习能力,内置数十种算法,能够对全图、子图、动态图的模式和特征进行处理,通过可视化或数据服务形式与外部数据源交互。此外,TuGraph提供可视化的展示和操作界面,覆盖图研发和服务的全生命周期,支持主流的图查询语言,提供便捷的访问和开发接口,能够与外部多模数据源进行导入导出、存量/增量/批量更新和备份。TuGraph还提供精美和实用的图生产环境管理监控,满足企业用户的技术和业务应用需要。
|
||||
|
||||
TuGraph在金融风控方面的应用实践主要包括个人信贷业务、反欺诈、洗钱路径追踪等问题。利用多维交叉关联信息深度刻画申请和交易行为,识别多种复杂、规模化、隐蔽性的欺诈网络和洗钱网络;结合聚类分析、风险传播等算法,实时计算用户的风险评分,在风险行为发生前预先识别,帮助金融机构提升效率、降低风险。基于TuGraph企业级图数据管理平台,蚂蚁集团增加反欺诈稽核金额6%,反洗钱风险审理分析效率提升90%。每天计算近10亿用户大约200亿左右边关系,对疑似团伙类犯罪风险识别能力提高近10倍。此外,为某银行提供的信贷图平台提升了13%的风控模型区分度;为某银行完成的信用卡申请团伙欺诈分析方案,运算时间缩短至原有的1/60;为某银行搭建的企业风险图平台,在对小微企业评级放贷问题中,担保圈识别准确率达到90%以上。
|
||||
|
||||
|
||||
## 1. TuGraph DB
|
||||
|
||||
### 1.1 简介
|
||||
TuGraph DB 是支持大数据容量、低延迟查找和快速图分析功能的高效图数据库。TuGraph社区版于2022年9月开源,提供了完整的图数据库基础功能和成熟的产品设计(如ACID兼容的事务、编程API和配套工具等),适用于单实例部署。社区版支持TB级别的数据规模,为用户管理和分析复杂关联数据提供了高效、易用、可靠的平台,是学习TuGraph和实现小型项目的理想选择。
|
||||
|
||||
### 1.2 TuGraph特性
|
||||
TuGraph是支持大数据量、低延迟查找和快速图分析功能的高效图数据库。TuGraph也是基于磁盘的数据库,支持存储多达数十TB的数据。TuGraph提供多种API,使用户能够轻松构建应用程序,并使其易于扩展和优化。
|
||||
|
||||
它具有如下功能特征:
|
||||
|
||||
* 属性图模型
|
||||
* 实时增删查改
|
||||
* 多重图(点间允许多重边)
|
||||
* 多图(大图与多个子图)
|
||||
* 完善的ACID事务处理,隔离级别为可串行化(serializable)
|
||||
* 点边索引
|
||||
* 混合事务和分析处理(HTAP),支持图查询、图分析、图学习
|
||||
* 主流图查询语言(OpenCypher、ISO GQL等)
|
||||
* 支持OLAP API,内置30多种图分析算法
|
||||
* 基于C++/Python的存储过程,含事务内并行Traversal API
|
||||
* 提供图可视化工具
|
||||
* 在性能和可扩展性方面的支持:
|
||||
* 千万点/秒的高吞吐率
|
||||
* TB级大容量
|
||||
* 高可用性支持
|
||||
* 高性能批量导入
|
||||
* 在线/离线的备份恢复
|
||||
|
||||
|
||||
主要功能:
|
||||
|
||||
- 标签属性图模型
|
||||
- 完善的 ACID 事务处理
|
||||
- 内置 34 图分析算法
|
||||
- 支持全文/主键/二级索引
|
||||
- OpenCypher 图查询语言
|
||||
- 基于 C++/Python 的存储过程
|
||||
|
||||
性能和可扩展性:
|
||||
|
||||
- LDBC SNB世界记录保持者 (2022/9/1 https://ldbcouncil.org/benchmarks/snb/)
|
||||
- 支持存储多达数十TB的数据
|
||||
- 每秒访问数百万个顶点
|
||||
- 快速批量导入
|
||||
|
||||
TuGraph DB的文档在[链接](https://tugraph-db.readthedocs.io/zh_CN/latest),欢迎访问我们的[官网](https://www.tugraph.org)。
|
||||
|
||||
### 1.3 快速上手
|
||||
|
||||
一个简单的方法是使用docker进行设置,可以在[DockerHub](https://hub.docker.com/u/tugraph)中找到, 名称为`tugraph/tugraph-runtime-[os]:[tugraph version]`,
|
||||
例如, `tugraph/tugraph-runtime-centos7:3.3.0`。
|
||||
|
||||
更多详情请参考 [快速上手文档](./docs/zh-CN/source/3.quick-start/1.preparation.md) 和 [业务开发指南](./docs/zh-CN/source/development_guide.md).
|
||||
|
||||
### 1.4 从源代码编译
|
||||
|
||||
建议在Linux系统中构建TuGraph DB,Docker环境是个不错的选择。如果您想设置一个新的环境,请参考[Dockerfile](ci/images).
|
||||
|
||||
以下是编译TuGraph DB的步骤:
|
||||
|
||||
1. 如果需要web接口运行`deps/build_deps.sh`,不需要web接口则跳过此步骤
|
||||
2. 根据容器系统信息执行`cmake .. -DOURSYSTEM=centos`或者`cmake .. -DOURSYSTEM=ubuntu`
|
||||
3. `make`
|
||||
4. `make package` 或者 `cpack --config CPackConfig.cmake`
|
||||
|
||||
示例:`tugraph/tugraph-compile-centos7`Docker环境
|
||||
|
||||
```bash
|
||||
$ git clone --recursive https://github.com/TuGraph-family/tugraph-db.git
|
||||
$ cd tugraph-db
|
||||
$ deps/build_deps.sh
|
||||
$ mkdir build && cd build
|
||||
$ cmake .. -DOURSYSTEM=centos7
|
||||
$ make
|
||||
$ make package
|
||||
```
|
||||
|
||||
### 1.5 开发
|
||||
|
||||
我们已为在DockerHub中编译准备了环境docker镜像,可以帮助开发人员轻松入门,名称为 `tugraph/tugraph-compile-[os]:[compile version]`, 例如, `tugraph/tugraph-compile-centos7:1.1.0`。
|
||||
|
||||
## 2. TuGraph Analytics
|
||||
|
||||
### 2.1 介绍
|
||||
**TuGraph Analytics** (别名:GeaFlow) 是蚂蚁集团开源的[**性能世界一流**](https://ldbcouncil.org/benchmarks/snb-bi/)的OLAP图数据库,支持万亿级图存储、图表混合处理、实时图计算、交互式图分析等核心能力,目前广泛应用于数仓加速、金融风控、知识图谱以及社交网络等场景。
|
||||
|
||||
关于GeaFlow更多介绍请参考:[GeaFlow介绍文档](docs/docs-cn/introduction.md)
|
||||
|
||||
GeaFlow设计论文参考:[GeaFlow: A Graph Extended and Accelerated Dataflow System](https://dl.acm.org/doi/abs/10.1145/3589771)
|
||||
|
||||
### 2.2 起源
|
||||
|
||||
早期的大数据分析主要以离线处理为主,以Hadoop为代表的技术栈很好的解决了大规模数据的分析问题。然而数据处理的时效性不足,
|
||||
很难满足高实时需求的场景。以Storm为代表的流式计算引擎的出现则很好的解决了数据实时处理的问题,提高了数据处理的时效性。
|
||||
然而,Storm本身不提供状态管理的能力, 对于聚合等有状态的计算显得无能为力。Flink
|
||||
的出现很好的弥补了这一短板,通过引入状态管理以及Checkpoint机制,实现了高效的有状态流计算能力。
|
||||
|
||||
随着数据实时处理场景的丰富,尤其是在实时数仓场景下,实时关系运算(即Stream Join)
|
||||
越来越多的成为数据实时化的难点。Flink虽然具备优秀的状态管理能和出色的性能,然而在处理Join运算,尤其是3度以上Join时,
|
||||
性能瓶颈越来越明显。由于需要在Join两端存放各个输入的数据状态,当Join变多时,状态的数据量急剧扩大,性能也变的难以接受。
|
||||
产生这个问题的本质原因是Flink等流计算系统以表作为数据模型,而表模型本身是一个二维结构,不包含关系的定义和关系的存储,
|
||||
在处理关系运算时只能通过Join运算方式实现,成本很高。
|
||||
|
||||
在蚂蚁的大数据应用场景中,尤其是金融风控、实时数仓等场景下,存在大量Join运算,如何提高Join
|
||||
的时效性和性能成为我们面临的重要挑战,为此我们引入了图模型。图模型是一种以点边结构描述实体关系的数据模型,在图模型里面,点代表实体,
|
||||
边代表关系,数据存储层面点边存放在一起。因此,图模型天然定义了数据的关系同时存储层面物化了点边关系。基于图模型,我们实现了新一代实时计算
|
||||
引擎GeaFlow,很好的解决了复杂关系运算实时化的问题。目前GeaFlow已广泛应用于数仓加速、金融风控、知识图谱以及社交网络等场景。
|
||||
|
||||
### 2.3 特性
|
||||
|
||||
* 分布式实时图计算
|
||||
* 图表混合处理(SQL+GQL语言)
|
||||
* 统一流批图计算
|
||||
* 万亿级图原生存储
|
||||
* 交互式图分析
|
||||
* 高可用和Exactly Once语义
|
||||
* 高阶API算子开发
|
||||
* UDF/图算法/Connector插件支持
|
||||
* 一站式图研发平台
|
||||
* 云原生部署
|
||||
|
||||
### 2.4 快速上手
|
||||
|
||||
1. 准备Git、JDK8、Maven、Docker环境。
|
||||
2. 下载源码:`git clone https://github.com/TuGraph-family/tugraph-analytics`
|
||||
3. 项目构建:`mvn clean install -DskipTests`
|
||||
4. 测试任务:`./bin/gql_submit.sh --gql geaflow/geaflow-examples/gql/loop_detection.sql`
|
||||
3. 构建镜像:`./build.sh --all`
|
||||
4. 启动容器:`docker run -d --name geaflow-console -p 8888:8888 geaflow-console:0.1`
|
||||
|
||||
更多详细内容请参考:[快速上手文档](docs/docs-cn/quick_start.md)。
|
||||
|
||||
### 2.5 开发手册
|
||||
|
||||
GeaFlow支持DSL和API两套编程接口,您既可以通过GeaFlow提供的类SQL扩展语言SQL+ISO/GQL进行流图计算作业的开发,也可以通过GeaFlow的高阶API编程接口通过Java语言进行应用开发。
|
||||
* DSL应用开发:[DSL开发文档](docs/docs-cn/application-development/dsl/overview.md)
|
||||
* API应用开发:[API开发文档](docs/docs-cn/application-development/api/guid.md)
|
||||
|
||||
|
||||
### 2.6 技术架构
|
||||
|
||||
GeaFlow整体架构如下所示:
|
||||
|
||||

|
||||
|
||||
* [DSL层](./principle/dsl_principle.md):即语言层。GeaFlow设计了SQL+GQL的融合分析语言,支持对表模型和图模型统一处理。
|
||||
* [Framework层](./principle/framework_principle.md):即框架层。GeaFlow设计了面向Graph和Stream的两套API支持流、批、图融合计算,并实现了基于Cycle的统一分布式调度模型。
|
||||
* [State层](./principle/state_principle.md):即存储层。GeaFlow设计了面向Graph和KV的两套API支持表数据和图数据的混合存储,整体采用了Sharing Nothing的设计,并支持将数据持久化到远程存储。
|
||||
* [Console平台](./principle/console_principle.md):GeaFlow提供了一站式图研发平台,实现了图数据的建模、加工、分析能力,并提供了图作业的运维管控支持。
|
||||
* **执行环境**:GeaFlow可以运行在多种异构执行环境,如K8S、Ray以及本地模式。
|
||||
|
||||
### 2.7 应用场景
|
||||
|
||||
#### 2.7.1 实时数仓加速
|
||||
数仓场景存在大量Join运算,在DWD层往往需要将多张表展开成一张大宽表,以加速后续查询。当Join的表数量变多时,传统的实时计算引擎很难
|
||||
保证Join的时效性和性能,这也成为目前实时数仓领域一个棘手的问题。基于GeaFlow的实时图计算引擎,可以很好的解决这方面的问题。
|
||||
GeaFlow以图作为数据模型,替代DWD层的宽表,可以实现数据实时构图,同时在查询阶段利用图的点边物化特性,可以极大加速关系运算的查询。
|
||||
|
||||
#### 2.7.2 实时归因分析
|
||||
在信息化的大背景下,对用户行为进行渠道归因和路径分析是流量分析领域中的核心所在。通过实时计算用户的有效行为路径,构建出完整的转化路径,能够快速帮助业务看清楚产品的价值,帮助运营及时调整运营思路。实时归因分析的核心要点是准确性和实效性。准确性要求在成本可控下保证用户行为路径分析的准确性;实效性则要求计算的实时性足够高,才能快速帮助业务决策。
|
||||
基于GeaFlow流图计算引擎的能力可以很好的满足归因分析的准确性和时效性要求。如下图所示:
|
||||

|
||||
GeaFlow首先通过实时构图将用户行为日志转换成用户行为拓扑图,以用户作为图中的点,与其相关的每个行为构建成从该用户指向埋点页面的一条边.然后利用流图计算能力分析提前用户行为子图,在子图上基于归因路径匹配的规则进行匹配计算得出该成交行为相应用户的归因路径,并输出到下游系统。
|
||||
|
||||
#### 2.7.3 实时反套现
|
||||
在信贷风控的场景下,如何进行信用卡反套现是一个典型的风控诉求。基于现有的套现模式分析,可以看到套现是一个环路子图,如何快速,高效在大图中快速判定套现,将极大的增加风险的识别效率。以下图为例,通过将实时交易流、转账流等输入数据源转换成实时交易图,然后根据风控策略对用户交易行为做图特征分析,比如环路检查等特征计算,实时提供给决策和监控平台进行反套现行为判定。通过GeaFlow实时构图和实时图计算能力,可以快速发现套现等异常交易行为,极大降低平台风险。
|
||||

|
||||
|
||||
|
||||
|
||||
## 3. OSGraph
|
||||
|
||||
**OSGraph (Open Source Graph)** 是一个开源图谱关系洞察工具,基于GitHub开源数据全域图谱,实现开发者行为、项目社区生态的分析洞察。可以为开发者、项目Owner、开源布道师、社区运营等提供简洁直观的开源数据视图,帮助你和你的项目制作专属的开源名片、寻求契合的开发伙伴、挖掘深度的社区价值。
|
||||
|
||||
|
||||
### 3.1 产品地址
|
||||
|
||||
**[https://osgraph.com](https://osgraph.com)**
|
||||
|
||||
|
||||
### 3.2 快速开始
|
||||
|
||||
本地启动测试请参考:[OSGraph部署文档](docs/zh-CN/DeveloperManual.md)
|
||||
|
||||
|
||||
### 3.3 功能介绍
|
||||
|
||||
当前产品默认提供了6张开源数据图谱供大家体验,包含项目类图谱3个(贡献、生态、社区)、开发类3个(活动、伙伴、兴趣)。
|
||||
|
||||
|
||||
#### 3.3.1 项目贡献图谱
|
||||
|
||||
**发现项目核心贡献**:根据项目开发者研发活动信息(Issue、PR、Commit、CR等),找到项目核心贡献者。
|
||||
|
||||
**Q**:我想看看给Apache Spark项目写代码的都有谁?
|
||||
|
||||
**A**:选择“项目贡献图谱” - 搜索spark - 选择apache/spark。可以看到HyukjinKwon、dongjoon-hyun等核心贡献者,另外还一不小心捉到两个“显眼包”,AmplabJenkins、SparkQA这两个只参与CodeReview的机器人账号。
|
||||
|
||||

|
||||
|
||||
|
||||
#### 3.3.2 项目生态图谱
|
||||
|
||||
**洞察项目生态伙伴**:提取项目间的开发活动、组织等关联信息,构建项目核心生态关系。
|
||||
|
||||
**Q**:最近很火的开源大模型Llama3周边生态大致是什么样的?
|
||||
|
||||
**A**:选择“项目生态图谱” - 搜索llama3 - 选择meta-llama3/llama3。可以看到pytorch、tensorflow、transformers等知名AI项目,当然还有上科技头条的llama.cpp。比较惊喜的发现是ray竟然和llama3有不少公共开发者,可以深度挖掘一下。
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
#### 3.3.3 项目社区图谱
|
||||
|
||||
**分析项目社区分布**:根据项目的开发活动、开发者组织等信息,提取项目核心开发者社区分布。
|
||||
|
||||
**Q**:大数据引擎Flink发展这么多年后的社区现状如何?
|
||||
|
||||
**A**:选择“项目社区图谱” - 搜索flink - 选择apache/flink。可以看到项目关注者主要来自中、美、德三国,而Alibaba组织是代码贡献的中坚力量。
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
#### 3.3.4 开发活动图谱
|
||||
|
||||
**展示个人开源贡献**:根据开发者研发活动信息(Issue、PR、Commit、CR等),找到参与的核心项目。
|
||||
|
||||
**Q**:大神Linus Torvalds最近在参与哪些开源项目?
|
||||
|
||||
**A**:选择“开发活动图谱” - 搜索torvalds。果然linux项目是torvalds的主要工作,不过llvm、mody、libgit2也有所参与,同时也看到他在subsurface这种“潜水日志管理工具”上的大量贡献,果然大佬的爱好都很广泛。
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
#### 3.3.5 开源伙伴图谱
|
||||
|
||||
**寻找个人开源伙伴**:找到开发者在开源社区中,与之协作紧密的其他开发者。
|
||||
|
||||
**Q**:我想知道在开源社区有没有和我志同道合的人?
|
||||
|
||||
**A**:选择“开发伙伴图谱” - 搜索我的ID。让我震惊的是有那么多陌生人和我关注了同一批项目,这不得找机会认识一下,说不定就能找到新朋友了。而和我合作PR的人基本上都是我认识的朋友和同事,继续探索一下朋友们的开源伙伴,开源社区的“六度人脉”不就来了么。
|
||||
|
||||

|
||||
|
||||
|
||||
|
||||
#### 3.3.6 开源兴趣图谱
|
||||
|
||||
**挖掘个人开源兴趣**:根据参与的项目主题、标签等信息,分析开发者技术领域与兴趣。
|
||||
|
||||
**Q**:GitHub上最活跃的开发者对什么技术感兴趣?
|
||||
|
||||
**A**:选择“开源兴趣图谱” - 搜索sindresorhus([GitHub用户榜](https://gitstar-ranking.com) No.1)。整体来看sindresorhus对node、npm、js很感兴趣,另外他发起的awesome项目足足30W星,令人咋舌!当前的开源兴趣数据主要来自项目有限的标签信息,后续借助AI技术可能会有更好的展现。
|
||||
|
||||

|
||||
|
||||
|
||||
### 3.4 未来规划
|
||||
|
||||
未来将会有更多有趣的图谱和功能加入到OSGraph:
|
||||
|
||||
* 简单灵活的API设计,让图谱无限扩展。
|
||||
* 自由高效的画布交互,无限探索数据价值。
|
||||
* 图谱URL支持嵌入Markdown,制作我的开源名片。
|
||||
* 基于AI技术的项目主题标签分析。
|
||||
* 多人多项目联合分析,图谱洞察一键可达。
|
||||
* 更丰富的数据展示与多维分析。
|
||||
* **更多功能,与你携手共建……**
|
||||
|
||||
|
||||
|
||||
## 4. ChatTuGraph
|
||||
|
||||
ChatTuGraph通过AI技术为TuGraph赋能,可以为图业务研发效能、图产品解决方案、图数据智能分析、图任务自动管控等领域带来更丰富的应用场景。
|
||||
目前ChatTuGraph通过图语言语料生成,借助大模型微调技术实现了自然语言的图数据分析,构建Graph RAG基于知识图谱实现检索增强生成,以降低大模型的推理幻觉,以及通过多智能体技术(Multiple Agents System)实现图数据上的AIGC、智能化等能力。
|
7
setup.py
7
setup.py
@@ -519,6 +519,11 @@ def knowledge_requires():
|
||||
"sentence-transformers",
|
||||
]
|
||||
|
||||
setup_spec.extras["graph_rag"] = setup_spec.extras["rag"] + [
|
||||
"neo4j",
|
||||
"dbgpt-tugraph-plugins>=0.1.0rc1",
|
||||
]
|
||||
|
||||
|
||||
def llama_cpp_requires():
|
||||
"""
|
||||
@@ -617,7 +622,6 @@ def all_datasource_requires():
|
||||
"pyhive",
|
||||
"thrift",
|
||||
"thrift_sasl",
|
||||
"neo4j",
|
||||
"vertica_python",
|
||||
]
|
||||
|
||||
@@ -691,6 +695,7 @@ def default_requires():
|
||||
]
|
||||
setup_spec.extras["default"] += setup_spec.extras["framework"]
|
||||
setup_spec.extras["default"] += setup_spec.extras["rag"]
|
||||
setup_spec.extras["default"] += setup_spec.extras["graph_rag"]
|
||||
setup_spec.extras["default"] += setup_spec.extras["datasource"]
|
||||
setup_spec.extras["default"] += setup_spec.extras["torch"]
|
||||
setup_spec.extras["default"] += setup_spec.extras["cache"]
|
||||
|
@@ -40,3 +40,15 @@ def test_get_indexes(connector):
|
||||
# Get the index information of the vertex table named 'person'.
|
||||
indexes = connector.get_indexes("person", "vertex")
|
||||
assert len(indexes) > 0
|
||||
|
||||
|
||||
def test_run_without_stream(connector):
|
||||
query = "MATCH (n) RETURN n limit 10"
|
||||
result = connector.run(query)
|
||||
assert len(result) == 10
|
||||
|
||||
|
||||
def test_run_with_stream(connector):
|
||||
query = "MATCH (n) RETURN n limit 10"
|
||||
result = list(connector.run_stream(query))
|
||||
assert len(result) == 10
|
||||
|
@@ -23,13 +23,13 @@ def test_graph_store(graph_store):
|
||||
graph_store.insert_triplet("E", "8", "F")
|
||||
|
||||
subgraph = graph_store.explore(["A"])
|
||||
print(f"\n{subgraph.graphviz()}")
|
||||
print(f"\n{subgraph.format()}")
|
||||
assert subgraph.edge_count == 9
|
||||
|
||||
graph_store.delete_triplet("A", "0", "A")
|
||||
graph_store.delete_triplet("B", "4", "D")
|
||||
subgraph = graph_store.explore(["A"])
|
||||
print(f"\n{subgraph.graphviz()}")
|
||||
print(f"\n{subgraph.format()}")
|
||||
assert subgraph.edge_count == 7
|
||||
|
||||
triplets = graph_store.get_triplets("B")
|
||||
@@ -38,4 +38,4 @@ def test_graph_store(graph_store):
|
||||
|
||||
schema = graph_store.get_schema()
|
||||
print(f"\nSchema: {schema}")
|
||||
assert len(schema) == 138
|
||||
assert len(schema) == 86
|
||||
|
@@ -2,17 +2,12 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from dbgpt.storage.graph_store.tugraph_store import TuGraphStore
|
||||
|
||||
|
||||
class TuGraphStoreConfig:
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
from dbgpt.storage.graph_store.tugraph_store import TuGraphStore, TuGraphStoreConfig
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def store():
|
||||
config = TuGraphStoreConfig(name="TestGraph")
|
||||
config = TuGraphStoreConfig(name="TestGraph", summary_enabled=False)
|
||||
store = TuGraphStore(config=config)
|
||||
yield store
|
||||
store.conn.close()
|
||||
@@ -29,7 +24,7 @@ def test_insert_and_get_triplets(store):
|
||||
store.insert_triplet("F", "7", "E")
|
||||
store.insert_triplet("E", "8", "F")
|
||||
triplets = store.get_triplets("A")
|
||||
assert len(triplets) == 3
|
||||
assert len(triplets) == 2
|
||||
triplets = store.get_triplets("B")
|
||||
assert len(triplets) == 3
|
||||
triplets = store.get_triplets("C")
|
||||
@@ -47,7 +42,7 @@ def test_query(store):
|
||||
result = store.query(query)
|
||||
v_c = result.vertex_count
|
||||
e_c = result.edge_count
|
||||
assert v_c == 2 and e_c == 3
|
||||
assert v_c == 3 and e_c == 3
|
||||
|
||||
|
||||
def test_explore(store):
|
||||
@@ -55,13 +50,13 @@ def test_explore(store):
|
||||
result = store.explore(subs, depth=2, fan=None, limit=10)
|
||||
v_c = result.vertex_count
|
||||
e_c = result.edge_count
|
||||
assert v_c == 2 and e_c == 3
|
||||
assert v_c == 5 and e_c == 5
|
||||
|
||||
|
||||
# def test_delete_triplet(store):
|
||||
# subj = "A"
|
||||
# rel = "0"
|
||||
# obj = "B"
|
||||
# store.delete_triplet(subj, rel, obj)
|
||||
# triplets = store.get_triplets(subj)
|
||||
# assert len(triplets) == 0
|
||||
def test_delete_triplet(store):
|
||||
subj = "A"
|
||||
rel = "0"
|
||||
obj = "B"
|
||||
store.delete_triplet(subj, rel, obj)
|
||||
triplets = store.get_triplets(subj)
|
||||
assert len(triplets) == 0
|
||||
|
@@ -0,0 +1,58 @@
|
||||
import pytest
|
||||
|
||||
from dbgpt.storage.graph_store.tugraph_store import TuGraphStore, TuGraphStoreConfig
|
||||
from dbgpt.storage.graph_store.graph import MemoryGraph, Edge, Vertex
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def store():
|
||||
config = TuGraphStoreConfig(name="TestSummaryGraph", summary_enabled=True)
|
||||
store_instance = TuGraphStore(config=config)
|
||||
yield store_instance
|
||||
store_instance.conn.close()
|
||||
|
||||
|
||||
def test_insert_graph(store):
|
||||
graph = MemoryGraph()
|
||||
vertex_list = [
|
||||
Vertex("A", "A", description="Vertex A", _document_id="Test doc"),
|
||||
Vertex("B", "B", description="Vertex B", _document_id="Test doc"),
|
||||
Vertex("C", "C", description="Vertex C", _document_id="Test doc"),
|
||||
Vertex("D", "D", description="Vertex D", _document_id="Test doc"),
|
||||
Vertex("E", "E", description="Vertex E", _document_id="Test doc"),
|
||||
Vertex("F", "F", description="Vertex F", _document_id="Test doc"),
|
||||
Vertex("G", "G", description="Vertex G", _document_id="Test doc"),
|
||||
]
|
||||
edge_list = [
|
||||
Edge("A", "B", name="A-B", description="description of edge"),
|
||||
Edge("B", "C", name="B-C", description="description of edge"),
|
||||
Edge("C", "D", name="C-D", description="description of edge"),
|
||||
Edge("D", "E", name="D-E", description="description of edge"),
|
||||
Edge("E", "F", name="E-F", description="description of edge"),
|
||||
Edge("F", "G", name="F-G", description="description of edge"),
|
||||
]
|
||||
for vertex in vertex_list:
|
||||
graph.upsert_vertex(vertex)
|
||||
for edge in edge_list:
|
||||
graph.append_edge(edge)
|
||||
store.insert_graph(graph)
|
||||
|
||||
|
||||
def test_leiden_query(store):
|
||||
query = "CALL db.plugin.callPlugin('CPP','leiden','{\"leiden_val\":\"_community_id\"}',60.00,false)"
|
||||
result = store.query(query)
|
||||
assert result.vertex_count == 1
|
||||
|
||||
|
||||
def test_query_node_and_edge(store):
|
||||
query = 'MATCH (n)-[r]->(m) WHERE n._community_id = "0" RETURN n,r,m'
|
||||
result = store.query(query)
|
||||
assert result.vertex_count == 7 and result.edge_count == 6
|
||||
|
||||
|
||||
def test_stream_query_path(store):
|
||||
query = 'MATCH p=(n)-[r:relation*2]->(m) WHERE n._community_id = "0" RETURN p'
|
||||
result = store.query(query)
|
||||
for v in result.vertices():
|
||||
print(v.get_prop("_community_id"))
|
||||
assert result.vertex_count == 7 and result.edge_count == 6
|
@@ -6,15 +6,15 @@ from dbgpt.storage.graph_store.graph import MemoryGraph, Edge, Vertex, Direction
|
||||
@pytest.fixture
|
||||
def g():
|
||||
g = MemoryGraph()
|
||||
g.append_edge(Edge("A", "A", label="0"))
|
||||
g.append_edge(Edge("A", "A", label="1"))
|
||||
g.append_edge(Edge("A", "B", label="2"))
|
||||
g.append_edge(Edge("B", "C", label="3"))
|
||||
g.append_edge(Edge("B", "D", label="4"))
|
||||
g.append_edge(Edge("C", "D", label="5"))
|
||||
g.append_edge(Edge("B", "E", label="6"))
|
||||
g.append_edge(Edge("F", "E", label="7"))
|
||||
g.append_edge(Edge("E", "F", label="8"))
|
||||
g.append_edge(Edge("A", "A", "0"))
|
||||
g.append_edge(Edge("A", "A", "1"))
|
||||
g.append_edge(Edge("A", "B", "2"))
|
||||
g.append_edge(Edge("B", "C", "3"))
|
||||
g.append_edge(Edge("B", "D", "4"))
|
||||
g.append_edge(Edge("C", "D", "5"))
|
||||
g.append_edge(Edge("B", "E", "6"))
|
||||
g.append_edge(Edge("F", "E", "7"))
|
||||
g.append_edge(Edge("E", "F", "8"))
|
||||
g.upsert_vertex(Vertex("G"))
|
||||
yield g
|
||||
|
||||
@@ -25,14 +25,20 @@ def g():
|
||||
(lambda g: g.del_vertices("G", "G"), 6, 9),
|
||||
(lambda g: g.del_vertices("C"), 6, 7),
|
||||
(lambda g: g.del_vertices("A", "G"), 5, 6),
|
||||
(lambda g: g.del_edges("E", "F", label="8"), 7, 8),
|
||||
(lambda g: g.del_edges("A", "A"), 7, 7),
|
||||
(lambda g: g.del_edges("A", "B"), 7, 8),
|
||||
(lambda g: g.del_edges("A", "A", "0"), 7, 8),
|
||||
(lambda g: g.del_edges("E", "F", "8"), 7, 8),
|
||||
(lambda g: g.del_edges("E", "F", "9"), 7, 9),
|
||||
(lambda g: g.del_edges("E", "F", val=1), 7, 9),
|
||||
(lambda g: g.del_edges("E", "F", "8", val=1), 7, 9),
|
||||
(lambda g: g.del_edges("E", "F", "9", val=1), 7, 9),
|
||||
(lambda g: g.del_neighbor_edges("A", Direction.IN), 7, 7),
|
||||
],
|
||||
)
|
||||
def test_delete(g, action, vc, ec):
|
||||
action(g)
|
||||
result = g.graphviz()
|
||||
result = g.format()
|
||||
print(f"\n{result}")
|
||||
assert g.vertex_count == vc
|
||||
assert g.edge_count == ec
|
||||
@@ -50,7 +56,7 @@ def test_delete(g, action, vc, ec):
|
||||
)
|
||||
def test_search(g, vids, dir, vc, ec):
|
||||
subgraph = g.search(vids, dir)
|
||||
print(f"\n{subgraph.graphviz()}")
|
||||
print(f"\n{subgraph.format()}")
|
||||
assert subgraph.vertex_count == vc
|
||||
assert subgraph.edge_count == ec
|
||||
|
||||
@@ -65,7 +71,7 @@ def test_search(g, vids, dir, vc, ec):
|
||||
)
|
||||
def test_search_result_limit(g, vids, dir, ec):
|
||||
subgraph = g.search(vids, dir, limit=ec)
|
||||
print(f"\n{subgraph.graphviz()}")
|
||||
print(f"\n{subgraph.format()}")
|
||||
assert subgraph.edge_count == ec
|
||||
|
||||
|
||||
@@ -79,7 +85,7 @@ def test_search_result_limit(g, vids, dir, ec):
|
||||
)
|
||||
def test_search_fan_limit(g, vids, dir, fan, ec):
|
||||
subgraph = g.search(vids, dir, fan=fan)
|
||||
print(f"\n{subgraph.graphviz()}")
|
||||
print(f"\n{subgraph.format()}")
|
||||
assert subgraph.edge_count == ec
|
||||
|
||||
|
||||
@@ -97,5 +103,5 @@ def test_search_fan_limit(g, vids, dir, fan, ec):
|
||||
)
|
||||
def test_search_depth_limit(g, vids, dir, dep, ec):
|
||||
subgraph = g.search(vids, dir, depth=dep)
|
||||
print(f"\n{subgraph.graphviz()}")
|
||||
print(f"\n{subgraph.format()}")
|
||||
assert subgraph.edge_count == ec
|
||||
|
@@ -83,9 +83,9 @@ const RecallTestModal: React.FC<RecallTestModalProps> = ({ open, setOpen, space
|
||||
destroyOnClose={true}
|
||||
>
|
||||
<Card
|
||||
title='召回配置'
|
||||
size='small'
|
||||
className='my-4'
|
||||
title="召回配置"
|
||||
size="small"
|
||||
className="my-4"
|
||||
extra={
|
||||
<Popover
|
||||
placement='bottomRight'
|
||||
@@ -160,7 +160,7 @@ const RecallTestModal: React.FC<RecallTestModalProps> = ({ open, setOpen, space
|
||||
)} */}
|
||||
</Form>
|
||||
</Card>
|
||||
<Card title='召回结果' size='small'>
|
||||
<Card title="召回结果" size="small">
|
||||
<Spin spinning={loading}>
|
||||
{resultList.length > 0 ? (
|
||||
<div
|
||||
@@ -172,9 +172,9 @@ const RecallTestModal: React.FC<RecallTestModalProps> = ({ open, setOpen, space
|
||||
{resultList.map(item => (
|
||||
<Card
|
||||
title={
|
||||
<div className='flex items-center'>
|
||||
<Tag color='blue'># {item.chunk_id}</Tag>
|
||||
{item.metadata.prop_field.title}
|
||||
<div className="flex items-center">
|
||||
<Tag color="blue"># {item.chunk_id}</Tag>
|
||||
{item.metadata.source}
|
||||
</div>
|
||||
}
|
||||
extra={
|
||||
|
@@ -1,10 +1,10 @@
|
||||
/** @type {import('next').NextConfig} */
|
||||
const CopyPlugin = require('copy-webpack-plugin');
|
||||
const MonacoWebpackPlugin = require('monaco-editor-webpack-plugin');
|
||||
const path = require('path');
|
||||
const CopyPlugin = require("copy-webpack-plugin");
|
||||
const MonacoWebpackPlugin = require("monaco-editor-webpack-plugin");
|
||||
const path = require("path");
|
||||
const nextConfig = {
|
||||
experimental: {
|
||||
esmExternals: 'loose',
|
||||
esmExternals: "loose",
|
||||
},
|
||||
typescript: {
|
||||
ignoreBuildErrors: true,
|
||||
@@ -27,30 +27,35 @@ const nextConfig = {
|
||||
new CopyPlugin({
|
||||
patterns: [
|
||||
{
|
||||
from: path.join(__dirname, 'node_modules/@oceanbase-odc/monaco-plugin-ob/worker-dist/'),
|
||||
to: 'static/ob-workers',
|
||||
from: path.join(
|
||||
__dirname,
|
||||
"node_modules/@oceanbase-odc/monaco-plugin-ob/worker-dist/"
|
||||
),
|
||||
to: "static/ob-workers",
|
||||
},
|
||||
],
|
||||
}),
|
||||
})
|
||||
);
|
||||
// 添加 monaco-editor-webpack-plugin 插件
|
||||
config.plugins.push(
|
||||
new MonacoWebpackPlugin({
|
||||
// 你可以在这里配置插件的选项,例如:
|
||||
languages: ['sql'],
|
||||
filename: 'static/[name].worker.js',
|
||||
}),
|
||||
languages: ["sql"],
|
||||
filename: "static/[name].worker.js",
|
||||
})
|
||||
);
|
||||
}
|
||||
return config;
|
||||
},
|
||||
};
|
||||
|
||||
const withTM = require('next-transpile-modules')([
|
||||
'@berryv/g2-react',
|
||||
'@antv/g2',
|
||||
'react-syntax-highlighter',
|
||||
'@antv/gpt-vis',
|
||||
const withTM = require("next-transpile-modules")([
|
||||
"@berryv/g2-react",
|
||||
"@antv/g2",
|
||||
"react-syntax-highlighter",
|
||||
"@antv/g6",
|
||||
"@antv/graphin",
|
||||
"@antv/gpt-vis",
|
||||
]);
|
||||
|
||||
module.exports = withTM({
|
||||
|
13767
web/package-lock.json
generated
13767
web/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -23,9 +23,12 @@
|
||||
"dependencies": {
|
||||
"@ant-design/cssinjs": "^1.18.4",
|
||||
"@ant-design/icons": "^5.2.5",
|
||||
"@antv/algorithm": "^0.1.26",
|
||||
"@antv/ava": "3.5.0-alpha.4",
|
||||
"@antv/g2": "^5.1.8",
|
||||
"@antv/gpt-vis": "^0.0.5",
|
||||
"@antv/g6": "^5.0.17",
|
||||
"@antv/graphin": "^3.0.2",
|
||||
"@antv/s2": "^1.51.2",
|
||||
"@berryv/g2-react": "^0.1.0",
|
||||
"@emotion/react": "^11.11.4",
|
||||
@@ -44,6 +47,9 @@
|
||||
"classnames": "^2.3.2",
|
||||
"cookies-next": "^4.0.0",
|
||||
"copy-to-clipboard": "^3.3.3",
|
||||
"framer-motion": "^10.16.4",
|
||||
"google-auth-library": "^9.2.0",
|
||||
"google-one-tap": "^1.0.6",
|
||||
"cytoscape": "^3.29.2",
|
||||
"cytoscape-euler": "^1.2.2",
|
||||
"eslint-plugin-prettier": "^5.2.1",
|
||||
@@ -53,16 +59,22 @@
|
||||
"dayjs": "^1.11.12",
|
||||
"i18next": "^23.4.5",
|
||||
"iron-session": "^6.3.1",
|
||||
"iron-session": "^6.3.1",
|
||||
"lodash": "^4.17.21",
|
||||
"markdown-it": "^14.1.0",
|
||||
"markdown-it": "^14.1.0",
|
||||
"moment": "^2.29.4",
|
||||
"monaco-editor": ">=0.31.0",
|
||||
"multer": "^1.4.5-lts.1",
|
||||
"mysql2": "^3.6.2",
|
||||
"multer": "^1.4.5-lts.1",
|
||||
"mysql2": "^3.6.2",
|
||||
"next": "13.4.7",
|
||||
"next-auth": "^4.20.1",
|
||||
"next-connect": "^1.0.0-next.4",
|
||||
"next-transpile-modules": "^10.0.1",
|
||||
"next-connect": "^1.0.0-next.4",
|
||||
"next-transpile-modules": "^10.0.1",
|
||||
"nprogress": "^0.2.0",
|
||||
"react": "^18.3.1",
|
||||
"react-dom": "^18.3.1",
|
||||
@@ -82,7 +94,6 @@
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/crypto-js": "^4.1.2",
|
||||
"@types/cytoscape": "^3.21.0",
|
||||
"@types/google-one-tap": "^1.2.4",
|
||||
"@types/lodash": "^4.14.195",
|
||||
"@types/markdown-it": "^14.1.1",
|
||||
|
232
web/pages/knowledge/graph/index.tsx
Normal file
232
web/pages/knowledge/graph/index.tsx
Normal file
@@ -0,0 +1,232 @@
|
||||
import React, { useEffect, useMemo, useRef, useState } from "react";
|
||||
import { Button, Spin } from "antd";
|
||||
import { RollbackOutlined } from "@ant-design/icons";
|
||||
import { apiInterceptors, getGraphVis } from "@/client/api";
|
||||
import { useRouter } from "next/router";
|
||||
import { idOf } from "@antv/g6";
|
||||
import type {
|
||||
Graph,
|
||||
GraphData,
|
||||
GraphOptions,
|
||||
ID,
|
||||
IPointerEvent,
|
||||
PluginOptions,
|
||||
} from "@antv/g6";
|
||||
import type { GraphVisResult } from "../../../types/knowledge";
|
||||
import { Graphin } from "@antv/graphin";
|
||||
import { getDegree, getSize, isInCommunity } from "../../../utils/graph";
|
||||
import { groupBy } from "lodash";
|
||||
|
||||
type GraphVisData = GraphVisResult | null;
|
||||
|
||||
const PALETTE = [
|
||||
"#5F95FF",
|
||||
"#61DDAA",
|
||||
"#F6BD16",
|
||||
"#7262FD",
|
||||
"#78D3F8",
|
||||
"#9661BC",
|
||||
"#F6903D",
|
||||
"#008685",
|
||||
"#F08BB4",
|
||||
];
|
||||
|
||||
function GraphVis() {
|
||||
const LIMIT = 500;
|
||||
const router = useRouter();
|
||||
const [data, setData] = useState<GraphVisData>(null);
|
||||
const graphRef = useRef<Graph | null>();
|
||||
const [isReady, setIsReady] = useState(false);
|
||||
|
||||
const fetchGraphVis = async () => {
|
||||
const [_, data] = await apiInterceptors(
|
||||
getGraphVis(spaceName as string, { limit: LIMIT })
|
||||
);
|
||||
setData(data);
|
||||
};
|
||||
|
||||
const transformData = (data: GraphVisData): GraphData => {
|
||||
if (!data) return { nodes: [], edges: [] };
|
||||
|
||||
const nodes = data.nodes.map((node) => ({ id: node.id, data: node }));
|
||||
const edges = data.edges.map((edge) => ({
|
||||
source: edge.source,
|
||||
target: edge.target,
|
||||
data: edge,
|
||||
}));
|
||||
|
||||
return { nodes, edges };
|
||||
};
|
||||
|
||||
const back = () => {
|
||||
router.push(`/construct/knowledge`);
|
||||
};
|
||||
|
||||
const {
|
||||
query: { spaceName },
|
||||
} = useRouter();
|
||||
|
||||
useEffect(() => {
|
||||
if (spaceName) fetchGraphVis();
|
||||
}, [spaceName]);
|
||||
|
||||
const graphData = useMemo(() => transformData(data), [data]);
|
||||
|
||||
useEffect(() => {
|
||||
if (isReady && graphRef.current) {
|
||||
const groupedNodes = groupBy(
|
||||
graphData.nodes,
|
||||
(node) => node.data!.communityId
|
||||
);
|
||||
const plugins: PluginOptions = [];
|
||||
Object.entries(groupedNodes).forEach(([key, nodes]) => {
|
||||
if (!key || nodes.length < 2) return;
|
||||
const color = graphRef.current?.getElementRenderStyle(
|
||||
idOf(nodes[0])
|
||||
).fill;
|
||||
plugins.push({
|
||||
key,
|
||||
type: "bubble-sets",
|
||||
members: nodes.map(idOf),
|
||||
stroke: color,
|
||||
fill: color,
|
||||
fillOpacity: 0.1,
|
||||
});
|
||||
});
|
||||
|
||||
graphRef.current.setPlugins((prev) => [...prev, ...plugins]);
|
||||
}
|
||||
}, [isReady]);
|
||||
|
||||
const getNodeSize = (nodeId: ID) => {
|
||||
return getSize(getNodeDegree(nodeId));
|
||||
};
|
||||
|
||||
const getNodeDegree = (nodeId?: ID) => {
|
||||
if (!nodeId) return 0;
|
||||
return getDegree(graphData.edges!, nodeId);
|
||||
};
|
||||
|
||||
const options: GraphOptions = {
|
||||
data: graphData,
|
||||
autoFit: "center",
|
||||
node: {
|
||||
style: (d) => {
|
||||
const style = {
|
||||
size: getNodeSize(idOf(d)),
|
||||
label: true,
|
||||
labelLineWidth: 2,
|
||||
labelText: d.data?.name as string,
|
||||
labelFontSize: 10,
|
||||
labelBackground: true,
|
||||
labelBackgroundFill: "#e5e7eb",
|
||||
labelPadding: [0, 6],
|
||||
labelBackgroundRadius: 4,
|
||||
labelMaxWidth: "400%",
|
||||
labelWordWrap: true,
|
||||
};
|
||||
if (!isInCommunity(graphData, idOf(d))) {
|
||||
Object.assign(style, { fill: "#b0b0b0" });
|
||||
}
|
||||
return style;
|
||||
},
|
||||
state: {
|
||||
active: {
|
||||
lineWidth: 2,
|
||||
labelWordWrap: false,
|
||||
labelFontSize: 12,
|
||||
labelFontWeight: "bold",
|
||||
},
|
||||
inactive: {
|
||||
label: false,
|
||||
},
|
||||
},
|
||||
palette: {
|
||||
type: "group",
|
||||
field: "communityId",
|
||||
color: PALETTE,
|
||||
},
|
||||
},
|
||||
edge: {
|
||||
style: {
|
||||
lineWidth: 1,
|
||||
stroke: "#e2e2e2",
|
||||
endArrow: true,
|
||||
endArrowType: "vee",
|
||||
label: true,
|
||||
labelFontSize: 8,
|
||||
labelBackground: true,
|
||||
labelText: (e) => e.data!.name as string,
|
||||
labelBackgroundFill: "#e5e7eb",
|
||||
labelPadding: [0, 6],
|
||||
labelBackgroundRadius: 4,
|
||||
labelMaxWidth: "60%",
|
||||
labelWordWrap: true,
|
||||
},
|
||||
state: {
|
||||
active: {
|
||||
stroke: "#b0b0b0",
|
||||
labelWordWrap: false,
|
||||
labelFontSize: 10,
|
||||
labelFontWeight: "bold",
|
||||
},
|
||||
inactive: {
|
||||
label: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
behaviors: [
|
||||
"drag-canvas",
|
||||
"zoom-canvas",
|
||||
"drag-element",
|
||||
{
|
||||
type: "hover-activate",
|
||||
degree: 1,
|
||||
state: "active",
|
||||
enable: (event: IPointerEvent) => ["node"].includes(event.targetType),
|
||||
},
|
||||
],
|
||||
animation: false,
|
||||
layout: {
|
||||
type: "force",
|
||||
preventOverlap: true,
|
||||
nodeSize: (d) => getNodeSize(d?.id as ID),
|
||||
linkDistance: (edge) => {
|
||||
const { source, target } = edge as { source: ID; target: ID };
|
||||
const nodeSize = Math.min(getNodeSize(source), getNodeSize(target));
|
||||
const degree = Math.min(getNodeDegree(source), getNodeDegree(target));
|
||||
return degree === 1
|
||||
? nodeSize * 2
|
||||
: Math.min(degree * nodeSize * 1.5, 700);
|
||||
},
|
||||
},
|
||||
transforms: ["process-parallel-edges"],
|
||||
};
|
||||
|
||||
if (!data) return <Spin className="h-full justify-center content-center" />;
|
||||
|
||||
return (
|
||||
<div className="p-4 h-full overflow-y-scroll relative px-2">
|
||||
<Graphin
|
||||
ref={(ref) => {
|
||||
graphRef.current = ref;
|
||||
}}
|
||||
style={{ height: "100%", width: "100%" }}
|
||||
options={options}
|
||||
onReady={() => {
|
||||
setIsReady(true);
|
||||
}}
|
||||
>
|
||||
<Button
|
||||
style={{ background: "#fff" }}
|
||||
onClick={back}
|
||||
icon={<RollbackOutlined />}
|
||||
>
|
||||
Back
|
||||
</Button>
|
||||
</Graphin>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
export default GraphVis;
|
100
web/utils/graph.ts
Normal file
100
web/utils/graph.ts
Normal file
@@ -0,0 +1,100 @@
|
||||
import { idOf } from "@antv/g6";
|
||||
import { pick, groupBy } from "lodash";
|
||||
import type { EdgeData, GraphData, ID } from "@antv/g6";
|
||||
|
||||
/**
|
||||
* Reassign the layout style to the original graph data
|
||||
* @param model - original graph data
|
||||
* @param layoutResult - layout result
|
||||
*/
|
||||
export function reassignLayoutStyle(model: GraphData, layoutResult: GraphData) {
|
||||
layoutResult.nodes?.forEach((layoutNode) => {
|
||||
const modelNode = model.nodes?.find((node) => node.id === layoutNode.id);
|
||||
if (modelNode?.style)
|
||||
Object.assign(
|
||||
modelNode.style || {},
|
||||
pick(layoutNode.style, ["x", "y", "z"])
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate node size based on degree
|
||||
* @param degree - degree of the node
|
||||
* @param minSize - minimum size of the node
|
||||
* @param maxSize - maximum size of the node
|
||||
* @param minDegree - minimum degree
|
||||
* @param maxDegree - maximum degree
|
||||
* @returns size of the node
|
||||
*/
|
||||
export function getSize(
|
||||
degree: number,
|
||||
minSize = 24,
|
||||
maxSize = 60,
|
||||
minDegree = 1,
|
||||
maxDegree = 10
|
||||
): number {
|
||||
const _degree = Math.max(minDegree, Math.min(maxDegree, degree));
|
||||
|
||||
const size =
|
||||
minSize +
|
||||
((_degree - minDegree) / (maxDegree - minDegree)) * (maxSize - minSize);
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get node degree, means the number of edges connected to the node
|
||||
* @param edges - all edges data
|
||||
* @param nodeId - node id
|
||||
* @returns degree of the node
|
||||
*/
|
||||
export function getDegree(edges: EdgeData[], nodeId: ID) {
|
||||
return getRelatedEdgesData(edges, nodeId).length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get related edges data of a node
|
||||
* @param edges - all edges data
|
||||
* @param nodeId - node id
|
||||
* @returns related edges data
|
||||
*/
|
||||
export function getRelatedEdgesData(edges: EdgeData[], nodeId: ID) {
|
||||
return edges.filter(
|
||||
(edge) => edge.source === nodeId || edge.target === nodeId
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Concatenate the labels of the related edges to the node as the node's edge key
|
||||
* @param edges - all edges data
|
||||
* @param nodeId - node id
|
||||
* @returns edge key
|
||||
*/
|
||||
export function getCommunityId(edges: EdgeData[], nodeId: ID) {
|
||||
const relatedEdges = getRelatedEdgesData(edges, nodeId);
|
||||
const key = relatedEdges
|
||||
.map((edge) => {
|
||||
const direction = edge.source === nodeId ? "->" : "<-";
|
||||
const otherEnd = edge.source === nodeId ? edge.target : edge.source;
|
||||
return `${direction}_${edge.data!.label}_${otherEnd}`;
|
||||
})
|
||||
.sort()
|
||||
.join("+");
|
||||
return key;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether the node is in a community(same communityId) with more than `limit` nodes
|
||||
* @param data - graph data
|
||||
* @param nodeId - node id
|
||||
* @param limit - limit
|
||||
* @returns boolean
|
||||
*/
|
||||
export function isInCommunity(data: GraphData, nodeId: string, limit = 2) {
|
||||
const groupedNodes = groupBy(data.nodes, (node) => node.data!.communityId);
|
||||
const filtered = Object.values(groupedNodes).find((nodes) =>
|
||||
nodes.map(idOf).includes(nodeId)
|
||||
)!;
|
||||
return filtered.length > limit;
|
||||
}
|
10526
web/yarn.lock
Normal file
10526
web/yarn.lock
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user