feat(GraphRAG): enhance GraphRAG by graph community summary (#1801)

Co-authored-by: Florian <fanzhidongyzby@163.com>
Co-authored-by: KingSkyLi <15566300566@163.com>
Co-authored-by: aries_ckt <916701291@qq.com>
Co-authored-by: Fangyin Cheng <staneyffer@gmail.com>
Co-authored-by: yvonneyx <zhuyuxin0627@gmail.com>
This commit is contained in:
M1n9X
2024-08-30 21:59:44 +08:00
committed by GitHub
parent 471689ba20
commit 759f7d99cc
59 changed files with 29316 additions and 411 deletions

View File

@@ -71,7 +71,7 @@ EMBEDDING_MODEL=text2vec
#EMBEDDING_MODEL=bge-large-zh
KNOWLEDGE_CHUNK_SIZE=500
KNOWLEDGE_SEARCH_TOP_SIZE=5
KNOWLEDGE_GRAPH_SEARCH_TOP_SIZE=50
KNOWLEDGE_GRAPH_SEARCH_TOP_SIZE=200
## Maximum number of chunks to load at once, if your single document is too large,
## you can set this value to a higher value for better performance.
## if out of memory when load large document, you can set this value to a lower value.
@@ -157,6 +157,11 @@ EXECUTE_LOCAL_COMMANDS=False
#*******************************************************************#
VECTOR_STORE_TYPE=Chroma
GRAPH_STORE_TYPE=TuGraph
GRAPH_COMMUNITY_SUMMARY_ENABLED=True
KNOWLEDGE_GRAPH_EXTRACT_SEARCH_TOP_SIZE=5
KNOWLEDGE_GRAPH_EXTRACT_SEARCH_RECALL_SCORE=0.3
KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_TOP_SIZE=20
KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_RECALL_SCORE=0.0
### Chroma vector db config
#CHROMA_PERSIST_PATH=/root/DB-GPT/pilot/data
@@ -187,7 +192,7 @@ ElasticSearch_PASSWORD={your_password}
#TUGRAPH_PASSWORD=73@TuGraph
#TUGRAPH_VERTEX_TYPE=entity
#TUGRAPH_EDGE_TYPE=relation
#TUGRAPH_EDGE_NAME_KEY=label
#TUGRAPH_PLUGIN_NAMES=leiden
#*******************************************************************#
#** WebServer Language Support **#

3
.gitignore vendored
View File

@@ -4,7 +4,6 @@ __pycache__/
*$py.class
# C extensions
*.so
message/
dbgpt/util/extensions/
.env*
@@ -185,4 +184,4 @@ thirdparty
/examples/**/*.gv
/examples/**/*.gv.pdf
/i18n/locales/**/**/*_ai_translated.po
/i18n/locales/**/**/*~
/i18n/locales/**/**/*~

View File

@@ -213,6 +213,9 @@ class Config(metaclass=Singleton):
# Vector Store Configuration
self.VECTOR_STORE_TYPE = os.getenv("VECTOR_STORE_TYPE", "Chroma")
self.GRAPH_COMMUNITY_SUMMARY_ENABLED = (
os.getenv("GRAPH_COMMUNITY_SUMMARY_ENABLED", "").lower() == "true"
)
self.MILVUS_URL = os.getenv("MILVUS_URL", "127.0.0.1")
self.MILVUS_PORT = os.getenv("MILVUS_PORT", "19530")
self.MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", None)

View File

@@ -112,13 +112,15 @@ def arguments(space_id: str):
@router.post("/knowledge/{space_name}/recall_test")
def recall_test(
async def recall_test(
space_name: str,
request: DocumentRecallTestRequest,
):
print(f"/knowledge/{space_name}/recall_test params:")
try:
return Result.succ(knowledge_space_service.recall_test(space_name, request))
return Result.succ(
await knowledge_space_service.recall_test(space_name, request)
)
except Exception as e:
return Result.failed(code="E000X", msg=f"{space_name} recall_test error {e}")

View File

@@ -309,7 +309,7 @@ class KnowledgeService:
"""
return knowledge_space_dao.get_knowledge_space_by_ids(ids)
def recall_test(
async def recall_test(
self, space_name, doc_recall_test_request: DocumentRecallTestRequest
):
logger.info(f"recall_test {space_name}, {doc_recall_test_request}")
@@ -338,7 +338,7 @@ class KnowledgeService:
knowledge_space_retriever = KnowledgeSpaceRetriever(
space_id=space.id, top_k=top_k
)
chunks = knowledge_space_retriever.retrieve_with_scores(
chunks = await knowledge_space_retriever.aretrieve_with_scores(
question, score_threshold
)
retrievers_end_time = timeit.default_timer()
@@ -646,13 +646,16 @@ class KnowledgeService:
graph = vector_store_connector.client.query_graph(limit=limit)
res = {"nodes": [], "edges": []}
for node in graph.vertices():
res["nodes"].append({"vid": node.vid})
for edge in graph.edges():
res["edges"].append(
res["nodes"].append(
{
"src": edge.sid,
"dst": edge.tid,
"label": edge.props[graph.edge_label],
"id": node.vid,
"communityId": node.get_prop("_community_id"),
"name": node.vid,
"type": "",
}
)
for edge in graph.edges():
res["edges"].append(
{"source": edge.sid, "target": edge.tid, "name": edge.name, "type": ""}
)
return res

View File

@@ -1,7 +1,7 @@
"""TuGraph Connector."""
import json
from typing import Dict, List, cast
from typing import Dict, Generator, List, cast
from .base import BaseConnector
@@ -23,11 +23,16 @@ class TuGraphConnector(BaseConnector):
def create_graph(self, graph_name: str) -> None:
"""Create a new graph."""
# run the query to get vertex labels
with self._driver.session(database="default") as session:
graph_list = session.run("CALL dbms.graph.listGraphs()").data()
exists = any(item["graph_name"] == graph_name for item in graph_list)
if not exists:
session.run(f"CALL dbms.graph.createGraph('{graph_name}', '', 2048)")
try:
with self._driver.session(database="default") as session:
graph_list = session.run("CALL dbms.graph.listGraphs()").data()
exists = any(item["graph_name"] == graph_name for item in graph_list)
if not exists:
session.run(
f"CALL dbms.graph.createGraph('{graph_name}', '', 2048)"
)
except Exception as e:
raise Exception(f"Failed to create graph '{graph_name}': {str(e)}")
def delete_graph(self, graph_name: str) -> None:
"""Delete a graph."""
@@ -89,10 +94,19 @@ class TuGraphConnector(BaseConnector):
self._driver.close()
def run(self, query: str, fetch: str = "all") -> List:
"""Run query."""
with self._driver.session(database=self._graph) as session:
try:
result = session.run(query)
return list(result)
except Exception as e:
raise Exception(f"Query execution failed: {e}")
def run_stream(self, query: str) -> Generator:
"""Run GQL."""
with self._driver.session(database=self._graph) as session:
result = session.run(query)
return list(result)
yield from result
def get_columns(self, table_name: str, table_type: str = "vertex") -> List[Dict]:
"""Get fields about specified graph.

View File

@@ -20,19 +20,19 @@ from .embeddings import ( # noqa: F401
from .rerank import CrossEncoderRerankEmbeddings, OpenAPIRerankEmbeddings # noqa: F401
__ALL__ = [
"CrossEncoderRerankEmbeddings",
"DefaultEmbeddingFactory",
"EmbeddingFactory",
"Embeddings",
"HuggingFaceBgeEmbeddings",
"HuggingFaceEmbeddings",
"HuggingFaceInferenceAPIEmbeddings",
"HuggingFaceInstructEmbeddings",
"JinaEmbeddings",
"OpenAPIEmbeddings",
"OllamaEmbeddings",
"DefaultEmbeddingFactory",
"EmbeddingFactory",
"WrappedEmbeddingFactory",
"TongYiEmbeddings",
"CrossEncoderRerankEmbeddings",
"OpenAPIEmbeddings",
"OpenAPIRerankEmbeddings",
"QianFanEmbeddings",
"TongYiEmbeddings",
"WrappedEmbeddingFactory",
]

View File

@@ -54,6 +54,10 @@ class IndexStoreBase(ABC):
"""Init index store."""
self._executor = executor or ThreadPoolExecutor()
@abstractmethod
def get_config(self) -> IndexStoreConfig:
"""Get the index store config."""
@abstractmethod
def load_document(self, chunks: List[Chunk]) -> List[str]:
"""Load document in index database.
@@ -104,6 +108,10 @@ class IndexStoreBase(ABC):
ids(str): The vector ids to delete, separated by comma.
"""
@abstractmethod
def truncate(self) -> List[str]:
"""Truncate data by name."""
@abstractmethod
def delete_vector_name(self, index_name: str):
"""Delete index by name.
@@ -188,7 +196,7 @@ class IndexStoreBase(ABC):
Return:
List[Chunk]: The similar documents.
"""
return self.similar_search_with_scores(text, topk, 1.0, filters)
return self.similar_search_with_scores(text, topk, 0.0, filters)
async def asimilar_search_with_scores(
self,

View File

@@ -9,11 +9,27 @@ logger = logging.getLogger(__name__)
class TransformerBase:
"""Transformer base class."""
@abstractmethod
def truncate(self):
"""Truncate operation."""
@abstractmethod
def drop(self):
"""Clean operation."""
class EmbedderBase(TransformerBase, ABC):
"""Embedder base class."""
class SummarizerBase(TransformerBase, ABC):
"""Summarizer base class."""
@abstractmethod
async def summarize(self, **args) -> str:
"""Summarize result."""
class ExtractorBase(TransformerBase, ABC):
"""Extractor base class."""

View File

@@ -0,0 +1,208 @@
"""CommunitySummarizer class."""
import logging
from dbgpt.core import LLMClient
from dbgpt.rag.transformer.llm_summarizer import LLMSummarizer
logger = logging.getLogger(__name__)
class CommunitySummarizer(LLMSummarizer):
"""CommunitySummarizer class."""
def __init__(self, llm_client: LLMClient, model_name: str):
"""Initialize the CommunitySummaryExtractor."""
super().__init__(llm_client, model_name, COMMUNITY_SUMMARY_PT_CN)
COMMUNITY_SUMMARY_PT_CN = (
"## 角色\n"
"你非常擅长知识图谱的信息总结,能根据给定的知识图谱中的实体和关系的名称以及描述"
"信息,全面、恰当地对知识图谱子图信息做出总结性描述,并且不会丢失关键的信息。\n"
"\n"
"## 技能\n"
"### 技能 1: 实体识别\n"
"- 准确地识别[Entities:]章节中的实体信息,包括实体名、实体描述信息。\n"
"- 实体信息的一般格式有:\n"
"(实体名)\n"
"(实体名:实体描述)\n"
"(实体名:实体属性表)\n"
"(文本块ID:文档块内容)\n"
"(目录ID:目录名)\n"
"(文档ID:文档名称)\n"
"\n"
"### 技能 2: 关系识别\n"
"- 准确地识别[Relationships:]章节中的关系信息,包括来源实体名、关系名、"
"目标实体名、关系描述信息实体名也可能是文档ID、目录ID、文本块ID。\n"
"- 关系信息的一般格式有:\n"
"(来源实体名)-[关系名]->(目标实体名)\n"
"(来源实体名)-[关系名:关系描述]->(目标实体名)\n"
"(来源实体名)-[关系名:关系属性表]->(目标实体名)\n"
"(文本块ID)-[包含]->(实体名)\n"
"(目录ID)-[包含]->(文本块实体)\n"
"(目录ID)-[包含]->(子目录ID)\n"
"(文档ID)-[包含]->(文本块实体)\n"
"(文档ID)-[包含]->(目录ID)\n"
"\n"
"### 技能 3: 图结构理解\n"
"--请按照如下步骤理解图结构--\n"
"1. 正确地将关系信息中的来源实体名与实体信息关联。\n"
"2. 正确地将关系信息中的目标实体名与实体信息关联。\n"
"3. 根据提供的关系信息还原出图结构。"
"\n"
"### 技能 4: 知识图谱总结\n"
"--请按照如下步骤总结知识图谱--\n"
"1. 确定知识图谱表达的主题或话题,突出关键实体和关系。"
"2. 使用准确、恰当、简洁的语言总结图结构表达的信息,不要生成与图结构中无关的信息。"
"\n"
"## 约束条件\n"
"- 不要在答案中描述你的思考过程,直接给出用户问题的答案,不要生成无关信息。\n"
"- 确保以第三人称书写,从客观角度对知识图谱表达的信息进行总结性描述。\n"
"- 如果实体或关系的描述信息为空,对最终的总结信息没有贡献,不要生成无关信息。\n"
"- 如果提供的描述信息相互矛盾,请解决矛盾并提供一个单一、连贯的描述。\n"
"- 避免使用停用词和过于常见的词汇。\n"
"\n"
"## 参考案例\n"
"--案例仅帮助你理解提示词的输入和输出格式,请不要在答案中使用它们。--\n"
"输入:\n"
"```\n"
"Entities:\n"
"(菲尔・贾伯#菲尔兹咖啡创始人)\n"
"(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌)\n"
"(雅各布・贾伯#菲尔・贾伯的儿子)\n"
"(美国多地#菲尔兹咖啡的扩展地区)\n"
"\n"
"Relationships:\n"
"(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立)\n"
"(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点)\n"
"(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的儿子)\n"
"(雅各布・贾伯#担任#首席执行官#在2005年成为菲尔兹咖啡的首席执行官)\n"
"(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围)\n"
"```\n"
"\n"
"输出:\n"
"```\n"
"菲尔兹咖啡是由菲尔・贾伯在1978年于加利福尼亚州伯克利创立的咖啡品牌。"
"菲尔・贾伯的儿子雅各布・贾伯在2005年接任首席执行官领导公司扩展到了美国多地"
"进一步巩固了菲尔兹咖啡作为加利福尼亚州伯克利创立的咖啡品牌的市场地位。\n"
"```\n"
"\n"
"----\n"
"\n"
"请根据接下来[知识图谱]提供的信息,按照上述要求,总结知识图谱表达的信息。\n"
"\n"
"[知识图谱]:\n"
"{graph}\n"
"\n"
"[总结]:\n"
"\n"
)
COMMUNITY_SUMMARY_PT_EN = (
"## Role\n"
"You are highly skilled in summarizing information from knowledge graphs. "
"Based on the names and descriptions of entities and relationships in a "
"given knowledge graph, you can comprehensively and appropriately summarize"
" the information of the subgraph without losing critical details.\n"
"\n"
"## Skills\n"
"### Skill 1: Entity Recognition\n"
"- Accurately recognize entity information in the [Entities:] section, "
"including entity names and descriptions.\n"
"- The general formats for entity information are:\n"
"(entity_name)\n"
"(entity_name: entity_description)\n"
"(entity_name: entity_property_map)\n"
"(chunk_id: chunk_content)\n"
"(catalog_id: catalog_name)\n"
"(document_id: document_name)\n"
"\n"
"### Skill 2: Relationship Recognition\n"
"- Accurately recognize relationship information in the [Relationships:] "
"section, including source_entity_name, relationship_name, "
"target_entity_name, and relationship_description, The entity_name may "
"also be the document_id, catalog_id, or chunk_id.\n"
"- The general formats for relationship information are:\n"
"(source_entity_name)-[relationship_name]->(target_entity_name)\n"
"(source_entity_name)-[relationship_name: relationship_description]->"
"(target_entity_name)\n"
"(source_entity_name)-[relationship_name: relationship_property_map]->"
"(target_entity_name)\n"
"(chunk_id)-[Contains]->(entity_name)\n"
"(catalog_id)-[Contains]->(chunk_id)\n"
"(catalog_id)-[Contains]->(sub_catalog_id)\n"
"(document_id)-[Contains]->(chunk_id)\n"
"(document_id)-[Contains]->(catalog_id)\n"
"\n"
"### Skill 3: Graph Structure Understanding\n"
"--Follow these steps to understand the graph structure--\n"
"1. Correctly associate the source entity name in the "
"relationship information with the entity information.\n"
"2. Correctly associate the target entity name in the "
"relationship information with the entity information.\n"
"3. Reconstruct the graph structure based on the provided "
"relationship information."
"\n"
"### Skill 4: Knowledge Graph Summarization\n"
"--Follow these steps to summarize the knowledge graph--\n"
"1. Determine the theme or topic expressed by the knowledge graph, "
"highlighting key entities and relationships."
"2. Use accurate, appropriate, and concise language to summarize "
"the information expressed by the graph "
"without generating irrelevant information."
"\n"
"## Constraints\n"
"- Don't describe your thought process in the answer, provide the answer "
"to the user's question directly without generating irrelevant information."
"- Ensure the summary is written in the third person and objectively "
"reflects the information conveyed by the knowledge graph.\n"
"- If the descriptions of entities or relationships are empty and "
"contribute nothing to the final summary, "
"do not generate unrelated information.\n"
"- If the provided descriptions are contradictory, resolve the conflicts "
"and provide a single, coherent description.\n"
"- Avoid using stop words and overly common words.\n"
"\n"
"## Reference Example\n"
"--The case is only to help you understand the input and output format of "
"the prompt, please do not use it in your answer.--\n"
"Input:\n"
"```\n"
"Entities:\n"
"(Phil Jaber#Founder of Philz Coffee)\n"
"(Philz Coffee#Coffee brand founded in Berkeley, California)\n"
"(Jacob Jaber#Son of Phil Jaber)\n"
"(Multiple locations in the USA#Expansion regions of Philz Coffee)\n"
"\n"
"Relationships:\n"
"(Phil Jaber#Created#Philz Coffee"
"#Founded in Berkeley, California in 1978)\n"
"(Philz Coffee#Located in#Berkeley, California"
"#Founding location of Philz Coffee)\n"
"(Phil Jaber#Has#Jacob Jaber#Son of Phil Jaber)\n"
"(Jacob Jaber#Serves as#CEO#Became CEO of Philz Coffee in 2005)\n"
"(Philz Coffee#Expanded to#Multiple locations in the USA"
"#Expansion regions of Philz Coffee)\n"
"```\n"
"\n"
"Output:\n"
"```\n"
"Philz Coffee is a coffee brand founded by Phil Jaber in 1978 in "
"Berkeley, California. Phil Jaber's son, Jacob Jaber, took over as CEO in "
"2005, leading the company to expand to multiple locations in the USA, "
"further solidifying Philz Coffee's market position as a coffee brand "
"founded in Berkeley, California.\n"
"```\n"
"\n"
"----\n"
"\n"
"Please summarize the information expressed by the [KnowledgeGraph] "
"provided in the following section according to the above requirements.\n"
"\n"
"[KnowledgeGraph]:\n"
"{graph}\n"
"\n"
"[Summary]:\n"
"\n"
)

View File

@@ -0,0 +1,304 @@
"""GraphExtractor class."""
import logging
import re
from typing import List, Optional
from dbgpt.core import Chunk, LLMClient
from dbgpt.rag.transformer.llm_extractor import LLMExtractor
from dbgpt.storage.graph_store.graph import Edge, Graph, MemoryGraph, Vertex
from dbgpt.storage.vector_store.base import VectorStoreBase
logger = logging.getLogger(__name__)
class GraphExtractor(LLMExtractor):
"""GraphExtractor class."""
def __init__(
self, llm_client: LLMClient, model_name: str, chunk_history: VectorStoreBase
):
"""Initialize the GraphExtractor."""
super().__init__(llm_client, model_name, GRAPH_EXTRACT_PT_CN)
self._chunk_history = chunk_history
config = self._chunk_history.get_config()
self._vector_space = config.name
self._max_chunks_once_load = config.max_chunks_once_load
self._max_threads = config.max_threads
self._topk = config.topk
self._score_threshold = config.score_threshold
async def extract(self, text: str, limit: Optional[int] = None) -> List:
"""Load similar chunks."""
# load similar chunks
chunks = await self._chunk_history.asimilar_search_with_scores(
text, self._topk, self._score_threshold
)
history = [
f"Section {i + 1}:\n{chunk.content}" for i, chunk in enumerate(chunks)
]
context = "\n".join(history) if history else ""
try:
# extract with chunk history
return await super()._extract(text, context, limit)
finally:
# save chunk to history
await self._chunk_history.aload_document_with_limit(
[Chunk(content=text, metadata={"relevant_cnt": len(history)})],
self._max_chunks_once_load,
self._max_threads,
)
def _parse_response(self, text: str, limit: Optional[int] = None) -> List[Graph]:
graph = MemoryGraph()
edge_count = 0
current_section = None
for line in text.split("\n"):
line = line.strip()
if line in ["Entities:", "Relationships:"]:
current_section = line[:-1]
elif line and current_section:
if current_section == "Entities":
match = re.match(r"\((.*?)#(.*?)\)", line)
if match:
name, summary = [part.strip() for part in match.groups()]
graph.upsert_vertex(Vertex(name, description=summary))
elif current_section == "Relationships":
match = re.match(r"\((.*?)#(.*?)#(.*?)#(.*?)\)", line)
if match:
source, name, target, summary = [
part.strip() for part in match.groups()
]
edge_count += 1
graph.append_edge(
Edge(source, target, name, description=summary)
)
if limit and edge_count >= limit:
break
return [graph]
def truncate(self):
"""Truncate chunk history."""
self._chunk_history.truncate()
def drop(self):
"""Drop chunk history."""
self._chunk_history.delete_vector_name(self._vector_space)
GRAPH_EXTRACT_PT_CN = (
"## 角色\n"
"你是一个知识图谱工程专家,非常擅长从文本中精确抽取知识图谱的实体"
"(主体、客体)和关系,并能对实体和关系的含义做出恰当的总结性描述。\n"
"\n"
"## 技能\n"
"### 技能 1: 实体抽取\n"
"--请按照如下步骤抽取实体--\n"
"1. 准确地识别文本中的实体信息,一般是名词、代词等。\n"
"2. 准确地识别实体的修饰性描述,一般作为定语对实体特征做补充。\n"
"3. 对相同概念的实体(同义词、别称、代指),请合并为单一简洁的实体名,"
"并合并它们的描述信息。\n"
"4. 对合并后的实体描述信息做简洁、恰当、连贯的总结。\n"
"\n"
"### 技能 2: 关系抽取\n"
"--请按照如下步骤抽取关系--\n"
"1. 准确地识别文本中实体之间的关联信息,一般是动词、代词等。\n"
"2. 准确地识别关系的修饰性描述,一般作为状语对关系特征做补充。\n"
"3. 对相同概念的关系(同义词、别称、代指),请合并为单一简洁的关系名,"
"并合并它们的描述信息。\n"
"4. 对合并后的关系描述信息做简洁、恰当、连贯的总结。\n"
"\n"
"### 技能 3: 关联上下文\n"
"- 关联上下文来自与当前待抽取文本相关的前置段落内容,"
"可以为知识抽取提供信息补充。\n"
"- 合理利用提供的上下文信息,知识抽取过程中出现的内容引用可能来自关联上下文。\n"
"- 不要对关联上下文的内容做知识抽取,而仅作为关联信息参考。\n"
"- 关联上下文是可选信息,可能为空。\n"
"\n"
"## 约束条件\n"
"- 如果文本已提供了图结构格式的数据,直接转换为输出格式返回,"
"不要修改实体或ID名称。"
"- 尽可能多的生成文本中提及的实体和关系信息,但不要随意创造不存在的实体和关系。\n"
"- 确保以第三人称书写,从客观角度描述实体名称、关系名称,以及他们的总结性描述。\n"
"- 尽可能多地使用关联上下文中的信息丰富实体和关系的内容,这非常重要。\n"
"- 如果实体或关系的总结描述为空,不提供总结描述信息,不要生成无关的描述信息。\n"
"- 如果提供的描述信息相互矛盾,请解决矛盾并提供一个单一、连贯的描述。\n"
"- 实体和关系的名称或者描述文本出现#和:字符时使用_字符替换其他字符不要修改。"
"- 避免使用停用词和过于常见的词汇。\n"
"\n"
"## 输出格式\n"
"Entities:\n"
"(实体名#实体总结)\n"
"...\n\n"
"Relationships:\n"
"(来源实体名#关系名#目标实体名#关系总结)\n"
"...\n"
"\n"
"## 参考案例"
"--案例仅帮助你理解提示词的输入和输出格式,请不要在答案中使用它们。--\n"
"输入:\n"
"```\n"
"[上下文]:\n"
"Section 1:\n"
"菲尔・贾伯的大儿子叫雅各布・贾伯。\n"
"Section 2:\n"
"菲尔・贾伯的小儿子叫比尔・贾伯。\n"
"..."
"\n"
"[文本]:\n"
"菲尔兹咖啡由菲尔・贾伯于1978年在加利福尼亚州伯克利创立。"
"因其独特的混合咖啡而闻名,菲尔兹已扩展到美国多地。"
"他的大儿子于2005年成为首席执行官并带领公司实现了显著增长。\n"
"```\n"
"\n"
"输出:\n"
"```\n"
"Entities:\n"
"(菲尔・贾伯#菲尔兹咖啡创始人)\n"
"(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌)\n"
"(雅各布・贾伯#菲尔・贾伯的大儿子)\n"
"(美国多地#菲尔兹咖啡的扩展地区)\n"
"\n"
"Relationships:\n"
"(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立)\n"
"(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点)\n"
"(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的大儿子)\n"
"(雅各布・贾伯#管理#菲尔兹咖啡#在2005年担任首席执行官)\n"
"(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围)\n"
"```\n"
"\n"
"----\n"
"\n"
"请根据接下来[上下文]提供的信息,按照上述要求,抽取[文本]中的实体和关系数据。\n"
"\n"
"[上下文]:\n"
"{history}\n"
"\n"
"[文本]:\n"
"{text}\n"
"\n"
"[结果]:\n"
"\n"
)
GRAPH_EXTRACT_PT_EN = (
"## Role\n"
"You are an expert in Knowledge Graph Engineering, skilled at extracting "
"entities (subjects, objects) and relations from text, and summarizing "
"their meanings effectively.\n"
"\n"
"## Skills\n"
"### Skill 1: Entity Extraction\n"
"--Please follow these steps to extract entities--\n"
"1. Accurately identify entity information in the text, "
"usually nouns, pronouns, etc.\n"
"2. Accurately identify descriptive information, "
"usually as adjectives, that supplements entity features.\n"
"3. Merge synonymous, alias, or reference entities into "
"a single concise entity name, and merge their descriptive information.\n"
"4. Provide a concise, appropriate, and coherent summary "
"of the combined entity descriptions.\n"
"\n"
"### Skill 2: Relation Extraction\n"
"--Please follow these steps to extract relations--\n"
"1. Accurately identify relation information between entities in the text, "
"usually verbs, pronouns, etc.\n"
"2. Accurately identify descriptive information, usually as adverbs, "
"that supplements relation features.\n"
"3. Merge synonymous, alias, or reference relations into "
"a single concise relation name, and merge their descriptive information.\n"
"4. Provide a concise, appropriate, and coherent summary "
"of the combined relation descriptions.\n"
"\n"
"### Skill 3: Contextual Association\n"
"- Context comes from preceding paragraphs related to the current "
"extraction text and can provide supplementary information.\n"
"- Appropriately use contextual information, content references "
"during extraction may come from this context.\n"
"- Do not extract knowledge from contextual content, "
"use it only as a reference.\n"
"- Context is optional and may be empty.\n"
"\n"
"## Constraints\n"
"- If the text has provided data that is similar to or the same as the "
"output format, please format the output directly according to the "
"output format requirements."
"- Generate as much entity and relation information mentioned in the text "
"as possible, but do not create nonexistent entities or relations.\n"
"- Ensure the writing is in the third person, describing entity names, "
"relation names, and their summaries objectively.\n"
"- Use as much contextual information as possible to enrich the content "
"of entities and relations, this is very important.\n"
"- If a summary of an entity or relation is empty, do not provide "
"summary information, and do not generate irrelevant descriptions.\n"
"- If provided descriptions are contradictory, resolve the conflict "
"and provide a single, coherent description.\n"
"- Replace any # or : characters in entity's and relation's "
"names or descriptions with an _ character.\n"
"- Avoid using stop words and overly common terms.\n"
"\n"
"## Output Format\n"
"Entities:\n"
"(entity_name#entity_summary)\n"
"...\n\n"
"Relationships:\n"
"(source_entity_name#relation_name#target_entity_name#relation_summary)\n"
"...\n"
"\n"
"## Reference Example\n"
"--The case is only to help you understand the input and output format of "
"the prompt, please do not use it in your answer.--\n"
"Input:\n"
"```\n"
"[Context]:\n"
"Section 1:\n"
"Phil Jabber's eldest son is named Jacob Jabber.\n"
"Section 2:\n"
"Phil Jabber's youngest son is named Bill Jabber.\n"
"..."
"\n"
"[Text]:\n"
"Philz Coffee was founded by Phil Jabber in 1978 in Berkeley, California. "
"Known for its distinctive blend coffee, Philz has expanded to multiple "
"locations in the USA. His eldest son became CEO in 2005, "
"leading significant growth for the company.\n"
"```\n"
"\n"
"Output:\n"
"```\n"
"Entities:\n"
"(Phil Jabber#Founder of Philz Coffee)\n"
"(Philz Coffee#Coffee brand founded in Berkeley, California)\n"
"(Jacob Jabber#Phil Jabber's eldest son)\n"
"(Multiple locations in the USA#Philz Coffee's expansion area)\n"
"\n"
"Relationships:\n"
"(Phil Jabber#Founded#Philz Coffee"
"#Founded in 1978 in Berkeley, California)\n"
"(Philz Coffee#Located in#Berkeley, California"
"#Philz Coffee's founding location)\n"
"(Phil Jabber#Has#Jacob Jabber#Phil Jabber's eldest son)\n"
"(Jacob Jabber#Manage#Philz Coffee#Serve as CEO in 2005)\n"
"(Philz Coffee#Expanded to#Multiple locations in the USA"
"#Philz Coffee's expansion area)\n"
"```\n"
"\n"
"----\n"
"\n"
"Please extract the entities and relationships data from the [Text] "
"according to the above requirements, using the provided [Context].\n"
"\n"
"[Context]:\n"
"{history}\n"
"\n"
"[Text]:\n"
"{text}\n"
"\n"
"[Results]:\n"
"\n"
)

View File

@@ -19,9 +19,20 @@ class LLMExtractor(ExtractorBase, ABC):
self._prompt_template = prompt_template
async def extract(self, text: str, limit: Optional[int] = None) -> List:
"""Extract by LLm."""
"""Extract by LLM."""
return await self._extract(text, None, limit)
async def _extract(
self, text: str, history: str = None, limit: Optional[int] = None
) -> List:
"""Inner extract by LLM."""
template = HumanPromptTemplate.from_template(self._prompt_template)
messages = template.format_messages(text=text)
messages = (
template.format_messages(text=text, history=history)
if history is not None
else template.format_messages(text=text)
)
# use default model if needed
if not self._model_name:
@@ -45,6 +56,12 @@ class LLMExtractor(ExtractorBase, ABC):
ValueError("optional argument limit >= 1")
return self._parse_response(response.text, limit)
def truncate(self):
"""Do nothing by default."""
def drop(self):
"""Do nothing by default."""
@abstractmethod
def _parse_response(self, text: str, limit: Optional[int] = None) -> List:
"""Parse llm response."""

View File

@@ -0,0 +1,48 @@
"""LLMSummarizer class."""
import logging
from abc import ABC
from dbgpt.core import HumanPromptTemplate, LLMClient, ModelMessage, ModelRequest
from dbgpt.rag.transformer.base import SummarizerBase
logger = logging.getLogger(__name__)
class LLMSummarizer(SummarizerBase, ABC):
"""LLMSummarizer class."""
def __init__(self, llm_client: LLMClient, model_name: str, prompt_template: str):
"""Initialize the LLMSummarizer."""
self._llm_client = llm_client
self._model_name = model_name
self._prompt_template = prompt_template
async def summarize(self, **args) -> str:
"""Summarize by LLM."""
template = HumanPromptTemplate.from_template(self._prompt_template)
messages = template.format_messages(**args)
# use default model if needed
if not self._model_name:
models = await self._llm_client.models()
if not models:
raise Exception("No models available")
self._model_name = models[0].model
logger.info(f"Using model {self._model_name} to extract")
model_messages = ModelMessage.from_base_messages(messages)
request = ModelRequest(model=self._model_name, messages=model_messages)
response = await self._llm_client.generate(request=request)
if not response.success:
code = str(response.error_code)
reason = response.text
logger.error(f"request llm failed ({code}) {reason}")
return response.text
def truncate(self):
"""Do nothing by default."""
def drop(self):
"""Do nothing by default."""

View File

@@ -6,6 +6,7 @@ import os
from collections import defaultdict
from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Type, cast
from dbgpt.app.component_configs import CFG
from dbgpt.core import Chunk, Embeddings
from dbgpt.core.awel.flow import (
FunctionDynamicOptions,
@@ -95,6 +96,7 @@ class VectorStoreConnector:
self._index_store_config = vector_store_config
self._register()
vector_store_type = self.__rewrite_index_store_type(vector_store_type)
if self._match(vector_store_type):
self.connector_class, self.config_class = connector[vector_store_type]
else:
@@ -124,6 +126,13 @@ class VectorStoreConnector:
logger.error("connect vector store failed: %s", e)
raise e
def __rewrite_index_store_type(self, index_store_type):
# Rewrite Knowledge Graph Type
if CFG.GRAPH_COMMUNITY_SUMMARY_ENABLED:
if index_store_type == "KnowledgeGraph":
return "CommunitySummaryKnowledgeGraph"
return index_store_type
@classmethod
def from_default(
cls,
@@ -270,6 +279,10 @@ class VectorStoreConnector:
"""
return self.client.delete_by_ids(ids=ids)
def truncate(self):
"""Truncate data."""
return self.client.truncate()
@property
def current_embeddings(self) -> Optional[Embeddings]:
"""Return the current embeddings."""

View File

@@ -5,7 +5,7 @@ from concurrent.futures import Executor, ThreadPoolExecutor
from typing import List, Optional
from dbgpt.core import Chunk
from dbgpt.rag.index.base import logger
from dbgpt.rag.index.base import IndexStoreConfig, logger
from dbgpt.storage.full_text.base import FullTextStoreBase
from dbgpt.storage.vector_store.elastic_store import ElasticsearchVectorConfig
from dbgpt.storage.vector_store.filters import MetadataFilters
@@ -35,6 +35,7 @@ class ElasticDocumentStore(FullTextStoreBase):
This similarity has the following options:
"""
super().__init__()
self._es_config = es_config
from elasticsearch import Elasticsearch
self._es_config = es_config
@@ -94,6 +95,10 @@ class ElasticDocumentStore(FullTextStoreBase):
)
self._executor = executor or ThreadPoolExecutor()
def get_config(self) -> IndexStoreConfig:
"""Get the es store config."""
return self._es_config
def load_document(self, chunks: List[Chunk]) -> List[str]:
"""Load document in elasticsearch.

View File

@@ -2,11 +2,11 @@
from typing import List, Optional
from dbgpt.core import Chunk
from dbgpt.rag.index.base import IndexStoreBase
from dbgpt.storage.full_text.base import FullTextStoreBase
from dbgpt.storage.vector_store.filters import MetadataFilters
class OpenSearch(IndexStoreBase):
class OpenSearch(FullTextStoreBase):
"""OpenSearch index store."""
def load_document(self, chunks: List[Chunk]) -> List[str]:

View File

@@ -1,7 +1,7 @@
"""Graph store base class."""
import logging
from abc import ABC, abstractmethod
from typing import List, Optional, Tuple
from typing import Generator, List, Optional, Tuple
from dbgpt._private.pydantic import BaseModel, ConfigDict, Field
from dbgpt.core import Embeddings
@@ -23,15 +23,35 @@ class GraphStoreConfig(BaseModel):
default=None,
description="The embedding function of graph store, optional.",
)
summary_enabled: bool = Field(
default=False,
description="Enable graph community summary or not.",
)
class GraphStoreBase(ABC):
"""Graph store base class."""
@abstractmethod
def get_config(self) -> GraphStoreConfig:
"""Get the graph store config."""
@abstractmethod
def get_vertex_type(self) -> str:
"""Get the vertex type."""
@abstractmethod
def get_edge_type(self) -> str:
"""Get the edge type."""
@abstractmethod
def insert_triplet(self, sub: str, rel: str, obj: str):
"""Add triplet."""
@abstractmethod
def insert_graph(self, graph: Graph):
"""Add graph."""
@abstractmethod
def get_triplets(self, sub: str) -> List[Tuple[str, str]]:
"""Get triplets."""
@@ -40,6 +60,10 @@ class GraphStoreBase(ABC):
def delete_triplet(self, sub: str, rel: str, obj: str):
"""Delete triplet."""
@abstractmethod
def truncate(self):
"""Truncate Graph."""
@abstractmethod
def drop(self):
"""Drop graph."""
@@ -66,3 +90,11 @@ class GraphStoreBase(ABC):
@abstractmethod
def query(self, query: str, **args) -> Graph:
"""Execute a query."""
def aquery(self, query: str, **args) -> Graph:
"""Async execute a query."""
return self.query(query, **args)
@abstractmethod
def stream_query(self, query: str) -> Generator[Graph, None, None]:
"""Execute stream query."""

View File

@@ -1,4 +1,4 @@
"""Connector for vector store."""
"""Graph store factory."""
import logging
from typing import Tuple, Type

View File

@@ -1,4 +1,4 @@
"""Graph store base class."""
"""Graph definition."""
import itertools
import json
import logging
@@ -24,9 +24,15 @@ class Direction(Enum):
class Elem(ABC):
"""Elem class."""
def __init__(self):
def __init__(self, name: Optional[str] = None):
"""Initialize Elem."""
self._props = {}
self._name = name
self._props: Dict[str, Any] = {}
@property
def name(self) -> str:
"""Return the edge label."""
return self._name or ""
@property
def props(self) -> Dict[str, Any]:
@@ -46,14 +52,17 @@ class Elem(ABC):
self._props.pop(key, None)
def has_props(self, **props):
"""Check if the element has the specified properties with the given values."""
"""Check all key-value pairs exist."""
return all(self._props.get(k) == v for k, v in props.items())
@abstractmethod
def format(self, label_key: Optional[str] = None):
def format(self) -> str:
"""Format properties into a string."""
if len(self._props) == 1:
return str(next(iter(self._props.values())))
formatted_props = [
f"{k}:{json.dumps(v)}" for k, v in self._props.items() if k != label_key
f"{k}:{json.dumps(v, ensure_ascii=False)}" for k, v in self._props.items()
]
return f"{{{';'.join(formatted_props)}}}"
@@ -61,9 +70,9 @@ class Elem(ABC):
class Vertex(Elem):
"""Vertex class."""
def __init__(self, vid: str, **props):
def __init__(self, vid: str, name: Optional[str] = None, **props):
"""Initialize Vertex."""
super().__init__()
super().__init__(name)
self._vid = vid
for k, v in props.items():
self.set_prop(k, v)
@@ -73,26 +82,43 @@ class Vertex(Elem):
"""Return the vertex ID."""
return self._vid
def format(self, label_key: Optional[str] = None):
"""Format vertex properties into a string."""
label = self.get_prop(label_key) if label_key else self._vid
props_str = super().format(label_key)
if props_str == "{}":
return f"({label})"
@property
def name(self) -> str:
"""Return the vertex name."""
return super().name or self._vid
def format(self, concise: bool = False):
"""Format vertex into a string."""
name = self._name or self._vid
if concise:
return f"({name})"
if self._props:
return f"({name}:{super().format()})"
else:
return f"({label}:{props_str})"
return f"({name})"
def __str__(self):
"""Return the vertex ID as its string representation."""
return f"({self._vid})"
class IdVertex(Vertex):
"""IdVertex class."""
def __init__(self, vid: str):
"""Initialize Idvertex."""
super().__init__(vid)
class Edge(Elem):
"""Edge class."""
def __init__(self, sid: str, tid: str, **props):
def __init__(self, sid: str, tid: str, name: str, **props):
"""Initialize Edge."""
super().__init__()
assert name, "Edge name is required"
super().__init__(name)
self._sid = sid
self._tid = tid
for k, v in props.items():
@@ -117,23 +143,20 @@ class Edge(Elem):
else:
raise ValueError(f"Get nid of {vid} on {self} failed")
def format(self, label_key: Optional[str] = None):
def format(self):
"""Format the edge properties into a string."""
label = self.get_prop(label_key) if label_key else ""
props_str = super().format(label_key)
if props_str == "{}":
return f"-[{label}]->" if label else "->"
if self._props:
return f"-[{self._name}:{super().format()}]->"
else:
return f"-[{label}:{props_str}]->" if label else f"-[{props_str}]->"
return f"-[{self._name}]->"
def triplet(self, label_key: str) -> Tuple[str, str, str]:
def triplet(self) -> Tuple[str, str, str]:
"""Return a triplet."""
assert label_key, "label key is needed"
return self._sid, str(self.get_prop(label_key)), self._tid
return self.sid, self.name, self.tid
def __str__(self):
"""Return the edge '(sid)->(tid)'."""
return f"({self._sid})->({self._tid})"
return f"({self._sid})-[{self._name}]->({self._tid})"
class Graph(ABC):
@@ -177,8 +200,8 @@ class Graph(ABC):
"""Delete vertices and their neighbor edges."""
@abstractmethod
def del_edges(self, sid: str, tid: str, **props):
"""Delete edges(sid -> tid) matches props."""
def del_edges(self, sid: str, tid: str, name: str, **props):
"""Delete edges(sid -[name]-> tid) matches props."""
@abstractmethod
def del_neighbor_edges(self, vid: str, direction: Direction = Direction.OUT):
@@ -203,19 +226,19 @@ class Graph(ABC):
def format(self) -> str:
"""Format graph data to string."""
@abstractmethod
def truncate(self):
"""Truncate graph."""
class MemoryGraph(Graph):
"""Graph class."""
def __init__(self, vertex_label: Optional[str] = None, edge_label: str = "label"):
def __init__(self):
"""Initialize MemoryGraph with vertex label and edge label."""
assert edge_label, "Edge label is needed"
# metadata
self._vertex_label = vertex_label
self._edge_label = edge_label
self._vertex_prop_keys = {vertex_label} if vertex_label else set()
self._edge_prop_keys = {edge_label}
self._vertex_prop_keys = set()
self._edge_prop_keys = set()
self._edge_count = 0
# init vertices, out edges, in edges index
@@ -223,26 +246,6 @@ class MemoryGraph(Graph):
self._oes: Any = defaultdict(lambda: defaultdict(set))
self._ies: Any = defaultdict(lambda: defaultdict(set))
@property
def vertex_label(self):
"""Return the label for vertices."""
return self._vertex_label
@property
def edge_label(self):
"""Return the label for edges."""
return self._edge_label
@property
def vertex_prop_keys(self):
"""Return a set of property keys for vertices."""
return self._vertex_prop_keys
@property
def edge_prop_keys(self):
"""Return a set of property keys for edges."""
return self._edge_prop_keys
@property
def vertex_count(self):
"""Return the number of vertices in the graph."""
@@ -256,7 +259,10 @@ class MemoryGraph(Graph):
def upsert_vertex(self, vertex: Vertex):
"""Insert or update a vertex based on its ID."""
if vertex.vid in self._vs:
self._vs[vertex.vid].props.update(vertex.props)
if isinstance(self._vs[vertex.vid], IdVertex):
self._vs[vertex.vid] = vertex
else:
self._vs[vertex.vid].props.update(vertex.props)
else:
self._vs[vertex.vid] = vertex
@@ -265,9 +271,6 @@ class MemoryGraph(Graph):
def append_edge(self, edge: Edge):
"""Append an edge if it doesn't exist; requires edge label."""
if self.edge_label not in edge.props.keys():
raise ValueError(f"Edge prop '{self.edge_label}' is needed")
sid = edge.sid
tid = edge.tid
@@ -275,8 +278,8 @@ class MemoryGraph(Graph):
return False
# init vertex index
self._vs.setdefault(sid, Vertex(sid))
self._vs.setdefault(tid, Vertex(tid))
self._vs.setdefault(sid, IdVertex(sid))
self._vs.setdefault(tid, IdVertex(tid))
# update edge index
self._oes[sid][tid].add(edge)
@@ -346,18 +349,19 @@ class MemoryGraph(Graph):
self.del_neighbor_edges(vid, Direction.BOTH)
self._vs.pop(vid, None)
def del_edges(self, sid: str, tid: str, **props):
def del_edges(self, sid: str, tid: str, name: str, **props):
"""Delete edges."""
old_edge_cnt = len(self._oes[sid][tid])
if not props:
self._edge_count -= old_edge_cnt
self._oes[sid].pop(tid, None)
self._ies[tid].pop(sid, None)
return
def remove_matches(es):
return set(filter(lambda e: not e.has_props(**props), es))
return set(
filter(
lambda e: not (
(name == e.name if name else True) and e.has_props(**props)
),
es,
)
)
self._oes[sid][tid] = remove_matches(self._oes[sid][tid])
self._ies[tid][sid] = remove_matches(self._ies[tid][sid])
@@ -439,12 +443,10 @@ class MemoryGraph(Graph):
"schema": [
{
"type": "VERTEX",
"label": f"{self._vertex_label}",
"properties": [{"name": k} for k in self._vertex_prop_keys],
},
{
"type": "EDGE",
"label": f"{self._edge_label}",
"properties": [{"name": k} for k in self._edge_prop_keys],
},
]
@@ -452,14 +454,30 @@ class MemoryGraph(Graph):
def format(self) -> str:
"""Format graph to string."""
vs_str = "\n".join(v.format(self.vertex_label) for v in self.vertices())
vs_str = "\n".join(v.format() for v in self.vertices())
es_str = "\n".join(
f"{self.get_vertex(e.sid).format(self.vertex_label)}"
f"{e.format(self.edge_label)}"
f"{self.get_vertex(e.tid).format(self.vertex_label)}"
f"{self.get_vertex(e.sid).format(concise=True)}"
f"{e.format()}"
f"{self.get_vertex(e.tid).format(concise=True)}"
for e in self.edges()
)
return f"Vertices:\n{vs_str}\n\nEdges:\n{es_str}"
return (
f"Entities:\n{vs_str}\n\n" f"Relationships:\n{es_str}"
if (vs_str or es_str)
else ""
)
def truncate(self):
"""Truncate graph."""
# clean metadata
self._vertex_prop_keys.clear()
self._edge_prop_keys.clear()
self._edge_count = 0
# clean data and index
self._vs.clear()
self._oes.clear()
self._ies.clear()
def graphviz(self, name="g"):
"""View graphviz graph: https://dreampuf.github.io/GraphvizOnline."""
@@ -468,7 +486,7 @@ class MemoryGraph(Graph):
g.add_node(vertex.vid)
for edge in self.edges():
triplet = edge.triplet(self.edge_label)
triplet = edge.triplet()
g.add_edge(triplet[0], triplet[2], label=triplet[1])
digraph = nx.nx_agraph.to_agraph(g).to_string()

View File

@@ -1,9 +1,9 @@
"""Graph store base class."""
"""Memory graph store."""
import json
import logging
from typing import List, Optional, Tuple
from typing import Generator, List, Optional, Tuple
from dbgpt._private.pydantic import ConfigDict, Field
from dbgpt._private.pydantic import ConfigDict
from dbgpt.storage.graph_store.base import GraphStoreBase, GraphStoreConfig
from dbgpt.storage.graph_store.graph import Direction, Edge, Graph, MemoryGraph
@@ -15,32 +15,51 @@ class MemoryGraphStoreConfig(GraphStoreConfig):
model_config = ConfigDict(arbitrary_types_allowed=True)
edge_name_key: str = Field(
default="label",
description="The label of edge name, `label` by default.",
)
class MemoryGraphStore(GraphStoreBase):
"""Memory graph store."""
def __init__(self, graph_store_config: MemoryGraphStoreConfig):
"""Initialize MemoryGraphStore with a memory graph."""
self._edge_name_key = graph_store_config.edge_name_key
self._graph = MemoryGraph(edge_label=self._edge_name_key)
self._graph_store_config = graph_store_config
self._graph = MemoryGraph()
def get_config(self):
"""Get the graph store config."""
return self._graph_store_config
def get_edge_type(self) -> str:
"""Get the edge type."""
raise NotImplementedError("Memory graph store does not have edge type")
def get_vertex_type(self) -> str:
"""Get the vertex type."""
raise NotImplementedError("Memory graph store does not have vertex type")
def insert_triplet(self, sub: str, rel: str, obj: str):
"""Insert a triplet into the graph."""
self._graph.append_edge(Edge(sub, obj, **{self._edge_name_key: rel}))
self._graph.append_edge(Edge(sub, obj, rel))
def insert_graph(self, graph: Graph):
"""Add graph."""
for vertex in graph.vertices():
self._graph.upsert_vertex(vertex)
for edge in graph.edges():
self._graph.append_edge(edge)
def get_triplets(self, sub: str) -> List[Tuple[str, str]]:
"""Retrieve triplets originating from a subject."""
subgraph = self.explore([sub], direct=Direction.OUT, depth=1)
return [(e.get_prop(self._edge_name_key), e.tid) for e in subgraph.edges()]
return [(e.name, e.tid) for e in subgraph.edges()]
def delete_triplet(self, sub: str, rel: str, obj: str):
"""Delete a specific triplet from the graph."""
self._graph.del_edges(sub, obj, **{self._edge_name_key: rel})
self._graph.del_edges(sub, obj, rel)
def truncate(self):
"""Truncate graph."""
self._graph.truncate()
def drop(self):
"""Drop graph."""
@@ -50,7 +69,7 @@ class MemoryGraphStore(GraphStoreBase):
"""Return the graph schema as a JSON string."""
return json.dumps(self._graph.schema())
def get_full_graph(self, limit: Optional[int] = None) -> MemoryGraph:
def get_full_graph(self, limit: Optional[int] = None) -> Graph:
"""Return self."""
if not limit:
return self._graph
@@ -79,3 +98,7 @@ class MemoryGraphStore(GraphStoreBase):
def query(self, query: str, **args) -> Graph:
"""Execute a query on graph."""
raise NotImplementedError("Query memory graph not allowed")
def stream_query(self, query: str) -> Generator[Graph, None, None]:
"""Execute stream query."""
raise NotImplementedError("Stream query memory graph not allowed")

View File

@@ -1,10 +1,8 @@
"""Neo4j vector store."""
"""Neo4j store."""
import logging
from typing import List, Optional, Tuple
from dbgpt._private.pydantic import ConfigDict
from dbgpt.storage.graph_store.base import GraphStoreBase, GraphStoreConfig
from dbgpt.storage.graph_store.graph import Direction, Graph, MemoryGraph
logger = logging.getLogger(__name__)
@@ -19,46 +17,3 @@ class Neo4jStore(GraphStoreBase):
"""Neo4j graph store."""
# todo: add neo4j implementation
def __init__(self, graph_store_config: Neo4jStoreConfig):
"""Initialize the Neo4jStore with connection details."""
pass
def insert_triplet(self, sub: str, rel: str, obj: str):
"""Insert triplets."""
pass
def get_triplets(self, sub: str) -> List[Tuple[str, str]]:
"""Get triplets."""
return []
def delete_triplet(self, sub: str, rel: str, obj: str):
"""Delete triplets."""
pass
def drop(self):
"""Drop graph."""
pass
def get_schema(self, refresh: bool = False) -> str:
"""Get schema."""
return ""
def get_full_graph(self, limit: Optional[int] = None) -> Graph:
"""Get full graph."""
return MemoryGraph()
def explore(
self,
subs: List[str],
direct: Direction = Direction.BOTH,
depth: Optional[int] = None,
fan: Optional[int] = None,
limit: Optional[int] = None,
) -> Graph:
"""Explore the graph from given subjects up to a depth."""
return MemoryGraph()
def query(self, query: str, **args) -> Graph:
"""Execute a query on graph."""
return MemoryGraph()

View File

@@ -1,12 +1,14 @@
"""TuGraph vector store."""
"""TuGraph store."""
import base64
import json
import logging
import os
from typing import List, Optional, Tuple
from typing import Any, Generator, Iterator, List, Optional, Tuple
from dbgpt._private.pydantic import ConfigDict, Field
from dbgpt.datasource.conn_tugraph import TuGraphConnector
from dbgpt.storage.graph_store.base import GraphStoreBase, GraphStoreConfig
from dbgpt.storage.graph_store.graph import Direction, Edge, MemoryGraph, Vertex
from dbgpt.storage.graph_store.graph import Direction, Edge, Graph, MemoryGraph, Vertex
logger = logging.getLogger(__name__)
@@ -29,20 +31,24 @@ class TuGraphStoreConfig(GraphStoreConfig):
description="login username",
)
password: str = Field(
default="123456",
default="73@TuGraph",
description="login password",
)
vertex_type: str = Field(
default="entity",
description="The type of graph vertex, `entity` by default.",
description="The type of vertex, `entity` by default.",
)
edge_type: str = Field(
default="relation",
description="The type of graph edge, `relation` by default.",
description="The type of edge, `relation` by default.",
)
edge_name_key: str = Field(
default="label",
description="The label of edge name, `label` by default.",
plugin_names: List[str] = Field(
default=["leiden"],
description=(
"Plugins need to be loaded when initialize TuGraph, "
"code: https://github.com/TuGraph-family"
"/dbgpt-tugraph-plugins/tree/master/cpp"
),
)
@@ -51,20 +57,23 @@ class TuGraphStore(GraphStoreBase):
def __init__(self, config: TuGraphStoreConfig) -> None:
"""Initialize the TuGraphStore with connection details."""
self._host = os.getenv("TUGRAPH_HOST", "127.0.0.1") or config.host
self._port = int(os.getenv("TUGRAPH_PORT", 7687)) or config.port
self._username = os.getenv("TUGRAPH_USERNAME", "admin") or config.username
self._password = os.getenv("TUGRAPH_PASSWORD", "73@TuGraph") or config.password
self._node_label = (
os.getenv("TUGRAPH_VERTEX_TYPE", "entity") or config.vertex_type
self._config = config
self._host = os.getenv("TUGRAPH_HOST", config.host)
self._port = int(os.getenv("TUGRAPH_PORT", config.port))
self._username = os.getenv("TUGRAPH_USERNAME", config.username)
self._password = os.getenv("TUGRAPH_PASSWORD", config.password)
self._summary_enabled = (
os.getenv("GRAPH_COMMUNITY_SUMMARY_ENABLED", "").lower() == "true"
or config.summary_enabled
)
self._edge_label = (
os.getenv("TUGRAPH_EDGE_TYPE", "relation") or config.edge_type
)
self.edge_name_key = (
os.getenv("TUGRAPH_EDGE_NAME_KEY", "label") or config.edge_name_key
self._plugin_names = (
os.getenv("TUGRAPH_PLUGIN_NAMES", "leiden").split(",")
or config.plugin_names
)
self._graph_name = config.name
self._vertex_type = os.getenv("TUGRAPH_VERTEX_TYPE", config.vertex_type)
self._edge_type = os.getenv("TUGRAPH_EDGE_TYPE", config.edge_type)
self.conn = TuGraphConnector.from_uri_db(
host=self._host,
port=self._port,
@@ -72,35 +81,197 @@ class TuGraphStore(GraphStoreBase):
pwd=self._password,
db_name=config.name,
)
self.conn.create_graph(graph_name=config.name)
self._create_graph(config.name)
def get_vertex_type(self) -> str:
"""Get the vertex type."""
return self._vertex_type
def get_edge_type(self) -> str:
"""Get the edge type."""
return self._edge_type
def _create_graph(self, graph_name: str):
self.conn.create_graph(graph_name=graph_name)
self._create_schema()
if self._summary_enabled:
self._upload_plugin()
def _check_label(self, elem_type: str):
result = self.conn.get_table_names()
if elem_type == "vertex":
return self._node_label in result["vertex_tables"]
return self._vertex_type in result["vertex_tables"]
if elem_type == "edge":
return self._edge_label in result["edge_tables"]
return self._edge_type in result["edge_tables"]
def _add_vertex_index(self, field_name):
gql = f"CALL db.addIndex('{self._vertex_type}', '{field_name}', false)"
self.conn.run(gql)
def _upload_plugin(self):
gql = "CALL db.plugin.listPlugin('CPP','v1')"
result = self.conn.run(gql)
result_names = [
json.loads(record["plugin_description"])["name"] for record in result
]
missing_plugins = [
name for name in self._plugin_names if name not in result_names
]
if len(missing_plugins):
for name in missing_plugins:
try:
from dbgpt_tugraph_plugins import ( # type: ignore # noqa
get_plugin_binary_path,
)
except ImportError:
logger.error(
"dbgpt-tugraph-plugins is not installed, "
"pip install dbgpt-tugraph-plugins==0.1.0rc1 -U -i "
"https://pypi.org/simple"
)
plugin_path = get_plugin_binary_path("leiden")
with open(plugin_path, "rb") as f:
content = f.read()
content = base64.b64encode(content).decode()
gql = (
f"CALL db.plugin.loadPlugin('CPP', '{name}', '{content}', "
"'SO', '{name} Plugin', false, 'v1')"
)
self.conn.run(gql)
def _create_schema(self):
if not self._check_label("vertex"):
create_vertex_gql = (
f"CALL db.createLabel("
f"'vertex', '{self._node_label}', "
f"'id', ['id',string,false])"
)
self.conn.run(create_vertex_gql)
if self._summary_enabled:
create_vertex_gql = (
f"CALL db.createLabel("
f"'vertex', '{self._vertex_type}', "
f"'id', ['id',string,false],"
f"['name',string,false],"
f"['_document_id',string,true],"
f"['_chunk_id',string,true],"
f"['_community_id',string,true],"
f"['description',string,true])"
)
self.conn.run(create_vertex_gql)
self._add_vertex_index("_community_id")
else:
create_vertex_gql = (
f"CALL db.createLabel("
f"'vertex', '{self._vertex_type}', "
f"'id', ['id',string,false],"
f"['name',string,false])"
)
self.conn.run(create_vertex_gql)
if not self._check_label("edge"):
create_edge_gql = f"""CALL db.createLabel(
'edge', '{self._edge_label}', '[["{self._node_label}",
"{self._node_label}"]]', ["id",STRING,false])"""
'edge', '{self._edge_type}',
'[["{self._vertex_type}",
"{self._vertex_type}"]]',
["id",STRING,false],
["name",STRING,false])"""
if self._summary_enabled:
create_edge_gql = f"""CALL db.createLabel(
'edge', '{self._edge_type}',
'[["{self._vertex_type}",
"{self._vertex_type}"]]',
["id",STRING,false],
["name",STRING,false],
["description",STRING,true])"""
self.conn.run(create_edge_gql)
def _format_query_data(self, data, white_prop_list: List[str]):
nodes_list = []
rels_list: List[Any] = []
_white_list = white_prop_list
from neo4j import graph
def get_filtered_properties(properties, white_list):
return {
key: value
for key, value in properties.items()
if (not key.startswith("_") and key not in ["id", "name"])
or key in white_list
}
def process_node(node: graph.Node):
node_id = node._properties.get("id")
node_name = node._properties.get("name")
node_properties = get_filtered_properties(node._properties, _white_list)
nodes_list.append(
{"id": node_id, "name": node_name, "properties": node_properties}
)
def process_relationship(rel: graph.Relationship):
name = rel._properties.get("name", "")
rel_nodes = rel.nodes
src_id = rel_nodes[0]._properties.get("id")
dst_id = rel_nodes[1]._properties.get("id")
for node in rel_nodes:
process_node(node)
edge_properties = get_filtered_properties(rel._properties, _white_list)
if not any(
existing_edge.get("name") == name
and existing_edge.get("src_id") == src_id
and existing_edge.get("dst_id") == dst_id
for existing_edge in rels_list
):
rels_list.append(
{
"src_id": src_id,
"dst_id": dst_id,
"name": name,
"properties": edge_properties,
}
)
def process_path(path: graph.Path):
for rel in path.relationships:
process_relationship(rel)
def process_other(value):
if not any(
existing_node.get("id") == "json_node" for existing_node in nodes_list
):
nodes_list.append(
{
"id": "json_node",
"name": "json_node",
"properties": {"description": value},
}
)
for record in data:
for key in record.keys():
value = record[key]
if isinstance(value, graph.Node):
process_node(value)
elif isinstance(value, graph.Relationship):
process_relationship(value)
elif isinstance(value, graph.Path):
process_path(value)
else:
process_other(value)
nodes = [
Vertex(node["id"], node["name"], **node["properties"])
for node in nodes_list
]
rels = [
Edge(edge["src_id"], edge["dst_id"], edge["name"], **edge["properties"])
for edge in rels_list
]
return {"nodes": nodes, "edges": rels}
def get_config(self):
"""Get the graph store config."""
return self._config
def get_triplets(self, subj: str) -> List[Tuple[str, str]]:
"""Get triplets."""
query = (
f"MATCH (n1:{self._node_label})-[r]->(n2:{self._node_label}) "
f"MATCH (n1:{self._vertex_type})-[r]->(n2:{self._vertex_type}) "
f'WHERE n1.id = "{subj}" RETURN r.id as rel, n2.id as obj;'
)
data = self.conn.run(query)
@@ -117,16 +288,83 @@ class TuGraphStore(GraphStoreBase):
rel_escaped = escape_quotes(rel)
obj_escaped = escape_quotes(obj)
subj_query = f"MERGE (n1:{self._node_label} {{id:'{subj_escaped}'}})"
obj_query = f"MERGE (n1:{self._node_label} {{id:'{obj_escaped}'}})"
rel_query = (
f"MERGE (n1:{self._node_label} {{id:'{subj_escaped}'}})"
f"-[r:{self._edge_label} {{id:'{rel_escaped}'}}]->"
f"(n2:{self._node_label} {{id:'{obj_escaped}'}})"
node_query = f"""CALL db.upsertVertex(
'{self._vertex_type}',
[{{id:'{subj_escaped}',name:'{subj_escaped}'}},
{{id:'{obj_escaped}',name:'{obj_escaped}'}}])"""
edge_query = f"""CALL db.upsertEdge(
'{self._edge_type}',
{{type:"{self._vertex_type}",key:"sid"}},
{{type:"{self._vertex_type}", key:"tid"}},
[{{sid:"{subj_escaped}",
tid: "{obj_escaped}",
id:"{rel_escaped}",
name: "{rel_escaped}"}}])"""
self.conn.run(query=node_query)
self.conn.run(query=edge_query)
def insert_graph(self, graph: Graph) -> None:
"""Add graph."""
def escape_quotes(value: str) -> str:
"""Escape single and double quotes in a string for queries."""
if value is not None:
return value.replace("'", "").replace('"', "")
nodes: Iterator[Vertex] = graph.vertices()
edges: Iterator[Edge] = graph.edges()
node_list = []
edge_list = []
def parser(node_list):
formatted_nodes = [
"{"
+ ", ".join(
f'{k}: "{v}"' if isinstance(v, str) else f"{k}: {v}"
for k, v in node.items()
)
+ "}"
for node in node_list
]
return f"""{', '.join(formatted_nodes)}"""
for node in nodes:
node_list.append(
{
"id": escape_quotes(node.vid),
"name": escape_quotes(node.name),
"description": escape_quotes(node.get_prop("description")) or "",
"_document_id": "0",
"_chunk_id": "0",
"_community_id": "0",
}
)
node_query = (
f"""CALL db.upsertVertex("{self._vertex_type}", [{parser(node_list)}])"""
)
self.conn.run(query=subj_query)
self.conn.run(query=obj_query)
self.conn.run(query=rel_query)
for edge in edges:
edge_list.append(
{
"sid": escape_quotes(edge.sid),
"tid": escape_quotes(edge.tid),
"id": escape_quotes(edge.name),
"name": escape_quotes(edge.name),
"description": escape_quotes(edge.get_prop("description")),
}
)
edge_query = f"""CALL db.upsertEdge(
"{self._edge_type}",
{{type:"{self._vertex_type}", key:"sid"}},
{{type:"{self._vertex_type}", key:"tid"}},
[{parser(edge_list)}])"""
self.conn.run(query=node_query)
self.conn.run(query=edge_query)
def truncate(self):
"""Truncate Graph."""
gql = "MATCH (n) DELETE n"
self.conn.run(gql)
def drop(self):
"""Delete Graph."""
@@ -135,9 +373,9 @@ class TuGraphStore(GraphStoreBase):
def delete_triplet(self, sub: str, rel: str, obj: str) -> None:
"""Delete triplet."""
del_query = (
f"MATCH (n1:{self._node_label} {{id:'{sub}'}})"
f"-[r:{self._edge_label} {{id:'{rel}'}}]->"
f"(n2:{self._node_label} {{id:'{obj}'}}) DELETE n1,n2,r"
f"MATCH (n1:{self._vertex_type} {{id:'{sub}'}})"
f"-[r:{self._edge_type} {{id:'{rel}'}}]->"
f"(n2:{self._vertex_type} {{id:'{obj}'}}) DELETE n1,n2,r"
)
self.conn.run(query=del_query)
@@ -148,11 +386,20 @@ class TuGraphStore(GraphStoreBase):
schema = data[0]["schema"]
return schema
def get_full_graph(self, limit: Optional[int] = None) -> MemoryGraph:
def get_full_graph(self, limit: Optional[int] = None) -> Graph:
"""Get full graph."""
if not limit:
raise Exception("limit must be set")
return self.query(f"MATCH (n)-[r]-(m) RETURN n,m,r LIMIT {limit}")
graph_result = self.query(
f"MATCH (n)-[r]-(m) RETURN n,r,m LIMIT {limit}",
white_list=["_community_id"],
)
all_graph = MemoryGraph()
for vertex in graph_result.vertices():
all_graph.upsert_vertex(vertex)
for edge in graph_result.edges():
all_graph.append_edge(edge)
return all_graph
def explore(
self,
@@ -161,8 +408,11 @@ class TuGraphStore(GraphStoreBase):
depth: Optional[int] = None,
fan: Optional[int] = None,
limit: Optional[int] = None,
) -> MemoryGraph:
) -> Graph:
"""Explore the graph from given subjects up to a depth."""
if not subs:
return MemoryGraph()
if fan is not None:
raise ValueError("Fan functionality is not supported at this time.")
else:
@@ -173,67 +423,88 @@ class TuGraphStore(GraphStoreBase):
limit_string = f"LIMIT {limit}"
if limit is None:
limit_string = ""
if direct.name == "OUT":
rel = f"-[r:{self._edge_type}*{depth_string}]->"
elif direct.name == "IN":
rel = f"<-[r:{self._edge_type}*{depth_string}]-"
else:
rel = f"-[r:{self._edge_type}*{depth_string}]-"
query = (
f"MATCH p=(n:{self._node_label})"
f"-[r:{self._edge_label}*{depth_string}]-(m:{self._node_label}) "
f"MATCH p=(n:{self._vertex_type})"
f"{rel}(m:{self._vertex_type}) "
f"WHERE n.id IN {subs} RETURN p {limit_string}"
)
return self.query(query)
def query(self, query: str, **args) -> MemoryGraph:
"""Execute a query on graph."""
def _format_paths(paths):
formatted_paths = []
for path in paths:
formatted_path = []
nodes = list(path["p"].nodes)
rels = list(path["p"].relationships)
for i in range(len(nodes)):
formatted_path.append(nodes[i]._properties["id"])
if i < len(rels):
formatted_path.append(rels[i]._properties["id"])
formatted_paths.append(formatted_path)
return formatted_paths
def _format_query_data(data):
node_ids_set = set()
rels_set = set()
from neo4j import graph
for record in data:
for key in record.keys():
value = record[key]
if isinstance(value, graph.Node):
node_id = value._properties["id"]
node_ids_set.add(node_id)
elif isinstance(value, graph.Relationship):
rel_nodes = value.nodes
prop_id = value._properties["id"]
src_id = rel_nodes[0]._properties["id"]
dst_id = rel_nodes[1]._properties["id"]
rels_set.add((src_id, dst_id, prop_id))
elif isinstance(value, graph.Path):
formatted_paths = _format_paths(data)
for path in formatted_paths:
for i in range(0, len(path), 2):
node_ids_set.add(path[i])
if i + 2 < len(path):
rels_set.add((path[i], path[i + 2], path[i + 1]))
nodes = [Vertex(node_id) for node_id in node_ids_set]
rels = [
Edge(src_id, dst_id, label=prop_id)
for (src_id, dst_id, prop_id) in rels_set
]
return {"nodes": nodes, "edges": rels}
result = self.conn.run(query=query)
graph = _format_query_data(result)
white_list = args.get("white_list", [])
graph = self._format_query_data(result, white_list)
mg = MemoryGraph()
for vertex in graph["nodes"]:
mg.upsert_vertex(vertex)
for edge in graph["edges"]:
mg.append_edge(edge)
return mg
def stream_query(self, query: str) -> Generator[Graph, None, None]:
"""Execute a stream query."""
from neo4j import graph
for record in self.conn.run_stream(query):
mg = MemoryGraph()
for key in record.keys():
value = record[key]
if isinstance(value, graph.Node):
node_id = value._properties["id"]
description = value._properties["description"]
vertex = Vertex(node_id, name=node_id, description=description)
mg.upsert_vertex(vertex)
elif isinstance(value, graph.Relationship):
rel_nodes = value.nodes
prop_id = value._properties["id"]
src_id = rel_nodes[0]._properties["id"]
dst_id = rel_nodes[1]._properties["id"]
description = value._properties["description"]
edge = Edge(src_id, dst_id, name=prop_id, description=description)
mg.append_edge(edge)
elif isinstance(value, graph.Path):
nodes = list(record["p"].nodes)
rels = list(record["p"].relationships)
formatted_path = []
for i in range(len(nodes)):
formatted_path.append(
{
"id": nodes[i]._properties["id"],
"description": nodes[i]._properties["description"],
}
)
if i < len(rels):
formatted_path.append(
{
"id": rels[i]._properties["id"],
"description": rels[i]._properties["description"],
}
)
for i in range(0, len(formatted_path), 2):
mg.upsert_vertex(
Vertex(
formatted_path[i]["id"],
name=formatted_path[i]["id"],
description=formatted_path[i]["description"],
)
)
if i + 2 < len(formatted_path):
mg.append_edge(
Edge(
formatted_path[i]["id"],
formatted_path[i + 2]["id"],
name=formatted_path[i + 1]["id"],
description=formatted_path[i + 1]["description"],
)
)
else:
vertex = Vertex("json_node", name="json_node", description=value)
mg.upsert_vertex(vertex)
yield mg

View File

@@ -19,6 +19,10 @@ class KnowledgeGraphConfig(IndexStoreConfig):
class KnowledgeGraphBase(IndexStoreBase, ABC):
"""Knowledge graph base class."""
@abstractmethod
def get_config(self) -> KnowledgeGraphConfig:
"""Get the knowledge graph config."""
@abstractmethod
def query_graph(self, limit: Optional[int] = None) -> Graph:
"""Get graph data."""

View File

@@ -0,0 +1 @@
"""Community Module."""

View File

@@ -0,0 +1,73 @@
"""Define Classes about Community."""
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Optional
from dbgpt.storage.graph_store.base import GraphStoreBase
from dbgpt.storage.graph_store.graph import Graph
logger = logging.getLogger(__name__)
@dataclass
class Community:
"""Community class."""
id: str
data: Optional[Graph] = None
summary: Optional[str] = None
@dataclass
class CommunityTree:
"""Represents a community tree."""
class CommunityStoreAdapter(ABC):
"""Community Store Adapter."""
def __init__(self, graph_store: GraphStoreBase):
"""Initialize Community Store Adapter."""
self._graph_store = graph_store
@property
def graph_store(self) -> GraphStoreBase:
"""Get graph store."""
return self._graph_store
@abstractmethod
async def discover_communities(self, **kwargs) -> List[str]:
"""Run community discovery."""
@abstractmethod
async def get_community(self, community_id: str) -> Community:
"""Get community."""
class CommunityMetastore(ABC):
"""Community metastore class."""
@abstractmethod
def get(self, community_id: str) -> Community:
"""Get community."""
@abstractmethod
def list(self) -> List[Community]:
"""Get all communities."""
@abstractmethod
async def search(self, query: str) -> List[Community]:
"""Search communities relevant to query."""
@abstractmethod
async def save(self, communities: List[Community]):
"""Save communities."""
@abstractmethod
async def truncate(self):
"""Truncate all communities."""
@abstractmethod
def drop(self):
"""Drop community metastore."""

View File

@@ -0,0 +1,63 @@
"""Builtin Community metastore."""
import logging
from typing import List, Optional
from dbgpt.core import Chunk
from dbgpt.datasource.rdbms.base import RDBMSConnector
from dbgpt.storage.knowledge_graph.community.base import Community, CommunityMetastore
from dbgpt.storage.vector_store.base import VectorStoreBase
logger = logging.getLogger(__name__)
class BuiltinCommunityMetastore(CommunityMetastore):
"""Builtin Community metastore."""
def __init__(
self, vector_store: VectorStoreBase, rdb_store: Optional[RDBMSConnector] = None
):
"""Initialize Community metastore."""
self._vector_store = vector_store
self._rdb_store = rdb_store
config = self._vector_store.get_config()
self._vector_space = config.name
self._max_chunks_once_load = config.max_chunks_once_load
self._max_threads = config.max_threads
self._topk = config.topk
self._score_threshold = config.score_threshold
def get(self, community_id: str) -> Community:
"""Get community."""
raise NotImplementedError("Get community not allowed")
def list(self) -> List[Community]:
"""Get all communities."""
raise NotImplementedError("List communities not allowed")
async def search(self, query: str) -> List[Community]:
"""Search communities relevant to query."""
chunks = await self._vector_store.asimilar_search_with_scores(
query, self._topk, self._score_threshold
)
return [Community(id=chunk.chunk_id, summary=chunk.content) for chunk in chunks]
async def save(self, communities: List[Community]):
"""Save communities."""
chunks = [
Chunk(id=c.id, content=c.summary, metadata={"total": len(communities)})
for c in communities
]
await self._vector_store.aload_document_with_limit(
chunks, self._max_chunks_once_load, self._max_threads
)
logger.info(f"Save {len(communities)} communities")
async def truncate(self):
"""Truncate community metastore."""
self._vector_store.truncate()
def drop(self):
"""Drop community metastore."""
if self._vector_store.vector_name_exists():
self._vector_store.delete_vector_name(self._vector_space)

View File

@@ -0,0 +1,83 @@
"""Define the CommunityStore class."""
import logging
from typing import List
from dbgpt.rag.transformer.community_summarizer import CommunitySummarizer
from dbgpt.storage.knowledge_graph.community.base import (
Community,
CommunityStoreAdapter,
)
from dbgpt.storage.knowledge_graph.community.community_metastore import (
BuiltinCommunityMetastore,
)
from dbgpt.storage.vector_store.base import VectorStoreBase
logger = logging.getLogger(__name__)
class CommunityStore:
"""CommunityStore Class."""
def __init__(
self,
community_store_adapter: CommunityStoreAdapter,
community_summarizer: CommunitySummarizer,
vector_store: VectorStoreBase,
):
"""Initialize the CommunityStore class."""
self._community_store_adapter = community_store_adapter
self._community_summarizer = community_summarizer
self._meta_store = BuiltinCommunityMetastore(vector_store)
async def build_communities(self):
"""Discover communities."""
community_ids = await (self._community_store_adapter.discover_communities())
# summarize communities
communities = []
for community_id in community_ids:
community = await (
self._community_store_adapter.get_community(community_id)
)
graph = community.data.format()
if not graph:
break
community.summary = await (
self._community_summarizer.summarize(graph=graph)
)
communities.append(community)
logger.info(
f"Summarize community {community_id}: " f"{community.summary[:50]}..."
)
# truncate then save new summaries
await self._meta_store.truncate()
await self._meta_store.save(communities)
async def search_communities(self, query: str) -> List[Community]:
"""Search communities."""
return await self._meta_store.search(query)
def truncate(self):
"""Truncate community store."""
logger.info("Truncate community metastore")
self._meta_store.truncate()
logger.info("Truncate community summarizer")
self._community_summarizer.truncate()
logger.info("Truncate graph")
self._community_store_adapter.graph_store.truncate()
def drop(self):
"""Drop community store."""
logger.info("Remove community metastore")
self._meta_store.drop()
logger.info("Remove community summarizer")
self._community_summarizer.drop()
logger.info("Remove graph")
self._community_store_adapter.graph_store.drop()

View File

@@ -0,0 +1,30 @@
"""CommunityStoreAdapter factory."""
import logging
from dbgpt.storage.graph_store.base import GraphStoreBase
from dbgpt.storage.graph_store.tugraph_store import TuGraphStore
from dbgpt.storage.knowledge_graph.community.base import CommunityStoreAdapter
from dbgpt.storage.knowledge_graph.community.tugraph_adapter import (
TuGraphCommunityStoreAdapter,
)
logger = logging.getLogger(__name__)
class CommunityStoreAdapterFactory:
"""Factory for community store adapter."""
@staticmethod
def create(graph_store: GraphStoreBase) -> CommunityStoreAdapter:
"""Create a CommunityStoreAdapter instance.
Args:
- graph_store_type: graph store type Memory, TuGraph, Neo4j
"""
if isinstance(graph_store, TuGraphStore):
return TuGraphCommunityStoreAdapter(graph_store)
else:
raise Exception(
"create community store adapter for %s failed",
graph_store.__class__.__name__,
)

View File

@@ -0,0 +1,52 @@
"""TuGraph Community Store Adapter."""
import json
import logging
from typing import List
from dbgpt.storage.graph_store.graph import MemoryGraph
from dbgpt.storage.knowledge_graph.community.base import (
Community,
CommunityStoreAdapter,
)
logger = logging.getLogger(__name__)
class TuGraphCommunityStoreAdapter(CommunityStoreAdapter):
"""TuGraph Community Store Adapter."""
MAX_HIERARCHY_LEVEL = 3
async def discover_communities(self, **kwargs) -> List[str]:
"""Run community discovery with leiden."""
mg = self._graph_store.query(
"CALL db.plugin.callPlugin"
"('CPP','leiden','{\"leiden_val\":\"_community_id\"}',60.00,false)"
)
result = mg.get_vertex("json_node").get_prop("description")
community_ids = json.loads(result)["community_id_list"]
logger.info(f"Discovered {len(community_ids)} communities.")
return community_ids
async def get_community(self, community_id: str) -> Community:
"""Get community."""
query = (
f"MATCH (n:{self._graph_store.get_vertex_type()})"
f"WHERE n._community_id = '{community_id}' RETURN n"
)
edge_query = (
f"MATCH (n:{self._graph_store.get_vertex_type()})-"
f"[r:{self._graph_store.get_edge_type()}]-"
f"(m:{self._graph_store.get_vertex_type()})"
f"WHERE n._community_id = '{community_id}' RETURN n,r,m"
)
all_vertex_graph = self._graph_store.aquery(query)
all_edge_graph = self._graph_store.aquery(edge_query)
all_graph = MemoryGraph()
for vertex in all_vertex_graph.vertices():
all_graph.upsert_vertex(vertex)
for edge in all_edge_graph.edges():
all_graph.append_edge(edge)
return Community(id=community_id, data=all_graph)

View File

@@ -0,0 +1,373 @@
"""Define the CommunitySummaryKnowledgeGraph."""
import logging
import os
from typing import List, Optional
from dbgpt._private.pydantic import ConfigDict, Field
from dbgpt.core import Chunk
from dbgpt.rag.transformer.community_summarizer import CommunitySummarizer
from dbgpt.rag.transformer.graph_extractor import GraphExtractor
from dbgpt.storage.knowledge_graph.community.community_store import CommunityStore
from dbgpt.storage.knowledge_graph.community.factory import CommunityStoreAdapterFactory
from dbgpt.storage.knowledge_graph.knowledge_graph import (
BuiltinKnowledgeGraph,
BuiltinKnowledgeGraphConfig,
)
from dbgpt.storage.vector_store.base import VectorStoreConfig
from dbgpt.storage.vector_store.factory import VectorStoreFactory
from dbgpt.storage.vector_store.filters import MetadataFilters
logger = logging.getLogger(__name__)
class CommunitySummaryKnowledgeGraphConfig(BuiltinKnowledgeGraphConfig):
"""Community summary knowledge graph config."""
model_config = ConfigDict(arbitrary_types_allowed=True)
vector_store_type: str = Field(
default="Chroma", description="The type of vector store."
)
user: Optional[str] = Field(
default=None,
description="The user of vector store, if not set, will use the default user.",
)
password: Optional[str] = Field(
default=None,
description=(
"The password of vector store, if not set, will use the default password."
),
)
extract_topk: int = Field(
default=5,
description="Topk of knowledge graph extract",
)
extract_score_threshold: float = Field(
default=0.3,
description="Recall score of knowledge graph extract",
)
community_topk: int = Field(
default=50,
description="Topk of community search in knowledge graph",
)
community_score_threshold: float = Field(
default=0.0,
description="Recall score of community search in knowledge graph",
)
class CommunitySummaryKnowledgeGraph(BuiltinKnowledgeGraph):
"""Community summary knowledge graph class."""
def __init__(self, config: CommunitySummaryKnowledgeGraphConfig):
"""Initialize community summary knowledge graph class."""
super().__init__(config)
self._config = config
self._vector_store_type = os.getenv(
"VECTOR_STORE_TYPE", config.vector_store_type
)
self._extract_topk = int(
os.getenv("KNOWLEDGE_GRAPH_EXTRACT_SEARCH_TOP_SIZE", config.extract_topk)
)
self._extract_score_threshold = float(
os.getenv(
"KNOWLEDGE_GRAPH_EXTRACT_SEARCH_RECALL_SCORE",
config.extract_score_threshold,
)
)
self._community_topk = int(
os.getenv(
"KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_TOP_SIZE", config.community_topk
)
)
self._community_score_threshold = float(
os.getenv(
"KNOWLEDGE_GRAPH_COMMUNITY_SEARCH_RECALL_SCORE",
config.community_score_threshold,
)
)
def extractor_configure(name: str, cfg: VectorStoreConfig):
cfg.name = name
cfg.embedding_fn = config.embedding_fn
cfg.max_chunks_once_load = config.max_chunks_once_load
cfg.max_threads = config.max_threads
cfg.user = config.user
cfg.password = config.password
cfg.topk = self._extract_topk
cfg.score_threshold = self._extract_score_threshold
self._graph_extractor = GraphExtractor(
self._llm_client,
self._model_name,
VectorStoreFactory.create(
self._vector_store_type,
config.name + "_CHUNK_HISTORY",
extractor_configure,
),
)
def community_store_configure(name: str, cfg: VectorStoreConfig):
cfg.name = name
cfg.embedding_fn = config.embedding_fn
cfg.max_chunks_once_load = config.max_chunks_once_load
cfg.max_threads = config.max_threads
cfg.user = config.user
cfg.password = config.password
cfg.topk = self._community_topk
cfg.score_threshold = self._community_score_threshold
self._community_store = CommunityStore(
CommunityStoreAdapterFactory.create(self._graph_store),
CommunitySummarizer(self._llm_client, self._model_name),
VectorStoreFactory.create(
self._vector_store_type,
config.name + "_COMMUNITY_SUMMARY",
community_store_configure,
),
)
def get_config(self) -> BuiltinKnowledgeGraphConfig:
"""Get the knowledge graph config."""
return self._config
async def aload_document(self, chunks: List[Chunk]) -> List[str]:
"""Extract and persist graph."""
# todo add doc node
for chunk in chunks:
# todo add chunk node
# todo add relation doc-chunk
# extract graphs and save
graphs = await self._graph_extractor.extract(chunk.content)
for graph in graphs:
self._graph_store.insert_graph(graph)
# build communities and save
await self._community_store.build_communities()
return [chunk.chunk_id for chunk in chunks]
async def asimilar_search_with_scores(
self,
text,
topk,
score_threshold: float,
filters: Optional[MetadataFilters] = None,
) -> List[Chunk]:
"""Retrieve relevant community summaries."""
# global search: retrieve relevant community summaries
communities = await self._community_store.search_communities(text)
summaries = [
f"Section {i + 1}:\n{community.summary}"
for i, community in enumerate(communities)
]
context = "\n".join(summaries) if summaries else ""
# local search: extract keywords and explore subgraph
keywords = await self._keyword_extractor.extract(text)
subgraph = self._graph_store.explore(keywords, limit=topk).format()
logger.info(f"Search subgraph from {len(keywords)} keywords")
if not summaries and not subgraph:
return []
# merge search results into context
content = HYBRID_SEARCH_PT_CN.format(context=context, graph=subgraph)
return [Chunk(content=content)]
def truncate(self) -> List[str]:
"""Truncate knowledge graph."""
logger.info("Truncate community store")
self._community_store.truncate()
logger.info("Truncate keyword extractor")
self._keyword_extractor.truncate()
logger.info("Truncate triplet extractor")
self._graph_extractor.truncate()
return [self._config.name]
def delete_vector_name(self, index_name: str):
"""Delete knowledge graph."""
logger.info("Drop community store")
self._community_store.drop()
logger.info("Drop keyword extractor")
self._keyword_extractor.drop()
logger.info("Drop triplet extractor")
self._graph_extractor.drop()
HYBRID_SEARCH_PT_CN = (
"## 角色\n"
"你非常擅长结合提示词模板提供的[上下文]信息与[知识图谱]信息,"
"准确恰当地回答用户的问题,并保证不会输出与上下文和知识图谱无关的信息。"
"\n"
"## 技能\n"
"### 技能 1: 上下文理解\n"
"- 准确地理解[上下文]提供的信息,上下文信息可能被拆分为多个章节。\n"
"- 上下文的每个章节内容都会以[Section]开始,并按需进行了编号。\n"
"- 上下文信息提供了与用户问题相关度最高的总结性描述,请合理使用它们。"
"### 技能 2: 知识图谱理解\n"
"- 准确地识别[知识图谱]中提供的[Entities:]章节中的实体信息"
"和[Relationships:]章节中的关系信息,实体和关系信息的一般格式为:\n"
"```"
"* 实体信息格式:\n"
"- (实体名)\n"
"- (实体名:实体描述)\n"
"- (实体名:实体属性表)\n"
"- (文本块ID:文档块内容)\n"
"- (目录ID:目录名)\n"
"- (文档ID:文档名称)\n"
"\n"
"* 关系信息的格式:\n"
"- (来源实体名)-[关系名]->(目标实体名)\n"
"- (来源实体名)-[关系名:关系描述]->(目标实体名)\n"
"- (来源实体名)-[关系名:关系属性表]->(目标实体名)\n"
"- (文本块实体)-[包含]->(实体名)\n"
"- (目录ID)-[包含]->(文本块实体)\n"
"- (目录ID)-[包含]->(子目录ID)\n"
"- (文档ID)-[包含]->(文本块实体)\n"
"- (文档ID)-[包含]->(目录ID)\n"
"```"
"- 正确地将关系信息中的实体名/ID与实体信息关联还原出图结构。"
"- 将图结构所表达的信息作为用户提问的明细上下文,辅助生成更好的答案。\n"
"\n"
"## 约束条件\n"
"- 不要在答案中描述你的思考过程,直接给出用户问题的答案,不要生成无关信息。\n"
"- 若[知识图谱]没有提供信息,此时应根据[上下文]提供的信息回答问题。"
"- 确保以第三人称书写,从客观角度结合[上下文]和[知识图谱]表达的信息回答问题。\n"
"- 若提供的信息相互矛盾,请解决矛盾并提供一个单一、连贯的描述。\n"
"- 避免使用停用词和过于常见的词汇。\n"
"\n"
"## 参考案例\n"
"```\n"
"[上下文]:\n"
"Section 1:\n"
"菲尔・贾伯的大儿子叫雅各布・贾伯。\n"
"Section 2:\n"
"菲尔・贾伯的小儿子叫比尔・贾伯。\n"
"[知识图谱]:\n"
"Entities:\n"
"(菲尔・贾伯#菲尔兹咖啡创始人)\n"
"(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌)\n"
"(雅各布・贾伯#菲尔・贾伯的儿子)\n"
"(美国多地#菲尔兹咖啡的扩展地区)\n"
"\n"
"Relationships:\n"
"(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立)\n"
"(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点)\n"
"(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的儿子)\n"
"(雅各布・贾伯#担任#首席执行官#在2005年成为菲尔兹咖啡的首席执行官)\n"
"(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围)\n"
"```\n"
"\n"
"----\n"
"\n"
"接下来的[上下文]和[知识图谱]的信息,可以帮助你回答更好地用户的问题。\n"
"\n"
"[上下文]:\n"
"{context}\n"
"\n"
"[知识图谱]:\n"
"{graph}\n"
"\n"
)
HYBRID_SEARCH_PT_EN = (
"## Role\n"
"You excel at combining the information provided in the [Context] with "
"information from the [KnowledgeGraph] to accurately and appropriately "
"answer user questions, ensuring that you do not output information "
"unrelated to the context and knowledge graph.\n"
"\n"
"## Skills\n"
"### Skill 1: Context Understanding\n"
"- Accurately understand the information provided in the [Context], "
"which may be divided into several sections.\n"
"- Each section in the context will start with [Section] "
"and may be numbered as needed.\n"
"- The context provides a summary description most relevant to the users "
"question, and it should be used wisely."
"### Skill 2: Knowledge Graph Understanding\n"
"- Accurately identify entity information in the [Entities:] section and "
"relationship information in the [Relationships:] section "
"of the [KnowledgeGraph]. The general format for entity "
"and relationship information is:\n"
"```"
"* Entity Information Format:\n"
"- (entity_name)\n"
"- (entity_name: entity_description)\n"
"- (entity_name: entity_property_map)\n"
"- (chunk_id: chunk_content)\n"
"- (catalog_id: catalog_name)\n"
"- (document_id: document_name)\n"
"\n"
"* Relationship Information Format:\n"
"- (source_entity_name)-[relationship_name]->(target_entity_name)\n"
"- (source_entity_name)-[relationship_name: relationship_description]->"
"(target_entity_name)\n"
"- (source_entity_name)-[relationship_name: relationship_property_map]->"
"(target_entity_name)\n"
"- (chunk_id)-[Contains]->(entity_name)\n"
"- (catalog_id)-[Contains]->(chunk_id)\n"
"- (catalog_id)-[Contains]->(sub_catalog_id)\n"
"- (document_id)-[Contains]->(chunk_id)\n"
"- (document_id)-[Contains]->(catalog_id)\n"
"```"
"- Correctly associate entity names/IDs in the relationship information "
"with entity information to restore the graph structure."
"- Use the information expressed by the graph structure as detailed "
"context for the user's query to assist in generating better answers.\n"
"\n"
"## Constraints\n"
"- Don't describe your thought process in the answer, provide the answer "
"to the user's question directly without generating irrelevant information."
"- If the [KnowledgeGraph] does not provide information, you should answer "
"the question based on the information provided in the [Context]."
"- Ensure to write in the third person, responding to questions from "
"an objective perspective based on the information combined from the "
"[Context] and the [KnowledgeGraph].\n"
"- If the provided information is contradictory, resolve the "
"contradictions and provide a single, coherent description.\n"
"- Avoid using stop words and overly common vocabulary.\n"
"\n"
"## Reference Example\n"
"```\n"
"[Context]:\n"
"Section 1:\n"
"Phil Schiller's eldest son is Jacob Schiller.\n"
"Section 2:\n"
"Phil Schiller's youngest son is Bill Schiller.\n"
"[KnowledgeGraph]:\n"
"Entities:\n"
"(Phil Jaber#Founder of Philz Coffee)\n"
"(Philz Coffee#Coffee brand founded in Berkeley, California)\n"
"(Jacob Jaber#Son of Phil Jaber)\n"
"(Multiple locations in the USA#Expansion regions of Philz Coffee)\n"
"\n"
"Relationships:\n"
"(Phil Jaber#Created#Philz Coffee"
"#Founded in Berkeley, California in 1978)\n"
"(Philz Coffee#Located in#Berkeley, California"
"#Founding location of Philz Coffee)\n"
"(Phil Jaber#Has#Jacob Jaber#Son of Phil Jaber)\n"
"(Jacob Jaber#Serves as#CEO#Became CEO of Philz Coffee in 2005)\n"
"(Philz Coffee#Expanded to#Multiple locations in the USA"
"#Expansion regions of Philz Coffee)\n"
"```\n"
"\n"
"----\n"
"\n"
"The following information from the [Context] and [KnowledgeGraph] can "
"help you better answer user questions.\n"
"\n"
"[Context]:\n"
"{context}\n"
"\n"
"[KnowledgeGraph]:\n"
"{graph}\n"
"\n"
)

View File

@@ -36,8 +36,9 @@ class BuiltinKnowledgeGraph(KnowledgeGraphBase):
def __init__(self, config: BuiltinKnowledgeGraphConfig):
"""Create builtin knowledge graph instance."""
self._config = config
super().__init__()
self._config = config
self._llm_client = config.llm_client
if not self._llm_client:
raise ValueError("No llm client provided.")
@@ -45,17 +46,19 @@ class BuiltinKnowledgeGraph(KnowledgeGraphBase):
self._model_name = config.model_name
self._triplet_extractor = TripletExtractor(self._llm_client, self._model_name)
self._keyword_extractor = KeywordExtractor(self._llm_client, self._model_name)
self._graph_store_type = (
os.getenv("GRAPH_STORE_TYPE", "TuGraph") or config.graph_store_type
)
self._graph_store = self.__init_graph_store(config)
def __init_graph_store(self, config) -> GraphStoreBase:
def configure(cfg: GraphStoreConfig):
cfg.name = self._config.name
cfg.embedding_fn = self._config.embedding_fn
cfg.name = config.name
cfg.embedding_fn = config.embedding_fn
self._graph_store: GraphStoreBase = GraphStoreFactory.create(
self._graph_store_type, configure
)
graph_store_type = os.getenv("GRAPH_STORE_TYPE") or config.graph_store_type
return GraphStoreFactory.create(graph_store_type, configure)
def get_config(self) -> BuiltinKnowledgeGraphConfig:
"""Get the knowledge graph config."""
return self._config
def load_document(self, chunks: List[Chunk]) -> List[str]:
"""Extract and persist triplets to graph store."""
@@ -113,35 +116,59 @@ class BuiltinKnowledgeGraph(KnowledgeGraphBase):
# extract keywords and explore graph store
keywords = await self._keyword_extractor.extract(text)
subgraph = self._graph_store.explore(keywords, limit=topk)
subgraph = self._graph_store.explore(keywords, limit=topk).format()
logger.info(f"Search subgraph from {len(keywords)} keywords")
if not subgraph:
return []
content = (
"The following vertices and edges data after [Subgraph Data] "
"are retrieved from the knowledge graph based on the keywords:\n"
f"Keywords:\n{','.join(keywords)}\n"
"The following entities and relationships provided after "
"[Subgraph] are retrieved from the knowledge graph "
"based on the keywords:\n"
f"\"{','.join(keywords)}\".\n"
"---------------------\n"
"You can refer to the sample vertices and edges to understand "
"the real knowledge graph data provided by [Subgraph Data].\n"
"Sample vertices:\n"
"The following examples after [Entities] and [Relationships] that "
"can help you understand the data format of the knowledge graph, "
"but do not use them in the answer.\n"
"[Entities]:\n"
"(alice)\n"
"(bob:{age:28})\n"
'(carry:{age:18;role:"teacher"})\n\n'
"Sample edges:\n"
"[Relationships]:\n"
"(alice)-[reward]->(alice)\n"
'(alice)-[notify:{method:"email"}]->'
'(carry:{age:18;role:"teacher"})\n'
'(bob:{age:28})-[teach:{course:"math";hour:180}]->(alice)\n'
"---------------------\n"
f"Subgraph Data:\n{subgraph.format()}\n"
f"[Subgraph]:\n{subgraph}\n"
)
return [Chunk(content=content, metadata=subgraph.schema())]
return [Chunk(content=content)]
def query_graph(self, limit: Optional[int] = None) -> Graph:
"""Query graph."""
return self._graph_store.get_full_graph(limit)
def truncate(self) -> List[str]:
"""Truncate knowledge graph."""
logger.info(f"Truncate graph {self._config.name}")
self._graph_store.truncate()
logger.info("Truncate keyword extractor")
self._keyword_extractor.truncate()
logger.info("Truncate triplet extractor")
self._triplet_extractor.truncate()
return [self._config.name]
def delete_vector_name(self, index_name: str):
"""Delete vector name."""
logger.info(f"Remove graph index {index_name}")
logger.info(f"Drop graph {index_name}")
self._graph_store.drop()
logger.info("Drop keyword extractor")
self._keyword_extractor.drop()
logger.info("Drop triplet extractor")
self._triplet_extractor.drop()

View File

@@ -1,12 +1,8 @@
"""OpenSPG class."""
import logging
from typing import List, Optional
from dbgpt._private.pydantic import ConfigDict
from dbgpt.core import Chunk
from dbgpt.storage.graph_store.graph import Graph, MemoryGraph
from dbgpt.storage.knowledge_graph.base import KnowledgeGraphBase, KnowledgeGraphConfig
from dbgpt.storage.vector_store.filters import MetadataFilters
logger = logging.getLogger(__name__)
@@ -21,29 +17,3 @@ class OpenSPG(KnowledgeGraphBase):
"""OpenSPG class."""
# todo: add OpenSPG implementation
def __init__(self, config: OpenSPGConfig):
"""Initialize the OpenSPG with config details."""
pass
def load_document(self, chunks: List[Chunk]) -> List[str]:
"""Load document."""
return []
def similar_search_with_scores(
self,
text,
topk,
score_threshold: float,
filters: Optional[MetadataFilters] = None,
) -> List[Chunk]:
"""Similar with scores."""
return []
def query_graph(self, limit: Optional[int] = None) -> Graph:
"""Query graph."""
return MemoryGraph()
def delete_vector_name(self, index_name: str):
"""Delete vector name."""
pass

View File

@@ -56,6 +56,15 @@ def _import_builtin_knowledge_graph() -> Tuple[Type, Type]:
return BuiltinKnowledgeGraph, BuiltinKnowledgeGraphConfig
def _import_community_summary_knowledge_graph() -> Tuple[Type, Type]:
from dbgpt.storage.knowledge_graph.community_summary import (
CommunitySummaryKnowledgeGraph,
CommunitySummaryKnowledgeGraphConfig,
)
return CommunitySummaryKnowledgeGraph, CommunitySummaryKnowledgeGraphConfig
def _import_openspg() -> Tuple[Type, Type]:
from dbgpt.storage.knowledge_graph.open_spg import OpenSPG, OpenSPGConfig
@@ -86,6 +95,8 @@ def __getattr__(name: str) -> Tuple[Type, Type]:
return _import_elastic()
elif name == "KnowledgeGraph":
return _import_builtin_knowledge_graph()
elif name == "CommunitySummaryKnowledgeGraph":
return _import_community_summary_knowledge_graph()
elif name == "OpenSPG":
return _import_openspg()
elif name == "FullText":
@@ -103,7 +114,7 @@ __vector_store__ = [
"ElasticSearch",
]
__knowledge_graph__ = ["KnowledgeGraph", "OpenSPG"]
__knowledge_graph__ = ["KnowledgeGraph", "CommunitySummaryKnowledgeGraph", "OpenSPG"]
__document_store__ = ["FullText"]

View File

@@ -99,6 +99,14 @@ class VectorStoreConfig(IndexStoreConfig):
"The password of vector store, if not set, will use the default password."
),
)
topk: int = Field(
default=5,
description="Topk of vector search",
)
score_threshold: float = Field(
default=0.3,
description="Recall score of vector search",
)
class VectorStoreBase(IndexStoreBase, ABC):
@@ -108,6 +116,10 @@ class VectorStoreBase(IndexStoreBase, ABC):
"""Initialize vector store."""
super().__init__(executor)
@abstractmethod
def get_config(self) -> VectorStoreConfig:
"""Get the vector store config."""
def filter_by_score_threshold(
self, chunks: List[Chunk], score_threshold: float
) -> List[Chunk]:
@@ -126,7 +138,7 @@ class VectorStoreBase(IndexStoreBase, ABC):
metadata=chunk.metadata,
content=chunk.content,
score=chunk.score,
chunk_id=str(id),
chunk_id=chunk.chunk_id,
)
for chunk in chunks
if chunk.score >= score_threshold

View File

@@ -63,6 +63,8 @@ class ChromaStore(VectorStoreBase):
vector_store_config(ChromaVectorConfig): vector store config.
"""
super().__init__()
self._vector_store_config = vector_store_config
chroma_vector_config = vector_store_config.to_dict(exclude_none=True)
chroma_path = chroma_vector_config.get(
"persist_path", os.path.join(PILOT_PATH, "data")
@@ -89,6 +91,10 @@ class ChromaStore(VectorStoreBase):
metadata=collection_metadata,
)
def get_config(self) -> ChromaVectorConfig:
"""Get the vector store config."""
return self._vector_store_config
def similar_search(
self, text, topk, filters: Optional[MetadataFilters] = None
) -> List[Chunk]:
@@ -100,10 +106,16 @@ class ChromaStore(VectorStoreBase):
filters=filters,
)
return [
Chunk(content=chroma_result[0], metadata=chroma_result[1] or {}, score=0.0)
Chunk(
content=chroma_result[0],
metadata=chroma_result[1] or {},
score=0.0,
chunk_id=chroma_result[2],
)
for chroma_result in zip(
chroma_results["documents"][0],
chroma_results["metadatas"][0],
chroma_results["ids"][0],
)
]
@@ -134,12 +146,14 @@ class ChromaStore(VectorStoreBase):
content=chroma_result[0],
metadata=chroma_result[1] or {},
score=(1 - chroma_result[2]),
chunk_id=chroma_result[3],
)
)
for chroma_result in zip(
chroma_results["documents"][0],
chroma_results["metadatas"][0],
chroma_results["distances"][0],
chroma_results["ids"][0],
)
]
return self.filter_by_score_threshold(chunks, score_threshold)
@@ -181,6 +195,20 @@ class ChromaStore(VectorStoreBase):
if len(ids) > 0:
self._collection.delete(ids=ids)
def truncate(self) -> List[str]:
"""Truncate data index_name."""
logger.info(f"begin truncate chroma collection:{self._collection.name}")
results = self._collection.get()
ids = results.get("ids")
if ids:
self._collection.delete(ids=ids)
logger.info(
f"truncate chroma collection {self._collection.name} "
f"{len(ids)} chunks success"
)
return ids
return []
def convert_metadata_filters(
self,
filters: MetadataFilters,

View File

@@ -126,6 +126,8 @@ class ElasticStore(VectorStoreBase):
vector_store_config (ElasticsearchVectorConfig): ElasticsearchStore config.
"""
super().__init__()
self._vector_store_config = vector_store_config
connect_kwargs = {}
elasticsearch_vector_config = vector_store_config.dict()
self.uri = elasticsearch_vector_config.get("uri") or os.getenv(
@@ -234,6 +236,10 @@ class ElasticStore(VectorStoreBase):
except Exception as e:
logger.error(f"ElasticSearch connection failed: {e}")
def get_config(self) -> ElasticsearchVectorConfig:
"""Get the vector store config."""
return self._vector_store_config
def load_document(
self,
chunks: List[Chunk],

View File

@@ -0,0 +1,44 @@
"""Vector store factory."""
import logging
from typing import Tuple, Type
from dbgpt.storage import vector_store
from dbgpt.storage.vector_store.base import VectorStoreBase, VectorStoreConfig
logger = logging.getLogger(__name__)
class VectorStoreFactory:
"""Factory for vector store."""
@staticmethod
def create(
vector_store_type: str, vector_space_name: str, vector_store_configure=None
) -> VectorStoreBase:
"""Create a VectorStore instance.
Args:
- vector_store_type: vector store type Chroma, Milvus, etc.
- vector_store_config: vector store config
"""
store_cls, cfg_cls = VectorStoreFactory.__find_type(vector_store_type)
try:
config = cfg_cls()
if vector_store_configure:
vector_store_configure(vector_space_name, config)
return store_cls(config)
except Exception as e:
logger.error("create vector store failed: %s", e)
raise e
@staticmethod
def __find_type(vector_store_type: str) -> Tuple[Type, Type]:
for t in vector_store.__vector_store__:
if t.lower() == vector_store_type.lower():
store_cls, cfg_cls = getattr(vector_store, t)
if issubclass(store_cls, VectorStoreBase) and issubclass(
cfg_cls, VectorStoreConfig
):
return store_cls, cfg_cls
raise Exception(f"Vector store {vector_store_type} not supported")

View File

@@ -150,6 +150,8 @@ class MilvusStore(VectorStoreBase):
refer to https://milvus.io/docs/v2.0.x/manage_connection.md
"""
super().__init__()
self._vector_store_config = vector_store_config
try:
from pymilvus import connections
except ImportError:
@@ -363,6 +365,10 @@ class MilvusStore(VectorStoreBase):
return res.primary_keys
def get_config(self) -> MilvusVectorConfig:
"""Get the vector store config."""
return self._vector_store_config
def load_document(self, chunks: List[Chunk]) -> List[str]:
"""Load document in vector database."""
batch_size = 500

View File

@@ -718,6 +718,8 @@ class OceanBaseStore(VectorStoreBase):
if vector_store_config.embedding_fn is None:
raise ValueError("embedding_fn is required for OceanBaseStore")
super().__init__()
self._vector_store_config = vector_store_config
self.embeddings = vector_store_config.embedding_fn
self.collection_name = vector_store_config.name
vector_store_config = vector_store_config.dict()
@@ -760,6 +762,10 @@ class OceanBaseStore(VectorStoreBase):
enable_normalize_vector=self.OB_ENABLE_NORMALIZE_VECTOR,
)
def get_config(self) -> OceanBaseConfig:
"""Get the vector store config."""
return self._vector_store_config
def similar_search(
self, text, topk, filters: Optional[MetadataFilters] = None, **kwargs: Any
) -> List[Chunk]:

View File

@@ -64,6 +64,8 @@ class PGVectorStore(VectorStoreBase):
"Please install the `langchain` package to use the PGVector."
)
super().__init__()
self._vector_store_config = vector_store_config
self.connection_string = vector_store_config.connection_string
self.embeddings = vector_store_config.embedding_fn
self.collection_name = vector_store_config.name
@@ -74,6 +76,10 @@ class PGVectorStore(VectorStoreBase):
connection_string=self.connection_string,
)
def get_config(self) -> PGVectorConfig:
"""Get the vector store config."""
return self._vector_store_config
def similar_search(
self, text: str, topk: int, filters: Optional[MetadataFilters] = None
) -> List[Chunk]:

View File

@@ -69,6 +69,8 @@ class WeaviateStore(VectorStoreBase):
"Please install it with `pip install weaviate-client`."
)
super().__init__()
self._vector_store_config = vector_store_config
self.weaviate_url = vector_store_config.weaviate_url
self.embedding = vector_store_config.embedding_fn
self.vector_name = vector_store_config.name
@@ -78,6 +80,10 @@ class WeaviateStore(VectorStoreBase):
self.vector_store_client = weaviate.Client(self.weaviate_url)
def get_config(self) -> WeaviateVectorConfig:
"""Get the vector store config."""
return self._vector_store_config
def similar_search(
self, text: str, topk: int, filters: Optional[MetadataFilters] = None
) -> List[Chunk]:

View File

@@ -1,12 +1,19 @@
import asyncio
import os
import pytest
from dbgpt.configs.model_config import ROOT_PATH
from dbgpt.core import Chunk, HumanPromptTemplate, ModelMessage, ModelRequest
from dbgpt.model.proxy.llms.chatgpt import OpenAILLMClient
from dbgpt.rag import ChunkParameters
from dbgpt.rag.assembler import EmbeddingAssembler
from dbgpt.rag.embedding import DefaultEmbeddingFactory
from dbgpt.rag.knowledge import KnowledgeFactory
from dbgpt.rag.retriever import RetrieverStrategy
from dbgpt.storage.knowledge_graph.community_summary import (
CommunitySummaryKnowledgeGraph,
CommunitySummaryKnowledgeGraphConfig,
)
from dbgpt.storage.knowledge_graph.knowledge_graph import (
BuiltinKnowledgeGraph,
BuiltinKnowledgeGraphConfig,
@@ -15,7 +22,7 @@ from dbgpt.storage.knowledge_graph.knowledge_graph import (
"""GraphRAG example.
pre-requirements:
* Set LLM config (url/sk) in `.env`.
* Setup/startup TuGraph from: https://github.com/TuGraph-family/tugraph-db
* Install pytest utils: `pip install pytest pytest-asyncio`
* Config TuGraph following the format below.
```
GRAPH_STORE_TYPE=TuGraph
@@ -24,46 +31,100 @@ from dbgpt.storage.knowledge_graph.knowledge_graph import (
TUGRAPH_USERNAME=admin
TUGRAPH_PASSWORD=73@TuGraph
```
Examples:
..code-block:: shell
python examples/rag/graph_rag_example.py
pytest -s examples/rag/graph_rag_example.py
"""
llm_client = OpenAILLMClient()
model_name = "gpt-4o-mini"
def _create_kg_connector():
@pytest.mark.asyncio
async def test_naive_graph_rag():
await __run_graph_rag(
knowledge_file="examples/test_files/graphrag-mini.md",
chunk_strategy="CHUNK_BY_SIZE",
knowledge_graph=__create_naive_kg_connector(),
question="What's the relationship between TuGraph and DB-GPT ?",
)
@pytest.mark.asyncio
async def test_community_graph_rag():
await __run_graph_rag(
knowledge_file="examples/test_files/graphrag-mini.md",
chunk_strategy="CHUNK_BY_MARKDOWN_HEADER",
knowledge_graph=__create_community_kg_connector(),
question="What's the relationship between TuGraph and DB-GPT ?",
)
def __create_naive_kg_connector():
"""Create knowledge graph connector."""
return BuiltinKnowledgeGraph(
config=BuiltinKnowledgeGraphConfig(
name="graph_rag_test",
name="naive_graph_rag_test",
embedding_fn=None,
llm_client=OpenAILLMClient(),
model_name="gpt-3.5-turbo",
llm_client=llm_client,
model_name=model_name,
graph_store_type="MemoryGraph",
),
)
async def main():
file_path = os.path.join(ROOT_PATH, "examples/test_files/tranformers_story.md")
def __create_community_kg_connector():
"""Create community knowledge graph connector."""
return CommunitySummaryKnowledgeGraph(
config=CommunitySummaryKnowledgeGraphConfig(
name="community_graph_rag_test",
embedding_fn=DefaultEmbeddingFactory.openai(),
llm_client=llm_client,
model_name=model_name,
graph_store_type="TuGraphGraph",
),
)
async def ask_chunk(chunk: Chunk, question) -> str:
rag_template = (
"Based on the following [Context] {context}, " "answer [Question] {question}."
)
template = HumanPromptTemplate.from_template(rag_template)
messages = template.format_messages(context=chunk.content, question=question)
model_messages = ModelMessage.from_base_messages(messages)
request = ModelRequest(model=model_name, messages=model_messages)
response = await llm_client.generate(request=request)
if not response.success:
code = str(response.error_code)
reason = response.text
raise Exception(f"request llm failed ({code}) {reason}")
return response.text
async def __run_graph_rag(knowledge_file, chunk_strategy, knowledge_graph, question):
file_path = os.path.join(ROOT_PATH, knowledge_file).format()
knowledge = KnowledgeFactory.from_file_path(file_path)
graph_store = _create_kg_connector()
chunk_parameters = ChunkParameters(chunk_strategy="CHUNK_BY_SIZE")
# get embedding assembler
assembler = await EmbeddingAssembler.aload_from_knowledge(
knowledge=knowledge,
chunk_parameters=chunk_parameters,
index_store=graph_store,
retrieve_strategy=RetrieverStrategy.GRAPH,
)
await assembler.apersist()
# get embeddings retriever
retriever = assembler.as_retriever(3)
chunks = await retriever.aretrieve_with_scores(
"What actions has Megatron taken ?", score_threshold=0.3
)
print(f"embedding rag example results:{chunks}")
graph_store.delete_vector_name("graph_rag_test")
try:
chunk_parameters = ChunkParameters(chunk_strategy=chunk_strategy)
# get embedding assembler
assembler = await EmbeddingAssembler.aload_from_knowledge(
knowledge=knowledge,
chunk_parameters=chunk_parameters,
index_store=knowledge_graph,
retrieve_strategy=RetrieverStrategy.GRAPH,
)
await assembler.apersist()
if __name__ == "__main__":
asyncio.run(main())
# get embeddings retriever
retriever = assembler.as_retriever(1)
chunks = await retriever.aretrieve_with_scores(question, score_threshold=0.3)
# chat
print(f"{await ask_chunk(chunks[0], question)}")
finally:
knowledge_graph.delete_vector_name(knowledge_graph.get_config().name)

View File

@@ -0,0 +1,185 @@
# DB-GPT: 用私有化LLM技术定义数据库下一代交互方式
## DB-GPT 是什么?
🤖️ **DB-GPT是一个开源的AI原生数据应用开发框架(AI Native Data App Development framework with AWEL(Agentic Workflow Expression Language) and Agents)。**
目的是构建大模型领域的基础设施,通过开发多模型管理(SMMF)、Text2SQL效果优化、RAG框架以及优化、Multi-Agents框架协作、AWEL(智能体工作流编排)等多种技术能力,让围绕数据库构建大模型应用更简单,更方便。
🚀 **数据3.0 时代,基于模型、数据库,企业/开发者可以用更少的代码搭建自己的专属应用。**
## 效果演示
### AI原生数据智能应用
---
- 🔥🔥🔥 [V0.5.0发布——通过工作流与智能体开发原生数据应用](https://www.yuque.com/eosphoros/dbgpt-docs/owcrh9423f9rqkg2)
---
### Data Agents
![](https://github.com/eosphoros-ai/DB-GPT/assets/17919400/37d116fc-d9dd-4efa-b4df-9ab02b22541c#id=KpPbI&originHeight=1880&originWidth=3010&originalType=binary&ratio=1&rotation=0&showTitle=false&status=done&style=none)
![](https://github.com/eosphoros-ai/DB-GPT/assets/17919400/a7bf6d65-92d1-4f0e-aaf0-259ccdde22fd#id=EHUr0&originHeight=1872&originWidth=3396&originalType=binary&ratio=1&rotation=0&showTitle=false&status=done&style=none)
![](https://github.com/eosphoros-ai/DB-GPT/assets/17919400/1849a79a-f7fd-40cf-bc9c-b117a041dd6a#id=gveW4&originHeight=1868&originWidth=2996&originalType=binary&ratio=1&rotation=0&showTitle=false&status=done&style=none)
## 目录
- [架构方案](#架构方案)
- [安装](#安装)
- [特性简介](#特性一览)
- [贡献](#贡献)
- [路线图](#路线图)
- [联系我们](#联系我们)
## 架构方案
![image.png](https://intranetproxy.alipay.com/skylark/lark/0/2024/png/26456775/1724764757479-314c8ed2-24e6-4cc2-8a29-e84e626d6755.png#clientId=u47bade0c-6d5b-4&from=paste&height=721&id=u6344fee6&originHeight=1442&originWidth=1590&originalType=binary&ratio=2&rotation=0&showTitle=false&size=766959&status=done&style=none&taskId=u0f69fc62-9392-468b-a990-84de8e3a3eb&title=&width=795)
核心能力主要有以下几个部分:
- **RAG(Retrieval Augmented Generation)**RAG是当下落地实践最多也是最迫切的领域DB-GPT目前已经实现了一套基于RAG的框架用户可以基于DB-GPT的RAG能力构建知识类应用。
- **GBI**生成式BI是DB-GPT项目的核心能力之一为构建企业报表分析、业务洞察提供基础的数智化技术保障。
- **微调框架**: 模型微调是任何一个企业在垂直、细分领域落地不可或缺的能力DB-GPT提供了完整的微调框架实现与DB-GPT项目的无缝打通在最近的微调中基于spider的准确率已经做到了82.5%
- **数据驱动的Multi-Agents框架**: DB-GPT提供了数据驱动的自进化Multi-Agents框架目标是可以持续基于数据做决策与执行。
- **数据工厂**: 数据工厂主要是在大模型时代,做可信知识、数据的清洗加工。
- **数据源**: 对接各类数据源实现生产业务数据无缝对接到DB-GPT核心能力。
### 智能体编排语言(AWEL)
AWELAgentic Workflow Expression Language是一套专为大模型应用开发设计的智能体工作流表达语言它提供了强大的功能和灵活性。通过 AWEL API 您可以专注于大模型应用业务逻辑的开发而不需要关注繁琐的模型和环境细节AWEL 采用分层 API 的设计, AWEL 的分层 API 设计架构如下图所示:
![image.png](https://cdn.nlark.com/yuque/0/2023/png/23108892/1700743735979-fcae1255-5b21-4071-a805-84d9f98247ef.png#averageHue=%23efefef&clientId=u62c750d6-91b4-4&from=paste&height=588&id=ua7e2a75b&originHeight=819&originWidth=586&originalType=binary&ratio=2&rotation=0&showTitle=false&size=101075&status=done&style=shadow&taskId=u753583cb-7d4f-4267-962d-a892e5150d2&title=&width=421)
AWEL在设计上分为三个层次依次为算子层、AgentFrame层以及DSL层以下对三个层次做简要介绍。
- 算子层
算子层是指LLM应用开发过程中一个个最基本的操作原子比如在一个RAG应用开发时。 检索、向量化、模型交互、Prompt处理等都是一个个基础算子。 在后续的发展中,框架会进一步对算子进行抽象与标准化设计。 可以根据标准API快速实现一组算子。
- AgentFrame层
AgentFrame层将算子做进一步封装可以基于算子做链式计算。 这一层链式计算也支持分布式支持如filter、join、map、reduce等一套链式计算操作。 后续也将支持更多的计算逻辑。
- DSL层
DSL层提供一套标准的结构化表示语言可以通过写DSL语句完成AgentFrame与算子的操作让围绕数据编写大模型应用更具确定性避免通过自然语言编写的不确定性使得围绕数据与大模型的应用编程变为确定性应用编程。
### RAG架构
![](https://github.com/eosphoros-ai/DB-GPT/raw/main/assets/RAG-IN-ACTION.jpg#from=url&id=JsJTm&originHeight=1300&originWidth=2272&originalType=binary&ratio=2&rotation=0&showTitle=false&status=done&style=none&title=)
### Agent架构
DB-GPT Agent是一个多Agent框架目的是提供生产级Agent构建的基础框架能力。我们认为生产级代理应用程序需要基于数据驱动的决策并且可以在可控制的工作流中进行编排。
在我们的设计中提供了一套以Agent为核心融合多模型管理、RAGs、API调用、可视化、AWEL智能体编排、Text2SQL、意图识别等一系列技术的生产级数据应用开发框架。
![image.png](https://intranetproxy.alipay.com/skylark/lark/0/2024/png/26456775/1724765648901-d048c6fc-8b08-4623-bc2d-66db8edb893f.png#clientId=u47bade0c-6d5b-4&from=paste&height=376&id=u580c84f4&originHeight=558&originWidth=1076&originalType=binary&ratio=2&rotation=0&showTitle=false&size=862016&status=done&style=none&taskId=ue3fa55ab-171a-4aeb-a7ec-8bcf8e13474&title=&width=725)
如同所示: 在DB-GPT中Agent是一等公民其他RAGs、Tools、数据源等都是Agent依赖的资源包括模型也是一种资源。
Agent的核心模块主要有Memory、Profile、Planing、Action等模块。
围绕Agent的核心模块往上构建多Agent之间的协作能力协作主要有三种形式。
1. 单一Agent: 单个Agent有具体任务与目标不涉及多模型协作。
2. Auto-Plan: Agent自己制定计划在多Agent协作时负责路径规划、分工协作等。
3. AWEL: 编排,通过程序编排来实现多智能体的协作。
### 多模型架构
在AIGC应用探索与生产落地中难以避免直接与模型服务对接但是目前大模型的推理部署还没有一个事实标准不断有新的模型发布也不断有新的训练方法被提出我们需要花大量的时间来适配多变的底层模型环境而这在一定程度上制约了AIGC应用的探索和落地。
![](https://intranetproxy.alipay.com/skylark/lark/0/2024/png/26456775/1724765743005-eb151d72-79a2-4a91-9d85-f46b68bfe031.png#clientId=u47bade0c-6d5b-4&from=paste&id=u26061337&originHeight=1087&originWidth=1439&originalType=url&ratio=2&rotation=0&showTitle=false&status=done&style=none&taskId=u181cfde4-f672-414c-a030-07d40dee916&title=)
SMMF由模型推理层、模型部署层两部分组成。模型推理层对应模型推理框架vLLM、TGI和TensorRT等。模型部署层向下对接推理层向上提供模型服务能力。 模型部署框架在推理框架之上,提供了多模型实例、多推理框架、多云、自动扩缩容与可观测性等能力。
### 子模块
- [DB-GPT-Hub](https://github.com/eosphoros-ai/DB-GPT-Hub) 通过微调来持续提升Text2SQL效果
- [DB-GPT-Plugins](https://github.com/eosphoros-ai/DB-GPT-Plugins) DB-GPT 插件仓库, 兼容Auto-GPT
- [GPT-Vis](https://github.com/eosphoros-ai/DB-GPT-Web) 可视化协议
- [dbgpts](https://github.com/eosphoros-ai/dbgpts) dbgpts 是官方提供的数据应用仓库, 包含数据智能应用, 智能体编排流程模版, 通用算子等构建在DB-GPT之上的资源。
## 安装
[**教程**](https://www.yuque.com/eosphoros/dbgpt-docs/bex30nsv60ru0fmx)
- [**快速开始**](https://www.yuque.com/eosphoros/dbgpt-docs/ew0kf1plm0bru2ga)
- [源码安装](https://www.yuque.com/eosphoros/dbgpt-docs/urh3fcx8tu0s9xmb)
- [Docker安装](https://www.yuque.com/eosphoros/dbgpt-docs/glf87qg4xxcyrp89)
- [Docker Compose安装](https://www.yuque.com/eosphoros/dbgpt-docs/wwdu11e0v5nkfzin)
- [**使用手册**](https://www.yuque.com/eosphoros/dbgpt-docs/tkspdd0tcy2vlnu4)
- [知识库](https://www.yuque.com/eosphoros/dbgpt-docs/ycyz3d9b62fccqxh)
- [数据对话](https://www.yuque.com/eosphoros/dbgpt-docs/gd9hbhi1dextqgbz)
- [Excel对话](https://www.yuque.com/eosphoros/dbgpt-docs/prugoype0xd2g4bb)
- [数据库对话](https://www.yuque.com/eosphoros/dbgpt-docs/wswpv3zcm2c9snmg)
- [报表分析](https://www.yuque.com/eosphoros/dbgpt-docs/vsv49p33eg4p5xc1)
- [Agents](https://www.yuque.com/eosphoros/dbgpt-docs/pom41m7oqtdd57hm)
- [**进阶教程**](https://www.yuque.com/eosphoros/dbgpt-docs/dxalqb8wsv2xkm5f)
- [智能体工作流使用](https://www.yuque.com/eosphoros/dbgpt-docs/hcomfb3yrleg7gmq)
- [智能应用使用](https://www.yuque.com/eosphoros/dbgpt-docs/aiagvxeb86iarq6r)
- [多模型管理](https://www.yuque.com/eosphoros/dbgpt-docs/huzgcf2abzvqy8uv)
- [命令行使用](https://www.yuque.com/eosphoros/dbgpt-docs/gd4kgumgd004aly8)
- [**模型服务部署**](https://www.yuque.com/eosphoros/dbgpt-docs/vubxiv9cqed5mc6o)
- [单机部署](https://www.yuque.com/eosphoros/dbgpt-docs/kwg1ed88lu5fgawb)
- [集群部署](https://www.yuque.com/eosphoros/dbgpt-docs/gmbp9619ytyn2v1s)
- [vLLM](https://www.yuque.com/eosphoros/dbgpt-docs/bhy9igdvanx1uluf)
- [**如何Debug**](https://www.yuque.com/eosphoros/dbgpt-docs/eyg0ocbc2ce3q95r)
- [**AWEL**](https://www.yuque.com/eosphoros/dbgpt-docs/zozbzslbfk0m0op5)
- [**FAQ**](https://www.yuque.com/eosphoros/dbgpt-docs/gomtc46qonmyt44l)
## 特性一览
- **私域问答&数据处理&RAG**支持内置、多文件格式上传、插件自抓取等方式自定义构建知识库,对海量结构化,非结构化数据做统一向量存储与检索
- **多数据源&GBI**支持自然语言与Excel、数据库、数仓等多种数据源交互并支持分析报告。
- **自动化微调**围绕大语言模型、Text2SQL数据集、LoRA/QLoRA/Pturning等微调方法构建的自动化微调轻量框架, 让TextSQL微调像流水线一样方便。详见: [DB-GPT-Hub](https://github.com/eosphoros-ai/DB-GPT-Hub)
- **数据驱动的Agents插件**支持自定义插件执行任务原生支持Auto-GPT插件模型Agents协议采用Agent Protocol标准
- **多模型支持与管理**海量模型支持包括开源、API代理等几十种大语言模型。如LLaMA/LLaMA2、Baichuan、ChatGLM、文心、通义、智谱等。当前已支持如下模型:
- 新增支持模型
- 🔥🔥🔥 [Meta-Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct)
- 🔥🔥🔥 [Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)
- 🔥🔥🔥 [Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
- 🔥🔥🔥 [gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it)
- 🔥🔥🔥 [gemma-2-9b-it](https://huggingface.co/google/gemma-2-9b-it)
- 🔥🔥🔥 [DeepSeek-Coder-V2-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct)
- 🔥🔥🔥 [DeepSeek-Coder-V2-Lite-Instruct](https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct)
- 🔥🔥🔥 [Qwen2-57B-A14B-Instruct](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct)
- 🔥🔥🔥 [Qwen2-57B-A14B-Instruct](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct)
- 🔥🔥🔥 [Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct)
- 🔥🔥🔥 [Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct)
- 🔥🔥🔥 [Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct)
- 🔥🔥🔥 [Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct)
- 🔥🔥🔥 [glm-4-9b-chat](https://huggingface.co/THUDM/glm-4-9b-chat)
- 🔥🔥🔥 [Phi-3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
- 🔥🔥🔥 [Yi-1.5-34B-Chat](https://huggingface.co/01-ai/Yi-1.5-34B-Chat)
- 🔥🔥🔥 [Yi-1.5-9B-Chat](https://huggingface.co/01-ai/Yi-1.5-9B-Chat)
- 🔥🔥🔥 [Yi-1.5-6B-Chat](https://huggingface.co/01-ai/Yi-1.5-6B-Chat)
- 🔥🔥🔥 [Qwen1.5-110B-Chat](https://huggingface.co/Qwen/Qwen1.5-110B-Chat)
- 🔥🔥🔥 [Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat)
- 🔥🔥🔥 [Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)
- 🔥🔥🔥 [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
- 🔥🔥🔥 [CodeQwen1.5-7B-Chat](https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat)
- 🔥🔥🔥 [Qwen1.5-32B-Chat](https://huggingface.co/Qwen/Qwen1.5-32B-Chat)
- 🔥🔥🔥 [Starling-LM-7B-beta](https://huggingface.co/Nexusflow/Starling-LM-7B-beta)
- 🔥🔥🔥 [gemma-7b-it](https://huggingface.co/google/gemma-7b-it)
- 🔥🔥🔥 [gemma-2b-it](https://huggingface.co/google/gemma-2b-it)
- 🔥🔥🔥 [SOLAR-10.7B](https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0)
- 🔥🔥🔥 [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
- 🔥🔥🔥 [Qwen-72B-Chat](https://huggingface.co/Qwen/Qwen-72B-Chat)
- 🔥🔥🔥 [Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat)
- [更多开源模型](https://www.yuque.com/eosphoros/dbgpt-docs/iqaaqwriwhp6zslc#qQktR)
- 支持在线代理模型
- [x] [DeepSeek.deepseek-chat](https://platform.deepseek.com/api-docs/)
- [x] [Ollama.API](https://github.com/ollama/ollama/blob/main/docs/api.md)
- [x] [月之暗面.Moonshot](https://platform.moonshot.cn/docs/)
- [x] [零一万物.Yi](https://platform.lingyiwanwu.com/docs)
- [x] [OpenAI·ChatGPT](https://api.openai.com/)
- [x] [百川·Baichuan](https://platform.baichuan-ai.com/)
- [x] [阿里·通义](https://www.aliyun.com/product/dashscope)
- [x] [百度·文心](https://cloud.baidu.com/product/wenxinworkshop?track=dingbutonglan)
- [x] [智谱·ChatGLM](http://open.bigmodel.cn/)
- [x] [讯飞·星火](https://xinghuo.xfyun.cn/)
- [x] [Google·Bard](https://bard.google.com/)
- [x] [Google·Gemini](https://makersuite.google.com/app/apikey)
- **隐私安全**通过私有化大模型、代理脱敏等多种技术保障数据的隐私安全。
- [支持数据源](https://www.yuque.com/eosphoros/dbgpt-docs/rc4r27ybmdwg9472)
## Image
🌐 [AutoDL镜像](https://www.codewithgpu.com/i/eosphoros-ai/DB-GPT/dbgpt)
🌐 [小程序云部署](https://www.yuque.com/eosphoros/dbgpt-docs/ek12ly8k661tbyn8)
### 多语言切换
在.env 配置文件当中修改LANGUAGE参数来切换使用不同的语言默认是英文(中文zh, 英文en, 其他语言待补充)
## 使用说明
### 多模型使用
### 数据Agents使用
- [数据Agents](https://www.yuque.com/eosphoros/dbgpt-docs/gwz4rayfuwz78fbq)
## 贡献
## 更加详细的贡献指南请参考[如何贡献](https://github.com/eosphoros-ai/DB-GPT/blob/main/CONTRIBUTING.md)。
这是一个用于数据库的复杂且创新的工具, 我们的项目也在紧急的开发当中, 会陆续发布一些新的feature。如在使用当中有任何具体问题, 优先在项目下提issue, 如有需要, 请联系如下微信,我会尽力提供帮助,同时也非常欢迎大家参与到项目建设中。
## Licence
The MIT License (MIT)
## 引用
如果您发现`DB-GPT`对您的研究或开发有用,请引用以下[论文](https://arxiv.org/abs/2312.17449)
```
@article{xue2023dbgpt,
title={DB-GPT: Empowering Database Interactions with Private Large Language Models},
author={Siqiao Xue and Caigao Jiang and Wenhui Shi and Fangyin Cheng and Keting Chen and Hongjun Yang and Zhiping Zhang and Jianshan He and Hongyang Zhang and Ganglin Wei and Wang Zhao and Fan Zhou and Danrui Qi and Hong Yi and Shaodong Liu and Faqiang Chen},
year={2023},
journal={arXiv preprint arXiv:2312.17449},
url={https://arxiv.org/abs/2312.17449}
}
```

View File

@@ -0,0 +1,97 @@
# TuGraph DB项目生态图谱
Entities:
(TuGraph-family/tugraph-db#github_repo)
(vesoft-inc/nebula#github_repo)
(PaddlePaddle/Paddle#github_repo)
(apache/brpc#github_repo)
(TuGraph-family/tugraph-web#github_repo)
(TuGraph-family/tugraph-db-client-java#github_repo)
(alibaba/GraphScope#github_repo)
(ClickHouse/ClickHouse#github_repo)
(TuGraph-family/fma-common#github_repo)
(vesoft-inc/nebula-docs-cn#github_repo)
(eosphoros-ai/DB-GPT#github_repo)
(eosphoros-ai#github_organization)
(yandex#github_organization)
(alibaba#github_organization)
(TuGraph-family#github_organization)
(baidu#github_organization)
(apache#github_organization)
(vesoft-inc#github_organization)
Relationships:
(TuGraph-family/tugraph-db#common_developer#vesoft-inc/nebula#common_developer count 10)
(TuGraph-family/tugraph-db#common_developer#PaddlePaddle/Paddle#common_developer count 9)
(TuGraph-family/tugraph-db#common_developer#apache/brpc#common_developer count 7)
(TuGraph-family/tugraph-db#common_developer#TuGraph-family/tugraph-web#common_developer count 7)
(TuGraph-family/tugraph-db#common_developer#TuGraph-family/tugraph-db-client-java#common_developer count 7)
(TuGraph-family/tugraph-db#common_developer#alibaba/GraphScope#common_developer count 6)
(TuGraph-family/tugraph-db#common_developer#ClickHouse/ClickHouse#common_developer count 6)
(TuGraph-family/tugraph-db#common_developer#TuGraph-family/fma-common#common_developer count 6)
(TuGraph-family/tugraph-db#common_developer#vesoft-inc/nebula-docs-cn#common_developer count 6)
(TuGraph-family/tugraph-db#common_developer#eosphoros-ai/DB-GPT#common_developer count 6)
(eosphoros-ai/DB-GPT#belong_to#eosphoros-ai#belong_to)
(ClickHouse/ClickHouse#belong_to#yandex#belong_to)
(alibaba/GraphScope#belong_to#alibaba#belong_to)
(TuGraph-family/tugraph-db#belong_to#TuGraph-family#belong_to)
(TuGraph-family/tugraph-web#belong_to#TuGraph-family#belong_to)
(TuGraph-family/fma-common#belong_to#TuGraph-family#belong_to)
(TuGraph-family/tugraph-db-client-java#belong_to#TuGraph-family#belong_to)
(PaddlePaddle/Paddle#belong_to#baidu#belong_to)
(apache/brpc#belong_to#apache#belong_to)
(vesoft-inc/nebula#belong_to#vesoft-inc#belong_to)
(vesoft-inc/nebula-docs-cn#belong_to#vesoft-inc#belong_to)
# DB-GPT项目生态图谱
Entities:
(eosphoros-ai/DB-GPT#github_repo)
(chatchat-space/Langchain-Chatchat#github_repo)
(hiyouga/LLaMA-Factory#github_repo)
(lm-sys/FastChat#github_repo)
(langchain-ai/langchain#github_repo)
(eosphoros-ai/DB-GPT-Hub#github_repo)
(THUDM/ChatGLM-6B#github_repo)
(langgenius/dify#github_repo)
(vllm-project/vllm#github_repo)
(QwenLM/Qwen#github_repo)
(PaddlePaddle/PaddleOCR#github_repo)
(vllm-project#github_organization)
(eosphoros-ai#github_organization)
(PaddlePaddle#github_organization)
(QwenLM#github_organization)
(THUDM#github_organization)
(lm-sys#github_organization)
(chatchat-space#github_organization)
(langchain-ai#github_organization)
(langgenius#github_organization)
Relationships:
(eosphoros-ai/DB-GPT#common_developer#chatchat-space/Langchain-Chatchat#common_developer count 82)
(eosphoros-ai/DB-GPT#common_developer#hiyouga/LLaMA-Factory#common_developer count 45)
(eosphoros-ai/DB-GPT#common_developer#lm-sys/FastChat#common_developer count 39)
(eosphoros-ai/DB-GPT#common_developer#langchain-ai/langchain#common_developer count 37)
(eosphoros-ai/DB-GPT#common_developer#eosphoros-ai/DB-GPT-Hub#common_developer count 37)
(eosphoros-ai/DB-GPT#common_developer#THUDM/ChatGLM-6B#common_developer count 31)
(eosphoros-ai/DB-GPT#common_developer#langgenius/dify#common_developer count 30)
(eosphoros-ai/DB-GPT#common_developer#vllm-project/vllm#common_developer count 27)
(eosphoros-ai/DB-GPT#common_developer#QwenLM/Qwen#common_developer count 26)
(eosphoros-ai/DB-GPT#common_developer#PaddlePaddle/PaddleOCR#common_developer count 24)
(vllm-project/vllm#belong_to#vllm-project#belong_to)
(eosphoros-ai/DB-GPT#belong_to#eosphoros-ai#belong_to)
(eosphoros-ai/DB-GPT-Hub#belong_to#eosphoros-ai#belong_to)
(PaddlePaddle/PaddleOCR#belong_to#PaddlePaddle#belong_to)
(QwenLM/Qwen#belong_to#QwenLM#belong_to)
(THUDM/ChatGLM-6B#belong_to#THUDM#belong_to)
(lm-sys/FastChat#belong_to#lm-sys#belong_to)
(chatchat-space/Langchain-Chatchat#belong_to#chatchat-space#belong_to)
(langchain-ai/langchain#belong_to#langchain-ai#belong_to)
(langgenius/dify#belong_to#langgenius#belong_to)
# TuGraph简介
TuGraph图数据库由蚂蚁集团与清华大学联合研发构建了一套包含图存储、图计算、图学习、图研发平台的完善的图技术体系支持海量多源的关联数据的实时处理显著提升数据分析效率支撑了蚂蚁支付、安全、社交、公益、数据治理等300多个场景应用。拥有业界领先规模的图集群解决了图数据分析面临的大数据量、高吞吐率和低延迟等重大挑战是蚂蚁集团金融风控能力的重要基础设施显著提升了欺诈洗钱等金融风险的实时识别能力和审理分析效率并面向金融、工业、政务服务等行业客户。TuGraph产品家族中开源产品包括TuGraph DB、TuGraph Analytics、OSGraph、ChatTuGraph等。内源产品包括GeaBase、GeaFlow、GeaLearn、GeaMaker等。
# DB-GPT简介
DB-GPT是一个开源的AI原生数据应用开发框架(AI Native Data App Development framework with AWEL(Agentic Workflow Expression Language) and Agents)。目的是构建大模型领域的基础设施,通过开发多模型管理(SMMF)、Text2SQL效果优化、RAG框架以及优化、Multi-Agents框架协作、AWEL(智能体工作流编排)等多种技术能力,让围绕数据库构建大模型应用更简单,更方便。

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,286 @@
# TuGraph
TuGraph图数据库由蚂蚁集团与清华大学联合研发构建了一套包含图存储、图计算、图学习、图研发平台的完善的图技术体系支持海量多源的关联数据的实时处理显著提升数据分析效率支撑了蚂蚁支付、安全、社交、公益、数据治理等300多个场景应用。拥有业界领先规模的图集群解决了图数据分析面临的大数据量、高吞吐率和低延迟等重大挑战是蚂蚁集团金融风控能力的重要基础设施显著提升了欺诈洗钱等金融风险的实时识别能力和审理分析效率并面向金融、工业、政务服务等行业客户。TuGraph产品家族中开源产品包括TuGraph DB、TuGraph Analytics、OSGraph、ChatTuGraph等。内源产品包括GeaBase、GeaFlow、GeaLearn、GeaMaker等。
TuGraph企业级图数据管理平台提供对关联数据的复杂、深度分析功能。TuGraph以分布式集群架构支持海量数据的高吞吐、高可用性、高并发读写和ACID事务操作。通过对数据的分片、分区支持水平扩展提供对点、边、属性、拓扑等结构的查询、过滤、索引等功能。TuGraph提供离线、近线、在线的图算法和图学习能力内置数十种算法能够对全图、子图、动态图的模式和特征进行处理通过可视化或数据服务形式与外部数据源交互。此外TuGraph提供可视化的展示和操作界面覆盖图研发和服务的全生命周期支持主流的图查询语言提供便捷的访问和开发接口能够与外部多模数据源进行导入导出、存量/增量/批量更新和备份。TuGraph还提供精美和实用的图生产环境管理监控满足企业用户的技术和业务应用需要。
TuGraph在金融风控方面的应用实践主要包括个人信贷业务、反欺诈、洗钱路径追踪等问题。利用多维交叉关联信息深度刻画申请和交易行为识别多种复杂、规模化、隐蔽性的欺诈网络和洗钱网络结合聚类分析、风险传播等算法实时计算用户的风险评分在风险行为发生前预先识别帮助金融机构提升效率、降低风险。基于TuGraph企业级图数据管理平台蚂蚁集团增加反欺诈稽核金额6%反洗钱风险审理分析效率提升90%。每天计算近10亿用户大约200亿左右边关系对疑似团伙类犯罪风险识别能力提高近10倍。此外为某银行提供的信贷图平台提升了13%的风控模型区分度为某银行完成的信用卡申请团伙欺诈分析方案运算时间缩短至原有的1/60为某银行搭建的企业风险图平台在对小微企业评级放贷问题中担保圈识别准确率达到90%以上。
## 1. TuGraph DB
### 1.1 简介
TuGraph DB 是支持大数据容量、低延迟查找和快速图分析功能的高效图数据库。TuGraph社区版于2022年9月开源提供了完整的图数据库基础功能和成熟的产品设计如ACID兼容的事务、编程API和配套工具等适用于单实例部署。社区版支持TB级别的数据规模为用户管理和分析复杂关联数据提供了高效、易用、可靠的平台是学习TuGraph和实现小型项目的理想选择。
### 1.2 TuGraph特性
TuGraph是支持大数据量、低延迟查找和快速图分析功能的高效图数据库。TuGraph也是基于磁盘的数据库支持存储多达数十TB的数据。TuGraph提供多种API使用户能够轻松构建应用程序并使其易于扩展和优化。
它具有如下功能特征:
* 属性图模型
* 实时增删查改
* 多重图(点间允许多重边)
* 多图(大图与多个子图)
* 完善的ACID事务处理隔离级别为可串行化serializable
* 点边索引
* 混合事务和分析处理HTAP支持图查询、图分析、图学习
* 主流图查询语言OpenCypher、ISO GQL等
* 支持OLAP API内置30多种图分析算法
* 基于C++/Python的存储过程含事务内并行Traversal API
* 提供图可视化工具
* 在性能和可扩展性方面的支持:
* 千万点/秒的高吞吐率
* TB级大容量
* 高可用性支持
* 高性能批量导入
* 在线/离线的备份恢复
主要功能:
- 标签属性图模型
- 完善的 ACID 事务处理
- 内置 34 图分析算法
- 支持全文/主键/二级索引
- OpenCypher 图查询语言
- 基于 C++/Python 的存储过程
性能和可扩展性:
- LDBC SNB世界记录保持者 (2022/9/1 https://ldbcouncil.org/benchmarks/snb/)
- 支持存储多达数十TB的数据
- 每秒访问数百万个顶点
- 快速批量导入
TuGraph DB的文档在[链接](https://tugraph-db.readthedocs.io/zh_CN/latest),欢迎访问我们的[官网](https://www.tugraph.org)。
### 1.3 快速上手
一个简单的方法是使用docker进行设置可以在[DockerHub](https://hub.docker.com/u/tugraph)中找到, 名称为`tugraph/tugraph-runtime-[os]:[tugraph version]`,
例如, `tugraph/tugraph-runtime-centos7:3.3.0`
更多详情请参考 [快速上手文档](./docs/zh-CN/source/3.quick-start/1.preparation.md) 和 [业务开发指南](./docs/zh-CN/source/development_guide.md).
### 1.4 从源代码编译
建议在Linux系统中构建TuGraph DBDocker环境是个不错的选择。如果您想设置一个新的环境请参考[Dockerfile](ci/images).
以下是编译TuGraph DB的步骤
1. 如果需要web接口运行`deps/build_deps.sh`不需要web接口则跳过此步骤
2. 根据容器系统信息执行`cmake .. -DOURSYSTEM=centos`或者`cmake .. -DOURSYSTEM=ubuntu`
3. `make`
4. `make package` 或者 `cpack --config CPackConfig.cmake`
示例:`tugraph/tugraph-compile-centos7`Docker环境
```bash
$ git clone --recursive https://github.com/TuGraph-family/tugraph-db.git
$ cd tugraph-db
$ deps/build_deps.sh
$ mkdir build && cd build
$ cmake .. -DOURSYSTEM=centos7
$ make
$ make package
```
### 1.5 开发
我们已为在DockerHub中编译准备了环境docker镜像可以帮助开发人员轻松入门名称为 `tugraph/tugraph-compile-[os]:[compile version]`, 例如, `tugraph/tugraph-compile-centos7:1.1.0`
## 2. TuGraph Analytics
### 2.1 介绍
**TuGraph Analytics** (别名GeaFlow) 是蚂蚁集团开源的[**性能世界一流**](https://ldbcouncil.org/benchmarks/snb-bi/)的OLAP图数据库支持万亿级图存储、图表混合处理、实时图计算、交互式图分析等核心能力目前广泛应用于数仓加速、金融风控、知识图谱以及社交网络等场景。
关于GeaFlow更多介绍请参考[GeaFlow介绍文档](docs/docs-cn/introduction.md)
GeaFlow设计论文参考[GeaFlow: A Graph Extended and Accelerated Dataflow System](https://dl.acm.org/doi/abs/10.1145/3589771)
### 2.2 起源
早期的大数据分析主要以离线处理为主以Hadoop为代表的技术栈很好的解决了大规模数据的分析问题。然而数据处理的时效性不足
很难满足高实时需求的场景。以Storm为代表的流式计算引擎的出现则很好的解决了数据实时处理的问题提高了数据处理的时效性。
然而Storm本身不提供状态管理的能力 对于聚合等有状态的计算显得无能为力。Flink
的出现很好的弥补了这一短板通过引入状态管理以及Checkpoint机制实现了高效的有状态流计算能力。
随着数据实时处理场景的丰富,尤其是在实时数仓场景下,实时关系运算(即Stream Join)
越来越多的成为数据实时化的难点。Flink虽然具备优秀的状态管理能和出色的性能然而在处理Join运算尤其是3度以上Join时
性能瓶颈越来越明显。由于需要在Join两端存放各个输入的数据状态当Join变多时状态的数据量急剧扩大性能也变的难以接受。
产生这个问题的本质原因是Flink等流计算系统以表作为数据模型而表模型本身是一个二维结构不包含关系的定义和关系的存储
在处理关系运算时只能通过Join运算方式实现成本很高。
在蚂蚁的大数据应用场景中尤其是金融风控、实时数仓等场景下存在大量Join运算如何提高Join
的时效性和性能成为我们面临的重要挑战,为此我们引入了图模型。图模型是一种以点边结构描述实体关系的数据模型,在图模型里面,点代表实体,
边代表关系,数据存储层面点边存放在一起。因此,图模型天然定义了数据的关系同时存储层面物化了点边关系。基于图模型,我们实现了新一代实时计算
引擎GeaFlow很好的解决了复杂关系运算实时化的问题。目前GeaFlow已广泛应用于数仓加速、金融风控、知识图谱以及社交网络等场景。
### 2.3 特性
* 分布式实时图计算
* 图表混合处理SQL+GQL语言
* 统一流批图计算
* 万亿级图原生存储
* 交互式图分析
* 高可用和Exactly Once语义
* 高阶API算子开发
* UDF/图算法/Connector插件支持
* 一站式图研发平台
* 云原生部署
### 2.4 快速上手
1. 准备Git、JDK8、Maven、Docker环境。
2. 下载源码:`git clone https://github.com/TuGraph-family/tugraph-analytics`
3. 项目构建:`mvn clean install -DskipTests`
4. 测试任务:`./bin/gql_submit.sh --gql geaflow/geaflow-examples/gql/loop_detection.sql`
3. 构建镜像:`./build.sh --all`
4. 启动容器:`docker run -d --name geaflow-console -p 8888:8888 geaflow-console:0.1`
更多详细内容请参考:[快速上手文档](docs/docs-cn/quick_start.md)。
### 2.5 开发手册
GeaFlow支持DSL和API两套编程接口您既可以通过GeaFlow提供的类SQL扩展语言SQL+ISO/GQL进行流图计算作业的开发也可以通过GeaFlow的高阶API编程接口通过Java语言进行应用开发。
* DSL应用开发[DSL开发文档](docs/docs-cn/application-development/dsl/overview.md)
* API应用开发[API开发文档](docs/docs-cn/application-development/api/guid.md)
### 2.6 技术架构
GeaFlow整体架构如下所示
![GeaFlow架构](../static/img/geaflow_arch_new.png)
* [DSL层](./principle/dsl_principle.md)即语言层。GeaFlow设计了SQL+GQL的融合分析语言支持对表模型和图模型统一处理。
* [Framework层](./principle/framework_principle.md)即框架层。GeaFlow设计了面向Graph和Stream的两套API支持流、批、图融合计算并实现了基于Cycle的统一分布式调度模型。
* [State层](./principle/state_principle.md)即存储层。GeaFlow设计了面向Graph和KV的两套API支持表数据和图数据的混合存储整体采用了Sharing Nothing的设计并支持将数据持久化到远程存储。
* [Console平台](./principle/console_principle.md)GeaFlow提供了一站式图研发平台实现了图数据的建模、加工、分析能力并提供了图作业的运维管控支持。
* **执行环境**GeaFlow可以运行在多种异构执行环境如K8S、Ray以及本地模式。
### 2.7 应用场景
#### 2.7.1 实时数仓加速
数仓场景存在大量Join运算在DWD层往往需要将多张表展开成一张大宽表以加速后续查询。当Join的表数量变多时传统的实时计算引擎很难
保证Join的时效性和性能这也成为目前实时数仓领域一个棘手的问题。基于GeaFlow的实时图计算引擎可以很好的解决这方面的问题。
GeaFlow以图作为数据模型替代DWD层的宽表可以实现数据实时构图同时在查询阶段利用图的点边物化特性可以极大加速关系运算的查询。
#### 2.7.2 实时归因分析
在信息化的大背景下,对用户行为进行渠道归因和路径分析是流量分析领域中的核心所在。通过实时计算用户的有效行为路径,构建出完整的转化路径,能够快速帮助业务看清楚产品的价值,帮助运营及时调整运营思路。实时归因分析的核心要点是准确性和实效性。准确性要求在成本可控下保证用户行为路径分析的准确性;实效性则要求计算的实时性足够高,才能快速帮助业务决策。
基于GeaFlow流图计算引擎的能力可以很好的满足归因分析的准确性和时效性要求。如下图所示
![归因分析](../static/img/guiyin_analysis.png)
GeaFlow首先通过实时构图将用户行为日志转换成用户行为拓扑图以用户作为图中的点与其相关的每个行为构建成从该用户指向埋点页面的一条边.然后利用流图计算能力分析提前用户行为子图,在子图上基于归因路径匹配的规则进行匹配计算得出该成交行为相应用户的归因路径,并输出到下游系统。
#### 2.7.3 实时反套现
在信贷风控的场景下如何进行信用卡反套现是一个典型的风控诉求。基于现有的套现模式分析可以看到套现是一个环路子图如何快速高效在大图中快速判定套现将极大的增加风险的识别效率。以下图为例通过将实时交易流、转账流等输入数据源转换成实时交易图然后根据风控策略对用户交易行为做图特征分析比如环路检查等特征计算实时提供给决策和监控平台进行反套现行为判定。通过GeaFlow实时构图和实时图计算能力可以快速发现套现等异常交易行为极大降低平台风险。
![实时反套现](../static/img/fantaoxian.png)
## 3. OSGraph
**OSGraph (Open Source Graph)** 是一个开源图谱关系洞察工具基于GitHub开源数据全域图谱实现开发者行为、项目社区生态的分析洞察。可以为开发者、项目Owner、开源布道师、社区运营等提供简洁直观的开源数据视图帮助你和你的项目制作专属的开源名片、寻求契合的开发伙伴、挖掘深度的社区价值。
### 3.1 产品地址
**[https://osgraph.com](https://osgraph.com)**
### 3.2 快速开始
本地启动测试请参考:[OSGraph部署文档](docs/zh-CN/DeveloperManual.md)
### 3.3 功能介绍
当前产品默认提供了6张开源数据图谱供大家体验包含项目类图谱3个贡献、生态、社区、开发类3个活动、伙伴、兴趣
#### 3.3.1 项目贡献图谱
**发现项目核心贡献**根据项目开发者研发活动信息Issue、PR、Commit、CR等找到项目核心贡献者。
**Q**我想看看给Apache Spark项目写代码的都有谁
**A**:选择“项目贡献图谱” - 搜索spark - 选择apache/spark。可以看到HyukjinKwon、dongjoon-hyun等核心贡献者另外还一不小心捉到两个“显眼包”AmplabJenkins、SparkQA这两个只参与CodeReview的机器人账号。
![](docs/img/spark-contrib.png)
#### 3.3.2 项目生态图谱
**洞察项目生态伙伴**:提取项目间的开发活动、组织等关联信息,构建项目核心生态关系。
**Q**最近很火的开源大模型Llama3周边生态大致是什么样的
**A**:选择“项目生态图谱” - 搜索llama3 - 选择meta-llama3/llama3。可以看到pytorch、tensorflow、transformers等知名AI项目当然还有上科技头条的llama.cpp。比较惊喜的发现是ray竟然和llama3有不少公共开发者可以深度挖掘一下。
![](docs/img/llama3-eco.png)
#### 3.3.3 项目社区图谱
**分析项目社区分布**:根据项目的开发活动、开发者组织等信息,提取项目核心开发者社区分布。
**Q**大数据引擎Flink发展这么多年后的社区现状如何
**A**:选择“项目社区图谱” - 搜索flink - 选择apache/flink。可以看到项目关注者主要来自中、美、德三国而Alibaba组织是代码贡献的中坚力量。
![](docs/img/flink-comm.png)
#### 3.3.4 开发活动图谱
**展示个人开源贡献**根据开发者研发活动信息Issue、PR、Commit、CR等找到参与的核心项目。
**Q**大神Linus Torvalds最近在参与哪些开源项目
**A**:选择“开发活动图谱” - 搜索torvalds。果然linux项目是torvalds的主要工作不过llvm、mody、libgit2也有所参与同时也看到他在subsurface这种“潜水日志管理工具”上的大量贡献果然大佬的爱好都很广泛。
![](docs/img/torvalds-act.png)
#### 3.3.5 开源伙伴图谱
**寻找个人开源伙伴**:找到开发者在开源社区中,与之协作紧密的其他开发者。
**Q**:我想知道在开源社区有没有和我志同道合的人?
**A**:选择“开发伙伴图谱” - 搜索我的ID。让我震惊的是有那么多陌生人和我关注了同一批项目这不得找机会认识一下说不定就能找到新朋友了。而和我合作PR的人基本上都是我认识的朋友和同事继续探索一下朋友们的开源伙伴开源社区的“六度人脉”不就来了么。
![](docs/img/fanzhidongyzby-part.png)
#### 3.3.6 开源兴趣图谱
**挖掘个人开源兴趣**:根据参与的项目主题、标签等信息,分析开发者技术领域与兴趣。
**Q**GitHub上最活跃的开发者对什么技术感兴趣
**A**:选择“开源兴趣图谱” - 搜索sindresorhus[GitHub用户榜](https://gitstar-ranking.com) No.1。整体来看sindresorhus对node、npm、js很感兴趣另外他发起的awesome项目足足30W星令人咋舌当前的开源兴趣数据主要来自项目有限的标签信息后续借助AI技术可能会有更好的展现。
![](docs/img/sindresorhus-intr.png)
### 3.4 未来规划
未来将会有更多有趣的图谱和功能加入到OSGraph
* 简单灵活的API设计让图谱无限扩展。
* 自由高效的画布交互,无限探索数据价值。
* 图谱URL支持嵌入Markdown制作我的开源名片。
* 基于AI技术的项目主题标签分析。
* 多人多项目联合分析,图谱洞察一键可达。
* 更丰富的数据展示与多维分析。
* **更多功能,与你携手共建……**
## 4. ChatTuGraph
ChatTuGraph通过AI技术为TuGraph赋能可以为图业务研发效能、图产品解决方案、图数据智能分析、图任务自动管控等领域带来更丰富的应用场景。
目前ChatTuGraph通过图语言语料生成借助大模型微调技术实现了自然语言的图数据分析构建Graph RAG基于知识图谱实现检索增强生成以降低大模型的推理幻觉以及通过多智能体技术Multiple Agents System实现图数据上的AIGC、智能化等能力。

View File

@@ -519,6 +519,11 @@ def knowledge_requires():
"sentence-transformers",
]
setup_spec.extras["graph_rag"] = setup_spec.extras["rag"] + [
"neo4j",
"dbgpt-tugraph-plugins>=0.1.0rc1",
]
def llama_cpp_requires():
"""
@@ -617,7 +622,6 @@ def all_datasource_requires():
"pyhive",
"thrift",
"thrift_sasl",
"neo4j",
"vertica_python",
]
@@ -691,6 +695,7 @@ def default_requires():
]
setup_spec.extras["default"] += setup_spec.extras["framework"]
setup_spec.extras["default"] += setup_spec.extras["rag"]
setup_spec.extras["default"] += setup_spec.extras["graph_rag"]
setup_spec.extras["default"] += setup_spec.extras["datasource"]
setup_spec.extras["default"] += setup_spec.extras["torch"]
setup_spec.extras["default"] += setup_spec.extras["cache"]

View File

@@ -40,3 +40,15 @@ def test_get_indexes(connector):
# Get the index information of the vertex table named 'person'.
indexes = connector.get_indexes("person", "vertex")
assert len(indexes) > 0
def test_run_without_stream(connector):
query = "MATCH (n) RETURN n limit 10"
result = connector.run(query)
assert len(result) == 10
def test_run_with_stream(connector):
query = "MATCH (n) RETURN n limit 10"
result = list(connector.run_stream(query))
assert len(result) == 10

View File

@@ -23,13 +23,13 @@ def test_graph_store(graph_store):
graph_store.insert_triplet("E", "8", "F")
subgraph = graph_store.explore(["A"])
print(f"\n{subgraph.graphviz()}")
print(f"\n{subgraph.format()}")
assert subgraph.edge_count == 9
graph_store.delete_triplet("A", "0", "A")
graph_store.delete_triplet("B", "4", "D")
subgraph = graph_store.explore(["A"])
print(f"\n{subgraph.graphviz()}")
print(f"\n{subgraph.format()}")
assert subgraph.edge_count == 7
triplets = graph_store.get_triplets("B")
@@ -38,4 +38,4 @@ def test_graph_store(graph_store):
schema = graph_store.get_schema()
print(f"\nSchema: {schema}")
assert len(schema) == 138
assert len(schema) == 86

View File

@@ -2,17 +2,12 @@
import pytest
from dbgpt.storage.graph_store.tugraph_store import TuGraphStore
class TuGraphStoreConfig:
def __init__(self, name):
self.name = name
from dbgpt.storage.graph_store.tugraph_store import TuGraphStore, TuGraphStoreConfig
@pytest.fixture(scope="module")
def store():
config = TuGraphStoreConfig(name="TestGraph")
config = TuGraphStoreConfig(name="TestGraph", summary_enabled=False)
store = TuGraphStore(config=config)
yield store
store.conn.close()
@@ -29,7 +24,7 @@ def test_insert_and_get_triplets(store):
store.insert_triplet("F", "7", "E")
store.insert_triplet("E", "8", "F")
triplets = store.get_triplets("A")
assert len(triplets) == 3
assert len(triplets) == 2
triplets = store.get_triplets("B")
assert len(triplets) == 3
triplets = store.get_triplets("C")
@@ -47,7 +42,7 @@ def test_query(store):
result = store.query(query)
v_c = result.vertex_count
e_c = result.edge_count
assert v_c == 2 and e_c == 3
assert v_c == 3 and e_c == 3
def test_explore(store):
@@ -55,13 +50,13 @@ def test_explore(store):
result = store.explore(subs, depth=2, fan=None, limit=10)
v_c = result.vertex_count
e_c = result.edge_count
assert v_c == 2 and e_c == 3
assert v_c == 5 and e_c == 5
# def test_delete_triplet(store):
# subj = "A"
# rel = "0"
# obj = "B"
# store.delete_triplet(subj, rel, obj)
# triplets = store.get_triplets(subj)
# assert len(triplets) == 0
def test_delete_triplet(store):
subj = "A"
rel = "0"
obj = "B"
store.delete_triplet(subj, rel, obj)
triplets = store.get_triplets(subj)
assert len(triplets) == 0

View File

@@ -0,0 +1,58 @@
import pytest
from dbgpt.storage.graph_store.tugraph_store import TuGraphStore, TuGraphStoreConfig
from dbgpt.storage.graph_store.graph import MemoryGraph, Edge, Vertex
@pytest.fixture(scope="module")
def store():
config = TuGraphStoreConfig(name="TestSummaryGraph", summary_enabled=True)
store_instance = TuGraphStore(config=config)
yield store_instance
store_instance.conn.close()
def test_insert_graph(store):
graph = MemoryGraph()
vertex_list = [
Vertex("A", "A", description="Vertex A", _document_id="Test doc"),
Vertex("B", "B", description="Vertex B", _document_id="Test doc"),
Vertex("C", "C", description="Vertex C", _document_id="Test doc"),
Vertex("D", "D", description="Vertex D", _document_id="Test doc"),
Vertex("E", "E", description="Vertex E", _document_id="Test doc"),
Vertex("F", "F", description="Vertex F", _document_id="Test doc"),
Vertex("G", "G", description="Vertex G", _document_id="Test doc"),
]
edge_list = [
Edge("A", "B", name="A-B", description="description of edge"),
Edge("B", "C", name="B-C", description="description of edge"),
Edge("C", "D", name="C-D", description="description of edge"),
Edge("D", "E", name="D-E", description="description of edge"),
Edge("E", "F", name="E-F", description="description of edge"),
Edge("F", "G", name="F-G", description="description of edge"),
]
for vertex in vertex_list:
graph.upsert_vertex(vertex)
for edge in edge_list:
graph.append_edge(edge)
store.insert_graph(graph)
def test_leiden_query(store):
query = "CALL db.plugin.callPlugin('CPP','leiden','{\"leiden_val\":\"_community_id\"}',60.00,false)"
result = store.query(query)
assert result.vertex_count == 1
def test_query_node_and_edge(store):
query = 'MATCH (n)-[r]->(m) WHERE n._community_id = "0" RETURN n,r,m'
result = store.query(query)
assert result.vertex_count == 7 and result.edge_count == 6
def test_stream_query_path(store):
query = 'MATCH p=(n)-[r:relation*2]->(m) WHERE n._community_id = "0" RETURN p'
result = store.query(query)
for v in result.vertices():
print(v.get_prop("_community_id"))
assert result.vertex_count == 7 and result.edge_count == 6

View File

@@ -6,15 +6,15 @@ from dbgpt.storage.graph_store.graph import MemoryGraph, Edge, Vertex, Direction
@pytest.fixture
def g():
g = MemoryGraph()
g.append_edge(Edge("A", "A", label="0"))
g.append_edge(Edge("A", "A", label="1"))
g.append_edge(Edge("A", "B", label="2"))
g.append_edge(Edge("B", "C", label="3"))
g.append_edge(Edge("B", "D", label="4"))
g.append_edge(Edge("C", "D", label="5"))
g.append_edge(Edge("B", "E", label="6"))
g.append_edge(Edge("F", "E", label="7"))
g.append_edge(Edge("E", "F", label="8"))
g.append_edge(Edge("A", "A", "0"))
g.append_edge(Edge("A", "A", "1"))
g.append_edge(Edge("A", "B", "2"))
g.append_edge(Edge("B", "C", "3"))
g.append_edge(Edge("B", "D", "4"))
g.append_edge(Edge("C", "D", "5"))
g.append_edge(Edge("B", "E", "6"))
g.append_edge(Edge("F", "E", "7"))
g.append_edge(Edge("E", "F", "8"))
g.upsert_vertex(Vertex("G"))
yield g
@@ -25,14 +25,20 @@ def g():
(lambda g: g.del_vertices("G", "G"), 6, 9),
(lambda g: g.del_vertices("C"), 6, 7),
(lambda g: g.del_vertices("A", "G"), 5, 6),
(lambda g: g.del_edges("E", "F", label="8"), 7, 8),
(lambda g: g.del_edges("A", "A"), 7, 7),
(lambda g: g.del_edges("A", "B"), 7, 8),
(lambda g: g.del_edges("A", "A", "0"), 7, 8),
(lambda g: g.del_edges("E", "F", "8"), 7, 8),
(lambda g: g.del_edges("E", "F", "9"), 7, 9),
(lambda g: g.del_edges("E", "F", val=1), 7, 9),
(lambda g: g.del_edges("E", "F", "8", val=1), 7, 9),
(lambda g: g.del_edges("E", "F", "9", val=1), 7, 9),
(lambda g: g.del_neighbor_edges("A", Direction.IN), 7, 7),
],
)
def test_delete(g, action, vc, ec):
action(g)
result = g.graphviz()
result = g.format()
print(f"\n{result}")
assert g.vertex_count == vc
assert g.edge_count == ec
@@ -50,7 +56,7 @@ def test_delete(g, action, vc, ec):
)
def test_search(g, vids, dir, vc, ec):
subgraph = g.search(vids, dir)
print(f"\n{subgraph.graphviz()}")
print(f"\n{subgraph.format()}")
assert subgraph.vertex_count == vc
assert subgraph.edge_count == ec
@@ -65,7 +71,7 @@ def test_search(g, vids, dir, vc, ec):
)
def test_search_result_limit(g, vids, dir, ec):
subgraph = g.search(vids, dir, limit=ec)
print(f"\n{subgraph.graphviz()}")
print(f"\n{subgraph.format()}")
assert subgraph.edge_count == ec
@@ -79,7 +85,7 @@ def test_search_result_limit(g, vids, dir, ec):
)
def test_search_fan_limit(g, vids, dir, fan, ec):
subgraph = g.search(vids, dir, fan=fan)
print(f"\n{subgraph.graphviz()}")
print(f"\n{subgraph.format()}")
assert subgraph.edge_count == ec
@@ -97,5 +103,5 @@ def test_search_fan_limit(g, vids, dir, fan, ec):
)
def test_search_depth_limit(g, vids, dir, dep, ec):
subgraph = g.search(vids, dir, depth=dep)
print(f"\n{subgraph.graphviz()}")
print(f"\n{subgraph.format()}")
assert subgraph.edge_count == ec

View File

@@ -83,9 +83,9 @@ const RecallTestModal: React.FC<RecallTestModalProps> = ({ open, setOpen, space
destroyOnClose={true}
>
<Card
title='召回配置'
size='small'
className='my-4'
title="召回配置"
size="small"
className="my-4"
extra={
<Popover
placement='bottomRight'
@@ -160,7 +160,7 @@ const RecallTestModal: React.FC<RecallTestModalProps> = ({ open, setOpen, space
)} */}
</Form>
</Card>
<Card title='召回结果' size='small'>
<Card title="召回结果" size="small">
<Spin spinning={loading}>
{resultList.length > 0 ? (
<div
@@ -172,9 +172,9 @@ const RecallTestModal: React.FC<RecallTestModalProps> = ({ open, setOpen, space
{resultList.map(item => (
<Card
title={
<div className='flex items-center'>
<Tag color='blue'># {item.chunk_id}</Tag>
{item.metadata.prop_field.title}
<div className="flex items-center">
<Tag color="blue"># {item.chunk_id}</Tag>
{item.metadata.source}
</div>
}
extra={

View File

@@ -1,10 +1,10 @@
/** @type {import('next').NextConfig} */
const CopyPlugin = require('copy-webpack-plugin');
const MonacoWebpackPlugin = require('monaco-editor-webpack-plugin');
const path = require('path');
const CopyPlugin = require("copy-webpack-plugin");
const MonacoWebpackPlugin = require("monaco-editor-webpack-plugin");
const path = require("path");
const nextConfig = {
experimental: {
esmExternals: 'loose',
esmExternals: "loose",
},
typescript: {
ignoreBuildErrors: true,
@@ -27,30 +27,35 @@ const nextConfig = {
new CopyPlugin({
patterns: [
{
from: path.join(__dirname, 'node_modules/@oceanbase-odc/monaco-plugin-ob/worker-dist/'),
to: 'static/ob-workers',
from: path.join(
__dirname,
"node_modules/@oceanbase-odc/monaco-plugin-ob/worker-dist/"
),
to: "static/ob-workers",
},
],
}),
})
);
// 添加 monaco-editor-webpack-plugin 插件
config.plugins.push(
new MonacoWebpackPlugin({
// 你可以在这里配置插件的选项,例如:
languages: ['sql'],
filename: 'static/[name].worker.js',
}),
languages: ["sql"],
filename: "static/[name].worker.js",
})
);
}
return config;
},
};
const withTM = require('next-transpile-modules')([
'@berryv/g2-react',
'@antv/g2',
'react-syntax-highlighter',
'@antv/gpt-vis',
const withTM = require("next-transpile-modules")([
"@berryv/g2-react",
"@antv/g2",
"react-syntax-highlighter",
"@antv/g6",
"@antv/graphin",
"@antv/gpt-vis",
]);
module.exports = withTM({

13767
web/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -23,9 +23,12 @@
"dependencies": {
"@ant-design/cssinjs": "^1.18.4",
"@ant-design/icons": "^5.2.5",
"@antv/algorithm": "^0.1.26",
"@antv/ava": "3.5.0-alpha.4",
"@antv/g2": "^5.1.8",
"@antv/gpt-vis": "^0.0.5",
"@antv/g6": "^5.0.17",
"@antv/graphin": "^3.0.2",
"@antv/s2": "^1.51.2",
"@berryv/g2-react": "^0.1.0",
"@emotion/react": "^11.11.4",
@@ -44,6 +47,9 @@
"classnames": "^2.3.2",
"cookies-next": "^4.0.0",
"copy-to-clipboard": "^3.3.3",
"framer-motion": "^10.16.4",
"google-auth-library": "^9.2.0",
"google-one-tap": "^1.0.6",
"cytoscape": "^3.29.2",
"cytoscape-euler": "^1.2.2",
"eslint-plugin-prettier": "^5.2.1",
@@ -53,16 +59,22 @@
"dayjs": "^1.11.12",
"i18next": "^23.4.5",
"iron-session": "^6.3.1",
"iron-session": "^6.3.1",
"lodash": "^4.17.21",
"markdown-it": "^14.1.0",
"markdown-it": "^14.1.0",
"moment": "^2.29.4",
"monaco-editor": ">=0.31.0",
"multer": "^1.4.5-lts.1",
"mysql2": "^3.6.2",
"multer": "^1.4.5-lts.1",
"mysql2": "^3.6.2",
"next": "13.4.7",
"next-auth": "^4.20.1",
"next-connect": "^1.0.0-next.4",
"next-transpile-modules": "^10.0.1",
"next-connect": "^1.0.0-next.4",
"next-transpile-modules": "^10.0.1",
"nprogress": "^0.2.0",
"react": "^18.3.1",
"react-dom": "^18.3.1",
@@ -82,7 +94,6 @@
},
"devDependencies": {
"@types/crypto-js": "^4.1.2",
"@types/cytoscape": "^3.21.0",
"@types/google-one-tap": "^1.2.4",
"@types/lodash": "^4.14.195",
"@types/markdown-it": "^14.1.1",

View File

@@ -0,0 +1,232 @@
import React, { useEffect, useMemo, useRef, useState } from "react";
import { Button, Spin } from "antd";
import { RollbackOutlined } from "@ant-design/icons";
import { apiInterceptors, getGraphVis } from "@/client/api";
import { useRouter } from "next/router";
import { idOf } from "@antv/g6";
import type {
Graph,
GraphData,
GraphOptions,
ID,
IPointerEvent,
PluginOptions,
} from "@antv/g6";
import type { GraphVisResult } from "../../../types/knowledge";
import { Graphin } from "@antv/graphin";
import { getDegree, getSize, isInCommunity } from "../../../utils/graph";
import { groupBy } from "lodash";
type GraphVisData = GraphVisResult | null;
const PALETTE = [
"#5F95FF",
"#61DDAA",
"#F6BD16",
"#7262FD",
"#78D3F8",
"#9661BC",
"#F6903D",
"#008685",
"#F08BB4",
];
function GraphVis() {
const LIMIT = 500;
const router = useRouter();
const [data, setData] = useState<GraphVisData>(null);
const graphRef = useRef<Graph | null>();
const [isReady, setIsReady] = useState(false);
const fetchGraphVis = async () => {
const [_, data] = await apiInterceptors(
getGraphVis(spaceName as string, { limit: LIMIT })
);
setData(data);
};
const transformData = (data: GraphVisData): GraphData => {
if (!data) return { nodes: [], edges: [] };
const nodes = data.nodes.map((node) => ({ id: node.id, data: node }));
const edges = data.edges.map((edge) => ({
source: edge.source,
target: edge.target,
data: edge,
}));
return { nodes, edges };
};
const back = () => {
router.push(`/construct/knowledge`);
};
const {
query: { spaceName },
} = useRouter();
useEffect(() => {
if (spaceName) fetchGraphVis();
}, [spaceName]);
const graphData = useMemo(() => transformData(data), [data]);
useEffect(() => {
if (isReady && graphRef.current) {
const groupedNodes = groupBy(
graphData.nodes,
(node) => node.data!.communityId
);
const plugins: PluginOptions = [];
Object.entries(groupedNodes).forEach(([key, nodes]) => {
if (!key || nodes.length < 2) return;
const color = graphRef.current?.getElementRenderStyle(
idOf(nodes[0])
).fill;
plugins.push({
key,
type: "bubble-sets",
members: nodes.map(idOf),
stroke: color,
fill: color,
fillOpacity: 0.1,
});
});
graphRef.current.setPlugins((prev) => [...prev, ...plugins]);
}
}, [isReady]);
const getNodeSize = (nodeId: ID) => {
return getSize(getNodeDegree(nodeId));
};
const getNodeDegree = (nodeId?: ID) => {
if (!nodeId) return 0;
return getDegree(graphData.edges!, nodeId);
};
const options: GraphOptions = {
data: graphData,
autoFit: "center",
node: {
style: (d) => {
const style = {
size: getNodeSize(idOf(d)),
label: true,
labelLineWidth: 2,
labelText: d.data?.name as string,
labelFontSize: 10,
labelBackground: true,
labelBackgroundFill: "#e5e7eb",
labelPadding: [0, 6],
labelBackgroundRadius: 4,
labelMaxWidth: "400%",
labelWordWrap: true,
};
if (!isInCommunity(graphData, idOf(d))) {
Object.assign(style, { fill: "#b0b0b0" });
}
return style;
},
state: {
active: {
lineWidth: 2,
labelWordWrap: false,
labelFontSize: 12,
labelFontWeight: "bold",
},
inactive: {
label: false,
},
},
palette: {
type: "group",
field: "communityId",
color: PALETTE,
},
},
edge: {
style: {
lineWidth: 1,
stroke: "#e2e2e2",
endArrow: true,
endArrowType: "vee",
label: true,
labelFontSize: 8,
labelBackground: true,
labelText: (e) => e.data!.name as string,
labelBackgroundFill: "#e5e7eb",
labelPadding: [0, 6],
labelBackgroundRadius: 4,
labelMaxWidth: "60%",
labelWordWrap: true,
},
state: {
active: {
stroke: "#b0b0b0",
labelWordWrap: false,
labelFontSize: 10,
labelFontWeight: "bold",
},
inactive: {
label: false,
},
},
},
behaviors: [
"drag-canvas",
"zoom-canvas",
"drag-element",
{
type: "hover-activate",
degree: 1,
state: "active",
enable: (event: IPointerEvent) => ["node"].includes(event.targetType),
},
],
animation: false,
layout: {
type: "force",
preventOverlap: true,
nodeSize: (d) => getNodeSize(d?.id as ID),
linkDistance: (edge) => {
const { source, target } = edge as { source: ID; target: ID };
const nodeSize = Math.min(getNodeSize(source), getNodeSize(target));
const degree = Math.min(getNodeDegree(source), getNodeDegree(target));
return degree === 1
? nodeSize * 2
: Math.min(degree * nodeSize * 1.5, 700);
},
},
transforms: ["process-parallel-edges"],
};
if (!data) return <Spin className="h-full justify-center content-center" />;
return (
<div className="p-4 h-full overflow-y-scroll relative px-2">
<Graphin
ref={(ref) => {
graphRef.current = ref;
}}
style={{ height: "100%", width: "100%" }}
options={options}
onReady={() => {
setIsReady(true);
}}
>
<Button
style={{ background: "#fff" }}
onClick={back}
icon={<RollbackOutlined />}
>
Back
</Button>
</Graphin>
</div>
);
}
export default GraphVis;

100
web/utils/graph.ts Normal file
View File

@@ -0,0 +1,100 @@
import { idOf } from "@antv/g6";
import { pick, groupBy } from "lodash";
import type { EdgeData, GraphData, ID } from "@antv/g6";
/**
* Reassign the layout style to the original graph data
* @param model - original graph data
* @param layoutResult - layout result
*/
export function reassignLayoutStyle(model: GraphData, layoutResult: GraphData) {
layoutResult.nodes?.forEach((layoutNode) => {
const modelNode = model.nodes?.find((node) => node.id === layoutNode.id);
if (modelNode?.style)
Object.assign(
modelNode.style || {},
pick(layoutNode.style, ["x", "y", "z"])
);
});
}
/**
* Calculate node size based on degree
* @param degree - degree of the node
* @param minSize - minimum size of the node
* @param maxSize - maximum size of the node
* @param minDegree - minimum degree
* @param maxDegree - maximum degree
* @returns size of the node
*/
export function getSize(
degree: number,
minSize = 24,
maxSize = 60,
minDegree = 1,
maxDegree = 10
): number {
const _degree = Math.max(minDegree, Math.min(maxDegree, degree));
const size =
minSize +
((_degree - minDegree) / (maxDegree - minDegree)) * (maxSize - minSize);
return size;
}
/**
* Get node degree, means the number of edges connected to the node
* @param edges - all edges data
* @param nodeId - node id
* @returns degree of the node
*/
export function getDegree(edges: EdgeData[], nodeId: ID) {
return getRelatedEdgesData(edges, nodeId).length;
}
/**
* Get related edges data of a node
* @param edges - all edges data
* @param nodeId - node id
* @returns related edges data
*/
export function getRelatedEdgesData(edges: EdgeData[], nodeId: ID) {
return edges.filter(
(edge) => edge.source === nodeId || edge.target === nodeId
);
}
/**
* Concatenate the labels of the related edges to the node as the node's edge key
* @param edges - all edges data
* @param nodeId - node id
* @returns edge key
*/
export function getCommunityId(edges: EdgeData[], nodeId: ID) {
const relatedEdges = getRelatedEdgesData(edges, nodeId);
const key = relatedEdges
.map((edge) => {
const direction = edge.source === nodeId ? "->" : "<-";
const otherEnd = edge.source === nodeId ? edge.target : edge.source;
return `${direction}_${edge.data!.label}_${otherEnd}`;
})
.sort()
.join("+");
return key;
}
/**
* Whether the node is in a community(same communityId) with more than `limit` nodes
* @param data - graph data
* @param nodeId - node id
* @param limit - limit
* @returns boolean
*/
export function isInCommunity(data: GraphData, nodeId: string, limit = 2) {
const groupedNodes = groupBy(data.nodes, (node) => node.data!.communityId);
const filtered = Object.values(groupedNodes).find((nodes) =>
nodes.map(idOf).includes(nodeId)
)!;
return filtered.length > limit;
}

10526
web/yarn.lock Normal file

File diff suppressed because it is too large Load Diff