feat: add GraphRAG framework and integrate TuGraph (#1506)

Co-authored-by: KingSkyLi <15566300566@163.com>
Co-authored-by: aries_ckt <916701291@qq.com>
Co-authored-by: Fangyin Cheng <staneyffer@gmail.com>
This commit is contained in:
Florian
2024-05-16 15:39:50 +08:00
committed by GitHub
parent 593e974405
commit a9087c3853
133 changed files with 10139 additions and 6631 deletions

View File

@@ -1,3 +1,4 @@
import asyncio
import json
import logging
import os
@@ -21,8 +22,10 @@ from dbgpt.configs.model_config import (
EMBEDDING_MODEL_CONFIG,
KNOWLEDGE_UPLOAD_ROOT_PATH,
)
from dbgpt.core import Chunk
from dbgpt.core import Chunk, LLMClient
from dbgpt.core.awel.dag.dag_manager import DAGManager
from dbgpt.model import DefaultLLMClient
from dbgpt.model.cluster import WorkerManagerFactory
from dbgpt.rag.assembler import EmbeddingAssembler
from dbgpt.rag.chunk_manager import ChunkParameters
from dbgpt.rag.embedding import EmbeddingFactory
@@ -71,7 +74,7 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
document_dao: Optional[KnowledgeDocumentDao] = None,
chunk_dao: Optional[DocumentChunkDao] = None,
):
self._system_app = None
self._system_app = system_app
self._dao: KnowledgeSpaceDao = dao
self._document_dao: KnowledgeDocumentDao = document_dao
self._chunk_dao: DocumentChunkDao = chunk_dao
@@ -112,6 +115,13 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
"""Returns the internal ServeConfig."""
return self._serve_config
@property
def llm_client(self) -> LLMClient:
worker_manager = self._system_app.get_component(
ComponentType.WORKER_MANAGER_FACTORY, WorkerManagerFactory
).create()
return DefaultLLMClient(worker_manager, True)
def create_space(self, request: SpaceServeRequest) -> SpaceServeResponse:
"""Create a new Space entity
@@ -198,7 +208,7 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
raise Exception(f"create document failed, {request.doc_name}")
return doc_id
def sync_document(self, requests: List[KnowledgeSyncRequest]) -> List:
async def sync_document(self, requests: List[KnowledgeSyncRequest]) -> List:
"""Create a new document entity
Args:
@@ -236,7 +246,7 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
if space_context is None
else int(space_context["embedding"]["chunk_overlap"])
)
self._sync_knowledge_document(space_id, doc, chunk_parameters)
await self._sync_knowledge_document(space_id, doc, chunk_parameters)
doc_ids.append(doc.id)
return doc_ids
@@ -284,10 +294,11 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
space = self.get(query_request)
if space is None:
raise HTTPException(status_code=400, detail=f"Space {space_id} not found")
config = VectorStoreConfig(name=space.name)
config = VectorStoreConfig(
name=space.name, llm_client=self.llm_client, model_name=None
)
vector_store_connector = VectorStoreConnector(
vector_store_type=CFG.VECTOR_STORE_TYPE,
vector_store_config=config,
vector_store_type=space.vector_type, vector_store_config=config
)
# delete vectors
vector_store_connector.delete_vector_name(space.name)
@@ -316,12 +327,22 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
docuemnt = self._document_dao.get_one(query_request)
if docuemnt is None:
raise Exception(f"there are no or more than one document {document_id}")
# get space by name
spaces = self._dao.get_knowledge_space(
KnowledgeSpaceEntity(name=docuemnt.space)
)
if len(spaces) != 1:
raise Exception(f"invalid space name: {docuemnt.space}")
space = spaces[0]
vector_ids = docuemnt.vector_ids
if vector_ids is not None:
config = VectorStoreConfig(name=docuemnt.space)
config = VectorStoreConfig(
name=space.name, llm_client=self.llm_client, model_name=None
)
vector_store_connector = VectorStoreConnector(
vector_store_type=CFG.VECTOR_STORE_TYPE,
vector_store_config=config,
vector_store_type=space.vector_type, vector_store_config=config
)
# delete vector by ids
vector_store_connector.delete_by_ids(vector_ids)
@@ -375,7 +396,7 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
"""
return self._document_dao.get_list_page(request, page, page_size)
def _batch_document_sync(
async def _batch_document_sync(
self, space_id, sync_requests: List[KnowledgeSyncRequest]
) -> List[int]:
"""batch sync knowledge document chunk into vector store
@@ -413,11 +434,11 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
if space_context is None
else int(space_context["embedding"]["chunk_overlap"])
)
self._sync_knowledge_document(space_id, doc, chunk_parameters)
await self._sync_knowledge_document(space_id, doc, chunk_parameters)
doc_ids.append(doc.id)
return doc_ids
def _sync_knowledge_document(
async def _sync_knowledge_document(
self,
space_id,
doc_vo: DocumentVO,
@@ -439,10 +460,11 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
name=space.name,
embedding_fn=embedding_fn,
max_chunks_once_load=CFG.KNOWLEDGE_MAX_CHUNKS_ONCE_LOAD,
llm_client=self.llm_client,
model_name=None,
)
vector_store_connector = VectorStoreConnector(
vector_store_type=CFG.VECTOR_STORE_TYPE,
vector_store_config=config,
vector_store_type=space.vector_type, vector_store_config=config
)
knowledge = KnowledgeFactory.create(
datasource=doc.content,
@@ -458,15 +480,16 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
doc.chunk_size = len(chunk_docs)
doc.gmt_modified = datetime.now()
self._document_dao.update_knowledge_document(doc)
executor = CFG.SYSTEM_APP.get_component(
ComponentType.EXECUTOR_DEFAULT, ExecutorFactory
).create()
executor.submit(self.async_doc_embedding, assembler, chunk_docs, doc)
# executor = CFG.SYSTEM_APP.get_component(
# ComponentType.EXECUTOR_DEFAULT, ExecutorFactory
# ).create()
# executor.submit(self.async_doc_embedding, assembler, chunk_docs, doc)
asyncio.create_task(self.async_doc_embedding(assembler, chunk_docs, doc))
logger.info(f"begin save document chunks, doc:{doc.doc_name}")
return chunk_docs
@trace("async_doc_embedding")
def async_doc_embedding(self, assembler, chunk_docs, doc):
async def async_doc_embedding(self, assembler, chunk_docs, doc):
"""async document embedding into vector db
Args:
- client: EmbeddingEngine Client
@@ -475,14 +498,19 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
"""
logger.info(
f"async doc embedding sync, doc:{doc.doc_name}, chunks length is {len(chunk_docs)}, begin embedding to vector store-{CFG.VECTOR_STORE_TYPE}"
f"async doc embedding sync, doc:{doc.doc_name}, chunks length is {len(chunk_docs)}"
)
try:
with root_tracer.start_span(
"app.knowledge.assembler.persist",
metadata={"doc": doc.doc_name, "chunks": len(chunk_docs)},
):
vector_ids = assembler.persist()
# vector_ids = assembler.persist()
space = self.get({"name": doc.space})
if space and space.vector_type == "KnowledgeGraph":
vector_ids = await assembler.apersist()
else:
vector_ids = assembler.persist()
doc.status = SyncStatus.FINISHED.name
doc.result = "document embedding success"
if vector_ids is not None: