mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-09-11 22:09:44 +00:00
feat: add GraphRAG framework and integrate TuGraph (#1506)
Co-authored-by: KingSkyLi <15566300566@163.com> Co-authored-by: aries_ckt <916701291@qq.com> Co-authored-by: Fangyin Cheng <staneyffer@gmail.com>
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -21,8 +22,10 @@ from dbgpt.configs.model_config import (
|
||||
EMBEDDING_MODEL_CONFIG,
|
||||
KNOWLEDGE_UPLOAD_ROOT_PATH,
|
||||
)
|
||||
from dbgpt.core import Chunk
|
||||
from dbgpt.core import Chunk, LLMClient
|
||||
from dbgpt.core.awel.dag.dag_manager import DAGManager
|
||||
from dbgpt.model import DefaultLLMClient
|
||||
from dbgpt.model.cluster import WorkerManagerFactory
|
||||
from dbgpt.rag.assembler import EmbeddingAssembler
|
||||
from dbgpt.rag.chunk_manager import ChunkParameters
|
||||
from dbgpt.rag.embedding import EmbeddingFactory
|
||||
@@ -71,7 +74,7 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
|
||||
document_dao: Optional[KnowledgeDocumentDao] = None,
|
||||
chunk_dao: Optional[DocumentChunkDao] = None,
|
||||
):
|
||||
self._system_app = None
|
||||
self._system_app = system_app
|
||||
self._dao: KnowledgeSpaceDao = dao
|
||||
self._document_dao: KnowledgeDocumentDao = document_dao
|
||||
self._chunk_dao: DocumentChunkDao = chunk_dao
|
||||
@@ -112,6 +115,13 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
|
||||
"""Returns the internal ServeConfig."""
|
||||
return self._serve_config
|
||||
|
||||
@property
|
||||
def llm_client(self) -> LLMClient:
|
||||
worker_manager = self._system_app.get_component(
|
||||
ComponentType.WORKER_MANAGER_FACTORY, WorkerManagerFactory
|
||||
).create()
|
||||
return DefaultLLMClient(worker_manager, True)
|
||||
|
||||
def create_space(self, request: SpaceServeRequest) -> SpaceServeResponse:
|
||||
"""Create a new Space entity
|
||||
|
||||
@@ -198,7 +208,7 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
|
||||
raise Exception(f"create document failed, {request.doc_name}")
|
||||
return doc_id
|
||||
|
||||
def sync_document(self, requests: List[KnowledgeSyncRequest]) -> List:
|
||||
async def sync_document(self, requests: List[KnowledgeSyncRequest]) -> List:
|
||||
"""Create a new document entity
|
||||
|
||||
Args:
|
||||
@@ -236,7 +246,7 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
|
||||
if space_context is None
|
||||
else int(space_context["embedding"]["chunk_overlap"])
|
||||
)
|
||||
self._sync_knowledge_document(space_id, doc, chunk_parameters)
|
||||
await self._sync_knowledge_document(space_id, doc, chunk_parameters)
|
||||
doc_ids.append(doc.id)
|
||||
return doc_ids
|
||||
|
||||
@@ -284,10 +294,11 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
|
||||
space = self.get(query_request)
|
||||
if space is None:
|
||||
raise HTTPException(status_code=400, detail=f"Space {space_id} not found")
|
||||
config = VectorStoreConfig(name=space.name)
|
||||
config = VectorStoreConfig(
|
||||
name=space.name, llm_client=self.llm_client, model_name=None
|
||||
)
|
||||
vector_store_connector = VectorStoreConnector(
|
||||
vector_store_type=CFG.VECTOR_STORE_TYPE,
|
||||
vector_store_config=config,
|
||||
vector_store_type=space.vector_type, vector_store_config=config
|
||||
)
|
||||
# delete vectors
|
||||
vector_store_connector.delete_vector_name(space.name)
|
||||
@@ -316,12 +327,22 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
|
||||
docuemnt = self._document_dao.get_one(query_request)
|
||||
if docuemnt is None:
|
||||
raise Exception(f"there are no or more than one document {document_id}")
|
||||
|
||||
# get space by name
|
||||
spaces = self._dao.get_knowledge_space(
|
||||
KnowledgeSpaceEntity(name=docuemnt.space)
|
||||
)
|
||||
if len(spaces) != 1:
|
||||
raise Exception(f"invalid space name: {docuemnt.space}")
|
||||
space = spaces[0]
|
||||
|
||||
vector_ids = docuemnt.vector_ids
|
||||
if vector_ids is not None:
|
||||
config = VectorStoreConfig(name=docuemnt.space)
|
||||
config = VectorStoreConfig(
|
||||
name=space.name, llm_client=self.llm_client, model_name=None
|
||||
)
|
||||
vector_store_connector = VectorStoreConnector(
|
||||
vector_store_type=CFG.VECTOR_STORE_TYPE,
|
||||
vector_store_config=config,
|
||||
vector_store_type=space.vector_type, vector_store_config=config
|
||||
)
|
||||
# delete vector by ids
|
||||
vector_store_connector.delete_by_ids(vector_ids)
|
||||
@@ -375,7 +396,7 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
|
||||
"""
|
||||
return self._document_dao.get_list_page(request, page, page_size)
|
||||
|
||||
def _batch_document_sync(
|
||||
async def _batch_document_sync(
|
||||
self, space_id, sync_requests: List[KnowledgeSyncRequest]
|
||||
) -> List[int]:
|
||||
"""batch sync knowledge document chunk into vector store
|
||||
@@ -413,11 +434,11 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
|
||||
if space_context is None
|
||||
else int(space_context["embedding"]["chunk_overlap"])
|
||||
)
|
||||
self._sync_knowledge_document(space_id, doc, chunk_parameters)
|
||||
await self._sync_knowledge_document(space_id, doc, chunk_parameters)
|
||||
doc_ids.append(doc.id)
|
||||
return doc_ids
|
||||
|
||||
def _sync_knowledge_document(
|
||||
async def _sync_knowledge_document(
|
||||
self,
|
||||
space_id,
|
||||
doc_vo: DocumentVO,
|
||||
@@ -439,10 +460,11 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
|
||||
name=space.name,
|
||||
embedding_fn=embedding_fn,
|
||||
max_chunks_once_load=CFG.KNOWLEDGE_MAX_CHUNKS_ONCE_LOAD,
|
||||
llm_client=self.llm_client,
|
||||
model_name=None,
|
||||
)
|
||||
vector_store_connector = VectorStoreConnector(
|
||||
vector_store_type=CFG.VECTOR_STORE_TYPE,
|
||||
vector_store_config=config,
|
||||
vector_store_type=space.vector_type, vector_store_config=config
|
||||
)
|
||||
knowledge = KnowledgeFactory.create(
|
||||
datasource=doc.content,
|
||||
@@ -458,15 +480,16 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
|
||||
doc.chunk_size = len(chunk_docs)
|
||||
doc.gmt_modified = datetime.now()
|
||||
self._document_dao.update_knowledge_document(doc)
|
||||
executor = CFG.SYSTEM_APP.get_component(
|
||||
ComponentType.EXECUTOR_DEFAULT, ExecutorFactory
|
||||
).create()
|
||||
executor.submit(self.async_doc_embedding, assembler, chunk_docs, doc)
|
||||
# executor = CFG.SYSTEM_APP.get_component(
|
||||
# ComponentType.EXECUTOR_DEFAULT, ExecutorFactory
|
||||
# ).create()
|
||||
# executor.submit(self.async_doc_embedding, assembler, chunk_docs, doc)
|
||||
asyncio.create_task(self.async_doc_embedding(assembler, chunk_docs, doc))
|
||||
logger.info(f"begin save document chunks, doc:{doc.doc_name}")
|
||||
return chunk_docs
|
||||
|
||||
@trace("async_doc_embedding")
|
||||
def async_doc_embedding(self, assembler, chunk_docs, doc):
|
||||
async def async_doc_embedding(self, assembler, chunk_docs, doc):
|
||||
"""async document embedding into vector db
|
||||
Args:
|
||||
- client: EmbeddingEngine Client
|
||||
@@ -475,14 +498,19 @@ class Service(BaseService[KnowledgeSpaceEntity, SpaceServeRequest, SpaceServeRes
|
||||
"""
|
||||
|
||||
logger.info(
|
||||
f"async doc embedding sync, doc:{doc.doc_name}, chunks length is {len(chunk_docs)}, begin embedding to vector store-{CFG.VECTOR_STORE_TYPE}"
|
||||
f"async doc embedding sync, doc:{doc.doc_name}, chunks length is {len(chunk_docs)}"
|
||||
)
|
||||
try:
|
||||
with root_tracer.start_span(
|
||||
"app.knowledge.assembler.persist",
|
||||
metadata={"doc": doc.doc_name, "chunks": len(chunk_docs)},
|
||||
):
|
||||
vector_ids = assembler.persist()
|
||||
# vector_ids = assembler.persist()
|
||||
space = self.get({"name": doc.space})
|
||||
if space and space.vector_type == "KnowledgeGraph":
|
||||
vector_ids = await assembler.apersist()
|
||||
else:
|
||||
vector_ids = assembler.persist()
|
||||
doc.status = SyncStatus.FINISHED.name
|
||||
doc.result = "document embedding success"
|
||||
if vector_ids is not None:
|
||||
|
Reference in New Issue
Block a user