mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-12 05:32:32 +00:00
feat:document summary
This commit is contained in:
parent
e4c96e325e
commit
53b1fc4090
@ -96,6 +96,13 @@ class ChatScene(Enum):
|
|||||||
["Extract Select"],
|
["Extract Select"],
|
||||||
True,
|
True,
|
||||||
)
|
)
|
||||||
|
ExtractRefineSummary = Scene(
|
||||||
|
"extract_refine_summary",
|
||||||
|
"Extract Summary",
|
||||||
|
"Extract Summary",
|
||||||
|
["Extract Select"],
|
||||||
|
True,
|
||||||
|
)
|
||||||
ExtractEntity = Scene(
|
ExtractEntity = Scene(
|
||||||
"extract_entity", "Extract Entity", "Extract Entity", ["Extract Select"], True
|
"extract_entity", "Extract Entity", "Extract Entity", ["Extract Select"], True
|
||||||
)
|
)
|
||||||
|
@ -127,8 +127,6 @@ class BaseChat(ABC):
|
|||||||
speak_to_user = prompt_define_response
|
speak_to_user = prompt_define_response
|
||||||
return speak_to_user
|
return speak_to_user
|
||||||
|
|
||||||
async def __call_base(self):
|
|
||||||
input_values = await self.generate_input_values()
|
|
||||||
async def __call_base(self):
|
async def __call_base(self):
|
||||||
import inspect
|
import inspect
|
||||||
|
|
||||||
|
@ -17,6 +17,7 @@ class ChatFactory(metaclass=Singleton):
|
|||||||
from pilot.scene.chat_knowledge.extract_triplet.chat import ExtractTriplet
|
from pilot.scene.chat_knowledge.extract_triplet.chat import ExtractTriplet
|
||||||
from pilot.scene.chat_knowledge.extract_entity.chat import ExtractEntity
|
from pilot.scene.chat_knowledge.extract_entity.chat import ExtractEntity
|
||||||
from pilot.scene.chat_knowledge.summary.chat import ExtractSummary
|
from pilot.scene.chat_knowledge.summary.chat import ExtractSummary
|
||||||
|
from pilot.scene.chat_knowledge.refine_summary.chat import ExtractRefineSummary
|
||||||
from pilot.scene.chat_data.chat_excel.excel_analyze.chat import ChatExcel
|
from pilot.scene.chat_data.chat_excel.excel_analyze.chat import ChatExcel
|
||||||
from pilot.scene.chat_agent.chat import ChatAgent
|
from pilot.scene.chat_agent.chat import ChatAgent
|
||||||
|
|
||||||
|
@ -30,11 +30,12 @@ class KnowledgeDocumentEntity(Base):
|
|||||||
content = Column(Text)
|
content = Column(Text)
|
||||||
result = Column(Text)
|
result = Column(Text)
|
||||||
vector_ids = Column(Text)
|
vector_ids = Column(Text)
|
||||||
|
summary = Column(Text)
|
||||||
gmt_created = Column(DateTime)
|
gmt_created = Column(DateTime)
|
||||||
gmt_modified = Column(DateTime)
|
gmt_modified = Column(DateTime)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"KnowledgeDocumentEntity(id={self.id}, doc_name='{self.doc_name}', doc_type='{self.doc_type}', chunk_size='{self.chunk_size}', status='{self.status}', last_sync='{self.last_sync}', content='{self.content}', result='{self.result}', gmt_created='{self.gmt_created}', gmt_modified='{self.gmt_modified}')"
|
return f"KnowledgeDocumentEntity(id={self.id}, doc_name='{self.doc_name}', doc_type='{self.doc_type}', chunk_size='{self.chunk_size}', status='{self.status}', last_sync='{self.last_sync}', content='{self.content}', result='{self.result}', summary='{self.summary}', gmt_created='{self.gmt_created}', gmt_modified='{self.gmt_modified}')"
|
||||||
|
|
||||||
|
|
||||||
class KnowledgeDocumentDao(BaseDao):
|
class KnowledgeDocumentDao(BaseDao):
|
||||||
|
@ -5,8 +5,9 @@ from pydantic import BaseModel
|
|||||||
|
|
||||||
class ChunkQueryResponse(BaseModel):
|
class ChunkQueryResponse(BaseModel):
|
||||||
"""data: data"""
|
"""data: data"""
|
||||||
|
|
||||||
data: List = None
|
data: List = None
|
||||||
|
"""summary: document summary"""
|
||||||
|
summary: str = None
|
||||||
"""total: total size"""
|
"""total: total size"""
|
||||||
total: int = None
|
total: int = None
|
||||||
"""page: current page"""
|
"""page: current page"""
|
||||||
|
@ -288,8 +288,8 @@ class KnowledgeService:
|
|||||||
executor = CFG.SYSTEM_APP.get_component(
|
executor = CFG.SYSTEM_APP.get_component(
|
||||||
ComponentType.EXECUTOR_DEFAULT, ExecutorFactory
|
ComponentType.EXECUTOR_DEFAULT, ExecutorFactory
|
||||||
).create()
|
).create()
|
||||||
executor.submit(self.async_knowledge_graph, chunk_docs, doc)
|
executor.submit(self.async_document_summary, chunk_docs, doc)
|
||||||
# executor.submit(self.async_doc_embedding, client, chunk_docs, doc)
|
executor.submit(self.async_doc_embedding, client, chunk_docs, doc)
|
||||||
logger.info(f"begin save document chunks, doc:{doc.doc_name}")
|
logger.info(f"begin save document chunks, doc:{doc.doc_name}")
|
||||||
# save chunk details
|
# save chunk details
|
||||||
chunk_entities = [
|
chunk_entities = [
|
||||||
@ -384,38 +384,59 @@ class KnowledgeService:
|
|||||||
doc_name=request.doc_name,
|
doc_name=request.doc_name,
|
||||||
doc_type=request.doc_type,
|
doc_type=request.doc_type,
|
||||||
)
|
)
|
||||||
|
document_query = KnowledgeDocumentEntity(id=request.document_id)
|
||||||
|
documents = knowledge_document_dao.get_documents(document_query)
|
||||||
|
|
||||||
res = ChunkQueryResponse()
|
res = ChunkQueryResponse()
|
||||||
res.data = document_chunk_dao.get_document_chunks(
|
res.data = document_chunk_dao.get_document_chunks(
|
||||||
query, page=request.page, page_size=request.page_size
|
query, page=request.page, page_size=request.page_size
|
||||||
)
|
)
|
||||||
|
res.summary = documents[0].summary
|
||||||
res.total = document_chunk_dao.get_document_chunks_count(query)
|
res.total = document_chunk_dao.get_document_chunks_count(query)
|
||||||
res.page = request.page
|
res.page = request.page
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def async_knowledge_graph(self, chunk_docs, doc):
|
def async_knowledge_graph(self, chunk_docs, doc):
|
||||||
"""async document extract triplets and save into graph db
|
"""async document extract triplets and save into graph db
|
||||||
Args:
|
Args:
|
||||||
- chunk_docs: List[Document]
|
- chunk_docs: List[Document]
|
||||||
- doc: KnowledgeDocumentEntity
|
- doc: KnowledgeDocumentEntity
|
||||||
"""
|
"""
|
||||||
for doc in chunk_docs:
|
|
||||||
text = doc.page_content
|
|
||||||
self._llm_extract_summary(text)
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"async_knowledge_graph, doc:{doc.doc_name}, chunk_size:{len(chunk_docs)}, begin embedding to graph store"
|
f"async_knowledge_graph, doc:{doc.doc_name}, chunk_size:{len(chunk_docs)}, begin embedding to graph store"
|
||||||
)
|
)
|
||||||
# try:
|
try:
|
||||||
# from pilot.graph_engine.graph_factory import RAGGraphFactory
|
from pilot.graph_engine.graph_factory import RAGGraphFactory
|
||||||
#
|
|
||||||
# rag_engine = CFG.SYSTEM_APP.get_component(
|
rag_engine = CFG.SYSTEM_APP.get_component(
|
||||||
# ComponentType.RAG_GRAPH_DEFAULT.value, RAGGraphFactory
|
ComponentType.RAG_GRAPH_DEFAULT.value, RAGGraphFactory
|
||||||
# ).create()
|
).create()
|
||||||
# rag_engine.knowledge_graph(chunk_docs)
|
rag_engine.knowledge_graph(chunk_docs)
|
||||||
# doc.status = SyncStatus.FINISHED.name
|
doc.status = SyncStatus.FINISHED.name
|
||||||
# doc.result = "document build graph success"
|
doc.result = "document build graph success"
|
||||||
# except Exception as e:
|
except Exception as e:
|
||||||
# doc.status = SyncStatus.FAILED.name
|
doc.status = SyncStatus.FAILED.name
|
||||||
# doc.result = "document build graph failed" + str(e)
|
doc.result = "document build graph failed" + str(e)
|
||||||
# logger.error(f"document build graph failed:{doc.doc_name}, {str(e)}")
|
logger.error(f"document build graph failed:{doc.doc_name}, {str(e)}")
|
||||||
|
return knowledge_document_dao.update_knowledge_document(doc)
|
||||||
|
|
||||||
|
def async_document_summary(self, chunk_docs, doc):
|
||||||
|
"""async document extract summary
|
||||||
|
Args:
|
||||||
|
- chunk_docs: List[Document]
|
||||||
|
- doc: KnowledgeDocumentEntity
|
||||||
|
"""
|
||||||
|
from llama_index import PromptHelper
|
||||||
|
from llama_index.prompts.default_prompt_selectors import DEFAULT_TREE_SUMMARIZE_PROMPT_SEL
|
||||||
|
texts = [doc.page_content for doc in chunk_docs]
|
||||||
|
prompt_helper = PromptHelper()
|
||||||
|
texts = prompt_helper.repack(prompt=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL, text_chunks=texts)
|
||||||
|
summary = self._llm_extract_summary(chunk_docs[0])
|
||||||
|
outputs, summary = self._refine_extract_summary(texts[1:], summary)
|
||||||
|
logger.info(
|
||||||
|
f"async_document_summary, doc:{doc.doc_name}, chunk_size:{len(chunk_docs)}, begin embedding to graph store"
|
||||||
|
)
|
||||||
|
doc.summary = summary
|
||||||
return knowledge_document_dao.update_knowledge_document(doc)
|
return knowledge_document_dao.update_knowledge_document(doc)
|
||||||
|
|
||||||
|
|
||||||
@ -491,15 +512,39 @@ class KnowledgeService:
|
|||||||
|
|
||||||
chat_param = {
|
chat_param = {
|
||||||
"chat_session_id": uuid.uuid1(),
|
"chat_session_id": uuid.uuid1(),
|
||||||
"current_user_input": doc,
|
"current_user_input": doc.page_content,
|
||||||
"select_param": "summery",
|
"select_param": "summary",
|
||||||
"model_name": "proxyllm",
|
"model_name": "proxyllm",
|
||||||
}
|
}
|
||||||
from pilot.utils import utils
|
from pilot.utils import utils
|
||||||
loop = utils.get_or_create_event_loop()
|
loop = utils.get_or_create_event_loop()
|
||||||
triplets = loop.run_until_complete(
|
summary = loop.run_until_complete(
|
||||||
llm_chat_response_nostream(
|
llm_chat_response_nostream(
|
||||||
ChatScene.ExtractSummary.value(), **{"chat_param": chat_param}
|
ChatScene.ExtractSummary.value(), **{"chat_param": chat_param}
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return triplets
|
return summary
|
||||||
|
def _refine_extract_summary(self, docs, summary: str):
|
||||||
|
"""Extract refine summary by llm"""
|
||||||
|
from pilot.scene.base import ChatScene
|
||||||
|
from pilot.common.chat_util import llm_chat_response_nostream
|
||||||
|
import uuid
|
||||||
|
outputs = []
|
||||||
|
for doc in docs:
|
||||||
|
chat_param = {
|
||||||
|
"chat_session_id": uuid.uuid1(),
|
||||||
|
"current_user_input": doc,
|
||||||
|
"select_param": summary,
|
||||||
|
"model_name": "proxyllm",
|
||||||
|
}
|
||||||
|
from pilot.utils import utils
|
||||||
|
loop = utils.get_or_create_event_loop()
|
||||||
|
summary = loop.run_until_complete(
|
||||||
|
llm_chat_response_nostream(
|
||||||
|
ChatScene.ExtractRefineSummary.value(), **{"chat_param": chat_param}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
outputs.append(summary)
|
||||||
|
return outputs, summary
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user