feat:document summary

This commit is contained in:
aries_ckt 2023-10-30 19:06:09 +08:00
parent e4c96e325e
commit 53b1fc4090
6 changed files with 79 additions and 26 deletions

View File

@ -96,6 +96,13 @@ class ChatScene(Enum):
["Extract Select"], ["Extract Select"],
True, True,
) )
ExtractRefineSummary = Scene(
"extract_refine_summary",
"Extract Summary",
"Extract Summary",
["Extract Select"],
True,
)
ExtractEntity = Scene( ExtractEntity = Scene(
"extract_entity", "Extract Entity", "Extract Entity", ["Extract Select"], True "extract_entity", "Extract Entity", "Extract Entity", ["Extract Select"], True
) )

View File

@ -127,8 +127,6 @@ class BaseChat(ABC):
speak_to_user = prompt_define_response speak_to_user = prompt_define_response
return speak_to_user return speak_to_user
async def __call_base(self):
input_values = await self.generate_input_values()
async def __call_base(self): async def __call_base(self):
import inspect import inspect

View File

@ -17,6 +17,7 @@ class ChatFactory(metaclass=Singleton):
from pilot.scene.chat_knowledge.extract_triplet.chat import ExtractTriplet from pilot.scene.chat_knowledge.extract_triplet.chat import ExtractTriplet
from pilot.scene.chat_knowledge.extract_entity.chat import ExtractEntity from pilot.scene.chat_knowledge.extract_entity.chat import ExtractEntity
from pilot.scene.chat_knowledge.summary.chat import ExtractSummary from pilot.scene.chat_knowledge.summary.chat import ExtractSummary
from pilot.scene.chat_knowledge.refine_summary.chat import ExtractRefineSummary
from pilot.scene.chat_data.chat_excel.excel_analyze.chat import ChatExcel from pilot.scene.chat_data.chat_excel.excel_analyze.chat import ChatExcel
from pilot.scene.chat_agent.chat import ChatAgent from pilot.scene.chat_agent.chat import ChatAgent

View File

@ -30,11 +30,12 @@ class KnowledgeDocumentEntity(Base):
content = Column(Text) content = Column(Text)
result = Column(Text) result = Column(Text)
vector_ids = Column(Text) vector_ids = Column(Text)
summary = Column(Text)
gmt_created = Column(DateTime) gmt_created = Column(DateTime)
gmt_modified = Column(DateTime) gmt_modified = Column(DateTime)
def __repr__(self): def __repr__(self):
return f"KnowledgeDocumentEntity(id={self.id}, doc_name='{self.doc_name}', doc_type='{self.doc_type}', chunk_size='{self.chunk_size}', status='{self.status}', last_sync='{self.last_sync}', content='{self.content}', result='{self.result}', gmt_created='{self.gmt_created}', gmt_modified='{self.gmt_modified}')" return f"KnowledgeDocumentEntity(id={self.id}, doc_name='{self.doc_name}', doc_type='{self.doc_type}', chunk_size='{self.chunk_size}', status='{self.status}', last_sync='{self.last_sync}', content='{self.content}', result='{self.result}', summary='{self.summary}', gmt_created='{self.gmt_created}', gmt_modified='{self.gmt_modified}')"
class KnowledgeDocumentDao(BaseDao): class KnowledgeDocumentDao(BaseDao):

View File

@ -5,8 +5,9 @@ from pydantic import BaseModel
class ChunkQueryResponse(BaseModel): class ChunkQueryResponse(BaseModel):
"""data: data""" """data: data"""
data: List = None data: List = None
"""summary: document summary"""
summary: str = None
"""total: total size""" """total: total size"""
total: int = None total: int = None
"""page: current page""" """page: current page"""

View File

@ -288,8 +288,8 @@ class KnowledgeService:
executor = CFG.SYSTEM_APP.get_component( executor = CFG.SYSTEM_APP.get_component(
ComponentType.EXECUTOR_DEFAULT, ExecutorFactory ComponentType.EXECUTOR_DEFAULT, ExecutorFactory
).create() ).create()
executor.submit(self.async_knowledge_graph, chunk_docs, doc) executor.submit(self.async_document_summary, chunk_docs, doc)
# executor.submit(self.async_doc_embedding, client, chunk_docs, doc) executor.submit(self.async_doc_embedding, client, chunk_docs, doc)
logger.info(f"begin save document chunks, doc:{doc.doc_name}") logger.info(f"begin save document chunks, doc:{doc.doc_name}")
# save chunk details # save chunk details
chunk_entities = [ chunk_entities = [
@ -384,38 +384,59 @@ class KnowledgeService:
doc_name=request.doc_name, doc_name=request.doc_name,
doc_type=request.doc_type, doc_type=request.doc_type,
) )
document_query = KnowledgeDocumentEntity(id=request.document_id)
documents = knowledge_document_dao.get_documents(document_query)
res = ChunkQueryResponse() res = ChunkQueryResponse()
res.data = document_chunk_dao.get_document_chunks( res.data = document_chunk_dao.get_document_chunks(
query, page=request.page, page_size=request.page_size query, page=request.page, page_size=request.page_size
) )
res.summary = documents[0].summary
res.total = document_chunk_dao.get_document_chunks_count(query) res.total = document_chunk_dao.get_document_chunks_count(query)
res.page = request.page res.page = request.page
return res return res
def async_knowledge_graph(self, chunk_docs, doc): def async_knowledge_graph(self, chunk_docs, doc):
"""async document extract triplets and save into graph db """async document extract triplets and save into graph db
Args: Args:
- chunk_docs: List[Document] - chunk_docs: List[Document]
- doc: KnowledgeDocumentEntity - doc: KnowledgeDocumentEntity
""" """
for doc in chunk_docs:
text = doc.page_content
self._llm_extract_summary(text)
logger.info( logger.info(
f"async_knowledge_graph, doc:{doc.doc_name}, chunk_size:{len(chunk_docs)}, begin embedding to graph store" f"async_knowledge_graph, doc:{doc.doc_name}, chunk_size:{len(chunk_docs)}, begin embedding to graph store"
) )
# try: try:
# from pilot.graph_engine.graph_factory import RAGGraphFactory from pilot.graph_engine.graph_factory import RAGGraphFactory
#
# rag_engine = CFG.SYSTEM_APP.get_component( rag_engine = CFG.SYSTEM_APP.get_component(
# ComponentType.RAG_GRAPH_DEFAULT.value, RAGGraphFactory ComponentType.RAG_GRAPH_DEFAULT.value, RAGGraphFactory
# ).create() ).create()
# rag_engine.knowledge_graph(chunk_docs) rag_engine.knowledge_graph(chunk_docs)
# doc.status = SyncStatus.FINISHED.name doc.status = SyncStatus.FINISHED.name
# doc.result = "document build graph success" doc.result = "document build graph success"
# except Exception as e: except Exception as e:
# doc.status = SyncStatus.FAILED.name doc.status = SyncStatus.FAILED.name
# doc.result = "document build graph failed" + str(e) doc.result = "document build graph failed" + str(e)
# logger.error(f"document build graph failed:{doc.doc_name}, {str(e)}") logger.error(f"document build graph failed:{doc.doc_name}, {str(e)}")
return knowledge_document_dao.update_knowledge_document(doc)
def async_document_summary(self, chunk_docs, doc):
"""async document extract summary
Args:
- chunk_docs: List[Document]
- doc: KnowledgeDocumentEntity
"""
from llama_index import PromptHelper
from llama_index.prompts.default_prompt_selectors import DEFAULT_TREE_SUMMARIZE_PROMPT_SEL
texts = [doc.page_content for doc in chunk_docs]
prompt_helper = PromptHelper()
texts = prompt_helper.repack(prompt=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL, text_chunks=texts)
summary = self._llm_extract_summary(chunk_docs[0])
outputs, summary = self._refine_extract_summary(texts[1:], summary)
logger.info(
f"async_document_summary, doc:{doc.doc_name}, chunk_size:{len(chunk_docs)}, begin embedding to graph store"
)
doc.summary = summary
return knowledge_document_dao.update_knowledge_document(doc) return knowledge_document_dao.update_knowledge_document(doc)
@ -491,15 +512,39 @@ class KnowledgeService:
chat_param = { chat_param = {
"chat_session_id": uuid.uuid1(), "chat_session_id": uuid.uuid1(),
"current_user_input": doc, "current_user_input": doc.page_content,
"select_param": "summery", "select_param": "summary",
"model_name": "proxyllm", "model_name": "proxyllm",
} }
from pilot.utils import utils from pilot.utils import utils
loop = utils.get_or_create_event_loop() loop = utils.get_or_create_event_loop()
triplets = loop.run_until_complete( summary = loop.run_until_complete(
llm_chat_response_nostream( llm_chat_response_nostream(
ChatScene.ExtractSummary.value(), **{"chat_param": chat_param} ChatScene.ExtractSummary.value(), **{"chat_param": chat_param}
) )
) )
return triplets return summary
def _refine_extract_summary(self, docs, summary: str):
"""Extract refine summary by llm"""
from pilot.scene.base import ChatScene
from pilot.common.chat_util import llm_chat_response_nostream
import uuid
outputs = []
for doc in docs:
chat_param = {
"chat_session_id": uuid.uuid1(),
"current_user_input": doc,
"select_param": summary,
"model_name": "proxyllm",
}
from pilot.utils import utils
loop = utils.get_or_create_event_loop()
summary = loop.run_until_complete(
llm_chat_response_nostream(
ChatScene.ExtractRefineSummary.value(), **{"chat_param": chat_param}
)
)
outputs.append(summary)
return outputs, summary