mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-04 18:10:02 +00:00
feat:add summary
This commit is contained in:
parent
07ad8fac67
commit
dca3ddb931
@ -1,4 +1,5 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
from typing import Coroutine, List, Any
|
||||||
|
|
||||||
from starlette.responses import StreamingResponse
|
from starlette.responses import StreamingResponse
|
||||||
|
|
||||||
@ -18,3 +19,37 @@ async def llm_chat_response_nostream(chat_scene: str, **chat_param):
|
|||||||
async def llm_chat_response(chat_scene: str, **chat_param):
|
async def llm_chat_response(chat_scene: str, **chat_param):
|
||||||
chat: BaseChat = chat_factory.get_implementation(chat_scene, **chat_param)
|
chat: BaseChat = chat_factory.get_implementation(chat_scene, **chat_param)
|
||||||
return chat.stream_call()
|
return chat.stream_call()
|
||||||
|
|
||||||
|
|
||||||
|
def run_async_tasks(
|
||||||
|
tasks: List[Coroutine],
|
||||||
|
show_progress: bool = False,
|
||||||
|
progress_bar_desc: str = "Running async tasks",
|
||||||
|
) -> List[Any]:
|
||||||
|
"""Run a list of async tasks."""
|
||||||
|
|
||||||
|
tasks_to_execute: List[Any] = tasks
|
||||||
|
if show_progress:
|
||||||
|
try:
|
||||||
|
import nest_asyncio
|
||||||
|
from tqdm.asyncio import tqdm
|
||||||
|
|
||||||
|
nest_asyncio.apply()
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
|
async def _tqdm_gather() -> List[Any]:
|
||||||
|
return await tqdm.gather(*tasks_to_execute, desc=progress_bar_desc)
|
||||||
|
|
||||||
|
tqdm_outputs: List[Any] = loop.run_until_complete(_tqdm_gather())
|
||||||
|
return tqdm_outputs
|
||||||
|
# run the operation w/o tqdm on hitting a fatal
|
||||||
|
# may occur in some environments where tqdm.asyncio
|
||||||
|
# is not supported
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def _gather() -> List[Any]:
|
||||||
|
return await asyncio.gather(*tasks_to_execute)
|
||||||
|
|
||||||
|
outputs: List[Any] = asyncio.run(_gather())
|
||||||
|
return outputs
|
||||||
|
37
pilot/scene/chat_knowledge/refine_summary/chat.py
Normal file
37
pilot/scene/chat_knowledge/refine_summary/chat.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
from pilot.scene.base_chat import BaseChat
|
||||||
|
from pilot.scene.base import ChatScene
|
||||||
|
from pilot.configs.config import Config
|
||||||
|
|
||||||
|
from pilot.scene.chat_knowledge.refine_summary.prompt import prompt
|
||||||
|
|
||||||
|
CFG = Config()
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractRefineSummary(BaseChat):
|
||||||
|
chat_scene: str = ChatScene.ExtractRefineSummary.value()
|
||||||
|
|
||||||
|
"""get summary by llm"""
|
||||||
|
|
||||||
|
def __init__(self, chat_param: Dict):
|
||||||
|
""" """
|
||||||
|
chat_param["chat_mode"] = ChatScene.ExtractRefineSummary
|
||||||
|
super().__init__(
|
||||||
|
chat_param=chat_param,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.user_input = chat_param["current_user_input"]
|
||||||
|
self.existing_answer = chat_param["select_param"]
|
||||||
|
# self.extract_mode = chat_param["select_param"]
|
||||||
|
|
||||||
|
def generate_input_values(self):
|
||||||
|
input_values = {
|
||||||
|
"context": self.user_input,
|
||||||
|
"existing_answer": self.existing_answer,
|
||||||
|
}
|
||||||
|
return input_values
|
||||||
|
|
||||||
|
@property
|
||||||
|
def chat_type(self) -> str:
|
||||||
|
return ChatScene.ExtractRefineSummary.value
|
57
pilot/scene/chat_knowledge/refine_summary/out_parser.py
Normal file
57
pilot/scene/chat_knowledge/refine_summary/out_parser.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
from pilot.out_parser.base import BaseOutputParser, T
|
||||||
|
from pilot.configs.config import Config
|
||||||
|
|
||||||
|
CFG = Config()
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractRefineSummaryParser(BaseOutputParser):
|
||||||
|
def __init__(self, sep: str, is_stream_out: bool):
|
||||||
|
super().__init__(sep=sep, is_stream_out=is_stream_out)
|
||||||
|
|
||||||
|
def parse_prompt_response(
|
||||||
|
self, response, max_length: int = 128
|
||||||
|
) -> List[Tuple[str, str, str]]:
|
||||||
|
# clean_str = super().parse_prompt_response(response)
|
||||||
|
print("clean prompt response:", response)
|
||||||
|
|
||||||
|
# if response.startswith("Triplets:"):
|
||||||
|
# response = response[len("Triplets:") :]
|
||||||
|
# pattern = r"\([^()]+\)"
|
||||||
|
# response = re.findall(pattern, response)
|
||||||
|
# # response = response.strip().split("\n")
|
||||||
|
# print("parse prompt response:", response)
|
||||||
|
# results = []
|
||||||
|
# for text in response:
|
||||||
|
# if not text or text[0] != "(" or text[-1] != ")":
|
||||||
|
# # skip empty lines and non-triplets
|
||||||
|
# continue
|
||||||
|
# tokens = text[1:-1].split(",")
|
||||||
|
# if len(tokens) != 3:
|
||||||
|
# continue
|
||||||
|
#
|
||||||
|
# if any(len(s.encode("utf-8")) > max_length for s in tokens):
|
||||||
|
# # We count byte-length instead of len() for UTF-8 chars,
|
||||||
|
# # will skip if any of the tokens are too long.
|
||||||
|
# # This is normally due to a poorly formatted triplet
|
||||||
|
# # extraction, in more serious KG building cases
|
||||||
|
# # we'll need NLP models to better extract triplets.
|
||||||
|
# continue
|
||||||
|
#
|
||||||
|
# subject, predicate, obj = map(str.strip, tokens)
|
||||||
|
# if not subject or not predicate or not obj:
|
||||||
|
# # skip partial triplets
|
||||||
|
# continue
|
||||||
|
# results.append((subject.lower(), predicate.lower(), obj.lower()))
|
||||||
|
return response
|
||||||
|
|
||||||
|
def parse_view_response(self, speak, data) -> str:
|
||||||
|
### tool out data to table view
|
||||||
|
return data
|
40
pilot/scene/chat_knowledge/refine_summary/prompt.py
Normal file
40
pilot/scene/chat_knowledge/refine_summary/prompt.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
from pilot.prompts.prompt_new import PromptTemplate
|
||||||
|
from pilot.configs.config import Config
|
||||||
|
from pilot.scene.base import ChatScene
|
||||||
|
from pilot.common.schema import SeparatorStyle
|
||||||
|
|
||||||
|
from pilot.scene.chat_knowledge.refine_summary.out_parser import ExtractRefineSummaryParser
|
||||||
|
|
||||||
|
CFG = Config()
|
||||||
|
|
||||||
|
|
||||||
|
PROMPT_SCENE_DEFINE = """Your job is to produce a final summary."""
|
||||||
|
|
||||||
|
_DEFAULT_TEMPLATE = """
|
||||||
|
We have provided an existing summary up to a certain point: {existing_answer}\nWe have the opportunity to refine the existing summary (only if needed) with some more context below.\n------------\n{context}\n------------\nGiven the new context, refine the original summary.\nIf the context isn't useful, return the original summary.
|
||||||
|
|
||||||
|
please use original language.
|
||||||
|
"""
|
||||||
|
PROMPT_RESPONSE = """"""
|
||||||
|
|
||||||
|
|
||||||
|
RESPONSE_FORMAT = """"""
|
||||||
|
|
||||||
|
|
||||||
|
PROMPT_SEP = SeparatorStyle.SINGLE.value
|
||||||
|
|
||||||
|
PROMPT_NEED_NEED_STREAM_OUT = False
|
||||||
|
|
||||||
|
prompt = PromptTemplate(
|
||||||
|
template_scene=ChatScene.ExtractRefineSummary.value(),
|
||||||
|
input_variables=["existing_answer","context"],
|
||||||
|
response_format="",
|
||||||
|
template_define=PROMPT_SCENE_DEFINE,
|
||||||
|
template=_DEFAULT_TEMPLATE + PROMPT_RESPONSE,
|
||||||
|
stream_out=PROMPT_NEED_NEED_STREAM_OUT,
|
||||||
|
output_parser=ExtractRefineSummaryParser(
|
||||||
|
sep=PROMPT_SEP, is_stream_out=PROMPT_NEED_NEED_STREAM_OUT
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
CFG.prompt_template_registry.register(prompt, is_default=True)
|
0
pilot/scene/chat_knowledge/summary/__init__.py
Normal file
0
pilot/scene/chat_knowledge/summary/__init__.py
Normal file
35
pilot/scene/chat_knowledge/summary/chat.py
Normal file
35
pilot/scene/chat_knowledge/summary/chat.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
from pilot.scene.base_chat import BaseChat
|
||||||
|
from pilot.scene.base import ChatScene
|
||||||
|
from pilot.configs.config import Config
|
||||||
|
|
||||||
|
from pilot.scene.chat_knowledge.summary.prompt import prompt
|
||||||
|
|
||||||
|
CFG = Config()
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractSummary(BaseChat):
|
||||||
|
chat_scene: str = ChatScene.ExtractSummary.value()
|
||||||
|
|
||||||
|
"""get summary by llm"""
|
||||||
|
|
||||||
|
def __init__(self, chat_param: Dict):
|
||||||
|
""" """
|
||||||
|
chat_param["chat_mode"] = ChatScene.ExtractSummary
|
||||||
|
super().__init__(
|
||||||
|
chat_param=chat_param,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.user_input = chat_param["current_user_input"]
|
||||||
|
# self.extract_mode = chat_param["select_param"]
|
||||||
|
|
||||||
|
def generate_input_values(self):
|
||||||
|
input_values = {
|
||||||
|
"context": self.user_input,
|
||||||
|
}
|
||||||
|
return input_values
|
||||||
|
|
||||||
|
@property
|
||||||
|
def chat_type(self) -> str:
|
||||||
|
return ChatScene.ExtractSummary.value
|
28
pilot/scene/chat_knowledge/summary/out_parser.py
Normal file
28
pilot/scene/chat_knowledge/summary/out_parser.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
from pilot.out_parser.base import BaseOutputParser, T
|
||||||
|
from pilot.configs.config import Config
|
||||||
|
|
||||||
|
CFG = Config()
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractSummaryParser(BaseOutputParser):
|
||||||
|
def __init__(self, sep: str, is_stream_out: bool):
|
||||||
|
super().__init__(sep=sep, is_stream_out=is_stream_out)
|
||||||
|
|
||||||
|
def parse_prompt_response(
|
||||||
|
self, response, max_length: int = 128
|
||||||
|
) -> List[Tuple[str, str, str]]:
|
||||||
|
# clean_str = super().parse_prompt_response(response)
|
||||||
|
print("clean prompt response:", response)
|
||||||
|
return response
|
||||||
|
|
||||||
|
def parse_view_response(self, speak, data) -> str:
|
||||||
|
### tool out data to table view
|
||||||
|
return data
|
47
pilot/scene/chat_knowledge/summary/prompt.py
Normal file
47
pilot/scene/chat_knowledge/summary/prompt.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
from pilot.prompts.prompt_new import PromptTemplate
|
||||||
|
from pilot.configs.config import Config
|
||||||
|
from pilot.scene.base import ChatScene
|
||||||
|
from pilot.common.schema import SeparatorStyle
|
||||||
|
|
||||||
|
from pilot.scene.chat_knowledge.summary.out_parser import ExtractSummaryParser
|
||||||
|
|
||||||
|
CFG = Config()
|
||||||
|
|
||||||
|
# PROMPT_SCENE_DEFINE = """You are an expert Q&A system that is trusted around the world.\nAlways answer the query using the provided context information, and not prior knowledge.\nSome rules to follow:\n1. Never directly reference the given context in your answer.\n2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines."""
|
||||||
|
|
||||||
|
PROMPT_SCENE_DEFINE = """Your job is to produce a final summary."""
|
||||||
|
|
||||||
|
# _DEFAULT_TEMPLATE = """
|
||||||
|
# Context information from multiple sources is below.\n---------------------\n
|
||||||
|
# {context}
|
||||||
|
# Given the information from multiple sources and not prior knowledge, answer the query.\nQuery: Describe what the provided text is about. Also describe some of the questions that this text can answer. \nAnswer: "
|
||||||
|
# """
|
||||||
|
|
||||||
|
_DEFAULT_TEMPLATE = """
|
||||||
|
Write a concise summary of the following context:
|
||||||
|
{context}
|
||||||
|
please use original language.
|
||||||
|
"""
|
||||||
|
PROMPT_RESPONSE = """"""
|
||||||
|
|
||||||
|
|
||||||
|
RESPONSE_FORMAT = """"""
|
||||||
|
|
||||||
|
|
||||||
|
PROMPT_SEP = SeparatorStyle.SINGLE.value
|
||||||
|
|
||||||
|
PROMPT_NEED_NEED_STREAM_OUT = False
|
||||||
|
|
||||||
|
prompt = PromptTemplate(
|
||||||
|
template_scene=ChatScene.ExtractSummary.value(),
|
||||||
|
input_variables=["context"],
|
||||||
|
response_format="",
|
||||||
|
template_define=PROMPT_SCENE_DEFINE,
|
||||||
|
template=_DEFAULT_TEMPLATE + PROMPT_RESPONSE,
|
||||||
|
stream_out=PROMPT_NEED_NEED_STREAM_OUT,
|
||||||
|
output_parser=ExtractSummaryParser(
|
||||||
|
sep=PROMPT_SEP, is_stream_out=PROMPT_NEED_NEED_STREAM_OUT
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
CFG.prompt_template_registry.register(prompt, is_default=True)
|
@ -288,7 +288,7 @@ class KnowledgeService:
|
|||||||
executor = CFG.SYSTEM_APP.get_component(
|
executor = CFG.SYSTEM_APP.get_component(
|
||||||
ComponentType.EXECUTOR_DEFAULT, ExecutorFactory
|
ComponentType.EXECUTOR_DEFAULT, ExecutorFactory
|
||||||
).create()
|
).create()
|
||||||
executor.submit(self.async_document_summary, chunk_docs, doc)
|
# executor.submit(self.async_document_summary, chunk_docs, doc)
|
||||||
executor.submit(self.async_doc_embedding, client, chunk_docs, doc)
|
executor.submit(self.async_doc_embedding, client, chunk_docs, doc)
|
||||||
logger.info(f"begin save document chunks, doc:{doc.doc_name}")
|
logger.info(f"begin save document chunks, doc:{doc.doc_name}")
|
||||||
# save chunk details
|
# save chunk details
|
||||||
@ -431,7 +431,8 @@ class KnowledgeService:
|
|||||||
texts = [doc.page_content for doc in chunk_docs]
|
texts = [doc.page_content for doc in chunk_docs]
|
||||||
prompt_helper = PromptHelper()
|
prompt_helper = PromptHelper()
|
||||||
texts = prompt_helper.repack(prompt=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL, text_chunks=texts)
|
texts = prompt_helper.repack(prompt=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL, text_chunks=texts)
|
||||||
summary = self._llm_extract_summary(chunk_docs[0])
|
summary = self._llm_extract_summary(texts[0])
|
||||||
|
# summaries = self._mapreduce_extract_summary(texts)
|
||||||
outputs, summary = self._refine_extract_summary(texts[1:], summary)
|
outputs, summary = self._refine_extract_summary(texts[1:], summary)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"async_document_summary, doc:{doc.doc_name}, chunk_size:{len(chunk_docs)}, begin embedding to graph store"
|
f"async_document_summary, doc:{doc.doc_name}, chunk_size:{len(chunk_docs)}, begin embedding to graph store"
|
||||||
@ -452,6 +453,7 @@ class KnowledgeService:
|
|||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
vector_ids = client.knowledge_embedding_batch(chunk_docs)
|
vector_ids = client.knowledge_embedding_batch(chunk_docs)
|
||||||
|
self.async_document_summary(chunk_docs, doc)
|
||||||
doc.status = SyncStatus.FINISHED.name
|
doc.status = SyncStatus.FINISHED.name
|
||||||
doc.result = "document embedding success"
|
doc.result = "document embedding success"
|
||||||
if vector_ids is not None:
|
if vector_ids is not None:
|
||||||
@ -512,9 +514,9 @@ class KnowledgeService:
|
|||||||
|
|
||||||
chat_param = {
|
chat_param = {
|
||||||
"chat_session_id": uuid.uuid1(),
|
"chat_session_id": uuid.uuid1(),
|
||||||
"current_user_input": doc.page_content,
|
"current_user_input": doc,
|
||||||
"select_param": "summary",
|
"select_param": "summary",
|
||||||
"model_name": "proxyllm",
|
"model_name": CFG.LLM_MODEL,
|
||||||
}
|
}
|
||||||
from pilot.utils import utils
|
from pilot.utils import utils
|
||||||
loop = utils.get_or_create_event_loop()
|
loop = utils.get_or_create_event_loop()
|
||||||
@ -535,7 +537,7 @@ class KnowledgeService:
|
|||||||
"chat_session_id": uuid.uuid1(),
|
"chat_session_id": uuid.uuid1(),
|
||||||
"current_user_input": doc,
|
"current_user_input": doc,
|
||||||
"select_param": summary,
|
"select_param": summary,
|
||||||
"model_name": "proxyllm",
|
"model_name": CFG.LLM_MODEL,
|
||||||
}
|
}
|
||||||
from pilot.utils import utils
|
from pilot.utils import utils
|
||||||
loop = utils.get_or_create_event_loop()
|
loop = utils.get_or_create_event_loop()
|
||||||
@ -547,4 +549,33 @@ class KnowledgeService:
|
|||||||
outputs.append(summary)
|
outputs.append(summary)
|
||||||
return outputs, summary
|
return outputs, summary
|
||||||
|
|
||||||
|
def _mapreduce_extract_summary(self, docs):
|
||||||
|
"""Extract mapreduce summary by llm"""
|
||||||
|
from pilot.scene.base import ChatScene
|
||||||
|
from pilot.common.chat_util import llm_chat_response_nostream
|
||||||
|
import uuid
|
||||||
|
outputs = []
|
||||||
|
tasks = []
|
||||||
|
for doc in docs:
|
||||||
|
chat_param = {
|
||||||
|
"chat_session_id": uuid.uuid1(),
|
||||||
|
"current_user_input": doc,
|
||||||
|
"select_param": "summary",
|
||||||
|
"model_name": CFG.LLM_MODEL,
|
||||||
|
}
|
||||||
|
tasks.append(llm_chat_response_nostream(
|
||||||
|
ChatScene.ExtractSummary.value(), **{"chat_param": chat_param}
|
||||||
|
))
|
||||||
|
from pilot.common.chat_util import run_async_tasks
|
||||||
|
summaries = run_async_tasks(tasks)
|
||||||
|
# from pilot.utils import utils
|
||||||
|
# loop = utils.get_or_create_event_loop()
|
||||||
|
# summary = loop.run_until_complete(
|
||||||
|
# llm_chat_response_nostream(
|
||||||
|
# ChatScene.ExtractRefineSummary.value(), **{"chat_param": chat_param}
|
||||||
|
# )
|
||||||
|
# )
|
||||||
|
# outputs.append(summary)
|
||||||
|
return summaries
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user