feat:document summary

This commit is contained in:
aries_ckt
2023-10-31 18:52:58 +08:00
parent 04dcd90502
commit de90244828
4 changed files with 58 additions and 47 deletions

View File

@@ -13,7 +13,7 @@ class RemoteModelWorker(ModelWorker):
def __init__(self) -> None: def __init__(self) -> None:
self.headers = {} self.headers = {}
# TODO Configured by ModelParameters # TODO Configured by ModelParameters
self.timeout = 180 self.timeout = 360
self.host = None self.host = None
self.port = None self.port = None

View File

@@ -8,19 +8,21 @@ from pilot.scene.chat_knowledge.refine_summary.out_parser import ExtractRefineSu
CFG = Config() CFG = Config()
PROMPT_SCENE_DEFINE = """Your job is to produce a final summary.""" PROMPT_SCENE_DEFINE = """"""
_DEFAULT_TEMPLATE = """ _DEFAULT_TEMPLATE_ZH = """根据提供的上下文信息,我们已经提供了一个到某一点的现有总结:{existing_answer}\n 我们有机会在下面提供的更多上下文信息的基础上进一步完善现有的总结(仅在需要的情况下)。请根据新的上下文信息,完善原来的总结。\n------------\n{context}\n------------\n如果上下文信息没有用处,请返回原来的总结。"""
_DEFAULT_TEMPLATE_EN = """
We have provided an existing summary up to a certain point: {existing_answer}\nWe have the opportunity to refine the existing summary (only if needed) with some more context below.\n------------\n{context}\n------------\nGiven the new context, refine the original summary. \nIf the context isn't useful, return the original summary. We have provided an existing summary up to a certain point: {existing_answer}\nWe have the opportunity to refine the existing summary (only if needed) with some more context below.\n------------\n{context}\n------------\nGiven the new context, refine the original summary. \nIf the context isn't useful, return the original summary.
please use original language. please use original language.
""" """
_DEFAULT_TEMPLATE = (
_DEFAULT_TEMPLATE_EN if CFG.LANGUAGE == "en" else _DEFAULT_TEMPLATE_ZH
)
PROMPT_RESPONSE = """""" PROMPT_RESPONSE = """"""
RESPONSE_FORMAT = """"""
PROMPT_SEP = SeparatorStyle.SINGLE.value PROMPT_SEP = SeparatorStyle.SINGLE.value
PROMPT_NEED_NEED_STREAM_OUT = False PROMPT_NEED_NEED_STREAM_OUT = False

View File

@@ -9,19 +9,22 @@ CFG = Config()
# PROMPT_SCENE_DEFINE = """You are an expert Q&A system that is trusted around the world.\nAlways answer the query using the provided context information, and not prior knowledge.\nSome rules to follow:\n1. Never directly reference the given context in your answer.\n2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.""" # PROMPT_SCENE_DEFINE = """You are an expert Q&A system that is trusted around the world.\nAlways answer the query using the provided context information, and not prior knowledge.\nSome rules to follow:\n1. Never directly reference the given context in your answer.\n2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines."""
PROMPT_SCENE_DEFINE = """Your job is to produce a final summary.""" PROMPT_SCENE_DEFINE = """"""
# _DEFAULT_TEMPLATE = """ _DEFAULT_TEMPLATE_ZH = """请根据提供的上下文信息的进行简洁地总结:
# Context information from multiple sources is below.\n---------------------\n {context}
# {context} """
# Given the information from multiple sources and not prior knowledge, answer the query.\nQuery: Describe what the provided text is about. Also describe some of the questions that this text can answer. \nAnswer: "
# """
_DEFAULT_TEMPLATE = """ _DEFAULT_TEMPLATE_EN = """
Write a concise summary of the following context: Write a concise summary of the following context:
{context} {context}
please use original language. please use original language.
""" """
_DEFAULT_TEMPLATE = (
_DEFAULT_TEMPLATE_EN if CFG.LANGUAGE == "en" else _DEFAULT_TEMPLATE_ZH
)
PROMPT_RESPONSE = """""" PROMPT_RESPONSE = """"""

View File

@@ -429,19 +429,22 @@ class KnowledgeService:
from llama_index import PromptHelper from llama_index import PromptHelper
from llama_index.prompts.default_prompt_selectors import DEFAULT_TREE_SUMMARIZE_PROMPT_SEL from llama_index.prompts.default_prompt_selectors import DEFAULT_TREE_SUMMARIZE_PROMPT_SEL
texts = [doc.page_content for doc in chunk_docs] texts = [doc.page_content for doc in chunk_docs]
prompt_helper = PromptHelper() prompt_helper = PromptHelper(context_window=2500)
texts = prompt_helper.repack(prompt=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL, text_chunks=texts) texts = prompt_helper.repack(prompt=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL, text_chunks=texts)
logger.info( logger.info(
f"async_document_summary, doc:{doc.doc_name}, chunk_size:{len(texts)}, begin generate summary" f"async_document_summary, doc:{doc.doc_name}, chunk_size:{len(texts)}, begin generate summary"
) )
summary = self._llm_extract_summary(texts[0]) # summary = self._llm_extract_summary(texts[0])
# summaries = self._mapreduce_extract_summary(texts) summary = self._mapreduce_extract_summary(texts)
outputs, summary = self._refine_extract_summary(texts[1:], summary) # summaries = prompt_helper.repack(prompt=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL, text_chunks=summaries)
print( # if (len(summaries)) > 1:
f"refine summary outputs:{outputs}" # outputs, summary = self._refine_extract_summary(summaries[1:], summaries[0])
) # else:
summaries = prompt_helper.repack(prompt=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL, text_chunks=outputs) # summary = self._llm_extract_summary("\n".join(summaries))
summary = self._llm_extract_summary("|".join(summaries)) # print(
# f"refine summary outputs:{summaries}"
# )
print( print(
f"final summary:{summary}" f"final summary:{summary}"
) )
@@ -565,12 +568,20 @@ class KnowledgeService:
return outputs, summary return outputs, summary
def _mapreduce_extract_summary(self, docs): def _mapreduce_extract_summary(self, docs):
"""Extract mapreduce summary by llm""" """Extract mapreduce summary by llm
map -> multi thread generate summary
reduce -> merge the summaries by map process
Args:
docs:List[str]
"""
from pilot.scene.base import ChatScene from pilot.scene.base import ChatScene
from pilot.common.chat_util import llm_chat_response_nostream from pilot.common.chat_util import llm_chat_response_nostream
import uuid import uuid
outputs = []
tasks = [] tasks = []
if len(docs) == 1:
summary = self._llm_extract_summary(doc=docs[0])
return summary
else:
for doc in docs: for doc in docs:
chat_param = { chat_param = {
"chat_session_id": uuid.uuid1(), "chat_session_id": uuid.uuid1(),
@@ -583,15 +594,10 @@ class KnowledgeService:
)) ))
from pilot.common.chat_util import run_async_tasks from pilot.common.chat_util import run_async_tasks
summary_iters = run_async_tasks(tasks) summary_iters = run_async_tasks(tasks)
summary = self._llm_extract_summary(" ".join(summary_iters)) from pilot.common.prompt_util import PromptHelper
# from pilot.utils import utils from llama_index.prompts.default_prompt_selectors import DEFAULT_TREE_SUMMARIZE_PROMPT_SEL
# loop = utils.get_or_create_event_loop() prompt_helper = PromptHelper(context_window=2500)
# summary = loop.run_until_complete( summary_iters = prompt_helper.repack(prompt=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL, text_chunks=summary_iters)
# llm_chat_response_nostream( return self._mapreduce_extract_summary(summary_iters)
# ChatScene.ExtractRefineSummary.value(), **{"chat_param": chat_param}
# )
# )
# outputs.append(summary)
return summary