From de902448280a16fdd0b7756e904cc256280d3551 Mon Sep 17 00:00:00 2001
From: aries_ckt <916701291@qq.com>
Date: Tue, 31 Oct 2023 18:52:58 +0800
Subject: [PATCH] feat:document summary

---
 pilot/model/cluster/worker/remote_worker.py   |  2 +-
 .../chat_knowledge/refine_summary/prompt.py   | 16 +++--
 pilot/scene/chat_knowledge/summary/prompt.py  | 17 +++--
 pilot/server/knowledge/service.py             | 70 ++++++++++---------
 4 files changed, 58 insertions(+), 47 deletions(-)

diff --git a/pilot/model/cluster/worker/remote_worker.py b/pilot/model/cluster/worker/remote_worker.py
index f974ba714..149f8b86a 100644
--- a/pilot/model/cluster/worker/remote_worker.py
+++ b/pilot/model/cluster/worker/remote_worker.py
@@ -13,7 +13,7 @@ class RemoteModelWorker(ModelWorker):
     def __init__(self) -> None:
         self.headers = {}
         # TODO Configured by ModelParameters
-        self.timeout = 180
+        self.timeout = 360
         self.host = None
         self.port = None
 
diff --git a/pilot/scene/chat_knowledge/refine_summary/prompt.py b/pilot/scene/chat_knowledge/refine_summary/prompt.py
index 0161cee35..69d4e46df 100644
--- a/pilot/scene/chat_knowledge/refine_summary/prompt.py
+++ b/pilot/scene/chat_knowledge/refine_summary/prompt.py
@@ -8,19 +8,21 @@ from pilot.scene.chat_knowledge.refine_summary.out_parser import ExtractRefineSu
 CFG = Config()
 
 
-PROMPT_SCENE_DEFINE = """Your job is to produce a final summary."""
+PROMPT_SCENE_DEFINE = """"""
 
-_DEFAULT_TEMPLATE = """
-We have provided an existing summary up to a certain point: {existing_answer}\nWe have the opportunity to refine the existing summary (only if needed) with some more context below.\n------------\n{context}\n------------\nGiven the new context, refine the original summary.\nIf the context isn't useful, return the original summary.
+_DEFAULT_TEMPLATE_ZH = """根据提供的上下文信息，我们已经提供了一个到某一点的现有总结:{existing_answer}\n 我们有机会在下面提供的更多上下文信息的基础上进一步完善现有的总结（仅在需要的情况下）。请根据新的上下文信息，完善原来的总结。\n------------\n{context}\n------------\n如果上下文信息没有用处，请返回原来的总结。"""
 
+_DEFAULT_TEMPLATE_EN = """
+We have provided an existing summary up to a certain point: {existing_answer}\nWe have the opportunity to refine the existing summary (only if needed) with some more context below.\n------------\n{context}\n------------\nGiven the new context, refine the original summary. \nIf the context isn't useful, return the original summary.
 please use original language.
 """
+
+_DEFAULT_TEMPLATE = (
+    _DEFAULT_TEMPLATE_EN if CFG.LANGUAGE == "en" else _DEFAULT_TEMPLATE_ZH
+)
+
 PROMPT_RESPONSE = """"""
 
-
-RESPONSE_FORMAT = """"""
-
-
 PROMPT_SEP = SeparatorStyle.SINGLE.value
 
 PROMPT_NEED_NEED_STREAM_OUT = False
diff --git a/pilot/scene/chat_knowledge/summary/prompt.py b/pilot/scene/chat_knowledge/summary/prompt.py
index cbf452c99..ec7c05c32 100644
--- a/pilot/scene/chat_knowledge/summary/prompt.py
+++ b/pilot/scene/chat_knowledge/summary/prompt.py
@@ -9,19 +9,22 @@ CFG = Config()
 
 # PROMPT_SCENE_DEFINE = """You are an expert Q&A system that is trusted around the world.\nAlways answer the query using the provided context information, and not prior knowledge.\nSome rules to follow:\n1. Never directly reference the given context in your answer.\n2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines."""
 
-PROMPT_SCENE_DEFINE = """Your job is to produce a final summary."""
+PROMPT_SCENE_DEFINE = """"""
 
-# _DEFAULT_TEMPLATE = """
-# Context information from multiple sources is below.\n---------------------\n
-# {context}
-# Given the information from multiple sources and not prior knowledge, answer the query.\nQuery: Describe what the provided text is about. Also describe some of the questions that this text can answer. \nAnswer: "
-# """
+_DEFAULT_TEMPLATE_ZH = """请根据提供的上下文信息的进行简洁地总结:
+{context}
+"""
 
-_DEFAULT_TEMPLATE = """
+_DEFAULT_TEMPLATE_EN = """
 Write a concise summary of the following context: 
 {context}
 please use original language.
 """
+
+_DEFAULT_TEMPLATE = (
+    _DEFAULT_TEMPLATE_EN if CFG.LANGUAGE == "en" else _DEFAULT_TEMPLATE_ZH
+)
+
 PROMPT_RESPONSE = """"""
 
 
diff --git a/pilot/server/knowledge/service.py b/pilot/server/knowledge/service.py
index 4db3d6c51..d7fb476d7 100644
--- a/pilot/server/knowledge/service.py
+++ b/pilot/server/knowledge/service.py
@@ -429,19 +429,22 @@ class KnowledgeService:
         from llama_index import PromptHelper
         from llama_index.prompts.default_prompt_selectors import DEFAULT_TREE_SUMMARIZE_PROMPT_SEL
         texts = [doc.page_content for doc in chunk_docs]
-        prompt_helper = PromptHelper()
+        prompt_helper = PromptHelper(context_window=2500)
+
         texts = prompt_helper.repack(prompt=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL, text_chunks=texts)
         logger.info(
             f"async_document_summary, doc:{doc.doc_name}, chunk_size:{len(texts)}, begin generate summary"
         )
-        summary = self._llm_extract_summary(texts[0])
-        # summaries = self._mapreduce_extract_summary(texts)
-        outputs, summary = self._refine_extract_summary(texts[1:], summary)
-        print(
-            f"refine summary outputs:{outputs}"
-        )
-        summaries = prompt_helper.repack(prompt=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL, text_chunks=outputs)
-        summary = self._llm_extract_summary("|".join(summaries))
+        # summary = self._llm_extract_summary(texts[0])
+        summary = self._mapreduce_extract_summary(texts)
+        # summaries = prompt_helper.repack(prompt=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL, text_chunks=summaries)
+        # if (len(summaries)) > 1:
+        #     outputs, summary = self._refine_extract_summary(summaries[1:], summaries[0])
+        # else:
+        #     summary = self._llm_extract_summary("\n".join(summaries))
+        # print(
+        #     f"refine summary outputs:{summaries}"
+        # )
         print(
             f"final summary:{summary}"
         )
@@ -565,33 +568,36 @@ class KnowledgeService:
         return outputs, summary
 
     def _mapreduce_extract_summary(self, docs):
-        """Extract mapreduce summary by llm"""
+        """Extract mapreduce summary by llm
+        map -> multi thread generate summary
+        reduce -> merge the summaries by map process
+        Args:
+            docs:List[str]
+        """
         from pilot.scene.base import ChatScene
         from pilot.common.chat_util import llm_chat_response_nostream
         import uuid
-        outputs = []
         tasks = []
-        for doc in docs:
-            chat_param = {
-                "chat_session_id": uuid.uuid1(),
-                "current_user_input": doc,
-                "select_param": "summary",
-                "model_name": CFG.LLM_MODEL,
-            }
-            tasks.append(llm_chat_response_nostream(
+        if len(docs) == 1:
+            summary = self._llm_extract_summary(doc=docs[0])
+            return summary
+        else:
+            for doc in docs:
+                chat_param = {
+                    "chat_session_id": uuid.uuid1(),
+                    "current_user_input": doc,
+                    "select_param": "summary",
+                    "model_name": CFG.LLM_MODEL,
+                }
+                tasks.append(llm_chat_response_nostream(
                     ChatScene.ExtractSummary.value(), **{"chat_param": chat_param}
-            ))
-        from pilot.common.chat_util import run_async_tasks
-        summary_iters = run_async_tasks(tasks)
-        summary = self._llm_extract_summary(" ".join(summary_iters))
-            # from pilot.utils import utils
-            # loop = utils.get_or_create_event_loop()
-            # summary = loop.run_until_complete(
-            #     llm_chat_response_nostream(
-            #         ChatScene.ExtractRefineSummary.value(), **{"chat_param": chat_param}
-            #     )
-            # )
-            # outputs.append(summary)
-        return summary
+                ))
+            from pilot.common.chat_util import run_async_tasks
+            summary_iters = run_async_tasks(tasks)
+            from pilot.common.prompt_util import PromptHelper
+            from llama_index.prompts.default_prompt_selectors import DEFAULT_TREE_SUMMARIZE_PROMPT_SEL
+            prompt_helper = PromptHelper(context_window=2500)
+            summary_iters = prompt_helper.repack(prompt=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL, text_chunks=summary_iters)
+            return self._mapreduce_extract_summary(summary_iters)