fix:url embedding

2025-09-16 22:51:24 +00:00 · 2023-06-01 22:07:33 +08:00
parent 8dd25815e1
commit 1d432e4d29
3 changed files with 18 additions and 8 deletions
--- a/pilot/out_parser/base.py
+++ b/pilot/out_parser/base.py
@@ -53,9 +53,11 @@ class BaseOutputParser(ABC):
        """
        if data["error_code"] == 0:
            if "vicuna" in CFG.LLM_MODEL:
-                output = data["text"][skip_echo_len + 11:].strip()
+                # output = data["text"][skip_echo_len + 11:].strip()
+                output = data["text"][skip_echo_len:].strip()
            elif "guanaco" in CFG.LLM_MODEL:
-                output = data["text"][skip_echo_len + 14:].replace("<s>", "").strip()
+                # output = data["text"][skip_echo_len + 14:].replace("<s>", "").strip()
+                output = data["text"][skip_echo_len:].replace("<s>", "").strip()
            else:
                output = data["text"].strip()

--- a/pilot/scene/chat_knowledge/url/prompt.py
+++ b/pilot/scene/chat_knowledge/url/prompt.py
@@ -11,7 +11,7 @@ from pilot.scene.chat_normal.out_parser import NormalChatOutputParser

 CFG = Config()

-PROMPT_SCENE_DEFINE = """A chat between a curious user and an artificial intelligence assistant, who very familiar with database related knowledge. 
+PROMPT_SCENE_DEFINE = """A chat between a curious human and an artificial intelligence assistant, who very familiar with database related knowledge. 
    The assistant gives helpful, detailed, professional and polite answers to the user's questions. """


--- a/pilot/source_embedding/url_embedding.py
+++ b/pilot/source_embedding/url_embedding.py
@@ -5,9 +5,12 @@ from langchain.document_loaders import WebBaseLoader
 from langchain.schema import Document
 from langchain.text_splitter import CharacterTextSplitter

+from pilot.configs.config import Config
+from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
 from pilot.source_embedding import SourceEmbedding, register
+from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter

-
+CFG = Config()
 class URLEmbedding(SourceEmbedding):
    """url embedding for read url document."""

@@ -22,10 +25,15 @@ class URLEmbedding(SourceEmbedding):
    def read(self):
        """Load from url path."""
        loader = WebBaseLoader(web_path=self.file_path)
-        text_splitor = CharacterTextSplitter(
-            chunk_size=100, chunk_overlap=20, length_function=len
-        )
-        return loader.load_and_split(text_splitor)
+        if CFG.LANGUAGE == "en":
+            text_splitter = CharacterTextSplitter(
+                chunk_size=KNOWLEDGE_CHUNK_SPLIT_SIZE, chunk_overlap=20, length_function=len
+            )
+        else:
+            text_splitter = CHNDocumentSplitter(
+                pdf=True, sentence_size=1000
+            )
+        return loader.load_and_split(text_splitter)

    @register
    def data_process(self, documents: List[Document]):