doc:knowledge docs update

2025-09-02 01:27:14 +00:00 · 2023-07-12 14:28:40 +08:00
parent f85def5a52
commit 16d6ce8c89
9 changed files with 135 additions and 68 deletions
--- a/docs/modules/knowledge/markdown/markdown_embedding.md
+++ b/docs/modules/knowledge/markdown/markdown_embedding.md
@@ -6,13 +6,14 @@ inheriting the SourceEmbedding

 ```
 class  MarkdownEmbedding(SourceEmbedding):
-    """pdf embedding for read pdf document."""
+    """pdf embedding for read markdown document."""

-    def __init__(self, file_path, vector_store_config):
-        """Initialize with pdf path."""
-        super().__init__(file_path, vector_store_config)
+    def __init__(self, file_path, vector_store_config, text_splitter):
+        """Initialize with markdown path."""
+        super().__init__(file_path, vector_store_config, text_splitter)
        self.file_path = file_path
        self.vector_store_config = vector_store_config
+        self.text_splitter = text_splitter or Nore
 ```
 implement read() and data_process()
 read() method allows you to read data and split data into chunk
@@ -22,12 +23,19 @@ read() method allows you to read data and split data into chunk
    def read(self):
        """Load from markdown path."""
        loader = EncodeTextLoader(self.file_path)
-        textsplitter = SpacyTextSplitter(
-            pipeline="zh_core_web_sm",
-            chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
-            chunk_overlap=100,
-        )
-        return loader.load_and_split(textsplitter)
+        if self.text_splitter is None:
+            try:
+                self.text_splitter = SpacyTextSplitter(
+                    pipeline="zh_core_web_sm",
+                    chunk_size=100,
+                    chunk_overlap=100,
+                )
+            except Exception:
+                self.text_splitter = RecursiveCharacterTextSplitter(
+                    chunk_size=100, chunk_overlap=50
+                )
+
+        return loader.load_and_split(self.text_splitter)
 ```

 data_process() method allows you to pre processing your ways
--- a/docs/modules/knowledge/pdf/pdf_embedding.md
+++ b/docs/modules/knowledge/pdf/pdf_embedding.md
@@ -7,11 +7,12 @@ inheriting the SourceEmbedding
 class PDFEmbedding(SourceEmbedding):
    """pdf embedding for read pdf document."""

-    def __init__(self, file_path, vector_store_config):
+    def __init__(self, file_path, vector_store_config, text_splitter):
        """Initialize with pdf path."""
-        super().__init__(file_path, vector_store_config)
+        super().__init__(file_path, vector_store_config, text_splitter)
        self.file_path = file_path
        self.vector_store_config = vector_store_config
+        self.text_splitter = text_splitter or Nore
 ```

 implement read() and data_process()
@@ -21,15 +22,19 @@ read() method allows you to read data and split data into chunk
    def read(self):
        """Load from pdf path."""
        loader = PyPDFLoader(self.file_path)
-        # textsplitter = CHNDocumentSplitter(
-        #     pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
-        # )
-        textsplitter = SpacyTextSplitter(
-            pipeline="zh_core_web_sm",
-            chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
-            chunk_overlap=100,
-        )
-        return loader.load_and_split(textsplitter)
+        if self.text_splitter is None:
+            try:
+                self.text_splitter = SpacyTextSplitter(
+                    pipeline="zh_core_web_sm",
+                    chunk_size=100,
+                    chunk_overlap=100,
+                )
+            except Exception:
+                self.text_splitter = RecursiveCharacterTextSplitter(
+                    chunk_size=100, chunk_overlap=50
+                )
+
+        return loader.load_and_split(self.text_splitter)
 ```
 data_process() method allows you to pre processing your ways
 ```
--- a/docs/modules/knowledge/ppt/ppt_embedding.md
+++ b/docs/modules/knowledge/ppt/ppt_embedding.md
@@ -7,11 +7,17 @@ inheriting the SourceEmbedding
 class PPTEmbedding(SourceEmbedding):
    """ppt embedding for read ppt document."""

-    def __init__(self, file_path, vector_store_config):
-        """Initialize with pdf path."""
-        super().__init__(file_path, vector_store_config)
+        def __init__(
+        self,
+        file_path,
+        vector_store_config,
+        text_splitter: Optional[TextSplitter] = None,
+    ):
+        """Initialize ppt word path."""
+        super().__init__(file_path, vector_store_config, text_splitter=None)
        self.file_path = file_path
        self.vector_store_config = vector_store_config
+        self.text_splitter = text_splitter or None
 ```

 implement read() and data_process()
@@ -21,12 +27,19 @@ read() method allows you to read data and split data into chunk
    def read(self):
        """Load from ppt path."""
        loader = UnstructuredPowerPointLoader(self.file_path)
-        textsplitter = SpacyTextSplitter(
-            pipeline="zh_core_web_sm",
-            chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
-            chunk_overlap=200,
-        )
-        return loader.load_and_split(textsplitter)
+        if self.text_splitter is None:
+            try:
+                self.text_splitter = SpacyTextSplitter(
+                    pipeline="zh_core_web_sm",
+                    chunk_size=100,
+                    chunk_overlap=100,
+                )
+            except Exception:
+                self.text_splitter = RecursiveCharacterTextSplitter(
+                    chunk_size=100, chunk_overlap=50
+                )
+
+        return loader.load_and_split(self.text_splitter)
 ```
 data_process() method allows you to pre processing your ways
 ```
--- a/docs/modules/knowledge/url/url_embedding.md
+++ b/docs/modules/knowledge/url/url_embedding.md
@@ -7,11 +7,17 @@ inheriting the SourceEmbedding
 class URLEmbedding(SourceEmbedding):
    """url embedding for read url document."""

-    def __init__(self, file_path, vector_store_config):
-        """Initialize with url path."""
-        super().__init__(file_path, vector_store_config)
+        def __init__(
+        self,
+        file_path,
+        vector_store_config,
+        text_splitter: Optional[TextSplitter] = None,
+    ):
+        """Initialize url word path."""
+        super().__init__(file_path, vector_store_config, text_splitter=None)
        self.file_path = file_path
        self.vector_store_config = vector_store_config
+        self.text_splitter = text_splitter or None
 ```

 implement read() and data_process()
@@ -21,15 +27,19 @@ read() method allows you to read data and split data into chunk
    def read(self):
        """Load from url path."""
        loader = WebBaseLoader(web_path=self.file_path)
-        if CFG.LANGUAGE == "en":
-            text_splitter = CharacterTextSplitter(
-                chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
-                chunk_overlap=20,
-                length_function=len,
-            )
-        else:
-            text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000)
-        return loader.load_and_split(text_splitter)
+        if self.text_splitter is None:
+            try:
+                self.text_splitter = SpacyTextSplitter(
+                    pipeline="zh_core_web_sm",
+                    chunk_size=100,
+                    chunk_overlap=100,
+                )
+            except Exception:
+                self.text_splitter = RecursiveCharacterTextSplitter(
+                    chunk_size=100, chunk_overlap=50
+                )
+
+        return loader.load_and_split(self.text_splitter)
 ```
 data_process() method allows you to pre processing your ways
 ```
--- a/docs/modules/knowledge/word/word_embedding.md
+++ b/docs/modules/knowledge/word/word_embedding.md
@@ -7,11 +7,12 @@ inheriting the SourceEmbedding
 class WordEmbedding(SourceEmbedding):
    """word embedding for read word document."""

-    def __init__(self, file_path, vector_store_config):
-        """Initialize with word path."""
-        super().__init__(file_path, vector_store_config)
+    def __init__(self, file_path, vector_store_config, text_splitter):
+        """Initialize with pdf path."""
+        super().__init__(file_path, vector_store_config, text_splitter)
        self.file_path = file_path
        self.vector_store_config = vector_store_config
+        self.text_splitter = text_splitter or Nore
 ```

 implement read() and data_process()
@@ -21,10 +22,19 @@ read() method allows you to read data and split data into chunk
    def read(self):
        """Load from word path."""
        loader = UnstructuredWordDocumentLoader(self.file_path)
-        textsplitter = CHNDocumentSplitter(
-            pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
-        )
-        return loader.load_and_split(textsplitter)
+        if self.text_splitter is None:
+            try:
+                self.text_splitter = SpacyTextSplitter(
+                    pipeline="zh_core_web_sm",
+                    chunk_size=100,
+                    chunk_overlap=100,
+                )
+            except Exception:
+                self.text_splitter = RecursiveCharacterTextSplitter(
+                    chunk_size=100, chunk_overlap=50
+                )
+
+        return loader.load_and_split(self.text_splitter)
 ```
 data_process() method allows you to pre processing your ways
 ```