diff --git a/pilot/embedding_engine/markdown_embedding.py b/pilot/embedding_engine/markdown_embedding.py index 225cdad82..524872d1d 100644 --- a/pilot/embedding_engine/markdown_embedding.py +++ b/pilot/embedding_engine/markdown_embedding.py @@ -31,7 +31,7 @@ class MarkdownEmbedding(SourceEmbedding): loader = EncodeTextLoader(self.file_path) if CFG.LANGUAGE == "en": - text_splitter = CharacterTextSplitter( + text_splitter = RecursiveCharacterTextSplitter( chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=20, length_function=len, diff --git a/pilot/embedding_engine/pdf_embedding.py b/pilot/embedding_engine/pdf_embedding.py index 3a47e115e..77ff5f2f3 100644 --- a/pilot/embedding_engine/pdf_embedding.py +++ b/pilot/embedding_engine/pdf_embedding.py @@ -4,7 +4,7 @@ from typing import List from langchain.document_loaders import PyPDFLoader from langchain.schema import Document -from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter +from langchain.text_splitter import SpacyTextSplitter, RecursiveCharacterTextSplitter from pilot.configs.config import Config from pilot.embedding_engine import SourceEmbedding, register @@ -34,7 +34,7 @@ class PDFEmbedding(SourceEmbedding): # chunk_overlap=100, # ) if CFG.LANGUAGE == "en": - text_splitter = CharacterTextSplitter( + text_splitter = RecursiveCharacterTextSplitter( chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=20, length_function=len, diff --git a/pilot/embedding_engine/ppt_embedding.py b/pilot/embedding_engine/ppt_embedding.py index 2ddc583f9..c8991c36f 100644 --- a/pilot/embedding_engine/ppt_embedding.py +++ b/pilot/embedding_engine/ppt_embedding.py @@ -4,7 +4,7 @@ from typing import List from langchain.document_loaders import UnstructuredPowerPointLoader from langchain.schema import Document -from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter +from langchain.text_splitter import SpacyTextSplitter, RecursiveCharacterTextSplitter from pilot.configs.config import Config from pilot.embedding_engine import SourceEmbedding, register @@ -32,7 +32,7 @@ class PPTEmbedding(SourceEmbedding): # chunk_overlap=200, # ) if CFG.LANGUAGE == "en": - text_splitter = CharacterTextSplitter( + text_splitter = RecursiveCharacterTextSplitter( chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=20, length_function=len, diff --git a/pilot/embedding_engine/url_embedding.py b/pilot/embedding_engine/url_embedding.py index 273b5e777..587c8d913 100644 --- a/pilot/embedding_engine/url_embedding.py +++ b/pilot/embedding_engine/url_embedding.py @@ -3,7 +3,7 @@ from typing import List from bs4 import BeautifulSoup from langchain.document_loaders import WebBaseLoader from langchain.schema import Document -from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter +from langchain.text_splitter import SpacyTextSplitter, RecursiveCharacterTextSplitter from pilot.configs.config import Config from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE @@ -27,7 +27,7 @@ class URLEmbedding(SourceEmbedding): """Load from url path.""" loader = WebBaseLoader(web_path=self.file_path) if CFG.LANGUAGE == "en": - text_splitter = CharacterTextSplitter( + text_splitter = RecursiveCharacterTextSplitter( chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=20, length_function=len, diff --git a/pilot/embedding_engine/word_embedding.py b/pilot/embedding_engine/word_embedding.py index efb337a9a..0f4732050 100644 --- a/pilot/embedding_engine/word_embedding.py +++ b/pilot/embedding_engine/word_embedding.py @@ -4,7 +4,7 @@ from typing import List from langchain.document_loaders import UnstructuredWordDocumentLoader from langchain.schema import Document -from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter +from langchain.text_splitter import SpacyTextSplitter, RecursiveCharacterTextSplitter from pilot.configs.config import Config from pilot.embedding_engine import SourceEmbedding, register @@ -26,7 +26,7 @@ class WordEmbedding(SourceEmbedding): """Load from word path.""" loader = UnstructuredWordDocumentLoader(self.file_path) if CFG.LANGUAGE == "en": - text_splitter = CharacterTextSplitter( + text_splitter = RecursiveCharacterTextSplitter( chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=20, length_function=len,