fix:chunk use RecursiveCharacterTextSplitter

This commit is contained in:
aries_ckt 2023-07-05 15:28:58 +08:00
parent 71f386d0b9
commit e37ecf262e
5 changed files with 9 additions and 9 deletions

View File

@ -31,7 +31,7 @@ class MarkdownEmbedding(SourceEmbedding):
loader = EncodeTextLoader(self.file_path)
if CFG.LANGUAGE == "en":
text_splitter = CharacterTextSplitter(
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=20,
length_function=len,

View File

@ -4,7 +4,7 @@ from typing import List
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter, RecursiveCharacterTextSplitter
from pilot.configs.config import Config
from pilot.embedding_engine import SourceEmbedding, register
@ -34,7 +34,7 @@ class PDFEmbedding(SourceEmbedding):
# chunk_overlap=100,
# )
if CFG.LANGUAGE == "en":
text_splitter = CharacterTextSplitter(
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=20,
length_function=len,

View File

@ -4,7 +4,7 @@ from typing import List
from langchain.document_loaders import UnstructuredPowerPointLoader
from langchain.schema import Document
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter, RecursiveCharacterTextSplitter
from pilot.configs.config import Config
from pilot.embedding_engine import SourceEmbedding, register
@ -32,7 +32,7 @@ class PPTEmbedding(SourceEmbedding):
# chunk_overlap=200,
# )
if CFG.LANGUAGE == "en":
text_splitter = CharacterTextSplitter(
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=20,
length_function=len,

View File

@ -3,7 +3,7 @@ from typing import List
from bs4 import BeautifulSoup
from langchain.document_loaders import WebBaseLoader
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter, RecursiveCharacterTextSplitter
from pilot.configs.config import Config
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
@ -27,7 +27,7 @@ class URLEmbedding(SourceEmbedding):
"""Load from url path."""
loader = WebBaseLoader(web_path=self.file_path)
if CFG.LANGUAGE == "en":
text_splitter = CharacterTextSplitter(
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=20,
length_function=len,

View File

@ -4,7 +4,7 @@ from typing import List
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter, RecursiveCharacterTextSplitter
from pilot.configs.config import Config
from pilot.embedding_engine import SourceEmbedding, register
@ -26,7 +26,7 @@ class WordEmbedding(SourceEmbedding):
"""Load from word path."""
loader = UnstructuredWordDocumentLoader(self.file_path)
if CFG.LANGUAGE == "en":
text_splitter = CharacterTextSplitter(
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=20,
length_function=len,