mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-10-11 20:03:53 +00:00
fix:chunk use RecursiveCharacterTextSplitter
This commit is contained in:
@@ -4,7 +4,7 @@ from typing import List
|
||||
|
||||
from langchain.document_loaders import PyPDFLoader
|
||||
from langchain.schema import Document
|
||||
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
|
||||
from langchain.text_splitter import SpacyTextSplitter, RecursiveCharacterTextSplitter
|
||||
|
||||
from pilot.configs.config import Config
|
||||
from pilot.embedding_engine import SourceEmbedding, register
|
||||
@@ -34,7 +34,7 @@ class PDFEmbedding(SourceEmbedding):
|
||||
# chunk_overlap=100,
|
||||
# )
|
||||
if CFG.LANGUAGE == "en":
|
||||
text_splitter = CharacterTextSplitter(
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=20,
|
||||
length_function=len,
|
||||
|
Reference in New Issue
Block a user