mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-04 10:00:17 +00:00
fix:chunk use RecursiveCharacterTextSplitter
This commit is contained in:
parent
71f386d0b9
commit
e37ecf262e
@ -31,7 +31,7 @@ class MarkdownEmbedding(SourceEmbedding):
|
|||||||
loader = EncodeTextLoader(self.file_path)
|
loader = EncodeTextLoader(self.file_path)
|
||||||
|
|
||||||
if CFG.LANGUAGE == "en":
|
if CFG.LANGUAGE == "en":
|
||||||
text_splitter = CharacterTextSplitter(
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
chunk_overlap=20,
|
chunk_overlap=20,
|
||||||
length_function=len,
|
length_function=len,
|
||||||
|
@ -4,7 +4,7 @@ from typing import List
|
|||||||
|
|
||||||
from langchain.document_loaders import PyPDFLoader
|
from langchain.document_loaders import PyPDFLoader
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
|
from langchain.text_splitter import SpacyTextSplitter, RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
from pilot.configs.config import Config
|
||||||
from pilot.embedding_engine import SourceEmbedding, register
|
from pilot.embedding_engine import SourceEmbedding, register
|
||||||
@ -34,7 +34,7 @@ class PDFEmbedding(SourceEmbedding):
|
|||||||
# chunk_overlap=100,
|
# chunk_overlap=100,
|
||||||
# )
|
# )
|
||||||
if CFG.LANGUAGE == "en":
|
if CFG.LANGUAGE == "en":
|
||||||
text_splitter = CharacterTextSplitter(
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
chunk_overlap=20,
|
chunk_overlap=20,
|
||||||
length_function=len,
|
length_function=len,
|
||||||
|
@ -4,7 +4,7 @@ from typing import List
|
|||||||
|
|
||||||
from langchain.document_loaders import UnstructuredPowerPointLoader
|
from langchain.document_loaders import UnstructuredPowerPointLoader
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
|
from langchain.text_splitter import SpacyTextSplitter, RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
from pilot.configs.config import Config
|
||||||
from pilot.embedding_engine import SourceEmbedding, register
|
from pilot.embedding_engine import SourceEmbedding, register
|
||||||
@ -32,7 +32,7 @@ class PPTEmbedding(SourceEmbedding):
|
|||||||
# chunk_overlap=200,
|
# chunk_overlap=200,
|
||||||
# )
|
# )
|
||||||
if CFG.LANGUAGE == "en":
|
if CFG.LANGUAGE == "en":
|
||||||
text_splitter = CharacterTextSplitter(
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
chunk_overlap=20,
|
chunk_overlap=20,
|
||||||
length_function=len,
|
length_function=len,
|
||||||
|
@ -3,7 +3,7 @@ from typing import List
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from langchain.document_loaders import WebBaseLoader
|
from langchain.document_loaders import WebBaseLoader
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
|
from langchain.text_splitter import SpacyTextSplitter, RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
from pilot.configs.config import Config
|
||||||
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||||
@ -27,7 +27,7 @@ class URLEmbedding(SourceEmbedding):
|
|||||||
"""Load from url path."""
|
"""Load from url path."""
|
||||||
loader = WebBaseLoader(web_path=self.file_path)
|
loader = WebBaseLoader(web_path=self.file_path)
|
||||||
if CFG.LANGUAGE == "en":
|
if CFG.LANGUAGE == "en":
|
||||||
text_splitter = CharacterTextSplitter(
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
chunk_overlap=20,
|
chunk_overlap=20,
|
||||||
length_function=len,
|
length_function=len,
|
||||||
|
@ -4,7 +4,7 @@ from typing import List
|
|||||||
|
|
||||||
from langchain.document_loaders import UnstructuredWordDocumentLoader
|
from langchain.document_loaders import UnstructuredWordDocumentLoader
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
|
from langchain.text_splitter import SpacyTextSplitter, RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
from pilot.configs.config import Config
|
||||||
from pilot.embedding_engine import SourceEmbedding, register
|
from pilot.embedding_engine import SourceEmbedding, register
|
||||||
@ -26,7 +26,7 @@ class WordEmbedding(SourceEmbedding):
|
|||||||
"""Load from word path."""
|
"""Load from word path."""
|
||||||
loader = UnstructuredWordDocumentLoader(self.file_path)
|
loader = UnstructuredWordDocumentLoader(self.file_path)
|
||||||
if CFG.LANGUAGE == "en":
|
if CFG.LANGUAGE == "en":
|
||||||
text_splitter = CharacterTextSplitter(
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
chunk_overlap=20,
|
chunk_overlap=20,
|
||||||
length_function=len,
|
length_function=len,
|
||||||
|
Loading…
Reference in New Issue
Block a user