mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-26 03:49:10 +00:00
fix:use spacy replace chunk method
use spacy replace chunk method
This commit is contained in:
parent
69d07c46ee
commit
24130a6097
@ -11,7 +11,6 @@ from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter
|
|||||||
from pilot.configs.config import Config
|
from pilot.configs.config import Config
|
||||||
from pilot.embedding_engine import SourceEmbedding, register
|
from pilot.embedding_engine import SourceEmbedding, register
|
||||||
from pilot.embedding_engine.EncodeTextLoader import EncodeTextLoader
|
from pilot.embedding_engine.EncodeTextLoader import EncodeTextLoader
|
||||||
from pilot.embedding_engine.chn_document_splitter import CHNDocumentSplitter
|
|
||||||
|
|
||||||
CFG = Config()
|
CFG = Config()
|
||||||
|
|
||||||
@ -30,11 +29,7 @@ class MarkdownEmbedding(SourceEmbedding):
|
|||||||
def read(self):
|
def read(self):
|
||||||
"""Load from markdown path."""
|
"""Load from markdown path."""
|
||||||
loader = EncodeTextLoader(self.file_path)
|
loader = EncodeTextLoader(self.file_path)
|
||||||
# text_splitter = SpacyTextSplitter(
|
|
||||||
# pipeline="zh_core_web_sm",
|
|
||||||
# chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
|
||||||
# chunk_overlap=100,
|
|
||||||
# )
|
|
||||||
if CFG.LANGUAGE == "en":
|
if CFG.LANGUAGE == "en":
|
||||||
text_splitter = CharacterTextSplitter(
|
text_splitter = CharacterTextSplitter(
|
||||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
@ -42,7 +37,11 @@ class MarkdownEmbedding(SourceEmbedding):
|
|||||||
length_function=len,
|
length_function=len,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000)
|
text_splitter = SpacyTextSplitter(
|
||||||
|
pipeline="zh_core_web_sm",
|
||||||
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
|
chunk_overlap=100,
|
||||||
|
)
|
||||||
return loader.load_and_split(text_splitter)
|
return loader.load_and_split(text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
@ -8,7 +8,6 @@ from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter
|
|||||||
|
|
||||||
from pilot.configs.config import Config
|
from pilot.configs.config import Config
|
||||||
from pilot.embedding_engine import SourceEmbedding, register
|
from pilot.embedding_engine import SourceEmbedding, register
|
||||||
from pilot.embedding_engine.chn_document_splitter import CHNDocumentSplitter
|
|
||||||
|
|
||||||
CFG = Config()
|
CFG = Config()
|
||||||
|
|
||||||
@ -41,7 +40,11 @@ class PDFEmbedding(SourceEmbedding):
|
|||||||
length_function=len,
|
length_function=len,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000)
|
text_splitter = SpacyTextSplitter(
|
||||||
|
pipeline="zh_core_web_sm",
|
||||||
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
|
chunk_overlap=100,
|
||||||
|
)
|
||||||
return loader.load_and_split(text_splitter)
|
return loader.load_and_split(text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
@ -38,7 +38,11 @@ class PPTEmbedding(SourceEmbedding):
|
|||||||
length_function=len,
|
length_function=len,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000)
|
text_splitter = SpacyTextSplitter(
|
||||||
|
pipeline="zh_core_web_sm",
|
||||||
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
|
chunk_overlap=100,
|
||||||
|
)
|
||||||
return loader.load_and_split(text_splitter)
|
return loader.load_and_split(text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
@ -2,13 +2,12 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
|
from langchain.document_loaders import UnstructuredWordDocumentLoader
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import CharacterTextSplitter
|
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
from pilot.configs.config import Config
|
||||||
from pilot.embedding_engine import SourceEmbedding, register
|
from pilot.embedding_engine import SourceEmbedding, register
|
||||||
from pilot.embedding_engine.chn_document_splitter import CHNDocumentSplitter
|
|
||||||
|
|
||||||
CFG = Config()
|
CFG = Config()
|
||||||
|
|
||||||
@ -33,7 +32,11 @@ class WordEmbedding(SourceEmbedding):
|
|||||||
length_function=len,
|
length_function=len,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000)
|
text_splitter = SpacyTextSplitter(
|
||||||
|
pipeline="zh_core_web_sm",
|
||||||
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
|
chunk_overlap=100,
|
||||||
|
)
|
||||||
return loader.load_and_split(text_splitter)
|
return loader.load_and_split(text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
Loading…
Reference in New Issue
Block a user