mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-23 02:27:55 +00:00
fix:embedding chunk update
integrate RecursiveCharacterTextSplitter
This commit is contained in:
parent
870c704ef4
commit
7ee68c905b
@ -6,7 +6,7 @@ from typing import List
|
|||||||
import markdown
|
import markdown
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter
|
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
from pilot.configs.config import Config
|
||||||
from pilot.embedding_engine import SourceEmbedding, register
|
from pilot.embedding_engine import SourceEmbedding, register
|
||||||
@ -37,11 +37,14 @@ class MarkdownEmbedding(SourceEmbedding):
|
|||||||
length_function=len,
|
length_function=len,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
text_splitter = SpacyTextSplitter(
|
try:
|
||||||
pipeline="zh_core_web_sm",
|
text_splitter = SpacyTextSplitter(
|
||||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
pipeline="zh_core_web_sm",
|
||||||
chunk_overlap=100,
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
)
|
chunk_overlap=100,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
|
||||||
return loader.load_and_split(text_splitter)
|
return loader.load_and_split(text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
@ -4,7 +4,7 @@ from typing import List
|
|||||||
|
|
||||||
from langchain.document_loaders import PyPDFLoader
|
from langchain.document_loaders import PyPDFLoader
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter
|
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
from pilot.configs.config import Config
|
||||||
from pilot.embedding_engine import SourceEmbedding, register
|
from pilot.embedding_engine import SourceEmbedding, register
|
||||||
@ -40,11 +40,14 @@ class PDFEmbedding(SourceEmbedding):
|
|||||||
length_function=len,
|
length_function=len,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
text_splitter = SpacyTextSplitter(
|
try:
|
||||||
pipeline="zh_core_web_sm",
|
text_splitter = SpacyTextSplitter(
|
||||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
pipeline="zh_core_web_sm",
|
||||||
chunk_overlap=100,
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
)
|
chunk_overlap=100,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
|
||||||
return loader.load_and_split(text_splitter)
|
return loader.load_and_split(text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
@ -4,7 +4,7 @@ from typing import List
|
|||||||
|
|
||||||
from langchain.document_loaders import UnstructuredPowerPointLoader
|
from langchain.document_loaders import UnstructuredPowerPointLoader
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter
|
from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
from pilot.configs.config import Config
|
||||||
from pilot.embedding_engine import SourceEmbedding, register
|
from pilot.embedding_engine import SourceEmbedding, register
|
||||||
@ -38,11 +38,14 @@ class PPTEmbedding(SourceEmbedding):
|
|||||||
length_function=len,
|
length_function=len,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
text_splitter = SpacyTextSplitter(
|
try:
|
||||||
pipeline="zh_core_web_sm",
|
text_splitter = SpacyTextSplitter(
|
||||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
pipeline="zh_core_web_sm",
|
||||||
chunk_overlap=100,
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
)
|
chunk_overlap=100,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
|
||||||
return loader.load_and_split(text_splitter)
|
return loader.load_and_split(text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
@ -3,7 +3,7 @@ from typing import List
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from langchain.document_loaders import WebBaseLoader
|
from langchain.document_loaders import WebBaseLoader
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter
|
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
from pilot.configs.config import Config
|
||||||
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||||
@ -33,11 +33,14 @@ class URLEmbedding(SourceEmbedding):
|
|||||||
length_function=len,
|
length_function=len,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
text_splitter = SpacyTextSplitter(
|
try:
|
||||||
pipeline="zh_core_web_sm",
|
text_splitter = SpacyTextSplitter(
|
||||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
pipeline="zh_core_web_sm",
|
||||||
chunk_overlap=100,
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
)
|
chunk_overlap=100,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
|
||||||
return loader.load_and_split(text_splitter)
|
return loader.load_and_split(text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
@ -4,7 +4,7 @@ from typing import List
|
|||||||
|
|
||||||
from langchain.document_loaders import UnstructuredWordDocumentLoader
|
from langchain.document_loaders import UnstructuredWordDocumentLoader
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter
|
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
from pilot.configs.config import Config
|
||||||
from pilot.embedding_engine import SourceEmbedding, register
|
from pilot.embedding_engine import SourceEmbedding, register
|
||||||
@ -32,11 +32,14 @@ class WordEmbedding(SourceEmbedding):
|
|||||||
length_function=len,
|
length_function=len,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
text_splitter = SpacyTextSplitter(
|
try:
|
||||||
pipeline="zh_core_web_sm",
|
text_splitter = SpacyTextSplitter(
|
||||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
pipeline="zh_core_web_sm",
|
||||||
chunk_overlap=100,
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
)
|
chunk_overlap=100,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, chunk_overlap=50)
|
||||||
return loader.load_and_split(text_splitter)
|
return loader.load_and_split(text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
Loading…
Reference in New Issue
Block a user