mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-10-11 11:53:55 +00:00
fix:knowledge_init path
1.knowledge_init path 2.url embedding chunk
This commit is contained in:
@@ -3,7 +3,7 @@ from typing import List
|
||||
from bs4 import BeautifulSoup
|
||||
from langchain.document_loaders import WebBaseLoader
|
||||
from langchain.schema import Document
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter
|
||||
|
||||
from pilot.configs.config import Config
|
||||
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||
@@ -33,7 +33,11 @@ class URLEmbedding(SourceEmbedding):
|
||||
length_function=len,
|
||||
)
|
||||
else:
|
||||
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000)
|
||||
text_splitter = SpacyTextSplitter(
|
||||
pipeline="zh_core_web_sm",
|
||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||
chunk_overlap=100,
|
||||
)
|
||||
return loader.load_and_split(text_splitter)
|
||||
|
||||
@register
|
||||
|
Reference in New Issue
Block a user