fix:knowledge_init path

1.knowledge_init path
2.url embedding chunk
This commit is contained in:
aries_ckt 2023-07-03 16:09:18 +08:00
parent 8baa1d9464
commit 1868b228bc
2 changed files with 7 additions and 3 deletions

View File

@ -3,7 +3,7 @@ from typing import List
from bs4 import BeautifulSoup
from langchain.document_loaders import WebBaseLoader
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter
from pilot.configs.config import Config
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
@ -33,7 +33,11 @@ class URLEmbedding(SourceEmbedding):
length_function=len,
)
else:
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000)
text_splitter = SpacyTextSplitter(
pipeline="zh_core_web_sm",
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
chunk_overlap=100,
)
return loader.load_and_split(text_splitter)
@register

View File

@ -3,12 +3,12 @@
import argparse
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
from pilot.embedding_engine.knowledge_type import KnowledgeType
from pilot.openapi.knowledge.knowledge_service import KnowledgeService
from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
from pilot.configs.config import Config
from pilot.configs.model_config import (