From 38d7a37ef83e02ae6c3e0bd8dad71d0ab628fcf4 Mon Sep 17 00:00:00 2001 From: aries-ckt <916701291@qq.com> Date: Wed, 7 Jun 2023 18:10:05 +0800 Subject: [PATCH] update:spacy chunk splitter --- pilot/source_embedding/pdf_embedding.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pilot/source_embedding/pdf_embedding.py b/pilot/source_embedding/pdf_embedding.py index 54f77fb81..aee498b31 100644 --- a/pilot/source_embedding/pdf_embedding.py +++ b/pilot/source_embedding/pdf_embedding.py @@ -4,10 +4,10 @@ from typing import List from langchain.document_loaders import PyPDFLoader from langchain.schema import Document +from langchain.text_splitter import SpacyTextSplitter from pilot.configs.config import Config from pilot.source_embedding import SourceEmbedding, register -from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter CFG = Config() @@ -24,11 +24,11 @@ class PDFEmbedding(SourceEmbedding): @register def read(self): """Load from pdf path.""" - # loader = UnstructuredPaddlePDFLoader(self.file_path) loader = PyPDFLoader(self.file_path) - textsplitter = CHNDocumentSplitter( - pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE - ) + # textsplitter = CHNDocumentSplitter( + # pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE + # ) + textsplitter = SpacyTextSplitter(pipeline='zh_core_web_sm', chunk_size=1000, chunk_overlap=200) return loader.load_and_split(textsplitter) @register