From 24130a60973897d41cb8dedbb2f833c2fce4896f Mon Sep 17 00:00:00 2001 From: aries_ckt <916701291@qq.com> Date: Thu, 29 Jun 2023 18:32:36 +0800 Subject: [PATCH] fix:use spacy replace chunk method use spacy replace chunk method --- pilot/embedding_engine/markdown_embedding.py | 13 ++++++------- pilot/embedding_engine/pdf_embedding.py | 7 +++++-- pilot/embedding_engine/ppt_embedding.py | 6 +++++- pilot/embedding_engine/word_embedding.py | 11 +++++++---- 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/pilot/embedding_engine/markdown_embedding.py b/pilot/embedding_engine/markdown_embedding.py index 2bbd20878..0d70ba34f 100644 --- a/pilot/embedding_engine/markdown_embedding.py +++ b/pilot/embedding_engine/markdown_embedding.py @@ -11,7 +11,6 @@ from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter from pilot.configs.config import Config from pilot.embedding_engine import SourceEmbedding, register from pilot.embedding_engine.EncodeTextLoader import EncodeTextLoader -from pilot.embedding_engine.chn_document_splitter import CHNDocumentSplitter CFG = Config() @@ -30,11 +29,7 @@ class MarkdownEmbedding(SourceEmbedding): def read(self): """Load from markdown path.""" loader = EncodeTextLoader(self.file_path) - # text_splitter = SpacyTextSplitter( - # pipeline="zh_core_web_sm", - # chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, - # chunk_overlap=100, - # ) + if CFG.LANGUAGE == "en": text_splitter = CharacterTextSplitter( chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, @@ -42,7 +37,11 @@ class MarkdownEmbedding(SourceEmbedding): length_function=len, ) else: - text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000) + text_splitter = SpacyTextSplitter( + pipeline="zh_core_web_sm", + chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, + chunk_overlap=100, + ) return loader.load_and_split(text_splitter) @register diff --git a/pilot/embedding_engine/pdf_embedding.py b/pilot/embedding_engine/pdf_embedding.py index a51eccbda..2b8f244e3 100644 --- a/pilot/embedding_engine/pdf_embedding.py +++ b/pilot/embedding_engine/pdf_embedding.py @@ -8,7 +8,6 @@ from langchain.text_splitter import SpacyTextSplitter, CharacterTextSplitter from pilot.configs.config import Config from pilot.embedding_engine import SourceEmbedding, register -from pilot.embedding_engine.chn_document_splitter import CHNDocumentSplitter CFG = Config() @@ -41,7 +40,11 @@ class PDFEmbedding(SourceEmbedding): length_function=len, ) else: - text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000) + text_splitter = SpacyTextSplitter( + pipeline="zh_core_web_sm", + chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, + chunk_overlap=100, + ) return loader.load_and_split(text_splitter) @register diff --git a/pilot/embedding_engine/ppt_embedding.py b/pilot/embedding_engine/ppt_embedding.py index 4ff06c6b7..da4390849 100644 --- a/pilot/embedding_engine/ppt_embedding.py +++ b/pilot/embedding_engine/ppt_embedding.py @@ -38,7 +38,11 @@ class PPTEmbedding(SourceEmbedding): length_function=len, ) else: - text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000) + text_splitter = SpacyTextSplitter( + pipeline="zh_core_web_sm", + chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, + chunk_overlap=100, + ) return loader.load_and_split(text_splitter) @register diff --git a/pilot/embedding_engine/word_embedding.py b/pilot/embedding_engine/word_embedding.py index 9668700a1..55988a240 100644 --- a/pilot/embedding_engine/word_embedding.py +++ b/pilot/embedding_engine/word_embedding.py @@ -2,13 +2,12 @@ # -*- coding: utf-8 -*- from typing import List -from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader +from langchain.document_loaders import UnstructuredWordDocumentLoader from langchain.schema import Document -from langchain.text_splitter import CharacterTextSplitter +from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter from pilot.configs.config import Config from pilot.embedding_engine import SourceEmbedding, register -from pilot.embedding_engine.chn_document_splitter import CHNDocumentSplitter CFG = Config() @@ -33,7 +32,11 @@ class WordEmbedding(SourceEmbedding): length_function=len, ) else: - text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000) + text_splitter = SpacyTextSplitter( + pipeline="zh_core_web_sm", + chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE, + chunk_overlap=100, + ) return loader.load_and_split(text_splitter) @register