diff --git a/pilot/source_embedding/chinese_text_splitter.py b/pilot/source_embedding/chn_document_splitter.py similarity index 76% rename from pilot/source_embedding/chinese_text_splitter.py rename to pilot/source_embedding/chn_document_splitter.py index 091276af6..090a6af56 100644 --- a/pilot/source_embedding/chinese_text_splitter.py +++ b/pilot/source_embedding/chn_document_splitter.py @@ -1,30 +1,29 @@ -from langchain.text_splitter import CharacterTextSplitter import re from typing import List -# from configs.model_config import SENTENCE_SIZE +from langchain.text_splitter import CharacterTextSplitter -class ChineseTextSplitter(CharacterTextSplitter): +class CHNDocumentSplitter(CharacterTextSplitter): def __init__(self, pdf: bool = False, sentence_size: int = None, **kwargs): super().__init__(**kwargs) self.pdf = pdf self.sentence_size = sentence_size - def split_text1(self, text: str) -> List[str]: - if self.pdf: - text = re.sub(r"\n{3,}", "\n", text) - text = re.sub('\s', ' ', text) - text = text.replace("\n\n", "") - sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :; - sent_list = [] - for ele in sent_sep_pattern.split(text): - if sent_sep_pattern.match(ele) and sent_list: - sent_list[-1] += ele - elif ele: - sent_list.append(ele) - return sent_list + # def split_text_version2(self, text: str) -> List[str]: + # if self.pdf: + # text = re.sub(r"\n{3,}", "\n", text) + # text = re.sub('\s', ' ', text) + # text = text.replace("\n\n", "") + # sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :; + # sent_list = [] + # for ele in sent_sep_pattern.split(text): + # if sent_sep_pattern.match(ele) and sent_list: + # sent_list[-1] += ele + # elif ele: + # sent_list.append(ele) + # return sent_list - def split_text(self, text: str) -> List[str]: ##此处需要进一步优化逻辑 + def split_text(self, text: str) -> List[str]: if self.pdf: text = re.sub(r"\n{3,}", r"\n", text) text = re.sub('\s', " ", text)