update:knowledge load script

This commit is contained in:
aries-ckt
2023-05-19 21:17:39 +08:00
parent 03f7ed32e5
commit 336ba1e042
9 changed files with 49 additions and 53 deletions

View File

@@ -9,33 +9,17 @@ class CHNDocumentSplitter(CharacterTextSplitter):
self.pdf = pdf
self.sentence_size = sentence_size
# def split_text_version2(self, text: str) -> List[str]:
# if self.pdf:
# text = re.sub(r"\n{3,}", "\n", text)
# text = re.sub('\s', ' ', text)
# text = text.replace("\n\n", "")
# sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del
# sent_list = []
# for ele in sent_sep_pattern.split(text):
# if sent_sep_pattern.match(ele) and sent_list:
# sent_list[-1] += ele
# elif ele:
# sent_list.append(ele)
# return sent_list
def split_text(self, text: str) -> List[str]:
if self.pdf:
text = re.sub(r"\n{3,}", r"\n", text)
text = re.sub('\s', " ", text)
text = re.sub("\n\n", "", text)
text = re.sub(r'([;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符
text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号
text = re.sub(r'(\{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号
text = re.sub(r'([;.!?。!?\?])([^”’])', r"\1\n\2", text)
text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text)
text = re.sub(r'(\{2})([^"’”」』])', r"\1\n\2", text)
text = re.sub(r'([;!?。!?\?]["’”」』]{0,2})([^;!?,。!?\?])', r'\1\n\2', text)
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后注意前面的几句都小心保留了双引号
text = text.rstrip() # 段尾如果有多余的\n就去掉它
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
text = text.rstrip()
ls = [i for i in text.split("\n") if i]
for ele in ls:
if len(ele) > self.sentence_size:

View File

@@ -4,13 +4,15 @@ from bs4 import BeautifulSoup
from langchain.document_loaders import PyPDFLoader, TextLoader, markdown
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from pilot.configs.model_config import DATASETS_DIR
from pilot.configs.model_config import DATASETS_DIR, KNOWLEDGE_CHUNK_SPLIT_SIZE
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
from pilot.source_embedding.csv_embedding import CSVEmbedding
from pilot.source_embedding.markdown_embedding import MarkdownEmbedding
from pilot.source_embedding.pdf_embedding import PDFEmbedding
import markdown
from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
class KnowledgeEmbedding:
def __init__(self, file_path, model_name, vector_store_config, local_persist=True):
@@ -63,7 +65,7 @@ class KnowledgeEmbedding:
print("directly return vector store")
vector_store = Chroma(persist_directory=persist_dir, embedding_function=self.embeddings)
else:
print(vector_name + "is new vector store, knowledge begin load...")
print(vector_name + " is new vector store, knowledge begin load...")
documents = self._load_knownlege(self.file_path)
vector_store = Chroma.from_documents(documents=documents,
embedding=self.embeddings,
@@ -88,7 +90,7 @@ class KnowledgeEmbedding:
def _load_file(self, filename):
if filename.lower().endswith(".md"):
loader = TextLoader(filename)
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=100)
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
docs = loader.load_and_split(text_splitter)
i = 0
for d in docs:
@@ -100,11 +102,15 @@ class KnowledgeEmbedding:
docs[i].page_content = docs[i].page_content.replace("\n", " ")
i += 1
elif filename.lower().endswith(".pdf"):
loader = PyPDFLoader(filename)
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=100)
loader = UnstructuredPaddlePDFLoader(filename)
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
docs = loader.load_and_split(textsplitter)
i = 0
for d in docs:
docs[i].page_content = d.page_content.replace("\n", " ").replace("<EFBFBD>", "")
i += 1
else:
loader = TextLoader(filename)
text_splitor = CHNDocumentSplitter(sentence_size=100)
text_splitor = CHNDocumentSplitter(sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
docs = loader.load_and_split(text_splitor)
return docs

View File

@@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
from langchain.document_loaders import TextLoader
from langchain.schema import Document
import markdown
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
from pilot.source_embedding import SourceEmbedding, register
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
@@ -26,7 +27,7 @@ class MarkdownEmbedding(SourceEmbedding):
def read(self):
"""Load from markdown path."""
loader = TextLoader(self.file_path)
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=100)
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
return loader.load_and_split(text_splitter)
@register

View File

@@ -2,11 +2,12 @@
# -*- coding: utf-8 -*-
from typing import List
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
from pilot.source_embedding import SourceEmbedding, register
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
from pilot.source_embedding.pdf_loader import UnstructuredPaddlePDFLoader
class PDFEmbedding(SourceEmbedding):
@@ -22,8 +23,8 @@ class PDFEmbedding(SourceEmbedding):
@register
def read(self):
"""Load from pdf path."""
loader = PyPDFLoader(self.file_path)
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=100)
loader = UnstructuredPaddlePDFLoader(self.file_path)
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
return loader.load_and_split(textsplitter)
@register

View File

@@ -50,7 +50,7 @@
#
# # text_embeddings = Text2Vectors()
# mivuls = MilvusStore(cfg={"url": "127.0.0.1", "port": "19530", "alias": "default", "table_name": "test_k"})
#
#
# mivuls.insert(["textc","tezt2"])
# print("success")
# ct