ci: make ci happy lint the code, delete unused imports

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
yihong0618
2023-05-24 18:42:55 +08:00
parent 562d5a98cc
commit b098a48898
75 changed files with 1110 additions and 824 deletions

View File

@@ -1,8 +1,3 @@
from pilot.source_embedding.source_embedding import SourceEmbedding
from pilot.source_embedding.source_embedding import register
from pilot.source_embedding.source_embedding import SourceEmbedding, register
__all__ = [
"SourceEmbedding",
"register"
]
__all__ = ["SourceEmbedding", "register"]

View File

@@ -1,5 +1,6 @@
import re
from typing import List
from langchain.text_splitter import CharacterTextSplitter
@@ -12,32 +13,43 @@ class CHNDocumentSplitter(CharacterTextSplitter):
def split_text(self, text: str) -> List[str]:
if self.pdf:
text = re.sub(r"\n{3,}", r"\n", text)
text = re.sub('\s', " ", text)
text = re.sub("\s", " ", text)
text = re.sub("\n\n", "", text)
text = re.sub(r'([;.!?。!?\?])([^”’])', r"\1\n\2", text)
text = re.sub(r"([;.!?。!?\?])([^”’])", r"\1\n\2", text)
text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text)
text = re.sub(r'(\{2})([^"’”」』])', r"\1\n\2", text)
text = re.sub(r'([;!?。!?\?]["’”」』]{0,2})([^;!?,。!?\?])', r'\1\n\2', text)
text = re.sub(r'([;!?。!?\?]["’”」』]{0,2})([^;!?,。!?\?])', r"\1\n\2", text)
text = text.rstrip()
ls = [i for i in text.split("\n") if i]
for ele in ls:
if len(ele) > self.sentence_size:
ele1 = re.sub(r'([,.]["’”」』]{0,2})([^,.])', r'\1\n\2', ele)
ele1 = re.sub(r'([,.]["’”」』]{0,2})([^,.])', r"\1\n\2", ele)
ele1_ls = ele1.split("\n")
for ele_ele1 in ele1_ls:
if len(ele_ele1) > self.sentence_size:
ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
ele_ele2 = re.sub(
r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r"\1\n\2", ele_ele1
)
ele2_ls = ele_ele2.split("\n")
for ele_ele2 in ele2_ls:
if len(ele_ele2) > self.sentence_size:
ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
ele_ele3 = re.sub(
'( ["’”」』]{0,2})([^ ])', r"\1\n\2", ele_ele2
)
ele2_id = ele2_ls.index(ele_ele2)
ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
ele2_id + 1:]
ele2_ls = (
ele2_ls[:ele2_id]
+ [i for i in ele_ele3.split("\n") if i]
+ ele2_ls[ele2_id + 1 :]
)
ele_id = ele1_ls.index(ele_ele1)
ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:]
ele1_ls = (
ele1_ls[:ele_id]
+ [i for i in ele2_ls if i]
+ ele1_ls[ele_id + 1 :]
)
id = ls.index(ele)
ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:]
ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1 :]
return ls

View File

@@ -1,14 +1,21 @@
from typing import List, Optional, Dict
from pilot.source_embedding import SourceEmbedding, register
from typing import Dict, List, Optional
from langchain.document_loaders import CSVLoader
from langchain.schema import Document
from pilot.source_embedding import SourceEmbedding, register
class CSVEmbedding(SourceEmbedding):
"""csv embedding for read csv document."""
def __init__(self, file_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None):
def __init__(
self,
file_path,
model_name,
vector_store_config,
embedding_args: Optional[Dict] = None,
):
"""Initialize with csv path."""
super().__init__(file_path, model_name, vector_store_config)
self.file_path = file_path
@@ -29,6 +36,3 @@ class CSVEmbedding(SourceEmbedding):
documents[i].page_content = d.page_content.replace("\n", "")
i += 1
return documents

View File

@@ -1,7 +1,8 @@
import os
import markdown
from bs4 import BeautifulSoup
from langchain.document_loaders import TextLoader, markdown, PyPDFLoader
from langchain.document_loaders import PyPDFLoader, TextLoader, markdown
from langchain.embeddings import HuggingFaceEmbeddings
from pilot.configs.config import Config
@@ -10,12 +11,11 @@ from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
from pilot.source_embedding.csv_embedding import CSVEmbedding
from pilot.source_embedding.markdown_embedding import MarkdownEmbedding
from pilot.source_embedding.pdf_embedding import PDFEmbedding
import markdown
from pilot.vector_store.connector import VectorStoreConnector
CFG = Config()
class KnowledgeEmbedding:
def __init__(self, file_path, model_name, vector_store_config, local_persist=True):
"""Initialize with Loader url, model_name, vector_store_config"""
@@ -37,16 +37,30 @@ class KnowledgeEmbedding:
def init_knowledge_embedding(self):
if self.file_path.endswith(".pdf"):
embedding = PDFEmbedding(file_path=self.file_path, model_name=self.model_name,
vector_store_config=self.vector_store_config)
embedding = PDFEmbedding(
file_path=self.file_path,
model_name=self.model_name,
vector_store_config=self.vector_store_config,
)
elif self.file_path.endswith(".md"):
embedding = MarkdownEmbedding(file_path=self.file_path, model_name=self.model_name, vector_store_config=self.vector_store_config)
embedding = MarkdownEmbedding(
file_path=self.file_path,
model_name=self.model_name,
vector_store_config=self.vector_store_config,
)
elif self.file_path.endswith(".csv"):
embedding = CSVEmbedding(file_path=self.file_path, model_name=self.model_name,
vector_store_config=self.vector_store_config)
embedding = CSVEmbedding(
file_path=self.file_path,
model_name=self.model_name,
vector_store_config=self.vector_store_config,
)
elif self.file_type == "default":
embedding = MarkdownEmbedding(file_path=self.file_path, model_name=self.model_name, vector_store_config=self.vector_store_config)
embedding = MarkdownEmbedding(
file_path=self.file_path,
model_name=self.model_name,
vector_store_config=self.vector_store_config,
)
return embedding
@@ -55,7 +69,9 @@ class KnowledgeEmbedding:
def knowledge_persist_initialization(self, append_mode):
documents = self._load_knownlege(self.file_path)
self.vector_client = VectorStoreConnector(CFG.VECTOR_STORE_TYPE, self.vector_store_config)
self.vector_client = VectorStoreConnector(
CFG.VECTOR_STORE_TYPE, self.vector_store_config
)
self.vector_client.load_document(documents)
return self.vector_client
@@ -67,7 +83,9 @@ class KnowledgeEmbedding:
docs = self._load_file(filename)
new_docs = []
for doc in docs:
doc.metadata = {"source": doc.metadata["source"].replace(DATASETS_DIR, "")}
doc.metadata = {
"source": doc.metadata["source"].replace(DATASETS_DIR, "")
}
print("doc is embedding...", doc.metadata)
new_docs.append(doc)
docments += new_docs
@@ -76,27 +94,33 @@ class KnowledgeEmbedding:
def _load_file(self, filename):
if filename.lower().endswith(".md"):
loader = TextLoader(filename)
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
text_splitter = CHNDocumentSplitter(
pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
)
docs = loader.load_and_split(text_splitter)
i = 0
for d in docs:
content = markdown.markdown(d.page_content)
soup = BeautifulSoup(content, 'html.parser')
for tag in soup(['!doctype', 'meta', 'i.fa']):
soup = BeautifulSoup(content, "html.parser")
for tag in soup(["!doctype", "meta", "i.fa"]):
tag.extract()
docs[i].page_content = soup.get_text()
docs[i].page_content = docs[i].page_content.replace("\n", " ")
i += 1
elif filename.lower().endswith(".pdf"):
loader = PyPDFLoader(filename)
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
textsplitter = CHNDocumentSplitter(
pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
)
docs = loader.load_and_split(textsplitter)
i = 0
for d in docs:
docs[i].page_content = d.page_content.replace("\n", " ").replace("<EFBFBD>", "")
docs[i].page_content = d.page_content.replace("\n", " ").replace(
"<EFBFBD>", ""
)
i += 1
else:
loader = TextLoader(filename)
text_splitor = CHNDocumentSplitter(sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
docs = loader.load_and_split(text_splitor)
return docs
return docs

View File

@@ -3,12 +3,12 @@
import os
from typing import List
import markdown
from bs4 import BeautifulSoup
from langchain.document_loaders import TextLoader
from langchain.schema import Document
import markdown
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
from pilot.source_embedding import SourceEmbedding, register
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
@@ -27,7 +27,9 @@ class MarkdownEmbedding(SourceEmbedding):
def read(self):
"""Load from markdown path."""
loader = TextLoader(self.file_path)
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
text_splitter = CHNDocumentSplitter(
pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
)
return loader.load_and_split(text_splitter)
@register
@@ -44,7 +46,9 @@ class MarkdownEmbedding(SourceEmbedding):
# 更新metadata数据
new_docs = []
for doc in docs:
doc.metadata = {"source": doc.metadata["source"].replace(self.file_path, "")}
doc.metadata = {
"source": doc.metadata["source"].replace(self.file_path, "")
}
print("doc is embedding ... ", doc.metadata)
new_docs.append(doc)
docments += new_docs
@@ -55,13 +59,10 @@ class MarkdownEmbedding(SourceEmbedding):
i = 0
for d in documents:
content = markdown.markdown(d.page_content)
soup = BeautifulSoup(content, 'html.parser')
for tag in soup(['!doctype', 'meta', 'i.fa']):
soup = BeautifulSoup(content, "html.parser")
for tag in soup(["!doctype", "meta", "i.fa"]):
tag.extract()
documents[i].page_content = soup.get_text()
documents[i].page_content = documents[i].page_content.replace("\n", " ")
i += 1
return documents

View File

@@ -4,8 +4,8 @@ from typing import List
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
from pilot.source_embedding import SourceEmbedding, register
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
@@ -25,7 +25,9 @@ class PDFEmbedding(SourceEmbedding):
"""Load from pdf path."""
# loader = UnstructuredPaddlePDFLoader(self.file_path)
loader = PyPDFLoader(self.file_path)
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
textsplitter = CHNDocumentSplitter(
pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
)
return loader.load_and_split(textsplitter)
@register
@@ -35,6 +37,3 @@ class PDFEmbedding(SourceEmbedding):
documents[i].page_content = d.page_content.replace("\n", "")
i += 1
return documents

View File

@@ -1,10 +1,10 @@
"""Loader that loads image files."""
import os
from typing import List
import fitz
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from paddleocr import PaddleOCR
import os
import fitz
class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
@@ -19,9 +19,8 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False)
doc = fitz.open(filepath)
txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename))
img_name = os.path.join(full_dir_path, '.tmp.png')
with open(txt_file_path, 'w', encoding='utf-8') as fout:
img_name = os.path.join(full_dir_path, ".tmp.png")
with open(txt_file_path, "w", encoding="utf-8") as fout:
for i in range(doc.page_count):
page = doc[i]
text = page.get_text("")
@@ -42,11 +41,14 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
txt_file_path = pdf_ocr_txt(self.file_path)
from unstructured.partition.text import partition_text
return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
if __name__ == "__main__":
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "content", "samples", "test.pdf")
filepath = os.path.join(
os.path.dirname(os.path.dirname(__file__)), "content", "samples", "test.pdf"
)
loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
docs = loader.load()
for doc in docs:

View File

@@ -58,4 +58,4 @@
# # docs,
# # embedding=embeddings,
# # connection_args={"host": "127.0.0.1", "port": "19530", "alias": "default"}
# # )
# # )

View File

@@ -1,9 +1,9 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from abc import ABC, abstractmethod
from typing import Dict, List, Optional
from langchain.embeddings import HuggingFaceEmbeddings
from typing import List, Optional, Dict
from pilot.configs.config import Config
from pilot.vector_store.connector import VectorStoreConnector
@@ -23,7 +23,13 @@ class SourceEmbedding(ABC):
Implementations should implement the method
"""
def __init__(self, file_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None):
def __init__(
self,
file_path,
model_name,
vector_store_config,
embedding_args: Optional[Dict] = None,
):
"""Initialize with Loader url, model_name, vector_store_config"""
self.file_path = file_path
self.model_name = model_name
@@ -32,12 +38,15 @@ class SourceEmbedding(ABC):
self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
vector_store_config["embeddings"] = self.embeddings
self.vector_client = VectorStoreConnector(CFG.VECTOR_STORE_TYPE, vector_store_config)
self.vector_client = VectorStoreConnector(
CFG.VECTOR_STORE_TYPE, vector_store_config
)
@abstractmethod
@register
def read(self) -> List[ABC]:
"""read datasource into document objects."""
@register
def data_process(self, text):
"""pre process data."""
@@ -63,25 +72,25 @@ class SourceEmbedding(ABC):
return self.vector_client.similar_search(doc, topk)
def source_embedding(self):
if 'read' in registered_methods:
if "read" in registered_methods:
text = self.read()
if 'data_process' in registered_methods:
if "data_process" in registered_methods:
text = self.data_process(text)
if 'text_split' in registered_methods:
if "text_split" in registered_methods:
self.text_split(text)
if 'text_to_vector' in registered_methods:
if "text_to_vector" in registered_methods:
self.text_to_vector(text)
if 'index_to_store' in registered_methods:
if "index_to_store" in registered_methods:
self.index_to_store(text)
def batch_embedding(self):
if 'read_batch' in registered_methods:
if "read_batch" in registered_methods:
text = self.read_batch()
if 'data_process' in registered_methods:
if "data_process" in registered_methods:
text = self.data_process(text)
if 'text_split' in registered_methods:
if "text_split" in registered_methods:
self.text_split(text)
if 'text_to_vector' in registered_methods:
if "text_to_vector" in registered_methods:
self.text_to_vector(text)
if 'index_to_store' in registered_methods:
if "index_to_store" in registered_methods:
self.index_to_store(text)

View File

@@ -1,13 +1,11 @@
from typing import List
from langchain.text_splitter import CharacterTextSplitter
from pilot.source_embedding import SourceEmbedding, register
from bs4 import BeautifulSoup
from langchain.document_loaders import WebBaseLoader
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from pilot.source_embedding import SourceEmbedding, register
class URLEmbedding(SourceEmbedding):
@@ -23,7 +21,9 @@ class URLEmbedding(SourceEmbedding):
def read(self):
"""Load from url path."""
loader = WebBaseLoader(web_path=self.file_path)
text_splitor = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, length_function=len)
text_splitor = CharacterTextSplitter(
chunk_size=1000, chunk_overlap=20, length_function=len
)
return loader.load_and_split(text_splitor)
@register
@@ -31,12 +31,9 @@ class URLEmbedding(SourceEmbedding):
i = 0
for d in documents:
content = d.page_content.replace("\n", "")
soup = BeautifulSoup(content, 'html.parser')
for tag in soup(['!doctype', 'meta']):
soup = BeautifulSoup(content, "html.parser")
for tag in soup(["!doctype", "meta"]):
tag.extract()
documents[i].page_content = soup.get_text()
i += 1
return documents