mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-09-12 12:37:14 +00:00
ci: make ci happy lint the code, delete unused imports
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
@@ -1,8 +1,3 @@
|
||||
from pilot.source_embedding.source_embedding import SourceEmbedding
|
||||
from pilot.source_embedding.source_embedding import register
|
||||
from pilot.source_embedding.source_embedding import SourceEmbedding, register
|
||||
|
||||
|
||||
__all__ = [
|
||||
"SourceEmbedding",
|
||||
"register"
|
||||
]
|
||||
__all__ = ["SourceEmbedding", "register"]
|
||||
|
@@ -1,5 +1,6 @@
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
|
||||
|
||||
@@ -12,32 +13,43 @@ class CHNDocumentSplitter(CharacterTextSplitter):
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
if self.pdf:
|
||||
text = re.sub(r"\n{3,}", r"\n", text)
|
||||
text = re.sub('\s', " ", text)
|
||||
text = re.sub("\s", " ", text)
|
||||
text = re.sub("\n\n", "", text)
|
||||
|
||||
text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text)
|
||||
text = re.sub(r"([;;.!?。!?\?])([^”’])", r"\1\n\2", text)
|
||||
text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text)
|
||||
text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text)
|
||||
text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r'\1\n\2', text)
|
||||
text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r"\1\n\2", text)
|
||||
text = text.rstrip()
|
||||
ls = [i for i in text.split("\n") if i]
|
||||
for ele in ls:
|
||||
if len(ele) > self.sentence_size:
|
||||
ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele)
|
||||
ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r"\1\n\2", ele)
|
||||
ele1_ls = ele1.split("\n")
|
||||
for ele_ele1 in ele1_ls:
|
||||
if len(ele_ele1) > self.sentence_size:
|
||||
ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
|
||||
ele_ele2 = re.sub(
|
||||
r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r"\1\n\2", ele_ele1
|
||||
)
|
||||
ele2_ls = ele_ele2.split("\n")
|
||||
for ele_ele2 in ele2_ls:
|
||||
if len(ele_ele2) > self.sentence_size:
|
||||
ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
|
||||
ele_ele3 = re.sub(
|
||||
'( ["’”」』]{0,2})([^ ])', r"\1\n\2", ele_ele2
|
||||
)
|
||||
ele2_id = ele2_ls.index(ele_ele2)
|
||||
ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
|
||||
ele2_id + 1:]
|
||||
ele2_ls = (
|
||||
ele2_ls[:ele2_id]
|
||||
+ [i for i in ele_ele3.split("\n") if i]
|
||||
+ ele2_ls[ele2_id + 1 :]
|
||||
)
|
||||
ele_id = ele1_ls.index(ele_ele1)
|
||||
ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:]
|
||||
ele1_ls = (
|
||||
ele1_ls[:ele_id]
|
||||
+ [i for i in ele2_ls if i]
|
||||
+ ele1_ls[ele_id + 1 :]
|
||||
)
|
||||
|
||||
id = ls.index(ele)
|
||||
ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:]
|
||||
ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1 :]
|
||||
return ls
|
||||
|
@@ -1,14 +1,21 @@
|
||||
from typing import List, Optional, Dict
|
||||
from pilot.source_embedding import SourceEmbedding, register
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from langchain.document_loaders import CSVLoader
|
||||
from langchain.schema import Document
|
||||
|
||||
from pilot.source_embedding import SourceEmbedding, register
|
||||
|
||||
|
||||
class CSVEmbedding(SourceEmbedding):
|
||||
"""csv embedding for read csv document."""
|
||||
|
||||
def __init__(self, file_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None):
|
||||
def __init__(
|
||||
self,
|
||||
file_path,
|
||||
model_name,
|
||||
vector_store_config,
|
||||
embedding_args: Optional[Dict] = None,
|
||||
):
|
||||
"""Initialize with csv path."""
|
||||
super().__init__(file_path, model_name, vector_store_config)
|
||||
self.file_path = file_path
|
||||
@@ -29,6 +36,3 @@ class CSVEmbedding(SourceEmbedding):
|
||||
documents[i].page_content = d.page_content.replace("\n", "")
|
||||
i += 1
|
||||
return documents
|
||||
|
||||
|
||||
|
||||
|
@@ -1,7 +1,8 @@
|
||||
import os
|
||||
|
||||
import markdown
|
||||
from bs4 import BeautifulSoup
|
||||
from langchain.document_loaders import TextLoader, markdown, PyPDFLoader
|
||||
from langchain.document_loaders import PyPDFLoader, TextLoader, markdown
|
||||
from langchain.embeddings import HuggingFaceEmbeddings
|
||||
|
||||
from pilot.configs.config import Config
|
||||
@@ -10,12 +11,11 @@ from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
||||
from pilot.source_embedding.csv_embedding import CSVEmbedding
|
||||
from pilot.source_embedding.markdown_embedding import MarkdownEmbedding
|
||||
from pilot.source_embedding.pdf_embedding import PDFEmbedding
|
||||
import markdown
|
||||
|
||||
from pilot.vector_store.connector import VectorStoreConnector
|
||||
|
||||
CFG = Config()
|
||||
|
||||
|
||||
class KnowledgeEmbedding:
|
||||
def __init__(self, file_path, model_name, vector_store_config, local_persist=True):
|
||||
"""Initialize with Loader url, model_name, vector_store_config"""
|
||||
@@ -37,16 +37,30 @@ class KnowledgeEmbedding:
|
||||
|
||||
def init_knowledge_embedding(self):
|
||||
if self.file_path.endswith(".pdf"):
|
||||
embedding = PDFEmbedding(file_path=self.file_path, model_name=self.model_name,
|
||||
vector_store_config=self.vector_store_config)
|
||||
embedding = PDFEmbedding(
|
||||
file_path=self.file_path,
|
||||
model_name=self.model_name,
|
||||
vector_store_config=self.vector_store_config,
|
||||
)
|
||||
elif self.file_path.endswith(".md"):
|
||||
embedding = MarkdownEmbedding(file_path=self.file_path, model_name=self.model_name, vector_store_config=self.vector_store_config)
|
||||
embedding = MarkdownEmbedding(
|
||||
file_path=self.file_path,
|
||||
model_name=self.model_name,
|
||||
vector_store_config=self.vector_store_config,
|
||||
)
|
||||
|
||||
elif self.file_path.endswith(".csv"):
|
||||
embedding = CSVEmbedding(file_path=self.file_path, model_name=self.model_name,
|
||||
vector_store_config=self.vector_store_config)
|
||||
embedding = CSVEmbedding(
|
||||
file_path=self.file_path,
|
||||
model_name=self.model_name,
|
||||
vector_store_config=self.vector_store_config,
|
||||
)
|
||||
elif self.file_type == "default":
|
||||
embedding = MarkdownEmbedding(file_path=self.file_path, model_name=self.model_name, vector_store_config=self.vector_store_config)
|
||||
embedding = MarkdownEmbedding(
|
||||
file_path=self.file_path,
|
||||
model_name=self.model_name,
|
||||
vector_store_config=self.vector_store_config,
|
||||
)
|
||||
|
||||
return embedding
|
||||
|
||||
@@ -55,7 +69,9 @@ class KnowledgeEmbedding:
|
||||
|
||||
def knowledge_persist_initialization(self, append_mode):
|
||||
documents = self._load_knownlege(self.file_path)
|
||||
self.vector_client = VectorStoreConnector(CFG.VECTOR_STORE_TYPE, self.vector_store_config)
|
||||
self.vector_client = VectorStoreConnector(
|
||||
CFG.VECTOR_STORE_TYPE, self.vector_store_config
|
||||
)
|
||||
self.vector_client.load_document(documents)
|
||||
return self.vector_client
|
||||
|
||||
@@ -67,7 +83,9 @@ class KnowledgeEmbedding:
|
||||
docs = self._load_file(filename)
|
||||
new_docs = []
|
||||
for doc in docs:
|
||||
doc.metadata = {"source": doc.metadata["source"].replace(DATASETS_DIR, "")}
|
||||
doc.metadata = {
|
||||
"source": doc.metadata["source"].replace(DATASETS_DIR, "")
|
||||
}
|
||||
print("doc is embedding...", doc.metadata)
|
||||
new_docs.append(doc)
|
||||
docments += new_docs
|
||||
@@ -76,27 +94,33 @@ class KnowledgeEmbedding:
|
||||
def _load_file(self, filename):
|
||||
if filename.lower().endswith(".md"):
|
||||
loader = TextLoader(filename)
|
||||
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
|
||||
text_splitter = CHNDocumentSplitter(
|
||||
pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||
)
|
||||
docs = loader.load_and_split(text_splitter)
|
||||
i = 0
|
||||
for d in docs:
|
||||
content = markdown.markdown(d.page_content)
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
for tag in soup(['!doctype', 'meta', 'i.fa']):
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
for tag in soup(["!doctype", "meta", "i.fa"]):
|
||||
tag.extract()
|
||||
docs[i].page_content = soup.get_text()
|
||||
docs[i].page_content = docs[i].page_content.replace("\n", " ")
|
||||
i += 1
|
||||
elif filename.lower().endswith(".pdf"):
|
||||
loader = PyPDFLoader(filename)
|
||||
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
|
||||
textsplitter = CHNDocumentSplitter(
|
||||
pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||
)
|
||||
docs = loader.load_and_split(textsplitter)
|
||||
i = 0
|
||||
for d in docs:
|
||||
docs[i].page_content = d.page_content.replace("\n", " ").replace("<EFBFBD>", "")
|
||||
docs[i].page_content = d.page_content.replace("\n", " ").replace(
|
||||
"<EFBFBD>", ""
|
||||
)
|
||||
i += 1
|
||||
else:
|
||||
loader = TextLoader(filename)
|
||||
text_splitor = CHNDocumentSplitter(sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
|
||||
docs = loader.load_and_split(text_splitor)
|
||||
return docs
|
||||
return docs
|
||||
|
@@ -3,12 +3,12 @@
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
import markdown
|
||||
from bs4 import BeautifulSoup
|
||||
from langchain.document_loaders import TextLoader
|
||||
from langchain.schema import Document
|
||||
import markdown
|
||||
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||
|
||||
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||
from pilot.source_embedding import SourceEmbedding, register
|
||||
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
||||
|
||||
@@ -27,7 +27,9 @@ class MarkdownEmbedding(SourceEmbedding):
|
||||
def read(self):
|
||||
"""Load from markdown path."""
|
||||
loader = TextLoader(self.file_path)
|
||||
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
|
||||
text_splitter = CHNDocumentSplitter(
|
||||
pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||
)
|
||||
return loader.load_and_split(text_splitter)
|
||||
|
||||
@register
|
||||
@@ -44,7 +46,9 @@ class MarkdownEmbedding(SourceEmbedding):
|
||||
# 更新metadata数据
|
||||
new_docs = []
|
||||
for doc in docs:
|
||||
doc.metadata = {"source": doc.metadata["source"].replace(self.file_path, "")}
|
||||
doc.metadata = {
|
||||
"source": doc.metadata["source"].replace(self.file_path, "")
|
||||
}
|
||||
print("doc is embedding ... ", doc.metadata)
|
||||
new_docs.append(doc)
|
||||
docments += new_docs
|
||||
@@ -55,13 +59,10 @@ class MarkdownEmbedding(SourceEmbedding):
|
||||
i = 0
|
||||
for d in documents:
|
||||
content = markdown.markdown(d.page_content)
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
for tag in soup(['!doctype', 'meta', 'i.fa']):
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
for tag in soup(["!doctype", "meta", "i.fa"]):
|
||||
tag.extract()
|
||||
documents[i].page_content = soup.get_text()
|
||||
documents[i].page_content = documents[i].page_content.replace("\n", " ")
|
||||
i += 1
|
||||
return documents
|
||||
|
||||
|
||||
|
||||
|
@@ -4,8 +4,8 @@ from typing import List
|
||||
|
||||
from langchain.document_loaders import PyPDFLoader
|
||||
from langchain.schema import Document
|
||||
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||
|
||||
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||
from pilot.source_embedding import SourceEmbedding, register
|
||||
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
||||
|
||||
@@ -25,7 +25,9 @@ class PDFEmbedding(SourceEmbedding):
|
||||
"""Load from pdf path."""
|
||||
# loader = UnstructuredPaddlePDFLoader(self.file_path)
|
||||
loader = PyPDFLoader(self.file_path)
|
||||
textsplitter = CHNDocumentSplitter(pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE)
|
||||
textsplitter = CHNDocumentSplitter(
|
||||
pdf=True, sentence_size=KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||
)
|
||||
return loader.load_and_split(textsplitter)
|
||||
|
||||
@register
|
||||
@@ -35,6 +37,3 @@ class PDFEmbedding(SourceEmbedding):
|
||||
documents[i].page_content = d.page_content.replace("\n", "")
|
||||
i += 1
|
||||
return documents
|
||||
|
||||
|
||||
|
||||
|
@@ -1,10 +1,10 @@
|
||||
"""Loader that loads image files."""
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
import fitz
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
from paddleocr import PaddleOCR
|
||||
import os
|
||||
import fitz
|
||||
|
||||
|
||||
class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
|
||||
@@ -19,9 +19,8 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
|
||||
ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False)
|
||||
doc = fitz.open(filepath)
|
||||
txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename))
|
||||
img_name = os.path.join(full_dir_path, '.tmp.png')
|
||||
with open(txt_file_path, 'w', encoding='utf-8') as fout:
|
||||
|
||||
img_name = os.path.join(full_dir_path, ".tmp.png")
|
||||
with open(txt_file_path, "w", encoding="utf-8") as fout:
|
||||
for i in range(doc.page_count):
|
||||
page = doc[i]
|
||||
text = page.get_text("")
|
||||
@@ -42,11 +41,14 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
|
||||
|
||||
txt_file_path = pdf_ocr_txt(self.file_path)
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "content", "samples", "test.pdf")
|
||||
filepath = os.path.join(
|
||||
os.path.dirname(os.path.dirname(__file__)), "content", "samples", "test.pdf"
|
||||
)
|
||||
loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
|
||||
docs = loader.load()
|
||||
for doc in docs:
|
||||
|
@@ -58,4 +58,4 @@
|
||||
# # docs,
|
||||
# # embedding=embeddings,
|
||||
# # connection_args={"host": "127.0.0.1", "port": "19530", "alias": "default"}
|
||||
# # )
|
||||
# # )
|
||||
|
@@ -1,9 +1,9 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from langchain.embeddings import HuggingFaceEmbeddings
|
||||
from typing import List, Optional, Dict
|
||||
|
||||
from pilot.configs.config import Config
|
||||
from pilot.vector_store.connector import VectorStoreConnector
|
||||
@@ -23,7 +23,13 @@ class SourceEmbedding(ABC):
|
||||
Implementations should implement the method
|
||||
"""
|
||||
|
||||
def __init__(self, file_path, model_name, vector_store_config, embedding_args: Optional[Dict] = None):
|
||||
def __init__(
|
||||
self,
|
||||
file_path,
|
||||
model_name,
|
||||
vector_store_config,
|
||||
embedding_args: Optional[Dict] = None,
|
||||
):
|
||||
"""Initialize with Loader url, model_name, vector_store_config"""
|
||||
self.file_path = file_path
|
||||
self.model_name = model_name
|
||||
@@ -32,12 +38,15 @@ class SourceEmbedding(ABC):
|
||||
self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
|
||||
|
||||
vector_store_config["embeddings"] = self.embeddings
|
||||
self.vector_client = VectorStoreConnector(CFG.VECTOR_STORE_TYPE, vector_store_config)
|
||||
self.vector_client = VectorStoreConnector(
|
||||
CFG.VECTOR_STORE_TYPE, vector_store_config
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
@register
|
||||
def read(self) -> List[ABC]:
|
||||
"""read datasource into document objects."""
|
||||
|
||||
@register
|
||||
def data_process(self, text):
|
||||
"""pre process data."""
|
||||
@@ -63,25 +72,25 @@ class SourceEmbedding(ABC):
|
||||
return self.vector_client.similar_search(doc, topk)
|
||||
|
||||
def source_embedding(self):
|
||||
if 'read' in registered_methods:
|
||||
if "read" in registered_methods:
|
||||
text = self.read()
|
||||
if 'data_process' in registered_methods:
|
||||
if "data_process" in registered_methods:
|
||||
text = self.data_process(text)
|
||||
if 'text_split' in registered_methods:
|
||||
if "text_split" in registered_methods:
|
||||
self.text_split(text)
|
||||
if 'text_to_vector' in registered_methods:
|
||||
if "text_to_vector" in registered_methods:
|
||||
self.text_to_vector(text)
|
||||
if 'index_to_store' in registered_methods:
|
||||
if "index_to_store" in registered_methods:
|
||||
self.index_to_store(text)
|
||||
|
||||
def batch_embedding(self):
|
||||
if 'read_batch' in registered_methods:
|
||||
if "read_batch" in registered_methods:
|
||||
text = self.read_batch()
|
||||
if 'data_process' in registered_methods:
|
||||
if "data_process" in registered_methods:
|
||||
text = self.data_process(text)
|
||||
if 'text_split' in registered_methods:
|
||||
if "text_split" in registered_methods:
|
||||
self.text_split(text)
|
||||
if 'text_to_vector' in registered_methods:
|
||||
if "text_to_vector" in registered_methods:
|
||||
self.text_to_vector(text)
|
||||
if 'index_to_store' in registered_methods:
|
||||
if "index_to_store" in registered_methods:
|
||||
self.index_to_store(text)
|
||||
|
@@ -1,13 +1,11 @@
|
||||
from typing import List
|
||||
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
|
||||
from pilot.source_embedding import SourceEmbedding, register
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from langchain.document_loaders import WebBaseLoader
|
||||
from langchain.schema import Document
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
|
||||
from pilot.source_embedding import SourceEmbedding, register
|
||||
|
||||
|
||||
class URLEmbedding(SourceEmbedding):
|
||||
@@ -23,7 +21,9 @@ class URLEmbedding(SourceEmbedding):
|
||||
def read(self):
|
||||
"""Load from url path."""
|
||||
loader = WebBaseLoader(web_path=self.file_path)
|
||||
text_splitor = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20, length_function=len)
|
||||
text_splitor = CharacterTextSplitter(
|
||||
chunk_size=1000, chunk_overlap=20, length_function=len
|
||||
)
|
||||
return loader.load_and_split(text_splitor)
|
||||
|
||||
@register
|
||||
@@ -31,12 +31,9 @@ class URLEmbedding(SourceEmbedding):
|
||||
i = 0
|
||||
for d in documents:
|
||||
content = d.page_content.replace("\n", "")
|
||||
soup = BeautifulSoup(content, 'html.parser')
|
||||
for tag in soup(['!doctype', 'meta']):
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
for tag in soup(["!doctype", "meta"]):
|
||||
tag.extract()
|
||||
documents[i].page_content = soup.get_text()
|
||||
i += 1
|
||||
return documents
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user