mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-08-07 03:14:42 +00:00
Merge branch 'llm_framework' into dev_ty_06_end
This commit is contained in:
commit
b91a6f1c95
@ -8,7 +8,7 @@ const nextConfig = {
|
|||||||
ignoreBuildErrors: true
|
ignoreBuildErrors: true
|
||||||
},
|
},
|
||||||
env: {
|
env: {
|
||||||
API_BASE_URL: process.env.API_BASE_URL || 'http://localhost:5000'
|
API_BASE_URL: process.env.API_BASE_URL || 'https://u158074-879a-d00019a9.westa.seetacloud.com:8443'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -33,9 +33,6 @@ class SourceEmbedding(ABC):
|
|||||||
self.vector_store_config = vector_store_config
|
self.vector_store_config = vector_store_config
|
||||||
self.embedding_args = embedding_args
|
self.embedding_args = embedding_args
|
||||||
self.embeddings = vector_store_config["embeddings"]
|
self.embeddings = vector_store_config["embeddings"]
|
||||||
self.vector_client = VectorStoreConnector(
|
|
||||||
CFG.VECTOR_STORE_TYPE, vector_store_config
|
|
||||||
)
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@register
|
@register
|
||||||
@ -59,11 +56,17 @@ class SourceEmbedding(ABC):
|
|||||||
@register
|
@register
|
||||||
def index_to_store(self, docs):
|
def index_to_store(self, docs):
|
||||||
"""index to vector store"""
|
"""index to vector store"""
|
||||||
return self.vector_client.load_document(docs)
|
self.vector_client = VectorStoreConnector(
|
||||||
|
CFG.VECTOR_STORE_TYPE, self.vector_store_config
|
||||||
|
)
|
||||||
|
self.vector_client.load_document(docs)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
def similar_search(self, doc, topk):
|
def similar_search(self, doc, topk):
|
||||||
"""vector store similarity_search"""
|
"""vector store similarity_search"""
|
||||||
|
self.vector_client = VectorStoreConnector(
|
||||||
|
CFG.VECTOR_STORE_TYPE, self.vector_store_config
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
ans = self.vector_client.similar_search(doc, topk)
|
ans = self.vector_client.similar_search(doc, topk)
|
||||||
except NotEnoughElementsException:
|
except NotEnoughElementsException:
|
||||||
@ -71,6 +74,9 @@ class SourceEmbedding(ABC):
|
|||||||
return ans
|
return ans
|
||||||
|
|
||||||
def vector_name_exist(self):
|
def vector_name_exist(self):
|
||||||
|
self.vector_client = VectorStoreConnector(
|
||||||
|
CFG.VECTOR_STORE_TYPE, self.vector_store_config
|
||||||
|
)
|
||||||
return self.vector_client.vector_name_exists()
|
return self.vector_client.vector_name_exists()
|
||||||
|
|
||||||
def source_embedding(self):
|
def source_embedding(self):
|
||||||
|
@ -3,7 +3,7 @@ from typing import List
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from langchain.document_loaders import WebBaseLoader
|
from langchain.document_loaders import WebBaseLoader
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
from langchain.text_splitter import CharacterTextSplitter
|
from langchain.text_splitter import CharacterTextSplitter, SpacyTextSplitter
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
from pilot.configs.config import Config
|
||||||
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
||||||
@ -33,7 +33,11 @@ class URLEmbedding(SourceEmbedding):
|
|||||||
length_function=len,
|
length_function=len,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000)
|
text_splitter = SpacyTextSplitter(
|
||||||
|
pipeline="zh_core_web_sm",
|
||||||
|
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
||||||
|
chunk_overlap=100,
|
||||||
|
)
|
||||||
return loader.load_and_split(text_splitter)
|
return loader.load_and_split(text_splitter)
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
@ -37,7 +37,7 @@ from pilot.conversation import (
|
|||||||
|
|
||||||
from pilot.server.gradio_css import code_highlight_css
|
from pilot.server.gradio_css import code_highlight_css
|
||||||
from pilot.server.gradio_patch import Chatbot as grChatbot
|
from pilot.server.gradio_patch import Chatbot as grChatbot
|
||||||
from pilot.source_embedding.knowledge_embedding import KnowledgeEmbedding
|
from pilot.embedding_engine.knowledge_embedding import KnowledgeEmbedding
|
||||||
from pilot.utils import build_logger
|
from pilot.utils import build_logger
|
||||||
from pilot.vector_store.extract_tovec import (
|
from pilot.vector_store.extract_tovec import (
|
||||||
get_vector_storelist,
|
get_vector_storelist,
|
||||||
@ -297,54 +297,40 @@ def http_bot(
|
|||||||
|
|
||||||
if ChatScene.ChatWithDbExecute == scene:
|
if ChatScene.ChatWithDbExecute == scene:
|
||||||
chat_param = {
|
chat_param = {
|
||||||
"temperature": temperature,
|
|
||||||
"max_new_tokens": max_new_tokens,
|
|
||||||
"chat_session_id": state.conv_id,
|
"chat_session_id": state.conv_id,
|
||||||
"db_name": db_selector,
|
"db_name": db_selector,
|
||||||
"user_input": state.last_user_input,
|
"user_input": state.last_user_input,
|
||||||
}
|
}
|
||||||
elif ChatScene.ChatWithDbQA == scene:
|
elif ChatScene.ChatWithDbQA == scene:
|
||||||
chat_param = {
|
chat_param = {
|
||||||
"temperature": temperature,
|
|
||||||
"max_new_tokens": max_new_tokens,
|
|
||||||
"chat_session_id": state.conv_id,
|
"chat_session_id": state.conv_id,
|
||||||
"db_name": db_selector,
|
"db_name": db_selector,
|
||||||
"user_input": state.last_user_input,
|
"user_input": state.last_user_input,
|
||||||
}
|
}
|
||||||
elif ChatScene.ChatExecution == scene:
|
elif ChatScene.ChatExecution == scene:
|
||||||
chat_param = {
|
chat_param = {
|
||||||
"temperature": temperature,
|
|
||||||
"max_new_tokens": max_new_tokens,
|
|
||||||
"chat_session_id": state.conv_id,
|
"chat_session_id": state.conv_id,
|
||||||
"plugin_selector": plugin_selector,
|
"plugin_selector": plugin_selector,
|
||||||
"user_input": state.last_user_input,
|
"user_input": state.last_user_input,
|
||||||
}
|
}
|
||||||
elif ChatScene.ChatNormal == scene:
|
elif ChatScene.ChatNormal == scene:
|
||||||
chat_param = {
|
chat_param = {
|
||||||
"temperature": temperature,
|
|
||||||
"max_new_tokens": max_new_tokens,
|
|
||||||
"chat_session_id": state.conv_id,
|
"chat_session_id": state.conv_id,
|
||||||
"user_input": state.last_user_input,
|
"user_input": state.last_user_input,
|
||||||
}
|
}
|
||||||
elif ChatScene.ChatDefaultKnowledge == scene:
|
elif ChatScene.ChatDefaultKnowledge == scene:
|
||||||
chat_param = {
|
chat_param = {
|
||||||
"temperature": temperature,
|
|
||||||
"max_new_tokens": max_new_tokens,
|
|
||||||
"chat_session_id": state.conv_id,
|
"chat_session_id": state.conv_id,
|
||||||
"user_input": state.last_user_input,
|
"user_input": state.last_user_input,
|
||||||
}
|
}
|
||||||
elif ChatScene.ChatNewKnowledge == scene:
|
elif ChatScene.ChatNewKnowledge == scene:
|
||||||
chat_param = {
|
chat_param = {
|
||||||
"temperature": temperature,
|
|
||||||
"max_new_tokens": max_new_tokens,
|
|
||||||
"chat_session_id": state.conv_id,
|
"chat_session_id": state.conv_id,
|
||||||
"user_input": state.last_user_input,
|
"user_input": state.last_user_input,
|
||||||
"knowledge_name": knowledge_name,
|
"knowledge_name": knowledge_name,
|
||||||
}
|
}
|
||||||
elif ChatScene.ChatUrlKnowledge == scene:
|
elif ChatScene.ChatUrlKnowledge == scene:
|
||||||
chat_param = {
|
chat_param = {
|
||||||
"temperature": temperature,
|
|
||||||
"max_new_tokens": max_new_tokens,
|
|
||||||
"chat_session_id": state.conv_id,
|
"chat_session_id": state.conv_id,
|
||||||
"user_input": state.last_user_input,
|
"user_input": state.last_user_input,
|
||||||
"url": url_input,
|
"url": url_input,
|
||||||
|
@ -1,26 +0,0 @@
|
|||||||
from typing import List, Optional
|
|
||||||
import chardet
|
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
|
||||||
from langchain.document_loaders.base import BaseLoader
|
|
||||||
|
|
||||||
|
|
||||||
class EncodeTextLoader(BaseLoader):
|
|
||||||
"""Load text files."""
|
|
||||||
|
|
||||||
def __init__(self, file_path: str, encoding: Optional[str] = None):
|
|
||||||
"""Initialize with file path."""
|
|
||||||
self.file_path = file_path
|
|
||||||
self.encoding = encoding
|
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load from file path."""
|
|
||||||
with open(self.file_path, "rb") as f:
|
|
||||||
raw_text = f.read()
|
|
||||||
result = chardet.detect(raw_text)
|
|
||||||
if result["encoding"] is None:
|
|
||||||
text = raw_text.decode("utf-8")
|
|
||||||
else:
|
|
||||||
text = raw_text.decode(result["encoding"])
|
|
||||||
metadata = {"source": self.file_path}
|
|
||||||
return [Document(page_content=text, metadata=metadata)]
|
|
@ -1,3 +0,0 @@
|
|||||||
from pilot.source_embedding.source_embedding import SourceEmbedding, register
|
|
||||||
|
|
||||||
__all__ = ["SourceEmbedding", "register"]
|
|
@ -1,55 +0,0 @@
|
|||||||
import re
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from langchain.text_splitter import CharacterTextSplitter
|
|
||||||
|
|
||||||
|
|
||||||
class CHNDocumentSplitter(CharacterTextSplitter):
|
|
||||||
def __init__(self, pdf: bool = False, sentence_size: int = None, **kwargs):
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
self.pdf = pdf
|
|
||||||
self.sentence_size = sentence_size
|
|
||||||
|
|
||||||
def split_text(self, text: str) -> List[str]:
|
|
||||||
if self.pdf:
|
|
||||||
text = re.sub(r"\n{3,}", r"\n", text)
|
|
||||||
text = re.sub("\s", " ", text)
|
|
||||||
text = re.sub("\n\n", "", text)
|
|
||||||
|
|
||||||
text = re.sub(r"([;;.!?。!?\?])([^”’])", r"\1\n\2", text)
|
|
||||||
text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text)
|
|
||||||
text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text)
|
|
||||||
text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r"\1\n\2", text)
|
|
||||||
text = text.rstrip()
|
|
||||||
ls = [i for i in text.split("\n") if i]
|
|
||||||
for ele in ls:
|
|
||||||
if len(ele) > self.sentence_size:
|
|
||||||
ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r"\1\n\2", ele)
|
|
||||||
ele1_ls = ele1.split("\n")
|
|
||||||
for ele_ele1 in ele1_ls:
|
|
||||||
if len(ele_ele1) > self.sentence_size:
|
|
||||||
ele_ele2 = re.sub(
|
|
||||||
r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r"\1\n\2", ele_ele1
|
|
||||||
)
|
|
||||||
ele2_ls = ele_ele2.split("\n")
|
|
||||||
for ele_ele2 in ele2_ls:
|
|
||||||
if len(ele_ele2) > self.sentence_size:
|
|
||||||
ele_ele3 = re.sub(
|
|
||||||
'( ["’”」』]{0,2})([^ ])', r"\1\n\2", ele_ele2
|
|
||||||
)
|
|
||||||
ele2_id = ele2_ls.index(ele_ele2)
|
|
||||||
ele2_ls = (
|
|
||||||
ele2_ls[:ele2_id]
|
|
||||||
+ [i for i in ele_ele3.split("\n") if i]
|
|
||||||
+ ele2_ls[ele2_id + 1 :]
|
|
||||||
)
|
|
||||||
ele_id = ele1_ls.index(ele_ele1)
|
|
||||||
ele1_ls = (
|
|
||||||
ele1_ls[:ele_id]
|
|
||||||
+ [i for i in ele2_ls if i]
|
|
||||||
+ ele1_ls[ele_id + 1 :]
|
|
||||||
)
|
|
||||||
|
|
||||||
id = ls.index(ele)
|
|
||||||
ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1 :]
|
|
||||||
return ls
|
|
@ -1,36 +0,0 @@
|
|||||||
from typing import Dict, List, Optional
|
|
||||||
|
|
||||||
from langchain.document_loaders import CSVLoader
|
|
||||||
from langchain.schema import Document
|
|
||||||
|
|
||||||
from pilot.source_embedding import SourceEmbedding, register
|
|
||||||
|
|
||||||
|
|
||||||
class CSVEmbedding(SourceEmbedding):
|
|
||||||
"""csv embedding for read csv document."""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
file_path,
|
|
||||||
vector_store_config,
|
|
||||||
embedding_args: Optional[Dict] = None,
|
|
||||||
):
|
|
||||||
"""Initialize with csv path."""
|
|
||||||
super().__init__(file_path, vector_store_config)
|
|
||||||
self.file_path = file_path
|
|
||||||
self.vector_store_config = vector_store_config
|
|
||||||
self.embedding_args = embedding_args
|
|
||||||
|
|
||||||
@register
|
|
||||||
def read(self):
|
|
||||||
"""Load from csv path."""
|
|
||||||
loader = CSVLoader(file_path=self.file_path)
|
|
||||||
return loader.load()
|
|
||||||
|
|
||||||
@register
|
|
||||||
def data_process(self, documents: List[Document]):
|
|
||||||
i = 0
|
|
||||||
for d in documents:
|
|
||||||
documents[i].page_content = d.page_content.replace("\n", "")
|
|
||||||
i += 1
|
|
||||||
return documents
|
|
@ -1,60 +0,0 @@
|
|||||||
from typing import Optional
|
|
||||||
|
|
||||||
from chromadb.errors import NotEnoughElementsException
|
|
||||||
from langchain.embeddings import HuggingFaceEmbeddings
|
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
|
||||||
from pilot.embedding_engine.knowledge_type import get_knowledge_embedding
|
|
||||||
from pilot.vector_store.connector import VectorStoreConnector
|
|
||||||
|
|
||||||
CFG = Config()
|
|
||||||
|
|
||||||
|
|
||||||
class KnowledgeEmbedding:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
model_name,
|
|
||||||
vector_store_config,
|
|
||||||
file_type: Optional[str] = "default",
|
|
||||||
file_path: Optional[str] = None,
|
|
||||||
):
|
|
||||||
"""Initialize with Loader url, model_name, vector_store_config"""
|
|
||||||
self.file_path = file_path
|
|
||||||
self.model_name = model_name
|
|
||||||
self.vector_store_config = vector_store_config
|
|
||||||
self.file_type = file_type
|
|
||||||
self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
|
|
||||||
self.vector_store_config["embeddings"] = self.embeddings
|
|
||||||
|
|
||||||
def knowledge_embedding(self):
|
|
||||||
self.knowledge_embedding_client = self.init_knowledge_embedding()
|
|
||||||
self.knowledge_embedding_client.source_embedding()
|
|
||||||
|
|
||||||
def knowledge_embedding_batch(self, docs):
|
|
||||||
# docs = self.knowledge_embedding_client.read_batch()
|
|
||||||
self.knowledge_embedding_client.index_to_store(docs)
|
|
||||||
|
|
||||||
def read(self):
|
|
||||||
self.knowledge_embedding_client = self.init_knowledge_embedding()
|
|
||||||
return self.knowledge_embedding_client.read_batch()
|
|
||||||
|
|
||||||
def init_knowledge_embedding(self):
|
|
||||||
return get_knowledge_embedding(
|
|
||||||
self.file_type.upper(), self.file_path, self.vector_store_config
|
|
||||||
)
|
|
||||||
|
|
||||||
def similar_search(self, text, topk):
|
|
||||||
vector_client = VectorStoreConnector(
|
|
||||||
CFG.VECTOR_STORE_TYPE, self.vector_store_config
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
ans = vector_client.similar_search(text, topk)
|
|
||||||
except NotEnoughElementsException:
|
|
||||||
ans = vector_client.similar_search(text, 1)
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def vector_exist(self):
|
|
||||||
vector_client = VectorStoreConnector(
|
|
||||||
CFG.VECTOR_STORE_TYPE, self.vector_store_config
|
|
||||||
)
|
|
||||||
return vector_client.vector_name_exists()
|
|
@ -1,51 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
import os
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import markdown
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from langchain.schema import Document
|
|
||||||
from langchain.text_splitter import SpacyTextSplitter
|
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
|
||||||
from pilot.source_embedding import SourceEmbedding, register
|
|
||||||
from pilot.source_embedding.EncodeTextLoader import EncodeTextLoader
|
|
||||||
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
|
||||||
|
|
||||||
CFG = Config()
|
|
||||||
|
|
||||||
|
|
||||||
class MarkdownEmbedding(SourceEmbedding):
|
|
||||||
"""markdown embedding for read markdown document."""
|
|
||||||
|
|
||||||
def __init__(self, file_path, vector_store_config):
|
|
||||||
"""Initialize with markdown path."""
|
|
||||||
super().__init__(file_path, vector_store_config)
|
|
||||||
self.file_path = file_path
|
|
||||||
self.vector_store_config = vector_store_config
|
|
||||||
# self.encoding = encoding
|
|
||||||
|
|
||||||
@register
|
|
||||||
def read(self):
|
|
||||||
"""Load from markdown path."""
|
|
||||||
loader = EncodeTextLoader(self.file_path)
|
|
||||||
textsplitter = SpacyTextSplitter(
|
|
||||||
pipeline="zh_core_web_sm",
|
|
||||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
|
||||||
chunk_overlap=100,
|
|
||||||
)
|
|
||||||
return loader.load_and_split(textsplitter)
|
|
||||||
|
|
||||||
@register
|
|
||||||
def data_process(self, documents: List[Document]):
|
|
||||||
i = 0
|
|
||||||
for d in documents:
|
|
||||||
content = markdown.markdown(d.page_content)
|
|
||||||
soup = BeautifulSoup(content, "html.parser")
|
|
||||||
for tag in soup(["!doctype", "meta", "i.fa"]):
|
|
||||||
tag.extract()
|
|
||||||
documents[i].page_content = soup.get_text()
|
|
||||||
documents[i].page_content = documents[i].page_content.replace("\n", " ")
|
|
||||||
i += 1
|
|
||||||
return documents
|
|
@ -1,44 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from langchain.document_loaders import PyPDFLoader
|
|
||||||
from langchain.schema import Document
|
|
||||||
from langchain.text_splitter import SpacyTextSplitter
|
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
|
||||||
from pilot.source_embedding import SourceEmbedding, register
|
|
||||||
|
|
||||||
CFG = Config()
|
|
||||||
|
|
||||||
|
|
||||||
class PDFEmbedding(SourceEmbedding):
|
|
||||||
"""pdf embedding for read pdf document."""
|
|
||||||
|
|
||||||
def __init__(self, file_path, vector_store_config):
|
|
||||||
"""Initialize with pdf path."""
|
|
||||||
super().__init__(file_path, vector_store_config)
|
|
||||||
self.file_path = file_path
|
|
||||||
self.vector_store_config = vector_store_config
|
|
||||||
|
|
||||||
@register
|
|
||||||
def read(self):
|
|
||||||
"""Load from pdf path."""
|
|
||||||
loader = PyPDFLoader(self.file_path)
|
|
||||||
# textsplitter = CHNDocumentSplitter(
|
|
||||||
# pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
|
|
||||||
# )
|
|
||||||
textsplitter = SpacyTextSplitter(
|
|
||||||
pipeline="zh_core_web_sm",
|
|
||||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
|
||||||
chunk_overlap=100,
|
|
||||||
)
|
|
||||||
return loader.load_and_split(textsplitter)
|
|
||||||
|
|
||||||
@register
|
|
||||||
def data_process(self, documents: List[Document]):
|
|
||||||
i = 0
|
|
||||||
for d in documents:
|
|
||||||
documents[i].page_content = d.page_content.replace("\n", "")
|
|
||||||
i += 1
|
|
||||||
return documents
|
|
@ -1,55 +0,0 @@
|
|||||||
"""Loader that loads image files."""
|
|
||||||
import os
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import fitz
|
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
|
||||||
from paddleocr import PaddleOCR
|
|
||||||
|
|
||||||
|
|
||||||
class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
|
|
||||||
"""Loader that uses unstructured to load image files, such as PNGs and JPGs."""
|
|
||||||
|
|
||||||
def _get_elements(self) -> List:
|
|
||||||
def pdf_ocr_txt(filepath, dir_path="tmp_files"):
|
|
||||||
full_dir_path = os.path.join(os.path.dirname(filepath), dir_path)
|
|
||||||
if not os.path.exists(full_dir_path):
|
|
||||||
os.makedirs(full_dir_path)
|
|
||||||
filename = os.path.split(filepath)[-1]
|
|
||||||
ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False)
|
|
||||||
doc = fitz.open(filepath)
|
|
||||||
txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename))
|
|
||||||
img_name = os.path.join(full_dir_path, ".tmp.png")
|
|
||||||
with open(txt_file_path, "w", encoding="utf-8") as fout:
|
|
||||||
for i in range(doc.page_count):
|
|
||||||
page = doc[i]
|
|
||||||
text = page.get_text("")
|
|
||||||
fout.write(text)
|
|
||||||
fout.write("\n")
|
|
||||||
|
|
||||||
img_list = page.get_images()
|
|
||||||
for img in img_list:
|
|
||||||
pix = fitz.Pixmap(doc, img[0])
|
|
||||||
|
|
||||||
pix.save(img_name)
|
|
||||||
|
|
||||||
result = ocr.ocr(img_name)
|
|
||||||
ocr_result = [i[1][0] for line in result for i in line]
|
|
||||||
fout.write("\n".join(ocr_result))
|
|
||||||
os.remove(img_name)
|
|
||||||
return txt_file_path
|
|
||||||
|
|
||||||
txt_file_path = pdf_ocr_txt(self.file_path)
|
|
||||||
from unstructured.partition.text import partition_text
|
|
||||||
|
|
||||||
return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
filepath = os.path.join(
|
|
||||||
os.path.dirname(os.path.dirname(__file__)), "content", "samples", "test_py.pdf"
|
|
||||||
)
|
|
||||||
loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
|
|
||||||
docs = loader.load()
|
|
||||||
for doc in docs:
|
|
||||||
print(doc)
|
|
@ -1,41 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from langchain.document_loaders import UnstructuredPowerPointLoader
|
|
||||||
from langchain.schema import Document
|
|
||||||
from langchain.text_splitter import SpacyTextSplitter
|
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
|
||||||
from pilot.source_embedding import SourceEmbedding, register
|
|
||||||
|
|
||||||
CFG = Config()
|
|
||||||
|
|
||||||
|
|
||||||
class PPTEmbedding(SourceEmbedding):
|
|
||||||
"""ppt embedding for read ppt document."""
|
|
||||||
|
|
||||||
def __init__(self, file_path, vector_store_config):
|
|
||||||
"""Initialize with pdf path."""
|
|
||||||
super().__init__(file_path, vector_store_config)
|
|
||||||
self.file_path = file_path
|
|
||||||
self.vector_store_config = vector_store_config
|
|
||||||
|
|
||||||
@register
|
|
||||||
def read(self):
|
|
||||||
"""Load from ppt path."""
|
|
||||||
loader = UnstructuredPowerPointLoader(self.file_path)
|
|
||||||
textsplitter = SpacyTextSplitter(
|
|
||||||
pipeline="zh_core_web_sm",
|
|
||||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
|
||||||
chunk_overlap=200,
|
|
||||||
)
|
|
||||||
return loader.load_and_split(textsplitter)
|
|
||||||
|
|
||||||
@register
|
|
||||||
def data_process(self, documents: List[Document]):
|
|
||||||
i = 0
|
|
||||||
for d in documents:
|
|
||||||
documents[i].page_content = d.page_content.replace("\n", "")
|
|
||||||
i += 1
|
|
||||||
return documents
|
|
@ -1,61 +0,0 @@
|
|||||||
# from langchain.embeddings import HuggingFaceEmbeddings
|
|
||||||
# from langchain.vectorstores import Milvus
|
|
||||||
# from pymilvus import Collection,utility
|
|
||||||
# from pymilvus import connections, DataType, FieldSchema, CollectionSchema
|
|
||||||
#
|
|
||||||
# # milvus = connections.connect(
|
|
||||||
# # alias="default",
|
|
||||||
# # host='localhost',
|
|
||||||
# # port="19530"
|
|
||||||
# # )
|
|
||||||
# # collection = Collection("book")
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# # Get an existing collection.
|
|
||||||
# # collection.load()
|
|
||||||
# #
|
|
||||||
# # search_params = {"metric_type": "L2", "params": {}, "offset": 5}
|
|
||||||
# #
|
|
||||||
# # results = collection.search(
|
|
||||||
# # data=[[0.1, 0.2]],
|
|
||||||
# # anns_field="book_intro",
|
|
||||||
# # param=search_params,
|
|
||||||
# # limit=10,
|
|
||||||
# # expr=None,
|
|
||||||
# # output_fields=['book_id'],
|
|
||||||
# # consistency_level="Strong"
|
|
||||||
# # )
|
|
||||||
# #
|
|
||||||
# # # get the IDs of all returned hits
|
|
||||||
# # results[0].ids
|
|
||||||
# #
|
|
||||||
# # # get the distances to the query vector from all returned hits
|
|
||||||
# # results[0].distances
|
|
||||||
# #
|
|
||||||
# # # get the value of an output field specified in the search request.
|
|
||||||
# # # vector fields are not supported yet.
|
|
||||||
# # hit = results[0][0]
|
|
||||||
# # hit.entity.get('title')
|
|
||||||
#
|
|
||||||
# # milvus = connections.connect(
|
|
||||||
# # alias="default",
|
|
||||||
# # host='localhost',
|
|
||||||
# # port="19530"
|
|
||||||
# # )
|
|
||||||
# from pilot.vector_store.milvus_store import MilvusStore
|
|
||||||
#
|
|
||||||
# data = ["aaa", "bbb"]
|
|
||||||
# model_name = "xx/all-MiniLM-L6-v2"
|
|
||||||
# embeddings = HuggingFaceEmbeddings(model_name=model_name)
|
|
||||||
#
|
|
||||||
# # text_embeddings = Text2Vectors()
|
|
||||||
# mivuls = MilvusStore(cfg={"url": "127.0.0.1", "port": "19530", "alias": "default", "table_name": "test_k"})
|
|
||||||
#
|
|
||||||
# mivuls.insert(["textc","tezt2"])
|
|
||||||
# print("success")
|
|
||||||
# ct
|
|
||||||
# # mivuls.from_texts(texts=data, embedding=embeddings)
|
|
||||||
# # docs,
|
|
||||||
# # embedding=embeddings,
|
|
||||||
# # connection_args={"host": "127.0.0.1", "port": "19530", "alias": "default"}
|
|
||||||
# # )
|
|
@ -1,101 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from typing import Dict, List, Optional
|
|
||||||
|
|
||||||
from chromadb.errors import NotEnoughElementsException
|
|
||||||
from pilot.configs.config import Config
|
|
||||||
from pilot.vector_store.connector import VectorStoreConnector
|
|
||||||
|
|
||||||
registered_methods = []
|
|
||||||
CFG = Config()
|
|
||||||
|
|
||||||
|
|
||||||
def register(method):
|
|
||||||
registered_methods.append(method.__name__)
|
|
||||||
return method
|
|
||||||
|
|
||||||
|
|
||||||
class SourceEmbedding(ABC):
|
|
||||||
"""base class for read data source embedding pipeline.
|
|
||||||
include data read, data process, data split, data to vector, data index vector store
|
|
||||||
Implementations should implement the method
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
file_path,
|
|
||||||
vector_store_config,
|
|
||||||
embedding_args: Optional[Dict] = None,
|
|
||||||
):
|
|
||||||
"""Initialize with Loader url, model_name, vector_store_config"""
|
|
||||||
self.file_path = file_path
|
|
||||||
self.vector_store_config = vector_store_config
|
|
||||||
self.embedding_args = embedding_args
|
|
||||||
self.embeddings = vector_store_config["embeddings"]
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
@register
|
|
||||||
def read(self) -> List[ABC]:
|
|
||||||
"""read datasource into document objects."""
|
|
||||||
|
|
||||||
@register
|
|
||||||
def data_process(self, text):
|
|
||||||
"""pre process data."""
|
|
||||||
|
|
||||||
@register
|
|
||||||
def text_split(self, text):
|
|
||||||
"""text split chunk"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@register
|
|
||||||
def text_to_vector(self, docs):
|
|
||||||
"""transform vector"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@register
|
|
||||||
def index_to_store(self, docs):
|
|
||||||
"""index to vector store"""
|
|
||||||
self.vector_client = VectorStoreConnector(
|
|
||||||
CFG.VECTOR_STORE_TYPE, self.vector_store_config
|
|
||||||
)
|
|
||||||
self.vector_client.load_document(docs)
|
|
||||||
|
|
||||||
@register
|
|
||||||
def similar_search(self, doc, topk):
|
|
||||||
"""vector store similarity_search"""
|
|
||||||
self.vector_client = VectorStoreConnector(
|
|
||||||
CFG.VECTOR_STORE_TYPE, self.vector_store_config
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
ans = self.vector_client.similar_search(doc, topk)
|
|
||||||
except NotEnoughElementsException:
|
|
||||||
ans = self.vector_client.similar_search(doc, 1)
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def vector_name_exist(self):
|
|
||||||
self.vector_client = VectorStoreConnector(
|
|
||||||
CFG.VECTOR_STORE_TYPE, self.vector_store_config
|
|
||||||
)
|
|
||||||
return self.vector_client.vector_name_exists()
|
|
||||||
|
|
||||||
def source_embedding(self):
|
|
||||||
if "read" in registered_methods:
|
|
||||||
text = self.read()
|
|
||||||
if "data_process" in registered_methods:
|
|
||||||
text = self.data_process(text)
|
|
||||||
if "text_split" in registered_methods:
|
|
||||||
self.text_split(text)
|
|
||||||
if "text_to_vector" in registered_methods:
|
|
||||||
self.text_to_vector(text)
|
|
||||||
if "index_to_store" in registered_methods:
|
|
||||||
self.index_to_store(text)
|
|
||||||
|
|
||||||
def read_batch(self):
|
|
||||||
if "read" in registered_methods:
|
|
||||||
text = self.read()
|
|
||||||
if "data_process" in registered_methods:
|
|
||||||
text = self.data_process(text)
|
|
||||||
if "text_split" in registered_methods:
|
|
||||||
self.text_split(text)
|
|
||||||
return text
|
|
@ -1,29 +0,0 @@
|
|||||||
from typing import List
|
|
||||||
|
|
||||||
from langchain.schema import Document
|
|
||||||
|
|
||||||
from pilot import SourceEmbedding, register
|
|
||||||
|
|
||||||
|
|
||||||
class StringEmbedding(SourceEmbedding):
|
|
||||||
"""string embedding for read string document."""
|
|
||||||
|
|
||||||
def __init__(self, file_path, vector_store_config):
|
|
||||||
"""Initialize with pdf path."""
|
|
||||||
super().__init__(file_path, vector_store_config)
|
|
||||||
self.file_path = file_path
|
|
||||||
self.vector_store_config = vector_store_config
|
|
||||||
|
|
||||||
@register
|
|
||||||
def read(self):
|
|
||||||
"""Load from String path."""
|
|
||||||
metadata = {"source": "db_summary"}
|
|
||||||
return [Document(page_content=self.file_path, metadata=metadata)]
|
|
||||||
|
|
||||||
@register
|
|
||||||
def data_process(self, documents: List[Document]):
|
|
||||||
i = 0
|
|
||||||
for d in documents:
|
|
||||||
documents[i].page_content = d.page_content.replace("\n", "")
|
|
||||||
i += 1
|
|
||||||
return documents
|
|
@ -1,49 +0,0 @@
|
|||||||
from typing import List
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from langchain.document_loaders import WebBaseLoader
|
|
||||||
from langchain.schema import Document
|
|
||||||
from langchain.text_splitter import CharacterTextSplitter
|
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
|
||||||
from pilot.configs.model_config import KNOWLEDGE_CHUNK_SPLIT_SIZE
|
|
||||||
from pilot.source_embedding import SourceEmbedding, register
|
|
||||||
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
|
||||||
|
|
||||||
CFG = Config()
|
|
||||||
|
|
||||||
|
|
||||||
class URLEmbedding(SourceEmbedding):
|
|
||||||
"""url embedding for read url document."""
|
|
||||||
|
|
||||||
def __init__(self, file_path, vector_store_config):
|
|
||||||
"""Initialize with url path."""
|
|
||||||
super().__init__(file_path, vector_store_config)
|
|
||||||
self.file_path = file_path
|
|
||||||
self.vector_store_config = vector_store_config
|
|
||||||
|
|
||||||
@register
|
|
||||||
def read(self):
|
|
||||||
"""Load from url path."""
|
|
||||||
loader = WebBaseLoader(web_path=self.file_path)
|
|
||||||
if CFG.LANGUAGE == "en":
|
|
||||||
text_splitter = CharacterTextSplitter(
|
|
||||||
chunk_size=CFG.KNOWLEDGE_CHUNK_SIZE,
|
|
||||||
chunk_overlap=20,
|
|
||||||
length_function=len,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
text_splitter = CHNDocumentSplitter(pdf=True, sentence_size=1000)
|
|
||||||
return loader.load_and_split(text_splitter)
|
|
||||||
|
|
||||||
@register
|
|
||||||
def data_process(self, documents: List[Document]):
|
|
||||||
i = 0
|
|
||||||
for d in documents:
|
|
||||||
content = d.page_content.replace("\n", "")
|
|
||||||
soup = BeautifulSoup(content, "html.parser")
|
|
||||||
for tag in soup(["!doctype", "meta"]):
|
|
||||||
tag.extract()
|
|
||||||
documents[i].page_content = soup.get_text()
|
|
||||||
i += 1
|
|
||||||
return documents
|
|
@ -1,39 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
|
|
||||||
from langchain.schema import Document
|
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
|
||||||
from pilot.source_embedding import SourceEmbedding, register
|
|
||||||
from pilot.source_embedding.chn_document_splitter import CHNDocumentSplitter
|
|
||||||
|
|
||||||
CFG = Config()
|
|
||||||
|
|
||||||
|
|
||||||
class WordEmbedding(SourceEmbedding):
|
|
||||||
"""word embedding for read word document."""
|
|
||||||
|
|
||||||
def __init__(self, file_path, vector_store_config):
|
|
||||||
"""Initialize with word path."""
|
|
||||||
super().__init__(file_path, vector_store_config)
|
|
||||||
self.file_path = file_path
|
|
||||||
self.vector_store_config = vector_store_config
|
|
||||||
|
|
||||||
@register
|
|
||||||
def read(self):
|
|
||||||
"""Load from word path."""
|
|
||||||
loader = UnstructuredWordDocumentLoader(self.file_path)
|
|
||||||
textsplitter = CHNDocumentSplitter(
|
|
||||||
pdf=True, sentence_size=CFG.KNOWLEDGE_CHUNK_SIZE
|
|
||||||
)
|
|
||||||
return loader.load_and_split(textsplitter)
|
|
||||||
|
|
||||||
@register
|
|
||||||
def data_process(self, documents: List[Document]):
|
|
||||||
i = 0
|
|
||||||
for d in documents:
|
|
||||||
documents[i].page_content = d.page_content.replace("\n", "")
|
|
||||||
i += 1
|
|
||||||
return documents
|
|
@ -3,12 +3,12 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
|
||||||
|
|
||||||
from pilot.embedding_engine.knowledge_type import KnowledgeType
|
from pilot.embedding_engine.knowledge_type import KnowledgeType
|
||||||
from pilot.openapi.knowledge.knowledge_service import KnowledgeService
|
from pilot.openapi.knowledge.knowledge_service import KnowledgeService
|
||||||
from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest
|
from pilot.openapi.knowledge.request.knowledge_request import KnowledgeSpaceRequest
|
||||||
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
|
|
||||||
|
|
||||||
from pilot.configs.config import Config
|
from pilot.configs.config import Config
|
||||||
from pilot.configs.model_config import (
|
from pilot.configs.model_config import (
|
||||||
@ -49,6 +49,7 @@ class LocalKnowledgeInit:
|
|||||||
space = KnowledgeSpaceRequest
|
space = KnowledgeSpaceRequest
|
||||||
space.name = self.vector_store_config["vector_store_name"]
|
space.name = self.vector_store_config["vector_store_name"]
|
||||||
space.desc = "knowledge_init.py"
|
space.desc = "knowledge_init.py"
|
||||||
|
space.vector_type = CFG.VECTOR_STORE_TYPE
|
||||||
space.owner = "DB-GPT"
|
space.owner = "DB-GPT"
|
||||||
knowledge_space_service.create_knowledge_space(space)
|
knowledge_space_service.create_knowledge_space(space)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user