Files
DB-GPT/pilot/vector_store/file_loader.py
2023-06-05 18:08:55 +08:00

110 lines
3.5 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
from langchain.chains import VectorDBQA
from langchain.document_loaders import (
TextLoader,
UnstructuredFileLoader,
UnstructuredPDFLoader,
)
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from pilot.configs.model_config import (
DATASETS_DIR,
LLM_MODEL_CONFIG,
VECTORE_PATH,
)
class KnownLedge2Vector:
"""KnownLedge2Vector class is order to load document to vector
and persist to vector store.
Args:
- model_name
Usage:
k2v = KnownLedge2Vector()
persist_dir = os.path.join(VECTORE_PATH, ".vectordb")
print(persist_dir)
for s, dc in k2v.query("what is oceanbase?"):
print(s, dc.page_content, dc.metadata)
"""
embeddings: object = None
model_name = LLM_MODEL_CONFIG["sentence-transforms"]
def __init__(self, model_name=None) -> None:
if not model_name:
# use default embedding model
self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
def init_vector_store(self):
persist_dir = os.path.join(VECTORE_PATH, ".vectordb")
print("Vector store Persist address is: ", persist_dir)
if os.path.exists(persist_dir):
# Loader from local file.
print("Loader data from local persist vector file...")
vector_store = Chroma(
persist_directory=persist_dir, embedding_function=self.embeddings
)
# vector_store.add_documents(documents=documents)
else:
documents = self.load_knownlege()
# reinit
vector_store = Chroma.from_documents(
documents=documents,
embedding=self.embeddings,
persist_directory=persist_dir,
)
vector_store.persist()
return vector_store
def load_knownlege(self):
docments = []
for root, _, files in os.walk(DATASETS_DIR, topdown=False):
for file in files:
filename = os.path.join(root, file)
docs = self._load_file(filename)
# update metadata.
new_docs = []
for doc in docs:
doc.metadata = {
"source": doc.metadata["source"].replace(DATASETS_DIR, "")
}
print("Documents to vector running, please wait...", doc.metadata)
new_docs.append(doc)
docments += new_docs
return docments
def _load_file(self, filename):
# Loader file
if filename.lower().endswith(".pdf"):
loader = UnstructuredFileLoader(filename)
text_splitor = CharacterTextSplitter()
docs = loader.load_and_split(text_splitor)
else:
loader = UnstructuredFileLoader(filename, mode="elements")
text_splitor = CharacterTextSplitter()
docs = loader.load_and_split(text_splitor)
return docs
def _load_from_url(self, url):
"""Load data from url address"""
pass
def query(self, q):
"""Query similar doc from Vector"""
vector_store = self.init_vector_store()
docs = vector_store.similarity_search_with_score(q, k=self.top_k)
for doc in docs:
dc, s = doc
yield s, dc