.env + LlamaCpp + PDF/CSV + Ingest All

.env

Added an env file to make configuration easier

LlamaCpp

Added support for LlamaCpp in .env (MODEL_TYPE=LlamaCpp)

PDF/CSV

Added support for PDF and CSV files.

Ingest All

All files in source_documents will automatically get stored in vector store based on their file type when running ingest, no longer need a path argument.
This commit is contained in:
alxspiker
2023-05-11 14:24:39 -06:00
parent 60225698b6
commit 52ae6c0866
4 changed files with 49 additions and 18 deletions

View File

@@ -1,19 +1,28 @@
from langchain.document_loaders import TextLoader
import os
from langchain.document_loaders import TextLoader, PDFMinerLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import LlamaCppEmbeddings
from sys import argv
def main():
llama_embeddings_model = os.environ.get('LLAMA_EMBEDDINGS_MODEL')
persist_directory = os.environ.get('PERSIST_DIRECTORY')
model_n_ctx = os.environ.get('MODEL_N_CTX')
# Load document and split in chunks
loader = TextLoader(argv[1], encoding="utf8")
for root, dirs, files in os.walk("source_documents"):
for file in files:
if file.endswith(".txt"):
loader = TextLoader(os.path.join(root, file), encoding="utf8")
elif file.endswith(".pdf"):
loader = PDFMinerLoader(os.path.join(root, file))
elif file.endswith(".csv"):
loader = CSVLoader(os.path.join(root, file))
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)
# Create embeddings
llama = LlamaCppEmbeddings(model_path="./models/ggml-model-q4_0.bin")
llama = LlamaCppEmbeddings(model_path=llama_embeddings_model, n_ctx=model_n_ctx)
# Create and store locally vectorstore
persist_directory = 'db'
db = Chroma.from_documents(texts, llama, persist_directory=persist_directory)
db.persist()
db = None