From 23d24c88e938067191896edbcc4a31eba4955f95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20Mart=C3=ADnez?= Date: Wed, 17 May 2023 00:32:41 +0200 Subject: [PATCH] Update code to use sentence-transformers through huggingfaceembeddings --- example.env | 2 +- ingest.py | 15 ++++++++------- privateGPT.py | 8 ++++---- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/example.env b/example.env index 149eca2e..82907845 100644 --- a/example.env +++ b/example.env @@ -1,5 +1,5 @@ PERSIST_DIRECTORY=db -LLAMA_EMBEDDINGS_MODEL=models/ggml-model-q4_0.bin MODEL_TYPE=GPT4All MODEL_PATH=models/ggml-gpt4all-j-v1.3-groovy.bin +EMBEDDINGS_MODEL_NAME=all-MiniLM-L6-v2 MODEL_N_CTX=1000 \ No newline at end of file diff --git a/ingest.py b/ingest.py index 4c955862..2c703623 100644 --- a/ingest.py +++ b/ingest.py @@ -6,7 +6,7 @@ from dotenv import load_dotenv from langchain.document_loaders import TextLoader, PDFMinerLoader, CSVLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma -from langchain.embeddings import LlamaCppEmbeddings +from langchain.embeddings import HuggingFaceEmbeddings from langchain.docstore.document import Document from constants import CHROMA_SETTINGS @@ -38,22 +38,23 @@ def main(): # Load environment variables persist_directory = os.environ.get('PERSIST_DIRECTORY') source_directory = os.environ.get('SOURCE_DIRECTORY', 'source_documents') - llama_embeddings_model = os.environ.get('LLAMA_EMBEDDINGS_MODEL') - model_n_ctx = os.environ.get('MODEL_N_CTX') + embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME') # Load documents and split in chunks print(f"Loading documents from {source_directory}") + chunk_size = 500 + chunk_overlap = 50 documents = load_documents(source_directory) - text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) + text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) texts = text_splitter.split_documents(documents) print(f"Loaded {len(documents)} documents from {source_directory}") - print(f"Split into {len(texts)} chunks of text (max. 500 tokens each)") + print(f"Split into {len(texts)} chunks of text (max. {chunk_size} characters each)") # Create embeddings - llama = LlamaCppEmbeddings(model_path=llama_embeddings_model, n_ctx=model_n_ctx) + embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) # Create and store locally vectorstore - db = Chroma.from_documents(texts, llama, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS) + db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS) db.persist() db = None diff --git a/privateGPT.py b/privateGPT.py index 4c603a27..ae08bb93 100644 --- a/privateGPT.py +++ b/privateGPT.py @@ -1,6 +1,6 @@ from dotenv import load_dotenv from langchain.chains import RetrievalQA -from langchain.embeddings import LlamaCppEmbeddings +from langchain.embeddings import HuggingFaceEmbeddings from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain.vectorstores import Chroma from langchain.llms import GPT4All, LlamaCpp @@ -8,7 +8,7 @@ import os load_dotenv() -llama_embeddings_model = os.environ.get("LLAMA_EMBEDDINGS_MODEL") +embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME") persist_directory = os.environ.get('PERSIST_DIRECTORY') model_type = os.environ.get('MODEL_TYPE') @@ -18,8 +18,8 @@ model_n_ctx = os.environ.get('MODEL_N_CTX') from constants import CHROMA_SETTINGS def main(): - llama = LlamaCppEmbeddings(model_path=llama_embeddings_model, n_ctx=model_n_ctx) - db = Chroma(persist_directory=persist_directory, embedding_function=llama, client_settings=CHROMA_SETTINGS) + embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) + db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS) retriever = db.as_retriever() # Prepare the LLM callbacks = [StreamingStdOutCallbackHandler()]