diff --git a/templates/rag-mongo/README.md b/templates/rag-mongo/README.md index 0ff7f79978a..92341f91e7c 100644 --- a/templates/rag-mongo/README.md +++ b/templates/rag-mongo/README.md @@ -5,11 +5,13 @@ This template performs RAG using MongoDB and OpenAI. ## Environment Setup -The environment variables that need to be set are: +You should export two environment variables, one being your MongoDB URI, the other being your OpenAI API KEY. +If you do not have a MongoDB URI, see the `Setup Mongo` section at the bottom for instructions on how to do so. -Set the `MONGO_URI` for connecting to MongoDB Atlas Vector Search. - -Set the `OPENAI_API_KEY` environment variable to access the OpenAI models. +```shell +export MONGO_URI=... +export OPENAI_API_KEY=... +``` ## Usage @@ -50,6 +52,10 @@ export LANGCHAIN_API_KEY= export LANGCHAIN_PROJECT= # if not specified, defaults to "default" ``` +If you DO NOT already have a Mongo Search Index you want to connect to, see `MongoDB Setup` section below before proceeding. + +If you DO have a MongoDB Search index you want to connect to, edit the connection details in `rag_mongo/chain.py` + If you are inside this directory, then you can spin up a LangServe instance directly by: ```shell @@ -72,3 +78,87 @@ runnable = RemoteRunnable("http://localhost:8000/rag-mongo") For additional context, please refer to [this notebook](https://colab.research.google.com/drive/1cr2HBAHyBmwKUerJq2if0JaNhy-hIq7I#scrollTo=TZp7_CBfxTOB). + +## MongoDB Setup + +Use this step if you need to setup your MongoDB account and ingest data. +We will first follow the standard MongoDB Atlas setup instructions [here](https://www.mongodb.com/docs/atlas/getting-started/). + +1. Create an account (if not already done) +2. Create a new project (if not already done) +3. Locate your MongoDB URI. + +This can be done by going to the deployement overview page and connecting to you database + +![connect.png](_images/connect.png) + +We then look at the drivers available + +![driver.png](_images/driver.png) + +Among which we will see our URI listed + +![uri.png](_images/uri.png) + +Let's then set that as an environment variable locally: + +```shell +export MONGO_URI=... +``` + +4. Let's also set an environment variable for OpenAI (which we will use as an LLM) + +```shell +export OPENAI_API_KEY=... +``` + +5. Let's now ingest some data! We can do that by moving into this directory and running the code in `ingest.py`, eg: + +```shell +python ingest.py +``` + +Note that you can (and should!) change this to ingest data of your choice + +6. We now need to set up a vector index on our data. + +We can first connect to the cluster where our database lives + +![cluster.png](_images%2Fcluster.png) + +We can then navigate to where all our collections are listed + +![collections.png](_images%2Fcollections.png) + +We can then find the collection we want and look at the search indexes for that collection + +![search-indexes.png](_images%2Fsearch-indexes.png) + +That should likely be empty, and we want to create a new one: + +![create.png](_images%2Fcreate.png) + +We will use the JSON editor to create it + +![json_editor.png](_images%2Fjson_editor.png) + +And we will paste the following JSON in: + +```text + { + "mappings": { + "dynamic": true, + "fields": { + "embedding": { + "dimensions": 1536, + "similarity": "cosine", + "type": "knnVector" + } + } + } + } +``` +![json.png](_images%2Fjson.png) + +From there, hit "Next" and then "Create Search Index". It will take a little bit but you should then have an index over your data! + diff --git a/templates/rag-mongo/_images/cluster.png b/templates/rag-mongo/_images/cluster.png new file mode 100644 index 00000000000..e94a4a1b772 Binary files /dev/null and b/templates/rag-mongo/_images/cluster.png differ diff --git a/templates/rag-mongo/_images/collections.png b/templates/rag-mongo/_images/collections.png new file mode 100644 index 00000000000..40f6cd2cdef Binary files /dev/null and b/templates/rag-mongo/_images/collections.png differ diff --git a/templates/rag-mongo/_images/connect.png b/templates/rag-mongo/_images/connect.png new file mode 100644 index 00000000000..6460d9863e1 Binary files /dev/null and b/templates/rag-mongo/_images/connect.png differ diff --git a/templates/rag-mongo/_images/create.png b/templates/rag-mongo/_images/create.png new file mode 100644 index 00000000000..29b501de07f Binary files /dev/null and b/templates/rag-mongo/_images/create.png differ diff --git a/templates/rag-mongo/_images/driver.png b/templates/rag-mongo/_images/driver.png new file mode 100644 index 00000000000..98f9276bed8 Binary files /dev/null and b/templates/rag-mongo/_images/driver.png differ diff --git a/templates/rag-mongo/_images/json.png b/templates/rag-mongo/_images/json.png new file mode 100644 index 00000000000..635a2e0c8f3 Binary files /dev/null and b/templates/rag-mongo/_images/json.png differ diff --git a/templates/rag-mongo/_images/json_editor.png b/templates/rag-mongo/_images/json_editor.png new file mode 100644 index 00000000000..47f69c57d41 Binary files /dev/null and b/templates/rag-mongo/_images/json_editor.png differ diff --git a/templates/rag-mongo/_images/search-indexes.png b/templates/rag-mongo/_images/search-indexes.png new file mode 100644 index 00000000000..aba1b86682d Binary files /dev/null and b/templates/rag-mongo/_images/search-indexes.png differ diff --git a/templates/rag-mongo/_images/uri.png b/templates/rag-mongo/_images/uri.png new file mode 100644 index 00000000000..958db8d8ca5 Binary files /dev/null and b/templates/rag-mongo/_images/uri.png differ diff --git a/templates/rag-mongo/ingest.py b/templates/rag-mongo/ingest.py new file mode 100644 index 00000000000..a69b7d7681e --- /dev/null +++ b/templates/rag-mongo/ingest.py @@ -0,0 +1,35 @@ +import os + +from langchain.document_loaders import PyPDFLoader +from langchain.embeddings import OpenAIEmbeddings +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.vectorstores import MongoDBAtlasVectorSearch +from pymongo import MongoClient + +MONGO_URI = os.environ["MONGO_URI"] + +# Note that if you change this, you also need to change it in `rag_mongo/chain.py` +DB_NAME = "langchain-test-2" +COLLECTION_NAME = "test" +ATLAS_VECTOR_SEARCH_INDEX_NAME = "default" +EMBEDDING_FIELD_NAME = "embedding" +client = MongoClient(MONGO_URI) +db = client[DB_NAME] +MONGODB_COLLECTION = db[COLLECTION_NAME] + +if __name__ == "__main__": + # Load docs + loader = PyPDFLoader("https://arxiv.org/pdf/2303.08774.pdf") + data = loader.load() + + # Split docs + text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0) + docs = text_splitter.split_documents(data) + + # Insert the documents in MongoDB Atlas Vector Search + _ = MongoDBAtlasVectorSearch.from_documents( + documents=docs, + embedding=OpenAIEmbeddings(disallowed_special=()), + collection=MONGODB_COLLECTION, + index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME, + ) diff --git a/templates/rag-mongo/rag_mongo/chain.py b/templates/rag-mongo/rag_mongo/chain.py index 454fd502d32..455c4a3421f 100644 --- a/templates/rag-mongo/rag_mongo/chain.py +++ b/templates/rag-mongo/rag_mongo/chain.py @@ -22,29 +22,6 @@ client = MongoClient(MONGO_URI) db = client[DB_NAME] MONGODB_COLLECTION = db[COLLECTION_NAME] -### Ingest code - you may need to run this the first time -""" -# Load -from langchain.document_loaders import WebBaseLoader -loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/") -data = loader.load() - -# Split -from langchain.text_splitter import RecursiveCharacterTextSplitter -text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0) -all_splits = text_splitter.split_documents(data) - -# Add to vectorDB -# Insert the documents in MongoDB Atlas Vector Search -vectorstore = MongoDBAtlasVectorSearch.from_documents( - documents=all_splits, - embedding=OpenAIEmbeddings(disallowed_special=()), - collection=MONGODB_COLLECTION, - index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME - ) -retriever = vectorstore.as_retriever() -""" - # Read from MongoDB Atlas Vector Search vectorstore = MongoDBAtlasVectorSearch.from_connection_string( MONGO_URI,