update mongo template (#12838)

2025-08-18 17:11:25 +00:00 · 2023-11-03 10:31:53 -07:00 · 2023-11-03 10:31:53 -07:00 · 523e5803bb
commit 523e5803bb
parent 18005c6384
12 changed files with 129 additions and 27 deletions
--- a/templates/rag-mongo/README.md
+++ b/templates/rag-mongo/README.md
@ -5,11 +5,13 @@ This template performs RAG using MongoDB and OpenAI.

 ## Environment Setup

-The environment variables that need to be set are:
+You should export two environment variables, one being your MongoDB URI, the other being your OpenAI API KEY.
+If you do not have a MongoDB URI, see the `Setup Mongo` section at the bottom for instructions on how to do so.

-Set the `MONGO_URI` for connecting to MongoDB Atlas Vector Search.
-
-Set the `OPENAI_API_KEY` environment variable to access the OpenAI models.
+```shell
+export MONGO_URI=...
+export OPENAI_API_KEY=...
+```

 ## Usage

@ -50,6 +52,10 @@ export LANGCHAIN_API_KEY=<your-api-key>
 export LANGCHAIN_PROJECT=<your-project>  # if not specified, defaults to "default"
 ```

+If you DO NOT already have a Mongo Search Index you want to connect to, see `MongoDB Setup` section below before proceeding.
+
+If you DO have a MongoDB Search index you want to connect to, edit the connection details in `rag_mongo/chain.py`
+
 If you are inside this directory, then you can spin up a LangServe instance directly by:

 ```shell
@ -72,3 +78,87 @@ runnable = RemoteRunnable("http://localhost:8000/rag-mongo")

 For additional context, please refer to [this notebook](https://colab.research.google.com/drive/1cr2HBAHyBmwKUerJq2if0JaNhy-hIq7I#scrollTo=TZp7_CBfxTOB).

+
+## MongoDB Setup
+
+Use this step if you need to setup your MongoDB account and ingest data.
+We will first follow the standard MongoDB Atlas setup instructions [here](https://www.mongodb.com/docs/atlas/getting-started/).
+
+1. Create an account (if not already done)
+2. Create a new project (if not already done)
+3. Locate your MongoDB URI.
+
+This can be done by going to the deployement overview page and connecting to you database
+
+![connect.png](_images/connect.png)
+
+We then look at the drivers available
+
+![driver.png](_images/driver.png)
+
+Among which we will see our URI listed
+
+![uri.png](_images/uri.png)
+
+Let's then set that as an environment variable locally:
+
+```shell
+export MONGO_URI=...
+```
+
+4. Let's also set an environment variable for OpenAI (which we will use as an LLM)
+
+```shell
+export OPENAI_API_KEY=...
+```
+
+5. Let's now ingest some data! We can do that by moving into this directory and running the code in `ingest.py`, eg:
+
+```shell
+python ingest.py
+```
+
+Note that you can (and should!) change this to ingest data of your choice
+
+6. We now need to set up a vector index on our data.
+
+We can first connect to the cluster where our database lives
+
+![cluster.png](_images%2Fcluster.png)
+
+We can then navigate to where all our collections are listed
+
+![collections.png](_images%2Fcollections.png)
+
+We can then find the collection we want and look at the search indexes for that collection
+
+![search-indexes.png](_images%2Fsearch-indexes.png)
+
+That should likely be empty, and we want to create a new one:
+
+![create.png](_images%2Fcreate.png)
+
+We will use the JSON editor to create it
+
+![json_editor.png](_images%2Fjson_editor.png)
+
+And we will paste the following JSON in:
+
+```text
+ {
+   "mappings": {
+     "dynamic": true,
+     "fields": {
+       "embedding": {
+         "dimensions": 1536,
+         "similarity": "cosine",
+         "type": "knnVector"
+       }
+     }
+   }
+ }
+```
+![json.png](_images%2Fjson.png)
+
+From there, hit "Next" and then "Create Search Index". It will take a little bit but you should then have an index over your data!
+
--- a/templates/rag-mongo/_images/cluster.png
+++ b/templates/rag-mongo/_images/cluster.png
--- a/templates/rag-mongo/_images/collections.png
+++ b/templates/rag-mongo/_images/collections.png
--- a/templates/rag-mongo/_images/connect.png
+++ b/templates/rag-mongo/_images/connect.png
--- a/templates/rag-mongo/_images/create.png
+++ b/templates/rag-mongo/_images/create.png
--- a/templates/rag-mongo/_images/driver.png
+++ b/templates/rag-mongo/_images/driver.png
--- a/templates/rag-mongo/_images/json.png
+++ b/templates/rag-mongo/_images/json.png
--- a/templates/rag-mongo/_images/json_editor.png
+++ b/templates/rag-mongo/_images/json_editor.png
--- a/templates/rag-mongo/_images/search-indexes.png
+++ b/templates/rag-mongo/_images/search-indexes.png
--- a/templates/rag-mongo/_images/uri.png
+++ b/templates/rag-mongo/_images/uri.png
--- a/templates/rag-mongo/ingest.py
+++ b/templates/rag-mongo/ingest.py
@ -0,0 +1,35 @@
+import os
+
+from langchain.document_loaders import PyPDFLoader
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import MongoDBAtlasVectorSearch
+from pymongo import MongoClient
+
+MONGO_URI = os.environ["MONGO_URI"]
+
+# Note that if you change this, you also need to change it in `rag_mongo/chain.py`
+DB_NAME = "langchain-test-2"
+COLLECTION_NAME = "test"
+ATLAS_VECTOR_SEARCH_INDEX_NAME = "default"
+EMBEDDING_FIELD_NAME = "embedding"
+client = MongoClient(MONGO_URI)
+db = client[DB_NAME]
+MONGODB_COLLECTION = db[COLLECTION_NAME]
+
+if __name__ == "__main__":
+    # Load docs
+    loader = PyPDFLoader("https://arxiv.org/pdf/2303.08774.pdf")
+    data = loader.load()
+
+    # Split docs
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
+    docs = text_splitter.split_documents(data)
+
+    # Insert the documents in MongoDB Atlas Vector Search
+    _ = MongoDBAtlasVectorSearch.from_documents(
+        documents=docs,
+        embedding=OpenAIEmbeddings(disallowed_special=()),
+        collection=MONGODB_COLLECTION,
+        index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
+    )
--- a/templates/rag-mongo/rag_mongo/chain.py
+++ b/templates/rag-mongo/rag_mongo/chain.py
@ -22,29 +22,6 @@ client = MongoClient(MONGO_URI)
 db = client[DB_NAME]
 MONGODB_COLLECTION = db[COLLECTION_NAME]

-### Ingest code - you may need to run this the first time
-""" 
-# Load
-from langchain.document_loaders import WebBaseLoader
-loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
-data = loader.load()
-
-# Split
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
-all_splits = text_splitter.split_documents(data)
-
-# Add to vectorDB
-# Insert the documents in MongoDB Atlas Vector Search
-vectorstore = MongoDBAtlasVectorSearch.from_documents(
-     documents=all_splits, 
-     embedding=OpenAIEmbeddings(disallowed_special=()), 
-     collection=MONGODB_COLLECTION, 
-     index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME
- )
-retriever = vectorstore.as_retriever()
-"""
-
 # Read from MongoDB Atlas Vector Search
 vectorstore = MongoDBAtlasVectorSearch.from_connection_string(
    MONGO_URI,