diff --git a/templates/rag-mongo/README.md b/templates/rag-mongo/README.md index ab0d1e4c1dd..d17eb5bd5b9 100644 --- a/templates/rag-mongo/README.md +++ b/templates/rag-mongo/README.md @@ -40,6 +40,13 @@ from rag_mongo import chain as rag_mongo_chain add_routes(app, rag_mongo_chain, path="/rag-mongo") ``` +If you want to set up an ingestion pipeline, you can add the following code to your `server.py` file: +```python +from rag_mongo import ingest as rag_mongo_ingest + +add_routes(app, rag_mongo_ingest, path="/rag-mongo-ingest") +``` + (Optional) Let's now configure LangSmith. LangSmith will help us trace, monitor and debug LangChain applications. LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/). diff --git a/templates/rag-mongo/rag_mongo/chain.py b/templates/rag-mongo/rag_mongo/chain.py index 455c4a3421f..c75159881ba 100644 --- a/templates/rag-mongo/rag_mongo/chain.py +++ b/templates/rag-mongo/rag_mongo/chain.py @@ -1,11 +1,17 @@ import os from langchain.chat_models import ChatOpenAI +from langchain.document_loaders import PyPDFLoader from langchain.embeddings import OpenAIEmbeddings from langchain.prompts import ChatPromptTemplate from langchain.pydantic_v1 import BaseModel from langchain.schema.output_parser import StrOutputParser -from langchain.schema.runnable import RunnableParallel, RunnablePassthrough +from langchain.schema.runnable import ( + RunnableLambda, + RunnableParallel, + RunnablePassthrough, +) +from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import MongoDBAtlasVectorSearch from pymongo import MongoClient @@ -54,3 +60,24 @@ class Question(BaseModel): chain = chain.with_types(input_type=Question) + + +def _ingest(url: str) -> dict: + loader = PyPDFLoader(url) + data = loader.load() + + # Split docs + text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0) + docs = text_splitter.split_documents(data) + + # Insert the documents in MongoDB Atlas Vector Search + _ = MongoDBAtlasVectorSearch.from_documents( + documents=docs, + embedding=OpenAIEmbeddings(disallowed_special=()), + collection=MONGODB_COLLECTION, + index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME, + ) + return {} + + +ingest = RunnableLambda(_ingest)