add ingest for mongo (#12897)

This commit is contained in:
Harrison Chase 2023-11-06 19:28:22 -08:00 committed by GitHub
parent ce21308f29
commit 99ffeb239f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 35 additions and 1 deletions

View File

@ -40,6 +40,13 @@ from rag_mongo import chain as rag_mongo_chain
add_routes(app, rag_mongo_chain, path="/rag-mongo") add_routes(app, rag_mongo_chain, path="/rag-mongo")
``` ```
If you want to set up an ingestion pipeline, you can add the following code to your `server.py` file:
```python
from rag_mongo import ingest as rag_mongo_ingest
add_routes(app, rag_mongo_ingest, path="/rag-mongo-ingest")
```
(Optional) Let's now configure LangSmith. (Optional) Let's now configure LangSmith.
LangSmith will help us trace, monitor and debug LangChain applications. LangSmith will help us trace, monitor and debug LangChain applications.
LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/). LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/).

View File

@ -1,11 +1,17 @@
import os import os
from langchain.chat_models import ChatOpenAI from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate from langchain.prompts import ChatPromptTemplate
from langchain.pydantic_v1 import BaseModel from langchain.pydantic_v1 import BaseModel
from langchain.schema.output_parser import StrOutputParser from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableParallel, RunnablePassthrough from langchain.schema.runnable import (
RunnableLambda,
RunnableParallel,
RunnablePassthrough,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import MongoDBAtlasVectorSearch from langchain.vectorstores import MongoDBAtlasVectorSearch
from pymongo import MongoClient from pymongo import MongoClient
@ -54,3 +60,24 @@ class Question(BaseModel):
chain = chain.with_types(input_type=Question) chain = chain.with_types(input_type=Question)
def _ingest(url: str) -> dict:
loader = PyPDFLoader(url)
data = loader.load()
# Split docs
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
docs = text_splitter.split_documents(data)
# Insert the documents in MongoDB Atlas Vector Search
_ = MongoDBAtlasVectorSearch.from_documents(
documents=docs,
embedding=OpenAIEmbeddings(disallowed_special=()),
collection=MONGODB_COLLECTION,
index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)
return {}
ingest = RunnableLambda(_ingest)