add ingest for mongo (#12897)

This commit is contained in:
Harrison Chase 2023-11-06 19:28:22 -08:00 committed by GitHub
parent ce21308f29
commit 99ffeb239f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 35 additions and 1 deletions

View File

@ -40,6 +40,13 @@ from rag_mongo import chain as rag_mongo_chain
add_routes(app, rag_mongo_chain, path="/rag-mongo")
```
If you want to set up an ingestion pipeline, you can add the following code to your `server.py` file:
```python
from rag_mongo import ingest as rag_mongo_ingest
add_routes(app, rag_mongo_ingest, path="/rag-mongo-ingest")
```
(Optional) Let's now configure LangSmith.
LangSmith will help us trace, monitor and debug LangChain applications.
LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/).

View File

@ -1,11 +1,17 @@
import os
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.pydantic_v1 import BaseModel
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableParallel, RunnablePassthrough
from langchain.schema.runnable import (
RunnableLambda,
RunnableParallel,
RunnablePassthrough,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import MongoDBAtlasVectorSearch
from pymongo import MongoClient
@ -54,3 +60,24 @@ class Question(BaseModel):
chain = chain.with_types(input_type=Question)
def _ingest(url: str) -> dict:
loader = PyPDFLoader(url)
data = loader.load()
# Split docs
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
docs = text_splitter.split_documents(data)
# Insert the documents in MongoDB Atlas Vector Search
_ = MongoDBAtlasVectorSearch.from_documents(
documents=docs,
embedding=OpenAIEmbeddings(disallowed_special=()),
collection=MONGODB_COLLECTION,
index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)
return {}
ingest = RunnableLambda(_ingest)