mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-10 21:35:08 +00:00
mongodb doc loader init (#10645)
- **Description:** A Document Loader for MongoDB - **Issue:** n/a - **Dependencies:** Motor, the async driver for MongoDB - **Tag maintainer:** n/a - **Twitter handle:** pigpenblue Note that an initial mongodb document loader was created 4 months ago, but the [PR ](https://github.com/langchain-ai/langchain/pull/4285)was never pulled in. @leo-gan had commented on that PR, but given it is extremely far behind the master branch and a ton has changed in Langchain since then (including repo name and structure), I rewrote the branch and issued a new PR with the expectation that the old one can be closed. Please reference that old PR for comments/context, but it can be closed in favor of this one. Thanks! --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
523898ab9c
commit
715ffda28b
163
docs/extras/integrations/document_loaders/mongodb.ipynb
Normal file
163
docs/extras/integrations/document_loaders/mongodb.ipynb
Normal file
@ -0,0 +1,163 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "vm8vn9t8DvC_"
|
||||
},
|
||||
"source": [
|
||||
"# MongoDB"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"[MongoDB](https://www.mongodb.com/) is a NoSQL , document-oriented database that supports JSON-like documents with a dynamic schema."
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "5WjXERXzFEhg"
|
||||
},
|
||||
"source": [
|
||||
"## Overview"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "juAmbgoWD17u"
|
||||
},
|
||||
"source": [
|
||||
"The MongoDB Document Loader returns a list of Langchain Documents from a MongoDB database.\n",
|
||||
"\n",
|
||||
"The Loader requires the following parameters:\n",
|
||||
"\n",
|
||||
"* MongoDB connection string\n",
|
||||
"* MongoDB database name\n",
|
||||
"* MongoDB collection name\n",
|
||||
"* (Optional) Content Filter dictionary\n",
|
||||
"\n",
|
||||
"The output takes the following format:\n",
|
||||
"\n",
|
||||
"- pageContent= Mongo Document\n",
|
||||
"- metadata={'database': '[database_name]', 'collection': '[collection_name]'}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load the Document Loader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# add this import for running in jupyter notebook\n",
|
||||
"import nest_asyncio\n",
|
||||
"nest_asyncio.apply()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders.mongodb import MongodbLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = MongodbLoader(connection_string=\"mongodb://localhost:27017/\",\n",
|
||||
" db_name=\"sample_restaurants\", \n",
|
||||
" collection_name=\"restaurants\",\n",
|
||||
" filter_criteria={\"borough\": \"Bronx\", \"cuisine\": \"Bakery\" },\n",
|
||||
" ) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"25359"
|
||||
]
|
||||
},
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"\n",
|
||||
"len(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content=\"{'_id': ObjectId('5eb3d668b31de5d588f4292a'), 'address': {'building': '2780', 'coord': [-73.98241999999999, 40.579505], 'street': 'Stillwell Avenue', 'zipcode': '11224'}, 'borough': 'Brooklyn', 'cuisine': 'American', 'grades': [{'date': datetime.datetime(2014, 6, 10, 0, 0), 'grade': 'A', 'score': 5}, {'date': datetime.datetime(2013, 6, 5, 0, 0), 'grade': 'A', 'score': 7}, {'date': datetime.datetime(2012, 4, 13, 0, 0), 'grade': 'A', 'score': 12}, {'date': datetime.datetime(2011, 10, 12, 0, 0), 'grade': 'A', 'score': 12}], 'name': 'Riviera Caterer', 'restaurant_id': '40356018'}\", metadata={'database': 'sample_restaurants', 'collection': 'restaurants'})"
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0]"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"collapsed_sections": [
|
||||
"5WjXERXzFEhg"
|
||||
],
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
@ -108,6 +108,7 @@ from langchain.document_loaders.mediawikidump import MWDumpLoader
|
||||
from langchain.document_loaders.merge import MergedDataLoader
|
||||
from langchain.document_loaders.mhtml import MHTMLLoader
|
||||
from langchain.document_loaders.modern_treasury import ModernTreasuryLoader
|
||||
from langchain.document_loaders.mongodb import MongodbLoader
|
||||
from langchain.document_loaders.news import NewsURLLoader
|
||||
from langchain.document_loaders.notebook import NotebookLoader
|
||||
from langchain.document_loaders.notion import NotionDirectoryLoader
|
||||
@ -284,6 +285,7 @@ __all__ = [
|
||||
"MaxComputeLoader",
|
||||
"MergedDataLoader",
|
||||
"ModernTreasuryLoader",
|
||||
"MongodbLoader",
|
||||
"NewsURLLoader",
|
||||
"NotebookLoader",
|
||||
"NotionDBLoader",
|
||||
|
76
libs/langchain/langchain/document_loaders/mongodb.py
Normal file
76
libs/langchain/langchain/document_loaders/mongodb.py
Normal file
@ -0,0 +1,76 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MongodbLoader(BaseLoader):
|
||||
"""Load MongoDB documents."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
connection_string: str,
|
||||
db_name: str,
|
||||
collection_name: str,
|
||||
*,
|
||||
filter_criteria: Optional[Dict] = None,
|
||||
) -> None:
|
||||
try:
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Cannot import from motor, please install with `pip install motor`."
|
||||
) from e
|
||||
if not connection_string:
|
||||
raise ValueError("connection_string must be provided.")
|
||||
|
||||
if not db_name:
|
||||
raise ValueError("db_name must be provided.")
|
||||
|
||||
if not collection_name:
|
||||
raise ValueError("collection_name must be provided.")
|
||||
|
||||
self.client = AsyncIOMotorClient(connection_string)
|
||||
self.db_name = db_name
|
||||
self.collection_name = collection_name
|
||||
self.filter_criteria = filter_criteria or {}
|
||||
|
||||
self.db = self.client.get_database(db_name)
|
||||
self.collection = self.db.get_collection(collection_name)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load data into Document objects.
|
||||
|
||||
Attention:
|
||||
|
||||
This implementation starts an asyncio event loop which
|
||||
will only work if running in a sync env. In an async env, it should
|
||||
fail since there is already an event loop running.
|
||||
|
||||
This code should be updated to kick off the event loop from a separate
|
||||
thread if running within an async context.
|
||||
"""
|
||||
return asyncio.run(self.aload())
|
||||
|
||||
async def aload(self) -> List[Document]:
|
||||
"""Load data into Document objects."""
|
||||
result = []
|
||||
total_docs = await self.collection.count_documents(self.filter_criteria)
|
||||
async for doc in self.collection.find(self.filter_criteria):
|
||||
metadata = {
|
||||
"database": self.db_name,
|
||||
"collection": self.collection_name,
|
||||
}
|
||||
result.append(Document(page_content=str(doc), metadata=metadata))
|
||||
|
||||
if len(result) != total_docs:
|
||||
logger.warning(
|
||||
f"Only partial collection of documents returned. Loaded {len(result)} "
|
||||
f"docs, expected {total_docs}."
|
||||
)
|
||||
|
||||
return result
|
34
libs/langchain/poetry.lock
generated
34
libs/langchain/poetry.lock
generated
@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "absl-py"
|
||||
@ -4687,6 +4687,30 @@ files = [
|
||||
{file = "more_itertools-10.1.0-py3-none-any.whl", hash = "sha256:64e0735fcfdc6f3464ea133afe8ea4483b1c5fe3a3d69852e6503b43a0b222e6"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "motor"
|
||||
version = "3.3.1"
|
||||
description = "Non-blocking MongoDB driver for Tornado or asyncio"
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "motor-3.3.1-py3-none-any.whl", hash = "sha256:a0dee83ad0d47b353932ac37467ba397b1e649ce7e3eea7f5a90554883d7cdbe"},
|
||||
{file = "motor-3.3.1.tar.gz", hash = "sha256:c5eb400e27d722a3db03a9826656b6d13acf9b6c70c2fb4604f474eac9da5be4"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pymongo = ">=4.5,<5"
|
||||
|
||||
[package.extras]
|
||||
aws = ["pymongo[aws] (>=4.5,<5)"]
|
||||
encryption = ["pymongo[encryption] (>=4.5,<5)"]
|
||||
gssapi = ["pymongo[gssapi] (>=4.5,<5)"]
|
||||
ocsp = ["pymongo[ocsp] (>=4.5,<5)"]
|
||||
snappy = ["pymongo[snappy] (>=4.5,<5)"]
|
||||
srv = ["pymongo[srv] (>=4.5,<5)"]
|
||||
test = ["aiohttp", "mockupdb", "motor[encryption]", "pytest (>=7)", "tornado (>=5)"]
|
||||
zstd = ["pymongo[zstd] (>=4.5,<5)"]
|
||||
|
||||
[[package]]
|
||||
name = "mpmath"
|
||||
version = "1.3.0"
|
||||
@ -5848,7 +5872,7 @@ files = [
|
||||
[package.dependencies]
|
||||
numpy = [
|
||||
{version = ">=1.20.3", markers = "python_version < \"3.10\""},
|
||||
{version = ">=1.21.0", markers = "python_version >= \"3.10\""},
|
||||
{version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||
{version = ">=1.23.2", markers = "python_version >= \"3.11\""},
|
||||
]
|
||||
python-dateutil = ">=2.8.2"
|
||||
@ -8850,7 +8874,7 @@ files = [
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""}
|
||||
greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""}
|
||||
typing-extensions = ">=4.2.0"
|
||||
|
||||
[package.extras]
|
||||
@ -10642,7 +10666,7 @@ clarifai = ["clarifai"]
|
||||
cohere = ["cohere"]
|
||||
docarray = ["docarray"]
|
||||
embeddings = ["sentence-transformers"]
|
||||
extended-testing = ["amazon-textract-caller", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "dashvector", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "xata", "xmltodict"]
|
||||
extended-testing = ["amazon-textract-caller", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "dashvector", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "motor", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "xata", "xmltodict"]
|
||||
javascript = ["esprima"]
|
||||
llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"]
|
||||
openai = ["openai", "tiktoken"]
|
||||
@ -10652,4 +10676,4 @@ text-helpers = ["chardet"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "89d73e8d026d8df841d4aaedd687dbd14900bc36ae52d5bb8cf03e8877d8222c"
|
||||
content-hash = "bb3019a7df3d22c5cd8d8c6a9a22effb86ca7e41e1d45d41db0b9d8173fac5ed"
|
||||
|
@ -129,6 +129,7 @@ markdownify = {version = "^0.11.6", optional = true}
|
||||
assemblyai = {version = "^0.17.0", optional = true}
|
||||
dashvector = {version = "^1.0.1", optional = true}
|
||||
sqlite-vss = {version = "^0.1.2", optional = true}
|
||||
motor = {version = "^3.3.1", optional = true}
|
||||
anyio = "<4.0"
|
||||
jsonpatch = "^1.33"
|
||||
timescale-vector = {version = "^0.0.1", optional = true}
|
||||
@ -351,6 +352,7 @@ extended_testing = [
|
||||
"arxiv",
|
||||
"dashvector",
|
||||
"sqlite-vss",
|
||||
"motor",
|
||||
"timescale-vector",
|
||||
"anthropic",
|
||||
]
|
||||
|
@ -0,0 +1,60 @@
|
||||
from typing import Dict, List
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.mongodb import MongodbLoader
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def raw_docs() -> List[Dict]:
|
||||
return [
|
||||
{"_id": "1", "address": {"building": "1", "room": "1"}},
|
||||
{"_id": "2", "address": {"building": "2", "room": "2"}},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def expected_documents() -> List[Document]:
|
||||
return [
|
||||
Document(
|
||||
page_content="{'_id': '1', 'address': {'building': '1', 'room': '1'}}",
|
||||
metadata={"database": "sample_restaurants", "collection": "restaurants"},
|
||||
),
|
||||
Document(
|
||||
page_content="{'_id': '2', 'address': {'building': '2', 'room': '2'}}",
|
||||
metadata={"database": "sample_restaurants", "collection": "restaurants"},
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.requires("motor")
|
||||
@pytest.mark.asyncio
|
||||
async def test_load_mocked(expected_documents: List[Document]) -> None:
|
||||
mock_async_load = AsyncMock()
|
||||
mock_async_load.return_value = expected_documents
|
||||
|
||||
mock_find = AsyncMock()
|
||||
mock_find.return_value = iter(expected_documents)
|
||||
|
||||
mock_count_documents = MagicMock()
|
||||
mock_count_documents.return_value = len(expected_documents)
|
||||
|
||||
mock_collection = MagicMock()
|
||||
mock_collection.find = mock_find
|
||||
mock_collection.count_documents = mock_count_documents
|
||||
|
||||
with patch(
|
||||
"motor.motor_asyncio.AsyncIOMotorClient", return_value=MagicMock()
|
||||
), patch(
|
||||
"langchain.document_loaders.mongodb.MongodbLoader.aload",
|
||||
new=mock_async_load,
|
||||
):
|
||||
loader = MongodbLoader(
|
||||
"mongodb://localhost:27017", "test_db", "test_collection"
|
||||
)
|
||||
loader.collection = mock_collection
|
||||
documents = await loader.aload()
|
||||
|
||||
assert documents == expected_documents
|
Loading…
Reference in New Issue
Block a user