From 715ffda28b9eede61e32a6a595bacba1cc9f85e4 Mon Sep 17 00:00:00 2001 From: Jon Saginaw Date: Fri, 29 Sep 2023 11:44:07 -0400 Subject: [PATCH] mongodb doc loader init (#10645) - **Description:** A Document Loader for MongoDB - **Issue:** n/a - **Dependencies:** Motor, the async driver for MongoDB - **Tag maintainer:** n/a - **Twitter handle:** pigpenblue Note that an initial mongodb document loader was created 4 months ago, but the [PR ](https://github.com/langchain-ai/langchain/pull/4285)was never pulled in. @leo-gan had commented on that PR, but given it is extremely far behind the master branch and a ton has changed in Langchain since then (including repo name and structure), I rewrote the branch and issued a new PR with the expectation that the old one can be closed. Please reference that old PR for comments/context, but it can be closed in favor of this one. Thanks! --------- Co-authored-by: Bagatur Co-authored-by: Eugene Yurtsev --- .../document_loaders/mongodb.ipynb | 163 ++++++++++++++++++ .../langchain/document_loaders/__init__.py | 2 + .../langchain/document_loaders/mongodb.py | 76 ++++++++ libs/langchain/poetry.lock | 34 +++- libs/langchain/pyproject.toml | 2 + .../document_loaders/test_mongodb.py | 60 +++++++ 6 files changed, 332 insertions(+), 5 deletions(-) create mode 100644 docs/extras/integrations/document_loaders/mongodb.ipynb create mode 100644 libs/langchain/langchain/document_loaders/mongodb.py create mode 100644 libs/langchain/tests/unit_tests/document_loaders/test_mongodb.py diff --git a/docs/extras/integrations/document_loaders/mongodb.ipynb b/docs/extras/integrations/document_loaders/mongodb.ipynb new file mode 100644 index 00000000000..6e486200499 --- /dev/null +++ b/docs/extras/integrations/document_loaders/mongodb.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "vm8vn9t8DvC_" + }, + "source": [ + "# MongoDB" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[MongoDB](https://www.mongodb.com/) is a NoSQL , document-oriented database that supports JSON-like documents with a dynamic schema." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "5WjXERXzFEhg" + }, + "source": [ + "## Overview" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "juAmbgoWD17u" + }, + "source": [ + "The MongoDB Document Loader returns a list of Langchain Documents from a MongoDB database.\n", + "\n", + "The Loader requires the following parameters:\n", + "\n", + "* MongoDB connection string\n", + "* MongoDB database name\n", + "* MongoDB collection name\n", + "* (Optional) Content Filter dictionary\n", + "\n", + "The output takes the following format:\n", + "\n", + "- pageContent= Mongo Document\n", + "- metadata={'database': '[database_name]', 'collection': '[collection_name]'}" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the Document Loader" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# add this import for running in jupyter notebook\n", + "import nest_asyncio\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.mongodb import MongodbLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "loader = MongodbLoader(connection_string=\"mongodb://localhost:27017/\",\n", + " db_name=\"sample_restaurants\", \n", + " collection_name=\"restaurants\",\n", + " filter_criteria={\"borough\": \"Bronx\", \"cuisine\": \"Bakery\" },\n", + " ) " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "25359" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs = loader.load()\n", + "\n", + "len(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content=\"{'_id': ObjectId('5eb3d668b31de5d588f4292a'), 'address': {'building': '2780', 'coord': [-73.98241999999999, 40.579505], 'street': 'Stillwell Avenue', 'zipcode': '11224'}, 'borough': 'Brooklyn', 'cuisine': 'American', 'grades': [{'date': datetime.datetime(2014, 6, 10, 0, 0), 'grade': 'A', 'score': 5}, {'date': datetime.datetime(2013, 6, 5, 0, 0), 'grade': 'A', 'score': 7}, {'date': datetime.datetime(2012, 4, 13, 0, 0), 'grade': 'A', 'score': 12}, {'date': datetime.datetime(2011, 10, 12, 0, 0), 'grade': 'A', 'score': 12}], 'name': 'Riviera Caterer', 'restaurant_id': '40356018'}\", metadata={'database': 'sample_restaurants', 'collection': 'restaurants'})" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "5WjXERXzFEhg" + ], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/libs/langchain/langchain/document_loaders/__init__.py b/libs/langchain/langchain/document_loaders/__init__.py index 0badf53af20..da26274c33f 100644 --- a/libs/langchain/langchain/document_loaders/__init__.py +++ b/libs/langchain/langchain/document_loaders/__init__.py @@ -108,6 +108,7 @@ from langchain.document_loaders.mediawikidump import MWDumpLoader from langchain.document_loaders.merge import MergedDataLoader from langchain.document_loaders.mhtml import MHTMLLoader from langchain.document_loaders.modern_treasury import ModernTreasuryLoader +from langchain.document_loaders.mongodb import MongodbLoader from langchain.document_loaders.news import NewsURLLoader from langchain.document_loaders.notebook import NotebookLoader from langchain.document_loaders.notion import NotionDirectoryLoader @@ -284,6 +285,7 @@ __all__ = [ "MaxComputeLoader", "MergedDataLoader", "ModernTreasuryLoader", + "MongodbLoader", "NewsURLLoader", "NotebookLoader", "NotionDBLoader", diff --git a/libs/langchain/langchain/document_loaders/mongodb.py b/libs/langchain/langchain/document_loaders/mongodb.py new file mode 100644 index 00000000000..7f48f88cd67 --- /dev/null +++ b/libs/langchain/langchain/document_loaders/mongodb.py @@ -0,0 +1,76 @@ +import asyncio +import logging +from typing import Dict, List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + +logger = logging.getLogger(__name__) + + +class MongodbLoader(BaseLoader): + """Load MongoDB documents.""" + + def __init__( + self, + connection_string: str, + db_name: str, + collection_name: str, + *, + filter_criteria: Optional[Dict] = None, + ) -> None: + try: + from motor.motor_asyncio import AsyncIOMotorClient + except ImportError as e: + raise ImportError( + "Cannot import from motor, please install with `pip install motor`." + ) from e + if not connection_string: + raise ValueError("connection_string must be provided.") + + if not db_name: + raise ValueError("db_name must be provided.") + + if not collection_name: + raise ValueError("collection_name must be provided.") + + self.client = AsyncIOMotorClient(connection_string) + self.db_name = db_name + self.collection_name = collection_name + self.filter_criteria = filter_criteria or {} + + self.db = self.client.get_database(db_name) + self.collection = self.db.get_collection(collection_name) + + def load(self) -> List[Document]: + """Load data into Document objects. + + Attention: + + This implementation starts an asyncio event loop which + will only work if running in a sync env. In an async env, it should + fail since there is already an event loop running. + + This code should be updated to kick off the event loop from a separate + thread if running within an async context. + """ + return asyncio.run(self.aload()) + + async def aload(self) -> List[Document]: + """Load data into Document objects.""" + result = [] + total_docs = await self.collection.count_documents(self.filter_criteria) + async for doc in self.collection.find(self.filter_criteria): + metadata = { + "database": self.db_name, + "collection": self.collection_name, + } + result.append(Document(page_content=str(doc), metadata=metadata)) + + if len(result) != total_docs: + logger.warning( + f"Only partial collection of documents returned. Loaded {len(result)} " + f"docs, expected {total_docs}." + ) + + return result diff --git a/libs/langchain/poetry.lock b/libs/langchain/poetry.lock index 4aad20a3d40..ba8cdcd3730 100644 --- a/libs/langchain/poetry.lock +++ b/libs/langchain/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "absl-py" @@ -4687,6 +4687,30 @@ files = [ {file = "more_itertools-10.1.0-py3-none-any.whl", hash = "sha256:64e0735fcfdc6f3464ea133afe8ea4483b1c5fe3a3d69852e6503b43a0b222e6"}, ] +[[package]] +name = "motor" +version = "3.3.1" +description = "Non-blocking MongoDB driver for Tornado or asyncio" +optional = true +python-versions = ">=3.7" +files = [ + {file = "motor-3.3.1-py3-none-any.whl", hash = "sha256:a0dee83ad0d47b353932ac37467ba397b1e649ce7e3eea7f5a90554883d7cdbe"}, + {file = "motor-3.3.1.tar.gz", hash = "sha256:c5eb400e27d722a3db03a9826656b6d13acf9b6c70c2fb4604f474eac9da5be4"}, +] + +[package.dependencies] +pymongo = ">=4.5,<5" + +[package.extras] +aws = ["pymongo[aws] (>=4.5,<5)"] +encryption = ["pymongo[encryption] (>=4.5,<5)"] +gssapi = ["pymongo[gssapi] (>=4.5,<5)"] +ocsp = ["pymongo[ocsp] (>=4.5,<5)"] +snappy = ["pymongo[snappy] (>=4.5,<5)"] +srv = ["pymongo[srv] (>=4.5,<5)"] +test = ["aiohttp", "mockupdb", "motor[encryption]", "pytest (>=7)", "tornado (>=5)"] +zstd = ["pymongo[zstd] (>=4.5,<5)"] + [[package]] name = "mpmath" version = "1.3.0" @@ -5848,7 +5872,7 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.20.3", markers = "python_version < \"3.10\""}, - {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, ] python-dateutil = ">=2.8.2" @@ -8850,7 +8874,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "platform_machine == \"win32\" or platform_machine == \"WIN32\" or platform_machine == \"AMD64\" or platform_machine == \"amd64\" or platform_machine == \"x86_64\" or platform_machine == \"ppc64le\" or platform_machine == \"aarch64\""} +greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""} typing-extensions = ">=4.2.0" [package.extras] @@ -10642,7 +10666,7 @@ clarifai = ["clarifai"] cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["amazon-textract-caller", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "dashvector", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "xata", "xmltodict"] +extended-testing = ["amazon-textract-caller", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "dashvector", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "motor", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "xata", "xmltodict"] javascript = ["esprima"] llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"] openai = ["openai", "tiktoken"] @@ -10652,4 +10676,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "89d73e8d026d8df841d4aaedd687dbd14900bc36ae52d5bb8cf03e8877d8222c" +content-hash = "bb3019a7df3d22c5cd8d8c6a9a22effb86ca7e41e1d45d41db0b9d8173fac5ed" diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index aed20ebd833..3064a953298 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -129,6 +129,7 @@ markdownify = {version = "^0.11.6", optional = true} assemblyai = {version = "^0.17.0", optional = true} dashvector = {version = "^1.0.1", optional = true} sqlite-vss = {version = "^0.1.2", optional = true} +motor = {version = "^3.3.1", optional = true} anyio = "<4.0" jsonpatch = "^1.33" timescale-vector = {version = "^0.0.1", optional = true} @@ -351,6 +352,7 @@ extended_testing = [ "arxiv", "dashvector", "sqlite-vss", + "motor", "timescale-vector", "anthropic", ] diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_mongodb.py b/libs/langchain/tests/unit_tests/document_loaders/test_mongodb.py new file mode 100644 index 00000000000..c6b380596d3 --- /dev/null +++ b/libs/langchain/tests/unit_tests/document_loaders/test_mongodb.py @@ -0,0 +1,60 @@ +from typing import Dict, List +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from langchain.docstore.document import Document +from langchain.document_loaders.mongodb import MongodbLoader + + +@pytest.fixture +def raw_docs() -> List[Dict]: + return [ + {"_id": "1", "address": {"building": "1", "room": "1"}}, + {"_id": "2", "address": {"building": "2", "room": "2"}}, + ] + + +@pytest.fixture +def expected_documents() -> List[Document]: + return [ + Document( + page_content="{'_id': '1', 'address': {'building': '1', 'room': '1'}}", + metadata={"database": "sample_restaurants", "collection": "restaurants"}, + ), + Document( + page_content="{'_id': '2', 'address': {'building': '2', 'room': '2'}}", + metadata={"database": "sample_restaurants", "collection": "restaurants"}, + ), + ] + + +@pytest.mark.requires("motor") +@pytest.mark.asyncio +async def test_load_mocked(expected_documents: List[Document]) -> None: + mock_async_load = AsyncMock() + mock_async_load.return_value = expected_documents + + mock_find = AsyncMock() + mock_find.return_value = iter(expected_documents) + + mock_count_documents = MagicMock() + mock_count_documents.return_value = len(expected_documents) + + mock_collection = MagicMock() + mock_collection.find = mock_find + mock_collection.count_documents = mock_count_documents + + with patch( + "motor.motor_asyncio.AsyncIOMotorClient", return_value=MagicMock() + ), patch( + "langchain.document_loaders.mongodb.MongodbLoader.aload", + new=mock_async_load, + ): + loader = MongodbLoader( + "mongodb://localhost:27017", "test_db", "test_collection" + ) + loader.collection = mock_collection + documents = await loader.aload() + + assert documents == expected_documents