From 393f469eb31716387396445e47b2dab067335044 Mon Sep 17 00:00:00 2001 From: Lance Martin <122662504+rlancemartin@users.noreply.github.com> Date: Fri, 23 Jun 2023 13:02:48 -0700 Subject: [PATCH] Create merge loader that combines documents from a set of loaders (#6659) Simple utility loader that combines documents from a set of specified loaders. --- .../integrations/merge_doc_loader.ipynb | 99 +++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/merge.py | 27 +++++ 3 files changed, 128 insertions(+) create mode 100644 docs/extras/modules/data_connection/document_loaders/integrations/merge_doc_loader.ipynb create mode 100644 langchain/document_loaders/merge.py diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/merge_doc_loader.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/merge_doc_loader.ipynb new file mode 100644 index 00000000000..2144c777cb3 --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/merge_doc_loader.ipynb @@ -0,0 +1,99 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dd7c3503", + "metadata": {}, + "source": [ + "# MergeDocLoader\n", + "\n", + "Merge the documents returned from a set of specified data loaders." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e08dfff1", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import WebBaseLoader\n", + "loader_web = WebBaseLoader(\"https://github.com/basecamp/handbook/blob/master/37signals-is-you.md\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "07b42b2e", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import PyPDFLoader\n", + "loader_pdf = PyPDFLoader(\"../MachineLearning-Lecture01.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "912ede96", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.merge import MergedDataLoader\n", + "loader_all=MergedDataLoader(loaders=[loader_web,loader_pdf])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9d001311", + "metadata": {}, + "outputs": [], + "source": [ + "docs_all=loader_all.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b9097486", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "23" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(docs_all)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 3d80f1c0fd1..ad6160ea04f 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -67,6 +67,7 @@ from langchain.document_loaders.markdown import UnstructuredMarkdownLoader from langchain.document_loaders.mastodon import MastodonTootsLoader from langchain.document_loaders.max_compute import MaxComputeLoader from langchain.document_loaders.mediawikidump import MWDumpLoader +from langchain.document_loaders.merge import MergedDataLoader from langchain.document_loaders.modern_treasury import ModernTreasuryLoader from langchain.document_loaders.notebook import NotebookLoader from langchain.document_loaders.notion import NotionDirectoryLoader @@ -201,6 +202,7 @@ __all__ = [ "MastodonTootsLoader", "MathpixPDFLoader", "MaxComputeLoader", + "MergedDataLoader", "ModernTreasuryLoader", "NotebookLoader", "NotionDBLoader", diff --git a/langchain/document_loaders/merge.py b/langchain/document_loaders/merge.py new file mode 100644 index 00000000000..496a404b131 --- /dev/null +++ b/langchain/document_loaders/merge.py @@ -0,0 +1,27 @@ +from typing import Iterator, List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class MergedDataLoader(BaseLoader): + """Merge documents from a list of loaders""" + + def __init__(self, loaders: List): + """Initialize with a list of loaders""" + self.loaders = loaders + + def lazy_load(self) -> Iterator[Document]: + """Lazy load docs from each individual loader.""" + for loader in self.loaders: + # Check if lazy_load is implemented + try: + data = loader.lazy_load() + except NotImplementedError: + data = loader.load() + for document in data: + yield document + + def load(self) -> List[Document]: + """Load docs.""" + return list(self.lazy_load())