diff --git a/docs/docs/how_to/document_loader_custom.ipynb b/docs/docs/how_to/document_loader_custom.ipynb index 30009972db0..2eecb72bfdb 100644 --- a/docs/docs/how_to/document_loader_custom.ipynb +++ b/docs/docs/how_to/document_loader_custom.ipynb @@ -67,9 +67,34 @@ "When implementing a document loader do **NOT** provide parameters via the `lazy_load` or `alazy_load` methods.\n", "\n", "All configuration is expected to be passed through the initializer (__init__). This was a design choice made by LangChain to make sure that once a document loader has been instantiated it has all the information needed to load documents.\n", - ":::\n", - "\n", + ":::" + ] + }, + { + "cell_type": "markdown", + "id": "520edbbabde7df6e", + "metadata": {}, + "source": [ + "### Installation\n", "\n", + "Install **langchain-core** and **langchain_community**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "936bd5fc", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU langchain_core langchain_community" + ] + }, + { + "cell_type": "markdown", + "id": "a93f17a87d323bdd", + "metadata": {}, + "source": [ "### Implementation\n", "\n", "Let's create an example of a standard document loader that loads a file and creates a document from each line in the file." @@ -77,9 +102,13 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "20f128c1-1a2c-43b9-9e7b-cf9b3a86d1db", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:56.764714Z", + "start_time": "2025-04-21T08:49:56.623508Z" + }, "tags": [] }, "outputs": [], @@ -122,7 +151,8 @@ " self,\n", " ) -> AsyncIterator[Document]: # <-- Does not take any arguments\n", " \"\"\"An async lazy loader that reads a file line by line.\"\"\"\n", - " # Requires aiofiles (install with pip)\n", + " # Requires aiofiles\n", + " # Install with `pip install aiofiles`\n", " # https://github.com/Tinche/aiofiles\n", " import aiofiles\n", "\n", @@ -151,9 +181,13 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "b1751198-c6dd-4149-95bd-6370ce8fa06f", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:56.776521Z", + "start_time": "2025-04-21T08:49:56.773511Z" + }, "tags": [] }, "outputs": [], @@ -167,9 +201,23 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, + "id": "c5210428", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q aiofiles" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "id": "71ef1482-f9de-4852-b5a4-0938f350612e", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:57.972675Z", + "start_time": "2025-04-21T08:49:57.969411Z" + }, "tags": [] }, "outputs": [ @@ -179,10 +227,12 @@ "text": [ "\n", "\n", - "page_content='meow meow🐱 \\n' metadata={'line_number': 0, 'source': './meow.txt'}\n", + "page_content='meow meow🐱 \n", + "' metadata={'line_number': 0, 'source': './meow.txt'}\n", "\n", "\n", - "page_content=' meow meow🐱 \\n' metadata={'line_number': 1, 'source': './meow.txt'}\n", + "page_content=' meow meow🐱 \n", + "' metadata={'line_number': 1, 'source': './meow.txt'}\n", "\n", "\n", "page_content=' meow😻😻' metadata={'line_number': 2, 'source': './meow.txt'}\n" @@ -199,9 +249,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "1588e78c-e81a-4d40-b36c-634242c84a6a", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:58.028989Z", + "start_time": "2025-04-21T08:49:58.021972Z" + }, "tags": [] }, "outputs": [ @@ -211,10 +265,12 @@ "text": [ "\n", "\n", - "page_content='meow meow🐱 \\n' metadata={'line_number': 0, 'source': './meow.txt'}\n", + "page_content='meow meow🐱 \n", + "' metadata={'line_number': 0, 'source': './meow.txt'}\n", "\n", "\n", - "page_content=' meow meow🐱 \\n' metadata={'line_number': 1, 'source': './meow.txt'}\n", + "page_content=' meow meow🐱 \n", + "' metadata={'line_number': 1, 'source': './meow.txt'}\n", "\n", "\n", "page_content=' meow😻😻' metadata={'line_number': 2, 'source': './meow.txt'}\n" @@ -245,21 +301,25 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "df5ad46a-9e00-4073-8505-489fc4f3799e", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:58.078111Z", + "start_time": "2025-04-21T08:49:58.071421Z" + }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='meow meow🐱 \\n', metadata={'line_number': 0, 'source': './meow.txt'}),\n", - " Document(page_content=' meow meow🐱 \\n', metadata={'line_number': 1, 'source': './meow.txt'}),\n", - " Document(page_content=' meow😻😻', metadata={'line_number': 2, 'source': './meow.txt'})]" + "[Document(metadata={'line_number': 0, 'source': './meow.txt'}, page_content='meow meow🐱 \\n'),\n", + " Document(metadata={'line_number': 1, 'source': './meow.txt'}, page_content=' meow meow🐱 \\n'),\n", + " Document(metadata={'line_number': 2, 'source': './meow.txt'}, page_content=' meow😻😻')]" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -286,9 +346,13 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "209f6a91-2f15-4cb2-9237-f79fc9493b82", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:58.124363Z", + "start_time": "2025-04-21T08:49:58.120782Z" + }, "tags": [] }, "outputs": [], @@ -313,9 +377,13 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "b1275c59-06d4-458f-abd2-fcbad0bde442", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:58.172506Z", + "start_time": "2025-04-21T08:49:58.167416Z" + }, "tags": [] }, "outputs": [], @@ -326,21 +394,25 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "id": "56a3d707-2086-413b-ae82-50e92ddb27f6", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:58.218426Z", + "start_time": "2025-04-21T08:49:58.214684Z" + }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='meow meow🐱 \\n', metadata={'line_number': 1, 'source': './meow.txt'}),\n", - " Document(page_content=' meow meow🐱 \\n', metadata={'line_number': 2, 'source': './meow.txt'}),\n", - " Document(page_content=' meow😻😻', metadata={'line_number': 3, 'source': './meow.txt'})]" + "[Document(metadata={'line_number': 1, 'source': './meow.txt'}, page_content='meow meow🐱 \\n'),\n", + " Document(metadata={'line_number': 2, 'source': './meow.txt'}, page_content=' meow meow🐱 \\n'),\n", + " Document(metadata={'line_number': 3, 'source': './meow.txt'}, page_content=' meow😻😻')]" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -359,20 +431,24 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "id": "20d03092-ba35-47d7-b612-9d1631c261cd", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:58.267755Z", + "start_time": "2025-04-21T08:49:58.264369Z" + }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='some data from memory\\n', metadata={'line_number': 1, 'source': None}),\n", - " Document(page_content='meow', metadata={'line_number': 2, 'source': None})]" + "[Document(metadata={'line_number': 1, 'source': None}, page_content='some data from memory\\n'),\n", + " Document(metadata={'line_number': 2, 'source': None}, page_content='meow')]" ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -394,9 +470,13 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "id": "a9e92e0e-c8da-401c-b8c6-f0676004cf58", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:58.330432Z", + "start_time": "2025-04-21T08:49:58.327223Z" + }, "tags": [] }, "outputs": [], @@ -406,9 +486,13 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "id": "6b559d30-8b0c-4e45-86b1-e4602d9aaa7e", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:58.383905Z", + "start_time": "2025-04-21T08:49:58.380658Z" + }, "tags": [] }, "outputs": [ @@ -418,7 +502,7 @@ "'utf-8'" ] }, - "execution_count": 11, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -429,9 +513,13 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "id": "2f7b145a-9c6f-47f9-9487-1f4b25aff46f", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:58.443829Z", + "start_time": "2025-04-21T08:49:58.440222Z" + }, "tags": [] }, "outputs": [ @@ -441,7 +529,7 @@ "b'meow meow\\xf0\\x9f\\x90\\xb1 \\n meow meow\\xf0\\x9f\\x90\\xb1 \\n meow\\xf0\\x9f\\x98\\xbb\\xf0\\x9f\\x98\\xbb'" ] }, - "execution_count": 12, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -452,9 +540,13 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "id": "9b9482fa-c49c-42cd-a2ef-80bc93214631", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:58.498609Z", + "start_time": "2025-04-21T08:49:58.494903Z" + }, "tags": [] }, "outputs": [ @@ -464,7 +556,7 @@ "'meow meow🐱 \\n meow meow🐱 \\n meow😻😻'" ] }, - "execution_count": 13, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -475,19 +567,23 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "id": "04cc7a81-290e-4ef8-b7e1-d885fcc59ece", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:58.551353Z", + "start_time": "2025-04-21T08:49:58.547518Z" + }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 14, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -498,9 +594,13 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "id": "ec8de0ab-51d7-4e41-82c9-3ce0a6fdc2cd", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:58.599576Z", + "start_time": "2025-04-21T08:49:58.596567Z" + }, "tags": [] }, "outputs": [ @@ -510,7 +610,7 @@ "{'foo': 'bar'}" ] }, - "execution_count": 15, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -521,9 +621,13 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "id": "19eae991-ae48-43c2-8952-7347cdb76a34", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:58.649634Z", + "start_time": "2025-04-21T08:49:58.646313Z" + }, "tags": [] }, "outputs": [ @@ -533,7 +637,7 @@ "'./meow.txt'" ] }, - "execution_count": 16, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -551,65 +655,50 @@ "\n", "While a parser encapsulates the logic needed to parse binary data into documents, *blob loaders* encapsulate the logic that's necessary to load blobs from a given storage location.\n", "\n", - "At the moment, `LangChain` only supports `FileSystemBlobLoader`.\n", + "At the moment, `LangChain` supports `FileSystemBlobLoader` and `CloudBlobLoader`.\n", "\n", "You can use the `FileSystemBlobLoader` to load blobs and then use the parser to parse them." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "id": "c093becb-2e84-4329-89e3-956a3bd765e5", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:49:58.718259Z", + "start_time": "2025-04-21T08:49:58.705367Z" + }, "tags": [] }, "outputs": [], "source": [ "from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader\n", "\n", - "blob_loader = FileSystemBlobLoader(path=\".\", glob=\"*.mdx\", show_progress=True)" + "filesystem_blob_loader = FileSystemBlobLoader(\n", + " path=\".\", glob=\"*.mdx\", show_progress=True\n", + ")" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "77739dab-2a1e-4b64-8daa-fee8aa029972", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "45e85d3f63224bb59db02a40ae2e3268", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/8 [00:00[The Microsoft Office](https://www.office.com/) suite of productivity software includes Microsoft Word, Microsoft Excel, Microsoft PowerPoint, Microsoft Outlook, and Microsoft OneNote. It is available for Microsoft Windows and macOS operating systems. It is also available on Android and iOS.\\n' metadata={'line_number': 3, 'source': 'office_file.mdx'}\n", - "page_content='\\n' metadata={'line_number': 4, 'source': 'office_file.mdx'}\n", - "page_content='This covers how to load commonly used file formats including `DOCX`, `XLSX` and `PPTX` documents into a document format that we can use downstream.\\n' metadata={'line_number': 5, 'source': 'office_file.mdx'}\n", + "metadata={} mimetype='application/pdf' path='s3://bucket-01/Annual-Report-2016.pdf'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 33%|███▎ | 2/6 [00:05<00:09, 2.28s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "metadata={} mimetype='application/pdf' path='s3://bucket-01/ComingHomeToNature_ActivityBooklet.pdf'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 50%|█████ | 3/6 [00:06<00:06, 2.01s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "metadata={} mimetype='application/pdf' path='s3://bucket-01/ComingHomeToNature_ActivityBookletFoyles.pdf'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 67%|██████▋ | 4/6 [00:07<00:02, 1.44s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "metadata={} mimetype='application/pdf' path='s3://bucket-01/EVENTS E-POSTER_DAYS OF AWE.pdf'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 83%|████████▎ | 5/6 [00:07<00:01, 1.11s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "metadata={} mimetype='application/pdf' path='s3://bucket-01/MH.pdf'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 6/6 [00:08<00:00, 1.02s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "metadata={} mimetype='application/pdf' path='s3://bucket-01/SRT Annual Report 2018.pdf'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 6/6 [00:11<00:00, 1.87s/it]\n" + ] + } + ], + "source": [ + "from cloudpathlib import S3Client, S3Path\n", + "from langchain_community.document_loaders.blob_loaders import CloudBlobLoader\n", + "\n", + "client = S3Client(no_sign_request=True)\n", + "client.set_as_default_client()\n", + "\n", + "path = S3Path(\n", + " \"s3://bucket-01\", client=client\n", + ") # Supports s3://, az://, gs://, file:// schemes.\n", + "\n", + "cloud_loader = CloudBlobLoader(path, glob=\"**/*.pdf\", show_progress=True)\n", + "\n", + "for blob in cloud_loader.yield_blobs():\n", + " print(blob)" + ] + }, + { + "cell_type": "markdown", + "id": "40c361ba4cd30164", + "metadata": {}, + "source": [ + "### Generic Loader\n", + "\n", + "LangChain has a `GenericLoader` abstraction which composes a `BlobLoader` with a `BaseBlobParser`.\n", + "\n", + "`GenericLoader` is meant to provide standardized classmethods that make it easy to use existing `BlobLoader` implementations. At the moment, the `FileSystemBlobLoader` and `CloudBlobLoader` are supported. See example below:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "5dfb2be02fe662c5", + "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:50:16.244917Z", + "start_time": "2025-04-21T08:50:15.527562Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 1224.82it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='# Text embedding models\n", + "' metadata={'line_number': 1, 'source': 'embed_text.mdx'}\n", + "page_content='\n", + "' metadata={'line_number': 2, 'source': 'embed_text.mdx'}\n", + "page_content=':::info\n", + "' metadata={'line_number': 3, 'source': 'embed_text.mdx'}\n", + "page_content='Head to [Integrations](/docs/integrations/text_embedding/) for documentation on built-in integrations with text embedding model providers.\n", + "' metadata={'line_number': 4, 'source': 'embed_text.mdx'}\n", + "page_content=':::\n", + "' metadata={'line_number': 5, 'source': 'embed_text.mdx'}\n", "... output truncated for demo purposes\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] } ], "source": [ "from langchain_community.document_loaders.generic import GenericLoader\n", "\n", - "loader = GenericLoader.from_filesystem(\n", - " path=\".\", glob=\"*.mdx\", show_progress=True, parser=MyParser()\n", + "generic_loader_filesystem = GenericLoader(\n", + " blob_loader=filesystem_blob_loader, blob_parser=parser\n", ")\n", - "\n", - "for idx, doc in enumerate(loader.lazy_load()):\n", + "for idx, doc in enumerate(generic_loader_filesystem.lazy_load()):\n", " if idx < 5:\n", " print(doc)\n", "\n", @@ -690,9 +924,13 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 28, "id": "23633102-dc44-4fed-a4e1-8159489101c8", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:50:34.841862Z", + "start_time": "2025-04-21T08:50:34.838375Z" + }, "tags": [] }, "outputs": [], @@ -709,37 +947,46 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 29, "id": "dc95be85-4a29-4c6f-a260-08afa3c95538", "metadata": { + "ExecuteTime": { + "end_time": "2025-04-21T08:50:34.901734Z", + "start_time": "2025-04-21T08:50:34.888098Z" + }, "tags": [] }, "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4320598ea3b44a52b1873e1c801db312", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/8 [00:00[The Microsoft Office](https://www.office.com/) suite of productivity software includes Microsoft Word, Microsoft Excel, Microsoft PowerPoint, Microsoft Outlook, and Microsoft OneNote. It is available for Microsoft Windows and macOS operating systems. It is also available on Android and iOS.\\n' metadata={'line_number': 3, 'source': 'office_file.mdx'}\n", - "page_content='\\n' metadata={'line_number': 4, 'source': 'office_file.mdx'}\n", - "page_content='This covers how to load commonly used file formats including `DOCX`, `XLSX` and `PPTX` documents into a document format that we can use downstream.\\n' metadata={'line_number': 5, 'source': 'office_file.mdx'}\n", + "page_content='# Text embedding models\n", + "' metadata={'line_number': 1, 'source': 'embed_text.mdx'}\n", + "page_content='\n", + "' metadata={'line_number': 2, 'source': 'embed_text.mdx'}\n", + "page_content=':::info\n", + "' metadata={'line_number': 3, 'source': 'embed_text.mdx'}\n", + "page_content='Head to [Integrations](/docs/integrations/text_embedding/) for documentation on built-in integrations with text embedding model providers.\n", + "' metadata={'line_number': 4, 'source': 'embed_text.mdx'}\n", + "page_content=':::\n", + "' metadata={'line_number': 5, 'source': 'embed_text.mdx'}\n", "... output truncated for demo purposes\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] } ], "source": [