diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/email.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/email.ipynb index dff531e4550..09eedd2e759 100644 --- a/docs/extras/modules/data_connection/document_loaders/integrations/email.ipynb +++ b/docs/extras/modules/data_connection/document_loaders/integrations/email.ipynb @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "id": "40cd9806", "metadata": { "tags": [] @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "id": "2d20b852", "metadata": { "tags": [] @@ -56,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "579fa702", "metadata": { "tags": [] @@ -68,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "id": "90c1d899", "metadata": { "tags": [] @@ -80,7 +80,7 @@ "[Document(page_content='This is a test email to use for unit tests.\\n\\nImportant points:\\n\\nRoses are red\\n\\nViolets are blue', metadata={'source': 'example_data/fake-email.eml'})]" ] }, - "execution_count": 8, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -128,7 +128,7 @@ { "data": { "text/plain": [ - "Document(page_content='This is a test email to use for unit tests.', lookup_str='', metadata={'source': 'example_data/fake-email.eml'}, lookup_index=0)" + "Document(page_content='This is a test email to use for unit tests.', metadata={'source': 'example_data/fake-email.eml', 'filename': 'fake-email.eml', 'file_directory': 'example_data', 'date': '2022-12-16T17:04:16-05:00', 'filetype': 'message/rfc822', 'sent_from': ['Matthew Robinson '], 'sent_to': ['Matthew Robinson '], 'subject': 'Test Email', 'category': 'NarrativeText'})" ] }, "execution_count": 7, @@ -140,6 +140,61 @@ "data[0]" ] }, + { + "cell_type": "markdown", + "id": "5021f20a", + "metadata": {}, + "source": [ + "### Processing Attachments\n", + "\n", + "You can process attachments with `UnstructuredEmailLoader` by setting `process_attachments=True` in the constructor. By default, attachments will be partitioned using the `partition` function from `unstructured`. You can use a different partitioning function by passing the function to the `attachment_partitioner` kwarg." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6539f166", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredEmailLoader(\n", + " \"example_data/fake-email.eml\",\n", + " mode=\"elements\",\n", + " process_attachments=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "aebead38", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ddeb60f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='This is a test email to use for unit tests.', metadata={'source': 'example_data/fake-email.eml', 'filename': 'fake-email.eml', 'file_directory': 'example_data', 'date': '2022-12-16T17:04:16-05:00', 'filetype': 'message/rfc822', 'sent_from': ['Matthew Robinson '], 'sent_to': ['Matthew Robinson '], 'subject': 'Test Email', 'category': 'NarrativeText'})" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[0]" + ] + }, { "cell_type": "markdown", "id": "6a074515", @@ -234,7 +289,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.8.13" } }, "nbformat": 4, diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/example_data/fake-email-attachment.eml b/docs/extras/modules/data_connection/document_loaders/integrations/example_data/fake-email-attachment.eml new file mode 100644 index 00000000000..5d8b0367247 --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/example_data/fake-email-attachment.eml @@ -0,0 +1,50 @@ +MIME-Version: 1.0 +Date: Fri, 23 Dec 2022 12:08:48 -0600 +Message-ID: +Subject: Fake email with attachment +From: Mallori Harrell +To: Mallori Harrell +Content-Type: multipart/mixed; boundary="0000000000005d654405f082adb7" + +--0000000000005d654405f082adb7 +Content-Type: multipart/alternative; boundary="0000000000005d654205f082adb5" + +--0000000000005d654205f082adb5 +Content-Type: text/plain; charset="UTF-8" + +Hello! + +Here's the attachments! + +It includes: + + - Lots of whitespace + - Little to no content + - and is a quick read + +Best, + +Mallori + +--0000000000005d654205f082adb5 +Content-Type: text/html; charset="UTF-8" +Content-Transfer-Encoding: quoted-printable + +
Hello!=C2=A0

Here's the attachments= +!

It includes:
  • Lots of whitespace
  • Little=C2= +=A0to no content
  • and is a quick read
Best,

Mallori

+ +--0000000000005d654205f082adb5-- +--0000000000005d654405f082adb7 +Content-Type: text/plain; charset="US-ASCII"; name="fake-attachment.txt" +Content-Disposition: attachment; filename="fake-attachment.txt" +Content-Transfer-Encoding: base64 +X-Attachment-Id: f_lc0tto5j0 +Content-ID: + +SGV5IHRoaXMgaXMgYSBmYWtlIGF0dGFjaG1lbnQh +--0000000000005d654405f082adb7-- \ No newline at end of file diff --git a/langchain/document_loaders/email.py b/langchain/document_loaders/email.py index ce0e2817053..f68fcbec369 100644 --- a/langchain/document_loaders/email.py +++ b/langchain/document_loaders/email.py @@ -1,6 +1,6 @@ """Loader that loads email files.""" import os -from typing import List +from typing import Any, List from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader @@ -11,7 +11,45 @@ from langchain.document_loaders.unstructured import ( class UnstructuredEmailLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load email files.""" + """Loader that uses unstructured to load email files. Works with both + .eml and .msg files. You can process attachments in addition to the + e-mail message itself by passing process_attachments=True into the + constructor for the loader. By default, attachments will be processed + with the unstructured partition function. If you already know the document + types of the attachments, you can specify another partitioning function + with the attachment partitioner kwarg. + + Example + ------- + from langchain.document_loaders import UnstructuredEmailLoader + + loader = UnstructuredEmailLoader("example_data/fake-email.eml", mode="elements") + loader.load() + + Example + ------- + from langchain.document_loaders import UnstructuredEmailLoader + + loader = UnstructuredEmailLoader( + "example_data/fake-email-attachment.eml", + mode="elements", + process_attachments=True, + ) + loader.load() + """ + + def __init__( + self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + ): + process_attachments = unstructured_kwargs.get("process_attachments") + attachment_partitioner = unstructured_kwargs.get("attachment_partitioner") + + if process_attachments and attachment_partitioner is None: + from unstructured.partition.auto import partition + + unstructured_kwargs["attachment_partitioner"] = partition + + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) def _get_elements(self) -> List: from unstructured.file_utils.filetype import FileType, detect_filetype diff --git a/tests/integration_tests/document_loaders/test_email.py b/tests/integration_tests/document_loaders/test_email.py index 327bff5173a..b89cc19c29b 100644 --- a/tests/integration_tests/document_loaders/test_email.py +++ b/tests/integration_tests/document_loaders/test_email.py @@ -1,6 +1,6 @@ from pathlib import Path -from langchain.document_loaders import OutlookMessageLoader +from langchain.document_loaders import OutlookMessageLoader, UnstructuredEmailLoader def test_outlook_message_loader() -> None: @@ -18,3 +18,15 @@ def test_outlook_message_loader() -> None: "Extractor\r\n\r\n\r\n-- \r\n\r\n\r\nKind regards" "\r\n\r\n\r\n\r\n\r\nBrian Zhou\r\n\r\n" ) + + +def test_unstructured_email_loader_with_attachments() -> None: + file_path = Path(__file__).parent.parent / "examples/fake-email-attachment.eml" + loader = UnstructuredEmailLoader( + str(file_path), mode="elements", process_attachments=True + ) + docs = loader.load() + + assert docs[-1].page_content == "Hey this is a fake attachment!" + assert docs[-1].metadata["filename"] == "fake-attachment.txt" + assert docs[-1].metadata["source"].endswith("fake-email-attachment.eml") diff --git a/tests/integration_tests/examples/fake-email-attachment.eml b/tests/integration_tests/examples/fake-email-attachment.eml new file mode 100644 index 00000000000..5d8b0367247 --- /dev/null +++ b/tests/integration_tests/examples/fake-email-attachment.eml @@ -0,0 +1,50 @@ +MIME-Version: 1.0 +Date: Fri, 23 Dec 2022 12:08:48 -0600 +Message-ID: +Subject: Fake email with attachment +From: Mallori Harrell +To: Mallori Harrell +Content-Type: multipart/mixed; boundary="0000000000005d654405f082adb7" + +--0000000000005d654405f082adb7 +Content-Type: multipart/alternative; boundary="0000000000005d654205f082adb5" + +--0000000000005d654205f082adb5 +Content-Type: text/plain; charset="UTF-8" + +Hello! + +Here's the attachments! + +It includes: + + - Lots of whitespace + - Little to no content + - and is a quick read + +Best, + +Mallori + +--0000000000005d654205f082adb5 +Content-Type: text/html; charset="UTF-8" +Content-Transfer-Encoding: quoted-printable + +
Hello!=C2=A0

Here's the attachments= +!

It includes:
  • Lots of whitespace
  • Little=C2= +=A0to no content
  • and is a quick read
Best,

Mallori

+ +--0000000000005d654205f082adb5-- +--0000000000005d654405f082adb7 +Content-Type: text/plain; charset="US-ASCII"; name="fake-attachment.txt" +Content-Disposition: attachment; filename="fake-attachment.txt" +Content-Transfer-Encoding: base64 +X-Attachment-Id: f_lc0tto5j0 +Content-ID: + +SGV5IHRoaXMgaXMgYSBmYWtlIGF0dGFjaG1lbnQh +--0000000000005d654405f082adb7-- \ No newline at end of file