mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-05 20:58:25 +00:00
Harrison/msg files (#2375)
Co-authored-by: Sahil Masand <masand.sahil@gmail.com> Co-authored-by: Sahil Masand <masands@cbh.com.au>
This commit is contained in:
parent
585f60a5aa
commit
e90d007db3
@ -7,7 +7,15 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# Email\n",
|
"# Email\n",
|
||||||
"\n",
|
"\n",
|
||||||
"This notebook shows how to load email (`.eml`) files."
|
"This notebook shows how to load email (`.eml`) and Microsoft Outlook (`.msg`) files."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "89caa348",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Using Unstructured"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -66,7 +74,7 @@
|
|||||||
"id": "8bf50cba",
|
"id": "8bf50cba",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Retain Elements\n",
|
"### Retain Elements\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||||
]
|
]
|
||||||
@ -112,10 +120,69 @@
|
|||||||
"data[0]"
|
"data[0]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "6a074515",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Using OutlookMessageLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "1e7a8444",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import OutlookMessageLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "77a055e6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = OutlookMessageLoader('example_data/fake-email.msg')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "789882de",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "46aa0632",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Document(page_content='This is a test email to experiment with the MS Outlook MSG Extractor\\r\\n\\r\\n\\r\\n-- \\r\\n\\r\\n\\r\\nKind regards\\r\\n\\r\\n\\r\\n\\r\\n\\r\\nBrian Zhou\\r\\n\\r\\n', metadata={'subject': 'Test for TIF files', 'sender': 'Brian Zhou <brizhou@gmail.com>', 'date': 'Mon, 18 Nov 2013 16:26:24 +0800'})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"data[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "6a074515",
|
"id": "2b223ce2",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": []
|
||||||
|
Binary file not shown.
@ -11,13 +11,18 @@ from langchain.document_loaders.azure_blob_storage_file import (
|
|||||||
)
|
)
|
||||||
from langchain.document_loaders.bigquery import BigQueryLoader
|
from langchain.document_loaders.bigquery import BigQueryLoader
|
||||||
from langchain.document_loaders.blackboard import BlackboardLoader
|
from langchain.document_loaders.blackboard import BlackboardLoader
|
||||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
from langchain.document_loaders.college_confidential import (
|
||||||
|
CollegeConfidentialLoader,
|
||||||
|
)
|
||||||
from langchain.document_loaders.conllu import CoNLLULoader
|
from langchain.document_loaders.conllu import CoNLLULoader
|
||||||
from langchain.document_loaders.csv_loader import CSVLoader
|
from langchain.document_loaders.csv_loader import CSVLoader
|
||||||
from langchain.document_loaders.dataframe import DataFrameLoader
|
from langchain.document_loaders.dataframe import DataFrameLoader
|
||||||
from langchain.document_loaders.directory import DirectoryLoader
|
from langchain.document_loaders.directory import DirectoryLoader
|
||||||
from langchain.document_loaders.duckdb_loader import DuckDBLoader
|
from langchain.document_loaders.duckdb_loader import DuckDBLoader
|
||||||
from langchain.document_loaders.email import UnstructuredEmailLoader
|
from langchain.document_loaders.email import (
|
||||||
|
OutlookMessageLoader,
|
||||||
|
UnstructuredEmailLoader,
|
||||||
|
)
|
||||||
from langchain.document_loaders.epub import UnstructuredEPubLoader
|
from langchain.document_loaders.epub import UnstructuredEPubLoader
|
||||||
from langchain.document_loaders.evernote import EverNoteLoader
|
from langchain.document_loaders.evernote import EverNoteLoader
|
||||||
from langchain.document_loaders.facebook_chat import FacebookChatLoader
|
from langchain.document_loaders.facebook_chat import FacebookChatLoader
|
||||||
@ -61,7 +66,9 @@ from langchain.document_loaders.url import UnstructuredURLLoader
|
|||||||
from langchain.document_loaders.url_selenium import SeleniumURLLoader
|
from langchain.document_loaders.url_selenium import SeleniumURLLoader
|
||||||
from langchain.document_loaders.web_base import WebBaseLoader
|
from langchain.document_loaders.web_base import WebBaseLoader
|
||||||
from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader
|
from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader
|
||||||
from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader
|
from langchain.document_loaders.word_document import (
|
||||||
|
UnstructuredWordDocumentLoader,
|
||||||
|
)
|
||||||
from langchain.document_loaders.youtube import (
|
from langchain.document_loaders.youtube import (
|
||||||
GoogleApiClient,
|
GoogleApiClient,
|
||||||
GoogleApiYoutubeLoader,
|
GoogleApiYoutubeLoader,
|
||||||
@ -89,6 +96,7 @@ __all__ = [
|
|||||||
"UnstructuredImageLoader",
|
"UnstructuredImageLoader",
|
||||||
"ObsidianLoader",
|
"ObsidianLoader",
|
||||||
"UnstructuredEmailLoader",
|
"UnstructuredEmailLoader",
|
||||||
|
"OutlookMessageLoader",
|
||||||
"UnstructuredEPubLoader",
|
"UnstructuredEPubLoader",
|
||||||
"UnstructuredMarkdownLoader",
|
"UnstructuredMarkdownLoader",
|
||||||
"RoamLoader",
|
"RoamLoader",
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
"""Loader that loads email files."""
|
"""Loader that loads email files."""
|
||||||
|
import os
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
|
||||||
|
|
||||||
@ -11,3 +14,42 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
|
|||||||
from unstructured.partition.email import partition_email
|
from unstructured.partition.email import partition_email
|
||||||
|
|
||||||
return partition_email(filename=self.file_path)
|
return partition_email(filename=self.file_path)
|
||||||
|
|
||||||
|
|
||||||
|
class OutlookMessageLoader(BaseLoader):
|
||||||
|
"""
|
||||||
|
Loader that loads Outlook Message files using extract_msg.
|
||||||
|
https://github.com/TeamMsgExtractor/msg-extractor
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, file_path: str):
|
||||||
|
"""Initialize with file path."""
|
||||||
|
|
||||||
|
self.file_path = file_path
|
||||||
|
|
||||||
|
if not os.path.isfile(self.file_path):
|
||||||
|
raise ValueError("File path %s is not a valid file" % self.file_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
import extract_msg # noqa:F401
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"extract_msg is not installed. Please install it with "
|
||||||
|
"`pip install extract_msg`"
|
||||||
|
)
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load data into document objects."""
|
||||||
|
import extract_msg
|
||||||
|
|
||||||
|
msg = extract_msg.Message(self.file_path)
|
||||||
|
return [
|
||||||
|
Document(
|
||||||
|
page_content=msg.body,
|
||||||
|
metadata={
|
||||||
|
"subject": msg.subject,
|
||||||
|
"sender": msg.sender,
|
||||||
|
"date": msg.date,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
]
|
||||||
|
20
tests/integration_tests/document_loaders/test_email.py
Normal file
20
tests/integration_tests/document_loaders/test_email.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from langchain.document_loaders import OutlookMessageLoader
|
||||||
|
|
||||||
|
|
||||||
|
def test_outlook_message_loader() -> None:
|
||||||
|
"""Test OutlookMessageLoader."""
|
||||||
|
file_path = Path(__file__).parent.parent / "examples/hello.msg"
|
||||||
|
loader = OutlookMessageLoader(str(file_path))
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) == 1
|
||||||
|
assert docs[0].metadata["subject"] == "Test for TIF files"
|
||||||
|
assert docs[0].metadata["sender"] == "Brian Zhou <brizhou@gmail.com>"
|
||||||
|
assert docs[0].metadata["date"] == "Mon, 18 Nov 2013 16:26:24 +0800"
|
||||||
|
assert docs[0].page_content == (
|
||||||
|
"This is a test email to experiment with the MS Outlook MSG "
|
||||||
|
"Extractor\r\n\r\n\r\n-- \r\n\r\n\r\nKind regards"
|
||||||
|
"\r\n\r\n\r\n\r\n\r\nBrian Zhou\r\n\r\n"
|
||||||
|
)
|
BIN
tests/integration_tests/examples/hello.msg
Normal file
BIN
tests/integration_tests/examples/hello.msg
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user