mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-05 04:38:26 +00:00
Harrison/msg files (#2375)
Co-authored-by: Sahil Masand <masand.sahil@gmail.com> Co-authored-by: Sahil Masand <masands@cbh.com.au>
This commit is contained in:
parent
585f60a5aa
commit
e90d007db3
@ -7,7 +7,15 @@
|
||||
"source": [
|
||||
"# Email\n",
|
||||
"\n",
|
||||
"This notebook shows how to load email (`.eml`) files."
|
||||
"This notebook shows how to load email (`.eml`) and Microsoft Outlook (`.msg`) files."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "89caa348",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using Unstructured"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -66,7 +74,7 @@
|
||||
"id": "8bf50cba",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retain Elements\n",
|
||||
"### Retain Elements\n",
|
||||
"\n",
|
||||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||
]
|
||||
@ -112,10 +120,69 @@
|
||||
"data[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6a074515",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using OutlookMessageLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "1e7a8444",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import OutlookMessageLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "77a055e6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = OutlookMessageLoader('example_data/fake-email.msg')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "789882de",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "46aa0632",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='This is a test email to experiment with the MS Outlook MSG Extractor\\r\\n\\r\\n\\r\\n-- \\r\\n\\r\\n\\r\\nKind regards\\r\\n\\r\\n\\r\\n\\r\\n\\r\\nBrian Zhou\\r\\n\\r\\n', metadata={'subject': 'Test for TIF files', 'sender': 'Brian Zhou <brizhou@gmail.com>', 'date': 'Mon, 18 Nov 2013 16:26:24 +0800'})"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6a074515",
|
||||
"id": "2b223ce2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
|
Binary file not shown.
@ -11,13 +11,18 @@ from langchain.document_loaders.azure_blob_storage_file import (
|
||||
)
|
||||
from langchain.document_loaders.bigquery import BigQueryLoader
|
||||
from langchain.document_loaders.blackboard import BlackboardLoader
|
||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
||||
from langchain.document_loaders.college_confidential import (
|
||||
CollegeConfidentialLoader,
|
||||
)
|
||||
from langchain.document_loaders.conllu import CoNLLULoader
|
||||
from langchain.document_loaders.csv_loader import CSVLoader
|
||||
from langchain.document_loaders.dataframe import DataFrameLoader
|
||||
from langchain.document_loaders.directory import DirectoryLoader
|
||||
from langchain.document_loaders.duckdb_loader import DuckDBLoader
|
||||
from langchain.document_loaders.email import UnstructuredEmailLoader
|
||||
from langchain.document_loaders.email import (
|
||||
OutlookMessageLoader,
|
||||
UnstructuredEmailLoader,
|
||||
)
|
||||
from langchain.document_loaders.epub import UnstructuredEPubLoader
|
||||
from langchain.document_loaders.evernote import EverNoteLoader
|
||||
from langchain.document_loaders.facebook_chat import FacebookChatLoader
|
||||
@ -61,7 +66,9 @@ from langchain.document_loaders.url import UnstructuredURLLoader
|
||||
from langchain.document_loaders.url_selenium import SeleniumURLLoader
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader
|
||||
from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader
|
||||
from langchain.document_loaders.word_document import (
|
||||
UnstructuredWordDocumentLoader,
|
||||
)
|
||||
from langchain.document_loaders.youtube import (
|
||||
GoogleApiClient,
|
||||
GoogleApiYoutubeLoader,
|
||||
@ -89,6 +96,7 @@ __all__ = [
|
||||
"UnstructuredImageLoader",
|
||||
"ObsidianLoader",
|
||||
"UnstructuredEmailLoader",
|
||||
"OutlookMessageLoader",
|
||||
"UnstructuredEPubLoader",
|
||||
"UnstructuredMarkdownLoader",
|
||||
"RoamLoader",
|
||||
|
@ -1,6 +1,9 @@
|
||||
"""Loader that loads email files."""
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
@ -11,3 +14,42 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
|
||||
from unstructured.partition.email import partition_email
|
||||
|
||||
return partition_email(filename=self.file_path)
|
||||
|
||||
|
||||
class OutlookMessageLoader(BaseLoader):
|
||||
"""
|
||||
Loader that loads Outlook Message files using extract_msg.
|
||||
https://github.com/TeamMsgExtractor/msg-extractor
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
|
||||
self.file_path = file_path
|
||||
|
||||
if not os.path.isfile(self.file_path):
|
||||
raise ValueError("File path %s is not a valid file" % self.file_path)
|
||||
|
||||
try:
|
||||
import extract_msg # noqa:F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"extract_msg is not installed. Please install it with "
|
||||
"`pip install extract_msg`"
|
||||
)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load data into document objects."""
|
||||
import extract_msg
|
||||
|
||||
msg = extract_msg.Message(self.file_path)
|
||||
return [
|
||||
Document(
|
||||
page_content=msg.body,
|
||||
metadata={
|
||||
"subject": msg.subject,
|
||||
"sender": msg.sender,
|
||||
"date": msg.date,
|
||||
},
|
||||
)
|
||||
]
|
||||
|
20
tests/integration_tests/document_loaders/test_email.py
Normal file
20
tests/integration_tests/document_loaders/test_email.py
Normal file
@ -0,0 +1,20 @@
|
||||
from pathlib import Path
|
||||
|
||||
from langchain.document_loaders import OutlookMessageLoader
|
||||
|
||||
|
||||
def test_outlook_message_loader() -> None:
|
||||
"""Test OutlookMessageLoader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.msg"
|
||||
loader = OutlookMessageLoader(str(file_path))
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
assert docs[0].metadata["subject"] == "Test for TIF files"
|
||||
assert docs[0].metadata["sender"] == "Brian Zhou <brizhou@gmail.com>"
|
||||
assert docs[0].metadata["date"] == "Mon, 18 Nov 2013 16:26:24 +0800"
|
||||
assert docs[0].page_content == (
|
||||
"This is a test email to experiment with the MS Outlook MSG "
|
||||
"Extractor\r\n\r\n\r\n-- \r\n\r\n\r\nKind regards"
|
||||
"\r\n\r\n\r\n\r\n\r\nBrian Zhou\r\n\r\n"
|
||||
)
|
BIN
tests/integration_tests/examples/hello.msg
Normal file
BIN
tests/integration_tests/examples/hello.msg
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user