diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index ab5fc4b0de3..c4cc744838f 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -57,6 +57,7 @@ from langchain.document_loaders.pdf import ( from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader from langchain.document_loaders.readthedocs import ReadTheDocsLoader from langchain.document_loaders.roam import RoamLoader +from langchain.document_loaders.rtf import UnstructuredRTFLoader from langchain.document_loaders.s3_directory import S3DirectoryLoader from langchain.document_loaders.s3_file import S3FileLoader from langchain.document_loaders.sitemap import SitemapLoader @@ -106,6 +107,7 @@ __all__ = [ "OutlookMessageLoader", "UnstructuredEPubLoader", "UnstructuredMarkdownLoader", + "UnstructuredRTFLoader", "RoamLoader", "YoutubeLoader", "S3FileLoader", diff --git a/langchain/document_loaders/rtf.py b/langchain/document_loaders/rtf.py new file mode 100644 index 00000000000..c4113be2062 --- /dev/null +++ b/langchain/document_loaders/rtf.py @@ -0,0 +1,28 @@ +"""Loader that loads rich text files.""" +from typing import Any, List + +from langchain.document_loaders.unstructured import ( + UnstructuredFileLoader, + satisfies_min_unstructured_version, +) + + +class UnstructuredRTFLoader(UnstructuredFileLoader): + """Loader that uses unstructured to load rtf files.""" + + def __init__( + self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + ): + min_unstructured_version = "0.5.12" + if not satisfies_min_unstructured_version(min_unstructured_version): + raise ValueError( + "Partitioning rtf files is only supported in " + f"unstructured>={min_unstructured_version}." + ) + + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + + def _get_elements(self) -> List: + from unstructured.partition.rtf import partition_rtf + + return partition_rtf(filename=self.file_path, **self.unstructured_kwargs)