feat: add loader for rich text files (#3227)

### Summary

Adds a loader for rich text files. Requires `unstructured>=0.5.12`.

### Testing

The following test uses the example RTF file from the [`unstructured`
repo](https://github.com/Unstructured-IO/unstructured/tree/main/example-docs).

```python
from langchain.document_loaders import UnstructuredRTFLoader

loader = UnstructuredRTFLoader("fake-doc.rtf", mode="elements")
docs = loader.load()
docs[0].page_content
```
This commit is contained in:
Matt Robinson 2023-04-20 18:51:49 -04:00 committed by GitHub
parent 5ef2d1e2a1
commit 3943759a90
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 30 additions and 0 deletions

View File

@ -57,6 +57,7 @@ from langchain.document_loaders.pdf import (
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
from langchain.document_loaders.readthedocs import ReadTheDocsLoader from langchain.document_loaders.readthedocs import ReadTheDocsLoader
from langchain.document_loaders.roam import RoamLoader from langchain.document_loaders.roam import RoamLoader
from langchain.document_loaders.rtf import UnstructuredRTFLoader
from langchain.document_loaders.s3_directory import S3DirectoryLoader from langchain.document_loaders.s3_directory import S3DirectoryLoader
from langchain.document_loaders.s3_file import S3FileLoader from langchain.document_loaders.s3_file import S3FileLoader
from langchain.document_loaders.sitemap import SitemapLoader from langchain.document_loaders.sitemap import SitemapLoader
@ -106,6 +107,7 @@ __all__ = [
"OutlookMessageLoader", "OutlookMessageLoader",
"UnstructuredEPubLoader", "UnstructuredEPubLoader",
"UnstructuredMarkdownLoader", "UnstructuredMarkdownLoader",
"UnstructuredRTFLoader",
"RoamLoader", "RoamLoader",
"YoutubeLoader", "YoutubeLoader",
"S3FileLoader", "S3FileLoader",

View File

@ -0,0 +1,28 @@
"""Loader that loads rich text files."""
from typing import Any, List
from langchain.document_loaders.unstructured import (
UnstructuredFileLoader,
satisfies_min_unstructured_version,
)
class UnstructuredRTFLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load rtf files."""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
):
min_unstructured_version = "0.5.12"
if not satisfies_min_unstructured_version(min_unstructured_version):
raise ValueError(
"Partitioning rtf files is only supported in "
f"unstructured>={min_unstructured_version}."
)
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.rtf import partition_rtf
return partition_rtf(filename=self.file_path, **self.unstructured_kwargs)