mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-28 05:54:55 +00:00
feat: add loader for rich text files (#3227)
### Summary Adds a loader for rich text files. Requires `unstructured>=0.5.12`. ### Testing The following test uses the example RTF file from the [`unstructured` repo](https://github.com/Unstructured-IO/unstructured/tree/main/example-docs). ```python from langchain.document_loaders import UnstructuredRTFLoader loader = UnstructuredRTFLoader("fake-doc.rtf", mode="elements") docs = loader.load() docs[0].page_content ```
This commit is contained in:
parent
5ef2d1e2a1
commit
3943759a90
@ -57,6 +57,7 @@ from langchain.document_loaders.pdf import (
|
||||
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
|
||||
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
||||
from langchain.document_loaders.roam import RoamLoader
|
||||
from langchain.document_loaders.rtf import UnstructuredRTFLoader
|
||||
from langchain.document_loaders.s3_directory import S3DirectoryLoader
|
||||
from langchain.document_loaders.s3_file import S3FileLoader
|
||||
from langchain.document_loaders.sitemap import SitemapLoader
|
||||
@ -106,6 +107,7 @@ __all__ = [
|
||||
"OutlookMessageLoader",
|
||||
"UnstructuredEPubLoader",
|
||||
"UnstructuredMarkdownLoader",
|
||||
"UnstructuredRTFLoader",
|
||||
"RoamLoader",
|
||||
"YoutubeLoader",
|
||||
"S3FileLoader",
|
||||
|
28
langchain/document_loaders/rtf.py
Normal file
28
langchain/document_loaders/rtf.py
Normal file
@ -0,0 +1,28 @@
|
||||
"""Loader that loads rich text files."""
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.document_loaders.unstructured import (
|
||||
UnstructuredFileLoader,
|
||||
satisfies_min_unstructured_version,
|
||||
)
|
||||
|
||||
|
||||
class UnstructuredRTFLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load rtf files."""
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
):
|
||||
min_unstructured_version = "0.5.12"
|
||||
if not satisfies_min_unstructured_version(min_unstructured_version):
|
||||
raise ValueError(
|
||||
"Partitioning rtf files is only supported in "
|
||||
f"unstructured>={min_unstructured_version}."
|
||||
)
|
||||
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.rtf import partition_rtf
|
||||
|
||||
return partition_rtf(filename=self.file_path, **self.unstructured_kwargs)
|
Loading…
Reference in New Issue
Block a user