mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-28 22:14:18 +00:00
feat: add loader for rich text files (#3227)
### Summary Adds a loader for rich text files. Requires `unstructured>=0.5.12`. ### Testing The following test uses the example RTF file from the [`unstructured` repo](https://github.com/Unstructured-IO/unstructured/tree/main/example-docs). ```python from langchain.document_loaders import UnstructuredRTFLoader loader = UnstructuredRTFLoader("fake-doc.rtf", mode="elements") docs = loader.load() docs[0].page_content ```
This commit is contained in:
parent
5ef2d1e2a1
commit
3943759a90
@ -57,6 +57,7 @@ from langchain.document_loaders.pdf import (
|
|||||||
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
|
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
|
||||||
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
|
||||||
from langchain.document_loaders.roam import RoamLoader
|
from langchain.document_loaders.roam import RoamLoader
|
||||||
|
from langchain.document_loaders.rtf import UnstructuredRTFLoader
|
||||||
from langchain.document_loaders.s3_directory import S3DirectoryLoader
|
from langchain.document_loaders.s3_directory import S3DirectoryLoader
|
||||||
from langchain.document_loaders.s3_file import S3FileLoader
|
from langchain.document_loaders.s3_file import S3FileLoader
|
||||||
from langchain.document_loaders.sitemap import SitemapLoader
|
from langchain.document_loaders.sitemap import SitemapLoader
|
||||||
@ -106,6 +107,7 @@ __all__ = [
|
|||||||
"OutlookMessageLoader",
|
"OutlookMessageLoader",
|
||||||
"UnstructuredEPubLoader",
|
"UnstructuredEPubLoader",
|
||||||
"UnstructuredMarkdownLoader",
|
"UnstructuredMarkdownLoader",
|
||||||
|
"UnstructuredRTFLoader",
|
||||||
"RoamLoader",
|
"RoamLoader",
|
||||||
"YoutubeLoader",
|
"YoutubeLoader",
|
||||||
"S3FileLoader",
|
"S3FileLoader",
|
||||||
|
28
langchain/document_loaders/rtf.py
Normal file
28
langchain/document_loaders/rtf.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
"""Loader that loads rich text files."""
|
||||||
|
from typing import Any, List
|
||||||
|
|
||||||
|
from langchain.document_loaders.unstructured import (
|
||||||
|
UnstructuredFileLoader,
|
||||||
|
satisfies_min_unstructured_version,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredRTFLoader(UnstructuredFileLoader):
|
||||||
|
"""Loader that uses unstructured to load rtf files."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||||
|
):
|
||||||
|
min_unstructured_version = "0.5.12"
|
||||||
|
if not satisfies_min_unstructured_version(min_unstructured_version):
|
||||||
|
raise ValueError(
|
||||||
|
"Partitioning rtf files is only supported in "
|
||||||
|
f"unstructured>={min_unstructured_version}."
|
||||||
|
)
|
||||||
|
|
||||||
|
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||||
|
|
||||||
|
def _get_elements(self) -> List:
|
||||||
|
from unstructured.partition.rtf import partition_rtf
|
||||||
|
|
||||||
|
return partition_rtf(filename=self.file_path, **self.unstructured_kwargs)
|
Loading…
Reference in New Issue
Block a user