feat: Add UnstructuredRSTLoader (#6594)

### Summary

Adds an `UnstructuredRSTLoader` for loading
[reStructuredText](https://en.wikipedia.org/wiki/ReStructuredText) file.

### Testing

```python
from langchain.document_loaders import UnstructuredRSTLoader

loader = UnstructuredRSTLoader(
    file_path="example_data/README.rst", mode="elements"
)
docs = loader.load()
print(docs[0])
```

### Reviewers

- @hwchase17 
- @rlancemartin 
- @eyurtsev
This commit is contained in:
Matt Robinson 2023-06-25 15:41:57 -04:00 committed by GitHub
parent b32cc01c9f
commit be68f6f8ce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 183 additions and 0 deletions

View File

@ -0,0 +1,28 @@
Example Docs
------------
The sample docs directory contains the following files:
- ``example-10k.html`` - A 10-K SEC filing in HTML format
- ``layout-parser-paper.pdf`` - A PDF copy of the layout parser paper
- ``factbook.xml``/``factbook.xsl`` - Example XML/XLS files that you
can use to test stylesheets
These documents can be used to test out the parsers in the library. In
addition, here are instructions for pulling in some sample docs that are
too big to store in the repo.
XBRL 10-K
^^^^^^^^^
You can get an example 10-K in inline XBRL format using the following
``curl``. Note, you need to have the user agent set in the header or the
SEC site will reject your request.
.. code:: bash
curl -O \
-A '${organization} ${email}'
https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt
You can parse this document using the HTML parser.

View File

@ -0,0 +1,88 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RST\n",
"\n",
">A [reStructured Text (RST)](https://en.wikipedia.org/wiki/ReStructuredText) file is a file format for textual data used primarily in the Python programming language community for technical documentation."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## `UnstructuredRSTLoader`\n",
"\n",
"You can load data from RST files with `UnstructuredRSTLoader` using the following workflow."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import UnstructuredRSTLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"loader = UnstructuredRSTLoader(\n",
" file_path=\"example_data/README.rst\", mode=\"elements\"\n",
")\n",
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"page_content='Example Docs' metadata={'source': 'example_data/README.rst', 'filename': 'README.rst', 'file_directory': 'example_data', 'filetype': 'text/x-rst', 'page_number': 1, 'category': 'Title'}\n"
]
}
],
"source": [
"print(docs[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -97,6 +97,7 @@ from langchain.document_loaders.readthedocs import ReadTheDocsLoader
from langchain.document_loaders.recursive_url_loader import RecusiveUrlLoader from langchain.document_loaders.recursive_url_loader import RecusiveUrlLoader
from langchain.document_loaders.reddit import RedditPostsLoader from langchain.document_loaders.reddit import RedditPostsLoader
from langchain.document_loaders.roam import RoamLoader from langchain.document_loaders.roam import RoamLoader
from langchain.document_loaders.rst import UnstructuredRSTLoader
from langchain.document_loaders.rtf import UnstructuredRTFLoader from langchain.document_loaders.rtf import UnstructuredRTFLoader
from langchain.document_loaders.s3_directory import S3DirectoryLoader from langchain.document_loaders.s3_directory import S3DirectoryLoader
from langchain.document_loaders.s3_file import S3FileLoader from langchain.document_loaders.s3_file import S3FileLoader
@ -261,6 +262,7 @@ __all__ = [
"UnstructuredODTLoader", "UnstructuredODTLoader",
"UnstructuredPDFLoader", "UnstructuredPDFLoader",
"UnstructuredPowerPointLoader", "UnstructuredPowerPointLoader",
"UnstructuredRSTLoader",
"UnstructuredRTFLoader", "UnstructuredRTFLoader",
"UnstructuredURLLoader", "UnstructuredURLLoader",
"UnstructuredWordDocumentLoader", "UnstructuredWordDocumentLoader",

View File

@ -0,0 +1,22 @@
"""Loader that loads RST files."""
from typing import Any, List
from langchain.document_loaders.unstructured import (
UnstructuredFileLoader,
validate_unstructured_version,
)
class UnstructuredRSTLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load RST files."""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
):
validate_unstructured_version(min_unstructured_version="0.7.5")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.rst import partition_rst
return partition_rst(filename=self.file_path, **self.unstructured_kwargs)

View File

@ -0,0 +1,15 @@
import os
from pathlib import Path
from langchain.document_loaders import UnstructuredRSTLoader
EXAMPLE_DIRECTORY = file_path = Path(__file__).parent.parent / "examples"
def test_unstructured_rst_loader() -> None:
"""Test unstructured loader."""
file_path = os.path.join(EXAMPLE_DIRECTORY, "README.rst")
loader = UnstructuredRSTLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1

View File

@ -0,0 +1,28 @@
Example Docs
------------
The sample docs directory contains the following files:
- ``example-10k.html`` - A 10-K SEC filing in HTML format
- ``layout-parser-paper.pdf`` - A PDF copy of the layout parser paper
- ``factbook.xml``/``factbook.xsl`` - Example XML/XLS files that you
can use to test stylesheets
These documents can be used to test out the parsers in the library. In
addition, here are instructions for pulling in some sample docs that are
too big to store in the repo.
XBRL 10-K
^^^^^^^^^
You can get an example 10-K in inline XBRL format using the following
``curl``. Note, you need to have the user agent set in the header or the
SEC site will reject your request.
.. code:: bash
curl -O \
-A '${organization} ${email}'
https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt
You can parse this document using the HTML parser.