diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/example_data/README.rst b/docs/extras/modules/data_connection/document_loaders/integrations/example_data/README.rst new file mode 100644 index 00000000000..45630d0385d --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/example_data/README.rst @@ -0,0 +1,28 @@ +Example Docs +------------ + +The sample docs directory contains the following files: + +- ``example-10k.html`` - A 10-K SEC filing in HTML format +- ``layout-parser-paper.pdf`` - A PDF copy of the layout parser paper +- ``factbook.xml``/``factbook.xsl`` - Example XML/XLS files that you + can use to test stylesheets + +These documents can be used to test out the parsers in the library. In +addition, here are instructions for pulling in some sample docs that are +too big to store in the repo. + +XBRL 10-K +^^^^^^^^^ + +You can get an example 10-K in inline XBRL format using the following +``curl``. Note, you need to have the user agent set in the header or the +SEC site will reject your request. + +.. code:: bash + + curl -O \ + -A '${organization} ${email}' + https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt + +You can parse this document using the HTML parser. diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/rst.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/rst.ipynb new file mode 100644 index 00000000000..1e2b6dd1372 --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/rst.ipynb @@ -0,0 +1,88 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RST\n", + "\n", + ">A [reStructured Text (RST)](https://en.wikipedia.org/wiki/ReStructuredText) file is a file format for textual data used primarily in the Python programming language community for technical documentation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `UnstructuredRSTLoader`\n", + "\n", + "You can load data from RST files with `UnstructuredRSTLoader` using the following workflow." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredRSTLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredRSTLoader(\n", + " file_path=\"example_data/README.rst\", mode=\"elements\"\n", + ")\n", + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='Example Docs' metadata={'source': 'example_data/README.rst', 'filename': 'README.rst', 'file_directory': 'example_data', 'filetype': 'text/x-rst', 'page_number': 1, 'category': 'Title'}\n" + ] + } + ], + "source": [ + "print(docs[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 53c0bf23b45..1503734f773 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -97,6 +97,7 @@ from langchain.document_loaders.readthedocs import ReadTheDocsLoader from langchain.document_loaders.recursive_url_loader import RecusiveUrlLoader from langchain.document_loaders.reddit import RedditPostsLoader from langchain.document_loaders.roam import RoamLoader +from langchain.document_loaders.rst import UnstructuredRSTLoader from langchain.document_loaders.rtf import UnstructuredRTFLoader from langchain.document_loaders.s3_directory import S3DirectoryLoader from langchain.document_loaders.s3_file import S3FileLoader @@ -261,6 +262,7 @@ __all__ = [ "UnstructuredODTLoader", "UnstructuredPDFLoader", "UnstructuredPowerPointLoader", + "UnstructuredRSTLoader", "UnstructuredRTFLoader", "UnstructuredURLLoader", "UnstructuredWordDocumentLoader", diff --git a/langchain/document_loaders/rst.py b/langchain/document_loaders/rst.py new file mode 100644 index 00000000000..9b20e7bab4d --- /dev/null +++ b/langchain/document_loaders/rst.py @@ -0,0 +1,22 @@ +"""Loader that loads RST files.""" +from typing import Any, List + +from langchain.document_loaders.unstructured import ( + UnstructuredFileLoader, + validate_unstructured_version, +) + + +class UnstructuredRSTLoader(UnstructuredFileLoader): + """Loader that uses unstructured to load RST files.""" + + def __init__( + self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + ): + validate_unstructured_version(min_unstructured_version="0.7.5") + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + + def _get_elements(self) -> List: + from unstructured.partition.rst import partition_rst + + return partition_rst(filename=self.file_path, **self.unstructured_kwargs) diff --git a/tests/integration_tests/document_loaders/test_rst.py b/tests/integration_tests/document_loaders/test_rst.py new file mode 100644 index 00000000000..ead71c3dc6c --- /dev/null +++ b/tests/integration_tests/document_loaders/test_rst.py @@ -0,0 +1,15 @@ +import os +from pathlib import Path + +from langchain.document_loaders import UnstructuredRSTLoader + +EXAMPLE_DIRECTORY = file_path = Path(__file__).parent.parent / "examples" + + +def test_unstructured_rst_loader() -> None: + """Test unstructured loader.""" + file_path = os.path.join(EXAMPLE_DIRECTORY, "README.rst") + loader = UnstructuredRSTLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 diff --git a/tests/integration_tests/examples/README.rst b/tests/integration_tests/examples/README.rst new file mode 100644 index 00000000000..45630d0385d --- /dev/null +++ b/tests/integration_tests/examples/README.rst @@ -0,0 +1,28 @@ +Example Docs +------------ + +The sample docs directory contains the following files: + +- ``example-10k.html`` - A 10-K SEC filing in HTML format +- ``layout-parser-paper.pdf`` - A PDF copy of the layout parser paper +- ``factbook.xml``/``factbook.xsl`` - Example XML/XLS files that you + can use to test stylesheets + +These documents can be used to test out the parsers in the library. In +addition, here are instructions for pulling in some sample docs that are +too big to store in the repo. + +XBRL 10-K +^^^^^^^^^ + +You can get an example 10-K in inline XBRL format using the following +``curl``. Note, you need to have the user agent set in the header or the +SEC site will reject your request. + +.. code:: bash + + curl -O \ + -A '${organization} ${email}' + https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt + +You can parse this document using the HTML parser.