From 2a70a07aad613614f48059a4b9cdfbe4e91d7fa5 Mon Sep 17 00:00:00 2001 From: Alexander Golodkov <55749660+alexander1999-hub@users.noreply.github.com> Date: Tue, 23 Jul 2024 05:04:53 +0300 Subject: [PATCH] community[minor]: added new document loaders based on dedoc library (#24303) ### Description This pull request added new document loaders to load documents of various formats using [Dedoc](https://github.com/ispras/dedoc): - `DedocFileLoader` (determine file types automatically and parse) - `DedocPDFLoader` (for `PDF` and images parsing) - `DedocAPIFileLoader` (determine file types automatically and parse using Dedoc API without library installation) [Dedoc](https://dedoc.readthedocs.io) is an open-source library/service that extracts texts, tables, attached files and document structure (e.g., titles, list items, etc.) from files of various formats. The library is actively developed and maintained by a group of developers. `Dedoc` supports `DOCX`, `XLSX`, `PPTX`, `EML`, `HTML`, `PDF`, images and more. Full list of supported formats can be found [here](https://dedoc.readthedocs.io/en/latest/#id1). For `PDF` documents, `Dedoc` allows to determine textual layer correctness and split the document into paragraphs. ### Issue This pull request extends variety of document loaders supported by `langchain_community` allowing users to choose the most suitable option for raw documents parsing. ### Dependencies The PR added a new (optional) dependency `dedoc>=2.2.5` ([library documentation](https://dedoc.readthedocs.io)) to the `extended_testing_deps.txt` ### Twitter handle None ### Add tests and docs 1. Test for the integration: `libs/community/tests/integration_tests/document_loaders/test_dedoc.py` 2. Example notebook: `docs/docs/integrations/document_loaders/dedoc.ipynb` 3. Information about the library: `docs/docs/integrations/providers/dedoc.mdx` ### Lint and test Done locally: - `make format` - `make lint` - `make integration_tests` - `make docs_build` (from the project root) --------- Co-authored-by: Nasty --- .../integrations/document_loaders/dedoc.ipynb | 484 ++++++++++++++++ docs/docs/integrations/providers/dedoc.mdx | 56 ++ libs/community/extended_testing_deps.txt | 1 + .../document_loaders/__init__.py | 11 + .../document_loaders/dedoc.py | 546 ++++++++++++++++++ .../document_loaders/pdf.py | 99 ++++ .../document_loaders/test_dedoc.py | 146 +++++ .../document_loaders/test_imports.py | 3 + 8 files changed, 1346 insertions(+) create mode 100644 docs/docs/integrations/document_loaders/dedoc.ipynb create mode 100644 docs/docs/integrations/providers/dedoc.mdx create mode 100644 libs/community/langchain_community/document_loaders/dedoc.py create mode 100644 libs/community/tests/integration_tests/document_loaders/test_dedoc.py diff --git a/docs/docs/integrations/document_loaders/dedoc.ipynb b/docs/docs/integrations/document_loaders/dedoc.ipynb new file mode 100644 index 00000000000..084537141f4 --- /dev/null +++ b/docs/docs/integrations/document_loaders/dedoc.ipynb @@ -0,0 +1,484 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6b74f73d-1763-42d0-9c24-8f65f445bb72", + "metadata": {}, + "source": [ + "# Dedoc\n", + "\n", + "This sample demonstrates the use of `Dedoc` in combination with `LangChain` as a `DocumentLoader`.\n", + "\n", + "## Overview\n", + "\n", + "[Dedoc](https://dedoc.readthedocs.io) is an [open-source](https://github.com/ispras/dedoc)\n", + "library/service that extracts texts, tables, attached files and document structure\n", + "(e.g., titles, list items, etc.) from files of various formats.\n", + "\n", + "`Dedoc` supports `DOCX`, `XLSX`, `PPTX`, `EML`, `HTML`, `PDF`, images and more.\n", + "Full list of supported formats can be found [here](https://dedoc.readthedocs.io/en/latest/#id1).\n", + "\n", + "\n", + "### Integration details\n", + "\n", + "| Class | Package | Local | Serializable | JS support |\n", + "|:-----------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------|:-----:|:------------:|:----------:|\n", + "| [DedocFileLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.dedoc.DedocFileLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ❌ | beta | ❌ |\n", + "| [DedocPDFLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.DedocPDFLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ❌ | beta | ❌ | \n", + "| [DedocAPIFileLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.dedoc.DedocAPIFileLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ❌ | beta | ❌ | \n", + "\n", + "\n", + "### Loader features\n", + "\n", + "Methods for lazy loading and async loading are available, but in fact, document loading is executed synchronously.\n", + "\n", + "| Source | Document Lazy Loading | Async Support |\n", + "|:------------------:|:---------------------:|:-------------:| \n", + "| DedocFileLoader | ❌ | ❌ |\n", + "| DedocPDFLoader | ❌ | ❌ | \n", + "| DedocAPIFileLoader | ❌ | ❌ | \n", + "\n", + "## Setup\n", + "\n", + "* To access `DedocFileLoader` and `DedocPDFLoader` document loaders, you'll need to install the `dedoc` integration package.\n", + "* To access `DedocAPIFileLoader`, you'll need to run the `Dedoc` service, e.g. `Docker` container (please see [the documentation](https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker) \n", + "for more details):\n", + "\n", + "```bash\n", + "docker pull dedocproject/dedoc\n", + "docker run -p 1231:1231\n", + "```\n", + "\n", + "`Dedoc` installation instruction is given [here](https://dedoc.readthedocs.io/en/latest/getting_started/installation.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "511c109d-a5c3-42ba-914e-5d1b385bc40f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "# Install package\n", + "%pip install --quiet \"dedoc[torch]\"" + ] + }, + { + "cell_type": "markdown", + "id": "6820c0e9-d56d-4899-b8c8-374760360e2b", + "metadata": {}, + "source": [ + "## Instantiation" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c1f98cae-71ec-4d60-87fb-96c1a76851d8", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import DedocFileLoader\n", + "\n", + "loader = DedocFileLoader(\"./example_data/state_of_the_union.txt\")" + ] + }, + { + "cell_type": "markdown", + "id": "5d7bc2b3-73a0-4cd6-8014-cc7184aa9d4a", + "metadata": {}, + "source": [ + "## Load" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b9097c14-6168-4726-819e-24abb9a63b13", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nMadam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and t'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs = loader.load()\n", + "docs[0].page_content[:100]" + ] + }, + { + "cell_type": "markdown", + "id": "9ed8bd46-0047-4ccc-b2d6-beb7761f7312", + "metadata": {}, + "source": [ + "## Lazy Load" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6ae12d7e-8105-4bbe-9031-0e968475f6bf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and t\n" + ] + } + ], + "source": [ + "docs = loader.lazy_load()\n", + "\n", + "for doc in docs:\n", + " print(doc.page_content[:100])\n", + " break" + ] + }, + { + "cell_type": "markdown", + "id": "8772ae40-6239-4751-bb2d-b4a9415c1ad1", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For detailed information on configuring and calling `Dedoc` loaders, please see the API references: \n", + "\n", + "* https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.dedoc.DedocFileLoader.html\n", + "* https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.DedocPDFLoader.html\n", + "* https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.dedoc.DedocAPIFileLoader.html" + ] + }, + { + "cell_type": "markdown", + "id": "c4d5e702-0e21-4cad-a4c3-b9b3bff77203", + "metadata": {}, + "source": [ + "## Loading any file\n", + "\n", + "For automatic handling of any file in a [supported format](https://dedoc.readthedocs.io/en/latest/#id1),\n", + "`DedocFileLoader` can be useful.\n", + "The file loader automatically detects the file type with a correct extension.\n", + "\n", + "File parsing process can be configured through `dedoc_kwargs` during the `DedocFileLoader` class initialization.\n", + "Here the basic examples of some options usage are given, \n", + "please see the documentation of `DedocFileLoader` and \n", + "[dedoc documentation](https://dedoc.readthedocs.io/en/latest/parameters/parameters.html) \n", + "to get more details about configuration parameters." + ] + }, + { + "cell_type": "markdown", + "id": "de97d0ed-d6b1-44e0-b392-1f3d89c762f9", + "metadata": {}, + "source": [ + "### Basic example" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "50ffeeee-db12-4801-b208-7e32ea3d72ad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nMadam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \\n\\n\\n\\nLast year COVID-19 kept us apart. This year we are finally together again. \\n\\n\\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \\n\\n\\n\\nWith a duty to one another to the American people to '" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_community.document_loaders import DedocFileLoader\n", + "\n", + "loader = DedocFileLoader(\"./example_data/state_of_the_union.txt\")\n", + "\n", + "docs = loader.load()\n", + "\n", + "docs[0].page_content[:400]" + ] + }, + { + "cell_type": "markdown", + "id": "457e5d4c-a4ee-4f31-ae74-3f75a1bbd0af", + "metadata": {}, + "source": [ + "### Modes of split\n", + "\n", + "`DedocFileLoader` supports different types of document splitting into parts (each part is returned separately).\n", + "For this purpose, `split` parameter is used with the following options:\n", + "* `document` (default value): document text is returned as a single langchain `Document` object (don't split);\n", + "* `page`: split document text into pages (works for `PDF`, `DJVU`, `PPTX`, `PPT`, `ODP`);\n", + "* `node`: split document text into `Dedoc` tree nodes (title nodes, list item nodes, raw text nodes);\n", + "* `line`: split document text into textual lines." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "eec54d31-ae7a-4a3c-aa10-4ae276b1e4c4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader = DedocFileLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " split=\"page\",\n", + " pages=\":2\",\n", + ")\n", + "\n", + "docs = loader.load()\n", + "\n", + "len(docs)" + ] + }, + { + "cell_type": "markdown", + "id": "61e11769-4780-4f77-b10e-27db6936f226", + "metadata": {}, + "source": [ + "### Handling tables\n", + "\n", + "`DedocFileLoader` supports tables handling when `with_tables` parameter is \n", + "set to `True` during loader initialization (`with_tables=True` by default). \n", + "\n", + "Tables are not split - each table corresponds to one langchain `Document` object.\n", + "For tables, `Document` object has additional `metadata` fields `type=\"table\"` \n", + "and `text_as_html` with table `HTML` representation." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "bbeb2f8a-ac5e-4b59-8026-7ea3fc14c928", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('table',\n", + " '\\n\\n\\n\\n\\n\n\n' in docs[1].metadata["text_as_html"] + assert "Maple Leafs\tTOR\t13" in docs[1].page_content + + +def test_dedoc_api_file_loader() -> None: + file_name = "whatsapp_chat.txt" + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name) + loader = DedocAPIFileLoader( + file_path, split="line", url="https://dedoc-readme.hf.space" + ) + docs = loader.load() + + assert len(docs) == 10 + assert docs[0].metadata["file_name"] == "whatsapp_chat.txt" + assert docs[0].metadata["file_type"] == "text/plain" + assert "[05.05.23, 15:48:11] James: Hi here" in docs[0].page_content + assert "[11/8/21, 9:41:32 AM] User name: Message 123" in docs[1].page_content + assert "1/23/23, 3:19 AM - User 2: Bye!" in docs[2].page_content diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index 0f8628d20e8..5cd9ce3d404 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -51,6 +51,9 @@ EXPECTED_ALL = [ "CubeSemanticLoader", "DataFrameLoader", "DatadogLogsLoader", + "DedocAPIFileLoader", + "DedocFileLoader", + "DedocPDFLoader", "PebbloSafeLoader", "DiffbotLoader", "DirectoryLoader",
Team "Payroll (millions)"\\nMIME-Version\\n1.0\\nMessage-ID\\n\\nSubject\\nFake email with attachment\\nTo\\nMallori Harrell ')" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader = DedocFileLoader(\n", + " \"./example_data/fake-email-attachment.eml\",\n", + " with_attachments=True,\n", + ")\n", + "\n", + "docs = loader.load()\n", + "\n", + "docs[1].metadata[\"type\"], docs[1].page_content" + ] + }, + { + "cell_type": "markdown", + "id": "d435c3f6-703a-4064-8307-ace140de967a", + "metadata": {}, + "source": [ + "## Loading PDF file\n", + "\n", + "If you want to handle only `PDF` documents, you can use `DedocPDFLoader` with only `PDF` support.\n", + "The loader supports the same parameters for document split, tables and attachments extraction.\n", + "\n", + "`Dedoc` can extract `PDF` with or without a textual layer, \n", + "as well as automatically detect its presence and correctness.\n", + "Several `PDF` handlers are available, you can use `pdf_with_text_layer` \n", + "parameter to choose one of them.\n", + "Please see [parameters description](https://dedoc.readthedocs.io/en/latest/parameters/pdf_handling.html) \n", + "to get more details.\n", + "\n", + "For `PDF` without a textual layer, `Tesseract OCR` and its language packages should be installed.\n", + "In this case, [the instruction](https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html) can be useful." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0103a7f3-6b5e-4444-8f4d-83dd3724a9af", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n2\\n\\nZ. Shen et al.\\n\\n37], layout detection [38, 22], table detection [26], and scene text detection [4].\\n\\nA generalized learning-based framework dramatically reduces the need for the\\n\\nmanual specification of complicated rules, which is the status quo with traditional\\n\\nmethods. DL has the potential to transform DIA pipelines and benefit a broad\\n\\nspectrum of large-scale document digitization projects.\\n'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_community.document_loaders import DedocPDFLoader\n", + "\n", + "loader = DedocPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\", pdf_with_text_layer=\"true\", pages=\"2:2\"\n", + ")\n", + "\n", + "docs = loader.load()\n", + "\n", + "docs[0].page_content[:400]" + ] + }, + { + "cell_type": "markdown", + "id": "13061995-1805-40c2-a77a-a6cd80999e20", + "metadata": {}, + "source": [ + "## Dedoc API\n", + "\n", + "If you want to get up and running with less set up, you can use `Dedoc` as a service.\n", + "**`DedocAPIFileLoader` can be used without installation of `dedoc` library.**\n", + "The loader supports the same parameters as `DedocFileLoader` and\n", + "also automatically detects input file types.\n", + "\n", + "To use `DedocAPIFileLoader`, you should run the `Dedoc` service, e.g. `Docker` container (please see [the documentation](https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker) \n", + "for more details):\n", + "\n", + "```bash\n", + "docker pull dedocproject/dedoc\n", + "docker run -p 1231:1231\n", + "```\n", + "\n", + "Please do not use our demo URL `https://dedoc-readme.hf.space` in your code." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "211fc0b5-6080-4974-a6c1-f982bafd87d6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nMadam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \\n\\n\\n\\nLast year COVID-19 kept us apart. This year we are finally together again. \\n\\n\\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \\n\\n\\n\\nWith a duty to one another to the American people to '" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_community.document_loaders import DedocAPIFileLoader\n", + "\n", + "loader = DedocAPIFileLoader(\n", + " \"./example_data/state_of_the_union.txt\",\n", + " url=\"https://dedoc-readme.hf.space\",\n", + ")\n", + "\n", + "docs = loader.load()\n", + "\n", + "docs[0].page_content[:400]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "faaff475-5209-436f-bcde-97d58daed05c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/docs/integrations/providers/dedoc.mdx b/docs/docs/integrations/providers/dedoc.mdx new file mode 100644 index 00000000000..3f2aaa206e3 --- /dev/null +++ b/docs/docs/integrations/providers/dedoc.mdx @@ -0,0 +1,56 @@ +# Dedoc + +>[Dedoc](https://dedoc.readthedocs.io) is an [open-source](https://github.com/ispras/dedoc) +library/service that extracts texts, tables, attached files and document structure +(e.g., titles, list items, etc.) from files of various formats. + +`Dedoc` supports `DOCX`, `XLSX`, `PPTX`, `EML`, `HTML`, `PDF`, images and more. +Full list of supported formats can be found [here](https://dedoc.readthedocs.io/en/latest/#id1). + +## Installation and Setup + +### Dedoc library + +You can install `Dedoc` using `pip`. +In this case, you will need to install dependencies, +please go [here](https://dedoc.readthedocs.io/en/latest/getting_started/installation.html) +to get more information. + +```bash +pip install dedoc +``` + +### Dedoc API + +If you are going to use `Dedoc` API, you don't need to install `dedoc` library. +In this case, you should run the `Dedoc` service, e.g. `Docker` container (please see +[the documentation](https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker) +for more details): + +```bash +docker pull dedocproject/dedoc +docker run -p 1231:1231 +``` + +## Document Loader + +* For handling files of any formats (supported by `Dedoc`), you can use `DedocFileLoader`: + + ```python + from langchain_community.document_loaders import DedocFileLoader + ``` + +* For handling PDF files (with or without a textual layer), you can use `DedocPDFLoader`: + + ```python + from langchain_community.document_loaders import DedocPDFLoader + ``` + +* For handling files of any formats without library installation, +you can use `Dedoc API` with `DedocAPIFileLoader`: + + ```python + from langchain_community.document_loaders import DedocAPIFileLoader + ``` + +Please see a [usage example](/docs/integrations/document_loaders/dedoc) for more details. diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt index 198e8954f82..55335cad97d 100644 --- a/libs/community/extended_testing_deps.txt +++ b/libs/community/extended_testing_deps.txt @@ -16,6 +16,7 @@ cloudpickle>=2.0.0 cohere>=4,<6 databricks-vectorsearch>=0.21,<0.22 datasets>=2.15.0,<3 +dedoc>=2.2.6,<3 dgml-utils>=0.3.0,<0.4 elasticsearch>=8.12.0,<9 esprima>=4.0.1,<5 diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index 7f111b1aae9..e03f7693127 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -142,6 +142,10 @@ if TYPE_CHECKING: from langchain_community.document_loaders.dataframe import ( DataFrameLoader, ) + from langchain_community.document_loaders.dedoc import ( + DedocAPIFileLoader, + DedocFileLoader, + ) from langchain_community.document_loaders.diffbot import ( DiffbotLoader, ) @@ -340,6 +344,7 @@ if TYPE_CHECKING: ) from langchain_community.document_loaders.pdf import ( AmazonTextractPDFLoader, + DedocPDFLoader, MathpixPDFLoader, OnlinePDFLoader, PagedPDFSplitter, @@ -570,6 +575,9 @@ _module_lookup = { "CubeSemanticLoader": "langchain_community.document_loaders.cube_semantic", "DataFrameLoader": "langchain_community.document_loaders.dataframe", "DatadogLogsLoader": "langchain_community.document_loaders.datadog_logs", + "DedocAPIFileLoader": "langchain_community.document_loaders.dedoc", + "DedocFileLoader": "langchain_community.document_loaders.dedoc", + "DedocPDFLoader": "langchain_community.document_loaders.pdf", "DiffbotLoader": "langchain_community.document_loaders.diffbot", "DirectoryLoader": "langchain_community.document_loaders.directory", "DiscordChatLoader": "langchain_community.document_loaders.discord", @@ -771,6 +779,9 @@ __all__ = [ "CubeSemanticLoader", "DataFrameLoader", "DatadogLogsLoader", + "DedocAPIFileLoader", + "DedocFileLoader", + "DedocPDFLoader", "DiffbotLoader", "DirectoryLoader", "DiscordChatLoader", diff --git a/libs/community/langchain_community/document_loaders/dedoc.py b/libs/community/langchain_community/document_loaders/dedoc.py new file mode 100644 index 00000000000..ed8ebc11d73 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/dedoc.py @@ -0,0 +1,546 @@ +import html +import json +import os +from abc import ABC, abstractmethod +from typing import ( + Dict, + Iterator, + Optional, + Tuple, + Union, +) + +from langchain_core.documents import Document + +from langchain_community.document_loaders.base import BaseLoader + + +class DedocBaseLoader(BaseLoader, ABC): + """ + Base Loader that uses `dedoc` (https://dedoc.readthedocs.io). + + Loader enables extracting text, tables and attached files from the given file: + * `Text` can be split by pages, `dedoc` tree nodes, textual lines + (according to the `split` parameter). + * `Attached files` (when with_attachments=True) + are split according to the `split` parameter. + For attachments, langchain Document object has an additional metadata field + `type`="attachment". + * `Tables` (when with_tables=True) are not split - each table corresponds to one + langchain Document object. + For tables, Document object has additional metadata fields `type`="table" + and `text_as_html` with table HTML representation. + """ + + def __init__( + self, + file_path: str, + *, + split: str = "document", + with_tables: bool = True, + with_attachments: Union[str, bool] = False, + recursion_deep_attachments: int = 10, + pdf_with_text_layer: str = "auto_tabby", + language: str = "rus+eng", + pages: str = ":", + is_one_column_document: str = "auto", + document_orientation: str = "auto", + need_header_footer_analysis: Union[str, bool] = False, + need_binarization: Union[str, bool] = False, + need_pdf_table_analysis: Union[str, bool] = True, + delimiter: Optional[str] = None, + encoding: Optional[str] = None, + ) -> None: + """ + Initialize with file path and parsing parameters. + + Args: + file_path: path to the file for processing + split: type of document splitting into parts (each part is returned + separately), default value "document" + "document": document text is returned as a single langchain Document + object (don't split) + "page": split document text into pages (works for PDF, DJVU, PPTX, PPT, + ODP) + "node": split document text into tree nodes (title nodes, list item + nodes, raw text nodes) + "line": split document text into lines + with_tables: add tables to the result - each table is returned as a single + langchain Document object + + Parameters used for document parsing via `dedoc` + (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html): + + with_attachments: enable attached files extraction + recursion_deep_attachments: recursion level for attached files + extraction, works only when with_attachments==True + pdf_with_text_layer: type of handler for parsing PDF documents, + available options + ["true", "false", "tabby", "auto", "auto_tabby" (default)] + language: language of the document for PDF without a textual layer and + images, available options ["eng", "rus", "rus+eng" (default)], + the list of languages can be extended, please see + https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html + pages: page slice to define the reading range for parsing PDF documents + is_one_column_document: detect number of columns for PDF without + a textual layer and images, available options + ["true", "false", "auto" (default)] + document_orientation: fix document orientation (90, 180, 270 degrees) + for PDF without a textual layer and images, available options + ["auto" (default), "no_change"] + need_header_footer_analysis: remove headers and footers from the output + result for parsing PDF and images + need_binarization: clean pages background (binarize) for PDF without a + textual layer and images + need_pdf_table_analysis: parse tables for PDF without a textual layer + and images + delimiter: column separator for CSV, TSV files + encoding: encoding of TXT, CSV, TSV + """ + self.parsing_parameters = { + key: value + for key, value in locals().items() + if key not in {"self", "file_path", "split", "with_tables"} + } + self.valid_split_values = {"document", "page", "node", "line"} + if split not in self.valid_split_values: + raise ValueError( + f"Got {split} for `split`, but should be one of " + f"`{self.valid_split_values}`" + ) + self.split = split + self.with_tables = with_tables + self.file_path = file_path + + structure_type = "tree" if self.split == "node" else "linear" + self.parsing_parameters["structure_type"] = structure_type + self.parsing_parameters["need_content_analysis"] = with_attachments + + def lazy_load(self) -> Iterator[Document]: + """Lazily load documents.""" + import tempfile + + try: + from dedoc import DedocManager + except ImportError: + raise ImportError( + "`dedoc` package not found, please install it with `pip install dedoc`" + ) + dedoc_manager = DedocManager(manager_config=self._make_config()) + dedoc_manager.config["logger"].disabled = True + + with tempfile.TemporaryDirectory() as tmpdir: + document_tree = dedoc_manager.parse( + file_path=self.file_path, + parameters={**self.parsing_parameters, "attachments_dir": tmpdir}, + ) + yield from self._split_document( + document_tree=document_tree.to_api_schema().dict(), split=self.split + ) + + @abstractmethod + def _make_config(self) -> dict: + """ + Make configuration for DedocManager according to the file extension and + parsing parameters. + """ + pass + + def _json2txt(self, paragraph: dict) -> str: + """Get text (recursively) of the document tree node.""" + subparagraphs_text = "\n".join( + [ + self._json2txt(subparagraph) + for subparagraph in paragraph["subparagraphs"] + ] + ) + text = ( + f"{paragraph['text']}\n{subparagraphs_text}" + if subparagraphs_text + else paragraph["text"] + ) + return text + + def _parse_subparagraphs( + self, document_tree: dict, document_metadata: dict + ) -> Iterator[Document]: + """Parse recursively document tree obtained by `dedoc`.""" + if len(document_tree["subparagraphs"]) > 0: + for subparagraph in document_tree["subparagraphs"]: + yield from self._parse_subparagraphs( + document_tree=subparagraph, document_metadata=document_metadata + ) + else: + yield Document( + page_content=document_tree["text"], + metadata={**document_metadata, **document_tree["metadata"]}, + ) + + def _split_document( + self, + document_tree: dict, + split: str, + additional_metadata: Optional[dict] = None, + ) -> Iterator[Document]: + """Split document into parts according to the `split` parameter.""" + document_metadata = document_tree["metadata"] + if additional_metadata: + document_metadata = {**document_metadata, **additional_metadata} + + if split == "document": + text = self._json2txt(paragraph=document_tree["content"]["structure"]) + yield Document(page_content=text, metadata=document_metadata) + + elif split == "page": + nodes = document_tree["content"]["structure"]["subparagraphs"] + page_id = nodes[0]["metadata"]["page_id"] + page_text = "" + + for node in nodes: + if node["metadata"]["page_id"] == page_id: + page_text += self._json2txt(node) + else: + yield Document( + page_content=page_text, + metadata={**document_metadata, "page_id": page_id}, + ) + page_id = node["metadata"]["page_id"] + page_text = self._json2txt(node) + + yield Document( + page_content=page_text, + metadata={**document_metadata, "page_id": page_id}, + ) + + elif split == "line": + for node in document_tree["content"]["structure"]["subparagraphs"]: + line_metadata = node["metadata"] + yield Document( + page_content=self._json2txt(node), + metadata={**document_metadata, **line_metadata}, + ) + + elif split == "node": + yield from self._parse_subparagraphs( + document_tree=document_tree["content"]["structure"], + document_metadata=document_metadata, + ) + + else: + raise ValueError( + f"Got {split} for `split`, but should be one of " + f"`{self.valid_split_values}`" + ) + + if self.with_tables: + for table in document_tree["content"]["tables"]: + table_text, table_html = self._get_table(table) + yield Document( + page_content=table_text, + metadata={ + **table["metadata"], + "type": "table", + "text_as_html": table_html, + }, + ) + + for attachment in document_tree["attachments"]: + yield from self._split_document( + document_tree=attachment, + split=self.split, + additional_metadata={"type": "attachment"}, + ) + + def _get_table(self, table: dict) -> Tuple[str, str]: + """Get text and HTML representation of the table.""" + table_text = "" + for row in table["cells"]: + for cell in row: + table_text += " ".join(line["text"] for line in cell["lines"]) + table_text += "\t" + table_text += "\n" + + table_html = ( + '\n\n' + ) + for row in table["cells"]: + table_html += "\n" + for cell in row: + cell_text = "\n".join(line["text"] for line in cell["lines"]) + cell_text = html.escape(cell_text) + table_html += "{cell_text}\n' + ) + table_html += "\n" + table_html += "\n
" + + return table_text, table_html + + +class DedocFileLoader(DedocBaseLoader): + """ + DedocFileLoader document loader integration to load files using `dedoc`. + + The file loader automatically detects the file type (with the correct extension). + The list of supported file types is gives at + https://dedoc.readthedocs.io/en/latest/index.html#id1. + Please see the documentation of DedocBaseLoader to get more details. + + Setup: + Install ``dedoc`` package. + + .. code-block:: bash + + pip install -U dedoc + + Instantiate: + .. code-block:: python + + from langchain_community.document_loaders import DedocFileLoader + + loader = DedocFileLoader( + file_path="example.pdf", + # split=..., + # with_tables=..., + # pdf_with_text_layer=..., + # pages=..., + # ... + ) + + Load: + .. code-block:: python + + docs = loader.load() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + Some text + { + 'file_name': 'example.pdf', + 'file_type': 'application/pdf', + # ... + } + + Lazy load: + .. code-block:: python + + docs = [] + docs_lazy = loader.lazy_load() + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + Some text + { + 'file_name': 'example.pdf', + 'file_type': 'application/pdf', + # ... + } + """ + + def _make_config(self) -> dict: + from dedoc.utils.langchain import make_manager_config + + return make_manager_config( + file_path=self.file_path, + parsing_params=self.parsing_parameters, + split=self.split, + ) + + +class DedocAPIFileLoader(DedocBaseLoader): + """ + Load files using `dedoc` API. + The file loader automatically detects the file type (even with the wrong extension). + By default, the loader makes a call to the locally hosted `dedoc` API. + More information about `dedoc` API can be found in `dedoc` documentation: + https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api.html + + Please see the documentation of DedocBaseLoader to get more details. + + Setup: + You don't need to install `dedoc` library for using this loader. + Instead, the `dedoc` API needs to be run. + You may use Docker container for this purpose. + Please see `dedoc` documentation for more details: + https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker + + .. code-block:: bash + + docker pull dedocproject/dedoc + docker run -p 1231:1231 + + Instantiate: + .. code-block:: python + + from langchain_community.document_loaders import DedocAPIFileLoader + + loader = DedocAPIFileLoader( + file_path="example.pdf", + # url=..., + # split=..., + # with_tables=..., + # pdf_with_text_layer=..., + # pages=..., + # ... + ) + + Load: + .. code-block:: python + + docs = loader.load() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + Some text + { + 'file_name': 'example.pdf', + 'file_type': 'application/pdf', + # ... + } + + Lazy load: + .. code-block:: python + + docs = [] + docs_lazy = loader.lazy_load() + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + Some text + { + 'file_name': 'example.pdf', + 'file_type': 'application/pdf', + # ... + } + """ + + def __init__( + self, + file_path: str, + *, + url: str = "http://0.0.0.0:1231", + split: str = "document", + with_tables: bool = True, + with_attachments: Union[str, bool] = False, + recursion_deep_attachments: int = 10, + pdf_with_text_layer: str = "auto_tabby", + language: str = "rus+eng", + pages: str = ":", + is_one_column_document: str = "auto", + document_orientation: str = "auto", + need_header_footer_analysis: Union[str, bool] = False, + need_binarization: Union[str, bool] = False, + need_pdf_table_analysis: Union[str, bool] = True, + delimiter: Optional[str] = None, + encoding: Optional[str] = None, + ) -> None: + """Initialize with file path, API url and parsing parameters. + + Args: + file_path: path to the file for processing + url: URL to call `dedoc` API + split: type of document splitting into parts (each part is returned + separately), default value "document" + "document": document is returned as a single langchain Document object + (don't split) + "page": split document into pages (works for PDF, DJVU, PPTX, PPT, ODP) + "node": split document into tree nodes (title nodes, list item nodes, + raw text nodes) + "line": split document into lines + with_tables: add tables to the result - each table is returned as a single + langchain Document object + + Parameters used for document parsing via `dedoc` + (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html): + + with_attachments: enable attached files extraction + recursion_deep_attachments: recursion level for attached files + extraction, works only when with_attachments==True + pdf_with_text_layer: type of handler for parsing PDF documents, + available options + ["true", "false", "tabby", "auto", "auto_tabby" (default)] + language: language of the document for PDF without a textual layer and + images, available options ["eng", "rus", "rus+eng" (default)], + the list of languages can be extended, please see + https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html + pages: page slice to define the reading range for parsing PDF documents + is_one_column_document: detect number of columns for PDF without + a textual layer and images, available options + ["true", "false", "auto" (default)] + document_orientation: fix document orientation (90, 180, 270 degrees) + for PDF without a textual layer and images, available options + ["auto" (default), "no_change"] + need_header_footer_analysis: remove headers and footers from the output + result for parsing PDF and images + need_binarization: clean pages background (binarize) for PDF without a + textual layer and images + need_pdf_table_analysis: parse tables for PDF without a textual layer + and images + delimiter: column separator for CSV, TSV files + encoding: encoding of TXT, CSV, TSV + """ + super().__init__( + file_path=file_path, + split=split, + with_tables=with_tables, + with_attachments=with_attachments, + recursion_deep_attachments=recursion_deep_attachments, + pdf_with_text_layer=pdf_with_text_layer, + language=language, + pages=pages, + is_one_column_document=is_one_column_document, + document_orientation=document_orientation, + need_header_footer_analysis=need_header_footer_analysis, + need_binarization=need_binarization, + need_pdf_table_analysis=need_pdf_table_analysis, + delimiter=delimiter, + encoding=encoding, + ) + self.url = url + self.parsing_parameters["return_format"] = "json" + + def lazy_load(self) -> Iterator[Document]: + """Lazily load documents.""" + doc_tree = self._send_file( + url=self.url, file_path=self.file_path, parameters=self.parsing_parameters + ) + yield from self._split_document(document_tree=doc_tree, split=self.split) + + def _make_config(self) -> dict: + return {} + + def _send_file( + self, url: str, file_path: str, parameters: dict + ) -> Dict[str, Union[list, dict, str]]: + """Send POST-request to `dedoc` API and return the results""" + import requests + + file_name = os.path.basename(file_path) + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post(f"{url}/upload", files=files, data=parameters) + + if r.status_code != 200: + raise ValueError(f"Error during file handling: {r.content.decode()}") + + result = json.loads(r.content.decode()) + return result diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index 02c416a135d..ea4485e5891 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -26,6 +26,7 @@ from langchain_core.utils import get_from_dict_or_env from langchain_community.document_loaders.base import BaseLoader from langchain_community.document_loaders.blob_loaders import Blob +from langchain_community.document_loaders.dedoc import DedocBaseLoader from langchain_community.document_loaders.parsers.pdf import ( AmazonTextractPDFParser, DocumentIntelligenceParser, @@ -738,6 +739,104 @@ class AmazonTextractPDFLoader(BasePDFLoader): raise ValueError(f"unsupported mime type: {blob.mimetype}") # type: ignore[attr-defined] +class DedocPDFLoader(DedocBaseLoader): + """ + DedocPDFLoader document loader integration to load PDF files using `dedoc`. + The file loader can automatically detect the correctness of a textual layer in the + PDF document. + Note that `__init__` method supports parameters that differ from ones of + DedocBaseLoader. + + Setup: + Install ``dedoc`` package. + + .. code-block:: bash + + pip install -U dedoc + + Instantiate: + .. code-block:: python + + from langchain_community.document_loaders import DedocPDFLoader + + loader = DedocPDFLoader( + file_path="example.pdf", + # split=..., + # with_tables=..., + # pdf_with_text_layer=..., + # pages=..., + # ... + ) + + Load: + .. code-block:: python + + docs = loader.load() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + Some text + { + 'file_name': 'example.pdf', + 'file_type': 'application/pdf', + # ... + } + + Lazy load: + .. code-block:: python + + docs = [] + docs_lazy = loader.lazy_load() + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + Some text + { + 'file_name': 'example.pdf', + 'file_type': 'application/pdf', + # ... + } + + Parameters used for document parsing via `dedoc` + (https://dedoc.readthedocs.io/en/latest/parameters/pdf_handling.html): + + with_attachments: enable attached files extraction + recursion_deep_attachments: recursion level for attached files extraction, + works only when with_attachments==True + pdf_with_text_layer: type of handler for parsing, available options + ["true", "false", "tabby", "auto", "auto_tabby" (default)] + language: language of the document for PDF without a textual layer, + available options ["eng", "rus", "rus+eng" (default)], the list of + languages can be extended, please see + https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html + pages: page slice to define the reading range for parsing + is_one_column_document: detect number of columns for PDF without a textual + layer, available options ["true", "false", "auto" (default)] + document_orientation: fix document orientation (90, 180, 270 degrees) for PDF + without a textual layer, available options ["auto" (default), "no_change"] + need_header_footer_analysis: remove headers and footers from the output result + need_binarization: clean pages background (binarize) for PDF without a textual + layer + need_pdf_table_analysis: parse tables for PDF without a textual layer + """ + + def _make_config(self) -> dict: + from dedoc.utils.langchain import make_manager_pdf_config + + return make_manager_pdf_config( + file_path=self.file_path, + parsing_params=self.parsing_parameters, + split=self.split, + ) + + class DocumentIntelligenceLoader(BasePDFLoader): """Load a PDF with Azure Document Intelligence""" diff --git a/libs/community/tests/integration_tests/document_loaders/test_dedoc.py b/libs/community/tests/integration_tests/document_loaders/test_dedoc.py new file mode 100644 index 00000000000..3499167a9f3 --- /dev/null +++ b/libs/community/tests/integration_tests/document_loaders/test_dedoc.py @@ -0,0 +1,146 @@ +import os +from pathlib import Path + +from langchain_community.document_loaders import ( + DedocAPIFileLoader, + DedocFileLoader, + DedocPDFLoader, +) + +EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/") + +FILE_NAMES = [ + "example.html", + "example.json", + "fake-email-attachment.eml", + "layout-parser-paper.pdf", + "slack_export.zip", + "stanley-cups.csv", + "stanley-cups.xlsx", + "whatsapp_chat.txt", +] + + +def test_dedoc_file_loader() -> None: + for file_name in FILE_NAMES: + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name) + loader = DedocFileLoader( + file_path, + split="document", + with_tables=False, + pdf_with_text_layer="tabby", + pages=":1", + ) + docs = loader.load() + + assert len(docs) == 1 + + +def test_dedoc_pdf_loader() -> None: + file_name = "layout-parser-paper.pdf" + for mode in ("true", "tabby"): + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name) + loader = DedocPDFLoader( + file_path, + split="document", + with_tables=False, + pdf_with_text_layer=mode, + pages=":1", + ) + docs = loader.load() + + assert len(docs) == 1 + + +def test_dedoc_content_html() -> None: + file_name = "example.html" + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name) + loader = DedocFileLoader( + file_path, + split="line", + with_tables=False, + ) + docs = loader.load() + + assert docs[0].metadata["file_name"] == "example.html" + assert docs[0].metadata["file_type"] == "text/html" + assert "Instead of drinking water from the cat bowl" in docs[0].page_content + assert "Chase the red dot" not in docs[0].page_content + + +def test_dedoc_content_pdf() -> None: + file_name = "layout-parser-paper.pdf" + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name) + loader = DedocFileLoader( + file_path, split="page", pdf_with_text_layer="tabby", pages=":5" + ) + docs = loader.load() + table_list = [item for item in docs if item.metadata.get("type", "") == "table"] + + assert len(docs) == 6 + assert docs[0].metadata["file_name"] == "layout-parser-paper.pdf" + assert docs[0].metadata["file_type"] == "application/pdf" + assert "This paper introduces LayoutParser, an open-source" in docs[0].page_content + assert "layout detection [38, 22], table detection [26]" in docs[1].page_content + assert "LayoutParser: A Unified Toolkit for DL-Based DIA" in docs[2].page_content + assert len(table_list) > 0 + assert ( + '\n
' + in table_list[0].metadata["text_as_html"] + ) + + +def test_dedoc_content_json() -> None: + file_name = "example.json" + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name) + loader = DedocFileLoader(file_path, split="node") + docs = loader.load() + + assert len(docs) == 11 + assert docs[0].metadata["file_name"] == "example.json" + assert docs[0].metadata["file_type"] == "application/json" + assert "Bye!" in docs[0].page_content + + +def test_dedoc_content_txt() -> None: + file_name = "whatsapp_chat.txt" + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name) + loader = DedocFileLoader(file_path, split="line") + docs = loader.load() + + assert len(docs) == 10 + assert docs[0].metadata["file_name"] == "whatsapp_chat.txt" + assert docs[0].metadata["file_type"] == "text/plain" + assert "[05.05.23, 15:48:11] James: Hi here" in docs[0].page_content + assert "[11/8/21, 9:41:32 AM] User name: Message 123" in docs[1].page_content + assert "1/23/23, 3:19 AM - User 2: Bye!" in docs[2].page_content + + +def test_dedoc_table_handling() -> None: + file_name = "stanley-cups.csv" + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name) + loader = DedocFileLoader(file_path, split="document") + docs = loader.load() + + assert len(docs) == 2 + assert docs[0].metadata["file_name"] == "stanley-cups.csv" + assert docs[0].metadata["file_type"] == "text/csv" + assert docs[1].metadata["type"] == "table" + assert '1