From 0c7f1d8b219e87e3ffd14a15a452622c532c7e95 Mon Sep 17 00:00:00 2001 From: Martin Schade <45048633+schadem@users.noreply.github.com> Date: Tue, 31 Oct 2023 02:02:10 +0100 Subject: [PATCH] Textract linearizer (#12446) **Description:** Textract PDF Loader generating linearized output, meaning it will replicate the structure of the source document as close as possible based on the features passed into the call (e. g. LAYOUT, FORMS, TABLES). With LAYOUT reading order for multi-column documents or identification of lists and figures is supported and with TABLES it will generate the table structure as well. FORMS will indicate "key: value" with columms. - **Issue:** the issue fixes #12068 - **Dependencies:** amazon-textract-textractor is added, which provides the linearization - **Tag maintainer:** @3coins --------- Co-authored-by: Bagatur --- .../pdf-amazonTextractPDFLoader.ipynb | 148 +++++++++++++++++- .../langchain/document_loaders/parsers/pdf.py | 74 +++++++-- libs/langchain/poetry.lock | 38 +---- libs/langchain/pyproject.toml | 2 - .../document_loaders/test_pdf.py | 42 ++++- 5 files changed, 241 insertions(+), 63 deletions(-) diff --git a/docs/docs/integrations/document_loaders/pdf-amazonTextractPDFLoader.ipynb b/docs/docs/integrations/document_loaders/pdf-amazonTextractPDFLoader.ipynb index 3cbaa76c76f..5e05701d0d8 100644 --- a/docs/docs/integrations/document_loaders/pdf-amazonTextractPDFLoader.ipynb +++ b/docs/docs/integrations/document_loaders/pdf-amazonTextractPDFLoader.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "c049beaf-f904-4ce6-91ca-805da62084c2", "metadata": { "tags": [] @@ -28,14 +28,135 @@ "name": "stdout", "output_type": "stream", "text": [ + "\u001b[33mDEPRECATION: amazon-textract-pipeline-pagedimensions 0.0.8 has a non-standard dependency specifier Pillow>=9.4.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of amazon-textract-pipeline-pagedimensions or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mDEPRECATION: amazon-textract-pipeline-pagedimensions 0.0.8 has a non-standard dependency specifier pypdf>=2.5.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of amazon-textract-pipeline-pagedimensions or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n", + "Obtaining file:///Users/schadem/code/github/schadem/langchain/libs/langchain\n", + " Installing build dependencies ... \u001b[?25ldone\n", + "\u001b[?25h Checking if build backend supports build_editable ... \u001b[?25ldone\n", + "\u001b[?25h Getting requirements to build editable ... \u001b[?25ldone\n", + "\u001b[?25h Preparing editable metadata (pyproject.toml) ... \u001b[?25ldone\n", + "\u001b[?25hRequirement already satisfied: PyYAML>=5.3 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from langchain==0.0.267) (6.0.1)\n", + "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from langchain==0.0.267) (2.0.22)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from langchain==0.0.267) (3.8.6)\n", + "Requirement already satisfied: amazon-textract-textractor<2 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from langchain==0.0.267) (1.4.1)\n", + "Collecting dataclasses-json<0.6.0,>=0.5.7 (from langchain==0.0.267)\n", + " Obtaining dependency information for dataclasses-json<0.6.0,>=0.5.7 from https://files.pythonhosted.org/packages/97/5f/e7cc90f36152810cab08b6c9c1125e8bcb9d76f8b3018d101b5f877b386c/dataclasses_json-0.5.14-py3-none-any.whl.metadata\n", + " Downloading dataclasses_json-0.5.14-py3-none-any.whl.metadata (22 kB)\n", + "Requirement already satisfied: langsmith<0.1.0,>=0.0.21 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from langchain==0.0.267) (0.0.44)\n", + "Requirement already satisfied: numexpr<3.0.0,>=2.8.4 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from langchain==0.0.267) (2.8.7)\n", + "Requirement already satisfied: numpy<2,>=1 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from langchain==0.0.267) (1.24.4)\n", + "Requirement already satisfied: pydantic<3,>=1 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from langchain==0.0.267) (1.10.13)\n", + "Requirement already satisfied: requests<3,>=2 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from langchain==0.0.267) (2.31.0)\n", + "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from langchain==0.0.267) (8.2.3)\n", + "Requirement already satisfied: attrs>=17.3.0 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain==0.0.267) (23.1.0)\n", + "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain==0.0.267) (3.3.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain==0.0.267) (6.0.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain==0.0.267) (4.0.3)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain==0.0.267) (1.9.2)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain==0.0.267) (1.4.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain==0.0.267) (1.3.1)\n", + "Requirement already satisfied: Pillow in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from amazon-textract-textractor<2->langchain==0.0.267) (10.1.0)\n", + "Requirement already satisfied: XlsxWriter<3.1,>=3.0 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from amazon-textract-textractor<2->langchain==0.0.267) (3.0.9)\n", + "Collecting amazon-textract-caller<0.1.0,>=0.0.27 (from amazon-textract-textractor<2->langchain==0.0.267)\n", + " Using cached amazon_textract_caller-0.0.29-py2.py3-none-any.whl (13 kB)\n", + "Requirement already satisfied: amazon-textract-pipeline-pagedimensions in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from amazon-textract-textractor<2->langchain==0.0.267) (0.0.8)\n", + "Requirement already satisfied: amazon-textract-response-parser<0.2.0,>=0.1.45 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from amazon-textract-textractor<2->langchain==0.0.267) (0.1.48)\n", + "Requirement already satisfied: editdistance==0.6.2 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from amazon-textract-textractor<2->langchain==0.0.267) (0.6.2)\n", + "Requirement already satisfied: tabulate<0.10,>=0.9 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from amazon-textract-textractor<2->langchain==0.0.267) (0.9.0)\n", + "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from dataclasses-json<0.6.0,>=0.5.7->langchain==0.0.267) (3.20.1)\n", + "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from dataclasses-json<0.6.0,>=0.5.7->langchain==0.0.267) (0.9.0)\n", + "Requirement already satisfied: typing-extensions>=4.2.0 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from pydantic<3,>=1->langchain==0.0.267) (4.8.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from requests<3,>=2->langchain==0.0.267) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from requests<3,>=2->langchain==0.0.267) (1.26.18)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from requests<3,>=2->langchain==0.0.267) (2023.7.22)\n", + "Requirement already satisfied: boto3>=1.26.35 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from amazon-textract-caller<0.1.0,>=0.0.27->amazon-textract-textractor<2->langchain==0.0.267) (1.28.67)\n", + "Requirement already satisfied: botocore in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from amazon-textract-caller<0.1.0,>=0.0.27->amazon-textract-textractor<2->langchain==0.0.267) (1.31.67)\n", + "Requirement already satisfied: packaging>=17.0 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from marshmallow<4.0.0,>=3.18.0->dataclasses-json<0.6.0,>=0.5.7->langchain==0.0.267) (23.2)\n", + "Requirement already satisfied: mypy-extensions>=0.3.0 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.6.0,>=0.5.7->langchain==0.0.267) (1.0.0)\n", + "Requirement already satisfied: pypdf>=2.5.* in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from amazon-textract-pipeline-pagedimensions->amazon-textract-textractor<2->langchain==0.0.267) (3.16.4)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from boto3>=1.26.35->amazon-textract-caller<0.1.0,>=0.0.27->amazon-textract-textractor<2->langchain==0.0.267) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.8.0,>=0.7.0 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from boto3>=1.26.35->amazon-textract-caller<0.1.0,>=0.0.27->amazon-textract-textractor<2->langchain==0.0.267) (0.7.0)\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from botocore->amazon-textract-caller<0.1.0,>=0.0.27->amazon-textract-textractor<2->langchain==0.0.267) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from python-dateutil<3.0.0,>=2.1->botocore->amazon-textract-caller<0.1.0,>=0.0.27->amazon-textract-textractor<2->langchain==0.0.267) (1.16.0)\n", + "Downloading dataclasses_json-0.5.14-py3-none-any.whl (26 kB)\n", + "Building wheels for collected packages: langchain\n", + " Building editable for langchain (pyproject.toml) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for langchain: filename=langchain-0.0.267-py3-none-any.whl size=5553 sha256=daaf68d6658b27d69a4a092aa0a39e31f32b96868ef195102d2a17cf119f9d86\n", + " Stored in directory: /private/var/folders/s4/y_t_mj094c95t80n023c9wym0000gr/T/pip-ephem-wheel-cache-v1ynlirx/wheels/9f/73/28/b1d250633de6bd5759f959e16889c6c841dd0e0ffb6474185a\n", + "Successfully built langchain\n", + "\u001b[33mDEPRECATION: amazon-textract-pipeline-pagedimensions 0.0.8 has a non-standard dependency specifier Pillow>=9.4.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of amazon-textract-pipeline-pagedimensions or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mDEPRECATION: amazon-textract-pipeline-pagedimensions 0.0.8 has a non-standard dependency specifier pypdf>=2.5.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of amazon-textract-pipeline-pagedimensions or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n", + "\u001b[0mInstalling collected packages: dataclasses-json, amazon-textract-caller, langchain\n", + " Attempting uninstall: dataclasses-json\n", + " Found existing installation: dataclasses-json 0.6.1\n", + " Uninstalling dataclasses-json-0.6.1:\n", + " Successfully uninstalled dataclasses-json-0.6.1\n", + " Attempting uninstall: amazon-textract-caller\n", + " Found existing installation: amazon-textract-caller 0.2.0\n", + " Uninstalling amazon-textract-caller-0.2.0:\n", + " Successfully uninstalled amazon-textract-caller-0.2.0\n", + " Attempting uninstall: langchain\n", + " Found existing installation: langchain 0.0.319\n", + " Uninstalling langchain-0.0.319:\n", + " Successfully uninstalled langchain-0.0.319\n", + "Successfully installed amazon-textract-caller-0.0.29 dataclasses-json-0.5.14 langchain-0.0.267\n", "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n" ] } ], "source": [ - "!pip install langchain boto3 openai tiktoken python-dotenv -q" + "# !pip install langchain boto3 openai tiktoken python-dotenv -q\n", + "!pip install boto3 openai tiktoken python-dotenv -q\n", + "!pip install -e /Users/schadem/code/github/schadem/langchain/libs/langchain" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e4305a0d-37da-41f9-a52c-7d166d7dbabf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting amazon-textract-caller>=0.2.0\n", + " Obtaining dependency information for amazon-textract-caller>=0.2.0 from https://files.pythonhosted.org/packages/35/42/17daacf400060ee1f768553980b7bd6bb77d5b80bcb8a82d8a9665e5bb9b/amazon_textract_caller-0.2.0-py2.py3-none-any.whl.metadata\n", + " Using cached amazon_textract_caller-0.2.0-py2.py3-none-any.whl.metadata (7.1 kB)\n", + "Requirement already satisfied: boto3>=1.26.35 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from amazon-textract-caller>=0.2.0) (1.28.67)\n", + "Requirement already satisfied: botocore in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from amazon-textract-caller>=0.2.0) (1.31.67)\n", + "Requirement already satisfied: amazon-textract-response-parser>=0.1.39 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from amazon-textract-caller>=0.2.0) (0.1.48)\n", + "Requirement already satisfied: marshmallow<4,>=3.14 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from amazon-textract-response-parser>=0.1.39->amazon-textract-caller>=0.2.0) (3.20.1)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from boto3>=1.26.35->amazon-textract-caller>=0.2.0) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.8.0,>=0.7.0 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from boto3>=1.26.35->amazon-textract-caller>=0.2.0) (0.7.0)\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from botocore->amazon-textract-caller>=0.2.0) (2.8.2)\n", + "Requirement already satisfied: urllib3<2.1,>=1.25.4 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from botocore->amazon-textract-caller>=0.2.0) (1.26.18)\n", + "Requirement already satisfied: packaging>=17.0 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from marshmallow<4,>=3.14->amazon-textract-response-parser>=0.1.39->amazon-textract-caller>=0.2.0) (23.2)\n", + "Requirement already satisfied: six>=1.5 in /Users/schadem/.pyenv/versions/3.11.1/envs/langchain/lib/python3.11/site-packages (from python-dateutil<3.0.0,>=2.1->botocore->amazon-textract-caller>=0.2.0) (1.16.0)\n", + "Using cached amazon_textract_caller-0.2.0-py2.py3-none-any.whl (13 kB)\n", + "\u001b[33mDEPRECATION: amazon-textract-pipeline-pagedimensions 0.0.8 has a non-standard dependency specifier Pillow>=9.4.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of amazon-textract-pipeline-pagedimensions or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mDEPRECATION: amazon-textract-pipeline-pagedimensions 0.0.8 has a non-standard dependency specifier pypdf>=2.5.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of amazon-textract-pipeline-pagedimensions or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n", + "\u001b[0mInstalling collected packages: amazon-textract-caller\n", + " Attempting uninstall: amazon-textract-caller\n", + " Found existing installation: amazon-textract-caller 0.0.29\n", + " Uninstalling amazon-textract-caller-0.0.29:\n", + " Successfully uninstalled amazon-textract-caller-0.0.29\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "amazon-textract-textractor 1.4.1 requires amazon-textract-caller<0.1.0,>=0.0.27, but you have amazon-textract-caller 0.2.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed amazon-textract-caller-0.2.0\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install \"amazon-textract-caller>=0.2.0\"" ] }, { @@ -53,12 +174,27 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "id": "1becee92-e82f-42d4-9b4e-b23d77cbe88d", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'DocumentIntelligenceParser' from 'langchain.document_loaders.parsers.pdf' (/Users/schadem/code/github/schadem/langchain/libs/langchain/langchain/document_loaders/parsers/pdf.py)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AmazonTextractPDFLoader\n\u001b[1;32m 2\u001b[0m loader \u001b[38;5;241m=\u001b[39m AmazonTextractPDFLoader(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mexample_data/alejandro_rosalez_sample-small.jpeg\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 3\u001b[0m documents \u001b[38;5;241m=\u001b[39m loader\u001b[38;5;241m.\u001b[39mload()\n", + "File \u001b[0;32m~/code/github/schadem/langchain/libs/langchain/langchain/document_loaders/__init__.py:46\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbigquery\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BigQueryLoader\n\u001b[1;32m 45\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbilibili\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BiliBiliLoader\n\u001b[0;32m---> 46\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mblackboard\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BlackboardLoader\n\u001b[1;32m 47\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mblob_loaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 48\u001b[0m Blob,\n\u001b[1;32m 49\u001b[0m BlobLoader,\n\u001b[1;32m 50\u001b[0m FileSystemBlobLoader,\n\u001b[1;32m 51\u001b[0m YoutubeAudioLoader,\n\u001b[1;32m 52\u001b[0m )\n\u001b[1;32m 53\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mblockchain\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BlockchainDocumentLoader\n", + "File \u001b[0;32m~/code/github/schadem/langchain/libs/langchain/langchain/document_loaders/blackboard.py:9\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocstore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Document\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdirectory\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DirectoryLoader\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpdf\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m PyPDFLoader\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mweb_base\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m WebBaseLoader\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mBlackboardLoader\u001b[39;00m(WebBaseLoader):\n", + "File \u001b[0;32m~/code/github/schadem/langchain/libs/langchain/langchain/document_loaders/pdf.py:17\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbase\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseLoader\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mblob_loaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Blob\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mparsers\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpdf\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 18\u001b[0m AmazonTextractPDFParser,\n\u001b[1;32m 19\u001b[0m DocumentIntelligenceParser,\n\u001b[1;32m 20\u001b[0m PDFMinerParser,\n\u001b[1;32m 21\u001b[0m PDFPlumberParser,\n\u001b[1;32m 22\u001b[0m PyMuPDFParser,\n\u001b[1;32m 23\u001b[0m PyPDFium2Parser,\n\u001b[1;32m 24\u001b[0m PyPDFParser,\n\u001b[1;32m 25\u001b[0m )\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01munstructured\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UnstructuredFileLoader\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m get_from_dict_or_env\n", + "\u001b[0;31mImportError\u001b[0m: cannot import name 'DocumentIntelligenceParser' from 'langchain.document_loaders.parsers.pdf' (/Users/schadem/code/github/schadem/langchain/libs/langchain/langchain/document_loaders/parsers/pdf.py)" + ] + } + ], "source": [ "from langchain.document_loaders import AmazonTextractPDFLoader\n", "\n", @@ -876,7 +1012,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" + "version": "3.11.6" } }, "nbformat": 4, diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index 74373ba9b6d..fb0cfd64a54 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -106,7 +106,7 @@ class PyPDFParser(BaseBlobParser): if not self.extract_images or "/XObject" not in page["/Resources"].keys(): return "" - xObject = page["/Resources"]["/XObject"].get_object() + xObject = page["/Resources"]["/XObject"].get_object() # type: ignore images = [] for obj in xObject: if xObject[obj]["/Subtype"] == "/Image": @@ -387,6 +387,46 @@ class AmazonTextractPDFParser(BaseBlobParser): """Send `PDF` files to `Amazon Textract` and parse them. For parsing multi-page PDFs, they have to reside on S3. + + The AmazonTextractPDFLoader calls the + [Amazon Textract Service](https://aws.amazon.com/textract/) + to convert PDFs into a Document structure. + Single and multi-page documents are supported with up to 3000 pages + and 512 MB of size. + + For the call to be successful an AWS account is required, + similar to the + [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) + requirements. + + Besides the AWS configuration, it is very similar to the other PDF + loaders, while also supporting JPEG, PNG and TIFF and non-native + PDF formats. + + ```python + from langchain.document_loaders import AmazonTextractPDFLoader + loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg") + documents = loader.load() + ``` + + One feature is the linearization of the output. + When using the features LAYOUT, FORMS or TABLES together with Textract + + ```python + from langchain.document_loaders import AmazonTextractPDFLoader + # you can mix and match each of the features + loader=AmazonTextractPDFLoader( + "example_data/alejandro_rosalez_sample-small.jpeg", + textract_features=["TABLES", "LAYOUT"]) + documents = loader.load() + ``` + + it will generate output that formats the text in reading order and + try to output the information in a tabular structure or + output the key/value pairs with a colon (key: value). + This helps most LLMs to achieve better accuracy when + processing these texts. + """ def __init__( @@ -405,8 +445,11 @@ class AmazonTextractPDFParser(BaseBlobParser): try: import textractcaller as tc + import textractor.entities.document as textractor self.tc = tc + self.textractor = textractor + if textract_features is not None: self.textract_features = [ tc.Textract_Features(f) for f in textract_features @@ -435,7 +478,8 @@ class AmazonTextractPDFParser(BaseBlobParser): def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Iterates over the Blob pages and returns an Iterator with a Document for each page, like the other parsers If multi-page document, blob.path - has to be set to the S3 URI and for single page docs the blob.data is taken + has to be set to the S3 URI and for single page docs + the blob.data is taken """ url_parse_result = urlparse(str(blob.path)) if blob.path else None @@ -458,23 +502,19 @@ class AmazonTextractPDFParser(BaseBlobParser): boto3_textract_client=self.boto3_textract_client, ) - current_text = "" - current_page = 1 - for block in textract_response_json["Blocks"]: - if "Page" in block and not (int(block["Page"]) == current_page): - yield Document( - page_content=current_text, - metadata={"source": blob.source, "page": current_page}, - ) - current_text = "" - current_page = int(block["Page"]) - if "Text" in block: - current_text += block["Text"] + " " + document = self.textractor.Document.open(textract_response_json) - yield Document( - page_content=current_text, - metadata={"source": blob.source, "page": current_page}, + linearizer_config = self.textractor.TextLinearizationConfig( + hide_figure_layout=True, + title_prefix="# ", + section_header_prefix="## ", + list_element_prefix="*", ) + for idx, page in enumerate(document.pages): + yield Document( + page_content=page.get_text(config=linearizer_config), + metadata={"source": blob.source, "page": idx + 1}, + ) class DocumentIntelligenceParser(BaseBlobParser): diff --git a/libs/langchain/poetry.lock b/libs/langchain/poetry.lock index 5deb1a73c10..57a42cdf58a 100644 --- a/libs/langchain/poetry.lock +++ b/libs/langchain/poetry.lock @@ -300,40 +300,6 @@ files = [ {file = "amadeus-9.0.0.tar.gz", hash = "sha256:d19805e19d699d2633911c5b52400f82c6719676cc1488f8ccf344dbc4eb3202"}, ] -[[package]] -name = "amazon-textract-caller" -version = "0.2.1" -description = "Amazon Textract Caller tools" -optional = true -python-versions = ">=3.6" -files = [ - {file = "amazon-textract-caller-0.2.1.tar.gz", hash = "sha256:7a531ba4841fb64718b9430c05796958b426f41a4d674d4996f9e56cd3849f4e"}, - {file = "amazon_textract_caller-0.2.1-py2.py3-none-any.whl", hash = "sha256:ccdeb364e02ce7c2034b69c09209954e995a0ee19f5d3dea79f25171a9565c37"}, -] - -[package.dependencies] -amazon-textract-response-parser = ">=0.1.39" -boto3 = ">=1.26.35" -botocore = "*" - -[package.extras] -testing = ["amazon-textract-response-parser", "pytest"] - -[[package]] -name = "amazon-textract-response-parser" -version = "1.0.1" -description = "Easily parse JSON returned by Amazon Textract." -optional = true -python-versions = ">=3.8" -files = [ - {file = "amazon-textract-response-parser-1.0.1.tar.gz", hash = "sha256:d9ddedb75d12c9f5dc7cf65811c96c3934c0dfa8ef76543882cc1077618a301f"}, - {file = "amazon_textract_response_parser-1.0.1-py2.py3-none-any.whl", hash = "sha256:890eba2c6bc33f4088c08c4df93088cd540896eca3243b7612635ea456f759c7"}, -] - -[package.dependencies] -boto3 = "*" -marshmallow = ">=3.14,<4" - [[package]] name = "anthropic" version = "0.3.11" @@ -11076,7 +11042,7 @@ cli = ["typer"] cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["aiosqlite", "aleph-alpha-client", "amazon-textract-caller", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "dashvector", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "google-cloud-documentai", "gql", "html2text", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openai", "openapi-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"] +extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "dashvector", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "google-cloud-documentai", "gql", "html2text", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "openai", "openai", "openapi-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict"] javascript = ["esprima"] llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"] openai = ["openai", "tiktoken"] @@ -11086,4 +11052,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "9345cd37346e9f369702f51b7e10dde8da91d5f7b659c8c204e5b46c360cd028" +content-hash = "9ffdcad5f675571917ffb0f222acdb578f406939695977da3c19e55192cac513" diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index cb002198687..a42a25525aa 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -123,7 +123,6 @@ gitpython = {version = "^3.1.32", optional = true} librosa = {version="^0.10.0.post2", optional = true } feedparser = {version = "^6.0.10", optional = true} newspaper3k = {version = "^0.2.8", optional = true} -amazon-textract-caller = {version = "<2", optional = true} xata = {version = "^1.0.0a7", optional = true} xmltodict = {version = "^0.13.0", optional = true} markdownify = {version = "^0.11.6", optional = true} @@ -318,7 +317,6 @@ cli = [ # merge-conflicts extended_testing = [ "aleph-alpha-client", - "amazon-textract-caller", "aiosqlite", "assemblyai", "beautifulsoup4", diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py index aea75c11fda..1fac61f3503 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_pdf.py @@ -150,14 +150,51 @@ def test_mathpix_loader() -> None: "https://amazon-textract-public-content.s3.us-east-2.amazonaws.com" "/langchain/alejandro_rosalez_sample_1.jpg" ), - ["FORMS", "TABLES"], + ["FORMS", "TABLES", "LAYOUT"], + 1, + False, + ), + ( + ( + "https://amazon-textract-public-content.s3.us-east-2.amazonaws.com" + "/langchain/alejandro_rosalez_sample_1.jpg" + ), + [], + 1, + False, + ), + ( + ( + "https://amazon-textract-public-content.s3.us-east-2.amazonaws.com" + "/langchain/alejandro_rosalez_sample_1.jpg" + ), + ["TABLES"], + 1, + False, + ), + ( + ( + "https://amazon-textract-public-content.s3.us-east-2.amazonaws.com" + "/langchain/alejandro_rosalez_sample_1.jpg" + ), + ["FORMS"], + 1, + False, + ), + ( + ( + "https://amazon-textract-public-content.s3.us-east-2.amazonaws.com" + "/langchain/alejandro_rosalez_sample_1.jpg" + ), + ["LAYOUT"], 1, False, ), (str(Path(__file__).parent.parent / "examples/hello.pdf"), ["FORMS"], 1, False), + (str(Path(__file__).parent.parent / "examples/hello.pdf"), [], 1, False), ( "s3://amazon-textract-public-content/langchain/layout-parser-paper.pdf", - None, + ["FORMS", "TABLES", "LAYOUT"], 16, True, ), @@ -180,6 +217,7 @@ def test_amazontextract_loader( else: loader = AmazonTextractPDFLoader(file_path, textract_features=features) docs = loader.load() + print(docs) assert len(docs) == docs_length