From 028069ca427508cbc680482dbaa29a307b8a61e8 Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Tue, 15 Apr 2025 11:50:52 +0200 Subject: [PATCH] Fix notebooks --- .../document_loaders/pdfplumber.ipynb | 400 +++++++++--------- .../document_loaders/parsers/pdf.py | 6 +- 2 files changed, 204 insertions(+), 202 deletions(-) diff --git a/docs/docs/integrations/document_loaders/pdfplumber.ipynb b/docs/docs/integrations/document_loaders/pdfplumber.ipynb index c2e505ac57d..82e404eb7e3 100644 --- a/docs/docs/integrations/document_loaders/pdfplumber.ipynb +++ b/docs/docs/integrations/document_loaders/pdfplumber.ipynb @@ -41,18 +41,18 @@ }, { "cell_type": "code", + "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:32:31.030959Z", "start_time": "2025-04-15T09:32:31.027427Z" } }, + "outputs": [], "source": [ "# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n", "# os.environ[\"LANGSMITH_TRACING\"] = \"true\"" - ], - "outputs": [], - "execution_count": 1 + ] }, { "cell_type": "markdown", @@ -65,13 +65,13 @@ }, { "cell_type": "code", + "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:32:34.953716Z", "start_time": "2025-04-15T09:32:32.674410Z" } }, - "source": "%pip install -qU langchain_community pdfplumber", "outputs": [ { "name": "stdout", @@ -82,7 +82,9 @@ ] } ], - "execution_count": 2 + "source": [ + "%pip install -qU langchain_community pdfplumber" + ] }, { "cell_type": "markdown", @@ -95,20 +97,20 @@ }, { "cell_type": "code", + "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:32:55.327932Z", "start_time": "2025-04-15T09:32:54.354899Z" } }, + "outputs": [], "source": [ "from langchain_community.document_loaders import PDFPlumberLoader\n", "\n", "file_path = \"./example_data/layout-parser-paper.pdf\"\n", "loader = PDFPlumberLoader(file_path, metadata_format=\"standard\")" - ], - "outputs": [], - "execution_count": 3 + ] }, { "cell_type": "markdown", @@ -119,16 +121,13 @@ }, { "cell_type": "code", + "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:32:59.036774Z", "start_time": "2025-04-15T09:32:57.033035Z" } }, - "source": [ - "docs = loader.load()\n", - "docs[0]" - ], "outputs": [ { "name": "stderr", @@ -163,21 +162,20 @@ "output_type": "execute_result" } ], - "execution_count": 4 + "source": [ + "docs = loader.load()\n", + "docs[0]" + ] }, { "cell_type": "code", + "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:32:59.047149Z", "start_time": "2025-04-15T09:32:59.043526Z" } }, - "source": [ - "import pprint\n", - "\n", - "pprint.pp(docs[0].metadata)" - ], "outputs": [ { "name": "stdout", @@ -201,7 +199,11 @@ ] } ], - "execution_count": 5 + "source": [ + "import pprint\n", + "\n", + "pprint.pp(docs[0].metadata)" + ] }, { "cell_type": "markdown", @@ -212,23 +214,13 @@ }, { "cell_type": "code", + "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:33:03.931290Z", "start_time": "2025-04-15T09:33:02.092848Z" } }, - "source": [ - "pages = []\n", - "for doc in loader.lazy_load():\n", - " pages.append(doc)\n", - " if len(pages) >= 10:\n", - " # do some paged operation, e.g.\n", - " # index.upsert(page)\n", - "\n", - " pages = []\n", - "len(pages)" - ], "outputs": [ { "name": "stderr", @@ -263,20 +255,27 @@ "output_type": "execute_result" } ], - "execution_count": 6 + "source": [ + "pages = []\n", + "for doc in loader.lazy_load():\n", + " pages.append(doc)\n", + " if len(pages) >= 10:\n", + " # do some paged operation, e.g.\n", + " # index.upsert(page)\n", + "\n", + " pages = []\n", + "len(pages)" + ] }, { "cell_type": "code", + "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:33:05.116002Z", "start_time": "2025-04-15T09:33:05.102235Z" } }, - "source": [ - "print(pages[0].page_content[:100])\n", - "pprint.pp(pages[0].metadata)" - ], "outputs": [ { "name": "stdout", @@ -302,7 +301,10 @@ ] } ], - "execution_count": 7 + "source": [ + "print(pages[0].page_content[:100])\n", + "pprint.pp(pages[0].metadata)" + ] }, { "cell_type": "markdown", @@ -347,22 +349,13 @@ }, { "cell_type": "code", + "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:33:13.625065Z", "start_time": "2025-04-15T09:33:11.686326Z" } }, - "source": [ - "loader = PDFPlumberLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " metadata_format=\"standard\",\n", - " mode=\"page\",\n", - ")\n", - "docs = loader.load()\n", - "print(len(docs))\n", - "pprint.pp(docs[0].metadata)" - ], "outputs": [ { "name": "stderr", @@ -409,7 +402,16 @@ ] } ], - "execution_count": 8 + "source": [ + "loader = PDFPlumberLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " metadata_format=\"standard\",\n", + " mode=\"page\",\n", + ")\n", + "docs = loader.load()\n", + "print(len(docs))\n", + "pprint.pp(docs[0].metadata)" + ] }, { "cell_type": "markdown", @@ -427,22 +429,13 @@ }, { "cell_type": "code", + "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:33:30.520801Z", "start_time": "2025-04-15T09:33:28.785067Z" } }, - "source": [ - "loader = PDFPlumberLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " metadata_format=\"standard\",\n", - " mode=\"single\",\n", - ")\n", - "docs = loader.load()\n", - "print(len(docs))\n", - "pprint.pp(docs[0].metadata)" - ], "outputs": [ { "name": "stderr", @@ -488,7 +481,16 @@ ] } ], - "execution_count": 9 + "source": [ + "loader = PDFPlumberLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " metadata_format=\"standard\",\n", + " mode=\"single\",\n", + ")\n", + "docs = loader.load()\n", + "print(len(docs))\n", + "pprint.pp(docs[0].metadata)" + ] }, { "cell_type": "markdown", @@ -504,22 +506,13 @@ }, { "cell_type": "code", + "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:33:34.209872Z", "start_time": "2025-04-15T09:33:32.242569Z" } }, - "source": [ - "loader = PDFPlumberLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " metadata_format=\"standard\",\n", - " mode=\"single\",\n", - " pages_delimiter=\"\\n-------THIS IS A CUSTOM END OF PAGE-------\\n\",\n", - ")\n", - "docs = loader.load()\n", - "print(docs[0].page_content[:5780])" - ], "outputs": [ { "name": "stderr", @@ -643,7 +636,16 @@ ] } ], - "execution_count": 10 + "source": [ + "loader = PDFPlumberLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " metadata_format=\"standard\",\n", + " mode=\"single\",\n", + " pages_delimiter=\"\\n-------THIS IS A CUSTOM END OF PAGE-------\\n\",\n", + ")\n", + "docs = loader.load()\n", + "print(docs[0].page_content[:5780])" + ] }, { "cell_type": "markdown", @@ -682,15 +684,13 @@ }, { "cell_type": "code", + "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:33:56.217580Z", "start_time": "2025-04-15T09:33:42.788726Z" } }, - "source": [ - "%pip install -qU rapidocr-onnxruntime" - ], "outputs": [ { "name": "stdout", @@ -701,30 +701,19 @@ ] } ], - "execution_count": 11 + "source": [ + "%pip install -qU rapidocr-onnxruntime" + ] }, { "cell_type": "code", + "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:34:24.118706Z", "start_time": "2025-04-15T09:33:56.230529Z" } }, - "source": [ - "from langchain_community.document_loaders.parsers import RapidOCRBlobParser\n", - "\n", - "loader = PDFPlumberLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " metadata_format=\"standard\",\n", - " mode=\"page\",\n", - " images_inner_format=\"markdown-img\",\n", - " images_parser=RapidOCRBlobParser(),\n", - ")\n", - "docs = loader.load()\n", - "\n", - "print(docs[5].page_content)" - ], "outputs": [ { "name": "stderr", @@ -819,7 +808,20 @@ ] } ], - "execution_count": 12 + "source": [ + "from langchain_community.document_loaders.parsers import RapidOCRBlobParser\n", + "\n", + "loader = PDFPlumberLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " metadata_format=\"standard\",\n", + " mode=\"page\",\n", + " images_inner_format=\"markdown-img\",\n", + " images_parser=RapidOCRBlobParser(),\n", + ")\n", + "docs = loader.load()\n", + "\n", + "print(docs[5].page_content)" + ] }, { "cell_type": "markdown", @@ -837,15 +839,13 @@ }, { "cell_type": "code", + "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:35:29.381269Z", "start_time": "2025-04-15T09:35:26.711980Z" } }, - "source": [ - "%pip install -qU pytesseract" - ], "outputs": [ { "name": "stdout", @@ -856,29 +856,19 @@ ] } ], - "execution_count": 13 + "source": [ + "%pip install -qU pytesseract" + ] }, { "cell_type": "code", + "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:35:45.154957Z", "start_time": "2025-04-15T09:35:33.162485Z" } }, - "source": [ - "from langchain_community.document_loaders.parsers import TesseractBlobParser\n", - "\n", - "loader = PDFPlumberLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " metadata_format=\"standard\",\n", - " mode=\"page\",\n", - " images_inner_format=\"html-img\",\n", - " images_parser=TesseractBlobParser(),\n", - ")\n", - "docs = loader.load()\n", - "print(docs[5].page_content)" - ], "outputs": [ { "name": "stderr", @@ -973,7 +963,19 @@ ] } ], - "execution_count": 14 + "source": [ + "from langchain_community.document_loaders.parsers import TesseractBlobParser\n", + "\n", + "loader = PDFPlumberLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " metadata_format=\"standard\",\n", + " mode=\"page\",\n", + " images_inner_format=\"html-img\",\n", + " images_parser=TesseractBlobParser(),\n", + ")\n", + "docs = loader.load()\n", + "print(docs[5].page_content)" + ] }, { "cell_type": "markdown", @@ -984,15 +986,13 @@ }, { "cell_type": "code", + "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:36:02.237828Z", "start_time": "2025-04-15T09:35:57.078164Z" } }, - "source": [ - "%pip install -qU langchain_openai" - ], "outputs": [ { "name": "stdout", @@ -1003,23 +1003,19 @@ ] } ], - "execution_count": 15 + "source": [ + "%pip install -qU langchain_openai" + ] }, { "cell_type": "code", + "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:36:03.749164Z", "start_time": "2025-04-15T09:36:03.558685Z" } }, - "source": [ - "import os\n", - "\n", - "from dotenv import load_dotenv\n", - "\n", - "load_dotenv()" - ], "outputs": [ { "data": { @@ -1032,47 +1028,40 @@ "output_type": "execute_result" } ], - "execution_count": 16 + "source": [ + "import os\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()" + ] }, { "cell_type": "code", + "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:36:04.863788Z", "start_time": "2025-04-15T09:36:04.852010Z" } }, + "outputs": [], "source": [ "from getpass import getpass\n", "\n", "if not os.environ.get(\"OPENAI_API_KEY\"):\n", " os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI API key =\")" - ], - "outputs": [], - "execution_count": 17 + ] }, { "cell_type": "code", + "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:37:03.954718Z", "start_time": "2025-04-15T09:36:09.023976Z" } }, - "source": [ - "from langchain_community.document_loaders.parsers import LLMImageBlobParser\n", - "from langchain_openai import ChatOpenAI\n", - "\n", - "loader = PDFPlumberLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " metadata_format=\"standard\",\n", - " mode=\"page\",\n", - " images_inner_format=\"markdown-img\",\n", - " images_parser=LLMImageBlobParser(model=ChatOpenAI(model=\"gpt-4o\", max_tokens=1024)),\n", - ")\n", - "docs = loader.load()\n", - "print(docs[5].page_content)" - ], "outputs": [ { "name": "stderr", @@ -1164,7 +1153,20 @@ ] } ], - "execution_count": 18 + "source": [ + "from langchain_community.document_loaders.parsers import LLMImageBlobParser\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "loader = PDFPlumberLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " metadata_format=\"standard\",\n", + " mode=\"page\",\n", + " images_inner_format=\"markdown-img\",\n", + " images_parser=LLMImageBlobParser(model=ChatOpenAI(model=\"gpt-4o\", max_tokens=1024)),\n", + ")\n", + "docs = loader.load()\n", + "print(docs[5].page_content)" + ] }, { "cell_type": "markdown", @@ -1179,29 +1181,14 @@ ] }, { + "cell_type": "code", + "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2025-04-15T09:39:00.809544Z", "start_time": "2025-04-15T09:38:58.847574Z" } }, - "cell_type": "code", - "source": [ - "from langchain_community.document_loaders import FileSystemBlobLoader\n", - "from langchain_community.document_loaders.generic import GenericLoader\n", - "from langchain_community.document_loaders.parsers import PDFPlumberParser\n", - "\n", - "loader = GenericLoader(\n", - " blob_loader=FileSystemBlobLoader(\n", - " path=\"./example_data/\",\n", - " glob=\"*.pdf\",\n", - " ),\n", - " blob_parser=PDFPlumberParser(),\n", - ")\n", - "docs = loader.load()\n", - "print(docs[0].page_content)\n", - "pprint.pp(docs[0].metadata)" - ], "outputs": [ { "name": "stderr", @@ -1294,49 +1281,6 @@ ] } ], - "execution_count": 20 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": "It is possible to work with files from cloud storage." - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "from langchain_community.document_loaders import CloudBlobLoader\n", - "from langchain_community.document_loaders.generic import GenericLoader\n", - "from langchain_community.document_loaders.parsers import PDFPlumberParser\n", - "\n", - "loader = GenericLoader(\n", - " blob_loader=CloudBlobLoader(\n", - " url=\"s3://mybucket\", # Supports s3://, az://, gs://, file:// schemes.\n", - " glob=\"*.pdf\",\n", - " ),\n", - " blob_parser=PDFPlumberParser(\n", - " metadata_format=\"standard\",\n", - " ),\n", - ")\n", - "docs = loader.load()\n", - "print(docs[0].page_content)\n", - "pprint.pp(docs[0].metadata)" - ], - "outputs": [], - "execution_count": null - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## API reference\n", - "\n", - "For detailed documentation of all `PDFMinerLoader` features and configurations head to the API reference: https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html" - ] - }, - { - "cell_type": "code", - "metadata": {}, "source": [ "from langchain_community.document_loaders import FileSystemBlobLoader\n", "from langchain_community.document_loaders.generic import GenericLoader\n", @@ -1354,9 +1298,69 @@ "docs = loader.load()\n", "print(docs[0].page_content)\n", "pprint.pp(docs[0].metadata)" - ], + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "It is possible to work with files from cloud storage." + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], - "execution_count": null + "source": [ + "from langchain_community.document_loaders import CloudBlobLoader\n", + "from langchain_community.document_loaders.generic import GenericLoader\n", + "from langchain_community.document_loaders.parsers import PDFPlumberParser\n", + "\n", + "loader = GenericLoader(\n", + " blob_loader=CloudBlobLoader(\n", + " url=\"s3://mybucket\", # Supports s3://, az://, gs://, file:// schemes.\n", + " glob=\"*.pdf\",\n", + " ),\n", + " blob_parser=PDFPlumberParser(\n", + " metadata_format=\"standard\",\n", + " ),\n", + ")\n", + "docs = loader.load()\n", + "print(docs[0].page_content)\n", + "pprint.pp(docs[0].metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For detailed documentation of all `PDFMinerLoader` features and configurations head to the API reference: https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import FileSystemBlobLoader\n", + "from langchain_community.document_loaders.generic import GenericLoader\n", + "from langchain_community.document_loaders.parsers import PDFPlumberParser\n", + "\n", + "loader = GenericLoader(\n", + " blob_loader=FileSystemBlobLoader(\n", + " path=\"./example_data/\",\n", + " glob=\"*.pdf\",\n", + " ),\n", + " blob_parser=PDFPlumberParser(\n", + " metadata_format=\"standard\",\n", + " ),\n", + ")\n", + "docs = loader.load()\n", + "print(docs[0].page_content)\n", + "pprint.pp(docs[0].metadata)" + ] } ], "metadata": { diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 775c29ada70..6af6014035f 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -1607,7 +1607,7 @@ class PDFPlumberParser(BaseBlobParser): mime_type="application/x-npy", ) text_from_image = next( - self.images_parser.lazy_parse(blob) # type: ignore + self.images_parser.lazy_parse(blob) ).page_content extras.append( _format_inner_image( @@ -1764,9 +1764,7 @@ class PDFPlumberParser(BaseBlobParser): ) elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS: buf = np.frombuffer(img["stream"].get_data(), dtype=np.uint8) - images.append( - np.array(Image.open(io.BytesIO(buf.tobytes()))) # type: ignore - ) + images.append(np.array(Image.open(io.BytesIO(buf.tobytes())))) else: logger.warning("Unknown PDF Filter!")