diff --git a/docs/docs/integrations/document_loaders/pdfplumber.ipynb b/docs/docs/integrations/document_loaders/pdfplumber.ipynb index b850d1e1300..c2e505ac57d 100644 --- a/docs/docs/integrations/document_loaders/pdfplumber.ipynb +++ b/docs/docs/integrations/document_loaders/pdfplumber.ipynb @@ -6,24 +6,24 @@ "source": [ "# PDFPlumberLoader\n", "\n", - "This notebook provides a quick overview for getting started with `PDFMiner` [document loader](https://python.langchain.com/docs/concepts/document_loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html).\n", + "This sample provides a quick overview for getting started with `PDFPlumber` [document loader](https://python.langchain.com/docs/concepts/document_loaders). For detailed documentation of all PDFPlumberLoader features and configurations head to the [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html).\n", "\n", " \n", "\n", "## Overview\n", "### Integration details\n", "\n", - "| Class | Package | Local | Serializable | JS support|\n", - "|:---------------------------------------------------------------------------------------------------------------------------------------------------------| :--- | :---: | :---: | :---: |\n", - "| [PDFPlumberLoader](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html) | [langchain_community](https://python.langchain.com/api_reference/community/index.html) | ✅ | ❌ | ❌ |\n", + "| Class | Package | Local | Serializable | JS support|\n", + "|:-----------------------------------------------------------------------------------------------------------------------------------------------------| :--- | :---: | :---: | :---: |\n", + "| [PDFMinerLoader](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFMinerLoader.html) | [langchain_community](https://python.langchain.com/api_reference/community/index.html) | ✅ | ❌ | ❌ |\n", "\n", "--------- \n", "\n", "### Loader features\n", "\n", - "| Source | Document Lazy Loading | Native Async Support | Extract Images | Extract Tables |\n", - "|:----------------:| :---: | :---: | :---: |:---: |\n", - "| PDFPlumberLoader | ✅ | ❌ | ✅ | ✅ |\n", + "| Source | Document Lazy Loading | Native Async Support | Extract Images | Extract Tables |\n", + "|:--------------:| :---: | :---: | :---: |:---: |\n", + "| PDFMinerLoader | ✅ | ❌ | ✅ | ✅ |\n", "\n", " \n", "\n", @@ -31,7 +31,7 @@ "\n", "### Credentials\n", "\n", - "No credentials are required to use PyMuPDFLoader" + "No credentials are required to use PDFPlumberLoader" ] }, { @@ -43,8 +43,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:39:37.440900Z", - "start_time": "2025-02-10T08:39:37.438441Z" + "end_time": "2025-04-15T09:32:31.030959Z", + "start_time": "2025-04-15T09:32:31.027427Z" } }, "source": [ @@ -60,15 +60,15 @@ "source": [ "### Installation\n", "\n", - "Install **langchain_community** and **pymupdf**." + "Install **langchain_community** and **pdfplumber**." ] }, { "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:39:41.487372Z", - "start_time": "2025-02-10T08:39:39.209073Z" + "end_time": "2025-04-15T09:32:34.953716Z", + "start_time": "2025-04-15T09:32:32.674410Z" } }, "source": "%pip install -qU langchain_community pdfplumber", @@ -77,7 +77,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" + "\u001B[33mWARNING: There was an error checking the latest version of pip.\u001B[0m\u001B[33m\r\n", + "\u001B[0mNote: you may need to restart the kernel to use updated packages.\n" ] } ], @@ -96,15 +97,15 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:39:45.246502Z", - "start_time": "2025-02-10T08:39:44.229183Z" + "end_time": "2025-04-15T09:32:55.327932Z", + "start_time": "2025-04-15T09:32:54.354899Z" } }, "source": [ "from langchain_community.document_loaders import PDFPlumberLoader\n", "\n", "file_path = \"./example_data/layout-parser-paper.pdf\"\n", - "loader = PDFPlumberLoader(file_path)" + "loader = PDFPlumberLoader(file_path, metadata_format=\"standard\")" ], "outputs": [], "execution_count": 3 @@ -120,8 +121,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:40:11.901128Z", - "start_time": "2025-02-10T08:39:46.905899Z" + "end_time": "2025-04-15T09:32:59.036774Z", + "start_time": "2025-04-15T09:32:57.033035Z" } }, "source": [ @@ -129,6 +130,28 @@ "docs[0]" ], "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n" + ] + }, { "data": { "text/plain": [ @@ -146,8 +169,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:40:20.094848Z", - "start_time": "2025-02-10T08:40:20.083124Z" + "end_time": "2025-04-15T09:32:59.047149Z", + "start_time": "2025-04-15T09:32:59.043526Z" } }, "source": [ @@ -191,8 +214,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:40:45.605691Z", - "start_time": "2025-02-10T08:40:22.639608Z" + "end_time": "2025-04-15T09:33:03.931290Z", + "start_time": "2025-04-15T09:33:02.092848Z" } }, "source": [ @@ -207,6 +230,28 @@ "len(pages)" ], "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n" + ] + }, { "data": { "text/plain": [ @@ -224,8 +269,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:40:46.969036Z", - "start_time": "2025-02-10T08:40:46.964794Z" + "end_time": "2025-04-15T09:33:05.116002Z", + "start_time": "2025-04-15T09:33:05.102235Z" } }, "source": [ @@ -290,7 +335,7 @@ "- By page\n", "- As a single text flow\n", "\n", - "By default PDFPlumberLoader will split the PDF by page." + "By default PDFMinerLoader will split the PDF by page." ] }, { @@ -304,13 +349,14 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:41:14.319842Z", - "start_time": "2025-02-10T08:40:50.665569Z" + "end_time": "2025-04-15T09:33:13.625065Z", + "start_time": "2025-04-15T09:33:11.686326Z" } }, "source": [ "loader = PDFPlumberLoader(\n", " \"./example_data/layout-parser-paper.pdf\",\n", + " metadata_format=\"standard\",\n", " mode=\"page\",\n", ")\n", "docs = loader.load()\n", @@ -318,6 +364,28 @@ "pprint.pp(docs[0].metadata)" ], "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -361,13 +429,14 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:41:41.244786Z", - "start_time": "2025-02-10T08:41:17.564901Z" + "end_time": "2025-04-15T09:33:30.520801Z", + "start_time": "2025-04-15T09:33:28.785067Z" } }, "source": [ "loader = PDFPlumberLoader(\n", " \"./example_data/layout-parser-paper.pdf\",\n", + " metadata_format=\"standard\",\n", " mode=\"single\",\n", ")\n", "docs = loader.load()\n", @@ -375,6 +444,28 @@ "pprint.pp(docs[0].metadata)" ], "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -415,13 +506,14 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:42:07.936745Z", - "start_time": "2025-02-10T08:41:44.463505Z" + "end_time": "2025-04-15T09:33:34.209872Z", + "start_time": "2025-04-15T09:33:32.242569Z" } }, "source": [ "loader = PDFPlumberLoader(\n", " \"./example_data/layout-parser-paper.pdf\",\n", + " metadata_format=\"standard\",\n", " mode=\"single\",\n", " pages_delimiter=\"\\n-------THIS IS A CUSTOM END OF PAGE-------\\n\",\n", ")\n", @@ -429,6 +521,28 @@ "print(docs[0].page_content[:5780])" ], "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -570,8 +684,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:42:14.077933Z", - "start_time": "2025-02-10T08:42:12.007265Z" + "end_time": "2025-04-15T09:33:56.217580Z", + "start_time": "2025-04-15T09:33:42.788726Z" } }, "source": [ @@ -582,7 +696,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" + "\u001B[33mWARNING: There was an error checking the latest version of pip.\u001B[0m\u001B[33m\r\n", + "\u001B[0mNote: you may need to restart the kernel to use updated packages.\n" ] } ], @@ -592,8 +707,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:42:38.934138Z", - "start_time": "2025-02-10T08:42:15.188110Z" + "end_time": "2025-04-15T09:34:24.118706Z", + "start_time": "2025-04-15T09:33:56.230529Z" } }, "source": [ @@ -601,6 +716,7 @@ "\n", "loader = PDFPlumberLoader(\n", " \"./example_data/layout-parser-paper.pdf\",\n", + " metadata_format=\"standard\",\n", " mode=\"page\",\n", " images_inner_format=\"markdown-img\",\n", " images_parser=RapidOCRBlobParser(),\n", @@ -610,6 +726,28 @@ "print(docs[5].page_content)" ], "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -701,8 +839,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:44:01.430937Z", - "start_time": "2025-02-10T08:43:59.573391Z" + "end_time": "2025-04-15T09:35:29.381269Z", + "start_time": "2025-04-15T09:35:26.711980Z" } }, "source": [ @@ -713,7 +851,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" + "\u001B[33mWARNING: There was an error checking the latest version of pip.\u001B[0m\u001B[33m\r\n", + "\u001B[0mNote: you may need to restart the kernel to use updated packages.\n" ] } ], @@ -723,8 +862,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:44:12.786743Z", - "start_time": "2025-02-10T08:44:02.309333Z" + "end_time": "2025-04-15T09:35:45.154957Z", + "start_time": "2025-04-15T09:35:33.162485Z" } }, "source": [ @@ -732,6 +871,7 @@ "\n", "loader = PDFPlumberLoader(\n", " \"./example_data/layout-parser-paper.pdf\",\n", + " metadata_format=\"standard\",\n", " mode=\"page\",\n", " images_inner_format=\"html-img\",\n", " images_parser=TesseractBlobParser(),\n", @@ -740,6 +880,28 @@ "print(docs[5].page_content)" ], "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -824,8 +986,8 @@ "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:46:05.694249Z", - "start_time": "2025-02-10T08:46:03.558918Z" + "end_time": "2025-04-15T09:36:02.237828Z", + "start_time": "2025-04-15T09:35:57.078164Z" } }, "source": [ @@ -836,18 +998,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "Note: you may need to restart the kernel to use updated packages.\n" + "\u001B[33mWARNING: There was an error checking the latest version of pip.\u001B[0m\u001B[33m\r\n", + "\u001B[0mNote: you may need to restart the kernel to use updated packages.\n" ] } ], - "execution_count": 16 + "execution_count": 15 }, { "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:46:07.818185Z", - "start_time": "2025-02-10T08:46:07.794265Z" + "end_time": "2025-04-15T09:36:03.749164Z", + "start_time": "2025-04-15T09:36:03.558685Z" } }, "source": [ @@ -864,19 +1027,19 @@ "True" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 17 + "execution_count": 16 }, { "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:46:09.139886Z", - "start_time": "2025-02-10T08:46:09.137577Z" + "end_time": "2025-04-15T09:36:04.863788Z", + "start_time": "2025-04-15T09:36:04.852010Z" } }, "source": [ @@ -886,14 +1049,14 @@ " os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI API key =\")" ], "outputs": [], - "execution_count": 18 + "execution_count": 17 }, { "cell_type": "code", "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:47:19.810461Z", - "start_time": "2025-02-10T08:46:10.995012Z" + "end_time": "2025-04-15T09:37:03.954718Z", + "start_time": "2025-04-15T09:36:09.023976Z" } }, "source": [ @@ -902,6 +1065,7 @@ "\n", "loader = PDFPlumberLoader(\n", " \"./example_data/layout-parser-paper.pdf\",\n", + " metadata_format=\"standard\",\n", " mode=\"page\",\n", " images_inner_format=\"markdown-img\",\n", " images_parser=LLMImageBlobParser(model=ChatOpenAI(model=\"gpt-4o\", max_tokens=1024)),\n", @@ -910,6 +1074,28 @@ "print(docs[5].page_content)" ], "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -941,71 +1127,44 @@ "levels of abstraction for the layout data, and a set of APIs are supported for\n", "transformations or operations on these classes.\n", "\n", - "![**Image Summary:**\n", - "Diagram illustrating coordinate, text block, and layout elements with transformation and operation APIs. Includes coordinate intervals, rectangles, quadrilaterals, and extra features like block text, block type, and reading order.\n", + "![**Image Summary:** Diagram illustrating a system for document layout processing, featuring elements like coordinate, textblock, and layout with features such as rectangles and quadrilaterals, combined with extra features for processing through transformation and operation APIs.\n", "\n", "**Extracted Text:**\n", "```\n", "Coordinate\n", - "\n", + "Coordinate\n", "(x1, y1)\n", + "Quadrilateral\n", + "(x1, y1)\n", + "(x2, y2)\n", "\n", "Rectangle\n", - "\n", "(x2, y2)\n", "\n", - "(x1, y1)\n", - "\n", - "(x2, y2)\n", - "\n", - "(x4, y4)\n", - "\n", - "(x3, y3)\n", - "\n", - "Quadrilateral\n", - "\n", - "The same transformation and operation APIs\n", - "\n", "textblock\n", - "\n", "Coordinate\n", - "\n", "+\n", - "\n", "Extra features\n", - "\n", "Block Text\n", - "\n", "Block Type\n", - "\n", - "Reading Order\n", - "\n", - "…\n", + "Reading\n", + "Order\n", + "...\n", "\n", "layout\n", "\n", - "[ coordinate1, textblock1, ...\n", - "\n", - "…, textblock2, layout1 \\\\]\n", - "\n", + "[ coordinate1, textblock1,\n", + "..., textblock2, layout1 \\\\]\n", "A list of the layout elements\n", "\n", - "x- interval\n", - "st a rt\n", - "\n", - "start\n", - "\n", - "y-interval\n", - "\n", - "en d\n", - "\n", - "end\n", + "The same transformation\n", + "and operation APIs\n", "```](#)\n", "\n" ] } ], - "execution_count": 19 + "execution_count": 18 }, { "cell_type": "markdown", @@ -1022,8 +1181,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2025-02-10T08:49:08.041155Z", - "start_time": "2025-02-10T08:48:40.584715Z" + "end_time": "2025-04-15T09:39:00.809544Z", + "start_time": "2025-04-15T09:38:58.847574Z" } }, "cell_type": "code", @@ -1044,6 +1203,30 @@ "pprint.pp(docs[0].metadata)" ], "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/pprados/workspace.bda/langchain/libs/community/langchain_community/document_loaders/parsers/pdf.py:1511: UserWarning: The default value 'legacy' use some CamelCase keys. It's will be deprecated in the next major version.\n", + " warnings.warn(\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n", + "CropBox missing from /Page, defaulting to MediaBox\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -1090,17 +1273,20 @@ "documentimageanalysis(DIA)tasksincludingdocumentimageclassification[11,\n", "1202 nuJ 12 ]VC.sc[ 2v84351.3012:viXra\n", "\n", - "{'author': '',\n", - " 'creationdate': '2021-06-22T01:27:10+00:00',\n", - " 'creator': 'LaTeX with hyperref',\n", - " 'keywords': '',\n", - " 'moddate': '2021-06-22T01:27:10+00:00',\n", - " 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live '\n", + "{'producer': 'PDFPlumber',\n", + " 'creator': 'PDFPlumber',\n", + " 'creationdate': '',\n", + " 'Author': '',\n", + " 'CreationDate': 'D:20210622012710Z',\n", + " 'Creator': 'LaTeX with hyperref',\n", + " 'Keywords': '',\n", + " 'ModDate': 'D:20210622012710Z',\n", + " 'PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live '\n", " '2020) kpathsea version 6.3.2',\n", - " 'producer': 'pdfTeX-1.40.21',\n", - " 'subject': '',\n", - " 'title': '',\n", - " 'trapped': 'False',\n", + " 'Producer': 'pdfTeX-1.40.21',\n", + " 'Subject': '',\n", + " 'Title': '',\n", + " 'Trapped': 'False',\n", " 'source': 'example_data/layout-parser-paper.pdf',\n", " 'file_path': 'example_data/layout-parser-paper.pdf',\n", " 'total_pages': 16,\n", @@ -1121,13 +1307,49 @@ "source": [ "from langchain_community.document_loaders import CloudBlobLoader\n", "from langchain_community.document_loaders.generic import GenericLoader\n", + "from langchain_community.document_loaders.parsers import PDFPlumberParser\n", "\n", "loader = GenericLoader(\n", " blob_loader=CloudBlobLoader(\n", " url=\"s3://mybucket\", # Supports s3://, az://, gs://, file:// schemes.\n", " glob=\"*.pdf\",\n", " ),\n", - " blob_parser=PDFPlumberParser(),\n", + " blob_parser=PDFPlumberParser(\n", + " metadata_format=\"standard\",\n", + " ),\n", + ")\n", + "docs = loader.load()\n", + "print(docs[0].page_content)\n", + "pprint.pp(docs[0].metadata)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For detailed documentation of all `PDFMinerLoader` features and configurations head to the API reference: https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "from langchain_community.document_loaders import FileSystemBlobLoader\n", + "from langchain_community.document_loaders.generic import GenericLoader\n", + "from langchain_community.document_loaders.parsers import PDFPlumberParser\n", + "\n", + "loader = GenericLoader(\n", + " blob_loader=FileSystemBlobLoader(\n", + " path=\"./example_data/\",\n", + " glob=\"*.pdf\",\n", + " ),\n", + " blob_parser=PDFPlumberParser(\n", + " metadata_format=\"standard\",\n", + " ),\n", ")\n", "docs = loader.load()\n", "print(docs[0].page_content)\n", @@ -1158,184 +1380,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -======= - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PDFPlumber\n", - "\n", - "Like PyMuPDF, the output Documents contain detailed metadata about the PDF and its pages, and returns one document per page.\n", - "\n", - "## Overview\n", - "### Integration details\n", - "\n", - "| Class | Package | Local | Serializable | JS support|\n", - "| :--- | :--- | :---: | :---: | :---: |\n", - "| [PDFPlumberLoader](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html) | [langchain_community](https://python.langchain.com/api_reference/community/index.html) | ✅ | ❌ | ❌ | \n", - "### Loader features\n", - "| Source | Document Lazy Loading | Native Async Support\n", - "| :---: | :---: | :---: | \n", - "| PDFPlumberLoader | ✅ | ❌ | \n", - "\n", - "## Setup\n", - "\n", - "### Credentials\n", - "\n", - "No credentials are needed to use this loader." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": "To enable automated tracing of your model calls, set your [LangSmith](https://docs.smith.langchain.com/) API key:" - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n", - "# os.environ[\"LANGSMITH_TRACING\"] = \"true\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Installation\n", - "\n", - "Install **langchain_community**." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -qU langchain_community" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initialization\n", - "\n", - "Now we can instantiate our model object and load documents:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import PDFPlumberLoader\n", - "\n", - "loader = PDFPlumberLoader(\"./example_data/layout-parser-paper.pdf\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'Author': '', 'CreationDate': 'D:20210622012710Z', 'Creator': 'LaTeX with hyperref', 'Keywords': '', 'ModDate': 'D:20210622012710Z', 'PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'Producer': 'pdfTeX-1.40.21', 'Subject': '', 'Title': '', 'Trapped': 'False'}, page_content='LayoutParser: A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1 Allen Institute for AI\\nshannons@allenai.org\\n2 Brown University\\nruochen zhang@brown.edu\\n3 Harvard University\\n{melissadell,jacob carlson}@fas.harvard.edu\\n4 University of Washington\\nbcgl@cs.washington.edu\\n5 University of Waterloo\\nw422li@uwaterloo.ca\\nAbstract. Recentadvancesindocumentimageanalysis(DIA)havebeen\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomescouldbeeasilydeployedinproductionandextendedforfurther\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\nportantinnovationsbyawideaudience.Thoughtherehavebeenon-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopmentindisciplineslikenaturallanguageprocessingandcomputer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademicresearchacross awiderangeof disciplinesinthesocialsciences\\nand humanities. This paper introduces LayoutParser, an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitiveinterfacesforapplyingandcustomizingDLmodelsforlayoutde-\\ntection,characterrecognition,andmanyotherdocumentprocessingtasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io.\\nKeywords: DocumentImageAnalysis·DeepLearning·LayoutAnalysis\\n· Character Recognition · Open Source library · Toolkit.\\n1 Introduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocumentimageanalysis(DIA)tasksincludingdocumentimageclassification[11,\\n1202\\nnuJ\\n12\\n]VC.sc[\\n2v84351.3012:viXra\\n')" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs = loader.load()\n", - "docs[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'Author': '', 'CreationDate': 'D:20210622012710Z', 'Creator': 'LaTeX with hyperref', 'Keywords': '', 'ModDate': 'D:20210622012710Z', 'PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'Producer': 'pdfTeX-1.40.21', 'Subject': '', 'Title': '', 'Trapped': 'False'}\n" - ] - } - ], - "source": [ - "print(docs[0].metadata)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Lazy Load" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "page = []\n", - "for doc in loader.lazy_load():\n", - " page.append(doc)\n", - " if len(page) >= 10:\n", - " # do some paged operation, e.g.\n", - " # index.upsert(page)\n", - "\n", - " page = []" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## API reference\n", - "\n", - "For detailed documentation of all PDFPlumberLoader features and configurations head to the API reference: https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 }