mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-06 15:13:56 +00:00
Fix notebooks
This commit is contained in:
parent
61b68bb8e8
commit
ada3ba2567
@ -6,7 +6,7 @@
|
||||
"source": [
|
||||
"# PDFPlumberLoader\n",
|
||||
"\n",
|
||||
"This notebook provides a quick overview for getting started with `PDFMiner` [document loader](https://python.langchain.com/docs/concepts/document_loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html).\n",
|
||||
"This sample provides a quick overview for getting started with `PDFPlumber` [document loader](https://python.langchain.com/docs/concepts/document_loaders). For detailed documentation of all PDFPlumberLoader features and configurations head to the [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html).\n",
|
||||
"\n",
|
||||
" \n",
|
||||
"\n",
|
||||
@ -14,16 +14,16 @@
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"|:---------------------------------------------------------------------------------------------------------------------------------------------------------| :--- | :---: | :---: | :---: |\n",
|
||||
"| [PDFPlumberLoader](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html) | [langchain_community](https://python.langchain.com/api_reference/community/index.html) | ✅ | ❌ | ❌ |\n",
|
||||
"|:-----------------------------------------------------------------------------------------------------------------------------------------------------| :--- | :---: | :---: | :---: |\n",
|
||||
"| [PDFMinerLoader](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFMinerLoader.html) | [langchain_community](https://python.langchain.com/api_reference/community/index.html) | ✅ | ❌ | ❌ |\n",
|
||||
"\n",
|
||||
"--------- \n",
|
||||
"\n",
|
||||
"### Loader features\n",
|
||||
"\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support | Extract Images | Extract Tables |\n",
|
||||
"|:----------------:| :---: | :---: | :---: |:---: |\n",
|
||||
"| PDFPlumberLoader | ✅ | ❌ | ✅ | ✅ |\n",
|
||||
"|:--------------:| :---: | :---: | :---: |:---: |\n",
|
||||
"| PDFMinerLoader | ✅ | ❌ | ✅ | ✅ |\n",
|
||||
"\n",
|
||||
" \n",
|
||||
"\n",
|
||||
@ -31,7 +31,7 @@
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are required to use PyMuPDFLoader"
|
||||
"No credentials are required to use PDFPlumberLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -43,8 +43,8 @@
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:39:37.440900Z",
|
||||
"start_time": "2025-02-10T08:39:37.438441Z"
|
||||
"end_time": "2025-04-15T09:32:31.030959Z",
|
||||
"start_time": "2025-04-15T09:32:31.027427Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
@ -60,15 +60,15 @@
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community** and **pymupdf**."
|
||||
"Install **langchain_community** and **pdfplumber**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:39:41.487372Z",
|
||||
"start_time": "2025-02-10T08:39:39.209073Z"
|
||||
"end_time": "2025-04-15T09:32:34.953716Z",
|
||||
"start_time": "2025-04-15T09:32:32.674410Z"
|
||||
}
|
||||
},
|
||||
"source": "%pip install -qU langchain_community pdfplumber",
|
||||
@ -77,7 +77,8 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
"\u001B[33mWARNING: There was an error checking the latest version of pip.\u001B[0m\u001B[33m\r\n",
|
||||
"\u001B[0mNote: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -96,15 +97,15 @@
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:39:45.246502Z",
|
||||
"start_time": "2025-02-10T08:39:44.229183Z"
|
||||
"end_time": "2025-04-15T09:32:55.327932Z",
|
||||
"start_time": "2025-04-15T09:32:54.354899Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import PDFPlumberLoader\n",
|
||||
"\n",
|
||||
"file_path = \"./example_data/layout-parser-paper.pdf\"\n",
|
||||
"loader = PDFPlumberLoader(file_path)"
|
||||
"loader = PDFPlumberLoader(file_path, metadata_format=\"standard\")"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 3
|
||||
@ -120,8 +121,8 @@
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:40:11.901128Z",
|
||||
"start_time": "2025-02-10T08:39:46.905899Z"
|
||||
"end_time": "2025-04-15T09:32:59.036774Z",
|
||||
"start_time": "2025-04-15T09:32:57.033035Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
@ -129,6 +130,28 @@
|
||||
"docs[0]"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
@ -146,8 +169,8 @@
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:40:20.094848Z",
|
||||
"start_time": "2025-02-10T08:40:20.083124Z"
|
||||
"end_time": "2025-04-15T09:32:59.047149Z",
|
||||
"start_time": "2025-04-15T09:32:59.043526Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
@ -191,8 +214,8 @@
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:40:45.605691Z",
|
||||
"start_time": "2025-02-10T08:40:22.639608Z"
|
||||
"end_time": "2025-04-15T09:33:03.931290Z",
|
||||
"start_time": "2025-04-15T09:33:02.092848Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
@ -207,6 +230,28 @@
|
||||
"len(pages)"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
@ -224,8 +269,8 @@
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:40:46.969036Z",
|
||||
"start_time": "2025-02-10T08:40:46.964794Z"
|
||||
"end_time": "2025-04-15T09:33:05.116002Z",
|
||||
"start_time": "2025-04-15T09:33:05.102235Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
@ -290,7 +335,7 @@
|
||||
"- By page\n",
|
||||
"- As a single text flow\n",
|
||||
"\n",
|
||||
"By default PDFPlumberLoader will split the PDF by page."
|
||||
"By default PDFMinerLoader will split the PDF by page."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -304,13 +349,14 @@
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:41:14.319842Z",
|
||||
"start_time": "2025-02-10T08:40:50.665569Z"
|
||||
"end_time": "2025-04-15T09:33:13.625065Z",
|
||||
"start_time": "2025-04-15T09:33:11.686326Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"loader = PDFPlumberLoader(\n",
|
||||
" \"./example_data/layout-parser-paper.pdf\",\n",
|
||||
" metadata_format=\"standard\",\n",
|
||||
" mode=\"page\",\n",
|
||||
")\n",
|
||||
"docs = loader.load()\n",
|
||||
@ -318,6 +364,28 @@
|
||||
"pprint.pp(docs[0].metadata)"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
@ -361,13 +429,14 @@
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:41:41.244786Z",
|
||||
"start_time": "2025-02-10T08:41:17.564901Z"
|
||||
"end_time": "2025-04-15T09:33:30.520801Z",
|
||||
"start_time": "2025-04-15T09:33:28.785067Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"loader = PDFPlumberLoader(\n",
|
||||
" \"./example_data/layout-parser-paper.pdf\",\n",
|
||||
" metadata_format=\"standard\",\n",
|
||||
" mode=\"single\",\n",
|
||||
")\n",
|
||||
"docs = loader.load()\n",
|
||||
@ -375,6 +444,28 @@
|
||||
"pprint.pp(docs[0].metadata)"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
@ -415,13 +506,14 @@
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:42:07.936745Z",
|
||||
"start_time": "2025-02-10T08:41:44.463505Z"
|
||||
"end_time": "2025-04-15T09:33:34.209872Z",
|
||||
"start_time": "2025-04-15T09:33:32.242569Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"loader = PDFPlumberLoader(\n",
|
||||
" \"./example_data/layout-parser-paper.pdf\",\n",
|
||||
" metadata_format=\"standard\",\n",
|
||||
" mode=\"single\",\n",
|
||||
" pages_delimiter=\"\\n-------THIS IS A CUSTOM END OF PAGE-------\\n\",\n",
|
||||
")\n",
|
||||
@ -429,6 +521,28 @@
|
||||
"print(docs[0].page_content[:5780])"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
@ -570,8 +684,8 @@
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:42:14.077933Z",
|
||||
"start_time": "2025-02-10T08:42:12.007265Z"
|
||||
"end_time": "2025-04-15T09:33:56.217580Z",
|
||||
"start_time": "2025-04-15T09:33:42.788726Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
@ -582,7 +696,8 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
"\u001B[33mWARNING: There was an error checking the latest version of pip.\u001B[0m\u001B[33m\r\n",
|
||||
"\u001B[0mNote: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -592,8 +707,8 @@
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:42:38.934138Z",
|
||||
"start_time": "2025-02-10T08:42:15.188110Z"
|
||||
"end_time": "2025-04-15T09:34:24.118706Z",
|
||||
"start_time": "2025-04-15T09:33:56.230529Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
@ -601,6 +716,7 @@
|
||||
"\n",
|
||||
"loader = PDFPlumberLoader(\n",
|
||||
" \"./example_data/layout-parser-paper.pdf\",\n",
|
||||
" metadata_format=\"standard\",\n",
|
||||
" mode=\"page\",\n",
|
||||
" images_inner_format=\"markdown-img\",\n",
|
||||
" images_parser=RapidOCRBlobParser(),\n",
|
||||
@ -610,6 +726,28 @@
|
||||
"print(docs[5].page_content)"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
@ -701,8 +839,8 @@
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:44:01.430937Z",
|
||||
"start_time": "2025-02-10T08:43:59.573391Z"
|
||||
"end_time": "2025-04-15T09:35:29.381269Z",
|
||||
"start_time": "2025-04-15T09:35:26.711980Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
@ -713,7 +851,8 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
"\u001B[33mWARNING: There was an error checking the latest version of pip.\u001B[0m\u001B[33m\r\n",
|
||||
"\u001B[0mNote: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -723,8 +862,8 @@
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:44:12.786743Z",
|
||||
"start_time": "2025-02-10T08:44:02.309333Z"
|
||||
"end_time": "2025-04-15T09:35:45.154957Z",
|
||||
"start_time": "2025-04-15T09:35:33.162485Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
@ -732,6 +871,7 @@
|
||||
"\n",
|
||||
"loader = PDFPlumberLoader(\n",
|
||||
" \"./example_data/layout-parser-paper.pdf\",\n",
|
||||
" metadata_format=\"standard\",\n",
|
||||
" mode=\"page\",\n",
|
||||
" images_inner_format=\"html-img\",\n",
|
||||
" images_parser=TesseractBlobParser(),\n",
|
||||
@ -740,6 +880,28 @@
|
||||
"print(docs[5].page_content)"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
@ -824,8 +986,8 @@
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:46:05.694249Z",
|
||||
"start_time": "2025-02-10T08:46:03.558918Z"
|
||||
"end_time": "2025-04-15T09:36:02.237828Z",
|
||||
"start_time": "2025-04-15T09:35:57.078164Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
@ -836,18 +998,19 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
"\u001B[33mWARNING: There was an error checking the latest version of pip.\u001B[0m\u001B[33m\r\n",
|
||||
"\u001B[0mNote: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 16
|
||||
"execution_count": 15
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:46:07.818185Z",
|
||||
"start_time": "2025-02-10T08:46:07.794265Z"
|
||||
"end_time": "2025-04-15T09:36:03.749164Z",
|
||||
"start_time": "2025-04-15T09:36:03.558685Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
@ -864,19 +1027,19 @@
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"execution_count": 17
|
||||
"execution_count": 16
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:46:09.139886Z",
|
||||
"start_time": "2025-02-10T08:46:09.137577Z"
|
||||
"end_time": "2025-04-15T09:36:04.863788Z",
|
||||
"start_time": "2025-04-15T09:36:04.852010Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
@ -886,14 +1049,14 @@
|
||||
" os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI API key =\")"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 18
|
||||
"execution_count": 17
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:47:19.810461Z",
|
||||
"start_time": "2025-02-10T08:46:10.995012Z"
|
||||
"end_time": "2025-04-15T09:37:03.954718Z",
|
||||
"start_time": "2025-04-15T09:36:09.023976Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
@ -902,6 +1065,7 @@
|
||||
"\n",
|
||||
"loader = PDFPlumberLoader(\n",
|
||||
" \"./example_data/layout-parser-paper.pdf\",\n",
|
||||
" metadata_format=\"standard\",\n",
|
||||
" mode=\"page\",\n",
|
||||
" images_inner_format=\"markdown-img\",\n",
|
||||
" images_parser=LLMImageBlobParser(model=ChatOpenAI(model=\"gpt-4o\", max_tokens=1024)),\n",
|
||||
@ -910,6 +1074,28 @@
|
||||
"print(docs[5].page_content)"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
@ -941,71 +1127,44 @@
|
||||
"levels of abstraction for the layout data, and a set of APIs are supported for\n",
|
||||
"transformations or operations on these classes.\n",
|
||||
"\n",
|
||||
"![**Image Summary:**\n",
|
||||
"Diagram illustrating coordinate, text block, and layout elements with transformation and operation APIs. Includes coordinate intervals, rectangles, quadrilaterals, and extra features like block text, block type, and reading order.\n",
|
||||
"![**Image Summary:** Diagram illustrating a system for document layout processing, featuring elements like coordinate, textblock, and layout with features such as rectangles and quadrilaterals, combined with extra features for processing through transformation and operation APIs.\n",
|
||||
"\n",
|
||||
"**Extracted Text:**\n",
|
||||
"```\n",
|
||||
"Coordinate\n",
|
||||
"\n",
|
||||
"Coordinate\n",
|
||||
"(x1, y1)\n",
|
||||
"Quadrilateral\n",
|
||||
"(x1, y1)\n",
|
||||
"(x2, y2)\n",
|
||||
"\n",
|
||||
"Rectangle\n",
|
||||
"\n",
|
||||
"(x2, y2)\n",
|
||||
"\n",
|
||||
"(x1, y1)\n",
|
||||
"\n",
|
||||
"(x2, y2)\n",
|
||||
"\n",
|
||||
"(x4, y4)\n",
|
||||
"\n",
|
||||
"(x3, y3)\n",
|
||||
"\n",
|
||||
"Quadrilateral\n",
|
||||
"\n",
|
||||
"The same transformation and operation APIs\n",
|
||||
"\n",
|
||||
"textblock\n",
|
||||
"\n",
|
||||
"Coordinate\n",
|
||||
"\n",
|
||||
"+\n",
|
||||
"\n",
|
||||
"Extra features\n",
|
||||
"\n",
|
||||
"Block Text\n",
|
||||
"\n",
|
||||
"Block Type\n",
|
||||
"\n",
|
||||
"Reading Order\n",
|
||||
"\n",
|
||||
"…\n",
|
||||
"Reading\n",
|
||||
"Order\n",
|
||||
"...\n",
|
||||
"\n",
|
||||
"layout\n",
|
||||
"\n",
|
||||
"[ coordinate1, textblock1, ...\n",
|
||||
"\n",
|
||||
"…, textblock2, layout1 \\\\]\n",
|
||||
"\n",
|
||||
"[ coordinate1, textblock1,\n",
|
||||
"..., textblock2, layout1 \\\\]\n",
|
||||
"A list of the layout elements\n",
|
||||
"\n",
|
||||
"x- interval\n",
|
||||
"st a rt\n",
|
||||
"\n",
|
||||
"start\n",
|
||||
"\n",
|
||||
"y-interval\n",
|
||||
"\n",
|
||||
"en d\n",
|
||||
"\n",
|
||||
"end\n",
|
||||
"The same transformation\n",
|
||||
"and operation APIs\n",
|
||||
"```](#)\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 19
|
||||
"execution_count": 18
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
@ -1022,8 +1181,8 @@
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-02-10T08:49:08.041155Z",
|
||||
"start_time": "2025-02-10T08:48:40.584715Z"
|
||||
"end_time": "2025-04-15T09:39:00.809544Z",
|
||||
"start_time": "2025-04-15T09:38:58.847574Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
@ -1044,6 +1203,30 @@
|
||||
"pprint.pp(docs[0].metadata)"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/pprados/workspace.bda/langchain/libs/community/langchain_community/document_loaders/parsers/pdf.py:1511: UserWarning: The default value 'legacy' use some CamelCase keys. It's will be deprecated in the next major version.\n",
|
||||
" warnings.warn(\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n",
|
||||
"CropBox missing from /Page, defaulting to MediaBox\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
@ -1090,17 +1273,20 @@
|
||||
"documentimageanalysis(DIA)tasksincludingdocumentimageclassification[11,\n",
|
||||
"1202 nuJ 12 ]VC.sc[ 2v84351.3012:viXra\n",
|
||||
"\n",
|
||||
"{'author': '',\n",
|
||||
" 'creationdate': '2021-06-22T01:27:10+00:00',\n",
|
||||
" 'creator': 'LaTeX with hyperref',\n",
|
||||
" 'keywords': '',\n",
|
||||
" 'moddate': '2021-06-22T01:27:10+00:00',\n",
|
||||
" 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live '\n",
|
||||
"{'producer': 'PDFPlumber',\n",
|
||||
" 'creator': 'PDFPlumber',\n",
|
||||
" 'creationdate': '',\n",
|
||||
" 'Author': '',\n",
|
||||
" 'CreationDate': 'D:20210622012710Z',\n",
|
||||
" 'Creator': 'LaTeX with hyperref',\n",
|
||||
" 'Keywords': '',\n",
|
||||
" 'ModDate': 'D:20210622012710Z',\n",
|
||||
" 'PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live '\n",
|
||||
" '2020) kpathsea version 6.3.2',\n",
|
||||
" 'producer': 'pdfTeX-1.40.21',\n",
|
||||
" 'subject': '',\n",
|
||||
" 'title': '',\n",
|
||||
" 'trapped': 'False',\n",
|
||||
" 'Producer': 'pdfTeX-1.40.21',\n",
|
||||
" 'Subject': '',\n",
|
||||
" 'Title': '',\n",
|
||||
" 'Trapped': 'False',\n",
|
||||
" 'source': 'example_data/layout-parser-paper.pdf',\n",
|
||||
" 'file_path': 'example_data/layout-parser-paper.pdf',\n",
|
||||
" 'total_pages': 16,\n",
|
||||
@ -1121,13 +1307,49 @@
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import CloudBlobLoader\n",
|
||||
"from langchain_community.document_loaders.generic import GenericLoader\n",
|
||||
"from langchain_community.document_loaders.parsers import PDFPlumberParser\n",
|
||||
"\n",
|
||||
"loader = GenericLoader(\n",
|
||||
" blob_loader=CloudBlobLoader(\n",
|
||||
" url=\"s3://mybucket\", # Supports s3://, az://, gs://, file:// schemes.\n",
|
||||
" glob=\"*.pdf\",\n",
|
||||
" ),\n",
|
||||
" blob_parser=PDFPlumberParser(),\n",
|
||||
" blob_parser=PDFPlumberParser(\n",
|
||||
" metadata_format=\"standard\",\n",
|
||||
" ),\n",
|
||||
")\n",
|
||||
"docs = loader.load()\n",
|
||||
"print(docs[0].page_content)\n",
|
||||
"pprint.pp(docs[0].metadata)"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all `PDFMinerLoader` features and configurations head to the API reference: https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import FileSystemBlobLoader\n",
|
||||
"from langchain_community.document_loaders.generic import GenericLoader\n",
|
||||
"from langchain_community.document_loaders.parsers import PDFPlumberParser\n",
|
||||
"\n",
|
||||
"loader = GenericLoader(\n",
|
||||
" blob_loader=FileSystemBlobLoader(\n",
|
||||
" path=\"./example_data/\",\n",
|
||||
" glob=\"*.pdf\",\n",
|
||||
" ),\n",
|
||||
" blob_parser=PDFPlumberParser(\n",
|
||||
" metadata_format=\"standard\",\n",
|
||||
" ),\n",
|
||||
")\n",
|
||||
"docs = loader.load()\n",
|
||||
"print(docs[0].page_content)\n",
|
||||
@ -1158,184 +1380,4 @@
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
=======
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# PDFPlumber\n",
|
||||
"\n",
|
||||
"Like PyMuPDF, the output Documents contain detailed metadata about the PDF and its pages, and returns one document per page.\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [PDFPlumberLoader](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html) | [langchain_community](https://python.langchain.com/api_reference/community/index.html) | ✅ | ❌ | ❌ | \n",
|
||||
"### Loader features\n",
|
||||
"| Source | Document Lazy Loading | Native Async Support\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| PDFPlumberLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"No credentials are needed to use this loader."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": "To enable automated tracing of your model calls, set your [LangSmith](https://docs.smith.langchain.com/) API key:"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install **langchain_community**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain_community"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our model object and load documents:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import PDFPlumberLoader\n",
|
||||
"\n",
|
||||
"loader = PDFPlumberLoader(\"./example_data/layout-parser-paper.pdf\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(metadata={'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'Author': '', 'CreationDate': 'D:20210622012710Z', 'Creator': 'LaTeX with hyperref', 'Keywords': '', 'ModDate': 'D:20210622012710Z', 'PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'Producer': 'pdfTeX-1.40.21', 'Subject': '', 'Title': '', 'Trapped': 'False'}, page_content='LayoutParser: A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1 Allen Institute for AI\\nshannons@allenai.org\\n2 Brown University\\nruochen zhang@brown.edu\\n3 Harvard University\\n{melissadell,jacob carlson}@fas.harvard.edu\\n4 University of Washington\\nbcgl@cs.washington.edu\\n5 University of Waterloo\\nw422li@uwaterloo.ca\\nAbstract. Recentadvancesindocumentimageanalysis(DIA)havebeen\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomescouldbeeasilydeployedinproductionandextendedforfurther\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\nportantinnovationsbyawideaudience.Thoughtherehavebeenon-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopmentindisciplineslikenaturallanguageprocessingandcomputer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademicresearchacross awiderangeof disciplinesinthesocialsciences\\nand humanities. This paper introduces LayoutParser, an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitiveinterfacesforapplyingandcustomizingDLmodelsforlayoutde-\\ntection,characterrecognition,andmanyotherdocumentprocessingtasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io.\\nKeywords: DocumentImageAnalysis·DeepLearning·LayoutAnalysis\\n· Character Recognition · Open Source library · Toolkit.\\n1 Introduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocumentimageanalysis(DIA)tasksincludingdocumentimageclassification[11,\\n1202\\nnuJ\\n12\\n]VC.sc[\\n2v84351.3012:viXra\\n')"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'source': './example_data/layout-parser-paper.pdf', 'file_path': './example_data/layout-parser-paper.pdf', 'page': 0, 'total_pages': 16, 'Author': '', 'CreationDate': 'D:20210622012710Z', 'Creator': 'LaTeX with hyperref', 'Keywords': '', 'ModDate': 'D:20210622012710Z', 'PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'Producer': 'pdfTeX-1.40.21', 'Subject': '', 'Title': '', 'Trapped': 'False'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
"\n",
|
||||
" page = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all PDFPlumberLoader features and configurations head to the API reference: https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.PDFPlumberLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user