From 05d125ac235bffe086e7bf0218519e4dd753a57f Mon Sep 17 00:00:00 2001
From: Harrison Chase <harrisonchase@Harrisons-MBP.attlocal.net>
Date: Thu, 9 Feb 2023 23:44:14 -0800
Subject: [PATCH] cr

---
 .../examples/paged_pdf_splitter.ipynb         | 100 +++++++++++++++---
 langchain/document_loaders/paged_pdf.py       |  54 +++-------
 poetry.lock                                   |   2 +-
 .../test_pdf_pagesplitter.py                  |   6 +-
 4 files changed, 109 insertions(+), 53 deletions(-)

diff --git a/docs/modules/document_loaders/examples/paged_pdf_splitter.ipynb b/docs/modules/document_loaders/examples/paged_pdf_splitter.ipynb
index 6586e1c1dec..0258af0ca3a 100644
--- a/docs/modules/document_loaders/examples/paged_pdf_splitter.ipynb
+++ b/docs/modules/document_loaders/examples/paged_pdf_splitter.ipynb
@@ -1,7 +1,6 @@
 {
  "cells": [
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -19,20 +18,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "from langchain.document_loaders import PagedPDFSplitter\n",
     "\n",
-    "loader = PagedPDFSplitter(chunk_size=250)\n",
-    "splits, metadatas = loader.load_and_split(\n",
-    "    \"examples/example_data/layout-parser-paper.pdf\"\n",
-    ")"
+    "loader = PagedPDFSplitter(\"example_data/layout-parser-paper.pdf\")\n",
+    "pages = loader.load_and_split()"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -43,18 +39,97 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "9: 10 Z. Shen et al.\n",
+      "Fig. 4: Illustration of (a) the original historical Japanese document with layout\n",
+      "detection results and (b) a recreated version of the document image that achieves\n",
+      "much better character recognition recall. The reorganization algorithm rearranges\n",
+      "the tokens based on the their detected bounding boxes given a maximum allowed\n",
+      "height.\n",
+      "4LayoutParser Community Platform\n",
+      "Another focus of LayoutParser is promoting the reusability of layout detection\n",
+      "models and full digitization pipelines. Similar to many existing deep learning\n",
+      "libraries, LayoutParser comes with a community model hub for distributing\n",
+      "layout models. End-users can upload their self-trained models to the model hub,\n",
+      "and these models can be loaded into a similar interface as the currently available\n",
+      "LayoutParser pre-trained models. For example, the model trained on the News\n",
+      "Navigator dataset [17] has been incorporated in the model hub.\n",
+      "Beyond DL models, LayoutParser also promotes the sharing of entire doc-\n",
+      "ument digitization pipelines. For example, sometimes the pipeline requires the\n",
+      "combination of multiple DL models to achieve better accuracy. Currently, pipelines\n",
+      "are mainly described in academic papers and implementations are often not pub-\n",
+      "licly available. To this end, the LayoutParser community platform also enables\n",
+      "the sharing of layout pipelines to promote the discussion and reuse of techniques.\n",
+      "For each shared pipeline, it has a dedicated project page, with links to the source\n",
+      "code, documentation, and an outline of the approaches. A discussion panel is\n",
+      "provided for exchanging ideas. Combined with the core LayoutParser library,\n",
+      "users can easily build reusable components based on the shared pipelines and\n",
+      "apply them to solve their unique problems.\n",
+      "5 Use Cases\n",
+      "The core objective of LayoutParser is to make it easier to create both large-scale\n",
+      "and light-weight document digitization pipelines. Large-scale document processing\n",
+      "3: 4 Z. Shen et al.\n",
+      "Efficient Data AnnotationC u s t o m i z e d  M o d e l  T r a i n i n gModel Cust omizationDI A Model HubDI A Pipeline SharingCommunity PlatformLa y out Detection ModelsDocument Images \n",
+      "T h e  C o r e  L a y o u t P a r s e r  L i b r a r yOCR ModuleSt or age & VisualizationLa y out Data Structur e\n",
+      "Fig. 1: The overall architecture of LayoutParser . For an input document image,\n",
+      "the core LayoutParser library provides a set of o\u000b",
+      "-the-shelf tools for layout\n",
+      "detection, OCR, visualization, and storage, backed by a carefully designed layout\n",
+      "data structure. LayoutParser also supports high level customization via e\u000ecient\n",
+      "layout annotation and model training functions. These improve model accuracy\n",
+      "on the target samples. The community platform enables the easy sharing of DIA\n",
+      "models and whole digitization pipelines to promote reusability and reproducibility.\n",
+      "A collection of detailed documentation, tutorials and exemplar projects make\n",
+      "LayoutParser easy to learn and use.\n",
+      "AllenNLP [ 8] and transformers [ 34] have provided the community with complete\n",
+      "DL-based support for developing and deploying models for general computer\n",
+      "vision and natural language processing problems. LayoutParser , on the other\n",
+      "hand, specializes speci\f",
+      "cally in DIA tasks. LayoutParser is also equipped with a\n",
+      "community platform inspired by established model hubs such as Torch Hub [23]\n",
+      "andTensorFlow Hub [1]. It enables the sharing of pretrained models as well as\n",
+      "full document processing pipelines that are unique to DIA tasks.\n",
+      "There have been a variety of document data collections to facilitate the\n",
+      "development of DL models. Some examples include PRImA [ 3](magazine layouts),\n",
+      "PubLayNet [ 38](academic paper layouts), Table Bank [ 18](tables in academic\n",
+      "papers), Newspaper Navigator Dataset [ 16,17](newspaper \f",
+      "gure layouts) and\n",
+      "HJDataset [31](historical Japanese document layouts). A spectrum of models\n",
+      "trained on these datasets are currently available in the LayoutParser model zoo\n",
+      "to support di\u000b",
+      "erent use cases.\n",
+      "3 The Core LayoutParser Library\n",
+      "At the core of LayoutParser is an o\u000b",
+      "-the-shelf toolkit that streamlines DL-\n",
+      "based document image analysis. Five components support a simple interface\n",
+      "with comprehensive functionalities: 1) The layout detection models enable using\n",
+      "pre-trained or self-trained DL models for layout detection with just four lines\n",
+      "of code. 2) The detected layout information is stored in carefully engineered\n"
+     ]
+    }
+   ],
    "source": [
     "from langchain.vectorstores import FAISS\n",
     "from langchain.embeddings.openai import OpenAIEmbeddings\n",
     "\n",
-    "faiss_index = FAISS.from_texts(splits, OpenAIEmbeddings(), metadatas=metadatas)\n",
+    "faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings())\n",
     "docs = faiss_index.similarity_search(\"How will the community be engaged?\", k=2)\n",
     "for doc in docs:\n",
-    "    print(doc.metadata[\"pages\"] + \":\", doc.page_content)"
+    "    print(str(doc.metadata[\"page\"]) + \":\", doc.page_content)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -73,9 +148,8 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.0"
+   "version": "3.9.1"
   },
-  "orig_nbformat": 4,
   "vscode": {
    "interpreter": {
     "hash": "5c7b89af1651d0b8571dde13640ecdccf7d5a6204171d6ab33e7c296e100e08a"
diff --git a/langchain/document_loaders/paged_pdf.py b/langchain/document_loaders/paged_pdf.py
index 081705db958..940f2965f78 100644
--- a/langchain/document_loaders/paged_pdf.py
+++ b/langchain/document_loaders/paged_pdf.py
@@ -1,14 +1,17 @@
 """Loads a PDF with pypdf and chunks at character level."""
 from typing import Dict, List, Optional, Tuple
+from langchain.document_loaders.base import BaseLoader
+from langchain.docstore.document import Document
 
 
-class PagedPDFSplitter:
+class PagedPDFSplitter(BaseLoader):
     """Loads a PDF with pypdf and chunks at character level.
 
     Loader also stores page numbers in metadatas.
     """
 
-    def __init__(self, chunk_size: int = 4000, chunk_overlap: int = 200):
+
+    def __init__(self, file_path: str):
         """Initialize with file path."""
         try:
             import pypdf  # noqa:F401
@@ -16,41 +19,18 @@ class PagedPDFSplitter:
             raise ValueError(
                 "pypdf package not found, please install it with " "`pip install pypdf`"
             )
-        self.chunk_size = chunk_size
-        self.chunk_overlap = chunk_overlap
+        self._file_path = file_path
 
-    def load_and_split(
-        self, file_path: str, metadata: Optional[Dict] = None
-    ) -> Tuple[List[str], List[Dict]]:
-        """Load given path and split into texts and metadatas.
-
-        If given, the metadata given will
-        be duplicated and attached to each split along with page number.
-        If key is present in metadata, it also has page number
-        included (e.g., Foo2012 Pages 3-4).
-        """
+    def load(self) -> List[Document]:
+        """Load given path as pages."""
         import pypdf
 
-        pdfFileObj = open(file_path, "rb")
-        pdfReader = pypdf.PdfReader(pdfFileObj)
-        splits = []
-        split = ""
-        pages = []
-        metadatas = []
-        key = (
-            metadata["key"] if metadata is not None and "key" in metadata else file_path
-        )
-        for i, page in enumerate(pdfReader.pages):
-            split += page.extract_text()
-            pages.append(str(i + 1))
-            if len(split) > self.chunk_size or i == len(pdfReader.pages) - 1:
-                splits.append(split[: self.chunk_size])
-                # pretty formatting of pages (e.g. 1-3, 4, 5-7)
-                pg = "-".join([pages[0], pages[-1]])
-                metadatas.append(dict(key=f"{key} pages {pg}", pages=pg))
-                if metadata is not None:
-                    metadatas[-1].update(metadata)
-                split = str(splits[self.chunk_size: self.chunk_overlap])
-                pages = [str(i + 1)]
-        pdfFileObj.close()
-        return splits, metadatas
+        pdf_file_obj = open(self._file_path, "rb")
+        pdf_reader = pypdf.PdfReader(pdf_file_obj)
+        docs = []
+        for i, page in enumerate(pdf_reader.pages):
+            text = page.extract_text()
+            metadata = {"source": self._file_path, "page": i}
+            docs.append(Document(page_content=text, metadata=metadata))
+        pdf_file_obj.close()
+        return docs
diff --git a/poetry.lock b/poetry.lock
index 1cb44d9c7c2..bf4292d309e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -7008,4 +7008,4 @@ llms = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "b190d518d7a99484ccb0aaf0ed43dfaf9c8cc74481b00da9d4fadd9d02c0dda2"
+content-hash = "55ff8e2f70840a299ca72a27468cf18ec732514bdc2aa2ed9e8faf9bc5caa71f"
diff --git a/tests/integration_tests/test_pdf_pagesplitter.py b/tests/integration_tests/test_pdf_pagesplitter.py
index be3375cc777..ba46385b531 100644
--- a/tests/integration_tests/test_pdf_pagesplitter.py
+++ b/tests/integration_tests/test_pdf_pagesplitter.py
@@ -1,8 +1,9 @@
 """Test splitting with page numbers included."""
+import os
+
 from langchain.document_loaders import PagedPDFSplitter
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.vectorstores import FAISS
-import os
 
 
 def test_pdf_pagesplitter() -> None:
@@ -10,7 +11,8 @@ def test_pdf_pagesplitter() -> None:
     loader = PagedPDFSplitter(chunk_size=250)
     script_dir = os.path.dirname(__file__)
     splits, metadatas = loader.load_and_split(
-        os.path.join(script_dir, "examples/hello.pdf"))
+        os.path.join(script_dir, "examples/hello.pdf")
+    )
     assert "pages" in metadatas[0]
     assert "key" in metadatas[0]
     assert len(splits) == len(metadatas)