From 05d125ac235bffe086e7bf0218519e4dd753a57f Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Thu, 9 Feb 2023 23:44:14 -0800 Subject: [PATCH] cr --- .../examples/paged_pdf_splitter.ipynb | 100 +++++++++++++++--- langchain/document_loaders/paged_pdf.py | 54 +++------- poetry.lock | 2 +- .../test_pdf_pagesplitter.py | 6 +- 4 files changed, 109 insertions(+), 53 deletions(-) diff --git a/docs/modules/document_loaders/examples/paged_pdf_splitter.ipynb b/docs/modules/document_loaders/examples/paged_pdf_splitter.ipynb index 6586e1c1dec..0258af0ca3a 100644 --- a/docs/modules/document_loaders/examples/paged_pdf_splitter.ipynb +++ b/docs/modules/document_loaders/examples/paged_pdf_splitter.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -19,20 +18,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from langchain.document_loaders import PagedPDFSplitter\n", "\n", - "loader = PagedPDFSplitter(chunk_size=250)\n", - "splits, metadatas = loader.load_and_split(\n", - " \"examples/example_data/layout-parser-paper.pdf\"\n", - ")" + "loader = PagedPDFSplitter(\"example_data/layout-parser-paper.pdf\")\n", + "pages = loader.load_and_split()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -43,18 +39,97 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9: 10 Z. Shen et al.\n", + "Fig. 4: Illustration of (a) the original historical Japanese document with layout\n", + "detection results and (b) a recreated version of the document image that achieves\n", + "much better character recognition recall. The reorganization algorithm rearranges\n", + "the tokens based on the their detected bounding boxes given a maximum allowed\n", + "height.\n", + "4LayoutParser Community Platform\n", + "Another focus of LayoutParser is promoting the reusability of layout detection\n", + "models and full digitization pipelines. Similar to many existing deep learning\n", + "libraries, LayoutParser comes with a community model hub for distributing\n", + "layout models. End-users can upload their self-trained models to the model hub,\n", + "and these models can be loaded into a similar interface as the currently available\n", + "LayoutParser pre-trained models. For example, the model trained on the News\n", + "Navigator dataset [17] has been incorporated in the model hub.\n", + "Beyond DL models, LayoutParser also promotes the sharing of entire doc-\n", + "ument digitization pipelines. For example, sometimes the pipeline requires the\n", + "combination of multiple DL models to achieve better accuracy. Currently, pipelines\n", + "are mainly described in academic papers and implementations are often not pub-\n", + "licly available. To this end, the LayoutParser community platform also enables\n", + "the sharing of layout pipelines to promote the discussion and reuse of techniques.\n", + "For each shared pipeline, it has a dedicated project page, with links to the source\n", + "code, documentation, and an outline of the approaches. A discussion panel is\n", + "provided for exchanging ideas. Combined with the core LayoutParser library,\n", + "users can easily build reusable components based on the shared pipelines and\n", + "apply them to solve their unique problems.\n", + "5 Use Cases\n", + "The core objective of LayoutParser is to make it easier to create both large-scale\n", + "and light-weight document digitization pipelines. Large-scale document processing\n", + "3: 4 Z. Shen et al.\n", + "Efficient Data AnnotationC u s t o m i z e d M o d e l T r a i n i n gModel Cust omizationDI A Model HubDI A Pipeline SharingCommunity PlatformLa y out Detection ModelsDocument Images \n", + "T h e C o r e L a y o u t P a r s e r L i b r a r yOCR ModuleSt or age & VisualizationLa y out Data Structur e\n", + "Fig. 1: The overall architecture of LayoutParser . For an input document image,\n", + "the core LayoutParser library provides a set of o\u000b", + "-the-shelf tools for layout\n", + "detection, OCR, visualization, and storage, backed by a carefully designed layout\n", + "data structure. LayoutParser also supports high level customization via e\u000ecient\n", + "layout annotation and model training functions. These improve model accuracy\n", + "on the target samples. The community platform enables the easy sharing of DIA\n", + "models and whole digitization pipelines to promote reusability and reproducibility.\n", + "A collection of detailed documentation, tutorials and exemplar projects make\n", + "LayoutParser easy to learn and use.\n", + "AllenNLP [ 8] and transformers [ 34] have provided the community with complete\n", + "DL-based support for developing and deploying models for general computer\n", + "vision and natural language processing problems. LayoutParser , on the other\n", + "hand, specializes speci\f", + "cally in DIA tasks. LayoutParser is also equipped with a\n", + "community platform inspired by established model hubs such as Torch Hub [23]\n", + "andTensorFlow Hub [1]. It enables the sharing of pretrained models as well as\n", + "full document processing pipelines that are unique to DIA tasks.\n", + "There have been a variety of document data collections to facilitate the\n", + "development of DL models. Some examples include PRImA [ 3](magazine layouts),\n", + "PubLayNet [ 38](academic paper layouts), Table Bank [ 18](tables in academic\n", + "papers), Newspaper Navigator Dataset [ 16,17](newspaper \f", + "gure layouts) and\n", + "HJDataset [31](historical Japanese document layouts). A spectrum of models\n", + "trained on these datasets are currently available in the LayoutParser model zoo\n", + "to support di\u000b", + "erent use cases.\n", + "3 The Core LayoutParser Library\n", + "At the core of LayoutParser is an o\u000b", + "-the-shelf toolkit that streamlines DL-\n", + "based document image analysis. Five components support a simple interface\n", + "with comprehensive functionalities: 1) The layout detection models enable using\n", + "pre-trained or self-trained DL models for layout detection with just four lines\n", + "of code. 2) The detected layout information is stored in carefully engineered\n" + ] + } + ], "source": [ "from langchain.vectorstores import FAISS\n", "from langchain.embeddings.openai import OpenAIEmbeddings\n", "\n", - "faiss_index = FAISS.from_texts(splits, OpenAIEmbeddings(), metadatas=metadatas)\n", + "faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings())\n", "docs = faiss_index.similarity_search(\"How will the community be engaged?\", k=2)\n", "for doc in docs:\n", - " print(doc.metadata[\"pages\"] + \":\", doc.page_content)" + " print(str(doc.metadata[\"page\"]) + \":\", doc.page_content)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -73,9 +148,8 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.9.1" }, - "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "5c7b89af1651d0b8571dde13640ecdccf7d5a6204171d6ab33e7c296e100e08a" diff --git a/langchain/document_loaders/paged_pdf.py b/langchain/document_loaders/paged_pdf.py index 081705db958..940f2965f78 100644 --- a/langchain/document_loaders/paged_pdf.py +++ b/langchain/document_loaders/paged_pdf.py @@ -1,14 +1,17 @@ """Loads a PDF with pypdf and chunks at character level.""" from typing import Dict, List, Optional, Tuple +from langchain.document_loaders.base import BaseLoader +from langchain.docstore.document import Document -class PagedPDFSplitter: +class PagedPDFSplitter(BaseLoader): """Loads a PDF with pypdf and chunks at character level. Loader also stores page numbers in metadatas. """ - def __init__(self, chunk_size: int = 4000, chunk_overlap: int = 200): + + def __init__(self, file_path: str): """Initialize with file path.""" try: import pypdf # noqa:F401 @@ -16,41 +19,18 @@ class PagedPDFSplitter: raise ValueError( "pypdf package not found, please install it with " "`pip install pypdf`" ) - self.chunk_size = chunk_size - self.chunk_overlap = chunk_overlap + self._file_path = file_path - def load_and_split( - self, file_path: str, metadata: Optional[Dict] = None - ) -> Tuple[List[str], List[Dict]]: - """Load given path and split into texts and metadatas. - - If given, the metadata given will - be duplicated and attached to each split along with page number. - If key is present in metadata, it also has page number - included (e.g., Foo2012 Pages 3-4). - """ + def load(self) -> List[Document]: + """Load given path as pages.""" import pypdf - pdfFileObj = open(file_path, "rb") - pdfReader = pypdf.PdfReader(pdfFileObj) - splits = [] - split = "" - pages = [] - metadatas = [] - key = ( - metadata["key"] if metadata is not None and "key" in metadata else file_path - ) - for i, page in enumerate(pdfReader.pages): - split += page.extract_text() - pages.append(str(i + 1)) - if len(split) > self.chunk_size or i == len(pdfReader.pages) - 1: - splits.append(split[: self.chunk_size]) - # pretty formatting of pages (e.g. 1-3, 4, 5-7) - pg = "-".join([pages[0], pages[-1]]) - metadatas.append(dict(key=f"{key} pages {pg}", pages=pg)) - if metadata is not None: - metadatas[-1].update(metadata) - split = str(splits[self.chunk_size: self.chunk_overlap]) - pages = [str(i + 1)] - pdfFileObj.close() - return splits, metadatas + pdf_file_obj = open(self._file_path, "rb") + pdf_reader = pypdf.PdfReader(pdf_file_obj) + docs = [] + for i, page in enumerate(pdf_reader.pages): + text = page.extract_text() + metadata = {"source": self._file_path, "page": i} + docs.append(Document(page_content=text, metadata=metadata)) + pdf_file_obj.close() + return docs diff --git a/poetry.lock b/poetry.lock index 1cb44d9c7c2..bf4292d309e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -7008,4 +7008,4 @@ llms = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "b190d518d7a99484ccb0aaf0ed43dfaf9c8cc74481b00da9d4fadd9d02c0dda2" +content-hash = "55ff8e2f70840a299ca72a27468cf18ec732514bdc2aa2ed9e8faf9bc5caa71f" diff --git a/tests/integration_tests/test_pdf_pagesplitter.py b/tests/integration_tests/test_pdf_pagesplitter.py index be3375cc777..ba46385b531 100644 --- a/tests/integration_tests/test_pdf_pagesplitter.py +++ b/tests/integration_tests/test_pdf_pagesplitter.py @@ -1,8 +1,9 @@ """Test splitting with page numbers included.""" +import os + from langchain.document_loaders import PagedPDFSplitter from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import FAISS -import os def test_pdf_pagesplitter() -> None: @@ -10,7 +11,8 @@ def test_pdf_pagesplitter() -> None: loader = PagedPDFSplitter(chunk_size=250) script_dir = os.path.dirname(__file__) splits, metadatas = loader.load_and_split( - os.path.join(script_dir, "examples/hello.pdf")) + os.path.join(script_dir, "examples/hello.pdf") + ) assert "pages" in metadatas[0] assert "key" in metadatas[0] assert len(splits) == len(metadatas)