diff --git a/langchain/document_loaders/parsers/pdf.py b/langchain/document_loaders/parsers/pdf.py index 3e0283738d5..eb3c0d9d165 100644 --- a/langchain/document_loaders/parsers/pdf.py +++ b/langchain/document_loaders/parsers/pdf.py @@ -1,5 +1,5 @@ """Module contains common parsers for PDFs.""" -from typing import Any, Iterator, Mapping, Optional +from typing import Any, Iterator, Mapping, Optional, Union from langchain.document_loaders.base import BaseBlobParser from langchain.document_loaders.blob_loaders import Blob @@ -9,12 +9,15 @@ from langchain.schema import Document class PyPDFParser(BaseBlobParser): """Loads a PDF with pypdf and chunks at character level.""" + def __init__(self, password: Optional[Union[str, bytes]] = None): + self.password = password + def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" import pypdf with blob.as_bytes_io() as pdf_file_obj: - pdf_reader = pypdf.PdfReader(pdf_file_obj) + pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password) yield from [ Document( page_content=page.extract_text(), diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py index 1d334e097af..8dc035a44ff 100644 --- a/langchain/document_loaders/pdf.py +++ b/langchain/document_loaders/pdf.py @@ -7,7 +7,7 @@ import time from abc import ABC from io import StringIO from pathlib import Path -from typing import Any, Iterator, List, Mapping, Optional +from typing import Any, Iterator, List, Mapping, Optional, Union from urllib.parse import urlparse import requests @@ -100,7 +100,9 @@ class PyPDFLoader(BasePDFLoader): Loader also stores page numbers in metadatas. """ - def __init__(self, file_path: str) -> None: + def __init__( + self, file_path: str, password: Optional[Union[str, bytes]] = None + ) -> None: """Initialize with file path.""" try: import pypdf # noqa:F401 @@ -108,7 +110,7 @@ class PyPDFLoader(BasePDFLoader): raise ImportError( "pypdf package not found, please install it with " "`pip install pypdf`" ) - self.parser = PyPDFParser() + self.parser = PyPDFParser(password=password) super().__init__(file_path) def load(self) -> List[Document]: