mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 07:09:31 +00:00
Add password to PyPDR loader and parser (#6908)
Add password to PyPDR loader and parser --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
This commit is contained in:
parent
429f4dbe4d
commit
e9950392dd
@ -1,5 +1,5 @@
|
||||
"""Module contains common parsers for PDFs."""
|
||||
from typing import Any, Iterator, Mapping, Optional
|
||||
from typing import Any, Iterator, Mapping, Optional, Union
|
||||
|
||||
from langchain.document_loaders.base import BaseBlobParser
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
@ -9,12 +9,15 @@ from langchain.schema import Document
|
||||
class PyPDFParser(BaseBlobParser):
|
||||
"""Loads a PDF with pypdf and chunks at character level."""
|
||||
|
||||
def __init__(self, password: Optional[Union[str, bytes]] = None):
|
||||
self.password = password
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
import pypdf
|
||||
|
||||
with blob.as_bytes_io() as pdf_file_obj:
|
||||
pdf_reader = pypdf.PdfReader(pdf_file_obj)
|
||||
pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
|
||||
yield from [
|
||||
Document(
|
||||
page_content=page.extract_text(),
|
||||
|
@ -7,7 +7,7 @@ import time
|
||||
from abc import ABC
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterator, List, Mapping, Optional
|
||||
from typing import Any, Iterator, List, Mapping, Optional, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
@ -100,7 +100,9 @@ class PyPDFLoader(BasePDFLoader):
|
||||
Loader also stores page numbers in metadatas.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str) -> None:
|
||||
def __init__(
|
||||
self, file_path: str, password: Optional[Union[str, bytes]] = None
|
||||
) -> None:
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import pypdf # noqa:F401
|
||||
@ -108,7 +110,7 @@ class PyPDFLoader(BasePDFLoader):
|
||||
raise ImportError(
|
||||
"pypdf package not found, please install it with " "`pip install pypdf`"
|
||||
)
|
||||
self.parser = PyPDFParser()
|
||||
self.parser = PyPDFParser(password=password)
|
||||
super().__init__(file_path)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
|
Loading…
Reference in New Issue
Block a user