Add password to PyPDR loader and parser (#6908)

Add password to PyPDR loader and parser

---------

Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
This commit is contained in:
lucasiscovici 2023-06-30 02:35:50 +02:00 committed by GitHub
parent 429f4dbe4d
commit e9950392dd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 10 additions and 5 deletions

View File

@ -1,5 +1,5 @@
"""Module contains common parsers for PDFs.""" """Module contains common parsers for PDFs."""
from typing import Any, Iterator, Mapping, Optional from typing import Any, Iterator, Mapping, Optional, Union
from langchain.document_loaders.base import BaseBlobParser from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob from langchain.document_loaders.blob_loaders import Blob
@ -9,12 +9,15 @@ from langchain.schema import Document
class PyPDFParser(BaseBlobParser): class PyPDFParser(BaseBlobParser):
"""Loads a PDF with pypdf and chunks at character level.""" """Loads a PDF with pypdf and chunks at character level."""
def __init__(self, password: Optional[Union[str, bytes]] = None):
self.password = password
def lazy_parse(self, blob: Blob) -> Iterator[Document]: def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob.""" """Lazily parse the blob."""
import pypdf import pypdf
with blob.as_bytes_io() as pdf_file_obj: with blob.as_bytes_io() as pdf_file_obj:
pdf_reader = pypdf.PdfReader(pdf_file_obj) pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
yield from [ yield from [
Document( Document(
page_content=page.extract_text(), page_content=page.extract_text(),

View File

@ -7,7 +7,7 @@ import time
from abc import ABC from abc import ABC
from io import StringIO from io import StringIO
from pathlib import Path from pathlib import Path
from typing import Any, Iterator, List, Mapping, Optional from typing import Any, Iterator, List, Mapping, Optional, Union
from urllib.parse import urlparse from urllib.parse import urlparse
import requests import requests
@ -100,7 +100,9 @@ class PyPDFLoader(BasePDFLoader):
Loader also stores page numbers in metadatas. Loader also stores page numbers in metadatas.
""" """
def __init__(self, file_path: str) -> None: def __init__(
self, file_path: str, password: Optional[Union[str, bytes]] = None
) -> None:
"""Initialize with file path.""" """Initialize with file path."""
try: try:
import pypdf # noqa:F401 import pypdf # noqa:F401
@ -108,7 +110,7 @@ class PyPDFLoader(BasePDFLoader):
raise ImportError( raise ImportError(
"pypdf package not found, please install it with " "`pip install pypdf`" "pypdf package not found, please install it with " "`pip install pypdf`"
) )
self.parser = PyPDFParser() self.parser = PyPDFParser(password=password)
super().__init__(file_path) super().__init__(file_path)
def load(self) -> List[Document]: def load(self) -> List[Document]: