From 203258b4d694a3e95618472ce639a63be4cbdc15 Mon Sep 17 00:00:00 2001 From: Kyle Pancamo <50267605+KylePancamo@users.noreply.github.com> Date: Thu, 5 Oct 2023 13:22:40 -0500 Subject: [PATCH] Update pdf.py comment for PyPDFLoader (#10495) PyPDF does not chunk at the character level to my understanding. Description: PyPDF does not chunk at the character level, but instead breaks up content by page. Fixup comment --------- Co-authored-by: Eugene Yurtsev Co-authored-by: Bagatur --- libs/langchain/langchain/document_loaders/parsers/pdf.py | 2 +- libs/langchain/langchain/document_loaders/pdf.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/parsers/pdf.py b/libs/langchain/langchain/document_loaders/parsers/pdf.py index 2ec7a684be6..22701f2c3e3 100644 --- a/libs/langchain/langchain/document_loaders/parsers/pdf.py +++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py @@ -13,7 +13,7 @@ if TYPE_CHECKING: class PyPDFParser(BaseBlobParser): - """Load `PDF` using `pypdf` and chunk at character level.""" + """Load `PDF` using `pypdf`""" def __init__(self, password: Optional[Union[str, bytes]] = None): self.password = password diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index dfccf9c4bd2..67743effd98 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -135,9 +135,9 @@ class OnlinePDFLoader(BasePDFLoader): class PyPDFLoader(BasePDFLoader): - """Load `PDF using `pypdf` and chunks at character level. + """Load PDF using pypdf into list of documents. - Loader also stores page numbers in metadata. + Loader chunks by page and stores page numbers in metadata. """ def __init__(