mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-10 15:06:18 +00:00
Update pdf.py comment for PyPDFLoader (#10495)
PyPDF does not chunk at the character level to my understanding. Description: PyPDF does not chunk at the character level, but instead breaks up content by page. Fixup comment --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
4236ae3851
commit
203258b4d6
@ -13,7 +13,7 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
|
|
||||||
class PyPDFParser(BaseBlobParser):
|
class PyPDFParser(BaseBlobParser):
|
||||||
"""Load `PDF` using `pypdf` and chunk at character level."""
|
"""Load `PDF` using `pypdf`"""
|
||||||
|
|
||||||
def __init__(self, password: Optional[Union[str, bytes]] = None):
|
def __init__(self, password: Optional[Union[str, bytes]] = None):
|
||||||
self.password = password
|
self.password = password
|
||||||
|
@ -135,9 +135,9 @@ class OnlinePDFLoader(BasePDFLoader):
|
|||||||
|
|
||||||
|
|
||||||
class PyPDFLoader(BasePDFLoader):
|
class PyPDFLoader(BasePDFLoader):
|
||||||
"""Load `PDF using `pypdf` and chunks at character level.
|
"""Load PDF using pypdf into list of documents.
|
||||||
|
|
||||||
Loader also stores page numbers in metadata.
|
Loader chunks by page and stores page numbers in metadata.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
Loading…
Reference in New Issue
Block a user