From 5771e561fb06e02b177b7038fc09499b98785a7f Mon Sep 17 00:00:00 2001 From: Philippe PRADOS Date: Tue, 4 Feb 2025 15:24:40 +0100 Subject: [PATCH] [Bugfix langchain_community] Fix PyMuPDFLoader (#29550) - **Description:** add legacy properties - **Issue:** #29470 - **Twitter handle:** pprados --- .../document_loaders/parsers/pdf.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index e6a6338f83d..124a153388f 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -836,9 +836,9 @@ class PyMuPDFParser(BaseBlobParser): Returns: dict: The extracted metadata. """ - return _purge_metadata( - dict( - { + metadata = _purge_metadata( + { + **{ "producer": "PyMuPDF", "creator": "PyMuPDF", "creationdate": "", @@ -851,8 +851,12 @@ class PyMuPDFParser(BaseBlobParser): for k in doc.metadata if isinstance(doc.metadata[k], (str, int)) }, - ) + } ) + for k in ("modDate", "creationDate"): + if k in doc.metadata: + metadata[k] = doc.metadata[k] + return metadata def _extract_images_from_page( self, doc: pymupdf.Document, page: pymupdf.Page