Fix revue

This commit is contained in:
Philippe Prados 2025-03-13 16:43:48 +01:00
parent dd909d2914
commit 38b50e3277
5 changed files with 96 additions and 83 deletions

View File

@ -454,9 +454,7 @@ class PyPDFParser(BaseBlobParser):
image_bytes = io.BytesIO() image_bytes = io.BytesIO()
Image.fromarray(np_image).save(image_bytes, format="PNG") Image.fromarray(np_image).save(image_bytes, format="PNG")
blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png") blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
image_text = next( image_text = next(self.images_parser.lazy_parse(blob)).page_content
self.images_parser.lazy_parse(blob) # type: ignore
).page_content
images.append( images.append(
_format_inner_image(blob, image_text, self.images_inner_format) _format_inner_image(blob, image_text, self.images_inner_format)
) )
@ -751,7 +749,7 @@ class PDFMinerParser(BaseBlobParser):
blob = Blob.from_path(Path(tempdir) / filename) blob = Blob.from_path(Path(tempdir) / filename)
blob.metadata["source"] = "#" blob.metadata["source"] = "#"
image_text = next( image_text = next(
self.images_parser.lazy_parse(blob) # type: ignore self.images_parser.lazy_parse(blob)
).page_content ).page_content
text_io.write( text_io.write(
@ -1104,9 +1102,7 @@ class PyMuPDFParser(BaseBlobParser):
blob = Blob.from_data( blob = Blob.from_data(
image_bytes.getvalue(), mime_type="application/x-npy" image_bytes.getvalue(), mime_type="application/x-npy"
) )
image_text = next( image_text = next(self.images_parser.lazy_parse(blob)).page_content
self.images_parser.lazy_parse(blob) # type: ignore
).page_content
images.append( images.append(
_format_inner_image(blob, image_text, self.images_inner_format) _format_inner_image(blob, image_text, self.images_inner_format)
@ -1196,6 +1192,8 @@ class PyPDFium2Parser(BaseBlobParser):
# password=None, # password=None,
mode="page", mode="page",
pages_delimiter="\n\f", pages_delimiter="\n\f",
# extract_images = True,
# images_to_text = convert_images_to_text_with_tesseract(),
) )
Lazily parse the blob: Lazily parse the blob:
@ -1365,9 +1363,7 @@ class PyPDFium2Parser(BaseBlobParser):
continue continue
numpy.save(image_bytes, image.get_bitmap().to_numpy()) numpy.save(image_bytes, image.get_bitmap().to_numpy())
blob = Blob.from_data(image_bytes.getvalue(), mime_type="application/x-npy") blob = Blob.from_data(image_bytes.getvalue(), mime_type="application/x-npy")
text_from_image = next( text_from_image = next(self.images_parser.lazy_parse(blob)).page_content
self.images_parser.lazy_parse(blob) # type: ignore
).page_content
str_images.append( str_images.append(
_format_inner_image(blob, text_from_image, self.images_inner_format) _format_inner_image(blob, text_from_image, self.images_inner_format)
) )
@ -1410,6 +1406,7 @@ class PDFPlumberParser(BaseBlobParser):
mode = "single", mode = "single",
pages_delimiter = "\n\f", pages_delimiter = "\n\f",
# extract_tables="markdown", # extract_tables="markdown",
metadata_format="standard",
) )
Lazily parse the blob: Lazily parse the blob:
@ -1438,6 +1435,7 @@ class PDFPlumberParser(BaseBlobParser):
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
extract_tables: Optional[Literal["csv", "markdown", "html"]] = None, extract_tables: Optional[Literal["csv", "markdown", "html"]] = None,
extract_tables_settings: Optional[dict[str, Any]] = None, extract_tables_settings: Optional[dict[str, Any]] = None,
metadata_format: Literal["legacy", "standard"] = "legacy",
) -> None: ) -> None:
"""Initialize the parser. """Initialize the parser.
@ -1461,6 +1459,8 @@ class PDFPlumberParser(BaseBlobParser):
dedupe: Avoiding the error of duplicate characters if `dedupe=True` dedupe: Avoiding the error of duplicate characters if `dedupe=True`
extract_tables_settings: Optional dictionary of settings for customizing extract_tables_settings: Optional dictionary of settings for customizing
table extraction. table extraction.
metadata_format: Use CamelCase keys with 'legacy'
and lower keys with 'standard'.
Returns: Returns:
This method does not directly return data. Use the `parse` or `lazy_parse` This method does not directly return data. Use the `parse` or `lazy_parse`
@ -1492,6 +1492,19 @@ class PDFPlumberParser(BaseBlobParser):
"snap_y_tolerance": 5, "snap_y_tolerance": 5,
"intersection_x_tolerance": 15, "intersection_x_tolerance": 15,
} }
if metadata_format == "legacy":
warnings.warn(
"The default value 'legacy' use some CamelCase keys. "
"It's will be deprecated in the next major version."
)
self.metadata_format = metadata_format
def _validate_metadata(self, metadata: dict[str, Any]) -> dict[str, Any]:
if self.metadata_format == "legacy":
return metadata
else:
return _validate_metadata(metadata)
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob. """Lazily parse the blob.
@ -1520,19 +1533,8 @@ class PDFPlumberParser(BaseBlobParser):
contents = [] contents = []
# The legacy version, use CreationDate, Creator, etc. # The legacy version, use CreationDate, Creator, etc.
# The new 'standard' version must use lower case key. # The new 'standard' version must use lower case key.
# This next line, merge the legecy keys and standard keys if self.metadata_format == "legacy":
# in the same dictionary.
# - The CreationDate is duplicate to `creationdate` with iso format.
# - The Creator is duplicate to 'creator', etc.
# With this strategy, the legacy code can continue to use CreationDate
# or Creator. The new code, can use `creationdate` or `creator`.
# _purge_metadata() convert and normalize the name and format of
# the metadatas.
doc_metadata = ( doc_metadata = (
doc.metadata # Legacy metdata with...
| _purge_metadata(
(
doc.metadata # Add parser metdata doc.metadata # Add parser metdata
| { # with more keys | { # with more keys
"source": blob.source, "source": blob.source,
@ -1540,6 +1542,15 @@ class PDFPlumberParser(BaseBlobParser):
"total_pages": len(doc.pages), "total_pages": len(doc.pages),
} }
) )
else:
doc_metadata = _purge_metadata(
(
doc.metadata # Add parser metdata
| { # with more keys
"source": blob.source,
"file_path": blob.source,
"total_pages": len(doc.pages),
}
) )
) )
@ -1596,7 +1607,7 @@ class PDFPlumberParser(BaseBlobParser):
all_text += "\n" all_text += "\n"
yield Document( yield Document(
page_content=all_text, page_content=all_text,
metadata=_validate_metadata( metadata=self._validate_metadata(
doc_metadata doc_metadata
| { | {
"page": page.page_number - 1, "page": page.page_number - 1,
@ -1608,7 +1619,7 @@ class PDFPlumberParser(BaseBlobParser):
if self.mode == "single": if self.mode == "single":
yield Document( yield Document(
page_content=self.pages_delimiter.join(contents), page_content=self.pages_delimiter.join(contents),
metadata=_validate_metadata(doc_metadata), metadata=self._validate_metadata(doc_metadata),
) )
def _process_page_content(self, page: pdfplumber.page.Page) -> str: def _process_page_content(self, page: pdfplumber.page.Page) -> str:

View File

@ -1044,6 +1044,7 @@ class PDFPlumberLoader(BasePDFLoader):
# extract_tables_settings = None, # extract_tables_settings = None,
# text_kwargs = {"use_text_flow": False, "keep_blank_chars": False}, # text_kwargs = {"use_text_flow": False, "keep_blank_chars": False},
# dedupe = False, # dedupe = False,
metadata_format="standard",
) )
Lazy load documents: Lazy load documents:
@ -1082,6 +1083,7 @@ class PDFPlumberLoader(BasePDFLoader):
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
extract_tables: Optional[Literal["csv", "markdown", "html"]] = None, extract_tables: Optional[Literal["csv", "markdown", "html"]] = None,
extract_tables_settings: Optional[dict[str, Any]] = None, extract_tables_settings: Optional[dict[str, Any]] = None,
metadata_format: Literal["legacy", "standard"] = "legacy",
) -> None: ) -> None:
"""Initialize with a file path. """Initialize with a file path.
@ -1108,6 +1110,8 @@ class PDFPlumberLoader(BasePDFLoader):
table extraction. table extraction.
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
dedupe: Avoiding the error of duplicate characters if `dedupe=True` dedupe: Avoiding the error of duplicate characters if `dedupe=True`
metadata_format: Use CamelCase keys with 'legacy'
and lower keys with 'standard'.
Returns: Returns:
This method does not directly return data. Use the `load`, `lazy_load`, This method does not directly return data. Use the `load`, `lazy_load`,
@ -1129,6 +1133,7 @@ class PDFPlumberLoader(BasePDFLoader):
text_kwargs=text_kwargs, text_kwargs=text_kwargs,
extract_tables_settings=extract_tables_settings, extract_tables_settings=extract_tables_settings,
dedupe=dedupe, dedupe=dedupe,
metadata_format=metadata_format,
) )
def lazy_load( def lazy_load(

View File

@ -2,19 +2,18 @@
import re import re
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Iterator from typing import TYPE_CHECKING, Iterator, Type
import pytest import pytest
import langchain_community.document_loaders.parsers as pdf_parsers
from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers import ( from langchain_community.document_loaders.parsers import (
BaseImageBlobParser, BaseImageBlobParser,
PDFPlumberParser,
) )
from langchain_community.document_loaders.parsers.pdf import ( from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser, PDFMinerParser,
PDFPlumberParser,
PyMuPDFParser, PyMuPDFParser,
PyPDFium2Parser, PyPDFium2Parser,
PyPDFParser, PyPDFParser,
@ -114,7 +113,7 @@ class EmptyImageBlobParser(BaseImageBlobParser):
"parser_class,params", "parser_class,params",
[ [
(PDFMinerParser, {}), (PDFMinerParser, {}),
(PDFPlumberParser, {}), (PDFPlumberParser, {"metadata_format": "standard"}),
(PyMuPDFParser, {}), (PyMuPDFParser, {}),
(PyPDFium2Parser, {}), (PyPDFium2Parser, {}),
(PyPDFParser, {"extraction_mode": "plain"}), (PyPDFParser, {"extraction_mode": "plain"}),
@ -145,7 +144,7 @@ def test_mode_and_extract_images_variations(
"parser_class,params", "parser_class,params",
[ [
(PDFMinerParser, {}), (PDFMinerParser, {}),
(PDFPlumberParser, {}), (PDFPlumberParser, {"metadata_format": "standard"}),
(PyMuPDFParser, {}), (PyMuPDFParser, {}),
(PyPDFium2Parser, {}), (PyPDFium2Parser, {}),
(PyPDFParser, {"extraction_mode": "plain"}), (PyPDFParser, {"extraction_mode": "plain"}),
@ -254,8 +253,6 @@ def test_parser_with_table(
mode: str, mode: str,
extract_tables: str, extract_tables: str,
) -> None: ) -> None:
parser_class = getattr(pdf_parsers, parser_factory)
from PIL.Image import Image from PIL.Image import Image
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser from langchain_community.document_loaders.parsers.images import BaseImageBlobParser

View File

@ -1,15 +1,15 @@
import os import os
from pathlib import Path from pathlib import Path
from typing import Sequence, Union from typing import Sequence, Type, Union
import pytest import pytest
import langchain_community.document_loaders as pdf_loaders from langchain_community.document_loaders.pdf import (
from langchain_community.document_loaders import (
AmazonTextractPDFLoader, AmazonTextractPDFLoader,
MathpixPDFLoader, MathpixPDFLoader,
PDFMinerLoader, PDFMinerLoader,
PDFMinerPDFasHTMLLoader, PDFMinerPDFasHTMLLoader,
PDFPlumberLoader,
PyMuPDFLoader, PyMuPDFLoader,
PyPDFium2Loader, PyPDFium2Loader,
PyPDFLoader, PyPDFLoader,
@ -171,7 +171,7 @@ def test_amazontextract_loader_failures() -> None:
"loader_class,params", "loader_class,params",
[ [
(PDFMinerLoader, {}), (PDFMinerLoader, {}),
(PDFPlumberLoader, {}), (PDFPlumberLoader, {"metadata_format": "standard"}),
(PyMuPDFLoader, {}), (PyMuPDFLoader, {}),
(PyPDFium2Loader, {}), (PyPDFium2Loader, {}),
(PyPDFLoader, {}), (PyPDFLoader, {}),
@ -181,7 +181,6 @@ def test_standard_parameters(
loader_class: Type, loader_class: Type,
params: dict, params: dict,
) -> None: ) -> None:
file_path = Path(__file__).parent.parent / "examples/hello.pdf" file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = loader_class(file_path) loader = loader_class(file_path)
docs = loader.load() docs = loader.load()

View File

@ -2,15 +2,15 @@
import importlib import importlib
from pathlib import Path from pathlib import Path
from typing import Any, Iterator from typing import Any, Iterator, Type
import pytest import pytest
import langchain_community.document_loaders.parsers as pdf_parsers
from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import ( from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser, PDFMinerParser,
PDFPlumberParser,
PyMuPDFParser, PyMuPDFParser,
PyPDFium2Parser, PyPDFium2Parser,
PyPDFParser, PyPDFParser,
@ -78,24 +78,25 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"parser_class,require,params", "parser_class,require,ctr_params,params",
[ [
(PDFMinerParser, "pdfminer", {"splits_by_page": False}), (PDFMinerParser, "pdfminer", {}, {"splits_by_page": False}),
(PDFPlumberParser, "pdfplumber", {}), (PDFPlumberParser, "pdfplumber", {"metadata_format": "standard"}, {}),
(PyMuPDFParser, "pymupdf", {}), (PyMuPDFParser, "pymupdf", {}, {}),
(PyPDFParser, "pypdf", {}), (PyPDFParser, "pypdf", {}, {}),
(PyPDFium2Parser, "pypdfium2", {}), (PyPDFium2Parser, "pypdfium2", {}, {}),
], ],
) )
def test_parsers( def test_parsers(
parser_class: Type, parser_class: Type,
require: str, require: str,
ctr_params: dict[str, Any],
params: dict[str, Any], params: dict[str, Any],
) -> None: ) -> None:
try: try:
require = require.replace("-", "") require = require.replace("-", "")
importlib.import_module(require, package=None) importlib.import_module(require, package=None)
parser = parser_class() parser = parser_class(**ctr_params)
_assert_with_parser(parser, **params) _assert_with_parser(parser, **params)
except ModuleNotFoundError: except ModuleNotFoundError:
pytest.skip(f"{parser_class} skiped. Require '{require}'") pytest.skip(f"{parser_class} skiped. Require '{require}'")