Fix test_parser_with_table

This commit is contained in:
Philippe Prados 2025-02-26 14:07:36 +01:00
parent 898e2a5b51
commit 4f9bcf285b
3 changed files with 26 additions and 18 deletions

View File

@ -48,7 +48,10 @@ class BaseImageBlobParser(BaseBlobParser):
with blob.as_bytes_io() as buf:
if blob.mimetype == "application/x-npy":
try:
img = Img.fromarray(numpy.load(buf))
except EOFError:
return # Ignore too small images
else:
img = Img.open(buf)
content = self._analyze_image(img)

View File

@ -1476,7 +1476,7 @@ class PDFPlumberParser(BaseBlobParser):
raise ValueError("mode must be single or page")
if extract_tables and extract_tables not in ["csv", "markdown", "html"]:
raise ValueError("mode must be csv, markdown or html")
if not extract_images and not images_parser:
if extract_images and not images_parser:
images_parser = RapidOCRBlobParser()
self.password = password
self.extract_images = extract_images
@ -1554,6 +1554,10 @@ class PDFPlumberParser(BaseBlobParser):
elif isinstance(content, list): # Table
page_text.append(_JOIN_TABLES + self._convert_table(content))
else: # Image
if self.images_parser:
try:
from PIL import Image as Img
Img.fromarray(content) # Check if image is valid
image_bytes = io.BytesIO()
numpy.save(image_bytes, content)
blob = Blob.from_data(
@ -1567,6 +1571,10 @@ class PDFPlumberParser(BaseBlobParser):
blob, text_from_image, self.images_inner_format
)
)
except TypeError:
pass
except EOFError:
pass
all_text = _merge_text_and_extras(extras, "".join(page_text).strip())

View File

@ -247,9 +247,6 @@ def test_parser_with_table(
mode: str,
extract_tables: str,
) -> None:
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
parser_class = getattr(pdf_parsers, parser_factory)
parser = parser_class(