Fix test_parser_with_table

This commit is contained in:
Philippe Prados 2025-02-26 14:07:36 +01:00
parent 898e2a5b51
commit 4f9bcf285b
3 changed files with 26 additions and 18 deletions

View File

@ -48,7 +48,10 @@ class BaseImageBlobParser(BaseBlobParser):
with blob.as_bytes_io() as buf: with blob.as_bytes_io() as buf:
if blob.mimetype == "application/x-npy": if blob.mimetype == "application/x-npy":
img = Img.fromarray(numpy.load(buf)) try:
img = Img.fromarray(numpy.load(buf))
except EOFError:
return # Ignore too small images
else: else:
img = Img.open(buf) img = Img.open(buf)
content = self._analyze_image(img) content = self._analyze_image(img)

View File

@ -1476,7 +1476,7 @@ class PDFPlumberParser(BaseBlobParser):
raise ValueError("mode must be single or page") raise ValueError("mode must be single or page")
if extract_tables and extract_tables not in ["csv", "markdown", "html"]: if extract_tables and extract_tables not in ["csv", "markdown", "html"]:
raise ValueError("mode must be csv, markdown or html") raise ValueError("mode must be csv, markdown or html")
if not extract_images and not images_parser: if extract_images and not images_parser:
images_parser = RapidOCRBlobParser() images_parser = RapidOCRBlobParser()
self.password = password self.password = password
self.extract_images = extract_images self.extract_images = extract_images
@ -1554,19 +1554,27 @@ class PDFPlumberParser(BaseBlobParser):
elif isinstance(content, list): # Table elif isinstance(content, list): # Table
page_text.append(_JOIN_TABLES + self._convert_table(content)) page_text.append(_JOIN_TABLES + self._convert_table(content))
else: # Image else: # Image
image_bytes = io.BytesIO() if self.images_parser:
numpy.save(image_bytes, content) try:
blob = Blob.from_data( from PIL import Image as Img
image_bytes.getvalue(), mime_type="application/x-npy" Img.fromarray(content) # Check if image is valid
) image_bytes = io.BytesIO()
text_from_image = next( numpy.save(image_bytes, content)
self.images_parser.lazy_parse(blob) # type: ignore blob = Blob.from_data(
).page_content image_bytes.getvalue(), mime_type="application/x-npy"
extras.append( )
_format_inner_image( text_from_image = next(
blob, text_from_image, self.images_inner_format self.images_parser.lazy_parse(blob) # type: ignore
) ).page_content
) extras.append(
_format_inner_image(
blob, text_from_image, self.images_inner_format
)
)
except TypeError:
pass
except EOFError:
pass
all_text = _merge_text_and_extras(extras, "".join(page_text).strip()) all_text = _merge_text_and_extras(extras, "".join(page_text).strip())

View File

@ -247,9 +247,6 @@ def test_parser_with_table(
mode: str, mode: str,
extract_tables: str, extract_tables: str,
) -> None: ) -> None:
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
parser_class = getattr(pdf_parsers, parser_factory) parser_class = getattr(pdf_parsers, parser_factory)
parser = parser_class( parser = parser_class(