Fix images parser

This commit is contained in:
Philippe Prados 2025-03-26 15:01:16 +01:00
parent 38b50e3277
commit 09c4c1f867

View File

@ -45,27 +45,28 @@ class BaseImageBlobParser(BaseBlobParser):
"""
try:
from PIL import Image as Img
with blob.as_bytes_io() as buf:
if blob.mimetype == "application/x-npy":
try:
img = Img.fromarray(numpy.load(buf))
except EOFError:
return # Ignore too small images
else:
img = Img.open(buf)
content = self._analyze_image(img)
logger.debug("Image text: %s", content.replace("\n", "\\n"))
yield Document(
page_content=content,
metadata={**blob.metadata, **{"source": blob.source}},
)
except ImportError:
raise ImportError(
"`Pillow` package not found, please install it with "
"`pip install Pillow`"
)
with blob.as_bytes_io() as buf:
if blob.mimetype == "application/x-npy":
array = numpy.load(buf)
if array.ndim == 3 and array.shape[2] == 1: # Grayscale image
img = Img.fromarray(numpy.squeeze(array, axis=2), mode="L")
else:
img = Img.fromarray(array)
else:
img = Img.open(buf)
content = self._analyze_image(img)
logger.debug("Image text: %s", content.replace("\n", "\\n"))
yield Document(
page_content=content,
metadata={**blob.metadata, **{"source": blob.source}},
)
class RapidOCRBlobParser(BaseImageBlobParser):
"""Parser for extracting text from images using the RapidOCR library.