mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-28 09:28:48 +00:00
Fix test_parser_with_table
This commit is contained in:
parent
898e2a5b51
commit
4f9bcf285b
@ -48,7 +48,10 @@ class BaseImageBlobParser(BaseBlobParser):
|
||||
|
||||
with blob.as_bytes_io() as buf:
|
||||
if blob.mimetype == "application/x-npy":
|
||||
img = Img.fromarray(numpy.load(buf))
|
||||
try:
|
||||
img = Img.fromarray(numpy.load(buf))
|
||||
except EOFError:
|
||||
return # Ignore too small images
|
||||
else:
|
||||
img = Img.open(buf)
|
||||
content = self._analyze_image(img)
|
||||
|
@ -1476,7 +1476,7 @@ class PDFPlumberParser(BaseBlobParser):
|
||||
raise ValueError("mode must be single or page")
|
||||
if extract_tables and extract_tables not in ["csv", "markdown", "html"]:
|
||||
raise ValueError("mode must be csv, markdown or html")
|
||||
if not extract_images and not images_parser:
|
||||
if extract_images and not images_parser:
|
||||
images_parser = RapidOCRBlobParser()
|
||||
self.password = password
|
||||
self.extract_images = extract_images
|
||||
@ -1554,19 +1554,27 @@ class PDFPlumberParser(BaseBlobParser):
|
||||
elif isinstance(content, list): # Table
|
||||
page_text.append(_JOIN_TABLES + self._convert_table(content))
|
||||
else: # Image
|
||||
image_bytes = io.BytesIO()
|
||||
numpy.save(image_bytes, content)
|
||||
blob = Blob.from_data(
|
||||
image_bytes.getvalue(), mime_type="application/x-npy"
|
||||
)
|
||||
text_from_image = next(
|
||||
self.images_parser.lazy_parse(blob) # type: ignore
|
||||
).page_content
|
||||
extras.append(
|
||||
_format_inner_image(
|
||||
blob, text_from_image, self.images_inner_format
|
||||
)
|
||||
)
|
||||
if self.images_parser:
|
||||
try:
|
||||
from PIL import Image as Img
|
||||
Img.fromarray(content) # Check if image is valid
|
||||
image_bytes = io.BytesIO()
|
||||
numpy.save(image_bytes, content)
|
||||
blob = Blob.from_data(
|
||||
image_bytes.getvalue(), mime_type="application/x-npy"
|
||||
)
|
||||
text_from_image = next(
|
||||
self.images_parser.lazy_parse(blob) # type: ignore
|
||||
).page_content
|
||||
extras.append(
|
||||
_format_inner_image(
|
||||
blob, text_from_image, self.images_inner_format
|
||||
)
|
||||
)
|
||||
except TypeError:
|
||||
pass
|
||||
except EOFError:
|
||||
pass
|
||||
|
||||
all_text = _merge_text_and_extras(extras, "".join(page_text).strip())
|
||||
|
||||
|
@ -247,9 +247,6 @@ def test_parser_with_table(
|
||||
mode: str,
|
||||
extract_tables: str,
|
||||
) -> None:
|
||||
|
||||
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
|
||||
|
||||
parser_class = getattr(pdf_parsers, parser_factory)
|
||||
|
||||
parser = parser_class(
|
||||
|
Loading…
Reference in New Issue
Block a user