mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-29 09:58:44 +00:00
Fix test_parser_with_table
This commit is contained in:
parent
898e2a5b51
commit
4f9bcf285b
@ -48,7 +48,10 @@ class BaseImageBlobParser(BaseBlobParser):
|
|||||||
|
|
||||||
with blob.as_bytes_io() as buf:
|
with blob.as_bytes_io() as buf:
|
||||||
if blob.mimetype == "application/x-npy":
|
if blob.mimetype == "application/x-npy":
|
||||||
|
try:
|
||||||
img = Img.fromarray(numpy.load(buf))
|
img = Img.fromarray(numpy.load(buf))
|
||||||
|
except EOFError:
|
||||||
|
return # Ignore too small images
|
||||||
else:
|
else:
|
||||||
img = Img.open(buf)
|
img = Img.open(buf)
|
||||||
content = self._analyze_image(img)
|
content = self._analyze_image(img)
|
||||||
|
@ -1476,7 +1476,7 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
raise ValueError("mode must be single or page")
|
raise ValueError("mode must be single or page")
|
||||||
if extract_tables and extract_tables not in ["csv", "markdown", "html"]:
|
if extract_tables and extract_tables not in ["csv", "markdown", "html"]:
|
||||||
raise ValueError("mode must be csv, markdown or html")
|
raise ValueError("mode must be csv, markdown or html")
|
||||||
if not extract_images and not images_parser:
|
if extract_images and not images_parser:
|
||||||
images_parser = RapidOCRBlobParser()
|
images_parser = RapidOCRBlobParser()
|
||||||
self.password = password
|
self.password = password
|
||||||
self.extract_images = extract_images
|
self.extract_images = extract_images
|
||||||
@ -1554,6 +1554,10 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
elif isinstance(content, list): # Table
|
elif isinstance(content, list): # Table
|
||||||
page_text.append(_JOIN_TABLES + self._convert_table(content))
|
page_text.append(_JOIN_TABLES + self._convert_table(content))
|
||||||
else: # Image
|
else: # Image
|
||||||
|
if self.images_parser:
|
||||||
|
try:
|
||||||
|
from PIL import Image as Img
|
||||||
|
Img.fromarray(content) # Check if image is valid
|
||||||
image_bytes = io.BytesIO()
|
image_bytes = io.BytesIO()
|
||||||
numpy.save(image_bytes, content)
|
numpy.save(image_bytes, content)
|
||||||
blob = Blob.from_data(
|
blob = Blob.from_data(
|
||||||
@ -1567,6 +1571,10 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
blob, text_from_image, self.images_inner_format
|
blob, text_from_image, self.images_inner_format
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
except TypeError:
|
||||||
|
pass
|
||||||
|
except EOFError:
|
||||||
|
pass
|
||||||
|
|
||||||
all_text = _merge_text_and_extras(extras, "".join(page_text).strip())
|
all_text = _merge_text_and_extras(extras, "".join(page_text).strip())
|
||||||
|
|
||||||
|
@ -247,9 +247,6 @@ def test_parser_with_table(
|
|||||||
mode: str,
|
mode: str,
|
||||||
extract_tables: str,
|
extract_tables: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
|
|
||||||
|
|
||||||
parser_class = getattr(pdf_parsers, parser_factory)
|
parser_class = getattr(pdf_parsers, parser_factory)
|
||||||
|
|
||||||
parser = parser_class(
|
parser = parser_class(
|
||||||
|
Loading…
Reference in New Issue
Block a user