community[patch]: move pdf text tests to integration (#18746)

This commit is contained in:
Erick Friis
2024-03-07 10:34:22 -08:00
committed by GitHub
parent 4a7d73b39d
commit 1beb84b061
4 changed files with 34 additions and 9 deletions

View File

@@ -1,4 +1,5 @@
"""Tests for the various PDF parsers."""
from pathlib import Path
from typing import Iterator
@@ -110,3 +111,23 @@ def test_pdfplumber_parser() -> None:
_assert_with_parser(PDFPlumberParser())
_assert_with_duplicate_parser(PDFPlumberParser())
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
def test_extract_images_text_from_pdf_pypdfparser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFParser"""
_assert_with_parser(PyPDFParser(extract_images=True))
def test_extract_images_text_from_pdf_pdfminerparser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PDFMinerParser"""
_assert_with_parser(PDFMinerParser(extract_images=True))
def test_extract_images_text_from_pdf_pymupdfparser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PyMuPDFParser"""
_assert_with_parser(PyMuPDFParser(extract_images=True))
def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser""" # noqa: E501
_assert_with_parser(PyPDFium2Parser(extract_images=True))