community[patch]: move pdf text tests to integration (#18746)

2025-09-04 20:46:45 +00:00 · 2024-03-07 10:34:22 -08:00
parent 4a7d73b39d
commit 1beb84b061
4 changed files with 34 additions and 9 deletions
--- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@@ -1,4 +1,5 @@
 """Tests for the various PDF parsers."""
+
 from pathlib import Path
 from typing import Iterator

@@ -110,3 +111,23 @@ def test_pdfplumber_parser() -> None:
    _assert_with_parser(PDFPlumberParser())
    _assert_with_duplicate_parser(PDFPlumberParser())
    _assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
+
+
+def test_extract_images_text_from_pdf_pypdfparser() -> None:
+    """Test extract image from pdf and recognize text with rapid ocr - PyPDFParser"""
+    _assert_with_parser(PyPDFParser(extract_images=True))
+
+
+def test_extract_images_text_from_pdf_pdfminerparser() -> None:
+    """Test extract image from pdf and recognize text with rapid ocr - PDFMinerParser"""
+    _assert_with_parser(PDFMinerParser(extract_images=True))
+
+
+def test_extract_images_text_from_pdf_pymupdfparser() -> None:
+    """Test extract image from pdf and recognize text with rapid ocr - PyMuPDFParser"""
+    _assert_with_parser(PyMuPDFParser(extract_images=True))
+
+
+def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
+    """Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser"""  # noqa: E501
+    _assert_with_parser(PyPDFium2Parser(extract_images=True))