community[patch]: move pdf text tests to integration (#18746)

This commit is contained in:
Erick Friis 2024-03-07 10:34:22 -08:00 committed by GitHub
parent 4a7d73b39d
commit 1beb84b061
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 34 additions and 9 deletions

View File

@ -1,5 +1,6 @@
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, cast
from langchain_core._api.deprecation import deprecated
from langchain_core.callbacks import (
AsyncCallbackManagerForLLMRun,
CallbackManagerForLLMRun,
@ -70,6 +71,11 @@ def convert_messages_to_prompt_anthropic(
return text.rstrip()
@deprecated(
since="0.0.28",
removal="0.2",
alternative_import="langchain_anthropic.ChatAnthropic",
)
class ChatAnthropic(BaseChatModel, _AnthropicCommon):
"""`Anthropic` chat large language models.

View File

@ -11,6 +11,7 @@ from typing import (
Optional,
)
from langchain_core._api.deprecation import deprecated
from langchain_core.callbacks import (
AsyncCallbackManagerForLLMRun,
CallbackManagerForLLMRun,
@ -147,6 +148,11 @@ class _AnthropicCommon(BaseLanguageModel):
return stop
@deprecated(
since="0.0.28",
removal="0.2",
alternative_import="langchain_anthropic.AnthropicLLM",
)
class Anthropic(LLM, _AnthropicCommon):
"""Anthropic large language models.

View File

@ -1,4 +1,5 @@
"""Tests for the various PDF parsers."""
from pathlib import Path
from typing import Iterator
@ -110,3 +111,23 @@ def test_pdfplumber_parser() -> None:
_assert_with_parser(PDFPlumberParser())
_assert_with_duplicate_parser(PDFPlumberParser())
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
def test_extract_images_text_from_pdf_pypdfparser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFParser"""
_assert_with_parser(PyPDFParser(extract_images=True))
def test_extract_images_text_from_pdf_pdfminerparser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PDFMinerParser"""
_assert_with_parser(PDFMinerParser(extract_images=True))
def test_extract_images_text_from_pdf_pymupdfparser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PyMuPDFParser"""
_assert_with_parser(PyMuPDFParser(extract_images=True))
def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser""" # noqa: E501
_assert_with_parser(PyPDFium2Parser(extract_images=True))

View File

@ -1,4 +1,5 @@
"""Tests for the various PDF parsers."""
from pathlib import Path
from typing import Iterator
@ -85,12 +86,3 @@ def test_pypdfium2_parser() -> None:
"""Test PyPDFium2 parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PyPDFium2Parser())
@pytest.mark.requires("rapidocr_onnxruntime")
def test_extract_images_text_from_pdf() -> None:
"""Test extract image from pdf and recognize text with rapid ocr"""
_assert_with_parser(PyPDFParser(extract_images=True))
_assert_with_parser(PDFMinerParser(extract_images=True))
_assert_with_parser(PyMuPDFParser(extract_images=True))
_assert_with_parser(PyPDFium2Parser(extract_images=True))