diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index e1a23db35fc..a5e73bd4014 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -14,8 +14,8 @@ on: jobs: codspeed: name: Run benchmarks - if: (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'run-benchmarks')) - runs-on: codspeed-macro + if: (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'run-codspeed-benchmarks')) || github.event_name == 'workflow_dispatch' || github.event_name == 'push' + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -41,3 +41,4 @@ jobs: run: | cd libs/core uv run --no-sync pytest ./tests/benchmarks --codspeed + mode: walltime diff --git a/docs/docs/how_to/tools_human.ipynb b/docs/docs/how_to/tools_human.ipynb index 9b61d1d6a69..7a827f057e0 100644 --- a/docs/docs/how_to/tools_human.ipynb +++ b/docs/docs/how_to/tools_human.ipynb @@ -15,7 +15,7 @@ "\n", "To build a production application, you will need to do more work to keep track of application state appropriately.\n", "\n", - "We recommend using `langgraph` for powering such a capability. For more details, please see this [guide](https://langchain-ai.github.io/langgraph/how-tos/human-in-the-loop/).\n", + "We recommend using `langgraph` for powering such a capability. For more details, please see this [guide](https://langchain-ai.github.io/langgraph/concepts/human_in_the_loop/).\n", ":::\n" ] }, @@ -209,7 +209,7 @@ "metadata": {}, "outputs": [ { - "name": "stdin", + "name": "stdout", "output_type": "stream", "text": [ "Do you approve of the following tool invocations\n", @@ -252,7 +252,7 @@ "metadata": {}, "outputs": [ { - "name": "stdin", + "name": "stdout", "output_type": "stream", "text": [ "Do you approve of the following tool invocations\n", diff --git a/libs/core/langchain_core/language_models/chat_models.py b/libs/core/langchain_core/language_models/chat_models.py index a62e06f460a..93fdc325898 100644 --- a/libs/core/langchain_core/language_models/chat_models.py +++ b/libs/core/langchain_core/language_models/chat_models.py @@ -53,6 +53,8 @@ from langchain_core.messages import ( BaseMessageChunk, HumanMessage, convert_to_messages, + convert_to_openai_image_block, + is_data_content_block, message_chunk_to_message, ) from langchain_core.outputs import ( @@ -103,6 +105,41 @@ def _generate_response_from_error(error: BaseException) -> list[ChatGeneration]: return generations +def _format_for_tracing(messages: list[BaseMessage]) -> list[BaseMessage]: + """Format messages for tracing in on_chat_model_start. + + For backward compatibility, we update image content blocks to OpenAI Chat + Completions format. + + Args: + messages: List of messages to format. + + Returns: + List of messages formatted for tracing. + """ + messages_to_trace = [] + for message in messages: + message_to_trace = message + if isinstance(message.content, list): + for idx, block in enumerate(message.content): + if ( + isinstance(block, dict) + and block.get("type") == "image" + and is_data_content_block(block) + ): + if message_to_trace is message: + message_to_trace = message.model_copy() + # Also shallow-copy content + message_to_trace.content = list(message_to_trace.content) + + message_to_trace.content[idx] = ( # type: ignore[index] # mypy confused by .model_copy + convert_to_openai_image_block(block) + ) + messages_to_trace.append(message_to_trace) + + return messages_to_trace + + def generate_from_stream(stream: Iterator[ChatGenerationChunk]) -> ChatResult: """Generate from a stream. @@ -439,7 +476,7 @@ class BaseChatModel(BaseLanguageModel[BaseMessage], ABC): ) (run_manager,) = callback_manager.on_chat_model_start( self._serialized, - [messages], + [_format_for_tracing(messages)], invocation_params=params, options=options, name=config.get("run_name"), @@ -524,7 +561,7 @@ class BaseChatModel(BaseLanguageModel[BaseMessage], ABC): ) (run_manager,) = await callback_manager.on_chat_model_start( self._serialized, - [messages], + [_format_for_tracing(messages)], invocation_params=params, options=options, name=config.get("run_name"), @@ -703,9 +740,12 @@ class BaseChatModel(BaseLanguageModel[BaseMessage], ABC): inheritable_metadata, self.metadata, ) + messages_to_trace = [ + _format_for_tracing(message_list) for message_list in messages + ] run_managers = callback_manager.on_chat_model_start( self._serialized, - messages, + messages_to_trace, invocation_params=params, options=options, name=run_name, @@ -812,9 +852,12 @@ class BaseChatModel(BaseLanguageModel[BaseMessage], ABC): self.metadata, ) + messages_to_trace = [ + _format_for_tracing(message_list) for message_list in messages + ] run_managers = await callback_manager.on_chat_model_start( self._serialized, - messages, + messages_to_trace, invocation_params=params, options=options, name=run_name, diff --git a/libs/core/langchain_core/messages/__init__.py b/libs/core/langchain_core/messages/__init__.py index 4c28efd71db..a7f8db60f62 100644 --- a/libs/core/langchain_core/messages/__init__.py +++ b/libs/core/langchain_core/messages/__init__.py @@ -31,6 +31,10 @@ if TYPE_CHECKING: messages_to_dict, ) from langchain_core.messages.chat import ChatMessage, ChatMessageChunk + from langchain_core.messages.content_blocks import ( + convert_to_openai_image_block, + is_data_content_block, + ) from langchain_core.messages.function import FunctionMessage, FunctionMessageChunk from langchain_core.messages.human import HumanMessage, HumanMessageChunk from langchain_core.messages.modifier import RemoveMessage @@ -78,8 +82,10 @@ __all__ = [ "ToolMessageChunk", "RemoveMessage", "_message_from_dict", + "convert_to_openai_image_block", "convert_to_messages", "get_buffer_string", + "is_data_content_block", "merge_content", "message_chunk_to_message", "message_to_dict", @@ -117,9 +123,11 @@ _dynamic_imports = { "MessageLikeRepresentation": "utils", "_message_from_dict": "utils", "convert_to_messages": "utils", + "convert_to_openai_image_block": "content_blocks", "convert_to_openai_messages": "utils", "filter_messages": "utils", "get_buffer_string": "utils", + "is_data_content_block": "content_blocks", "merge_message_runs": "utils", "message_chunk_to_message": "utils", "messages_from_dict": "utils", diff --git a/libs/core/langchain_core/messages/content_blocks.py b/libs/core/langchain_core/messages/content_blocks.py new file mode 100644 index 00000000000..889e25b421c --- /dev/null +++ b/libs/core/langchain_core/messages/content_blocks.py @@ -0,0 +1,112 @@ +"""Types for content blocks.""" + +from typing import Any, Literal, Union + +from pydantic import TypeAdapter, ValidationError +from typing_extensions import NotRequired, TypedDict + + +class BaseDataContentBlock(TypedDict): + """Base class for data content blocks.""" + + mime_type: NotRequired[str] + """MIME type of the content block (if needed).""" + metadata: NotRequired[dict] + """Provider-specific metadata such as citations or filenames.""" + + +class URLContentBlock(BaseDataContentBlock): + """Content block for data from a URL.""" + + type: Literal["image", "audio", "file"] + """Type of the content block.""" + source_type: Literal["url"] + """Source type (url).""" + url: str + """URL for data.""" + + +class Base64ContentBlock(BaseDataContentBlock): + """Content block for inline data from a base64 string.""" + + type: Literal["image", "audio", "file"] + """Type of the content block.""" + source_type: Literal["base64"] + """Source type (base64).""" + data: str + """Data as a base64 string.""" + + +class PlainTextContentBlock(BaseDataContentBlock): + """Content block for plain text data (e.g., from a document).""" + + type: Literal["file"] + """Type of the content block.""" + source_type: Literal["text"] + """Source type (text).""" + text: str + """Text data.""" + + +class IDContentBlock(TypedDict): + """Content block for data specified by an identifier.""" + + type: Literal["image", "audio", "file"] + """Type of the content block.""" + source_type: Literal["id"] + """Source type (id).""" + id: str + """Identifier for data source.""" + + +DataContentBlock = Union[ + URLContentBlock, + Base64ContentBlock, + PlainTextContentBlock, + IDContentBlock, +] + +_DataContentBlockAdapter: TypeAdapter[DataContentBlock] = TypeAdapter(DataContentBlock) + + +def is_data_content_block( + content_block: dict, +) -> bool: + """Check if the content block is a standard data content block. + + Args: + content_block: The content block to check. + + Returns: + True if the content block is a data content block, False otherwise. + """ + try: + _ = _DataContentBlockAdapter.validate_python(content_block) + except ValidationError: + return False + else: + return True + + +def convert_to_openai_image_block(content_block: dict[str, Any]) -> dict: + """Convert image content block to format expected by OpenAI Chat Completions API.""" + if content_block["source_type"] == "url": + return { + "type": "image_url", + "image_url": { + "url": content_block["url"], + }, + } + if content_block["source_type"] == "base64": + if "mime_type" not in content_block: + error_message = "mime_type key is required for base64 data." + raise ValueError(error_message) + mime_type = content_block["mime_type"] + return { + "type": "image_url", + "image_url": { + "url": f"data:{mime_type};base64,{content_block['data']}", + }, + } + error_message = "Unsupported source type. Only 'url' and 'base64' are supported." + raise ValueError(error_message) diff --git a/libs/core/tests/benchmarks/test_async_callbacks.py b/libs/core/tests/benchmarks/test_async_callbacks.py index 508934a64b1..5cb58f0210e 100644 --- a/libs/core/tests/benchmarks/test_async_callbacks.py +++ b/libs/core/tests/benchmarks/test_async_callbacks.py @@ -46,7 +46,7 @@ class MyCustomAsyncHandler(AsyncCallbackHandler): @pytest.mark.benchmark async def test_async_callbacks_in_sync(benchmark: BenchmarkFixture) -> None: - infinite_cycle = cycle([AIMessage(content=" ".join(["hello", "goodbye"] * 500))]) + infinite_cycle = cycle([AIMessage(content=" ".join(["hello", "goodbye"] * 5))]) model = GenericFakeChatModel(messages=infinite_cycle) @benchmark # type: ignore[misc] diff --git a/libs/core/tests/unit_tests/language_models/chat_models/test_base.py b/libs/core/tests/unit_tests/language_models/chat_models/test_base.py index 04fd163726e..eb1a5960542 100644 --- a/libs/core/tests/unit_tests/language_models/chat_models/test_base.py +++ b/libs/core/tests/unit_tests/language_models/chat_models/test_base.py @@ -8,7 +8,11 @@ import pytest from typing_extensions import override from langchain_core.callbacks import CallbackManagerForLLMRun -from langchain_core.language_models import BaseChatModel, FakeListChatModel +from langchain_core.language_models import ( + BaseChatModel, + FakeListChatModel, + ParrotFakeChatModel, +) from langchain_core.language_models.fake_chat_models import FakeListChatModelError from langchain_core.messages import ( AIMessage, @@ -396,3 +400,58 @@ async def test_disable_streaming_no_streaming_model_async( async for c in model.astream([], tools=[{}]): assert c.content == "invoke" break + + +class FakeChatModelStartTracer(FakeTracer): + def __init__(self) -> None: + super().__init__() + self.messages: list = [] + + def on_chat_model_start(self, *args: Any, **kwargs: Any) -> Run: + _, messages = args + self.messages.append(messages) + return super().on_chat_model_start( + *args, + **kwargs, + ) + + +def test_trace_images_in_openai_format() -> None: + """Test that images are traced in OpenAI format.""" + llm = ParrotFakeChatModel() + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "source_type": "url", + "url": "https://example.com/image.png", + } + ], + } + ] + tracer = FakeChatModelStartTracer() + response = llm.invoke(messages, config={"callbacks": [tracer]}) + assert tracer.messages == [ + [ + [ + HumanMessage( + content=[ + { + "type": "image_url", + "image_url": {"url": "https://example.com/image.png"}, + } + ] + ) + ] + ] + ] + # Test no mutation + assert response.content == [ + { + "type": "image", + "source_type": "url", + "url": "https://example.com/image.png", + } + ] diff --git a/libs/core/tests/unit_tests/messages/test_imports.py b/libs/core/tests/unit_tests/messages/test_imports.py index 18d8cd5243f..1b97e647e01 100644 --- a/libs/core/tests/unit_tests/messages/test_imports.py +++ b/libs/core/tests/unit_tests/messages/test_imports.py @@ -24,6 +24,7 @@ EXPECTED_ALL = [ "RemoveMessage", "convert_to_messages", "get_buffer_string", + "is_data_content_block", "merge_content", "message_chunk_to_message", "message_to_dict", @@ -32,6 +33,7 @@ EXPECTED_ALL = [ "filter_messages", "merge_message_runs", "trim_messages", + "convert_to_openai_image_block", "convert_to_openai_messages", ] diff --git a/libs/core/tests/unit_tests/test_messages.py b/libs/core/tests/unit_tests/test_messages.py index ecd940ec499..5ba3fc8ac57 100644 --- a/libs/core/tests/unit_tests/test_messages.py +++ b/libs/core/tests/unit_tests/test_messages.py @@ -21,7 +21,9 @@ from langchain_core.messages import ( SystemMessage, ToolMessage, convert_to_messages, + convert_to_openai_image_block, get_buffer_string, + is_data_content_block, merge_content, message_chunk_to_message, message_to_dict, @@ -1087,3 +1089,86 @@ def test_message_text() -> None: ).text() == "" ) + + +def test_is_data_content_block() -> None: + assert is_data_content_block( + { + "type": "image", + "source_type": "url", + "url": "https://...", + } + ) + assert is_data_content_block( + { + "type": "image", + "source_type": "base64", + "data": "", + "mime_type": "image/jpeg", + } + ) + assert is_data_content_block( + { + "type": "image", + "source_type": "base64", + "data": "", + "mime_type": "image/jpeg", + "metadata": {"cache_control": {"type": "ephemeral"}}, + } + ) + + assert not is_data_content_block( + { + "type": "text", + "text": "foo", + } + ) + assert not is_data_content_block( + { + "type": "image_url", + "image_url": {"url": "https://..."}, + } + ) + assert not is_data_content_block( + { + "type": "image", + "source_type": "base64", + } + ) + assert not is_data_content_block( + { + "type": "image", + "source": "", + } + ) + + +def test_convert_to_openai_image_block() -> None: + input_block = { + "type": "image", + "source_type": "url", + "url": "https://...", + "metadata": {"cache_control": {"type": "ephemeral"}}, + } + expected = { + "type": "image_url", + "image_url": {"url": "https://..."}, + } + result = convert_to_openai_image_block(input_block) + assert result == expected + + input_block = { + "type": "image", + "source_type": "base64", + "data": "", + "mime_type": "image/jpeg", + "metadata": {"cache_control": {"type": "ephemeral"}}, + } + expected = { + "type": "image_url", + "image_url": { + "url": "data:image/jpeg;base64,", + }, + } + result = convert_to_openai_image_block(input_block) + assert result == expected diff --git a/libs/partners/anthropic/langchain_anthropic/chat_models.py b/libs/partners/anthropic/langchain_anthropic/chat_models.py index d998f94c5da..16d13a9cb6d 100644 --- a/libs/partners/anthropic/langchain_anthropic/chat_models.py +++ b/libs/partners/anthropic/langchain_anthropic/chat_models.py @@ -35,6 +35,7 @@ from langchain_core.messages import ( SystemMessage, ToolCall, ToolMessage, + is_data_content_block, ) from langchain_core.messages.ai import InputTokenDetails, UsageMetadata from langchain_core.messages.tool import tool_call_chunk as create_tool_call_chunk @@ -177,8 +178,78 @@ def _merge_messages( return merged +def _format_data_content_block(block: dict) -> dict: + """Format standard data content block to format expected by Anthropic.""" + if block["type"] == "image": + if block["source_type"] == "url": + if block["url"].startswith("data:"): + # Data URI + formatted_block = { + "type": "image", + "source": _format_image(block["url"]), + } + else: + formatted_block = { + "type": "image", + "source": {"type": "url", "url": block["url"]}, + } + elif block["source_type"] == "base64": + formatted_block = { + "type": "image", + "source": { + "type": "base64", + "media_type": block["mime_type"], + "data": block["data"], + }, + } + else: + raise ValueError( + "Anthropic only supports 'url' and 'base64' source_type for image " + "content blocks." + ) + + elif block["type"] == "file": + if block["source_type"] == "url": + formatted_block = { + "type": "document", + "source": { + "type": "url", + "url": block["url"], + }, + } + elif block["source_type"] == "base64": + formatted_block = { + "type": "document", + "source": { + "type": "base64", + "media_type": block.get("mime_type") or "application/pdf", + "data": block["data"], + }, + } + elif block["source_type"] == "text": + formatted_block = { + "type": "document", + "source": { + "type": "text", + "media_type": block.get("mime_type") or "text/plain", + "data": block["text"], + }, + } + + else: + raise ValueError(f"Block of type {block['type']} is not supported.") + + if formatted_block and (metadata := block.get("metadata")): + if "cache_control" in metadata: + formatted_block["cache_control"] = metadata["cache_control"] + if "citations" in metadata: + formatted_block["citations"] = metadata["citations"] + + return formatted_block + + def _format_messages( - messages: list[BaseMessage], + messages: Sequence[BaseMessage], ) -> tuple[Union[str, list[dict], None], list[dict]]: """Format messages for anthropic.""" @@ -233,6 +304,8 @@ def _format_messages( # convert format source = _format_image(block["image_url"]["url"]) content.append({"type": "image", "source": source}) + elif is_data_content_block(block): + content.append(_format_data_content_block(block)) elif block["type"] == "tool_use": # If a tool_call with the same id as a tool_use content block # exists, the tool_call is preferred. diff --git a/libs/partners/anthropic/tests/integration_tests/test_standard.py b/libs/partners/anthropic/tests/integration_tests/test_standard.py index 1a11a4850af..b216141d261 100644 --- a/libs/partners/anthropic/tests/integration_tests/test_standard.py +++ b/libs/partners/anthropic/tests/integration_tests/test_standard.py @@ -25,6 +25,14 @@ class TestAnthropicStandard(ChatModelIntegrationTests): def supports_image_inputs(self) -> bool: return True + @property + def supports_image_urls(self) -> bool: + return True + + @property + def supports_pdf_inputs(self) -> bool: + return True + @property def supports_image_tool_message(self) -> bool: return True diff --git a/libs/partners/anthropic/tests/unit_tests/test_chat_models.py b/libs/partners/anthropic/tests/unit_tests/test_chat_models.py index 7281a104cbb..87811a6807b 100644 --- a/libs/partners/anthropic/tests/unit_tests/test_chat_models.py +++ b/libs/partners/anthropic/tests/unit_tests/test_chat_models.py @@ -690,6 +690,85 @@ def test__format_messages_with_cache_control() -> None: assert expected_system == actual_system assert expected_messages == actual_messages + # Test standard multi-modal format + messages = [ + HumanMessage( + [ + { + "type": "text", + "text": "Summarize this document:", + }, + { + "type": "file", + "source_type": "base64", + "mime_type": "application/pdf", + "data": "", + "metadata": {"cache_control": {"type": "ephemeral"}}, + }, + ] + ) + ] + actual_system, actual_messages = _format_messages(messages) + assert actual_system is None + expected_messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Summarize this document:", + }, + { + "type": "document", + "source": { + "type": "base64", + "media_type": "application/pdf", + "data": "", + }, + "cache_control": {"type": "ephemeral"}, + }, + ], + } + ] + assert actual_messages == expected_messages + + +def test__format_messages_with_citations() -> None: + input_messages = [ + HumanMessage( + content=[ + { + "type": "file", + "source_type": "text", + "text": "The grass is green. The sky is blue.", + "mime_type": "text/plain", + "metadata": {"citations": {"enabled": True}}, + }, + {"type": "text", "text": "What color is the grass and sky?"}, + ] + ) + ] + expected_messages = [ + { + "role": "user", + "content": [ + { + "type": "document", + "source": { + "type": "text", + "media_type": "text/plain", + "data": "The grass is green. The sky is blue.", + }, + "citations": {"enabled": True}, + }, + {"type": "text", "text": "What color is the grass and sky?"}, + ], + } + ] + actual_system, actual_messages = _format_messages(input_messages) + assert actual_system is None + assert actual_messages == expected_messages + def test__format_messages_with_multiple_system() -> None: messages = [ diff --git a/libs/partners/openai/langchain_openai/chat_models/base.py b/libs/partners/openai/langchain_openai/chat_models/base.py index c3d5fa4bc04..6670a3b7668 100644 --- a/libs/partners/openai/langchain_openai/chat_models/base.py +++ b/libs/partners/openai/langchain_openai/chat_models/base.py @@ -61,6 +61,8 @@ from langchain_core.messages import ( ToolCall, ToolMessage, ToolMessageChunk, + convert_to_openai_image_block, + is_data_content_block, ) from langchain_core.messages.ai import ( InputTokenDetails, @@ -184,6 +186,32 @@ def _convert_dict_to_message(_dict: Mapping[str, Any]) -> BaseMessage: return ChatMessage(content=_dict.get("content", ""), role=role, id=id_) # type: ignore[arg-type] +def _format_data_content_block(block: dict) -> dict: + """Format standard data content block to format expected by OpenAI.""" + if block["type"] == "image": + formatted_block = convert_to_openai_image_block(block) + + elif block["type"] == "file": + if block["source_type"] == "base64": + file = {"file_data": f"data:{block['mime_type']};base64,{block['data']}"} + if (metadata := block.get("metadata")) and ("filename" in metadata): + file["filename"] = metadata["filename"] + else: + warnings.warn( + "OpenAI may require a filename for file inputs. Specify a filename " + "in the metadata: {'type': 'file', 'source_type': 'base64', " + "'mime_type': 'application/pdf', 'data': '...', " + "'metadata': {'filename': 'my-pdf'}}" + ) + formatted_block = {"type": "file", "file": file} + elif block["source_type"] == "id": + formatted_block = {"type": "file", "file": {"file_id": block["id"]}} + else: + raise ValueError(f"Block of type {block['type']} is not supported.") + + return formatted_block + + def _format_message_content(content: Any) -> Any: """Format message content.""" if content and isinstance(content, list): @@ -196,6 +224,8 @@ def _format_message_content(content: Any) -> Any: and block["type"] in ("tool_use", "thinking") ): continue + elif isinstance(block, dict) and is_data_content_block(block): + formatted_content.append(_format_data_content_block(block)) # Anthropic image blocks elif ( isinstance(block, dict) @@ -3122,6 +3152,9 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list: if block["image_url"].get("detail"): new_block["detail"] = block["image_url"]["detail"] new_blocks.append(new_block) + elif block["type"] == "file": + new_block = {"type": "input_file", **block["file"]} + new_blocks.append(new_block) elif block["type"] in ("input_text", "input_image", "input_file"): new_blocks.append(block) else: diff --git a/libs/partners/openai/tests/integration_tests/chat_models/test_azure_standard.py b/libs/partners/openai/tests/integration_tests/chat_models/test_azure_standard.py index 1e7d6f0cdcf..41e6aec4bd5 100644 --- a/libs/partners/openai/tests/integration_tests/chat_models/test_azure_standard.py +++ b/libs/partners/openai/tests/integration_tests/chat_models/test_azure_standard.py @@ -30,6 +30,10 @@ class TestAzureOpenAIStandard(ChatModelIntegrationTests): def supports_image_inputs(self) -> bool: return True + @property + def supports_image_urls(self) -> bool: + return True + @property def supports_json_mode(self) -> bool: return True diff --git a/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py b/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py index e23bbffa7c5..0bbf3ce7a18 100644 --- a/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py +++ b/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py @@ -1,10 +1,12 @@ """Standard LangChain interface tests""" +import base64 from pathlib import Path from typing import Literal, cast +import httpx from langchain_core.language_models import BaseChatModel -from langchain_core.messages import AIMessage +from langchain_core.messages import AIMessage, HumanMessage from langchain_tests.integration_tests import ChatModelIntegrationTests from langchain_openai import ChatOpenAI @@ -25,6 +27,10 @@ class TestOpenAIStandard(ChatModelIntegrationTests): def supports_image_inputs(self) -> bool: return True + @property + def supports_image_urls(self) -> bool: + return True + @property def supports_json_mode(self) -> bool: return True @@ -71,6 +77,31 @@ class TestOpenAIStandard(ChatModelIntegrationTests): ) return _invoke(llm, input_, stream) + @property + def supports_pdf_inputs(self) -> bool: + # OpenAI requires a filename for PDF inputs + # For now, we test with filename in OpenAI-specific tests + return False + + def test_openai_pdf_inputs(self, model: BaseChatModel) -> None: + """Test that the model can process PDF inputs.""" + url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" + pdf_data = base64.b64encode(httpx.get(url).content).decode("utf-8") + + message = HumanMessage( + [ + {"type": "text", "text": "Summarize this document:"}, + { + "type": "file", + "source_type": "base64", + "mime_type": "application/pdf", + "data": pdf_data, + "metadata": {"filename": "my-pdf"}, # OpenAI requires a filename + }, + ] + ) + _ = model.invoke([message]) + def _invoke(llm: ChatOpenAI, input_: str, stream: bool) -> AIMessage: if stream: diff --git a/libs/partners/openai/tests/unit_tests/chat_models/test_base.py b/libs/partners/openai/tests/unit_tests/chat_models/test_base.py index 5b8d219d923..ded98f4d639 100644 --- a/libs/partners/openai/tests/unit_tests/chat_models/test_base.py +++ b/libs/partners/openai/tests/unit_tests/chat_models/test_base.py @@ -649,6 +649,51 @@ def test_format_message_content() -> None: ] assert [{"type": "text", "text": "hello"}] == _format_message_content(content) + # Standard multi-modal inputs + content = [{"type": "image", "source_type": "url", "url": "https://..."}] + expected = [{"type": "image_url", "image_url": {"url": "https://..."}}] + assert expected == _format_message_content(content) + + content = [ + { + "type": "image", + "source_type": "base64", + "data": "", + "mime_type": "image/png", + } + ] + expected = [ + { + "type": "image_url", + "image_url": {"url": "data:image/png;base64,"}, + } + ] + assert expected == _format_message_content(content) + + content = [ + { + "type": "file", + "source_type": "base64", + "data": "", + "mime_type": "application/pdf", + "metadata": {"filename": "my_file"}, + } + ] + expected = [ + { + "type": "file", + "file": { + "filename": "my_file", + "file_data": "data:application/pdf;base64,", + }, + } + ] + assert expected == _format_message_content(content) + + content = [{"type": "file", "source_type": "id", "id": "file-abc123"}] + expected = [{"type": "file", "file": {"file_id": "file-abc123"}}] + assert expected == _format_message_content(content) + class GenerateUsername(BaseModel): "Get a username based on someone's name and hair color." diff --git a/libs/standard-tests/langchain_tests/integration_tests/chat_models.py b/libs/standard-tests/langchain_tests/integration_tests/chat_models.py index b5e294ffc8d..ae083c5bf36 100644 --- a/libs/standard-tests/langchain_tests/integration_tests/chat_models.py +++ b/libs/standard-tests/langchain_tests/integration_tests/chat_models.py @@ -298,13 +298,21 @@ class ChatModelIntegrationTests(ChatModelTests): .. code-block:: python - [ - {"type": "text", "text": "describe the weather in this image"}, - { - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, - }, - ] + { + "type": "image", + "source_type": "base64", + "data": "", + "mime_type": "image/jpeg", # or appropriate mime-type + } + + In addition to OpenAI-style content blocks: + + .. code-block:: python + + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, + } See https://python.langchain.com/docs/concepts/multimodality/ @@ -316,6 +324,59 @@ class ChatModelIntegrationTests(ChatModelTests): def supports_image_inputs(self) -> bool: return True + .. dropdown:: supports_image_urls + + Boolean property indicating whether the chat model supports image inputs from + URLs. Defaults to ``False``. + + If set to ``True``, the chat model will be tested using content blocks of the + form + + .. code-block:: python + + { + "type": "image", + "source_type": "url", + "url": "https://...", + } + + See https://python.langchain.com/docs/concepts/multimodality/ + + Example: + + .. code-block:: python + + @property + def supports_image_urls(self) -> bool: + return True + + .. dropdown:: supports_pdf_inputs + + Boolean property indicating whether the chat model supports PDF inputs. + Defaults to ``False``. + + If set to ``True``, the chat model will be tested using content blocks of the + form + + .. code-block:: python + + { + "type": "file", + "source_type": "base64", + "data": "", + "mime_type": "application/pdf", + } + + See https://python.langchain.com/docs/concepts/multimodality/ + + Example: + + .. code-block:: python + + @property + def supports_pdf_inputs(self) -> bool: + return True + .. dropdown:: supports_video_inputs Boolean property indicating whether the chat model supports image inputs. @@ -1891,11 +1952,79 @@ class ChatModelIntegrationTests(ChatModelTests): result = model_with_tools.invoke(messages) assert isinstance(result, AIMessage) + def test_pdf_inputs(self, model: BaseChatModel) -> None: + """Test that the model can process PDF inputs. + + This test should be skipped (see Configuration below) if the model does not + support PDF inputs. These will take the form: + + .. code-block:: python + + { + "type": "image", + "source_type": "base64", + "data": "", + "mime_type": "application/pdf", + } + + See https://python.langchain.com/docs/concepts/multimodality/ + + .. dropdown:: Configuration + + To disable this test, set ``supports_pdf_inputs`` to False in your + test class: + + .. code-block:: python + + class TestMyChatModelIntegration(ChatModelIntegrationTests): + + @property + def supports_pdf_inputs(self) -> bool: + return False + + .. dropdown:: Troubleshooting + + If this test fails, check that the model can correctly handle messages + with pdf content blocks, including base64-encoded files. Otherwise, set + the ``supports_pdf_inputs`` property to False. + """ + if not self.supports_pdf_inputs: + pytest.skip("Model does not support PDF inputs.") + url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" + pdf_data = base64.b64encode(httpx.get(url).content).decode("utf-8") + + message = HumanMessage( + [ + { + "type": "text", + "text": "Summarize this document:", + }, + { + "type": "file", + "source_type": "base64", + "mime_type": "application/pdf", + "data": pdf_data, + }, + ] + ) + _ = model.invoke([message]) + def test_image_inputs(self, model: BaseChatModel) -> None: """Test that the model can process image inputs. This test should be skipped (see Configuration below) if the model does not - support image inputs These will take the form of messages with OpenAI-style + support image inputs. These will take the form: + + .. code-block:: python + + { + "type": "image", + "source_type": "base64", + "data": "", + "mime_type": "image/jpeg", # or appropriate mime-type + } + + For backward-compatibility, we must also support OpenAI-style image content blocks: .. code-block:: python @@ -1910,6 +2039,17 @@ class ChatModelIntegrationTests(ChatModelTests): See https://python.langchain.com/docs/concepts/multimodality/ + If the property ``supports_image_urls`` is set to True, the test will also + check that we can process content blocks of the form: + + .. code-block:: python + + { + "type": "image", + "source_type": "url", + "url": "", + } + .. dropdown:: Configuration To disable this test, set ``supports_image_inputs`` to False in your @@ -1922,16 +2062,23 @@ class ChatModelIntegrationTests(ChatModelTests): def supports_image_inputs(self) -> bool: return False + # Can also explicitly disable testing image URLs: + @property + def supports_image_urls(self) -> bool: + return False + .. dropdown:: Troubleshooting If this test fails, check that the model can correctly handle messages - with image content blocks in OpenAI format, including base64-encoded - images. Otherwise, set the ``supports_image_inputs`` property to False. + with image content blocks, including base64-encoded images. Otherwise, set + the ``supports_image_inputs`` property to False. """ if not self.supports_image_inputs: pytest.skip("Model does not support image message.") image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" image_data = base64.b64encode(httpx.get(image_url).content).decode("utf-8") + + # OpenAI format, base64 data message = HumanMessage( content=[ {"type": "text", "text": "describe the weather in this image"}, @@ -1941,7 +2088,35 @@ class ChatModelIntegrationTests(ChatModelTests): }, ], ) - model.invoke([message]) + _ = model.invoke([message]) + + # Standard format, base64 data + message = HumanMessage( + content=[ + {"type": "text", "text": "describe the weather in this image"}, + { + "type": "image", + "source_type": "base64", + "mime_type": "image/jpeg", + "data": image_data, + }, + ], + ) + _ = model.invoke([message]) + + # Standard format, URL + if self.supports_image_urls: + message = HumanMessage( + content=[ + {"type": "text", "text": "describe the weather in this image"}, + { + "type": "image", + "source_type": "url", + "url": image_url, + }, + ], + ) + _ = model.invoke([message]) def test_image_tool_message(self, model: BaseChatModel) -> None: """Test that the model can process ToolMessages with image inputs. diff --git a/libs/standard-tests/langchain_tests/unit_tests/chat_models.py b/libs/standard-tests/langchain_tests/unit_tests/chat_models.py index beec0b98cb1..f3e22d04adf 100644 --- a/libs/standard-tests/langchain_tests/unit_tests/chat_models.py +++ b/libs/standard-tests/langchain_tests/unit_tests/chat_models.py @@ -160,6 +160,17 @@ class ChatModelTests(BaseStandardTests): ``False``.""" return False + @property + def supports_image_urls(self) -> bool: + """(bool) whether the chat model supports image inputs from URLs, defaults to + ``False``.""" + return False + + @property + def supports_pdf_inputs(self) -> bool: + """(bool) whether the chat model supports PDF inputs, defaults to ``False``.""" + return False + @property def supports_video_inputs(self) -> bool: """(bool) whether the chat model supports video inputs, efaults to ``False``. @@ -373,13 +384,21 @@ class ChatModelUnitTests(ChatModelTests): .. code-block:: python - [ - {"type": "text", "text": "describe the weather in this image"}, - { - "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, - }, - ] + { + "type": "image", + "source_type": "base64", + "data": "", + "mime_type": "image/jpeg", # or appropriate mime-type + } + + In addition to OpenAI-style content blocks: + + .. code-block:: python + + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, + } See https://python.langchain.com/docs/concepts/multimodality/ @@ -391,6 +410,59 @@ class ChatModelUnitTests(ChatModelTests): def supports_image_inputs(self) -> bool: return True + .. dropdown:: supports_image_urls + + Boolean property indicating whether the chat model supports image inputs from + URLs. Defaults to ``False``. + + If set to ``True``, the chat model will be tested using content blocks of the + form + + .. code-block:: python + + { + "type": "image", + "source_type": "url", + "url": "https://...", + } + + See https://python.langchain.com/docs/concepts/multimodality/ + + Example: + + .. code-block:: python + + @property + def supports_image_urls(self) -> bool: + return True + + .. dropdown:: supports_pdf_inputs + + Boolean property indicating whether the chat model supports PDF inputs. + Defaults to ``False``. + + If set to ``True``, the chat model will be tested using content blocks of the + form + + .. code-block:: python + + { + "type": "file", + "source_type": "base64", + "data": "", + "mime_type": "application/pdf", + } + + See https://python.langchain.com/docs/concepts/multimodality/ + + Example: + + .. code-block:: python + + @property + def supports_pdf_inputs(self) -> bool: + return True + .. dropdown:: supports_video_inputs Boolean property indicating whether the chat model supports image inputs.