multiple: multi-modal content blocks (#30746)

Introduces standard content block format for images, audio, and files. ## Examples Image from url: ``` { "type": "image", "source_type": "url", "url": "https://path.to.image.png", } ``` Image, in-line data: ``` { "type": "image", "source_type": "base64", "data": "<base64 string>", "mime_type": "image/png", } ``` PDF, in-line data: ``` { "type": "file", "source_type": "base64", "data": "<base64 string>", "mime_type": "application/pdf", } ``` File from ID: ``` { "type": "file", "source_type": "id", "id": "file-abc123", } ``` Plain-text file: ``` { "type": "file", "source_type": "text", "text": "foo bar", } ```
2025-09-23 19:39:58 +00:00 · 2025-04-15 09:48:06 -04:00
parent 09438857e8
commit 9cfe6bcacd
15 changed files with 854 additions and 25 deletions
--- a/libs/standard-tests/langchain_tests/unit_tests/chat_models.py
+++ b/libs/standard-tests/langchain_tests/unit_tests/chat_models.py
@@ -160,6 +160,17 @@ class ChatModelTests(BaseStandardTests):
        ``False``."""
        return False

+    @property
+    def supports_image_urls(self) -> bool:
+        """(bool) whether the chat model supports image inputs from URLs, defaults to
+        ``False``."""
+        return False
+
+    @property
+    def supports_pdf_inputs(self) -> bool:
+        """(bool) whether the chat model supports PDF inputs, defaults to ``False``."""
+        return False
+
    @property
    def supports_video_inputs(self) -> bool:
        """(bool) whether the chat model supports video inputs, efaults to ``False``.
@@ -373,13 +384,21 @@ class ChatModelUnitTests(ChatModelTests):

        .. code-block:: python

-            [
-                {"type": "text", "text": "describe the weather in this image"},
-                {
-                    "type": "image_url",
-                    "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
-                },
-            ]
+            {
+                "type": "image",
+                "source_type": "base64",
+                "data": "<base64 image data>",
+                "mime_type": "image/jpeg",  # or appropriate mime-type
+            }
+
+        In addition to OpenAI-style content blocks:
+
+        .. code-block:: python
+
+            {
+                "type": "image_url",
+                "image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
+            }

        See https://python.langchain.com/docs/concepts/multimodality/

@@ -391,6 +410,59 @@ class ChatModelUnitTests(ChatModelTests):
            def supports_image_inputs(self) -> bool:
                return True

+    .. dropdown:: supports_image_urls
+
+        Boolean property indicating whether the chat model supports image inputs from
+        URLs. Defaults to ``False``.
+
+        If set to ``True``, the chat model will be tested using content blocks of the
+        form
+
+        .. code-block:: python
+
+            {
+                "type": "image",
+                "source_type": "url",
+                "url": "https://...",
+            }
+
+        See https://python.langchain.com/docs/concepts/multimodality/
+
+        Example:
+
+        .. code-block:: python
+
+            @property
+            def supports_image_urls(self) -> bool:
+                return True
+
+    .. dropdown:: supports_pdf_inputs
+
+        Boolean property indicating whether the chat model supports PDF inputs.
+        Defaults to ``False``.
+
+        If set to ``True``, the chat model will be tested using content blocks of the
+        form
+
+        .. code-block:: python
+
+            {
+                "type": "file",
+                "source_type": "base64",
+                "data": "<base64 file data>",
+                "mime_type": "application/pdf",
+            }
+
+        See https://python.langchain.com/docs/concepts/multimodality/
+
+        Example:
+
+        .. code-block:: python
+
+            @property
+            def supports_pdf_inputs(self) -> bool:
+                return True
+
    .. dropdown:: supports_video_inputs

        Boolean property indicating whether the chat model supports image inputs.