feat(openai): (v1) support pdfs passed via url in standard format (#32876)

2026-02-21 06:33:41 +00:00 · 2025-09-12 10:44:00 -04:00
parent 67aa37b144
commit b88115f6fc
5 changed files with 200 additions and 17 deletions
--- a/libs/core/langchain_core/messages/block_translators/openai.py
+++ b/libs/core/langchain_core/messages/block_translators/openai.py
@@ -5,7 +5,7 @@ from __future__ import annotations
 import json
 import warnings
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, Optional, Union, cast
+from typing import TYPE_CHECKING, Any, Literal, Optional, Union, cast

 from langchain_core.language_models._utils import (
    _is_openai_data_block,
@@ -42,10 +42,23 @@ def convert_to_openai_image_block(block: dict[str, Any]) -> dict:
    raise ValueError(error_message)


-def convert_to_openai_data_block(block: dict) -> dict:
+def convert_to_openai_data_block(
+    block: dict, api: Literal["chat/completions", "responses"] = "chat/completions"
+) -> dict:
    """Format standard data content block to format expected by OpenAI."""
    if block["type"] == "image":
-        formatted_block = convert_to_openai_image_block(block)
+        chat_completions_block = convert_to_openai_image_block(block)
+        if api == "responses":
+            formatted_block = {
+                "type": "input_image",
+                "image_url": chat_completions_block["image_url"]["url"],
+            }
+            if chat_completions_block["image_url"].get("detail"):
+                formatted_block["detail"] = chat_completions_block["image_url"][
+                    "detail"
+                ]
+        else:
+            formatted_block = chat_completions_block

    elif block["type"] == "file":
        if "base64" in block or block.get("source_type") == "base64":
@@ -68,13 +81,23 @@ def convert_to_openai_data_block(block: dict) -> dict:
                    stacklevel=1,
                )
            formatted_block = {"type": "file", "file": file}
+            if api == "responses":
+                formatted_block = {"type": "input_file", **formatted_block["file"]}
        elif "file_id" in block or block.get("source_type") == "id":
            # Handle v0 format: {"source_type": "id", "id": "...", ...}
            # Handle v1 format: {"file_id": "...", ...}
            file_id = block["id"] if "source_type" in block else block["file_id"]
            formatted_block = {"type": "file", "file": {"file_id": file_id}}
+            if api == "responses":
+                formatted_block = {"type": "input_file", **formatted_block["file"]}
+        elif "url" in block:
+            if api == "chat/completions":
+                error_msg = "OpenAI Chat Completions does not support file URLs."
+                raise ValueError(error_msg)
+            # Only supported by Responses API; return in that format
+            formatted_block = {"type": "input_file", "file_url": block["url"]}
        else:
-            error_msg = "Keys base64 or file_id required for file blocks."
+            error_msg = "Keys base64, url, or file_id required for file blocks."
            raise ValueError(error_msg)

    elif block["type"] == "audio":
--- a/libs/core/tests/unit_tests/messages/block_translators/test_openai.py
+++ b/libs/core/tests/unit_tests/messages/block_translators/test_openai.py
@@ -1,7 +1,12 @@
 from typing import Optional

+import pytest
+
 from langchain_core.messages import AIMessage, AIMessageChunk, HumanMessage
 from langchain_core.messages import content as types
+from langchain_core.messages.block_translators.openai import (
+    convert_to_openai_data_block,
+)
 from tests.unit_tests.language_models.chat_models.test_base import (
    _content_blocks_equal_ignore_id,
 )
@@ -442,3 +447,132 @@ def test_compat_responses_v03() -> None:
        {"type": "reasoning", "reasoning": "reasoning text", "id": "rs_abc"}
    ]
    assert chunk.content_blocks == expected_content
+
+
+def test_convert_to_openai_data_block() -> None:
+    # Chat completions
+    ## Image / url
+    block = {
+        "type": "image",
+        "url": "https://example.com/test.png",
+    }
+    expected = {
+        "type": "image_url",
+        "image_url": {"url": "https://example.com/test.png"},
+    }
+    result = convert_to_openai_data_block(block)
+    assert result == expected
+
+    ## Image / base64
+    block = {
+        "type": "image",
+        "base64": "<base64 string>",
+        "mime_type": "image/png",
+    }
+    expected = {
+        "type": "image_url",
+        "image_url": {"url": "data:image/png;base64,<base64 string>"},
+    }
+    result = convert_to_openai_data_block(block)
+    assert result == expected
+
+    ## File / url
+    block = {
+        "type": "file",
+        "url": "https://example.com/test.pdf",
+    }
+    with pytest.raises(ValueError, match="does not support"):
+        result = convert_to_openai_data_block(block)
+
+    ## File / base64
+    block = {
+        "type": "file",
+        "base64": "<base64 string>",
+        "mime_type": "application/pdf",
+        "filename": "test.pdf",
+    }
+    expected = {
+        "type": "file",
+        "file": {
+            "file_data": "data:application/pdf;base64,<base64 string>",
+            "filename": "test.pdf",
+        },
+    }
+    result = convert_to_openai_data_block(block)
+    assert result == expected
+
+    ## File / file ID
+    block = {
+        "type": "file",
+        "file_id": "file-abc123",
+    }
+    expected = {"type": "file", "file": {"file_id": "file-abc123"}}
+    result = convert_to_openai_data_block(block)
+    assert result == expected
+
+    ## Audio / base64
+    block = {
+        "type": "audio",
+        "base64": "<base64 string>",
+        "mime_type": "audio/wav",
+    }
+    expected = {
+        "type": "input_audio",
+        "input_audio": {"data": "<base64 string>", "format": "wav"},
+    }
+    result = convert_to_openai_data_block(block)
+    assert result == expected
+
+    # Responses
+    ## Image / url
+    block = {
+        "type": "image",
+        "url": "https://example.com/test.png",
+    }
+    expected = {"type": "input_image", "image_url": "https://example.com/test.png"}
+    result = convert_to_openai_data_block(block, api="responses")
+    assert result == expected
+
+    ## Image / base64
+    block = {
+        "type": "image",
+        "base64": "<base64 string>",
+        "mime_type": "image/png",
+    }
+    expected = {
+        "type": "input_image",
+        "image_url": "data:image/png;base64,<base64 string>",
+    }
+    result = convert_to_openai_data_block(block, api="responses")
+    assert result == expected
+
+    ## File / url
+    block = {
+        "type": "file",
+        "url": "https://example.com/test.pdf",
+    }
+    expected = {"type": "input_file", "file_url": "https://example.com/test.pdf"}
+
+    ## File / base64
+    block = {
+        "type": "file",
+        "base64": "<base64 string>",
+        "mime_type": "application/pdf",
+        "filename": "test.pdf",
+    }
+    expected = {
+        "type": "input_file",
+        "file_data": "data:application/pdf;base64,<base64 string>",
+        "filename": "test.pdf",
+    }
+    result = convert_to_openai_data_block(block, api="responses")
+    assert result == expected
+
+    ## File / file ID
+    block = {
+        "type": "file",
+        "file_id": "file-abc123",
+    }
+    expected = {"type": "input_file", "file_id": "file-abc123"}
+    result = convert_to_openai_data_block(block, api="responses")
+    assert result == expected
--- a/libs/partners/openai/langchain_openai/chat_models/base.py
+++ b/libs/partners/openai/langchain_openai/chat_models/base.py
@@ -206,7 +206,11 @@ def _convert_dict_to_message(_dict: Mapping[str, Any]) -> BaseMessage:
        return ChatMessage(content=_dict.get("content", ""), role=role, id=id_)  # type: ignore[arg-type]


-def _format_message_content(content: Any, responses_ai_msg: bool = False) -> Any:
+def _format_message_content(
+    content: Any,
+    api: Literal["chat/completions", "responses"] = "chat/completions",
+    role: Optional[str] = None,
+) -> Any:
    """Format message content."""
    if content and isinstance(content, list):
        formatted_content = []
@@ -223,9 +227,9 @@ def _format_message_content(content: Any, responses_ai_msg: bool = False) -> Any
                and is_data_content_block(block)
                # Responses API messages handled separately in _compat (parsed into
                # image generation calls)
-                and not responses_ai_msg
+                and not (api == "responses" and str(role).lower().startswith("ai"))
            ):
-                formatted_content.append(convert_to_openai_data_block(block))
+                formatted_content.append(convert_to_openai_data_block(block, api=api))
            # Anthropic image blocks
            elif (
                isinstance(block, dict)
@@ -258,13 +262,12 @@ def _format_message_content(content: Any, responses_ai_msg: bool = False) -> Any


 def _convert_message_to_dict(
-    message: BaseMessage, responses_ai_msg: bool = False
+    message: BaseMessage,
+    api: Literal["chat/completions", "responses"] = "chat/completions",
 ) -> dict:
    """Convert a LangChain message to dictionary format expected by OpenAI."""
    message_dict: dict[str, Any] = {
-        "content": _format_message_content(
-            message.content, responses_ai_msg=responses_ai_msg
-        )
+        "content": _format_message_content(message.content, api=api, role=message.type)
    }
    if (name := message.name or message.additional_kwargs.get("name")) is not None:
        message_dict["name"] = name
@@ -306,7 +309,7 @@ def _convert_message_to_dict(
                isinstance(block, dict)
                and block.get("type") == "audio"
                and (id_ := block.get("id"))
-                and not responses_ai_msg
+                and api != "responses"
            ):
                # openai doesn't support passing the data back - only the id
                # https://platform.openai.com/docs/guides/audio/multi-turn-conversations
@@ -3702,7 +3705,7 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
    for lc_msg in messages:
        if isinstance(lc_msg, AIMessage):
            lc_msg = _convert_from_v03_ai_message(lc_msg)
-            msg = _convert_message_to_dict(lc_msg, responses_ai_msg=True)
+            msg = _convert_message_to_dict(lc_msg, api="responses")
            if isinstance(msg.get("content"), list) and all(
                isinstance(block, dict) for block in msg["content"]
            ):
@@ -3717,7 +3720,7 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
                ]
                msg["content"] = _convert_from_v1_to_responses(msg["content"], tcs)
        else:
-            msg = _convert_message_to_dict(lc_msg)
+            msg = _convert_message_to_dict(lc_msg, api="responses")
            # Get content from non-standard content blocks
            if isinstance(msg["content"], list):
                for i, block in enumerate(msg["content"]):
--- a/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py
+++ b/libs/partners/openai/tests/integration_tests/chat_models/test_base_standard.py
@@ -95,7 +95,7 @@ class TestOpenAIStandard(ChatModelIntegrationTests):

        message = HumanMessage(
            [
-                {"type": "text", "text": "Summarize this document:"},
+                {"type": "text", "text": "What is the document title, verbatim?"},
                {
                    "type": "file",
                    "mime_type": "application/pdf",
@@ -109,7 +109,7 @@ class TestOpenAIStandard(ChatModelIntegrationTests):
        # Test OpenAI Chat Completions format
        message = HumanMessage(
            [
-                {"type": "text", "text": "Summarize this document:"},
+                {"type": "text", "text": "What is the document title, verbatim?"},
                {
                    "type": "file",
                    "file": {
--- a/libs/partners/openai/tests/integration_tests/chat_models/test_responses_standard.py
+++ b/libs/partners/openai/tests/integration_tests/chat_models/test_responses_standard.py
@@ -5,7 +5,7 @@ from typing import cast

 import pytest
 from langchain_core.language_models import BaseChatModel
-from langchain_core.messages import AIMessage
+from langchain_core.messages import AIMessage, HumanMessage

 from langchain_openai import ChatOpenAI
 from tests.integration_tests.chat_models.test_base_standard import TestOpenAIStandard
@@ -48,6 +48,29 @@ class TestOpenAIResponses(TestOpenAIStandard):
        input_ = "What was the 3rd highest building in 2000?"
        return _invoke(llm, input_, stream)

+    def test_openai_pdf_inputs(self, model: BaseChatModel) -> None:
+        """Test that the model can process PDF inputs."""
+        super().test_openai_pdf_inputs(model)
+        # Responses API additionally supports files via URL
+        url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
+
+        message = HumanMessage(
+            [
+                {"type": "text", "text": "What is the document title, verbatim?"},
+                {"type": "file", "url": url},
+            ]
+        )
+        _ = model.invoke([message])
+
+        # Test OpenAI Responses format
+        message = HumanMessage(
+            [
+                {"type": "text", "text": "What is the document title, verbatim?"},
+                {"type": "input_file", "file_url": url},
+            ]
+        )
+        _ = model.invoke([message])
+

 def _invoke(llm: ChatOpenAI, input_: str, stream: bool) -> AIMessage:
    if stream: