feat(openai): (v1) support pdfs passed via url in standard format (#32876)

This commit is contained in:
ccurme
2025-09-12 10:44:00 -04:00
committed by GitHub
parent 67aa37b144
commit b88115f6fc
5 changed files with 200 additions and 17 deletions

View File

@@ -5,7 +5,7 @@ from __future__ import annotations
import json
import warnings
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any, Optional, Union, cast
from typing import TYPE_CHECKING, Any, Literal, Optional, Union, cast
from langchain_core.language_models._utils import (
_is_openai_data_block,
@@ -42,10 +42,23 @@ def convert_to_openai_image_block(block: dict[str, Any]) -> dict:
raise ValueError(error_message)
def convert_to_openai_data_block(block: dict) -> dict:
def convert_to_openai_data_block(
block: dict, api: Literal["chat/completions", "responses"] = "chat/completions"
) -> dict:
"""Format standard data content block to format expected by OpenAI."""
if block["type"] == "image":
formatted_block = convert_to_openai_image_block(block)
chat_completions_block = convert_to_openai_image_block(block)
if api == "responses":
formatted_block = {
"type": "input_image",
"image_url": chat_completions_block["image_url"]["url"],
}
if chat_completions_block["image_url"].get("detail"):
formatted_block["detail"] = chat_completions_block["image_url"][
"detail"
]
else:
formatted_block = chat_completions_block
elif block["type"] == "file":
if "base64" in block or block.get("source_type") == "base64":
@@ -68,13 +81,23 @@ def convert_to_openai_data_block(block: dict) -> dict:
stacklevel=1,
)
formatted_block = {"type": "file", "file": file}
if api == "responses":
formatted_block = {"type": "input_file", **formatted_block["file"]}
elif "file_id" in block or block.get("source_type") == "id":
# Handle v0 format: {"source_type": "id", "id": "...", ...}
# Handle v1 format: {"file_id": "...", ...}
file_id = block["id"] if "source_type" in block else block["file_id"]
formatted_block = {"type": "file", "file": {"file_id": file_id}}
if api == "responses":
formatted_block = {"type": "input_file", **formatted_block["file"]}
elif "url" in block:
if api == "chat/completions":
error_msg = "OpenAI Chat Completions does not support file URLs."
raise ValueError(error_msg)
# Only supported by Responses API; return in that format
formatted_block = {"type": "input_file", "file_url": block["url"]}
else:
error_msg = "Keys base64 or file_id required for file blocks."
error_msg = "Keys base64, url, or file_id required for file blocks."
raise ValueError(error_msg)
elif block["type"] == "audio":

View File

@@ -1,7 +1,12 @@
from typing import Optional
import pytest
from langchain_core.messages import AIMessage, AIMessageChunk, HumanMessage
from langchain_core.messages import content as types
from langchain_core.messages.block_translators.openai import (
convert_to_openai_data_block,
)
from tests.unit_tests.language_models.chat_models.test_base import (
_content_blocks_equal_ignore_id,
)
@@ -442,3 +447,132 @@ def test_compat_responses_v03() -> None:
{"type": "reasoning", "reasoning": "reasoning text", "id": "rs_abc"}
]
assert chunk.content_blocks == expected_content
def test_convert_to_openai_data_block() -> None:
# Chat completions
## Image / url
block = {
"type": "image",
"url": "https://example.com/test.png",
}
expected = {
"type": "image_url",
"image_url": {"url": "https://example.com/test.png"},
}
result = convert_to_openai_data_block(block)
assert result == expected
## Image / base64
block = {
"type": "image",
"base64": "<base64 string>",
"mime_type": "image/png",
}
expected = {
"type": "image_url",
"image_url": {"url": "data:image/png;base64,<base64 string>"},
}
result = convert_to_openai_data_block(block)
assert result == expected
## File / url
block = {
"type": "file",
"url": "https://example.com/test.pdf",
}
with pytest.raises(ValueError, match="does not support"):
result = convert_to_openai_data_block(block)
## File / base64
block = {
"type": "file",
"base64": "<base64 string>",
"mime_type": "application/pdf",
"filename": "test.pdf",
}
expected = {
"type": "file",
"file": {
"file_data": "data:application/pdf;base64,<base64 string>",
"filename": "test.pdf",
},
}
result = convert_to_openai_data_block(block)
assert result == expected
## File / file ID
block = {
"type": "file",
"file_id": "file-abc123",
}
expected = {"type": "file", "file": {"file_id": "file-abc123"}}
result = convert_to_openai_data_block(block)
assert result == expected
## Audio / base64
block = {
"type": "audio",
"base64": "<base64 string>",
"mime_type": "audio/wav",
}
expected = {
"type": "input_audio",
"input_audio": {"data": "<base64 string>", "format": "wav"},
}
result = convert_to_openai_data_block(block)
assert result == expected
# Responses
## Image / url
block = {
"type": "image",
"url": "https://example.com/test.png",
}
expected = {"type": "input_image", "image_url": "https://example.com/test.png"}
result = convert_to_openai_data_block(block, api="responses")
assert result == expected
## Image / base64
block = {
"type": "image",
"base64": "<base64 string>",
"mime_type": "image/png",
}
expected = {
"type": "input_image",
"image_url": "data:image/png;base64,<base64 string>",
}
result = convert_to_openai_data_block(block, api="responses")
assert result == expected
## File / url
block = {
"type": "file",
"url": "https://example.com/test.pdf",
}
expected = {"type": "input_file", "file_url": "https://example.com/test.pdf"}
## File / base64
block = {
"type": "file",
"base64": "<base64 string>",
"mime_type": "application/pdf",
"filename": "test.pdf",
}
expected = {
"type": "input_file",
"file_data": "data:application/pdf;base64,<base64 string>",
"filename": "test.pdf",
}
result = convert_to_openai_data_block(block, api="responses")
assert result == expected
## File / file ID
block = {
"type": "file",
"file_id": "file-abc123",
}
expected = {"type": "input_file", "file_id": "file-abc123"}
result = convert_to_openai_data_block(block, api="responses")
assert result == expected

View File

@@ -206,7 +206,11 @@ def _convert_dict_to_message(_dict: Mapping[str, Any]) -> BaseMessage:
return ChatMessage(content=_dict.get("content", ""), role=role, id=id_) # type: ignore[arg-type]
def _format_message_content(content: Any, responses_ai_msg: bool = False) -> Any:
def _format_message_content(
content: Any,
api: Literal["chat/completions", "responses"] = "chat/completions",
role: Optional[str] = None,
) -> Any:
"""Format message content."""
if content and isinstance(content, list):
formatted_content = []
@@ -223,9 +227,9 @@ def _format_message_content(content: Any, responses_ai_msg: bool = False) -> Any
and is_data_content_block(block)
# Responses API messages handled separately in _compat (parsed into
# image generation calls)
and not responses_ai_msg
and not (api == "responses" and str(role).lower().startswith("ai"))
):
formatted_content.append(convert_to_openai_data_block(block))
formatted_content.append(convert_to_openai_data_block(block, api=api))
# Anthropic image blocks
elif (
isinstance(block, dict)
@@ -258,13 +262,12 @@ def _format_message_content(content: Any, responses_ai_msg: bool = False) -> Any
def _convert_message_to_dict(
message: BaseMessage, responses_ai_msg: bool = False
message: BaseMessage,
api: Literal["chat/completions", "responses"] = "chat/completions",
) -> dict:
"""Convert a LangChain message to dictionary format expected by OpenAI."""
message_dict: dict[str, Any] = {
"content": _format_message_content(
message.content, responses_ai_msg=responses_ai_msg
)
"content": _format_message_content(message.content, api=api, role=message.type)
}
if (name := message.name or message.additional_kwargs.get("name")) is not None:
message_dict["name"] = name
@@ -306,7 +309,7 @@ def _convert_message_to_dict(
isinstance(block, dict)
and block.get("type") == "audio"
and (id_ := block.get("id"))
and not responses_ai_msg
and api != "responses"
):
# openai doesn't support passing the data back - only the id
# https://platform.openai.com/docs/guides/audio/multi-turn-conversations
@@ -3702,7 +3705,7 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
for lc_msg in messages:
if isinstance(lc_msg, AIMessage):
lc_msg = _convert_from_v03_ai_message(lc_msg)
msg = _convert_message_to_dict(lc_msg, responses_ai_msg=True)
msg = _convert_message_to_dict(lc_msg, api="responses")
if isinstance(msg.get("content"), list) and all(
isinstance(block, dict) for block in msg["content"]
):
@@ -3717,7 +3720,7 @@ def _construct_responses_api_input(messages: Sequence[BaseMessage]) -> list:
]
msg["content"] = _convert_from_v1_to_responses(msg["content"], tcs)
else:
msg = _convert_message_to_dict(lc_msg)
msg = _convert_message_to_dict(lc_msg, api="responses")
# Get content from non-standard content blocks
if isinstance(msg["content"], list):
for i, block in enumerate(msg["content"]):

View File

@@ -95,7 +95,7 @@ class TestOpenAIStandard(ChatModelIntegrationTests):
message = HumanMessage(
[
{"type": "text", "text": "Summarize this document:"},
{"type": "text", "text": "What is the document title, verbatim?"},
{
"type": "file",
"mime_type": "application/pdf",
@@ -109,7 +109,7 @@ class TestOpenAIStandard(ChatModelIntegrationTests):
# Test OpenAI Chat Completions format
message = HumanMessage(
[
{"type": "text", "text": "Summarize this document:"},
{"type": "text", "text": "What is the document title, verbatim?"},
{
"type": "file",
"file": {

View File

@@ -5,7 +5,7 @@ from typing import cast
import pytest
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import AIMessage
from langchain_core.messages import AIMessage, HumanMessage
from langchain_openai import ChatOpenAI
from tests.integration_tests.chat_models.test_base_standard import TestOpenAIStandard
@@ -48,6 +48,29 @@ class TestOpenAIResponses(TestOpenAIStandard):
input_ = "What was the 3rd highest building in 2000?"
return _invoke(llm, input_, stream)
def test_openai_pdf_inputs(self, model: BaseChatModel) -> None:
"""Test that the model can process PDF inputs."""
super().test_openai_pdf_inputs(model)
# Responses API additionally supports files via URL
url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
message = HumanMessage(
[
{"type": "text", "text": "What is the document title, verbatim?"},
{"type": "file", "url": url},
]
)
_ = model.invoke([message])
# Test OpenAI Responses format
message = HumanMessage(
[
{"type": "text", "text": "What is the document title, verbatim?"},
{"type": "input_file", "file_url": url},
]
)
_ = model.invoke([message])
def _invoke(llm: ChatOpenAI, input_: str, stream: bool) -> AIMessage:
if stream: