From 6b0b317cb5ebd70e004ea1891899696606781596 Mon Sep 17 00:00:00 2001 From: Jacob Lee Date: Thu, 24 Apr 2025 06:36:31 -0700 Subject: [PATCH] feat(core): Autogenerate filenames for when converting file content blocks to OpenAI format (#30984) CC @ccurme --------- Co-authored-by: Chester Curme --- libs/core/langchain_core/messages/utils.py | 13 ++++++- .../tests/unit_tests/messages/test_utils.py | 35 +++++++++++++++---- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/libs/core/langchain_core/messages/utils.py b/libs/core/langchain_core/messages/utils.py index 6582f298abe..d3e27c502a4 100644 --- a/libs/core/langchain_core/messages/utils.py +++ b/libs/core/langchain_core/messages/utils.py @@ -12,6 +12,7 @@ from __future__ import annotations import base64 import inspect import json +import logging import math from collections.abc import Iterable, Sequence from functools import partial @@ -47,6 +48,8 @@ if TYPE_CHECKING: from langchain_core.prompt_values import PromptValue from langchain_core.runnables.base import Runnable +logger = logging.getLogger(__name__) + def _get_type(v: Any) -> str: """Get the type associated with the object for serialization purposes.""" @@ -1070,7 +1073,15 @@ def convert_to_openai_messages( ) # Standard multi-modal content block elif is_data_content_block(block): - content.append(convert_to_openai_data_block(block)) + formatted_block = convert_to_openai_data_block(block) + if ( + formatted_block.get("type") == "file" + and "file" in formatted_block + and "filename" not in formatted_block["file"] + ): + logger.info("Generating a fallback filename.") + formatted_block["file"]["filename"] = "LC_AUTOGENERATED" + content.append(formatted_block) # Anthropic and Bedrock converse format elif (block.get("type") == "image") or "image" in block: # Anthropic diff --git a/libs/core/tests/unit_tests/messages/test_utils.py b/libs/core/tests/unit_tests/messages/test_utils.py index 9031e1be5da..db75921e1a3 100644 --- a/libs/core/tests/unit_tests/messages/test_utils.py +++ b/libs/core/tests/unit_tests/messages/test_utils.py @@ -1202,12 +1202,6 @@ def test_convert_to_openai_messages_multimodal() -> None: "data": "", "mime_type": "image/png", }, - { - "type": "file", - "source_type": "base64", - "data": "", - "mime_type": "application/pdf", - }, { "type": "file", "source_type": "base64", @@ -1232,7 +1226,34 @@ def test_convert_to_openai_messages_multimodal() -> None: result = convert_to_openai_messages(messages, text_format="block") assert len(result) == 1 message = result[0] - assert len(message["content"]) == 7 + assert len(message["content"]) == 6 + + # Test adding filename + messages = [ + HumanMessage( + content=[ + { + "type": "file", + "source_type": "base64", + "data": "", + "mime_type": "application/pdf", + }, + ] + ) + ] + with pytest.warns(match="filename"): + result = convert_to_openai_messages(messages, text_format="block") + assert len(result) == 1 + message = result[0] + assert len(message["content"]) == 1 + block = message["content"][0] + assert block == { + "type": "file", + "file": { + "file_data": "data:application/pdf;base64,", + "filename": "LC_AUTOGENERATED", + }, + } def test_count_tokens_approximately_empty_messages() -> None: