From 8a69de5c2464f7a451e4a82606331e8d1124a1a3 Mon Sep 17 00:00:00 2001 From: ccurme Date: Tue, 1 Apr 2025 15:29:33 -0400 Subject: [PATCH] openai[patch]: ignore file blocks when counting tokens (#30601) OpenAI does not appear to document how it transforms PDF pages to images, which determines how tokens are counted: https://platform.openai.com/docs/guides/pdf-files?api-mode=chat#usage-considerations Currently these block types raise ValueError inside `get_num_tokens_from_messages`. Here we update to generate a warning and continue. --- .../langchain_openai/chat_models/base.py | 6 ++++++ .../tests/unit_tests/chat_models/test_base.py | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/libs/partners/openai/langchain_openai/chat_models/base.py b/libs/partners/openai/langchain_openai/chat_models/base.py index ce9c25cb6cc..83c8fda6a67 100644 --- a/libs/partners/openai/langchain_openai/chat_models/base.py +++ b/libs/partners/openai/langchain_openai/chat_models/base.py @@ -1298,6 +1298,12 @@ class BaseChatOpenAI(BaseChatModel): encoding.encode(val["function"]["arguments"]) ) num_tokens += len(encoding.encode(val["function"]["name"])) + elif val["type"] == "file": + warnings.warn( + "Token counts for file inputs are not supported. " + "Ignoring file inputs." + ) + pass else: raise ValueError( f"Unrecognized content block type\n\n{val}" diff --git a/libs/partners/openai/tests/unit_tests/chat_models/test_base.py b/libs/partners/openai/tests/unit_tests/chat_models/test_base.py index 7dacf9d9547..fc4666c10c6 100644 --- a/libs/partners/openai/tests/unit_tests/chat_models/test_base.py +++ b/libs/partners/openai/tests/unit_tests/chat_models/test_base.py @@ -752,6 +752,25 @@ def test_get_num_tokens_from_messages() -> None: actual = llm.get_num_tokens_from_messages(messages) assert expected == actual + # Test file inputs + messages = [ + HumanMessage( + [ + "Summarize this document.", + { + "type": "file", + "file": { + "filename": "my file", + "file_data": "data:application/pdf;base64,", + }, + }, + ] + ) + ] + with pytest.warns(match="file inputs are not supported"): + actual = llm.get_num_tokens_from_messages(messages) + assert actual == 13 + class Foo(BaseModel): bar: int