From 8a69de5c2464f7a451e4a82606331e8d1124a1a3 Mon Sep 17 00:00:00 2001
From: ccurme <chester.curme@gmail.com>
Date: Tue, 1 Apr 2025 15:29:33 -0400
Subject: [PATCH] openai[patch]: ignore file blocks when counting tokens
 (#30601)

OpenAI does not appear to document how it transforms PDF pages to
images, which determines how tokens are counted:
https://platform.openai.com/docs/guides/pdf-files?api-mode=chat#usage-considerations

Currently these block types raise ValueError inside
`get_num_tokens_from_messages`. Here we update to generate a warning and
continue.
---
 .../langchain_openai/chat_models/base.py      |  6 ++++++
 .../tests/unit_tests/chat_models/test_base.py | 19 +++++++++++++++++++
 2 files changed, 25 insertions(+)
diff --git a/libs/partners/openai/langchain_openai/chat_models/base.py b/libs/partners/openai/langchain_openai/chat_models/base.py
index ce9c25cb6cc..83c8fda6a67 100644
--- a/libs/partners/openai/langchain_openai/chat_models/base.py
+++ b/libs/partners/openai/langchain_openai/chat_models/base.py
@@ -1298,6 +1298,12 @@ class BaseChatOpenAI(BaseChatModel):
                                 encoding.encode(val["function"]["arguments"])
                             )
                             num_tokens += len(encoding.encode(val["function"]["name"]))
+                        elif val["type"] == "file":
+                            warnings.warn(
+                                "Token counts for file inputs are not supported. "
+                                "Ignoring file inputs."
+                            )
+                            pass
                         else:
                             raise ValueError(
                                 f"Unrecognized content block type\n\n{val}"
diff --git a/libs/partners/openai/tests/unit_tests/chat_models/test_base.py b/libs/partners/openai/tests/unit_tests/chat_models/test_base.py
index 7dacf9d9547..fc4666c10c6 100644
--- a/libs/partners/openai/tests/unit_tests/chat_models/test_base.py
+++ b/libs/partners/openai/tests/unit_tests/chat_models/test_base.py
@@ -752,6 +752,25 @@ def test_get_num_tokens_from_messages() -> None:
     actual = llm.get_num_tokens_from_messages(messages)
     assert expected == actual
 
+    # Test file inputs
+    messages = [
+        HumanMessage(
+            [
+                "Summarize this document.",
+                {
+                    "type": "file",
+                    "file": {
+                        "filename": "my file",
+                        "file_data": "data:application/pdf;base64,<data>",
+                    },
+                },
+            ]
+        )
+    ]
+    with pytest.warns(match="file inputs are not supported"):
+        actual = llm.get_num_tokens_from_messages(messages)
+        assert actual == 13
+
 
 class Foo(BaseModel):
     bar: int