openai[patch]: ignore file blocks when counting tokens (#30601)

OpenAI does not appear to document how it transforms PDF pages to
images, which determines how tokens are counted:
https://platform.openai.com/docs/guides/pdf-files?api-mode=chat#usage-considerations

Currently these block types raise ValueError inside
`get_num_tokens_from_messages`. Here we update to generate a warning and
continue.
This commit is contained in:
ccurme 2025-04-01 15:29:33 -04:00 committed by GitHub
parent 558191198f
commit 8a69de5c24
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 25 additions and 0 deletions

View File

@ -1298,6 +1298,12 @@ class BaseChatOpenAI(BaseChatModel):
encoding.encode(val["function"]["arguments"])
)
num_tokens += len(encoding.encode(val["function"]["name"]))
elif val["type"] == "file":
warnings.warn(
"Token counts for file inputs are not supported. "
"Ignoring file inputs."
)
pass
else:
raise ValueError(
f"Unrecognized content block type\n\n{val}"

View File

@ -752,6 +752,25 @@ def test_get_num_tokens_from_messages() -> None:
actual = llm.get_num_tokens_from_messages(messages)
assert expected == actual
# Test file inputs
messages = [
HumanMessage(
[
"Summarize this document.",
{
"type": "file",
"file": {
"filename": "my file",
"file_data": "data:application/pdf;base64,<data>",
},
},
]
)
]
with pytest.warns(match="file inputs are not supported"):
actual = llm.get_num_tokens_from_messages(messages)
assert actual == 13
class Foo(BaseModel):
bar: int