openai[patch]: support streaming with json_schema response format (#29044)

- Stream JSON string content. Final chunk includes parsed representation
(following OpenAI
[docs](https://platform.openai.com/docs/guides/structured-outputs#streaming)).
- Mildly (?) breaking change: if you were using streaming with
`response_format` before, usage metadata will disappear unless you set
`stream_usage=True`.

## Response format

Before:

![Screenshot 2025-01-06 at 11 59
01 AM](https://github.com/user-attachments/assets/e54753f7-47d5-421d-b8f3-172f32b3364d)


After:

![Screenshot 2025-01-06 at 11 58
13 AM](https://github.com/user-attachments/assets/34882c6c-2284-45b4-92f7-5b5b69896903)


## with_structured_output

For pydantic output, behavior of `with_structured_output` is unchanged
(except for warning disappearing), because we pluck the parsed
representation straight from OpenAI, and OpenAI doesn't return it until
the stream is completed. Open to alternatives (e.g., parsing from
content or intermediate dict chunks generated by OpenAI).

Before:

![Screenshot 2025-01-06 at 12 38
11 PM](https://github.com/user-attachments/assets/913d320d-f49e-4cbb-a800-b394ae817fd1)

After:

![Screenshot 2025-01-06 at 12 38
58 PM](https://github.com/user-attachments/assets/f7a45dd6-d886-48a6-8d76-d0e21ca767c6)
This commit is contained in:
ccurme
2025-01-09 10:32:30 -05:00
committed by GitHub
parent 858f655a25
commit 815bfa1913
3 changed files with 146 additions and 35 deletions

View File

@@ -13,6 +13,7 @@ from langchain_core.messages import (
HumanMessage,
)
from langchain_core.outputs import ChatGeneration, ChatResult, LLMResult
from pydantic import BaseModel
from langchain_openai import AzureChatOpenAI
from tests.unit_tests.fake.callbacks import FakeCallbackHandler
@@ -262,3 +263,37 @@ async def test_json_mode_async(llm: AzureChatOpenAI) -> None:
assert isinstance(full, AIMessageChunk)
assert isinstance(full.content, str)
assert json.loads(full.content) == {"a": 1}
class Foo(BaseModel):
response: str
def test_stream_response_format(llm: AzureChatOpenAI) -> None:
full: Optional[BaseMessageChunk] = None
chunks = []
for chunk in llm.stream("how are ya", response_format=Foo):
chunks.append(chunk)
full = chunk if full is None else full + chunk
assert len(chunks) > 1
assert isinstance(full, AIMessageChunk)
parsed = full.additional_kwargs["parsed"]
assert isinstance(parsed, Foo)
assert isinstance(full.content, str)
parsed_content = json.loads(full.content)
assert parsed.response == parsed_content["response"]
async def test_astream_response_format(llm: AzureChatOpenAI) -> None:
full: Optional[BaseMessageChunk] = None
chunks = []
async for chunk in llm.astream("how are ya", response_format=Foo):
chunks.append(chunk)
full = chunk if full is None else full + chunk
assert len(chunks) > 1
assert isinstance(full, AIMessageChunk)
parsed = full.additional_kwargs["parsed"]
assert isinstance(parsed, Foo)
assert isinstance(full.content, str)
parsed_content = json.loads(full.content)
assert parsed.response == parsed_content["response"]

View File

@@ -1092,14 +1092,37 @@ class Foo(BaseModel):
def test_stream_response_format() -> None:
list(ChatOpenAI(model="gpt-4o-mini").stream("how are ya", response_format=Foo))
full: Optional[BaseMessageChunk] = None
chunks = []
for chunk in ChatOpenAI(model="gpt-4o-mini").stream(
"how are ya", response_format=Foo
):
chunks.append(chunk)
full = chunk if full is None else full + chunk
assert len(chunks) > 1
assert isinstance(full, AIMessageChunk)
parsed = full.additional_kwargs["parsed"]
assert isinstance(parsed, Foo)
assert isinstance(full.content, str)
parsed_content = json.loads(full.content)
assert parsed.response == parsed_content["response"]
async def test_astream_response_format() -> None:
async for _ in ChatOpenAI(model="gpt-4o-mini").astream(
full: Optional[BaseMessageChunk] = None
chunks = []
async for chunk in ChatOpenAI(model="gpt-4o-mini").astream(
"how are ya", response_format=Foo
):
pass
chunks.append(chunk)
full = chunk if full is None else full + chunk
assert len(chunks) > 1
assert isinstance(full, AIMessageChunk)
parsed = full.additional_kwargs["parsed"]
assert isinstance(parsed, Foo)
assert isinstance(full.content, str)
parsed_content = json.loads(full.content)
assert parsed.response == parsed_content["response"]
@pytest.mark.parametrize("use_max_completion_tokens", [True, False])