mirror of
https://github.com/hwchase17/langchain.git
synced 2026-06-09 10:17:00 +00:00
fix(langchain): land final answer in last AIMessage for TodoListMiddleware (#37643)
This commit is contained in:
226
.github/workflows/middleware_evals.yml
vendored
226
.github/workflows/middleware_evals.yml
vendored
@@ -1,226 +0,0 @@
|
||||
# Real-model evals for langchain agent middleware.
|
||||
#
|
||||
# Triggered manually via workflow_dispatch — these tests call live model APIs
|
||||
# and incur cost, so they are not run on every PR. Use this workflow when
|
||||
# making changes to middleware prompts (e.g., WRITE_TODOS_SYSTEM_PROMPT) or
|
||||
# other behavior-affecting middleware code to confirm the change works against
|
||||
# real models.
|
||||
#
|
||||
# Pass a comma-separated list of model ids to run the suite across providers
|
||||
# in parallel. Each model id is a separate matrix job.
|
||||
#
|
||||
# Required repository secrets (only set the ones for providers you actually
|
||||
# evaluate against — unset providers just can't be picked):
|
||||
# LANGSMITH_API_KEY — trace/experiment recording (always set this)
|
||||
# ANTHROPIC_API_KEY — `claude-*` ids or `anthropic:claude-*`
|
||||
# OPENAI_API_KEY — `gpt-*` ids or `openai:*`
|
||||
# GOOGLE_API_KEY — `google_genai:gemini-*`
|
||||
# GROQ_API_KEY — `groq:*`
|
||||
# MISTRAL_API_KEY — `mistralai:*`
|
||||
# XAI_API_KEY — `xai:grok-*`
|
||||
# DEEPSEEK_API_KEY — `deepseek:*`
|
||||
# FIREWORKS_API_KEY — `fireworks:*`
|
||||
# NVIDIA_API_KEY — `nvidia:*`
|
||||
# OLLAMA_API_KEY — `ollama:*` (Ollama Cloud)
|
||||
# OPENROUTER_API_KEY — `openrouter:*`
|
||||
#
|
||||
# OLLAMA_HOST is hardcoded to https://ollama.com below since it is not
|
||||
# sensitive.
|
||||
#
|
||||
# To add another provider:
|
||||
# 1. Add the provider SDK to the `--with` list in libs/langchain_v1/Makefile
|
||||
# (under the `evals` target).
|
||||
# 2. Add the API key secret to the `env` block in the job below.
|
||||
# 3. Document the matching `--model` syntax above.
|
||||
|
||||
name: "📊 Middleware Evals"
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
models:
|
||||
description: >-
|
||||
Comma-separated model ids passed to --model. Each becomes a matrix job.
|
||||
Examples: claude-sonnet-4-6,claude-haiku-4-5,openai:gpt-5,
|
||||
google_genai:gemini-2.5-pro, groq:llama-3.3-70b-versatile,
|
||||
mistralai:mistral-large-latest, xai:grok-4,
|
||||
deepseek:deepseek-chat,
|
||||
fireworks:accounts/fireworks/models/llama-v3p1-70b-instruct,
|
||||
openrouter:anthropic/claude-sonnet-4-6
|
||||
required: true
|
||||
default: "claude-sonnet-4-6"
|
||||
type: string
|
||||
tier:
|
||||
description: "Eval tier to run"
|
||||
required: false
|
||||
default: "baseline"
|
||||
type: choice
|
||||
options:
|
||||
- baseline
|
||||
- hillclimb
|
||||
- all
|
||||
eval_category:
|
||||
description: >-
|
||||
Optional eval_category to filter to (e.g., "middleware/todo").
|
||||
Empty runs all categories.
|
||||
required: false
|
||||
default: ""
|
||||
type: string
|
||||
openai_reasoning_effort:
|
||||
description: >-
|
||||
Forwarded to ChatOpenAI when the model is OpenAI. Use `minimal` for
|
||||
faster/cheaper GPT-5 runs. Ignored for non-OpenAI providers.
|
||||
required: false
|
||||
default: ""
|
||||
type: choice
|
||||
options:
|
||||
- ""
|
||||
- minimal
|
||||
- low
|
||||
- medium
|
||||
- high
|
||||
pytest_extra:
|
||||
description: "Additional pytest args (e.g., '-k density' to filter)"
|
||||
required: false
|
||||
default: ""
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
env:
|
||||
UV_FROZEN: "true"
|
||||
|
||||
jobs:
|
||||
parse:
|
||||
name: "Parse model list"
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
models: ${{ steps.set.outputs.models }}
|
||||
steps:
|
||||
- id: set
|
||||
env:
|
||||
MODELS_INPUT: ${{ inputs.models }}
|
||||
run: |
|
||||
MODELS_JSON=$(python3 -c '
|
||||
import json, os
|
||||
raw = os.environ.get("MODELS_INPUT", "")
|
||||
out = [m.strip() for m in raw.split(",") if m.strip()]
|
||||
print(json.dumps(out))
|
||||
')
|
||||
echo "Parsed models: $MODELS_JSON"
|
||||
echo "models=$MODELS_JSON" >> "$GITHUB_OUTPUT"
|
||||
|
||||
evals:
|
||||
name: "📊 ${{ matrix.model }} (${{ inputs.tier }})"
|
||||
needs: parse
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 4
|
||||
matrix:
|
||||
model: ${{ fromJSON(needs.parse.outputs.models) }}
|
||||
steps:
|
||||
- name: "📋 Checkout"
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
|
||||
|
||||
- name: "🛠️ Set up uv"
|
||||
uses: astral-sh/setup-uv@0ca8f610542aa7f4acaf39e65cf4eb3c35091883 # v7
|
||||
with:
|
||||
enable-cache: true
|
||||
|
||||
- name: "📊 Run evals"
|
||||
working-directory: libs/langchain_v1
|
||||
# SECURITY: every value derived from `inputs.*` / `matrix.*` is passed
|
||||
# via `env:`, NOT interpolated into the `run:` script body. GitHub
|
||||
# expands `${{ ... }}` textually before the shell runs, so splicing
|
||||
# those expressions into a `run:` block lets a value containing `'`
|
||||
# break out of the literal and execute arbitrary commands with API
|
||||
# keys in scope. Passing them as env vars keeps the values out of the
|
||||
# script source and lets bash quote them safely as `"$VAR"`. We also
|
||||
# invoke `pytest` directly here instead of going through `make`,
|
||||
# because Make's `$(VAR)` is textual expansion and would re-introduce
|
||||
# the same class of injection at the Make layer.
|
||||
env:
|
||||
# Provider keys. Unset secrets resolve to empty strings; the SDK only
|
||||
# complains if the matrix model id actually requires that provider.
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
|
||||
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
|
||||
MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
|
||||
XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
|
||||
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
|
||||
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
|
||||
NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
|
||||
OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }}
|
||||
OLLAMA_HOST: "https://ollama.com"
|
||||
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
||||
# LangSmith trace recording.
|
||||
LANGSMITH_API_KEY: ${{ secrets.LANGSMITH_API_KEY }}
|
||||
LANGSMITH_TRACING: "true"
|
||||
LANGSMITH_PROJECT: "langchain-middleware-evals"
|
||||
# User-controlled inputs surfaced as env vars (not script splices).
|
||||
MODEL: ${{ matrix.model }}
|
||||
TIER: ${{ inputs.tier }}
|
||||
CATEGORY: ${{ inputs.eval_category }}
|
||||
REASONING: ${{ inputs.openai_reasoning_effort }}
|
||||
USER_EXTRA: ${{ inputs.pytest_extra }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# Build the pytest arg array from env-provided inputs. Each element
|
||||
# is added as a discrete word in the array so word-splitting and
|
||||
# quoting are handled by bash, not by the script source. Filter
|
||||
# expressions for `-m` are constructed from a fixed template and
|
||||
# concatenated with user-supplied values; the values themselves are
|
||||
# never interpreted as shell.
|
||||
PYTEST_ARGS=(-v --tb=short --model "$MODEL")
|
||||
|
||||
MARKER=""
|
||||
if [ "$TIER" != "all" ]; then
|
||||
MARKER="eval_tier(\"$TIER\")"
|
||||
fi
|
||||
if [ -n "$CATEGORY" ]; then
|
||||
if [ -n "$MARKER" ]; then
|
||||
MARKER="$MARKER and eval_category(\"$CATEGORY\")"
|
||||
else
|
||||
MARKER="eval_category(\"$CATEGORY\")"
|
||||
fi
|
||||
fi
|
||||
if [ -n "$MARKER" ]; then
|
||||
PYTEST_ARGS+=(-m "$MARKER")
|
||||
fi
|
||||
if [ -n "$REASONING" ]; then
|
||||
PYTEST_ARGS+=("--openai-reasoning-effort=$REASONING")
|
||||
fi
|
||||
if [ -n "$USER_EXTRA" ]; then
|
||||
# USER_EXTRA is split on whitespace into discrete args. Users with
|
||||
# workflow_dispatch access who want spaces inside a single arg can
|
||||
# escape with quotes inside the input, but the typical use is
|
||||
# things like `-k density`.
|
||||
# shellcheck disable=SC2086
|
||||
read -r -a USER_ARGS <<< "$USER_EXTRA"
|
||||
PYTEST_ARGS+=("${USER_ARGS[@]}")
|
||||
fi
|
||||
|
||||
printf 'Model: %s\n' "$MODEL"
|
||||
printf 'Tier: %s\n' "$TIER"
|
||||
printf 'Category: %s\n' "${CATEGORY:-(all)}"
|
||||
printf 'Reasoning: %s\n' "${REASONING:-(default)}"
|
||||
printf 'Pytest args: %s\n' "${PYTEST_ARGS[*]}"
|
||||
|
||||
uv run --group test \
|
||||
--with langchain-anthropic \
|
||||
--with langchain-deepseek \
|
||||
--with langchain-fireworks \
|
||||
--with langchain-google-genai \
|
||||
--with langchain-groq \
|
||||
--with langchain-mistralai \
|
||||
--with langchain-nvidia-ai-endpoints \
|
||||
--with langchain-ollama \
|
||||
--with langchain-openai \
|
||||
--with langchain-openrouter \
|
||||
--with langchain-xai \
|
||||
pytest tests/evals "${PYTEST_ARGS[@]}"
|
||||
@@ -49,7 +49,7 @@ class WriteTodosInput(BaseModel):
|
||||
todos: list[Todo]
|
||||
|
||||
|
||||
WRITE_TODOS_TOOL_DESCRIPTION = """Use this tool to create and manage a structured task list for your current work session. This helps you track progress, organize complex tasks, and demonstrate thoroughness to the user.
|
||||
WRITE_TODOS_TOOL_DESCRIPTION = """Use this tool to create and manage a structured task list for your current work session. This helps you track progress and organize complex tasks.
|
||||
|
||||
Only use this tool if you think it will be helpful in staying organized. If the user's request is trivial and takes less than 3 steps, it is better to NOT use this tool and just do the task directly.
|
||||
|
||||
@@ -91,7 +91,7 @@ It is important to skip using this tool when:
|
||||
- Complete current tasks before starting new ones
|
||||
- Remove tasks that are no longer relevant from the list entirely
|
||||
- IMPORTANT: When you write this todo list, you should mark your first task (or tasks) as in_progress immediately!.
|
||||
- IMPORTANT: Unless all tasks are completed, you should always have at least one task in_progress to show the user that you are working on something.
|
||||
- IMPORTANT: Unless all tasks are completed, you should always have at least one task in_progress.
|
||||
|
||||
3. **Task Completion Requirements**:
|
||||
- ONLY mark a task as completed when you have FULLY accomplished it
|
||||
@@ -109,13 +109,17 @@ It is important to skip using this tool when:
|
||||
- Break complex tasks into smaller, manageable steps
|
||||
- Use clear, descriptive task names
|
||||
|
||||
Being proactive with task management demonstrates attentiveness and ensures you complete all requirements successfully
|
||||
Remember: If you only need to make a few tool calls to complete a task, and it is clear what you need to do, it is better to just do the task directly and NOT call this tool at all.""" # noqa: E501
|
||||
Being proactive with task management ensures you complete all requirements successfully
|
||||
Remember: If you only need to make a few tool calls to complete a task, and it is clear what you need to do, it is better to just do the task directly and NOT call this tool at all.
|
||||
|
||||
## When You Finish
|
||||
|
||||
`write_todos` tracks your work; it does not deliver the answer. Whatever the user asked for — computations, summaries, comparisons, data — must appear as text content in a message after your final `write_todos` call. Marking the last todo complete is not itself an answer to the user.""" # noqa: E501
|
||||
|
||||
WRITE_TODOS_SYSTEM_PROMPT = """## `write_todos`
|
||||
|
||||
You have access to the `write_todos` tool to help you manage and plan complex objectives.
|
||||
Use this tool for complex objectives to ensure that you are tracking each necessary step and giving the user visibility into your progress.
|
||||
Use this tool for complex objectives to ensure that you are tracking each necessary step.
|
||||
This tool is very helpful for planning complex objectives, and for breaking down these larger complex objectives into smaller steps.
|
||||
|
||||
It is critical that you mark todos as completed as soon as you are done with a step. Do not batch up multiple steps before marking them as completed.
|
||||
@@ -125,7 +129,11 @@ Writing todos takes time and tokens, use it when it is helpful for managing comp
|
||||
## Important To-Do List Usage Notes to Remember
|
||||
|
||||
- The `write_todos` tool should never be called multiple times in parallel.
|
||||
- Don't be afraid to revise the To-Do list as you go. New information may reveal new tasks that need to be done, or old tasks that are irrelevant.""" # noqa: E501
|
||||
- Don't be afraid to revise the To-Do list as you go. New information may reveal new tasks that need to be done, or old tasks that are irrelevant.
|
||||
|
||||
## Finishing a task
|
||||
|
||||
When you finish all work, write your final answer in the message AFTER your last `write_todos` call — not in the same turn as that call. Start the final message with the substantive content the user asked for — the data, computation, summary, or analysis. The user wants the result, not confirmation that the work is done.""" # noqa: E501
|
||||
|
||||
|
||||
@tool(description=WRITE_TODOS_TOOL_DESCRIPTION)
|
||||
|
||||
Reference in New Issue
Block a user