diff --git a/.github/workflows/middleware_evals.yml b/.github/workflows/middleware_evals.yml deleted file mode 100644 index a036f585467..00000000000 --- a/.github/workflows/middleware_evals.yml +++ /dev/null @@ -1,226 +0,0 @@ -# Real-model evals for langchain agent middleware. -# -# Triggered manually via workflow_dispatch — these tests call live model APIs -# and incur cost, so they are not run on every PR. Use this workflow when -# making changes to middleware prompts (e.g., WRITE_TODOS_SYSTEM_PROMPT) or -# other behavior-affecting middleware code to confirm the change works against -# real models. -# -# Pass a comma-separated list of model ids to run the suite across providers -# in parallel. Each model id is a separate matrix job. -# -# Required repository secrets (only set the ones for providers you actually -# evaluate against — unset providers just can't be picked): -# LANGSMITH_API_KEY — trace/experiment recording (always set this) -# ANTHROPIC_API_KEY — `claude-*` ids or `anthropic:claude-*` -# OPENAI_API_KEY — `gpt-*` ids or `openai:*` -# GOOGLE_API_KEY — `google_genai:gemini-*` -# GROQ_API_KEY — `groq:*` -# MISTRAL_API_KEY — `mistralai:*` -# XAI_API_KEY — `xai:grok-*` -# DEEPSEEK_API_KEY — `deepseek:*` -# FIREWORKS_API_KEY — `fireworks:*` -# NVIDIA_API_KEY — `nvidia:*` -# OLLAMA_API_KEY — `ollama:*` (Ollama Cloud) -# OPENROUTER_API_KEY — `openrouter:*` -# -# OLLAMA_HOST is hardcoded to https://ollama.com below since it is not -# sensitive. -# -# To add another provider: -# 1. Add the provider SDK to the `--with` list in libs/langchain_v1/Makefile -# (under the `evals` target). -# 2. Add the API key secret to the `env` block in the job below. -# 3. Document the matching `--model` syntax above. - -name: "📊 Middleware Evals" - -on: - workflow_dispatch: - inputs: - models: - description: >- - Comma-separated model ids passed to --model. Each becomes a matrix job. - Examples: claude-sonnet-4-6,claude-haiku-4-5,openai:gpt-5, - google_genai:gemini-2.5-pro, groq:llama-3.3-70b-versatile, - mistralai:mistral-large-latest, xai:grok-4, - deepseek:deepseek-chat, - fireworks:accounts/fireworks/models/llama-v3p1-70b-instruct, - openrouter:anthropic/claude-sonnet-4-6 - required: true - default: "claude-sonnet-4-6" - type: string - tier: - description: "Eval tier to run" - required: false - default: "baseline" - type: choice - options: - - baseline - - hillclimb - - all - eval_category: - description: >- - Optional eval_category to filter to (e.g., "middleware/todo"). - Empty runs all categories. - required: false - default: "" - type: string - openai_reasoning_effort: - description: >- - Forwarded to ChatOpenAI when the model is OpenAI. Use `minimal` for - faster/cheaper GPT-5 runs. Ignored for non-OpenAI providers. - required: false - default: "" - type: choice - options: - - "" - - minimal - - low - - medium - - high - pytest_extra: - description: "Additional pytest args (e.g., '-k density' to filter)" - required: false - default: "" - type: string - -permissions: - contents: read - -env: - UV_FROZEN: "true" - -jobs: - parse: - name: "Parse model list" - runs-on: ubuntu-latest - outputs: - models: ${{ steps.set.outputs.models }} - steps: - - id: set - env: - MODELS_INPUT: ${{ inputs.models }} - run: | - MODELS_JSON=$(python3 -c ' - import json, os - raw = os.environ.get("MODELS_INPUT", "") - out = [m.strip() for m in raw.split(",") if m.strip()] - print(json.dumps(out)) - ') - echo "Parsed models: $MODELS_JSON" - echo "models=$MODELS_JSON" >> "$GITHUB_OUTPUT" - - evals: - name: "📊 ${{ matrix.model }} (${{ inputs.tier }})" - needs: parse - runs-on: ubuntu-latest - timeout-minutes: 30 - strategy: - fail-fast: false - max-parallel: 4 - matrix: - model: ${{ fromJSON(needs.parse.outputs.models) }} - steps: - - name: "📋 Checkout" - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - - - name: "🛠️ Set up uv" - uses: astral-sh/setup-uv@0ca8f610542aa7f4acaf39e65cf4eb3c35091883 # v7 - with: - enable-cache: true - - - name: "📊 Run evals" - working-directory: libs/langchain_v1 - # SECURITY: every value derived from `inputs.*` / `matrix.*` is passed - # via `env:`, NOT interpolated into the `run:` script body. GitHub - # expands `${{ ... }}` textually before the shell runs, so splicing - # those expressions into a `run:` block lets a value containing `'` - # break out of the literal and execute arbitrary commands with API - # keys in scope. Passing them as env vars keeps the values out of the - # script source and lets bash quote them safely as `"$VAR"`. We also - # invoke `pytest` directly here instead of going through `make`, - # because Make's `$(VAR)` is textual expansion and would re-introduce - # the same class of injection at the Make layer. - env: - # Provider keys. Unset secrets resolve to empty strings; the SDK only - # complains if the matrix model id actually requires that provider. - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} - GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} - MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }} - XAI_API_KEY: ${{ secrets.XAI_API_KEY }} - DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }} - FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} - NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }} - OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }} - OLLAMA_HOST: "https://ollama.com" - OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} - # LangSmith trace recording. - LANGSMITH_API_KEY: ${{ secrets.LANGSMITH_API_KEY }} - LANGSMITH_TRACING: "true" - LANGSMITH_PROJECT: "langchain-middleware-evals" - # User-controlled inputs surfaced as env vars (not script splices). - MODEL: ${{ matrix.model }} - TIER: ${{ inputs.tier }} - CATEGORY: ${{ inputs.eval_category }} - REASONING: ${{ inputs.openai_reasoning_effort }} - USER_EXTRA: ${{ inputs.pytest_extra }} - run: | - set -euo pipefail - - # Build the pytest arg array from env-provided inputs. Each element - # is added as a discrete word in the array so word-splitting and - # quoting are handled by bash, not by the script source. Filter - # expressions for `-m` are constructed from a fixed template and - # concatenated with user-supplied values; the values themselves are - # never interpreted as shell. - PYTEST_ARGS=(-v --tb=short --model "$MODEL") - - MARKER="" - if [ "$TIER" != "all" ]; then - MARKER="eval_tier(\"$TIER\")" - fi - if [ -n "$CATEGORY" ]; then - if [ -n "$MARKER" ]; then - MARKER="$MARKER and eval_category(\"$CATEGORY\")" - else - MARKER="eval_category(\"$CATEGORY\")" - fi - fi - if [ -n "$MARKER" ]; then - PYTEST_ARGS+=(-m "$MARKER") - fi - if [ -n "$REASONING" ]; then - PYTEST_ARGS+=("--openai-reasoning-effort=$REASONING") - fi - if [ -n "$USER_EXTRA" ]; then - # USER_EXTRA is split on whitespace into discrete args. Users with - # workflow_dispatch access who want spaces inside a single arg can - # escape with quotes inside the input, but the typical use is - # things like `-k density`. - # shellcheck disable=SC2086 - read -r -a USER_ARGS <<< "$USER_EXTRA" - PYTEST_ARGS+=("${USER_ARGS[@]}") - fi - - printf 'Model: %s\n' "$MODEL" - printf 'Tier: %s\n' "$TIER" - printf 'Category: %s\n' "${CATEGORY:-(all)}" - printf 'Reasoning: %s\n' "${REASONING:-(default)}" - printf 'Pytest args: %s\n' "${PYTEST_ARGS[*]}" - - uv run --group test \ - --with langchain-anthropic \ - --with langchain-deepseek \ - --with langchain-fireworks \ - --with langchain-google-genai \ - --with langchain-groq \ - --with langchain-mistralai \ - --with langchain-nvidia-ai-endpoints \ - --with langchain-ollama \ - --with langchain-openai \ - --with langchain-openrouter \ - --with langchain-xai \ - pytest tests/evals "${PYTEST_ARGS[@]}" diff --git a/libs/langchain_v1/langchain/agents/middleware/todo.py b/libs/langchain_v1/langchain/agents/middleware/todo.py index 2cb0b94154f..0513f4caf11 100644 --- a/libs/langchain_v1/langchain/agents/middleware/todo.py +++ b/libs/langchain_v1/langchain/agents/middleware/todo.py @@ -49,7 +49,7 @@ class WriteTodosInput(BaseModel): todos: list[Todo] -WRITE_TODOS_TOOL_DESCRIPTION = """Use this tool to create and manage a structured task list for your current work session. This helps you track progress, organize complex tasks, and demonstrate thoroughness to the user. +WRITE_TODOS_TOOL_DESCRIPTION = """Use this tool to create and manage a structured task list for your current work session. This helps you track progress and organize complex tasks. Only use this tool if you think it will be helpful in staying organized. If the user's request is trivial and takes less than 3 steps, it is better to NOT use this tool and just do the task directly. @@ -91,7 +91,7 @@ It is important to skip using this tool when: - Complete current tasks before starting new ones - Remove tasks that are no longer relevant from the list entirely - IMPORTANT: When you write this todo list, you should mark your first task (or tasks) as in_progress immediately!. - - IMPORTANT: Unless all tasks are completed, you should always have at least one task in_progress to show the user that you are working on something. + - IMPORTANT: Unless all tasks are completed, you should always have at least one task in_progress. 3. **Task Completion Requirements**: - ONLY mark a task as completed when you have FULLY accomplished it @@ -109,13 +109,17 @@ It is important to skip using this tool when: - Break complex tasks into smaller, manageable steps - Use clear, descriptive task names -Being proactive with task management demonstrates attentiveness and ensures you complete all requirements successfully -Remember: If you only need to make a few tool calls to complete a task, and it is clear what you need to do, it is better to just do the task directly and NOT call this tool at all.""" # noqa: E501 +Being proactive with task management ensures you complete all requirements successfully +Remember: If you only need to make a few tool calls to complete a task, and it is clear what you need to do, it is better to just do the task directly and NOT call this tool at all. + +## When You Finish + +`write_todos` tracks your work; it does not deliver the answer. Whatever the user asked for — computations, summaries, comparisons, data — must appear as text content in a message after your final `write_todos` call. Marking the last todo complete is not itself an answer to the user.""" # noqa: E501 WRITE_TODOS_SYSTEM_PROMPT = """## `write_todos` You have access to the `write_todos` tool to help you manage and plan complex objectives. -Use this tool for complex objectives to ensure that you are tracking each necessary step and giving the user visibility into your progress. +Use this tool for complex objectives to ensure that you are tracking each necessary step. This tool is very helpful for planning complex objectives, and for breaking down these larger complex objectives into smaller steps. It is critical that you mark todos as completed as soon as you are done with a step. Do not batch up multiple steps before marking them as completed. @@ -125,7 +129,11 @@ Writing todos takes time and tokens, use it when it is helpful for managing comp ## Important To-Do List Usage Notes to Remember - The `write_todos` tool should never be called multiple times in parallel. -- Don't be afraid to revise the To-Do list as you go. New information may reveal new tasks that need to be done, or old tasks that are irrelevant.""" # noqa: E501 +- Don't be afraid to revise the To-Do list as you go. New information may reveal new tasks that need to be done, or old tasks that are irrelevant. + +## Finishing a task + +When you finish all work, write your final answer in the message AFTER your last `write_todos` call — not in the same turn as that call. Start the final message with the substantive content the user asked for — the data, computation, summary, or analysis. The user wants the result, not confirmation that the work is done.""" # noqa: E501 @tool(description=WRITE_TODOS_TOOL_DESCRIPTION)