From 11cdce91dc4867613a8ff49fb942c5a72fe2ff96 Mon Sep 17 00:00:00 2001 From: Nick Hollon Date: Fri, 22 May 2026 21:41:46 -0400 Subject: [PATCH] ci(infra): add middleware evals workflow for `workflow_dispatch` discovery (#37644) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fast-track companion to #37643. GitHub's `workflow_dispatch` event is only discoverable when the workflow file exists on the default branch — even though the workflow code that runs comes from the `ref` passed to the dispatch. This PR lands the `Middleware Evals` workflow file on master so that #37643 (which adds `libs/langchain_v1/tests/evals/`) can be dispatched against the feature branch via: ```bash gh workflow run middleware_evals.yml \ --ref nh/todo-middleware-loop-contract \ --field models='claude-sonnet-4-6,...' ``` without first merging the full eval framework. ## Caveats - The workflow's pytest invocation depends on `libs/langchain_v1/tests/evals/` and the partner SDK list, neither of which exists on master yet. Dispatching with `--ref master` before #37643 lands will fail at pytest collection. That's the intended behavior — the workflow's purpose is to dispatch against branches that ship the eval suite. - Once #37643 merges to master, this workflow file already matches what #37643 adds. The merge will be a no-op for `middleware_evals.yml` itself. --- .github/workflows/middleware_evals.yml | 226 +++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 .github/workflows/middleware_evals.yml diff --git a/.github/workflows/middleware_evals.yml b/.github/workflows/middleware_evals.yml new file mode 100644 index 00000000000..a036f585467 --- /dev/null +++ b/.github/workflows/middleware_evals.yml @@ -0,0 +1,226 @@ +# Real-model evals for langchain agent middleware. +# +# Triggered manually via workflow_dispatch — these tests call live model APIs +# and incur cost, so they are not run on every PR. Use this workflow when +# making changes to middleware prompts (e.g., WRITE_TODOS_SYSTEM_PROMPT) or +# other behavior-affecting middleware code to confirm the change works against +# real models. +# +# Pass a comma-separated list of model ids to run the suite across providers +# in parallel. Each model id is a separate matrix job. +# +# Required repository secrets (only set the ones for providers you actually +# evaluate against — unset providers just can't be picked): +# LANGSMITH_API_KEY — trace/experiment recording (always set this) +# ANTHROPIC_API_KEY — `claude-*` ids or `anthropic:claude-*` +# OPENAI_API_KEY — `gpt-*` ids or `openai:*` +# GOOGLE_API_KEY — `google_genai:gemini-*` +# GROQ_API_KEY — `groq:*` +# MISTRAL_API_KEY — `mistralai:*` +# XAI_API_KEY — `xai:grok-*` +# DEEPSEEK_API_KEY — `deepseek:*` +# FIREWORKS_API_KEY — `fireworks:*` +# NVIDIA_API_KEY — `nvidia:*` +# OLLAMA_API_KEY — `ollama:*` (Ollama Cloud) +# OPENROUTER_API_KEY — `openrouter:*` +# +# OLLAMA_HOST is hardcoded to https://ollama.com below since it is not +# sensitive. +# +# To add another provider: +# 1. Add the provider SDK to the `--with` list in libs/langchain_v1/Makefile +# (under the `evals` target). +# 2. Add the API key secret to the `env` block in the job below. +# 3. Document the matching `--model` syntax above. + +name: "📊 Middleware Evals" + +on: + workflow_dispatch: + inputs: + models: + description: >- + Comma-separated model ids passed to --model. Each becomes a matrix job. + Examples: claude-sonnet-4-6,claude-haiku-4-5,openai:gpt-5, + google_genai:gemini-2.5-pro, groq:llama-3.3-70b-versatile, + mistralai:mistral-large-latest, xai:grok-4, + deepseek:deepseek-chat, + fireworks:accounts/fireworks/models/llama-v3p1-70b-instruct, + openrouter:anthropic/claude-sonnet-4-6 + required: true + default: "claude-sonnet-4-6" + type: string + tier: + description: "Eval tier to run" + required: false + default: "baseline" + type: choice + options: + - baseline + - hillclimb + - all + eval_category: + description: >- + Optional eval_category to filter to (e.g., "middleware/todo"). + Empty runs all categories. + required: false + default: "" + type: string + openai_reasoning_effort: + description: >- + Forwarded to ChatOpenAI when the model is OpenAI. Use `minimal` for + faster/cheaper GPT-5 runs. Ignored for non-OpenAI providers. + required: false + default: "" + type: choice + options: + - "" + - minimal + - low + - medium + - high + pytest_extra: + description: "Additional pytest args (e.g., '-k density' to filter)" + required: false + default: "" + type: string + +permissions: + contents: read + +env: + UV_FROZEN: "true" + +jobs: + parse: + name: "Parse model list" + runs-on: ubuntu-latest + outputs: + models: ${{ steps.set.outputs.models }} + steps: + - id: set + env: + MODELS_INPUT: ${{ inputs.models }} + run: | + MODELS_JSON=$(python3 -c ' + import json, os + raw = os.environ.get("MODELS_INPUT", "") + out = [m.strip() for m in raw.split(",") if m.strip()] + print(json.dumps(out)) + ') + echo "Parsed models: $MODELS_JSON" + echo "models=$MODELS_JSON" >> "$GITHUB_OUTPUT" + + evals: + name: "📊 ${{ matrix.model }} (${{ inputs.tier }})" + needs: parse + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + fail-fast: false + max-parallel: 4 + matrix: + model: ${{ fromJSON(needs.parse.outputs.models) }} + steps: + - name: "📋 Checkout" + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - name: "🛠️ Set up uv" + uses: astral-sh/setup-uv@0ca8f610542aa7f4acaf39e65cf4eb3c35091883 # v7 + with: + enable-cache: true + + - name: "📊 Run evals" + working-directory: libs/langchain_v1 + # SECURITY: every value derived from `inputs.*` / `matrix.*` is passed + # via `env:`, NOT interpolated into the `run:` script body. GitHub + # expands `${{ ... }}` textually before the shell runs, so splicing + # those expressions into a `run:` block lets a value containing `'` + # break out of the literal and execute arbitrary commands with API + # keys in scope. Passing them as env vars keeps the values out of the + # script source and lets bash quote them safely as `"$VAR"`. We also + # invoke `pytest` directly here instead of going through `make`, + # because Make's `$(VAR)` is textual expansion and would re-introduce + # the same class of injection at the Make layer. + env: + # Provider keys. Unset secrets resolve to empty strings; the SDK only + # complains if the matrix model id actually requires that provider. + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} + GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} + MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }} + XAI_API_KEY: ${{ secrets.XAI_API_KEY }} + DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }} + FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} + NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }} + OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }} + OLLAMA_HOST: "https://ollama.com" + OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + # LangSmith trace recording. + LANGSMITH_API_KEY: ${{ secrets.LANGSMITH_API_KEY }} + LANGSMITH_TRACING: "true" + LANGSMITH_PROJECT: "langchain-middleware-evals" + # User-controlled inputs surfaced as env vars (not script splices). + MODEL: ${{ matrix.model }} + TIER: ${{ inputs.tier }} + CATEGORY: ${{ inputs.eval_category }} + REASONING: ${{ inputs.openai_reasoning_effort }} + USER_EXTRA: ${{ inputs.pytest_extra }} + run: | + set -euo pipefail + + # Build the pytest arg array from env-provided inputs. Each element + # is added as a discrete word in the array so word-splitting and + # quoting are handled by bash, not by the script source. Filter + # expressions for `-m` are constructed from a fixed template and + # concatenated with user-supplied values; the values themselves are + # never interpreted as shell. + PYTEST_ARGS=(-v --tb=short --model "$MODEL") + + MARKER="" + if [ "$TIER" != "all" ]; then + MARKER="eval_tier(\"$TIER\")" + fi + if [ -n "$CATEGORY" ]; then + if [ -n "$MARKER" ]; then + MARKER="$MARKER and eval_category(\"$CATEGORY\")" + else + MARKER="eval_category(\"$CATEGORY\")" + fi + fi + if [ -n "$MARKER" ]; then + PYTEST_ARGS+=(-m "$MARKER") + fi + if [ -n "$REASONING" ]; then + PYTEST_ARGS+=("--openai-reasoning-effort=$REASONING") + fi + if [ -n "$USER_EXTRA" ]; then + # USER_EXTRA is split on whitespace into discrete args. Users with + # workflow_dispatch access who want spaces inside a single arg can + # escape with quotes inside the input, but the typical use is + # things like `-k density`. + # shellcheck disable=SC2086 + read -r -a USER_ARGS <<< "$USER_EXTRA" + PYTEST_ARGS+=("${USER_ARGS[@]}") + fi + + printf 'Model: %s\n' "$MODEL" + printf 'Tier: %s\n' "$TIER" + printf 'Category: %s\n' "${CATEGORY:-(all)}" + printf 'Reasoning: %s\n' "${REASONING:-(default)}" + printf 'Pytest args: %s\n' "${PYTEST_ARGS[*]}" + + uv run --group test \ + --with langchain-anthropic \ + --with langchain-deepseek \ + --with langchain-fireworks \ + --with langchain-google-genai \ + --with langchain-groq \ + --with langchain-mistralai \ + --with langchain-nvidia-ai-endpoints \ + --with langchain-ollama \ + --with langchain-openai \ + --with langchain-openrouter \ + --with langchain-xai \ + pytest tests/evals "${PYTEST_ARGS[@]}"