langchain/.github/workflows/middleware_evals.yml

# Real-model evals for langchain agent middleware.
#
# Triggered manually via workflow_dispatch — these tests call live model APIs
# and incur cost, so they are not run on every PR. Use this workflow when
# making changes to middleware prompts (e.g., WRITE_TODOS_SYSTEM_PROMPT) or
# other behavior-affecting middleware code to confirm the change works against
# real models.
#
# Pass a comma-separated list of model ids to run the suite across providers
# in parallel. Each model id is a separate matrix job.
#
# Required repository secrets (only set the ones for providers you actually
# evaluate against — unset providers just can't be picked):
#   LANGSMITH_API_KEY      — trace/experiment recording (always set this)
#   ANTHROPIC_API_KEY      — `claude-*` ids or `anthropic:claude-*`
#   OPENAI_API_KEY         — `gpt-*` ids or `openai:*`
#   GOOGLE_API_KEY         — `google_genai:gemini-*`
#   GROQ_API_KEY           — `groq:*`
#   MISTRAL_API_KEY        — `mistralai:*`
#   XAI_API_KEY            — `xai:grok-*`
#   DEEPSEEK_API_KEY       — `deepseek:*`
#   FIREWORKS_API_KEY      — `fireworks:*`
#   NVIDIA_API_KEY         — `nvidia:*`
#   OLLAMA_API_KEY         — `ollama:*` (Ollama Cloud)
#   OPENROUTER_API_KEY     — `openrouter:*`
#
# OLLAMA_HOST is hardcoded to https://ollama.com below since it is not
# sensitive.
#
# To add another provider:
#   1. Add the provider SDK to the `--with` list in libs/langchain_v1/Makefile
#      (under the `evals` target).
#   2. Add the API key secret to the `env` block in the job below.
#   3. Document the matching `--model` syntax above.

name: "📊 Middleware Evals"

on:
  workflow_dispatch:
    inputs:
      models:
        description: >-
          Comma-separated model ids passed to --model. Each becomes a matrix job.
          Examples: claude-sonnet-4-6,claude-haiku-4-5,openai:gpt-5,
          google_genai:gemini-2.5-pro, groq:llama-3.3-70b-versatile,
          mistralai:mistral-large-latest, xai:grok-4,
          deepseek:deepseek-chat,
          fireworks:accounts/fireworks/models/llama-v3p1-70b-instruct,
          openrouter:anthropic/claude-sonnet-4-6
        required: true
        default: "claude-sonnet-4-6"
        type: string
      tier:
        description: "Eval tier to run"
        required: false
        default: "baseline"
        type: choice
        options:
          - baseline
          - hillclimb
          - all
      eval_category:
        description: >-
          Optional eval_category to filter to (e.g., "middleware/todo").
          Empty runs all categories.
        required: false
        default: ""
        type: string
      openai_reasoning_effort:
        description: >-
          Forwarded to ChatOpenAI when the model is OpenAI. Use `minimal` for
          faster/cheaper GPT-5 runs. Ignored for non-OpenAI providers.
        required: false
        default: ""
        type: choice
        options:
          - ""
          - minimal
          - low
          - medium
          - high
      pytest_extra:
        description: "Additional pytest args (e.g., '-k density' to filter)"
        required: false
        default: ""
        type: string

permissions:
  contents: read

env:
  UV_FROZEN: "true"

jobs:
  parse:
    name: "Parse model list"
    runs-on: ubuntu-latest
    outputs:
      models: ${{ steps.set.outputs.models }}
    steps:
      - id: set
        env:
          MODELS_INPUT: ${{ inputs.models }}
        run: |
          MODELS_JSON=$(python3 -c '
          import json, os
          raw = os.environ.get("MODELS_INPUT", "")
          out = [m.strip() for m in raw.split(",") if m.strip()]
          print(json.dumps(out))
          ')
          echo "Parsed models: $MODELS_JSON"
          echo "models=$MODELS_JSON" >> "$GITHUB_OUTPUT"

  evals:
    name: "📊 ${{ matrix.model }} (${{ inputs.tier }})"
    needs: parse
    runs-on: ubuntu-latest
    timeout-minutes: 30
    strategy:
      fail-fast: false
      max-parallel: 4
      matrix:
        model: ${{ fromJSON(needs.parse.outputs.models) }}
    steps:
      - name: "📋 Checkout"
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6

      - name: "🛠️ Set up uv"
        uses: astral-sh/setup-uv@0ca8f610542aa7f4acaf39e65cf4eb3c35091883 # v7
        with:
          enable-cache: true

      - name: "📊 Run evals"
        working-directory: libs/langchain_v1
        # SECURITY: every value derived from `inputs.*` / `matrix.*` is passed
        # via `env:`, NOT interpolated into the `run:` script body. GitHub
        # expands `${{ ... }}` textually before the shell runs, so splicing
        # those expressions into a `run:` block lets a value containing `'`
        # break out of the literal and execute arbitrary commands with API
        # keys in scope. Passing them as env vars keeps the values out of the
        # script source and lets bash quote them safely as `"$VAR"`. We also
        # invoke `pytest` directly here instead of going through `make`,
        # because Make's `$(VAR)` is textual expansion and would re-introduce
        # the same class of injection at the Make layer.
        env:
          # Provider keys. Unset secrets resolve to empty strings; the SDK only
          # complains if the matrix model id actually requires that provider.
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
          GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
          MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
          XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
          DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
          FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
          NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
          OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }}
          OLLAMA_HOST: "https://ollama.com"
          OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
          # LangSmith trace recording.
          LANGSMITH_API_KEY: ${{ secrets.LANGSMITH_API_KEY }}
          LANGSMITH_TRACING: "true"
          LANGSMITH_PROJECT: "langchain-middleware-evals"
          # User-controlled inputs surfaced as env vars (not script splices).
          MODEL: ${{ matrix.model }}
          TIER: ${{ inputs.tier }}
          CATEGORY: ${{ inputs.eval_category }}
          REASONING: ${{ inputs.openai_reasoning_effort }}
          USER_EXTRA: ${{ inputs.pytest_extra }}
        run: |
          set -euo pipefail

          # Build the pytest arg array from env-provided inputs. Each element
          # is added as a discrete word in the array so word-splitting and
          # quoting are handled by bash, not by the script source. Filter
          # expressions for `-m` are constructed from a fixed template and
          # concatenated with user-supplied values; the values themselves are
          # never interpreted as shell.
          PYTEST_ARGS=(-v --tb=short --model "$MODEL")

          MARKER=""
          if [ "$TIER" != "all" ]; then
            MARKER="eval_tier(\"$TIER\")"
          fi
          if [ -n "$CATEGORY" ]; then
            if [ -n "$MARKER" ]; then
              MARKER="$MARKER and eval_category(\"$CATEGORY\")"
            else
              MARKER="eval_category(\"$CATEGORY\")"
            fi
          fi
          if [ -n "$MARKER" ]; then
            PYTEST_ARGS+=(-m "$MARKER")
          fi
          if [ -n "$REASONING" ]; then
            PYTEST_ARGS+=("--openai-reasoning-effort=$REASONING")
          fi
          if [ -n "$USER_EXTRA" ]; then
            # USER_EXTRA is split on whitespace into discrete args. Users with
            # workflow_dispatch access who want spaces inside a single arg can
            # escape with quotes inside the input, but the typical use is
            # things like `-k density`.
            # shellcheck disable=SC2086
            read -r -a USER_ARGS <<< "$USER_EXTRA"
            PYTEST_ARGS+=("${USER_ARGS[@]}")
          fi

          printf 'Model:        %s\n' "$MODEL"
          printf 'Tier:         %s\n' "$TIER"
          printf 'Category:     %s\n' "${CATEGORY:-(all)}"
          printf 'Reasoning:    %s\n' "${REASONING:-(default)}"
          printf 'Pytest args:  %s\n' "${PYTEST_ARGS[*]}"

          uv run --group test \
            --with langchain-anthropic \
            --with langchain-deepseek \
            --with langchain-fireworks \
            --with langchain-google-genai \
            --with langchain-groq \
            --with langchain-mistralai \
            --with langchain-nvidia-ai-endpoints \
            --with langchain-ollama \
            --with langchain-openai \
            --with langchain-openrouter \
            --with langchain-xai \
            pytest tests/evals "${PYTEST_ARGS[@]}"