mirror of
https://github.com/hwchase17/langchain.git
synced 2026-06-09 10:17:00 +00:00
ci(infra): add middleware evals workflow for workflow_dispatch discovery (#37644)
Fast-track companion to #37643. GitHub's `workflow_dispatch` event is only discoverable when the workflow file exists on the default branch — even though the workflow code that runs comes from the `ref` passed to the dispatch. This PR lands the `Middleware Evals` workflow file on master so that #37643 (which adds `libs/langchain_v1/tests/evals/`) can be dispatched against the feature branch via: ```bash gh workflow run middleware_evals.yml \ --ref nh/todo-middleware-loop-contract \ --field models='claude-sonnet-4-6,...' ``` without first merging the full eval framework. ## Caveats - The workflow's pytest invocation depends on `libs/langchain_v1/tests/evals/` and the partner SDK list, neither of which exists on master yet. Dispatching with `--ref master` before #37643 lands will fail at pytest collection. That's the intended behavior — the workflow's purpose is to dispatch against branches that ship the eval suite. - Once #37643 merges to master, this workflow file already matches what #37643 adds. The merge will be a no-op for `middleware_evals.yml` itself.
This commit is contained in:
226
.github/workflows/middleware_evals.yml
vendored
Normal file
226
.github/workflows/middleware_evals.yml
vendored
Normal file
@@ -0,0 +1,226 @@
|
|||||||
|
# Real-model evals for langchain agent middleware.
|
||||||
|
#
|
||||||
|
# Triggered manually via workflow_dispatch — these tests call live model APIs
|
||||||
|
# and incur cost, so they are not run on every PR. Use this workflow when
|
||||||
|
# making changes to middleware prompts (e.g., WRITE_TODOS_SYSTEM_PROMPT) or
|
||||||
|
# other behavior-affecting middleware code to confirm the change works against
|
||||||
|
# real models.
|
||||||
|
#
|
||||||
|
# Pass a comma-separated list of model ids to run the suite across providers
|
||||||
|
# in parallel. Each model id is a separate matrix job.
|
||||||
|
#
|
||||||
|
# Required repository secrets (only set the ones for providers you actually
|
||||||
|
# evaluate against — unset providers just can't be picked):
|
||||||
|
# LANGSMITH_API_KEY — trace/experiment recording (always set this)
|
||||||
|
# ANTHROPIC_API_KEY — `claude-*` ids or `anthropic:claude-*`
|
||||||
|
# OPENAI_API_KEY — `gpt-*` ids or `openai:*`
|
||||||
|
# GOOGLE_API_KEY — `google_genai:gemini-*`
|
||||||
|
# GROQ_API_KEY — `groq:*`
|
||||||
|
# MISTRAL_API_KEY — `mistralai:*`
|
||||||
|
# XAI_API_KEY — `xai:grok-*`
|
||||||
|
# DEEPSEEK_API_KEY — `deepseek:*`
|
||||||
|
# FIREWORKS_API_KEY — `fireworks:*`
|
||||||
|
# NVIDIA_API_KEY — `nvidia:*`
|
||||||
|
# OLLAMA_API_KEY — `ollama:*` (Ollama Cloud)
|
||||||
|
# OPENROUTER_API_KEY — `openrouter:*`
|
||||||
|
#
|
||||||
|
# OLLAMA_HOST is hardcoded to https://ollama.com below since it is not
|
||||||
|
# sensitive.
|
||||||
|
#
|
||||||
|
# To add another provider:
|
||||||
|
# 1. Add the provider SDK to the `--with` list in libs/langchain_v1/Makefile
|
||||||
|
# (under the `evals` target).
|
||||||
|
# 2. Add the API key secret to the `env` block in the job below.
|
||||||
|
# 3. Document the matching `--model` syntax above.
|
||||||
|
|
||||||
|
name: "📊 Middleware Evals"
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
models:
|
||||||
|
description: >-
|
||||||
|
Comma-separated model ids passed to --model. Each becomes a matrix job.
|
||||||
|
Examples: claude-sonnet-4-6,claude-haiku-4-5,openai:gpt-5,
|
||||||
|
google_genai:gemini-2.5-pro, groq:llama-3.3-70b-versatile,
|
||||||
|
mistralai:mistral-large-latest, xai:grok-4,
|
||||||
|
deepseek:deepseek-chat,
|
||||||
|
fireworks:accounts/fireworks/models/llama-v3p1-70b-instruct,
|
||||||
|
openrouter:anthropic/claude-sonnet-4-6
|
||||||
|
required: true
|
||||||
|
default: "claude-sonnet-4-6"
|
||||||
|
type: string
|
||||||
|
tier:
|
||||||
|
description: "Eval tier to run"
|
||||||
|
required: false
|
||||||
|
default: "baseline"
|
||||||
|
type: choice
|
||||||
|
options:
|
||||||
|
- baseline
|
||||||
|
- hillclimb
|
||||||
|
- all
|
||||||
|
eval_category:
|
||||||
|
description: >-
|
||||||
|
Optional eval_category to filter to (e.g., "middleware/todo").
|
||||||
|
Empty runs all categories.
|
||||||
|
required: false
|
||||||
|
default: ""
|
||||||
|
type: string
|
||||||
|
openai_reasoning_effort:
|
||||||
|
description: >-
|
||||||
|
Forwarded to ChatOpenAI when the model is OpenAI. Use `minimal` for
|
||||||
|
faster/cheaper GPT-5 runs. Ignored for non-OpenAI providers.
|
||||||
|
required: false
|
||||||
|
default: ""
|
||||||
|
type: choice
|
||||||
|
options:
|
||||||
|
- ""
|
||||||
|
- minimal
|
||||||
|
- low
|
||||||
|
- medium
|
||||||
|
- high
|
||||||
|
pytest_extra:
|
||||||
|
description: "Additional pytest args (e.g., '-k density' to filter)"
|
||||||
|
required: false
|
||||||
|
default: ""
|
||||||
|
type: string
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
env:
|
||||||
|
UV_FROZEN: "true"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
parse:
|
||||||
|
name: "Parse model list"
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
models: ${{ steps.set.outputs.models }}
|
||||||
|
steps:
|
||||||
|
- id: set
|
||||||
|
env:
|
||||||
|
MODELS_INPUT: ${{ inputs.models }}
|
||||||
|
run: |
|
||||||
|
MODELS_JSON=$(python3 -c '
|
||||||
|
import json, os
|
||||||
|
raw = os.environ.get("MODELS_INPUT", "")
|
||||||
|
out = [m.strip() for m in raw.split(",") if m.strip()]
|
||||||
|
print(json.dumps(out))
|
||||||
|
')
|
||||||
|
echo "Parsed models: $MODELS_JSON"
|
||||||
|
echo "models=$MODELS_JSON" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
evals:
|
||||||
|
name: "📊 ${{ matrix.model }} (${{ inputs.tier }})"
|
||||||
|
needs: parse
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
timeout-minutes: 30
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
max-parallel: 4
|
||||||
|
matrix:
|
||||||
|
model: ${{ fromJSON(needs.parse.outputs.models) }}
|
||||||
|
steps:
|
||||||
|
- name: "📋 Checkout"
|
||||||
|
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
|
||||||
|
|
||||||
|
- name: "🛠️ Set up uv"
|
||||||
|
uses: astral-sh/setup-uv@0ca8f610542aa7f4acaf39e65cf4eb3c35091883 # v7
|
||||||
|
with:
|
||||||
|
enable-cache: true
|
||||||
|
|
||||||
|
- name: "📊 Run evals"
|
||||||
|
working-directory: libs/langchain_v1
|
||||||
|
# SECURITY: every value derived from `inputs.*` / `matrix.*` is passed
|
||||||
|
# via `env:`, NOT interpolated into the `run:` script body. GitHub
|
||||||
|
# expands `${{ ... }}` textually before the shell runs, so splicing
|
||||||
|
# those expressions into a `run:` block lets a value containing `'`
|
||||||
|
# break out of the literal and execute arbitrary commands with API
|
||||||
|
# keys in scope. Passing them as env vars keeps the values out of the
|
||||||
|
# script source and lets bash quote them safely as `"$VAR"`. We also
|
||||||
|
# invoke `pytest` directly here instead of going through `make`,
|
||||||
|
# because Make's `$(VAR)` is textual expansion and would re-introduce
|
||||||
|
# the same class of injection at the Make layer.
|
||||||
|
env:
|
||||||
|
# Provider keys. Unset secrets resolve to empty strings; the SDK only
|
||||||
|
# complains if the matrix model id actually requires that provider.
|
||||||
|
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||||
|
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||||
|
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
|
||||||
|
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
|
||||||
|
MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
|
||||||
|
XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
|
||||||
|
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
|
||||||
|
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
|
||||||
|
NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
|
||||||
|
OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }}
|
||||||
|
OLLAMA_HOST: "https://ollama.com"
|
||||||
|
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
||||||
|
# LangSmith trace recording.
|
||||||
|
LANGSMITH_API_KEY: ${{ secrets.LANGSMITH_API_KEY }}
|
||||||
|
LANGSMITH_TRACING: "true"
|
||||||
|
LANGSMITH_PROJECT: "langchain-middleware-evals"
|
||||||
|
# User-controlled inputs surfaced as env vars (not script splices).
|
||||||
|
MODEL: ${{ matrix.model }}
|
||||||
|
TIER: ${{ inputs.tier }}
|
||||||
|
CATEGORY: ${{ inputs.eval_category }}
|
||||||
|
REASONING: ${{ inputs.openai_reasoning_effort }}
|
||||||
|
USER_EXTRA: ${{ inputs.pytest_extra }}
|
||||||
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Build the pytest arg array from env-provided inputs. Each element
|
||||||
|
# is added as a discrete word in the array so word-splitting and
|
||||||
|
# quoting are handled by bash, not by the script source. Filter
|
||||||
|
# expressions for `-m` are constructed from a fixed template and
|
||||||
|
# concatenated with user-supplied values; the values themselves are
|
||||||
|
# never interpreted as shell.
|
||||||
|
PYTEST_ARGS=(-v --tb=short --model "$MODEL")
|
||||||
|
|
||||||
|
MARKER=""
|
||||||
|
if [ "$TIER" != "all" ]; then
|
||||||
|
MARKER="eval_tier(\"$TIER\")"
|
||||||
|
fi
|
||||||
|
if [ -n "$CATEGORY" ]; then
|
||||||
|
if [ -n "$MARKER" ]; then
|
||||||
|
MARKER="$MARKER and eval_category(\"$CATEGORY\")"
|
||||||
|
else
|
||||||
|
MARKER="eval_category(\"$CATEGORY\")"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if [ -n "$MARKER" ]; then
|
||||||
|
PYTEST_ARGS+=(-m "$MARKER")
|
||||||
|
fi
|
||||||
|
if [ -n "$REASONING" ]; then
|
||||||
|
PYTEST_ARGS+=("--openai-reasoning-effort=$REASONING")
|
||||||
|
fi
|
||||||
|
if [ -n "$USER_EXTRA" ]; then
|
||||||
|
# USER_EXTRA is split on whitespace into discrete args. Users with
|
||||||
|
# workflow_dispatch access who want spaces inside a single arg can
|
||||||
|
# escape with quotes inside the input, but the typical use is
|
||||||
|
# things like `-k density`.
|
||||||
|
# shellcheck disable=SC2086
|
||||||
|
read -r -a USER_ARGS <<< "$USER_EXTRA"
|
||||||
|
PYTEST_ARGS+=("${USER_ARGS[@]}")
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf 'Model: %s\n' "$MODEL"
|
||||||
|
printf 'Tier: %s\n' "$TIER"
|
||||||
|
printf 'Category: %s\n' "${CATEGORY:-(all)}"
|
||||||
|
printf 'Reasoning: %s\n' "${REASONING:-(default)}"
|
||||||
|
printf 'Pytest args: %s\n' "${PYTEST_ARGS[*]}"
|
||||||
|
|
||||||
|
uv run --group test \
|
||||||
|
--with langchain-anthropic \
|
||||||
|
--with langchain-deepseek \
|
||||||
|
--with langchain-fireworks \
|
||||||
|
--with langchain-google-genai \
|
||||||
|
--with langchain-groq \
|
||||||
|
--with langchain-mistralai \
|
||||||
|
--with langchain-nvidia-ai-endpoints \
|
||||||
|
--with langchain-ollama \
|
||||||
|
--with langchain-openai \
|
||||||
|
--with langchain-openrouter \
|
||||||
|
--with langchain-xai \
|
||||||
|
pytest tests/evals "${PYTEST_ARGS[@]}"
|
||||||
Reference in New Issue
Block a user