Add progress bar + runner fixes (#10348)

- Add progress bar to eval runs - Use thread pool for concurrency - Update some error messages - Friendlier project name - Print out quantiles of the final stats Closes LS-902
2025-07-19 03:01:29 +00:00 · 2023-09-08 07:45:28 -07:00 · 2023-09-08 07:45:28 -07:00 · 46e9abdc75
commit 46e9abdc75
parent 0672533b3e
8 changed files with 1098 additions and 515 deletions
--- a/libs/langchain/langchain/callbacks/tracers/evaluation.py
+++ b/libs/langchain/langchain/callbacks/tracers/evaluation.py
@ -2,29 +2,20 @@
 from __future__ import annotations
 import logging
-from concurrent.futures import Future, ThreadPoolExecutor, wait
+from concurrent.futures import Future, ThreadPoolExecutor
 from typing import Any, Dict, List, Optional, Sequence, Set, Union
 from uuid import UUID
 import langsmith
 from langsmith import schemas as langsmith_schemas
-from langchain.callbacks.manager import tracing_v2_enabled
+from langchain.callbacks import manager
 from langchain.callbacks.tracers import langchain as langchain_tracer
 from langchain.callbacks.tracers.base import BaseTracer
 from langchain.callbacks.tracers.langchain import _get_client
 from langchain.callbacks.tracers.schemas import Run
 logger = logging.getLogger(__name__)
 _TRACERS: List[EvaluatorCallbackHandler] = []
 def wait_for_all_evaluators() -> None:
    """Wait for all tracers to finish."""
    global _TRACERS
    for tracer in _TRACERS:
        tracer.wait_for_futures()
 class EvaluatorCallbackHandler(BaseTracer):
    """A tracer that runs a run evaluator whenever a run is persisted.
@ -79,17 +70,13 @@ class EvaluatorCallbackHandler(BaseTracer):
        self.example_id = (
            UUID(example_id) if isinstance(example_id, str) else example_id
        )
-        self.client = client or _get_client()
+        self.client = client or langchain_tracer.get_client()
        self.evaluators = evaluators
-        self.executor = ThreadPoolExecutor(
+        self.max_workers = max_workers or len(evaluators)
            max_workers=max(max_workers or len(evaluators), 1)
        )
        self.futures: Set[Future] = set()
        self.skip_unfinished = skip_unfinished
        self.project_name = project_name
        self.logged_feedback: Dict[str, List[langsmith_schemas.Feedback]] = {}
        global _TRACERS
        _TRACERS.append(self)
    def _evaluate_in_project(self, run: Run, evaluator: langsmith.RunEvaluator) -> None:
        """Evaluate the run in the project.
@ -105,7 +92,7 @@ class EvaluatorCallbackHandler(BaseTracer):
        try:
            if self.project_name is None:
                feedback = self.client.evaluate_run(run, evaluator)
-            with tracing_v2_enabled(
+            with manager.tracing_v2_enabled(
                project_name=self.project_name, tags=["eval"], client=self.client
            ):
                feedback = self.client.evaluate_run(run, evaluator)
@ -133,14 +120,15 @@ class EvaluatorCallbackHandler(BaseTracer):
            return
        run_ = run.copy()
        run_.reference_example_id = self.example_id
-        for evaluator in self.evaluators:
+        if self.max_workers > 0:
-            self.futures.add(
+            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
-                self.executor.submit(self._evaluate_in_project, run_, evaluator)
+                list(
-            )
+                    executor.map(
-
+                        self._evaluate_in_project,
-    def wait_for_futures(self) -> None:
+                        [run_ for _ in range(len(self.evaluators))],
-        """Wait for all futures to complete."""
+                        self.evaluators,
-        futures = list(self.futures)
+                    )
-        wait(futures)
+                )
-        for future in futures:
+        else:
-            self.futures.remove(future)
+            for evaluator in self.evaluators:
                self._evaluate_in_project(run_, evaluator)
--- a/libs/langchain/langchain/callbacks/tracers/langchain.py
+++ b/libs/langchain/langchain/callbacks/tracers/langchain.py
@ -42,7 +42,7 @@ def wait_for_all_tracers() -> None:
            tracer.wait_for_futures()
-def _get_client() -> Client:
+def get_client() -> Client:
    """Get the client."""
    global _CLIENT
    if _CLIENT is None:
@ -83,7 +83,7 @@ class LangChainTracer(BaseTracer):
                _EXECUTORS.append(self.executor)
        else:
            self.executor = None
-        self.client = client or _get_client()
+        self.client = client or get_client()
        self._futures: Set[Future] = set()
        self.tags = tags or []
        global _TRACERS
--- a/libs/langchain/langchain/smith/evaluation/name_generation.py
+++ b/libs/langchain/langchain/smith/evaluation/name_generation.py
@ -0,0 +1,729 @@
 import random
 adjectives = [
    "abandoned",
    "aching",
    "advanced",
    "ample",
    "artistic",
    "back",
    "best",
    "bold",
    "brief",
    "clear",
    "cold",
    "complicated",
    "cooked",
    "crazy",
    "crushing",
    "damp",
    "dear",
    "definite",
    "dependable",
    "diligent",
    "drab",
    "earnest",
    "elderly",
    "enchanted",
    "essential",
    "excellent",
    "extraneous",
    "fixed",
    "flowery",
    "formal",
    "fresh",
    "frosty",
    "giving",
    "glossy",
    "healthy",
    "helpful",
    "impressionable",
    "kind",
    "large",
    "left",
    "long",
    "loyal",
    "mealy",
    "memorable",
    "monthly",
    "new",
    "notable",
    "only",
    "ordinary",
    "passionate",
    "perfect",
    "pertinent",
    "proper",
    "puzzled",
    "reflecting",
    "respectful",
    "roasted",
    "scholarly",
    "shiny",
    "slight",
    "sparkling",
    "spotless",
    "stupendous",
    "sunny",
    "tart",
    "terrific",
    "timely",
    "unique",
    "upbeat",
    "vacant",
    "virtual",
    "warm",
    "weary",
    "whispered",
    "worthwhile",
    "yellow",
 ]
 nouns = [
    "account",
    "acknowledgment",
    "address",
    "advertising",
    "airplane",
    "animal",
    "appointment",
    "arrival",
    "artist",
    "attachment",
    "attitude",
    "availability",
    "backpack",
    "bag",
    "balance",
    "bass",
    "bean",
    "beauty",
    "bibliography",
    "bill",
    "bite",
    "blossom",
    "boat",
    "book",
    "box",
    "boy",
    "bread",
    "bridge",
    "broccoli",
    "building",
    "butter",
    "button",
    "cabbage",
    "cake",
    "camera",
    "camp",
    "candle",
    "candy",
    "canvas",
    "car",
    "card",
    "carrot",
    "cart",
    "case",
    "cat",
    "chain",
    "chair",
    "chalk",
    "chance",
    "change",
    "channel",
    "character",
    "charge",
    "charm",
    "chart",
    "check",
    "cheek",
    "cheese",
    "chef",
    "cherry",
    "chicken",
    "child",
    "church",
    "circle",
    "class",
    "clay",
    "click",
    "clock",
    "cloth",
    "cloud",
    "clove",
    "club",
    "coach",
    "coal",
    "coast",
    "coat",
    "cod",
    "coffee",
    "collar",
    "color",
    "comb",
    "comfort",
    "comic",
    "committee",
    "community",
    "company",
    "comparison",
    "competition",
    "condition",
    "connection",
    "control",
    "cook",
    "copper",
    "copy",
    "corn",
    "cough",
    "country",
    "cover",
    "crate",
    "crayon",
    "cream",
    "creator",
    "crew",
    "crown",
    "current",
    "curtain",
    "curve",
    "cushion",
    "dad",
    "daughter",
    "day",
    "death",
    "debt",
    "decision",
    "deer",
    "degree",
    "design",
    "desire",
    "desk",
    "detail",
    "development",
    "digestion",
    "dime",
    "dinner",
    "direction",
    "dirt",
    "discovery",
    "discussion",
    "disease",
    "disgust",
    "distance",
    "distribution",
    "division",
    "doctor",
    "dog",
    "door",
    "drain",
    "drawer",
    "dress",
    "drink",
    "driving",
    "dust",
    "ear",
    "earth",
    "edge",
    "education",
    "effect",
    "egg",
    "end",
    "energy",
    "engine",
    "error",
    "event",
    "example",
    "exchange",
    "existence",
    "expansion",
    "experience",
    "expert",
    "eye",
    "face",
    "fact",
    "fall",
    "family",
    "farm",
    "father",
    "fear",
    "feeling",
    "field",
    "finger",
    "fire",
    "fish",
    "flag",
    "flight",
    "floor",
    "flower",
    "fold",
    "food",
    "football",
    "force",
    "form",
    "frame",
    "friend",
    "frog",
    "fruit",
    "fuel",
    "furniture",
    "game",
    "garden",
    "gate",
    "girl",
    "glass",
    "glove",
    "goat",
    "gold",
    "government",
    "grade",
    "grain",
    "grass",
    "green",
    "grip",
    "group",
    "growth",
    "guide",
    "guitar",
    "hair",
    "hall",
    "hand",
    "harbor",
    "harmony",
    "hat",
    "head",
    "health",
    "heart",
    "heat",
    "hill",
    "history",
    "hobbies",
    "hole",
    "hope",
    "horn",
    "horse",
    "hospital",
    "hour",
    "house",
    "humor",
    "idea",
    "impulse",
    "income",
    "increase",
    "industry",
    "ink",
    "insect",
    "instrument",
    "insurance",
    "interest",
    "invention",
    "iron",
    "island",
    "jelly",
    "jet",
    "jewel",
    "join",
    "judge",
    "juice",
    "jump",
    "kettle",
    "key",
    "kick",
    "kiss",
    "kitten",
    "knee",
    "knife",
    "knowledge",
    "land",
    "language",
    "laugh",
    "law",
    "lead",
    "learning",
    "leather",
    "leg",
    "lettuce",
    "level",
    "library",
    "lift",
    "light",
    "limit",
    "line",
    "linen",
    "lip",
    "liquid",
    "list",
    "look",
    "loss",
    "love",
    "lunch",
    "machine",
    "man",
    "manager",
    "map",
    "marble",
    "mark",
    "market",
    "mass",
    "match",
    "meal",
    "measure",
    "meat",
    "meeting",
    "memory",
    "metal",
    "middle",
    "milk",
    "mind",
    "mine",
    "minute",
    "mist",
    "mitten",
    "mom",
    "money",
    "monkey",
    "month",
    "moon",
    "morning",
    "mother",
    "motion",
    "mountain",
    "mouth",
    "muscle",
    "music",
    "nail",
    "name",
    "nation",
    "neck",
    "need",
    "news",
    "night",
    "noise",
    "note",
    "number",
    "nut",
    "observation",
    "offer",
    "oil",
    "operation",
    "opinion",
    "orange",
    "order",
    "organization",
    "ornament",
    "oven",
    "page",
    "pail",
    "pain",
    "paint",
    "pan",
    "pancake",
    "paper",
    "parcel",
    "parent",
    "part",
    "passenger",
    "paste",
    "payment",
    "peace",
    "pear",
    "pen",
    "pencil",
    "person",
    "pest",
    "pet",
    "picture",
    "pie",
    "pin",
    "pipe",
    "pizza",
    "place",
    "plane",
    "plant",
    "plastic",
    "plate",
    "play",
    "pleasure",
    "plot",
    "plough",
    "pocket",
    "point",
    "poison",
    "police",
    "pollution",
    "popcorn",
    "porter",
    "position",
    "pot",
    "potato",
    "powder",
    "power",
    "price",
    "print",
    "process",
    "produce",
    "product",
    "profit",
    "property",
    "prose",
    "protest",
    "pull",
    "pump",
    "punishment",
    "purpose",
    "push",
    "quarter",
    "question",
    "quiet",
    "quill",
    "quilt",
    "quince",
    "rabbit",
    "rail",
    "rain",
    "range",
    "rat",
    "rate",
    "ray",
    "reaction",
    "reading",
    "reason",
    "record",
    "regret",
    "relation",
    "religion",
    "representative",
    "request",
    "respect",
    "rest",
    "reward",
    "rhythm",
    "rice",
    "river",
    "road",
    "roll",
    "room",
    "root",
    "rose",
    "route",
    "rub",
    "rule",
    "run",
    "sack",
    "sail",
    "salt",
    "sand",
    "scale",
    "scarecrow",
    "scarf",
    "scene",
    "scent",
    "school",
    "science",
    "scissors",
    "screw",
    "sea",
    "seat",
    "secretary",
    "seed",
    "selection",
    "self",
    "sense",
    "servant",
    "shade",
    "shake",
    "shame",
    "shape",
    "sheep",
    "sheet",
    "shelf",
    "ship",
    "shirt",
    "shock",
    "shoe",
    "shop",
    "show",
    "side",
    "sign",
    "silk",
    "sink",
    "sister",
    "size",
    "sky",
    "slave",
    "sleep",
    "smash",
    "smell",
    "smile",
    "smoke",
    "snail",
    "snake",
    "sneeze",
    "snow",
    "soap",
    "society",
    "sock",
    "soda",
    "sofa",
    "son",
    "song",
    "sort",
    "sound",
    "soup",
    "space",
    "spark",
    "speed",
    "sponge",
    "spoon",
    "spray",
    "spring",
    "spy",
    "square",
    "stamp",
    "star",
    "start",
    "statement",
    "station",
    "steam",
    "steel",
    "stem",
    "step",
    "stew",
    "stick",
    "stitch",
    "stocking",
    "stomach",
    "stone",
    "stop",
    "store",
    "story",
    "stove",
    "stranger",
    "straw",
    "stream",
    "street",
    "stretch",
    "string",
    "structure",
    "substance",
    "sugar",
    "suggestion",
    "suit",
    "summer",
    "sun",
    "support",
    "surprise",
    "sweater",
    "swim",
    "system",
    "table",
    "tail",
    "talk",
    "tank",
    "taste",
    "tax",
    "tea",
    "teaching",
    "team",
    "tendency",
    "test",
    "texture",
    "theory",
    "thing",
    "thought",
    "thread",
    "throat",
    "thumb",
    "thunder",
    "ticket",
    "time",
    "tin",
    "title",
    "toad",
    "toe",
    "tooth",
    "toothpaste",
    "touch",
    "town",
    "toy",
    "trade",
    "train",
    "transport",
    "tray",
    "treatment",
    "tree",
    "trick",
    "trip",
    "trouble",
    "trousers",
    "truck",
    "tub",
    "turkey",
    "turn",
    "twist",
    "umbrella",
    "uncle",
    "underwear",
    "unit",
    "use",
    "vacation",
    "value",
    "van",
    "vase",
    "vegetable",
    "veil",
    "vein",
    "verse",
    "vessel",
    "view",
    "visitor",
    "voice",
    "volcano",
    "walk",
    "wall",
    "war",
    "wash",
    "waste",
    "watch",
    "water",
    "wave",
    "wax",
    "way",
    "wealth",
    "weather",
    "week",
    "weight",
    "wheel",
    "whip",
    "whistle",
    "window",
    "wine",
    "wing",
    "winter",
    "wire",
    "wish",
    "woman",
    "wood",
    "wool",
    "word",
    "work",
    "worm",
    "wound",
    "wrist",
    "writer",
    "yard",
    "yoke",
    "zebra",
    "zinc",
    "zipper",
    "zone",
 ]
 def random_name(prefix: str = "test") -> str:
    """Generate a random name."""
    adjective = random.choice(adjectives)
    noun = random.choice(nouns)
    number = random.randint(1, 100)
    return f"{prefix}-{adjective}-{noun}-{number}"
--- a/libs/langchain/langchain/smith/evaluation/progress.py
+++ b/libs/langchain/langchain/smith/evaluation/progress.py
@ -0,0 +1,82 @@
 """A simple progress bar for the console."""
 import threading
 from typing import Any, Dict, Optional, Sequence
 from uuid import UUID
 from langchain.callbacks import base as base_callbacks
 from langchain.schema.document import Document
 from langchain.schema.output import LLMResult
 class ProgressBarCallback(base_callbacks.BaseCallbackHandler):
    """A simple progress bar for the console."""
    def __init__(self, total: int, ncols: int = 50, **kwargs: Any):
        """Initialize the progress bar.
        Args:
            total: int, the total number of items to be processed.
            ncols: int, the character width of the progress bar.
        """
        self.total = total
        self.ncols = ncols
        self.counter = 0
        self.lock = threading.Lock()
        self._print_bar()
    def increment(self) -> None:
        """Increment the counter and update the progress bar."""
        with self.lock:
            self.counter += 1
            self._print_bar()
    def _print_bar(self) -> None:
        """Print the progress bar to the console."""
        progress = self.counter / self.total
        arrow = "-" * int(round(progress * self.ncols) - 1) + ">"
        spaces = " " * (self.ncols - len(arrow))
        print(f"\r[{arrow + spaces}] {self.counter}/{self.total}", end="")
    def on_chain_end(
        self,
        outputs: Dict[str, Any],
        *,
        run_id: UUID,
        parent_run_id: Optional[UUID] = None,
        **kwargs: Any,
    ) -> Any:
        if parent_run_id is None:
            self.increment()
    def on_retriever_end(
        self,
        documents: Sequence[Document],
        *,
        run_id: UUID,
        parent_run_id: Optional[UUID] = None,
        **kwargs: Any,
    ) -> Any:
        if parent_run_id is None:
            self.increment()
    def on_llm_end(
        self,
        response: LLMResult,
        *,
        run_id: UUID,
        parent_run_id: Optional[UUID] = None,
        **kwargs: Any,
    ) -> Any:
        if parent_run_id is None:
            self.increment()
    def on_tool_end(
        self,
        output: str,
        *,
        run_id: UUID,
        parent_run_id: Optional[UUID] = None,
        **kwargs: Any,
    ) -> Any:
        if parent_run_id is None:
            self.increment()
--- a/libs/langchain/langchain/smith/evaluation/runner_utils.py
+++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py
--- a/libs/langchain/langchain/smith/evaluation/string_run_evaluator.py
+++ b/libs/langchain/langchain/smith/evaluation/string_run_evaluator.py
@ -148,13 +148,27 @@ class ChainStringRunMapper(StringRunMapper):
    def map(self, run: Run) -> Dict[str, str]:
        """Maps the Run to a dictionary."""
        if not run.outputs:
            raise ValueError(f"Run {run.id} has no outputs to evaluate.")
        if self.input_key is not None and self.input_key not in run.inputs:
            raise ValueError(f"Run {run.id} does not have input key {self.input_key}.")
        elif self.prediction_key is not None and self.prediction_key not in run.outputs:
            raise ValueError(
-                f"Run {run.id} does not have prediction key {self.prediction_key}."
+                f"Run with ID {run.id} lacks outputs required for evaluation."
                " Ensure the Run has valid outputs."
            )
        if self.input_key is not None and self.input_key not in run.inputs:
            raise ValueError(
                f"Run with ID {run.id} is missing the expected input key"
                f" '{self.input_key}'.\nAvailable input keys in this Run"
                f"  are: {run.inputs.keys()}.\nAdjust the evaluator's"
                f" input_key or ensure your input data includes key"
                f" '{self.input_key}'."
            )
        elif self.prediction_key is not None and self.prediction_key not in run.outputs:
            available_keys = ", ".join(run.outputs.keys())
            raise ValueError(
                f"Run with ID {run.id} doesn't have the expected prediction key"
                f" '{self.prediction_key}'. Available prediction keys in this Run are:"
                f" {available_keys}. Adjust the evaluator's prediction_key or"
                " ensure the Run object's outputs the expected key."
            )
        else:
            input_ = self._get_key(run.inputs, self.input_key, "input")
            prediction = self._get_key(run.outputs, self.prediction_key, "prediction")
--- a/libs/langchain/tests/integration_tests/smith/evaluation/test_runner_utils.py
+++ b/libs/langchain/tests/integration_tests/smith/evaluation/test_runner_utils.py
@ -5,7 +5,6 @@ import pytest
 from langsmith import Client as Client
 from langsmith.schemas import DataType
 from langchain.callbacks.tracers.evaluation import wait_for_all_evaluators
 from langchain.chains.llm import LLMChain
 from langchain.chat_models import ChatOpenAI
 from langchain.evaluation import EvaluatorType
@ -22,7 +21,6 @@ def _check_all_feedback_passed(_project_name: str, client: Client) -> None:
    # chain or llm passes for the feedback provided.
    runs = list(client.list_runs(project_name=_project_name, execution_order=1))
    assert len(runs) == 4
    wait_for_all_evaluators()
    feedback = list(client.list_feedback(run_ids=[run.id for run in runs]))
    assert len(feedback) == 8
    assert all([f.score == 1 for f in feedback])
--- a/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py
+++ b/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py
@ -181,11 +181,15 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
        assert "the wrong input" in inputs
        return {"the right input": inputs["the wrong input"]}
-    result = _run_llm_or_chain(example, lambda: mock_chain, input_mapper=input_mapper)
+    result = _run_llm_or_chain(
        example,
        {"callbacks": [], "tags": []},
        llm_or_chain_factory=lambda: mock_chain,
        input_mapper=input_mapper,
    )
    assert result == {"output": "2", "the right input": "1"}
    bad_result = _run_llm_or_chain(
-        example,
+        example, {"callbacks": [], "tags": []}, llm_or_chain_factory=lambda: mock_chain
        lambda: mock_chain,
    )
    assert "Error" in bad_result
@ -195,7 +199,12 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
        return "the right input"
    mock_llm = FakeLLM(queries={"the right input": "somenumber"})
-    llm_result = _run_llm_or_chain(example, mock_llm, input_mapper=llm_input_mapper)
+    llm_result = _run_llm_or_chain(
        example,
        {"callbacks": [], "tags": []},
        llm_or_chain_factory=mock_llm,
        input_mapper=llm_input_mapper,
    )
    assert isinstance(llm_result, str)
    assert llm_result == "somenumber"
@ -324,10 +333,14 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
        )
        expected = {
-            uuid_: {
+            str(example.id): {
-                "output": {"result": f"Result for example {uuid.UUID(uuid_)}"},
+                "output": {
                    "result": f"Result for example {uuid.UUID(str(example.id))}"
                },
                "input": {"input": example.inputs["input"]},
                "reference": {"output": example.outputs["output"]},
                "feedback": [],
            }
-            for uuid_ in uuids
+            for example in examples
        }
        assert results["results"] == expected