Add progress bar + runner fixes (#10348)

- Add progress bar to eval runs
- Use thread pool for concurrency
- Update some error messages
- Friendlier project name
- Print out quantiles of the final stats 

Closes LS-902
This commit is contained in:
William FH 2023-09-08 07:45:28 -07:00 committed by GitHub
parent 0672533b3e
commit 46e9abdc75
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 1098 additions and 515 deletions

View File

@ -2,29 +2,20 @@
from __future__ import annotations from __future__ import annotations
import logging import logging
from concurrent.futures import Future, ThreadPoolExecutor, wait from concurrent.futures import Future, ThreadPoolExecutor
from typing import Any, Dict, List, Optional, Sequence, Set, Union from typing import Any, Dict, List, Optional, Sequence, Set, Union
from uuid import UUID from uuid import UUID
import langsmith import langsmith
from langsmith import schemas as langsmith_schemas from langsmith import schemas as langsmith_schemas
from langchain.callbacks.manager import tracing_v2_enabled from langchain.callbacks import manager
from langchain.callbacks.tracers import langchain as langchain_tracer
from langchain.callbacks.tracers.base import BaseTracer from langchain.callbacks.tracers.base import BaseTracer
from langchain.callbacks.tracers.langchain import _get_client
from langchain.callbacks.tracers.schemas import Run from langchain.callbacks.tracers.schemas import Run
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TRACERS: List[EvaluatorCallbackHandler] = []
def wait_for_all_evaluators() -> None:
"""Wait for all tracers to finish."""
global _TRACERS
for tracer in _TRACERS:
tracer.wait_for_futures()
class EvaluatorCallbackHandler(BaseTracer): class EvaluatorCallbackHandler(BaseTracer):
"""A tracer that runs a run evaluator whenever a run is persisted. """A tracer that runs a run evaluator whenever a run is persisted.
@ -79,17 +70,13 @@ class EvaluatorCallbackHandler(BaseTracer):
self.example_id = ( self.example_id = (
UUID(example_id) if isinstance(example_id, str) else example_id UUID(example_id) if isinstance(example_id, str) else example_id
) )
self.client = client or _get_client() self.client = client or langchain_tracer.get_client()
self.evaluators = evaluators self.evaluators = evaluators
self.executor = ThreadPoolExecutor( self.max_workers = max_workers or len(evaluators)
max_workers=max(max_workers or len(evaluators), 1)
)
self.futures: Set[Future] = set() self.futures: Set[Future] = set()
self.skip_unfinished = skip_unfinished self.skip_unfinished = skip_unfinished
self.project_name = project_name self.project_name = project_name
self.logged_feedback: Dict[str, List[langsmith_schemas.Feedback]] = {} self.logged_feedback: Dict[str, List[langsmith_schemas.Feedback]] = {}
global _TRACERS
_TRACERS.append(self)
def _evaluate_in_project(self, run: Run, evaluator: langsmith.RunEvaluator) -> None: def _evaluate_in_project(self, run: Run, evaluator: langsmith.RunEvaluator) -> None:
"""Evaluate the run in the project. """Evaluate the run in the project.
@ -105,7 +92,7 @@ class EvaluatorCallbackHandler(BaseTracer):
try: try:
if self.project_name is None: if self.project_name is None:
feedback = self.client.evaluate_run(run, evaluator) feedback = self.client.evaluate_run(run, evaluator)
with tracing_v2_enabled( with manager.tracing_v2_enabled(
project_name=self.project_name, tags=["eval"], client=self.client project_name=self.project_name, tags=["eval"], client=self.client
): ):
feedback = self.client.evaluate_run(run, evaluator) feedback = self.client.evaluate_run(run, evaluator)
@ -133,14 +120,15 @@ class EvaluatorCallbackHandler(BaseTracer):
return return
run_ = run.copy() run_ = run.copy()
run_.reference_example_id = self.example_id run_.reference_example_id = self.example_id
for evaluator in self.evaluators: if self.max_workers > 0:
self.futures.add( with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
self.executor.submit(self._evaluate_in_project, run_, evaluator) list(
) executor.map(
self._evaluate_in_project,
def wait_for_futures(self) -> None: [run_ for _ in range(len(self.evaluators))],
"""Wait for all futures to complete.""" self.evaluators,
futures = list(self.futures) )
wait(futures) )
for future in futures: else:
self.futures.remove(future) for evaluator in self.evaluators:
self._evaluate_in_project(run_, evaluator)

View File

@ -42,7 +42,7 @@ def wait_for_all_tracers() -> None:
tracer.wait_for_futures() tracer.wait_for_futures()
def _get_client() -> Client: def get_client() -> Client:
"""Get the client.""" """Get the client."""
global _CLIENT global _CLIENT
if _CLIENT is None: if _CLIENT is None:
@ -83,7 +83,7 @@ class LangChainTracer(BaseTracer):
_EXECUTORS.append(self.executor) _EXECUTORS.append(self.executor)
else: else:
self.executor = None self.executor = None
self.client = client or _get_client() self.client = client or get_client()
self._futures: Set[Future] = set() self._futures: Set[Future] = set()
self.tags = tags or [] self.tags = tags or []
global _TRACERS global _TRACERS

View File

@ -0,0 +1,729 @@
import random
adjectives = [
"abandoned",
"aching",
"advanced",
"ample",
"artistic",
"back",
"best",
"bold",
"brief",
"clear",
"cold",
"complicated",
"cooked",
"crazy",
"crushing",
"damp",
"dear",
"definite",
"dependable",
"diligent",
"drab",
"earnest",
"elderly",
"enchanted",
"essential",
"excellent",
"extraneous",
"fixed",
"flowery",
"formal",
"fresh",
"frosty",
"giving",
"glossy",
"healthy",
"helpful",
"impressionable",
"kind",
"large",
"left",
"long",
"loyal",
"mealy",
"memorable",
"monthly",
"new",
"notable",
"only",
"ordinary",
"passionate",
"perfect",
"pertinent",
"proper",
"puzzled",
"reflecting",
"respectful",
"roasted",
"scholarly",
"shiny",
"slight",
"sparkling",
"spotless",
"stupendous",
"sunny",
"tart",
"terrific",
"timely",
"unique",
"upbeat",
"vacant",
"virtual",
"warm",
"weary",
"whispered",
"worthwhile",
"yellow",
]
nouns = [
"account",
"acknowledgment",
"address",
"advertising",
"airplane",
"animal",
"appointment",
"arrival",
"artist",
"attachment",
"attitude",
"availability",
"backpack",
"bag",
"balance",
"bass",
"bean",
"beauty",
"bibliography",
"bill",
"bite",
"blossom",
"boat",
"book",
"box",
"boy",
"bread",
"bridge",
"broccoli",
"building",
"butter",
"button",
"cabbage",
"cake",
"camera",
"camp",
"candle",
"candy",
"canvas",
"car",
"card",
"carrot",
"cart",
"case",
"cat",
"chain",
"chair",
"chalk",
"chance",
"change",
"channel",
"character",
"charge",
"charm",
"chart",
"check",
"cheek",
"cheese",
"chef",
"cherry",
"chicken",
"child",
"church",
"circle",
"class",
"clay",
"click",
"clock",
"cloth",
"cloud",
"clove",
"club",
"coach",
"coal",
"coast",
"coat",
"cod",
"coffee",
"collar",
"color",
"comb",
"comfort",
"comic",
"committee",
"community",
"company",
"comparison",
"competition",
"condition",
"connection",
"control",
"cook",
"copper",
"copy",
"corn",
"cough",
"country",
"cover",
"crate",
"crayon",
"cream",
"creator",
"crew",
"crown",
"current",
"curtain",
"curve",
"cushion",
"dad",
"daughter",
"day",
"death",
"debt",
"decision",
"deer",
"degree",
"design",
"desire",
"desk",
"detail",
"development",
"digestion",
"dime",
"dinner",
"direction",
"dirt",
"discovery",
"discussion",
"disease",
"disgust",
"distance",
"distribution",
"division",
"doctor",
"dog",
"door",
"drain",
"drawer",
"dress",
"drink",
"driving",
"dust",
"ear",
"earth",
"edge",
"education",
"effect",
"egg",
"end",
"energy",
"engine",
"error",
"event",
"example",
"exchange",
"existence",
"expansion",
"experience",
"expert",
"eye",
"face",
"fact",
"fall",
"family",
"farm",
"father",
"fear",
"feeling",
"field",
"finger",
"fire",
"fish",
"flag",
"flight",
"floor",
"flower",
"fold",
"food",
"football",
"force",
"form",
"frame",
"friend",
"frog",
"fruit",
"fuel",
"furniture",
"game",
"garden",
"gate",
"girl",
"glass",
"glove",
"goat",
"gold",
"government",
"grade",
"grain",
"grass",
"green",
"grip",
"group",
"growth",
"guide",
"guitar",
"hair",
"hall",
"hand",
"harbor",
"harmony",
"hat",
"head",
"health",
"heart",
"heat",
"hill",
"history",
"hobbies",
"hole",
"hope",
"horn",
"horse",
"hospital",
"hour",
"house",
"humor",
"idea",
"impulse",
"income",
"increase",
"industry",
"ink",
"insect",
"instrument",
"insurance",
"interest",
"invention",
"iron",
"island",
"jelly",
"jet",
"jewel",
"join",
"judge",
"juice",
"jump",
"kettle",
"key",
"kick",
"kiss",
"kitten",
"knee",
"knife",
"knowledge",
"land",
"language",
"laugh",
"law",
"lead",
"learning",
"leather",
"leg",
"lettuce",
"level",
"library",
"lift",
"light",
"limit",
"line",
"linen",
"lip",
"liquid",
"list",
"look",
"loss",
"love",
"lunch",
"machine",
"man",
"manager",
"map",
"marble",
"mark",
"market",
"mass",
"match",
"meal",
"measure",
"meat",
"meeting",
"memory",
"metal",
"middle",
"milk",
"mind",
"mine",
"minute",
"mist",
"mitten",
"mom",
"money",
"monkey",
"month",
"moon",
"morning",
"mother",
"motion",
"mountain",
"mouth",
"muscle",
"music",
"nail",
"name",
"nation",
"neck",
"need",
"news",
"night",
"noise",
"note",
"number",
"nut",
"observation",
"offer",
"oil",
"operation",
"opinion",
"orange",
"order",
"organization",
"ornament",
"oven",
"page",
"pail",
"pain",
"paint",
"pan",
"pancake",
"paper",
"parcel",
"parent",
"part",
"passenger",
"paste",
"payment",
"peace",
"pear",
"pen",
"pencil",
"person",
"pest",
"pet",
"picture",
"pie",
"pin",
"pipe",
"pizza",
"place",
"plane",
"plant",
"plastic",
"plate",
"play",
"pleasure",
"plot",
"plough",
"pocket",
"point",
"poison",
"police",
"pollution",
"popcorn",
"porter",
"position",
"pot",
"potato",
"powder",
"power",
"price",
"print",
"process",
"produce",
"product",
"profit",
"property",
"prose",
"protest",
"pull",
"pump",
"punishment",
"purpose",
"push",
"quarter",
"question",
"quiet",
"quill",
"quilt",
"quince",
"rabbit",
"rail",
"rain",
"range",
"rat",
"rate",
"ray",
"reaction",
"reading",
"reason",
"record",
"regret",
"relation",
"religion",
"representative",
"request",
"respect",
"rest",
"reward",
"rhythm",
"rice",
"river",
"road",
"roll",
"room",
"root",
"rose",
"route",
"rub",
"rule",
"run",
"sack",
"sail",
"salt",
"sand",
"scale",
"scarecrow",
"scarf",
"scene",
"scent",
"school",
"science",
"scissors",
"screw",
"sea",
"seat",
"secretary",
"seed",
"selection",
"self",
"sense",
"servant",
"shade",
"shake",
"shame",
"shape",
"sheep",
"sheet",
"shelf",
"ship",
"shirt",
"shock",
"shoe",
"shop",
"show",
"side",
"sign",
"silk",
"sink",
"sister",
"size",
"sky",
"slave",
"sleep",
"smash",
"smell",
"smile",
"smoke",
"snail",
"snake",
"sneeze",
"snow",
"soap",
"society",
"sock",
"soda",
"sofa",
"son",
"song",
"sort",
"sound",
"soup",
"space",
"spark",
"speed",
"sponge",
"spoon",
"spray",
"spring",
"spy",
"square",
"stamp",
"star",
"start",
"statement",
"station",
"steam",
"steel",
"stem",
"step",
"stew",
"stick",
"stitch",
"stocking",
"stomach",
"stone",
"stop",
"store",
"story",
"stove",
"stranger",
"straw",
"stream",
"street",
"stretch",
"string",
"structure",
"substance",
"sugar",
"suggestion",
"suit",
"summer",
"sun",
"support",
"surprise",
"sweater",
"swim",
"system",
"table",
"tail",
"talk",
"tank",
"taste",
"tax",
"tea",
"teaching",
"team",
"tendency",
"test",
"texture",
"theory",
"thing",
"thought",
"thread",
"throat",
"thumb",
"thunder",
"ticket",
"time",
"tin",
"title",
"toad",
"toe",
"tooth",
"toothpaste",
"touch",
"town",
"toy",
"trade",
"train",
"transport",
"tray",
"treatment",
"tree",
"trick",
"trip",
"trouble",
"trousers",
"truck",
"tub",
"turkey",
"turn",
"twist",
"umbrella",
"uncle",
"underwear",
"unit",
"use",
"vacation",
"value",
"van",
"vase",
"vegetable",
"veil",
"vein",
"verse",
"vessel",
"view",
"visitor",
"voice",
"volcano",
"walk",
"wall",
"war",
"wash",
"waste",
"watch",
"water",
"wave",
"wax",
"way",
"wealth",
"weather",
"week",
"weight",
"wheel",
"whip",
"whistle",
"window",
"wine",
"wing",
"winter",
"wire",
"wish",
"woman",
"wood",
"wool",
"word",
"work",
"worm",
"wound",
"wrist",
"writer",
"yard",
"yoke",
"zebra",
"zinc",
"zipper",
"zone",
]
def random_name(prefix: str = "test") -> str:
"""Generate a random name."""
adjective = random.choice(adjectives)
noun = random.choice(nouns)
number = random.randint(1, 100)
return f"{prefix}-{adjective}-{noun}-{number}"

View File

@ -0,0 +1,82 @@
"""A simple progress bar for the console."""
import threading
from typing import Any, Dict, Optional, Sequence
from uuid import UUID
from langchain.callbacks import base as base_callbacks
from langchain.schema.document import Document
from langchain.schema.output import LLMResult
class ProgressBarCallback(base_callbacks.BaseCallbackHandler):
"""A simple progress bar for the console."""
def __init__(self, total: int, ncols: int = 50, **kwargs: Any):
"""Initialize the progress bar.
Args:
total: int, the total number of items to be processed.
ncols: int, the character width of the progress bar.
"""
self.total = total
self.ncols = ncols
self.counter = 0
self.lock = threading.Lock()
self._print_bar()
def increment(self) -> None:
"""Increment the counter and update the progress bar."""
with self.lock:
self.counter += 1
self._print_bar()
def _print_bar(self) -> None:
"""Print the progress bar to the console."""
progress = self.counter / self.total
arrow = "-" * int(round(progress * self.ncols) - 1) + ">"
spaces = " " * (self.ncols - len(arrow))
print(f"\r[{arrow + spaces}] {self.counter}/{self.total}", end="")
def on_chain_end(
self,
outputs: Dict[str, Any],
*,
run_id: UUID,
parent_run_id: Optional[UUID] = None,
**kwargs: Any,
) -> Any:
if parent_run_id is None:
self.increment()
def on_retriever_end(
self,
documents: Sequence[Document],
*,
run_id: UUID,
parent_run_id: Optional[UUID] = None,
**kwargs: Any,
) -> Any:
if parent_run_id is None:
self.increment()
def on_llm_end(
self,
response: LLMResult,
*,
run_id: UUID,
parent_run_id: Optional[UUID] = None,
**kwargs: Any,
) -> Any:
if parent_run_id is None:
self.increment()
def on_tool_end(
self,
output: str,
*,
run_id: UUID,
parent_run_id: Optional[UUID] = None,
**kwargs: Any,
) -> Any:
if parent_run_id is None:
self.increment()

File diff suppressed because it is too large Load Diff

View File

@ -148,13 +148,27 @@ class ChainStringRunMapper(StringRunMapper):
def map(self, run: Run) -> Dict[str, str]: def map(self, run: Run) -> Dict[str, str]:
"""Maps the Run to a dictionary.""" """Maps the Run to a dictionary."""
if not run.outputs: if not run.outputs:
raise ValueError(f"Run {run.id} has no outputs to evaluate.")
if self.input_key is not None and self.input_key not in run.inputs:
raise ValueError(f"Run {run.id} does not have input key {self.input_key}.")
elif self.prediction_key is not None and self.prediction_key not in run.outputs:
raise ValueError( raise ValueError(
f"Run {run.id} does not have prediction key {self.prediction_key}." f"Run with ID {run.id} lacks outputs required for evaluation."
" Ensure the Run has valid outputs."
) )
if self.input_key is not None and self.input_key not in run.inputs:
raise ValueError(
f"Run with ID {run.id} is missing the expected input key"
f" '{self.input_key}'.\nAvailable input keys in this Run"
f" are: {run.inputs.keys()}.\nAdjust the evaluator's"
f" input_key or ensure your input data includes key"
f" '{self.input_key}'."
)
elif self.prediction_key is not None and self.prediction_key not in run.outputs:
available_keys = ", ".join(run.outputs.keys())
raise ValueError(
f"Run with ID {run.id} doesn't have the expected prediction key"
f" '{self.prediction_key}'. Available prediction keys in this Run are:"
f" {available_keys}. Adjust the evaluator's prediction_key or"
" ensure the Run object's outputs the expected key."
)
else: else:
input_ = self._get_key(run.inputs, self.input_key, "input") input_ = self._get_key(run.inputs, self.input_key, "input")
prediction = self._get_key(run.outputs, self.prediction_key, "prediction") prediction = self._get_key(run.outputs, self.prediction_key, "prediction")

View File

@ -5,7 +5,6 @@ import pytest
from langsmith import Client as Client from langsmith import Client as Client
from langsmith.schemas import DataType from langsmith.schemas import DataType
from langchain.callbacks.tracers.evaluation import wait_for_all_evaluators
from langchain.chains.llm import LLMChain from langchain.chains.llm import LLMChain
from langchain.chat_models import ChatOpenAI from langchain.chat_models import ChatOpenAI
from langchain.evaluation import EvaluatorType from langchain.evaluation import EvaluatorType
@ -22,7 +21,6 @@ def _check_all_feedback_passed(_project_name: str, client: Client) -> None:
# chain or llm passes for the feedback provided. # chain or llm passes for the feedback provided.
runs = list(client.list_runs(project_name=_project_name, execution_order=1)) runs = list(client.list_runs(project_name=_project_name, execution_order=1))
assert len(runs) == 4 assert len(runs) == 4
wait_for_all_evaluators()
feedback = list(client.list_feedback(run_ids=[run.id for run in runs])) feedback = list(client.list_feedback(run_ids=[run.id for run in runs]))
assert len(feedback) == 8 assert len(feedback) == 8
assert all([f.score == 1 for f in feedback]) assert all([f.score == 1 for f in feedback])

View File

@ -181,11 +181,15 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
assert "the wrong input" in inputs assert "the wrong input" in inputs
return {"the right input": inputs["the wrong input"]} return {"the right input": inputs["the wrong input"]}
result = _run_llm_or_chain(example, lambda: mock_chain, input_mapper=input_mapper) result = _run_llm_or_chain(
example,
{"callbacks": [], "tags": []},
llm_or_chain_factory=lambda: mock_chain,
input_mapper=input_mapper,
)
assert result == {"output": "2", "the right input": "1"} assert result == {"output": "2", "the right input": "1"}
bad_result = _run_llm_or_chain( bad_result = _run_llm_or_chain(
example, example, {"callbacks": [], "tags": []}, llm_or_chain_factory=lambda: mock_chain
lambda: mock_chain,
) )
assert "Error" in bad_result assert "Error" in bad_result
@ -195,7 +199,12 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
return "the right input" return "the right input"
mock_llm = FakeLLM(queries={"the right input": "somenumber"}) mock_llm = FakeLLM(queries={"the right input": "somenumber"})
llm_result = _run_llm_or_chain(example, mock_llm, input_mapper=llm_input_mapper) llm_result = _run_llm_or_chain(
example,
{"callbacks": [], "tags": []},
llm_or_chain_factory=mock_llm,
input_mapper=llm_input_mapper,
)
assert isinstance(llm_result, str) assert isinstance(llm_result, str)
assert llm_result == "somenumber" assert llm_result == "somenumber"
@ -324,10 +333,14 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
) )
expected = { expected = {
uuid_: { str(example.id): {
"output": {"result": f"Result for example {uuid.UUID(uuid_)}"}, "output": {
"result": f"Result for example {uuid.UUID(str(example.id))}"
},
"input": {"input": example.inputs["input"]},
"reference": {"output": example.outputs["output"]},
"feedback": [], "feedback": [],
} }
for uuid_ in uuids for example in examples
} }
assert results["results"] == expected assert results["results"] == expected