Add progress bar + runner fixes (#10348)

- Add progress bar to eval runs
- Use thread pool for concurrency
- Update some error messages
- Friendlier project name
- Print out quantiles of the final stats 

Closes LS-902
This commit is contained in:
William FH 2023-09-08 07:45:28 -07:00 committed by GitHub
parent 0672533b3e
commit 46e9abdc75
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 1098 additions and 515 deletions

View File

@ -2,29 +2,20 @@
from __future__ import annotations
import logging
from concurrent.futures import Future, ThreadPoolExecutor, wait
from concurrent.futures import Future, ThreadPoolExecutor
from typing import Any, Dict, List, Optional, Sequence, Set, Union
from uuid import UUID
import langsmith
from langsmith import schemas as langsmith_schemas
from langchain.callbacks.manager import tracing_v2_enabled
from langchain.callbacks import manager
from langchain.callbacks.tracers import langchain as langchain_tracer
from langchain.callbacks.tracers.base import BaseTracer
from langchain.callbacks.tracers.langchain import _get_client
from langchain.callbacks.tracers.schemas import Run
logger = logging.getLogger(__name__)
_TRACERS: List[EvaluatorCallbackHandler] = []
def wait_for_all_evaluators() -> None:
"""Wait for all tracers to finish."""
global _TRACERS
for tracer in _TRACERS:
tracer.wait_for_futures()
class EvaluatorCallbackHandler(BaseTracer):
"""A tracer that runs a run evaluator whenever a run is persisted.
@ -79,17 +70,13 @@ class EvaluatorCallbackHandler(BaseTracer):
self.example_id = (
UUID(example_id) if isinstance(example_id, str) else example_id
)
self.client = client or _get_client()
self.client = client or langchain_tracer.get_client()
self.evaluators = evaluators
self.executor = ThreadPoolExecutor(
max_workers=max(max_workers or len(evaluators), 1)
)
self.max_workers = max_workers or len(evaluators)
self.futures: Set[Future] = set()
self.skip_unfinished = skip_unfinished
self.project_name = project_name
self.logged_feedback: Dict[str, List[langsmith_schemas.Feedback]] = {}
global _TRACERS
_TRACERS.append(self)
def _evaluate_in_project(self, run: Run, evaluator: langsmith.RunEvaluator) -> None:
"""Evaluate the run in the project.
@ -105,7 +92,7 @@ class EvaluatorCallbackHandler(BaseTracer):
try:
if self.project_name is None:
feedback = self.client.evaluate_run(run, evaluator)
with tracing_v2_enabled(
with manager.tracing_v2_enabled(
project_name=self.project_name, tags=["eval"], client=self.client
):
feedback = self.client.evaluate_run(run, evaluator)
@ -133,14 +120,15 @@ class EvaluatorCallbackHandler(BaseTracer):
return
run_ = run.copy()
run_.reference_example_id = self.example_id
for evaluator in self.evaluators:
self.futures.add(
self.executor.submit(self._evaluate_in_project, run_, evaluator)
)
def wait_for_futures(self) -> None:
"""Wait for all futures to complete."""
futures = list(self.futures)
wait(futures)
for future in futures:
self.futures.remove(future)
if self.max_workers > 0:
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
list(
executor.map(
self._evaluate_in_project,
[run_ for _ in range(len(self.evaluators))],
self.evaluators,
)
)
else:
for evaluator in self.evaluators:
self._evaluate_in_project(run_, evaluator)

View File

@ -42,7 +42,7 @@ def wait_for_all_tracers() -> None:
tracer.wait_for_futures()
def _get_client() -> Client:
def get_client() -> Client:
"""Get the client."""
global _CLIENT
if _CLIENT is None:
@ -83,7 +83,7 @@ class LangChainTracer(BaseTracer):
_EXECUTORS.append(self.executor)
else:
self.executor = None
self.client = client or _get_client()
self.client = client or get_client()
self._futures: Set[Future] = set()
self.tags = tags or []
global _TRACERS

View File

@ -0,0 +1,729 @@
import random
adjectives = [
"abandoned",
"aching",
"advanced",
"ample",
"artistic",
"back",
"best",
"bold",
"brief",
"clear",
"cold",
"complicated",
"cooked",
"crazy",
"crushing",
"damp",
"dear",
"definite",
"dependable",
"diligent",
"drab",
"earnest",
"elderly",
"enchanted",
"essential",
"excellent",
"extraneous",
"fixed",
"flowery",
"formal",
"fresh",
"frosty",
"giving",
"glossy",
"healthy",
"helpful",
"impressionable",
"kind",
"large",
"left",
"long",
"loyal",
"mealy",
"memorable",
"monthly",
"new",
"notable",
"only",
"ordinary",
"passionate",
"perfect",
"pertinent",
"proper",
"puzzled",
"reflecting",
"respectful",
"roasted",
"scholarly",
"shiny",
"slight",
"sparkling",
"spotless",
"stupendous",
"sunny",
"tart",
"terrific",
"timely",
"unique",
"upbeat",
"vacant",
"virtual",
"warm",
"weary",
"whispered",
"worthwhile",
"yellow",
]
nouns = [
"account",
"acknowledgment",
"address",
"advertising",
"airplane",
"animal",
"appointment",
"arrival",
"artist",
"attachment",
"attitude",
"availability",
"backpack",
"bag",
"balance",
"bass",
"bean",
"beauty",
"bibliography",
"bill",
"bite",
"blossom",
"boat",
"book",
"box",
"boy",
"bread",
"bridge",
"broccoli",
"building",
"butter",
"button",
"cabbage",
"cake",
"camera",
"camp",
"candle",
"candy",
"canvas",
"car",
"card",
"carrot",
"cart",
"case",
"cat",
"chain",
"chair",
"chalk",
"chance",
"change",
"channel",
"character",
"charge",
"charm",
"chart",
"check",
"cheek",
"cheese",
"chef",
"cherry",
"chicken",
"child",
"church",
"circle",
"class",
"clay",
"click",
"clock",
"cloth",
"cloud",
"clove",
"club",
"coach",
"coal",
"coast",
"coat",
"cod",
"coffee",
"collar",
"color",
"comb",
"comfort",
"comic",
"committee",
"community",
"company",
"comparison",
"competition",
"condition",
"connection",
"control",
"cook",
"copper",
"copy",
"corn",
"cough",
"country",
"cover",
"crate",
"crayon",
"cream",
"creator",
"crew",
"crown",
"current",
"curtain",
"curve",
"cushion",
"dad",
"daughter",
"day",
"death",
"debt",
"decision",
"deer",
"degree",
"design",
"desire",
"desk",
"detail",
"development",
"digestion",
"dime",
"dinner",
"direction",
"dirt",
"discovery",
"discussion",
"disease",
"disgust",
"distance",
"distribution",
"division",
"doctor",
"dog",
"door",
"drain",
"drawer",
"dress",
"drink",
"driving",
"dust",
"ear",
"earth",
"edge",
"education",
"effect",
"egg",
"end",
"energy",
"engine",
"error",
"event",
"example",
"exchange",
"existence",
"expansion",
"experience",
"expert",
"eye",
"face",
"fact",
"fall",
"family",
"farm",
"father",
"fear",
"feeling",
"field",
"finger",
"fire",
"fish",
"flag",
"flight",
"floor",
"flower",
"fold",
"food",
"football",
"force",
"form",
"frame",
"friend",
"frog",
"fruit",
"fuel",
"furniture",
"game",
"garden",
"gate",
"girl",
"glass",
"glove",
"goat",
"gold",
"government",
"grade",
"grain",
"grass",
"green",
"grip",
"group",
"growth",
"guide",
"guitar",
"hair",
"hall",
"hand",
"harbor",
"harmony",
"hat",
"head",
"health",
"heart",
"heat",
"hill",
"history",
"hobbies",
"hole",
"hope",
"horn",
"horse",
"hospital",
"hour",
"house",
"humor",
"idea",
"impulse",
"income",
"increase",
"industry",
"ink",
"insect",
"instrument",
"insurance",
"interest",
"invention",
"iron",
"island",
"jelly",
"jet",
"jewel",
"join",
"judge",
"juice",
"jump",
"kettle",
"key",
"kick",
"kiss",
"kitten",
"knee",
"knife",
"knowledge",
"land",
"language",
"laugh",
"law",
"lead",
"learning",
"leather",
"leg",
"lettuce",
"level",
"library",
"lift",
"light",
"limit",
"line",
"linen",
"lip",
"liquid",
"list",
"look",
"loss",
"love",
"lunch",
"machine",
"man",
"manager",
"map",
"marble",
"mark",
"market",
"mass",
"match",
"meal",
"measure",
"meat",
"meeting",
"memory",
"metal",
"middle",
"milk",
"mind",
"mine",
"minute",
"mist",
"mitten",
"mom",
"money",
"monkey",
"month",
"moon",
"morning",
"mother",
"motion",
"mountain",
"mouth",
"muscle",
"music",
"nail",
"name",
"nation",
"neck",
"need",
"news",
"night",
"noise",
"note",
"number",
"nut",
"observation",
"offer",
"oil",
"operation",
"opinion",
"orange",
"order",
"organization",
"ornament",
"oven",
"page",
"pail",
"pain",
"paint",
"pan",
"pancake",
"paper",
"parcel",
"parent",
"part",
"passenger",
"paste",
"payment",
"peace",
"pear",
"pen",
"pencil",
"person",
"pest",
"pet",
"picture",
"pie",
"pin",
"pipe",
"pizza",
"place",
"plane",
"plant",
"plastic",
"plate",
"play",
"pleasure",
"plot",
"plough",
"pocket",
"point",
"poison",
"police",
"pollution",
"popcorn",
"porter",
"position",
"pot",
"potato",
"powder",
"power",
"price",
"print",
"process",
"produce",
"product",
"profit",
"property",
"prose",
"protest",
"pull",
"pump",
"punishment",
"purpose",
"push",
"quarter",
"question",
"quiet",
"quill",
"quilt",
"quince",
"rabbit",
"rail",
"rain",
"range",
"rat",
"rate",
"ray",
"reaction",
"reading",
"reason",
"record",
"regret",
"relation",
"religion",
"representative",
"request",
"respect",
"rest",
"reward",
"rhythm",
"rice",
"river",
"road",
"roll",
"room",
"root",
"rose",
"route",
"rub",
"rule",
"run",
"sack",
"sail",
"salt",
"sand",
"scale",
"scarecrow",
"scarf",
"scene",
"scent",
"school",
"science",
"scissors",
"screw",
"sea",
"seat",
"secretary",
"seed",
"selection",
"self",
"sense",
"servant",
"shade",
"shake",
"shame",
"shape",
"sheep",
"sheet",
"shelf",
"ship",
"shirt",
"shock",
"shoe",
"shop",
"show",
"side",
"sign",
"silk",
"sink",
"sister",
"size",
"sky",
"slave",
"sleep",
"smash",
"smell",
"smile",
"smoke",
"snail",
"snake",
"sneeze",
"snow",
"soap",
"society",
"sock",
"soda",
"sofa",
"son",
"song",
"sort",
"sound",
"soup",
"space",
"spark",
"speed",
"sponge",
"spoon",
"spray",
"spring",
"spy",
"square",
"stamp",
"star",
"start",
"statement",
"station",
"steam",
"steel",
"stem",
"step",
"stew",
"stick",
"stitch",
"stocking",
"stomach",
"stone",
"stop",
"store",
"story",
"stove",
"stranger",
"straw",
"stream",
"street",
"stretch",
"string",
"structure",
"substance",
"sugar",
"suggestion",
"suit",
"summer",
"sun",
"support",
"surprise",
"sweater",
"swim",
"system",
"table",
"tail",
"talk",
"tank",
"taste",
"tax",
"tea",
"teaching",
"team",
"tendency",
"test",
"texture",
"theory",
"thing",
"thought",
"thread",
"throat",
"thumb",
"thunder",
"ticket",
"time",
"tin",
"title",
"toad",
"toe",
"tooth",
"toothpaste",
"touch",
"town",
"toy",
"trade",
"train",
"transport",
"tray",
"treatment",
"tree",
"trick",
"trip",
"trouble",
"trousers",
"truck",
"tub",
"turkey",
"turn",
"twist",
"umbrella",
"uncle",
"underwear",
"unit",
"use",
"vacation",
"value",
"van",
"vase",
"vegetable",
"veil",
"vein",
"verse",
"vessel",
"view",
"visitor",
"voice",
"volcano",
"walk",
"wall",
"war",
"wash",
"waste",
"watch",
"water",
"wave",
"wax",
"way",
"wealth",
"weather",
"week",
"weight",
"wheel",
"whip",
"whistle",
"window",
"wine",
"wing",
"winter",
"wire",
"wish",
"woman",
"wood",
"wool",
"word",
"work",
"worm",
"wound",
"wrist",
"writer",
"yard",
"yoke",
"zebra",
"zinc",
"zipper",
"zone",
]
def random_name(prefix: str = "test") -> str:
"""Generate a random name."""
adjective = random.choice(adjectives)
noun = random.choice(nouns)
number = random.randint(1, 100)
return f"{prefix}-{adjective}-{noun}-{number}"

View File

@ -0,0 +1,82 @@
"""A simple progress bar for the console."""
import threading
from typing import Any, Dict, Optional, Sequence
from uuid import UUID
from langchain.callbacks import base as base_callbacks
from langchain.schema.document import Document
from langchain.schema.output import LLMResult
class ProgressBarCallback(base_callbacks.BaseCallbackHandler):
"""A simple progress bar for the console."""
def __init__(self, total: int, ncols: int = 50, **kwargs: Any):
"""Initialize the progress bar.
Args:
total: int, the total number of items to be processed.
ncols: int, the character width of the progress bar.
"""
self.total = total
self.ncols = ncols
self.counter = 0
self.lock = threading.Lock()
self._print_bar()
def increment(self) -> None:
"""Increment the counter and update the progress bar."""
with self.lock:
self.counter += 1
self._print_bar()
def _print_bar(self) -> None:
"""Print the progress bar to the console."""
progress = self.counter / self.total
arrow = "-" * int(round(progress * self.ncols) - 1) + ">"
spaces = " " * (self.ncols - len(arrow))
print(f"\r[{arrow + spaces}] {self.counter}/{self.total}", end="")
def on_chain_end(
self,
outputs: Dict[str, Any],
*,
run_id: UUID,
parent_run_id: Optional[UUID] = None,
**kwargs: Any,
) -> Any:
if parent_run_id is None:
self.increment()
def on_retriever_end(
self,
documents: Sequence[Document],
*,
run_id: UUID,
parent_run_id: Optional[UUID] = None,
**kwargs: Any,
) -> Any:
if parent_run_id is None:
self.increment()
def on_llm_end(
self,
response: LLMResult,
*,
run_id: UUID,
parent_run_id: Optional[UUID] = None,
**kwargs: Any,
) -> Any:
if parent_run_id is None:
self.increment()
def on_tool_end(
self,
output: str,
*,
run_id: UUID,
parent_run_id: Optional[UUID] = None,
**kwargs: Any,
) -> Any:
if parent_run_id is None:
self.increment()

File diff suppressed because it is too large Load Diff

View File

@ -148,13 +148,27 @@ class ChainStringRunMapper(StringRunMapper):
def map(self, run: Run) -> Dict[str, str]:
"""Maps the Run to a dictionary."""
if not run.outputs:
raise ValueError(f"Run {run.id} has no outputs to evaluate.")
if self.input_key is not None and self.input_key not in run.inputs:
raise ValueError(f"Run {run.id} does not have input key {self.input_key}.")
elif self.prediction_key is not None and self.prediction_key not in run.outputs:
raise ValueError(
f"Run {run.id} does not have prediction key {self.prediction_key}."
f"Run with ID {run.id} lacks outputs required for evaluation."
" Ensure the Run has valid outputs."
)
if self.input_key is not None and self.input_key not in run.inputs:
raise ValueError(
f"Run with ID {run.id} is missing the expected input key"
f" '{self.input_key}'.\nAvailable input keys in this Run"
f" are: {run.inputs.keys()}.\nAdjust the evaluator's"
f" input_key or ensure your input data includes key"
f" '{self.input_key}'."
)
elif self.prediction_key is not None and self.prediction_key not in run.outputs:
available_keys = ", ".join(run.outputs.keys())
raise ValueError(
f"Run with ID {run.id} doesn't have the expected prediction key"
f" '{self.prediction_key}'. Available prediction keys in this Run are:"
f" {available_keys}. Adjust the evaluator's prediction_key or"
" ensure the Run object's outputs the expected key."
)
else:
input_ = self._get_key(run.inputs, self.input_key, "input")
prediction = self._get_key(run.outputs, self.prediction_key, "prediction")

View File

@ -5,7 +5,6 @@ import pytest
from langsmith import Client as Client
from langsmith.schemas import DataType
from langchain.callbacks.tracers.evaluation import wait_for_all_evaluators
from langchain.chains.llm import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.evaluation import EvaluatorType
@ -22,7 +21,6 @@ def _check_all_feedback_passed(_project_name: str, client: Client) -> None:
# chain or llm passes for the feedback provided.
runs = list(client.list_runs(project_name=_project_name, execution_order=1))
assert len(runs) == 4
wait_for_all_evaluators()
feedback = list(client.list_feedback(run_ids=[run.id for run in runs]))
assert len(feedback) == 8
assert all([f.score == 1 for f in feedback])

View File

@ -181,11 +181,15 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
assert "the wrong input" in inputs
return {"the right input": inputs["the wrong input"]}
result = _run_llm_or_chain(example, lambda: mock_chain, input_mapper=input_mapper)
result = _run_llm_or_chain(
example,
{"callbacks": [], "tags": []},
llm_or_chain_factory=lambda: mock_chain,
input_mapper=input_mapper,
)
assert result == {"output": "2", "the right input": "1"}
bad_result = _run_llm_or_chain(
example,
lambda: mock_chain,
example, {"callbacks": [], "tags": []}, llm_or_chain_factory=lambda: mock_chain
)
assert "Error" in bad_result
@ -195,7 +199,12 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
return "the right input"
mock_llm = FakeLLM(queries={"the right input": "somenumber"})
llm_result = _run_llm_or_chain(example, mock_llm, input_mapper=llm_input_mapper)
llm_result = _run_llm_or_chain(
example,
{"callbacks": [], "tags": []},
llm_or_chain_factory=mock_llm,
input_mapper=llm_input_mapper,
)
assert isinstance(llm_result, str)
assert llm_result == "somenumber"
@ -324,10 +333,14 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
)
expected = {
uuid_: {
"output": {"result": f"Result for example {uuid.UUID(uuid_)}"},
str(example.id): {
"output": {
"result": f"Result for example {uuid.UUID(str(example.id))}"
},
"input": {"input": example.inputs["input"]},
"reference": {"output": example.outputs["output"]},
"feedback": [],
}
for uuid_ in uuids
for example in examples
}
assert results["results"] == expected