Add error rate (#13568)

To the in-memory outputs. Separate it out from the outputs so it's
present in the dataframe.describe() results
This commit is contained in:
William FH 2023-11-21 07:51:30 -08:00 committed by GitHub
parent 8329f81072
commit 17c6551c18
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 43 additions and 19 deletions

View File

@ -550,7 +550,6 @@ nouns = [
"sister", "sister",
"size", "size",
"sky", "sky",
"slave",
"sleep", "sleep",
"smash", "smash",
"smell", "smell",

View File

@ -88,15 +88,8 @@ class TestResult(dict):
A DataFrame containing the quantiles for each feedback key. A DataFrame containing the quantiles for each feedback key.
""" """
df = self.to_dataframe() df = self.to_dataframe()
feedback_cols = [ to_drop = {"input", "output", "reference"}.intersection(df.columns)
col for col in df.columns if col not in ["input", "output", "reference"] return df.describe(include="all").drop(to_drop, axis=1)
]
_quantiles = df[feedback_cols].quantile(
quantiles or [0.25, 0.5, 0.75], numeric_only=True
)
_quantiles.loc["mean"] = df[feedback_cols].mean()
_quantiles.loc["mode"] = df[feedback_cols].mode().iloc[0]
return _quantiles.transpose()
def to_dataframe(self) -> pd.DataFrame: def to_dataframe(self) -> pd.DataFrame:
"""Convert the results to a dataframe.""" """Convert the results to a dataframe."""
@ -112,20 +105,46 @@ class TestResult(dict):
records = [] records = []
for example_id, result in self["results"].items(): for example_id, result in self["results"].items():
feedback = result["feedback"] feedback = result["feedback"]
output_ = result.get("output")
if isinstance(output_, dict):
output = {f"outputs.{k}": v for k, v in output_.items()}
elif output_ is None:
output = {}
else:
output = {"output": output_}
r = { r = {
**{f.key: f.score for f in feedback}, **{f"inputs.{k}": v for k, v in result["input"].items()},
"input": result["input"], **output,
"output": result["output"],
"execution_time": result["execution_time"],
} }
if "reference" in result: if "reference" in result:
r["reference"] = result["reference"] r["reference"] = result["reference"]
r.update(
{
**{f"feedback.{f.key}": f.score for f in feedback},
"error": result.get("error"),
"execution_time": result["execution_time"],
}
)
records.append(r) records.append(r)
indices.append(example_id) indices.append(example_id)
return pd.DataFrame(records, index=indices) return pd.DataFrame(records, index=indices)
class EvalError(dict):
"""Your architecture raised an error."""
def __init__(self, Error: BaseException, **kwargs: Any) -> None:
super().__init__(Error=Error, **kwargs)
def __getattr__(self, name: str) -> Any:
try:
return self[name]
except KeyError:
raise AttributeError(f"'EvalError' object has no attribute '{name}'")
def _wrap_in_chain_factory( def _wrap_in_chain_factory(
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY, llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
dataset_name: str = "<my_dataset>", dataset_name: str = "<my_dataset>",
@ -742,7 +761,7 @@ async def _arun_llm_or_chain(
f"with inputs {example.inputs}" f"with inputs {example.inputs}"
f"\n{repr(e)}" f"\n{repr(e)}"
) )
result = {"Error": repr(e)} result = EvalError(Error=e)
return result return result
@ -874,7 +893,7 @@ def _run_llm_or_chain(
f"with inputs {example.inputs}" f"with inputs {example.inputs}"
f"\nError Type: {error_type}, Message: {e}" f"\nError Type: {error_type}, Message: {e}"
) )
result = {"Error": repr(e)} result = EvalError(Error=e)
return result return result
@ -979,6 +998,7 @@ def _collect_test_results(
) -> TestResult: ) -> TestResult:
wait_for_all_evaluators() wait_for_all_evaluators()
all_eval_results = {} all_eval_results = {}
all_execution_time = {}
for c in configs: for c in configs:
for callback in cast(list, c["callbacks"]): for callback in cast(list, c["callbacks"]):
if isinstance(callback, EvaluatorCallbackHandler): if isinstance(callback, EvaluatorCallbackHandler):
@ -988,21 +1008,26 @@ def _collect_test_results(
) )
elif isinstance(callback, LangChainTracer): elif isinstance(callback, LangChainTracer):
run = callback.latest_run run = callback.latest_run
example_id = callback.example_id
execution_time = ( execution_time = (
(run.end_time - run.start_time).total_seconds() (run.end_time - run.start_time).total_seconds()
if run and run.end_time if run and run.end_time
else None else None
) )
all_execution_time[str(example_id)] = execution_time
results = {} results: dict = {}
for example, output in zip(examples, batch_results): for example, output in zip(examples, batch_results):
feedback = all_eval_results.get(str(example.id), []) feedback = all_eval_results.get(str(example.id), [])
results[str(example.id)] = { results[str(example.id)] = {
"output": output,
"input": example.inputs, "input": example.inputs,
"feedback": feedback, "feedback": feedback,
"execution_time": execution_time, "execution_time": all_execution_time.get(str(example.id)),
} }
if isinstance(output, EvalError):
results[str(example.id)]["error"] = output.error
else:
results[str(example.id)]["output"] = output
if example.outputs: if example.outputs:
results[str(example.id)]["reference"] = example.outputs results[str(example.id)]["reference"] = example.outputs
return TestResult( return TestResult(