mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-11 13:55:03 +00:00
[Nits] Evaluation - Some Rendering Improvements (#14097)
- Improve rendering of aggregate results at the end - flatten reference if present
This commit is contained in:
parent
f15859bd86
commit
71c2e184b4
@ -14,7 +14,6 @@ from typing import (
|
|||||||
Dict,
|
Dict,
|
||||||
List,
|
List,
|
||||||
Optional,
|
Optional,
|
||||||
Sequence,
|
|
||||||
Tuple,
|
Tuple,
|
||||||
Union,
|
Union,
|
||||||
cast,
|
cast,
|
||||||
@ -77,7 +76,7 @@ class TestResult(dict):
|
|||||||
"""A dictionary of the results of a single test run."""
|
"""A dictionary of the results of a single test run."""
|
||||||
|
|
||||||
def get_aggregate_feedback(
|
def get_aggregate_feedback(
|
||||||
self, quantiles: Optional[Sequence[float]] = None
|
self,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""Return quantiles for the feedback scores.
|
"""Return quantiles for the feedback scores.
|
||||||
|
|
||||||
@ -88,7 +87,14 @@ class TestResult(dict):
|
|||||||
A DataFrame containing the quantiles for each feedback key.
|
A DataFrame containing the quantiles for each feedback key.
|
||||||
"""
|
"""
|
||||||
df = self.to_dataframe()
|
df = self.to_dataframe()
|
||||||
to_drop = {"input", "output", "reference"}.intersection(df.columns)
|
# Drop all things starting with inputs., outputs., and reference
|
||||||
|
to_drop = [
|
||||||
|
col
|
||||||
|
for col in df.columns
|
||||||
|
if col.startswith("inputs.")
|
||||||
|
or col.startswith("outputs.")
|
||||||
|
or col.startswith("reference")
|
||||||
|
]
|
||||||
return df.describe(include="all").drop(to_drop, axis=1)
|
return df.describe(include="all").drop(to_drop, axis=1)
|
||||||
|
|
||||||
def to_dataframe(self) -> pd.DataFrame:
|
def to_dataframe(self) -> pd.DataFrame:
|
||||||
@ -118,7 +124,12 @@ class TestResult(dict):
|
|||||||
**output,
|
**output,
|
||||||
}
|
}
|
||||||
if "reference" in result:
|
if "reference" in result:
|
||||||
r["reference"] = result["reference"]
|
if isinstance(result["reference"], dict):
|
||||||
|
r.update(
|
||||||
|
{f"reference.{k}": v for k, v in result["reference"].items()}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
r["reference"] = result["reference"]
|
||||||
r.update(
|
r.update(
|
||||||
{
|
{
|
||||||
**{f"feedback.{f.key}": f.score for f in feedback},
|
**{f"feedback.{f.key}": f.score for f in feedback},
|
||||||
@ -910,6 +921,7 @@ def _prepare_eval_run(
|
|||||||
) -> Tuple[MCF, str, Dataset, List[Example]]:
|
) -> Tuple[MCF, str, Dataset, List[Example]]:
|
||||||
wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
|
wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
|
||||||
dataset = client.read_dataset(dataset_name=dataset_name)
|
dataset = client.read_dataset(dataset_name=dataset_name)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
project_extra: dict = {"metadata": project_metadata} if project_metadata else {}
|
project_extra: dict = {"metadata": project_metadata} if project_metadata else {}
|
||||||
if tags:
|
if tags:
|
||||||
@ -933,9 +945,10 @@ run_on_dataset(
|
|||||||
f"Test project {project_name} already exists. Please use a different name:"
|
f"Test project {project_name} already exists. Please use a different name:"
|
||||||
f"\n\n{example_msg}"
|
f"\n\n{example_msg}"
|
||||||
)
|
)
|
||||||
|
comparison_url = dataset.url + f"/compare?selectedSessions={project.id}"
|
||||||
print(
|
print(
|
||||||
f"View the evaluation results for project '{project_name}'"
|
f"View the evaluation results for project '{project_name}'"
|
||||||
f" at:\n{project.url}?eval=true\n\n"
|
f" at:\n{comparison_url}\n\n"
|
||||||
f"View all tests for Dataset {dataset_name} at:\n{dataset.url}",
|
f"View all tests for Dataset {dataset_name} at:\n{dataset.url}",
|
||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
@ -1042,6 +1055,30 @@ def _collect_test_results(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_jupyter_environment() -> bool:
|
||||||
|
try:
|
||||||
|
from IPython import get_ipython
|
||||||
|
|
||||||
|
res = get_ipython()
|
||||||
|
return get_ipython() is not None and "zmqshell" in str(type(res))
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _display_aggregate_results(aggregate_results: pd.DataFrame) -> None:
|
||||||
|
if _is_jupyter_environment():
|
||||||
|
from IPython.display import HTML, display
|
||||||
|
|
||||||
|
display(HTML("<h3>Experiment Results:</h3>"))
|
||||||
|
display(aggregate_results)
|
||||||
|
else:
|
||||||
|
formatted_string = aggregate_results.to_string(
|
||||||
|
float_format=lambda x: f"{x:.2f}", justify="right"
|
||||||
|
)
|
||||||
|
print("\n Experiment Results:")
|
||||||
|
print(formatted_string)
|
||||||
|
|
||||||
|
|
||||||
_INPUT_MAPPER_DEP_WARNING = (
|
_INPUT_MAPPER_DEP_WARNING = (
|
||||||
"The input_mapper argument is deprecated and "
|
"The input_mapper argument is deprecated and "
|
||||||
"will be removed in a future release. Please add a "
|
"will be removed in a future release. Please add a "
|
||||||
@ -1182,8 +1219,7 @@ def run_on_dataset(
|
|||||||
if verbose:
|
if verbose:
|
||||||
try:
|
try:
|
||||||
agg_feedback = results.get_aggregate_feedback()
|
agg_feedback = results.get_aggregate_feedback()
|
||||||
print("\n Eval quantiles:")
|
_display_aggregate_results(agg_feedback)
|
||||||
print(agg_feedback)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Failed to print aggregate feedback: {repr(e)}")
|
logger.debug(f"Failed to print aggregate feedback: {repr(e)}")
|
||||||
return results
|
return results
|
||||||
|
@ -248,6 +248,7 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
|
|||||||
owner_id="owner",
|
owner_id="owner",
|
||||||
created_at=_CREATED_AT,
|
created_at=_CREATED_AT,
|
||||||
tenant_id=_TENANT_ID,
|
tenant_id=_TENANT_ID,
|
||||||
|
_host_url="http://localhost:1984",
|
||||||
)
|
)
|
||||||
uuids = [
|
uuids = [
|
||||||
"0c193153-2309-4704-9a47-17aee4fb25c8",
|
"0c193153-2309-4704-9a47-17aee4fb25c8",
|
||||||
|
Loading…
Reference in New Issue
Block a user