mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-28 18:48:50 +00:00
Add execution time (#13542)
And warn instead of raising an error, since the chain API is too inconsistent.
This commit is contained in:
parent
0fb5f857f9
commit
c56faa6ef1
@ -116,6 +116,7 @@ class TestResult(dict):
|
||||
**{f.key: f.score for f in feedback},
|
||||
"input": result["input"],
|
||||
"output": result["output"],
|
||||
"execution_time": result["execution_time"],
|
||||
}
|
||||
if "reference" in result:
|
||||
r["reference"] = result["reference"]
|
||||
@ -418,12 +419,17 @@ def _determine_input_key(
|
||||
if config.input_key:
|
||||
input_key = config.input_key
|
||||
if run_inputs and input_key not in run_inputs:
|
||||
raise ValueError(f"Input key {input_key} not in run inputs {run_inputs}")
|
||||
logger.warning(
|
||||
f"Input key {input_key} not in chain's specified"
|
||||
f" input keys {run_inputs}. Evaluation behavior may be undefined."
|
||||
)
|
||||
elif run_inputs and len(run_inputs) == 1:
|
||||
input_key = run_inputs[0]
|
||||
elif run_inputs is not None and len(run_inputs) > 1:
|
||||
raise ValueError(
|
||||
f"Must specify input key for model with multiple inputs: {run_inputs}"
|
||||
logger.warning(
|
||||
f"Chain expects multiple input keys: {run_inputs},"
|
||||
f" Evaluator is likely to fail. Evaluation behavior may be undefined."
|
||||
" Specify an input_key in the RunEvalConfig to avoid this warning."
|
||||
)
|
||||
|
||||
return input_key
|
||||
@ -437,15 +443,17 @@ def _determine_prediction_key(
|
||||
if config.prediction_key:
|
||||
prediction_key = config.prediction_key
|
||||
if run_outputs and prediction_key not in run_outputs:
|
||||
raise ValueError(
|
||||
f"Prediction key {prediction_key} not in run outputs {run_outputs}"
|
||||
logger.warning(
|
||||
f"Prediction key {prediction_key} not in chain's specified"
|
||||
f" output keys {run_outputs}. Evaluation behavior may be undefined."
|
||||
)
|
||||
elif run_outputs and len(run_outputs) == 1:
|
||||
prediction_key = run_outputs[0]
|
||||
elif run_outputs is not None and len(run_outputs) > 1:
|
||||
raise ValueError(
|
||||
f"Must specify prediction key for model"
|
||||
f" with multiple outputs: {run_outputs}"
|
||||
logger.warning(
|
||||
f"Chain expects multiple output keys: {run_outputs},"
|
||||
f" Evaluation behavior may be undefined. Specify a prediction_key"
|
||||
" in the RunEvalConfig to avoid this warning."
|
||||
)
|
||||
return prediction_key
|
||||
|
||||
@ -978,6 +986,14 @@ def _collect_test_results(
|
||||
all_eval_results.update(
|
||||
{example_id: v for (_, example_id), v in eval_results.items()}
|
||||
)
|
||||
elif isinstance(callback, LangChainTracer):
|
||||
run = callback.latest_run
|
||||
execution_time = (
|
||||
(run.end_time - run.start_time).total_seconds()
|
||||
if run and run.end_time
|
||||
else None
|
||||
)
|
||||
|
||||
results = {}
|
||||
for example, output in zip(examples, batch_results):
|
||||
feedback = all_eval_results.get(str(example.id), [])
|
||||
@ -985,6 +1001,7 @@ def _collect_test_results(
|
||||
"output": output,
|
||||
"input": example.inputs,
|
||||
"feedback": feedback,
|
||||
"execution_time": execution_time,
|
||||
}
|
||||
if example.outputs:
|
||||
results[str(example.id)]["reference"] = example.outputs
|
||||
|
@ -5,6 +5,7 @@ from typing import Any, Dict, Iterator, List, Optional, Union
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from freezegun import freeze_time
|
||||
from langsmith.client import Client
|
||||
from langsmith.schemas import Dataset, Example
|
||||
|
||||
@ -239,6 +240,7 @@ def test_run_chat_model_all_formats(inputs: Dict[str, Any]) -> None:
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@freeze_time("2023-01-01")
|
||||
async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
dataset = Dataset(
|
||||
id=uuid.uuid4(),
|
||||
@ -341,6 +343,8 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
else None
|
||||
},
|
||||
"feedback": [],
|
||||
# No run since we mock the call to the llm above
|
||||
"execution_time": None,
|
||||
}
|
||||
for example in examples
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user