mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-11-03 17:54:10 +00:00 
			
		
		
		
	Add execution time (#13542)
And warn instead of raising an error, since the chain API is too inconsistent.
This commit is contained in:
		@@ -116,6 +116,7 @@ class TestResult(dict):
 | 
				
			|||||||
                **{f.key: f.score for f in feedback},
 | 
					                **{f.key: f.score for f in feedback},
 | 
				
			||||||
                "input": result["input"],
 | 
					                "input": result["input"],
 | 
				
			||||||
                "output": result["output"],
 | 
					                "output": result["output"],
 | 
				
			||||||
 | 
					                "execution_time": result["execution_time"],
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
            if "reference" in result:
 | 
					            if "reference" in result:
 | 
				
			||||||
                r["reference"] = result["reference"]
 | 
					                r["reference"] = result["reference"]
 | 
				
			||||||
@@ -418,12 +419,17 @@ def _determine_input_key(
 | 
				
			|||||||
    if config.input_key:
 | 
					    if config.input_key:
 | 
				
			||||||
        input_key = config.input_key
 | 
					        input_key = config.input_key
 | 
				
			||||||
        if run_inputs and input_key not in run_inputs:
 | 
					        if run_inputs and input_key not in run_inputs:
 | 
				
			||||||
            raise ValueError(f"Input key {input_key} not in run inputs {run_inputs}")
 | 
					            logger.warning(
 | 
				
			||||||
 | 
					                f"Input key {input_key} not in chain's specified"
 | 
				
			||||||
 | 
					                f" input keys {run_inputs}. Evaluation behavior may be undefined."
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
    elif run_inputs and len(run_inputs) == 1:
 | 
					    elif run_inputs and len(run_inputs) == 1:
 | 
				
			||||||
        input_key = run_inputs[0]
 | 
					        input_key = run_inputs[0]
 | 
				
			||||||
    elif run_inputs is not None and len(run_inputs) > 1:
 | 
					    elif run_inputs is not None and len(run_inputs) > 1:
 | 
				
			||||||
        raise ValueError(
 | 
					        logger.warning(
 | 
				
			||||||
            f"Must specify input key for model with multiple inputs: {run_inputs}"
 | 
					            f"Chain expects multiple input keys: {run_inputs},"
 | 
				
			||||||
 | 
					            f" Evaluator is likely to fail. Evaluation behavior may be undefined."
 | 
				
			||||||
 | 
					            " Specify an input_key in the RunEvalConfig to avoid this warning."
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return input_key
 | 
					    return input_key
 | 
				
			||||||
@@ -437,15 +443,17 @@ def _determine_prediction_key(
 | 
				
			|||||||
    if config.prediction_key:
 | 
					    if config.prediction_key:
 | 
				
			||||||
        prediction_key = config.prediction_key
 | 
					        prediction_key = config.prediction_key
 | 
				
			||||||
        if run_outputs and prediction_key not in run_outputs:
 | 
					        if run_outputs and prediction_key not in run_outputs:
 | 
				
			||||||
            raise ValueError(
 | 
					            logger.warning(
 | 
				
			||||||
                f"Prediction key {prediction_key} not in run outputs {run_outputs}"
 | 
					                f"Prediction key {prediction_key} not in chain's specified"
 | 
				
			||||||
 | 
					                f" output keys {run_outputs}. Evaluation behavior may be undefined."
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
    elif run_outputs and len(run_outputs) == 1:
 | 
					    elif run_outputs and len(run_outputs) == 1:
 | 
				
			||||||
        prediction_key = run_outputs[0]
 | 
					        prediction_key = run_outputs[0]
 | 
				
			||||||
    elif run_outputs is not None and len(run_outputs) > 1:
 | 
					    elif run_outputs is not None and len(run_outputs) > 1:
 | 
				
			||||||
        raise ValueError(
 | 
					        logger.warning(
 | 
				
			||||||
            f"Must specify prediction key for model"
 | 
					            f"Chain expects multiple output keys: {run_outputs},"
 | 
				
			||||||
            f" with multiple outputs: {run_outputs}"
 | 
					            f" Evaluation behavior may be undefined. Specify a prediction_key"
 | 
				
			||||||
 | 
					            " in the RunEvalConfig to avoid this warning."
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    return prediction_key
 | 
					    return prediction_key
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -978,6 +986,14 @@ def _collect_test_results(
 | 
				
			|||||||
                all_eval_results.update(
 | 
					                all_eval_results.update(
 | 
				
			||||||
                    {example_id: v for (_, example_id), v in eval_results.items()}
 | 
					                    {example_id: v for (_, example_id), v in eval_results.items()}
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
 | 
					            elif isinstance(callback, LangChainTracer):
 | 
				
			||||||
 | 
					                run = callback.latest_run
 | 
				
			||||||
 | 
					                execution_time = (
 | 
				
			||||||
 | 
					                    (run.end_time - run.start_time).total_seconds()
 | 
				
			||||||
 | 
					                    if run and run.end_time
 | 
				
			||||||
 | 
					                    else None
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    results = {}
 | 
					    results = {}
 | 
				
			||||||
    for example, output in zip(examples, batch_results):
 | 
					    for example, output in zip(examples, batch_results):
 | 
				
			||||||
        feedback = all_eval_results.get(str(example.id), [])
 | 
					        feedback = all_eval_results.get(str(example.id), [])
 | 
				
			||||||
@@ -985,6 +1001,7 @@ def _collect_test_results(
 | 
				
			|||||||
            "output": output,
 | 
					            "output": output,
 | 
				
			||||||
            "input": example.inputs,
 | 
					            "input": example.inputs,
 | 
				
			||||||
            "feedback": feedback,
 | 
					            "feedback": feedback,
 | 
				
			||||||
 | 
					            "execution_time": execution_time,
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        if example.outputs:
 | 
					        if example.outputs:
 | 
				
			||||||
            results[str(example.id)]["reference"] = example.outputs
 | 
					            results[str(example.id)]["reference"] = example.outputs
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -5,6 +5,7 @@ from typing import Any, Dict, Iterator, List, Optional, Union
 | 
				
			|||||||
from unittest import mock
 | 
					from unittest import mock
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					from freezegun import freeze_time
 | 
				
			||||||
from langsmith.client import Client
 | 
					from langsmith.client import Client
 | 
				
			||||||
from langsmith.schemas import Dataset, Example
 | 
					from langsmith.schemas import Dataset, Example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -239,6 +240,7 @@ def test_run_chat_model_all_formats(inputs: Dict[str, Any]) -> None:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.asyncio
 | 
					@pytest.mark.asyncio
 | 
				
			||||||
 | 
					@freeze_time("2023-01-01")
 | 
				
			||||||
async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
 | 
					async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
 | 
				
			||||||
    dataset = Dataset(
 | 
					    dataset = Dataset(
 | 
				
			||||||
        id=uuid.uuid4(),
 | 
					        id=uuid.uuid4(),
 | 
				
			||||||
@@ -341,6 +343,8 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
 | 
				
			|||||||
                    else None
 | 
					                    else None
 | 
				
			||||||
                },
 | 
					                },
 | 
				
			||||||
                "feedback": [],
 | 
					                "feedback": [],
 | 
				
			||||||
 | 
					                # No run since we mock the call to the llm above
 | 
				
			||||||
 | 
					                "execution_time": None,
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
            for example in examples
 | 
					            for example in examples
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user