diff --git a/libs/langchain/langchain/evaluation/loading.py b/libs/langchain/langchain/evaluation/loading.py index 81ffec6d797..b5601b04434 100644 --- a/libs/langchain/langchain/evaluation/loading.py +++ b/libs/langchain/langchain/evaluation/loading.py @@ -108,7 +108,6 @@ def load_evaluator( >>> from langchain.evaluation import load_evaluator, EvaluatorType >>> evaluator = load_evaluator(EvaluatorType.QA) """ - llm = llm or ChatOpenAI(model="gpt-4", temperature=0) if evaluator not in _EVALUATOR_MAP: raise ValueError( f"Unknown evaluator type: {evaluator}" @@ -116,6 +115,16 @@ def load_evaluator( ) evaluator_cls = _EVALUATOR_MAP[evaluator] if issubclass(evaluator_cls, LLMEvalChain): + try: + llm = llm or ChatOpenAI(model="gpt-4", temperature=0) + except Exception as e: + raise ValueError( + f"Evaluation with the {evaluator_cls} requires a " + "language model to function." + " Failed to create the default 'gpt-4' model." + " Please manually provide an evaluation LLM" + " or check your openai credentials." + ) from e return evaluator_cls.from_llm(llm=llm, **kwargs) else: return evaluator_cls(**kwargs) @@ -154,7 +163,6 @@ def load_evaluators( >>> evaluators = [EvaluatorType.QA, EvaluatorType.CRITERIA] >>> loaded_evaluators = load_evaluators(evaluators, criteria="helpfulness") """ - llm = llm or ChatOpenAI(model="gpt-4", temperature=0) loaded = [] for evaluator in evaluators: _kwargs = config.get(evaluator, {}) if config else {} diff --git a/libs/langchain/langchain/smith/evaluation/runner_utils.py b/libs/langchain/langchain/smith/evaluation/runner_utils.py index f20f5cea4ba..1f432053e20 100644 --- a/libs/langchain/langchain/smith/evaluation/runner_utils.py +++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py @@ -35,7 +35,6 @@ from langchain.callbacks.tracers.base import BaseTracer from langchain.callbacks.tracers.evaluation import EvaluatorCallbackHandler from langchain.callbacks.tracers.langchain import LangChainTracer from langchain.chains.base import Chain -from langchain.chat_models.openai import ChatOpenAI from langchain.evaluation.loading import load_evaluator from langchain.evaluation.schema import EvaluatorType, StringEvaluator from langchain.schema import ChatResult, LLMResult @@ -493,7 +492,7 @@ def _determine_reference_key( def _construct_run_evaluator( eval_config: Union[EvaluatorType, str, EvalConfig], - eval_llm: BaseLanguageModel, + eval_llm: Optional[BaseLanguageModel], run_type: str, data_type: DataType, example_outputs: Optional[List[str]], @@ -563,7 +562,6 @@ def _load_run_evaluators( Returns: A list of run evaluators. """ - eval_llm = config.eval_llm or ChatOpenAI(model="gpt-4", temperature=0.0) run_evaluators = [] input_key, prediction_key, reference_key = None, None, None if ( @@ -580,7 +578,7 @@ def _load_run_evaluators( for eval_config in config.evaluators: run_evaluator = _construct_run_evaluator( eval_config, - eval_llm, + config.eval_llm, run_type, data_type, example_outputs,