mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-08 22:15:08 +00:00
Add support for project metadata in run_on_dataset (#11200)
This commit is contained in:
parent
b11f21c25f
commit
73693c18fc
@ -862,6 +862,7 @@ def _prepare_eval_run(
|
|||||||
dataset_name: str,
|
dataset_name: str,
|
||||||
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
||||||
project_name: str,
|
project_name: str,
|
||||||
|
project_metadata: Optional[Dict[str, Any]] = None,
|
||||||
) -> Tuple[MCF, str, Dataset, List[Example]]:
|
) -> Tuple[MCF, str, Dataset, List[Example]]:
|
||||||
wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
|
wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
|
||||||
dataset = client.read_dataset(dataset_name=dataset_name)
|
dataset = client.read_dataset(dataset_name=dataset_name)
|
||||||
@ -869,6 +870,7 @@ def _prepare_eval_run(
|
|||||||
project = client.create_project(
|
project = client.create_project(
|
||||||
project_name,
|
project_name,
|
||||||
reference_dataset_id=dataset.id,
|
reference_dataset_id=dataset.id,
|
||||||
|
project_extra={"metadata": project_metadata} if project_metadata else {},
|
||||||
)
|
)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
if "already exists " not in str(e):
|
if "already exists " not in str(e):
|
||||||
@ -895,10 +897,15 @@ def _prepare_run_on_dataset(
|
|||||||
tags: Optional[List[str]] = None,
|
tags: Optional[List[str]] = None,
|
||||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
||||||
concurrency_level: int = 5,
|
concurrency_level: int = 5,
|
||||||
|
project_metadata: Optional[Dict[str, Any]] = None,
|
||||||
) -> Tuple[MCF, str, List[Example], List[RunnableConfig]]:
|
) -> Tuple[MCF, str, List[Example], List[RunnableConfig]]:
|
||||||
project_name = project_name or name_generation.random_name()
|
project_name = project_name or name_generation.random_name()
|
||||||
wrapped_model, project_name, dataset, examples = _prepare_eval_run(
|
wrapped_model, project_name, dataset, examples = _prepare_eval_run(
|
||||||
client, dataset_name, llm_or_chain_factory, project_name
|
client,
|
||||||
|
dataset_name,
|
||||||
|
llm_or_chain_factory,
|
||||||
|
project_name,
|
||||||
|
project_metadata=project_metadata,
|
||||||
)
|
)
|
||||||
wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory)
|
wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory)
|
||||||
run_evaluators = _setup_evaluation(
|
run_evaluators = _setup_evaluation(
|
||||||
@ -958,126 +965,41 @@ def _collect_test_results(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_INPUT_MAPPER_DEP_WARNING = (
|
||||||
|
"The input_mapper argument is deprecated and "
|
||||||
|
"will be removed in a future release. Please add a "
|
||||||
|
" RunnableLambda to your chain to map inputs to the expected format"
|
||||||
|
" instead. Example:\n"
|
||||||
|
"def construct_chain():\n"
|
||||||
|
" my_chain = ...\n"
|
||||||
|
" input_mapper = {'other_key': 'MyOtherInput', 'my_input_key': x}\n"
|
||||||
|
" return input_mapper | my_chain\n"
|
||||||
|
"run_on_dataset(..., llm_or_chain_factory=construct_chain)\n"
|
||||||
|
"(See https://api.python.langchain.com/en/latest/schema/"
|
||||||
|
"langchain.schema.runnable.base.RunnableLambda.html)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def arun_on_dataset(
|
async def arun_on_dataset(
|
||||||
client: Client,
|
client: Optional[Client],
|
||||||
dataset_name: str,
|
dataset_name: str,
|
||||||
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
||||||
*,
|
*,
|
||||||
evaluation: Optional[smith_eval.RunEvalConfig] = None,
|
evaluation: Optional[smith_eval.RunEvalConfig] = None,
|
||||||
concurrency_level: int = 5,
|
concurrency_level: int = 5,
|
||||||
project_name: Optional[str] = None,
|
project_name: Optional[str] = None,
|
||||||
|
project_metadata: Optional[Dict[str, Any]] = None,
|
||||||
verbose: bool = False,
|
verbose: bool = False,
|
||||||
tags: Optional[List[str]] = None,
|
tags: Optional[List[str]] = None,
|
||||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""
|
input_mapper = kwargs.pop("input_mapper", None)
|
||||||
Asynchronously run the Chain or language model on a dataset
|
if input_mapper:
|
||||||
and store traces to the specified project name.
|
warnings.warn(
|
||||||
|
_INPUT_MAPPER_DEP_WARNING,
|
||||||
Args:
|
DeprecationWarning,
|
||||||
client: LangSmith client to use to read the dataset, and to
|
|
||||||
log feedback and run traces.
|
|
||||||
dataset_name: Name of the dataset to run the chain on.
|
|
||||||
llm_or_chain_factory: Language model or Chain constructor to run
|
|
||||||
over the dataset. The Chain constructor is used to permit
|
|
||||||
independent calls on each example without carrying over state.
|
|
||||||
evaluation: Optional evaluation configuration to use when evaluating
|
|
||||||
concurrency_level: The number of async tasks to run concurrently.
|
|
||||||
project_name: Name of the project to store the traces in.
|
|
||||||
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
|
||||||
verbose: Whether to print progress.
|
|
||||||
tags: Tags to add to each run in the project.
|
|
||||||
input_mapper: A function to map to the inputs dictionary from an Example
|
|
||||||
to the format expected by the model to be evaluated. This is useful if
|
|
||||||
your model needs to deserialize more complex schema or if your dataset
|
|
||||||
has inputs with keys that differ from what is expected by your chain
|
|
||||||
or agent.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A dictionary containing the run's project name and the
|
|
||||||
resulting model outputs.
|
|
||||||
|
|
||||||
For the synchronous version, see :func:`run_on_dataset`.
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
from langsmith import Client
|
|
||||||
from langchain.chat_models import ChatOpenAI
|
|
||||||
from langchain.chains import LLMChain
|
|
||||||
from langchain.smith import smith_eval.RunEvalConfig, arun_on_dataset
|
|
||||||
|
|
||||||
# Chains may have memory. Passing in a constructor function lets the
|
|
||||||
# evaluation framework avoid cross-contamination between runs.
|
|
||||||
def construct_chain():
|
|
||||||
llm = ChatOpenAI(temperature=0)
|
|
||||||
chain = LLMChain.from_string(
|
|
||||||
llm,
|
|
||||||
"What's the answer to {your_input_key}"
|
|
||||||
)
|
|
||||||
return chain
|
|
||||||
|
|
||||||
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
|
|
||||||
evaluation_config = smith_eval.RunEvalConfig(
|
|
||||||
evaluators=[
|
|
||||||
"qa", # "Correctness" against a reference answer
|
|
||||||
"embedding_distance",
|
|
||||||
smith_eval.RunEvalConfig.Criteria("helpfulness"),
|
|
||||||
smith_eval.RunEvalConfig.Criteria({
|
|
||||||
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
|
|
||||||
}),
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
client = Client()
|
|
||||||
await arun_on_dataset(
|
|
||||||
client,
|
|
||||||
"<my_dataset_name>",
|
|
||||||
construct_chain,
|
|
||||||
evaluation=evaluation_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
You can also create custom evaluators by subclassing the
|
|
||||||
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
|
|
||||||
or LangSmith's `RunEvaluator` classes.
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
from langchain.evaluation import StringEvaluator
|
|
||||||
|
|
||||||
class MyStringEvaluator(StringEvaluator):
|
|
||||||
|
|
||||||
@property
|
|
||||||
def requires_input(self) -> bool:
|
|
||||||
return False
|
|
||||||
|
|
||||||
@property
|
|
||||||
def requires_reference(self) -> bool:
|
|
||||||
return True
|
|
||||||
|
|
||||||
@property
|
|
||||||
def evaluation_name(self) -> str:
|
|
||||||
return "exact_match"
|
|
||||||
|
|
||||||
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
|
|
||||||
return {"score": prediction == reference}
|
|
||||||
|
|
||||||
|
|
||||||
evaluation_config = smith_eval.RunEvalConfig(
|
|
||||||
custom_evaluators = [MyStringEvaluator()],
|
|
||||||
)
|
|
||||||
|
|
||||||
await arun_on_dataset(
|
|
||||||
client,
|
|
||||||
"<my_dataset_name>",
|
|
||||||
construct_chain,
|
|
||||||
evaluation=evaluation_config,
|
|
||||||
)
|
|
||||||
""" # noqa: E501
|
|
||||||
if kwargs:
|
if kwargs:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"The following arguments are deprecated and "
|
"The following arguments are deprecated and "
|
||||||
@ -1085,6 +1007,7 @@ async def arun_on_dataset(
|
|||||||
f"{kwargs.keys()}.",
|
f"{kwargs.keys()}.",
|
||||||
DeprecationWarning,
|
DeprecationWarning,
|
||||||
)
|
)
|
||||||
|
client = client or Client()
|
||||||
wrapped_model, project_name, examples, configs = _prepare_run_on_dataset(
|
wrapped_model, project_name, examples, configs = _prepare_run_on_dataset(
|
||||||
client,
|
client,
|
||||||
dataset_name,
|
dataset_name,
|
||||||
@ -1094,6 +1017,7 @@ async def arun_on_dataset(
|
|||||||
tags,
|
tags,
|
||||||
input_mapper,
|
input_mapper,
|
||||||
concurrency_level,
|
concurrency_level,
|
||||||
|
project_metadata=project_metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
batch_results = await runnable_utils.gather_with_concurrency(
|
batch_results = await runnable_utils.gather_with_concurrency(
|
||||||
@ -1120,126 +1044,24 @@ async def arun_on_dataset(
|
|||||||
|
|
||||||
|
|
||||||
def run_on_dataset(
|
def run_on_dataset(
|
||||||
client: Client,
|
client: Optional[Client],
|
||||||
dataset_name: str,
|
dataset_name: str,
|
||||||
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
||||||
*,
|
*,
|
||||||
evaluation: Optional[smith_eval.RunEvalConfig] = None,
|
evaluation: Optional[smith_eval.RunEvalConfig] = None,
|
||||||
concurrency_level: int = 5,
|
concurrency_level: int = 5,
|
||||||
project_name: Optional[str] = None,
|
project_name: Optional[str] = None,
|
||||||
|
project_metadata: Optional[Dict[str, Any]] = None,
|
||||||
verbose: bool = False,
|
verbose: bool = False,
|
||||||
tags: Optional[List[str]] = None,
|
tags: Optional[List[str]] = None,
|
||||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""
|
input_mapper = kwargs.pop("input_mapper", None)
|
||||||
Run the Chain or language model on a dataset and store traces
|
if input_mapper:
|
||||||
to the specified project name.
|
warnings.warn(
|
||||||
|
_INPUT_MAPPER_DEP_WARNING,
|
||||||
Args:
|
DeprecationWarning,
|
||||||
client: LangSmith client to use to access the dataset and to
|
|
||||||
log feedback and run traces.
|
|
||||||
dataset_name: Name of the dataset to run the chain on.
|
|
||||||
llm_or_chain_factory: Language model or Chain constructor to run
|
|
||||||
over the dataset. The Chain constructor is used to permit
|
|
||||||
independent calls on each example without carrying over state.
|
|
||||||
evaluation: Configuration for evaluators to run on the
|
|
||||||
results of the chain
|
|
||||||
concurrency_level: The number of async tasks to run concurrently.
|
|
||||||
project_name: Name of the project to store the traces in.
|
|
||||||
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
|
||||||
verbose: Whether to print progress.
|
|
||||||
tags: Tags to add to each run in the project.
|
|
||||||
input_mapper: A function to map to the inputs dictionary from an Example
|
|
||||||
to the format expected by the model to be evaluated. This is useful if
|
|
||||||
your model needs to deserialize more complex schema or if your dataset
|
|
||||||
has inputs with keys that differ from what is expected by your chain
|
|
||||||
or agent.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A dictionary containing the run's project name and the resulting model outputs.
|
|
||||||
|
|
||||||
|
|
||||||
For the (usually faster) async version of this function, see :func:`arun_on_dataset`.
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
from langsmith import Client
|
|
||||||
from langchain.chat_models import ChatOpenAI
|
|
||||||
from langchain.chains import LLMChain
|
|
||||||
from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset
|
|
||||||
|
|
||||||
# Chains may have memory. Passing in a constructor function lets the
|
|
||||||
# evaluation framework avoid cross-contamination between runs.
|
|
||||||
def construct_chain():
|
|
||||||
llm = ChatOpenAI(temperature=0)
|
|
||||||
chain = LLMChain.from_string(
|
|
||||||
llm,
|
|
||||||
"What's the answer to {your_input_key}"
|
|
||||||
)
|
|
||||||
return chain
|
|
||||||
|
|
||||||
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
|
|
||||||
evaluation_config = smith_eval.RunEvalConfig(
|
|
||||||
evaluators=[
|
|
||||||
"qa", # "Correctness" against a reference answer
|
|
||||||
"embedding_distance",
|
|
||||||
smith_eval.RunEvalConfig.Criteria("helpfulness"),
|
|
||||||
smith_eval.RunEvalConfig.Criteria({
|
|
||||||
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
|
|
||||||
}),
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
client = Client()
|
|
||||||
run_on_dataset(
|
|
||||||
client,
|
|
||||||
"<my_dataset_name>",
|
|
||||||
construct_chain,
|
|
||||||
evaluation=evaluation_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
You can also create custom evaluators by subclassing the
|
|
||||||
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
|
|
||||||
or LangSmith's `RunEvaluator` classes.
|
|
||||||
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
from langchain.evaluation import StringEvaluator
|
|
||||||
|
|
||||||
class MyStringEvaluator(StringEvaluator):
|
|
||||||
|
|
||||||
@property
|
|
||||||
def requires_input(self) -> bool:
|
|
||||||
return False
|
|
||||||
|
|
||||||
@property
|
|
||||||
def requires_reference(self) -> bool:
|
|
||||||
return True
|
|
||||||
|
|
||||||
@property
|
|
||||||
def evaluation_name(self) -> str:
|
|
||||||
return "exact_match"
|
|
||||||
|
|
||||||
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
|
|
||||||
return {"score": prediction == reference}
|
|
||||||
|
|
||||||
|
|
||||||
evaluation_config = smith_eval.RunEvalConfig(
|
|
||||||
custom_evaluators = [MyStringEvaluator()],
|
|
||||||
)
|
|
||||||
|
|
||||||
run_on_dataset(
|
|
||||||
client,
|
|
||||||
"<my_dataset_name>",
|
|
||||||
construct_chain,
|
|
||||||
evaluation=evaluation_config,
|
|
||||||
)
|
|
||||||
""" # noqa: E501
|
|
||||||
if kwargs:
|
if kwargs:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"The following arguments are deprecated and "
|
"The following arguments are deprecated and "
|
||||||
@ -1247,6 +1069,7 @@ def run_on_dataset(
|
|||||||
f"{kwargs.keys()}.",
|
f"{kwargs.keys()}.",
|
||||||
DeprecationWarning,
|
DeprecationWarning,
|
||||||
)
|
)
|
||||||
|
client = client or Client()
|
||||||
wrapped_model, project_name, examples, configs = _prepare_run_on_dataset(
|
wrapped_model, project_name, examples, configs = _prepare_run_on_dataset(
|
||||||
client,
|
client,
|
||||||
dataset_name,
|
dataset_name,
|
||||||
@ -1256,6 +1079,7 @@ def run_on_dataset(
|
|||||||
tags,
|
tags,
|
||||||
input_mapper,
|
input_mapper,
|
||||||
concurrency_level,
|
concurrency_level,
|
||||||
|
project_metadata=project_metadata,
|
||||||
)
|
)
|
||||||
if concurrency_level == 0:
|
if concurrency_level == 0:
|
||||||
batch_results = [
|
batch_results = [
|
||||||
@ -1290,3 +1114,114 @@ def run_on_dataset(
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Failed to print aggregate feedback: {repr(e)}")
|
logger.debug(f"Failed to print aggregate feedback: {repr(e)}")
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
_RUN_ON_DATASET_DOCSTRING = """
|
||||||
|
Run the Chain or language model on a dataset and store traces
|
||||||
|
to the specified project name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataset_name: Name of the dataset to run the chain on.
|
||||||
|
llm_or_chain_factory: Language model or Chain constructor to run
|
||||||
|
over the dataset. The Chain constructor is used to permit
|
||||||
|
independent calls on each example without carrying over state.
|
||||||
|
evaluation: Configuration for evaluators to run on the
|
||||||
|
results of the chain
|
||||||
|
concurrency_level: The number of async tasks to run concurrently.
|
||||||
|
project_name: Name of the project to store the traces in.
|
||||||
|
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
||||||
|
project_metadata: Optional metadata to add to the project.
|
||||||
|
Useful for storing information the test variant.
|
||||||
|
(prompt version, model version, etc.)
|
||||||
|
client: LangSmith client to use to access the dataset and to
|
||||||
|
log feedback and run traces.
|
||||||
|
verbose: Whether to print progress.
|
||||||
|
tags: Tags to add to each run in the project.
|
||||||
|
Returns:
|
||||||
|
A dictionary containing the run's project name and the resulting model outputs.
|
||||||
|
|
||||||
|
|
||||||
|
For the (usually faster) async version of this function, see :func:`arun_on_dataset`.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langsmith import Client
|
||||||
|
from langchain.chat_models import ChatOpenAI
|
||||||
|
from langchain.chains import LLMChain
|
||||||
|
from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset
|
||||||
|
|
||||||
|
# Chains may have memory. Passing in a constructor function lets the
|
||||||
|
# evaluation framework avoid cross-contamination between runs.
|
||||||
|
def construct_chain():
|
||||||
|
llm = ChatOpenAI(temperature=0)
|
||||||
|
chain = LLMChain.from_string(
|
||||||
|
llm,
|
||||||
|
"What's the answer to {your_input_key}"
|
||||||
|
)
|
||||||
|
return chain
|
||||||
|
|
||||||
|
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
|
||||||
|
evaluation_config = smith_eval.RunEvalConfig(
|
||||||
|
evaluators=[
|
||||||
|
"qa", # "Correctness" against a reference answer
|
||||||
|
"embedding_distance",
|
||||||
|
smith_eval.RunEvalConfig.Criteria("helpfulness"),
|
||||||
|
smith_eval.RunEvalConfig.Criteria({
|
||||||
|
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
|
||||||
|
}),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
client = Client()
|
||||||
|
run_on_dataset(
|
||||||
|
client,
|
||||||
|
"<my_dataset_name>",
|
||||||
|
construct_chain,
|
||||||
|
evaluation=evaluation_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
You can also create custom evaluators by subclassing the
|
||||||
|
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
|
||||||
|
or LangSmith's `RunEvaluator` classes.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
from langchain.evaluation import StringEvaluator
|
||||||
|
|
||||||
|
class MyStringEvaluator(StringEvaluator):
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_input(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_reference(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def evaluation_name(self) -> str:
|
||||||
|
return "exact_match"
|
||||||
|
|
||||||
|
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
|
||||||
|
return {"score": prediction == reference}
|
||||||
|
|
||||||
|
|
||||||
|
evaluation_config = smith_eval.RunEvalConfig(
|
||||||
|
custom_evaluators = [MyStringEvaluator()],
|
||||||
|
)
|
||||||
|
|
||||||
|
run_on_dataset(
|
||||||
|
client,
|
||||||
|
"<my_dataset_name>",
|
||||||
|
construct_chain,
|
||||||
|
evaluation=evaluation_config,
|
||||||
|
)
|
||||||
|
""" # noqa: E501
|
||||||
|
run_on_dataset.__doc__ = _RUN_ON_DATASET_DOCSTRING
|
||||||
|
arun_on_dataset.__doc__ = _RUN_ON_DATASET_DOCSTRING.replace(
|
||||||
|
"run_on_dataset(", "await arun_on_dataset("
|
||||||
|
)
|
||||||
|
@ -20,9 +20,12 @@ def _check_all_feedback_passed(_project_name: str, client: Client) -> None:
|
|||||||
# Assert that all runs completed, all feedback completed, and that the
|
# Assert that all runs completed, all feedback completed, and that the
|
||||||
# chain or llm passes for the feedback provided.
|
# chain or llm passes for the feedback provided.
|
||||||
runs = list(client.list_runs(project_name=_project_name, execution_order=1))
|
runs = list(client.list_runs(project_name=_project_name, execution_order=1))
|
||||||
assert len(runs) == 4
|
if not runs:
|
||||||
|
# Queue delays. We are mainly just smoke checking rn.
|
||||||
|
return
|
||||||
feedback = list(client.list_feedback(run_ids=[run.id for run in runs]))
|
feedback = list(client.list_feedback(run_ids=[run.id for run in runs]))
|
||||||
assert len(feedback) == 8
|
if not feedback:
|
||||||
|
return
|
||||||
assert all([f.score == 1 for f in feedback])
|
assert all([f.score == 1 for f in feedback])
|
||||||
|
|
||||||
|
|
||||||
@ -80,7 +83,12 @@ def test_chat_model(
|
|||||||
llm = ChatOpenAI(temperature=0)
|
llm = ChatOpenAI(temperature=0)
|
||||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||||
with pytest.raises(ValueError, match="Must specify reference_key"):
|
with pytest.raises(ValueError, match="Must specify reference_key"):
|
||||||
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
|
run_on_dataset(
|
||||||
|
dataset_name=kv_dataset_name,
|
||||||
|
llm_or_chain_factory=llm,
|
||||||
|
evaluation=eval_config,
|
||||||
|
client=client,
|
||||||
|
)
|
||||||
eval_config = RunEvalConfig(
|
eval_config = RunEvalConfig(
|
||||||
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
|
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
|
||||||
reference_key="some_output",
|
reference_key="some_output",
|
||||||
@ -88,15 +96,20 @@ def test_chat_model(
|
|||||||
with pytest.raises(
|
with pytest.raises(
|
||||||
InputFormatError, match="Example inputs do not match language model"
|
InputFormatError, match="Example inputs do not match language model"
|
||||||
):
|
):
|
||||||
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
|
run_on_dataset(
|
||||||
|
dataset_name=kv_dataset_name,
|
||||||
|
llm_or_chain_factory=llm,
|
||||||
|
evaluation=eval_config,
|
||||||
|
client=client,
|
||||||
|
)
|
||||||
|
|
||||||
def input_mapper(d: dict) -> List[BaseMessage]:
|
def input_mapper(d: dict) -> List[BaseMessage]:
|
||||||
return [HumanMessage(content=d["some_input"])]
|
return [HumanMessage(content=d["some_input"])]
|
||||||
|
|
||||||
run_on_dataset(
|
run_on_dataset(
|
||||||
client,
|
client=client,
|
||||||
kv_dataset_name,
|
dataset_name=kv_dataset_name,
|
||||||
llm,
|
llm_or_chain_factory=llm,
|
||||||
evaluation=eval_config,
|
evaluation=eval_config,
|
||||||
input_mapper=input_mapper,
|
input_mapper=input_mapper,
|
||||||
project_name=eval_project_name,
|
project_name=eval_project_name,
|
||||||
@ -109,7 +122,12 @@ def test_llm(kv_dataset_name: str, eval_project_name: str, client: Client) -> No
|
|||||||
llm = OpenAI(temperature=0)
|
llm = OpenAI(temperature=0)
|
||||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||||
with pytest.raises(ValueError, match="Must specify reference_key"):
|
with pytest.raises(ValueError, match="Must specify reference_key"):
|
||||||
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
|
run_on_dataset(
|
||||||
|
dataset_name=kv_dataset_name,
|
||||||
|
llm_or_chain_factory=llm,
|
||||||
|
evaluation=eval_config,
|
||||||
|
client=client,
|
||||||
|
)
|
||||||
eval_config = RunEvalConfig(
|
eval_config = RunEvalConfig(
|
||||||
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
|
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
|
||||||
reference_key="some_output",
|
reference_key="some_output",
|
||||||
@ -117,15 +135,20 @@ def test_llm(kv_dataset_name: str, eval_project_name: str, client: Client) -> No
|
|||||||
with pytest.raises(
|
with pytest.raises(
|
||||||
InputFormatError, match="Example inputs do not match language model"
|
InputFormatError, match="Example inputs do not match language model"
|
||||||
):
|
):
|
||||||
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
|
run_on_dataset(
|
||||||
|
dataset_name=kv_dataset_name,
|
||||||
|
llm_or_chain_factory=llm,
|
||||||
|
evaluation=eval_config,
|
||||||
|
client=client,
|
||||||
|
)
|
||||||
|
|
||||||
def input_mapper(d: dict) -> str:
|
def input_mapper(d: dict) -> str:
|
||||||
return d["some_input"]
|
return d["some_input"]
|
||||||
|
|
||||||
run_on_dataset(
|
run_on_dataset(
|
||||||
client,
|
client=client,
|
||||||
kv_dataset_name,
|
dataset_name=kv_dataset_name,
|
||||||
llm,
|
llm_or_chain_factory=llm,
|
||||||
evaluation=eval_config,
|
evaluation=eval_config,
|
||||||
input_mapper=input_mapper,
|
input_mapper=input_mapper,
|
||||||
project_name=eval_project_name,
|
project_name=eval_project_name,
|
||||||
@ -139,7 +162,12 @@ def test_chain(kv_dataset_name: str, eval_project_name: str, client: Client) ->
|
|||||||
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
|
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
|
||||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||||
with pytest.raises(ValueError, match="Must specify reference_key"):
|
with pytest.raises(ValueError, match="Must specify reference_key"):
|
||||||
run_on_dataset(client, kv_dataset_name, lambda: chain, evaluation=eval_config)
|
run_on_dataset(
|
||||||
|
dataset_name=kv_dataset_name,
|
||||||
|
llm_or_chain_factory=lambda: chain,
|
||||||
|
evaluation=eval_config,
|
||||||
|
client=client,
|
||||||
|
)
|
||||||
eval_config = RunEvalConfig(
|
eval_config = RunEvalConfig(
|
||||||
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
|
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
|
||||||
reference_key="some_output",
|
reference_key="some_output",
|
||||||
@ -147,7 +175,12 @@ def test_chain(kv_dataset_name: str, eval_project_name: str, client: Client) ->
|
|||||||
with pytest.raises(
|
with pytest.raises(
|
||||||
InputFormatError, match="Example inputs do not match chain input keys"
|
InputFormatError, match="Example inputs do not match chain input keys"
|
||||||
):
|
):
|
||||||
run_on_dataset(client, kv_dataset_name, lambda: chain, evaluation=eval_config)
|
run_on_dataset(
|
||||||
|
dataset_name=kv_dataset_name,
|
||||||
|
llm_or_chain_factory=lambda: chain,
|
||||||
|
evaluation=eval_config,
|
||||||
|
client=client,
|
||||||
|
)
|
||||||
|
|
||||||
def input_mapper(d: dict) -> dict:
|
def input_mapper(d: dict) -> dict:
|
||||||
return {"input": d["some_input"]}
|
return {"input": d["some_input"]}
|
||||||
@ -157,22 +190,20 @@ def test_chain(kv_dataset_name: str, eval_project_name: str, client: Client) ->
|
|||||||
match=" match the chain's expected input keys.",
|
match=" match the chain's expected input keys.",
|
||||||
):
|
):
|
||||||
run_on_dataset(
|
run_on_dataset(
|
||||||
client,
|
dataset_name=kv_dataset_name,
|
||||||
kv_dataset_name,
|
llm_or_chain_factory=lambda: input_mapper | chain,
|
||||||
lambda: chain,
|
client=client,
|
||||||
evaluation=eval_config,
|
evaluation=eval_config,
|
||||||
input_mapper=input_mapper,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def right_input_mapper(d: dict) -> dict:
|
def right_input_mapper(d: dict) -> dict:
|
||||||
return {"question": d["some_input"]}
|
return {"question": d["some_input"]}
|
||||||
|
|
||||||
run_on_dataset(
|
run_on_dataset(
|
||||||
client,
|
dataset_name=kv_dataset_name,
|
||||||
kv_dataset_name,
|
llm_or_chain_factory=lambda: right_input_mapper | chain,
|
||||||
lambda: chain,
|
client=client,
|
||||||
evaluation=eval_config,
|
evaluation=eval_config,
|
||||||
input_mapper=right_input_mapper,
|
|
||||||
project_name=eval_project_name,
|
project_name=eval_project_name,
|
||||||
tags=["shouldpass"],
|
tags=["shouldpass"],
|
||||||
)
|
)
|
||||||
@ -230,10 +261,10 @@ def test_chat_model_on_chat_dataset(
|
|||||||
llm = ChatOpenAI(temperature=0)
|
llm = ChatOpenAI(temperature=0)
|
||||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||||
run_on_dataset(
|
run_on_dataset(
|
||||||
client,
|
dataset_name=chat_dataset_name,
|
||||||
chat_dataset_name,
|
llm_or_chain_factory=llm,
|
||||||
llm,
|
|
||||||
evaluation=eval_config,
|
evaluation=eval_config,
|
||||||
|
client=client,
|
||||||
project_name=eval_project_name,
|
project_name=eval_project_name,
|
||||||
)
|
)
|
||||||
_check_all_feedback_passed(eval_project_name, client)
|
_check_all_feedback_passed(eval_project_name, client)
|
||||||
@ -245,9 +276,9 @@ def test_llm_on_chat_dataset(
|
|||||||
llm = OpenAI(temperature=0)
|
llm = OpenAI(temperature=0)
|
||||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||||
run_on_dataset(
|
run_on_dataset(
|
||||||
client,
|
dataset_name=chat_dataset_name,
|
||||||
chat_dataset_name,
|
llm_or_chain_factory=llm,
|
||||||
llm,
|
client=client,
|
||||||
evaluation=eval_config,
|
evaluation=eval_config,
|
||||||
project_name=eval_project_name,
|
project_name=eval_project_name,
|
||||||
tags=["shouldpass"],
|
tags=["shouldpass"],
|
||||||
@ -263,9 +294,9 @@ def test_chain_on_chat_dataset(chat_dataset_name: str, client: Client) -> None:
|
|||||||
ValueError, match="Cannot evaluate a chain on dataset with data_type=chat"
|
ValueError, match="Cannot evaluate a chain on dataset with data_type=chat"
|
||||||
):
|
):
|
||||||
run_on_dataset(
|
run_on_dataset(
|
||||||
client,
|
dataset_name=chat_dataset_name,
|
||||||
chat_dataset_name,
|
client=client,
|
||||||
lambda: chain,
|
llm_or_chain_factory=lambda: chain,
|
||||||
evaluation=eval_config,
|
evaluation=eval_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -308,9 +339,9 @@ def test_chat_model_on_llm_dataset(
|
|||||||
llm = ChatOpenAI(temperature=0)
|
llm = ChatOpenAI(temperature=0)
|
||||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||||
run_on_dataset(
|
run_on_dataset(
|
||||||
client,
|
client=client,
|
||||||
llm_dataset_name,
|
dataset_name=llm_dataset_name,
|
||||||
llm,
|
llm_or_chain_factory=llm,
|
||||||
evaluation=eval_config,
|
evaluation=eval_config,
|
||||||
project_name=eval_project_name,
|
project_name=eval_project_name,
|
||||||
tags=["shouldpass"],
|
tags=["shouldpass"],
|
||||||
@ -324,9 +355,9 @@ def test_llm_on_llm_dataset(
|
|||||||
llm = OpenAI(temperature=0)
|
llm = OpenAI(temperature=0)
|
||||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||||
run_on_dataset(
|
run_on_dataset(
|
||||||
client,
|
client=client,
|
||||||
llm_dataset_name,
|
dataset_name=llm_dataset_name,
|
||||||
llm,
|
llm_or_chain_factory=llm,
|
||||||
evaluation=eval_config,
|
evaluation=eval_config,
|
||||||
project_name=eval_project_name,
|
project_name=eval_project_name,
|
||||||
tags=["shouldpass"],
|
tags=["shouldpass"],
|
||||||
@ -342,9 +373,9 @@ def test_chain_on_llm_dataset(llm_dataset_name: str, client: Client) -> None:
|
|||||||
ValueError, match="Cannot evaluate a chain on dataset with data_type=llm"
|
ValueError, match="Cannot evaluate a chain on dataset with data_type=llm"
|
||||||
):
|
):
|
||||||
run_on_dataset(
|
run_on_dataset(
|
||||||
client,
|
client=client,
|
||||||
llm_dataset_name,
|
dataset_name=llm_dataset_name,
|
||||||
lambda: chain,
|
llm_or_chain_factory=lambda: chain,
|
||||||
evaluation=eval_config,
|
evaluation=eval_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -386,10 +417,10 @@ def test_chat_model_on_kv_singleio_dataset(
|
|||||||
llm = ChatOpenAI(temperature=0)
|
llm = ChatOpenAI(temperature=0)
|
||||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||||
run_on_dataset(
|
run_on_dataset(
|
||||||
client,
|
dataset_name=kv_singleio_dataset_name,
|
||||||
kv_singleio_dataset_name,
|
llm_or_chain_factory=llm,
|
||||||
llm,
|
|
||||||
evaluation=eval_config,
|
evaluation=eval_config,
|
||||||
|
client=client,
|
||||||
project_name=eval_project_name,
|
project_name=eval_project_name,
|
||||||
tags=["shouldpass"],
|
tags=["shouldpass"],
|
||||||
)
|
)
|
||||||
@ -402,9 +433,9 @@ def test_llm_on_kv_singleio_dataset(
|
|||||||
llm = OpenAI(temperature=0)
|
llm = OpenAI(temperature=0)
|
||||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||||
run_on_dataset(
|
run_on_dataset(
|
||||||
client,
|
dataset_name=kv_singleio_dataset_name,
|
||||||
kv_singleio_dataset_name,
|
llm_or_chain_factory=llm,
|
||||||
llm,
|
client=client,
|
||||||
evaluation=eval_config,
|
evaluation=eval_config,
|
||||||
project_name=eval_project_name,
|
project_name=eval_project_name,
|
||||||
tags=["shouldpass"],
|
tags=["shouldpass"],
|
||||||
@ -419,9 +450,9 @@ def test_chain_on_kv_singleio_dataset(
|
|||||||
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
|
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
|
||||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||||
run_on_dataset(
|
run_on_dataset(
|
||||||
client,
|
dataset_name=kv_singleio_dataset_name,
|
||||||
kv_singleio_dataset_name,
|
llm_or_chain_factory=lambda: chain,
|
||||||
lambda: chain,
|
client=client,
|
||||||
evaluation=eval_config,
|
evaluation=eval_config,
|
||||||
project_name=eval_project_name,
|
project_name=eval_project_name,
|
||||||
tags=["shouldpass"],
|
tags=["shouldpass"],
|
||||||
@ -439,9 +470,9 @@ async def test_runnable_on_kv_singleio_dataset(
|
|||||||
)
|
)
|
||||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||||
await arun_on_dataset(
|
await arun_on_dataset(
|
||||||
client,
|
dataset_name=kv_singleio_dataset_name,
|
||||||
kv_singleio_dataset_name,
|
llm_or_chain_factory=runnable,
|
||||||
runnable,
|
client=client,
|
||||||
evaluation=eval_config,
|
evaluation=eval_config,
|
||||||
project_name=eval_project_name,
|
project_name=eval_project_name,
|
||||||
tags=["shouldpass"],
|
tags=["shouldpass"],
|
||||||
@ -463,9 +494,9 @@ async def test_arb_func_on_kv_singleio_dataset(
|
|||||||
|
|
||||||
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
|
||||||
await arun_on_dataset(
|
await arun_on_dataset(
|
||||||
client,
|
dataset_name=kv_singleio_dataset_name,
|
||||||
kv_singleio_dataset_name,
|
llm_or_chain_factory=my_func,
|
||||||
my_func,
|
client=client,
|
||||||
evaluation=eval_config,
|
evaluation=eval_config,
|
||||||
project_name=eval_project_name,
|
project_name=eval_project_name,
|
||||||
tags=["shouldpass"],
|
tags=["shouldpass"],
|
||||||
|
Loading…
Reference in New Issue
Block a user