From c58d35765daa063b87733e006a4d65e2e1f388d3 Mon Sep 17 00:00:00 2001 From: William FH <13333726+hinthornw@users.noreply.github.com> Date: Sun, 16 Jul 2023 12:05:56 -0700 Subject: [PATCH] Add examples to docstrings (#7796) and: - remove dataset name from autogenerated project name - print out project name to view --- .../guides/evaluation/comparisons.ipynb | 2 +- langchain/smith/evaluation/runner_utils.py | 229 ++++++++++++++++-- .../smith/evaluation/test_runner_utils.py | 6 +- tests/unit_tests/smith/test_runner_utils.py | 6 +- 4 files changed, 221 insertions(+), 22 deletions(-) diff --git a/docs/extras/guides/evaluation/comparisons.ipynb b/docs/extras/guides/evaluation/comparisons.ipynb index 300623bb31b..0e544824a94 100644 --- a/docs/extras/guides/evaluation/comparisons.ipynb +++ b/docs/extras/guides/evaluation/comparisons.ipynb @@ -10,7 +10,7 @@ "\n", "One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`[[1]](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n", "\n", - "For this evalution, we will need 3 things:\n", + "For this evaluation, we will need 3 things:\n", "1. An evaluator\n", "2. A dataset of inputs\n", "3. 2 (or more) LLMs, Chains, or Agents to compare\n", diff --git a/langchain/smith/evaluation/runner_utils.py b/langchain/smith/evaluation/runner_utils.py index 4c7645ead6b..cc32e514c63 100644 --- a/langchain/smith/evaluation/runner_utils.py +++ b/langchain/smith/evaluation/runner_utils.py @@ -19,9 +19,10 @@ from typing import ( Tuple, Union, ) +from urllib.parse import urlparse, urlunparse from langsmith import Client, RunEvaluator -from langsmith.schemas import DataType, Example, RunTypeEnum +from langsmith.schemas import Dataset, DataType, Example, RunTypeEnum from langchain.callbacks.base import BaseCallbackHandler from langchain.callbacks.manager import Callbacks @@ -50,6 +51,19 @@ class InputFormatError(Exception): ## Shared Utilities +def _get_eval_project_url(api_url: str, project_id: str) -> str: + """Get the project url from the api url.""" + parsed = urlparse(api_url) + hostname = parsed.hostname or "" + if "api." in hostname: + hostname = hostname.replace("api.", "", 1) + if "localhost" in hostname: + # Remove the port + hostname = "localhost" + url = urlunparse(parsed._replace(netloc=hostname)) + return f"{url}/projects/p/{project_id}?eval=true" + + def _wrap_in_chain_factory( llm_or_chain_factory: Union[Chain, MODEL_OR_CHAIN_FACTORY], dataset_name: str = "", @@ -206,7 +220,6 @@ def _get_messages(inputs: Dict[str, Any]) -> List[BaseMessage]: def _get_project_name( project_name: Optional[str], llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY, - dataset_name: Optional[str], ) -> str: """ Get the project name. @@ -214,7 +227,6 @@ def _get_project_name( Args: project_name: The project name if manually specified. llm_or_chain_factory: The Chain or language model constructor. - dataset_name: The dataset name. Returns: The project name. @@ -226,8 +238,7 @@ def _get_project_name( model_name = llm_or_chain_factory.__class__.__name__ else: model_name = llm_or_chain_factory().__class__.__name__ - dataset_prefix = f"{dataset_name}-" if dataset_name else "" - return f"{dataset_prefix}{model_name}-{current_time}" + return f"{current_time}-{model_name}" ## Shared Validation Utilities @@ -801,7 +812,7 @@ async def _arun_on_examples( A dictionary mapping example ids to the model outputs. """ llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory) - project_name = _get_project_name(project_name, llm_or_chain_factory, None) + project_name = _get_project_name(project_name, llm_or_chain_factory) run_evaluators, examples = _setup_evaluation( llm_or_chain_factory, examples, evaluation, data_type ) @@ -1033,7 +1044,7 @@ def _run_on_examples( """ results: Dict[str, Any] = {} llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory) - project_name = _get_project_name(project_name, llm_or_chain_factory, None) + project_name = _get_project_name(project_name, llm_or_chain_factory) tracer = LangChainTracer( project_name=project_name, client=client, use_threading=False ) @@ -1068,6 +1079,31 @@ def _run_on_examples( ## Public API +def _prepare_eval_run( + client: Client, + dataset_name: str, + llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY, + project_name: Optional[str], +) -> Tuple[MODEL_OR_CHAIN_FACTORY, str, Dataset, Iterator[Example]]: + llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name) + project_name = _get_project_name(project_name, llm_or_chain_factory) + try: + project = client.create_project(project_name) + except ValueError as e: + if "already exists " not in str(e): + raise e + raise ValueError( + f"Project {project_name} already exists. Please use a different name." + ) + project_url = _get_eval_project_url(client.api_url, project.id) + print( + f"View the evaluation results for project '{project_name}' at:\n{project_url}" + ) + dataset = client.read_dataset(dataset_name=dataset_name) + examples = client.list_examples(dataset_id=str(dataset.id)) + return llm_or_chain_factory, project_name, dataset, examples + + async def arun_on_dataset( client: Client, dataset_name: str, @@ -1110,11 +1146,90 @@ async def arun_on_dataset( Returns: A dictionary containing the run's project name and the resulting model outputs. - """ - llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name) - project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name) - dataset = client.read_dataset(dataset_name=dataset_name) - examples = client.list_examples(dataset_id=str(dataset.id)) + + For the synchronous version, see :func:`run_on_dataset`. + + Examples + -------- + + .. code-block:: python + + from langsmith import Client + from langchain.chat_models import ChatOpenAI + from langchain.chains import LLMChain + from langchain.smith import RunEvalConfig, arun_on_dataset + + # Chains may have memory. Passing in a constructor function lets the + # evaluation framework avoid cross-contamination between runs. + def construct_chain(): + llm = ChatOpenAI(temperature=0) + chain = LLMChain.from_string( + llm, + "What's the answer to {your_input_key}" + ) + return chain + + # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum) + evaluation_config = RunEvalConfig( + evaluators=[ + "qa", # "Correctness" against a reference answer + "embedding_distance", + RunEvalConfig.Criteria("helpfulness"), + RunEvalConfig.Criteria({ + "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?" + }), + ] + ) + + client = Client() + await arun_on_dataset( + client, + "", + construct_chain, + evaluation=evaluation_config, + ) + + You can also create custom evaluators by subclassing the + :class:`StringEvaluator ` + or LangSmith's `RunEvaluator` classes. + + .. code-block:: python + + from typing import Optional + from langchain.evaluation import StringEvaluator + + class MyStringEvaluator(StringEvaluator): + + @property + def requires_input(self) -> bool: + return False + + @property + def requires_reference(self) -> bool: + return True + + @property + def evaluation_name(self) -> str: + return "exact_match" + + def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict: + return {"score": prediction == reference} + + + evaluation_config = RunEvalConfig( + custom_evaluators = [MyStringEvaluator()], + ) + + await arun_on_dataset( + client, + "", + construct_chain, + evaluation=evaluation_config, + ) + """ # noqa: E501 + llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run( + client, dataset_name, llm_or_chain_factory, project_name + ) results = await _arun_on_examples( client, examples, @@ -1174,11 +1289,91 @@ def run_on_dataset( Returns: A dictionary containing the run's project name and the resulting model outputs. - """ - llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name) - project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name) - dataset = client.read_dataset(dataset_name=dataset_name) - examples = client.list_examples(dataset_id=str(dataset.id)) + + + For the (usually faster) async version of this function, see :func:`arun_on_dataset`. + + Examples + -------- + + .. code-block:: python + + from langsmith import Client + from langchain.chat_models import ChatOpenAI + from langchain.chains import LLMChain + from langchain.smith import RunEvalConfig, run_on_dataset + + # Chains may have memory. Passing in a constructor function lets the + # evaluation framework avoid cross-contamination between runs. + def construct_chain(): + llm = ChatOpenAI(temperature=0) + chain = LLMChain.from_string( + llm, + "What's the answer to {your_input_key}" + ) + return chain + + # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum) + evaluation_config = RunEvalConfig( + evaluators=[ + "qa", # "Correctness" against a reference answer + "embedding_distance", + RunEvalConfig.Criteria("helpfulness"), + RunEvalConfig.Criteria({ + "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?" + }), + ] + ) + + client = Client() + run_on_dataset( + client, + "", + construct_chain, + evaluation=evaluation_config, + ) + + You can also create custom evaluators by subclassing the + :class:`StringEvaluator ` + or LangSmith's `RunEvaluator` classes. + + .. code-block:: python + + from typing import Optional + from langchain.evaluation import StringEvaluator + + class MyStringEvaluator(StringEvaluator): + + @property + def requires_input(self) -> bool: + return False + + @property + def requires_reference(self) -> bool: + return True + + @property + def evaluation_name(self) -> str: + return "exact_match" + + def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict: + return {"score": prediction == reference} + + + evaluation_config = RunEvalConfig( + custom_evaluators = [MyStringEvaluator()], + ) + + run_on_dataset( + client, + "", + construct_chain, + evaluation=evaluation_config, + ) + """ # noqa: E501 + llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run( + client, dataset_name, llm_or_chain_factory, project_name + ) results = _run_on_examples( client, examples, diff --git a/tests/unit_tests/smith/evaluation/test_runner_utils.py b/tests/unit_tests/smith/evaluation/test_runner_utils.py index f93909bb40e..3bf9a1d32c8 100644 --- a/tests/unit_tests/smith/evaluation/test_runner_utils.py +++ b/tests/unit_tests/smith/evaluation/test_runner_utils.py @@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None: {"result": f"Result for example {example.id}"} for _ in range(n_repetitions) ] - def mock_create_project(*args: Any, **kwargs: Any) -> None: - pass + def mock_create_project(*args: Any, **kwargs: Any) -> Any: + proj = mock.MagicMock() + proj.id = "123" + return proj with mock.patch.object( Client, "read_dataset", new=mock_read_dataset diff --git a/tests/unit_tests/smith/test_runner_utils.py b/tests/unit_tests/smith/test_runner_utils.py index f93909bb40e..3bf9a1d32c8 100644 --- a/tests/unit_tests/smith/test_runner_utils.py +++ b/tests/unit_tests/smith/test_runner_utils.py @@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None: {"result": f"Result for example {example.id}"} for _ in range(n_repetitions) ] - def mock_create_project(*args: Any, **kwargs: Any) -> None: - pass + def mock_create_project(*args: Any, **kwargs: Any) -> Any: + proj = mock.MagicMock() + proj.id = "123" + return proj with mock.patch.object( Client, "read_dataset", new=mock_read_dataset