From c58d35765daa063b87733e006a4d65e2e1f388d3 Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Sun, 16 Jul 2023 12:05:56 -0700
Subject: [PATCH] Add examples to docstrings (#7796)

and:
- remove dataset name from autogenerated project name
- print out project name to view
---
 .../guides/evaluation/comparisons.ipynb       |   2 +-
 langchain/smith/evaluation/runner_utils.py    | 229 ++++++++++++++++--
 .../smith/evaluation/test_runner_utils.py     |   6 +-
 tests/unit_tests/smith/test_runner_utils.py   |   6 +-
 4 files changed, 221 insertions(+), 22 deletions(-)
diff --git a/docs/extras/guides/evaluation/comparisons.ipynb b/docs/extras/guides/evaluation/comparisons.ipynb
index 300623bb31b..0e544824a94 100644
--- a/docs/extras/guides/evaluation/comparisons.ipynb
+++ b/docs/extras/guides/evaluation/comparisons.ipynb
@@ -10,7 +10,7 @@
     "\n",
     "One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n",
     "\n",
-    "For this evalution, we will need 3 things:\n",
+    "For this evaluation, we will need 3 things:\n",
     "1. An evaluator\n",
     "2. A dataset of inputs\n",
     "3. 2 (or more) LLMs, Chains, or Agents to compare\n",
diff --git a/langchain/smith/evaluation/runner_utils.py b/langchain/smith/evaluation/runner_utils.py
index 4c7645ead6b..cc32e514c63 100644
--- a/langchain/smith/evaluation/runner_utils.py
+++ b/langchain/smith/evaluation/runner_utils.py
@@ -19,9 +19,10 @@ from typing import (
     Tuple,
     Union,
 )
+from urllib.parse import urlparse, urlunparse
 
 from langsmith import Client, RunEvaluator
-from langsmith.schemas import DataType, Example, RunTypeEnum
+from langsmith.schemas import Dataset, DataType, Example, RunTypeEnum
 
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.callbacks.manager import Callbacks
@@ -50,6 +51,19 @@ class InputFormatError(Exception):
 ## Shared Utilities
 
 
+def _get_eval_project_url(api_url: str, project_id: str) -> str:
+    """Get the project url from the api url."""
+    parsed = urlparse(api_url)
+    hostname = parsed.hostname or ""
+    if "api." in hostname:
+        hostname = hostname.replace("api.", "", 1)
+    if "localhost" in hostname:
+        # Remove the port
+        hostname = "localhost"
+    url = urlunparse(parsed._replace(netloc=hostname))
+    return f"{url}/projects/p/{project_id}?eval=true"
+
+
 def _wrap_in_chain_factory(
     llm_or_chain_factory: Union[Chain, MODEL_OR_CHAIN_FACTORY],
     dataset_name: str = "<my_dataset>",
@@ -206,7 +220,6 @@ def _get_messages(inputs: Dict[str, Any]) -> List[BaseMessage]:
 def _get_project_name(
     project_name: Optional[str],
     llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
-    dataset_name: Optional[str],
 ) -> str:
     """
     Get the project name.
@@ -214,7 +227,6 @@ def _get_project_name(
     Args:
         project_name: The project name if manually specified.
         llm_or_chain_factory: The Chain or language model constructor.
-        dataset_name: The dataset name.
 
     Returns:
         The project name.
@@ -226,8 +238,7 @@ def _get_project_name(
         model_name = llm_or_chain_factory.__class__.__name__
     else:
         model_name = llm_or_chain_factory().__class__.__name__
-    dataset_prefix = f"{dataset_name}-" if dataset_name else ""
-    return f"{dataset_prefix}{model_name}-{current_time}"
+    return f"{current_time}-{model_name}"
 
 
 ## Shared Validation Utilities
@@ -801,7 +812,7 @@ async def _arun_on_examples(
         A dictionary mapping example ids to the model outputs.
     """
     llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
-    project_name = _get_project_name(project_name, llm_or_chain_factory, None)
+    project_name = _get_project_name(project_name, llm_or_chain_factory)
     run_evaluators, examples = _setup_evaluation(
         llm_or_chain_factory, examples, evaluation, data_type
     )
@@ -1033,7 +1044,7 @@ def _run_on_examples(
     """
     results: Dict[str, Any] = {}
     llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
-    project_name = _get_project_name(project_name, llm_or_chain_factory, None)
+    project_name = _get_project_name(project_name, llm_or_chain_factory)
     tracer = LangChainTracer(
         project_name=project_name, client=client, use_threading=False
     )
@@ -1068,6 +1079,31 @@ def _run_on_examples(
 ## Public API
 
 
+def _prepare_eval_run(
+    client: Client,
+    dataset_name: str,
+    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
+    project_name: Optional[str],
+) -> Tuple[MODEL_OR_CHAIN_FACTORY, str, Dataset, Iterator[Example]]:
+    llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
+    project_name = _get_project_name(project_name, llm_or_chain_factory)
+    try:
+        project = client.create_project(project_name)
+    except ValueError as e:
+        if "already exists " not in str(e):
+            raise e
+        raise ValueError(
+            f"Project {project_name} already exists. Please use a different name."
+        )
+    project_url = _get_eval_project_url(client.api_url, project.id)
+    print(
+        f"View the evaluation results for project '{project_name}' at:\n{project_url}"
+    )
+    dataset = client.read_dataset(dataset_name=dataset_name)
+    examples = client.list_examples(dataset_id=str(dataset.id))
+    return llm_or_chain_factory, project_name, dataset, examples
+
+
 async def arun_on_dataset(
     client: Client,
     dataset_name: str,
@@ -1110,11 +1146,90 @@ async def arun_on_dataset(
     Returns:
         A dictionary containing the run's project name and the
         resulting model outputs.
-    """
-    llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
-    project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
-    dataset = client.read_dataset(dataset_name=dataset_name)
-    examples = client.list_examples(dataset_id=str(dataset.id))
+
+    For the synchronous version, see :func:`run_on_dataset`.
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        from langsmith import Client
+        from langchain.chat_models import ChatOpenAI
+        from langchain.chains import LLMChain
+        from langchain.smith import RunEvalConfig, arun_on_dataset
+
+        # Chains may have memory. Passing in a constructor function lets the
+        # evaluation framework avoid cross-contamination between runs.
+        def construct_chain():
+            llm = ChatOpenAI(temperature=0)
+            chain = LLMChain.from_string(
+                llm,
+                "What's the answer to {your_input_key}"
+            )
+            return chain
+
+        # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
+        evaluation_config = RunEvalConfig(
+            evaluators=[
+                "qa",  # "Correctness" against a reference answer
+                "embedding_distance",
+                RunEvalConfig.Criteria("helpfulness"),
+                RunEvalConfig.Criteria({
+                    "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
+                }),
+            ]
+        )
+
+        client = Client()
+        await arun_on_dataset(
+            client,
+            "<my_dataset_name>",
+            construct_chain,
+            evaluation=evaluation_config,
+        )
+
+    You can also create custom evaluators by subclassing the
+    :class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
+    or LangSmith's `RunEvaluator` classes.
+
+    .. code-block:: python
+
+        from typing import Optional
+        from langchain.evaluation import StringEvaluator
+
+        class MyStringEvaluator(StringEvaluator):
+
+            @property
+            def requires_input(self) -> bool:
+                return False
+
+            @property
+            def requires_reference(self) -> bool:
+                return True
+
+            @property
+            def evaluation_name(self) -> str:
+                return "exact_match"
+
+            def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
+                return {"score": prediction == reference}
+
+
+        evaluation_config = RunEvalConfig(
+            custom_evaluators = [MyStringEvaluator()],
+        )
+
+        await arun_on_dataset(
+            client,
+            "<my_dataset_name>",
+            construct_chain,
+            evaluation=evaluation_config,
+        )
+    """  # noqa: E501
+    llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run(
+        client, dataset_name, llm_or_chain_factory, project_name
+    )
     results = await _arun_on_examples(
         client,
         examples,
@@ -1174,11 +1289,91 @@ def run_on_dataset(
 
     Returns:
         A dictionary containing the run's project name and the resulting model outputs.
-    """
-    llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
-    project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
-    dataset = client.read_dataset(dataset_name=dataset_name)
-    examples = client.list_examples(dataset_id=str(dataset.id))
+
+
+    For the (usually faster) async version of this function, see :func:`arun_on_dataset`.
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        from langsmith import Client
+        from langchain.chat_models import ChatOpenAI
+        from langchain.chains import LLMChain
+        from langchain.smith import RunEvalConfig, run_on_dataset
+
+        # Chains may have memory. Passing in a constructor function lets the
+        # evaluation framework avoid cross-contamination between runs.
+        def construct_chain():
+            llm = ChatOpenAI(temperature=0)
+            chain = LLMChain.from_string(
+                llm,
+                "What's the answer to {your_input_key}"
+            )
+            return chain
+
+        # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
+        evaluation_config = RunEvalConfig(
+            evaluators=[
+                "qa",  # "Correctness" against a reference answer
+                "embedding_distance",
+                RunEvalConfig.Criteria("helpfulness"),
+                RunEvalConfig.Criteria({
+                    "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
+                }),
+            ]
+        )
+
+        client = Client()
+        run_on_dataset(
+            client,
+            "<my_dataset_name>",
+            construct_chain,
+            evaluation=evaluation_config,
+        )
+
+    You can also create custom evaluators by subclassing the
+    :class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
+    or LangSmith's `RunEvaluator` classes.
+
+    .. code-block:: python
+
+        from typing import Optional
+        from langchain.evaluation import StringEvaluator
+
+        class MyStringEvaluator(StringEvaluator):
+
+            @property
+            def requires_input(self) -> bool:
+                return False
+
+            @property
+            def requires_reference(self) -> bool:
+                return True
+
+            @property
+            def evaluation_name(self) -> str:
+                return "exact_match"
+
+            def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
+                return {"score": prediction == reference}
+
+
+        evaluation_config = RunEvalConfig(
+            custom_evaluators = [MyStringEvaluator()],
+        )
+
+        run_on_dataset(
+            client,
+            "<my_dataset_name>",
+            construct_chain,
+            evaluation=evaluation_config,
+        )
+    """  # noqa: E501
+    llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run(
+        client, dataset_name, llm_or_chain_factory, project_name
+    )
     results = _run_on_examples(
         client,
         examples,
diff --git a/tests/unit_tests/smith/evaluation/test_runner_utils.py b/tests/unit_tests/smith/evaluation/test_runner_utils.py
index f93909bb40e..3bf9a1d32c8 100644
--- a/tests/unit_tests/smith/evaluation/test_runner_utils.py
+++ b/tests/unit_tests/smith/evaluation/test_runner_utils.py
@@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
             {"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
         ]
 
-    def mock_create_project(*args: Any, **kwargs: Any) -> None:
-        pass
+    def mock_create_project(*args: Any, **kwargs: Any) -> Any:
+        proj = mock.MagicMock()
+        proj.id = "123"
+        return proj
 
     with mock.patch.object(
         Client, "read_dataset", new=mock_read_dataset
diff --git a/tests/unit_tests/smith/test_runner_utils.py b/tests/unit_tests/smith/test_runner_utils.py
index f93909bb40e..3bf9a1d32c8 100644
--- a/tests/unit_tests/smith/test_runner_utils.py
+++ b/tests/unit_tests/smith/test_runner_utils.py
@@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
             {"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
         ]
 
-    def mock_create_project(*args: Any, **kwargs: Any) -> None:
-        pass
+    def mock_create_project(*args: Any, **kwargs: Any) -> Any:
+        proj = mock.MagicMock()
+        proj.id = "123"
+        return proj
 
     with mock.patch.object(
         Client, "read_dataset", new=mock_read_dataset