Add examples to docstrings (#7796)

and: - remove dataset name from autogenerated project name - print out project name to view
2025-08-08 12:31:49 +00:00 · 2023-07-16 12:05:56 -07:00 · 2023-07-16 12:05:56 -07:00 · c58d35765d
commit c58d35765d
parent ed97af423c
4 changed files with 221 additions and 22 deletions
--- a/docs/extras/guides/evaluation/comparisons.ipynb
+++ b/docs/extras/guides/evaluation/comparisons.ipynb
@ -10,7 +10,7 @@
    "\n",
    "One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n",
    "\n",
-    "For this evalution, we will need 3 things:\n",
+    "For this evaluation, we will need 3 things:\n",
    "1. An evaluator\n",
    "2. A dataset of inputs\n",
    "3. 2 (or more) LLMs, Chains, or Agents to compare\n",
--- a/langchain/smith/evaluation/runner_utils.py
+++ b/langchain/smith/evaluation/runner_utils.py
@ -19,9 +19,10 @@ from typing import (
    Tuple,
    Union,
 )
+from urllib.parse import urlparse, urlunparse

 from langsmith import Client, RunEvaluator
-from langsmith.schemas import DataType, Example, RunTypeEnum
+from langsmith.schemas import Dataset, DataType, Example, RunTypeEnum

 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.callbacks.manager import Callbacks
@ -50,6 +51,19 @@ class InputFormatError(Exception):
 ## Shared Utilities


+def _get_eval_project_url(api_url: str, project_id: str) -> str:
+    """Get the project url from the api url."""
+    parsed = urlparse(api_url)
+    hostname = parsed.hostname or ""
+    if "api." in hostname:
+        hostname = hostname.replace("api.", "", 1)
+    if "localhost" in hostname:
+        # Remove the port
+        hostname = "localhost"
+    url = urlunparse(parsed._replace(netloc=hostname))
+    return f"{url}/projects/p/{project_id}?eval=true"
+
+
 def _wrap_in_chain_factory(
    llm_or_chain_factory: Union[Chain, MODEL_OR_CHAIN_FACTORY],
    dataset_name: str = "<my_dataset>",
@ -206,7 +220,6 @@ def _get_messages(inputs: Dict[str, Any]) -> List[BaseMessage]:
 def _get_project_name(
    project_name: Optional[str],
    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
-    dataset_name: Optional[str],
 ) -> str:
    """
    Get the project name.
@ -214,7 +227,6 @@ def _get_project_name(
    Args:
        project_name: The project name if manually specified.
        llm_or_chain_factory: The Chain or language model constructor.
-        dataset_name: The dataset name.

    Returns:
        The project name.
@ -226,8 +238,7 @@ def _get_project_name(
        model_name = llm_or_chain_factory.__class__.__name__
    else:
        model_name = llm_or_chain_factory().__class__.__name__
-    dataset_prefix = f"{dataset_name}-" if dataset_name else ""
-    return f"{dataset_prefix}{model_name}-{current_time}"
+    return f"{current_time}-{model_name}"


 ## Shared Validation Utilities
@ -801,7 +812,7 @@ async def _arun_on_examples(
        A dictionary mapping example ids to the model outputs.
    """
    llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
-    project_name = _get_project_name(project_name, llm_or_chain_factory, None)
+    project_name = _get_project_name(project_name, llm_or_chain_factory)
    run_evaluators, examples = _setup_evaluation(
        llm_or_chain_factory, examples, evaluation, data_type
    )
@ -1033,7 +1044,7 @@ def _run_on_examples(
    """
    results: Dict[str, Any] = {}
    llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
-    project_name = _get_project_name(project_name, llm_or_chain_factory, None)
+    project_name = _get_project_name(project_name, llm_or_chain_factory)
    tracer = LangChainTracer(
        project_name=project_name, client=client, use_threading=False
    )
@ -1068,6 +1079,31 @@ def _run_on_examples(
 ## Public API


+def _prepare_eval_run(
+    client: Client,
+    dataset_name: str,
+    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
+    project_name: Optional[str],
+) -> Tuple[MODEL_OR_CHAIN_FACTORY, str, Dataset, Iterator[Example]]:
+    llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
+    project_name = _get_project_name(project_name, llm_or_chain_factory)
+    try:
+        project = client.create_project(project_name)
+    except ValueError as e:
+        if "already exists " not in str(e):
+            raise e
+        raise ValueError(
+            f"Project {project_name} already exists. Please use a different name."
+        )
+    project_url = _get_eval_project_url(client.api_url, project.id)
+    print(
+        f"View the evaluation results for project '{project_name}' at:\n{project_url}"
+    )
+    dataset = client.read_dataset(dataset_name=dataset_name)
+    examples = client.list_examples(dataset_id=str(dataset.id))
+    return llm_or_chain_factory, project_name, dataset, examples
+
+
 async def arun_on_dataset(
    client: Client,
    dataset_name: str,
@ -1110,11 +1146,90 @@ async def arun_on_dataset(
    Returns:
        A dictionary containing the run's project name and the
        resulting model outputs.
-    """
-    llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
-    project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
-    dataset = client.read_dataset(dataset_name=dataset_name)
-    examples = client.list_examples(dataset_id=str(dataset.id))
+
+    For the synchronous version, see :func:`run_on_dataset`.
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        from langsmith import Client
+        from langchain.chat_models import ChatOpenAI
+        from langchain.chains import LLMChain
+        from langchain.smith import RunEvalConfig, arun_on_dataset
+
+        # Chains may have memory. Passing in a constructor function lets the
+        # evaluation framework avoid cross-contamination between runs.
+        def construct_chain():
+            llm = ChatOpenAI(temperature=0)
+            chain = LLMChain.from_string(
+                llm,
+                "What's the answer to {your_input_key}"
+            )
+            return chain
+
+        # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
+        evaluation_config = RunEvalConfig(
+            evaluators=[
+                "qa",  # "Correctness" against a reference answer
+                "embedding_distance",
+                RunEvalConfig.Criteria("helpfulness"),
+                RunEvalConfig.Criteria({
+                    "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
+                }),
+            ]
+        )
+
+        client = Client()
+        await arun_on_dataset(
+            client,
+            "<my_dataset_name>",
+            construct_chain,
+            evaluation=evaluation_config,
+        )
+
+    You can also create custom evaluators by subclassing the
+    :class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
+    or LangSmith's `RunEvaluator` classes.
+
+    .. code-block:: python
+
+        from typing import Optional
+        from langchain.evaluation import StringEvaluator
+
+        class MyStringEvaluator(StringEvaluator):
+
+            @property
+            def requires_input(self) -> bool:
+                return False
+
+            @property
+            def requires_reference(self) -> bool:
+                return True
+
+            @property
+            def evaluation_name(self) -> str:
+                return "exact_match"
+
+            def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
+                return {"score": prediction == reference}
+
+
+        evaluation_config = RunEvalConfig(
+            custom_evaluators = [MyStringEvaluator()],
+        )
+
+        await arun_on_dataset(
+            client,
+            "<my_dataset_name>",
+            construct_chain,
+            evaluation=evaluation_config,
+        )
+    """  # noqa: E501
+    llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run(
+        client, dataset_name, llm_or_chain_factory, project_name
+    )
    results = await _arun_on_examples(
        client,
        examples,
@ -1174,11 +1289,91 @@ def run_on_dataset(

    Returns:
        A dictionary containing the run's project name and the resulting model outputs.
-    """
-    llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
-    project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
-    dataset = client.read_dataset(dataset_name=dataset_name)
-    examples = client.list_examples(dataset_id=str(dataset.id))
+
+
+    For the (usually faster) async version of this function, see :func:`arun_on_dataset`.
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        from langsmith import Client
+        from langchain.chat_models import ChatOpenAI
+        from langchain.chains import LLMChain
+        from langchain.smith import RunEvalConfig, run_on_dataset
+
+        # Chains may have memory. Passing in a constructor function lets the
+        # evaluation framework avoid cross-contamination between runs.
+        def construct_chain():
+            llm = ChatOpenAI(temperature=0)
+            chain = LLMChain.from_string(
+                llm,
+                "What's the answer to {your_input_key}"
+            )
+            return chain
+
+        # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
+        evaluation_config = RunEvalConfig(
+            evaluators=[
+                "qa",  # "Correctness" against a reference answer
+                "embedding_distance",
+                RunEvalConfig.Criteria("helpfulness"),
+                RunEvalConfig.Criteria({
+                    "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
+                }),
+            ]
+        )
+
+        client = Client()
+        run_on_dataset(
+            client,
+            "<my_dataset_name>",
+            construct_chain,
+            evaluation=evaluation_config,
+        )
+
+    You can also create custom evaluators by subclassing the
+    :class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
+    or LangSmith's `RunEvaluator` classes.
+
+    .. code-block:: python
+
+        from typing import Optional
+        from langchain.evaluation import StringEvaluator
+
+        class MyStringEvaluator(StringEvaluator):
+
+            @property
+            def requires_input(self) -> bool:
+                return False
+
+            @property
+            def requires_reference(self) -> bool:
+                return True
+
+            @property
+            def evaluation_name(self) -> str:
+                return "exact_match"
+
+            def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
+                return {"score": prediction == reference}
+
+
+        evaluation_config = RunEvalConfig(
+            custom_evaluators = [MyStringEvaluator()],
+        )
+
+        run_on_dataset(
+            client,
+            "<my_dataset_name>",
+            construct_chain,
+            evaluation=evaluation_config,
+        )
+    """  # noqa: E501
+    llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run(
+        client, dataset_name, llm_or_chain_factory, project_name
+    )
    results = _run_on_examples(
        client,
        examples,
--- a/tests/unit_tests/smith/evaluation/test_runner_utils.py
+++ b/tests/unit_tests/smith/evaluation/test_runner_utils.py
@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
            {"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
        ]

-    def mock_create_project(*args: Any, **kwargs: Any) -> None:
-        pass
+    def mock_create_project(*args: Any, **kwargs: Any) -> Any:
+        proj = mock.MagicMock()
+        proj.id = "123"
+        return proj

    with mock.patch.object(
        Client, "read_dataset", new=mock_read_dataset
--- a/tests/unit_tests/smith/test_runner_utils.py
+++ b/tests/unit_tests/smith/test_runner_utils.py
@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
            {"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
        ]

-    def mock_create_project(*args: Any, **kwargs: Any) -> None:
-        pass
+    def mock_create_project(*args: Any, **kwargs: Any) -> Any:
+        proj = mock.MagicMock()
+        proj.id = "123"
+        return proj

    with mock.patch.object(
        Client, "read_dataset", new=mock_read_dataset