Add examples to docstrings (#7796)

and: - remove dataset name from autogenerated project name - print out project name to view
2025-09-24 03:52:10 +00:00 · 2023-07-16 12:05:56 -07:00
parent ed97af423c
commit c58d35765d
4 changed files with 221 additions and 22 deletions
--- a/docs/extras/guides/evaluation/comparisons.ipynb
+++ b/docs/extras/guides/evaluation/comparisons.ipynb
@@ -10,7 +10,7 @@
    "\n",
    "One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n",
    "\n",
-    "For this evalution, we will need 3 things:\n",
+    "For this evaluation, we will need 3 things:\n",
    "1. An evaluator\n",
    "2. A dataset of inputs\n",
    "3. 2 (or more) LLMs, Chains, or Agents to compare\n",
--- a/langchain/smith/evaluation/runner_utils.py
+++ b/langchain/smith/evaluation/runner_utils.py
@@ -19,9 +19,10 @@ from typing import (
    Tuple,
    Union,
 )
 from urllib.parse import urlparse, urlunparse
 from langsmith import Client, RunEvaluator
-from langsmith.schemas import DataType, Example, RunTypeEnum
+from langsmith.schemas import Dataset, DataType, Example, RunTypeEnum
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.callbacks.manager import Callbacks
@@ -50,6 +51,19 @@ class InputFormatError(Exception):
 ## Shared Utilities
 def _get_eval_project_url(api_url: str, project_id: str) -> str:
    """Get the project url from the api url."""
    parsed = urlparse(api_url)
    hostname = parsed.hostname or ""
    if "api." in hostname:
        hostname = hostname.replace("api.", "", 1)
    if "localhost" in hostname:
        # Remove the port
        hostname = "localhost"
    url = urlunparse(parsed._replace(netloc=hostname))
    return f"{url}/projects/p/{project_id}?eval=true"
 def _wrap_in_chain_factory(
    llm_or_chain_factory: Union[Chain, MODEL_OR_CHAIN_FACTORY],
    dataset_name: str = "<my_dataset>",
@@ -206,7 +220,6 @@ def _get_messages(inputs: Dict[str, Any]) -> List[BaseMessage]:
 def _get_project_name(
    project_name: Optional[str],
    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
    dataset_name: Optional[str],
 ) -> str:
    """
    Get the project name.
@@ -214,7 +227,6 @@ def _get_project_name(
    Args:
        project_name: The project name if manually specified.
        llm_or_chain_factory: The Chain or language model constructor.
        dataset_name: The dataset name.
    Returns:
        The project name.
@@ -226,8 +238,7 @@ def _get_project_name(
        model_name = llm_or_chain_factory.__class__.__name__
    else:
        model_name = llm_or_chain_factory().__class__.__name__
-    dataset_prefix = f"{dataset_name}-" if dataset_name else ""
+    return f"{current_time}-{model_name}"
    return f"{dataset_prefix}{model_name}-{current_time}"
 ## Shared Validation Utilities
@@ -801,7 +812,7 @@ async def _arun_on_examples(
        A dictionary mapping example ids to the model outputs.
    """
    llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
-    project_name = _get_project_name(project_name, llm_or_chain_factory, None)
+    project_name = _get_project_name(project_name, llm_or_chain_factory)
    run_evaluators, examples = _setup_evaluation(
        llm_or_chain_factory, examples, evaluation, data_type
    )
@@ -1033,7 +1044,7 @@ def _run_on_examples(
    """
    results: Dict[str, Any] = {}
    llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
-    project_name = _get_project_name(project_name, llm_or_chain_factory, None)
+    project_name = _get_project_name(project_name, llm_or_chain_factory)
    tracer = LangChainTracer(
        project_name=project_name, client=client, use_threading=False
    )
@@ -1068,6 +1079,31 @@ def _run_on_examples(
 ## Public API
 def _prepare_eval_run(
    client: Client,
    dataset_name: str,
    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
    project_name: Optional[str],
 ) -> Tuple[MODEL_OR_CHAIN_FACTORY, str, Dataset, Iterator[Example]]:
    llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
    project_name = _get_project_name(project_name, llm_or_chain_factory)
    try:
        project = client.create_project(project_name)
    except ValueError as e:
        if "already exists " not in str(e):
            raise e
        raise ValueError(
            f"Project {project_name} already exists. Please use a different name."
        )
    project_url = _get_eval_project_url(client.api_url, project.id)
    print(
        f"View the evaluation results for project '{project_name}' at:\n{project_url}"
    )
    dataset = client.read_dataset(dataset_name=dataset_name)
    examples = client.list_examples(dataset_id=str(dataset.id))
    return llm_or_chain_factory, project_name, dataset, examples
 async def arun_on_dataset(
    client: Client,
    dataset_name: str,
@@ -1110,11 +1146,90 @@ async def arun_on_dataset(
    Returns:
        A dictionary containing the run's project name and the
        resulting model outputs.
-    """
+
-    llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
+    For the synchronous version, see :func:`run_on_dataset`.
-    project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
+
-    dataset = client.read_dataset(dataset_name=dataset_name)
+    Examples
-    examples = client.list_examples(dataset_id=str(dataset.id))
+    --------
    .. code-block:: python
        from langsmith import Client
        from langchain.chat_models import ChatOpenAI
        from langchain.chains import LLMChain
        from langchain.smith import RunEvalConfig, arun_on_dataset
        # Chains may have memory. Passing in a constructor function lets the
        # evaluation framework avoid cross-contamination between runs.
        def construct_chain():
            llm = ChatOpenAI(temperature=0)
            chain = LLMChain.from_string(
                llm,
                "What's the answer to {your_input_key}"
            )
            return chain
        # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
        evaluation_config = RunEvalConfig(
            evaluators=[
                "qa",  # "Correctness" against a reference answer
                "embedding_distance",
                RunEvalConfig.Criteria("helpfulness"),
                RunEvalConfig.Criteria({
                    "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
                }),
            ]
        )
        client = Client()
        await arun_on_dataset(
            client,
            "<my_dataset_name>",
            construct_chain,
            evaluation=evaluation_config,
        )
    You can also create custom evaluators by subclassing the
    :class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
    or LangSmith's `RunEvaluator` classes.
    .. code-block:: python
        from typing import Optional
        from langchain.evaluation import StringEvaluator
        class MyStringEvaluator(StringEvaluator):
            @property
            def requires_input(self) -> bool:
                return False
            @property
            def requires_reference(self) -> bool:
                return True
            @property
            def evaluation_name(self) -> str:
                return "exact_match"
            def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
                return {"score": prediction == reference}
        evaluation_config = RunEvalConfig(
            custom_evaluators = [MyStringEvaluator()],
        )
        await arun_on_dataset(
            client,
            "<my_dataset_name>",
            construct_chain,
            evaluation=evaluation_config,
        )
    """  # noqa: E501
    llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run(
        client, dataset_name, llm_or_chain_factory, project_name
    )
    results = await _arun_on_examples(
        client,
        examples,
@@ -1174,11 +1289,91 @@ def run_on_dataset(
    Returns:
        A dictionary containing the run's project name and the resulting model outputs.
-    """
+
-    llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
+
-    project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
+    For the (usually faster) async version of this function, see :func:`arun_on_dataset`.
-    dataset = client.read_dataset(dataset_name=dataset_name)
+
-    examples = client.list_examples(dataset_id=str(dataset.id))
+    Examples
    --------
    .. code-block:: python
        from langsmith import Client
        from langchain.chat_models import ChatOpenAI
        from langchain.chains import LLMChain
        from langchain.smith import RunEvalConfig, run_on_dataset
        # Chains may have memory. Passing in a constructor function lets the
        # evaluation framework avoid cross-contamination between runs.
        def construct_chain():
            llm = ChatOpenAI(temperature=0)
            chain = LLMChain.from_string(
                llm,
                "What's the answer to {your_input_key}"
            )
            return chain
        # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
        evaluation_config = RunEvalConfig(
            evaluators=[
                "qa",  # "Correctness" against a reference answer
                "embedding_distance",
                RunEvalConfig.Criteria("helpfulness"),
                RunEvalConfig.Criteria({
                    "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
                }),
            ]
        )
        client = Client()
        run_on_dataset(
            client,
            "<my_dataset_name>",
            construct_chain,
            evaluation=evaluation_config,
        )
    You can also create custom evaluators by subclassing the
    :class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
    or LangSmith's `RunEvaluator` classes.
    .. code-block:: python
        from typing import Optional
        from langchain.evaluation import StringEvaluator
        class MyStringEvaluator(StringEvaluator):
            @property
            def requires_input(self) -> bool:
                return False
            @property
            def requires_reference(self) -> bool:
                return True
            @property
            def evaluation_name(self) -> str:
                return "exact_match"
            def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
                return {"score": prediction == reference}
        evaluation_config = RunEvalConfig(
            custom_evaluators = [MyStringEvaluator()],
        )
        run_on_dataset(
            client,
            "<my_dataset_name>",
            construct_chain,
            evaluation=evaluation_config,
        )
    """  # noqa: E501
    llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run(
        client, dataset_name, llm_or_chain_factory, project_name
    )
    results = _run_on_examples(
        client,
        examples,
--- a/tests/unit_tests/smith/evaluation/test_runner_utils.py
+++ b/tests/unit_tests/smith/evaluation/test_runner_utils.py
@@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
            {"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
        ]
-    def mock_create_project(*args: Any, **kwargs: Any) -> None:
+    def mock_create_project(*args: Any, **kwargs: Any) -> Any:
-        pass
+        proj = mock.MagicMock()
        proj.id = "123"
        return proj
    with mock.patch.object(
        Client, "read_dataset", new=mock_read_dataset
--- a/tests/unit_tests/smith/test_runner_utils.py
+++ b/tests/unit_tests/smith/test_runner_utils.py
@@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
            {"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
        ]
-    def mock_create_project(*args: Any, **kwargs: Any) -> None:
+    def mock_create_project(*args: Any, **kwargs: Any) -> Any:
-        pass
+        proj = mock.MagicMock()
        proj.id = "123"
        return proj
    with mock.patch.object(
        Client, "read_dataset", new=mock_read_dataset