From c58d35765daa063b87733e006a4d65e2e1f388d3 Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Sun, 16 Jul 2023 12:05:56 -0700
Subject: [PATCH] Add examples to docstrings (#7796)
and:
- remove dataset name from autogenerated project name
- print out project name to view
---
.../guides/evaluation/comparisons.ipynb | 2 +-
langchain/smith/evaluation/runner_utils.py | 229 ++++++++++++++++--
.../smith/evaluation/test_runner_utils.py | 6 +-
tests/unit_tests/smith/test_runner_utils.py | 6 +-
4 files changed, 221 insertions(+), 22 deletions(-)
diff --git a/docs/extras/guides/evaluation/comparisons.ipynb b/docs/extras/guides/evaluation/comparisons.ipynb
index 300623bb31b..0e544824a94 100644
--- a/docs/extras/guides/evaluation/comparisons.ipynb
+++ b/docs/extras/guides/evaluation/comparisons.ipynb
@@ -10,7 +10,7 @@
"\n",
"One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`[[1]](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n",
"\n",
- "For this evalution, we will need 3 things:\n",
+ "For this evaluation, we will need 3 things:\n",
"1. An evaluator\n",
"2. A dataset of inputs\n",
"3. 2 (or more) LLMs, Chains, or Agents to compare\n",
diff --git a/langchain/smith/evaluation/runner_utils.py b/langchain/smith/evaluation/runner_utils.py
index 4c7645ead6b..cc32e514c63 100644
--- a/langchain/smith/evaluation/runner_utils.py
+++ b/langchain/smith/evaluation/runner_utils.py
@@ -19,9 +19,10 @@ from typing import (
Tuple,
Union,
)
+from urllib.parse import urlparse, urlunparse
from langsmith import Client, RunEvaluator
-from langsmith.schemas import DataType, Example, RunTypeEnum
+from langsmith.schemas import Dataset, DataType, Example, RunTypeEnum
from langchain.callbacks.base import BaseCallbackHandler
from langchain.callbacks.manager import Callbacks
@@ -50,6 +51,19 @@ class InputFormatError(Exception):
## Shared Utilities
+def _get_eval_project_url(api_url: str, project_id: str) -> str:
+ """Get the project url from the api url."""
+ parsed = urlparse(api_url)
+ hostname = parsed.hostname or ""
+ if "api." in hostname:
+ hostname = hostname.replace("api.", "", 1)
+ if "localhost" in hostname:
+ # Remove the port
+ hostname = "localhost"
+ url = urlunparse(parsed._replace(netloc=hostname))
+ return f"{url}/projects/p/{project_id}?eval=true"
+
+
def _wrap_in_chain_factory(
llm_or_chain_factory: Union[Chain, MODEL_OR_CHAIN_FACTORY],
dataset_name: str = "",
@@ -206,7 +220,6 @@ def _get_messages(inputs: Dict[str, Any]) -> List[BaseMessage]:
def _get_project_name(
project_name: Optional[str],
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
- dataset_name: Optional[str],
) -> str:
"""
Get the project name.
@@ -214,7 +227,6 @@ def _get_project_name(
Args:
project_name: The project name if manually specified.
llm_or_chain_factory: The Chain or language model constructor.
- dataset_name: The dataset name.
Returns:
The project name.
@@ -226,8 +238,7 @@ def _get_project_name(
model_name = llm_or_chain_factory.__class__.__name__
else:
model_name = llm_or_chain_factory().__class__.__name__
- dataset_prefix = f"{dataset_name}-" if dataset_name else ""
- return f"{dataset_prefix}{model_name}-{current_time}"
+ return f"{current_time}-{model_name}"
## Shared Validation Utilities
@@ -801,7 +812,7 @@ async def _arun_on_examples(
A dictionary mapping example ids to the model outputs.
"""
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
- project_name = _get_project_name(project_name, llm_or_chain_factory, None)
+ project_name = _get_project_name(project_name, llm_or_chain_factory)
run_evaluators, examples = _setup_evaluation(
llm_or_chain_factory, examples, evaluation, data_type
)
@@ -1033,7 +1044,7 @@ def _run_on_examples(
"""
results: Dict[str, Any] = {}
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
- project_name = _get_project_name(project_name, llm_or_chain_factory, None)
+ project_name = _get_project_name(project_name, llm_or_chain_factory)
tracer = LangChainTracer(
project_name=project_name, client=client, use_threading=False
)
@@ -1068,6 +1079,31 @@ def _run_on_examples(
## Public API
+def _prepare_eval_run(
+ client: Client,
+ dataset_name: str,
+ llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
+ project_name: Optional[str],
+) -> Tuple[MODEL_OR_CHAIN_FACTORY, str, Dataset, Iterator[Example]]:
+ llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
+ project_name = _get_project_name(project_name, llm_or_chain_factory)
+ try:
+ project = client.create_project(project_name)
+ except ValueError as e:
+ if "already exists " not in str(e):
+ raise e
+ raise ValueError(
+ f"Project {project_name} already exists. Please use a different name."
+ )
+ project_url = _get_eval_project_url(client.api_url, project.id)
+ print(
+ f"View the evaluation results for project '{project_name}' at:\n{project_url}"
+ )
+ dataset = client.read_dataset(dataset_name=dataset_name)
+ examples = client.list_examples(dataset_id=str(dataset.id))
+ return llm_or_chain_factory, project_name, dataset, examples
+
+
async def arun_on_dataset(
client: Client,
dataset_name: str,
@@ -1110,11 +1146,90 @@ async def arun_on_dataset(
Returns:
A dictionary containing the run's project name and the
resulting model outputs.
- """
- llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
- project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
- dataset = client.read_dataset(dataset_name=dataset_name)
- examples = client.list_examples(dataset_id=str(dataset.id))
+
+ For the synchronous version, see :func:`run_on_dataset`.
+
+ Examples
+ --------
+
+ .. code-block:: python
+
+ from langsmith import Client
+ from langchain.chat_models import ChatOpenAI
+ from langchain.chains import LLMChain
+ from langchain.smith import RunEvalConfig, arun_on_dataset
+
+ # Chains may have memory. Passing in a constructor function lets the
+ # evaluation framework avoid cross-contamination between runs.
+ def construct_chain():
+ llm = ChatOpenAI(temperature=0)
+ chain = LLMChain.from_string(
+ llm,
+ "What's the answer to {your_input_key}"
+ )
+ return chain
+
+ # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
+ evaluation_config = RunEvalConfig(
+ evaluators=[
+ "qa", # "Correctness" against a reference answer
+ "embedding_distance",
+ RunEvalConfig.Criteria("helpfulness"),
+ RunEvalConfig.Criteria({
+ "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
+ }),
+ ]
+ )
+
+ client = Client()
+ await arun_on_dataset(
+ client,
+ "",
+ construct_chain,
+ evaluation=evaluation_config,
+ )
+
+ You can also create custom evaluators by subclassing the
+ :class:`StringEvaluator `
+ or LangSmith's `RunEvaluator` classes.
+
+ .. code-block:: python
+
+ from typing import Optional
+ from langchain.evaluation import StringEvaluator
+
+ class MyStringEvaluator(StringEvaluator):
+
+ @property
+ def requires_input(self) -> bool:
+ return False
+
+ @property
+ def requires_reference(self) -> bool:
+ return True
+
+ @property
+ def evaluation_name(self) -> str:
+ return "exact_match"
+
+ def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
+ return {"score": prediction == reference}
+
+
+ evaluation_config = RunEvalConfig(
+ custom_evaluators = [MyStringEvaluator()],
+ )
+
+ await arun_on_dataset(
+ client,
+ "",
+ construct_chain,
+ evaluation=evaluation_config,
+ )
+ """ # noqa: E501
+ llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run(
+ client, dataset_name, llm_or_chain_factory, project_name
+ )
results = await _arun_on_examples(
client,
examples,
@@ -1174,11 +1289,91 @@ def run_on_dataset(
Returns:
A dictionary containing the run's project name and the resulting model outputs.
- """
- llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
- project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
- dataset = client.read_dataset(dataset_name=dataset_name)
- examples = client.list_examples(dataset_id=str(dataset.id))
+
+
+ For the (usually faster) async version of this function, see :func:`arun_on_dataset`.
+
+ Examples
+ --------
+
+ .. code-block:: python
+
+ from langsmith import Client
+ from langchain.chat_models import ChatOpenAI
+ from langchain.chains import LLMChain
+ from langchain.smith import RunEvalConfig, run_on_dataset
+
+ # Chains may have memory. Passing in a constructor function lets the
+ # evaluation framework avoid cross-contamination between runs.
+ def construct_chain():
+ llm = ChatOpenAI(temperature=0)
+ chain = LLMChain.from_string(
+ llm,
+ "What's the answer to {your_input_key}"
+ )
+ return chain
+
+ # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
+ evaluation_config = RunEvalConfig(
+ evaluators=[
+ "qa", # "Correctness" against a reference answer
+ "embedding_distance",
+ RunEvalConfig.Criteria("helpfulness"),
+ RunEvalConfig.Criteria({
+ "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
+ }),
+ ]
+ )
+
+ client = Client()
+ run_on_dataset(
+ client,
+ "",
+ construct_chain,
+ evaluation=evaluation_config,
+ )
+
+ You can also create custom evaluators by subclassing the
+ :class:`StringEvaluator `
+ or LangSmith's `RunEvaluator` classes.
+
+ .. code-block:: python
+
+ from typing import Optional
+ from langchain.evaluation import StringEvaluator
+
+ class MyStringEvaluator(StringEvaluator):
+
+ @property
+ def requires_input(self) -> bool:
+ return False
+
+ @property
+ def requires_reference(self) -> bool:
+ return True
+
+ @property
+ def evaluation_name(self) -> str:
+ return "exact_match"
+
+ def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
+ return {"score": prediction == reference}
+
+
+ evaluation_config = RunEvalConfig(
+ custom_evaluators = [MyStringEvaluator()],
+ )
+
+ run_on_dataset(
+ client,
+ "",
+ construct_chain,
+ evaluation=evaluation_config,
+ )
+ """ # noqa: E501
+ llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run(
+ client, dataset_name, llm_or_chain_factory, project_name
+ )
results = _run_on_examples(
client,
examples,
diff --git a/tests/unit_tests/smith/evaluation/test_runner_utils.py b/tests/unit_tests/smith/evaluation/test_runner_utils.py
index f93909bb40e..3bf9a1d32c8 100644
--- a/tests/unit_tests/smith/evaluation/test_runner_utils.py
+++ b/tests/unit_tests/smith/evaluation/test_runner_utils.py
@@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
{"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
]
- def mock_create_project(*args: Any, **kwargs: Any) -> None:
- pass
+ def mock_create_project(*args: Any, **kwargs: Any) -> Any:
+ proj = mock.MagicMock()
+ proj.id = "123"
+ return proj
with mock.patch.object(
Client, "read_dataset", new=mock_read_dataset
diff --git a/tests/unit_tests/smith/test_runner_utils.py b/tests/unit_tests/smith/test_runner_utils.py
index f93909bb40e..3bf9a1d32c8 100644
--- a/tests/unit_tests/smith/test_runner_utils.py
+++ b/tests/unit_tests/smith/test_runner_utils.py
@@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
{"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
]
- def mock_create_project(*args: Any, **kwargs: Any) -> None:
- pass
+ def mock_create_project(*args: Any, **kwargs: Any) -> Any:
+ proj = mock.MagicMock()
+ proj.id = "123"
+ return proj
with mock.patch.object(
Client, "read_dataset", new=mock_read_dataset