mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 14:49:29 +00:00
Add examples to docstrings (#7796)
and: - remove dataset name from autogenerated project name - print out project name to view
This commit is contained in:
parent
ed97af423c
commit
c58d35765d
@ -10,7 +10,7 @@
|
||||
"\n",
|
||||
"One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n",
|
||||
"\n",
|
||||
"For this evalution, we will need 3 things:\n",
|
||||
"For this evaluation, we will need 3 things:\n",
|
||||
"1. An evaluator\n",
|
||||
"2. A dataset of inputs\n",
|
||||
"3. 2 (or more) LLMs, Chains, or Agents to compare\n",
|
||||
|
@ -19,9 +19,10 @@ from typing import (
|
||||
Tuple,
|
||||
Union,
|
||||
)
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
from langsmith import Client, RunEvaluator
|
||||
from langsmith.schemas import DataType, Example, RunTypeEnum
|
||||
from langsmith.schemas import Dataset, DataType, Example, RunTypeEnum
|
||||
|
||||
from langchain.callbacks.base import BaseCallbackHandler
|
||||
from langchain.callbacks.manager import Callbacks
|
||||
@ -50,6 +51,19 @@ class InputFormatError(Exception):
|
||||
## Shared Utilities
|
||||
|
||||
|
||||
def _get_eval_project_url(api_url: str, project_id: str) -> str:
|
||||
"""Get the project url from the api url."""
|
||||
parsed = urlparse(api_url)
|
||||
hostname = parsed.hostname or ""
|
||||
if "api." in hostname:
|
||||
hostname = hostname.replace("api.", "", 1)
|
||||
if "localhost" in hostname:
|
||||
# Remove the port
|
||||
hostname = "localhost"
|
||||
url = urlunparse(parsed._replace(netloc=hostname))
|
||||
return f"{url}/projects/p/{project_id}?eval=true"
|
||||
|
||||
|
||||
def _wrap_in_chain_factory(
|
||||
llm_or_chain_factory: Union[Chain, MODEL_OR_CHAIN_FACTORY],
|
||||
dataset_name: str = "<my_dataset>",
|
||||
@ -206,7 +220,6 @@ def _get_messages(inputs: Dict[str, Any]) -> List[BaseMessage]:
|
||||
def _get_project_name(
|
||||
project_name: Optional[str],
|
||||
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
||||
dataset_name: Optional[str],
|
||||
) -> str:
|
||||
"""
|
||||
Get the project name.
|
||||
@ -214,7 +227,6 @@ def _get_project_name(
|
||||
Args:
|
||||
project_name: The project name if manually specified.
|
||||
llm_or_chain_factory: The Chain or language model constructor.
|
||||
dataset_name: The dataset name.
|
||||
|
||||
Returns:
|
||||
The project name.
|
||||
@ -226,8 +238,7 @@ def _get_project_name(
|
||||
model_name = llm_or_chain_factory.__class__.__name__
|
||||
else:
|
||||
model_name = llm_or_chain_factory().__class__.__name__
|
||||
dataset_prefix = f"{dataset_name}-" if dataset_name else ""
|
||||
return f"{dataset_prefix}{model_name}-{current_time}"
|
||||
return f"{current_time}-{model_name}"
|
||||
|
||||
|
||||
## Shared Validation Utilities
|
||||
@ -801,7 +812,7 @@ async def _arun_on_examples(
|
||||
A dictionary mapping example ids to the model outputs.
|
||||
"""
|
||||
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
|
||||
project_name = _get_project_name(project_name, llm_or_chain_factory, None)
|
||||
project_name = _get_project_name(project_name, llm_or_chain_factory)
|
||||
run_evaluators, examples = _setup_evaluation(
|
||||
llm_or_chain_factory, examples, evaluation, data_type
|
||||
)
|
||||
@ -1033,7 +1044,7 @@ def _run_on_examples(
|
||||
"""
|
||||
results: Dict[str, Any] = {}
|
||||
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
|
||||
project_name = _get_project_name(project_name, llm_or_chain_factory, None)
|
||||
project_name = _get_project_name(project_name, llm_or_chain_factory)
|
||||
tracer = LangChainTracer(
|
||||
project_name=project_name, client=client, use_threading=False
|
||||
)
|
||||
@ -1068,6 +1079,31 @@ def _run_on_examples(
|
||||
## Public API
|
||||
|
||||
|
||||
def _prepare_eval_run(
|
||||
client: Client,
|
||||
dataset_name: str,
|
||||
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
||||
project_name: Optional[str],
|
||||
) -> Tuple[MODEL_OR_CHAIN_FACTORY, str, Dataset, Iterator[Example]]:
|
||||
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
|
||||
project_name = _get_project_name(project_name, llm_or_chain_factory)
|
||||
try:
|
||||
project = client.create_project(project_name)
|
||||
except ValueError as e:
|
||||
if "already exists " not in str(e):
|
||||
raise e
|
||||
raise ValueError(
|
||||
f"Project {project_name} already exists. Please use a different name."
|
||||
)
|
||||
project_url = _get_eval_project_url(client.api_url, project.id)
|
||||
print(
|
||||
f"View the evaluation results for project '{project_name}' at:\n{project_url}"
|
||||
)
|
||||
dataset = client.read_dataset(dataset_name=dataset_name)
|
||||
examples = client.list_examples(dataset_id=str(dataset.id))
|
||||
return llm_or_chain_factory, project_name, dataset, examples
|
||||
|
||||
|
||||
async def arun_on_dataset(
|
||||
client: Client,
|
||||
dataset_name: str,
|
||||
@ -1110,11 +1146,90 @@ async def arun_on_dataset(
|
||||
Returns:
|
||||
A dictionary containing the run's project name and the
|
||||
resulting model outputs.
|
||||
"""
|
||||
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
|
||||
project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
|
||||
dataset = client.read_dataset(dataset_name=dataset_name)
|
||||
examples = client.list_examples(dataset_id=str(dataset.id))
|
||||
|
||||
For the synchronous version, see :func:`run_on_dataset`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langsmith import Client
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.chains import LLMChain
|
||||
from langchain.smith import RunEvalConfig, arun_on_dataset
|
||||
|
||||
# Chains may have memory. Passing in a constructor function lets the
|
||||
# evaluation framework avoid cross-contamination between runs.
|
||||
def construct_chain():
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
chain = LLMChain.from_string(
|
||||
llm,
|
||||
"What's the answer to {your_input_key}"
|
||||
)
|
||||
return chain
|
||||
|
||||
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
|
||||
evaluation_config = RunEvalConfig(
|
||||
evaluators=[
|
||||
"qa", # "Correctness" against a reference answer
|
||||
"embedding_distance",
|
||||
RunEvalConfig.Criteria("helpfulness"),
|
||||
RunEvalConfig.Criteria({
|
||||
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
|
||||
}),
|
||||
]
|
||||
)
|
||||
|
||||
client = Client()
|
||||
await arun_on_dataset(
|
||||
client,
|
||||
"<my_dataset_name>",
|
||||
construct_chain,
|
||||
evaluation=evaluation_config,
|
||||
)
|
||||
|
||||
You can also create custom evaluators by subclassing the
|
||||
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
|
||||
or LangSmith's `RunEvaluator` classes.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from typing import Optional
|
||||
from langchain.evaluation import StringEvaluator
|
||||
|
||||
class MyStringEvaluator(StringEvaluator):
|
||||
|
||||
@property
|
||||
def requires_input(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def requires_reference(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def evaluation_name(self) -> str:
|
||||
return "exact_match"
|
||||
|
||||
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
|
||||
return {"score": prediction == reference}
|
||||
|
||||
|
||||
evaluation_config = RunEvalConfig(
|
||||
custom_evaluators = [MyStringEvaluator()],
|
||||
)
|
||||
|
||||
await arun_on_dataset(
|
||||
client,
|
||||
"<my_dataset_name>",
|
||||
construct_chain,
|
||||
evaluation=evaluation_config,
|
||||
)
|
||||
""" # noqa: E501
|
||||
llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run(
|
||||
client, dataset_name, llm_or_chain_factory, project_name
|
||||
)
|
||||
results = await _arun_on_examples(
|
||||
client,
|
||||
examples,
|
||||
@ -1174,11 +1289,91 @@ def run_on_dataset(
|
||||
|
||||
Returns:
|
||||
A dictionary containing the run's project name and the resulting model outputs.
|
||||
"""
|
||||
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
|
||||
project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
|
||||
dataset = client.read_dataset(dataset_name=dataset_name)
|
||||
examples = client.list_examples(dataset_id=str(dataset.id))
|
||||
|
||||
|
||||
For the (usually faster) async version of this function, see :func:`arun_on_dataset`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langsmith import Client
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.chains import LLMChain
|
||||
from langchain.smith import RunEvalConfig, run_on_dataset
|
||||
|
||||
# Chains may have memory. Passing in a constructor function lets the
|
||||
# evaluation framework avoid cross-contamination between runs.
|
||||
def construct_chain():
|
||||
llm = ChatOpenAI(temperature=0)
|
||||
chain = LLMChain.from_string(
|
||||
llm,
|
||||
"What's the answer to {your_input_key}"
|
||||
)
|
||||
return chain
|
||||
|
||||
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
|
||||
evaluation_config = RunEvalConfig(
|
||||
evaluators=[
|
||||
"qa", # "Correctness" against a reference answer
|
||||
"embedding_distance",
|
||||
RunEvalConfig.Criteria("helpfulness"),
|
||||
RunEvalConfig.Criteria({
|
||||
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
|
||||
}),
|
||||
]
|
||||
)
|
||||
|
||||
client = Client()
|
||||
run_on_dataset(
|
||||
client,
|
||||
"<my_dataset_name>",
|
||||
construct_chain,
|
||||
evaluation=evaluation_config,
|
||||
)
|
||||
|
||||
You can also create custom evaluators by subclassing the
|
||||
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
|
||||
or LangSmith's `RunEvaluator` classes.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from typing import Optional
|
||||
from langchain.evaluation import StringEvaluator
|
||||
|
||||
class MyStringEvaluator(StringEvaluator):
|
||||
|
||||
@property
|
||||
def requires_input(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def requires_reference(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def evaluation_name(self) -> str:
|
||||
return "exact_match"
|
||||
|
||||
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
|
||||
return {"score": prediction == reference}
|
||||
|
||||
|
||||
evaluation_config = RunEvalConfig(
|
||||
custom_evaluators = [MyStringEvaluator()],
|
||||
)
|
||||
|
||||
run_on_dataset(
|
||||
client,
|
||||
"<my_dataset_name>",
|
||||
construct_chain,
|
||||
evaluation=evaluation_config,
|
||||
)
|
||||
""" # noqa: E501
|
||||
llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run(
|
||||
client, dataset_name, llm_or_chain_factory, project_name
|
||||
)
|
||||
results = _run_on_examples(
|
||||
client,
|
||||
examples,
|
||||
|
@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
{"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
|
||||
]
|
||||
|
||||
def mock_create_project(*args: Any, **kwargs: Any) -> None:
|
||||
pass
|
||||
def mock_create_project(*args: Any, **kwargs: Any) -> Any:
|
||||
proj = mock.MagicMock()
|
||||
proj.id = "123"
|
||||
return proj
|
||||
|
||||
with mock.patch.object(
|
||||
Client, "read_dataset", new=mock_read_dataset
|
||||
|
@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
{"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
|
||||
]
|
||||
|
||||
def mock_create_project(*args: Any, **kwargs: Any) -> None:
|
||||
pass
|
||||
def mock_create_project(*args: Any, **kwargs: Any) -> Any:
|
||||
proj = mock.MagicMock()
|
||||
proj.id = "123"
|
||||
return proj
|
||||
|
||||
with mock.patch.object(
|
||||
Client, "read_dataset", new=mock_read_dataset
|
||||
|
Loading…
Reference in New Issue
Block a user