mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 07:09:31 +00:00
Add examples to docstrings (#7796)
and: - remove dataset name from autogenerated project name - print out project name to view
This commit is contained in:
parent
ed97af423c
commit
c58d35765d
@ -10,7 +10,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n",
|
"One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"For this evalution, we will need 3 things:\n",
|
"For this evaluation, we will need 3 things:\n",
|
||||||
"1. An evaluator\n",
|
"1. An evaluator\n",
|
||||||
"2. A dataset of inputs\n",
|
"2. A dataset of inputs\n",
|
||||||
"3. 2 (or more) LLMs, Chains, or Agents to compare\n",
|
"3. 2 (or more) LLMs, Chains, or Agents to compare\n",
|
||||||
|
@ -19,9 +19,10 @@ from typing import (
|
|||||||
Tuple,
|
Tuple,
|
||||||
Union,
|
Union,
|
||||||
)
|
)
|
||||||
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
|
||||||
from langsmith import Client, RunEvaluator
|
from langsmith import Client, RunEvaluator
|
||||||
from langsmith.schemas import DataType, Example, RunTypeEnum
|
from langsmith.schemas import Dataset, DataType, Example, RunTypeEnum
|
||||||
|
|
||||||
from langchain.callbacks.base import BaseCallbackHandler
|
from langchain.callbacks.base import BaseCallbackHandler
|
||||||
from langchain.callbacks.manager import Callbacks
|
from langchain.callbacks.manager import Callbacks
|
||||||
@ -50,6 +51,19 @@ class InputFormatError(Exception):
|
|||||||
## Shared Utilities
|
## Shared Utilities
|
||||||
|
|
||||||
|
|
||||||
|
def _get_eval_project_url(api_url: str, project_id: str) -> str:
|
||||||
|
"""Get the project url from the api url."""
|
||||||
|
parsed = urlparse(api_url)
|
||||||
|
hostname = parsed.hostname or ""
|
||||||
|
if "api." in hostname:
|
||||||
|
hostname = hostname.replace("api.", "", 1)
|
||||||
|
if "localhost" in hostname:
|
||||||
|
# Remove the port
|
||||||
|
hostname = "localhost"
|
||||||
|
url = urlunparse(parsed._replace(netloc=hostname))
|
||||||
|
return f"{url}/projects/p/{project_id}?eval=true"
|
||||||
|
|
||||||
|
|
||||||
def _wrap_in_chain_factory(
|
def _wrap_in_chain_factory(
|
||||||
llm_or_chain_factory: Union[Chain, MODEL_OR_CHAIN_FACTORY],
|
llm_or_chain_factory: Union[Chain, MODEL_OR_CHAIN_FACTORY],
|
||||||
dataset_name: str = "<my_dataset>",
|
dataset_name: str = "<my_dataset>",
|
||||||
@ -206,7 +220,6 @@ def _get_messages(inputs: Dict[str, Any]) -> List[BaseMessage]:
|
|||||||
def _get_project_name(
|
def _get_project_name(
|
||||||
project_name: Optional[str],
|
project_name: Optional[str],
|
||||||
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
||||||
dataset_name: Optional[str],
|
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Get the project name.
|
Get the project name.
|
||||||
@ -214,7 +227,6 @@ def _get_project_name(
|
|||||||
Args:
|
Args:
|
||||||
project_name: The project name if manually specified.
|
project_name: The project name if manually specified.
|
||||||
llm_or_chain_factory: The Chain or language model constructor.
|
llm_or_chain_factory: The Chain or language model constructor.
|
||||||
dataset_name: The dataset name.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The project name.
|
The project name.
|
||||||
@ -226,8 +238,7 @@ def _get_project_name(
|
|||||||
model_name = llm_or_chain_factory.__class__.__name__
|
model_name = llm_or_chain_factory.__class__.__name__
|
||||||
else:
|
else:
|
||||||
model_name = llm_or_chain_factory().__class__.__name__
|
model_name = llm_or_chain_factory().__class__.__name__
|
||||||
dataset_prefix = f"{dataset_name}-" if dataset_name else ""
|
return f"{current_time}-{model_name}"
|
||||||
return f"{dataset_prefix}{model_name}-{current_time}"
|
|
||||||
|
|
||||||
|
|
||||||
## Shared Validation Utilities
|
## Shared Validation Utilities
|
||||||
@ -801,7 +812,7 @@ async def _arun_on_examples(
|
|||||||
A dictionary mapping example ids to the model outputs.
|
A dictionary mapping example ids to the model outputs.
|
||||||
"""
|
"""
|
||||||
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
|
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
|
||||||
project_name = _get_project_name(project_name, llm_or_chain_factory, None)
|
project_name = _get_project_name(project_name, llm_or_chain_factory)
|
||||||
run_evaluators, examples = _setup_evaluation(
|
run_evaluators, examples = _setup_evaluation(
|
||||||
llm_or_chain_factory, examples, evaluation, data_type
|
llm_or_chain_factory, examples, evaluation, data_type
|
||||||
)
|
)
|
||||||
@ -1033,7 +1044,7 @@ def _run_on_examples(
|
|||||||
"""
|
"""
|
||||||
results: Dict[str, Any] = {}
|
results: Dict[str, Any] = {}
|
||||||
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
|
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
|
||||||
project_name = _get_project_name(project_name, llm_or_chain_factory, None)
|
project_name = _get_project_name(project_name, llm_or_chain_factory)
|
||||||
tracer = LangChainTracer(
|
tracer = LangChainTracer(
|
||||||
project_name=project_name, client=client, use_threading=False
|
project_name=project_name, client=client, use_threading=False
|
||||||
)
|
)
|
||||||
@ -1068,6 +1079,31 @@ def _run_on_examples(
|
|||||||
## Public API
|
## Public API
|
||||||
|
|
||||||
|
|
||||||
|
def _prepare_eval_run(
|
||||||
|
client: Client,
|
||||||
|
dataset_name: str,
|
||||||
|
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
|
||||||
|
project_name: Optional[str],
|
||||||
|
) -> Tuple[MODEL_OR_CHAIN_FACTORY, str, Dataset, Iterator[Example]]:
|
||||||
|
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
|
||||||
|
project_name = _get_project_name(project_name, llm_or_chain_factory)
|
||||||
|
try:
|
||||||
|
project = client.create_project(project_name)
|
||||||
|
except ValueError as e:
|
||||||
|
if "already exists " not in str(e):
|
||||||
|
raise e
|
||||||
|
raise ValueError(
|
||||||
|
f"Project {project_name} already exists. Please use a different name."
|
||||||
|
)
|
||||||
|
project_url = _get_eval_project_url(client.api_url, project.id)
|
||||||
|
print(
|
||||||
|
f"View the evaluation results for project '{project_name}' at:\n{project_url}"
|
||||||
|
)
|
||||||
|
dataset = client.read_dataset(dataset_name=dataset_name)
|
||||||
|
examples = client.list_examples(dataset_id=str(dataset.id))
|
||||||
|
return llm_or_chain_factory, project_name, dataset, examples
|
||||||
|
|
||||||
|
|
||||||
async def arun_on_dataset(
|
async def arun_on_dataset(
|
||||||
client: Client,
|
client: Client,
|
||||||
dataset_name: str,
|
dataset_name: str,
|
||||||
@ -1110,11 +1146,90 @@ async def arun_on_dataset(
|
|||||||
Returns:
|
Returns:
|
||||||
A dictionary containing the run's project name and the
|
A dictionary containing the run's project name and the
|
||||||
resulting model outputs.
|
resulting model outputs.
|
||||||
"""
|
|
||||||
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
|
For the synchronous version, see :func:`run_on_dataset`.
|
||||||
project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
|
|
||||||
dataset = client.read_dataset(dataset_name=dataset_name)
|
Examples
|
||||||
examples = client.list_examples(dataset_id=str(dataset.id))
|
--------
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langsmith import Client
|
||||||
|
from langchain.chat_models import ChatOpenAI
|
||||||
|
from langchain.chains import LLMChain
|
||||||
|
from langchain.smith import RunEvalConfig, arun_on_dataset
|
||||||
|
|
||||||
|
# Chains may have memory. Passing in a constructor function lets the
|
||||||
|
# evaluation framework avoid cross-contamination between runs.
|
||||||
|
def construct_chain():
|
||||||
|
llm = ChatOpenAI(temperature=0)
|
||||||
|
chain = LLMChain.from_string(
|
||||||
|
llm,
|
||||||
|
"What's the answer to {your_input_key}"
|
||||||
|
)
|
||||||
|
return chain
|
||||||
|
|
||||||
|
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
|
||||||
|
evaluation_config = RunEvalConfig(
|
||||||
|
evaluators=[
|
||||||
|
"qa", # "Correctness" against a reference answer
|
||||||
|
"embedding_distance",
|
||||||
|
RunEvalConfig.Criteria("helpfulness"),
|
||||||
|
RunEvalConfig.Criteria({
|
||||||
|
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
|
||||||
|
}),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
client = Client()
|
||||||
|
await arun_on_dataset(
|
||||||
|
client,
|
||||||
|
"<my_dataset_name>",
|
||||||
|
construct_chain,
|
||||||
|
evaluation=evaluation_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
You can also create custom evaluators by subclassing the
|
||||||
|
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
|
||||||
|
or LangSmith's `RunEvaluator` classes.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
from langchain.evaluation import StringEvaluator
|
||||||
|
|
||||||
|
class MyStringEvaluator(StringEvaluator):
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_input(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_reference(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def evaluation_name(self) -> str:
|
||||||
|
return "exact_match"
|
||||||
|
|
||||||
|
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
|
||||||
|
return {"score": prediction == reference}
|
||||||
|
|
||||||
|
|
||||||
|
evaluation_config = RunEvalConfig(
|
||||||
|
custom_evaluators = [MyStringEvaluator()],
|
||||||
|
)
|
||||||
|
|
||||||
|
await arun_on_dataset(
|
||||||
|
client,
|
||||||
|
"<my_dataset_name>",
|
||||||
|
construct_chain,
|
||||||
|
evaluation=evaluation_config,
|
||||||
|
)
|
||||||
|
""" # noqa: E501
|
||||||
|
llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run(
|
||||||
|
client, dataset_name, llm_or_chain_factory, project_name
|
||||||
|
)
|
||||||
results = await _arun_on_examples(
|
results = await _arun_on_examples(
|
||||||
client,
|
client,
|
||||||
examples,
|
examples,
|
||||||
@ -1174,11 +1289,91 @@ def run_on_dataset(
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A dictionary containing the run's project name and the resulting model outputs.
|
A dictionary containing the run's project name and the resulting model outputs.
|
||||||
"""
|
|
||||||
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
|
|
||||||
project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
|
For the (usually faster) async version of this function, see :func:`arun_on_dataset`.
|
||||||
dataset = client.read_dataset(dataset_name=dataset_name)
|
|
||||||
examples = client.list_examples(dataset_id=str(dataset.id))
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langsmith import Client
|
||||||
|
from langchain.chat_models import ChatOpenAI
|
||||||
|
from langchain.chains import LLMChain
|
||||||
|
from langchain.smith import RunEvalConfig, run_on_dataset
|
||||||
|
|
||||||
|
# Chains may have memory. Passing in a constructor function lets the
|
||||||
|
# evaluation framework avoid cross-contamination between runs.
|
||||||
|
def construct_chain():
|
||||||
|
llm = ChatOpenAI(temperature=0)
|
||||||
|
chain = LLMChain.from_string(
|
||||||
|
llm,
|
||||||
|
"What's the answer to {your_input_key}"
|
||||||
|
)
|
||||||
|
return chain
|
||||||
|
|
||||||
|
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
|
||||||
|
evaluation_config = RunEvalConfig(
|
||||||
|
evaluators=[
|
||||||
|
"qa", # "Correctness" against a reference answer
|
||||||
|
"embedding_distance",
|
||||||
|
RunEvalConfig.Criteria("helpfulness"),
|
||||||
|
RunEvalConfig.Criteria({
|
||||||
|
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
|
||||||
|
}),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
client = Client()
|
||||||
|
run_on_dataset(
|
||||||
|
client,
|
||||||
|
"<my_dataset_name>",
|
||||||
|
construct_chain,
|
||||||
|
evaluation=evaluation_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
You can also create custom evaluators by subclassing the
|
||||||
|
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
|
||||||
|
or LangSmith's `RunEvaluator` classes.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
from langchain.evaluation import StringEvaluator
|
||||||
|
|
||||||
|
class MyStringEvaluator(StringEvaluator):
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_input(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_reference(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def evaluation_name(self) -> str:
|
||||||
|
return "exact_match"
|
||||||
|
|
||||||
|
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
|
||||||
|
return {"score": prediction == reference}
|
||||||
|
|
||||||
|
|
||||||
|
evaluation_config = RunEvalConfig(
|
||||||
|
custom_evaluators = [MyStringEvaluator()],
|
||||||
|
)
|
||||||
|
|
||||||
|
run_on_dataset(
|
||||||
|
client,
|
||||||
|
"<my_dataset_name>",
|
||||||
|
construct_chain,
|
||||||
|
evaluation=evaluation_config,
|
||||||
|
)
|
||||||
|
""" # noqa: E501
|
||||||
|
llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run(
|
||||||
|
client, dataset_name, llm_or_chain_factory, project_name
|
||||||
|
)
|
||||||
results = _run_on_examples(
|
results = _run_on_examples(
|
||||||
client,
|
client,
|
||||||
examples,
|
examples,
|
||||||
|
@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
|
|||||||
{"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
|
{"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
|
||||||
]
|
]
|
||||||
|
|
||||||
def mock_create_project(*args: Any, **kwargs: Any) -> None:
|
def mock_create_project(*args: Any, **kwargs: Any) -> Any:
|
||||||
pass
|
proj = mock.MagicMock()
|
||||||
|
proj.id = "123"
|
||||||
|
return proj
|
||||||
|
|
||||||
with mock.patch.object(
|
with mock.patch.object(
|
||||||
Client, "read_dataset", new=mock_read_dataset
|
Client, "read_dataset", new=mock_read_dataset
|
||||||
|
@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
|
|||||||
{"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
|
{"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
|
||||||
]
|
]
|
||||||
|
|
||||||
def mock_create_project(*args: Any, **kwargs: Any) -> None:
|
def mock_create_project(*args: Any, **kwargs: Any) -> Any:
|
||||||
pass
|
proj = mock.MagicMock()
|
||||||
|
proj.id = "123"
|
||||||
|
return proj
|
||||||
|
|
||||||
with mock.patch.object(
|
with mock.patch.object(
|
||||||
Client, "read_dataset", new=mock_read_dataset
|
Client, "read_dataset", new=mock_read_dataset
|
||||||
|
Loading…
Reference in New Issue
Block a user