Compare commits

...

2 Commits

Author SHA1 Message Date
William Fu-Hinthorn
4ecbb3aeac Delete deprecated run evaluator loaders 2023-07-10 10:52:19 -07:00
William Fu-Hinthorn
4d50092103 Switch to langsmith 2023-07-10 10:49:37 -07:00
17 changed files with 1295 additions and 1096 deletions

View File

@@ -4,7 +4,7 @@ from concurrent.futures import Future, ThreadPoolExecutor, wait
from typing import Any, Optional, Sequence, Set, Union
from uuid import UUID
from langchainplus_sdk import LangChainPlusClient, RunEvaluator
from langsmith import Client, RunEvaluator
from langchain.callbacks.manager import tracing_v2_enabled
from langchain.callbacks.tracers.base import BaseTracer
@@ -23,8 +23,8 @@ class EvaluatorCallbackHandler(BaseTracer):
max_workers : int, optional
The maximum number of worker threads to use for running the evaluators.
If not specified, it will default to the number of evaluators.
client : LangChainPlusClient, optional
The LangChainPlusClient instance to use for evaluating the runs.
client : LangSmith Client, optional
The LangSmith client instance to use for evaluating the runs.
If not specified, a new instance will be created.
example_id : Union[UUID, str], optional
The example ID to be associated with the runs.
@@ -35,8 +35,8 @@ class EvaluatorCallbackHandler(BaseTracer):
----------
example_id : Union[UUID, None]
The example ID associated with the runs.
client : LangChainPlusClient
The LangChainPlusClient instance used for evaluating the runs.
client : Client
The LangSmith client instance used for evaluating the runs.
evaluators : Sequence[RunEvaluator]
The sequence of run evaluators to be executed.
executor : ThreadPoolExecutor
@@ -56,7 +56,7 @@ class EvaluatorCallbackHandler(BaseTracer):
self,
evaluators: Sequence[RunEvaluator],
max_workers: Optional[int] = None,
client: Optional[LangChainPlusClient] = None,
client: Optional[Client] = None,
example_id: Optional[Union[UUID, str]] = None,
skip_unfinished: bool = True,
project_name: Optional[str] = None,
@@ -66,7 +66,7 @@ class EvaluatorCallbackHandler(BaseTracer):
self.example_id = (
UUID(example_id) if isinstance(example_id, str) else example_id
)
self.client = client or LangChainPlusClient()
self.client = client or Client()
self.evaluators = evaluators
self.executor = ThreadPoolExecutor(
max_workers=max(max_workers or len(evaluators), 1)

View File

@@ -8,7 +8,7 @@ from datetime import datetime
from typing import Any, Dict, List, Optional, Set, Union
from uuid import UUID
from langchainplus_sdk import LangChainPlusClient
from langsmith import Client
from langchain.callbacks.tracers.base import BaseTracer
from langchain.callbacks.tracers.schemas import Run, RunTypeEnum, TracerSession
@@ -44,7 +44,7 @@ class LangChainTracer(BaseTracer):
self,
example_id: Optional[Union[UUID, str]] = None,
project_name: Optional[str] = None,
client: Optional[LangChainPlusClient] = None,
client: Optional[Client] = None,
tags: Optional[List[str]] = None,
**kwargs: Any,
) -> None:
@@ -59,7 +59,7 @@ class LangChainTracer(BaseTracer):
)
# set max_workers to 1 to process tasks in order
self.executor = ThreadPoolExecutor(max_workers=1)
self.client = client or LangChainPlusClient()
self.client = client or Client()
self._futures: Set[Future] = set()
self.tags = tags or []
global _TRACERS

View File

@@ -5,8 +5,8 @@ import datetime
from typing import Any, Dict, List, Optional
from uuid import UUID
from langchainplus_sdk.schemas import RunBase as BaseRunV2
from langchainplus_sdk.schemas import RunTypeEnum
from langsmith.schemas import RunBase as BaseRunV2
from langsmith.schemas import RunTypeEnum
from pydantic import BaseModel, Field, root_validator
from langchain.schema import LLMResult

View File

@@ -18,8 +18,8 @@ from typing import (
Union,
)
from langchainplus_sdk import LangChainPlusClient, RunEvaluator
from langchainplus_sdk.schemas import Example
from langsmith import Client, RunEvaluator
from langsmith.schemas import Example
from langchain.callbacks.base import BaseCallbackHandler
from langchain.callbacks.manager import Callbacks
@@ -306,7 +306,7 @@ async def _gather_with_concurrency(
async def _callbacks_initializer(
project_name: Optional[str],
client: LangChainPlusClient,
client: Client,
run_evaluators: Sequence[RunEvaluator],
evaluation_handler_collector: List[EvaluatorCallbackHandler],
) -> List[BaseTracer]:
@@ -348,7 +348,7 @@ async def arun_on_examples(
num_repetitions: int = 1,
project_name: Optional[str] = None,
verbose: bool = False,
client: Optional[LangChainPlusClient] = None,
client: Optional[Client] = None,
tags: Optional[List[str]] = None,
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
input_mapper: Optional[Callable[[Dict], Any]] = None,
@@ -369,7 +369,7 @@ async def arun_on_examples(
project_name: Project name to use when tracing runs.
Defaults to {dataset_name}-{chain class name}-{datetime}.
verbose: Whether to print progress.
client: Client to use to read the dataset. If not provided, a new
client: LangSmith client to use to read the dataset. If not provided, a new
client will be created using the credentials in the environment.
tags: Tags to add to each run in the project.
run_evaluators: Evaluators to run on the results of the chain.
@@ -383,7 +383,7 @@ async def arun_on_examples(
A dictionary mapping example ids to the model outputs.
"""
project_name = _get_project_name(project_name, llm_or_chain_factory, None)
client_ = client or LangChainPlusClient()
client_ = client or Client()
client_.create_project(project_name)
results: Dict[str, List[Any]] = {}
@@ -548,7 +548,7 @@ def run_on_examples(
num_repetitions: int = 1,
project_name: Optional[str] = None,
verbose: bool = False,
client: Optional[LangChainPlusClient] = None,
client: Optional[Client] = None,
tags: Optional[List[str]] = None,
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
input_mapper: Optional[Callable[[Dict], Any]] = None,
@@ -568,7 +568,7 @@ def run_on_examples(
project_name: Name of the project to store the traces in.
Defaults to {dataset_name}-{chain class name}-{datetime}.
verbose: Whether to print progress.
client: Client to use to access the dataset. If None, a new client
client: LangSmith client to use to access the dataset. If None, a new client
will be created using the credentials in the environment.
tags: Tags to add to each run in the project.
run_evaluators: Evaluators to run on the results of the chain.
@@ -583,7 +583,7 @@ def run_on_examples(
"""
results: Dict[str, Any] = {}
project_name = _get_project_name(project_name, llm_or_chain_factory, None)
client_ = client or LangChainPlusClient()
client_ = client or Client()
client_.create_project(project_name)
tracer = LangChainTracer(project_name=project_name)
evaluator_project_name = f"{project_name}-evaluators"
@@ -645,7 +645,7 @@ async def arun_on_dataset(
num_repetitions: int = 1,
project_name: Optional[str] = None,
verbose: bool = False,
client: Optional[LangChainPlusClient] = None,
client: Optional[Client] = None,
tags: Optional[List[str]] = None,
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
input_mapper: Optional[Callable[[Dict], Any]] = None,
@@ -666,8 +666,8 @@ async def arun_on_dataset(
project_name: Name of the project to store the traces in.
Defaults to {dataset_name}-{chain class name}-{datetime}.
verbose: Whether to print progress.
client: Client to use to read the dataset. If not provided,
a new client will be created using the credentials in the environment.
client: LangSmith client to use to read the dataset. If not provided, a new
client will be created using the credentials in the environment.
tags: Tags to add to each run in the project.
run_evaluators: Evaluators to run on the results of the chain.
input_mapper: A function to map to the inputs dictionary from an Example
@@ -678,7 +678,7 @@ async def arun_on_dataset(
Returns:
A dictionary containing the run's project name and the resulting model outputs.
"""
client_ = client or LangChainPlusClient()
client_ = client or Client()
project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
dataset = client_.read_dataset(dataset_name=dataset_name)
examples = client_.list_examples(dataset_id=str(dataset.id))
@@ -707,7 +707,7 @@ def run_on_dataset(
num_repetitions: int = 1,
project_name: Optional[str] = None,
verbose: bool = False,
client: Optional[LangChainPlusClient] = None,
client: Optional[Client] = None,
tags: Optional[List[str]] = None,
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
input_mapper: Optional[Callable[[Dict], Any]] = None,
@@ -727,8 +727,8 @@ def run_on_dataset(
project_name: Name of the project to store the traces in.
Defaults to {dataset_name}-{chain class name}-{datetime}.
verbose: Whether to print progress.
client: Client to use to access the dataset. If None,
a new client will be created using the credentials in the environment.
client: LangSmith client to use to access the dataset. If None, a new client
will be created using the credentials in the environment.
tags: Tags to add to each run in the project.
run_evaluators: Evaluators to run on the results of the chain.
input_mapper: A function to map to the inputs dictionary from an Example
@@ -740,7 +740,7 @@ def run_on_dataset(
Returns:
A dictionary containing the run's project name and the resulting model outputs.
"""
client_ = client or LangChainPlusClient()
client_ = client or Client()
project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
dataset = client_.read_dataset(dataset_name=dataset_name)
examples = client_.list_examples(dataset_id=str(dataset.id))

View File

@@ -4,13 +4,6 @@ from langchain.evaluation.run_evaluators.base import (
RunEvaluatorInputMapper,
RunEvaluatorOutputParser,
)
from langchain.evaluation.run_evaluators.implementations import (
ChoicesOutputParser,
StringRunEvaluatorInputMapper,
get_criteria_evaluator,
get_qa_evaluator,
get_trajectory_evaluator,
)
from langchain.evaluation.run_evaluators.loading import (
load_run_evaluator_for_model,
load_run_evaluators_for_model,

View File

@@ -3,8 +3,8 @@ from __future__ import annotations
from abc import abstractmethod
from typing import Any, Dict, List, Optional
from langchainplus_sdk import EvaluationResult, RunEvaluator
from langchainplus_sdk.schemas import Example, Run
from langsmith import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run
from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,

View File

@@ -1,306 +0,0 @@
from typing import Any, Dict, Mapping, Optional, Sequence, Union
from langchainplus_sdk.evaluation import EvaluationResult
from langchainplus_sdk.schemas import Example, Run, RunTypeEnum
from pydantic import BaseModel, Field
from langchain.chat_models.base import BaseChatModel
from langchain.evaluation.agents.trajectory_eval_chain import (
TrajectoryEvalChain,
TrajectoryOutputParser,
)
from langchain.evaluation.criteria.eval_chain import (
CriteriaEvalChain,
CriteriaResultOutputParser,
)
from langchain.evaluation.qa.eval_chain import QAEvalChain
from langchain.evaluation.qa.eval_prompt import PROMPT as QA_DEFAULT_PROMPT
from langchain.evaluation.qa.eval_prompt import SQL_PROMPT
from langchain.evaluation.run_evaluators.base import (
RunEvaluatorChain,
RunEvaluatorInputMapper,
RunEvaluatorOutputParser,
)
from langchain.prompts.prompt import PromptTemplate
from langchain.schema import BasePromptTemplate
from langchain.schema.language_model import BaseLanguageModel
from langchain.tools.base import BaseTool
_QA_PROMPTS = {
"qa": QA_DEFAULT_PROMPT,
"sql": SQL_PROMPT,
}
class StringRunEvaluatorInputMapper(RunEvaluatorInputMapper, BaseModel):
"""Maps the Run and Optional[Example] to a dictionary."""
prediction_map: Dict[str, str]
"""Map from run outputs to the evaluation inputs."""
input_map: Dict[str, str]
"""Map from run inputs to the evaluation inputs."""
answer_map: Optional[Dict[str, str]] = None
"""Map from example outputs to the evaluation inputs."""
def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, Any]:
"""Maps the Run and Optional[Example] to a dictionary"""
if run.outputs is None and self.prediction_map:
raise ValueError(f"Run {run.id} has no outputs.")
if self.answer_map and (not example or not example.outputs):
raise ValueError("This evaluator requires references, but none were given.")
outputs = run.outputs or {}
data = {value: outputs[key] for key, value in self.prediction_map.items()}
data.update({value: run.inputs[key] for key, value in self.input_map.items()})
if self.answer_map and example and example.outputs:
data.update(
{value: example.outputs[key] for key, value in self.answer_map.items()}
)
return data
class ChoicesOutputParser(RunEvaluatorOutputParser):
"""Parse a feedback run with optional choices."""
evaluation_name: str
choices_map: Optional[Dict[str, int]] = None
@property
def _type(self) -> str:
return "choices_run_eval"
def parse(self, text: str) -> EvaluationResult:
"""Parse the last line of the text and return an evaluation result."""
lines = text.strip().split()
value = lines[-1].strip()
score = self.choices_map.get(value) if self.choices_map else None
comment = " ".join(lines[:-1]) if len(lines) > 1 else None
return EvaluationResult(
key=self.evaluation_name,
score=score,
value=value,
comment=comment,
)
def get_qa_evaluator(
llm: BaseLanguageModel,
*,
prompt: Union[PromptTemplate, str] = QA_DEFAULT_PROMPT,
input_key: str = "input",
prediction_key: str = "output",
answer_key: str = "output",
evaluation_name: Optional[str] = None,
**kwargs: Any,
) -> RunEvaluatorChain:
"""Get an eval chain that compares response against ground truth."""
if isinstance(prompt, str):
prompt = _QA_PROMPTS[prompt]
eval_chain = QAEvalChain.from_llm(llm=llm, prompt=prompt, **kwargs)
input_mapper = kwargs.pop(
"input_mapper",
StringRunEvaluatorInputMapper(
input_map={input_key: "query"},
prediction_map={prediction_key: "result"},
answer_map={answer_key: "answer"},
),
)
evaluation_name = evaluation_name or "Correctness"
output_parser = kwargs.pop(
"output_parser",
ChoicesOutputParser(
evaluation_name=evaluation_name,
choices_map={"CORRECT": 1, "INCORRECT": 0},
),
)
tags = kwargs.pop("tags", [])
return RunEvaluatorChain(
eval_chain=eval_chain,
input_mapper=input_mapper,
output_parser=output_parser,
tags=tags + [evaluation_name],
**kwargs,
)
class CriteriaOutputParser(RunEvaluatorOutputParser):
"""Parse a criteria results into an evaluation result."""
evaluation_name: str
@property
def _type(self) -> str:
return "criteria"
def parse(self, parsed_output: Union[str, dict]) -> EvaluationResult:
"""Parse the last line of the text and return an evaluation result."""
if isinstance(parsed_output, str):
parsed_output_ = CriteriaResultOutputParser().parse(parsed_output)
else:
parsed_output_ = parsed_output
return EvaluationResult(
key=self.evaluation_name,
score=parsed_output_["score"],
value=parsed_output_["value"],
comment=parsed_output_["reasoning"],
)
def get_criteria_evaluator(
llm: BaseLanguageModel,
criteria: Union[Mapping[str, str], Sequence[str], str],
*,
input_key: str = "input",
prediction_key: str = "output",
prompt: Optional[BasePromptTemplate] = None,
evaluation_name: Optional[str] = None,
requires_reference: bool = False,
**kwargs: Any,
) -> RunEvaluatorChain:
"""Get an eval chain for grading a model's response against a map of criteria."""
input_mapper = kwargs.pop(
"input_mapper",
StringRunEvaluatorInputMapper(
input_map={input_key: "input"},
prediction_map={prediction_key: "output"},
),
)
criteria_ = CriteriaEvalChain.resolve_criteria(criteria)
evaluation_name = evaluation_name or " ".join(criteria_.keys())
parser = kwargs.pop(
"output_parser",
CriteriaOutputParser(
choices_map={"Y": 1, "N": 0}, evaluation_name=evaluation_name
),
)
tags = kwargs.pop("tags", [])
eval_chain = CriteriaEvalChain.from_llm(
llm=llm,
criteria=criteria_,
prompt=prompt,
requires_reference=requires_reference,
**kwargs,
)
return RunEvaluatorChain(
eval_chain=eval_chain,
input_mapper=input_mapper,
output_parser=parser,
tags=tags + [evaluation_name],
**kwargs,
)
class TrajectoryRunEvalOutputParser(RunEvaluatorOutputParser, TrajectoryOutputParser):
evaluation_name: str = "Agent Trajectory"
"""The name assigned to the evaluation feedback."""
evaluator_info: dict = Field(default_factory=dict)
"""Additional information to log as feedback metadata."""
@property
def _type(self) -> str:
return "agent_trajectory_run_eval"
def parse_chain_output(self, output: Dict[str, Any]) -> EvaluationResult:
"""Parse the output of a run."""
return EvaluationResult(
key=self.evaluation_name,
score=int(output["score"]),
comment=output["reasoning"],
evaluator_info=self.evaluator_info,
)
class TrajectoryInputMapper(RunEvaluatorInputMapper, BaseModel):
"""Maps the Run and Optional[Example] to a dictionary."""
agent_input_key: str = "input"
"""The key to load from the agent executor's run input dictionary."""
agent_output_key: str = "output"
"""The key to load from the agent executor's run output dictionary."""
tool_input_key: str = "input"
"""The key to load from the tool executor's run input dictionary."""
tool_output_key: str = "output"
"""The key to load from the tool executor's run output dictionary."""
reference_output_key: Optional[str] = None
"""The key to use for selecting the reference answer."""
def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, str]:
"""Maps the Run and Optional[Example] to a dictionary"""
if run.child_runs is None:
raise ValueError("Run must have child runs to be evaluated.")
if run.outputs is None:
raise ValueError("Run must have outputs to be evaluated.")
reference = ""
if example is not None and example.outputs:
if self.reference_output_key is not None:
reference = example.outputs[self.reference_output_key]
elif "output" in example.outputs:
reference = example.outputs["output"]
elif len(example.outputs) == 1:
reference = next(iter(example.outputs.values()))
else:
raise ValueError("Could not infer the reference answer from ")
question = run.inputs[self.agent_input_key]
tool_runs = [
run_ for run_ in run.child_runs if run_.run_type == RunTypeEnum.tool
]
agent_steps = []
for i, run_ in enumerate(tool_runs, 1):
tool_output = (
f"Tool output: {run_.outputs.get(self.tool_output_key, run_.outputs)}"
if run_.outputs
else (f"Tool error: {run_.error}" if run_.error else "No output")
)
agent_steps.append(
f"""Step {i}:
Tool used: {run_.name}
Tool input: {run_.inputs.get(self.tool_input_key, run_.inputs)}
Tool output: {tool_output}"""
)
return {
"question": question,
"agent_trajectory": "\n\n".join(agent_steps),
"answer": run.outputs[self.agent_output_key],
"reference": reference,
}
def get_trajectory_evaluator(
llm: BaseChatModel,
agent_tools: Sequence[BaseTool],
*,
input_key: str = "input",
prediction_key: str = "output",
tool_input_key: str = "input",
tool_output_key: str = "output",
reference_output_key: Optional[str] = None,
evaluation_name: str = "Agent Trajectory",
**kwargs: Any,
) -> RunEvaluatorChain:
"""Get an eval chain for grading a model's response against a map of criteria."""
input_mapper = kwargs.pop(
"input_mapper",
TrajectoryInputMapper(
agent_input_key=input_key,
agent_output_key=prediction_key,
tool_input_key=tool_input_key,
tool_output_key=tool_output_key,
reference_output_key=reference_output_key,
),
)
parser = kwargs.pop(
"output_parser",
TrajectoryRunEvalOutputParser(evaluation_name=evaluation_name),
)
eval_chain = TrajectoryEvalChain.from_llm(
llm=llm, agent_tools=agent_tools, return_reasoning=True, **kwargs
)
tags = kwargs.pop("tags", [])
return RunEvaluatorChain(
eval_chain=eval_chain,
input_mapper=input_mapper,
output_parser=parser,
tags=tags + [evaluation_name],
**kwargs,
)

View File

@@ -1,7 +1,7 @@
""""Loading helpers for run evaluators."""
from typing import Any, List, Optional, Sequence, Union
from langchainplus_sdk import RunEvaluator
from langsmith import RunEvaluator
from langchain.base_language import BaseLanguageModel
from langchain.chains.base import Chain

View File

@@ -4,8 +4,8 @@ from __future__ import annotations
from abc import abstractmethod
from typing import Any, Dict, List, Optional, Union
from langchainplus_sdk import EvaluationResult, RunEvaluator
from langchainplus_sdk.schemas import Example, Run
from langsmith import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run
from langchain.base_language import BaseLanguageModel
from langchain.callbacks.manager import (

File diff suppressed because it is too large Load Diff

View File

@@ -2,7 +2,7 @@
import subprocess
from pathlib import Path
from langchainplus_sdk.cli.main import get_docker_compose_command
from langsmith.cli.main import get_docker_compose_command
def main() -> None:

588
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -108,7 +108,6 @@ pyspark = {version = "^3.4.0", optional = true}
clarifai = {version = ">=9.1.0", optional = true}
tigrisdb = {version = "^1.0.0b6", optional = true}
nebula3-python = {version = "^3.4.0", optional = true}
langchainplus-sdk = "^0.0.20"
awadb = {version = "^0.3.3", optional = true}
azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-dev", optional = true}
esprima = {version = "^4.0.1", optional = true}
@@ -118,6 +117,7 @@ psychicapi = {version = "^0.8.0", optional = true}
cassio = {version = "^0.0.7", optional = true}
rdflib = {version = "^6.3.2", optional = true}
rapidfuzz = {version = "^3.1.1", optional = true}
langsmith = "^0.0.2"
[tool.poetry.group.docs.dependencies]
autodoc_pydantic = "^1.8.0"

View File

@@ -3,7 +3,7 @@ from typing import Iterator
from uuid import uuid4
import pytest
from langchainplus_sdk import LangChainPlusClient as Client
from langsmith import Client as Client
from langchain.chains.llm import LLMChain
from langchain.chat_models import ChatOpenAI

View File

@@ -5,8 +5,8 @@ from typing import Any, Dict, List, Optional, Union
from unittest import mock
import pytest
from langchainplus_sdk.client import LangChainPlusClient
from langchainplus_sdk.schemas import Dataset, Example
from langsmith.client import Client
from langsmith.schemas import Dataset, Example
from langchain.chains.base import Chain
from langchain.chains.transform import TransformChain
@@ -235,15 +235,13 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
pass
with mock.patch.object(
LangChainPlusClient, "read_dataset", new=mock_read_dataset
), mock.patch.object(
LangChainPlusClient, "list_examples", new=mock_list_examples
), mock.patch(
Client, "read_dataset", new=mock_read_dataset
), mock.patch.object(Client, "list_examples", new=mock_list_examples), mock.patch(
"langchain.client.runner_utils._arun_llm_or_chain", new=mock_arun_chain
), mock.patch.object(
LangChainPlusClient, "create_project", new=mock_create_project
Client, "create_project", new=mock_create_project
):
client = LangChainPlusClient(api_url="http://localhost:1984", api_key="123")
client = Client(api_url="http://localhost:1984", api_key="123")
chain = mock.MagicMock()
num_repetitions = 3
results = await arun_on_dataset(

View File

@@ -3,7 +3,7 @@
from uuid import UUID
import pytest
from langchainplus_sdk.schemas import Example, Run
from langsmith.schemas import Example, Run
from langchain.evaluation.run_evaluators import get_criteria_evaluator, get_qa_evaluator
from tests.unit_tests.llms.fake_llm import FakeLLM

View File

@@ -38,7 +38,7 @@ def test_required_dependencies(poetry_conf: Mapping[str, Any]) -> None:
"aiohttp",
"async-timeout",
"dataclasses-json",
"langchainplus-sdk",
"langsmith",
"numexpr",
"numpy",
"openapi-schema-pydantic",