mirror of
https://github.com/hwchase17/langchain.git
synced 2026-01-23 13:19:22 +00:00
Compare commits
2 Commits
sr/async-i
...
wfh/delete
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4ecbb3aeac | ||
|
|
4d50092103 |
@@ -4,7 +4,7 @@ from concurrent.futures import Future, ThreadPoolExecutor, wait
|
||||
from typing import Any, Optional, Sequence, Set, Union
|
||||
from uuid import UUID
|
||||
|
||||
from langchainplus_sdk import LangChainPlusClient, RunEvaluator
|
||||
from langsmith import Client, RunEvaluator
|
||||
|
||||
from langchain.callbacks.manager import tracing_v2_enabled
|
||||
from langchain.callbacks.tracers.base import BaseTracer
|
||||
@@ -23,8 +23,8 @@ class EvaluatorCallbackHandler(BaseTracer):
|
||||
max_workers : int, optional
|
||||
The maximum number of worker threads to use for running the evaluators.
|
||||
If not specified, it will default to the number of evaluators.
|
||||
client : LangChainPlusClient, optional
|
||||
The LangChainPlusClient instance to use for evaluating the runs.
|
||||
client : LangSmith Client, optional
|
||||
The LangSmith client instance to use for evaluating the runs.
|
||||
If not specified, a new instance will be created.
|
||||
example_id : Union[UUID, str], optional
|
||||
The example ID to be associated with the runs.
|
||||
@@ -35,8 +35,8 @@ class EvaluatorCallbackHandler(BaseTracer):
|
||||
----------
|
||||
example_id : Union[UUID, None]
|
||||
The example ID associated with the runs.
|
||||
client : LangChainPlusClient
|
||||
The LangChainPlusClient instance used for evaluating the runs.
|
||||
client : Client
|
||||
The LangSmith client instance used for evaluating the runs.
|
||||
evaluators : Sequence[RunEvaluator]
|
||||
The sequence of run evaluators to be executed.
|
||||
executor : ThreadPoolExecutor
|
||||
@@ -56,7 +56,7 @@ class EvaluatorCallbackHandler(BaseTracer):
|
||||
self,
|
||||
evaluators: Sequence[RunEvaluator],
|
||||
max_workers: Optional[int] = None,
|
||||
client: Optional[LangChainPlusClient] = None,
|
||||
client: Optional[Client] = None,
|
||||
example_id: Optional[Union[UUID, str]] = None,
|
||||
skip_unfinished: bool = True,
|
||||
project_name: Optional[str] = None,
|
||||
@@ -66,7 +66,7 @@ class EvaluatorCallbackHandler(BaseTracer):
|
||||
self.example_id = (
|
||||
UUID(example_id) if isinstance(example_id, str) else example_id
|
||||
)
|
||||
self.client = client or LangChainPlusClient()
|
||||
self.client = client or Client()
|
||||
self.evaluators = evaluators
|
||||
self.executor = ThreadPoolExecutor(
|
||||
max_workers=max(max_workers or len(evaluators), 1)
|
||||
|
||||
@@ -8,7 +8,7 @@ from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional, Set, Union
|
||||
from uuid import UUID
|
||||
|
||||
from langchainplus_sdk import LangChainPlusClient
|
||||
from langsmith import Client
|
||||
|
||||
from langchain.callbacks.tracers.base import BaseTracer
|
||||
from langchain.callbacks.tracers.schemas import Run, RunTypeEnum, TracerSession
|
||||
@@ -44,7 +44,7 @@ class LangChainTracer(BaseTracer):
|
||||
self,
|
||||
example_id: Optional[Union[UUID, str]] = None,
|
||||
project_name: Optional[str] = None,
|
||||
client: Optional[LangChainPlusClient] = None,
|
||||
client: Optional[Client] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
@@ -59,7 +59,7 @@ class LangChainTracer(BaseTracer):
|
||||
)
|
||||
# set max_workers to 1 to process tasks in order
|
||||
self.executor = ThreadPoolExecutor(max_workers=1)
|
||||
self.client = client or LangChainPlusClient()
|
||||
self.client = client or Client()
|
||||
self._futures: Set[Future] = set()
|
||||
self.tags = tags or []
|
||||
global _TRACERS
|
||||
|
||||
@@ -5,8 +5,8 @@ import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
from uuid import UUID
|
||||
|
||||
from langchainplus_sdk.schemas import RunBase as BaseRunV2
|
||||
from langchainplus_sdk.schemas import RunTypeEnum
|
||||
from langsmith.schemas import RunBase as BaseRunV2
|
||||
from langsmith.schemas import RunTypeEnum
|
||||
from pydantic import BaseModel, Field, root_validator
|
||||
|
||||
from langchain.schema import LLMResult
|
||||
|
||||
@@ -18,8 +18,8 @@ from typing import (
|
||||
Union,
|
||||
)
|
||||
|
||||
from langchainplus_sdk import LangChainPlusClient, RunEvaluator
|
||||
from langchainplus_sdk.schemas import Example
|
||||
from langsmith import Client, RunEvaluator
|
||||
from langsmith.schemas import Example
|
||||
|
||||
from langchain.callbacks.base import BaseCallbackHandler
|
||||
from langchain.callbacks.manager import Callbacks
|
||||
@@ -306,7 +306,7 @@ async def _gather_with_concurrency(
|
||||
|
||||
async def _callbacks_initializer(
|
||||
project_name: Optional[str],
|
||||
client: LangChainPlusClient,
|
||||
client: Client,
|
||||
run_evaluators: Sequence[RunEvaluator],
|
||||
evaluation_handler_collector: List[EvaluatorCallbackHandler],
|
||||
) -> List[BaseTracer]:
|
||||
@@ -348,7 +348,7 @@ async def arun_on_examples(
|
||||
num_repetitions: int = 1,
|
||||
project_name: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
client: Optional[LangChainPlusClient] = None,
|
||||
client: Optional[Client] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
|
||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
||||
@@ -369,7 +369,7 @@ async def arun_on_examples(
|
||||
project_name: Project name to use when tracing runs.
|
||||
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
||||
verbose: Whether to print progress.
|
||||
client: Client to use to read the dataset. If not provided, a new
|
||||
client: LangSmith client to use to read the dataset. If not provided, a new
|
||||
client will be created using the credentials in the environment.
|
||||
tags: Tags to add to each run in the project.
|
||||
run_evaluators: Evaluators to run on the results of the chain.
|
||||
@@ -383,7 +383,7 @@ async def arun_on_examples(
|
||||
A dictionary mapping example ids to the model outputs.
|
||||
"""
|
||||
project_name = _get_project_name(project_name, llm_or_chain_factory, None)
|
||||
client_ = client or LangChainPlusClient()
|
||||
client_ = client or Client()
|
||||
client_.create_project(project_name)
|
||||
|
||||
results: Dict[str, List[Any]] = {}
|
||||
@@ -548,7 +548,7 @@ def run_on_examples(
|
||||
num_repetitions: int = 1,
|
||||
project_name: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
client: Optional[LangChainPlusClient] = None,
|
||||
client: Optional[Client] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
|
||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
||||
@@ -568,7 +568,7 @@ def run_on_examples(
|
||||
project_name: Name of the project to store the traces in.
|
||||
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
||||
verbose: Whether to print progress.
|
||||
client: Client to use to access the dataset. If None, a new client
|
||||
client: LangSmith client to use to access the dataset. If None, a new client
|
||||
will be created using the credentials in the environment.
|
||||
tags: Tags to add to each run in the project.
|
||||
run_evaluators: Evaluators to run on the results of the chain.
|
||||
@@ -583,7 +583,7 @@ def run_on_examples(
|
||||
"""
|
||||
results: Dict[str, Any] = {}
|
||||
project_name = _get_project_name(project_name, llm_or_chain_factory, None)
|
||||
client_ = client or LangChainPlusClient()
|
||||
client_ = client or Client()
|
||||
client_.create_project(project_name)
|
||||
tracer = LangChainTracer(project_name=project_name)
|
||||
evaluator_project_name = f"{project_name}-evaluators"
|
||||
@@ -645,7 +645,7 @@ async def arun_on_dataset(
|
||||
num_repetitions: int = 1,
|
||||
project_name: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
client: Optional[LangChainPlusClient] = None,
|
||||
client: Optional[Client] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
|
||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
||||
@@ -666,8 +666,8 @@ async def arun_on_dataset(
|
||||
project_name: Name of the project to store the traces in.
|
||||
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
||||
verbose: Whether to print progress.
|
||||
client: Client to use to read the dataset. If not provided,
|
||||
a new client will be created using the credentials in the environment.
|
||||
client: LangSmith client to use to read the dataset. If not provided, a new
|
||||
client will be created using the credentials in the environment.
|
||||
tags: Tags to add to each run in the project.
|
||||
run_evaluators: Evaluators to run on the results of the chain.
|
||||
input_mapper: A function to map to the inputs dictionary from an Example
|
||||
@@ -678,7 +678,7 @@ async def arun_on_dataset(
|
||||
Returns:
|
||||
A dictionary containing the run's project name and the resulting model outputs.
|
||||
"""
|
||||
client_ = client or LangChainPlusClient()
|
||||
client_ = client or Client()
|
||||
project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
|
||||
dataset = client_.read_dataset(dataset_name=dataset_name)
|
||||
examples = client_.list_examples(dataset_id=str(dataset.id))
|
||||
@@ -707,7 +707,7 @@ def run_on_dataset(
|
||||
num_repetitions: int = 1,
|
||||
project_name: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
client: Optional[LangChainPlusClient] = None,
|
||||
client: Optional[Client] = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
run_evaluators: Optional[Sequence[RunEvaluator]] = None,
|
||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
||||
@@ -727,8 +727,8 @@ def run_on_dataset(
|
||||
project_name: Name of the project to store the traces in.
|
||||
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
||||
verbose: Whether to print progress.
|
||||
client: Client to use to access the dataset. If None,
|
||||
a new client will be created using the credentials in the environment.
|
||||
client: LangSmith client to use to access the dataset. If None, a new client
|
||||
will be created using the credentials in the environment.
|
||||
tags: Tags to add to each run in the project.
|
||||
run_evaluators: Evaluators to run on the results of the chain.
|
||||
input_mapper: A function to map to the inputs dictionary from an Example
|
||||
@@ -740,7 +740,7 @@ def run_on_dataset(
|
||||
Returns:
|
||||
A dictionary containing the run's project name and the resulting model outputs.
|
||||
"""
|
||||
client_ = client or LangChainPlusClient()
|
||||
client_ = client or Client()
|
||||
project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
|
||||
dataset = client_.read_dataset(dataset_name=dataset_name)
|
||||
examples = client_.list_examples(dataset_id=str(dataset.id))
|
||||
|
||||
@@ -4,13 +4,6 @@ from langchain.evaluation.run_evaluators.base import (
|
||||
RunEvaluatorInputMapper,
|
||||
RunEvaluatorOutputParser,
|
||||
)
|
||||
from langchain.evaluation.run_evaluators.implementations import (
|
||||
ChoicesOutputParser,
|
||||
StringRunEvaluatorInputMapper,
|
||||
get_criteria_evaluator,
|
||||
get_qa_evaluator,
|
||||
get_trajectory_evaluator,
|
||||
)
|
||||
from langchain.evaluation.run_evaluators.loading import (
|
||||
load_run_evaluator_for_model,
|
||||
load_run_evaluators_for_model,
|
||||
|
||||
@@ -3,8 +3,8 @@ from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchainplus_sdk import EvaluationResult, RunEvaluator
|
||||
from langchainplus_sdk.schemas import Example, Run
|
||||
from langsmith import EvaluationResult, RunEvaluator
|
||||
from langsmith.schemas import Example, Run
|
||||
|
||||
from langchain.callbacks.manager import (
|
||||
AsyncCallbackManagerForChainRun,
|
||||
|
||||
@@ -1,306 +0,0 @@
|
||||
from typing import Any, Dict, Mapping, Optional, Sequence, Union
|
||||
|
||||
from langchainplus_sdk.evaluation import EvaluationResult
|
||||
from langchainplus_sdk.schemas import Example, Run, RunTypeEnum
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from langchain.chat_models.base import BaseChatModel
|
||||
from langchain.evaluation.agents.trajectory_eval_chain import (
|
||||
TrajectoryEvalChain,
|
||||
TrajectoryOutputParser,
|
||||
)
|
||||
from langchain.evaluation.criteria.eval_chain import (
|
||||
CriteriaEvalChain,
|
||||
CriteriaResultOutputParser,
|
||||
)
|
||||
from langchain.evaluation.qa.eval_chain import QAEvalChain
|
||||
from langchain.evaluation.qa.eval_prompt import PROMPT as QA_DEFAULT_PROMPT
|
||||
from langchain.evaluation.qa.eval_prompt import SQL_PROMPT
|
||||
from langchain.evaluation.run_evaluators.base import (
|
||||
RunEvaluatorChain,
|
||||
RunEvaluatorInputMapper,
|
||||
RunEvaluatorOutputParser,
|
||||
)
|
||||
from langchain.prompts.prompt import PromptTemplate
|
||||
from langchain.schema import BasePromptTemplate
|
||||
from langchain.schema.language_model import BaseLanguageModel
|
||||
from langchain.tools.base import BaseTool
|
||||
|
||||
_QA_PROMPTS = {
|
||||
"qa": QA_DEFAULT_PROMPT,
|
||||
"sql": SQL_PROMPT,
|
||||
}
|
||||
|
||||
|
||||
class StringRunEvaluatorInputMapper(RunEvaluatorInputMapper, BaseModel):
|
||||
"""Maps the Run and Optional[Example] to a dictionary."""
|
||||
|
||||
prediction_map: Dict[str, str]
|
||||
"""Map from run outputs to the evaluation inputs."""
|
||||
input_map: Dict[str, str]
|
||||
"""Map from run inputs to the evaluation inputs."""
|
||||
answer_map: Optional[Dict[str, str]] = None
|
||||
"""Map from example outputs to the evaluation inputs."""
|
||||
|
||||
def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, Any]:
|
||||
"""Maps the Run and Optional[Example] to a dictionary"""
|
||||
if run.outputs is None and self.prediction_map:
|
||||
raise ValueError(f"Run {run.id} has no outputs.")
|
||||
if self.answer_map and (not example or not example.outputs):
|
||||
raise ValueError("This evaluator requires references, but none were given.")
|
||||
outputs = run.outputs or {}
|
||||
data = {value: outputs[key] for key, value in self.prediction_map.items()}
|
||||
data.update({value: run.inputs[key] for key, value in self.input_map.items()})
|
||||
if self.answer_map and example and example.outputs:
|
||||
data.update(
|
||||
{value: example.outputs[key] for key, value in self.answer_map.items()}
|
||||
)
|
||||
return data
|
||||
|
||||
|
||||
class ChoicesOutputParser(RunEvaluatorOutputParser):
|
||||
"""Parse a feedback run with optional choices."""
|
||||
|
||||
evaluation_name: str
|
||||
choices_map: Optional[Dict[str, int]] = None
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
return "choices_run_eval"
|
||||
|
||||
def parse(self, text: str) -> EvaluationResult:
|
||||
"""Parse the last line of the text and return an evaluation result."""
|
||||
lines = text.strip().split()
|
||||
value = lines[-1].strip()
|
||||
score = self.choices_map.get(value) if self.choices_map else None
|
||||
comment = " ".join(lines[:-1]) if len(lines) > 1 else None
|
||||
return EvaluationResult(
|
||||
key=self.evaluation_name,
|
||||
score=score,
|
||||
value=value,
|
||||
comment=comment,
|
||||
)
|
||||
|
||||
|
||||
def get_qa_evaluator(
|
||||
llm: BaseLanguageModel,
|
||||
*,
|
||||
prompt: Union[PromptTemplate, str] = QA_DEFAULT_PROMPT,
|
||||
input_key: str = "input",
|
||||
prediction_key: str = "output",
|
||||
answer_key: str = "output",
|
||||
evaluation_name: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> RunEvaluatorChain:
|
||||
"""Get an eval chain that compares response against ground truth."""
|
||||
if isinstance(prompt, str):
|
||||
prompt = _QA_PROMPTS[prompt]
|
||||
eval_chain = QAEvalChain.from_llm(llm=llm, prompt=prompt, **kwargs)
|
||||
input_mapper = kwargs.pop(
|
||||
"input_mapper",
|
||||
StringRunEvaluatorInputMapper(
|
||||
input_map={input_key: "query"},
|
||||
prediction_map={prediction_key: "result"},
|
||||
answer_map={answer_key: "answer"},
|
||||
),
|
||||
)
|
||||
evaluation_name = evaluation_name or "Correctness"
|
||||
output_parser = kwargs.pop(
|
||||
"output_parser",
|
||||
ChoicesOutputParser(
|
||||
evaluation_name=evaluation_name,
|
||||
choices_map={"CORRECT": 1, "INCORRECT": 0},
|
||||
),
|
||||
)
|
||||
tags = kwargs.pop("tags", [])
|
||||
return RunEvaluatorChain(
|
||||
eval_chain=eval_chain,
|
||||
input_mapper=input_mapper,
|
||||
output_parser=output_parser,
|
||||
tags=tags + [evaluation_name],
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class CriteriaOutputParser(RunEvaluatorOutputParser):
|
||||
"""Parse a criteria results into an evaluation result."""
|
||||
|
||||
evaluation_name: str
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
return "criteria"
|
||||
|
||||
def parse(self, parsed_output: Union[str, dict]) -> EvaluationResult:
|
||||
"""Parse the last line of the text and return an evaluation result."""
|
||||
if isinstance(parsed_output, str):
|
||||
parsed_output_ = CriteriaResultOutputParser().parse(parsed_output)
|
||||
else:
|
||||
parsed_output_ = parsed_output
|
||||
return EvaluationResult(
|
||||
key=self.evaluation_name,
|
||||
score=parsed_output_["score"],
|
||||
value=parsed_output_["value"],
|
||||
comment=parsed_output_["reasoning"],
|
||||
)
|
||||
|
||||
|
||||
def get_criteria_evaluator(
|
||||
llm: BaseLanguageModel,
|
||||
criteria: Union[Mapping[str, str], Sequence[str], str],
|
||||
*,
|
||||
input_key: str = "input",
|
||||
prediction_key: str = "output",
|
||||
prompt: Optional[BasePromptTemplate] = None,
|
||||
evaluation_name: Optional[str] = None,
|
||||
requires_reference: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> RunEvaluatorChain:
|
||||
"""Get an eval chain for grading a model's response against a map of criteria."""
|
||||
input_mapper = kwargs.pop(
|
||||
"input_mapper",
|
||||
StringRunEvaluatorInputMapper(
|
||||
input_map={input_key: "input"},
|
||||
prediction_map={prediction_key: "output"},
|
||||
),
|
||||
)
|
||||
criteria_ = CriteriaEvalChain.resolve_criteria(criteria)
|
||||
evaluation_name = evaluation_name or " ".join(criteria_.keys())
|
||||
parser = kwargs.pop(
|
||||
"output_parser",
|
||||
CriteriaOutputParser(
|
||||
choices_map={"Y": 1, "N": 0}, evaluation_name=evaluation_name
|
||||
),
|
||||
)
|
||||
tags = kwargs.pop("tags", [])
|
||||
eval_chain = CriteriaEvalChain.from_llm(
|
||||
llm=llm,
|
||||
criteria=criteria_,
|
||||
prompt=prompt,
|
||||
requires_reference=requires_reference,
|
||||
**kwargs,
|
||||
)
|
||||
return RunEvaluatorChain(
|
||||
eval_chain=eval_chain,
|
||||
input_mapper=input_mapper,
|
||||
output_parser=parser,
|
||||
tags=tags + [evaluation_name],
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class TrajectoryRunEvalOutputParser(RunEvaluatorOutputParser, TrajectoryOutputParser):
|
||||
evaluation_name: str = "Agent Trajectory"
|
||||
"""The name assigned to the evaluation feedback."""
|
||||
evaluator_info: dict = Field(default_factory=dict)
|
||||
"""Additional information to log as feedback metadata."""
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
return "agent_trajectory_run_eval"
|
||||
|
||||
def parse_chain_output(self, output: Dict[str, Any]) -> EvaluationResult:
|
||||
"""Parse the output of a run."""
|
||||
return EvaluationResult(
|
||||
key=self.evaluation_name,
|
||||
score=int(output["score"]),
|
||||
comment=output["reasoning"],
|
||||
evaluator_info=self.evaluator_info,
|
||||
)
|
||||
|
||||
|
||||
class TrajectoryInputMapper(RunEvaluatorInputMapper, BaseModel):
|
||||
"""Maps the Run and Optional[Example] to a dictionary."""
|
||||
|
||||
agent_input_key: str = "input"
|
||||
"""The key to load from the agent executor's run input dictionary."""
|
||||
agent_output_key: str = "output"
|
||||
"""The key to load from the agent executor's run output dictionary."""
|
||||
tool_input_key: str = "input"
|
||||
"""The key to load from the tool executor's run input dictionary."""
|
||||
tool_output_key: str = "output"
|
||||
"""The key to load from the tool executor's run output dictionary."""
|
||||
reference_output_key: Optional[str] = None
|
||||
"""The key to use for selecting the reference answer."""
|
||||
|
||||
def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, str]:
|
||||
"""Maps the Run and Optional[Example] to a dictionary"""
|
||||
if run.child_runs is None:
|
||||
raise ValueError("Run must have child runs to be evaluated.")
|
||||
if run.outputs is None:
|
||||
raise ValueError("Run must have outputs to be evaluated.")
|
||||
reference = ""
|
||||
if example is not None and example.outputs:
|
||||
if self.reference_output_key is not None:
|
||||
reference = example.outputs[self.reference_output_key]
|
||||
elif "output" in example.outputs:
|
||||
reference = example.outputs["output"]
|
||||
elif len(example.outputs) == 1:
|
||||
reference = next(iter(example.outputs.values()))
|
||||
else:
|
||||
raise ValueError("Could not infer the reference answer from ")
|
||||
|
||||
question = run.inputs[self.agent_input_key]
|
||||
tool_runs = [
|
||||
run_ for run_ in run.child_runs if run_.run_type == RunTypeEnum.tool
|
||||
]
|
||||
agent_steps = []
|
||||
for i, run_ in enumerate(tool_runs, 1):
|
||||
tool_output = (
|
||||
f"Tool output: {run_.outputs.get(self.tool_output_key, run_.outputs)}"
|
||||
if run_.outputs
|
||||
else (f"Tool error: {run_.error}" if run_.error else "No output")
|
||||
)
|
||||
agent_steps.append(
|
||||
f"""Step {i}:
|
||||
Tool used: {run_.name}
|
||||
Tool input: {run_.inputs.get(self.tool_input_key, run_.inputs)}
|
||||
Tool output: {tool_output}"""
|
||||
)
|
||||
|
||||
return {
|
||||
"question": question,
|
||||
"agent_trajectory": "\n\n".join(agent_steps),
|
||||
"answer": run.outputs[self.agent_output_key],
|
||||
"reference": reference,
|
||||
}
|
||||
|
||||
|
||||
def get_trajectory_evaluator(
|
||||
llm: BaseChatModel,
|
||||
agent_tools: Sequence[BaseTool],
|
||||
*,
|
||||
input_key: str = "input",
|
||||
prediction_key: str = "output",
|
||||
tool_input_key: str = "input",
|
||||
tool_output_key: str = "output",
|
||||
reference_output_key: Optional[str] = None,
|
||||
evaluation_name: str = "Agent Trajectory",
|
||||
**kwargs: Any,
|
||||
) -> RunEvaluatorChain:
|
||||
"""Get an eval chain for grading a model's response against a map of criteria."""
|
||||
input_mapper = kwargs.pop(
|
||||
"input_mapper",
|
||||
TrajectoryInputMapper(
|
||||
agent_input_key=input_key,
|
||||
agent_output_key=prediction_key,
|
||||
tool_input_key=tool_input_key,
|
||||
tool_output_key=tool_output_key,
|
||||
reference_output_key=reference_output_key,
|
||||
),
|
||||
)
|
||||
parser = kwargs.pop(
|
||||
"output_parser",
|
||||
TrajectoryRunEvalOutputParser(evaluation_name=evaluation_name),
|
||||
)
|
||||
eval_chain = TrajectoryEvalChain.from_llm(
|
||||
llm=llm, agent_tools=agent_tools, return_reasoning=True, **kwargs
|
||||
)
|
||||
tags = kwargs.pop("tags", [])
|
||||
return RunEvaluatorChain(
|
||||
eval_chain=eval_chain,
|
||||
input_mapper=input_mapper,
|
||||
output_parser=parser,
|
||||
tags=tags + [evaluation_name],
|
||||
**kwargs,
|
||||
)
|
||||
@@ -1,7 +1,7 @@
|
||||
""""Loading helpers for run evaluators."""
|
||||
from typing import Any, List, Optional, Sequence, Union
|
||||
|
||||
from langchainplus_sdk import RunEvaluator
|
||||
from langsmith import RunEvaluator
|
||||
|
||||
from langchain.base_language import BaseLanguageModel
|
||||
from langchain.chains.base import Chain
|
||||
|
||||
@@ -4,8 +4,8 @@ from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from langchainplus_sdk import EvaluationResult, RunEvaluator
|
||||
from langchainplus_sdk.schemas import Example, Run
|
||||
from langsmith import EvaluationResult, RunEvaluator
|
||||
from langsmith.schemas import Example, Run
|
||||
|
||||
from langchain.base_language import BaseLanguageModel
|
||||
from langchain.callbacks.manager import (
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,7 +2,7 @@
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from langchainplus_sdk.cli.main import get_docker_compose_command
|
||||
from langsmith.cli.main import get_docker_compose_command
|
||||
|
||||
|
||||
def main() -> None:
|
||||
|
||||
588
poetry.lock
generated
588
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -108,7 +108,6 @@ pyspark = {version = "^3.4.0", optional = true}
|
||||
clarifai = {version = ">=9.1.0", optional = true}
|
||||
tigrisdb = {version = "^1.0.0b6", optional = true}
|
||||
nebula3-python = {version = "^3.4.0", optional = true}
|
||||
langchainplus-sdk = "^0.0.20"
|
||||
awadb = {version = "^0.3.3", optional = true}
|
||||
azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-dev", optional = true}
|
||||
esprima = {version = "^4.0.1", optional = true}
|
||||
@@ -118,6 +117,7 @@ psychicapi = {version = "^0.8.0", optional = true}
|
||||
cassio = {version = "^0.0.7", optional = true}
|
||||
rdflib = {version = "^6.3.2", optional = true}
|
||||
rapidfuzz = {version = "^3.1.1", optional = true}
|
||||
langsmith = "^0.0.2"
|
||||
|
||||
[tool.poetry.group.docs.dependencies]
|
||||
autodoc_pydantic = "^1.8.0"
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import Iterator
|
||||
from uuid import uuid4
|
||||
|
||||
import pytest
|
||||
from langchainplus_sdk import LangChainPlusClient as Client
|
||||
from langsmith import Client as Client
|
||||
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
|
||||
@@ -5,8 +5,8 @@ from typing import Any, Dict, List, Optional, Union
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from langchainplus_sdk.client import LangChainPlusClient
|
||||
from langchainplus_sdk.schemas import Dataset, Example
|
||||
from langsmith.client import Client
|
||||
from langsmith.schemas import Dataset, Example
|
||||
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.transform import TransformChain
|
||||
@@ -235,15 +235,13 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
pass
|
||||
|
||||
with mock.patch.object(
|
||||
LangChainPlusClient, "read_dataset", new=mock_read_dataset
|
||||
), mock.patch.object(
|
||||
LangChainPlusClient, "list_examples", new=mock_list_examples
|
||||
), mock.patch(
|
||||
Client, "read_dataset", new=mock_read_dataset
|
||||
), mock.patch.object(Client, "list_examples", new=mock_list_examples), mock.patch(
|
||||
"langchain.client.runner_utils._arun_llm_or_chain", new=mock_arun_chain
|
||||
), mock.patch.object(
|
||||
LangChainPlusClient, "create_project", new=mock_create_project
|
||||
Client, "create_project", new=mock_create_project
|
||||
):
|
||||
client = LangChainPlusClient(api_url="http://localhost:1984", api_key="123")
|
||||
client = Client(api_url="http://localhost:1984", api_key="123")
|
||||
chain = mock.MagicMock()
|
||||
num_repetitions = 3
|
||||
results = await arun_on_dataset(
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
from uuid import UUID
|
||||
|
||||
import pytest
|
||||
from langchainplus_sdk.schemas import Example, Run
|
||||
from langsmith.schemas import Example, Run
|
||||
|
||||
from langchain.evaluation.run_evaluators import get_criteria_evaluator, get_qa_evaluator
|
||||
from tests.unit_tests.llms.fake_llm import FakeLLM
|
||||
|
||||
@@ -38,7 +38,7 @@ def test_required_dependencies(poetry_conf: Mapping[str, Any]) -> None:
|
||||
"aiohttp",
|
||||
"async-timeout",
|
||||
"dataclasses-json",
|
||||
"langchainplus-sdk",
|
||||
"langsmith",
|
||||
"numexpr",
|
||||
"numpy",
|
||||
"openapi-schema-pydantic",
|
||||
|
||||
Reference in New Issue
Block a user