mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-18 18:53:10 +00:00
Add Trajectory Eval RunEvaluator (#6449)
This commit is contained in:
parent
6a157cf8bb
commit
bc0af67aaf
@ -1,11 +1,15 @@
|
||||
from typing import Any, Dict, Mapping, Optional, Sequence, Union
|
||||
from typing import Any, Dict, List, Mapping, Optional, Sequence, Union
|
||||
|
||||
from langchainplus_sdk.evaluation import EvaluationResult
|
||||
from langchainplus_sdk.schemas import Example, Run
|
||||
from pydantic import BaseModel
|
||||
from langchainplus_sdk.schemas import Example, Run, RunTypeEnum
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from langchain.base_language import BaseLanguageModel
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.chat_models.base import BaseChatModel
|
||||
from langchain.evaluation.agents.trajectory_eval_prompt import (
|
||||
EVAL_CHAT_PROMPT as TRAJECTORY_PROMPT,
|
||||
)
|
||||
from langchain.evaluation.qa.eval_chain import QAEvalChain
|
||||
from langchain.evaluation.qa.eval_prompt import PROMPT as QA_DEFAULT_PROMPT
|
||||
from langchain.evaluation.qa.eval_prompt import SQL_PROMPT
|
||||
@ -17,7 +21,10 @@ from langchain.evaluation.run_evaluators.base import (
|
||||
from langchain.evaluation.run_evaluators.criteria_prompt import (
|
||||
PROMPT as CRITERIA_PROMPT,
|
||||
)
|
||||
from langchain.prompts.base import BasePromptTemplate
|
||||
from langchain.prompts.prompt import PromptTemplate
|
||||
from langchain.schema import OutputParserException
|
||||
from langchain.tools.base import BaseTool
|
||||
|
||||
_QA_PROMPTS = {
|
||||
"qa": QA_DEFAULT_PROMPT,
|
||||
@ -28,18 +35,13 @@ _QA_PROMPTS = {
|
||||
class StringRunEvaluatorInputMapper(RunEvaluatorInputMapper, BaseModel):
|
||||
"""Maps the Run and Optional[Example] to a dictionary."""
|
||||
|
||||
prediction_map: Mapping[str, str]
|
||||
prediction_map: Dict[str, str]
|
||||
"""Map from run outputs to the evaluation inputs."""
|
||||
input_map: Mapping[str, str]
|
||||
input_map: Dict[str, str]
|
||||
"""Map from run inputs to the evaluation inputs."""
|
||||
answer_map: Optional[Mapping[str, str]] = None
|
||||
answer_map: Optional[Dict[str, str]] = None
|
||||
"""Map from example outputs to the evaluation inputs."""
|
||||
|
||||
class Config:
|
||||
"""Pydantic config."""
|
||||
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, str]:
|
||||
"""Maps the Run and Optional[Example] to a dictionary"""
|
||||
if run.outputs is None:
|
||||
@ -166,7 +168,7 @@ def get_criteria_evaluator(
|
||||
*,
|
||||
input_key: str = "input",
|
||||
prediction_key: str = "output",
|
||||
prompt: PromptTemplate = CRITERIA_PROMPT,
|
||||
prompt: BasePromptTemplate = CRITERIA_PROMPT,
|
||||
evaluation_name: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> RunEvaluatorChain:
|
||||
@ -198,3 +200,125 @@ def get_criteria_evaluator(
|
||||
output_parser=parser,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class TrajectoryEvalOutputParser(RunEvaluatorOutputParser):
|
||||
evaluation_name: str = "Agent Trajectory"
|
||||
"""The name assigned to the evaluation feedback."""
|
||||
evaluator_info: dict = Field(default_factory=dict)
|
||||
"""Additional information to log as feedback metadata."""
|
||||
|
||||
def parse(self, text: str) -> EvaluationResult:
|
||||
if "Score:" not in text:
|
||||
raise OutputParserException(
|
||||
f"Could not find score in model eval output: {text}"
|
||||
)
|
||||
|
||||
reasoning, score_str = text.split("Score: ")
|
||||
|
||||
reasoning, score_str = reasoning.strip(), score_str.strip()
|
||||
|
||||
score_str = next(
|
||||
(char for char in score_str if char.isdigit()), "0"
|
||||
) # Scan for first digit
|
||||
|
||||
if not 1 <= int(score_str) <= 5:
|
||||
raise OutputParserException(
|
||||
f"Score is not a digit in the range 1-5: {text}"
|
||||
)
|
||||
|
||||
return EvaluationResult(
|
||||
key=self.evaluation_name,
|
||||
score=int(score_str),
|
||||
comment=reasoning,
|
||||
evaluator_info=self.evaluator_info,
|
||||
)
|
||||
|
||||
|
||||
class TrajectoryInputMapper(RunEvaluatorInputMapper, BaseModel):
|
||||
"""Maps the Run and Optional[Example] to a dictionary."""
|
||||
|
||||
tool_descriptions: List[str]
|
||||
"""The descriptions for each of the tools available to the agent."""
|
||||
agent_input_key: str = "input"
|
||||
"""The key to load from the agent executor's run input dictionary."""
|
||||
agent_output_key: str = "output"
|
||||
"""The key to load from the agent executor's run output dictionary."""
|
||||
tool_input_key: str = "input"
|
||||
"""The key to load from the tool executor's run input dictionary."""
|
||||
tool_output_key: str = "output"
|
||||
"""The key to load from the tool executor's run output dictionary."""
|
||||
|
||||
def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, str]:
|
||||
"""Maps the Run and Optional[Example] to a dictionary"""
|
||||
if run.child_runs is None:
|
||||
raise ValueError("Run must have child runs to be evaluated.")
|
||||
if run.outputs is None:
|
||||
raise ValueError("Run must have outputs to be evaluated.")
|
||||
question = run.inputs[self.agent_input_key]
|
||||
tool_runs = [
|
||||
run_ for run_ in run.child_runs if run_.run_type == RunTypeEnum.tool
|
||||
]
|
||||
agent_steps = []
|
||||
for i, run_ in enumerate(tool_runs, 1):
|
||||
tool_output = (
|
||||
f"Tool output: {run_.outputs.get(self.tool_output_key, run_.outputs)}"
|
||||
if run_.outputs
|
||||
else (f"Tool error: {run_.error}" if run_.error else "No output")
|
||||
)
|
||||
agent_steps.append(
|
||||
f"""Step {i}:
|
||||
Tool used: {run_.name}
|
||||
Tool input: {run_.inputs.get(self.tool_input_key, run_.inputs)}
|
||||
Tool output: {tool_output}"""
|
||||
)
|
||||
|
||||
return {
|
||||
"tool_descriptions": "\n\n".join(self.tool_descriptions),
|
||||
"question": question,
|
||||
"agent_trajectory": "\n\n".join(agent_steps),
|
||||
"answer": run.outputs[self.agent_output_key],
|
||||
}
|
||||
|
||||
|
||||
def get_trajectory_evaluator(
|
||||
llm: BaseChatModel,
|
||||
agent_tools: Union[Sequence[str], Sequence[BaseTool]],
|
||||
*,
|
||||
input_key: str = "input",
|
||||
prediction_key: str = "output",
|
||||
tool_input_key: str = "input",
|
||||
tool_output_key: str = "output",
|
||||
prompt: BasePromptTemplate = TRAJECTORY_PROMPT,
|
||||
evaluation_name: str = "Agent Trajectory",
|
||||
**kwargs: Any,
|
||||
) -> RunEvaluatorChain:
|
||||
"""Get an eval chain for grading a model's response against a map of criteria."""
|
||||
tool_descriptions = [
|
||||
f"Tool {i}: {tool.name}\nDescription: {tool.description}"
|
||||
if isinstance(tool, BaseTool)
|
||||
else f"Tool {i}: {tool}"
|
||||
for i, tool in enumerate(agent_tools, 1)
|
||||
]
|
||||
|
||||
input_mapper = kwargs.pop(
|
||||
"input_mapper",
|
||||
TrajectoryInputMapper(
|
||||
agent_input_key=input_key,
|
||||
agent_output_key=prediction_key,
|
||||
tool_input_key=tool_input_key,
|
||||
tool_output_key=tool_output_key,
|
||||
tool_descriptions=tool_descriptions,
|
||||
),
|
||||
)
|
||||
parser = kwargs.pop(
|
||||
"output_parser",
|
||||
TrajectoryEvalOutputParser(evaluation_name=evaluation_name),
|
||||
)
|
||||
eval_chain = LLMChain(llm=llm, prompt=prompt, **kwargs)
|
||||
return RunEvaluatorChain(
|
||||
eval_chain=eval_chain,
|
||||
input_mapper=input_mapper,
|
||||
output_parser=parser,
|
||||
**kwargs,
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user