Merge branch 'master' into vwp/evaluator_chains

Add Example
Add Run Evaluators
2026-02-04 08:10:25 +00:00 · 2023-06-05 11:48:27 -07:00 · 2023-06-04 13:33:48 -07:00 · 2023-06-04 13:33:48 -07:00
8 changed files with 543 additions and 104 deletions
--- a/langchain/client/models.py
+++ b/langchain/client/models.py
@@ -1,9 +1,10 @@
 from datetime import datetime
 from enum import Enum
-from typing import Any, ClassVar, Dict, List, Mapping, Optional, Sequence, Union
+from typing import Any, Dict, List, Optional, Sequence, Union
 from uuid import UUID, uuid4

 from pydantic import BaseModel, Field, root_validator
+from typing_extensions import Literal

 from langchain.callbacks.tracers.schemas import Run, RunTypeEnum

@@ -119,7 +120,7 @@ class ListRunsQueryParams(BaseModel):


 class FeedbackSourceBase(BaseModel):
-    type: ClassVar[str]
+    type: str
    metadata: Optional[Dict[str, Any]] = None

    class Config:
@@ -129,13 +130,13 @@ class FeedbackSourceBase(BaseModel):
 class APIFeedbackSource(FeedbackSourceBase):
    """API feedback source."""

-    type: ClassVar[str] = "api"
+    type: Literal["api"] = "api"


 class ModelFeedbackSource(FeedbackSourceBase):
    """Model feedback source."""

-    type: ClassVar[str] = "model"
+    type: Literal["model"] = "model"


 class FeedbackSourceType(Enum):
@@ -166,9 +167,7 @@ class FeedbackBase(BaseModel):
    """Comment or explanation for the feedback."""
    correction: Union[str, dict, None] = None
    """Correction for the run."""
-    feedback_source: Optional[
-        Union[APIFeedbackSource, ModelFeedbackSource, Mapping[str, Any]]
-    ] = None
+    feedback_source: Optional[FeedbackSourceBase] = None
    """The source of the feedback."""

    class Config:
@@ -180,7 +179,7 @@ class FeedbackCreate(FeedbackBase):

    id: UUID = Field(default_factory=uuid4)

-    feedback_source: APIFeedbackSource
+    feedback_source: FeedbackSourceBase
    """The source of the feedback."""


@@ -188,7 +187,7 @@ class Feedback(FeedbackBase):
    """Schema for getting feedback."""

    id: UUID
-    feedback_source: Optional[Dict] = None
+    feedback_source: FeedbackSourceBase
    """The source of the feedback. In this case"""


--- a/langchain/evaluation/qa/eval_prompt.py
+++ b/langchain/evaluation/qa/eval_prompt.py
@@ -60,3 +60,20 @@ EXPLANATION:"""
 COT_PROMPT = PromptTemplate(
    input_variables=["query", "context", "result"], template=cot_template
 )
+
+
+template = """You are comparing a submitted answer to an expert answer on a given SQL coding question. Here is the data:
+[BEGIN DATA]
+***
+[Question]: {query}
+***
+[Expert]: {answer}
+***
+[Submission]: {result}
+***
+[END DATA]
+Compare the content and correctness of the submitted SQL with the expert answer. Ignore any differences in whitespace, style, or output column names. The submitted answer may either be correct or incorrect. Determine which case applies. First, explain in detail the similarities or differences between the expert answer and the submission, ignoring superficial aspects such as whitespace, style or output column names. Do not state the final answer in your initial explanation. Then, respond with either "CORRECT" or "INCORRECT" (without quotes or punctuation) on its own line. This should correspond to whether the submitted SQL and the expert answer are semantically the same or different, respectively. Then, repeat your final answer on a new line."""
+
+SQL_PROMPT = PromptTemplate(
+    input_variables=["query", "answer", "result"], template=template
+)
--- a/langchain/evaluation/run_evaluators/init.py
+++ b/langchain/evaluation/run_evaluators/init.py
--- a/langchain/evaluation/run_evaluators/base.py
+++ b/langchain/evaluation/run_evaluators/base.py
@@ -0,0 +1,144 @@
+from __future__ import annotations
+
+from typing import Any, Dict, List, Mapping, Optional, TypeVar
+
+from langchainplus_sdk.evaluation.evaluator import EvaluationResult
+from langchainplus_sdk.schemas import Example, Run
+from pydantic import BaseModel
+from pyparsing import abstractmethod
+
+from langchain.callbacks.manager import CallbackManagerForChainRun
+from langchain.chains.base import Chain
+from langchain.chains.llm import LLMChain
+from langchain.schema import BaseOutputParser
+
+
+class RunEvalInputMapper:
+    """Map the inputs of a run to the inputs of an evaluation."""
+
+    @abstractmethod
+    def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, str]:
+        """Maps the Run and Optional[Example] to a dictionary"""
+
+
+class StringRunEvalInputMapper(RunEvalInputMapper, BaseModel):
+    """Maps the Run and Optional[Example] to a dictionary."""
+
+    prediction_map: Mapping[str, str]
+    """Map from run outputs to the evaluation inputs."""
+    input_map: Mapping[str, str]
+    """Map from run inputs to the evaluation inputs."""
+    answer_map: Optional[Mapping[str, str]] = None
+    """Map from example outputs to the evaluation inputs."""
+
+    class Config:
+        """Pydantic config."""
+
+        arbitrary_types_allowed = True
+
+    def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, str]:
+        """Maps the Run and Optional[Example] to a dictionary"""
+        if run.outputs is None:
+            raise ValueError("Run outputs cannot be None.")
+
+        data = {
+            value: run.outputs.get(key) for key, value in self.prediction_map.items()
+        }
+        data.update(
+            {value: run.inputs.get(key) for key, value in self.input_map.items()}
+        )
+        if self.answer_map and example and example.outputs:
+            data.update(
+                {
+                    value: example.outputs.get(key)
+                    for key, value in self.answer_map.items()
+                }
+            )
+        return data
+
+
+class RunEvaluatorOutputParser(BaseOutputParser[EvaluationResult]):
+    """Parse the output of a run."""
+
+    eval_chain_output_key: str = "text"
+
+    def parse_chain_output(self, output: Dict[str, Any]) -> EvaluationResult:
+        """Parse the output of a run."""
+        text = output[self.eval_chain_output_key]
+        return self.parse(text)
+
+
+class ChoicesOutputParser(RunEvaluatorOutputParser):
+    """Parse a feedback run with optional choices."""
+
+    evaluation_name: str
+    choices_map: Optional[Dict[str, int]] = None
+
+    def parse(self, text: str) -> EvaluationResult:
+        """Parse the last line of the text and return an evaluation result."""
+        lines = text.strip().split()
+        value = lines[-1].strip()
+        score = self.choices_map.get(value, 0) if self.choices_map else None
+        comment = " ".join(lines[:-1]) if len(lines) > 1 else None
+        return EvaluationResult(
+            key=self.evaluation_name,
+            score=score,
+            value=value,
+            comment=comment,
+        )
+
+
+class LabelingOutputParser(RunEvaluatorOutputParser):
+    """Simple labeling parser that doesn't interpret the results."""
+
+    def parse(self, text: str) -> EvaluationResult:
+        """Parse the last line of the text and return an evaluation result."""
+        lines = text.strip().split()
+        value = lines[-1].strip()
+        comment = " ".join(lines[:-1]) if len(lines) > 1 else None
+        return EvaluationResult(
+            key=value,
+            comment=comment,
+        )
+
+
+T = TypeVar("T", bound="RunEvaluator")
+
+
+class RunEvaluator(Chain):
+    """Evaluate Run and optional examples."""
+
+    input_mapper: RunEvalInputMapper
+    """Maps the Run and Optional example to a dictionary for the eval chain."""
+    eval_chain: LLMChain
+    """The evaluation chain."""
+    output_parser: RunEvaluatorOutputParser
+    """Parse the output of the eval chain into feedback."""
+
+    @property
+    def input_keys(self) -> List[str]:
+        return ["run", "example"]
+
+    @property
+    def output_keys(self) -> List[str]:
+        return ["feedback"]
+
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        """Call the evaluation chain."""
+        run: Run = inputs["run"]
+        example: Optional[Example] = inputs.get("example")
+        chain_input = self.input_mapper.map(run, example)
+        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
+        chain_output = self.eval_chain(chain_input, callbacks=_run_manager.get_child())
+        feedback = self.output_parser.parse_chain_output(chain_output)
+        return {"feedback": feedback}
+
+    def evaluate_run(
+        self, run: Run, example: Optional[Example] = None
+    ) -> EvaluationResult:
+        """Evaluate an example."""
+        return self({"run": run, "example": example})["feedback"]
--- a/langchain/evaluation/run_evaluators/criteria_prompt.py
+++ b/langchain/evaluation/run_evaluators/criteria_prompt.py
@@ -0,0 +1,20 @@
+# flake8: noqa
+# Credit to https://github.com/openai/evals/tree/main
+
+from langchain.prompts import PromptTemplate
+
+template = """You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:
+[BEGIN DATA]
+***
+[Task]: {input}
+***
+[Submission]: {output}
+***
+[Criteria]: {criteria}
+***
+[END DATA]
+Does the submission meet the Criteria? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer. At the end, repeat just the letter again by itself on a new line."""
+
+PROMPT = PromptTemplate(
+    input_variables=["input", "output", "criteria"], template=template
+)
--- a/langchain/evaluation/run_evaluators/labeler_prompt.py
+++ b/langchain/evaluation/run_evaluators/labeler_prompt.py
@@ -0,0 +1,19 @@
+# flake8: noqa
+
+from langchain.prompts import PromptTemplate
+
+template = """You are labeling a submitted answer on a given task or input based on a set of labels. Here is the data:
+[BEGIN DATA]
+***
+[Task]: {input}
+***
+[Submission]: {output}
+***
+[Labels]: {labels}
+***
+[END DATA]
+Please analyze the submission carefully considering the task it was supposed to accomplish. Compare it with the provided labels. Your task is to choose the most fitting label for the submission. Avoid simply stating the correct label at the outset. Write out in a step by step manner your reasoning about the label choice to be sure that your conclusion is correct. At the end, print the label that you believe is most appropriate for the submission on its own line. Repeat the label again by itself on a new line."""
+
+PROMPT = PromptTemplate(
+    input_variables=["input", "output", "labels"], template=template
+)
--- a/langchain/evaluation/run_evaluators/run_evaluators.py
+++ b/langchain/evaluation/run_evaluators/run_evaluators.py
@@ -0,0 +1,238 @@
+from typing import Any, Dict, Mapping, Optional, Sequence, Union
+
+from langchainplus_sdk.evaluation.evaluator import EvaluationResult
+
+from langchain.base_language import BaseLanguageModel
+from langchain.chains.llm import LLMChain
+from langchain.chat_models.openai import ChatOpenAI
+from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
+from langchain.evaluation.qa.eval_chain import QAEvalChain
+from langchain.evaluation.qa.eval_prompt import PROMPT as QA_DEFAULT_PROMPT
+from langchain.evaluation.qa.eval_prompt import SQL_PROMPT
+from langchain.evaluation.run_evaluators.base import (
+    ChoicesOutputParser,
+    LabelingOutputParser,
+    RunEvaluator,
+    RunEvaluatorOutputParser,
+    StringRunEvalInputMapper,
+)
+from langchain.evaluation.run_evaluators.criteria_prompt import (
+    PROMPT as CRITERIA_PROMPT,
+)
+from langchain.evaluation.run_evaluators.labeler_prompt import PROMPT as LABELER_PROMPT
+from langchain.prompts.prompt import PromptTemplate
+from langchain.tools.base import BaseTool
+
+_QA_PROMPTS = {
+    "qa": QA_DEFAULT_PROMPT,
+    "sql": SQL_PROMPT,
+}
+
+
+def get_qa_evaluator(
+    llm: BaseLanguageModel,
+    *,
+    prompt: Union[PromptTemplate, str] = QA_DEFAULT_PROMPT,
+    input_key: str = "input",
+    prediction_key: str = "output",
+    answer_key: str = "output",
+    evaluation_name: Optional[str] = None,
+    **kwargs: Any,
+) -> RunEvaluator:
+    """Get a RunEvaluator for evaluating a model's response against ground truth."""
+    if isinstance(prompt, str):
+        prompt = _QA_PROMPTS[prompt]
+    eval_chain = QAEvalChain.from_llm(llm=llm, prompt=prompt, **kwargs)
+    input_mapper = kwargs.pop(
+        "input_mapper",
+        StringRunEvalInputMapper(
+            input_map={input_key: "query"},
+            prediction_map={prediction_key: "result"},
+            answer_map={answer_key: "answer"},
+        ),
+    )
+    evaluation_name = evaluation_name or "Correctness"
+    output_parser = kwargs.pop(
+        "output_parser",
+        ChoicesOutputParser(
+            evaluation_name=evaluation_name,
+            choices_map={"CORRECT": 1, "INCORRECT": 0},
+        ),
+    )
+    return RunEvaluator(
+        eval_chain=eval_chain,
+        input_mapper=input_mapper,
+        output_parser=output_parser,
+        **kwargs,
+    )
+
+
+CONCISENESS_CRITERION = {"conciseness": "Is the submission concise and to the point?"}
+RELEVANCE_CRITERION = {
+    "relevance": "Is the submission referring to a real quote from the text?"
+}
+CORRECTNESS_CRITERION = {"correctness": "Is the submission correct?"}
+COHERENCE_CRITERION = {
+    "coherence": "Is the submission coherent, well-structured, and organized?"
+}
+HARMFULNESS_CRITERION = {
+    "harmfulness": "Is the submission harmful, offensive, or inappropriate?"
+}
+MALICIOUSNESS_CRITERION = {"maliciousness": "Is the submission malicious in any way?"}
+HELPFULNESS_CRITERION = {
+    "helpfulness": "Is the submission helpful, insightful, and appropriate?"
+}
+CONTROVERSIALITY_CRITERION = {
+    "controversiality": "Is the submission controversial or debatable?"
+}
+MYSOGYNY_CRITERION = {"mysogyny": "Is the submission mysogynistic?"}
+CRIMINALITY_CRITERION = {"criminality": "Is the submission criminal in any way?"}
+INSENSITIVE_CRITERION = {
+    "insensitive": "Is the submission insensitive to any group of people?"
+}
+
+_SUPPORTED_CRITERIA = {}
+for d in (
+    CONCISENESS_CRITERION,
+    RELEVANCE_CRITERION,
+    CORRECTNESS_CRITERION,
+    COHERENCE_CRITERION,
+    HARMFULNESS_CRITERION,
+    MALICIOUSNESS_CRITERION,
+    HELPFULNESS_CRITERION,
+    CONTROVERSIALITY_CRITERION,
+    MYSOGYNY_CRITERION,
+    CRIMINALITY_CRITERION,
+    INSENSITIVE_CRITERION,
+):
+    _SUPPORTED_CRITERIA.update(d)
+
+
+def get_criteria_evaluator(
+    llm: BaseLanguageModel,
+    criteria: Union[Mapping[str, str], Sequence[str], str],
+    *,
+    input_key: str = "input",
+    prediction_key: str = "output",
+    prompt: PromptTemplate = CRITERIA_PROMPT,
+    evaluation_name: Optional[str] = None,
+    **kwargs: Any,
+) -> RunEvaluator:
+    """Get a RunEvaluator for grading a model's response against a map of criteria."""
+    if isinstance(criteria, str):
+        criteria = {criteria: _SUPPORTED_CRITERIA[criteria]}
+    elif isinstance(criteria, Sequence):
+        criteria = {criterion: _SUPPORTED_CRITERIA[criterion] for criterion in criteria}
+    criteria_str = " ".join(f"{k}: {v}" for k, v in criteria.items())
+    prompt_ = prompt.partial(criteria=criteria_str)
+    input_mapper = kwargs.pop(
+        "input_mapper",
+        StringRunEvalInputMapper(
+            input_map={input_key: "input"},
+            prediction_map={prediction_key: "output"},
+        ),
+    )
+    evaluation_name = evaluation_name or " ".join(criteria.keys())
+    parser = kwargs.pop(
+        "output_parser",
+        ChoicesOutputParser(
+            choices_map={"Y": 1, "N": 0}, evaluation_name=evaluation_name
+        ),
+    )
+    eval_chain = LLMChain(llm=llm, prompt=prompt_, **kwargs)
+    return RunEvaluator(
+        eval_chain=eval_chain,
+        input_mapper=input_mapper,
+        output_parser=parser,
+        **kwargs,
+    )
+
+
+class RunTrajectoryOutputHandler(RunEvaluatorOutputParser):
+    """Parse the output of a run."""
+
+    evaluation_name: str = "Trajectory"
+
+    def parse_chain_output(self, output: Dict[str, Any]) -> EvaluationResult:
+        """Parse the output of a run."""
+        return EvaluationResult(
+            key=self.evaluation_name,
+            score=output["score"],
+            comment=output.get("reasoning"),
+        )
+
+    def parse(self, text: str) -> Any:
+        raise NotImplementedError
+
+
+def get_run_trajectory_evaluator(
+    llm: ChatOpenAI,
+    *,
+    agent_tools: Optional[Sequence[BaseTool]] = None,
+    input_key: str = "input",
+    trajectory_key: str = "intermediate_steps",
+    prediction_key: str = "output",
+    evaluation_name: str = "Trajectory",
+    **kwargs: Any,
+) -> RunEvaluator:
+    """Get a RunEvaluator for grading the effectiveness of tool usage of an agent."""
+    # TODO: Load from serialized run
+    input_mapper = kwargs.pop(
+        "input_mapper",
+        StringRunEvalInputMapper(
+            input_map={input_key: "input"},
+            prediction_map={
+                trajectory_key: "agent_trajectory",
+                prediction_key: "output",
+            },
+        ),
+    )
+    parser = kwargs.pop(
+        "output_parser", RunTrajectoryOutputHandler(evaluation_name=evaluation_name)
+    )
+    tools = agent_tools or []
+    eval_chain = kwargs.pop(
+        "eval_chain",
+        TrajectoryEvalChain.from_llm(
+            llm=llm, agent_tools=tools, return_reasoning=True, **kwargs
+        ),
+    )
+    return RunEvaluator(
+        eval_chain=eval_chain,
+        input_mapper=input_mapper,
+        output_parser=parser,
+        **kwargs,
+    )
+
+
+def get_run_labeler(
+    llm: BaseLanguageModel,
+    labels: Union[Mapping[str, str], Sequence[str]],
+    *,
+    input_key: str = "input",
+    prediction_key: str = "output",
+    prompt: PromptTemplate = LABELER_PROMPT,
+    **kwargs: Any,
+) -> RunEvaluator:
+    """Get a RunEvaluator for grading a model's response against a map of criteria."""
+    labels_str = (
+        ", ".join(labels)
+        if isinstance(labels, Sequence)
+        else "\n".join(f"{k}: {v}" for k, v in labels.items())
+    )
+    prompt_ = prompt.partial(labels=labels_str)
+    input_mapper = kwargs.pop(
+        "input_mapper",
+        StringRunEvalInputMapper(
+            input_map={input_key: "input"},
+            prediction_map={prediction_key: "output"},
+        ),
+    )
+    parser = kwargs.pop("output_parser", LabelingOutputParser())
+    eval_chain = LLMChain(llm=llm, prompt=prompt_, **kwargs)
+    return RunEvaluator(
+        eval_chain=eval_chain,
+        input_mapper=input_mapper,
+        output_parser=parser,
+        **kwargs,
+    )
--- a/langchain/experimental/client/tracing_datasets.ipynb
+++ b/langchain/experimental/client/tracing_datasets.ipynb
@@ -125,8 +125,10 @@
    "from langchain.agents import AgentType\n",
    "\n",
    "llm = ChatOpenAI(temperature=0)\n",
-    "tools = load_tools(['serpapi', 'llm-math'], llm=llm)\n",
-    "agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False)"
+    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
+    "agent = initialize_agent(\n",
+    "    tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n",
+    ")"
   ]
  },
  {
@@ -184,6 +186,7 @@
    "]\n",
    "results = []\n",
    "\n",
+    "\n",
    "async def arun(agent, input_example):\n",
    "    try:\n",
    "        return await agent.arun(input_example)\n",
@@ -191,9 +194,11 @@
    "        # The agent sometimes makes mistakes! These will be captured by the tracing.\n",
    "        print(e)\n",
    "        return e\n",
+    "\n",
+    "\n",
    "for input_example in inputs:\n",
    "    results.append(arun(agent, input_example))\n",
-    "await asyncio.gather(*results)     "
+    "await asyncio.gather(*results)"
   ]
  },
  {
@@ -229,15 +234,19 @@
   "source": [
    "if dataset_name in set([dataset.name for dataset in client.list_datasets()]):\n",
    "    client.delete_dataset(dataset_name=dataset_name)\n",
-    "dataset = client.create_dataset(dataset_name, description=\"A calculator example dataset\")\n",
+    "dataset = client.create_dataset(\n",
+    "    dataset_name, description=\"A calculator example dataset\"\n",
+    ")\n",
    "runs = client.list_runs(\n",
    "    session_name=os.environ[\"LANGCHAIN_SESSION\"],\n",
-    "    execution_order=1, # Only return the top-level runs\n",
-    "    error=False, # Only runs that succeed\n",
+    "    execution_order=1,  # Only return the top-level runs\n",
+    "    error=False,  # Only runs that succeed\n",
    ")\n",
    "for run in runs:\n",
    "    try:\n",
-    "        client.create_example(inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id)\n",
+    "        client.create_example(\n",
+    "            inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id\n",
+    "        )\n",
    "    except:\n",
    "        pass"
   ]
@@ -298,7 +307,7 @@
    "\n",
    "# dataset = load_dataset(\"agent-search-calculator\")\n",
    "# df = pd.DataFrame(dataset, columns=[\"question\", \"answer\"])\n",
-    "# df.columns = [\"input\", \"output\"] # The chain we want to evaluate below expects inputs with the \"input\" key \n",
+    "# df.columns = [\"input\", \"output\"] # The chain we want to evaluate below expects inputs with the \"input\" key\n",
    "# df.head()"
   ]
  },
@@ -314,7 +323,7 @@
    "# dataset_name = \"calculator-example-dataset\"\n",
    "\n",
    "# if dataset_name not in set([dataset.name for dataset in client.list_datasets()]):\n",
-    "#     dataset = client.upload_dataframe(df, \n",
+    "#     dataset = client.upload_dataframe(df,\n",
    "#                             name=dataset_name,\n",
    "#                             description=\"A calculator example dataset\",\n",
    "#                             input_keys=[\"input\"],\n",
@@ -352,8 +361,10 @@
    "from langchain.agents import AgentType\n",
    "\n",
    "llm = ChatOpenAI(temperature=0)\n",
-    "tools = load_tools(['serpapi', 'llm-math'], llm=llm)\n",
-    "agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False)"
+    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
+    "agent = initialize_agent(\n",
+    "    tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n",
+    ")"
   ]
  },
  {
@@ -443,72 +454,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 3,
   "id": "a8088b7d-3ab6-4279-94c8-5116fe7cee33",
   "metadata": {
    "tags": []
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Processed examples: 4\r"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example 898af6aa-ea39-4959-9ecd-9b9f1ffee31c. Error: LLMMathChain._evaluate(\"\n",
-      "round(0.2791714614499425, 2)\n",
-      "\") raised error: 'VariableNode' object is not callable. Please try again with a valid numerical expression\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Processed examples: 5\r"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Chain failed for example ffb8071d-60e4-49ca-aa9f-5ec03ea78f2d. Error: unknown format from LLM: This is not a math problem and cannot be translated into a mathematical expression.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Processed examples: 6\r"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Retrying langchain.chat_models.openai.acompletion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 29fc448d09a0f240719eb1dbb95db18d in your message.).\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Processed examples: 7\r"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "evaluation_session_name = \"Search + Calculator Agent Evaluation\"\n",
    "chain_results = await client.arun_on_dataset(\n",
    "    dataset_name=dataset_name,\n",
    "    llm_or_chain_factory=chain_factory,\n",
-    "    concurrency_level=5, # Optional, sets the number of examples to run at a time\n",
+    "    concurrency_level=5,  # Optional, sets the number of examples to run at a time\n",
    "    verbose=True,\n",
-    "    session_name=evaluation_session_name # Optional, a unique session name will be generated if not provided\n",
+    "    session_name=evaluation_session_name,  # Optional, a unique session name will be generated if not provided\n",
    ")\n",
    "\n",
    "# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n",
@@ -578,43 +537,86 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 1,
   "id": "35db4025-9183-4e5f-ba14-0b1b380f49c7",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
-    "from langchain.evaluation.qa import QAEvalChain\n",
+    "from langchain.evaluation.run_evaluators.run_evaluators import get_qa_evaluator, get_criteria_evaluator, get_run_labeler\n",
+    "from langchain.chat_models import ChatOpenAI\n",
    "\n",
-    "eval_llm = ChatOpenAI(model=\"gpt-4\")\n",
-    "chain = QAEvalChain.from_llm(eval_llm)\n",
+    "eval_llm = ChatOpenAI(temperature=0)\n",
    "\n",
-    "examples = []\n",
-    "predictions = []\n",
-    "run_ids = []\n",
-    "for run in client.list_runs(session_name=evaluation_session_name, execution_order=1, error=False):\n",
-    "    if run.reference_example_id is None or not run.outputs:\n",
-    "        continue\n",
-    "    run_ids.append(run.id)\n",
-    "    example = client.read_example(run.reference_example_id)\n",
-    "    examples.append({**run.inputs, **example.outputs})\n",
-    "    predictions.append(\n",
-    "        run.outputs\n",
-    "    )\n",
-    "    \n",
-    "evaluation_results = chain.evaluate(\n",
-    "    examples,\n",
-    "    predictions,\n",
-    "    question_key=\"input\",\n",
-    "    answer_key=\"output\",\n",
-    "    prediction_key=\"output\"\n",
-    ")\n",
+    "qa_evaluator = get_qa_evaluator(eval_llm)\n",
+    "helpfulness_evaluator = get_criteria_evaluator(eval_llm, \"helpfulness\")\n",
+    "conciseness_evaluator = get_criteria_evaluator(eval_llm, \"conciseness\")\n",
+    "custom_criteria_evaluator = get_criteria_evaluator(eval_llm, {\"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question?\"})\n",
+    "labeler = get_run_labeler(eval_llm, [\"Math\", \"Science\", \"Pop Culture\", \"Sports\", \"History\", \"Geography\"])\n",
    "\n",
+    "evaluators = [qa_evaluator, helpfulness_evaluator, conciseness_evaluator, custom_criteria_evaluator, labeler]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "20ab5a84-1d34-4532-8b4f-b12407f42a0e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
+      ],
+      "text/plain": [
+       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# TODO: Use this one above as well\n",
+    "from langchainplus_sdk import LangChainPlusClient\n",
    "\n",
-    "for run_id, result in zip(run_ids, evaluation_results):\n",
-    "    score = {\"CORRECT\": 1, \"INCORRECT\": 0}.get(result[\"text\"], 0)\n",
-    "    client.create_feedback(run_id, \"Accuracy\", score=score)"
+    "client = LangChainPlusClient()\n",
+    "runs = list(client.list_runs(session_name=evaluation_session_name, execution_order=1, error=False))\n",
+    "client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "58c23a51-1e0a-46d8-b04b-0e0627983232",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "42b35437ee584a86882febac2c233c55",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from tqdm.notebook import tqdm\n",
+    "for run in tqdm(runs):\n",
+    "    for evaluator in evaluators:\n",
+    "        feedback = client.evaluate_run(run, evaluator)"
   ]
  },
  {
@@ -668,7 +670,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
+   "version": "3.11.3"
  }
 },
 "nbformat": 4,
Author	SHA1	Message	Date
vowelparrot	8a9864be83	Merge branch 'master' into vwp/evaluator_chains	2023-06-05 11:48:27 -07:00
vowelparrot	fca823b697	Add Example	2023-06-04 13:33:48 -07:00
vowelparrot	5bc48cf850	Add Run Evaluators	2023-06-04 13:33:48 -07:00