Load Run Evaluator (#7101)

Current problems:
1. Evaluating LLMs or Chat models isn't smooth. Even specifying
'generations' as the output inserts a redundant list into the eval
template
2. Configuring input / prediction / reference keys in the
`get_qa_evaluator` function is confusing. Unless you are using a chain
with the default keys, you have to specify all the variables and need to
reason about whether the key corresponds to the traced run's inputs,
outputs or the examples inputs or outputs.


Proposal:
- Configure the run evaluator according to a model. Use the model type
and input/output keys to assert compatibility where possible. Only need
to specify a reference_key for certain evaluators (which is less
confusing than specifying input keys)


When does this work:
- If you have your langchain model available (assumed always for
run_on_dataset flow)
- If you are evaluating an LLM, Chat model, or chain
- If the LLM or chat models are traced by langchain (wouldn't work if
you add an incompatible schema via the REST API)

When would this fail:
- Currently if you directly create an example from an LLM run, the
outputs are generations with all the extra metadata present. A simple
`example_key` and dumping all to the template could make the evaluations
unreliable
- Doesn't help if you're not using the low level API
- If you want to instantiate the evaluator without instantiating your
chain or LLM (maybe common for monitoring, for instance) -> could also
load from run or run type though

What's ugly:
- Personally think it's better to load evaluators one by one since
passing a config down is pretty confusing.
- Lots of testing needs to be added
- Inconsistent in that it makes a separate run and example input mapper
instead of the original `RunEvaluatorInputMapper`, which maps a run and
example to a single input.

Example usage running the for an LLM, Chat Model, and Agent.

```
# Test running for the string evaluators
evaluator_names = ["qa", "criteria"]

model = ChatOpenAI()
configured_evaluators = load_run_evaluators_for_model(evaluator_names, model=model, reference_key="answer")
run_on_dataset(ds_name, model, run_evaluators=configured_evaluators)
```


<details>
  <summary>Full code with dataset upload</summary>
```
## Create dataset
from langchain.evaluation.run_evaluators.loading import load_run_evaluators_for_model
from langchain.evaluation import load_dataset
import pandas as pd

lcds = load_dataset("llm-math")
df = pd.DataFrame(lcds)

from uuid import uuid4
from langsmith import Client
client = Client()
ds_name = "llm-math - " + str(uuid4())[0:8]
ds = client.upload_dataframe(df, name=ds_name, input_keys=["question"], output_keys=["answer"])



## Define the models we'll test over
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent, AgentType

from langchain.tools import tool

llm = OpenAI(temperature=0)
chat_model = ChatOpenAI(temperature=0)

@tool
    def sum(a: float, b: float) -> float:
        """Add two numbers"""
        return a + b
    
def construct_agent():
    return initialize_agent(
        llm=chat_model,
        tools=[sum],
        agent=AgentType.OPENAI_MULTI_FUNCTIONS,
    )

agent = construct_agent()

# Test running for the string evaluators
evaluator_names = ["qa", "criteria"]

models = [llm, chat_model, agent]
run_evaluators = []
for model in models:
    run_evaluators.append(load_run_evaluators_for_model(evaluator_names, model=model, reference_key="answer"))
    

# Run on LLM, Chat Model, and Agent
from langchain.client.runner_utils import run_on_dataset

to_test = [llm, chat_model, construct_agent]

for model, configured_evaluators in zip(to_test, run_evaluators):
    run_on_dataset(ds_name, model, run_evaluators=configured_evaluators, verbose=True)
```
</details>

---------

Co-authored-by: Nuno Campos <nuno@boringbits.io>
This commit is contained in:
William FH
2023-07-07 19:57:59 -07:00
committed by GitHub
parent 1ac347b4e3
commit c5edbea34a
13 changed files with 730 additions and 3 deletions

View File

@@ -0,0 +1,81 @@
import sys
from typing import Iterator
from uuid import uuid4
import pytest
from langchainplus_sdk import LangChainPlusClient as Client
from langchain.chains.llm import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.client.runner_utils import run_on_dataset
from langchain.evaluation import EvaluatorType
from langchain.evaluation.run_evaluators.loading import load_run_evaluators_for_model
from langchain.llms.openai import OpenAI
@pytest.fixture(
scope="module",
)
def dataset_name() -> Iterator[str]:
import pandas as pd
client = Client()
df = pd.DataFrame(
[
{"question": "5", "answer": 5.0},
{"question": "5 + 3", "answer": 8.0},
{"question": "2^3.171", "answer": 9.006708689094099},
{"question": " 2 ^3.171 ", "answer": 9.006708689094099},
]
)
uid = str(uuid4())[-8:]
_dataset_name = f"lcp integration tests - {uid}"
client.upload_dataframe(
df,
name=_dataset_name,
input_keys=["question"],
output_keys=["answer"],
description="Integration test dataset",
)
yield _dataset_name
def test_chat_model(dataset_name: str) -> None:
llm = ChatOpenAI(temperature=0)
evaluators = load_run_evaluators_for_model(
[EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer"
)
results = run_on_dataset(
dataset_name,
llm,
run_evaluators=evaluators,
)
print("CHAT", results, file=sys.stderr)
def test_llm(dataset_name: str) -> None:
llm = OpenAI(temperature=0)
evaluators = load_run_evaluators_for_model(
[EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer"
)
results = run_on_dataset(
dataset_name,
llm,
run_evaluators=evaluators,
)
print("LLM", results, file=sys.stderr)
def test_chain(dataset_name: str) -> None:
llm = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
evaluators = load_run_evaluators_for_model(
[EvaluatorType.QA, EvaluatorType.CRITERIA], chain, reference_key="answer"
)
results = run_on_dataset(
dataset_name,
lambda: chain,
run_evaluators=evaluators,
)
print("CHAIN", results, file=sys.stderr)

View File

@@ -0,0 +1,114 @@
"""Test the loading function for evalutors."""
from unittest.mock import MagicMock
import pytest
from langchain.callbacks.tracers.run_collector import RunCollectorCallbackHandler
from langchain.evaluation.loading import load_evaluators
from langchain.evaluation.run_evaluators.string_run_evaluator import (
StringRunEvaluatorChain,
)
from langchain.evaluation.schema import StringEvaluator
from tests.unit_tests.chains.test_base import FakeChain
from tests.unit_tests.llms.fake_chat_model import FakeChatModel
from tests.unit_tests.llms.fake_llm import FakeLLM
@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
def test_load_string_run_evaluators_with_llm(evaluator_type: str) -> None:
"""Test loading evaluators."""
fake_llm = FakeLLM(
queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
)
evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0] # type: ignore
if not isinstance(evaluator, StringEvaluator):
raise ValueError("Evaluator is not a string evaluator")
model = FakeLLM(queries={"text": "Foo output"}, sequential_responses=True)
kwargs = {}
if evaluator.requires_reference:
kwargs["reference_key"] = "generations"
run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
model, evaluator, **kwargs
)
callback = RunCollectorCallbackHandler()
model.predict("Foo input", callbacks=[callback])
run = callback.traced_runs[0]
example = MagicMock()
example.inputs = {}
example.outputs = {"generations": "Foo output"}
result = run_evaluator._prepare_input({"run": run, "example": example})
assert result["input"] == "Foo input"
assert result["prediction"] == "Foo output"
if evaluator.requires_reference:
assert "reference" in result
assert result["reference"] == "Foo output"
@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
def test_load_string_run_evaluators_with_chat_model(evaluator_type: str) -> None:
"""Test loading evaluators."""
fake_llm = FakeLLM(
queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
)
evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0] # type: ignore
if not isinstance(evaluator, StringEvaluator):
raise ValueError("Evaluator is not a string evaluator")
model = FakeChatModel()
kwargs = {}
if evaluator.requires_reference:
kwargs["reference_key"] = "generations"
run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
model, evaluator, **kwargs
)
callback = RunCollectorCallbackHandler()
model.predict("Foo input", callbacks=[callback])
run = callback.traced_runs[0]
example = MagicMock()
example.inputs = {}
example.outputs = {"generations": "Another fake response"}
result = run_evaluator._prepare_input({"run": run, "example": example})
assert result["input"] == "Human: Foo input"
assert result["prediction"] == "AI: fake response"
if evaluator.requires_reference:
assert "reference" in result
assert result["reference"] == "Another fake response"
@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
def test_load_string_run_evaluators_with_chain(evaluator_type: str) -> None:
model = FakeChain(
the_input_keys=["an_input", "another_input"],
)
fake_llm = FakeChatModel()
evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0] # type: ignore
if not isinstance(evaluator, StringEvaluator):
raise ValueError("Evaluator is not a string evaluator")
# No input key
with pytest.raises(ValueError, match="multiple input keys"):
StringRunEvaluatorChain.from_model_and_evaluator(model, evaluator)
with pytest.raises(ValueError, match="does not have specified"):
StringRunEvaluatorChain.from_model_and_evaluator(
model, evaluator, input_key="some_input"
)
kwargs = {}
if evaluator.requires_reference:
kwargs["reference_key"] = "label_column"
run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
model, evaluator, input_key="an_input", **kwargs
)
callback = RunCollectorCallbackHandler()
model(
{"an_input": "Foo input", "another_input": "Another fake response"},
callbacks=[callback],
)
run = callback.traced_runs[0]
example = MagicMock()
example.inputs = {}
example.outputs = {"label_column": "Another fake response"}
result = run_evaluator._prepare_input({"run": run, "example": example})
assert result["input"] == "Foo input"
assert result["prediction"] == "baz"
if evaluator.requires_reference:
assert "reference" in result
assert result["reference"] == "Another fake response"

View File

@@ -3,7 +3,9 @@
import pytest
from langchain.evaluation.loading import EvaluatorType, load_evaluators
from langchain.evaluation.schema import StringEvaluator
from tests.unit_tests.llms.fake_chat_model import FakeChatModel
from tests.unit_tests.llms.fake_llm import FakeLLM
@pytest.mark.parametrize("evaluator_type", EvaluatorType)
@@ -14,3 +16,16 @@ def test_load_evaluators(evaluator_type: EvaluatorType) -> None:
# Test as string
load_evaluators([evaluator_type.value], llm=fake_llm) # type: ignore
def test_criteria_eval_chain_requires_reference() -> None:
"""Test loading evaluators."""
fake_llm = FakeLLM(
queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
)
evaluator = load_evaluators(
[EvaluatorType.CRITERIA], llm=fake_llm, requires_reference=True
)[0]
if not isinstance(evaluator, StringEvaluator):
raise ValueError("Evaluator is not a string evaluator")
assert evaluator.requires_reference