mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-01 12:38:45 +00:00
Return feedback (#9629)
Return the feedback values in an eval run result Also made a helper method to display as a dataframe but it may be overkill
This commit is contained in:
parent
5e2d0cf54e
commit
cb642ef658
@ -3,10 +3,11 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from concurrent.futures import Future, ThreadPoolExecutor, wait
|
||||
from typing import Any, List, Optional, Sequence, Set, Union
|
||||
from typing import Any, Dict, List, Optional, Sequence, Set, Union
|
||||
from uuid import UUID
|
||||
|
||||
from langsmith import Client, RunEvaluator
|
||||
import langsmith
|
||||
from langsmith import schemas as langsmith_schemas
|
||||
|
||||
from langchain.callbacks.manager import tracing_v2_enabled
|
||||
from langchain.callbacks.tracers.base import BaseTracer
|
||||
@ -62,13 +63,13 @@ class EvaluatorCallbackHandler(BaseTracer):
|
||||
The LangSmith project name to be organize eval chain runs under.
|
||||
"""
|
||||
|
||||
name: str = "evaluator_callback_handler"
|
||||
name = "evaluator_callback_handler"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
evaluators: Sequence[RunEvaluator],
|
||||
evaluators: Sequence[langsmith.RunEvaluator],
|
||||
max_workers: Optional[int] = None,
|
||||
client: Optional[Client] = None,
|
||||
client: Optional[langsmith.Client] = None,
|
||||
example_id: Optional[Union[UUID, str]] = None,
|
||||
skip_unfinished: bool = True,
|
||||
project_name: Optional[str] = "evaluators",
|
||||
@ -86,10 +87,11 @@ class EvaluatorCallbackHandler(BaseTracer):
|
||||
self.futures: Set[Future] = set()
|
||||
self.skip_unfinished = skip_unfinished
|
||||
self.project_name = project_name
|
||||
self.logged_feedback: Dict[str, List[langsmith_schemas.Feedback]] = {}
|
||||
global _TRACERS
|
||||
_TRACERS.append(self)
|
||||
|
||||
def _evaluate_in_project(self, run: Run, evaluator: RunEvaluator) -> None:
|
||||
def _evaluate_in_project(self, run: Run, evaluator: langsmith.RunEvaluator) -> None:
|
||||
"""Evaluate the run in the project.
|
||||
|
||||
Parameters
|
||||
@ -102,11 +104,11 @@ class EvaluatorCallbackHandler(BaseTracer):
|
||||
"""
|
||||
try:
|
||||
if self.project_name is None:
|
||||
self.client.evaluate_run(run, evaluator)
|
||||
feedback = self.client.evaluate_run(run, evaluator)
|
||||
with tracing_v2_enabled(
|
||||
project_name=self.project_name, tags=["eval"], client=self.client
|
||||
):
|
||||
self.client.evaluate_run(run, evaluator)
|
||||
feedback = self.client.evaluate_run(run, evaluator)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error evaluating run {run.id} with "
|
||||
@ -114,6 +116,8 @@ class EvaluatorCallbackHandler(BaseTracer):
|
||||
exc_info=True,
|
||||
)
|
||||
raise e
|
||||
example_id = str(run.reference_example_id)
|
||||
self.logged_feedback.setdefault(example_id, []).append(feedback)
|
||||
|
||||
def _persist_run(self, run: Run) -> None:
|
||||
"""Run the evaluator on the run.
|
||||
|
@ -11,6 +11,7 @@ import uuid
|
||||
import warnings
|
||||
from enum import Enum
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Coroutine,
|
||||
@ -44,6 +45,9 @@ from langchain.schema.runnable import Runnable, RunnableConfig, RunnableLambda
|
||||
from langchain.smith.evaluation.config import EvalConfig, RunEvalConfig
|
||||
from langchain.smith.evaluation.string_run_evaluator import StringRunEvaluatorChain
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pandas as pd
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MODEL_OR_CHAIN_FACTORY = Union[
|
||||
@ -63,6 +67,31 @@ class InputFormatError(Exception):
|
||||
## Shared Utilities
|
||||
|
||||
|
||||
class TestResult(dict):
|
||||
"""A dictionary of the results of a single test run."""
|
||||
|
||||
def to_dataframe(self) -> pd.DataFrame:
|
||||
"""Convert the results to a dataframe."""
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Pandas is required to convert the results to a dataframe."
|
||||
" to install pandas, run `pip install pandas`."
|
||||
) from e
|
||||
|
||||
indices = []
|
||||
records = []
|
||||
for example_id, result in self["results"].items():
|
||||
feedback = result["feedback"]
|
||||
records.append(
|
||||
{**{f.key: f.score for f in feedback}, "output": result["output"]}
|
||||
)
|
||||
indices.append(example_id)
|
||||
|
||||
return pd.DataFrame(records, index=indices)
|
||||
|
||||
|
||||
def _get_eval_project_url(api_url: str, project_id: str) -> str:
|
||||
"""Get the project url from the api url."""
|
||||
parsed = urlparse(api_url)
|
||||
@ -667,7 +696,7 @@ async def _arun_llm_or_chain(
|
||||
tags: Optional[List[str]] = None,
|
||||
callbacks: Optional[List[BaseCallbackHandler]] = None,
|
||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
||||
) -> Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
|
||||
) -> Union[dict, str, LLMResult, ChatResult]:
|
||||
"""Asynchronously run the Chain or language model.
|
||||
|
||||
Args:
|
||||
@ -689,10 +718,10 @@ async def _arun_llm_or_chain(
|
||||
tracer.example_id = example.id
|
||||
else:
|
||||
previous_example_ids = None
|
||||
outputs = []
|
||||
chain_or_llm = (
|
||||
"LLM" if isinstance(llm_or_chain_factory, BaseLanguageModel) else "Chain"
|
||||
)
|
||||
result = None
|
||||
try:
|
||||
if isinstance(llm_or_chain_factory, BaseLanguageModel):
|
||||
output: Any = await _arun_llm(
|
||||
@ -711,15 +740,15 @@ async def _arun_llm_or_chain(
|
||||
callbacks=callbacks,
|
||||
input_mapper=input_mapper,
|
||||
)
|
||||
outputs.append(output)
|
||||
result = output
|
||||
except Exception as e:
|
||||
logger.warning(f"{chain_or_llm} failed for example {example.id}. Error: {e}")
|
||||
outputs.append({"Error": str(e)})
|
||||
result = {"Error": str(e)}
|
||||
if callbacks and previous_example_ids:
|
||||
for example_id, tracer in zip(previous_example_ids, callbacks):
|
||||
if hasattr(tracer, "example_id"):
|
||||
tracer.example_id = example_id
|
||||
return outputs
|
||||
return result
|
||||
|
||||
|
||||
async def _gather_with_concurrency(
|
||||
@ -856,7 +885,7 @@ async def _arun_on_examples(
|
||||
wrapped_model, examples, evaluation, data_type
|
||||
)
|
||||
examples = _validate_example_inputs(examples, wrapped_model, input_mapper)
|
||||
results: Dict[str, List[Any]] = {}
|
||||
results: Dict[str, dict] = {}
|
||||
|
||||
async def process_example(
|
||||
example: Example, callbacks: List[BaseCallbackHandler], job_state: dict
|
||||
@ -869,7 +898,7 @@ async def _arun_on_examples(
|
||||
callbacks=callbacks,
|
||||
input_mapper=input_mapper,
|
||||
)
|
||||
results[str(example.id)] = result
|
||||
results[str(example.id)] = {"output": result}
|
||||
job_state["num_processed"] += 1
|
||||
if verbose:
|
||||
print(
|
||||
@ -890,8 +919,14 @@ async def _arun_on_examples(
|
||||
),
|
||||
*(functools.partial(process_example, e) for e in examples),
|
||||
)
|
||||
all_feedback = {}
|
||||
for handler in evaluation_handlers:
|
||||
handler.wait_for_futures()
|
||||
all_feedback.update(handler.logged_feedback)
|
||||
# join the results and feedback on the example id
|
||||
for example_id, output_dict in results.items():
|
||||
feedback = all_feedback.get(example_id, [])
|
||||
output_dict["feedback"] = feedback
|
||||
return results
|
||||
|
||||
|
||||
@ -978,7 +1013,7 @@ def _run_llm_or_chain(
|
||||
tags: Optional[List[str]] = None,
|
||||
callbacks: Optional[List[BaseCallbackHandler]] = None,
|
||||
input_mapper: Optional[Callable[[Dict], Any]] = None,
|
||||
) -> Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
|
||||
) -> Union[dict, str, LLMResult, ChatResult]:
|
||||
"""
|
||||
Run the Chain or language model synchronously.
|
||||
|
||||
@ -1001,10 +1036,10 @@ def _run_llm_or_chain(
|
||||
tracer.example_id = example.id
|
||||
else:
|
||||
previous_example_ids = None
|
||||
outputs = []
|
||||
chain_or_llm = (
|
||||
"LLM" if isinstance(llm_or_chain_factory, BaseLanguageModel) else "Chain"
|
||||
)
|
||||
result = None
|
||||
try:
|
||||
if isinstance(llm_or_chain_factory, BaseLanguageModel):
|
||||
output: Any = _run_llm(
|
||||
@ -1023,18 +1058,18 @@ def _run_llm_or_chain(
|
||||
tags=tags,
|
||||
input_mapper=input_mapper,
|
||||
)
|
||||
outputs.append(output)
|
||||
result = output
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"{chain_or_llm} failed for example {example.id} with inputs:"
|
||||
f" {example.inputs}.\nError: {e}",
|
||||
)
|
||||
outputs.append({"Error": str(e)})
|
||||
result = {"Error": str(e)}
|
||||
if callbacks and previous_example_ids:
|
||||
for example_id, tracer in zip(previous_example_ids, callbacks):
|
||||
if hasattr(tracer, "example_id"):
|
||||
tracer.example_id = example_id
|
||||
return outputs
|
||||
return result
|
||||
|
||||
|
||||
def _run_on_examples(
|
||||
@ -1075,7 +1110,7 @@ def _run_on_examples(
|
||||
Returns:
|
||||
A dictionary mapping example ids to the model outputs.
|
||||
"""
|
||||
results: Dict[str, Any] = {}
|
||||
results: Dict[str, dict] = {}
|
||||
wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory)
|
||||
project_name = _get_project_name(project_name, wrapped_model)
|
||||
tracer = LangChainTracer(
|
||||
@ -1085,11 +1120,11 @@ def _run_on_examples(
|
||||
wrapped_model, examples, evaluation, data_type
|
||||
)
|
||||
examples = _validate_example_inputs(examples, wrapped_model, input_mapper)
|
||||
evalution_handler = EvaluatorCallbackHandler(
|
||||
evaluation_handler = EvaluatorCallbackHandler(
|
||||
evaluators=run_evaluators or [],
|
||||
client=client,
|
||||
)
|
||||
callbacks: List[BaseCallbackHandler] = [tracer, evalution_handler]
|
||||
callbacks: List[BaseCallbackHandler] = [tracer, evaluation_handler]
|
||||
for i, example in enumerate(examples):
|
||||
result = _run_llm_or_chain(
|
||||
example,
|
||||
@ -1100,9 +1135,14 @@ def _run_on_examples(
|
||||
)
|
||||
if verbose:
|
||||
print(f"{i+1} processed", flush=True, end="\r")
|
||||
results[str(example.id)] = result
|
||||
results[str(example.id)] = {"output": result}
|
||||
tracer.wait_for_futures()
|
||||
evalution_handler.wait_for_futures()
|
||||
evaluation_handler.wait_for_futures()
|
||||
all_feedback = evaluation_handler.logged_feedback
|
||||
# join the results and feedback on the example id
|
||||
for example_id, output_dict in results.items():
|
||||
feedback = all_feedback.get(example_id, [])
|
||||
output_dict["feedback"] = feedback
|
||||
return results
|
||||
|
||||
|
||||
@ -1276,10 +1316,10 @@ async def arun_on_dataset(
|
||||
input_mapper=input_mapper,
|
||||
data_type=dataset.data_type,
|
||||
)
|
||||
return {
|
||||
"project_name": project_name,
|
||||
"results": results,
|
||||
}
|
||||
return TestResult(
|
||||
project_name=project_name,
|
||||
results=results,
|
||||
)
|
||||
|
||||
|
||||
def _handle_coroutine(coro: Coroutine) -> Any:
|
||||
@ -1461,7 +1501,7 @@ def run_on_dataset(
|
||||
data_type=dataset.data_type,
|
||||
)
|
||||
results = _handle_coroutine(coro)
|
||||
return {
|
||||
"project_name": project_name,
|
||||
"results": results,
|
||||
}
|
||||
return TestResult(
|
||||
project_name=project_name,
|
||||
results=results,
|
||||
)
|
||||
|
@ -182,14 +182,12 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
|
||||
return {"the right input": inputs["the wrong input"]}
|
||||
|
||||
result = _run_llm_or_chain(example, lambda: mock_chain, input_mapper=input_mapper)
|
||||
assert len(result) == 1
|
||||
assert result[0] == {"output": "2", "the right input": "1"}
|
||||
assert result == {"output": "2", "the right input": "1"}
|
||||
bad_result = _run_llm_or_chain(
|
||||
example,
|
||||
lambda: mock_chain,
|
||||
)
|
||||
assert len(bad_result) == 1
|
||||
assert "Error" in bad_result[0]
|
||||
assert "Error" in bad_result
|
||||
|
||||
# Try with LLM
|
||||
def llm_input_mapper(inputs: dict) -> str:
|
||||
@ -197,9 +195,7 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
|
||||
return "the right input"
|
||||
|
||||
mock_llm = FakeLLM(queries={"the right input": "somenumber"})
|
||||
result = _run_llm_or_chain(example, mock_llm, input_mapper=llm_input_mapper)
|
||||
assert len(result) == 1
|
||||
llm_result = result[0]
|
||||
llm_result = _run_llm_or_chain(example, mock_llm, input_mapper=llm_input_mapper)
|
||||
assert isinstance(llm_result, str)
|
||||
assert llm_result == "somenumber"
|
||||
|
||||
@ -300,8 +296,8 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
tags: Optional[List[str]] = None,
|
||||
callbacks: Optional[Any] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Dict[str, Any]]:
|
||||
return [{"result": f"Result for example {example.id}"}]
|
||||
) -> Dict[str, Any]:
|
||||
return {"result": f"Result for example {example.id}"}
|
||||
|
||||
def mock_create_project(*args: Any, **kwargs: Any) -> Any:
|
||||
proj = mock.MagicMock()
|
||||
@ -328,9 +324,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
)
|
||||
|
||||
expected = {
|
||||
uuid_: [
|
||||
{"result": f"Result for example {uuid.UUID(uuid_)}"} for _ in range(1)
|
||||
]
|
||||
uuid_: {
|
||||
"output": {"result": f"Result for example {uuid.UUID(uuid_)}"},
|
||||
"feedback": [],
|
||||
}
|
||||
for uuid_ in uuids
|
||||
}
|
||||
assert results["results"] == expected
|
||||
|
Loading…
Reference in New Issue
Block a user