Rm deprecated (#15920)

Remove the usage of deprecated methods in the test runner.
This commit is contained in:
William FH 2024-01-11 18:10:49 -08:00 committed by GitHub
parent 438beb6c94
commit 129552e3d6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 64 additions and 62 deletions

View File

@ -674,15 +674,14 @@ async def _arun_llm(
""" """
if input_mapper is not None: if input_mapper is not None:
prompt_or_messages = input_mapper(inputs) prompt_or_messages = input_mapper(inputs)
if isinstance(prompt_or_messages, str): if (
return await llm.apredict( isinstance(prompt_or_messages, str)
prompt_or_messages, callbacks=callbacks, tags=tags or isinstance(prompt_or_messages, list)
) and all(isinstance(msg, BaseMessage) for msg in prompt_or_messages)
elif isinstance(prompt_or_messages, list) and all(
isinstance(msg, BaseMessage) for msg in prompt_or_messages
): ):
return await llm.apredict_messages( return await llm.ainvoke(
prompt_or_messages, callbacks=callbacks, tags=tags prompt_or_messages,
config=RunnableConfig(callbacks=callbacks, tags=tags or []),
) )
else: else:
raise InputFormatError( raise InputFormatError(
@ -694,13 +693,13 @@ async def _arun_llm(
else: else:
try: try:
prompt = _get_prompt(inputs) prompt = _get_prompt(inputs)
llm_output: Union[str, BaseMessage] = await llm.apredict( llm_output: Union[str, BaseMessage] = await llm.ainvoke(
prompt, callbacks=callbacks, tags=tags prompt, config=RunnableConfig(callbacks=callbacks, tags=tags or [])
) )
except InputFormatError: except InputFormatError:
messages = _get_messages(inputs) messages = _get_messages(inputs)
llm_output = await llm.apredict_messages( llm_output = await llm.ainvoke(
messages, callbacks=callbacks, tags=tags messages, config=RunnableConfig(callbacks=callbacks, tags=tags or [])
) )
return llm_output return llm_output
@ -722,7 +721,9 @@ async def _arun_chain(
and chain.input_keys and chain.input_keys
): ):
val = next(iter(inputs_.values())) val = next(iter(inputs_.values()))
output = await chain.acall(val, callbacks=callbacks, tags=tags) output = await chain.ainvoke(
val, config=RunnableConfig(callbacks=callbacks, tags=tags or [])
)
else: else:
runnable_config = RunnableConfig(tags=tags or [], callbacks=callbacks) runnable_config = RunnableConfig(tags=tags or [], callbacks=callbacks)
output = await chain.ainvoke(inputs_, config=runnable_config) output = await chain.ainvoke(inputs_, config=runnable_config)
@ -807,17 +808,17 @@ def _run_llm(
ValueError: If the LLM type is unsupported. ValueError: If the LLM type is unsupported.
InputFormatError: If the input format is invalid. InputFormatError: If the input format is invalid.
""" """
# Most of this is legacy code; we could probably remove a lot of it.
if input_mapper is not None: if input_mapper is not None:
prompt_or_messages = input_mapper(inputs) prompt_or_messages = input_mapper(inputs)
if isinstance(prompt_or_messages, str): if (
llm_output: Union[str, BaseMessage] = llm.predict( isinstance(prompt_or_messages, str)
prompt_or_messages, callbacks=callbacks, tags=tags or isinstance(prompt_or_messages, list)
) and all(isinstance(msg, BaseMessage) for msg in prompt_or_messages)
elif isinstance(prompt_or_messages, list) and all(
isinstance(msg, BaseMessage) for msg in prompt_or_messages
): ):
llm_output = llm.predict_messages( llm_output: Union[str, BaseMessage] = llm.invoke(
prompt_or_messages, callbacks=callbacks, tags=tags prompt_or_messages,
config=RunnableConfig(callbacks=callbacks, tags=tags or []),
) )
else: else:
raise InputFormatError( raise InputFormatError(
@ -828,10 +829,14 @@ def _run_llm(
else: else:
try: try:
llm_prompts = _get_prompt(inputs) llm_prompts = _get_prompt(inputs)
llm_output = llm.predict(llm_prompts, callbacks=callbacks, tags=tags) llm_output = llm.invoke(
llm_prompts, config=RunnableConfig(callbacks=callbacks, tags=tags or [])
)
except InputFormatError: except InputFormatError:
llm_messages = _get_messages(inputs) llm_messages = _get_messages(inputs)
llm_output = llm.predict_messages(llm_messages, callbacks=callbacks) llm_output = llm.invoke(
llm_messages, config=RunnableConfig(callbacks=callbacks)
)
return llm_output return llm_output
@ -852,7 +857,9 @@ def _run_chain(
and chain.input_keys and chain.input_keys
): ):
val = next(iter(inputs_.values())) val = next(iter(inputs_.values()))
output = chain(val, callbacks=callbacks, tags=tags) output = chain.invoke(
val, config=RunnableConfig(callbacks=callbacks, tags=tags or [])
)
else: else:
runnable_config = RunnableConfig(tags=tags or [], callbacks=callbacks) runnable_config = RunnableConfig(tags=tags or [], callbacks=callbacks)
output = chain.invoke(inputs_, config=runnable_config) output = chain.invoke(inputs_, config=runnable_config)
@ -1313,7 +1320,7 @@ Examples
.. code-block:: python .. code-block:: python
from langsmith import Client from langsmith import Client
from langchain_community.chat_models import ChatOpenAI from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain from langchain.chains import LLMChain
from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset

View File

@ -1,4 +1,4 @@
from typing import Iterator, List from typing import Iterator, List, Optional
from uuid import uuid4 from uuid import uuid4
import pytest import pytest
@ -7,7 +7,8 @@ from langchain_community.llms.openai import OpenAI
from langchain_core.messages import BaseMessage, HumanMessage from langchain_core.messages import BaseMessage, HumanMessage
from langchain_core.prompts.chat import ChatPromptTemplate from langchain_core.prompts.chat import ChatPromptTemplate
from langsmith import Client as Client from langsmith import Client as Client
from langsmith.schemas import DataType from langsmith.evaluation import run_evaluator
from langsmith.schemas import DataType, Example, Run
from langchain.chains.llm import LLMChain from langchain.chains.llm import LLMChain
from langchain.evaluation import EvaluatorType from langchain.evaluation import EvaluatorType
@ -26,7 +27,15 @@ def _check_all_feedback_passed(_project_name: str, client: Client) -> None:
feedback = list(client.list_feedback(run_ids=[run.id for run in runs])) feedback = list(client.list_feedback(run_ids=[run.id for run in runs]))
if not feedback: if not feedback:
return return
assert all([f.score == 1 for f in feedback]) assert all([bool(f.score) for f in feedback])
@run_evaluator
def not_empty(run: Run, example: Optional[Example] = None) -> dict:
return {
"score": run.outputs and next(iter(run.outputs.values())),
"key": "not_empty",
}
@pytest.fixture @pytest.fixture
@ -81,7 +90,9 @@ def test_chat_model(
kv_dataset_name: str, eval_project_name: str, client: Client kv_dataset_name: str, eval_project_name: str, client: Client
) -> None: ) -> None:
llm = ChatOpenAI(temperature=0) llm = ChatOpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA]) eval_config = RunEvalConfig(
evaluators=[EvaluatorType.QA], custom_evaluators=[not_empty]
)
with pytest.raises(ValueError, match="Must specify reference_key"): with pytest.raises(ValueError, match="Must specify reference_key"):
run_on_dataset( run_on_dataset(
dataset_name=kv_dataset_name, dataset_name=kv_dataset_name,
@ -90,7 +101,7 @@ def test_chat_model(
client=client, client=client,
) )
eval_config = RunEvalConfig( eval_config = RunEvalConfig(
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA], evaluators=[EvaluatorType.QA],
reference_key="some_output", reference_key="some_output",
) )
with pytest.raises( with pytest.raises(
@ -109,9 +120,8 @@ def test_chat_model(
run_on_dataset( run_on_dataset(
client=client, client=client,
dataset_name=kv_dataset_name, dataset_name=kv_dataset_name,
llm_or_chain_factory=llm, llm_or_chain_factory=input_mapper | llm,
evaluation=eval_config, evaluation=eval_config,
input_mapper=input_mapper,
project_name=eval_project_name, project_name=eval_project_name,
tags=["shouldpass"], tags=["shouldpass"],
) )
@ -120,7 +130,7 @@ def test_chat_model(
def test_llm(kv_dataset_name: str, eval_project_name: str, client: Client) -> None: def test_llm(kv_dataset_name: str, eval_project_name: str, client: Client) -> None:
llm = OpenAI(temperature=0) llm = OpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA]) eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA])
with pytest.raises(ValueError, match="Must specify reference_key"): with pytest.raises(ValueError, match="Must specify reference_key"):
run_on_dataset( run_on_dataset(
dataset_name=kv_dataset_name, dataset_name=kv_dataset_name,
@ -132,9 +142,7 @@ def test_llm(kv_dataset_name: str, eval_project_name: str, client: Client) -> No
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA], evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
reference_key="some_output", reference_key="some_output",
) )
with pytest.raises( with pytest.raises(InputFormatError, match="Example inputs"):
InputFormatError, match="Example inputs do not match language model"
):
run_on_dataset( run_on_dataset(
dataset_name=kv_dataset_name, dataset_name=kv_dataset_name,
llm_or_chain_factory=llm, llm_or_chain_factory=llm,
@ -148,9 +156,8 @@ def test_llm(kv_dataset_name: str, eval_project_name: str, client: Client) -> No
run_on_dataset( run_on_dataset(
client=client, client=client,
dataset_name=kv_dataset_name, dataset_name=kv_dataset_name,
llm_or_chain_factory=llm, llm_or_chain_factory=input_mapper | llm,
evaluation=eval_config, evaluation=eval_config,
input_mapper=input_mapper,
project_name=eval_project_name, project_name=eval_project_name,
tags=["shouldpass"], tags=["shouldpass"],
) )
@ -172,9 +179,7 @@ def test_chain(kv_dataset_name: str, eval_project_name: str, client: Client) ->
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA], evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
reference_key="some_output", reference_key="some_output",
) )
with pytest.raises( with pytest.raises(InputFormatError, match="Example inputs"):
InputFormatError, match="Example inputs do not match chain input keys"
):
run_on_dataset( run_on_dataset(
dataset_name=kv_dataset_name, dataset_name=kv_dataset_name,
llm_or_chain_factory=lambda: chain, llm_or_chain_factory=lambda: chain,
@ -182,18 +187,8 @@ def test_chain(kv_dataset_name: str, eval_project_name: str, client: Client) ->
client=client, client=client,
) )
def input_mapper(d: dict) -> dict: eval_config = RunEvalConfig(
return {"input": d["some_input"]} custom_evaluators=[not_empty],
with pytest.raises(
InputFormatError,
match=" match the chain's expected input keys.",
):
run_on_dataset(
dataset_name=kv_dataset_name,
llm_or_chain_factory=lambda: input_mapper | chain,
client=client,
evaluation=eval_config,
) )
def right_input_mapper(d: dict) -> dict: def right_input_mapper(d: dict) -> dict:
@ -259,7 +254,7 @@ def test_chat_model_on_chat_dataset(
chat_dataset_name: str, eval_project_name: str, client: Client chat_dataset_name: str, eval_project_name: str, client: Client
) -> None: ) -> None:
llm = ChatOpenAI(temperature=0) llm = ChatOpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA]) eval_config = RunEvalConfig(custom_evaluators=[not_empty])
run_on_dataset( run_on_dataset(
dataset_name=chat_dataset_name, dataset_name=chat_dataset_name,
llm_or_chain_factory=llm, llm_or_chain_factory=llm,
@ -274,7 +269,7 @@ def test_llm_on_chat_dataset(
chat_dataset_name: str, eval_project_name: str, client: Client chat_dataset_name: str, eval_project_name: str, client: Client
) -> None: ) -> None:
llm = OpenAI(temperature=0) llm = OpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA]) eval_config = RunEvalConfig(custom_evaluators=[not_empty])
run_on_dataset( run_on_dataset(
dataset_name=chat_dataset_name, dataset_name=chat_dataset_name,
llm_or_chain_factory=llm, llm_or_chain_factory=llm,
@ -337,7 +332,7 @@ def test_chat_model_on_llm_dataset(
llm_dataset_name: str, eval_project_name: str, client: Client llm_dataset_name: str, eval_project_name: str, client: Client
) -> None: ) -> None:
llm = ChatOpenAI(temperature=0) llm = ChatOpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA]) eval_config = RunEvalConfig(custom_evaluators=[not_empty])
run_on_dataset( run_on_dataset(
client=client, client=client,
dataset_name=llm_dataset_name, dataset_name=llm_dataset_name,
@ -353,7 +348,7 @@ def test_llm_on_llm_dataset(
llm_dataset_name: str, eval_project_name: str, client: Client llm_dataset_name: str, eval_project_name: str, client: Client
) -> None: ) -> None:
llm = OpenAI(temperature=0) llm = OpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA]) eval_config = RunEvalConfig(custom_evaluators=[not_empty])
run_on_dataset( run_on_dataset(
client=client, client=client,
dataset_name=llm_dataset_name, dataset_name=llm_dataset_name,
@ -431,7 +426,7 @@ def test_llm_on_kv_singleio_dataset(
kv_singleio_dataset_name: str, eval_project_name: str, client: Client kv_singleio_dataset_name: str, eval_project_name: str, client: Client
) -> None: ) -> None:
llm = OpenAI(temperature=0) llm = OpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA]) eval_config = RunEvalConfig(custom_evaluators=[not_empty])
run_on_dataset( run_on_dataset(
dataset_name=kv_singleio_dataset_name, dataset_name=kv_singleio_dataset_name,
llm_or_chain_factory=llm, llm_or_chain_factory=llm,
@ -448,7 +443,7 @@ def test_chain_on_kv_singleio_dataset(
) -> None: ) -> None:
llm = ChatOpenAI(temperature=0) llm = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(llm, "The answer to the {question} is: ") chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA]) eval_config = RunEvalConfig(custom_evaluators=[not_empty])
run_on_dataset( run_on_dataset(
dataset_name=kv_singleio_dataset_name, dataset_name=kv_singleio_dataset_name,
llm_or_chain_factory=lambda: chain, llm_or_chain_factory=lambda: chain,
@ -467,7 +462,7 @@ async def test_runnable_on_kv_singleio_dataset(
ChatPromptTemplate.from_messages([("human", "{the wackiest input}")]) ChatPromptTemplate.from_messages([("human", "{the wackiest input}")])
| ChatOpenAI() | ChatOpenAI()
) )
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA]) eval_config = RunEvalConfig(custom_evaluators=[not_empty])
await arun_on_dataset( await arun_on_dataset(
dataset_name=kv_singleio_dataset_name, dataset_name=kv_singleio_dataset_name,
llm_or_chain_factory=runnable, llm_or_chain_factory=runnable,
@ -496,7 +491,7 @@ async def test_arb_func_on_kv_singleio_dataset(
f"Expected message with content type string, got {content}" f"Expected message with content type string, got {content}"
) )
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA]) eval_config = RunEvalConfig(custom_evaluators=[not_empty])
await arun_on_dataset( await arun_on_dataset(
dataset_name=kv_singleio_dataset_name, dataset_name=kv_singleio_dataset_name,
llm_or_chain_factory=my_func, llm_or_chain_factory=my_func,