From c29fbede5940328ea27e409a8d645f31854df583 Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Fri, 18 Aug 2023 10:08:39 -0700
Subject: [PATCH] Wfh/rm num repetitions (#9425)

Makes it hard to do test run comparison views and we'd probably want to
just run multiple runs right now
---
 .../smith/evaluation/runner_utils.py          | 136 ++++++++----------
 .../smith/evaluation/test_runner_utils.py     |  19 +--
 2 files changed, 66 insertions(+), 89 deletions(-)

diff --git a/libs/langchain/langchain/smith/evaluation/runner_utils.py b/libs/langchain/langchain/smith/evaluation/runner_utils.py
index 64139f95e30..9e06fcd65f4 100644
--- a/libs/langchain/langchain/smith/evaluation/runner_utils.py
+++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py
@@ -8,6 +8,7 @@ import inspect
 import itertools
 import logging
 import uuid
+import warnings
 from enum import Enum
 from typing import (
     Any,
@@ -662,7 +663,6 @@ async def _arun_chain(
 async def _arun_llm_or_chain(
     example: Example,
     llm_or_chain_factory: MCF,
-    n_repetitions: int,
     *,
     tags: Optional[List[str]] = None,
     callbacks: Optional[List[BaseCallbackHandler]] = None,
@@ -673,7 +673,6 @@ async def _arun_llm_or_chain(
     Args:
         example: The example to run.
         llm_or_chain_factory: The Chain or language model constructor to run.
-        n_repetitions: The number of times to run the model on each example.
         tags: Optional tags to add to the run.
         callbacks: Optional callbacks to use during the run.
         input_mapper: Optional function to map the input to the expected format.
@@ -694,31 +693,28 @@ async def _arun_llm_or_chain(
     chain_or_llm = (
         "LLM" if isinstance(llm_or_chain_factory, BaseLanguageModel) else "Chain"
     )
-    for _ in range(n_repetitions):
-        try:
-            if isinstance(llm_or_chain_factory, BaseLanguageModel):
-                output: Any = await _arun_llm(
-                    llm_or_chain_factory,
-                    example.inputs,
-                    tags=tags,
-                    callbacks=callbacks,
-                    input_mapper=input_mapper,
-                )
-            else:
-                chain = llm_or_chain_factory()
-                output = await _arun_chain(
-                    chain,
-                    example.inputs,
-                    tags=tags,
-                    callbacks=callbacks,
-                    input_mapper=input_mapper,
-                )
-            outputs.append(output)
-        except Exception as e:
-            logger.warning(
-                f"{chain_or_llm} failed for example {example.id}. Error: {e}"
+    try:
+        if isinstance(llm_or_chain_factory, BaseLanguageModel):
+            output: Any = await _arun_llm(
+                llm_or_chain_factory,
+                example.inputs,
+                tags=tags,
+                callbacks=callbacks,
+                input_mapper=input_mapper,
             )
-            outputs.append({"Error": str(e)})
+        else:
+            chain = llm_or_chain_factory()
+            output = await _arun_chain(
+                chain,
+                example.inputs,
+                tags=tags,
+                callbacks=callbacks,
+                input_mapper=input_mapper,
+            )
+        outputs.append(output)
+    except Exception as e:
+        logger.warning(f"{chain_or_llm} failed for example {example.id}. Error: {e}")
+        outputs.append({"Error": str(e)})
     if callbacks and previous_example_ids:
         for example_id, tracer in zip(previous_example_ids, callbacks):
             if hasattr(tracer, "example_id"):
@@ -822,7 +818,6 @@ async def _arun_on_examples(
     *,
     evaluation: Optional[RunEvalConfig] = None,
     concurrency_level: int = 5,
-    num_repetitions: int = 1,
     project_name: Optional[str] = None,
     verbose: bool = False,
     tags: Optional[List[str]] = None,
@@ -841,9 +836,6 @@ async def _arun_on_examples(
             independent calls on each example without carrying over state.
         evaluation: Optional evaluation configuration to use when evaluating
         concurrency_level: The number of async tasks to run concurrently.
-        num_repetitions: Number of times to run the model on each example.
-            This is useful when testing success rates or generating confidence
-            intervals.
         project_name: Project name to use when tracing runs.
             Defaults to {dataset_name}-{chain class name}-{datetime}.
         verbose: Whether to print progress.
@@ -873,7 +865,6 @@ async def _arun_on_examples(
         result = await _arun_llm_or_chain(
             example,
             wrapped_model,
-            num_repetitions,
             tags=tags,
             callbacks=callbacks,
             input_mapper=input_mapper,
@@ -983,7 +974,6 @@ def _run_chain(
 def _run_llm_or_chain(
     example: Example,
     llm_or_chain_factory: MCF,
-    n_repetitions: int,
     *,
     tags: Optional[List[str]] = None,
     callbacks: Optional[List[BaseCallbackHandler]] = None,
@@ -995,7 +985,6 @@ def _run_llm_or_chain(
     Args:
         example: The example to run.
         llm_or_chain_factory: The Chain or language model constructor to run.
-        n_repetitions: The number of times to run the model on each example.
         tags: Optional tags to add to the run.
         callbacks: Optional callbacks to use during the run.
 
@@ -1016,32 +1005,31 @@ def _run_llm_or_chain(
     chain_or_llm = (
         "LLM" if isinstance(llm_or_chain_factory, BaseLanguageModel) else "Chain"
     )
-    for _ in range(n_repetitions):
-        try:
-            if isinstance(llm_or_chain_factory, BaseLanguageModel):
-                output: Any = _run_llm(
-                    llm_or_chain_factory,
-                    example.inputs,
-                    callbacks,
-                    tags=tags,
-                    input_mapper=input_mapper,
-                )
-            else:
-                chain = llm_or_chain_factory()
-                output = _run_chain(
-                    chain,
-                    example.inputs,
-                    callbacks,
-                    tags=tags,
-                    input_mapper=input_mapper,
-                )
-            outputs.append(output)
-        except Exception as e:
-            logger.warning(
-                f"{chain_or_llm} failed for example {example.id} with inputs:"
-                f" {example.inputs}.\nError: {e}",
+    try:
+        if isinstance(llm_or_chain_factory, BaseLanguageModel):
+            output: Any = _run_llm(
+                llm_or_chain_factory,
+                example.inputs,
+                callbacks,
+                tags=tags,
+                input_mapper=input_mapper,
             )
-            outputs.append({"Error": str(e)})
+        else:
+            chain = llm_or_chain_factory()
+            output = _run_chain(
+                chain,
+                example.inputs,
+                callbacks,
+                tags=tags,
+                input_mapper=input_mapper,
+            )
+        outputs.append(output)
+    except Exception as e:
+        logger.warning(
+            f"{chain_or_llm} failed for example {example.id} with inputs:"
+            f" {example.inputs}.\nError: {e}",
+        )
+        outputs.append({"Error": str(e)})
     if callbacks and previous_example_ids:
         for example_id, tracer in zip(previous_example_ids, callbacks):
             if hasattr(tracer, "example_id"):
@@ -1055,7 +1043,6 @@ def _run_on_examples(
     llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
     *,
     evaluation: Optional[RunEvalConfig] = None,
-    num_repetitions: int = 1,
     project_name: Optional[str] = None,
     verbose: bool = False,
     tags: Optional[List[str]] = None,
@@ -1073,9 +1060,6 @@ def _run_on_examples(
             over the dataset. The Chain constructor is used to permit
             independent calls on each example without carrying over state.
         evaluation: Optional evaluation configuration to use when evaluating
-        num_repetitions: Number of times to run the model on each example.
-            This is useful when testing success rates or generating confidence
-            intervals.
         project_name: Name of the project to store the traces in.
             Defaults to {dataset_name}-{chain class name}-{datetime}.
         verbose: Whether to print progress.
@@ -1110,7 +1094,6 @@ def _run_on_examples(
         result = _run_llm_or_chain(
             example,
             wrapped_model,
-            num_repetitions,
             tags=tags,
             callbacks=callbacks,
             input_mapper=input_mapper,
@@ -1158,11 +1141,11 @@ async def arun_on_dataset(
     *,
     evaluation: Optional[RunEvalConfig] = None,
     concurrency_level: int = 5,
-    num_repetitions: int = 1,
     project_name: Optional[str] = None,
     verbose: bool = False,
     tags: Optional[List[str]] = None,
     input_mapper: Optional[Callable[[Dict], Any]] = None,
+    **kwargs: Any,
 ) -> Dict[str, Any]:
     """
     Asynchronously run the Chain or language model on a dataset
@@ -1177,9 +1160,6 @@ async def arun_on_dataset(
             independent calls on each example without carrying over state.
         evaluation: Optional evaluation configuration to use when evaluating
         concurrency_level: The number of async tasks to run concurrently.
-        num_repetitions: Number of times to run the model on each example.
-            This is useful when testing success rates or generating confidence
-            intervals.
         project_name: Name of the project to store the traces in.
             Defaults to {dataset_name}-{chain class name}-{datetime}.
         verbose: Whether to print progress.
@@ -1274,6 +1254,13 @@ async def arun_on_dataset(
             evaluation=evaluation_config,
         )
     """  # noqa: E501
+    if kwargs:
+        warnings.warn(
+            "The following arguments are deprecated and will "
+            "be removed in a future release: "
+            f"{kwargs.keys()}.",
+            DeprecationWarning,
+        )
     wrapped_model, project_name, dataset, examples = _prepare_eval_run(
         client, dataset_name, llm_or_chain_factory, project_name
     )
@@ -1282,7 +1269,6 @@ async def arun_on_dataset(
         examples,
         wrapped_model,
         concurrency_level=concurrency_level,
-        num_repetitions=num_repetitions,
         project_name=project_name,
         verbose=verbose,
         tags=tags,
@@ -1323,12 +1309,12 @@ def run_on_dataset(
     llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
     *,
     evaluation: Optional[RunEvalConfig] = None,
-    num_repetitions: int = 1,
     concurrency_level: int = 5,
     project_name: Optional[str] = None,
     verbose: bool = False,
     tags: Optional[List[str]] = None,
     input_mapper: Optional[Callable[[Dict], Any]] = None,
+    **kwargs: Any,
 ) -> Dict[str, Any]:
     """
     Run the Chain or language model on a dataset and store traces
@@ -1344,9 +1330,6 @@ def run_on_dataset(
         evaluation: Configuration for evaluators to run on the
             results of the chain
         concurrency_level: The number of async tasks to run concurrently.
-        num_repetitions: Number of times to run the model on each example.
-            This is useful when testing success rates or generating confidence
-            intervals.
         project_name: Name of the project to store the traces in.
             Defaults to {dataset_name}-{chain class name}-{datetime}.
         verbose: Whether to print progress.
@@ -1441,6 +1424,13 @@ def run_on_dataset(
             evaluation=evaluation_config,
         )
     """  # noqa: E501
+    if kwargs:
+        warnings.warn(
+            "The following arguments are deprecated and "
+            "will be removed in a future release: "
+            f"{kwargs.keys()}.",
+            DeprecationWarning,
+        )
     wrapped_model, project_name, dataset, examples = _prepare_eval_run(
         client, dataset_name, llm_or_chain_factory, project_name
     )
@@ -1449,7 +1439,6 @@ def run_on_dataset(
             client,
             examples,
             wrapped_model,
-            num_repetitions=num_repetitions,
             project_name=project_name,
             verbose=verbose,
             tags=tags,
@@ -1464,7 +1453,6 @@ def run_on_dataset(
             examples,
             wrapped_model,
             concurrency_level=concurrency_level,
-            num_repetitions=num_repetitions,
             project_name=project_name,
             verbose=verbose,
             tags=tags,
diff --git a/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py b/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py
index de7f9e9434c..5c34f9032fa 100644
--- a/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py
+++ b/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py
@@ -181,15 +181,12 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
         assert "the wrong input" in inputs
         return {"the right input": inputs["the wrong input"]}
 
-    result = _run_llm_or_chain(
-        example, lambda: mock_chain, n_repetitions=1, input_mapper=input_mapper
-    )
+    result = _run_llm_or_chain(example, lambda: mock_chain, input_mapper=input_mapper)
     assert len(result) == 1
     assert result[0] == {"output": "2", "the right input": "1"}
     bad_result = _run_llm_or_chain(
         example,
         lambda: mock_chain,
-        n_repetitions=1,
     )
     assert len(bad_result) == 1
     assert "Error" in bad_result[0]
@@ -200,9 +197,7 @@ def test_run_llm_or_chain_with_input_mapper() -> None:
         return "the right input"
 
     mock_llm = FakeLLM(queries={"the right input": "somenumber"})
-    result = _run_llm_or_chain(
-        example, mock_llm, n_repetitions=1, input_mapper=llm_input_mapper
-    )
+    result = _run_llm_or_chain(example, mock_llm, input_mapper=llm_input_mapper)
     assert len(result) == 1
     llm_result = result[0]
     assert isinstance(llm_result, str)
@@ -302,14 +297,11 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
     async def mock_arun_chain(
         example: Example,
         llm_or_chain: Union[BaseLanguageModel, Chain],
-        n_repetitions: int,
         tags: Optional[List[str]] = None,
         callbacks: Optional[Any] = None,
         **kwargs: Any,
     ) -> List[Dict[str, Any]]:
-        return [
-            {"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
-        ]
+        return [{"result": f"Result for example {example.id}"}]
 
     def mock_create_project(*args: Any, **kwargs: Any) -> Any:
         proj = mock.MagicMock()
@@ -327,20 +319,17 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
         client = Client(api_url="http://localhost:1984", api_key="123")
         chain = mock.MagicMock()
         chain.input_keys = ["foothing"]
-        num_repetitions = 3
         results = await arun_on_dataset(
             dataset_name="test",
             llm_or_chain_factory=lambda: chain,
             concurrency_level=2,
             project_name="test_project",
-            num_repetitions=num_repetitions,
             client=client,
         )
 
         expected = {
             uuid_: [
-                {"result": f"Result for example {uuid.UUID(uuid_)}"}
-                for _ in range(num_repetitions)
+                {"result": f"Result for example {uuid.UUID(uuid_)}"} for _ in range(1)
             ]
             for uuid_ in uuids
         }