From b9669444fc49c976da27f8b48ed43ab2cb19b92b Mon Sep 17 00:00:00 2001
From: Harrison Chase <hw.chase.17@gmail.com>
Date: Sun, 12 Mar 2023 16:36:15 -0700
Subject: [PATCH] cr

---
 docs/modules/chains/examples/sqlite.ipynb |  2 +-
 docs/use_cases/evaluation.rst             | 75 ++++++++++++++++++++++-
 langchain/evaluation/loading.py           |  4 +-
 langchain/evaluation/qa/evaluator.py      | 59 ------------------
 4 files changed, 76 insertions(+), 64 deletions(-)
 delete mode 100644 langchain/evaluation/qa/evaluator.py

diff --git a/docs/modules/chains/examples/sqlite.ipynb b/docs/modules/chains/examples/sqlite.ipynb
index c85d3f1059e..b598a2a89e9 100644
--- a/docs/modules/chains/examples/sqlite.ipynb
+++ b/docs/modules/chains/examples/sqlite.ipynb
@@ -675,7 +675,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
   }
  },
  "nbformat": 4,
diff --git a/docs/use_cases/evaluation.rst b/docs/use_cases/evaluation.rst
index 9517bb23711..cfea42aa698 100644
--- a/docs/use_cases/evaluation.rst
+++ b/docs/use_cases/evaluation.rst
@@ -1,9 +1,80 @@
 Evaluation
 ==============
 
-Generative models are notoriously hard to evaluate with traditional metrics. One new way of evaluating them is using language models themselves to do the evaluation. LangChain provides some prompts/chains for assisting in this.
+This section of documentation covers how we approach and think about evaluation in LangChain.
+Both evaluation of internal chains/agents, but also how we would recommend people building on top of LangChain approach evaluation.
 
-The examples here all highlight how to use language models to assist in evaluation of themselves.
+The Problem
+-----------
+
+It can be really hard to evaluate LangChain chains and agents.
+There are two main reasons for this:
+
+**# 1: Lack of data**
+
+You generally don't have a ton of data to evaluate your chains/agents over before starting a project.
+This is usually because Large Language Models (the core of most chains/agents) are terrific few-shot and zero shot learners,
+meaning you are almost always able to get started on a particular task (text-to-SQL, question answering, etc) without
+a large dataset of examples.
+This is in stark contrast to traditional machine learning where you had to first collect a bunch of datapoints
+before even getting started using a model.
+
+**# 2: Lack of metrics**
+
+Most chains/agents are performing tasks for which there are not very good metrics to evaluate performance.
+For example, one of the most common use cases is generating text of some form.
+Evaluating generated text is much more complicated than evaluating a classification prediction, or a numeric prediction.
+
+The Solution
+------------
+
+LangChain attempts to tackle both of those issues.
+What we have so far are initial passes at solutions - we do not think we have a perfect solution.
+So we very much welcome feedback, contributions, integrations, and thoughts on this.
+
+Here is what we have for each problem so far:
+
+**# 1: Lack of data**
+
+We have started `LangChainDatasets<https://huggingface.co/LangChainDatasets>`_ a Community space on Hugging Face.
+We intend this to be a collection of open source datasets for evaluating common chains and agents.
+We have contributed five datasets of our own to start, but we highly intend this to be a community effort.
+In order to contribute a dataset, you simply need to join the community and then you will be able to upload datasets.
+
+**# 2: Lack of metrics**
+
+We have two solutions to the lack of metrics.
+
+The first solution is to use no metrics, and rather just rely on looking at results by eye to get a sense for how the chain/agent is performing.
+To assist in this, we have developed (and will continue to develop) `tracing <../tracing.md>`_, a UI-based visualizer of your chain and agent runs.
+
+The second solution we recommend is to use Language Models themselves to evaluate outputs.
+For this we have a few different chains and prompts aimed at tackling this issue.
+
+The Examples
+------------
+
+We have created a bunch of examples combining the above two solutions to show how we internally evaluate chains and agents when we are developing.
+In addition to the examples we've curated, we also highly welcome contributions here.
+To facilitate that, we've included a `template notebook<./evaluation/benchmarking_template.html>`_ for community members to use to build their own examples.
+
+The existing examples we have are:
+
+`Question Answering (State of Union)<./evaluation/qa_benchmarking_sota.html>`_: An notebook showing evaluation of a question-answering task over a State-of-the-Union address.
+
+`Question Answering (Paul Graham Essay)<./evaluation/qa_benchmarking_pg.html>`_: An notebook showing evaluation of a question-answering task over a Paul Graham essay.
+
+`SQL Question Answering (Chinook)<./evaluation/sql_qa_benchmarking_chinook.html>`_: An notebook showing evaluation of a question-answering task over a SQL database (the Chinook database).
+
+`Agent Vectorstore<./evaluation/vectordb_agent_qa_benchmarking.html>`_: An notebook showing evaluation of an agent doing question answering while routing between two different vector databases.
+
+`Agent Search + Calculator<./evaluation/agent_benchmarking.html>`_: An notebook showing evaluation of an agent doing question answering using a Search engine and a Calculator as tools.
+
+
+Other Examples
+------------
+
+In addition, we also have some more generic resources for evaluation.
 
 `Question Answering <./evaluation/question_answering.html>`_: An overview of LLMs aimed at evaluating question answering systems in general.
 
diff --git a/langchain/evaluation/loading.py b/langchain/evaluation/loading.py
index 7d481fe58fe..613e261303b 100644
--- a/langchain/evaluation/loading.py
+++ b/langchain/evaluation/loading.py
@@ -1,8 +1,8 @@
-from typing import List, Dict
+from typing import Dict, List
 
 
 def load_dataset(uri: str) -> List[Dict]:
     from datasets import load_dataset
 
     dataset = load_dataset(f"LangChainDatasets/{uri}")
-    return [d for d in dataset['train']]
+    return [d for d in dataset["train"]]
diff --git a/langchain/evaluation/qa/evaluator.py b/langchain/evaluation/qa/evaluator.py
deleted file mode 100644
index 2acaba5d20f..00000000000
--- a/langchain/evaluation/qa/evaluator.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from pydantic import BaseModel
-from typing import Optional, List
-from langchain.chains.base import Chain
-
-
-
-class QADataPoint(BaseModel):
-    question: str
-    answer: str
-    error: Optional[str]
-    prediction: Optional[str]
-    ai_grade: Optional[str]
-
-
-def predict_qa(chain: Chain, datapoints: List[QADataPoint], question_key: str="question", prediction_key: Optional[str] = None, silent_errors: bool = False) -> None:
-    for data in datapoints:
-        try:
-            prediction_dict = chain({question_key: data.question}, return_only_outputs=True)
-        except Exception as e:
-            if silent_errors:
-                data.error = str(e)
-                continue
-            else:
-                raise e
-        if prediction_key is not None:
-            data.prediction = prediction_dict[prediction_key]
-        elif len(prediction_dict) == 1:
-            data.prediction = list(prediction_dict.values())[0]
-        else:
-            raise ValueError(
-                "No prediction key was specified, and got multiple outputs so not "
-                f"sure which one to use: {prediction_dict}. Please either "
-                f"specify a `prediction_key` or change the chain to return "
-                f"a single output"
-            )
-
-def eval_qa(
-        eval_chain: Chain,
-        datapoints: List[QADataPoint],
-        question_key: str = "query",
-        answer_key: str = "answer",
-        prediction_key: str = "result",
-        grade_key: str = "text"
-    ) -> None:
-        """Evaluate question answering examples and predictions."""
-        data_to_grade = [d for d in datapoints if d.prediction is not None]
-        inputs = [
-            {
-                question_key: data.question,
-                answer_key: data.answer,
-                prediction_key: data.prediction,
-            }
-            for data in data_to_grade
-        ]
-
-        grades = eval_chain.apply(inputs)
-        for i, grade in enumerate(grades):
-            data_to_grade[i].ai_grade = grade[grade_key]
-    
\ No newline at end of file