From b9669444fc49c976da27f8b48ed43ab2cb19b92b Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sun, 12 Mar 2023 16:36:15 -0700 Subject: [PATCH] cr --- docs/modules/chains/examples/sqlite.ipynb | 2 +- docs/use_cases/evaluation.rst | 75 ++++++++++++++++++++++- langchain/evaluation/loading.py | 4 +- langchain/evaluation/qa/evaluator.py | 59 ------------------ 4 files changed, 76 insertions(+), 64 deletions(-) delete mode 100644 langchain/evaluation/qa/evaluator.py diff --git a/docs/modules/chains/examples/sqlite.ipynb b/docs/modules/chains/examples/sqlite.ipynb index c85d3f1059e..b598a2a89e9 100644 --- a/docs/modules/chains/examples/sqlite.ipynb +++ b/docs/modules/chains/examples/sqlite.ipynb @@ -675,7 +675,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/use_cases/evaluation.rst b/docs/use_cases/evaluation.rst index 9517bb23711..cfea42aa698 100644 --- a/docs/use_cases/evaluation.rst +++ b/docs/use_cases/evaluation.rst @@ -1,9 +1,80 @@ Evaluation ============== -Generative models are notoriously hard to evaluate with traditional metrics. One new way of evaluating them is using language models themselves to do the evaluation. LangChain provides some prompts/chains for assisting in this. +This section of documentation covers how we approach and think about evaluation in LangChain. +Both evaluation of internal chains/agents, but also how we would recommend people building on top of LangChain approach evaluation. -The examples here all highlight how to use language models to assist in evaluation of themselves. +The Problem +----------- + +It can be really hard to evaluate LangChain chains and agents. +There are two main reasons for this: + +**# 1: Lack of data** + +You generally don't have a ton of data to evaluate your chains/agents over before starting a project. +This is usually because Large Language Models (the core of most chains/agents) are terrific few-shot and zero shot learners, +meaning you are almost always able to get started on a particular task (text-to-SQL, question answering, etc) without +a large dataset of examples. +This is in stark contrast to traditional machine learning where you had to first collect a bunch of datapoints +before even getting started using a model. + +**# 2: Lack of metrics** + +Most chains/agents are performing tasks for which there are not very good metrics to evaluate performance. +For example, one of the most common use cases is generating text of some form. +Evaluating generated text is much more complicated than evaluating a classification prediction, or a numeric prediction. + +The Solution +------------ + +LangChain attempts to tackle both of those issues. +What we have so far are initial passes at solutions - we do not think we have a perfect solution. +So we very much welcome feedback, contributions, integrations, and thoughts on this. + +Here is what we have for each problem so far: + +**# 1: Lack of data** + +We have started `LangChainDatasets`_ a Community space on Hugging Face. +We intend this to be a collection of open source datasets for evaluating common chains and agents. +We have contributed five datasets of our own to start, but we highly intend this to be a community effort. +In order to contribute a dataset, you simply need to join the community and then you will be able to upload datasets. + +**# 2: Lack of metrics** + +We have two solutions to the lack of metrics. + +The first solution is to use no metrics, and rather just rely on looking at results by eye to get a sense for how the chain/agent is performing. +To assist in this, we have developed (and will continue to develop) `tracing <../tracing.md>`_, a UI-based visualizer of your chain and agent runs. + +The second solution we recommend is to use Language Models themselves to evaluate outputs. +For this we have a few different chains and prompts aimed at tackling this issue. + +The Examples +------------ + +We have created a bunch of examples combining the above two solutions to show how we internally evaluate chains and agents when we are developing. +In addition to the examples we've curated, we also highly welcome contributions here. +To facilitate that, we've included a `template notebook<./evaluation/benchmarking_template.html>`_ for community members to use to build their own examples. + +The existing examples we have are: + +`Question Answering (State of Union)<./evaluation/qa_benchmarking_sota.html>`_: An notebook showing evaluation of a question-answering task over a State-of-the-Union address. + +`Question Answering (Paul Graham Essay)<./evaluation/qa_benchmarking_pg.html>`_: An notebook showing evaluation of a question-answering task over a Paul Graham essay. + +`SQL Question Answering (Chinook)<./evaluation/sql_qa_benchmarking_chinook.html>`_: An notebook showing evaluation of a question-answering task over a SQL database (the Chinook database). + +`Agent Vectorstore<./evaluation/vectordb_agent_qa_benchmarking.html>`_: An notebook showing evaluation of an agent doing question answering while routing between two different vector databases. + +`Agent Search + Calculator<./evaluation/agent_benchmarking.html>`_: An notebook showing evaluation of an agent doing question answering using a Search engine and a Calculator as tools. + + +Other Examples +------------ + +In addition, we also have some more generic resources for evaluation. `Question Answering <./evaluation/question_answering.html>`_: An overview of LLMs aimed at evaluating question answering systems in general. diff --git a/langchain/evaluation/loading.py b/langchain/evaluation/loading.py index 7d481fe58fe..613e261303b 100644 --- a/langchain/evaluation/loading.py +++ b/langchain/evaluation/loading.py @@ -1,8 +1,8 @@ -from typing import List, Dict +from typing import Dict, List def load_dataset(uri: str) -> List[Dict]: from datasets import load_dataset dataset = load_dataset(f"LangChainDatasets/{uri}") - return [d for d in dataset['train']] + return [d for d in dataset["train"]] diff --git a/langchain/evaluation/qa/evaluator.py b/langchain/evaluation/qa/evaluator.py deleted file mode 100644 index 2acaba5d20f..00000000000 --- a/langchain/evaluation/qa/evaluator.py +++ /dev/null @@ -1,59 +0,0 @@ -from pydantic import BaseModel -from typing import Optional, List -from langchain.chains.base import Chain - - - -class QADataPoint(BaseModel): - question: str - answer: str - error: Optional[str] - prediction: Optional[str] - ai_grade: Optional[str] - - -def predict_qa(chain: Chain, datapoints: List[QADataPoint], question_key: str="question", prediction_key: Optional[str] = None, silent_errors: bool = False) -> None: - for data in datapoints: - try: - prediction_dict = chain({question_key: data.question}, return_only_outputs=True) - except Exception as e: - if silent_errors: - data.error = str(e) - continue - else: - raise e - if prediction_key is not None: - data.prediction = prediction_dict[prediction_key] - elif len(prediction_dict) == 1: - data.prediction = list(prediction_dict.values())[0] - else: - raise ValueError( - "No prediction key was specified, and got multiple outputs so not " - f"sure which one to use: {prediction_dict}. Please either " - f"specify a `prediction_key` or change the chain to return " - f"a single output" - ) - -def eval_qa( - eval_chain: Chain, - datapoints: List[QADataPoint], - question_key: str = "query", - answer_key: str = "answer", - prediction_key: str = "result", - grade_key: str = "text" - ) -> None: - """Evaluate question answering examples and predictions.""" - data_to_grade = [d for d in datapoints if d.prediction is not None] - inputs = [ - { - question_key: data.question, - answer_key: data.answer, - prediction_key: data.prediction, - } - for data in data_to_grade - ] - - grades = eval_chain.apply(inputs) - for i, grade in enumerate(grades): - data_to_grade[i].ai_grade = grade[grade_key] - \ No newline at end of file