diff --git a/docs/use_cases/evaluation/evaluating_traced_examples.ipynb b/docs/use_cases/evaluation/evaluating_traced_examples.ipynb new file mode 100644 index 00000000000..ce9f3e54dab --- /dev/null +++ b/docs/use_cases/evaluation/evaluating_traced_examples.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1a4596ea-a631-416d-a2a4-3577c140493d", + "metadata": {}, + "source": [ + "# Running Chains on Traced Datasets\n", + "\n", + "Developing applications with language models can be uniquely challenging. To manage this complexity and ensure reliable performance, LangChain provides tracing and evaluation functionality through . This notebook demonstrates how to run Chains, which are language model functions, on previously captured datasets or traces. Some common use cases for this approach include:\n", + "\n", + "- Running an evaluation chain to grade previous runs.\n", + "- Comparing different chains, LLMs, and agents on traced datasets.\n", + "- Executing a stochastic chain multiple times over a dataset to generate metrics before deployment.\n", + "\n", + "Please note that this notebook assumes you have LangChain+ tracing running in the background. To set it up, follow the [tracing directions here](..\\/..\\/tracing\\/local_installation.md).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "904db9a5-f387-4a57-914c-c8af8d39e249", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.client import LangChainPlusClient\n", + "\n", + "client = LangChainPlusClient(\n", + " api_url=\"http://localhost:8000\",\n", + " api_key=None,\n", + " #tenant_id=\"\", Required for connecting to a hosted tenant\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6f7775bf-b8a9-4236-8d21-a17611bf2c4f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'309d7a02-99cc-4a5f-abf0-2b389747bb95'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.tenant_id" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "814c206c-717a-4847-836f-5fedf0e55326", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain import OpenAI\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.schema import HumanMessage, AIMessage, SystemMessage" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "7a02d0ad-b997-4c2f-a145-15051592ebb4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "LLMResult(generations=[[Generation(text=\"\\n\\nMy name is Adam and I'm excited to be part of this community. I'm a big fan of technology, and I'm always looking for ways to learn and grow. I'm also passionate about helping others, and I'm always looking for new opportunities to do so. I look forward to connecting with all of you!\", generation_info={'finish_reason': 'stop', 'logprobs': None})]], llm_output={'token_usage': {'completion_tokens': 67, 'prompt_tokens': 4, 'total_tokens': 71}, 'model_name': 'text-davinci-003'})" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "llm = OpenAI()\n", + "llm.generate([\"Hello, world!\"], callbacks=[tracer])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "12894f25-c308-48ff-855b-c719122c6c5f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Failed to load default session, using empty session: 0\n", + "WARNING:root:Failed to load default session, using empty session: 0\n" + ] + } + ], + "source": [ + "from langchain.callbacks.tracers.langchain import LangChainTracerV2\n", + "tracer = LangChainTracerV2(session_name='default')\n", + "tracer.load_session(\"default\")\n", + "model = ChatOpenAI() # callbacks=[tracer])\n", + "messages = [[\n", + " HumanMessage(content=\"I am human roar\"),\n", + " AIMessage(content=\"I am AI beep boop\"),\n", + " SystemMessage(content=\"I am a system message\"),\n", + " ]\n", + "]\n", + "results = model.generate(messages=messages, callbacks=[tracer])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "479756b0-1158-42d2-af95-a57870020c6b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "db79dea2-fbaa-4c12-9083-f6154b51e2d3", + "metadata": { + "tags": [] + }, + "source": [ + "## Seed an example dataset\n", + "\n", + "If you have been using LangChainPlus already, you may have datasets available. To view all saved datasets, run:\n", + "\n", + "```\n", + "datasets = client.list_datasets()\n", + "datasets\n", + "```\n", + "Datasets can be created in a number of ways, most often by collecting `Run`'s captured through the LangChain tracing API.\n", + "\n", + "However, this notebook assumes you're running locally for the first time, so we'll start by uploading an example evaluation dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1baa677c-5642-4378-8e01-3aa1647f19d6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# !pip install datasets > /dev/null\n", + "# !pip install pandas > /dev/null" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60d14593-c61f-449f-a38f-772ca43707c2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from langchain.evaluation.loading import load_dataset\n", + "\n", + "dataset = load_dataset(\"agent-search-calculator\")\n", + "df = pd.DataFrame(dataset, columns=[\"question\", \"answer\"])\n", + "df.columns = [\"input\", \"output\"] # The chain we want to evaluate below expects inputs with the \"input\" key " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6dfe2e58-41bf-4deb-8fe7-944ce0fa3548", + "metadata": {}, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f27e45f1-e299-4de8-a538-ee1272ac5024", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dataset_name = f\"calculator_example.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52a7ea76-79ca-4765-abf7-231e884040d6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "if dataset_name not in set([dataset.name for dataset in client.list_datasets()]):\n", + " dataset = client.upload_dataframe(df, \n", + " name=dataset_name,\n", + " description=\"Acalculator example dataset\",\n", + " input_keys=[\"input\"],\n", + " output_keys=[\"output\"],\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "07885b10", + "metadata": { + "tags": [] + }, + "source": [ + "## Running a Chain on a Traced Dataset\n", + "\n", + "Once you have a dataset, you can run a chain over it to see its results. The run traces will automatically be associated with the dataset for easy attribution and analysis.\n", + "\n", + "**First, we'll define the chain we wish to run over the dataset.**\n", + "\n", + "In this case, we're using an agent, but it can be any simple chain." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2b59104-b90e-466a-b7ea-c5bd0194263b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.agents import initialize_agent, load_tools\n", + "from langchain.agents import AgentType\n", + "\n", + "llm = ChatOpenAI(temperature=0)\n", + "tools = load_tools(['serpapi', 'llm-math'], llm=llm)\n", + "agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False)" + ] + }, + { + "cell_type": "markdown", + "id": "84094a4a-1d76-461c-bc37-8c537939b466", + "metadata": {}, + "source": [ + "**Now we're ready to run the chain!**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8088b7d-3ab6-4279-94c8-5116fe7cee33", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "chain_results = await client.arun_on_dataset(\n", + " dataset_name=dataset_name,\n", + " chain=agent,\n", + " num_workers=5, # Optional, sets the number of examples to run at a time\n", + " session_name=\"Calculator Dataset Runs\", # Optional. Will be sent to the 'default' session otherwise\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "d2737458-b20c-4288-8790-1f4a8d237b2a", + "metadata": {}, + "source": [ + "## Reviewing the Chain Results\n", + "\n", + "The method called above returns a dictionary mapping Example IDs to the output of the chain.\n", + "You can directly inspect the results below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49bc839d-8320-446c-982a-391c55bc9e2a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "chain_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "136db492-d6ca-4215-96f9-439c23538241", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# You can navigate to the UI by clicking on the link below\n", + "client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b05f1aab-6bc9-4963-9059-c5cd9c656992", + "metadata": {}, + "outputs": [], + "source": [ + "# You can review all the chain runs over a given example as follows:\n", + "example_id = next(iter(chain_results))\n", + "example = client.read_example(example_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d28b7752-7abb-4b4d-8575-1e98d6baf1b1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# For example, view the chain runs on this example\n", + "example.chain_runs" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}