langchain/docs/use_cases/evaluation/evaluating_traced_examples.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1a4596ea-a631-416d-a2a4-3577c140493d",
   "metadata": {},
   "source": [
    "## Running chains on Datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "185fc992-7472-415b-8eda-cb4e11c9068f",
   "metadata": {},
   "source": [
    "Some setup to upload the dataset to LangChainPlus. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54b4cfb2-2a4b-4017-a474-b088023ea3ec",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "!pip install datasets > /dev/null"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "443eacbd",
   "metadata": {},
   "source": [
    "You'll already have a dataset in LangChain+, but we'll upload this as an example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e360961b-65e9-4fd6-b609-916bae62ff72",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "from io import BytesIO\n",
    "import requests\n",
    "from typing import Sequence, Tuple, Optional\n",
    "\n",
    "def upload_dataframe(base_url: str,\n",
    "                     df: pd.DataFrame,\n",
    "                     input_keys: Sequence[str], \n",
    "                     output_keys: Sequence[str], \n",
    "                     name: str, \n",
    "                     description: str, auth: Optional[dict] = None) -> Tuple[int, dict]:\n",
    "    buffer = BytesIO()\n",
    "    df.to_csv(buffer, index=False)\n",
    "    buffer.seek(0)\n",
    "    files = {\"file\": (f\"{name}.csv\", buffer)}\n",
    "    data = {\n",
    "        \"input_keys\": ','.join(input_keys),\n",
    "        \"output_keys\": ','.join(output_keys),\n",
    "        \"description\": description,\n",
    "    }\n",
    "    response = requests.post(base_url + \"/datasets/upload\", auth=auth, data=data, files=files)\n",
    "    return response.status_code, response.json()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "79b4605d-0adc-49a6-b542-5b3fe22fa020",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "endpoint = os.getenv(\"LANGCHAIN_ENDPOINT\", \"http://localhost:8000\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "69af24a2-9892-408d-91ac-6b265b9c903c",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset json (/Users/wfh/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--agent-search-calculator-8a025c0ce5fb99d2/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e8c0711e6f3e40909da0459419166960",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from langchain.evaluation.loading import load_dataset\n",
    "dataset = load_dataset(\"agent-search-calculator\")\n",
    "df = pd.DataFrame(dataset)\n",
    "del df['steps']\n",
    "df.columns = ['input', 'output']\n",
    "dataset_name = \"agent-search-calculator-dataset-test\"\n",
    "response_code, _  = upload_dataframe(endpoint,df, [\"input\"], [\"output\"], dataset_name, \"A math dataset\") "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07885b10",
   "metadata": {},
   "source": [
    "## Running a Chain on a Dataset\n",
    "The actual \"running on a dataset\" logic starts here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "b7f91628-1336-422e-9571-a26cb8bed3ae",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "dataset = requests.get(endpoint +\"/datasets\", params={\"name\": f\"{dataset_name}.csv\"}).json()[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c0a2eaf4-8021-4d43-bd73-a694b183b9b9",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import asyncio\n",
    "import logging\n",
    "\n",
    "from langchain.callbacks.tracers.langchain import LangChainTracer\n",
    "from langchain.chains.base import Chain\n",
    "\n",
    "logger = logging.getLogger(__name__)\n",
    "\n",
    "\n",
    "async def arun_chain(\n",
    "    example: dict, langchain_tracer: LangChainTracer, chain: Chain\n",
    ") -> dict:\n",
    "    \"\"\"Run the chain asynchronously\"\"\"\n",
    "    langchain_tracer.example_id = example[\"id\"]\n",
    "    inputs = example[\"inputs\"]\n",
    "    try:\n",
    "        chain_output = await chain.arun(inputs, callbacks=[langchain_tracer])\n",
    "        langchain_tracer.example_id = None\n",
    "    except Exception as e:\n",
    "        logger.error(e)\n",
    "        return {\"Error\": str(e)}\n",
    "    finally:\n",
    "        langchain_tracer.example_id = None\n",
    "    return chain_output\n",
    "\n",
    "\n",
    "async def run_dataset(dataset: dict, chain: Chain, batch_size: int = 5):\n",
    "    \"\"\"Grade the QA examples.\"\"\"\n",
    "    logger.info(\"`Grading QA performance ...`\")\n",
    "    tracers = [LangChainTracer() for _ in range(batch_size)]\n",
    "    for tracer in tracers:\n",
    "        tracer.load_session(\"default\")\n",
    "    graded_outputs = []\n",
    "    total_examples = len(dataset[\"examples\"])\n",
    "    for i in range(0, total_examples, batch_size):\n",
    "        batch_results = []\n",
    "        for j in range(i, min(total_examples, i + batch_size)):\n",
    "            example = dataset[\"examples\"][j]\n",
    "            batch_results.append(arun_chain(example, tracers[j % len(tracers)], chain))\n",
    "        graded_outputs.extend(await asyncio.gather(*batch_results))\n",
    "    return graded_outputs\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "6c588a1f-f869-4d63-a088-e01d0707116c",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.agents import initialize_agent, load_tools\n",
    "from langchain.agents import AgentType\n",
    "from langchain import OpenAI\n",
    "\n",
    "llm = ChatOpenAI(temperature=0)\n",
    "tools = load_tools(['serpapi', 'llm-math'], llm=llm)\n",
    "agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "d8483f0e-e2ed-4835-96a9-30c2a22aa471",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "'age'. Please try again with a valid numerical expression\n",
      "unknown format from LLM: Assuming we don't have any information about the actual number of points scored in the 2023 super bowl, we cannot provide a mathematical expression to solve this problem.\n",
      "invalid syntax. Perhaps you forgot a comma? (<expr>, line 1). Please try again with a valid numerical expression\n",
      "Could not parse LLM output: `The final answer is that there were no more points scored in the 2023 Super Bowl than in the 2022 Super Bowl.`\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['The current population of Canada as of May 3, 2023 is 38,677,281.',\n",
       " \"Anwar Hadid's age raised to the .43 power is approximately 2.68.\",\n",
       " {'Error': \"'age'. Please try again with a valid numerical expression\"},\n",
       " 'The distance between Paris and Boston is 3448 miles.',\n",
       " {'Error': \"unknown format from LLM: Assuming we don't have any information about the actual number of points scored in the 2023 super bowl, we cannot provide a mathematical expression to solve this problem.\"},\n",
       " {'Error': 'invalid syntax. Perhaps you forgot a comma? (<expr>, line 1). Please try again with a valid numerical expression'},\n",
       " {'Error': 'Could not parse LLM output: `The final answer is that there were no more points scored in the 2023 Super Bowl than in the 2022 Super Bowl.`'},\n",
       " '1.9347796717823205',\n",
       " \"Bad Bunny's height (in inches) raised to the .13 power is approximately 1.740 inches.\",\n",
       " '0.2791714614499425']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results = await run_dataset(dataset, agent)\n",
    "results"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b71e72db-fd81-498f-afc6-4596662aacc4",
   "metadata": {},
   "source": [
    "scratch\n",
    "----------\n",
    "\n",
    "ignore below"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63528052-06e9-4488-ab7b-6489b32531bd",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# from langchain import PromptTemplate\n",
    "# template = \"\"\"You are assessing a submitted student answer to a question relative to the true answer based on the provided criteria: \n",
    "    \n",
    "#     ***\n",
    "#     QUESTION: {query}\n",
    "#     ***\n",
    "#     STUDENT ANSWER: {result}\n",
    "#     ***\n",
    "#     TRUE ANSWER: {answer}\n",
    "#     ***\n",
    "#     Criteria: \n",
    "#       relevance:  Is the submission referring to a real quote from the text?\"\n",
    "#       conciseness:  Is the answer concise and to the point?\"\n",
    "#       correct: Is the answer correct?\"\n",
    "#     ***\n",
    "#     Does the submission meet the criterion? First, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print \"Correct\" or \"Incorrect\" (without quotes or punctuation) on its own line corresponding to the correct answer.\n",
    "#     Reasoning:\n",
    "# \"\"\"\n",
    "\n",
    "# GRADE_ANSWER_PROMPT_OPENAI = PromptTemplate(\n",
    "#     input_variables=[\"query\", \"result\", \"answer\"], template=template\n",
    "# )\n",
    "\n",
    "# eval_chain = QAEvalChain.from_llm(\n",
    "#     llm=ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0),\n",
    "#     prompt=GRADE_ANSWER_PROMPT_OPENAI,\n",
    "# )\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6145782c-5a85-4a7e-b62e-eeeb467c6924",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# # TODO: Update the runs syntax\n",
    "# chain_runs = requests.get(endpoint + \"/chain-runs\").json()\n",
    "# chain_run_ids = [chain_run['id'] for chain_run in chain_runs]\n",
    "# from typing import List, Optional\n",
    "# import pandas as pd\n",
    "# from io import BytesIO\n",
    "# import json\n",
    "\n",
    "# # Could do runs or whatever\n",
    "# def create_dataset(chain_run_ids: List[str], name: str, description: str, auth: Optional[dict] = None) -> str:\n",
    "#     examples = []\n",
    "#     for run_id in chain_run_ids:\n",
    "#         run_url = endpoint + f\"/chain-runs/{run_id}\"\n",
    "#         run_response = requests.get(run_url).json()\n",
    "#         examples.append({\"chain_runs\": run_response})\n",
    "#         values[\"inputs\"].append(run_response[\"inputs\"])\n",
    "#         values[\"outputs\"].append((run_response[\"outputs\"])\n",
    "#     dataset_request_body = {\"name\": \"foo\", \n",
    "#                         \"description\": \"bar\", \n",
    "#                         \"examples\": [],\n",
    "#                        }\n",
    "#     response = requests.post(endpoint + \"/datasets\", json=dataset_request_body)\n",
    "#     return response.json()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62b5ae32-2112-441e-acc0-c21428ba8bdc",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}