mirror of
https://github.com/hwchase17/langchain.git
synced 2026-03-18 11:07:36 +00:00
494 lines
19 KiB
Plaintext
494 lines
19 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "6591df9f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<style>.container { width:100% !important; }</style>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"from IPython.display import display, HTML\n",
|
|
"display(HTML(\"<style>.container { width:100% !important; }</style>\"))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "22522eb8",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"%load_ext autoreload\n",
|
|
"%autoreload 2"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "d2b11cce",
|
|
"metadata": {},
|
|
"source": [
|
|
"evaluating a (task-oriented) agent\n",
|
|
"\n",
|
|
"- for now, task is information retrieval/QA\n",
|
|
" - cuz these are the cases Harrison started with\n",
|
|
"- desiderata+challenges:\n",
|
|
" - evaluate goal state:\n",
|
|
" - evaluate intermediate states:\n",
|
|
" - challenges:\n",
|
|
" - non-deterministic/different trajectories may be acceptable\n",
|
|
" - coupled/cascading errors\n",
|
|
" - we can rely on another LM for evaluation, but this is error-prone too"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "62418f6d",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Evaluate each of these w/ DaVinci (quick few shot, chain-of-thought binary classifiers)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "56b5f44b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from distutils.util import strtobool\n",
|
|
"from typing import List\n",
|
|
"\n",
|
|
"from langchain.llms import OpenAI\n",
|
|
"from langchain.schema import AgentAction\n",
|
|
"\n",
|
|
"\n",
|
|
"davinci_003 = OpenAI(model_name=\"text-davinci-003\", temperature=0.0)\n",
|
|
"\n",
|
|
"\n",
|
|
"def extract_binary_classification(completion: str) -> bool:\n",
|
|
" \"\"\"Extract (try to extract) binary classification from text completion.\n",
|
|
" \"\"\"\n",
|
|
" boolean_as_str = completion.strip().split('.')[0]\n",
|
|
" boolean = False\n",
|
|
" try:\n",
|
|
" boolean = bool(strtobool(boolean_as_str))\n",
|
|
" except ValueError as e:\n",
|
|
" print(e)\n",
|
|
" return boolean\n",
|
|
"\n",
|
|
"\n",
|
|
"def evaluate_candidate_action_plan(question: str, action_plan: List[AgentAction], candidate_action_plan: List[AgentAction], model: 'LLM ABC', verbose: bool=False) -> bool:\n",
|
|
" \"\"\"Use a few-shot classifier to verify whether 2 actin plans are \"roughly\" equivalent for a given question.\n",
|
|
"\n",
|
|
" This approach is itself highly error prone!\n",
|
|
" \"\"\"\n",
|
|
" \n",
|
|
" prompt_prefix = \"\"\"Decide whether the Candidate action plan would give the same outcome as the Desired action plan in answering a given Question. Actions correspond to calling on a tool like a search engine, data store, calculator, etc.\n",
|
|
"\n",
|
|
"Examples:\n",
|
|
"\n",
|
|
"Question: How far is the Earth from the Moon?\n",
|
|
"Desired action plan: Search(distance between Earth and Moon)\n",
|
|
"Candidate action plan: Calculator(distance from Earth to Moon)\n",
|
|
"Satisfactory? No.\n",
|
|
"Explanation: The Candidate plan uses a Calculator instead of a Search engine.\n",
|
|
"\n",
|
|
"Question: What is the number of kids our current president has to the power of two?\n",
|
|
"Desired action plan: Search(how many kids the president has?), Calculator(4^2)\n",
|
|
"Candidate action plan: Search(who is current president?), Search(how many kids Joe Biden has?), Calculator(4*4)\n",
|
|
"Satisfactory? Yes\n",
|
|
"Explanation: The Candidate plan reaches the same result as the Desired plan with one step broken down into two. \n",
|
|
"\n",
|
|
"Question: how long does it take to drive from Boston to New York?\n",
|
|
"Desired action plan: Search(distance from Boston to New York), Search(speed limit Boston to NewYork), Calculator(190.04/40)\n",
|
|
"Candidate action plan: Search(driving time from Boston to New York)\n",
|
|
"Satisfactory? Yes.\n",
|
|
"Explanation: The Candidate plan uses a tool to answer the question directly, rather than breaking it down like the Desired plan.\n",
|
|
" \"\"\"\n",
|
|
" \n",
|
|
" def serialize_action_plan(action_plan):\n",
|
|
" return ', '.join([\n",
|
|
" f\"{action.tool}({action.tool_input})\"\n",
|
|
" for action in action_plan\n",
|
|
" ])\n",
|
|
" desired_action_plan_str = serialize_action_plan(desired_action_plan)\n",
|
|
" candidate_action_plan_str = serialize_action_plan(candidate_action_plan)\n",
|
|
"\n",
|
|
" prompt = prompt_prefix + f\"\"\"\n",
|
|
"Question: {question}\n",
|
|
"Desired action plan: {desired_action_plan_str}\n",
|
|
"Candidate action plan: {candidate_action_plan_str}\n",
|
|
"Satisfactory?\"\"\"\n",
|
|
"\n",
|
|
" completion = model(prompt) \n",
|
|
" if verbose:\n",
|
|
" print(\"Prompt:\\n\", prompt)\n",
|
|
" print(\"Completion:\\n\", completion)\n",
|
|
" \n",
|
|
" return extract_binary_classification(completion)\n",
|
|
" \n",
|
|
"\n",
|
|
"def evaluate_candidate_answer(question: str, answer: str, candidate_answer: str, model: 'LLM ABC', verbose: bool=False) -> bool:\n",
|
|
" \"\"\"Use a few-shot classifier to verify whether 2 answers are \"roughly\" equivalent for a given question.\n",
|
|
"\n",
|
|
" This approach is itself highly error prone!\n",
|
|
" \"\"\"\n",
|
|
" \n",
|
|
" prompt_prefix = f\"\"\"Decide whether a Candidate answer gives the same information as a Desired answer for a given Question.\n",
|
|
"\n",
|
|
"Examples:\n",
|
|
"\n",
|
|
"Question: What is the distance from Earth to the Moon?\n",
|
|
"Desired answer: 238,900 mi\n",
|
|
"Candidate answer: The distance is about 250k miles\n",
|
|
"Satisfactory? Yes. \n",
|
|
"Explanation: The Candidate answer roughly gives the same information as the Desired answer.\n",
|
|
"\n",
|
|
"Question: How many kids does Joe Biden have?\n",
|
|
"Desired answer: 4\n",
|
|
"Candidate answer: 42\n",
|
|
"Satisfactory? No.\n",
|
|
"Explanation: The candidate answer 42 is not the same as the Desired answer.\n",
|
|
"\"\"\"\n",
|
|
" \n",
|
|
" prompt = prompt_prefix + f\"\"\"\n",
|
|
"Question: {question}\n",
|
|
"Desired answer: {answer}\n",
|
|
"Candidate answer: {candidate_answer}\n",
|
|
"Satisfactory?\"\"\"\n",
|
|
" \n",
|
|
" completion = model(prompt)\n",
|
|
" if verbose:\n",
|
|
" print(\"Prompt:\\n\", prompt)\n",
|
|
" print(\"Completion:\\n\", completion)\n",
|
|
" \n",
|
|
" return extract_binary_classification(completion)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "3f5bf13c",
|
|
"metadata": {},
|
|
"source": [
|
|
"# A couple test cases for ReAct-agent QA"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "71daffa5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain import LLMMathChain, OpenAI, SerpAPIWrapper, SQLDatabase, SQLDatabaseChain\n",
|
|
"from langchain.agents import initialize_agent, Tool, load_tools\n",
|
|
"\n",
|
|
"tools = load_tools(['serpapi', 'llm-math'], llm=OpenAI(temperature=0))\n",
|
|
"agent = initialize_agent(tools, OpenAI(temperature=0), agent=\"zero-shot-react-description\", verbose=True, return_intermediate_steps=True)\n",
|
|
"\n",
|
|
"test_cases = [\n",
|
|
" {\n",
|
|
" \"question\": \"How many people live in canada as of 2023?\",\n",
|
|
" \"answer\": \"approximately 38,625,801\",\n",
|
|
" \"steps\": [\n",
|
|
" {\"tool\": \"Search\", \"tool_input\": \"Population of Canada 2023\"}\n",
|
|
" ]\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"question\": \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\",\n",
|
|
" \"answer\": \"her boyfriend is Romain Gravas. his age raised to the .43 power is approximately 4.9373857399466665\",\n",
|
|
" \"steps\": [\n",
|
|
" {\"tool\": \"Search\", \"tool_input\": \"Dua Lipa's boyfriend\"},\n",
|
|
" {\"tool\": \"Search\", \"tool_input\": \"Romain Gravas age\"},\n",
|
|
" {\"tool\": \"Calculator\", \"tool_input\": \"41^.43\"}\n",
|
|
" ]\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"question\": \"what is dua lipa's boyfriend age raised to the .43 power?\",\n",
|
|
" \"answer\": \"her boyfriend is Romain Gravas. his age raised to the .43 power is approximately 4.9373857399466665\",\n",
|
|
" \"steps\": [\n",
|
|
" {\"tool\": \"Search\", \"tool_input\": \"Dua Lipa's boyfriend\"},\n",
|
|
" {\"tool\": \"Search\", \"tool_input\": \"Romain Gravas age\"},\n",
|
|
" {\"tool\": \"Calculator\", \"tool_input\": \"41^.43\"}\n",
|
|
" ]\n",
|
|
" \n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"question\": \"how far is it from paris to boston in miles\",\n",
|
|
" \"answer\": \"approximately 3,435 mi\",\n",
|
|
" \"steps\": [\n",
|
|
" {\"tool\": \"Search\", \"tool_input\": \"paris to boston distance\"},\n",
|
|
" ]\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"question\": \"what was the total number of points scored in the 2023 super bowl? what is that number raised to the .23 power?\",\n",
|
|
" \"answer\": \"approximately 2.682651500990882\",\n",
|
|
" \"steps\": [\n",
|
|
" {\"tool\": \"Search\", \"tool_input\": \"2023 super bowl score\"},\n",
|
|
" {\"tool\": \"Calculator\", \"tool_input\": \"73^.23\"},\n",
|
|
" ]\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"question\": \"what was the total number of points scored in the 2023 super bowl raised to the .23 power?\",\n",
|
|
" \"answer\": \"approximately 2.682651500990882\",\n",
|
|
" \"steps\": [\n",
|
|
" {\"tool\": \"Search\", \"tool_input\": \"2023 super bowl score\"},\n",
|
|
" {\"tool\": \"Calculator\", \"tool_input\": \"73^.23\"},\n",
|
|
" ]\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"question\": \"how many more points were scored in the 2023 super bowl than in the 2022 super bowl?\",\n",
|
|
" \"answer\": \"30\",\n",
|
|
" \"steps\": [\n",
|
|
" {\"tool\": \"Search\", \"tool_input\": \"2023 super bowl score\"},\n",
|
|
" {\"tool\": \"Search\", \"tool_input\": \"2022 super bowl score\"},\n",
|
|
" ]\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"question\": \"what is 153 raised to .1312 power?\",\n",
|
|
" \"answer\": \"approximately 1.9347796717823205\",\n",
|
|
" \"steps\": [\n",
|
|
" {\"tool\": \"Calculator\", \"tool_input\": \"15**.13\"},\n",
|
|
" ]\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"question\": \"who is kendall jenner's boyfriend? what is his height (in inches) raised to .13 power?\",\n",
|
|
" \"answer\": \"approximately 1.7589107138176394\",\n",
|
|
" \"steps\": [\n",
|
|
" {\"tool\": \"Search\", \"tool_input\": \"kendall jenner boyfriend\"},\n",
|
|
" {\"tool\": \"Search\", \"tool_input\": \"devin booker height\"},\n",
|
|
" {\"tool\": \"Calculator\", \"tool_input\": \"77**.13\"},\n",
|
|
" ]\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"question\": \"what is 1213 divided by 4345?\",\n",
|
|
" \"answer\": \"approximately 0.2791714614499425\",\n",
|
|
" \"steps\": [\n",
|
|
" {\"tool\": \"Calculator\", \"tool_input\": \"1213/4345\"},\n",
|
|
" ]\n",
|
|
" },\n",
|
|
"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "4fb64085",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"\n",
|
|
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
|
"\u001b[32;1m\u001b[1;3m I need to find out who Dua Lipa's boyfriend is and then calculate his age raised to the .43 power\n",
|
|
"Action: Search\n",
|
|
"Action Input: \"Dua Lipa's boyfriend\"\u001b[0m\n",
|
|
"Observation: \u001b[36;1m\u001b[1;3mDua and Isaac, a model and a chef, dated on and off from 2013 to 2019. The two first split in early 2017, which is when Dua went on to date LANY ...\u001b[0m\n",
|
|
"Thought:\u001b[32;1m\u001b[1;3m I need to find out Isaac's age\n",
|
|
"Action: Search\n",
|
|
"Action Input: \"Isaac Carew age\"\u001b[0m\n",
|
|
"Observation: \u001b[36;1m\u001b[1;3m36 years\u001b[0m\n",
|
|
"Thought:\u001b[32;1m\u001b[1;3m I need to calculate 36 raised to the .43 power\n",
|
|
"Action: Calculator\n",
|
|
"Action Input: 36^.43\u001b[0m\n",
|
|
"Observation: \u001b[33;1m\u001b[1;3mAnswer: 4.6688516567750975\n",
|
|
"\u001b[0m\n",
|
|
"Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n",
|
|
"Final Answer: Isaac Carew, Dua Lipa's boyfriend, is 36 years old and his age raised to the .43 power is 4.6688516567750975.\u001b[0m\n",
|
|
"\n",
|
|
"\u001b[1m> Finished chain.\u001b[0m\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"test_case = test_cases[1]\n",
|
|
"\n",
|
|
"question = test_case['question']\n",
|
|
"out = agent(question)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "fbe14d7b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"desired_action_plan = [\n",
|
|
" AgentAction(tool=step['tool'], tool_input=step['tool_input'], log=None)\n",
|
|
" for step in test_case['steps']\n",
|
|
"]\n",
|
|
"desired_answer = test_case['answer']\n",
|
|
"\n",
|
|
"candidate_action_plan = [\n",
|
|
" action for action, observation in out['intermediate_steps']\n",
|
|
"]\n",
|
|
"candidate_answer = out['output']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "8e4d5f86",
|
|
"metadata": {
|
|
"scrolled": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Prompt:\n",
|
|
" Decide whether a Candidate answer gives the same information as a Desired answer for a given Question.\n",
|
|
"\n",
|
|
"Examples:\n",
|
|
"\n",
|
|
"Question: What is the distance from Earth to the Moon?\n",
|
|
"Desired answer: 238,900 mi\n",
|
|
"Candidate answer: The distance is about 250k miles\n",
|
|
"Satisfactory? Yes. \n",
|
|
"Explanation: The Candidate answer roughly gives the same information as the Desired answer.\n",
|
|
"\n",
|
|
"Question: How many kids does Joe Biden have?\n",
|
|
"Desired answer: 4\n",
|
|
"Candidate answer: 42\n",
|
|
"Satisfactory? No.\n",
|
|
"Explanation: The candidate answer 42 is not the same as the Desired answer.\n",
|
|
"\n",
|
|
"Question: who is dua lipa's boyfriend? what is his age raised to the .43 power?\n",
|
|
"Desired answer: her boyfriend is Romain Gravas. his age raised to the .43 power is approximately 4.9373857399466665\n",
|
|
"Candidate answer: Isaac Carew, Dua Lipa's boyfriend, is 36 years old and his age raised to the .43 power is 4.6688516567750975.\n",
|
|
"Satisfactory?\n",
|
|
"Completion:\n",
|
|
" Yes.\n",
|
|
"Explanation: The Candidate answer roughly gives the same information as the Desired answer.\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"True"
|
|
]
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"evaluate_candidate_answer(question, desired_answer, candidate_answer, davinci_003, verbose=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8b4d6595",
|
|
"metadata": {},
|
|
"source": [
|
|
"**Not quite! From CoT explanation, appears to attend to the 2nd question only.**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "51ff802c",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Prompt:\n",
|
|
" Decide whether the Candidate action plan would give the same outcome as the Desired action plan in answering a given Question. Actions correspond to calling on a tool like a search engine, data store, calculator, etc.\n",
|
|
"\n",
|
|
"Examples:\n",
|
|
"\n",
|
|
"Question: How far is the Earth from the Moon?\n",
|
|
"Desired action plan: Search(distance between Earth and Moon)\n",
|
|
"Candidate action plan: Calculator(distance from Earth to Moon)\n",
|
|
"Satisfactory? No.\n",
|
|
"Explanation: The Candidate plan uses a Calculator instead of a Search engine.\n",
|
|
"\n",
|
|
"Question: What is the number of kids our current president has to the power of two?\n",
|
|
"Desired action plan: Search(how many kids the president has?), Calculator(4^2)\n",
|
|
"Candidate action plan: Search(who is current president?), Search(how many kids Joe Biden has?), Calculator(4*4)\n",
|
|
"Satisfactory? Yes\n",
|
|
"Explanation: The Candidate plan reaches the same result as the Desired plan with one step broken down into two. \n",
|
|
"\n",
|
|
"Question: how long does it take to drive from Boston to New York?\n",
|
|
"Desired action plan: Search(distance from Boston to New York), Search(speed limit Boston to NewYork), Calculator(190.04/40)\n",
|
|
"Candidate action plan: Search(driving time from Boston to New York)\n",
|
|
"Satisfactory? Yes.\n",
|
|
"Explanation: The Candidate plan uses a tool to answer the question directly, rather than breaking it down like the Desired plan.\n",
|
|
" \n",
|
|
"Question: who is dua lipa's boyfriend? what is his age raised to the .43 power?\n",
|
|
"Desired action plan: Search(Dua Lipa's boyfriend), Search(Romain Gravas age), Calculator(41^.43)\n",
|
|
"Candidate action plan: Search(Dua Lipa's boyfriend), Search(Isaac Carew age), Calculator(36^.43)\n",
|
|
"Satisfactory?\n",
|
|
"Completion:\n",
|
|
" No.\n",
|
|
"Explanation: The Candidate plan uses a different boyfriend than the Desired plan, so the result would be different.\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"False"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"evaluate_candidate_action_plan(question, desired_action_plan, candidate_action_plan, davinci_003, verbose=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "2557d35b",
|
|
"metadata": {},
|
|
"source": [
|
|
"**Evaluates as intended initially, though we'd likely like to break this our farther (i.e. a trajectory should not really resolve to T/F).**"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.1"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|