## AutoGPT example finding Winning Marathon Times

* Implementation of https://github.com/Significant-Gravitas/Auto-GPT 
* With LangChain primitives (LLMs, PromptTemplates, VectorStores, Embeddings, Tools)

In [None]:
# !pip install bs4
# !pip install nest_asyncio

In [1]:
# General 
import pandas as pd
from langchain.experimental.autonomous_agents.autogpt.agent import AutoGPT
from langchain.chat_models import ChatOpenAI

from langchain.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent
from langchain.docstore.document import Document
from langchain.chains import RetrievalQA
import asyncio
import nest_asyncio


# Needed synce jupyter runs an async eventloop
nest_asyncio.apply()

In [2]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=1.0)

### Set up tools

* We'll set up an AutoGPT with a `search` tool, and `write-file` tool, and a `read-file` tool, and a web browsing tool

Define any other `tools` you want to use here

In [3]:
# Tools
from typing import Optional
from langchain.agents import tool
from langchain.tools.file_management.read import ReadFileTool
from langchain.tools.file_management.write import WriteFileTool

@tool
def process_csv(csv_file_path: str, instructions: str, output_path: Optional[str] = None) -> str:
    """Process a CSV by with pandas in a limited REPL. Only use this after writing data to disk as a csv file. Any figures must be saved to disk to be viewed by the human. Instructions should be written in natural language, not code. Assume the dataframe is already loaded."""
    try:
        df = pd.read_csv(csv_file_path)
    except Exception as e:
        return f"Error: {e}"
    agent = create_pandas_dataframe_agent(llm, df, max_iterations=30, verbose=True)
    if output_path is not None:
        instructions += f" Save output to disk at {output_path}"
    try:
        return agent.run(instructions)
    except Exception as e:
        return f"Error: {e}"


**Web Search Tool**

No need for API Tokens to use this tool, but it will require an optional dependency

In [4]:
# !pip install duckduckgo_search

In [5]:
import json
from duckduckgo_search import ddg

In [6]:
@tool
def web_search(query: str, num_results: int = 8) -> str:
    """Useful for general internet search queries."""
    search_results = []
    if not query:
        return json.dumps(search_results)

    results = ddg(query, max_results=num_results)
    if not results:
        return json.dumps(search_results)

    for j in results:
        search_results.append(j)

    return json.dumps(search_results, ensure_ascii=False, indent=4)

**Browse a web page with PlayWright**

In [7]:
# !pip install playwright
# !playwright install

In [8]:
async def async_load_playwright(url: str) -> str:
    """Load the specified URLs using Playwright and parse using BeautifulSoup."""
    from bs4 import BeautifulSoup
    from playwright.async_api import async_playwright

    results = ""
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        try:
            page = await browser.new_page()
            await page.goto(url)

            page_source = await page.content()
            soup = BeautifulSoup(page_source, "html.parser")

            for script in soup(["script", "style"]):
                script.extract()

            text = soup.get_text()
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            results = "\n".join(chunk for chunk in chunks if chunk)
        except Exception as e:
            results = f"Error: {e}"
        await browser.close()
    return results

def run_async(coro):
    event_loop = asyncio.get_event_loop()
    return event_loop.run_until_complete(coro)

@tool
def browse_web_page(url: str) -> str:
    """Verbose way to scrape a whole webpage. Likely to cause issues parsing."""
    return run_async(async_load_playwright(url))

**Q&A Over a webpage**

Help the model ask more directed questions of web pages to avoid cluttering its memory

In [9]:
from langchain.tools.base import BaseTool
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.document_loaders import WebBaseLoader
from pydantic import Field
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain, BaseCombineDocumentsChain

def _get_text_splitter():
    return RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size = 500,
        chunk_overlap  = 20,
        length_function = len,
    )


class WebpageQATool(BaseTool):
    name = "query_webpage"
    description = "Browse a webpage and retrieve the information relevant to the question."
    text_splitter: RecursiveCharacterTextSplitter = Field(default_factory=_get_text_splitter)
    qa_chain: BaseCombineDocumentsChain
    
    def _run(self, url: str, question: str) -> str:
        """Useful for browsing websites and scraping the text information."""
        result = browse_web_page.run(url)
        docs = [Document(page_content=result, metadata={"source": url})]
        web_docs = self.text_splitter.split_documents(docs)
        results = []
        # TODO: Handle this with a MapReduceChain
        for i in range(0, len(web_docs), 4):
            input_docs = web_docs[i:i+4]
            window_result = self.qa_chain({"input_documents": input_docs, "question": question}, return_only_outputs=True)
            results.append(f"Response from window {i} - {window_result}")
        results_docs = [Document(page_content="\n".join(results), metadata={"source": url})]
        return self.qa_chain({"input_documents": results_docs, "question": question}, return_only_outputs=True)
    
    async def _arun(self, url: str, question: str) -> str:
        raise NotImplementedError
      

In [10]:
query_website_tool = WebpageQATool(qa_chain=load_qa_with_sources_chain(llm))

### Set up memory

* The memory here is used for the agents intermediate steps

In [11]:
# Memory
import faiss
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.embeddings import OpenAIEmbeddings
from langchain.tools.human.tool import HumanInputRun

embeddings_model = OpenAIEmbeddings()
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {})

### Setup model and AutoGPT

`Model set-up`

In [12]:
tools = [
    web_search,
    WriteFileTool(),
    ReadFileTool(),
    process_csv,
    query_website_tool,
    # HumanInputRun(), # Activate if you want the permit asking for help from the human
]

In [13]:
agent = AutoGPT.from_llm_and_tools(
    ai_name="Tom",
    ai_role="Assistant",
    tools=tools,
    llm=llm,
    memory=vectorstore.as_retriever(search_kwargs={"k": 8}),
    # human_in_the_loop=True, # Set to True if you want to add feedback at each step.
)
# agent.chain.verbose = True

### AutoGPT as a research / data munger 

#### `inflation` and `college tuition`
 
Let's use AutoGPT as researcher and data munger / cleaner.
  
I spent a lot of time over the years crawling data sources and cleaning data. 

Let's see if AutoGPT can do all of this for us!

Here is the prompt comparing `inflation` and `college tuition`.

In [14]:
agent.run(["What were the winning boston marathon times for the past 5 years? Generate a table of the names, countries of origin, and times."])



{
    "thoughts": {
        "text": "I need to find the winning Boston Marathon times for the past 5 years.",
        "reasoning": "I'll start by conducting a web search for the requested information.",
        "plan": "- Conduct a web search\n- Query relevant webpage\n- Generate table\n- Save data to file",
        "criticism": "None",
        "speak": "I will begin by conducting a web search to find the past 5 years' Boston Marathon winning times."
    },
    "command": {
        "name": "web_search",
        "args": {
            "query": "winning Boston Marathon times for the past 5 years"
        }
    }
}




{
    "thoughts": {
        "text": "I found several relevant search results, and I will use the query_webpage command on a specific URL to gather the information.",
        "reasoning": "The Boston Athletic Association's official website (www.baa.org) is likely the most accurate source.",
        "plan": "- Query the Boston Athletic Association webpage\n- Filter and parse the data\n- Generate table and save to file",
        "criticism": "None",
        "speak": "I will now query the Boston Athletic Association webpage to retrieve the information on the past 5 years' winning times."
    },
    "command": {
        "name": "query_webpage",
        "args": {
            "url": "https://www.baa.org/races/boston-marathon/results/champions",
            "question": "winning times of the Boston Marathon for the past 5 years"
        }
    }
}




{
    "thoughts": {
        "text": "I have the winning times of the Boston Marathon for the past 5 years. I need to create a table with the names, countries of origin, and times.",
        "reasoning": "I can use the information I've retrieved to generate a CSV file, then process the CSV file to create the table.",
        "plan": "- Save data to a CSV file\n- Process CSV file to generate table",
        "criticism": "None",
        "speak": "I will save the Boston Marathon data to a CSV file and then process it to generate a table."
    },
    "command": {
        "name": "write_file",
        "args": {
            "file_path": "boston_marathon_data.csv",
            "text": "Year,Name,Country,Time\n2022,Evans Chebet,Kenya,2:06:51\n2021,Benson Kipruto,Kenya,2:09:51\n2019,Lawrence Cherono,Kenya,2:07:57\n2018,Yuki Kawauchi,Japan,2:15:58\n2017,Geoffrey Kirui,Kenya,2:09:37"
        }
    }
}




{
    "thoughts": {
        "text": "I have the winning times for the Boston Marathon for the past 5 years. Now, I need to turn it into a table, save it as a CSV, and process it.",
        "reasoning": "Saving data as a CSV will allow for a clean table format and output.",
        "plan": "- Format the data as a CSV\n- Write the CSV to disk\n- Process the CSV and generate a table",
        "criticism": "None",
        "speak": "I will now format the winning times data as a CSV, save it, and process it to generate a table."
    },
    "command": {
        "name": "write_file",
        "args": {
            "file_path": "boston_marathon_winners.csv",
            "text": "Year,Name,Country,Time\n2022,Evans Chebet,Kenya,2:06:51\n2021,Benson Kipruto,Kenya,2:09:51\n2019,Lawrence Cherono,Kenya,2:07:57\n2018,Yuki Kawauchi,Japan,2:15:58\n2017,Geoffrey Kirui,Kenya,2:09:37"
        }
    }
}




{
    "thoughts": {
        "text": "I have the winning times for the past 5 years, now I need to organize the information into a table and save it as a file.",
        "reasoning": "Storing the information in a table file format will make it easier to access and present the data.",
        "plan": "- Save data in a CSV file\n- Generate a table",
        "criticism": "None",
        "speak": "I will save the winning marathon times in a table format as a CSV file and generate a table."
    },
    "command": {
        "name": "write_file",
        "args": {
            "file_path": "boston_marathon_winners.csv",
            "text": "Year,Name,Country,Time\n2022,Evans Chebet,Kenya,2:06:51\n2021,Benson Kipruto,Kenya,2:09:51\n2019,Lawrence Cherono,Kenya,2:07:57\n2018,Yuki Kawauchi,Japan,2:15:58\n2017,Geoffrey Kirui,Kenya,2:09:37"
        }
    }
}




{
    "thoughts": {
        "text": "I previously found the winning times of the Boston Marathon for the past 5 years and saved it as a CSV file. Now, I need to process the CSV to create a table with the names, countries of origin, and times.",
        "reasoning": "Processing the CSV will help generate a clean table to provide the answer.",
        "plan": "- Process the CSV file\n- Generate a table with the required columns",
        "criticism": "None",
        "speak": "I will process the Boston Marathon winners' CSV file to create a table with the names, countries of origin, and times."
    },
    "command": {
        "name": "process_csv",
        "args": {
            "csv_file_path": "boston_marathon_winners.csv",
            "instructions": "Generate a table with columns Year, Name, Country, and Time."
        }
    }
}


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to convert the provided dataframe to a string in Markdown table format.
Action: py



{
    "thoughts": {
        "text": "I have already retrieved the winning times for the past 5 years of the Boston Marathon and created a table. My objective is complete.",
        "reasoning": "I have completed my task on this topic, so I don't need to use any other commands.",
        "plan": "- Inform the user that the task is complete",
        "criticism": "None",
        "speak": "I have found the winning times for the past 5 years of the Boston Marathon and created a table. My task is complete."
    },
    "command": {
        "name": "finish",
        "args": {
            "response": "I have found the winning times for the past 5 years of the Boston Marathon and created a table. My task is complete."
        }
    }
}


'I have found the winning times for the past 5 years of the Boston Marathon and created a table. My task is complete.'