## AutoGPT example finding Winning Marathon Times

* Implementation of https://github.com/Significant-Gravitas/Auto-GPT 
* With LangChain primitives (LLMs, PromptTemplates, VectorStores, Embeddings, Tools)

In [None]:
# !pip install bs4
# !pip install nest_asyncio

In [1]:
# General 
import pandas as pd
from langchain.experimental.autonomous_agents.autogpt.agent import AutoGPT
from langchain.chat_models import ChatOpenAI

from langchain.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent
from langchain.docstore.document import Document
from langchain.chains import RetrievalQA
import asyncio
import nest_asyncio


# Needed synce jupyter runs an async eventloop
nest_asyncio.apply()

In [2]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=1.0)

### Set up tools

* We'll set up an AutoGPT with a `search` tool, and `write-file` tool, and a `read-file` tool, and a web browsing tool

Define any other `tools` you want to use here

In [3]:
# Tools
from typing import Optional
from langchain.agents import tool
from langchain.tools.file_management.read import ReadFileTool
from langchain.tools.file_management.write import WriteFileTool

@tool
def process_csv(csv_file_path: str, instructions: str, output_path: Optional[str] = None) -> str:
    """Process a CSV by with pandas in a limited REPL. Only use this after writing data to disk as a csv file. Any figures must be saved to disk to be viewed by the human. Instructions should be written in natural language, not code. Assume the dataframe is already loaded."""
    try:
        df = pd.read_csv(csv_file_path)
    except Exception as e:
        return f"Error: {e}"
    agent = create_pandas_dataframe_agent(llm, df, max_iterations=30, verbose=True)
    if output_path is not None:
        instructions += f" Save output to disk at {output_path}"
    try:
        return agent.run(instructions)
    except Exception as e:
        return f"Error: {e}"


**Web Search Tool**

No need for API Tokens to use this tool, but it will require an optional dependency

In [4]:
# !pip install duckduckgo_search

In [5]:
import json
from duckduckgo_search import ddg

In [6]:
@tool
def web_search(query: str, num_results: int = 8) -> str:
    """Useful for general internet search queries."""
    search_results = []
    if not query:
        return json.dumps(search_results)

    results = ddg(query, max_results=num_results)
    if not results:
        return json.dumps(search_results)

    for j in results:
        search_results.append(j)

    return json.dumps(search_results, ensure_ascii=False, indent=4)

**Browse a web page with PlayWright**

In [15]:
# !pip install playwright
# !playwright install

In [7]:
async def async_load_playwright(url: str) -> str:
    """Load the specified URLs using Playwright and parse using BeautifulSoup."""
    from bs4 import BeautifulSoup
    from playwright.async_api import async_playwright

    results = ""
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        try:
            page = await browser.new_page()
            await page.goto(url)

            page_source = await page.content()
            soup = BeautifulSoup(page_source, "html.parser")

            for script in soup(["script", "style"]):
                script.extract()

            text = soup.get_text()
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            results = "\n".join(chunk for chunk in chunks if chunk)
        except Exception as e:
            results = f"Error: {e}"
        await browser.close()
    return results

def run_async(coro):
    event_loop = asyncio.get_event_loop()
    return event_loop.run_until_complete(coro)

@tool
def browse_web_page(url: str) -> str:
    """Verbose way to scrape a whole webpage. Likely to cause issues parsing."""
    return run_async(async_load_playwright(url))

**Q&A Over a webpage**

Help the model ask more directed questions of web pages to avoid cluttering its memory

In [8]:
from langchain.tools.base import BaseTool
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.document_loaders import WebBaseLoader
from pydantic import Field
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain, BaseCombineDocumentsChain

def _get_text_splitter():
    return RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size = 500,
        chunk_overlap  = 20,
        length_function = len,
    )


class WebpageQATool(BaseTool):
    name = "query_webpage"
    description = "Browse a webpage and retrieve the information relevant to the question."
    text_splitter: RecursiveCharacterTextSplitter = Field(default_factory=_get_text_splitter)
    qa_chain: BaseCombineDocumentsChain
    
    def _run(self, url: str, question: str) -> str:
        """Useful for browsing websites and scraping the text information."""
        result = browse_web_page.run(url)
        docs = [Document(page_content=result, metadata={"source": url})]
        web_docs = self.text_splitter.split_documents(docs)
        results = []
        # TODO: Handle this with a MapReduceChain
        for i in range(0, len(web_docs), 4):
            input_docs = web_docs[i:i+4]
            window_result = self.qa_chain({"input_documents": input_docs, "question": question}, return_only_outputs=True)
            results.append(f"Response from window {i} - {window_result}")
        results_docs = [Document(page_content="\n".join(results), metadata={"source": url})]
        return self.qa_chain({"input_documents": results_docs, "question": question}, return_only_outputs=True)
    
    async def _arun(self, url: str, question: str) -> str:
        raise NotImplementedError
      

In [9]:
query_website_tool = WebpageQATool(qa_chain=load_qa_with_sources_chain(llm))

### Set up memory

* The memory here is used for the agents intermediate steps

In [10]:
# Memory
import faiss
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.embeddings import OpenAIEmbeddings
from langchain.tools.human.tool import HumanInputRun

embeddings_model = OpenAIEmbeddings()
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {})

### Setup model and AutoGPT

`Model set-up`

In [14]:
tools = [
    web_search,
    WriteFileTool(),
    ReadFileTool(),
    process_csv,
    query_website_tool,
    # HumanInputRun(), # Activate if you want the permit asking for help from the human
]

In [12]:
agent = AutoGPT.from_llm_and_tools(
    ai_name="Tom",
    ai_role="Assistant",
    tools=tools,
    llm=llm,
    memory=vectorstore.as_retriever(search_kwargs={"k": 8}),
    # human_in_the_loop=True, # Set to True if you want to add feedback at each step.
)
# agent.chain.verbose = True

### AutoGPT as a research / data munger 

#### `inflation` and `college tuition`
 
Let's use AutoGPT as researcher and data munger / cleaner.
  
I spent a lot of time over the years crawling data sources and cleaning data. 

Let's see if AutoGPT can do all of this for us!

Here is the prompt comparing `inflation` and `college tuition`.

In [13]:
agent.run(["What were the winning boston marathon times for the past 5 years"])



{
    "thoughts": {
        "text": "I will start by using the web_search command to find the Boston Marathon winning times for the past 5 years. Since it is a simple query, I expect to find relevant information in the first few search results.",
        "reasoning": "I have decided on this course of action to efficiently retrieve the information I need while keeping it simple to avoid legal complications.",
        "plan": "- Use web_search command to search for 'Boston Marathon winning times past 5 years'\n- Look through top results for the information requested\n- If necessary, repeat search with slightly different query to get better results",
        "criticism": "I should make sure to phrase my search query concisely to prevent too many irrelevant search results.",
        "speak": "I will use the web_search command to find the Boston Marathon winning times for the past 5 years."
    },
    "command": {
        "name": "web_search",
        "args": {
            "query": "Boston 



{
    "thoughts": {
        "text": "The search results indicate that the winning times for the past 5 years can be found on the Wikipedia page for the Boston Marathon winners. I will retrieve this information using the query_webpage command and the URL of the relevant Wikipedia page. ",
        "reasoning": "This method will allow me to avoid any legal complications while retrieving the exact information that I need.",
        "plan": "- Use query_webpage command to browse through the Wikipedia page for the Boston Marathon winners \\n- Locate the relevant information on the page \\n- Return the information to the user",
        "criticism": "I should be sure to locate and use only reliable sources to avoid providing the user with inaccurate information.",
        "speak": "I will use the query_webpage command to browse through the Wikipedia page for the Boston Marathon winners and retrieve the relevant information."
    },
    "command": {
        "name": "query_webpage",
        "arg



{
    "thoughts": {
        "text": "I could write this information to a file using the write_file command so that I can refer back to it later. But first, I should process the information to extract the relevant data and convert it into a useful format using the process_csv command.",
        "reasoning": "Storing the information in a CSV file will allow me to use pandas to manipulate the data in the future with ease, while converting the data into a useful format will make it easier for other people to use the data.",
        "plan": "- Use process_csv command to extract and format the relevant data from the Wikipedia page\\n- Write the data to a CSV file using the write_file command\\n- Inform the user that the data has been saved to file",
        "criticism": "I should thoroughly check the CSV file for accuracy before saving the data.",
        "speak": "I will use the process_csv command to extract and format the relevant data from the Wikipedia page, and save it to a CSV file us



{
    "thoughts": {
        "text": "I will now use the write_file command to save the winning times for the past 5 years as a text file for later reference.",
        "reasoning": "This will allow me to easily access the winning times in the future without needing to search for the information again.",
        "plan": "- Use the write_file command to save the winning times for the past 5 years to a text file.\\n- Name the file appropriately, so it is easy to identify in the future.",
        "criticism": "I should review the contents of the file after writing it to make sure that the information has been saved correctly and can be read later without errors.",
        "speak": "I will use the write_file command to save the winning times for the past 5 years as a text file for easy retrieval in the future."
    },
    "command": {
        "name": "write_file",
        "args": {
            "file_path": "BostonMarathonWinningTimes.txt",
            "text": "The winning times of the Bosto



I will use the command "query_webpage" to browse through the Wikipedia page for the Boston Marathon winners and retrieve the winning times for the past 5 years. This is the method that will allow me to retrieve the exact information I need while avoiding any legal issues. 

```
{
    "thoughts": {
        "text": "The search results indicate that the winning times for the past 5 years can be found on the Wikipedia page for the Boston Marathon winners. I will retrieve this information using the query_webpage command and the URL of the relevant Wikipedia page. ",
        "reasoning": "This method will allow me to avoid any legal complications while retrieving the exact information that I need.",
        "plan": "- Use query_webpage command to browse through the Wikipedia page for the Boston Marathon winners \\n- Locate the relevant information on the page \\n- Return the information to the user",
        "criticism": "I should be sure to locate and use only reliable sources to avoid prov



{
    "thoughts": {
        "text": "Now that I have the winning times for the past 5 years, I will save them to a file to ensure that I don't forget them. I will use the write_file command to save this data to disk.",
        "reasoning": "By saving the data to disk, I can easily retrieve it in the future and avoid having to perform the same search again.",
        "plan": "- Use the write_file command to save the data to a text file.",
        "criticism": "I should ensure that the file path and file name are clear and descriptive to make it easy to locate the relevant information in the future.",
        "speak": "I will save the winning times for the past 5 years to a text file using the write_file command."
    },
    "command": {
        "name": "write_file",
        "args": {
            "file_path": "winning_times.txt",
            "text": "The winning times of the Boston Marathon for the past 5 years are:\n- 2017: Men's - 2:09:37, Women's - 2:21:52\n- 2018: Men's - 2:15:58, Wo



{
    "thoughts": {
        "text": "I will now shut down my system, as I have completed all of my objectives.",
        "reasoning": "Since I have completed all of my objectives, there is no need to keep running my system.",
        "plan": "- Use the finish command to signal that I have completed all of my objectives.\\\\n- Shut down the system.",
        "criticism": "I should always double-check to ensure that all of my objectives have been completed before shutting down my system.",
        "speak": "I have completed all of my objectives, and I will now shut down my system."
    },
    "command": {
        "name": "finish",
        "args": {
            "response": "All objectives have been completed."
        }
    }
}


'All objectives have been completed.'