From f79582c5489930d359e256be9e5bdd68427e1cfb Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Fri, 2 Jun 2023 12:56:26 -0400 Subject: [PATCH] Add research example --- docs/modules/chains/examples/research.ipynb | 448 ++++++++++++++++++++ 1 file changed, 448 insertions(+) create mode 100644 docs/modules/chains/examples/research.ipynb diff --git a/docs/modules/chains/examples/research.ipynb b/docs/modules/chains/examples/research.ipynb new file mode 100644 index 00000000000..5499455f5d9 --- /dev/null +++ b/docs/modules/chains/examples/research.ipynb @@ -0,0 +1,448 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d0187afb-f460-431f-aee3-50d68bc33446", + "metadata": {}, + "source": [ + "# Research Chain\n", + "\n", + "This is an experimental research chain to attempt to answer a research question using information on the web.\n", + "\n", + "A research chain is composed of the following components:\n", + "\n", + "1. A searcher that searches for documents using a search engine.\n", + " - The searcher is responsible to return a list of URLs of documents that\n", + " may be relevant to read to be able to answer the question.\n", + "2. A downloader that downloads the documents.\n", + "3. An HTML to markdown parser (hard coded) that converts the HTML to markdown.\n", + " * Conversion to markdown is lossy\n", + " * However, it can significantly reduce the token count of the document\n", + " * Markdown helps to preserve some styling information\n", + " (e.g., bold, italics, links, headers) which is expected to help the reader\n", + " to answer certain kinds of questions correctly.\n", + "4. A reader that reads the documents and produces an answer.\n", + "\n", + "## Limitations\n", + "\n", + "* Chain can be potentially long running (Use initialization parameters to control how many options are eplored).\n", + "* This research chain only implements a single hop at the moment; i.e.,\n", + " it goes from the questions to a list of URLs to documents to compiling answers.\n", + " Without continuing the crawl, web-sites that require pagnation will not be explored fully.\n", + "* The reader chain must match the type of question. For example, the QA refine chain \n", + " isn't good at extracting a list of entries from a long document.\n", + "* Content downloader may get blocked (since it looks like a bot). \n", + " \n", + "## Extending\n", + "\n", + "* Continue crawling documents to discover more relevant pages that were not surfaced by the search engine.\n", + "* Adapt reading strategy based on nature of question.\n", + "* Analyze the query and determine whether the query is a multi-hop query and change search/crawling strategy based on that.\n", + "* Provide smaller pieces to an agent. :)\n", + "* Add cheaper strategies for selecting which links should be explored further (e.g., based on tf-idf similarity instead of gpt-4)\n", + "* Add a summarization chain on top of the individually collected answers" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7eb466b8-24fa-4acc-b0ce-06fcfa2fa9c4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.chains.research.api import Research\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.llms import OpenAI\n", + "from langchain.chains.question_answering import load_qa_chain\n", + "from langchain.chains.research.fetch import PlaywrightDownloadHandler" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "70474885-0acd-41b2-8050-15dd54f44f1e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "question = \"\"\"\\\n", + "Compile information about Harrison Chase from langchain. \n", + "Ignore if it's a different Harrison Chase. \n", + "Only include information you're certain about.\n", + "\n", + "Include:\n", + "* education history\n", + "* verbal skills\n", + "* work history \n", + "* favorite pets\n", + "* phone number\n", + "* hobbies\n", + "* does he like sports\n", + "* is he a honest person?\n", + "* living location\n", + "* social profiles (e.g., github, twitter, etc.)\n", + "* smoking habits\n", + "* which languages can he code in?\n", + "* does he play sports?\n", + "\n", + "Format your answer in a bullet point format for each sub-question.\n", + "\n", + "If the context text claims that a security verification is required or a different web-browser is \n", + "required to download the page, then please output: \"Download May Have Failed.\"\n", + "\"\"\".strip()" + ] + }, + { + "cell_type": "markdown", + "id": "6613da1c-3349-45f4-9770-19986750d548", + "metadata": {}, + "source": [ + "Instantiate LLMs" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e74a44b4-2075-4cc6-933e-c769bf3f6002", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "llm = OpenAI(temperature=0, model='text-davinci-003') # Used for the readers and the query generator\n", + "selector_llm = ChatOpenAI(temperature=0, model='gpt-4') # Used for selecting which links to explore" + ] + }, + { + "cell_type": "markdown", + "id": "b243364a-e79b-432d-8035-3de8caf554a8", + "metadata": {}, + "source": [ + "Create a chain that can be used to extract the answer to the question above from a given document.\n", + "\n", + "This chain must be tailored to the task." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "2f538062-14e3-49ab-9b25-bc470eb5869c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "qa_chain = load_qa_chain(llm, chain_type='refine')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "a96f3ed3-10de-4a85-9e93-a8b78d8bfbb6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "research = Research.from_llms(\n", + " query_generation_llm=llm, \n", + " link_selection_llm=selector_llm, \n", + " underlying_reader_chain=qa_chain, \n", + " top_k_per_search=3, \n", + " max_num_pages_per_doc=30, \n", + " download_handler=PlaywrightDownloadHandler()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "3207c696-a72c-4378-b427-7d285f5fdd1c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Retrying langchain.llms.openai.acompletion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.\n", + "Retrying langchain.llms.openai.acompletion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.\n", + "Retrying langchain.llms.openai.acompletion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.\n", + "Retrying langchain.llms.openai.acompletion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.\n", + "Retrying langchain.llms.openai.acompletion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.\n" + ] + } + ], + "source": [ + "results = await research.acall(inputs={'question': question})" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "843616b3-32d7-49c7-a42b-b0272d71f3ed", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "https://www.linkedin.com/in/harrison-chase-961287118\n", + "\n", + "Download May Have Failed.\n", + "----------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "https://github.com/hwchase17\n", + "\n", + "\n", + "* Education history: Unknown\n", + "* Verbal skills: Unknown\n", + "* Work history: GitHub user since 2015, 3,548 contributions in the last year\n", + "* Favorite pets: Unknown\n", + "* Phone number: Unknown\n", + "* Hobbies: Unknown\n", + "* Does he like sports: Unknown\n", + "* Is he a honest person?: Unknown\n", + "* Living location: Unknown\n", + "* Social profiles: GitHub (https://github.com/hwchase17)\n", + "* Smoking habits: Unknown\n", + "* Which languages can he code in?: Python, TypeScript\n", + "* Does he play sports?: Unknown\n", + "----------------------------------------------------------------------------------------------------------------------------------------------------------------\n", + "https://twitter.com/hwchase17?lang=en\n", + "\n", + "\n", + "* Education history: Unknown\n", + "* Verbal skills: Unknown\n", + "* Work history: CEO of LangChainAI, previously RobustHQ and Kensho\n", + "* Favorite pets: Unknown\n", + "* Phone number: Unknown\n", + "* Hobbies: Unknown\n", + "* Does he like sports: Unknown\n", + "* Is he a honest person: Unknown\n", + "* Living location: Unknown\n", + "* Social profiles: Twitter (@hwchase17)\n", + "* Smoking habits: Unknown\n", + "* Which languages can he code in: Unknown\n", + "* Does he play sports: Unknown\n" + ] + } + ], + "source": [ + "for doc in results['docs']:\n", + " print('--'*80)\n", + " print(doc.metadata['source'])\n", + " print(doc.page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "edcaf496-3679-4b58-9baa-6124e1cc3435", + "metadata": {}, + "source": [ + "If useful we can produce another summary!" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "76136bff-b7df-4539-9bcb-760fc4449390", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "qa_chain = load_qa_chain(llm, chain_type='stuff')" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "b352f3f3-6777-4795-acbb-ed26ecac137d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "summary = await qa_chain.acall(inputs={'input_documents': results['docs'], 'question': question})" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "1c90c305-a89d-42e2-b975-dda039e816b6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "* Education history: Unknown\n", + "* Verbal skills: Unknown\n", + "* Work history: CEO of LangChainAI, previously RobustHQ and Kensho\n", + "* Favorite pets: Unknown\n", + "* Phone number: Unknown\n", + "* Hobbies: Unknown\n", + "* Does he like sports: Unknown\n", + "* Is he a honest person: Unknown\n", + "* Living location: Unknown\n", + "* Social profiles: GitHub (https://github.com/hwchase17), Twitter (@hwchase17)\n", + "* Smoking habits: Unknown\n", + "* Which languages can he code in: Unknown\n", + "* Does he play sports: Unknown\n" + ] + } + ], + "source": [ + "print(summary['output_text'])" + ] + }, + { + "cell_type": "markdown", + "id": "af2adfee-85d3-41af-900a-c594dc01ce16", + "metadata": {}, + "source": [ + "## Under the hood" + ] + }, + { + "cell_type": "markdown", + "id": "c307aa60-7e75-48e8-ba72-b45507ed3fe0", + "metadata": {}, + "source": [ + "A searcher is invoked first to find URLs that are good to explore" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "4e2c369c-8763-458e-9ab8-684466395890", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'question': \"Compile information about Harrison Chase from langchain. \\nIgnore if it's a different harrison chase. \\nOnly include information you're certain about.\\n\\nInclude:\\n* education history\\n* verbal skills\\n* work history \\n* favorite pets\\n* phone number\\n* hobbies\\n* does he like sports\\n* is he a honest person?\\n* living location\\n* social profiles (e.g., github, twitter, etc.)\\n* smoking habits\\n* which languages can he code in?\\n* does he play sports?\",\n", + " 'urls': ['https://www.linkedin.com/in/harrison-chase-961287118',\n", + " 'https://www.youtube.com/watch?v=zaYTXQFR0_s',\n", + " 'https://twitter.com/hwchase17?lang=en']}" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results = await research.searcher.acall(inputs={'question': question})\n", + "results" + ] + }, + { + "cell_type": "markdown", + "id": "24099529-2d3d-4eca-8a1f-45a2539c8842", + "metadata": {}, + "source": [ + "The webpages are downloaded" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "c6f9d1b5-e513-4d8d-b325-32eacbee92b4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "blobs = await research.downloader.adownload(results['urls'])" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "964aae51-6a1f-4f63-a6fe-11c83557888a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'