Kay retriever (#10657)

- **Description**: Adding retrievers for [kay.ai](https://kay.ai) and
SEC filings powered by Kay and Cybersyn. Kay provides context as a
service: it's an API built for RAG.
- **Issue**: N/A
- **Dependencies**: Just added a dep to the
[kay](https://pypi.org/project/kay/) package
- **Tag maintainer**: @baskaryan @hwchase17 Discussed in slack
- **Twtter handle:** [@vishalrohra_](https://twitter.com/vishalrohra_)

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Palau 2023-09-25 16:10:13 -04:00 committed by GitHub
parent 5f13668fa0
commit 89ef440c14
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 1729 additions and 1265 deletions

View File

@ -111,7 +111,8 @@
"This is useful for instance when AWS credentials can't be set as environment variables.\n",
"See the [list of parameters](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session) that can be configured."
],
"metadata": {}
"metadata": {},
"id": "91a7ac07"
},
{
"cell_type": "code",
@ -120,7 +121,8 @@
"source": [
"loader = S3DirectoryLoader(\"testing-hwc\", aws_access_key_id=\"xxxx\", aws_secret_access_key=\"yyyy\")"
],
"metadata": {}
"metadata": {},
"id": "f485ec8c"
},
{
"cell_type": "code",
@ -129,7 +131,8 @@
"source": [
"loader.load()"
],
"metadata": {}
"metadata": {},
"id": "c0fa76ae"
}
],
"metadata": {

View File

@ -84,7 +84,8 @@
"source": [
"loader = S3FileLoader(\"testing-hwc\", \"fake.docx\", aws_access_key_id=\"xxxx\", aws_secret_access_key=\"yyyy\")"
],
"metadata": {}
"metadata": {},
"id": "43106ee8"
},
{
"cell_type": "code",
@ -93,7 +94,8 @@
"source": [
"loader.load()"
],
"metadata": {}
"metadata": {},
"id": "1764a727"
}
],
"metadata": {
@ -118,4 +120,3 @@
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -147,7 +147,8 @@
],
"metadata": {
"collapsed": false
}
},
"id": "c9bc0693"
},
{
"cell_type": "code",
@ -203,7 +204,8 @@
],
"metadata": {
"collapsed": false
}
},
"id": "a7fa0331"
},
{
"attachments": {},
@ -265,12 +267,12 @@
"evalue": "1 validation error for ChatOpenAI\n__root__\n Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mValidationError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[0;32mIn[17], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m llm \u001B[38;5;241m=\u001B[39m \u001B[43mChatOpenAI\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtemperature\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m0\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m 2\u001B[0m agent_chain \u001B[38;5;241m=\u001B[39m initialize_agent(\n\u001B[1;32m 3\u001B[0m tools,\n\u001B[1;32m 4\u001B[0m llm,\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 7\u001B[0m memory\u001B[38;5;241m=\u001B[39mmemory,\n\u001B[1;32m 8\u001B[0m )\n",
"File \u001B[0;32m~/Documents/projects/langchain/libs/langchain/langchain/load/serializable.py:74\u001B[0m, in \u001B[0;36mSerializable.__init__\u001B[0;34m(self, **kwargs)\u001B[0m\n\u001B[1;32m 73\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m__init__\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs: Any) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[0;32m---> 74\u001B[0m \u001B[38;5;28;43msuper\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[38;5;21;43m__init__\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 75\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_lc_kwargs \u001B[38;5;241m=\u001B[39m kwargs\n",
"File \u001B[0;32m~/Documents/projects/langchain/.venv/lib/python3.9/site-packages/pydantic/main.py:341\u001B[0m, in \u001B[0;36mpydantic.main.BaseModel.__init__\u001B[0;34m()\u001B[0m\n",
"\u001B[0;31mValidationError\u001B[0m: 1 validation error for ChatOpenAI\n__root__\n Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)"
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[17], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m llm \u001b[38;5;241m=\u001b[39m \u001b[43mChatOpenAI\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtemperature\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m agent_chain \u001b[38;5;241m=\u001b[39m initialize_agent(\n\u001b[1;32m 3\u001b[0m tools,\n\u001b[1;32m 4\u001b[0m llm,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 7\u001b[0m memory\u001b[38;5;241m=\u001b[39mmemory,\n\u001b[1;32m 8\u001b[0m )\n",
"File \u001b[0;32m~/Documents/projects/langchain/libs/langchain/langchain/load/serializable.py:74\u001b[0m, in \u001b[0;36mSerializable.__init__\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 74\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 75\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lc_kwargs \u001b[38;5;241m=\u001b[39m kwargs\n",
"File \u001b[0;32m~/Documents/projects/langchain/.venv/lib/python3.9/site-packages/pydantic/main.py:341\u001b[0m, in \u001b[0;36mpydantic.main.BaseModel.__init__\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mValidationError\u001b[0m: 1 validation error for ChatOpenAI\n__root__\n Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)"
]
}
],

View File

@ -0,0 +1,207 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "263f914c-9d67-4316-8b3d-03c3b99ba9d8",
"metadata": {},
"source": [
"Kay.ai\n",
"=\n",
"\n",
"> Data API built for RAG 🕵️ We are curating the world's largest datasets as high-quality embeddings so your AI agents can retrieve context on the fly. Latest models, fast retrieval, and zero infra.\n",
"\n",
"This notebook shows you how to retrieve datasets supported by [Kay](https://kay.ai/). You can currently search SEC Filings and Press Releases of US companies. Visit [kay.ai](https://kay.ai) for the latest data drops. For any questions, join our [discord](https://discord.gg/hAnE4e5T6M) or [tweet at us](https://twitter.com/vishalrohra_)"
]
},
{
"cell_type": "markdown",
"id": "fc507b8e-ea51-417c-93da-42bf998a1195",
"metadata": {},
"source": [
"Installation\n",
"=\n",
"\n",
"First you will need to install the [`kay` package](https://pypi.org/project/kay/). You will also need an API key: you can get one for free at [https://kay.ai](https://kay.ai/). Once you have an API key, you must set it as an environment variable `KAY_API_KEY`.\n",
"\n",
"`KayAiRetriever` has a static `.create()` factory method that takes the following arguments:\n",
"\n",
"* `dataset_id: string` required -- A Kay dataset id. This is a collection of data about a particular entity such as companies, people, or places. For example, try `\"company\"` \n",
"* `data_type: List[string]` optional -- This is a category within a dataset based on its origin or format, such as SEC Filings, Press Releases, or Reports within the “company” dataset. For example, try [\"10-K\", \"10-Q\", \"PressRelease\"] under the “company” dataset. If left empty, Kay will retrieve the most relevant context across all types.\n",
"* `num_contexts: int` optional, defaults to 6 -- The number of document chunks to retrieve on each call to `get_relevant_documents()`"
]
},
{
"cell_type": "markdown",
"id": "c923bea0-585a-4f62-8662-efc167e8d793",
"metadata": {},
"source": [
"Examples\n",
"=\n",
"\n",
"Basic Retriever Usage\n",
"-"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f7b8c99c-0341-4f3c-912f-a11e98f7de71",
"metadata": {},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
" ········\n"
]
}
],
"source": [
"# Setup API key\n",
"from getpass import getpass\n",
"KAY_API_KEY = getpass()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "b4d4d386-2a6b-4942-863e-9202f5a9f1d6",
"metadata": {},
"outputs": [],
"source": [
"from langchain.retrievers import KayAiRetriever\n",
"import os\n",
"from kay.rag.retrievers import KayRetriever\n",
"os.environ[\"KAY_API_KEY\"] = KAY_API_KEY\n",
"retriever = KayAiRetriever.create(dataset_id=\"company\", data_types=[\"10-K\", \"10-Q\", \"PressRelease\"], num_contexts=3)\n",
"docs = retriever.get_relevant_documents(\"What were the biggest strategy changes and partnerships made by Roku in 2023??\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "04ee2d6b-c2ab-4e15-8a8b-afaf6ef8c0f6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='Company Name: ROKU INC\\nCompany Industry: CABLE & OTHER PAY TELEVISION SERVICES\\nArticle Title: Roku and FreeWheel Announce Strategic Partnership to Bring Rokus Leading Ad Tech to FreeWheel Customers\\nText: Additionally, eMarketer Link: https://cts.businesswire.com/ct/CT?id=smartlink&url=https%3A%2F%2Fwww.insiderintelligence.com%2Finsights%2Favod-more-than-50-percent-of-us-digital-video-viewers%2F&esheet=53451144&newsitemid=20230712907788&lan=en-US&anchor=eMarketer&index=4&md5=b64dea72bcf6b6379474462602781d83 projects 57% of U.S. digital video users will stream an advertising-based video on demand (AVOD) service this year.\\nHaving solutions aimed at driving greater interoperability and automation will help accelerate this growth.\\nKey highlights of this collaboration include:\\nStreamlined Integration: Roku has now integrated its demand application programming interface (dAPI) with FreeWheel s TV platform. Roku s demand API gives publishers direct, automatic and real-time access to more advertiser demand. This enhanced integration allows for streamlined ad operation workflows and better inventory quality control, both of which will improve publisher yield and revenue.\\nSeamless Data Targeting: Publishers can now use Roku platform signals to enable advertisers to target audiences and measure campaign performance without relying on cookies. Additionally, FreeWheel and Roku will rely on data clean room technology to enable the activation of additional data sets providing better measurement and monetization to publishers and agencies.', metadata={'_additional': {'id': '962b79e0-f9d1-43ae-9f7a-8a9b42bc7a9a'}, 'chunk_type': 'text', 'chunk_years_mentioned': [], 'company_name': 'ROKU INC', 'company_sic_code_description': 'CABLE & OTHER PAY TELEVISION SERVICES', 'data_source': 'PressRelease', 'data_source_link': 'https://www.nasdaq.com/press-release/roku-and-freewheel-announce-strategic-partnership-to-bring-rokus-leading-ad-tech-to', 'data_source_publish_date': '2023-07-12T00:00:00Z', 'data_source_uid': 'a46f309c-705d-3946-96db-87aa4e73261f', 'title': 'ROKU INC | Roku and FreeWheel Announce Strategic Partnership to Bring Rokus Leading Ad Tech to FreeWheel Customers'}),\n",
" Document(page_content='Company Name: ROKU INC \\n Company Industry: CABLE & OTHER PAY TELEVISION SERVICES \\n Form Title: 10-K 2022-FY \\n Form Section: Risk Factors \\n Text: nd the Note Regarding Forward Looking Statements.This section of this Annual Report generally discusses fiscal years 2022 and 2021 and year to year comparisons between those years.Discussions of fiscal year 2020 and year to year comparisons between fiscal years 2021 and 2020 that are not included in this Annual Report can be found in Management\\'s Discussion and Analysis of Financial Condition and Results of Operations in Part II, Item 7 of our Annual Report for the fiscal year ended December 31, 2021 filed with the SEC on February 18, 2022.Overview Effective as of the fourth quarter of fiscal 2022, we reorganized our reportable segments to better align with management\\'s reporting of information reviewed by the Chief Operating Decision Maker (\"CODM\") for each segment.We renamed our \"player\" segment to \"devices\" which now includes our licensing arrangements with service operators and licensed Roku TV partners in addition to sales of our streaming players, audio products, smart home products and Roku branded TVs that will be designed, made, and sold by us in 2023.Our historical segment information is recast to conform to our new presentation in our financial statements and accompanying notes included in Item 8 of this Annual Report.Our two reportable segments are the platform segment and the devices segment.', metadata={'_additional': {'id': 'a76c5fed-5d63-45a7-b63a-2c30e05140fc'}, 'chunk_type': 'text', 'chunk_years_mentioned': [2020, 2021, 2022, 2023], 'company_name': 'ROKU INC', 'company_sic_code_description': 'CABLE & OTHER PAY TELEVISION SERVICES', 'data_source': '10-K', 'data_source_link': 'https://www.sec.gov/Archives/edgar/data/1428439/000142843923000007', 'data_source_publish_date': '2022-01-01T00:00:00Z', 'data_source_uid': '0001428439-23-000007', 'title': 'ROKU INC | 10-K 2022-FY '}),\n",
" Document(page_content='Company Name: ROKU INC \\n Company Industry: CABLE & OTHER PAY TELEVISION SERVICES \\n Form Title: 10-Q 2023-Q1 \\n Form Section: Risk Factors \\n Text: Our current and potential partners include TV brands, cable and satellite companies, and telecommunication providers.Under these license arrangements, we generally have limited or no control over the amount and timing of resources these entities dedicate to the relationship.In the past, our licensed Roku TV partners have failed to meet their forecasts and anticipated market launch dates for distributing Roku TV models, and they may fail to meet their forecasts or such launches in the future.If our licensed Roku TV partners or service operator partners fail to meet their forecasts or such launches for distributing licensed streaming devices or choose to deploy competing streaming solutions within their product lines, our business may be harmed.We depend on a small number of content publishers for a majority of our streaming hours, and if we fail to maintain these relationships, our business could be harmed.*Historically, a small number of content publishers have accounted for a significant portion of the hours streamed on our platform.In the three months ended March 31, 2023, the top three streaming services represented over 50% of all hours streamed in the period.If, for any reason, we cease distributing channels that have historically streamed a large percentage of the aggregate streaming hours on our platform, our streaming hours, our active accounts, or Roku streaming device sales may be adversely affected, and our business may be harmed.', metadata={'_additional': {'id': '2a92b2bb-02a0-4e15-8b64-d7e04078a205'}, 'chunk_type': 'text', 'chunk_years_mentioned': [2023], 'company_name': 'ROKU INC', 'company_sic_code_description': 'CABLE & OTHER PAY TELEVISION SERVICES', 'data_source': '10-Q', 'data_source_link': 'https://www.sec.gov/Archives/edgar/data/1428439/000142843923000017', 'data_source_publish_date': '2023-01-01T00:00:00Z', 'data_source_uid': '0001428439-23-000017', 'title': 'ROKU INC | 10-Q 2023-Q1 '})]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs"
]
},
{
"cell_type": "markdown",
"id": "21f6e9e5-478c-4b2c-9d61-f7a84f4d2f8f",
"metadata": {},
"source": [
"Usage in a chain\n",
"-"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d1cba716-ab8d-4518-9196-43f17eb189dc",
"metadata": {},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
" ········\n"
]
}
],
"source": [
"OPENAI_API_KEY = getpass()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "79441f1f-fa06-452c-bcd6-160ad0debc6a",
"metadata": {},
"outputs": [],
"source": [
"os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "0c504bcd-f6e0-4028-a797-b31fb4b6d027",
"metadata": {},
"outputs": [],
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.chains import ConversationalRetrievalChain\n",
"\n",
"model = ChatOpenAI(model_name=\"gpt-3.5-turbo\")\n",
"qa = ConversationalRetrievalChain.from_llm(model, retriever=retriever)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "977f158b-38d3-4b5f-9379-7cdd09436327",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-> **Question**: What were the biggest strategy changes and partnerships made by Roku in 2023? \n",
"\n",
"**Answer**: In 2023, Roku made a strategic partnership with FreeWheel to bring Roku's leading ad tech to FreeWheel customers. This partnership aimed to drive greater interoperability and automation in the advertising-based video on demand (AVOD) space. Key highlights of this collaboration include streamlined integration of Roku's demand application programming interface (dAPI) with FreeWheel's TV platform, allowing for better inventory quality control and improved publisher yield and revenue. Additionally, publishers can now use Roku platform signals to enable advertisers to target audiences and measure campaign performance without relying on cookies. This partnership also involves the use of data clean room technology to enable the activation of additional data sets for better measurement and monetization for publishers and agencies. These partnerships and strategies aim to support Roku's growth in the AVOD market. \n",
"\n"
]
}
],
"source": [
"questions = [\n",
" \"What were the biggest strategy changes and partnerships made by Roku in 2023?\"\n",
" # \"Where is Wex making the most money in 2023?\",\n",
"]\n",
"chat_history = []\n",
"\n",
"for question in questions:\n",
" result = qa({\"question\": question, \"chat_history\": chat_history})\n",
" chat_history.append((question, result[\"answer\"]))\n",
" print(f\"-> **Question**: {question} \\n\")\n",
" print(f\"**Answer**: {result['answer']} \\n\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -81,7 +81,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.9.18"
}
},
"nbformat": 4,

View File

@ -0,0 +1,165 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "263f914c-9d67-4316-8b3d-03c3b99ba9d8",
"metadata": {},
"source": [
"SEC filings data\n",
"=\n",
"\n",
"SEC filings data powered by [Kay.ai](https://kay.ai) and [Cybersyn](https://www.cybersyn.com/).\n",
"\n",
">The SEC filing is a financial statement or other formal document submitted to the U.S. Securities and Exchange Commission (SEC). Public companies, certain insiders, and broker-dealers are required to make regular SEC filings. Investors and financial professionals rely on these filings for information about companies they are evaluating for investment purposes."
]
},
{
"cell_type": "markdown",
"id": "fc507b8e-ea51-417c-93da-42bf998a1195",
"metadata": {},
"source": [
"Setup\n",
"=\n",
"\n",
"First you will need to install the `kay` package. You will also need an API key: you can get one for free at [https://kay.ai](https://kay.ai/). Once you have an API key, you must set it as an environment variable `KAY_API_KEY`.\n",
"\n",
"In this example we're going to use the `KayAiRetriever`. Take a look at the [kay notebook](/docs/integrations/retrievers/kay) for more detailed information for the parmeters that it accepts.`"
]
},
{
"cell_type": "markdown",
"id": "c923bea0-585a-4f62-8662-efc167e8d793",
"metadata": {},
"source": [
"Examples\n",
"=\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f7b8c99c-0341-4f3c-912f-a11e98f7de71",
"metadata": {},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
" ········\n",
" ········\n"
]
}
],
"source": [
"# Setup API keys for Kay and OpenAI\n",
"from getpass import getpass\n",
"KAY_API_KEY = getpass()\n",
"OPENAI_API_KEY = getpass()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "04ee2d6b-c2ab-4e15-8a8b-afaf6ef8c0f6",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"KAY_API_KEY\"] = KAY_API_KEY\n",
"os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0c504bcd-f6e0-4028-a797-b31fb4b6d027",
"metadata": {},
"outputs": [],
"source": [
"from langchain.chains import ConversationalRetrievalChain\n",
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.retrievers import KayAiRetriever\n",
"\n",
"model = ChatOpenAI(model_name=\"gpt-3.5-turbo\")\n",
"retriever = KayAiRetriever.create(dataset_id=\"company\", data_types=[\"10-K\", \"10-Q\"], num_contexts=6)\n",
"qa = ConversationalRetrievalChain.from_llm(model, retriever=retriever)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "977f158b-38d3-4b5f-9379-7cdd09436327",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-> **Question**: What are patterns in Nvidia's spend over the past three quarters? \n",
"\n",
"**Answer**: Based on the provided information, here are the patterns in NVIDIA's spend over the past three quarters:\n",
"\n",
"1. Research and Development Expenses:\n",
" - Q3 2022: Increased by 34% compared to Q3 2021.\n",
" - Q1 2023: Increased by 40% compared to Q1 2022.\n",
" - Q2 2022: Increased by 25% compared to Q2 2021.\n",
" \n",
" Overall, research and development expenses have been consistently increasing over the past three quarters.\n",
"\n",
"2. Sales, General and Administrative Expenses:\n",
" - Q3 2022: Increased by 8% compared to Q3 2021.\n",
" - Q1 2023: Increased by 14% compared to Q1 2022.\n",
" - Q2 2022: Decreased by 16% compared to Q2 2021.\n",
" \n",
" The pattern for sales, general and administrative expenses is not as consistent, with some quarters showing an increase and others showing a decrease.\n",
"\n",
"3. Total Operating Expenses:\n",
" - Q3 2022: Increased by 25% compared to Q3 2021.\n",
" - Q1 2023: Increased by 113% compared to Q1 2022.\n",
" - Q2 2022: Increased by 9% compared to Q2 2021.\n",
" \n",
" Total operating expenses have generally been increasing over the past three quarters, with a significant increase in Q1 2023.\n",
"\n",
"Overall, the pattern indicates a consistent increase in research and development expenses and total operating expenses, while sales, general and administrative expenses show some fluctuations. \n",
"\n"
]
}
],
"source": [
"questions = [\n",
" \"What are patterns in Nvidia's spend over the past three quarters?\",\n",
" #\"What are some recent challenges faced by the renewable energy sector?\",\n",
"]\n",
"chat_history = []\n",
"\n",
"for question in questions:\n",
" result = qa({\"question\": question, \"chat_history\": chat_history})\n",
" chat_history.append((question, result[\"answer\"]))\n",
" print(f\"-> **Question**: {question} \\n\")\n",
" print(f\"**Answer**: {result['answer']} \\n\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -120,7 +120,7 @@
{
"data": {
"text/plain": [
"'English EditionEnglish中文 (Chinese)日本語 (Japanese) More Other Products from WSJBuy Side from WSJWSJ ShopWSJ Wine Other Products from WSJ Search Quotes and Companies Search Quotes and Companies 0.15% 0.03% 0.12% -0.42% 4.102% -0.69% -0.25% -0.15% -1.82% 0.24% 0.19% -1.10% About Evan His Family Reflects His Reporting How You Can Help Write a Message Life in Detention Latest News Get Email Updates Four Americans Released From Iranian Prison The Americans will remain under house arrest until they are '"
"'English EditionEnglish\u4e2d\u6587 (Chinese)\u65e5\u672c\u8a9e (Japanese) More Other Products from WSJBuy Side from WSJWSJ ShopWSJ Wine Other Products from WSJ Search Quotes and Companies Search Quotes and Companies 0.15% 0.03% 0.12% -0.42% 4.102% -0.69% -0.25% -0.15% -1.82% 0.24% 0.19% -1.10% About Evan His Family Reflects His Reporting How You Can Help Write a Message Life in Detention Latest News Get Email Updates Four Americans Released From Iranian Prison The Americans will remain under house arrest until they are '"
]
},
"execution_count": 4,
@ -218,7 +218,7 @@
{
"data": {
"text/plain": [
"\"Skip to main content Skip to navigation\\n\\n<\\n\\n>\\n\\nMenu\\n\\n## ESPN\\n\\n * Search\\n\\n * * scores\\n\\n * NFL\\n * MLB\\n * NBA\\n * NHL\\n * Soccer\\n * NCAAF\\n * \\n\\n * Women's World Cup\\n * LLWS\\n * NCAAM\\n * NCAAW\\n * Sports Betting\\n * Boxing\\n * CFL\\n * NCAA\\n * Cricket\\n * F1\\n * Golf\\n * Horse\\n * MMA\\n * NASCAR\\n * NBA G League\\n * Olympic Sports\\n * PLL\\n * Racing\\n * RN BB\\n * RN FB\\n * Rugby\\n * Tennis\\n * WNBA\\n * WWE\\n * X Games\\n * XFL\\n\\n * More\""
"\"Skip to main content Skip to navigation\\n\\n<\\n\\n>\\n\\nMenu\\n\\n## ESPN\\n\\n * Search\\n\\n * * scores\\n\\n * NFL\\n * MLB\\n * NBA\\n * NHL\\n * Soccer\\n * NCAAF\\n * \u2026\\n\\n * Women's World Cup\\n * LLWS\\n * NCAAM\\n * NCAAW\\n * Sports Betting\\n * Boxing\\n * CFL\\n * NCAA\\n * Cricket\\n * F1\\n * Golf\\n * Horse\\n * MMA\\n * NASCAR\\n * NBA G League\\n * Olympic Sports\\n * PLL\\n * Racing\\n * RN BB\\n * RN FB\\n * Rugby\\n * Tennis\\n * WNBA\\n * WWE\\n * X Games\\n * XFL\\n\\n * More\""
]
},
"execution_count": 7,
@ -338,25 +338,25 @@
" 'as it works to fend off antitrust scrutiny and cut '\n",
" 'costs.',\n",
" 'news_article_title': 'Amazon Cuts Dozens of House Brands'},\n",
" {'news_article_summary': 'President Bidens order comes on top of a slowing '\n",
" {'news_article_summary': 'President Biden\u2019s order comes on top of a slowing '\n",
" 'Chinese economy, Covid lockdowns and rising '\n",
" 'tensions between the two powers.',\n",
" 'news_article_title': 'U.S. Investment Ban on China Poised to Deepen Divide'},\n",
" {'news_article_summary': 'The proposed trial date in the '\n",
" 'election-interference case comes on the same day as '\n",
" 'the former presidents not guilty plea on '\n",
" 'the former president\u2019s not guilty plea on '\n",
" 'additional Mar-a-Lago charges.',\n",
" 'news_article_title': 'Trump Should Be Tried in January, Prosecutors Tell '\n",
" 'Judge'},\n",
" {'news_article_summary': 'The CEO who started in June says the platform has '\n",
" '“an entirely different road map” for the future.',\n",
" '\u201can entirely different road map\u201d for the future.',\n",
" 'news_article_title': 'Yaccarino Says X Is Watching Threads but Has Its Own '\n",
" 'Vision'},\n",
" {'news_article_summary': 'Students foot the bill for flagship state '\n",
" 'universities that pour money into new buildings and '\n",
" 'programs with little pushback.',\n",
" 'news_article_title': 'Colleges Spend Like Theres No Tomorrow. These '\n",
" 'Places Are Just Devouring Money.'},\n",
" 'news_article_title': 'Colleges Spend Like There\u2019s No Tomorrow. \u2018These '\n",
" 'Places Are Just Devouring Money.\u2019'},\n",
" {'news_article_summary': 'Wildfires fanned by hurricane winds have torn '\n",
" 'through parts of the Hawaiian island, devastating '\n",
" 'the popular tourist town of Lahaina.',\n",
@ -371,27 +371,27 @@
" 'rising drug gang violence.',\n",
" 'news_article_title': 'Ecuador Declares State of Emergency After '\n",
" 'Presidential Hopeful Killed'},\n",
" {'news_article_summary': 'This years hurricane season, which typically runs '\n",
" {'news_article_summary': 'This year\u2019s hurricane season, which typically runs '\n",
" 'from June to the end of November, has been '\n",
" 'difficult to predict, climate scientists said.',\n",
" 'news_article_title': 'Atlantic Hurricane Season Prediction Increased to '\n",
" 'Above Normal, NOAA Says'},\n",
" '\u2018Above Normal,\u2019 NOAA Says'},\n",
" {'news_article_summary': 'The NFL is raising the price of its NFL+ streaming '\n",
" 'packages as it adds the NFL Network and RedZone.',\n",
" 'news_article_title': 'NFL to Raise Price of NFL+ Streaming Packages as It '\n",
" 'Adds NFL Network, RedZone'},\n",
" {'news_article_summary': 'Russia is planning a moon mission as part of the '\n",
" 'new space race.',\n",
" 'news_article_title': 'Russias Moon Mission and the New Space Race'},\n",
" {'news_article_summary': 'Tapestrys $8.5 billion acquisition of Capri would '\n",
" 'news_article_title': 'Russia\u2019s Moon Mission and the New Space Race'},\n",
" {'news_article_summary': 'Tapestry\u2019s $8.5 billion acquisition of Capri would '\n",
" 'create a conglomerate with more than $12 billion in '\n",
" 'annual sales, but it would still lack the '\n",
" 'high-wattage labels and diversity that have fueled '\n",
" 'LVMHs success.',\n",
" 'LVMH\u2019s success.',\n",
" 'news_article_title': \"Why the Coach and Kors Marriage Doesn't Scare LVMH\"},\n",
" {'news_article_summary': 'The Supreme Court has blocked Purdue Pharmas $6 '\n",
" {'news_article_summary': 'The Supreme Court has blocked Purdue Pharma\u2019s $6 '\n",
" 'billion Sackler opioid settlement.',\n",
" 'news_article_title': 'Supreme Court Blocks Purdue Pharmas $6 Billion '\n",
" 'news_article_title': 'Supreme Court Blocks Purdue Pharma\u2019s $6 Billion '\n",
" 'Sackler Opioid Settlement'},\n",
" {'news_article_summary': 'The Social Security COLA is expected to rise in '\n",
" '2024, but not by a lot.',\n",
@ -578,7 +578,8 @@
"### Going deeper \n",
"\n",
"* Here's a [app](https://github.com/langchain-ai/web-explorer/tree/main) that wraps this retriver with a lighweight UI."
]
],
"id": "7a940df1"
},
{
"cell_type": "markdown",

View File

@ -30,6 +30,7 @@ from langchain.retrievers.ensemble import EnsembleRetriever
from langchain.retrievers.google_cloud_enterprise_search import (
GoogleCloudEnterpriseSearchRetriever,
)
from langchain.retrievers.kay import KayAiRetriever
from langchain.retrievers.kendra import AmazonKendraRetriever
from langchain.retrievers.knn import KNNRetriever
from langchain.retrievers.llama_index import (
@ -68,6 +69,7 @@ __all__ = [
"ChaindeskRetriever",
"ElasticSearchBM25Retriever",
"GoogleCloudEnterpriseSearchRetriever",
"KayAiRetriever",
"KNNRetriever",
"LlamaIndexGraphRetriever",
"LlamaIndexRetriever",

View File

@ -0,0 +1,59 @@
from __future__ import annotations
from typing import Any, List
from langchain.callbacks.manager import CallbackManagerForRetrieverRun
from langchain.schema import BaseRetriever, Document
class KayAiRetriever(BaseRetriever):
"""
Retriever for Kay.ai datasets.
To work properly, expects you to have KAY_API_KEY env variable set.
You can get one for free at https://kay.ai/.
"""
client: Any
num_contexts: int
@classmethod
def create(
cls,
dataset_id: str,
data_types: List[str],
num_contexts: int = 6,
) -> KayAiRetriever:
"""
Create a KayRetriever given a Kay dataset id and a list of datasources.
Args:
dataset_id: A dataset id category in Kay, like "company"
data_types: A list of datasources present within a dataset. For
"company" the corresponding datasources could be
["10-K", "10-Q", "8-K", "PressRelease"].
num_contexts: The number of documents to retrieve on each query.
Defaults to 6.
"""
try:
from kay.rag.retrievers import KayRetriever
except ImportError:
raise ImportError(
"Could not import kay python package. Please install it with "
"`pip install kay`.",
)
client = KayRetriever(dataset_id, data_types)
return cls(client=client, num_contexts=num_contexts)
def _get_relevant_documents(
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
) -> List[Document]:
ctxs = self.client.query(query=query, num_context=self.num_contexts)
docs = []
for ctx in ctxs:
page_content = ctx.pop("chunk_embed_text", None)
if page_content is None:
continue
docs.append(Document(page_content=page_content, metadata={**ctx}))
return docs

View File

@ -0,0 +1,24 @@
"""Integration test for Kay.ai API Wrapper."""
import pytest
from langchain.retrievers import KayAiRetriever
from langchain.schema import Document
@pytest.mark.requires("kay")
def test_kay_retriever() -> None:
retriever = KayAiRetriever.create(
dataset_id="company",
data_types=["10-K", "10-Q", "8-K", "PressRelease"],
num_contexts=3,
)
docs = retriever.get_relevant_documents(
"What were the biggest strategy changes and partnerships made by Roku "
"in 2023?",
)
assert len(docs) == 3
for doc in docs:
assert isinstance(doc, Document)
assert doc.page_content
assert doc.metadata
assert len(list(doc.metadata.items())) > 0