From 49bdd3b6fe07e1ee29c5f2381ef7a563bef15778 Mon Sep 17 00:00:00 2001 From: Jason Zhang <166434281+jayfish0@users.noreply.github.com> Date: Tue, 11 Mar 2025 18:57:40 -0700 Subject: [PATCH] docs: Add AgentQL provider doc, tool/toolkit doc and documentloader doc (#30144) - **Description:** Added AgentQL docs for the provider page, tools page and documentloader page - **Twitter handle:** @AgentQL Repo: https://github.com/tinyfish-io/agentql-integrations/tree/main/langchain PyPI: https://pypi.org/project/langchain-agentql/ If no one reviews your PR within a few days, please @-mention one of baskaryan, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Chester Curme --- .../document_loaders/agentql.ipynb | 265 ++++ docs/docs/integrations/providers/agentql.mdx | 35 + docs/docs/integrations/tools/agentql.ipynb | 1077 +++++++++++++++++ docs/scripts/tool_feat_table.py | 5 + docs/src/theme/FeatureTables.js | 7 + libs/packages.yml | 3 + 6 files changed, 1392 insertions(+) create mode 100644 docs/docs/integrations/document_loaders/agentql.ipynb create mode 100644 docs/docs/integrations/providers/agentql.mdx create mode 100644 docs/docs/integrations/tools/agentql.ipynb diff --git a/docs/docs/integrations/document_loaders/agentql.ipynb b/docs/docs/integrations/document_loaders/agentql.ipynb new file mode 100644 index 00000000000..0cdfcc39b81 --- /dev/null +++ b/docs/docs/integrations/document_loaders/agentql.ipynb @@ -0,0 +1,265 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "wkUAAcGZNSJ3" + }, + "source": [ + "# AgentQLLoader\n", + "\n", + "[AgentQL](https://www.agentql.com/)'s document loader provides structured data extraction from any web page using an [AgentQL query](https://docs.agentql.com/agentql-query). AgentQL can be used across multiple languages and web pages without breaking over time and change.\n", + "\n", + "## Overview\n", + "\n", + "`AgentQLLoader` requires the following two parameters:\n", + "- `url`: The URL of the web page you want to extract data from.\n", + "- `query`: The AgentQL query to execute. Learn more about [how to write an AgentQL query in the docs](https://docs.agentql.com/agentql-query) or test one out in the [AgentQL Playground](https://dev.agentql.com/playground).\n", + "\n", + "Setting the following parameters are optional:\n", + "- `api_key`: Your AgentQL API key from [dev.agentql.com](https://dev.agentql.com). **`Optional`.**\n", + "- `timeout`: The number of seconds to wait for a request before timing out. **Defaults to `900`.**\n", + "- `is_stealth_mode_enabled`: Whether to enable experimental anti-bot evasion strategies. This feature may not work for all websites at all times. Data extraction may take longer to complete with this mode enabled. **Defaults to `False`.**\n", + "- `wait_for`: The number of seconds to wait for the page to load before extracting data. **Defaults to `0`.**\n", + "- `is_scroll_to_bottom_enabled`: Whether to scroll to bottom of the page before extracting data. **Defaults to `False`.**\n", + "- `mode`: `\"standard\"` uses deep data analysis, while `\"fast\"` trades some depth of analysis for speed and is adequate for most usecases. [Learn more about the modes in this guide.](https://docs.agentql.com/accuracy/standard-mode) **Defaults to `\"fast\"`.**\n", + "- `is_screenshot_enabled`: Whether to take a screenshot before extracting data. Returned in 'metadata' as a Base64 string. **Defaults to `False`.**\n", + "\n", + "AgentQLLoader is implemented with AgentQL's [REST API](https://docs.agentql.com/rest-api/api-reference)\n", + "\n", + "### Integration details\n", + "\n", + "| Class | Package | Local | Serializable | JS support |\n", + "| :--- | :--- | :---: | :---: | :---: |\n", + "| AgentQLLoader| langchain-agentql | ✅ | ❌ | ❌ |\n", + "\n", + "### Loader features\n", + "| Source | Document Lazy Loading | Native Async Support\n", + "| :---: | :---: | :---: |\n", + "| AgentQLLoader | ✅ | ❌ |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CaKa2QrnwPXq" + }, + "source": [ + "## Setup\n", + "\n", + "To use the AgentQL Document Loader, you will need to configure the `AGENTQL_API_KEY` environment variable, or use the `api_key` parameter. You can acquire an API key from our [Dev Portal](https://dev.agentql.com)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mZNJvUQBNSJ5" + }, + "source": [ + "### Installation\n", + "\n", + "Install **langchain-agentql**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IblRoJJDNSJ5" + }, + "outputs": [], + "source": [ + "%pip install -qU langchain_agentql" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SNsUT60YvfCm" + }, + "source": [ + "### Set Credentials" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "2D1EN7Egvk1c" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"AGENTQL_API_KEY\"] = \"YOUR_AGENTQL_API_KEY\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D4hnJV_6NSJ5" + }, + "source": [ + "## Initialization\n", + "\n", + "Next instantiate your model object:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "oMJdxL_KNSJ5" + }, + "outputs": [], + "source": [ + "from langchain_agentql.document_loaders import AgentQLLoader\n", + "\n", + "loader = AgentQLLoader(\n", + " url=\"https://www.agentql.com/blog\",\n", + " query=\"\"\"\n", + " {\n", + " posts[] {\n", + " title\n", + " url\n", + " date\n", + " author\n", + " }\n", + " }\n", + " \"\"\",\n", + " is_scroll_to_bottom_enabled=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SRxIOx90NSJ5" + }, + "source": [ + "## Load" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bNnnCZ1oNSJ5", + "outputId": "d0eb8cb4-9742-4f0c-80f1-0509a3af1808" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(metadata={'request_id': 'bdb9dbe7-8a7f-427f-bc16-839ccc02cae6', 'generated_query': None, 'screenshot': None}, page_content=\"{'posts': [{'title': 'Launch Week Recap—make the web AI-ready', 'url': 'https://www.agentql.com/blog/2024-launch-week-recap', 'date': 'Nov 18, 2024', 'author': 'Rachel-Lee Nabors'}, {'title': 'Accurate data extraction from PDFs and images with AgentQL', 'url': 'https://www.agentql.com/blog/accurate-data-extraction-pdfs-images', 'date': 'Feb 1, 2025', 'author': 'Rachel-Lee Nabors'}, {'title': 'Introducing Scheduled Scraping Workflows', 'url': 'https://www.agentql.com/blog/scheduling', 'date': 'Dec 2, 2024', 'author': 'Rachel-Lee Nabors'}, {'title': 'Updates to Our Pricing Model', 'url': 'https://www.agentql.com/blog/2024-pricing-update', 'date': 'Nov 19, 2024', 'author': 'Rachel-Lee Nabors'}, {'title': 'Get data from any page: AgentQL’s REST API Endpoint—Launch week day 5', 'url': 'https://www.agentql.com/blog/data-rest-api', 'date': 'Nov 15, 2024', 'author': 'Rachel-Lee Nabors'}]}\")" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs = loader.load()\n", + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wtPMNh72NSJ5", + "outputId": "59d529a4-3c22-445c-f5cf-dc7b24168906" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'request_id': 'bdb9dbe7-8a7f-427f-bc16-839ccc02cae6', 'generated_query': None, 'screenshot': None}\n" + ] + } + ], + "source": [ + "print(docs[0].metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7RMuEwl4NSJ5" + }, + "source": [ + "## Lazy Load\n", + "\n", + "`AgentQLLoader` currently only loads one `Document` at a time. Therefore, `load()` and `lazy_load()` behave the same:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FIYddZBONSJ5", + "outputId": "c39a7a6d-bc52-4ef9-b36f-e1d138590b79" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(metadata={'request_id': '06273abd-b2ef-4e15-b0ec-901cba7b4825', 'generated_query': None, 'screenshot': None}, page_content=\"{'posts': [{'title': 'Launch Week Recap—make the web AI-ready', 'url': 'https://www.agentql.com/blog/2024-launch-week-recap', 'date': 'Nov 18, 2024', 'author': 'Rachel-Lee Nabors'}, {'title': 'Accurate data extraction from PDFs and images with AgentQL', 'url': 'https://www.agentql.com/blog/accurate-data-extraction-pdfs-images', 'date': 'Feb 1, 2025', 'author': 'Rachel-Lee Nabors'}, {'title': 'Introducing Scheduled Scraping Workflows', 'url': 'https://www.agentql.com/blog/scheduling', 'date': 'Dec 2, 2024', 'author': 'Rachel-Lee Nabors'}, {'title': 'Updates to Our Pricing Model', 'url': 'https://www.agentql.com/blog/2024-pricing-update', 'date': 'Nov 19, 2024', 'author': 'Rachel-Lee Nabors'}, {'title': 'Get data from any page: AgentQL’s REST API Endpoint—Launch week day 5', 'url': 'https://www.agentql.com/blog/data-rest-api', 'date': 'Nov 15, 2024', 'author': 'Rachel-Lee Nabors'}]}\")]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pages = [doc for doc in loader.lazy_load()]\n", + "pages" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For more information on how to use this integration, please refer to the [git repo](https://github.com/tinyfish-io/agentql-integrations/tree/main/langchain) or the [langchain integration documentation](https://docs.agentql.com/integrations/langchain)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/docs/integrations/providers/agentql.mdx b/docs/docs/integrations/providers/agentql.mdx new file mode 100644 index 00000000000..20381e1784e --- /dev/null +++ b/docs/docs/integrations/providers/agentql.mdx @@ -0,0 +1,35 @@ +# AgentQL + +[AgentQL](https://www.agentql.com/) provides web interaction and structured data extraction from any web page using an [AgentQL query](https://docs.agentql.com/agentql-query) or a Natural Language prompt. AgentQL can be used across multiple languages and web pages without breaking over time and change. + +## Installation and Setup + +Install the integration package: + +```bash +pip install langchain-agentql +``` + +## API Key + +Get an API Key from our [Dev Portal](https://dev.agentql.com/) and add it to your environment variables: +``` +export AGENTQL_API_KEY="your-api-key-here" +``` + +## DocumentLoader +AgentQL's document loader provides structured data extraction from any web page using an AgentQL query. + +```python +from langchain_agentql.document_loaders import AgentQLLoader +``` +See our [document loader documentation and usage example](/docs/integrations/document_loaders/agentql). + +## Tools and Toolkits +AgentQL tools provides web interaction and structured data extraction from any web page using an AgentQL query or a Natural Language prompt. + +```python +from langchain_agentql.tools import ExtractWebDataTool, ExtractWebDataBrowserTool, GetWebElementBrowserTool +from langchain_agentql import AgentQLBrowserToolkit +``` +See our [tools documentation and usage example](/docs/integrations/tools/agentql). diff --git a/docs/docs/integrations/tools/agentql.ipynb b/docs/docs/integrations/tools/agentql.ipynb new file mode 100644 index 00000000000..96c3c3797cc --- /dev/null +++ b/docs/docs/integrations/tools/agentql.ipynb @@ -0,0 +1,1077 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a6f91f20", + "metadata": { + "id": "a6f91f20" + }, + "source": [ + "# AgentQL\n", + "\n", + "[AgentQL](https://www.agentql.com/) tools provides web interaction and structured data extraction from any web page using an [AgentQL query](https://docs.agentql.com/agentql-query) or a Natural Language prompt. AgentQL can be used across multiple languages and web pages without breaking over time and change.\n", + "\n", + "## Overview\n", + "\n", + "AgentQL provides the following three tools:\n", + "\n", + "- **`ExtractWebDataTool`** extracts structured data as JSON from a web page given a URL using either an [AgentQL query](https://docs.agentql.com/agentql-query/query-intro) or a Natural Language description of the data.\n", + "\n", + "The following two tools are also bundled as `AgentQLBrowserToolkit` and must be used with a `Playwright` browser or a remote browser instance via Chrome DevTools Protocal (CDP):\n", + "\n", + "- **`ExtractWebDataBrowserTool`** extracts structured data as JSON from the active web page in a browser using either an [AgentQL query](https://docs.agentql.com/agentql-query/query-intro) or a Natural Language description.\n", + "\n", + "- **`GetWebElementBrowserTool`** finds a web element on the active web page in a browser using a Natural Language description and returns its CSS selector for further interaction.\n", + "\n", + "### Integration details\n", + "\n", + "| Class | Package | Serializable | [JS support](https://js.langchain.com/docs/integrations/tools/langchain_agentql) | Package latest |\n", + "| :--- | :--- | :---: | :---: | :---: |\n", + "| AgentQL | langchain-agentql | ❌ | ❌ | 1.0.0 |\n", + "\n", + "### Tool features\n", + "\n", + "| Tool | Web Data Extraction | Web Element Extraction | Use With Local Browser |\n", + "| :--- | :---: | :---: | :---: |\n", + "| ExtractWebDataTool | ✅ | ❌ | ❌\n", + "| ExtractWebDataBrowserTool | ✅ | ❌ | ✅\n", + "| GetWebElementBrowserTool | ❌ | ✅ | ✅" + ] + }, + { + "cell_type": "markdown", + "id": "e0ec39b2", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f85b4089", + "metadata": { + "id": "f85b4089" + }, + "outputs": [], + "source": [ + "%pip install --quiet -U langchain_agentql" + ] + }, + { + "cell_type": "markdown", + "id": "uQvMedxGWeUV", + "metadata": { + "id": "uQvMedxGWeUV" + }, + "source": [ + "To run this notebook, install `Playwright` browser and configure Jupyter Notebook's `asyncio` loop." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "OMaAdTLYWRfL", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OMaAdTLYWRfL", + "outputId": "e1472145-dea3-45b1-b8d1-875772bfdfe1" + }, + "outputs": [], + "source": [ + "!playwright install\n", + "\n", + "# This import is required only for jupyter notebooks, since they have their own eventloop\n", + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "markdown", + "id": "b15e9266", + "metadata": { + "id": "b15e9266" + }, + "source": [ + "### Credentials\n", + "\n", + "To use the AgentQL tools, you will need to get your own API key from the [AgentQL Dev Portal](https://dev.agentql.com/) and set the AgentQL environment variable." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e0b178a2-8816-40ca-b57c-ccdd86dde9c9", + "metadata": { + "id": "e0b178a2-8816-40ca-b57c-ccdd86dde9c9" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"AGENTQL_API_KEY\"] = \"YOUR_AGENTQL_API_KEY\"" + ] + }, + { + "cell_type": "markdown", + "id": "3nk4k9ED8E9Z", + "metadata": { + "id": "3nk4k9ED8E9Z" + }, + "source": [ + "## Instantiation" + ] + }, + { + "cell_type": "markdown", + "id": "1c97218f-f366-479d-8bf7-fe9f2f6df73f", + "metadata": { + "id": "1c97218f-f366-479d-8bf7-fe9f2f6df73f" + }, + "source": [ + "### `ExtractWebDataTool`\n", + "You can instantiate `ExtractWebDataTool` with the following params:\n", + "- `api_key`: Your AgentQL API key from [dev.agentql.com](https://dev.agentql.com). **`Optional`.**\n", + "- `timeout`: The number of seconds to wait for a request before timing out. Increase if data extraction times out. **Defaults to `900`.**\n", + "- `is_stealth_mode_enabled`: Whether to enable experimental anti-bot evasion strategies. This feature may not work for all websites at all times. Data extraction may take longer to complete with this mode enabled. **Defaults to `False`.**\n", + "- `wait_for`: The number of seconds to wait for the page to load before extracting data. **Defaults to `0`.**\n", + "- `is_scroll_to_bottom_enabled`: Whether to scroll to bottom of the page before extracting data. **Defaults to `False`.**\n", + "- `mode`: `\"standard\"` uses deep data analysis, while `\"fast\"` trades some depth of analysis for speed and is adequate for most usecases. [Learn more about the modes in this guide.](https://docs.agentql.com/accuracy/standard-mode) **Defaults to `\"fast\"`.**\n", + "- `is_screenshot_enabled`: Whether to take a screenshot before extracting data. Returned in 'metadata' as a Base64 string. **Defaults to `False`.**\n", + "\n", + "`ExtractWebDataTool` is implemented with AgentQL's REST API, you can view more details about the parameters in the [API Reference docs](https://docs.agentql.com/rest-api/api-reference)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8b3ddfe9-ca79-494c-a7ab-1f56d9407a64", + "metadata": { + "id": "8b3ddfe9-ca79-494c-a7ab-1f56d9407a64" + }, + "outputs": [], + "source": [ + "from langchain_agentql.tools import ExtractWebDataTool\n", + "\n", + "extract_web_data_tool = ExtractWebDataTool()" + ] + }, + { + "cell_type": "markdown", + "id": "sSTERzBmCyi3", + "metadata": { + "id": "sSTERzBmCyi3" + }, + "source": [ + "### `ExtractWebDataBrowserTool`\n", + "\n", + "To instantiate **ExtractWebDataBrowserTool**, you need to connect the tool with a browser instance.\n", + "\n", + "You can set the following params:\n", + "- `timeout`: The number of seconds to wait for a request before timing out. Increase if data extraction times out. **Defaults to `900`.**\n", + "- `wait_for_network_idle`: Whether to wait until the network reaches a full idle state before executing. **Defaults to `True`.**\n", + "- `include_hidden`: Whether to take into account visually hidden elements on the page. **Defaults to `True`.**\n", + "- `mode`: `\"standard\"` uses deep data analysis, while `\"fast\"` trades some depth of analysis for speed and is adequate for most usecases. [Learn more about the modes in this guide.](https://docs.agentql.com/accuracy/standard-mode) **Defaults to `\"fast\"`.**\n", + "\n", + "`ExtractWebDataBrowserTool` is implemented with AgentQL's SDK. You can find more details about the parameters and the functions in AgentQL's [API References](https://docs.agentql.com/python-sdk/api-references/agentql-page#querydata)." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bnhKlXqHE7Z5", + "metadata": { + "id": "bnhKlXqHE7Z5" + }, + "outputs": [], + "source": [ + "from langchain_agentql.tools import ExtractWebDataBrowserTool\n", + "from langchain_agentql.utils import create_async_playwright_browser\n", + "\n", + "async_browser = await create_async_playwright_browser()\n", + "\n", + "extract_web_data_browser_tool = ExtractWebDataBrowserTool(async_browser=async_browser)" + ] + }, + { + "cell_type": "markdown", + "id": "DJjSQwSaJ4Ml", + "metadata": { + "id": "DJjSQwSaJ4Ml" + }, + "source": [ + "### `GetWebElementBrowserTool`\n", + "\n", + "To instantiate **GetWebElementBrowserTool**, you need to connect the tool with a browser instance.\n", + "\n", + "You can set the following params:\n", + "- `timeout`: The number of seconds to wait for a request before timing out. Increase if data extraction times out. **Defaults to `900`.**\n", + "- `wait_for_network_idle`: Whether to wait until the network reaches a full idle state before executing. **Defaults to `True`.**\n", + "- `include_hidden`: Whether to take into account visually hidden elements on the page. **Defaults to `False`.**\n", + "- `mode`: `\"standard\"` uses deep data analysis, while `\"fast\"` trades some depth of analysis for speed and is adequate for most usecases. [Learn more about the modes in this guide.](https://docs.agentql.com/accuracy/standard-mode) **Defaults to `\"fast\"`.**\n", + "\n", + "`GetWebElementBrowserTool` is implemented with AgentQL's SDK. You can find more details about the parameters and the functions in AgentQL's [API References](https://docs.agentql.com/python-sdk/api-references/agentql-page#queryelements).`" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "503PRMZ1Lay7", + "metadata": { + "id": "503PRMZ1Lay7" + }, + "outputs": [], + "source": [ + "from langchain_agentql.tools import GetWebElementBrowserTool\n", + "\n", + "extract_web_element_tool = GetWebElementBrowserTool(async_browser=async_browser)" + ] + }, + { + "cell_type": "markdown", + "id": "RVRZn9Dy8Q65", + "metadata": { + "id": "RVRZn9Dy8Q65" + }, + "source": [ + "## Invocation" + ] + }, + { + "cell_type": "markdown", + "id": "74147a1a", + "metadata": { + "id": "74147a1a" + }, + "source": [ + "### `ExtractWebDataTool`\n", + "\n", + "This tool uses AgentQL's REST API under the hood, sending the publically available web page's URL to AgentQL's endpoint. This will not work with private pages or logged in sessions. Use `ExtractWebDataBrowserTool` for those usecases.\n", + "\n", + "- `url`: The URL of the web page you want to extract data from.\n", + "- `query`: The AgentQL query to execute. Use AgentQL query if you want to extract precisely structured data. Learn more about [how to write an AgentQL query in the docs](https://docs.agentql.com/agentql-query) or test one out in the [AgentQL Playground](https://dev.agentql.com/playground).\n", + "- `prompt`: A Natural Language description of the data to extract from the page. AgentQL will infer the data’s structure from your prompt. Use `prompt` if you want to extract data defined by free-form language without defining a particular structure. \n", + "\n", + "**Note:** You must define either a `query` or a `prompt` to use AgentQL." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "65310a8b-eb0c-4d9e-a618-4f4abe2414fc", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "65310a8b-eb0c-4d9e-a618-4f4abe2414fc", + "outputId": "48996c37-b61e-487f-a618-719f75afc4db" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'data': {'posts': [{'title': 'Launch Week Recap—make the web AI-ready',\n", + " 'url': 'https://www.agentql.com/blog/2024-launch-week-recap',\n", + " 'date': 'Nov 18, 2024',\n", + " 'author': 'Rachel-Lee Nabors'},\n", + " {'title': 'Accurate data extraction from PDFs and images with AgentQL',\n", + " 'url': 'https://www.agentql.com/blog/accurate-data-extraction-pdfs-images',\n", + " 'date': 'Feb 1, 2025',\n", + " 'author': 'Rachel-Lee Nabors'},\n", + " {'title': 'Introducing Scheduled Scraping Workflows',\n", + " 'url': 'https://www.agentql.com/blog/scheduling',\n", + " 'date': 'Dec 2, 2024',\n", + " 'author': 'Rachel-Lee Nabors'},\n", + " {'title': 'Updates to Our Pricing Model',\n", + " 'url': 'https://www.agentql.com/blog/2024-pricing-update',\n", + " 'date': 'Nov 19, 2024',\n", + " 'author': 'Rachel-Lee Nabors'},\n", + " {'title': 'Get data from any page: AgentQL’s REST API Endpoint—Launch week day 5',\n", + " 'url': 'https://www.agentql.com/blog/data-rest-api',\n", + " 'date': 'Nov 15, 2024',\n", + " 'author': 'Rachel-Lee Nabors'}]},\n", + " 'metadata': {'request_id': '0dc1f89c-1b6a-46fe-8089-6cd0f082f094',\n", + " 'generated_query': None,\n", + " 'screenshot': None}}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can invoke the tool with either a query or a prompt\n", + "\n", + "# extract_web_data_tool.invoke(\n", + "# {\n", + "# \"url\": \"https://www.agentql.com/blog\",\n", + "# \"prompt\": \"the blog posts with title, url, date of post and author\",\n", + "# }\n", + "# )\n", + "\n", + "extract_web_data_tool.invoke(\n", + " {\n", + " \"url\": \"https://www.agentql.com/blog\",\n", + " \"query\": \"{ posts[] { title url date author } }\",\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "71zmio-iEEDU", + "metadata": { + "id": "71zmio-iEEDU" + }, + "source": [ + "### `ExtractWebDataBrowserTool`\n", + "- `query`: The AgentQL query to execute. Use AgentQL query if you want to extract precisely structured data. Learn more about [how to write an AgentQL query in the docs](https://docs.agentql.com/agentql-query) or test one out in the [AgentQL Playground](https://dev.agentql.com/playground).\n", + "- `prompt`: A Natural Language description of the data to extract from the page. AgentQL will infer the data’s structure from your prompt. Use `prompt` if you want to extract data defined by free-form language without defining a particular structure. \n", + "\n", + "**Note:** You must define either a `query` or a `prompt` to use AgentQL.\n", + "\n", + "To extract data, first you must navigate to a web page using LangChain's [Playwright](https://python.langchain.com/docs/integrations/tools/playwright/) tool." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "Cp9LxO8MaPN1", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "Cp9LxO8MaPN1", + "outputId": "08b98158-e451-428f-f2c0-4c7eec9924cd" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'Navigating to https://www.agentql.com/blog returned status code 200'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_community.tools.playwright import NavigateTool\n", + "\n", + "navigate_tool = NavigateTool(async_browser=async_browser)\n", + "await navigate_tool.ainvoke({\"url\": \"https://www.agentql.com/blog\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ia5qf2RFEQR9", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ia5qf2RFEQR9", + "outputId": "ee9c1396-82c2-4e94-a79e-730319610033" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/agentql/_core/_utils.py:167: UserWarning: \u001b[31m🚨 The function get_data_by_prompt_experimental is experimental and may not work as expected 🚨\u001b[0m\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "{'blog_posts': [{'title': 'Launch Week Recap—make the web AI-ready',\n", + " 'url': 'https://www.agentql.com/blog/2024-launch-week-recap',\n", + " 'date': 'Nov 18, 2024',\n", + " 'author': 'Rachel-Lee Nabors'},\n", + " {'title': 'Accurate data extraction from PDFs and images with AgentQL',\n", + " 'url': 'https://www.agentql.com/blog/accurate-data-extraction-pdfs-images',\n", + " 'date': 'Feb 1, 2025',\n", + " 'author': 'Rachel-Lee Nabors'},\n", + " {'title': 'Introducing Scheduled Scraping Workflows',\n", + " 'url': 'https://www.agentql.com/blog/scheduling',\n", + " 'date': 'Dec 2, 2024',\n", + " 'author': 'Rachel-Lee Nabors'},\n", + " {'title': 'Updates to Our Pricing Model',\n", + " 'url': 'https://www.agentql.com/blog/2024-pricing-update',\n", + " 'date': 'Nov 19, 2024',\n", + " 'author': 'Rachel-Lee Nabors'},\n", + " {'title': 'Get data from any page: AgentQL’s REST API Endpoint—Launch week day 5',\n", + " 'url': 'https://www.agentql.com/blog/data-rest-api',\n", + " 'date': 'Nov 15, 2024',\n", + " 'author': 'Rachel-Lee Nabors'}]}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can invoke the tool with either a query or a prompt\n", + "\n", + "# await extract_web_data_browser_tool.ainvoke(\n", + "# {'query': '{ blogs[] { title url date author } }'}\n", + "# )\n", + "\n", + "await extract_web_data_browser_tool.ainvoke(\n", + " {\"prompt\": \"the blog posts with title, url, date of post and author\"}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "xg8rKW6jEXEI", + "metadata": { + "id": "xg8rKW6jEXEI" + }, + "source": [ + "### `GetWebElementBrowserTool`\n", + "- `prompt`: A Natural Language description of the web element to find on the page." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "XgktygByEnas", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "XgktygByEnas", + "outputId": "23091977-4b37-415a-97f9-8ed1154de495" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "\"[tf623_id='194']\"" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selector = await extract_web_element_tool.ainvoke({\"prompt\": \"Next page button\"})\n", + "selector" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "O8QJ_NyFfcdh", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "O8QJ_NyFfcdh", + "outputId": "ff0033ca-b230-4fc1-b25f-383f7b5e2ca0" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "\"Clicked element '[tf623_id='194']'\"" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_community.tools.playwright import ClickTool\n", + "\n", + "# Disabling 'visible_only' will allow us to click on elements that are not visible on the page\n", + "await ClickTool(async_browser=async_browser, visible_only=False).ainvoke(\n", + " {\"selector\": selector}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "dgsTLAcifoJO", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "dgsTLAcifoJO", + "outputId": "376d0bce-3595-4643-c616-764e4f6bb0ed" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'https://www.agentql.com/blog/page/2'" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_community.tools.playwright import CurrentWebPageTool\n", + "\n", + "await CurrentWebPageTool(async_browser=async_browser).ainvoke({})" + ] + }, + { + "cell_type": "markdown", + "id": "ed9fda2c", + "metadata": {}, + "source": [ + "## Chaining\n", + "\n", + "You can use AgentQL tools in a chain by first binding one to a [tool-calling model](/docs/how_to/tool_calling/) and then calling it:\n" + ] + }, + { + "cell_type": "markdown", + "id": "7fd5dc6f", + "metadata": {}, + "source": [ + "### Instantiate LLM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef351fb1", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"YOUR_OPENAI_API_KEY\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac535776", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import init_chat_model\n", + "\n", + "llm = init_chat_model(model=\"gpt-4o\", model_provider=\"openai\")" + ] + }, + { + "cell_type": "markdown", + "id": "4727f685", + "metadata": {}, + "source": [ + "### Execute Tool Chain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "434b4678", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'messages': [ToolMessage(content='{\"data\": {\"posts\": [{\"title\": \"Launch Week Recap—make the web AI-ready\", \"url\": \"https://www.agentql.com/blog/2024-launch-week-recap\", \"date\": \"Nov 18, 2024\", \"author\": \"Rachel-Lee Nabors\"}, {\"title\": \"Accurate data extraction from PDFs and images with AgentQL\", \"url\": \"https://www.agentql.com/blog/accurate-data-extraction-pdfs-images\", \"date\": \"Feb 1, 2025\", \"author\": \"Rachel-Lee Nabors\"}, {\"title\": \"Introducing Scheduled Scraping Workflows\", \"url\": \"https://www.agentql.com/blog/scheduling\", \"date\": \"Dec 2, 2024\", \"author\": \"Rachel-Lee Nabors\"}, {\"title\": \"Updates to Our Pricing Model\", \"url\": \"https://www.agentql.com/blog/2024-pricing-update\", \"date\": \"Nov 19, 2024\", \"author\": \"Rachel-Lee Nabors\"}, {\"title\": \"Get data from any page: AgentQL’s REST API Endpoint—Launch week day 5\", \"url\": \"https://www.agentql.com/blog/data-rest-api\", \"date\": \"Nov 15, 2024\", \"author\": \"Rachel-Lee Nabors\"}]}, \"metadata\": {\"request_id\": \"1a84ed12-d02a-497d-b09d-21fe49342fa3\", \"generated_query\": null, \"screenshot\": null}}', name='extract_web_data_with_rest_api', tool_call_id='call_z4Rl1MpjJZNcbLlq1OCneoMF')]}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_core.prompts import ChatPromptTemplate\n", + "from langchain_core.runnables import RunnableConfig, chain\n", + "\n", + "prompt = ChatPromptTemplate(\n", + " [\n", + " (\"system\", \"You are a helpful assistant in extracting data from website.\"),\n", + " (\"human\", \"{user_input}\"),\n", + " (\"placeholder\", \"{messages}\"),\n", + " ]\n", + ")\n", + "\n", + "# specifying tool_choice will force the model to call this tool.\n", + "llm_with_tools = llm.bind_tools(\n", + " [extract_web_data_tool], tool_choice=\"extract_web_data_with_rest_api\"\n", + ")\n", + "\n", + "llm_chain = prompt | llm_with_tools\n", + "\n", + "\n", + "@chain\n", + "def tool_chain(user_input: str, config: RunnableConfig):\n", + " input_ = {\"user_input\": user_input}\n", + " ai_msg = llm_chain.invoke(input_, config=config)\n", + " tool_msgs = extract_web_data_tool.batch(ai_msg.tool_calls, config=config)\n", + " return {\"messages\": tool_msgs}\n", + "\n", + "\n", + "tool_chain.invoke(\n", + " \"Extract data from https://www.agentql.com/blog using the following agentql query: { posts[] { title url date author } }\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "-SknEjzZXQWg", + "metadata": { + "id": "-SknEjzZXQWg" + }, + "source": [ + "## Use within an agent\n", + "\n", + "You can use AgentQL tools with an AI Agent using the `AgentQLBrowserToolkit` . This toolkit includes `ExtractDataBrowserTool` and `GetWebElementBrowserTool`. Here's an example of agentic browser actions that combine AgentQL's toolkit with the Playwright tools." + ] + }, + { + "cell_type": "markdown", + "id": "VLzyKpfAmvv7", + "metadata": { + "id": "VLzyKpfAmvv7" + }, + "source": [ + "### Instantiate Toolkit\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "UGsFDL1atP3y", + "metadata": { + "id": "UGsFDL1atP3y" + }, + "outputs": [], + "source": [ + "from langchain_agentql.utils import create_async_playwright_browser\n", + "\n", + "async_agent_browser = await create_async_playwright_browser()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "nCWN9X118rtF", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nCWN9X118rtF", + "outputId": "0ef8160e-f2a3-4ad5-f53d-8cb0f0d71367" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[ExtractWebDataBrowserTool(async_browser= version=133.0.6943.16>),\n", + " GetWebElementBrowserTool(async_browser= version=133.0.6943.16>)]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_agentql import AgentQLBrowserToolkit\n", + "\n", + "agentql_toolkit = AgentQLBrowserToolkit(async_browser=async_agent_browser)\n", + "agentql_toolkit.get_tools()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "rBYb-I6Tp56C", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rBYb-I6Tp56C", + "outputId": "b1ccaa79-425b-4137-cd4d-bffbc32fc395" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[NavigateTool(async_browser= version=133.0.6943.16>),\n", + " ClickTool(async_browser= version=133.0.6943.16>, visible_only=False)]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_community.tools.playwright import ClickTool, NavigateTool\n", + "\n", + "# we hand pick the following tools to allow more precise agentic browser actions\n", + "playwright_toolkit = [\n", + " NavigateTool(async_browser=async_agent_browser),\n", + " ClickTool(async_browser=async_agent_browser, visible_only=False),\n", + "]\n", + "playwright_toolkit" + ] + }, + { + "cell_type": "markdown", + "id": "5c_KiBCHqTjv", + "metadata": { + "id": "5c_KiBCHqTjv" + }, + "source": [ + "### Use with a ReAct Agent\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "HaAPyYr7quau", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HaAPyYr7quau", + "outputId": "2de2496c-06eb-47db-91d3-f2171caf0640" + }, + "outputs": [], + "source": [ + "%pip install --quiet -U langgraph" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "ShcjQ352qRS_", + "metadata": { + "id": "ShcjQ352qRS_" + }, + "outputs": [], + "source": [ + "from langgraph.prebuilt import create_react_agent\n", + "\n", + "# You need to set up an llm, please refer to the chaining section\n", + "agent_executor = create_react_agent(\n", + " llm, agentql_toolkit.get_tools() + playwright_toolkit\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "YEdeprRCq_7E", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YEdeprRCq_7E", + "outputId": "f834b9f3-802d-49d6-c5c3-06c86d6d82e8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "================================\u001b[1m Human Message \u001b[0m=================================\n", + "\n", + "\n", + "Navigate to https://news.ycombinator.com/,\n", + "extract the news titles on the current page,\n", + "show the current page url,\n", + "find the button on the webpage that direct to the next page,\n", + "click on the button,\n", + "show the current page url,\n", + "extract the news title on the current page\n", + "extract the news titles that mention \"AI\" from the two pages.\n", + "\n", + "==================================\u001b[1m Ai Message \u001b[0m==================================\n", + "Tool Calls:\n", + " navigate_browser (call_3eY5a0BRwyYj7kaNpAxkquTD)\n", + " Call ID: call_3eY5a0BRwyYj7kaNpAxkquTD\n", + " Args:\n", + " url: https://news.ycombinator.com/\n", + "=================================\u001b[1m Tool Message \u001b[0m=================================\n", + "Name: navigate_browser\n", + "\n", + "Navigating to https://news.ycombinator.com/ returned status code 200\n", + "==================================\u001b[1m Ai Message \u001b[0m==================================\n", + "Tool Calls:\n", + " extract_web_data_from_browser (call_WvRrZKGGo8mq3JewRlaIS5xx)\n", + " Call ID: call_WvRrZKGGo8mq3JewRlaIS5xx\n", + " Args:\n", + " prompt: Extract all the news titles from this page.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/agentql/_core/_utils.py:167: UserWarning: \u001b[31m🚨 The function get_data_by_prompt_experimental is experimental and may not work as expected 🚨\u001b[0m\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=================================\u001b[1m Tool Message \u001b[0m=================================\n", + "Name: extract_web_data_from_browser\n", + "\n", + "{\"news_item\": [{\"title\": \"I Went to SQL Injection Court\"}, {\"title\": \"Framework's first desktop is a strange–but unique–mini ITX gaming PC\"}, {\"title\": \"Hyperspace\"}, {\"title\": \"The XB-70 (2019)\"}, {\"title\": \"How core Git developers configure Git\"}, {\"title\": \"Emergent Misalignment: Narrow finetuning can produce broadly misaligned LLMs [pdf]\"}, {\"title\": \"Hard problems that reduce to document ranking\"}, {\"title\": \"Ggwave: Tiny Data-over-Sound Library\"}, {\"title\": \"Bald eagles are thriving again after near extinction\"}, {\"title\": \"Forum with 2.6M posts being deleted due to UK Online Safety Act\"}, {\"title\": \"Launch HN: Browser Use (YC W25) – open-source web agents\"}, {\"title\": \"Part two of Grant Sanderson's video with Terry Tao on the cosmic distance ladder\"}, {\"title\": \"New maps of the chaotic space-time inside black holes\"}, {\"title\": \"Knitting Your Parachute\"}, {\"title\": \"Chicory: A JVM native WebAssembly runtime\"}, {\"title\": \"Low Overhead Allocation Sampling with VMProf in PyPy's GC\"}, {\"title\": \"Sigma BF Camera\"}, {\"title\": \"DeepSearcher: A local open-source Deep Research\"}, {\"title\": \"Xonsh – A Python-powered shell\"}, {\"title\": \"A possible future of Python in the browser\"}, {\"title\": \"Show HN: GoatDB – A lightweight, offline-first, realtime NoDB for Deno and React\"}, {\"title\": \"Embedding Python in Elixir, it's fine\"}, {\"title\": \"The Deep Research problem\"}, {\"title\": \"Why are QR Codes with capital letters smaller than QR codes with lower case?\"}, {\"title\": \"Show HN: My new wiki for Silicon Graphics stuff\"}, {\"title\": \"AI is blurring the line between PMs and engineers?\"}, {\"title\": \"I recreated Shazam's algorithm with Go [video]\"}, {\"title\": \"Dogs may have domesticated themselves because they liked snacks, model suggests\"}, {\"title\": \"Show HN: Txtl – Fast static website of text utilities\"}, {\"title\": \"Have we been wrong about why Mars is red?\"}]}\n", + "==================================\u001b[1m Ai Message \u001b[0m==================================\n", + "Tool Calls:\n", + " get_web_element_from_browser (call_B6jn5ItasceNW7eeb640UhQQ)\n", + " Call ID: call_B6jn5ItasceNW7eeb640UhQQ\n", + " Args:\n", + " prompt: button or link to go to the next page\n", + " extract_web_data_from_browser (call_Wyh2VH76bzrlDozp7gpkVBl7)\n", + " Call ID: call_Wyh2VH76bzrlDozp7gpkVBl7\n", + " Args:\n", + " prompt: Extract the current page URL\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/agentql/_core/_utils.py:167: UserWarning: \u001b[31m🚨 The function get_data_by_prompt_experimental is experimental and may not work as expected 🚨\u001b[0m\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=================================\u001b[1m Tool Message \u001b[0m=================================\n", + "Name: extract_web_data_from_browser\n", + "\n", + "{\"current_page_url\": \"https://news.ycombinator.com/news\"}\n", + "==================================\u001b[1m Ai Message \u001b[0m==================================\n", + "Tool Calls:\n", + " click_element (call_NLGIW1lLutkZ6k0vqkfGbOD7)\n", + " Call ID: call_NLGIW1lLutkZ6k0vqkfGbOD7\n", + " Args:\n", + " selector: [tf623_id='944']\n", + "=================================\u001b[1m Tool Message \u001b[0m=================================\n", + "Name: click_element\n", + "\n", + "Clicked element '[tf623_id='944']'\n", + "==================================\u001b[1m Ai Message \u001b[0m==================================\n", + "Tool Calls:\n", + " extract_web_data_from_browser (call_QPt8R2hqiSgytUvLcWUUORKF)\n", + " Call ID: call_QPt8R2hqiSgytUvLcWUUORKF\n", + " Args:\n", + " prompt: Extract the current page URL\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/agentql/_core/_utils.py:167: UserWarning: \u001b[31m🚨 The function get_data_by_prompt_experimental is experimental and may not work as expected 🚨\u001b[0m\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=================================\u001b[1m Tool Message \u001b[0m=================================\n", + "Name: extract_web_data_from_browser\n", + "\n", + "{\"current_page_url\": \"https://news.ycombinator.com/news?p=2\"}\n", + "==================================\u001b[1m Ai Message \u001b[0m==================================\n", + "Tool Calls:\n", + " extract_web_data_from_browser (call_ZZOPrIfVaVQ1A26j8EGE913W)\n", + " Call ID: call_ZZOPrIfVaVQ1A26j8EGE913W\n", + " Args:\n", + " prompt: Extract all the news titles from this page.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/agentql/_core/_utils.py:167: UserWarning: \u001b[31m🚨 The function get_data_by_prompt_experimental is experimental and may not work as expected 🚨\u001b[0m\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=================================\u001b[1m Tool Message \u001b[0m=================================\n", + "Name: extract_web_data_from_browser\n", + "\n", + "{\"news_item\": [{\"title\": \"'Hey Number 17 '\"}, {\"title\": \"Building and operating a pretty big storage system called S3 (2023)\"}, {\"title\": \"Ghost House – software for automatic inbetweens\"}, {\"title\": \"Ask HN: Former devs who can't get a job, what did you end up doing for work?\"}, {\"title\": \"DeepSeek open source DeepEP – library for MoE training and Inference\"}, {\"title\": \"SETI's hard steps and how to resolve them\"}, {\"title\": \"A Defense of Weird Research\"}, {\"title\": \"DigiCert: Threat of legal action to stifle Bugzilla discourse\"}, {\"title\": \"Show HN: Tach – Visualize and untangle your Python codebase\"}, {\"title\": \"Ask HN: A retrofitted C dialect?\"}, {\"title\": \"“The closer to the train station, the worse the kebab” – a “study”\"}, {\"title\": \"Brewing Clean Water: The metal-remediating benefits of tea preparation\"}, {\"title\": \"Invoker Commands (Explainer)\"}, {\"title\": \"Freelancing: How I found clients, part 1\"}, {\"title\": \"Claude 3.7 Sonnet and Claude Code\"}, {\"title\": \"Clean Code vs. A Philosophy Of Software Design\"}, {\"title\": \"Show HN: While the world builds AI Agents, I'm just building calculators\"}, {\"title\": \"History of CAD\"}, {\"title\": \"Fans are better than tech at organizing information online (2019)\"}, {\"title\": \"Some Programming Language Ideas\"}, {\"title\": \"The independent researcher (2018)\"}, {\"title\": \"The best way to use text embeddings portably is with Parquet and Polars\"}, {\"title\": \"Show HN: Prioritize Anything with Stacks\"}, {\"title\": \"Ashby (YC W19) Is Hiring Principal Product Engineers\"}, {\"title\": \"GibberLink [AI-AI Communication]\"}, {\"title\": \"Show HN: I made a site to tell the time in corporate\"}, {\"title\": \"It’s still worth blogging in the age of AI\"}, {\"title\": \"What would happen if we didn't use TCP or UDP?\"}, {\"title\": \"Closing the “green gap”: energy savings from the math of the landscape function\"}, {\"title\": \"Larry Ellison's half-billion-dollar quest to change farming\"}]}\n", + "==================================\u001b[1m Ai Message \u001b[0m==================================\n", + "\n", + "Here's a summary of the actions and results:\n", + "\n", + "### Page 1\n", + "- **URL:** [https://news.ycombinator.com/news](https://news.ycombinator.com/news)\n", + "- **News Titles:**\n", + " 1. I Went to SQL Injection Court\n", + " 2. Framework's first desktop is a strange–but unique–mini ITX gaming PC\n", + " 3. Hyperspace\n", + " 4. The XB-70 (2019)\n", + " 5. How core Git developers configure Git\n", + " 6. Emergent Misalignment: Narrow finetuning can produce broadly misaligned LLMs [pdf]\n", + " 7. Hard problems that reduce to document ranking\n", + " 8. Ggwave: Tiny Data-over-Sound Library\n", + " 9. Bald eagles are thriving again after near extinction\n", + " 10. Forum with 2.6M posts being deleted due to UK Online Safety Act\n", + " 11. Launch HN: Browser Use (YC W25) – open-source web agents\n", + " 12. Part two of Grant Sanderson's video with Terry Tao on the cosmic distance ladder\n", + " 13. New maps of the chaotic space-time inside black holes\n", + " 14. Knitting Your Parachute\n", + " 15. Chicory: A JVM native WebAssembly runtime\n", + " 16. Low Overhead Allocation Sampling with VMProf in PyPy's GC\n", + " 17. Sigma BF Camera\n", + " 18. DeepSearcher: A local open-source Deep Research\n", + " 19. Xonsh – A Python-powered shell\n", + " 20. A possible future of Python in the browser\n", + " 21. Show HN: GoatDB – A lightweight, offline-first, realtime NoDB for Deno and React\n", + " 22. Embedding Python in Elixir, it's fine\n", + " 23. The Deep Research problem\n", + " 24. Why are QR Codes with capital letters smaller than QR codes with lower case?\n", + " 25. Show HN: My new wiki for Silicon Graphics stuff\n", + " 26. **AI is blurring the line between PMs and engineers?**\n", + " 27. I recreated Shazam's algorithm with Go [video]\n", + " 28. Dogs may have domesticated themselves because they liked snacks, model suggests\n", + " 29. Show HN: Txtl – Fast static website of text utilities\n", + " 30. Have we been wrong about why Mars is red?\n", + "\n", + "### Page 2\n", + "- **URL:** [https://news.ycombinator.com/news?p=2](https://news.ycombinator.com/news?p=2)\n", + "- **News Titles:**\n", + " 1. 'Hey Number 17'\n", + " 2. Building and operating a pretty big storage system called S3 (2023)\n", + " 3. Ghost House – software for automatic inbetweens\n", + " 4. Ask HN: Former devs who can't get a job, what did you end up doing for work?\n", + " 5. DeepSeek open source DeepEP – library for MoE training and Inference\n", + " 6. SETI's hard steps and how to resolve them\n", + " 7. A Defense of Weird Research\n", + " 8. DigiCert: Threat of legal action to stifle Bugzilla discourse\n", + " 9. Show HN: Tach – Visualize and untangle your Python codebase\n", + " 10. Ask HN: A retrofitted C dialect?\n", + " 11. “The closer to the train station, the worse the kebab” – a “study”\n", + " 12. Brewing Clean Water: The metal-remediating benefits of tea preparation\n", + " 13. Invoker Commands (Explainer)\n", + " 14. Freelancing: How I found clients, part 1\n", + " 15. Claude 3.7 Sonnet and Claude Code\n", + " 16. Clean Code vs. A Philosophy Of Software Design\n", + " 17. **Show HN: While the world builds AI Agents, I'm just building calculators**\n", + " 18. History of CAD\n", + " 19. Fans are better than tech at organizing information online (2019)\n", + " 20. Some Programming Language Ideas\n", + " 21. The independent researcher (2018)\n", + " 22. The best way to use text embeddings portably is with Parquet and Polars\n", + " 23. Show HN: Prioritize Anything with Stacks\n", + " 24. Ashby (YC W19) Is Hiring Principal Product Engineers\n", + " 25. **GibberLink [AI-AI Communication]**\n", + " 26. Show HN: I made a site to tell the time in corporate\n", + " 27. **It’s still worth blogging in the age of AI**\n", + " 28. What would happen if we didn't use TCP or UDP?\n", + " 29. Closing the “green gap”: energy savings from the math of the landscape function\n", + " 30. Larry Ellison's half-billion-dollar quest to change farming\n", + "\n", + "### News Titles Mentioning \"AI\":\n", + "1. Page 1: **AI is blurring the line between PMs and engineers?**\n", + "2. Page 2:\n", + " - **Show HN: While the world builds AI Agents, I'm just building calculators**\n", + " - **GibberLink [AI-AI Communication]**\n", + " - **It’s still worth blogging in the age of AI**\n" + ] + } + ], + "source": [ + "prompt = \"\"\"\n", + "Navigate to https://news.ycombinator.com/,\n", + "extract the news titles on the current page,\n", + "show the current page url,\n", + "find the button on the webpage that direct to the next page,\n", + "click on the button,\n", + "show the current page url,\n", + "extract the news title on the current page\n", + "extract the news titles that mention \"AI\" from the two pages.\n", + "\"\"\"\n", + "\n", + "events = agent_executor.astream(\n", + " {\"messages\": [(\"user\", prompt)]},\n", + " stream_mode=\"values\",\n", + ")\n", + "async for event in events:\n", + " event[\"messages\"][-1].pretty_print()" + ] + }, + { + "cell_type": "markdown", + "id": "9b3660a4", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For more information on how to use this integration, please refer to the [git repo](https://github.com/tinyfish-io/agentql-integrations/tree/main/langchain) or the [langchain integration documentation](https://docs.agentql.com/integrations/langchain)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/scripts/tool_feat_table.py b/docs/scripts/tool_feat_table.py index c92d2344a93..5e5a3498413 100644 --- a/docs/scripts/tool_feat_table.py +++ b/docs/scripts/tool_feat_table.py @@ -147,6 +147,11 @@ WEBBROWSING_TOOL_FEAT_TABLE = { "interactions": True, "pricing": "40 free requests/day", }, + "AgentQL Toolkit": { + "link": "/docs/integrations/tools/agentql", + "interactions": True, + "pricing": "Free trial, with pay-as-you-go and flat rate plans after", + }, } DATABASE_TOOL_FEAT_TABLE = { diff --git a/docs/src/theme/FeatureTables.js b/docs/src/theme/FeatureTables.js index 73e9bfaa090..2703760561c 100644 --- a/docs/src/theme/FeatureTables.js +++ b/docs/src/theme/FeatureTables.js @@ -819,6 +819,13 @@ const FEATURE_TABLES = { source: "Platform for running and scaling headless browsers, can be used to scrape/crawl any site", api: "API", apiLink: "https://python.langchain.com/docs/integrations/document_loaders/hyperbrowser/" + }, + { + name: "AgentQL", + link: "agentql", + source: "Web interaction and structured data extraction from any web page using an AgentQL query or a Natural Language prompt", + api: "API", + apiLink: "https://python.langchain.com/docs/integrations/document_loaders/agentql/" } ] }, diff --git a/libs/packages.yml b/libs/packages.yml index a8eea203610..8b807f75a6a 100644 --- a/libs/packages.yml +++ b/libs/packages.yml @@ -513,3 +513,6 @@ packages: - name: langchain-opengradient path: . repo: OpenGradient/og-langchain +- name: langchain-agentql + path: langchain + repo: tinyfish-io/agentql-integrations