docs: update apify integration (#29553)

**Description:** Fixed and updated Apify integration documentation to
use the new [langchain-apify](https://github.com/apify/langchain-apify)
package.
**Twitter handle:** @apify
This commit is contained in:
Jakub Kopecký 2025-02-13 05:02:55 +01:00 committed by GitHub
parent 16fb1f5371
commit c8cb7c25bf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 399 additions and 34 deletions

View File

@ -2,7 +2,9 @@
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"id": "xwiDq5fOuoRn"
},
"source": [ "source": [
"# Apify Dataset\n", "# Apify Dataset\n",
"\n", "\n",
@ -20,33 +22,63 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"id": "qRW2-mokuoRp",
"tags": [] "tags": []
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"%pip install --upgrade --quiet apify-client" "%pip install --upgrade --quiet langchain langchain-apify langchain-openai"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"id": "8jRVq16LuoRq"
},
"source": [ "source": [
"First, import `ApifyDatasetLoader` into your source code:" "First, import `ApifyDatasetLoader` into your source code:"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 2,
"metadata": {}, "metadata": {
"id": "umXQHqIJuoRq"
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from langchain_community.document_loaders import ApifyDatasetLoader\n", "from langchain_apify import ApifyDatasetLoader\n",
"from langchain_core.documents import Document" "from langchain_core.documents import Document"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"id": "NjGwKy59vz1X"
},
"source": [
"Find your [Apify API token](https://console.apify.com/account/integrations) and [OpenAI API key](https://platform.openai.com/account/api-keys) and initialize these into environment variable:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"id": "AvzNtyCxwDdr"
},
"outputs": [],
"source": [
"import os\n",
"\n",
"os.environ[\"APIFY_API_TOKEN\"] = \"your-apify-api-token\"\n",
"os.environ[\"OPENAI_API_KEY\"] = \"your-openai-api-key\""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "d1O-KL48uoRr"
},
"source": [ "source": [
"Then provide a function that maps Apify dataset record fields to LangChain `Document` format.\n", "Then provide a function that maps Apify dataset record fields to LangChain `Document` format.\n",
"\n", "\n",
@ -64,8 +96,10 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 8,
"metadata": {}, "metadata": {
"id": "m1SpA7XZuoRr"
},
"outputs": [], "outputs": [],
"source": [ "source": [
"loader = ApifyDatasetLoader(\n", "loader = ApifyDatasetLoader(\n",
@ -78,8 +112,10 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 9,
"metadata": {}, "metadata": {
"id": "0hWX7ABsuoRs"
},
"outputs": [], "outputs": [],
"source": [ "source": [
"data = loader.load()" "data = loader.load()"
@ -87,7 +123,9 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"id": "EJCVFVKNuoRs"
},
"source": [ "source": [
"## An example with question answering\n", "## An example with question answering\n",
"\n", "\n",
@ -96,21 +134,26 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 14,
"metadata": {}, "metadata": {
"id": "sNisJKzZuoRt"
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from langchain.indexes import VectorstoreIndexCreator\n", "from langchain.indexes import VectorstoreIndexCreator\n",
"from langchain_community.utilities import ApifyWrapper\n", "from langchain_apify import ApifyWrapper\n",
"from langchain_core.documents import Document\n", "from langchain_core.documents import Document\n",
"from langchain_openai import OpenAI\n", "from langchain_core.vectorstores import InMemoryVectorStore\n",
"from langchain_openai import ChatOpenAI\n",
"from langchain_openai.embeddings import OpenAIEmbeddings" "from langchain_openai.embeddings import OpenAIEmbeddings"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 15,
"metadata": {}, "metadata": {
"id": "qcfmnbdDuoRu"
},
"outputs": [], "outputs": [],
"source": [ "source": [
"loader = ApifyDatasetLoader(\n", "loader = ApifyDatasetLoader(\n",
@ -123,27 +166,47 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 16,
"metadata": {}, "metadata": {
"id": "8b0xzKJxuoRv"
},
"outputs": [], "outputs": [],
"source": [ "source": [
"index = VectorstoreIndexCreator(embedding=OpenAIEmbeddings()).from_loaders([loader])" "index = VectorstoreIndexCreator(\n",
" vectorstore_cls=InMemoryVectorStore, embedding=OpenAIEmbeddings()\n",
").from_loaders([loader])"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 17,
"metadata": {}, "metadata": {
"id": "7zPXGsVFwUGA"
},
"outputs": [],
"source": [
"llm = ChatOpenAI(model=\"gpt-4o-mini\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"id": "ecWrdM4guoRv"
},
"outputs": [], "outputs": [],
"source": [ "source": [
"query = \"What is Apify?\"\n", "query = \"What is Apify?\"\n",
"result = index.query_with_sources(query, llm=OpenAI())" "result = index.query_with_sources(query, llm=llm)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": null,
"metadata": {}, "metadata": {
"id": "QH8r44e9uoRv",
"outputId": "361fe050-f75d-4d5a-c327-5e7bd190fba5"
},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
@ -162,6 +225,9 @@
} }
], ],
"metadata": { "metadata": {
"colab": {
"provenance": []
},
"kernelspec": { "kernelspec": {
"display_name": "Python 3 (ipykernel)", "display_name": "Python 3 (ipykernel)",
"language": "python", "language": "python",
@ -181,5 +247,5 @@
} }
}, },
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 4 "nbformat_minor": 0
} }

View File

@ -14,20 +14,34 @@ blogs, or knowledge bases.
## Installation and Setup ## Installation and Setup
- Install the Apify API client for Python with `pip install apify-client` - Install the LangChain Apify package for Python with:
```bash
pip install langchain-apify
```
- Get your [Apify API token](https://console.apify.com/account/integrations) and either set it as - Get your [Apify API token](https://console.apify.com/account/integrations) and either set it as
an environment variable (`APIFY_API_TOKEN`) or pass it to the `ApifyWrapper` as `apify_api_token` in the constructor. an environment variable (`APIFY_API_TOKEN`) or pass it as `apify_api_token` in the constructor.
## Tool
## Utility You can use the `ApifyActorsTool` to use Apify Actors with agents.
```python
from langchain_apify import ApifyActorsTool
```
See [this notebook](/docs/integrations/tools/apify_actors) for example usage.
For more information on how to use this tool, visit [the Apify integration documentation](https://docs.apify.com/platform/integrations/langgraph).
## Wrapper
You can use the `ApifyWrapper` to run Actors on the Apify platform. You can use the `ApifyWrapper` to run Actors on the Apify platform.
```python ```python
from langchain_community.utilities import ApifyWrapper from langchain_apify import ApifyWrapper
``` ```
For more information on this wrapper, see [the API reference](https://python.langchain.com/api_reference/community/utilities/langchain_community.utilities.apify.ApifyWrapper.html). For more information on how to use this wrapper, see [the Apify integration documentation](https://docs.apify.com/platform/integrations/langchain).
## Document loader ## Document loader
@ -35,7 +49,10 @@ For more information on this wrapper, see [the API reference](https://python.lan
You can also use our `ApifyDatasetLoader` to get data from Apify dataset. You can also use our `ApifyDatasetLoader` to get data from Apify dataset.
```python ```python
from langchain_community.document_loaders import ApifyDatasetLoader from langchain_apify import ApifyDatasetLoader
``` ```
For a more detailed walkthrough of this loader, see [this notebook](/docs/integrations/document_loaders/apify_dataset). For a more detailed walkthrough of this loader, see [this notebook](/docs/integrations/document_loaders/apify_dataset).
Source code for this integration can be found in the [LangChain Apify repository](https://github.com/apify/langchain-apify).

View File

@ -0,0 +1,256 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "_9MNj58sIkGN"
},
"source": [
"# Apify Actor\n",
"\n",
"## Overview\n",
"\n",
">[Apify Actors](https://docs.apify.com/platform/actors) are cloud programs designed for a wide range of web scraping, crawling, and data extraction tasks. These actors facilitate automated data gathering from the web, enabling users to extract, process, and store information efficiently. Actors can be used to perform tasks like scraping e-commerce sites for product details, monitoring price changes, or gathering search engine results. They integrate seamlessly with [Apify Datasets](https://docs.apify.com/platform/storage/dataset), allowing the structured data collected by actors to be stored, managed, and exported in formats like JSON, CSV, or Excel for further analysis or use.\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "OHLF9t9v9HCb"
},
"source": [
"## Setup\n",
"\n",
"This integration lives in the [langchain-apify](https://pypi.org/project/langchain-apify/) package. The package can be installed using pip.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4DdGmBn5IbXz"
},
"outputs": [],
"source": [
"%pip install langchain-apify"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "rEAwonXqwggR"
},
"source": [
"### Prerequisites\n",
"\n",
"- **Apify account**: Register your free Apify account [here](https://console.apify.com/sign-up).\n",
"- **Apify API token**: Learn how to get your API token in the [Apify documentation](https://docs.apify.com/platform/integrations/api)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "9nJOl4MBMkcR"
},
"outputs": [],
"source": [
"import os\n",
"\n",
"os.environ[\"APIFY_API_TOKEN\"] = \"your-apify-api-token\"\n",
"os.environ[\"OPENAI_API_KEY\"] = \"your-openai-api-key\""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UfoQxAlCxR9q"
},
"source": [
"## Instantiation"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "qG9KtXtLM8i7"
},
"source": [
"Here we instantiate the `ApifyActorsTool` to be able to call [RAG Web Browser](https://apify.com/apify/rag-web-browser) Apify Actor. This Actor provides web browsing functionality for AI and LLM applications, similar to the web browsing feature in ChatGPT. Any Actor from the [Apify Store](https://apify.com/store) can be used in this way."
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"id": "cyxeTlPnM4Ya"
},
"outputs": [],
"source": [
"from langchain_apify import ApifyActorsTool\n",
"\n",
"tool = ApifyActorsTool(\"apify/rag-web-browser\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "fGDLvDCqyKWO"
},
"source": [
"## Invocation\n",
"\n",
"The `ApifyActorsTool` takes a single argument, which is `run_input` - a dictionary that is passed as a run input to the Actor. Run input schema documentation can be found in the input section of the Actor details page. See [RAG Web Browser input schema](https://apify.com/apify/rag-web-browser/input-schema).\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "nTWy6Hx1yk04"
},
"outputs": [],
"source": [
"tool.invoke({\"run_input\": {\"query\": \"what is apify?\", \"maxResults\": 2}})"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "kQsa27hoO58S"
},
"source": [
"## Chaining\n",
"\n",
"We can provide the created tool to an [agent](https://python.langchain.com/docs/tutorials/agents/). When asked to search for information, the agent will call the Apify Actor, which will search the web, and then retrieve the search results.\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "YySvLskW72Y8"
},
"outputs": [],
"source": [
"%pip install langgraph langchain-openai"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"id": "QEDz07btO5Gi"
},
"outputs": [],
"source": [
"from langchain_core.messages import ToolMessage\n",
"from langchain_openai import ChatOpenAI\n",
"from langgraph.prebuilt import create_react_agent\n",
"\n",
"model = ChatOpenAI(model=\"gpt-4o\")\n",
"tools = [tool]\n",
"graph = create_react_agent(model, tools=tools)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XS1GEyNkQxGu",
"outputId": "195273d7-034c-425b-f3f9-95c0a9fb0c9e"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"================================\u001b[1m Human Message \u001b[0m=================================\n",
"\n",
"search for what is Apify\n",
"==================================\u001b[1m Ai Message \u001b[0m==================================\n",
"Tool Calls:\n",
" apify_actor_apify_rag-web-browser (call_27mjHLzDzwa5ZaHWCMH510lm)\n",
" Call ID: call_27mjHLzDzwa5ZaHWCMH510lm\n",
" Args:\n",
" run_input: {\"run_input\":{\"query\":\"Apify\",\"maxResults\":3,\"outputFormats\":[\"markdown\"]}}\n",
"==================================\u001b[1m Ai Message \u001b[0m==================================\n",
"\n",
"Apify is a comprehensive platform for web scraping, browser automation, and data extraction. It offers a wide array of tools and services that cater to developers and businesses looking to extract data from websites efficiently and effectively. Here's an overview of Apify:\n",
"\n",
"1. **Ecosystem and Tools**:\n",
" - Apify provides an ecosystem where developers can build, deploy, and publish data extraction and web automation tools called Actors.\n",
" - The platform supports various use cases such as extracting data from social media platforms, conducting automated browser-based tasks, and more.\n",
"\n",
"2. **Offerings**:\n",
" - Apify offers over 3,000 ready-made scraping tools and code templates.\n",
" - Users can also build custom solutions or hire Apify's professional services for more tailored data extraction needs.\n",
"\n",
"3. **Technology and Integration**:\n",
" - The platform supports integration with popular tools and services like Zapier, GitHub, Google Sheets, Pinecone, and more.\n",
" - Apify supports open-source tools and technologies such as JavaScript, Python, Puppeteer, Playwright, Selenium, and its own Crawlee library for web crawling and browser automation.\n",
"\n",
"4. **Community and Learning**:\n",
" - Apify hosts a community on Discord where developers can get help and share expertise.\n",
" - It offers educational resources through the Web Scraping Academy to help users become proficient in data scraping and automation.\n",
"\n",
"5. **Enterprise Solutions**:\n",
" - Apify provides enterprise-grade web data extraction solutions with high reliability, 99.95% uptime, and compliance with SOC2, GDPR, and CCPA standards.\n",
"\n",
"For more information, you can visit [Apify's official website](https://apify.com/) or their [GitHub page](https://github.com/apify) which contains their code repositories and further details about their projects.\n"
]
}
],
"source": [
"inputs = {\"messages\": [(\"user\", \"search for what is Apify\")]}\n",
"for s in graph.stream(inputs, stream_mode=\"values\"):\n",
" message = s[\"messages\"][-1]\n",
" # skip tool messages\n",
" if isinstance(message, ToolMessage):\n",
" continue\n",
" message.pretty_print()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "WYXuQIQx8AvG"
},
"source": [
"## API reference\n",
"\n",
"For more information on how to use this integration, see the [git repository](https://github.com/apify/langchain-apify) or the [Apify integration documentation](https://docs.apify.com/platform/integrations/langgraph)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "f1NnMik78oib"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"provenance": [],
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 147 KiB

After

Width:  |  Height:  |  Size: 212 KiB

View File

@ -1,11 +1,22 @@
from typing import Any, Callable, Dict, List from typing import Any, Callable, Dict, List
from langchain_core._api import deprecated
from langchain_core.documents import Document from langchain_core.documents import Document
from pydantic import BaseModel, model_validator from pydantic import BaseModel, model_validator
from langchain_community.document_loaders.base import BaseLoader from langchain_community.document_loaders.base import BaseLoader
@deprecated(
since="0.3.18",
message=(
"This class is deprecated and will be removed in a future version. "
"You can swap to using the `ApifyDatasetLoader`"
" implementation in `langchain_apify` package. "
"See <https://github.com/apify/langchain-apify>"
),
alternative_import="langchain_apify.ApifyDatasetLoader",
)
class ApifyDatasetLoader(BaseLoader, BaseModel): class ApifyDatasetLoader(BaseLoader, BaseModel):
"""Load datasets from `Apify` web scraping, crawling, and data extraction platform. """Load datasets from `Apify` web scraping, crawling, and data extraction platform.

View File

@ -1,5 +1,6 @@
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
from langchain_core._api import deprecated
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_core.utils import get_from_dict_or_env from langchain_core.utils import get_from_dict_or_env
from pydantic import BaseModel, model_validator from pydantic import BaseModel, model_validator
@ -8,6 +9,16 @@ if TYPE_CHECKING:
from langchain_community.document_loaders import ApifyDatasetLoader from langchain_community.document_loaders import ApifyDatasetLoader
@deprecated(
since="0.3.18",
message=(
"This class is deprecated and will be removed in a future version. "
"You can swap to using the `ApifyWrapper`"
" implementation in `langchain_apify` package. "
"See <https://github.com/apify/langchain-apify>"
),
alternative_import="langchain_apify.ApifyWrapper",
)
class ApifyWrapper(BaseModel): class ApifyWrapper(BaseModel):
"""Wrapper around Apify. """Wrapper around Apify.
To use, you should have the ``apify-client`` python package installed, To use, you should have the ``apify-client`` python package installed,

View File

@ -386,6 +386,10 @@ packages:
repo: Nimbleway/langchain-nimble repo: Nimbleway/langchain-nimble
path: . path: .
downloads: 0 downloads: 0
- name: langchain-apify
path: .
repo: apify/langchain-apify
downloads: 204
- name: langfair - name: langfair
repo: cvs-health/langfair repo: cvs-health/langfair
path: . path: .