diff --git a/docs/docs/integrations/document_loaders/scrapfly.ipynb b/docs/docs/integrations/document_loaders/scrapfly.ipynb new file mode 100644 index 00000000000..2625e3d3fb9 --- /dev/null +++ b/docs/docs/integrations/document_loaders/scrapfly.ipynb @@ -0,0 +1,107 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ScrapFly\n", + "[ScrapFly](https://scrapfly.io/) is a web scraping API with headless browser capabilities, proxies, and anti-bot bypass. It allows for extracting web page data into accessible LLM markdown or text." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Installation\n", + "Install ScrapFly Python SDK and he required Langchain packages using pip:\n", + "```shell\n", + "pip install scrapfly-sdk langchain langchain-community\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Usage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import ScrapflyLoader\n", + "\n", + "scrapfly_loader = ScrapflyLoader(\n", + " [\"https://web-scraping.dev/products\"],\n", + " api_key=\"Your ScrapFly API key\", # Get your API key from https://www.scrapfly.io/\n", + " ignore_scrape_failures=True, # Ignore unprocessable web pages and log their exceptions\n", + ")\n", + "\n", + "# Load documents from URLs as markdown\n", + "documents = scrapfly_loader.load()\n", + "print(documents)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The ScrapflyLoader also allows passigng ScrapeConfig object for customizing the scrape request. See the documentation for the full feature details and their API params: https://scrapfly.io/docs/scrape-api/getting-started" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import ScrapflyLoader\n", + "\n", + "scrapfly_scrape_config = {\n", + " \"asp\": True, # Bypass scraping blocking and antibot solutions, like Cloudflare\n", + " \"render_js\": True, # Enable JavaScript rendering with a cloud headless browser\n", + " \"proxy_pool\": \"public_residential_pool\", # Select a proxy pool (datacenter or residnetial)\n", + " \"country\": \"us\", # Select a proxy location\n", + " \"auto_scroll\": True, # Auto scroll the page\n", + " \"js\": \"\", # Execute custom JavaScript code by the headless browser\n", + "}\n", + "\n", + "scrapfly_loader = ScrapflyLoader(\n", + " [\"https://web-scraping.dev/products\"],\n", + " api_key=\"Your ScrapFly API key\", # Get your API key from https://www.scrapfly.io/\n", + " ignore_scrape_failures=True, # Ignore unprocessable web pages and log their exceptions\n", + " scrape_config=scrapfly_scrape_config, # Pass the scrape_config object\n", + " scrape_format=\"markdown\", # The scrape result format, either `markdown`(default) or `text`\n", + ")\n", + "\n", + "# Load documents from URLs as markdown\n", + "documents = scrapfly_loader.load()\n", + "print(documents)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index 5a5f1e4c343..1977498691d 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -403,6 +403,9 @@ if TYPE_CHECKING: from langchain_community.document_loaders.s3_file import ( S3FileLoader, ) + from langchain_community.document_loaders.scrapfly import ( + ScrapflyLoader, + ) from langchain_community.document_loaders.sharepoint import ( SharePointLoader, ) @@ -654,6 +657,7 @@ _module_lookup = { "RocksetLoader": "langchain_community.document_loaders.rocksetdb", "S3DirectoryLoader": "langchain_community.document_loaders.s3_directory", "S3FileLoader": "langchain_community.document_loaders.s3_file", + "ScrapflyLoader": "langchain_community.document_loaders.scrapfly", "SQLDatabaseLoader": "langchain_community.document_loaders.sql_database", "SRTLoader": "langchain_community.document_loaders.srt", "SeleniumURLLoader": "langchain_community.document_loaders.url_selenium", @@ -854,6 +858,7 @@ __all__ = [ "RocksetLoader", "S3DirectoryLoader", "S3FileLoader", + "ScrapflyLoader", "SQLDatabaseLoader", "SRTLoader", "SeleniumURLLoader", diff --git a/libs/community/langchain_community/document_loaders/scrapfly.py b/libs/community/langchain_community/document_loaders/scrapfly.py new file mode 100644 index 00000000000..b774d46aded --- /dev/null +++ b/libs/community/langchain_community/document_loaders/scrapfly.py @@ -0,0 +1,69 @@ +"""Scrapfly Web Reader.""" +import logging +from typing import Iterator, List, Literal, Optional + +from langchain_core.document_loaders import BaseLoader +from langchain_core.documents import Document +from langchain_core.utils import get_from_env + +logger = logging.getLogger(__file__) + + +class ScrapflyLoader(BaseLoader): + """Turn a url to llm accessible markdown with `Scrapfly.io`. + + For further details, visit: https://scrapfly.io/docs/sdk/python + """ + + def __init__( + self, + urls: List[str], + *, + api_key: Optional[str] = None, + scrape_format: Literal["markdown", "text"] = "markdown", + scrape_config: Optional[dict] = None, + continue_on_failure: bool = True, + ) -> None: + """Initialize client. + + Args: + urls: List of urls to scrape. + api_key: The Scrapfly API key. If not specified must have env var + SCRAPFLY_API_KEY set. + scrape_format: Scrape result format, one or "markdown" or "text". + scrape_config: Dictionary of ScrapFly scrape config object. + continue_on_failure: Whether to continue if scraping a url fails. + """ + try: + from scrapfly import ScrapflyClient + except ImportError: + raise ImportError( + "`scrapfly` package not found, please run `pip install scrapfly-sdk`" + ) + if not urls: + raise ValueError("URLs must be provided.") + api_key = api_key or get_from_env("api_key", "SCRAPFLY_API_KEY") + self.scrapfly = ScrapflyClient(key=api_key) + self.urls = urls + self.scrape_format = scrape_format + self.scrape_config = scrape_config + self.continue_on_failure = continue_on_failure + + def lazy_load(self) -> Iterator[Document]: + from scrapfly import ScrapeConfig + + scrape_config = self.scrape_config if self.scrape_config is not None else {} + for url in self.urls: + try: + response = self.scrapfly.scrape( + ScrapeConfig(url, format=self.scrape_format, **scrape_config) + ) + yield Document( + page_content=response.scrape_result["content"], + metadata={"url": url}, + ) + except Exception as e: + if self.continue_on_failure: + logger.error(f"Error fetching data from {url}, exception: {e}") + else: + raise e diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index a8890aabe0e..0f8628d20e8 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -138,6 +138,7 @@ EXPECTED_ALL = [ "RocksetLoader", "S3DirectoryLoader", "S3FileLoader", + "ScrapflyLoader", "SQLDatabaseLoader", "SRTLoader", "SeleniumURLLoader",