diff --git a/docs/docs/integrations/providers/scrapegraph.mdx b/docs/docs/integrations/providers/scrapegraph.mdx index 93507ef3a88..dffbc063d5f 100644 --- a/docs/docs/integrations/providers/scrapegraph.mdx +++ b/docs/docs/integrations/providers/scrapegraph.mdx @@ -27,8 +27,8 @@ There are four tools available: ```python from langchain_scrapegraph.tools import ( SmartScraperTool, # Extract structured data from websites + SmartCrawlerTool, # Extract data from multiple pages with crawling MarkdownifyTool, # Convert webpages to markdown - LocalScraperTool, # Process local HTML content GetCreditsTool, # Check remaining API credits ) ``` @@ -36,6 +36,6 @@ from langchain_scrapegraph.tools import ( Each tool serves a specific purpose: - `SmartScraperTool`: Extract structured data from websites given a URL, prompt and optional output schema +- `SmartCrawlerTool`: Extract data from multiple pages with advanced crawling options like depth control, page limits, and domain restrictions - `MarkdownifyTool`: Convert any webpage to clean markdown format -- `LocalScraperTool`: Extract structured data from a local HTML file given a prompt and optional output schema - `GetCreditsTool`: Check your remaining ScrapeGraph AI credits diff --git a/docs/docs/integrations/tools/scrapegraph.ipynb b/docs/docs/integrations/tools/scrapegraph.ipynb index e8c0f53e706..3434c9d3e9a 100644 --- a/docs/docs/integrations/tools/scrapegraph.ipynb +++ b/docs/docs/integrations/tools/scrapegraph.ipynb @@ -30,8 +30,8 @@ "| Class | Package | Serializable | JS support | Package latest |\n", "| :--- | :--- | :---: | :---: | :---: |\n", "| [SmartScraperTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n", + "| [SmartCrawlerTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n", "| [MarkdownifyTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n", - "| [LocalScraperTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n", "| [GetCreditsTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n", "\n", "### Tool features\n", @@ -39,8 +39,8 @@ "| Tool | Purpose | Input | Output |\n", "| :--- | :--- | :--- | :--- |\n", "| SmartScraperTool | Extract structured data from websites | URL + prompt | JSON |\n", + "| SmartCrawlerTool | Extract data from multiple pages with crawling | URL + prompt + crawl options | JSON |\n", "| MarkdownifyTool | Convert webpages to markdown | URL | Markdown text |\n", - "| LocalScraperTool | Extract data from HTML content | HTML + prompt | JSON |\n", "| GetCreditsTool | Check API credits | None | Credit info |\n", "\n", "\n", @@ -122,21 +122,26 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "8b3ddfe9", "metadata": {}, "outputs": [], "source": [ + "from scrapegraph_py.logger import sgai_logger\n", + "import json\n", + "\n", "from langchain_scrapegraph.tools import (\n", " GetCreditsTool,\n", - " LocalScraperTool,\n", " MarkdownifyTool,\n", + " SmartCrawlerTool,\n", " SmartScraperTool,\n", ")\n", "\n", + "sgai_logger.set_logging(level=\"INFO\")\n", + "\n", "smartscraper = SmartScraperTool()\n", + "smartcrawler = SmartCrawlerTool()\n", "markdownify = MarkdownifyTool()\n", - "localscraper = LocalScraperTool()\n", "credits = GetCreditsTool()" ] }, @@ -152,9 +157,23 @@ "Let's try each tool individually:" ] }, + { + "cell_type": "markdown", + "id": "d5a88cf2", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "### SmartCrawler Tool\n", + "\n", + "The SmartCrawlerTool allows you to crawl multiple pages from a website and extract structured data with advanced crawling options like depth control, page limits, and domain restrictions.\n" + ] + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "65310a8b", "metadata": {}, "outputs": [ @@ -189,33 +208,71 @@ "markdown = markdownify.invoke({\"website_url\": \"https://scrapegraphai.com\"})\n", "print(\"\\nMarkdownify Result (first 200 chars):\", markdown[:200])\n", "\n", - "local_html = \"\"\"\n", - "\n", - "
\n", - "We are a technology company focused on AI solutions.
\n", - "Email: contact@example.com
\n", - "Phone: (555) 123-4567
\n", - "