mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-19 19:11:33 +00:00
docs: update ScrapeGraphAI tools (#32026)
It was outdated --------- Co-authored-by: Mason Daugherty <github@mdrxy.com>
This commit is contained in:
parent
d96b75f9d3
commit
26c2c8f70a
@ -27,8 +27,8 @@ There are four tools available:
|
|||||||
```python
|
```python
|
||||||
from langchain_scrapegraph.tools import (
|
from langchain_scrapegraph.tools import (
|
||||||
SmartScraperTool, # Extract structured data from websites
|
SmartScraperTool, # Extract structured data from websites
|
||||||
|
SmartCrawlerTool, # Extract data from multiple pages with crawling
|
||||||
MarkdownifyTool, # Convert webpages to markdown
|
MarkdownifyTool, # Convert webpages to markdown
|
||||||
LocalScraperTool, # Process local HTML content
|
|
||||||
GetCreditsTool, # Check remaining API credits
|
GetCreditsTool, # Check remaining API credits
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
@ -36,6 +36,6 @@ from langchain_scrapegraph.tools import (
|
|||||||
Each tool serves a specific purpose:
|
Each tool serves a specific purpose:
|
||||||
|
|
||||||
- `SmartScraperTool`: Extract structured data from websites given a URL, prompt and optional output schema
|
- `SmartScraperTool`: Extract structured data from websites given a URL, prompt and optional output schema
|
||||||
|
- `SmartCrawlerTool`: Extract data from multiple pages with advanced crawling options like depth control, page limits, and domain restrictions
|
||||||
- `MarkdownifyTool`: Convert any webpage to clean markdown format
|
- `MarkdownifyTool`: Convert any webpage to clean markdown format
|
||||||
- `LocalScraperTool`: Extract structured data from a local HTML file given a prompt and optional output schema
|
|
||||||
- `GetCreditsTool`: Check your remaining ScrapeGraph AI credits
|
- `GetCreditsTool`: Check your remaining ScrapeGraph AI credits
|
||||||
|
@ -30,8 +30,8 @@
|
|||||||
"| Class | Package | Serializable | JS support | Package latest |\n",
|
"| Class | Package | Serializable | JS support | Package latest |\n",
|
||||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||||
"| [SmartScraperTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n",
|
"| [SmartScraperTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n",
|
||||||
|
"| [SmartCrawlerTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n",
|
||||||
"| [MarkdownifyTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n",
|
"| [MarkdownifyTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n",
|
||||||
"| [LocalScraperTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n",
|
|
||||||
"| [GetCreditsTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n",
|
"| [GetCreditsTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n",
|
||||||
"\n",
|
"\n",
|
||||||
"### Tool features\n",
|
"### Tool features\n",
|
||||||
@ -39,8 +39,8 @@
|
|||||||
"| Tool | Purpose | Input | Output |\n",
|
"| Tool | Purpose | Input | Output |\n",
|
||||||
"| :--- | :--- | :--- | :--- |\n",
|
"| :--- | :--- | :--- | :--- |\n",
|
||||||
"| SmartScraperTool | Extract structured data from websites | URL + prompt | JSON |\n",
|
"| SmartScraperTool | Extract structured data from websites | URL + prompt | JSON |\n",
|
||||||
|
"| SmartCrawlerTool | Extract data from multiple pages with crawling | URL + prompt + crawl options | JSON |\n",
|
||||||
"| MarkdownifyTool | Convert webpages to markdown | URL | Markdown text |\n",
|
"| MarkdownifyTool | Convert webpages to markdown | URL | Markdown text |\n",
|
||||||
"| LocalScraperTool | Extract data from HTML content | HTML + prompt | JSON |\n",
|
|
||||||
"| GetCreditsTool | Check API credits | None | Credit info |\n",
|
"| GetCreditsTool | Check API credits | None | Credit info |\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -122,21 +122,26 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": null,
|
||||||
"id": "8b3ddfe9",
|
"id": "8b3ddfe9",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"from scrapegraph_py.logger import sgai_logger\n",
|
||||||
|
"import json\n",
|
||||||
|
"\n",
|
||||||
"from langchain_scrapegraph.tools import (\n",
|
"from langchain_scrapegraph.tools import (\n",
|
||||||
" GetCreditsTool,\n",
|
" GetCreditsTool,\n",
|
||||||
" LocalScraperTool,\n",
|
|
||||||
" MarkdownifyTool,\n",
|
" MarkdownifyTool,\n",
|
||||||
|
" SmartCrawlerTool,\n",
|
||||||
" SmartScraperTool,\n",
|
" SmartScraperTool,\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"sgai_logger.set_logging(level=\"INFO\")\n",
|
||||||
|
"\n",
|
||||||
"smartscraper = SmartScraperTool()\n",
|
"smartscraper = SmartScraperTool()\n",
|
||||||
|
"smartcrawler = SmartCrawlerTool()\n",
|
||||||
"markdownify = MarkdownifyTool()\n",
|
"markdownify = MarkdownifyTool()\n",
|
||||||
"localscraper = LocalScraperTool()\n",
|
|
||||||
"credits = GetCreditsTool()"
|
"credits = GetCreditsTool()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -152,9 +157,23 @@
|
|||||||
"Let's try each tool individually:"
|
"Let's try each tool individually:"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "d5a88cf2",
|
||||||
|
"metadata": {
|
||||||
|
"vscode": {
|
||||||
|
"languageId": "raw"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"### SmartCrawler Tool\n",
|
||||||
|
"\n",
|
||||||
|
"The SmartCrawlerTool allows you to crawl multiple pages from a website and extract structured data with advanced crawling options like depth control, page limits, and domain restrictions.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": null,
|
||||||
"id": "65310a8b",
|
"id": "65310a8b",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -189,33 +208,71 @@
|
|||||||
"markdown = markdownify.invoke({\"website_url\": \"https://scrapegraphai.com\"})\n",
|
"markdown = markdownify.invoke({\"website_url\": \"https://scrapegraphai.com\"})\n",
|
||||||
"print(\"\\nMarkdownify Result (first 200 chars):\", markdown[:200])\n",
|
"print(\"\\nMarkdownify Result (first 200 chars):\", markdown[:200])\n",
|
||||||
"\n",
|
"\n",
|
||||||
"local_html = \"\"\"\n",
|
"# SmartCrawler\n",
|
||||||
"<html>\n",
|
"url = \"https://scrapegraphai.com/\"\n",
|
||||||
" <body>\n",
|
"prompt = (\n",
|
||||||
" <h1>Company Name</h1>\n",
|
" \"What does the company do? and I need text content from their privacy and terms\"\n",
|
||||||
" <p>We are a technology company focused on AI solutions.</p>\n",
|
")\n",
|
||||||
" <div class=\"contact\">\n",
|
|
||||||
" <p>Email: contact@example.com</p>\n",
|
|
||||||
" <p>Phone: (555) 123-4567</p>\n",
|
|
||||||
" </div>\n",
|
|
||||||
" </body>\n",
|
|
||||||
"</html>\n",
|
|
||||||
"\"\"\"\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# LocalScraper\n",
|
"# Use the tool with crawling parameters\n",
|
||||||
"result_local = localscraper.invoke(\n",
|
"result_crawler = smartcrawler.invoke(\n",
|
||||||
" {\n",
|
" {\n",
|
||||||
" \"user_prompt\": \"Make a summary of the webpage and extract the email and phone number\",\n",
|
" \"url\": url,\n",
|
||||||
" \"website_html\": local_html,\n",
|
" \"prompt\": prompt,\n",
|
||||||
|
" \"cache_website\": True,\n",
|
||||||
|
" \"depth\": 2,\n",
|
||||||
|
" \"max_pages\": 2,\n",
|
||||||
|
" \"same_domain_only\": True,\n",
|
||||||
" }\n",
|
" }\n",
|
||||||
")\n",
|
")\n",
|
||||||
"print(\"LocalScraper Result:\", result_local)\n",
|
"\n",
|
||||||
|
"print(\"\\nSmartCrawler Result:\")\n",
|
||||||
|
"print(json.dumps(result_crawler, indent=2))\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Check credits\n",
|
"# Check credits\n",
|
||||||
"credits_info = credits.invoke({})\n",
|
"credits_info = credits.invoke({})\n",
|
||||||
"print(\"\\nCredits Info:\", credits_info)"
|
"print(\"\\nCredits Info:\", credits_info)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "f13fb466",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# SmartCrawler example\n",
|
||||||
|
"from scrapegraph_py.logger import sgai_logger\n",
|
||||||
|
"import json\n",
|
||||||
|
"\n",
|
||||||
|
"from langchain_scrapegraph.tools import SmartCrawlerTool\n",
|
||||||
|
"\n",
|
||||||
|
"sgai_logger.set_logging(level=\"INFO\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Will automatically get SGAI_API_KEY from environment\n",
|
||||||
|
"tool = SmartCrawlerTool()\n",
|
||||||
|
"\n",
|
||||||
|
"# Example based on the provided code snippet\n",
|
||||||
|
"url = \"https://scrapegraphai.com/\"\n",
|
||||||
|
"prompt = (\n",
|
||||||
|
" \"What does the company do? and I need text content from their privacy and terms\"\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Use the tool with crawling parameters\n",
|
||||||
|
"result = tool.invoke(\n",
|
||||||
|
" {\n",
|
||||||
|
" \"url\": url,\n",
|
||||||
|
" \"prompt\": prompt,\n",
|
||||||
|
" \"cache_website\": True,\n",
|
||||||
|
" \"depth\": 2,\n",
|
||||||
|
" \"max_pages\": 2,\n",
|
||||||
|
" \"same_domain_only\": True,\n",
|
||||||
|
" }\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(json.dumps(result, indent=2))"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "d6e73897",
|
"id": "d6e73897",
|
||||||
@ -350,15 +407,21 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"## API reference\n",
|
"## API reference\n",
|
||||||
"\n",
|
"\n",
|
||||||
"For detailed documentation of all ScrapeGraph features and configurations head to the Langchain API reference: https://python.langchain.com/docs/integrations/tools/scrapegraph\n",
|
"For detailed documentation of all ScrapeGraph features and configurations head to [the Langchain API reference](https://python.langchain.com/docs/integrations/tools/scrapegraph).\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Or to the official SDK repo: https://github.com/ScrapeGraphAI/langchain-scrapegraph"
|
"Or to [the official SDK repo](https://github.com/ScrapeGraphAI/langchain-scrapegraph)."
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "d710dad8",
|
||||||
|
"metadata": {},
|
||||||
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "langchain",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -372,7 +435,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.9"
|
"version": "3.10.16"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
Loading…
Reference in New Issue
Block a user