diff --git a/docs/docs/integrations/document_loaders/firecrawl.ipynb b/docs/docs/integrations/document_loaders/firecrawl.ipynb new file mode 100644 index 00000000000..d2c8c588e41 --- /dev/null +++ b/docs/docs/integrations/document_loaders/firecrawl.ipynb @@ -0,0 +1,193 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FireCrawl\n", + "\n", + "[FireCrawl](https://firecrawl.dev/?ref=langchain) crawls and convert any website into LLM-ready data. It crawls all accessible subpages and give you clean markdown and metadata for each. No sitemap required.\n", + "\n", + "FireCrawl handles complex tasks such as reverse proxies, caching, rate limits, and content blocked by JavaScript. Built by the [mendable.ai](https://mendable.ai) team.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: firecrawl-py in /Users/nicolascamara/anaconda3/envs/langchain/lib/python3.9/site-packages (0.0.5)\n", + "Requirement already satisfied: requests in /Users/nicolascamara/anaconda3/envs/langchain/lib/python3.9/site-packages (from firecrawl-py) (2.31.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/nicolascamara/anaconda3/envs/langchain/lib/python3.9/site-packages (from requests->firecrawl-py) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/nicolascamara/anaconda3/envs/langchain/lib/python3.9/site-packages (from requests->firecrawl-py) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/nicolascamara/anaconda3/envs/langchain/lib/python3.9/site-packages (from requests->firecrawl-py) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/nicolascamara/anaconda3/envs/langchain/lib/python3.9/site-packages (from requests->firecrawl-py) (2024.2.2)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install firecrawl-py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usage" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You will need to get your own API key. See https://firecrawl.dev" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import FireCrawlLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "loader = FireCrawlLoader(\n", + " api_key=\"YOUR_API_KEY\", url=\"https://firecrawl.dev\", mode=\"crawl\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='[Skip to content](#skip)\\n\\n[šŸ”„ FireCrawl](/)\\n\\n[Playground](/playground)\\n[Pricing](/pricing)\\n\\n[Log In](/signin)\\n[Log In](/signin)\\n[Sign Up](/signin/signup)\\n\\n![Slack Logo](/images/slack_logo_icon.png)\\n\\nNew message in: #coach-gtm\\n==========================\\n\\n@CoachGTM: Your meeting prep for Pied Piper < > WindFlow Dynamics is ready! Meeting starts in 30 minutes\\n\\nTurn websites into \\n_LLM-ready_ data\\n=====================================\\n\\nCrawl and convert any website into clean markdown\\n\\nTry now (100 free credits)No credit card required\\n\\nA product by\\n\\n[![Mendable Logo](/images/mendable_logo_transparent.png)Mendable](https://mendable.ai)\\n\\n![Mendable Website Image](/mendable-hero-8.png)\\n\\nCrawl, Capture, Clean\\n---------------------\\n\\nWe crawl all accessible subpages and give you clean markdown for each. No sitemap required.\\n\\n \\n [\\\\\\n {\\\\\\n \"url\": \"https://www.mendable.ai/\",\\\\\\n \"markdown\": \"## Welcome to Mendable\\\\\\n Mendable empowers teams with AI-driven solutions - \\\\\\n streamlining sales and support.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.mendable.ai/features\",\\\\\\n \"markdown\": \"## Features\\\\\\n Discover how Mendable\\'s cutting-edge features can \\\\\\n transform your business operations.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.mendable.ai/pricing\",\\\\\\n \"markdown\": \"## Pricing Plans\\\\\\n Choose the perfect plan that fits your business needs.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.mendable.ai/about\",\\\\\\n \"markdown\": \"## About Us\\\\\\n \\\\\\n Learn more about Mendable\\'s mission and the \\\\\\n team behind our innovative platform.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.mendable.ai/contact\",\\\\\\n \"markdown\": \"## Contact Us\\\\\\n Get in touch with us for any queries or support.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.mendable.ai/blog\",\\\\\\n \"markdown\": \"## Blog\\\\\\n Stay updated with the latest news and insights from Mendable.\"\\\\\\n }\\\\\\n ]\\n \\n\\nNote: The markdown has been edited for display purposes.\\n\\nWe handle the hard stuff\\n------------------------\\n\\nReverse proxyies, caching, rate limits, js-blocked content and more...\\n\\n#### Crawling\\n\\nFireCrawl crawls all accessible subpages, even without a sitemap.\\n\\n#### Dynamic content\\n\\nFireCrawl gathers data even if a website uses javascript to render content.\\n\\n#### To Markdown\\n\\nFireCrawl returns clean, well formatted markdown - ready for use in LLM applications\\n\\n#### Continuous updates\\n\\nSchedule syncs with FireCrawl. No cron jobs or orchestration required.\\n\\n#### Caching\\n\\nFireCrawl caches content, so you don\\'t have to wait for a full scrape unless new content exists.\\n\\n#### Built for AI\\n\\nBuilt by LLM engineers, for LLM engineers. Giving you clean data the way you want it.\\n\\nPricing Plans\\n=============\\n\\nStarter\\n-------\\n\\n50k credits ($1.00/1k)\\n\\n$50/month\\n\\n* Scrape 50,000 pages\\n* Credits valid for 6 months\\n* 2 simultaneous scrapers\\\\*\\n\\nSubscribe\\n\\nStandard\\n--------\\n\\n500k credits ($0.75/1k)\\n\\n$375/month\\n\\n* Scrape 500,000 pages\\n* Credits valid for 6 months\\n* 4 simultaneous scrapers\\\\*\\n\\nSubscribe\\n\\nScale\\n-----\\n\\n12.5M credits ($0.30/1k)\\n\\n$1,250/month\\n\\n* Scrape 2,500,000 pages\\n* Credits valid for 6 months\\n* 10 simultaneous scrapes\\\\*\\n\\nSubscribe\\n\\n\\\\* a \"scraper\" refers to how many scraper jobs you can simultaneously submit.\\n\\nWhat sites work?\\n----------------\\n\\nFirecrawl is best suited for business websites, docs and help centers.\\n\\nBuisness websites\\n\\nGathering business intelligence or connecting company data to your AI\\n\\nBlogs, Documentation and Help centers\\n\\nGather content from documentation and other textual sources\\n\\nSocial Media\\n\\nComing soon\\n\\n![Feature 01](/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Fexample-business-2.b6c6b56a.png&w=1920&q=75)\\n\\n![Feature 02](/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Fexample-docs-sites.11eef02d.png&w=1920&q=75)\\n\\nComing Soon\\n-----------\\n\\n[But I want it now!](https://calendly.com/d/cp3d-rvx-58g/mendable-meeting)\\n\\\\* Schedule a meeting\\n\\n![Feature 04](/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Fexample-business-2.b6c6b56a.png&w=1920&q=75)\\n\\n![Slack Logo](/images/slack_logo_icon.png)\\n\\nNew message in: #coach-gtm\\n==========================\\n\\n@CoachGTM: Your meeting prep for Pied Piper < > WindFlow Dynamics is ready! Meeting starts in 30 minutes\\n\\n[šŸ”„](/)\\n\\nReady to _Build?_\\n-----------------\\n\\n[Meet with us](https://calendly.com/d/cp3d-rvx-58g/mendable-meeting)\\n\\n[Try 100 queries free](/signin)\\n\\n[Discord](https://discord.gg/gSmWdAkdwd)\\n\\nFAQ\\n---\\n\\nFrequently asked questions about FireCrawl\\n\\nWhat is FireCrawl?\\n\\nFireCrawl is an advanced web crawling and data conversion tool designed to transform any website into clean, LLM-ready markdown. Ideal for AI developers and data scientists, it automates the collection, cleaning, and formatting of web data, streamlining the preparation process for Large Language Model (LLM) applications.\\n\\nHow does FireCrawl handle dynamic content on websites?\\n\\nUnlike traditional web scrapers, FireCrawl is equipped to handle dynamic content rendered with JavaScript. It ensures comprehensive data collection from all accessible subpages, making it a reliable tool for scraping websites that rely heavily on JS for content delivery.\\n\\nCan FireCrawl crawl websites without a sitemap?\\n\\nYes, FireCrawl can access and crawl all accessible subpages of a website, even in the absence of a sitemap. This feature enables users to gather data from a wide array of web sources with minimal setup.\\n\\nWhat formats can FireCrawl convert web data into?\\n\\nFireCrawl specializes in converting web data into clean, well-formatted markdown. This format is particularly suited for LLM applications, offering a structured yet flexible way to represent web content.\\n\\nHow does FireCrawl ensure the cleanliness of the data?\\n\\nFireCrawl employs advanced algorithms to clean and structure the scraped data, removing unnecessary elements and formatting the content into readable markdown. This process ensures that the data is ready for use in LLM applications without further preprocessing.\\n\\nIs FireCrawl suitable for large-scale data scraping projects?\\n\\nAbsolutely. FireCrawl offers various pricing plans, including a Scale plan that supports scraping of millions of pages. With features like caching and scheduled syncs, it\\'s designed to efficiently handle large-scale data scraping and continuous updates, making it ideal for enterprises and large projects.\\n\\nWhat measures does FireCrawl take to handle web scraping challenges like rate limits and caching?\\n\\nFireCrawl is built to navigate common web scraping challenges, including reverse proxies, rate limits, and caching. It smartly manages requests and employs caching techniques to minimize bandwidth usage and avoid triggering anti-scraping mechanisms, ensuring reliable data collection.\\n\\nHow can I try FireCrawl?\\n\\nYou can start with FireCrawl by trying our free trial, which includes 100 pages. This trial allows you to experience firsthand how FireCrawl can streamline your data collection and conversion processes. Sign up and begin transforming web content into LLM-ready data today!\\n\\nWho can benefit from using FireCrawl?\\n\\nFireCrawl is tailored for LLM engineers, data scientists, AI researchers, and developers looking to harness web data for training machine learning models, market research, content aggregation, and more. It simplifies the data preparation process, allowing professionals to focus on insights and model development.\\n\\n[šŸ”„](/)\\n\\nĀ© A product by Mendable.ai - All rights reserved.\\n\\n[Twitter](https://twitter.com/mendableai)\\n[GitHub](https://github.com/sideguide)\\n[Discord](https://discord.gg/gSmWdAkdwd)\\n\\nBacked by![Y Combinator Logo](/images/yc.svg)\\n\\n![SOC 2 Type II](/soc2type2badge.png)\\n\\n###### Company\\n\\n* [About us](#0)\\n \\n* [Diversity & Inclusion](#0)\\n \\n* [Blog](#0)\\n \\n* [Careers](#0)\\n \\n* [Financial statements](#0)\\n \\n\\n###### Resources\\n\\n* [Community](#0)\\n \\n* [Terms of service](#0)\\n \\n* [Collaboration features](#0)\\n \\n\\n###### Legals\\n\\n* [Refund policy](#0)\\n \\n* [Terms & Conditions](#0)\\n \\n* [Privacy policy](#0)\\n \\n* [Brand Kit](#0)', metadata={'title': 'Home - FireCrawl', 'description': 'FireCrawl crawls and converts any website into clean markdown.', 'language': None, 'sourceURL': 'https://firecrawl.dev/'}),\n", + " Document(page_content='[Skip to content](#skip)\\n\\n[šŸ”„ FireCrawl](/)\\n\\n[Playground](/playground)\\n[Pricing](/pricing)\\n\\n[Log In](/signin)\\n[Log In](/signin)\\n[Sign Up](/signin/signup)\\n\\nPricing Plans\\n=============\\n\\nStarter\\n-------\\n\\n50k credits ($1.00/1k)\\n\\n$50/month\\n\\n* Scrape 50,000 pages\\n* Credits valid for 6 months\\n* 2 simultaneous scrapers\\\\*\\n\\nSubscribe\\n\\nStandard\\n--------\\n\\n500k credits ($0.75/1k)\\n\\n$375/month\\n\\n* Scrape 500,000 pages\\n* Credits valid for 6 months\\n* 4 simultaneous scrapers\\\\*\\n\\nSubscribe\\n\\nScale\\n-----\\n\\n12.5M credits ($0.30/1k)\\n\\n$1,250/month\\n\\n* Scrape 2,500,000 pages\\n* Credits valid for 6 months\\n* 10 simultaneous scrapes\\\\*\\n\\nSubscribe\\n\\n\\\\* a \"scraper\" refers to how many scraper jobs you can simultaneously submit.\\n\\n[šŸ”„](/)\\n\\nĀ© A product by Mendable.ai - All rights reserved.\\n\\n[Twitter](https://twitter.com/mendableai)\\n[GitHub](https://github.com/sideguide)\\n[Discord](https://discord.gg/gSmWdAkdwd)\\n\\nBacked by![Y Combinator Logo](/images/yc.svg)\\n\\n![SOC 2 Type II](/soc2type2badge.png)\\n\\n###### Company\\n\\n* [About us](#0)\\n \\n* [Diversity & Inclusion](#0)\\n \\n* [Blog](#0)\\n \\n* [Careers](#0)\\n \\n* [Financial statements](#0)\\n \\n\\n###### Resources\\n\\n* [Community](#0)\\n \\n* [Terms of service](#0)\\n \\n* [Collaboration features](#0)\\n \\n\\n###### Legals\\n\\n* [Refund policy](#0)\\n \\n* [Terms & Conditions](#0)\\n \\n* [Privacy policy](#0)\\n \\n* [Brand Kit](#0)', metadata={'title': 'FireCrawl', 'description': 'Turn any website into LLM-ready data.', 'language': None, 'sourceURL': 'https://firecrawl.dev/pricing'})]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modes\n", + "\n", + "- `scrape`: Scrape single url and return the markdown.\n", + "- `crawl`: Crawl the url and all accessible sub pages and return the markdown for each one." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "loader = FireCrawlLoader(\n", + " api_key=\"YOUR_API_KEY\",\n", + " url=\"https://firecrawl.dev\",\n", + " mode=\"scrape\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='[Skip to content](#skip)\\n\\n[šŸ”„ FireCrawl](/)\\n\\n[Playground](/playground)\\n[Pricing](/pricing)\\n\\n[Log In](/signin)\\n[Log In](/signin)\\n[Sign Up](/signin/signup)\\n\\n![Slack Logo](/images/slack_logo_icon.png)\\n\\nNew message in: #coach-gtm\\n==========================\\n\\n@CoachGTM: Your meeting prep for Pied Piper < > WindFlow Dynamics is ready! Meeting starts in 30 minutes\\n\\nTurn websites into \\n_LLM-ready_ data\\n=====================================\\n\\nCrawl and convert any website into clean markdown\\n\\nTry now (100 free credits)No credit card required\\n\\nA product by\\n\\n[![Mendable Logo](/images/mendable_logo_transparent.png)Mendable](https://mendable.ai)\\n\\n![Mendable Website Image](/mendable-hero-8.png)\\n\\nCrawl, Capture, Clean\\n---------------------\\n\\nWe crawl all accessible subpages and give you clean markdown for each. No sitemap required.\\n\\n \\n [\\\\\\n {\\\\\\n \"url\": \"https://www.mendable.ai/\",\\\\\\n \"markdown\": \"## Welcome to Mendable\\\\\\n Mendable empowers teams with AI-driven solutions - \\\\\\n streamlining sales and support.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.mendable.ai/features\",\\\\\\n \"markdown\": \"## Features\\\\\\n Discover how Mendable\\'s cutting-edge features can \\\\\\n transform your business operations.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.mendable.ai/pricing\",\\\\\\n \"markdown\": \"## Pricing Plans\\\\\\n Choose the perfect plan that fits your business needs.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.mendable.ai/about\",\\\\\\n \"markdown\": \"## About Us\\\\\\n \\\\\\n Learn more about Mendable\\'s mission and the \\\\\\n team behind our innovative platform.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.mendable.ai/contact\",\\\\\\n \"markdown\": \"## Contact Us\\\\\\n Get in touch with us for any queries or support.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.mendable.ai/blog\",\\\\\\n \"markdown\": \"## Blog\\\\\\n Stay updated with the latest news and insights from Mendable.\"\\\\\\n }\\\\\\n ]\\n \\n\\nNote: The markdown has been edited for display purposes.\\n\\nWe handle the hard stuff\\n------------------------\\n\\nReverse proxyies, caching, rate limits, js-blocked content and more...\\n\\n#### Crawling\\n\\nFireCrawl crawls all accessible subpages, even without a sitemap.\\n\\n#### Dynamic content\\n\\nFireCrawl gathers data even if a website uses javascript to render content.\\n\\n#### To Markdown\\n\\nFireCrawl returns clean, well formatted markdown - ready for use in LLM applications\\n\\n#### Continuous updates\\n\\nSchedule syncs with FireCrawl. No cron jobs or orchestration required.\\n\\n#### Caching\\n\\nFireCrawl caches content, so you don\\'t have to wait for a full scrape unless new content exists.\\n\\n#### Built for AI\\n\\nBuilt by LLM engineers, for LLM engineers. Giving you clean data the way you want it.\\n\\nPricing Plans\\n=============\\n\\nStarter\\n-------\\n\\n50k credits ($1.00/1k)\\n\\n$50/month\\n\\n* Scrape 50,000 pages\\n* Credits valid for 6 months\\n* 2 simultaneous scrapers\\\\*\\n\\nSubscribe\\n\\nStandard\\n--------\\n\\n500k credits ($0.75/1k)\\n\\n$375/month\\n\\n* Scrape 500,000 pages\\n* Credits valid for 6 months\\n* 4 simultaneous scrapers\\\\*\\n\\nSubscribe\\n\\nScale\\n-----\\n\\n12.5M credits ($0.30/1k)\\n\\n$1,250/month\\n\\n* Scrape 2,500,000 pages\\n* Credits valid for 6 months\\n* 10 simultaneous scrapes\\\\*\\n\\nSubscribe\\n\\n\\\\* a \"scraper\" refers to how many scraper jobs you can simultaneously submit.\\n\\nWhat sites work?\\n----------------\\n\\nFirecrawl is best suited for business websites, docs and help centers.\\n\\nBuisness websites\\n\\nGathering business intelligence or connecting company data to your AI\\n\\nBlogs, Documentation and Help centers\\n\\nGather content from documentation and other textual sources\\n\\nSocial Media\\n\\nComing soon\\n\\n![Feature 01](/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Fexample-business-2.b6c6b56a.png&w=1920&q=75)\\n\\n![Feature 02](/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Fexample-docs-sites.11eef02d.png&w=1920&q=75)\\n\\nComing Soon\\n-----------\\n\\n[But I want it now!](https://calendly.com/d/cp3d-rvx-58g/mendable-meeting)\\n\\\\* Schedule a meeting\\n\\n![Feature 04](/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Fexample-business-2.b6c6b56a.png&w=1920&q=75)\\n\\n![Slack Logo](/images/slack_logo_icon.png)\\n\\nNew message in: #coach-gtm\\n==========================\\n\\n@CoachGTM: Your meeting prep for Pied Piper < > WindFlow Dynamics is ready! Meeting starts in 30 minutes\\n\\n[šŸ”„](/)\\n\\nReady to _Build?_\\n-----------------\\n\\n[Meet with us](https://calendly.com/d/cp3d-rvx-58g/mendable-meeting)\\n\\n[Try 100 queries free](/signin)\\n\\n[Discord](https://discord.gg/gSmWdAkdwd)\\n\\nFAQ\\n---\\n\\nFrequently asked questions about FireCrawl\\n\\nWhat is FireCrawl?\\n\\nFireCrawl is an advanced web crawling and data conversion tool designed to transform any website into clean, LLM-ready markdown. Ideal for AI developers and data scientists, it automates the collection, cleaning, and formatting of web data, streamlining the preparation process for Large Language Model (LLM) applications.\\n\\nHow does FireCrawl handle dynamic content on websites?\\n\\nUnlike traditional web scrapers, FireCrawl is equipped to handle dynamic content rendered with JavaScript. It ensures comprehensive data collection from all accessible subpages, making it a reliable tool for scraping websites that rely heavily on JS for content delivery.\\n\\nCan FireCrawl crawl websites without a sitemap?\\n\\nYes, FireCrawl can access and crawl all accessible subpages of a website, even in the absence of a sitemap. This feature enables users to gather data from a wide array of web sources with minimal setup.\\n\\nWhat formats can FireCrawl convert web data into?\\n\\nFireCrawl specializes in converting web data into clean, well-formatted markdown. This format is particularly suited for LLM applications, offering a structured yet flexible way to represent web content.\\n\\nHow does FireCrawl ensure the cleanliness of the data?\\n\\nFireCrawl employs advanced algorithms to clean and structure the scraped data, removing unnecessary elements and formatting the content into readable markdown. This process ensures that the data is ready for use in LLM applications without further preprocessing.\\n\\nIs FireCrawl suitable for large-scale data scraping projects?\\n\\nAbsolutely. FireCrawl offers various pricing plans, including a Scale plan that supports scraping of millions of pages. With features like caching and scheduled syncs, it\\'s designed to efficiently handle large-scale data scraping and continuous updates, making it ideal for enterprises and large projects.\\n\\nWhat measures does FireCrawl take to handle web scraping challenges like rate limits and caching?\\n\\nFireCrawl is built to navigate common web scraping challenges, including reverse proxies, rate limits, and caching. It smartly manages requests and employs caching techniques to minimize bandwidth usage and avoid triggering anti-scraping mechanisms, ensuring reliable data collection.\\n\\nHow can I try FireCrawl?\\n\\nYou can start with FireCrawl by trying our free trial, which includes 100 pages. This trial allows you to experience firsthand how FireCrawl can streamline your data collection and conversion processes. Sign up and begin transforming web content into LLM-ready data today!\\n\\nWho can benefit from using FireCrawl?\\n\\nFireCrawl is tailored for LLM engineers, data scientists, AI researchers, and developers looking to harness web data for training machine learning models, market research, content aggregation, and more. It simplifies the data preparation process, allowing professionals to focus on insights and model development.\\n\\n[šŸ”„](/)\\n\\nĀ© A product by Mendable.ai - All rights reserved.\\n\\n[Twitter](https://twitter.com/mendableai)\\n[GitHub](https://github.com/sideguide)\\n[Discord](https://discord.gg/gSmWdAkdwd)\\n\\nBacked by![Y Combinator Logo](/images/yc.svg)\\n\\n![SOC 2 Type II](/soc2type2badge.png)\\n\\n###### Company\\n\\n* [About us](#0)\\n \\n* [Diversity & Inclusion](#0)\\n \\n* [Blog](#0)\\n \\n* [Careers](#0)\\n \\n* [Financial statements](#0)\\n \\n\\n###### Resources\\n\\n* [Community](#0)\\n \\n* [Terms of service](#0)\\n \\n* [Collaboration features](#0)\\n \\n\\n###### Legals\\n\\n* [Refund policy](#0)\\n \\n* [Terms & Conditions](#0)\\n \\n* [Privacy policy](#0)\\n \\n* [Brand Kit](#0)', metadata={'title': 'Home - FireCrawl', 'description': 'FireCrawl crawls and converts any website into clean markdown.', 'language': None, 'sourceURL': 'https://firecrawl.dev'})]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Crawler Options\n", + "\n", + "You can also pass `params` to the loader. This is a dictionary of options to pass to the crawler. See the [FireCrawl API documentation](https://github.com/mendableai/firecrawl-py) for more information.\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "langchain", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/docs/integrations/document_loaders/web_base.ipynb b/docs/docs/integrations/document_loaders/web_base.ipynb index 371d9108ee8..5362d8d7021 100644 --- a/docs/docs/integrations/document_loaders/web_base.ipynb +++ b/docs/docs/integrations/document_loaders/web_base.ipynb @@ -7,7 +7,9 @@ "source": [ "# WebBaseLoader\n", "\n", - "This covers how to use `WebBaseLoader` to load all text from `HTML` webpages into a document format that we can use downstream. For more custom logic for loading webpages look at some child class examples such as `IMSDbLoader`, `AZLyricsLoader`, and `CollegeConfidentialLoader`" + "This covers how to use `WebBaseLoader` to load all text from `HTML` webpages into a document format that we can use downstream. For more custom logic for loading webpages look at some child class examples such as `IMSDbLoader`, `AZLyricsLoader`, and `CollegeConfidentialLoader`. \n", + "\n", + "If you don't want to worry about website crawling, bypassing JS-blocking sites, and data cleaning, consider using `FireCrawlLoader`.\n" ] }, { @@ -277,4 +279,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/docs/docs/modules/data_connection/document_loaders/html.mdx b/docs/docs/modules/data_connection/document_loaders/html.mdx index 9c81f123673..a6d204bd21e 100644 --- a/docs/docs/modules/data_connection/document_loaders/html.mdx +++ b/docs/docs/modules/data_connection/document_loaders/html.mdx @@ -55,6 +55,32 @@ data +## Loading HTML with FireCrawlLoader + +[FireCrawl](https://firecrawl.dev/?ref=langchain) crawls and convert any website into markdown. It crawls all accessible subpages and give you clean markdown and metadata for each. + +FireCrawl handles complex tasks such as reverse proxies, caching, rate limits, and content blocked by JavaScript. + +### Prerequisite + +You need to have a FireCrawl API key to use this loader. You can get one by signing up at [FireCrawl](https://firecrawl.dev/?ref=langchainpy). + +```python +%pip install --upgrade --quiet langchain langchain-community firecrawl-py + +from langchain_community.document_loaders import FireCrawlLoader + + +loader = FireCrawlLoader( + api_key="YOUR_API_KEY", url="https://firecrawl.dev", mode="crawl" +) + +data = loader.load() +``` + +For more information on how to use FireCrawl, visit [FireCrawl](https://firecrawl.dev/?ref=langchainpy). + + ## Loading HTML with AzureAIDocumentIntelligenceLoader [Azure AI Document Intelligence](https://aka.ms/doc-intelligence) (formerly known as `Azure Form Recognizer`) is machine-learning diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index 83170a0b479..6fd37037452 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -187,6 +187,9 @@ if TYPE_CHECKING: from langchain_community.document_loaders.figma import ( FigmaFileLoader, # noqa: F401 ) + from langchain_community.document_loaders.firecrawl import ( + FireCrawlLoader, # noqa: F401 + ) from langchain_community.document_loaders.gcs_directory import ( GCSDirectoryLoader, # noqa: F401 ) @@ -560,6 +563,7 @@ __all__ = [ "FacebookChatLoader", "FaunaLoader", "FigmaFileLoader", + "FireCrawlLoader", "FileSystemBlobLoader", "GCSDirectoryLoader", "GCSFileLoader", @@ -745,6 +749,7 @@ _module_lookup = { "FacebookChatLoader": "langchain_community.document_loaders.facebook_chat", "FaunaLoader": "langchain_community.document_loaders.fauna", "FigmaFileLoader": "langchain_community.document_loaders.figma", + "FireCrawlLoader": "langchain_community.document_loaders.firecrawl", "FileSystemBlobLoader": "langchain_community.document_loaders.blob_loaders", "GCSDirectoryLoader": "langchain_community.document_loaders.gcs_directory", "GCSFileLoader": "langchain_community.document_loaders.gcs_file", diff --git a/libs/community/langchain_community/document_loaders/firecrawl.py b/libs/community/langchain_community/document_loaders/firecrawl.py new file mode 100644 index 00000000000..8cc54f4f593 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/firecrawl.py @@ -0,0 +1,66 @@ +from typing import Iterator, Literal, Optional + +from langchain_core.document_loaders import BaseLoader +from langchain_core.documents import Document +from langchain_core.utils import get_from_env + + +class FireCrawlLoader(BaseLoader): + """Load web pages as Documents using FireCrawl. + + Must have Python package `firecrawl` installed and a FireCrawl API key. See + https://www.firecrawl.dev/ for more. + """ + + def __init__( + self, + url: str, + *, + api_key: Optional[str] = None, + mode: Literal["crawl", "scrape"] = "crawl", + params: Optional[dict] = None, + ): + """Initialize with API key and url. + + Args: + url: The url to be crawled. + api_key: The Firecrawl API key. If not specified will be read from env var + FIREWALL_API_KEY. Get an API key + mode: The mode to run the loader in. Default is "crawl". + Options include "scrape" (single url) and + "crawl" (all accessible sub pages). + params: The parameters to pass to the Firecrawl API. + Examples include crawlerOptions. + For more details, visit: https://github.com/mendableai/firecrawl-py + """ + + try: + from firecrawl import FirecrawlApp # noqa: F401 + except ImportError: + raise ImportError( + "`firecrawl` package not found, please run `pip install firecrawl-py`" + ) + if mode not in ("crawl", "scrape"): + raise ValueError( + f"Unrecognized mode '{mode}'. Expected one of 'crawl', 'scrape'." + ) + api_key = api_key or get_from_env("api_key", "FIREWALL_API_KEY") + self.firecrawl = FirecrawlApp(api_key=api_key) + self.url = url + self.mode = mode + self.params = params + + def lazy_load(self) -> Iterator[Document]: + if self.mode == "scrape": + firecrawl_docs = [self.firecrawl.scrape_url(self.url, params=self.params)] + elif self.mode == "crawl": + firecrawl_docs = self.firecrawl.crawl_url(self.url, params=self.params) + else: + raise ValueError( + f"Unrecognized mode '{self.mode}'. Expected one of 'crawl', 'scrape'." + ) + for doc in firecrawl_docs: + yield Document( + page_content=doc.get("markdown", ""), + metadata=doc.get("metadata", {}), + ) diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index d1a3f8ab372..3bf8182d489 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -65,6 +65,7 @@ EXPECTED_ALL = [ "FaunaLoader", "FigmaFileLoader", "FileSystemBlobLoader", + "FireCrawlLoader", "GCSDirectoryLoader", "GCSFileLoader", "GeoDataFrameLoader",