From fc14f675f1af1e8fee23710f208259da9fcfd778 Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 15 Oct 2024 10:13:28 -0300 Subject: [PATCH] Community: Updated Firecrawl Document Loader to v1 (#26548) This PR updates the Firecrawl Document Loader to use the recently released V1 API of Firecrawl. **Key Updates:** **Firecrawl V1 Integration:** Updated the document loader to leverage the new Firecrawl V1 API for improved performance, reliability, and developer experience. **Map Functionality Added:** Introduced the map mode for more flexible document loading options. These updates enhance the integration and provide access to the latest features of Firecrawl. --------- Co-authored-by: Erick Friis Co-authored-by: Harrison Chase --- .../document_loaders/firecrawl.ipynb | 223 +++++++++--------- .../document_loaders/firecrawl.py | 219 +++++++++++++++-- 2 files changed, 318 insertions(+), 124 deletions(-) diff --git a/docs/docs/integrations/document_loaders/firecrawl.ipynb b/docs/docs/integrations/document_loaders/firecrawl.ipynb index 4abf280da8f..a6ec4ff418b 100644 --- a/docs/docs/integrations/document_loaders/firecrawl.ipynb +++ b/docs/docs/integrations/document_loaders/firecrawl.ipynb @@ -26,33 +26,32 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setup\n", - "\n", - "### Credentials \n", - "\n", - "You will need to get your own API key. Go to [this page](https://firecrawl.dev) to learn more." + "## Setup" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": {}, + "execution_count": null, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "import getpass\n", - "import os\n", - "\n", - "if \"FIRECRAWL_API_KEY\" not in os.environ:\n", - " os.environ[\"FIRECRAWL_API_KEY\"] = getpass.getpass(\"Enter your Firecrawl API key: \")" + "pip install firecrawl-py" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Installation\n", - "\n", - "You will need to install both the `langchain_community` and `firecrawl-py` pacakges:" + "## Usage" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You will need to get your own API key. See https://firecrawl.dev" ] }, { @@ -61,42 +60,12 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install -qU firecrawl-py==0.0.20 langchain_community" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initialization\n", - "\n", - "### Modes\n", - "\n", - "- `scrape`: Scrape single url and return the markdown.\n", - "- `crawl`: Crawl the url and all accessible sub pages and return the markdown for each one." + "from langchain_community.document_loaders.firecrawl import FireCrawlLoader" ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_community.document_loaders import FireCrawlLoader\n", - "\n", - "loader = FireCrawlLoader(url=\"https://firecrawl.dev\", mode=\"crawl\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load" - ] - }, - { - "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -111,40 +80,14 @@ } ], "source": [ - "docs = loader.load()\n", - "\n", - "docs[0]" + "loader = FireCrawlLoader(\n", + " api_key=\"YOUR_API_KEY\", url=\"https://firecrawl.dev\", mode=\"scrape\"\n", + ")" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}\n" - ] - } - ], - "source": [ - "print(docs[0].metadata)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Lazy Load\n", - "\n", - "You can use lazy loading to minimize memory requirements." - ] - }, - { - "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -160,39 +103,61 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "8" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "len(pages)" + "pages" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modes\n", + "\n", + "- `scrape`: Scrape single url and return the markdown.\n", + "- `crawl`: Crawl the url and all accessible sub pages and return the markdown for each one.\n", + "- `map`: Maps the URL and returns a list of semantically related pages." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Crawl\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)\n", - " Join the waitlist to turn any web\n", - "{'ogUrl': 'https://www.firecrawl.dev/blog/introducing-fire-engine-for-firecrawl', 'title': 'Introducing Fire Engine for Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/images/blog/fire-engine-launch.png', 'ogTitle': 'Introducing Fire Engine for Firecrawl', 'sitemap': {'lastmod': '2024-08-06T00:00:00.000Z', 'changefreq': 'weekly'}, 'keywords': 'firecrawl,fireengine,web crawling,dashboard,web scraping,LLM,data extraction', 'sourceURL': 'https://www.firecrawl.dev/blog/introducing-fire-engine-for-firecrawl', 'ogSiteName': 'Firecrawl', 'description': 'The most scalable, reliable, and fast way to get web data for Firecrawl.', 'ogDescription': 'The most scalable, reliable, and fast way to get web data for Firecrawl.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}\n" - ] - } - ], + "outputs": [], + "source": [ + "loader = FireCrawlLoader(\n", + " api_key=\"YOUR_API_KEY\",\n", + " url=\"https://firecrawl.dev\",\n", + " mode=\"crawl\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "print(pages[0].page_content[:100])\n", "print(pages[0].metadata)" @@ -202,10 +167,54 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Crawler Options\n", + "#### Crawl Options\n", "\n", - "You can also pass `params` to the loader. This is a dictionary of options to pass to the crawler. See the [FireCrawl API documentation](https://github.com/mendableai/firecrawl-py) for more information.\n", - "\n" + "You can also pass `params` to the loader. This is a dictionary of options to pass to the crawler. See the [FireCrawl API documentation](https://github.com/mendableai/firecrawl-py) for more information." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Map" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loader = FireCrawlLoader(api_key=\"YOUR_API_KEY\", url=\"firecrawl.dev\", mode=\"map\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Map Options\n", + "\n", + "You can also pass `params` to the loader. This is a dictionary of options to pass to the loader. See the [FireCrawl API documentation](https://github.com/mendableai/firecrawl-py) for more information." ] }, { @@ -220,7 +229,7 @@ ], "metadata": { "kernelspec": { - "display_name": "langchain", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/libs/community/langchain_community/document_loaders/firecrawl.py b/libs/community/langchain_community/document_loaders/firecrawl.py index 467813419ae..4423881dfef 100644 --- a/libs/community/langchain_community/document_loaders/firecrawl.py +++ b/libs/community/langchain_community/document_loaders/firecrawl.py @@ -1,3 +1,4 @@ +import warnings from typing import Iterator, Literal, Optional from langchain_core.document_loaders import BaseLoader @@ -48,7 +49,6 @@ class FireCrawlLoader(BaseLoader): Join the waitlist to turn any web {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []} - Async load: .. code-block:: python @@ -64,13 +64,169 @@ class FireCrawlLoader(BaseLoader): """ # noqa: E501 + def legacy_crawler_options_adapter(self, params: dict) -> dict: + use_legacy_options = False + legacy_keys = [ + "includes", + "excludes", + "allowBackwardCrawling", + "allowExternalContentLinks", + "pageOptions", + ] + for key in legacy_keys: + if params.get(key): + use_legacy_options = True + break + + if use_legacy_options: + warnings.warn( + "Deprecated parameters detected. See Firecrawl v1 docs for updates.", + DeprecationWarning, + ) + if "includes" in params: + if params["includes"] is True: + params["includePaths"] = params["includes"] + del params["includes"] + + if "excludes" in params: + if params["excludes"] is True: + params["excludePaths"] = params["excludes"] + del params["excludes"] + + if "allowBackwardCrawling" in params: + if params["allowBackwardCrawling"] is True: + params["allowBackwardLinks"] = params["allowBackwardCrawling"] + del params["allowBackwardCrawling"] + + if "allowExternalContentLinks" in params: + if params["allowExternalContentLinks"] is True: + params["allowExternalLinks"] = params["allowExternalContentLinks"] + del params["allowExternalContentLinks"] + + if "pageOptions" in params: + if isinstance(params["pageOptions"], dict): + params["scrapeOptions"] = self.legacy_scrape_options_adapter( + params["pageOptions"] + ) + del params["pageOptions"] + + return params + + def legacy_scrape_options_adapter(self, params: dict) -> dict: + use_legacy_options = False + formats = ["markdown"] + + if "extractorOptions" in params: + if "mode" in params["extractorOptions"]: + if ( + params["extractorOptions"]["mode"] == "llm-extraction" + or params["extractorOptions"]["mode"] + == "llm-extraction-from-raw-html" + or params["extractorOptions"]["mode"] + == "llm-extraction-from-markdown" + ): + use_legacy_options = True + if "extractionPrompt" in params["extractorOptions"]: + if params["extractorOptions"]["extractionPrompt"]: + params["prompt"] = params["extractorOptions"][ + "extractionPrompt" + ] + else: + params["prompt"] = params["extractorOptions"].get( + "extractionPrompt", + "Extract page information based on the schema.", + ) + + if "extractionSchema" in params["extractorOptions"]: + if params["extractorOptions"]["extractionSchema"]: + params["schema"] = params["extractorOptions"][ + "extractionSchema" + ] + + if "userPrompt" in params["extractorOptions"]: + if params["extractorOptions"]["userPrompt"]: + params["prompt"] = params["extractorOptions"]["userPrompt"] + + del params["extractorOptions"] + + scrape_keys = [ + "includeMarkdown", + "includeHtml", + "includeRawHtml", + "includeExtract", + "includeLinks", + "screenshot", + "fullPageScreenshot", + "onlyIncludeTags", + "removeTags", + ] + for key in scrape_keys: + if params.get(key): + use_legacy_options = True + break + + if use_legacy_options: + warnings.warn( + "Deprecated parameters detected. See Firecrawl v1 docs for updates.", + DeprecationWarning, + ) + if "includeMarkdown" in params: + if params["includeMarkdown"] is False: + formats.remove("markdown") + del params["includeMarkdown"] + + if "includeHtml" in params: + if params["includeHtml"] is True: + formats.append("html") + del params["includeHtml"] + + if "includeRawHtml" in params: + if params["includeRawHtml"] is True: + formats.append("rawHtml") + del params["includeRawHtml"] + + if "includeExtract" in params: + if params["includeExtract"] is True: + formats.append("extract") + del params["includeExtract"] + + if "includeLinks" in params: + if params["includeLinks"] is True: + formats.append("links") + del params["includeLinks"] + + if "screenshot" in params: + if params["screenshot"] is True: + formats.append("screenshot") + del params["screenshot"] + + if "fullPageScreenshot" in params: + if params["fullPageScreenshot"] is True: + formats.append("screenshot@fullPage") + del params["fullPageScreenshot"] + + if "onlyIncludeTags" in params: + if params["onlyIncludeTags"] is True: + params["includeTags"] = params["onlyIncludeTags"] + del params["onlyIncludeTags"] + + if "removeTags" in params: + if params["removeTags"] is True: + params["excludeTags"] = params["removeTags"] + del params["removeTags"] + + if "formats" not in params: + params["formats"] = formats + + return params + def __init__( self, url: str, *, api_key: Optional[str] = None, api_url: Optional[str] = None, - mode: Literal["crawl", "scrape"] = "crawl", + mode: Literal["crawl", "scrape", "map"] = "crawl", params: Optional[dict] = None, ): """Initialize with API key and url. @@ -82,8 +238,9 @@ class FireCrawlLoader(BaseLoader): api_url: The Firecrawl API URL. If not specified will be read from env var FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev. mode: The mode to run the loader in. Default is "crawl". - Options include "scrape" (single url) and - "crawl" (all accessible sub pages). + Options include "scrape" (single url), + "crawl" (all accessible sub pages), + "map" (returns list of links that are semantically related). params: The parameters to pass to the Firecrawl API. Examples include crawlerOptions. For more details, visit: https://github.com/mendableai/firecrawl-py @@ -95,30 +252,58 @@ class FireCrawlLoader(BaseLoader): raise ImportError( "`firecrawl` package not found, please run `pip install firecrawl-py`" ) - if mode not in ("crawl", "scrape"): + if mode not in ("crawl", "scrape", "search", "map"): raise ValueError( - f"Unrecognized mode '{mode}'. Expected one of 'crawl', 'scrape'." + f"Invalid mode '{mode}'. Allowed: 'crawl', 'scrape', 'search', 'map'." ) + + if not url: + raise ValueError("Url must be provided") + api_key = api_key or get_from_env("api_key", "FIRECRAWL_API_KEY") self.firecrawl = FirecrawlApp(api_key=api_key, api_url=api_url) self.url = url self.mode = mode - self.params = params + self.params = params or {} def lazy_load(self) -> Iterator[Document]: if self.mode == "scrape": - firecrawl_docs = [self.firecrawl.scrape_url(self.url, params=self.params)] + firecrawl_docs = [ + self.firecrawl.scrape_url( + self.url, params=self.legacy_scrape_options_adapter(self.params) + ) + ] elif self.mode == "crawl": - firecrawl_docs = self.firecrawl.crawl_url(self.url, params=self.params) + if not self.url: + raise ValueError("URL is required for crawl mode") + crawl_response = self.firecrawl.crawl_url( + self.url, params=self.legacy_crawler_options_adapter(self.params) + ) + firecrawl_docs = crawl_response.get("data", []) + elif self.mode == "map": + if not self.url: + raise ValueError("URL is required for map mode") + firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params) + elif self.mode == "search": + raise ValueError( + "Search mode is not supported in this version, please downgrade." + ) else: raise ValueError( - f"Unrecognized mode '{self.mode}'. Expected one of 'crawl', 'scrape'." + f"Invalid mode '{self.mode}'. Allowed: 'crawl', 'scrape', 'map'." ) for doc in firecrawl_docs: - metadata = doc.get("metadata", {}) - if (self.params is not None) and self.params.get( - "extractorOptions", {} - ).get("mode") == "llm-extraction": - metadata["llm_extraction"] = doc.get("llm_extraction") - - yield Document(page_content=doc.get("markdown", ""), metadata=metadata) + if self.mode == "map": + page_content = doc + metadata = {} + else: + page_content = ( + doc.get("markdown") or doc.get("html") or doc.get("rawHtml", "") + ) + metadata = doc.get("metadata", {}) + if not page_content: + continue + yield Document( + page_content=page_content, + metadata=metadata, + )