From fc14f675f1af1e8fee23710f208259da9fcfd778 Mon Sep 17 00:00:00 2001
From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com>
Date: Tue, 15 Oct 2024 10:13:28 -0300
Subject: [PATCH] Community: Updated Firecrawl Document Loader to v1 (#26548)

This PR updates the Firecrawl Document Loader to use the recently
released V1 API of Firecrawl.

**Key Updates:**

**Firecrawl V1 Integration:** Updated the document loader to leverage
the new Firecrawl V1 API for improved performance, reliability, and
developer experience.

**Map Functionality Added:** Introduced the map mode for more flexible
document loading options.

These updates enhance the integration and provide access to the latest
features of Firecrawl.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
---
 .../document_loaders/firecrawl.ipynb          | 223 +++++++++---------
 .../document_loaders/firecrawl.py             | 219 +++++++++++++++--
 2 files changed, 318 insertions(+), 124 deletions(-)

diff --git a/docs/docs/integrations/document_loaders/firecrawl.ipynb b/docs/docs/integrations/document_loaders/firecrawl.ipynb
index 4abf280da8f..a6ec4ff418b 100644
--- a/docs/docs/integrations/document_loaders/firecrawl.ipynb
+++ b/docs/docs/integrations/document_loaders/firecrawl.ipynb
@@ -26,33 +26,32 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Setup\n",
-    "\n",
-    "### Credentials \n",
-    "\n",
-    "You will need to get your own API key. Go to  [this page](https://firecrawl.dev) to learn more."
+    "## Setup"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
-    "import getpass\n",
-    "import os\n",
-    "\n",
-    "if \"FIRECRAWL_API_KEY\" not in os.environ:\n",
-    "    os.environ[\"FIRECRAWL_API_KEY\"] = getpass.getpass(\"Enter your Firecrawl API key: \")"
+    "pip install firecrawl-py"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Installation\n",
-    "\n",
-    "You will need to install both the `langchain_community` and `firecrawl-py` pacakges:"
+    "## Usage"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You will need to get your own API key. See https://firecrawl.dev"
    ]
   },
   {
@@ -61,42 +60,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install -qU firecrawl-py==0.0.20 langchain_community"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Initialization\n",
-    "\n",
-    "### Modes\n",
-    "\n",
-    "- `scrape`: Scrape single url and return the markdown.\n",
-    "- `crawl`: Crawl the url and all accessible sub pages and return the markdown for each one."
+    "from langchain_community.document_loaders.firecrawl import FireCrawlLoader"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain_community.document_loaders import FireCrawlLoader\n",
-    "\n",
-    "loader = FireCrawlLoader(url=\"https://firecrawl.dev\", mode=\"crawl\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Load"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -111,40 +80,14 @@
     }
    ],
    "source": [
-    "docs = loader.load()\n",
-    "\n",
-    "docs[0]"
+    "loader = FireCrawlLoader(\n",
+    "    api_key=\"YOUR_API_KEY\", url=\"https://firecrawl.dev\", mode=\"scrape\"\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(docs[0].metadata)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Lazy Load\n",
-    "\n",
-    "You can use lazy loading to minimize memory requirements."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -160,39 +103,61 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "8"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "len(pages)"
+    "pages"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Modes\n",
+    "\n",
+    "- `scrape`: Scrape single url and return the markdown.\n",
+    "- `crawl`: Crawl the url and all accessible sub pages and return the markdown for each one.\n",
+    "- `map`: Maps the URL and returns a list of semantically related pages."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Crawl\n",
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)\n",
-      " Join the waitlist to turn any web\n",
-      "{'ogUrl': 'https://www.firecrawl.dev/blog/introducing-fire-engine-for-firecrawl', 'title': 'Introducing Fire Engine for Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/images/blog/fire-engine-launch.png', 'ogTitle': 'Introducing Fire Engine for Firecrawl', 'sitemap': {'lastmod': '2024-08-06T00:00:00.000Z', 'changefreq': 'weekly'}, 'keywords': 'firecrawl,fireengine,web crawling,dashboard,web scraping,LLM,data extraction', 'sourceURL': 'https://www.firecrawl.dev/blog/introducing-fire-engine-for-firecrawl', 'ogSiteName': 'Firecrawl', 'description': 'The most scalable, reliable, and fast way to get web data for Firecrawl.', 'ogDescription': 'The most scalable, reliable, and fast way to get web data for Firecrawl.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}\n"
-     ]
-    }
-   ],
+   "outputs": [],
+   "source": [
+    "loader = FireCrawlLoader(\n",
+    "    api_key=\"YOUR_API_KEY\",\n",
+    "    url=\"https://firecrawl.dev\",\n",
+    "    mode=\"crawl\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "print(pages[0].page_content[:100])\n",
     "print(pages[0].metadata)"
@@ -202,10 +167,54 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Crawler Options\n",
+    "#### Crawl Options\n",
     "\n",
-    "You can also pass `params` to the loader. This is a dictionary of options to pass to the crawler. See the [FireCrawl API documentation](https://github.com/mendableai/firecrawl-py) for more information.\n",
-    "\n"
+    "You can also pass `params` to the loader. This is a dictionary of options to pass to the crawler. See the [FireCrawl API documentation](https://github.com/mendableai/firecrawl-py) for more information."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Map"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = FireCrawlLoader(api_key=\"YOUR_API_KEY\", url=\"firecrawl.dev\", mode=\"map\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Map Options\n",
+    "\n",
+    "You can also pass `params` to the loader. This is a dictionary of options to pass to the loader. See the [FireCrawl API documentation](https://github.com/mendableai/firecrawl-py) for more information."
    ]
   },
   {
@@ -220,7 +229,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "langchain",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/libs/community/langchain_community/document_loaders/firecrawl.py b/libs/community/langchain_community/document_loaders/firecrawl.py
index 467813419ae..4423881dfef 100644
--- a/libs/community/langchain_community/document_loaders/firecrawl.py
+++ b/libs/community/langchain_community/document_loaders/firecrawl.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Iterator, Literal, Optional
 
 from langchain_core.document_loaders import BaseLoader
@@ -48,7 +49,6 @@ class FireCrawlLoader(BaseLoader):
              Join the waitlist to turn any web
             {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}
 
-
     Async load:
         .. code-block:: python
 
@@ -64,13 +64,169 @@ class FireCrawlLoader(BaseLoader):
 
     """  # noqa: E501
 
+    def legacy_crawler_options_adapter(self, params: dict) -> dict:
+        use_legacy_options = False
+        legacy_keys = [
+            "includes",
+            "excludes",
+            "allowBackwardCrawling",
+            "allowExternalContentLinks",
+            "pageOptions",
+        ]
+        for key in legacy_keys:
+            if params.get(key):
+                use_legacy_options = True
+                break
+
+        if use_legacy_options:
+            warnings.warn(
+                "Deprecated parameters detected. See Firecrawl v1 docs for updates.",
+                DeprecationWarning,
+            )
+            if "includes" in params:
+                if params["includes"] is True:
+                    params["includePaths"] = params["includes"]
+                del params["includes"]
+
+            if "excludes" in params:
+                if params["excludes"] is True:
+                    params["excludePaths"] = params["excludes"]
+                del params["excludes"]
+
+            if "allowBackwardCrawling" in params:
+                if params["allowBackwardCrawling"] is True:
+                    params["allowBackwardLinks"] = params["allowBackwardCrawling"]
+                del params["allowBackwardCrawling"]
+
+            if "allowExternalContentLinks" in params:
+                if params["allowExternalContentLinks"] is True:
+                    params["allowExternalLinks"] = params["allowExternalContentLinks"]
+                del params["allowExternalContentLinks"]
+
+            if "pageOptions" in params:
+                if isinstance(params["pageOptions"], dict):
+                    params["scrapeOptions"] = self.legacy_scrape_options_adapter(
+                        params["pageOptions"]
+                    )
+                del params["pageOptions"]
+
+        return params
+
+    def legacy_scrape_options_adapter(self, params: dict) -> dict:
+        use_legacy_options = False
+        formats = ["markdown"]
+
+        if "extractorOptions" in params:
+            if "mode" in params["extractorOptions"]:
+                if (
+                    params["extractorOptions"]["mode"] == "llm-extraction"
+                    or params["extractorOptions"]["mode"]
+                    == "llm-extraction-from-raw-html"
+                    or params["extractorOptions"]["mode"]
+                    == "llm-extraction-from-markdown"
+                ):
+                    use_legacy_options = True
+                    if "extractionPrompt" in params["extractorOptions"]:
+                        if params["extractorOptions"]["extractionPrompt"]:
+                            params["prompt"] = params["extractorOptions"][
+                                "extractionPrompt"
+                            ]
+                        else:
+                            params["prompt"] = params["extractorOptions"].get(
+                                "extractionPrompt",
+                                "Extract page information based on the schema.",
+                            )
+
+                    if "extractionSchema" in params["extractorOptions"]:
+                        if params["extractorOptions"]["extractionSchema"]:
+                            params["schema"] = params["extractorOptions"][
+                                "extractionSchema"
+                            ]
+
+                    if "userPrompt" in params["extractorOptions"]:
+                        if params["extractorOptions"]["userPrompt"]:
+                            params["prompt"] = params["extractorOptions"]["userPrompt"]
+
+                    del params["extractorOptions"]
+
+        scrape_keys = [
+            "includeMarkdown",
+            "includeHtml",
+            "includeRawHtml",
+            "includeExtract",
+            "includeLinks",
+            "screenshot",
+            "fullPageScreenshot",
+            "onlyIncludeTags",
+            "removeTags",
+        ]
+        for key in scrape_keys:
+            if params.get(key):
+                use_legacy_options = True
+                break
+
+        if use_legacy_options:
+            warnings.warn(
+                "Deprecated parameters detected. See Firecrawl v1 docs for updates.",
+                DeprecationWarning,
+            )
+            if "includeMarkdown" in params:
+                if params["includeMarkdown"] is False:
+                    formats.remove("markdown")
+                del params["includeMarkdown"]
+
+            if "includeHtml" in params:
+                if params["includeHtml"] is True:
+                    formats.append("html")
+                del params["includeHtml"]
+
+            if "includeRawHtml" in params:
+                if params["includeRawHtml"] is True:
+                    formats.append("rawHtml")
+                del params["includeRawHtml"]
+
+            if "includeExtract" in params:
+                if params["includeExtract"] is True:
+                    formats.append("extract")
+                del params["includeExtract"]
+
+            if "includeLinks" in params:
+                if params["includeLinks"] is True:
+                    formats.append("links")
+                del params["includeLinks"]
+
+            if "screenshot" in params:
+                if params["screenshot"] is True:
+                    formats.append("screenshot")
+                del params["screenshot"]
+
+            if "fullPageScreenshot" in params:
+                if params["fullPageScreenshot"] is True:
+                    formats.append("screenshot@fullPage")
+                del params["fullPageScreenshot"]
+
+            if "onlyIncludeTags" in params:
+                if params["onlyIncludeTags"] is True:
+                    params["includeTags"] = params["onlyIncludeTags"]
+                del params["onlyIncludeTags"]
+
+            if "removeTags" in params:
+                if params["removeTags"] is True:
+                    params["excludeTags"] = params["removeTags"]
+                del params["removeTags"]
+
+        if "formats" not in params:
+            params["formats"] = formats
+
+        return params
+
     def __init__(
         self,
         url: str,
         *,
         api_key: Optional[str] = None,
         api_url: Optional[str] = None,
-        mode: Literal["crawl", "scrape"] = "crawl",
+        mode: Literal["crawl", "scrape", "map"] = "crawl",
         params: Optional[dict] = None,
     ):
         """Initialize with API key and url.
@@ -82,8 +238,9 @@ class FireCrawlLoader(BaseLoader):
             api_url: The Firecrawl API URL. If not specified will be read from env var
                 FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev.
             mode: The mode to run the loader in. Default is "crawl".
-                 Options include "scrape" (single url) and
-                 "crawl" (all accessible sub pages).
+                 Options include "scrape" (single url),
+                 "crawl" (all accessible sub pages),
+                 "map" (returns list of links that are semantically related).
             params: The parameters to pass to the Firecrawl API.
                 Examples include crawlerOptions.
                 For more details, visit: https://github.com/mendableai/firecrawl-py
@@ -95,30 +252,58 @@ class FireCrawlLoader(BaseLoader):
             raise ImportError(
                 "`firecrawl` package not found, please run `pip install firecrawl-py`"
             )
-        if mode not in ("crawl", "scrape"):
+        if mode not in ("crawl", "scrape", "search", "map"):
             raise ValueError(
-                f"Unrecognized mode '{mode}'. Expected one of 'crawl', 'scrape'."
+                f"Invalid mode '{mode}'. Allowed: 'crawl', 'scrape', 'search', 'map'."
             )
+
+        if not url:
+            raise ValueError("Url must be provided")
+
         api_key = api_key or get_from_env("api_key", "FIRECRAWL_API_KEY")
         self.firecrawl = FirecrawlApp(api_key=api_key, api_url=api_url)
         self.url = url
         self.mode = mode
-        self.params = params
+        self.params = params or {}
 
     def lazy_load(self) -> Iterator[Document]:
         if self.mode == "scrape":
-            firecrawl_docs = [self.firecrawl.scrape_url(self.url, params=self.params)]
+            firecrawl_docs = [
+                self.firecrawl.scrape_url(
+                    self.url, params=self.legacy_scrape_options_adapter(self.params)
+                )
+            ]
         elif self.mode == "crawl":
-            firecrawl_docs = self.firecrawl.crawl_url(self.url, params=self.params)
+            if not self.url:
+                raise ValueError("URL is required for crawl mode")
+            crawl_response = self.firecrawl.crawl_url(
+                self.url, params=self.legacy_crawler_options_adapter(self.params)
+            )
+            firecrawl_docs = crawl_response.get("data", [])
+        elif self.mode == "map":
+            if not self.url:
+                raise ValueError("URL is required for map mode")
+            firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params)
+        elif self.mode == "search":
+            raise ValueError(
+                "Search mode is not supported in this version, please downgrade."
+            )
         else:
             raise ValueError(
-                f"Unrecognized mode '{self.mode}'. Expected one of 'crawl', 'scrape'."
+                f"Invalid mode '{self.mode}'. Allowed: 'crawl', 'scrape', 'map'."
             )
         for doc in firecrawl_docs:
-            metadata = doc.get("metadata", {})
-            if (self.params is not None) and self.params.get(
-                "extractorOptions", {}
-            ).get("mode") == "llm-extraction":
-                metadata["llm_extraction"] = doc.get("llm_extraction")
-
-            yield Document(page_content=doc.get("markdown", ""), metadata=metadata)
+            if self.mode == "map":
+                page_content = doc
+                metadata = {}
+            else:
+                page_content = (
+                    doc.get("markdown") or doc.get("html") or doc.get("rawHtml", "")
+                )
+                metadata = doc.get("metadata", {})
+            if not page_content:
+                continue
+            yield Document(
+                page_content=page_content,
+                metadata=metadata,
+            )