mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-20 05:43:55 +00:00
Community: Updated Firecrawl Document Loader to v1 (#26548)
This PR updates the Firecrawl Document Loader to use the recently released V1 API of Firecrawl. **Key Updates:** **Firecrawl V1 Integration:** Updated the document loader to leverage the new Firecrawl V1 API for improved performance, reliability, and developer experience. **Map Functionality Added:** Introduced the map mode for more flexible document loading options. These updates enhance the integration and provide access to the latest features of Firecrawl. --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
8fea07f92e
commit
fc14f675f1
@ -26,33 +26,32 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Setup\n",
|
"## Setup"
|
||||||
"\n",
|
|
||||||
"### Credentials \n",
|
|
||||||
"\n",
|
|
||||||
"You will need to get your own API key. Go to [this page](https://firecrawl.dev) to learn more."
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import getpass\n",
|
"pip install firecrawl-py"
|
||||||
"import os\n",
|
|
||||||
"\n",
|
|
||||||
"if \"FIRECRAWL_API_KEY\" not in os.environ:\n",
|
|
||||||
" os.environ[\"FIRECRAWL_API_KEY\"] = getpass.getpass(\"Enter your Firecrawl API key: \")"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"### Installation\n",
|
"## Usage"
|
||||||
"\n",
|
]
|
||||||
"You will need to install both the `langchain_community` and `firecrawl-py` pacakges:"
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"You will need to get your own API key. See https://firecrawl.dev"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -61,42 +60,12 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"%pip install -qU firecrawl-py==0.0.20 langchain_community"
|
"from langchain_community.document_loaders.firecrawl import FireCrawlLoader"
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Initialization\n",
|
|
||||||
"\n",
|
|
||||||
"### Modes\n",
|
|
||||||
"\n",
|
|
||||||
"- `scrape`: Scrape single url and return the markdown.\n",
|
|
||||||
"- `crawl`: Crawl the url and all accessible sub pages and return the markdown for each one."
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langchain_community.document_loaders import FireCrawlLoader\n",
|
|
||||||
"\n",
|
|
||||||
"loader = FireCrawlLoader(url=\"https://firecrawl.dev\", mode=\"crawl\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Load"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -111,40 +80,14 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"docs = loader.load()\n",
|
"loader = FireCrawlLoader(\n",
|
||||||
"\n",
|
" api_key=\"YOUR_API_KEY\", url=\"https://firecrawl.dev\", mode=\"scrape\"\n",
|
||||||
"docs[0]"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": null,
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"print(docs[0].metadata)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Lazy Load\n",
|
|
||||||
"\n",
|
|
||||||
"You can use lazy loading to minimize memory requirements."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 9,
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -160,39 +103,61 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
"source": [
|
||||||
"data": {
|
"pages"
|
||||||
"text/plain": [
|
|
||||||
"8"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 10,
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"len(pages)"
|
"## Modes\n",
|
||||||
|
"\n",
|
||||||
|
"- `scrape`: Scrape single url and return the markdown.\n",
|
||||||
|
"- `crawl`: Crawl the url and all accessible sub pages and return the markdown for each one.\n",
|
||||||
|
"- `map`: Maps the URL and returns a list of semantically related pages."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Crawl\n",
|
||||||
|
"\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
"source": [
|
||||||
"name": "stdout",
|
"loader = FireCrawlLoader(\n",
|
||||||
"output_type": "stream",
|
" api_key=\"YOUR_API_KEY\",\n",
|
||||||
"text": [
|
" url=\"https://firecrawl.dev\",\n",
|
||||||
"Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)\n",
|
" mode=\"crawl\",\n",
|
||||||
" Join the waitlist to turn any web\n",
|
")"
|
||||||
"{'ogUrl': 'https://www.firecrawl.dev/blog/introducing-fire-engine-for-firecrawl', 'title': 'Introducing Fire Engine for Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/images/blog/fire-engine-launch.png', 'ogTitle': 'Introducing Fire Engine for Firecrawl', 'sitemap': {'lastmod': '2024-08-06T00:00:00.000Z', 'changefreq': 'weekly'}, 'keywords': 'firecrawl,fireengine,web crawling,dashboard,web scraping,LLM,data extraction', 'sourceURL': 'https://www.firecrawl.dev/blog/introducing-fire-engine-for-firecrawl', 'ogSiteName': 'Firecrawl', 'description': 'The most scalable, reliable, and fast way to get web data for Firecrawl.', 'ogDescription': 'The most scalable, reliable, and fast way to get web data for Firecrawl.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}\n"
|
|
||||||
]
|
]
|
||||||
}
|
},
|
||||||
],
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"print(pages[0].page_content[:100])\n",
|
"print(pages[0].page_content[:100])\n",
|
||||||
"print(pages[0].metadata)"
|
"print(pages[0].metadata)"
|
||||||
@ -202,10 +167,54 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Crawler Options\n",
|
"#### Crawl Options\n",
|
||||||
"\n",
|
"\n",
|
||||||
"You can also pass `params` to the loader. This is a dictionary of options to pass to the crawler. See the [FireCrawl API documentation](https://github.com/mendableai/firecrawl-py) for more information.\n",
|
"You can also pass `params` to the loader. This is a dictionary of options to pass to the crawler. See the [FireCrawl API documentation](https://github.com/mendableai/firecrawl-py) for more information."
|
||||||
"\n"
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"### Map"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = FireCrawlLoader(api_key=\"YOUR_API_KEY\", url=\"firecrawl.dev\", mode=\"map\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Map Options\n",
|
||||||
|
"\n",
|
||||||
|
"You can also pass `params` to the loader. This is a dictionary of options to pass to the loader. See the [FireCrawl API documentation](https://github.com/mendableai/firecrawl-py) for more information."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -220,7 +229,7 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "langchain",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import warnings
|
||||||
from typing import Iterator, Literal, Optional
|
from typing import Iterator, Literal, Optional
|
||||||
|
|
||||||
from langchain_core.document_loaders import BaseLoader
|
from langchain_core.document_loaders import BaseLoader
|
||||||
@ -48,7 +49,6 @@ class FireCrawlLoader(BaseLoader):
|
|||||||
Join the waitlist to turn any web
|
Join the waitlist to turn any web
|
||||||
{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}
|
{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}
|
||||||
|
|
||||||
|
|
||||||
Async load:
|
Async load:
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
@ -64,13 +64,169 @@ class FireCrawlLoader(BaseLoader):
|
|||||||
|
|
||||||
""" # noqa: E501
|
""" # noqa: E501
|
||||||
|
|
||||||
|
def legacy_crawler_options_adapter(self, params: dict) -> dict:
|
||||||
|
use_legacy_options = False
|
||||||
|
legacy_keys = [
|
||||||
|
"includes",
|
||||||
|
"excludes",
|
||||||
|
"allowBackwardCrawling",
|
||||||
|
"allowExternalContentLinks",
|
||||||
|
"pageOptions",
|
||||||
|
]
|
||||||
|
for key in legacy_keys:
|
||||||
|
if params.get(key):
|
||||||
|
use_legacy_options = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if use_legacy_options:
|
||||||
|
warnings.warn(
|
||||||
|
"Deprecated parameters detected. See Firecrawl v1 docs for updates.",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
if "includes" in params:
|
||||||
|
if params["includes"] is True:
|
||||||
|
params["includePaths"] = params["includes"]
|
||||||
|
del params["includes"]
|
||||||
|
|
||||||
|
if "excludes" in params:
|
||||||
|
if params["excludes"] is True:
|
||||||
|
params["excludePaths"] = params["excludes"]
|
||||||
|
del params["excludes"]
|
||||||
|
|
||||||
|
if "allowBackwardCrawling" in params:
|
||||||
|
if params["allowBackwardCrawling"] is True:
|
||||||
|
params["allowBackwardLinks"] = params["allowBackwardCrawling"]
|
||||||
|
del params["allowBackwardCrawling"]
|
||||||
|
|
||||||
|
if "allowExternalContentLinks" in params:
|
||||||
|
if params["allowExternalContentLinks"] is True:
|
||||||
|
params["allowExternalLinks"] = params["allowExternalContentLinks"]
|
||||||
|
del params["allowExternalContentLinks"]
|
||||||
|
|
||||||
|
if "pageOptions" in params:
|
||||||
|
if isinstance(params["pageOptions"], dict):
|
||||||
|
params["scrapeOptions"] = self.legacy_scrape_options_adapter(
|
||||||
|
params["pageOptions"]
|
||||||
|
)
|
||||||
|
del params["pageOptions"]
|
||||||
|
|
||||||
|
return params
|
||||||
|
|
||||||
|
def legacy_scrape_options_adapter(self, params: dict) -> dict:
|
||||||
|
use_legacy_options = False
|
||||||
|
formats = ["markdown"]
|
||||||
|
|
||||||
|
if "extractorOptions" in params:
|
||||||
|
if "mode" in params["extractorOptions"]:
|
||||||
|
if (
|
||||||
|
params["extractorOptions"]["mode"] == "llm-extraction"
|
||||||
|
or params["extractorOptions"]["mode"]
|
||||||
|
== "llm-extraction-from-raw-html"
|
||||||
|
or params["extractorOptions"]["mode"]
|
||||||
|
== "llm-extraction-from-markdown"
|
||||||
|
):
|
||||||
|
use_legacy_options = True
|
||||||
|
if "extractionPrompt" in params["extractorOptions"]:
|
||||||
|
if params["extractorOptions"]["extractionPrompt"]:
|
||||||
|
params["prompt"] = params["extractorOptions"][
|
||||||
|
"extractionPrompt"
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
params["prompt"] = params["extractorOptions"].get(
|
||||||
|
"extractionPrompt",
|
||||||
|
"Extract page information based on the schema.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if "extractionSchema" in params["extractorOptions"]:
|
||||||
|
if params["extractorOptions"]["extractionSchema"]:
|
||||||
|
params["schema"] = params["extractorOptions"][
|
||||||
|
"extractionSchema"
|
||||||
|
]
|
||||||
|
|
||||||
|
if "userPrompt" in params["extractorOptions"]:
|
||||||
|
if params["extractorOptions"]["userPrompt"]:
|
||||||
|
params["prompt"] = params["extractorOptions"]["userPrompt"]
|
||||||
|
|
||||||
|
del params["extractorOptions"]
|
||||||
|
|
||||||
|
scrape_keys = [
|
||||||
|
"includeMarkdown",
|
||||||
|
"includeHtml",
|
||||||
|
"includeRawHtml",
|
||||||
|
"includeExtract",
|
||||||
|
"includeLinks",
|
||||||
|
"screenshot",
|
||||||
|
"fullPageScreenshot",
|
||||||
|
"onlyIncludeTags",
|
||||||
|
"removeTags",
|
||||||
|
]
|
||||||
|
for key in scrape_keys:
|
||||||
|
if params.get(key):
|
||||||
|
use_legacy_options = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if use_legacy_options:
|
||||||
|
warnings.warn(
|
||||||
|
"Deprecated parameters detected. See Firecrawl v1 docs for updates.",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
if "includeMarkdown" in params:
|
||||||
|
if params["includeMarkdown"] is False:
|
||||||
|
formats.remove("markdown")
|
||||||
|
del params["includeMarkdown"]
|
||||||
|
|
||||||
|
if "includeHtml" in params:
|
||||||
|
if params["includeHtml"] is True:
|
||||||
|
formats.append("html")
|
||||||
|
del params["includeHtml"]
|
||||||
|
|
||||||
|
if "includeRawHtml" in params:
|
||||||
|
if params["includeRawHtml"] is True:
|
||||||
|
formats.append("rawHtml")
|
||||||
|
del params["includeRawHtml"]
|
||||||
|
|
||||||
|
if "includeExtract" in params:
|
||||||
|
if params["includeExtract"] is True:
|
||||||
|
formats.append("extract")
|
||||||
|
del params["includeExtract"]
|
||||||
|
|
||||||
|
if "includeLinks" in params:
|
||||||
|
if params["includeLinks"] is True:
|
||||||
|
formats.append("links")
|
||||||
|
del params["includeLinks"]
|
||||||
|
|
||||||
|
if "screenshot" in params:
|
||||||
|
if params["screenshot"] is True:
|
||||||
|
formats.append("screenshot")
|
||||||
|
del params["screenshot"]
|
||||||
|
|
||||||
|
if "fullPageScreenshot" in params:
|
||||||
|
if params["fullPageScreenshot"] is True:
|
||||||
|
formats.append("screenshot@fullPage")
|
||||||
|
del params["fullPageScreenshot"]
|
||||||
|
|
||||||
|
if "onlyIncludeTags" in params:
|
||||||
|
if params["onlyIncludeTags"] is True:
|
||||||
|
params["includeTags"] = params["onlyIncludeTags"]
|
||||||
|
del params["onlyIncludeTags"]
|
||||||
|
|
||||||
|
if "removeTags" in params:
|
||||||
|
if params["removeTags"] is True:
|
||||||
|
params["excludeTags"] = params["removeTags"]
|
||||||
|
del params["removeTags"]
|
||||||
|
|
||||||
|
if "formats" not in params:
|
||||||
|
params["formats"] = formats
|
||||||
|
|
||||||
|
return params
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
*,
|
*,
|
||||||
api_key: Optional[str] = None,
|
api_key: Optional[str] = None,
|
||||||
api_url: Optional[str] = None,
|
api_url: Optional[str] = None,
|
||||||
mode: Literal["crawl", "scrape"] = "crawl",
|
mode: Literal["crawl", "scrape", "map"] = "crawl",
|
||||||
params: Optional[dict] = None,
|
params: Optional[dict] = None,
|
||||||
):
|
):
|
||||||
"""Initialize with API key and url.
|
"""Initialize with API key and url.
|
||||||
@ -82,8 +238,9 @@ class FireCrawlLoader(BaseLoader):
|
|||||||
api_url: The Firecrawl API URL. If not specified will be read from env var
|
api_url: The Firecrawl API URL. If not specified will be read from env var
|
||||||
FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev.
|
FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev.
|
||||||
mode: The mode to run the loader in. Default is "crawl".
|
mode: The mode to run the loader in. Default is "crawl".
|
||||||
Options include "scrape" (single url) and
|
Options include "scrape" (single url),
|
||||||
"crawl" (all accessible sub pages).
|
"crawl" (all accessible sub pages),
|
||||||
|
"map" (returns list of links that are semantically related).
|
||||||
params: The parameters to pass to the Firecrawl API.
|
params: The parameters to pass to the Firecrawl API.
|
||||||
Examples include crawlerOptions.
|
Examples include crawlerOptions.
|
||||||
For more details, visit: https://github.com/mendableai/firecrawl-py
|
For more details, visit: https://github.com/mendableai/firecrawl-py
|
||||||
@ -95,30 +252,58 @@ class FireCrawlLoader(BaseLoader):
|
|||||||
raise ImportError(
|
raise ImportError(
|
||||||
"`firecrawl` package not found, please run `pip install firecrawl-py`"
|
"`firecrawl` package not found, please run `pip install firecrawl-py`"
|
||||||
)
|
)
|
||||||
if mode not in ("crawl", "scrape"):
|
if mode not in ("crawl", "scrape", "search", "map"):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Unrecognized mode '{mode}'. Expected one of 'crawl', 'scrape'."
|
f"Invalid mode '{mode}'. Allowed: 'crawl', 'scrape', 'search', 'map'."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if not url:
|
||||||
|
raise ValueError("Url must be provided")
|
||||||
|
|
||||||
api_key = api_key or get_from_env("api_key", "FIRECRAWL_API_KEY")
|
api_key = api_key or get_from_env("api_key", "FIRECRAWL_API_KEY")
|
||||||
self.firecrawl = FirecrawlApp(api_key=api_key, api_url=api_url)
|
self.firecrawl = FirecrawlApp(api_key=api_key, api_url=api_url)
|
||||||
self.url = url
|
self.url = url
|
||||||
self.mode = mode
|
self.mode = mode
|
||||||
self.params = params
|
self.params = params or {}
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
if self.mode == "scrape":
|
if self.mode == "scrape":
|
||||||
firecrawl_docs = [self.firecrawl.scrape_url(self.url, params=self.params)]
|
firecrawl_docs = [
|
||||||
|
self.firecrawl.scrape_url(
|
||||||
|
self.url, params=self.legacy_scrape_options_adapter(self.params)
|
||||||
|
)
|
||||||
|
]
|
||||||
elif self.mode == "crawl":
|
elif self.mode == "crawl":
|
||||||
firecrawl_docs = self.firecrawl.crawl_url(self.url, params=self.params)
|
if not self.url:
|
||||||
|
raise ValueError("URL is required for crawl mode")
|
||||||
|
crawl_response = self.firecrawl.crawl_url(
|
||||||
|
self.url, params=self.legacy_crawler_options_adapter(self.params)
|
||||||
|
)
|
||||||
|
firecrawl_docs = crawl_response.get("data", [])
|
||||||
|
elif self.mode == "map":
|
||||||
|
if not self.url:
|
||||||
|
raise ValueError("URL is required for map mode")
|
||||||
|
firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params)
|
||||||
|
elif self.mode == "search":
|
||||||
|
raise ValueError(
|
||||||
|
"Search mode is not supported in this version, please downgrade."
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Unrecognized mode '{self.mode}'. Expected one of 'crawl', 'scrape'."
|
f"Invalid mode '{self.mode}'. Allowed: 'crawl', 'scrape', 'map'."
|
||||||
)
|
)
|
||||||
for doc in firecrawl_docs:
|
for doc in firecrawl_docs:
|
||||||
|
if self.mode == "map":
|
||||||
|
page_content = doc
|
||||||
|
metadata = {}
|
||||||
|
else:
|
||||||
|
page_content = (
|
||||||
|
doc.get("markdown") or doc.get("html") or doc.get("rawHtml", "")
|
||||||
|
)
|
||||||
metadata = doc.get("metadata", {})
|
metadata = doc.get("metadata", {})
|
||||||
if (self.params is not None) and self.params.get(
|
if not page_content:
|
||||||
"extractorOptions", {}
|
continue
|
||||||
).get("mode") == "llm-extraction":
|
yield Document(
|
||||||
metadata["llm_extraction"] = doc.get("llm_extraction")
|
page_content=page_content,
|
||||||
|
metadata=metadata,
|
||||||
yield Document(page_content=doc.get("markdown", ""), metadata=metadata)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user