From c113682328fff0c80217e96e82c86ee2589c7d21 Mon Sep 17 00:00:00 2001 From: AmosDinh <39965380+AmosDinh@users.noreply.github.com> Date: Sun, 28 Jul 2024 20:30:36 +0200 Subject: [PATCH] community:Add support for specifying document_loaders.firecrawl api url. (#24747) community:Add support for specifying document_loaders.firecrawl api url. Add support for specifying document_loaders.firecrawl api url. This is mainly to support the [self-hosting](https://github.com/mendableai/firecrawl/blob/main/SELF_HOST.md) option firecrawl provides. Eg. now I can specify localhost:.... The corresponding firecrawl class already provides functionality to pass the argument. See here: https://github.com/mendableai/firecrawl/blob/4c9d62f6d3c6cb7bd13590a8149e70dd81d8e282/apps/python-sdk/firecrawl/firecrawl.py#L29 --------- Co-authored-by: Chester Curme --- .../langchain_community/document_loaders/firecrawl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/document_loaders/firecrawl.py b/libs/community/langchain_community/document_loaders/firecrawl.py index da87466d3de..3ff3bb3b1e9 100644 --- a/libs/community/langchain_community/document_loaders/firecrawl.py +++ b/libs/community/langchain_community/document_loaders/firecrawl.py @@ -17,6 +17,7 @@ class FireCrawlLoader(BaseLoader): url: str, *, api_key: Optional[str] = None, + api_url: Optional[str] = None, mode: Literal["crawl", "scrape"] = "crawl", params: Optional[dict] = None, ): @@ -26,6 +27,8 @@ class FireCrawlLoader(BaseLoader): url: The url to be crawled. api_key: The Firecrawl API key. If not specified will be read from env var FIRECRAWL_API_KEY. Get an API key + api_url: The Firecrawl API URL. If not specified will be read from env var + FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev. mode: The mode to run the loader in. Default is "crawl". Options include "scrape" (single url) and "crawl" (all accessible sub pages). @@ -45,7 +48,7 @@ class FireCrawlLoader(BaseLoader): f"Unrecognized mode '{mode}'. Expected one of 'crawl', 'scrape'." ) api_key = api_key or get_from_env("api_key", "FIRECRAWL_API_KEY") - self.firecrawl = FirecrawlApp(api_key=api_key) + self.firecrawl = FirecrawlApp(api_key=api_key, api_url=api_url) self.url = url self.mode = mode self.params = params