community: add 'extract' mode to FireCrawlLoader for structured data extraction (#30242)

**Description:** 
Added an 'extract' mode to FireCrawlLoader that enables structured data
extraction from web pages. This feature allows users to Extract
structured data from a single URLs, or entire websites using Large
Language Models (LLMs).
You can show more params and usage on [firecrawl
docs](https://docs.firecrawl.dev/features/extract-beta).
You can extract from only one url now.(it depends on firecrawl's extract
method)

**Dependencies:** 
No new dependencies required. Uses existing FireCrawl API capabilities.

---------

Co-authored-by: chbae <chbae@gcsc.co.kr>
Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
Bae-ChangHyun 2025-03-18 00:15:57 +09:00 committed by GitHub
parent 747efa16ec
commit d8510270ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 114 additions and 5 deletions

View File

@ -226,7 +226,7 @@ class FireCrawlLoader(BaseLoader):
*, *,
api_key: Optional[str] = None, api_key: Optional[str] = None,
api_url: Optional[str] = None, api_url: Optional[str] = None,
mode: Literal["crawl", "scrape", "map"] = "crawl", mode: Literal["crawl", "scrape", "map", "extract"] = "crawl",
params: Optional[dict] = None, params: Optional[dict] = None,
): ):
"""Initialize with API key and url. """Initialize with API key and url.
@ -241,6 +241,7 @@ class FireCrawlLoader(BaseLoader):
Options include "scrape" (single url), Options include "scrape" (single url),
"crawl" (all accessible sub pages), "crawl" (all accessible sub pages),
"map" (returns list of links that are semantically related). "map" (returns list of links that are semantically related).
"extract" (extracts structured data from a page).
params: The parameters to pass to the Firecrawl API. params: The parameters to pass to the Firecrawl API.
Examples include crawlerOptions. Examples include crawlerOptions.
For more details, visit: https://github.com/mendableai/firecrawl-py For more details, visit: https://github.com/mendableai/firecrawl-py
@ -252,9 +253,10 @@ class FireCrawlLoader(BaseLoader):
raise ImportError( raise ImportError(
"`firecrawl` package not found, please run `pip install firecrawl-py`" "`firecrawl` package not found, please run `pip install firecrawl-py`"
) )
if mode not in ("crawl", "scrape", "search", "map"): if mode not in ("crawl", "scrape", "search", "map", "extract"):
raise ValueError( raise ValueError(
f"Invalid mode '{mode}'. Allowed: 'crawl', 'scrape', 'search', 'map'." f"""Invalid mode '{mode}'.
Allowed: 'crawl', 'scrape', 'search', 'map', 'extract'."""
) )
if not url: if not url:
@ -284,16 +286,23 @@ class FireCrawlLoader(BaseLoader):
if not self.url: if not self.url:
raise ValueError("URL is required for map mode") raise ValueError("URL is required for map mode")
firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params) firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params)
elif self.mode == "extract":
if not self.url:
raise ValueError("URL is required for extract mode")
firecrawl_docs = [
str(self.firecrawl.extract([self.url], params=self.params))
]
elif self.mode == "search": elif self.mode == "search":
raise ValueError( raise ValueError(
"Search mode is not supported in this version, please downgrade." "Search mode is not supported in this version, please downgrade."
) )
else: else:
raise ValueError( raise ValueError(
f"Invalid mode '{self.mode}'. Allowed: 'crawl', 'scrape', 'map'." f"""Invalid mode '{self.mode}'.
Allowed: 'crawl', 'scrape', 'map', 'extract'."""
) )
for doc in firecrawl_docs: for doc in firecrawl_docs:
if self.mode == "map": if self.mode == "map" or self.mode == "extract":
page_content = doc page_content = doc
metadata = {} metadata = {}
else: else:

View File

@ -0,0 +1,100 @@
"""Test FireCrawlLoader."""
import sys
from typing import Generator, List, Tuple
from unittest.mock import MagicMock
import pytest
from langchain_core.documents import Document
from langchain_community.document_loaders import FireCrawlLoader
# firecrawl 모듈을 모킹하여 sys.modules에 등록
@pytest.fixture(autouse=True)
def mock_firecrawl() -> Generator[Tuple[MagicMock, MagicMock], None, None]:
"""Mock firecrawl module for all tests."""
mock_module = MagicMock()
mock_client = MagicMock()
# FirecrawlApp 클래스로 수정
mock_module.FirecrawlApp.return_value = mock_client
# extract 메서드의 반환값 설정
response_dict = {
"success": True,
"data": {
"title": "extracted title",
"main contents": "extracted main contents",
},
"status": "completed",
"expiresAt": "2025-03-12T12:42:09.000Z",
}
mock_client.extract.return_value = response_dict
# sys.modules에 모의 모듈 삽입
sys.modules["firecrawl"] = mock_module
yield mock_module, mock_client # 테스트에서 필요할 경우 접근할 수 있도록 yield
# 테스트 후 정리
if "firecrawl" in sys.modules:
del sys.modules["firecrawl"]
class TestFireCrawlLoader:
"""Test FireCrawlLoader."""
def test_load_extract_mode(
self, mock_firecrawl: Tuple[MagicMock, MagicMock]
) -> List[Document]:
"""Test loading in extract mode."""
# fixture에서 모킹된 객체 가져오기
_, mock_client = mock_firecrawl
params = {
"prompt": "extract the title and main contents(write your own prompt here)",
"schema": {
"type": "object",
"properties": {
"title": {"type": "string"},
"main contents": {"type": "string"},
},
"required": ["title", "main contents"],
},
"enableWebSearch": False,
"ignoreSitemap": False,
"showSources": False,
"scrapeOptions": {
"formats": ["markdown"],
"onlyMainContent": True,
"headers": {},
"waitFor": 0,
"mobile": False,
"skipTlsVerification": False,
"timeout": 30000,
"removeBase64Images": True,
"blockAds": True,
"proxy": "basic",
},
}
# FireCrawlLoader 인스턴스 생성 및 실행
loader = FireCrawlLoader(
url="https://example.com", api_key="fake-key", mode="extract", params=params
)
docs = list(loader.lazy_load()) # lazy_load 메서드 호출
# 검증
assert len(docs) == 1
assert isinstance(docs[0].page_content, str)
# extract 메서드가 올바른 인자로 호출되었는지 확인
mock_client.extract.assert_called_once_with(
["https://example.com"], params=params
)
# 응답이 문자열로 변환되었으므로 각 속성이 문자열에 포함되어 있는지 확인
assert "extracted title" in docs[0].page_content
assert "extracted main contents" in docs[0].page_content
assert "success" in docs[0].page_content
return docs