community: add 'extract' mode to FireCrawlLoader for structured data extraction (#30242)

**Description:** Added an 'extract' mode to FireCrawlLoader that enables structured data extraction from web pages. This feature allows users to Extract structured data from a single URLs, or entire websites using Large Language Models (LLMs). You can show more params and usage on [firecrawl docs](https://docs.firecrawl.dev/features/extract-beta). You can extract from only one url now.(it depends on firecrawl's extract method) **Dependencies:** No new dependencies required. Uses existing FireCrawl API capabilities. --------- Co-authored-by: chbae <chbae@gcsc.co.kr> Co-authored-by: ccurme <chester.curme@gmail.com>
2025-08-26 13:01:55 +00:00 · 2025-03-18 00:15:57 +09:00 · 2025-03-18 00:15:57 +09:00 · d8510270ee
commit d8510270ee
parent 747efa16ec
2 changed files with 114 additions and 5 deletions
--- a/libs/community/langchain_community/document_loaders/firecrawl.py
+++ b/libs/community/langchain_community/document_loaders/firecrawl.py
@ -226,7 +226,7 @@ class FireCrawlLoader(BaseLoader):
        *,
        api_key: Optional[str] = None,
        api_url: Optional[str] = None,
-        mode: Literal["crawl", "scrape", "map"] = "crawl",
+        mode: Literal["crawl", "scrape", "map", "extract"] = "crawl",
        params: Optional[dict] = None,
    ):
        """Initialize with API key and url.
@ -241,6 +241,7 @@ class FireCrawlLoader(BaseLoader):
                 Options include "scrape" (single url),
                 "crawl" (all accessible sub pages),
                 "map" (returns list of links that are semantically related).
                 "extract" (extracts structured data from a page).
            params: The parameters to pass to the Firecrawl API.
                Examples include crawlerOptions.
                For more details, visit: https://github.com/mendableai/firecrawl-py
@ -252,9 +253,10 @@ class FireCrawlLoader(BaseLoader):
            raise ImportError(
                "`firecrawl` package not found, please run `pip install firecrawl-py`"
            )
-        if mode not in ("crawl", "scrape", "search", "map"):
+        if mode not in ("crawl", "scrape", "search", "map", "extract"):
            raise ValueError(
-                f"Invalid mode '{mode}'. Allowed: 'crawl', 'scrape', 'search', 'map'."
+                f"""Invalid mode '{mode}'.
                Allowed: 'crawl', 'scrape', 'search', 'map', 'extract'."""
            )
        if not url:
@ -284,16 +286,23 @@ class FireCrawlLoader(BaseLoader):
            if not self.url:
                raise ValueError("URL is required for map mode")
            firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params)
        elif self.mode == "extract":
            if not self.url:
                raise ValueError("URL is required for extract mode")
            firecrawl_docs = [
                str(self.firecrawl.extract([self.url], params=self.params))
            ]
        elif self.mode == "search":
            raise ValueError(
                "Search mode is not supported in this version, please downgrade."
            )
        else:
            raise ValueError(
-                f"Invalid mode '{self.mode}'. Allowed: 'crawl', 'scrape', 'map'."
+                f"""Invalid mode '{self.mode}'.
                Allowed: 'crawl', 'scrape', 'map', 'extract'."""
            )
        for doc in firecrawl_docs:
-            if self.mode == "map":
+            if self.mode == "map" or self.mode == "extract":
                page_content = doc
                metadata = {}
            else:
--- a/libs/community/tests/unit_tests/document_loaders/test_firecrawl.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_firecrawl.py
@ -0,0 +1,100 @@
 """Test FireCrawlLoader."""
 import sys
 from typing import Generator, List, Tuple
 from unittest.mock import MagicMock
 import pytest
 from langchain_core.documents import Document
 from langchain_community.document_loaders import FireCrawlLoader
 # firecrawl 모듈을 모킹하여 sys.modules에 등록
@pytest.fixture(autouse=True)
 def mock_firecrawl() -> Generator[Tuple[MagicMock, MagicMock], None, None]:
    """Mock firecrawl module for all tests."""
    mock_module = MagicMock()
    mock_client = MagicMock()
    # FirecrawlApp 클래스로 수정
    mock_module.FirecrawlApp.return_value = mock_client
    # extract 메서드의 반환값 설정
    response_dict = {
        "success": True,
        "data": {
            "title": "extracted title",
            "main contents": "extracted main contents",
        },
        "status": "completed",
        "expiresAt": "2025-03-12T12:42:09.000Z",
    }
    mock_client.extract.return_value = response_dict
    # sys.modules에 모의 모듈 삽입
    sys.modules["firecrawl"] = mock_module
    yield mock_module, mock_client  # 테스트에서 필요할 경우 접근할 수 있도록 yield
    # 테스트 후 정리
    if "firecrawl" in sys.modules:
        del sys.modules["firecrawl"]
 class TestFireCrawlLoader:
    """Test FireCrawlLoader."""
    def test_load_extract_mode(
        self, mock_firecrawl: Tuple[MagicMock, MagicMock]
    ) -> List[Document]:
        """Test loading in extract mode."""
        # fixture에서 모킹된 객체 가져오기
        _, mock_client = mock_firecrawl
        params = {
            "prompt": "extract the title and main contents(write your own prompt here)",
            "schema": {
                "type": "object",
                "properties": {
                    "title": {"type": "string"},
                    "main contents": {"type": "string"},
                },
                "required": ["title", "main contents"],
            },
            "enableWebSearch": False,
            "ignoreSitemap": False,
            "showSources": False,
            "scrapeOptions": {
                "formats": ["markdown"],
                "onlyMainContent": True,
                "headers": {},
                "waitFor": 0,
                "mobile": False,
                "skipTlsVerification": False,
                "timeout": 30000,
                "removeBase64Images": True,
                "blockAds": True,
                "proxy": "basic",
            },
        }
        # FireCrawlLoader 인스턴스 생성 및 실행
        loader = FireCrawlLoader(
            url="https://example.com", api_key="fake-key", mode="extract", params=params
        )
        docs = list(loader.lazy_load())  # lazy_load 메서드 호출
        # 검증
        assert len(docs) == 1
        assert isinstance(docs[0].page_content, str)
        # extract 메서드가 올바른 인자로 호출되었는지 확인
        mock_client.extract.assert_called_once_with(
            ["https://example.com"], params=params
        )
        # 응답이 문자열로 변환되었으므로 각 속성이 문자열에 포함되어 있는지 확인
        assert "extracted title" in docs[0].page_content
        assert "extracted main contents" in docs[0].page_content
        assert "success" in docs[0].page_content
        return docs