community: add 'extract' mode to FireCrawlLoader for structured data extraction (#30242)

**Description:** Added an 'extract' mode to FireCrawlLoader that enables structured data extraction from web pages. This feature allows users to Extract structured data from a single URLs, or entire websites using Large Language Models (LLMs). You can show more params and usage on [firecrawl docs](https://docs.firecrawl.dev/features/extract-beta). You can extract from only one url now.(it depends on firecrawl's extract method) **Dependencies:** No new dependencies required. Uses existing FireCrawl API capabilities. --------- Co-authored-by: chbae <chbae@gcsc.co.kr> Co-authored-by: ccurme <chester.curme@gmail.com>
2025-07-11 15:35:09 +00:00 · 2025-03-18 00:15:57 +09:00 · 2025-03-18 00:15:57 +09:00 · d8510270ee
commit d8510270ee
parent 747efa16ec
2 changed files with 114 additions and 5 deletions
--- a/libs/community/langchain_community/document_loaders/firecrawl.py
+++ b/libs/community/langchain_community/document_loaders/firecrawl.py
@ -226,7 +226,7 @@ class FireCrawlLoader(BaseLoader):
        *,
        api_key: Optional[str] = None,
        api_url: Optional[str] = None,
-        mode: Literal["crawl", "scrape", "map"] = "crawl",
+        mode: Literal["crawl", "scrape", "map", "extract"] = "crawl",
        params: Optional[dict] = None,
    ):
        """Initialize with API key and url.
@ -241,6 +241,7 @@ class FireCrawlLoader(BaseLoader):
                 Options include "scrape" (single url),
                 "crawl" (all accessible sub pages),
                 "map" (returns list of links that are semantically related).
+                 "extract" (extracts structured data from a page).
            params: The parameters to pass to the Firecrawl API.
                Examples include crawlerOptions.
                For more details, visit: https://github.com/mendableai/firecrawl-py
@ -252,9 +253,10 @@ class FireCrawlLoader(BaseLoader):
            raise ImportError(
                "`firecrawl` package not found, please run `pip install firecrawl-py`"
            )
-        if mode not in ("crawl", "scrape", "search", "map"):
+        if mode not in ("crawl", "scrape", "search", "map", "extract"):
            raise ValueError(
-                f"Invalid mode '{mode}'. Allowed: 'crawl', 'scrape', 'search', 'map'."
+                f"""Invalid mode '{mode}'.
+                Allowed: 'crawl', 'scrape', 'search', 'map', 'extract'."""
            )

        if not url:
@ -284,16 +286,23 @@ class FireCrawlLoader(BaseLoader):
            if not self.url:
                raise ValueError("URL is required for map mode")
            firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params)
+        elif self.mode == "extract":
+            if not self.url:
+                raise ValueError("URL is required for extract mode")
+            firecrawl_docs = [
+                str(self.firecrawl.extract([self.url], params=self.params))
+            ]
        elif self.mode == "search":
            raise ValueError(
                "Search mode is not supported in this version, please downgrade."
            )
        else:
            raise ValueError(
-                f"Invalid mode '{self.mode}'. Allowed: 'crawl', 'scrape', 'map'."
+                f"""Invalid mode '{self.mode}'.
+                Allowed: 'crawl', 'scrape', 'map', 'extract'."""
            )
        for doc in firecrawl_docs:
-            if self.mode == "map":
+            if self.mode == "map" or self.mode == "extract":
                page_content = doc
                metadata = {}
            else:
--- a/libs/community/tests/unit_tests/document_loaders/test_firecrawl.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_firecrawl.py
@ -0,0 +1,100 @@
+"""Test FireCrawlLoader."""
+
+import sys
+from typing import Generator, List, Tuple
+from unittest.mock import MagicMock
+
+import pytest
+from langchain_core.documents import Document
+
+from langchain_community.document_loaders import FireCrawlLoader
+
+
+# firecrawl 모듈을 모킹하여 sys.modules에 등록
+@pytest.fixture(autouse=True)
+def mock_firecrawl() -> Generator[Tuple[MagicMock, MagicMock], None, None]:
+    """Mock firecrawl module for all tests."""
+    mock_module = MagicMock()
+    mock_client = MagicMock()
+    # FirecrawlApp 클래스로 수정
+    mock_module.FirecrawlApp.return_value = mock_client
+
+    # extract 메서드의 반환값 설정
+    response_dict = {
+        "success": True,
+        "data": {
+            "title": "extracted title",
+            "main contents": "extracted main contents",
+        },
+        "status": "completed",
+        "expiresAt": "2025-03-12T12:42:09.000Z",
+    }
+    mock_client.extract.return_value = response_dict
+
+    # sys.modules에 모의 모듈 삽입
+    sys.modules["firecrawl"] = mock_module
+    yield mock_module, mock_client  # 테스트에서 필요할 경우 접근할 수 있도록 yield
+
+    # 테스트 후 정리
+    if "firecrawl" in sys.modules:
+        del sys.modules["firecrawl"]
+
+
+class TestFireCrawlLoader:
+    """Test FireCrawlLoader."""
+
+    def test_load_extract_mode(
+        self, mock_firecrawl: Tuple[MagicMock, MagicMock]
+    ) -> List[Document]:
+        """Test loading in extract mode."""
+        # fixture에서 모킹된 객체 가져오기
+        _, mock_client = mock_firecrawl
+
+        params = {
+            "prompt": "extract the title and main contents(write your own prompt here)",
+            "schema": {
+                "type": "object",
+                "properties": {
+                    "title": {"type": "string"},
+                    "main contents": {"type": "string"},
+                },
+                "required": ["title", "main contents"],
+            },
+            "enableWebSearch": False,
+            "ignoreSitemap": False,
+            "showSources": False,
+            "scrapeOptions": {
+                "formats": ["markdown"],
+                "onlyMainContent": True,
+                "headers": {},
+                "waitFor": 0,
+                "mobile": False,
+                "skipTlsVerification": False,
+                "timeout": 30000,
+                "removeBase64Images": True,
+                "blockAds": True,
+                "proxy": "basic",
+            },
+        }
+
+        # FireCrawlLoader 인스턴스 생성 및 실행
+        loader = FireCrawlLoader(
+            url="https://example.com", api_key="fake-key", mode="extract", params=params
+        )
+        docs = list(loader.lazy_load())  # lazy_load 메서드 호출
+
+        # 검증
+        assert len(docs) == 1
+        assert isinstance(docs[0].page_content, str)
+
+        # extract 메서드가 올바른 인자로 호출되었는지 확인
+        mock_client.extract.assert_called_once_with(
+            ["https://example.com"], params=params
+        )
+
+        # 응답이 문자열로 변환되었으므로 각 속성이 문자열에 포함되어 있는지 확인
+        assert "extracted title" in docs[0].page_content
+        assert "extracted main contents" in docs[0].page_content
+        assert "success" in docs[0].page_content
+
+        return docs