From d8510270eeb7233ac68a759eabfe059e3588fdf6 Mon Sep 17 00:00:00 2001 From: Bae-ChangHyun <48899047+Bae-ChangHyun@users.noreply.github.com> Date: Tue, 18 Mar 2025 00:15:57 +0900 Subject: [PATCH] community: add 'extract' mode to FireCrawlLoader for structured data extraction (#30242) **Description:** Added an 'extract' mode to FireCrawlLoader that enables structured data extraction from web pages. This feature allows users to Extract structured data from a single URLs, or entire websites using Large Language Models (LLMs). You can show more params and usage on [firecrawl docs](https://docs.firecrawl.dev/features/extract-beta). You can extract from only one url now.(it depends on firecrawl's extract method) **Dependencies:** No new dependencies required. Uses existing FireCrawl API capabilities. --------- Co-authored-by: chbae Co-authored-by: ccurme --- .../document_loaders/firecrawl.py | 19 +++- .../document_loaders/test_firecrawl.py | 100 ++++++++++++++++++ 2 files changed, 114 insertions(+), 5 deletions(-) create mode 100644 libs/community/tests/unit_tests/document_loaders/test_firecrawl.py diff --git a/libs/community/langchain_community/document_loaders/firecrawl.py b/libs/community/langchain_community/document_loaders/firecrawl.py index 4423881dfef..12fdbbd5609 100644 --- a/libs/community/langchain_community/document_loaders/firecrawl.py +++ b/libs/community/langchain_community/document_loaders/firecrawl.py @@ -226,7 +226,7 @@ class FireCrawlLoader(BaseLoader): *, api_key: Optional[str] = None, api_url: Optional[str] = None, - mode: Literal["crawl", "scrape", "map"] = "crawl", + mode: Literal["crawl", "scrape", "map", "extract"] = "crawl", params: Optional[dict] = None, ): """Initialize with API key and url. @@ -241,6 +241,7 @@ class FireCrawlLoader(BaseLoader): Options include "scrape" (single url), "crawl" (all accessible sub pages), "map" (returns list of links that are semantically related). + "extract" (extracts structured data from a page). params: The parameters to pass to the Firecrawl API. Examples include crawlerOptions. For more details, visit: https://github.com/mendableai/firecrawl-py @@ -252,9 +253,10 @@ class FireCrawlLoader(BaseLoader): raise ImportError( "`firecrawl` package not found, please run `pip install firecrawl-py`" ) - if mode not in ("crawl", "scrape", "search", "map"): + if mode not in ("crawl", "scrape", "search", "map", "extract"): raise ValueError( - f"Invalid mode '{mode}'. Allowed: 'crawl', 'scrape', 'search', 'map'." + f"""Invalid mode '{mode}'. + Allowed: 'crawl', 'scrape', 'search', 'map', 'extract'.""" ) if not url: @@ -284,16 +286,23 @@ class FireCrawlLoader(BaseLoader): if not self.url: raise ValueError("URL is required for map mode") firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params) + elif self.mode == "extract": + if not self.url: + raise ValueError("URL is required for extract mode") + firecrawl_docs = [ + str(self.firecrawl.extract([self.url], params=self.params)) + ] elif self.mode == "search": raise ValueError( "Search mode is not supported in this version, please downgrade." ) else: raise ValueError( - f"Invalid mode '{self.mode}'. Allowed: 'crawl', 'scrape', 'map'." + f"""Invalid mode '{self.mode}'. + Allowed: 'crawl', 'scrape', 'map', 'extract'.""" ) for doc in firecrawl_docs: - if self.mode == "map": + if self.mode == "map" or self.mode == "extract": page_content = doc metadata = {} else: diff --git a/libs/community/tests/unit_tests/document_loaders/test_firecrawl.py b/libs/community/tests/unit_tests/document_loaders/test_firecrawl.py new file mode 100644 index 00000000000..11f285498a9 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/test_firecrawl.py @@ -0,0 +1,100 @@ +"""Test FireCrawlLoader.""" + +import sys +from typing import Generator, List, Tuple +from unittest.mock import MagicMock + +import pytest +from langchain_core.documents import Document + +from langchain_community.document_loaders import FireCrawlLoader + + +# firecrawl 모듈을 모킹하여 sys.modules에 등록 +@pytest.fixture(autouse=True) +def mock_firecrawl() -> Generator[Tuple[MagicMock, MagicMock], None, None]: + """Mock firecrawl module for all tests.""" + mock_module = MagicMock() + mock_client = MagicMock() + # FirecrawlApp 클래스로 수정 + mock_module.FirecrawlApp.return_value = mock_client + + # extract 메서드의 반환값 설정 + response_dict = { + "success": True, + "data": { + "title": "extracted title", + "main contents": "extracted main contents", + }, + "status": "completed", + "expiresAt": "2025-03-12T12:42:09.000Z", + } + mock_client.extract.return_value = response_dict + + # sys.modules에 모의 모듈 삽입 + sys.modules["firecrawl"] = mock_module + yield mock_module, mock_client # 테스트에서 필요할 경우 접근할 수 있도록 yield + + # 테스트 후 정리 + if "firecrawl" in sys.modules: + del sys.modules["firecrawl"] + + +class TestFireCrawlLoader: + """Test FireCrawlLoader.""" + + def test_load_extract_mode( + self, mock_firecrawl: Tuple[MagicMock, MagicMock] + ) -> List[Document]: + """Test loading in extract mode.""" + # fixture에서 모킹된 객체 가져오기 + _, mock_client = mock_firecrawl + + params = { + "prompt": "extract the title and main contents(write your own prompt here)", + "schema": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "main contents": {"type": "string"}, + }, + "required": ["title", "main contents"], + }, + "enableWebSearch": False, + "ignoreSitemap": False, + "showSources": False, + "scrapeOptions": { + "formats": ["markdown"], + "onlyMainContent": True, + "headers": {}, + "waitFor": 0, + "mobile": False, + "skipTlsVerification": False, + "timeout": 30000, + "removeBase64Images": True, + "blockAds": True, + "proxy": "basic", + }, + } + + # FireCrawlLoader 인스턴스 생성 및 실행 + loader = FireCrawlLoader( + url="https://example.com", api_key="fake-key", mode="extract", params=params + ) + docs = list(loader.lazy_load()) # lazy_load 메서드 호출 + + # 검증 + assert len(docs) == 1 + assert isinstance(docs[0].page_content, str) + + # extract 메서드가 올바른 인자로 호출되었는지 확인 + mock_client.extract.assert_called_once_with( + ["https://example.com"], params=params + ) + + # 응답이 문자열로 변환되었으므로 각 속성이 문자열에 포함되어 있는지 확인 + assert "extracted title" in docs[0].page_content + assert "extracted main contents" in docs[0].page_content + assert "success" in docs[0].page_content + + return docs