1
0
mirror of https://github.com/hwchase17/langchain.git synced 2025-05-04 14:48:07 +00:00

community: add 'extract' mode to FireCrawlLoader for structured data extraction ()

**Description:** 
Added an 'extract' mode to FireCrawlLoader that enables structured data
extraction from web pages. This feature allows users to Extract
structured data from a single URLs, or entire websites using Large
Language Models (LLMs).
You can show more params and usage on [firecrawl
docs](https://docs.firecrawl.dev/features/extract-beta).
You can extract from only one url now.(it depends on firecrawl's extract
method)

**Dependencies:** 
No new dependencies required. Uses existing FireCrawl API capabilities.

---------

Co-authored-by: chbae <chbae@gcsc.co.kr>
Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
Bae-ChangHyun 2025-03-18 00:15:57 +09:00 committed by GitHub
parent 747efa16ec
commit d8510270ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 114 additions and 5 deletions
libs/community
langchain_community/document_loaders
tests/unit_tests/document_loaders

View File

@ -226,7 +226,7 @@ class FireCrawlLoader(BaseLoader):
*,
api_key: Optional[str] = None,
api_url: Optional[str] = None,
mode: Literal["crawl", "scrape", "map"] = "crawl",
mode: Literal["crawl", "scrape", "map", "extract"] = "crawl",
params: Optional[dict] = None,
):
"""Initialize with API key and url.
@ -241,6 +241,7 @@ class FireCrawlLoader(BaseLoader):
Options include "scrape" (single url),
"crawl" (all accessible sub pages),
"map" (returns list of links that are semantically related).
"extract" (extracts structured data from a page).
params: The parameters to pass to the Firecrawl API.
Examples include crawlerOptions.
For more details, visit: https://github.com/mendableai/firecrawl-py
@ -252,9 +253,10 @@ class FireCrawlLoader(BaseLoader):
raise ImportError(
"`firecrawl` package not found, please run `pip install firecrawl-py`"
)
if mode not in ("crawl", "scrape", "search", "map"):
if mode not in ("crawl", "scrape", "search", "map", "extract"):
raise ValueError(
f"Invalid mode '{mode}'. Allowed: 'crawl', 'scrape', 'search', 'map'."
f"""Invalid mode '{mode}'.
Allowed: 'crawl', 'scrape', 'search', 'map', 'extract'."""
)
if not url:
@ -284,16 +286,23 @@ class FireCrawlLoader(BaseLoader):
if not self.url:
raise ValueError("URL is required for map mode")
firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params)
elif self.mode == "extract":
if not self.url:
raise ValueError("URL is required for extract mode")
firecrawl_docs = [
str(self.firecrawl.extract([self.url], params=self.params))
]
elif self.mode == "search":
raise ValueError(
"Search mode is not supported in this version, please downgrade."
)
else:
raise ValueError(
f"Invalid mode '{self.mode}'. Allowed: 'crawl', 'scrape', 'map'."
f"""Invalid mode '{self.mode}'.
Allowed: 'crawl', 'scrape', 'map', 'extract'."""
)
for doc in firecrawl_docs:
if self.mode == "map":
if self.mode == "map" or self.mode == "extract":
page_content = doc
metadata = {}
else:

View File

@ -0,0 +1,100 @@
"""Test FireCrawlLoader."""
import sys
from typing import Generator, List, Tuple
from unittest.mock import MagicMock
import pytest
from langchain_core.documents import Document
from langchain_community.document_loaders import FireCrawlLoader
# firecrawl 모듈을 모킹하여 sys.modules에 등록
@pytest.fixture(autouse=True)
def mock_firecrawl() -> Generator[Tuple[MagicMock, MagicMock], None, None]:
"""Mock firecrawl module for all tests."""
mock_module = MagicMock()
mock_client = MagicMock()
# FirecrawlApp 클래스로 수정
mock_module.FirecrawlApp.return_value = mock_client
# extract 메서드의 반환값 설정
response_dict = {
"success": True,
"data": {
"title": "extracted title",
"main contents": "extracted main contents",
},
"status": "completed",
"expiresAt": "2025-03-12T12:42:09.000Z",
}
mock_client.extract.return_value = response_dict
# sys.modules에 모의 모듈 삽입
sys.modules["firecrawl"] = mock_module
yield mock_module, mock_client # 테스트에서 필요할 경우 접근할 수 있도록 yield
# 테스트 후 정리
if "firecrawl" in sys.modules:
del sys.modules["firecrawl"]
class TestFireCrawlLoader:
"""Test FireCrawlLoader."""
def test_load_extract_mode(
self, mock_firecrawl: Tuple[MagicMock, MagicMock]
) -> List[Document]:
"""Test loading in extract mode."""
# fixture에서 모킹된 객체 가져오기
_, mock_client = mock_firecrawl
params = {
"prompt": "extract the title and main contents(write your own prompt here)",
"schema": {
"type": "object",
"properties": {
"title": {"type": "string"},
"main contents": {"type": "string"},
},
"required": ["title", "main contents"],
},
"enableWebSearch": False,
"ignoreSitemap": False,
"showSources": False,
"scrapeOptions": {
"formats": ["markdown"],
"onlyMainContent": True,
"headers": {},
"waitFor": 0,
"mobile": False,
"skipTlsVerification": False,
"timeout": 30000,
"removeBase64Images": True,
"blockAds": True,
"proxy": "basic",
},
}
# FireCrawlLoader 인스턴스 생성 및 실행
loader = FireCrawlLoader(
url="https://example.com", api_key="fake-key", mode="extract", params=params
)
docs = list(loader.lazy_load()) # lazy_load 메서드 호출
# 검증
assert len(docs) == 1
assert isinstance(docs[0].page_content, str)
# extract 메서드가 올바른 인자로 호출되었는지 확인
mock_client.extract.assert_called_once_with(
["https://example.com"], params=params
)
# 응답이 문자열로 변환되었으므로 각 속성이 문자열에 포함되어 있는지 확인
assert "extracted title" in docs[0].page_content
assert "extracted main contents" in docs[0].page_content
assert "success" in docs[0].page_content
return docs