langchain/libs/community/tests/unit_tests/document_loaders/test_firecrawl.py
Bae-ChangHyun d8510270ee
community: add 'extract' mode to FireCrawlLoader for structured data extraction (#30242)
**Description:** 
Added an 'extract' mode to FireCrawlLoader that enables structured data
extraction from web pages. This feature allows users to Extract
structured data from a single URLs, or entire websites using Large
Language Models (LLMs).
You can show more params and usage on [firecrawl
docs](https://docs.firecrawl.dev/features/extract-beta).
You can extract from only one url now.(it depends on firecrawl's extract
method)

**Dependencies:** 
No new dependencies required. Uses existing FireCrawl API capabilities.

---------

Co-authored-by: chbae <chbae@gcsc.co.kr>
Co-authored-by: ccurme <chester.curme@gmail.com>
2025-03-17 15:15:57 +00:00

101 lines
3.3 KiB
Python

"""Test FireCrawlLoader."""
import sys
from typing import Generator, List, Tuple
from unittest.mock import MagicMock
import pytest
from langchain_core.documents import Document
from langchain_community.document_loaders import FireCrawlLoader
# firecrawl 모듈을 모킹하여 sys.modules에 등록
@pytest.fixture(autouse=True)
def mock_firecrawl() -> Generator[Tuple[MagicMock, MagicMock], None, None]:
"""Mock firecrawl module for all tests."""
mock_module = MagicMock()
mock_client = MagicMock()
# FirecrawlApp 클래스로 수정
mock_module.FirecrawlApp.return_value = mock_client
# extract 메서드의 반환값 설정
response_dict = {
"success": True,
"data": {
"title": "extracted title",
"main contents": "extracted main contents",
},
"status": "completed",
"expiresAt": "2025-03-12T12:42:09.000Z",
}
mock_client.extract.return_value = response_dict
# sys.modules에 모의 모듈 삽입
sys.modules["firecrawl"] = mock_module
yield mock_module, mock_client # 테스트에서 필요할 경우 접근할 수 있도록 yield
# 테스트 후 정리
if "firecrawl" in sys.modules:
del sys.modules["firecrawl"]
class TestFireCrawlLoader:
"""Test FireCrawlLoader."""
def test_load_extract_mode(
self, mock_firecrawl: Tuple[MagicMock, MagicMock]
) -> List[Document]:
"""Test loading in extract mode."""
# fixture에서 모킹된 객체 가져오기
_, mock_client = mock_firecrawl
params = {
"prompt": "extract the title and main contents(write your own prompt here)",
"schema": {
"type": "object",
"properties": {
"title": {"type": "string"},
"main contents": {"type": "string"},
},
"required": ["title", "main contents"],
},
"enableWebSearch": False,
"ignoreSitemap": False,
"showSources": False,
"scrapeOptions": {
"formats": ["markdown"],
"onlyMainContent": True,
"headers": {},
"waitFor": 0,
"mobile": False,
"skipTlsVerification": False,
"timeout": 30000,
"removeBase64Images": True,
"blockAds": True,
"proxy": "basic",
},
}
# FireCrawlLoader 인스턴스 생성 및 실행
loader = FireCrawlLoader(
url="https://example.com", api_key="fake-key", mode="extract", params=params
)
docs = list(loader.lazy_load()) # lazy_load 메서드 호출
# 검증
assert len(docs) == 1
assert isinstance(docs[0].page_content, str)
# extract 메서드가 올바른 인자로 호출되었는지 확인
mock_client.extract.assert_called_once_with(
["https://example.com"], params=params
)
# 응답이 문자열로 변환되었으므로 각 속성이 문자열에 포함되어 있는지 확인
assert "extracted title" in docs[0].page_content
assert "extracted main contents" in docs[0].page_content
assert "success" in docs[0].page_content
return docs