mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-04 14:48:07 +00:00
community: add 'extract' mode to FireCrawlLoader for structured data extraction (#30242)
**Description:** Added an 'extract' mode to FireCrawlLoader that enables structured data extraction from web pages. This feature allows users to Extract structured data from a single URLs, or entire websites using Large Language Models (LLMs). You can show more params and usage on [firecrawl docs](https://docs.firecrawl.dev/features/extract-beta). You can extract from only one url now.(it depends on firecrawl's extract method) **Dependencies:** No new dependencies required. Uses existing FireCrawl API capabilities. --------- Co-authored-by: chbae <chbae@gcsc.co.kr> Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
parent
747efa16ec
commit
d8510270ee
libs/community
@ -226,7 +226,7 @@ class FireCrawlLoader(BaseLoader):
|
||||
*,
|
||||
api_key: Optional[str] = None,
|
||||
api_url: Optional[str] = None,
|
||||
mode: Literal["crawl", "scrape", "map"] = "crawl",
|
||||
mode: Literal["crawl", "scrape", "map", "extract"] = "crawl",
|
||||
params: Optional[dict] = None,
|
||||
):
|
||||
"""Initialize with API key and url.
|
||||
@ -241,6 +241,7 @@ class FireCrawlLoader(BaseLoader):
|
||||
Options include "scrape" (single url),
|
||||
"crawl" (all accessible sub pages),
|
||||
"map" (returns list of links that are semantically related).
|
||||
"extract" (extracts structured data from a page).
|
||||
params: The parameters to pass to the Firecrawl API.
|
||||
Examples include crawlerOptions.
|
||||
For more details, visit: https://github.com/mendableai/firecrawl-py
|
||||
@ -252,9 +253,10 @@ class FireCrawlLoader(BaseLoader):
|
||||
raise ImportError(
|
||||
"`firecrawl` package not found, please run `pip install firecrawl-py`"
|
||||
)
|
||||
if mode not in ("crawl", "scrape", "search", "map"):
|
||||
if mode not in ("crawl", "scrape", "search", "map", "extract"):
|
||||
raise ValueError(
|
||||
f"Invalid mode '{mode}'. Allowed: 'crawl', 'scrape', 'search', 'map'."
|
||||
f"""Invalid mode '{mode}'.
|
||||
Allowed: 'crawl', 'scrape', 'search', 'map', 'extract'."""
|
||||
)
|
||||
|
||||
if not url:
|
||||
@ -284,16 +286,23 @@ class FireCrawlLoader(BaseLoader):
|
||||
if not self.url:
|
||||
raise ValueError("URL is required for map mode")
|
||||
firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params)
|
||||
elif self.mode == "extract":
|
||||
if not self.url:
|
||||
raise ValueError("URL is required for extract mode")
|
||||
firecrawl_docs = [
|
||||
str(self.firecrawl.extract([self.url], params=self.params))
|
||||
]
|
||||
elif self.mode == "search":
|
||||
raise ValueError(
|
||||
"Search mode is not supported in this version, please downgrade."
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid mode '{self.mode}'. Allowed: 'crawl', 'scrape', 'map'."
|
||||
f"""Invalid mode '{self.mode}'.
|
||||
Allowed: 'crawl', 'scrape', 'map', 'extract'."""
|
||||
)
|
||||
for doc in firecrawl_docs:
|
||||
if self.mode == "map":
|
||||
if self.mode == "map" or self.mode == "extract":
|
||||
page_content = doc
|
||||
metadata = {}
|
||||
else:
|
||||
|
@ -0,0 +1,100 @@
|
||||
"""Test FireCrawlLoader."""
|
||||
|
||||
import sys
|
||||
from typing import Generator, List, Tuple
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders import FireCrawlLoader
|
||||
|
||||
|
||||
# firecrawl 모듈을 모킹하여 sys.modules에 등록
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_firecrawl() -> Generator[Tuple[MagicMock, MagicMock], None, None]:
|
||||
"""Mock firecrawl module for all tests."""
|
||||
mock_module = MagicMock()
|
||||
mock_client = MagicMock()
|
||||
# FirecrawlApp 클래스로 수정
|
||||
mock_module.FirecrawlApp.return_value = mock_client
|
||||
|
||||
# extract 메서드의 반환값 설정
|
||||
response_dict = {
|
||||
"success": True,
|
||||
"data": {
|
||||
"title": "extracted title",
|
||||
"main contents": "extracted main contents",
|
||||
},
|
||||
"status": "completed",
|
||||
"expiresAt": "2025-03-12T12:42:09.000Z",
|
||||
}
|
||||
mock_client.extract.return_value = response_dict
|
||||
|
||||
# sys.modules에 모의 모듈 삽입
|
||||
sys.modules["firecrawl"] = mock_module
|
||||
yield mock_module, mock_client # 테스트에서 필요할 경우 접근할 수 있도록 yield
|
||||
|
||||
# 테스트 후 정리
|
||||
if "firecrawl" in sys.modules:
|
||||
del sys.modules["firecrawl"]
|
||||
|
||||
|
||||
class TestFireCrawlLoader:
|
||||
"""Test FireCrawlLoader."""
|
||||
|
||||
def test_load_extract_mode(
|
||||
self, mock_firecrawl: Tuple[MagicMock, MagicMock]
|
||||
) -> List[Document]:
|
||||
"""Test loading in extract mode."""
|
||||
# fixture에서 모킹된 객체 가져오기
|
||||
_, mock_client = mock_firecrawl
|
||||
|
||||
params = {
|
||||
"prompt": "extract the title and main contents(write your own prompt here)",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"main contents": {"type": "string"},
|
||||
},
|
||||
"required": ["title", "main contents"],
|
||||
},
|
||||
"enableWebSearch": False,
|
||||
"ignoreSitemap": False,
|
||||
"showSources": False,
|
||||
"scrapeOptions": {
|
||||
"formats": ["markdown"],
|
||||
"onlyMainContent": True,
|
||||
"headers": {},
|
||||
"waitFor": 0,
|
||||
"mobile": False,
|
||||
"skipTlsVerification": False,
|
||||
"timeout": 30000,
|
||||
"removeBase64Images": True,
|
||||
"blockAds": True,
|
||||
"proxy": "basic",
|
||||
},
|
||||
}
|
||||
|
||||
# FireCrawlLoader 인스턴스 생성 및 실행
|
||||
loader = FireCrawlLoader(
|
||||
url="https://example.com", api_key="fake-key", mode="extract", params=params
|
||||
)
|
||||
docs = list(loader.lazy_load()) # lazy_load 메서드 호출
|
||||
|
||||
# 검증
|
||||
assert len(docs) == 1
|
||||
assert isinstance(docs[0].page_content, str)
|
||||
|
||||
# extract 메서드가 올바른 인자로 호출되었는지 확인
|
||||
mock_client.extract.assert_called_once_with(
|
||||
["https://example.com"], params=params
|
||||
)
|
||||
|
||||
# 응답이 문자열로 변환되었으므로 각 속성이 문자열에 포함되어 있는지 확인
|
||||
assert "extracted title" in docs[0].page_content
|
||||
assert "extracted main contents" in docs[0].page_content
|
||||
assert "success" in docs[0].page_content
|
||||
|
||||
return docs
|
Loading…
Reference in New Issue
Block a user