mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-08 16:48:49 +00:00
community: add 'extract' mode to FireCrawlLoader for structured data extraction (#30242)
**Description:** Added an 'extract' mode to FireCrawlLoader that enables structured data extraction from web pages. This feature allows users to Extract structured data from a single URLs, or entire websites using Large Language Models (LLMs). You can show more params and usage on [firecrawl docs](https://docs.firecrawl.dev/features/extract-beta). You can extract from only one url now.(it depends on firecrawl's extract method) **Dependencies:** No new dependencies required. Uses existing FireCrawl API capabilities. --------- Co-authored-by: chbae <chbae@gcsc.co.kr> Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
parent
747efa16ec
commit
d8510270ee
@ -226,7 +226,7 @@ class FireCrawlLoader(BaseLoader):
|
|||||||
*,
|
*,
|
||||||
api_key: Optional[str] = None,
|
api_key: Optional[str] = None,
|
||||||
api_url: Optional[str] = None,
|
api_url: Optional[str] = None,
|
||||||
mode: Literal["crawl", "scrape", "map"] = "crawl",
|
mode: Literal["crawl", "scrape", "map", "extract"] = "crawl",
|
||||||
params: Optional[dict] = None,
|
params: Optional[dict] = None,
|
||||||
):
|
):
|
||||||
"""Initialize with API key and url.
|
"""Initialize with API key and url.
|
||||||
@ -241,6 +241,7 @@ class FireCrawlLoader(BaseLoader):
|
|||||||
Options include "scrape" (single url),
|
Options include "scrape" (single url),
|
||||||
"crawl" (all accessible sub pages),
|
"crawl" (all accessible sub pages),
|
||||||
"map" (returns list of links that are semantically related).
|
"map" (returns list of links that are semantically related).
|
||||||
|
"extract" (extracts structured data from a page).
|
||||||
params: The parameters to pass to the Firecrawl API.
|
params: The parameters to pass to the Firecrawl API.
|
||||||
Examples include crawlerOptions.
|
Examples include crawlerOptions.
|
||||||
For more details, visit: https://github.com/mendableai/firecrawl-py
|
For more details, visit: https://github.com/mendableai/firecrawl-py
|
||||||
@ -252,9 +253,10 @@ class FireCrawlLoader(BaseLoader):
|
|||||||
raise ImportError(
|
raise ImportError(
|
||||||
"`firecrawl` package not found, please run `pip install firecrawl-py`"
|
"`firecrawl` package not found, please run `pip install firecrawl-py`"
|
||||||
)
|
)
|
||||||
if mode not in ("crawl", "scrape", "search", "map"):
|
if mode not in ("crawl", "scrape", "search", "map", "extract"):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Invalid mode '{mode}'. Allowed: 'crawl', 'scrape', 'search', 'map'."
|
f"""Invalid mode '{mode}'.
|
||||||
|
Allowed: 'crawl', 'scrape', 'search', 'map', 'extract'."""
|
||||||
)
|
)
|
||||||
|
|
||||||
if not url:
|
if not url:
|
||||||
@ -284,16 +286,23 @@ class FireCrawlLoader(BaseLoader):
|
|||||||
if not self.url:
|
if not self.url:
|
||||||
raise ValueError("URL is required for map mode")
|
raise ValueError("URL is required for map mode")
|
||||||
firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params)
|
firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params)
|
||||||
|
elif self.mode == "extract":
|
||||||
|
if not self.url:
|
||||||
|
raise ValueError("URL is required for extract mode")
|
||||||
|
firecrawl_docs = [
|
||||||
|
str(self.firecrawl.extract([self.url], params=self.params))
|
||||||
|
]
|
||||||
elif self.mode == "search":
|
elif self.mode == "search":
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Search mode is not supported in this version, please downgrade."
|
"Search mode is not supported in this version, please downgrade."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Invalid mode '{self.mode}'. Allowed: 'crawl', 'scrape', 'map'."
|
f"""Invalid mode '{self.mode}'.
|
||||||
|
Allowed: 'crawl', 'scrape', 'map', 'extract'."""
|
||||||
)
|
)
|
||||||
for doc in firecrawl_docs:
|
for doc in firecrawl_docs:
|
||||||
if self.mode == "map":
|
if self.mode == "map" or self.mode == "extract":
|
||||||
page_content = doc
|
page_content = doc
|
||||||
metadata = {}
|
metadata = {}
|
||||||
else:
|
else:
|
||||||
|
@ -0,0 +1,100 @@
|
|||||||
|
"""Test FireCrawlLoader."""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from typing import Generator, List, Tuple
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
|
from langchain_community.document_loaders import FireCrawlLoader
|
||||||
|
|
||||||
|
|
||||||
|
# firecrawl 모듈을 모킹하여 sys.modules에 등록
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def mock_firecrawl() -> Generator[Tuple[MagicMock, MagicMock], None, None]:
|
||||||
|
"""Mock firecrawl module for all tests."""
|
||||||
|
mock_module = MagicMock()
|
||||||
|
mock_client = MagicMock()
|
||||||
|
# FirecrawlApp 클래스로 수정
|
||||||
|
mock_module.FirecrawlApp.return_value = mock_client
|
||||||
|
|
||||||
|
# extract 메서드의 반환값 설정
|
||||||
|
response_dict = {
|
||||||
|
"success": True,
|
||||||
|
"data": {
|
||||||
|
"title": "extracted title",
|
||||||
|
"main contents": "extracted main contents",
|
||||||
|
},
|
||||||
|
"status": "completed",
|
||||||
|
"expiresAt": "2025-03-12T12:42:09.000Z",
|
||||||
|
}
|
||||||
|
mock_client.extract.return_value = response_dict
|
||||||
|
|
||||||
|
# sys.modules에 모의 모듈 삽입
|
||||||
|
sys.modules["firecrawl"] = mock_module
|
||||||
|
yield mock_module, mock_client # 테스트에서 필요할 경우 접근할 수 있도록 yield
|
||||||
|
|
||||||
|
# 테스트 후 정리
|
||||||
|
if "firecrawl" in sys.modules:
|
||||||
|
del sys.modules["firecrawl"]
|
||||||
|
|
||||||
|
|
||||||
|
class TestFireCrawlLoader:
|
||||||
|
"""Test FireCrawlLoader."""
|
||||||
|
|
||||||
|
def test_load_extract_mode(
|
||||||
|
self, mock_firecrawl: Tuple[MagicMock, MagicMock]
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Test loading in extract mode."""
|
||||||
|
# fixture에서 모킹된 객체 가져오기
|
||||||
|
_, mock_client = mock_firecrawl
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"prompt": "extract the title and main contents(write your own prompt here)",
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": {"type": "string"},
|
||||||
|
"main contents": {"type": "string"},
|
||||||
|
},
|
||||||
|
"required": ["title", "main contents"],
|
||||||
|
},
|
||||||
|
"enableWebSearch": False,
|
||||||
|
"ignoreSitemap": False,
|
||||||
|
"showSources": False,
|
||||||
|
"scrapeOptions": {
|
||||||
|
"formats": ["markdown"],
|
||||||
|
"onlyMainContent": True,
|
||||||
|
"headers": {},
|
||||||
|
"waitFor": 0,
|
||||||
|
"mobile": False,
|
||||||
|
"skipTlsVerification": False,
|
||||||
|
"timeout": 30000,
|
||||||
|
"removeBase64Images": True,
|
||||||
|
"blockAds": True,
|
||||||
|
"proxy": "basic",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# FireCrawlLoader 인스턴스 생성 및 실행
|
||||||
|
loader = FireCrawlLoader(
|
||||||
|
url="https://example.com", api_key="fake-key", mode="extract", params=params
|
||||||
|
)
|
||||||
|
docs = list(loader.lazy_load()) # lazy_load 메서드 호출
|
||||||
|
|
||||||
|
# 검증
|
||||||
|
assert len(docs) == 1
|
||||||
|
assert isinstance(docs[0].page_content, str)
|
||||||
|
|
||||||
|
# extract 메서드가 올바른 인자로 호출되었는지 확인
|
||||||
|
mock_client.extract.assert_called_once_with(
|
||||||
|
["https://example.com"], params=params
|
||||||
|
)
|
||||||
|
|
||||||
|
# 응답이 문자열로 변환되었으므로 각 속성이 문자열에 포함되어 있는지 확인
|
||||||
|
assert "extracted title" in docs[0].page_content
|
||||||
|
assert "extracted main contents" in docs[0].page_content
|
||||||
|
assert "success" in docs[0].page_content
|
||||||
|
|
||||||
|
return docs
|
Loading…
Reference in New Issue
Block a user