mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-25 16:13:25 +00:00
refactor(document_loaders): introduce PlaywrightEvaluator abstract base class for custom evalutors and add tests
This commit is contained in:
parent
ef7f4aea32
commit
04fcd2d2e0
@ -1,6 +1,7 @@
|
|||||||
"""Loader that uses Playwright to load a page, then uses unstructured to load the html.
|
"""Loader that uses Playwright to load a page, then uses unstructured to load the html.
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -9,8 +10,88 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PlaywrightEvaluator(ABC):
|
||||||
|
"""Abstract base class for all evaluators.
|
||||||
|
|
||||||
|
Each evaluator should take a page, a browser instance, and a response
|
||||||
|
object, process the page as necessary, and return the resulting text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def evaluate(self, page, browser, response):
|
||||||
|
"""Synchronously process the page and return the resulting text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
page: The page to process.
|
||||||
|
browser: The browser instance.
|
||||||
|
response: The response from page.goto().
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
text: The text content of the page.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def evaluate_async(self, page, browser, response):
|
||||||
|
"""Asynchronously process the page and return the resulting text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
page: The page to process.
|
||||||
|
browser: The browser instance.
|
||||||
|
response: The response from page.goto().
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
text: The text content of the page.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredHtmlEvaluator(PlaywrightEvaluator):
|
||||||
|
"""Evaluates the page HTML content using the `unstructured` library."""
|
||||||
|
|
||||||
|
def __init__(self, remove_selectors: Optional[List[str]] = None):
|
||||||
|
"""Initialize UnstructuredHtmlEvaluator and check if `unstructured` package is installed."""
|
||||||
|
try:
|
||||||
|
import unstructured # noqa:F401
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"unstructured package not found, please install it with "
|
||||||
|
"`pip install unstructured`"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.remove_selectors = remove_selectors
|
||||||
|
|
||||||
|
def evaluate(self, page, browser, response):
|
||||||
|
"""Synchronously process the HTML content of the page and return a text string."""
|
||||||
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
|
for selector in self.remove_selectors or []:
|
||||||
|
elements = page.locator(selector).all()
|
||||||
|
for element in elements:
|
||||||
|
if element.is_visible():
|
||||||
|
element.evaluate("element => element.remove()")
|
||||||
|
|
||||||
|
page_source = page.content()
|
||||||
|
elements = partition_html(text=page_source)
|
||||||
|
return "\n\n".join([str(el) for el in elements])
|
||||||
|
|
||||||
|
async def evaluate_async(self, page, browser, response):
|
||||||
|
"""Asynchronously process the HTML content of the page and return a text string."""
|
||||||
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
|
for selector in self.remove_selectors or []:
|
||||||
|
elements = await page.locator(selector).all()
|
||||||
|
for element in elements:
|
||||||
|
if await element.is_visible():
|
||||||
|
await element.evaluate("element => element.remove()")
|
||||||
|
|
||||||
|
page_source = await page.content()
|
||||||
|
elements = partition_html(text=page_source)
|
||||||
|
return "\n\n".join([str(el) for el in elements])
|
||||||
|
|
||||||
|
|
||||||
class PlaywrightURLLoader(BaseLoader):
|
class PlaywrightURLLoader(BaseLoader):
|
||||||
"""Loader that uses Playwright and to load a page and unstructured to load the html.
|
"""Loader that uses Playwright and PlaywrightEvaluator to load the html.
|
||||||
This is useful for loading pages that require javascript to render.
|
This is useful for loading pages that require javascript to render.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
@ -25,8 +106,9 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
continue_on_failure: bool = True,
|
continue_on_failure: bool = True,
|
||||||
headless: bool = True,
|
headless: bool = True,
|
||||||
remove_selectors: Optional[List[str]] = None,
|
remove_selectors: Optional[List[str]] = None,
|
||||||
|
evaluator: Optional[PlaywrightEvaluator] = None,
|
||||||
):
|
):
|
||||||
"""Load a list of URLs using Playwright and unstructured."""
|
"""Load a list of URLs using Playwright."""
|
||||||
try:
|
try:
|
||||||
import playwright # noqa:F401
|
import playwright # noqa:F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -35,66 +117,17 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
"`pip install playwright`"
|
"`pip install playwright`"
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
|
||||||
import unstructured # noqa:F401
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError(
|
|
||||||
"unstructured package not found, please install it with "
|
|
||||||
"`pip install unstructured`"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.urls = urls
|
self.urls = urls
|
||||||
self.continue_on_failure = continue_on_failure
|
self.continue_on_failure = continue_on_failure
|
||||||
self.headless = headless
|
self.headless = headless
|
||||||
self.remove_selectors = remove_selectors
|
|
||||||
|
|
||||||
def _sync_evaluate(self, page, browser, response):
|
if remove_selectors and evaluator:
|
||||||
"""Process a page and return the text content synchronously.
|
raise ValueError(
|
||||||
|
"`remove_selectors` and `evaluator` cannot be both not None"
|
||||||
|
)
|
||||||
|
|
||||||
Args:
|
# Use the provided evaluator, if any, otherwise, use the default UnstructuredHtmlEvaluator.
|
||||||
page: The page to process.
|
self.evaluator = evaluator or UnstructuredHtmlEvaluator(remove_selectors)
|
||||||
browser: The browser instance.
|
|
||||||
response: The response from page.goto().
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
text: The text content of the page.
|
|
||||||
"""
|
|
||||||
from unstructured.partition.html import partition_html
|
|
||||||
|
|
||||||
for selector in self.remove_selectors or []:
|
|
||||||
elements = page.locator(selector).all()
|
|
||||||
for element in elements:
|
|
||||||
if element.is_visible():
|
|
||||||
element.evaluate("element => element.remove()")
|
|
||||||
|
|
||||||
page_source = page.content()
|
|
||||||
elements = partition_html(text=page_source)
|
|
||||||
text = "\n\n".join([str(el) for el in elements])
|
|
||||||
return text
|
|
||||||
|
|
||||||
async def _async_evaluate(self, page, browser, response):
|
|
||||||
"""Process a page and return the text content asynchronously.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
page: The page to process.
|
|
||||||
browser: The browser instance.
|
|
||||||
response: The response from page.goto().
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
text: The text content of the page.
|
|
||||||
"""
|
|
||||||
from unstructured.partition.html import partition_html
|
|
||||||
|
|
||||||
for selector in self.remove_selectors or []:
|
|
||||||
elements = await page.locator(selector).all()
|
|
||||||
for element in elements:
|
|
||||||
if await element.is_visible():
|
|
||||||
await element.evaluate("element => element.remove()")
|
|
||||||
|
|
||||||
page_source = await page.content()
|
|
||||||
elements = partition_html(text=page_source)
|
|
||||||
text = "\n\n".join([str(el) for el in elements])
|
|
||||||
return text
|
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load the specified URLs using Playwright and create Document instances.
|
"""Load the specified URLs using Playwright and create Document instances.
|
||||||
@ -112,7 +145,7 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
try:
|
try:
|
||||||
page = browser.new_page()
|
page = browser.new_page()
|
||||||
response = page.goto(url)
|
response = page.goto(url)
|
||||||
text = self._sync_evaluate(page, browser, response)
|
text = self.evaluator.evaluate(page, browser, response)
|
||||||
metadata = {"source": url}
|
metadata = {"source": url}
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -142,7 +175,7 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
try:
|
try:
|
||||||
page = await browser.new_page()
|
page = await browser.new_page()
|
||||||
response = await page.goto(url)
|
response = await page.goto(url)
|
||||||
text = await self._async_evaluate(page, browser, response)
|
text = await self.evaluator.evaluate_async(page, browser, response)
|
||||||
metadata = {"source": url}
|
metadata = {"source": url}
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -4,6 +4,16 @@ import pytest
|
|||||||
from langchain.document_loaders import PlaywrightURLLoader
|
from langchain.document_loaders import PlaywrightURLLoader
|
||||||
|
|
||||||
|
|
||||||
|
class TestEvaluator(PageEvaluator):
|
||||||
|
"""A simple evaluator for testing purposes."""
|
||||||
|
|
||||||
|
def evaluate(self, page, browser, response):
|
||||||
|
return "test"
|
||||||
|
|
||||||
|
async def evaluate_async(self, page, browser, response):
|
||||||
|
return "test"
|
||||||
|
|
||||||
|
|
||||||
def test_playwright_url_loader() -> None:
|
def test_playwright_url_loader() -> None:
|
||||||
"""Test Playwright URL loader."""
|
"""Test Playwright URL loader."""
|
||||||
urls = [
|
urls = [
|
||||||
@ -39,3 +49,32 @@ async def test_playwright_async_url_loader() -> None:
|
|||||||
)
|
)
|
||||||
docs = await loader.aload()
|
docs = await loader.aload()
|
||||||
assert len(docs) > 0
|
assert len(docs) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_playwright_url_loader_with_custom_evaluator() -> None:
|
||||||
|
"""Test Playwright URL loader with a custom evaluator."""
|
||||||
|
urls = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"]
|
||||||
|
loader = PlaywrightURLLoader(
|
||||||
|
urls=urls,
|
||||||
|
page_evaluator=TestEvaluator(),
|
||||||
|
continue_on_failure=False,
|
||||||
|
headless=True,
|
||||||
|
)
|
||||||
|
docs = loader.load()
|
||||||
|
assert len(docs) == 1
|
||||||
|
assert docs[0].page_content == "test-"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_playwright_async_url_loader_with_custom_evaluator() -> None:
|
||||||
|
"""Test Playwright async URL loader with a custom evaluator."""
|
||||||
|
urls = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"]
|
||||||
|
loader = PlaywrightURLLoader(
|
||||||
|
urls=urls,
|
||||||
|
page_evaluator=TestEvaluator(),
|
||||||
|
continue_on_failure=False,
|
||||||
|
headless=True,
|
||||||
|
)
|
||||||
|
docs = await loader.aload()
|
||||||
|
assert len(docs) == 2
|
||||||
|
assert docs[0].page_content == "test"
|
||||||
|
Loading…
Reference in New Issue
Block a user