refactor(document_loaders): introduce PlaywrightEvaluator abstract base class for custom evalutors and add tests

This commit is contained in:
Youngwook Kim 2023-08-09 14:14:59 +09:00
parent ef7f4aea32
commit 04fcd2d2e0
2 changed files with 131 additions and 59 deletions

View File

@ -1,6 +1,7 @@
"""Loader that uses Playwright to load a page, then uses unstructured to load the html. """Loader that uses Playwright to load a page, then uses unstructured to load the html.
""" """
import logging import logging
from abc import ABC, abstractmethod
from typing import List, Optional from typing import List, Optional
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -9,8 +10,88 @@ from langchain.document_loaders.base import BaseLoader
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class PlaywrightEvaluator(ABC):
"""Abstract base class for all evaluators.
Each evaluator should take a page, a browser instance, and a response
object, process the page as necessary, and return the resulting text.
"""
@abstractmethod
def evaluate(self, page, browser, response):
"""Synchronously process the page and return the resulting text.
Args:
page: The page to process.
browser: The browser instance.
response: The response from page.goto().
Returns:
text: The text content of the page.
"""
pass
@abstractmethod
async def evaluate_async(self, page, browser, response):
"""Asynchronously process the page and return the resulting text.
Args:
page: The page to process.
browser: The browser instance.
response: The response from page.goto().
Returns:
text: The text content of the page.
"""
pass
class UnstructuredHtmlEvaluator(PlaywrightEvaluator):
"""Evaluates the page HTML content using the `unstructured` library."""
def __init__(self, remove_selectors: Optional[List[str]] = None):
"""Initialize UnstructuredHtmlEvaluator and check if `unstructured` package is installed."""
try:
import unstructured # noqa:F401
except ImportError:
raise ImportError(
"unstructured package not found, please install it with "
"`pip install unstructured`"
)
self.remove_selectors = remove_selectors
def evaluate(self, page, browser, response):
"""Synchronously process the HTML content of the page and return a text string."""
from unstructured.partition.html import partition_html
for selector in self.remove_selectors or []:
elements = page.locator(selector).all()
for element in elements:
if element.is_visible():
element.evaluate("element => element.remove()")
page_source = page.content()
elements = partition_html(text=page_source)
return "\n\n".join([str(el) for el in elements])
async def evaluate_async(self, page, browser, response):
"""Asynchronously process the HTML content of the page and return a text string."""
from unstructured.partition.html import partition_html
for selector in self.remove_selectors or []:
elements = await page.locator(selector).all()
for element in elements:
if await element.is_visible():
await element.evaluate("element => element.remove()")
page_source = await page.content()
elements = partition_html(text=page_source)
return "\n\n".join([str(el) for el in elements])
class PlaywrightURLLoader(BaseLoader): class PlaywrightURLLoader(BaseLoader):
"""Loader that uses Playwright and to load a page and unstructured to load the html. """Loader that uses Playwright and PlaywrightEvaluator to load the html.
This is useful for loading pages that require javascript to render. This is useful for loading pages that require javascript to render.
Attributes: Attributes:
@ -25,8 +106,9 @@ class PlaywrightURLLoader(BaseLoader):
continue_on_failure: bool = True, continue_on_failure: bool = True,
headless: bool = True, headless: bool = True,
remove_selectors: Optional[List[str]] = None, remove_selectors: Optional[List[str]] = None,
evaluator: Optional[PlaywrightEvaluator] = None,
): ):
"""Load a list of URLs using Playwright and unstructured.""" """Load a list of URLs using Playwright."""
try: try:
import playwright # noqa:F401 import playwright # noqa:F401
except ImportError: except ImportError:
@ -35,66 +117,17 @@ class PlaywrightURLLoader(BaseLoader):
"`pip install playwright`" "`pip install playwright`"
) )
try:
import unstructured # noqa:F401
except ImportError:
raise ImportError(
"unstructured package not found, please install it with "
"`pip install unstructured`"
)
self.urls = urls self.urls = urls
self.continue_on_failure = continue_on_failure self.continue_on_failure = continue_on_failure
self.headless = headless self.headless = headless
self.remove_selectors = remove_selectors
def _sync_evaluate(self, page, browser, response): if remove_selectors and evaluator:
"""Process a page and return the text content synchronously. raise ValueError(
"`remove_selectors` and `evaluator` cannot be both not None"
)
Args: # Use the provided evaluator, if any, otherwise, use the default UnstructuredHtmlEvaluator.
page: The page to process. self.evaluator = evaluator or UnstructuredHtmlEvaluator(remove_selectors)
browser: The browser instance.
response: The response from page.goto().
Returns:
text: The text content of the page.
"""
from unstructured.partition.html import partition_html
for selector in self.remove_selectors or []:
elements = page.locator(selector).all()
for element in elements:
if element.is_visible():
element.evaluate("element => element.remove()")
page_source = page.content()
elements = partition_html(text=page_source)
text = "\n\n".join([str(el) for el in elements])
return text
async def _async_evaluate(self, page, browser, response):
"""Process a page and return the text content asynchronously.
Args:
page: The page to process.
browser: The browser instance.
response: The response from page.goto().
Returns:
text: The text content of the page.
"""
from unstructured.partition.html import partition_html
for selector in self.remove_selectors or []:
elements = await page.locator(selector).all()
for element in elements:
if await element.is_visible():
await element.evaluate("element => element.remove()")
page_source = await page.content()
elements = partition_html(text=page_source)
text = "\n\n".join([str(el) for el in elements])
return text
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load the specified URLs using Playwright and create Document instances. """Load the specified URLs using Playwright and create Document instances.
@ -112,7 +145,7 @@ class PlaywrightURLLoader(BaseLoader):
try: try:
page = browser.new_page() page = browser.new_page()
response = page.goto(url) response = page.goto(url)
text = self._sync_evaluate(page, browser, response) text = self.evaluator.evaluate(page, browser, response)
metadata = {"source": url} metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata)) docs.append(Document(page_content=text, metadata=metadata))
except Exception as e: except Exception as e:
@ -142,7 +175,7 @@ class PlaywrightURLLoader(BaseLoader):
try: try:
page = await browser.new_page() page = await browser.new_page()
response = await page.goto(url) response = await page.goto(url)
text = await self._async_evaluate(page, browser, response) text = await self.evaluator.evaluate_async(page, browser, response)
metadata = {"source": url} metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata)) docs.append(Document(page_content=text, metadata=metadata))
except Exception as e: except Exception as e:

View File

@ -4,6 +4,16 @@ import pytest
from langchain.document_loaders import PlaywrightURLLoader from langchain.document_loaders import PlaywrightURLLoader
class TestEvaluator(PageEvaluator):
"""A simple evaluator for testing purposes."""
def evaluate(self, page, browser, response):
return "test"
async def evaluate_async(self, page, browser, response):
return "test"
def test_playwright_url_loader() -> None: def test_playwright_url_loader() -> None:
"""Test Playwright URL loader.""" """Test Playwright URL loader."""
urls = [ urls = [
@ -39,3 +49,32 @@ async def test_playwright_async_url_loader() -> None:
) )
docs = await loader.aload() docs = await loader.aload()
assert len(docs) > 0 assert len(docs) > 0
def test_playwright_url_loader_with_custom_evaluator() -> None:
"""Test Playwright URL loader with a custom evaluator."""
urls = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"]
loader = PlaywrightURLLoader(
urls=urls,
page_evaluator=TestEvaluator(),
continue_on_failure=False,
headless=True,
)
docs = loader.load()
assert len(docs) == 1
assert docs[0].page_content == "test-"
@pytest.mark.asyncio
async def test_playwright_async_url_loader_with_custom_evaluator() -> None:
"""Test Playwright async URL loader with a custom evaluator."""
urls = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"]
loader = PlaywrightURLLoader(
urls=urls,
page_evaluator=TestEvaluator(),
continue_on_failure=False,
headless=True,
)
docs = await loader.aload()
assert len(docs) == 2
assert docs[0].page_content == "test"