feat(document_loaders): add sync and async page evaluation methods to PlaywrightURLLoader

This commit is contained in:
Youngwook Kim 2023-07-20 11:07:17 +09:00
parent 3f64b8a761
commit 1fa5d94591

View File

@ -48,6 +48,36 @@ class PlaywrightURLLoader(BaseLoader):
self.headless = headless self.headless = headless
self.remove_selectors = remove_selectors self.remove_selectors = remove_selectors
def sync_evaluate_page(self, page):
"""Process a page and return the text content.
This method can be overridden to apply custom logic.
"""
for selector in self.remove_selectors or []:
elements = page.locator(selector).all()
for element in elements:
if element.is_visible():
element.evaluate("element => element.remove()")
page_source = page.content()
elements = partition_html(text=page_source)
text = "\n\n".join([str(el) for el in elements])
return text
async def async_evaluate_page(self, page):
"""Process a page asynchronously and return the text content.
This method can be overridden to apply custom logic.
"""
for selector in self.remove_selectors or []:
elements = await page.locator(selector).all()
for element in elements:
if await element.is_visible():
await element.evaluate("element => element.remove()")
page_source = await page.content()
elements = partition_html(text=page_source)
text = "\n\n".join([str(el) for el in elements])
return text
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load the specified URLs using Playwright and create Document instances. """Load the specified URLs using Playwright and create Document instances.
@ -65,16 +95,7 @@ class PlaywrightURLLoader(BaseLoader):
try: try:
page = browser.new_page() page = browser.new_page()
page.goto(url) page.goto(url)
text = self.sync_evaluate_page(page)
for selector in self.remove_selectors or []:
elements = page.locator(selector).all()
for element in elements:
if element.is_visible():
element.evaluate("element => element.remove()")
page_source = page.content()
elements = partition_html(text=page_source)
text = "\n\n".join([str(el) for el in elements])
metadata = {"source": url} metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata)) docs.append(Document(page_content=text, metadata=metadata))
except Exception as e: except Exception as e:
@ -105,16 +126,7 @@ class PlaywrightURLLoader(BaseLoader):
try: try:
page = await browser.new_page() page = await browser.new_page()
await page.goto(url) await page.goto(url)
text = await self.async_evaluate_page(page)
for selector in self.remove_selectors or []:
elements = await page.locator(selector).all()
for element in elements:
if await element.is_visible():
await element.evaluate("element => element.remove()")
page_source = await page.content()
elements = partition_html(text=page_source)
text = "\n\n".join([str(el) for el in elements])
metadata = {"source": url} metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata)) docs.append(Document(page_content=text, metadata=metadata))
except Exception as e: except Exception as e: