mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-25 16:13:25 +00:00
feat(document_loaders): add sync and async page evaluation methods to PlaywrightURLLoader
This commit is contained in:
parent
3f64b8a761
commit
1fa5d94591
@ -48,6 +48,36 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
self.headless = headless
|
self.headless = headless
|
||||||
self.remove_selectors = remove_selectors
|
self.remove_selectors = remove_selectors
|
||||||
|
|
||||||
|
def sync_evaluate_page(self, page):
|
||||||
|
"""Process a page and return the text content.
|
||||||
|
This method can be overridden to apply custom logic.
|
||||||
|
"""
|
||||||
|
for selector in self.remove_selectors or []:
|
||||||
|
elements = page.locator(selector).all()
|
||||||
|
for element in elements:
|
||||||
|
if element.is_visible():
|
||||||
|
element.evaluate("element => element.remove()")
|
||||||
|
|
||||||
|
page_source = page.content()
|
||||||
|
elements = partition_html(text=page_source)
|
||||||
|
text = "\n\n".join([str(el) for el in elements])
|
||||||
|
return text
|
||||||
|
|
||||||
|
async def async_evaluate_page(self, page):
|
||||||
|
"""Process a page asynchronously and return the text content.
|
||||||
|
This method can be overridden to apply custom logic.
|
||||||
|
"""
|
||||||
|
for selector in self.remove_selectors or []:
|
||||||
|
elements = await page.locator(selector).all()
|
||||||
|
for element in elements:
|
||||||
|
if await element.is_visible():
|
||||||
|
await element.evaluate("element => element.remove()")
|
||||||
|
|
||||||
|
page_source = await page.content()
|
||||||
|
elements = partition_html(text=page_source)
|
||||||
|
text = "\n\n".join([str(el) for el in elements])
|
||||||
|
return text
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load the specified URLs using Playwright and create Document instances.
|
"""Load the specified URLs using Playwright and create Document instances.
|
||||||
|
|
||||||
@ -65,16 +95,7 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
try:
|
try:
|
||||||
page = browser.new_page()
|
page = browser.new_page()
|
||||||
page.goto(url)
|
page.goto(url)
|
||||||
|
text = self.sync_evaluate_page(page)
|
||||||
for selector in self.remove_selectors or []:
|
|
||||||
elements = page.locator(selector).all()
|
|
||||||
for element in elements:
|
|
||||||
if element.is_visible():
|
|
||||||
element.evaluate("element => element.remove()")
|
|
||||||
|
|
||||||
page_source = page.content()
|
|
||||||
elements = partition_html(text=page_source)
|
|
||||||
text = "\n\n".join([str(el) for el in elements])
|
|
||||||
metadata = {"source": url}
|
metadata = {"source": url}
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -105,16 +126,7 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
try:
|
try:
|
||||||
page = await browser.new_page()
|
page = await browser.new_page()
|
||||||
await page.goto(url)
|
await page.goto(url)
|
||||||
|
text = await self.async_evaluate_page(page)
|
||||||
for selector in self.remove_selectors or []:
|
|
||||||
elements = await page.locator(selector).all()
|
|
||||||
for element in elements:
|
|
||||||
if await element.is_visible():
|
|
||||||
await element.evaluate("element => element.remove()")
|
|
||||||
|
|
||||||
page_source = await page.content()
|
|
||||||
elements = partition_html(text=page_source)
|
|
||||||
text = "\n\n".join([str(el) for el in elements])
|
|
||||||
metadata = {"source": url}
|
metadata = {"source": url}
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
Loading…
Reference in New Issue
Block a user