mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 23:29:21 +00:00
community[minor]: PlaywrightURLLoader can take stored session file (#30152)
**Description:** Implements an additional `browser_session` parameter on PlaywrightURLLoader which can be used to initialize the browser context by providing a stored playwright context.
This commit is contained in:
parent
bffa530816
commit
9b687d7fbd
@ -1,8 +1,9 @@
|
|||||||
"""Loader that uses Playwright to load a page, then uses unstructured to parse html."""
|
"""Loader that uses Playwright to load a page, then uses unstructured to parse html."""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import TYPE_CHECKING, AsyncIterator, Dict, Iterator, List, Optional
|
from typing import TYPE_CHECKING, AsyncIterator, Dict, Iterator, List, Optional, Union
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -113,6 +114,8 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
headless (bool): If True, the browser will run in headless mode.
|
headless (bool): If True, the browser will run in headless mode.
|
||||||
proxy (Optional[Dict[str, str]]): If set, the browser will access URLs
|
proxy (Optional[Dict[str, str]]): If set, the browser will access URLs
|
||||||
through the specified proxy.
|
through the specified proxy.
|
||||||
|
browser_session (Optional[Union[str, os.PathLike[str]]]): Path to a file with
|
||||||
|
browser session data that can be used to restore the browser session.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
@ -137,6 +140,7 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
remove_selectors: Optional[List[str]] = None,
|
remove_selectors: Optional[List[str]] = None,
|
||||||
evaluator: Optional[PlaywrightEvaluator] = None,
|
evaluator: Optional[PlaywrightEvaluator] = None,
|
||||||
proxy: Optional[Dict[str, str]] = None,
|
proxy: Optional[Dict[str, str]] = None,
|
||||||
|
browser_session: Optional[Union[str, os.PathLike[str]]] = None,
|
||||||
):
|
):
|
||||||
"""Load a list of URLs using Playwright."""
|
"""Load a list of URLs using Playwright."""
|
||||||
try:
|
try:
|
||||||
@ -151,6 +155,7 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
self.continue_on_failure = continue_on_failure
|
self.continue_on_failure = continue_on_failure
|
||||||
self.headless = headless
|
self.headless = headless
|
||||||
self.proxy = proxy
|
self.proxy = proxy
|
||||||
|
self.browser_session = browser_session
|
||||||
|
|
||||||
if remove_selectors and evaluator:
|
if remove_selectors and evaluator:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -170,9 +175,20 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
|
|
||||||
with sync_playwright() as p:
|
with sync_playwright() as p:
|
||||||
browser = p.chromium.launch(headless=self.headless, proxy=self.proxy)
|
browser = p.chromium.launch(headless=self.headless, proxy=self.proxy)
|
||||||
|
context = None
|
||||||
|
|
||||||
|
if self.browser_session:
|
||||||
|
if os.path.exists(self.browser_session):
|
||||||
|
context = browser.new_context(storage_state=self.browser_session)
|
||||||
|
else:
|
||||||
|
logger.warning(f"Session file not found: {self.browser_session}")
|
||||||
|
|
||||||
|
if context is None:
|
||||||
|
context = browser.new_context()
|
||||||
|
|
||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
try:
|
try:
|
||||||
page = browser.new_page()
|
page = context.new_page()
|
||||||
response = page.goto(url)
|
response = page.goto(url)
|
||||||
if response is None:
|
if response is None:
|
||||||
raise ValueError(f"page.goto() returned None for url {url}")
|
raise ValueError(f"page.goto() returned None for url {url}")
|
||||||
@ -180,6 +196,7 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
page.wait_for_load_state("load")
|
page.wait_for_load_state("load")
|
||||||
|
|
||||||
text = self.evaluator.evaluate(page, browser, response)
|
text = self.evaluator.evaluate(page, browser, response)
|
||||||
|
page.close()
|
||||||
metadata = {"source": url}
|
metadata = {"source": url}
|
||||||
yield Document(page_content=text, metadata=metadata)
|
yield Document(page_content=text, metadata=metadata)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -211,9 +228,22 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
|
|
||||||
async with async_playwright() as p:
|
async with async_playwright() as p:
|
||||||
browser = await p.chromium.launch(headless=self.headless, proxy=self.proxy)
|
browser = await p.chromium.launch(headless=self.headless, proxy=self.proxy)
|
||||||
|
context = None
|
||||||
|
|
||||||
|
if self.browser_session:
|
||||||
|
if os.path.exists(self.browser_session):
|
||||||
|
context = await browser.new_context(
|
||||||
|
storage_state=self.browser_session
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning(f"Session file not found: {self.browser_session}")
|
||||||
|
|
||||||
|
if context is None:
|
||||||
|
context = await browser.new_context()
|
||||||
|
|
||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
try:
|
try:
|
||||||
page = await browser.new_page()
|
page = await context.new_page()
|
||||||
response = await page.goto(url)
|
response = await page.goto(url)
|
||||||
if response is None:
|
if response is None:
|
||||||
raise ValueError(f"page.goto() returned None for url {url}")
|
raise ValueError(f"page.goto() returned None for url {url}")
|
||||||
@ -221,6 +251,7 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
await page.wait_for_load_state("load")
|
await page.wait_for_load_state("load")
|
||||||
|
|
||||||
text = await self.evaluator.evaluate_async(page, browser, response)
|
text = await self.evaluator.evaluate_async(page, browser, response)
|
||||||
|
await page.close()
|
||||||
metadata = {"source": url}
|
metadata = {"source": url}
|
||||||
yield Document(page_content=text, metadata=metadata)
|
yield Document(page_content=text, metadata=metadata)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
Loading…
Reference in New Issue
Block a user