community[proxy]: Enhancement/add proxy support playwrighturlloader 16751 (#16822)

- **Description:** Enhancement/add proxy support playwrighturlloader
16751
- **Issue:** [Enhancement: Add Proxy Support to PlaywrightURLLoader
Class](https://github.com/langchain-ai/langchain/issues/16751)
  - **Dependencies:** 
  - **Twitter handle:** @ootR77013489

---------

Co-authored-by: root <root@ip-172-31-46-160.ap-southeast-1.compute.internal>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
yin1991 2024-02-13 11:48:29 +08:00 committed by GitHub
parent e3b775e035
commit c454dc36fc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -2,7 +2,7 @@
""" """
import logging import logging
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, List, Optional from typing import TYPE_CHECKING, Dict, List, Optional
from langchain_core.documents import Document from langchain_core.documents import Document
@ -111,6 +111,22 @@ class PlaywrightURLLoader(BaseLoader):
urls (List[str]): List of URLs to load. urls (List[str]): List of URLs to load.
continue_on_failure (bool): If True, continue loading other URLs on failure. continue_on_failure (bool): If True, continue loading other URLs on failure.
headless (bool): If True, the browser will run in headless mode. headless (bool): If True, the browser will run in headless mode.
proxy (Optional[Dict[str, str]]): If set, the browser will access URLs
through the specified proxy.
Example:
.. code-block:: python
from langchain_community.document_loaders import PlaywrightURLLoader
urls = ["https://api.ipify.org/?format=json",]
proxy={
"server": "https://xx.xx.xx:15818", # https://<host>:<port>
"username": "username",
"password": "password"
}
loader = PlaywrightURLLoader(urls, proxy=proxy)
data = loader.load()
""" """
def __init__( def __init__(
@ -120,6 +136,7 @@ class PlaywrightURLLoader(BaseLoader):
headless: bool = True, headless: bool = True,
remove_selectors: Optional[List[str]] = None, remove_selectors: Optional[List[str]] = None,
evaluator: Optional[PlaywrightEvaluator] = None, evaluator: Optional[PlaywrightEvaluator] = None,
proxy: Optional[Dict[str, str]] = None,
): ):
"""Load a list of URLs using Playwright.""" """Load a list of URLs using Playwright."""
try: try:
@ -133,6 +150,7 @@ class PlaywrightURLLoader(BaseLoader):
self.urls = urls self.urls = urls
self.continue_on_failure = continue_on_failure self.continue_on_failure = continue_on_failure
self.headless = headless self.headless = headless
self.proxy = proxy
if remove_selectors and evaluator: if remove_selectors and evaluator:
raise ValueError( raise ValueError(
@ -153,7 +171,7 @@ class PlaywrightURLLoader(BaseLoader):
docs: List[Document] = list() docs: List[Document] = list()
with sync_playwright() as p: with sync_playwright() as p:
browser = p.chromium.launch(headless=self.headless) browser = p.chromium.launch(headless=self.headless, proxy=self.proxy)
for url in self.urls: for url in self.urls:
try: try:
page = browser.new_page() page = browser.new_page()
@ -186,7 +204,7 @@ class PlaywrightURLLoader(BaseLoader):
docs: List[Document] = list() docs: List[Document] = list()
async with async_playwright() as p: async with async_playwright() as p:
browser = await p.chromium.launch(headless=self.headless) browser = await p.chromium.launch(headless=self.headless, proxy=self.proxy)
for url in self.urls: for url in self.urls:
try: try:
page = await browser.new_page() page = await browser.new_page()