mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-19 13:23:35 +00:00
community[proxy]: Enhancement/add proxy support playwrighturlloader 16751 (#16822)
- **Description:** Enhancement/add proxy support playwrighturlloader 16751 - **Issue:** [Enhancement: Add Proxy Support to PlaywrightURLLoader Class](https://github.com/langchain-ai/langchain/issues/16751) - **Dependencies:** - **Twitter handle:** @ootR77013489 --------- Co-authored-by: root <root@ip-172-31-46-160.ap-southeast-1.compute.internal> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
e3b775e035
commit
c454dc36fc
@ -2,7 +2,7 @@
|
|||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import TYPE_CHECKING, List, Optional
|
from typing import TYPE_CHECKING, Dict, List, Optional
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -111,6 +111,22 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
urls (List[str]): List of URLs to load.
|
urls (List[str]): List of URLs to load.
|
||||||
continue_on_failure (bool): If True, continue loading other URLs on failure.
|
continue_on_failure (bool): If True, continue loading other URLs on failure.
|
||||||
headless (bool): If True, the browser will run in headless mode.
|
headless (bool): If True, the browser will run in headless mode.
|
||||||
|
proxy (Optional[Dict[str, str]]): If set, the browser will access URLs
|
||||||
|
through the specified proxy.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_community.document_loaders import PlaywrightURLLoader
|
||||||
|
|
||||||
|
urls = ["https://api.ipify.org/?format=json",]
|
||||||
|
proxy={
|
||||||
|
"server": "https://xx.xx.xx:15818", # https://<host>:<port>
|
||||||
|
"username": "username",
|
||||||
|
"password": "password"
|
||||||
|
}
|
||||||
|
loader = PlaywrightURLLoader(urls, proxy=proxy)
|
||||||
|
data = loader.load()
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -120,6 +136,7 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
headless: bool = True,
|
headless: bool = True,
|
||||||
remove_selectors: Optional[List[str]] = None,
|
remove_selectors: Optional[List[str]] = None,
|
||||||
evaluator: Optional[PlaywrightEvaluator] = None,
|
evaluator: Optional[PlaywrightEvaluator] = None,
|
||||||
|
proxy: Optional[Dict[str, str]] = None,
|
||||||
):
|
):
|
||||||
"""Load a list of URLs using Playwright."""
|
"""Load a list of URLs using Playwright."""
|
||||||
try:
|
try:
|
||||||
@ -133,6 +150,7 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
self.urls = urls
|
self.urls = urls
|
||||||
self.continue_on_failure = continue_on_failure
|
self.continue_on_failure = continue_on_failure
|
||||||
self.headless = headless
|
self.headless = headless
|
||||||
|
self.proxy = proxy
|
||||||
|
|
||||||
if remove_selectors and evaluator:
|
if remove_selectors and evaluator:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -153,7 +171,7 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
docs: List[Document] = list()
|
docs: List[Document] = list()
|
||||||
|
|
||||||
with sync_playwright() as p:
|
with sync_playwright() as p:
|
||||||
browser = p.chromium.launch(headless=self.headless)
|
browser = p.chromium.launch(headless=self.headless, proxy=self.proxy)
|
||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
try:
|
try:
|
||||||
page = browser.new_page()
|
page = browser.new_page()
|
||||||
@ -186,7 +204,7 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
docs: List[Document] = list()
|
docs: List[Document] = list()
|
||||||
|
|
||||||
async with async_playwright() as p:
|
async with async_playwright() as p:
|
||||||
browser = await p.chromium.launch(headless=self.headless)
|
browser = await p.chromium.launch(headless=self.headless, proxy=self.proxy)
|
||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
try:
|
try:
|
||||||
page = await browser.new_page()
|
page = await browser.new_page()
|
||||||
|
Loading…
Reference in New Issue
Block a user