mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-08 06:23:20 +00:00
community[minor]: add proxy support to RecursiveUrlLoader (#27364)
**Description** This PR introduces the proxies parameter to the RecursiveUrlLoader class, allowing the user to specify proxy servers for requests. This update enables crawling through proxy servers, providing enhanced flexibility for network configurations. The key changes include: 1.Added an optional proxies parameter to the constructor (__init__). 2.Updated the documentation to explain the proxies parameter usage with an example. 3.Modified the _get_child_links_recursive method to pass the proxies parameter to the requests.get function. **Sample Usage** ```python from bs4 import BeautifulSoup as Soup from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader proxies = { "http": "http://localhost:1080", "https": "http://localhost:1080", } url = "https://python.langchain.com/docs/concepts/#langchain-expression-language-lcel" loader = RecursiveUrlLoader( url=url, max_depth=1, extractor=lambda x: Soup(x, "html.parser").text,proxies=proxies ) docs = loader.load() ``` --------- Co-authored-by: root <root@thb>
This commit is contained in:
@@ -268,6 +268,7 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
base_url: Optional[str] = None,
|
||||
autoset_encoding: bool = True,
|
||||
encoding: Optional[str] = None,
|
||||
proxies: Optional[dict] = None,
|
||||
) -> None:
|
||||
"""Initialize with URL to crawl and any subdirectories to exclude.
|
||||
|
||||
@@ -313,6 +314,16 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
encoding, unless the `encoding` argument has already been explicitly set.
|
||||
encoding: The encoding of the response. If manually set, the encoding will be
|
||||
set to given value, regardless of the `autoset_encoding` argument.
|
||||
proxies: A dictionary mapping protocol names to the proxy URLs to be used for requests.
|
||||
This allows the crawler to route its requests through specified proxy servers.
|
||||
If None, no proxies will be used and requests will go directly to the target URL.
|
||||
Example usage:
|
||||
..code-block:: python
|
||||
|
||||
proxies = {
|
||||
"http": "http://10.10.1.10:3128",
|
||||
"https": "https://10.10.1.10:1080",
|
||||
}
|
||||
""" # noqa: E501
|
||||
|
||||
self.url = url
|
||||
@@ -342,6 +353,7 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
self.check_response_status = check_response_status
|
||||
self.continue_on_failure = continue_on_failure
|
||||
self.base_url = base_url if base_url is not None else url
|
||||
self.proxies = proxies
|
||||
|
||||
def _get_child_links_recursive(
|
||||
self, url: str, visited: Set[str], *, depth: int = 0
|
||||
@@ -360,7 +372,9 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
# Get all links that can be accessed from the current URL
|
||||
visited.add(url)
|
||||
try:
|
||||
response = requests.get(url, timeout=self.timeout, headers=self.headers)
|
||||
response = requests.get(
|
||||
url, timeout=self.timeout, headers=self.headers, proxies=self.proxies
|
||||
)
|
||||
|
||||
if self.encoding is not None:
|
||||
response.encoding = self.encoding
|
||||
|
Reference in New Issue
Block a user