diff --git a/docs/docs/security.md b/docs/docs/security.md index e583642c66a..08e841c89a2 100644 --- a/docs/docs/security.md +++ b/docs/docs/security.md @@ -6,7 +6,7 @@ LangChain has a large ecosystem of integrations with various external resources When building such applications developers should remember to follow good security practices: -* [**Limit Permissions**](https://en.wikipedia.org/wiki/Principle_of_least_privilege): Scope permissions specifically to the application's need. Granting broad or excessive permissions can introduce significant security vulnerabilities. To avoid such vulnerabilities, consider using read-only credentials, disallowing access to sensitive resources, using sandboxing techniques (such as running inside a container), etc. as appropriate for your application. +* [**Limit Permissions**](https://en.wikipedia.org/wiki/Principle_of_least_privilege): Scope permissions specifically to the application's need. Granting broad or excessive permissions can introduce significant security vulnerabilities. To avoid such vulnerabilities, consider using read-only credentials, disallowing access to sensitive resources, using sandboxing techniques (such as running inside a container), specifying proxy configurations to control external requests, etc. as appropriate for your application. * **Anticipate Potential Misuse**: Just as humans can err, so can Large Language Models (LLMs). Always assume that any system access or credentials may be used in any way allowed by the permissions they are assigned. For example, if a pair of database credentials allows deleting data, it’s safest to assume that any LLM able to use those credentials may in fact delete data. * [**Defense in Depth**](https://en.wikipedia.org/wiki/Defense_in_depth_(computing)): No security technique is perfect. Fine-tuning and good chain design can reduce, but not eliminate, the odds that a Large Language Model (LLM) may make a mistake. It’s best to combine multiple layered security approaches rather than relying on any single layer of defense to ensure security. For example: use both read-only permissions and sandboxing to ensure that LLMs are only able to access data that is explicitly meant for them to use. diff --git a/libs/community/langchain_community/document_loaders/recursive_url_loader.py b/libs/community/langchain_community/document_loaders/recursive_url_loader.py index d2043444747..ebf119bafc2 100644 --- a/libs/community/langchain_community/document_loaders/recursive_url_loader.py +++ b/libs/community/langchain_community/document_loaders/recursive_url_loader.py @@ -268,6 +268,7 @@ class RecursiveUrlLoader(BaseLoader): base_url: Optional[str] = None, autoset_encoding: bool = True, encoding: Optional[str] = None, + proxies: Optional[dict] = None, ) -> None: """Initialize with URL to crawl and any subdirectories to exclude. @@ -313,6 +314,16 @@ class RecursiveUrlLoader(BaseLoader): encoding, unless the `encoding` argument has already been explicitly set. encoding: The encoding of the response. If manually set, the encoding will be set to given value, regardless of the `autoset_encoding` argument. + proxies: A dictionary mapping protocol names to the proxy URLs to be used for requests. + This allows the crawler to route its requests through specified proxy servers. + If None, no proxies will be used and requests will go directly to the target URL. + Example usage: + ..code-block:: python + + proxies = { + "http": "http://10.10.1.10:3128", + "https": "https://10.10.1.10:1080", + } """ # noqa: E501 self.url = url @@ -342,6 +353,7 @@ class RecursiveUrlLoader(BaseLoader): self.check_response_status = check_response_status self.continue_on_failure = continue_on_failure self.base_url = base_url if base_url is not None else url + self.proxies = proxies def _get_child_links_recursive( self, url: str, visited: Set[str], *, depth: int = 0 @@ -360,7 +372,9 @@ class RecursiveUrlLoader(BaseLoader): # Get all links that can be accessed from the current URL visited.add(url) try: - response = requests.get(url, timeout=self.timeout, headers=self.headers) + response = requests.get( + url, timeout=self.timeout, headers=self.headers, proxies=self.proxies + ) if self.encoding is not None: response.encoding = self.encoding