mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-19 19:11:33 +00:00
Add security note to recursive url loader (#11934)
Add security note to recursive loader
This commit is contained in:
parent
42dcc502c7
commit
9ecb7240a4
@ -49,7 +49,36 @@ def _metadata_extractor(raw_html: str, url: str) -> dict:
|
||||
|
||||
|
||||
class RecursiveUrlLoader(BaseLoader):
|
||||
"""Load all child links from a URL page."""
|
||||
"""Load all child links from a URL page.
|
||||
|
||||
**Security Note**: This loader is a crawler that will start crawling
|
||||
at a given URL and then expand to crawl child links recursively.
|
||||
|
||||
Web crawlers should generally NOT be deployed with network access
|
||||
to any internal servers.
|
||||
|
||||
Control access to who can submit crawling requests and what network access
|
||||
the crawler has.
|
||||
|
||||
While crawling, the crawler may encounter malicious URLs that would lead to a
|
||||
server-side request forgery (SSRF) attack.
|
||||
|
||||
To mitigate risks, the crawler by default will only load URLs from the same
|
||||
domain as the start URL (controlled via prevent_outside named argument).
|
||||
|
||||
This will mitigate the risk of SSRF attacks, but will not eliminate it.
|
||||
|
||||
For example, if crawling a host which hosts several sites:
|
||||
|
||||
https://some_host/alice_site/
|
||||
https://some_host/bob_site/
|
||||
|
||||
A malicious URL on Alice's site could cause the crawler to make a malicious
|
||||
GET request to an endpoint on Bob's site. Both sites are hosted on the
|
||||
same host, so such a request would not be prevented by default.
|
||||
|
||||
See https://python.langchain.com/docs/security
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -60,12 +89,13 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
metadata_extractor: Optional[Callable[[str, str], str]] = None,
|
||||
exclude_dirs: Optional[Sequence[str]] = (),
|
||||
timeout: Optional[int] = 10,
|
||||
prevent_outside: Optional[bool] = True,
|
||||
prevent_outside: bool = True,
|
||||
link_regex: Union[str, re.Pattern, None] = None,
|
||||
headers: Optional[dict] = None,
|
||||
check_response_status: bool = False,
|
||||
) -> None:
|
||||
"""Initialize with URL to crawl and any subdirectories to exclude.
|
||||
|
||||
Args:
|
||||
url: The URL to crawl.
|
||||
max_depth: The max depth of the recursive loading.
|
||||
|
Loading…
Reference in New Issue
Block a user