community[patch]: SitemapLoader restrict depth of parsing sitemap (CVE-2024-2965) (#22903)

This PR restricts the depth to which the sitemap can be parsed.

Fix for: CVE-2024-2965
This commit is contained in:
Eugene Yurtsev 2024-06-14 13:04:40 -04:00 committed by GitHub
parent 4a77a3ab19
commit 9a877c7adb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,6 +1,16 @@
import itertools import itertools
import re import re
from typing import Any, Callable, Generator, Iterable, Iterator, List, Optional, Tuple from typing import (
Any,
Callable,
Dict,
Generator,
Iterable,
Iterator,
List,
Optional,
Tuple,
)
from urllib.parse import urlparse from urllib.parse import urlparse
from langchain_core.documents import Document from langchain_core.documents import Document
@ -75,6 +85,7 @@ class SitemapLoader(WebBaseLoader):
is_local: bool = False, is_local: bool = False,
continue_on_failure: bool = False, continue_on_failure: bool = False,
restrict_to_same_domain: bool = True, restrict_to_same_domain: bool = True,
max_depth: int = 10,
**kwargs: Any, **kwargs: Any,
): ):
"""Initialize with webpage path and optional filter URLs. """Initialize with webpage path and optional filter URLs.
@ -105,6 +116,7 @@ class SitemapLoader(WebBaseLoader):
restrict_to_same_domain: whether to restrict loading to URLs to the same restrict_to_same_domain: whether to restrict loading to URLs to the same
domain as the sitemap. Attention: This is only applied if the sitemap domain as the sitemap. Attention: This is only applied if the sitemap
is not a local file! is not a local file!
max_depth: maximum depth to follow sitemap links. Default: 10
""" """
if blocksize is not None and blocksize < 1: if blocksize is not None and blocksize < 1:
@ -134,17 +146,23 @@ class SitemapLoader(WebBaseLoader):
self.blocknum = blocknum self.blocknum = blocknum
self.is_local = is_local self.is_local = is_local
self.continue_on_failure = continue_on_failure self.continue_on_failure = continue_on_failure
self.max_depth = max_depth
def parse_sitemap(self, soup: Any) -> List[dict]: def parse_sitemap(self, soup: Any, *, depth: int = 0) -> List[dict]:
"""Parse sitemap xml and load into a list of dicts. """Parse sitemap xml and load into a list of dicts.
Args: Args:
soup: BeautifulSoup object. soup: BeautifulSoup object.
depth: current depth of the sitemap. Default: 0
Returns: Returns:
List of dicts. List of dicts.
""" """
els = [] if depth >= self.max_depth:
return []
els: List[Dict] = []
for url in soup.find_all("url"): for url in soup.find_all("url"):
loc = url.find("loc") loc = url.find("loc")
if not loc: if not loc:
@ -177,9 +195,9 @@ class SitemapLoader(WebBaseLoader):
loc = sitemap.find("loc") loc = sitemap.find("loc")
if not loc: if not loc:
continue continue
soup_child = self.scrape_all([loc.text], "xml")[0]
els.extend(self.parse_sitemap(soup_child)) soup_child = self.scrape_all([loc.text], "xml")[0]
els.extend(self.parse_sitemap(soup_child, depth=depth + 1))
return els return els
def lazy_load(self) -> Iterator[Document]: def lazy_load(self) -> Iterator[Document]: