mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-19 05:13:46 +00:00
community[patch]: SitemapLoader restrict depth of parsing sitemap (CVE-2024-2965) (#22903)
This PR restricts the depth to which the sitemap can be parsed. Fix for: CVE-2024-2965
This commit is contained in:
parent
4a77a3ab19
commit
9a877c7adb
@ -1,6 +1,16 @@
|
|||||||
import itertools
|
import itertools
|
||||||
import re
|
import re
|
||||||
from typing import Any, Callable, Generator, Iterable, Iterator, List, Optional, Tuple
|
from typing import (
|
||||||
|
Any,
|
||||||
|
Callable,
|
||||||
|
Dict,
|
||||||
|
Generator,
|
||||||
|
Iterable,
|
||||||
|
Iterator,
|
||||||
|
List,
|
||||||
|
Optional,
|
||||||
|
Tuple,
|
||||||
|
)
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
@ -75,6 +85,7 @@ class SitemapLoader(WebBaseLoader):
|
|||||||
is_local: bool = False,
|
is_local: bool = False,
|
||||||
continue_on_failure: bool = False,
|
continue_on_failure: bool = False,
|
||||||
restrict_to_same_domain: bool = True,
|
restrict_to_same_domain: bool = True,
|
||||||
|
max_depth: int = 10,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
):
|
):
|
||||||
"""Initialize with webpage path and optional filter URLs.
|
"""Initialize with webpage path and optional filter URLs.
|
||||||
@ -105,6 +116,7 @@ class SitemapLoader(WebBaseLoader):
|
|||||||
restrict_to_same_domain: whether to restrict loading to URLs to the same
|
restrict_to_same_domain: whether to restrict loading to URLs to the same
|
||||||
domain as the sitemap. Attention: This is only applied if the sitemap
|
domain as the sitemap. Attention: This is only applied if the sitemap
|
||||||
is not a local file!
|
is not a local file!
|
||||||
|
max_depth: maximum depth to follow sitemap links. Default: 10
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if blocksize is not None and blocksize < 1:
|
if blocksize is not None and blocksize < 1:
|
||||||
@ -134,17 +146,23 @@ class SitemapLoader(WebBaseLoader):
|
|||||||
self.blocknum = blocknum
|
self.blocknum = blocknum
|
||||||
self.is_local = is_local
|
self.is_local = is_local
|
||||||
self.continue_on_failure = continue_on_failure
|
self.continue_on_failure = continue_on_failure
|
||||||
|
self.max_depth = max_depth
|
||||||
|
|
||||||
def parse_sitemap(self, soup: Any) -> List[dict]:
|
def parse_sitemap(self, soup: Any, *, depth: int = 0) -> List[dict]:
|
||||||
"""Parse sitemap xml and load into a list of dicts.
|
"""Parse sitemap xml and load into a list of dicts.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
soup: BeautifulSoup object.
|
soup: BeautifulSoup object.
|
||||||
|
depth: current depth of the sitemap. Default: 0
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of dicts.
|
List of dicts.
|
||||||
"""
|
"""
|
||||||
els = []
|
if depth >= self.max_depth:
|
||||||
|
return []
|
||||||
|
|
||||||
|
els: List[Dict] = []
|
||||||
|
|
||||||
for url in soup.find_all("url"):
|
for url in soup.find_all("url"):
|
||||||
loc = url.find("loc")
|
loc = url.find("loc")
|
||||||
if not loc:
|
if not loc:
|
||||||
@ -177,9 +195,9 @@ class SitemapLoader(WebBaseLoader):
|
|||||||
loc = sitemap.find("loc")
|
loc = sitemap.find("loc")
|
||||||
if not loc:
|
if not loc:
|
||||||
continue
|
continue
|
||||||
soup_child = self.scrape_all([loc.text], "xml")[0]
|
|
||||||
|
|
||||||
els.extend(self.parse_sitemap(soup_child))
|
soup_child = self.scrape_all([loc.text], "xml")[0]
|
||||||
|
els.extend(self.parse_sitemap(soup_child, depth=depth + 1))
|
||||||
return els
|
return els
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
|
Loading…
Reference in New Issue
Block a user