diff --git a/libs/community/langchain_community/document_loaders/sitemap.py b/libs/community/langchain_community/document_loaders/sitemap.py index 77da27ec172..757a3b182c1 100644 --- a/libs/community/langchain_community/document_loaders/sitemap.py +++ b/libs/community/langchain_community/document_loaders/sitemap.py @@ -1,6 +1,16 @@ import itertools import re -from typing import Any, Callable, Generator, Iterable, Iterator, List, Optional, Tuple +from typing import ( + Any, + Callable, + Dict, + Generator, + Iterable, + Iterator, + List, + Optional, + Tuple, +) from urllib.parse import urlparse from langchain_core.documents import Document @@ -75,6 +85,7 @@ class SitemapLoader(WebBaseLoader): is_local: bool = False, continue_on_failure: bool = False, restrict_to_same_domain: bool = True, + max_depth: int = 10, **kwargs: Any, ): """Initialize with webpage path and optional filter URLs. @@ -105,6 +116,7 @@ class SitemapLoader(WebBaseLoader): restrict_to_same_domain: whether to restrict loading to URLs to the same domain as the sitemap. Attention: This is only applied if the sitemap is not a local file! + max_depth: maximum depth to follow sitemap links. Default: 10 """ if blocksize is not None and blocksize < 1: @@ -134,17 +146,23 @@ class SitemapLoader(WebBaseLoader): self.blocknum = blocknum self.is_local = is_local self.continue_on_failure = continue_on_failure + self.max_depth = max_depth - def parse_sitemap(self, soup: Any) -> List[dict]: + def parse_sitemap(self, soup: Any, *, depth: int = 0) -> List[dict]: """Parse sitemap xml and load into a list of dicts. Args: soup: BeautifulSoup object. + depth: current depth of the sitemap. Default: 0 Returns: List of dicts. """ - els = [] + if depth >= self.max_depth: + return [] + + els: List[Dict] = [] + for url in soup.find_all("url"): loc = url.find("loc") if not loc: @@ -177,9 +195,9 @@ class SitemapLoader(WebBaseLoader): loc = sitemap.find("loc") if not loc: continue - soup_child = self.scrape_all([loc.text], "xml")[0] - els.extend(self.parse_sitemap(soup_child)) + soup_child = self.scrape_all([loc.text], "xml")[0] + els.extend(self.parse_sitemap(soup_child, depth=depth + 1)) return els def lazy_load(self) -> Iterator[Document]: