community[minor]: Implement lazy_load() for SitemapLoader (#18667)

Integration tests: `test_sitemap.py` and `test_docusaurus.py`
This commit is contained in:
Christophe Bornet 2024-03-06 15:15:35 +01:00 committed by GitHub
parent 623dfcc55c
commit ae167fb5b2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,6 +1,6 @@
import itertools
import re
from typing import Any, Callable, Generator, Iterable, List, Optional, Tuple
from typing import Any, Callable, Generator, Iterable, Iterator, List, Optional, Tuple
from urllib.parse import urlparse
from langchain_core.documents import Document
@ -182,7 +182,7 @@ class SitemapLoader(WebBaseLoader):
els.extend(self.parse_sitemap(soup_child))
return els
def load(self) -> List[Document]:
def lazy_load(self) -> Iterator[Document]:
"""Load sitemap."""
if self.is_local:
try:
@ -211,10 +211,8 @@ class SitemapLoader(WebBaseLoader):
results = self.scrape_all([el["loc"].strip() for el in els if "loc" in el])
return [
Document(
page_content=self.parsing_function(results[i]),
metadata=self.meta_function(els[i], results[i]),
for i, result in enumerate(results):
yield Document(
page_content=self.parsing_function(result),
metadata=self.meta_function(els[i], result),
)
for i in range(len(results))
]