community[minor]: Implement lazy_load() for SitemapLoader (#18667)

Integration tests: `test_sitemap.py` and `test_docusaurus.py`
This commit is contained in:
Christophe Bornet 2024-03-06 15:15:35 +01:00 committed by GitHub
parent 623dfcc55c
commit ae167fb5b2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,6 +1,6 @@
import itertools import itertools
import re import re
from typing import Any, Callable, Generator, Iterable, List, Optional, Tuple from typing import Any, Callable, Generator, Iterable, Iterator, List, Optional, Tuple
from urllib.parse import urlparse from urllib.parse import urlparse
from langchain_core.documents import Document from langchain_core.documents import Document
@ -182,7 +182,7 @@ class SitemapLoader(WebBaseLoader):
els.extend(self.parse_sitemap(soup_child)) els.extend(self.parse_sitemap(soup_child))
return els return els
def load(self) -> List[Document]: def lazy_load(self) -> Iterator[Document]:
"""Load sitemap.""" """Load sitemap."""
if self.is_local: if self.is_local:
try: try:
@ -211,10 +211,8 @@ class SitemapLoader(WebBaseLoader):
results = self.scrape_all([el["loc"].strip() for el in els if "loc" in el]) results = self.scrape_all([el["loc"].strip() for el in els if "loc" in el])
return [ for i, result in enumerate(results):
Document( yield Document(
page_content=self.parsing_function(results[i]), page_content=self.parsing_function(result),
metadata=self.meta_function(els[i], results[i]), metadata=self.meta_function(els[i], result),
) )
for i in range(len(results))
]