From ae167fb5b2c5dde525b232e07e64719d0743d2d1 Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Wed, 6 Mar 2024 15:15:35 +0100 Subject: [PATCH] community[minor]: Implement lazy_load() for SitemapLoader (#18667) Integration tests: `test_sitemap.py` and `test_docusaurus.py` --- .../document_loaders/sitemap.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/sitemap.py b/libs/community/langchain_community/document_loaders/sitemap.py index fe08acb9bb1..77da27ec172 100644 --- a/libs/community/langchain_community/document_loaders/sitemap.py +++ b/libs/community/langchain_community/document_loaders/sitemap.py @@ -1,6 +1,6 @@ import itertools import re -from typing import Any, Callable, Generator, Iterable, List, Optional, Tuple +from typing import Any, Callable, Generator, Iterable, Iterator, List, Optional, Tuple from urllib.parse import urlparse from langchain_core.documents import Document @@ -182,7 +182,7 @@ class SitemapLoader(WebBaseLoader): els.extend(self.parse_sitemap(soup_child)) return els - def load(self) -> List[Document]: + def lazy_load(self) -> Iterator[Document]: """Load sitemap.""" if self.is_local: try: @@ -211,10 +211,8 @@ class SitemapLoader(WebBaseLoader): results = self.scrape_all([el["loc"].strip() for el in els if "loc" in el]) - return [ - Document( - page_content=self.parsing_function(results[i]), - metadata=self.meta_function(els[i], results[i]), + for i, result in enumerate(results): + yield Document( + page_content=self.parsing_function(result), + metadata=self.meta_function(els[i], result), ) - for i in range(len(results)) - ]