mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 15:19:33 +00:00
community[minor]: Implement lazy_load() for SitemapLoader (#18667)
Integration tests: `test_sitemap.py` and `test_docusaurus.py`
This commit is contained in:
parent
623dfcc55c
commit
ae167fb5b2
@ -1,6 +1,6 @@
|
|||||||
import itertools
|
import itertools
|
||||||
import re
|
import re
|
||||||
from typing import Any, Callable, Generator, Iterable, List, Optional, Tuple
|
from typing import Any, Callable, Generator, Iterable, Iterator, List, Optional, Tuple
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
@ -182,7 +182,7 @@ class SitemapLoader(WebBaseLoader):
|
|||||||
els.extend(self.parse_sitemap(soup_child))
|
els.extend(self.parse_sitemap(soup_child))
|
||||||
return els
|
return els
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load sitemap."""
|
"""Load sitemap."""
|
||||||
if self.is_local:
|
if self.is_local:
|
||||||
try:
|
try:
|
||||||
@ -211,10 +211,8 @@ class SitemapLoader(WebBaseLoader):
|
|||||||
|
|
||||||
results = self.scrape_all([el["loc"].strip() for el in els if "loc" in el])
|
results = self.scrape_all([el["loc"].strip() for el in els if "loc" in el])
|
||||||
|
|
||||||
return [
|
for i, result in enumerate(results):
|
||||||
Document(
|
yield Document(
|
||||||
page_content=self.parsing_function(results[i]),
|
page_content=self.parsing_function(result),
|
||||||
metadata=self.meta_function(els[i], results[i]),
|
metadata=self.meta_function(els[i], result),
|
||||||
)
|
)
|
||||||
for i in range(len(results))
|
|
||||||
]
|
|
||||||
|
Loading…
Reference in New Issue
Block a user