community[minor]: Implement lazy_load() for GitbookLoader (#18670)

Integration test:
`tests/integration_tests/document_loaders/test_gitbook.py`
This commit is contained in:
Christophe Bornet 2024-03-06 15:14:36 +01:00 committed by GitHub
parent 81985b31e6
commit 20794bb889
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,4 +1,4 @@
from typing import Any, List, Optional from typing import Any, Iterator, List, Optional
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse
from langchain_core.documents import Document from langchain_core.documents import Document
@ -47,23 +47,23 @@ class GitbookLoader(WebBaseLoader):
self.load_all_paths = load_all_paths self.load_all_paths = load_all_paths
self.content_selector = content_selector self.content_selector = content_selector
def load(self) -> List[Document]: def lazy_load(self) -> Iterator[Document]:
"""Fetch text from one single GitBook page.""" """Fetch text from one single GitBook page."""
if self.load_all_paths: if self.load_all_paths:
soup_info = self.scrape() soup_info = self.scrape()
relative_paths = self._get_paths(soup_info) relative_paths = self._get_paths(soup_info)
urls = [urljoin(self.base_url, path) for path in relative_paths] urls = [urljoin(self.base_url, path) for path in relative_paths]
soup_infos = self.scrape_all(urls) soup_infos = self.scrape_all(urls)
_documents = [ for soup_info, url in zip(soup_infos, urls):
self._get_document(soup_info, url) doc = self._get_document(soup_info, url)
for soup_info, url in zip(soup_infos, urls) if doc:
] yield doc
else: else:
soup_info = self.scrape() soup_info = self.scrape()
_documents = [self._get_document(soup_info, self.web_path)] doc = self._get_document(soup_info, self.web_path)
documents = [d for d in _documents if d] if doc:
yield doc
return documents
def _get_document( def _get_document(
self, soup: Any, custom_url: Optional[str] = None self, soup: Any, custom_url: Optional[str] = None