mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-03 10:12:33 +00:00
community[minor]: Implement lazy_load() for GitbookLoader (#18670)
Integration test: `tests/integration_tests/document_loaders/test_gitbook.py`
This commit is contained in:
parent
81985b31e6
commit
20794bb889
@ -1,4 +1,4 @@
|
|||||||
from typing import Any, List, Optional
|
from typing import Any, Iterator, List, Optional
|
||||||
from urllib.parse import urljoin, urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
@ -47,23 +47,23 @@ class GitbookLoader(WebBaseLoader):
|
|||||||
self.load_all_paths = load_all_paths
|
self.load_all_paths = load_all_paths
|
||||||
self.content_selector = content_selector
|
self.content_selector = content_selector
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Fetch text from one single GitBook page."""
|
"""Fetch text from one single GitBook page."""
|
||||||
if self.load_all_paths:
|
if self.load_all_paths:
|
||||||
soup_info = self.scrape()
|
soup_info = self.scrape()
|
||||||
relative_paths = self._get_paths(soup_info)
|
relative_paths = self._get_paths(soup_info)
|
||||||
urls = [urljoin(self.base_url, path) for path in relative_paths]
|
urls = [urljoin(self.base_url, path) for path in relative_paths]
|
||||||
soup_infos = self.scrape_all(urls)
|
soup_infos = self.scrape_all(urls)
|
||||||
_documents = [
|
for soup_info, url in zip(soup_infos, urls):
|
||||||
self._get_document(soup_info, url)
|
doc = self._get_document(soup_info, url)
|
||||||
for soup_info, url in zip(soup_infos, urls)
|
if doc:
|
||||||
]
|
yield doc
|
||||||
|
|
||||||
else:
|
else:
|
||||||
soup_info = self.scrape()
|
soup_info = self.scrape()
|
||||||
_documents = [self._get_document(soup_info, self.web_path)]
|
doc = self._get_document(soup_info, self.web_path)
|
||||||
documents = [d for d in _documents if d]
|
if doc:
|
||||||
|
yield doc
|
||||||
return documents
|
|
||||||
|
|
||||||
def _get_document(
|
def _get_document(
|
||||||
self, soup: Any, custom_url: Optional[str] = None
|
self, soup: Any, custom_url: Optional[str] = None
|
||||||
|
Loading…
Reference in New Issue
Block a user