mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 23:29:21 +00:00
Add concurrency to GitbookLoader (#7069)
- Description: Fetch all pages concurrently. - Dependencies: `scrape_all` -> `fetch_all` -> `_fetch_with_rate_limit` -> `_fetch` (might be broken currently: https://github.com/hwchase17/langchain/pull/6519) - Tag maintainer: @rlancemartin, @eyurtsev --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
6aa66fd2b0
commit
930e319ca7
@ -49,17 +49,18 @@ class GitbookLoader(WebBaseLoader):
|
|||||||
if self.load_all_paths:
|
if self.load_all_paths:
|
||||||
soup_info = self.scrape()
|
soup_info = self.scrape()
|
||||||
relative_paths = self._get_paths(soup_info)
|
relative_paths = self._get_paths(soup_info)
|
||||||
documents = []
|
urls = [urljoin(self.base_url, path) for path in relative_paths]
|
||||||
for path in relative_paths:
|
soup_infos = self.scrape_all(urls)
|
||||||
url = urljoin(self.base_url, path)
|
_documents = [
|
||||||
print(f"Fetching text from {url}")
|
self._get_document(soup_info, url)
|
||||||
soup_info = self._scrape(url)
|
for soup_info, url in zip(soup_infos, urls)
|
||||||
documents.append(self._get_document(soup_info, url))
|
]
|
||||||
return [d for d in documents if d]
|
|
||||||
else:
|
else:
|
||||||
soup_info = self.scrape()
|
soup_info = self.scrape()
|
||||||
documents = [self._get_document(soup_info, self.web_path)]
|
_documents = [self._get_document(soup_info, self.web_path)]
|
||||||
return [d for d in documents if d]
|
documents = [d for d in _documents if d]
|
||||||
|
|
||||||
|
return documents
|
||||||
|
|
||||||
def _get_document(
|
def _get_document(
|
||||||
self, soup: Any, custom_url: Optional[str] = None
|
self, soup: Any, custom_url: Optional[str] = None
|
||||||
|
Loading…
Reference in New Issue
Block a user