From 930e319ca7418d9bf8f0788438be051129df689c Mon Sep 17 00:00:00 2001 From: Yevgnen Date: Thu, 6 Jul 2023 08:51:10 +0800 Subject: [PATCH] Add concurrency to GitbookLoader (#7069) - Description: Fetch all pages concurrently. - Dependencies: `scrape_all` -> `fetch_all` -> `_fetch_with_rate_limit` -> `_fetch` (might be broken currently: https://github.com/hwchase17/langchain/pull/6519) - Tag maintainer: @rlancemartin, @eyurtsev --------- Co-authored-by: Harrison Chase --- langchain/document_loaders/gitbook.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/langchain/document_loaders/gitbook.py b/langchain/document_loaders/gitbook.py index aa286da4856..f0bf88d7576 100644 --- a/langchain/document_loaders/gitbook.py +++ b/langchain/document_loaders/gitbook.py @@ -49,17 +49,18 @@ class GitbookLoader(WebBaseLoader): if self.load_all_paths: soup_info = self.scrape() relative_paths = self._get_paths(soup_info) - documents = [] - for path in relative_paths: - url = urljoin(self.base_url, path) - print(f"Fetching text from {url}") - soup_info = self._scrape(url) - documents.append(self._get_document(soup_info, url)) - return [d for d in documents if d] + urls = [urljoin(self.base_url, path) for path in relative_paths] + soup_infos = self.scrape_all(urls) + _documents = [ + self._get_document(soup_info, url) + for soup_info, url in zip(soup_infos, urls) + ] else: soup_info = self.scrape() - documents = [self._get_document(soup_info, self.web_path)] - return [d for d in documents if d] + _documents = [self._get_document(soup_info, self.web_path)] + documents = [d for d in _documents if d] + + return documents def _get_document( self, soup: Any, custom_url: Optional[str] = None