From d98b830e4b965997299d04e2aea769b01b856cee Mon Sep 17 00:00:00 2001 From: "clement.l" Date: Sat, 20 Jul 2024 21:18:02 +0800 Subject: [PATCH] community: add flag to toggle progress bar (#24463) - **Description:** Add a flag to determine whether to show progress bar - **Issue:** n/a - **Dependencies:** n/a - **Twitter handle:** n/a --------- Co-authored-by: Chester Curme --- .../document_loaders/blackboard.py | 6 +++++- .../document_loaders/gitbook.py | 8 +++++++- .../document_loaders/web_base.py | 15 +++++++++++---- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/blackboard.py b/libs/community/langchain_community/document_loaders/blackboard.py index 0141f2af916..d2e6b1e0a4d 100644 --- a/libs/community/langchain_community/document_loaders/blackboard.py +++ b/libs/community/langchain_community/document_loaders/blackboard.py @@ -41,6 +41,7 @@ class BlackboardLoader(WebBaseLoader): basic_auth: Optional[Tuple[str, str]] = None, cookies: Optional[dict] = None, continue_on_failure: bool = False, + show_progress: bool = True, ): """Initialize with blackboard course url. @@ -56,12 +57,15 @@ class BlackboardLoader(WebBaseLoader): occurs loading a url, emitting a warning instead of raising an exception. Setting this to True makes the loader more robust, but also may result in missing data. Default: False + show_progress: whether to show a progress bar while loading. Default: True Raises: ValueError: If blackboard course url is invalid. """ super().__init__( - web_paths=(blackboard_course_url), continue_on_failure=continue_on_failure + web_paths=(blackboard_course_url), + continue_on_failure=continue_on_failure, + show_progress=show_progress, ) # Get base url try: diff --git a/libs/community/langchain_community/document_loaders/gitbook.py b/libs/community/langchain_community/document_loaders/gitbook.py index 8fdf7370896..816d68c33f6 100644 --- a/libs/community/langchain_community/document_loaders/gitbook.py +++ b/libs/community/langchain_community/document_loaders/gitbook.py @@ -20,6 +20,7 @@ class GitbookLoader(WebBaseLoader): base_url: Optional[str] = None, content_selector: str = "main", continue_on_failure: bool = False, + show_progress: bool = True, ): """Initialize with web page and whether to load all paths. @@ -36,6 +37,7 @@ class GitbookLoader(WebBaseLoader): occurs loading a url, emitting a warning instead of raising an exception. Setting this to True makes the loader more robust, but also may result in missing data. Default: False + show_progress: whether to show a progress bar while loading. Default: True """ self.base_url = base_url or web_page if self.base_url.endswith("/"): @@ -43,7 +45,11 @@ class GitbookLoader(WebBaseLoader): if load_all_paths: # set web_path to the sitemap if we want to crawl all paths web_page = f"{self.base_url}/sitemap.xml" - super().__init__(web_paths=(web_page,), continue_on_failure=continue_on_failure) + super().__init__( + web_paths=(web_page,), + continue_on_failure=continue_on_failure, + show_progress=show_progress, + ) self.load_all_paths = load_all_paths self.content_selector = content_selector diff --git a/libs/community/langchain_community/document_loaders/web_base.py b/libs/community/langchain_community/document_loaders/web_base.py index a086a135ab2..f41c731d09c 100644 --- a/libs/community/langchain_community/document_loaders/web_base.py +++ b/libs/community/langchain_community/document_loaders/web_base.py @@ -58,6 +58,8 @@ class WebBaseLoader(BaseLoader): bs_get_text_kwargs: Optional[Dict[str, Any]] = None, bs_kwargs: Optional[Dict[str, Any]] = None, session: Any = None, + *, + show_progress: bool = True, ) -> None: """Initialize loader. @@ -69,6 +71,7 @@ class WebBaseLoader(BaseLoader): raise_for_status: Raise an exception if http status code denotes an error. bs_get_text_kwargs: kwargs for beatifulsoup4 get_text bs_kwargs: kwargs for beatifulsoup4 web page parsing + show_progress: Show progress bar when loading pages. """ # web_path kept for backwards-compatibility. if web_path and web_paths: @@ -91,6 +94,7 @@ class WebBaseLoader(BaseLoader): self.default_parser = default_parser self.requests_kwargs = requests_kwargs or {} self.raise_for_status = raise_for_status + self.show_progress = show_progress self.bs_get_text_kwargs = bs_get_text_kwargs or {} self.bs_kwargs = bs_kwargs or {} if session: @@ -177,11 +181,14 @@ class WebBaseLoader(BaseLoader): task = asyncio.ensure_future(self._fetch_with_rate_limit(url, semaphore)) tasks.append(task) try: - from tqdm.asyncio import tqdm_asyncio + if self.show_progress: + from tqdm.asyncio import tqdm_asyncio - return await tqdm_asyncio.gather( - *tasks, desc="Fetching pages", ascii=True, mininterval=1 - ) + return await tqdm_asyncio.gather( + *tasks, desc="Fetching pages", ascii=True, mininterval=1 + ) + else: + return await asyncio.gather(*tasks) except ImportError: warnings.warn("For better logging of progress, `pip install tqdm`") return await asyncio.gather(*tasks)