community: add flag to toggle progress bar (#24463)

- **Description:** Add a flag to determine whether to show progress bar 
- **Issue:** n/a
- **Dependencies:** n/a
- **Twitter handle:** n/a

---------

Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
clement.l 2024-07-20 21:18:02 +08:00 committed by GitHub
parent 6b08a33fa4
commit d98b830e4b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 23 additions and 6 deletions

View File

@ -41,6 +41,7 @@ class BlackboardLoader(WebBaseLoader):
basic_auth: Optional[Tuple[str, str]] = None, basic_auth: Optional[Tuple[str, str]] = None,
cookies: Optional[dict] = None, cookies: Optional[dict] = None,
continue_on_failure: bool = False, continue_on_failure: bool = False,
show_progress: bool = True,
): ):
"""Initialize with blackboard course url. """Initialize with blackboard course url.
@ -56,12 +57,15 @@ class BlackboardLoader(WebBaseLoader):
occurs loading a url, emitting a warning instead of raising an occurs loading a url, emitting a warning instead of raising an
exception. Setting this to True makes the loader more robust, but also exception. Setting this to True makes the loader more robust, but also
may result in missing data. Default: False may result in missing data. Default: False
show_progress: whether to show a progress bar while loading. Default: True
Raises: Raises:
ValueError: If blackboard course url is invalid. ValueError: If blackboard course url is invalid.
""" """
super().__init__( super().__init__(
web_paths=(blackboard_course_url), continue_on_failure=continue_on_failure web_paths=(blackboard_course_url),
continue_on_failure=continue_on_failure,
show_progress=show_progress,
) )
# Get base url # Get base url
try: try:

View File

@ -20,6 +20,7 @@ class GitbookLoader(WebBaseLoader):
base_url: Optional[str] = None, base_url: Optional[str] = None,
content_selector: str = "main", content_selector: str = "main",
continue_on_failure: bool = False, continue_on_failure: bool = False,
show_progress: bool = True,
): ):
"""Initialize with web page and whether to load all paths. """Initialize with web page and whether to load all paths.
@ -36,6 +37,7 @@ class GitbookLoader(WebBaseLoader):
occurs loading a url, emitting a warning instead of raising an occurs loading a url, emitting a warning instead of raising an
exception. Setting this to True makes the loader more robust, but also exception. Setting this to True makes the loader more robust, but also
may result in missing data. Default: False may result in missing data. Default: False
show_progress: whether to show a progress bar while loading. Default: True
""" """
self.base_url = base_url or web_page self.base_url = base_url or web_page
if self.base_url.endswith("/"): if self.base_url.endswith("/"):
@ -43,7 +45,11 @@ class GitbookLoader(WebBaseLoader):
if load_all_paths: if load_all_paths:
# set web_path to the sitemap if we want to crawl all paths # set web_path to the sitemap if we want to crawl all paths
web_page = f"{self.base_url}/sitemap.xml" web_page = f"{self.base_url}/sitemap.xml"
super().__init__(web_paths=(web_page,), continue_on_failure=continue_on_failure) super().__init__(
web_paths=(web_page,),
continue_on_failure=continue_on_failure,
show_progress=show_progress,
)
self.load_all_paths = load_all_paths self.load_all_paths = load_all_paths
self.content_selector = content_selector self.content_selector = content_selector

View File

@ -58,6 +58,8 @@ class WebBaseLoader(BaseLoader):
bs_get_text_kwargs: Optional[Dict[str, Any]] = None, bs_get_text_kwargs: Optional[Dict[str, Any]] = None,
bs_kwargs: Optional[Dict[str, Any]] = None, bs_kwargs: Optional[Dict[str, Any]] = None,
session: Any = None, session: Any = None,
*,
show_progress: bool = True,
) -> None: ) -> None:
"""Initialize loader. """Initialize loader.
@ -69,6 +71,7 @@ class WebBaseLoader(BaseLoader):
raise_for_status: Raise an exception if http status code denotes an error. raise_for_status: Raise an exception if http status code denotes an error.
bs_get_text_kwargs: kwargs for beatifulsoup4 get_text bs_get_text_kwargs: kwargs for beatifulsoup4 get_text
bs_kwargs: kwargs for beatifulsoup4 web page parsing bs_kwargs: kwargs for beatifulsoup4 web page parsing
show_progress: Show progress bar when loading pages.
""" """
# web_path kept for backwards-compatibility. # web_path kept for backwards-compatibility.
if web_path and web_paths: if web_path and web_paths:
@ -91,6 +94,7 @@ class WebBaseLoader(BaseLoader):
self.default_parser = default_parser self.default_parser = default_parser
self.requests_kwargs = requests_kwargs or {} self.requests_kwargs = requests_kwargs or {}
self.raise_for_status = raise_for_status self.raise_for_status = raise_for_status
self.show_progress = show_progress
self.bs_get_text_kwargs = bs_get_text_kwargs or {} self.bs_get_text_kwargs = bs_get_text_kwargs or {}
self.bs_kwargs = bs_kwargs or {} self.bs_kwargs = bs_kwargs or {}
if session: if session:
@ -177,11 +181,14 @@ class WebBaseLoader(BaseLoader):
task = asyncio.ensure_future(self._fetch_with_rate_limit(url, semaphore)) task = asyncio.ensure_future(self._fetch_with_rate_limit(url, semaphore))
tasks.append(task) tasks.append(task)
try: try:
if self.show_progress:
from tqdm.asyncio import tqdm_asyncio from tqdm.asyncio import tqdm_asyncio
return await tqdm_asyncio.gather( return await tqdm_asyncio.gather(
*tasks, desc="Fetching pages", ascii=True, mininterval=1 *tasks, desc="Fetching pages", ascii=True, mininterval=1
) )
else:
return await asyncio.gather(*tasks)
except ImportError: except ImportError:
warnings.warn("For better logging of progress, `pip install tqdm`") warnings.warn("For better logging of progress, `pip install tqdm`")
return await asyncio.gather(*tasks) return await asyncio.gather(*tasks)