From 05eec992693082bde92ad5b00c61309a8d069bbb Mon Sep 17 00:00:00 2001 From: Janos Tolgyesi Date: Sun, 25 Jun 2023 21:42:27 +0200 Subject: [PATCH] beautifulsoup get_text kwargs in WebBaseLoader (#6591) # beautifulsoup get_text kwargs in WebBaseLoader - Description: this PR introduces an optional `bs_get_text_kwargs` parameter to `WebBaseLoader` constructor. It can be used to pass kwargs to the downstream BeautifulSoup.get_text call. The most common usage might be to pass a custom text separator, as seen also in `BSHTMLLoader`. - Tag maintainer: @rlancemartin, @eyurtsev - Twitter handle: jtolgyesi --- langchain/document_loaders/web_base.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py index 07a10001df2..8ea5d749117 100644 --- a/langchain/document_loaders/web_base.py +++ b/langchain/document_loaders/web_base.py @@ -50,6 +50,9 @@ class WebBaseLoader(BaseLoader): requests_kwargs: Dict[str, Any] = {} """kwargs for requests""" + bs_get_text_kwargs: Dict[str, Any] = {} + """kwargs for beatifulsoup4 get_text""" + def __init__( self, web_path: Union[str, List[str]], @@ -201,7 +204,7 @@ class WebBaseLoader(BaseLoader): """Lazy load text from the url(s) in web_path.""" for path in self.web_paths: soup = self._scrape(path) - text = soup.get_text() + text = soup.get_text(**self.bs_get_text_kwargs) metadata = _build_metadata(soup, path) yield Document(page_content=text, metadata=metadata) @@ -216,7 +219,7 @@ class WebBaseLoader(BaseLoader): docs = [] for i in range(len(results)): soup = results[i] - text = soup.get_text() + text = soup.get_text(**self.bs_get_text_kwargs) metadata = _build_metadata(soup, self.web_paths[i]) docs.append(Document(page_content=text, metadata=metadata))