beautifulsoup get_text kwargs in WebBaseLoader (#6591)

# beautifulsoup get_text kwargs in WebBaseLoader

- Description: this PR introduces an optional `bs_get_text_kwargs`
parameter to `WebBaseLoader` constructor. It can be used to pass kwargs
to the downstream BeautifulSoup.get_text call. The most common usage
might be to pass a custom text separator, as seen also in
`BSHTMLLoader`.
  - Tag maintainer: @rlancemartin, @eyurtsev
  - Twitter handle: jtolgyesi
This commit is contained in:
Janos Tolgyesi 2023-06-25 21:42:27 +02:00 committed by GitHub
parent be68f6f8ce
commit 05eec99269
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -50,6 +50,9 @@ class WebBaseLoader(BaseLoader):
requests_kwargs: Dict[str, Any] = {} requests_kwargs: Dict[str, Any] = {}
"""kwargs for requests""" """kwargs for requests"""
bs_get_text_kwargs: Dict[str, Any] = {}
"""kwargs for beatifulsoup4 get_text"""
def __init__( def __init__(
self, self,
web_path: Union[str, List[str]], web_path: Union[str, List[str]],
@ -201,7 +204,7 @@ class WebBaseLoader(BaseLoader):
"""Lazy load text from the url(s) in web_path.""" """Lazy load text from the url(s) in web_path."""
for path in self.web_paths: for path in self.web_paths:
soup = self._scrape(path) soup = self._scrape(path)
text = soup.get_text() text = soup.get_text(**self.bs_get_text_kwargs)
metadata = _build_metadata(soup, path) metadata = _build_metadata(soup, path)
yield Document(page_content=text, metadata=metadata) yield Document(page_content=text, metadata=metadata)
@ -216,7 +219,7 @@ class WebBaseLoader(BaseLoader):
docs = [] docs = []
for i in range(len(results)): for i in range(len(results)):
soup = results[i] soup = results[i]
text = soup.get_text() text = soup.get_text(**self.bs_get_text_kwargs)
metadata = _build_metadata(soup, self.web_paths[i]) metadata = _build_metadata(soup, self.web_paths[i])
docs.append(Document(page_content=text, metadata=metadata)) docs.append(Document(page_content=text, metadata=metadata))