diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py index 07a10001df2..8ea5d749117 100644 --- a/langchain/document_loaders/web_base.py +++ b/langchain/document_loaders/web_base.py @@ -50,6 +50,9 @@ class WebBaseLoader(BaseLoader): requests_kwargs: Dict[str, Any] = {} """kwargs for requests""" + bs_get_text_kwargs: Dict[str, Any] = {} + """kwargs for beatifulsoup4 get_text""" + def __init__( self, web_path: Union[str, List[str]], @@ -201,7 +204,7 @@ class WebBaseLoader(BaseLoader): """Lazy load text from the url(s) in web_path.""" for path in self.web_paths: soup = self._scrape(path) - text = soup.get_text() + text = soup.get_text(**self.bs_get_text_kwargs) metadata = _build_metadata(soup, path) yield Document(page_content=text, metadata=metadata) @@ -216,7 +219,7 @@ class WebBaseLoader(BaseLoader): docs = [] for i in range(len(results)): soup = results[i] - text = soup.get_text() + text = soup.get_text(**self.bs_get_text_kwargs) metadata = _build_metadata(soup, self.web_paths[i]) docs.append(Document(page_content=text, metadata=metadata))