mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-10 13:27:36 +00:00
beautifulsoup get_text kwargs in WebBaseLoader (#6591)
# beautifulsoup get_text kwargs in WebBaseLoader - Description: this PR introduces an optional `bs_get_text_kwargs` parameter to `WebBaseLoader` constructor. It can be used to pass kwargs to the downstream BeautifulSoup.get_text call. The most common usage might be to pass a custom text separator, as seen also in `BSHTMLLoader`. - Tag maintainer: @rlancemartin, @eyurtsev - Twitter handle: jtolgyesi
This commit is contained in:
parent
be68f6f8ce
commit
05eec99269
@ -50,6 +50,9 @@ class WebBaseLoader(BaseLoader):
|
||||
requests_kwargs: Dict[str, Any] = {}
|
||||
"""kwargs for requests"""
|
||||
|
||||
bs_get_text_kwargs: Dict[str, Any] = {}
|
||||
"""kwargs for beatifulsoup4 get_text"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
web_path: Union[str, List[str]],
|
||||
@ -201,7 +204,7 @@ class WebBaseLoader(BaseLoader):
|
||||
"""Lazy load text from the url(s) in web_path."""
|
||||
for path in self.web_paths:
|
||||
soup = self._scrape(path)
|
||||
text = soup.get_text()
|
||||
text = soup.get_text(**self.bs_get_text_kwargs)
|
||||
metadata = _build_metadata(soup, path)
|
||||
yield Document(page_content=text, metadata=metadata)
|
||||
|
||||
@ -216,7 +219,7 @@ class WebBaseLoader(BaseLoader):
|
||||
docs = []
|
||||
for i in range(len(results)):
|
||||
soup = results[i]
|
||||
text = soup.get_text()
|
||||
text = soup.get_text(**self.bs_get_text_kwargs)
|
||||
metadata = _build_metadata(soup, self.web_paths[i])
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user