mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-10 21:35:08 +00:00
beautifulsoup get_text kwargs in WebBaseLoader (#6591)
# beautifulsoup get_text kwargs in WebBaseLoader - Description: this PR introduces an optional `bs_get_text_kwargs` parameter to `WebBaseLoader` constructor. It can be used to pass kwargs to the downstream BeautifulSoup.get_text call. The most common usage might be to pass a custom text separator, as seen also in `BSHTMLLoader`. - Tag maintainer: @rlancemartin, @eyurtsev - Twitter handle: jtolgyesi
This commit is contained in:
parent
be68f6f8ce
commit
05eec99269
@ -50,6 +50,9 @@ class WebBaseLoader(BaseLoader):
|
|||||||
requests_kwargs: Dict[str, Any] = {}
|
requests_kwargs: Dict[str, Any] = {}
|
||||||
"""kwargs for requests"""
|
"""kwargs for requests"""
|
||||||
|
|
||||||
|
bs_get_text_kwargs: Dict[str, Any] = {}
|
||||||
|
"""kwargs for beatifulsoup4 get_text"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
web_path: Union[str, List[str]],
|
web_path: Union[str, List[str]],
|
||||||
@ -201,7 +204,7 @@ class WebBaseLoader(BaseLoader):
|
|||||||
"""Lazy load text from the url(s) in web_path."""
|
"""Lazy load text from the url(s) in web_path."""
|
||||||
for path in self.web_paths:
|
for path in self.web_paths:
|
||||||
soup = self._scrape(path)
|
soup = self._scrape(path)
|
||||||
text = soup.get_text()
|
text = soup.get_text(**self.bs_get_text_kwargs)
|
||||||
metadata = _build_metadata(soup, path)
|
metadata = _build_metadata(soup, path)
|
||||||
yield Document(page_content=text, metadata=metadata)
|
yield Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
@ -216,7 +219,7 @@ class WebBaseLoader(BaseLoader):
|
|||||||
docs = []
|
docs = []
|
||||||
for i in range(len(results)):
|
for i in range(len(results)):
|
||||||
soup = results[i]
|
soup = results[i]
|
||||||
text = soup.get_text()
|
text = soup.get_text(**self.bs_get_text_kwargs)
|
||||||
metadata = _build_metadata(soup, self.web_paths[i])
|
metadata = _build_metadata(soup, self.web_paths[i])
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user