mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-03 03:38:06 +00:00
make auto-setting the encodings optional, alow explicitly setting it (#10774)
I was trying to use web loaders on some spanish documentation (e.g. [this site](https://www.fromdoppler.com/es/mailing-tendencias/), but the auto-encoding introduced in https://github.com/langchain-ai/langchain/pull/3602 was detected as "MacRoman" instead of the (correct) "UTF-8". To address this, I've added the ability to disable the auto-encoding, as well as the ability to explicitly tell the loader what encoding to use. - **Description:** Makes auto-setting the encoding optional in `WebBaseLoader`, and introduces an `encoding` option to explicitly set it. - **Dependencies:** N/A - **Tag maintainer:** @hwchase17 - **Twitter handle:** @czue
This commit is contained in:
parent
c68be4eb2b
commit
62603f2664
@ -63,6 +63,8 @@ class WebBaseLoader(BaseLoader):
|
||||
verify_ssl: Optional[bool] = True,
|
||||
proxies: Optional[dict] = None,
|
||||
continue_on_failure: Optional[bool] = False,
|
||||
autoset_encoding: Optional[bool] = True,
|
||||
encoding: Optional[str] = None,
|
||||
):
|
||||
"""Initialize with webpage path."""
|
||||
|
||||
@ -98,7 +100,8 @@ class WebBaseLoader(BaseLoader):
|
||||
self.session.headers = dict(headers)
|
||||
self.session.verify = verify_ssl
|
||||
self.continue_on_failure = continue_on_failure
|
||||
|
||||
self.autoset_encoding = autoset_encoding
|
||||
self.encoding = encoding
|
||||
if proxies:
|
||||
self.session.proxies.update(proxies)
|
||||
|
||||
@ -208,6 +211,10 @@ class WebBaseLoader(BaseLoader):
|
||||
html_doc = self.session.get(url, **self.requests_kwargs)
|
||||
if self.raise_for_status:
|
||||
html_doc.raise_for_status()
|
||||
|
||||
if self.encoding is not None:
|
||||
html_doc.encoding = self.encoding
|
||||
elif self.autoset_encoding:
|
||||
html_doc.encoding = html_doc.apparent_encoding
|
||||
return BeautifulSoup(html_doc.text, parser)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user