mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 04:07:54 +00:00
make auto-setting the encodings optional, alow explicitly setting it (#10774)
I was trying to use web loaders on some spanish documentation (e.g. [this site](https://www.fromdoppler.com/es/mailing-tendencias/), but the auto-encoding introduced in https://github.com/langchain-ai/langchain/pull/3602 was detected as "MacRoman" instead of the (correct) "UTF-8". To address this, I've added the ability to disable the auto-encoding, as well as the ability to explicitly tell the loader what encoding to use. - **Description:** Makes auto-setting the encoding optional in `WebBaseLoader`, and introduces an `encoding` option to explicitly set it. - **Dependencies:** N/A - **Tag maintainer:** @hwchase17 - **Twitter handle:** @czue
This commit is contained in:
parent
c68be4eb2b
commit
62603f2664
@ -63,6 +63,8 @@ class WebBaseLoader(BaseLoader):
|
|||||||
verify_ssl: Optional[bool] = True,
|
verify_ssl: Optional[bool] = True,
|
||||||
proxies: Optional[dict] = None,
|
proxies: Optional[dict] = None,
|
||||||
continue_on_failure: Optional[bool] = False,
|
continue_on_failure: Optional[bool] = False,
|
||||||
|
autoset_encoding: Optional[bool] = True,
|
||||||
|
encoding: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""Initialize with webpage path."""
|
"""Initialize with webpage path."""
|
||||||
|
|
||||||
@ -98,7 +100,8 @@ class WebBaseLoader(BaseLoader):
|
|||||||
self.session.headers = dict(headers)
|
self.session.headers = dict(headers)
|
||||||
self.session.verify = verify_ssl
|
self.session.verify = verify_ssl
|
||||||
self.continue_on_failure = continue_on_failure
|
self.continue_on_failure = continue_on_failure
|
||||||
|
self.autoset_encoding = autoset_encoding
|
||||||
|
self.encoding = encoding
|
||||||
if proxies:
|
if proxies:
|
||||||
self.session.proxies.update(proxies)
|
self.session.proxies.update(proxies)
|
||||||
|
|
||||||
@ -208,6 +211,10 @@ class WebBaseLoader(BaseLoader):
|
|||||||
html_doc = self.session.get(url, **self.requests_kwargs)
|
html_doc = self.session.get(url, **self.requests_kwargs)
|
||||||
if self.raise_for_status:
|
if self.raise_for_status:
|
||||||
html_doc.raise_for_status()
|
html_doc.raise_for_status()
|
||||||
|
|
||||||
|
if self.encoding is not None:
|
||||||
|
html_doc.encoding = self.encoding
|
||||||
|
elif self.autoset_encoding:
|
||||||
html_doc.encoding = html_doc.apparent_encoding
|
html_doc.encoding = html_doc.apparent_encoding
|
||||||
return BeautifulSoup(html_doc.text, parser)
|
return BeautifulSoup(html_doc.text, parser)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user