mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-06 07:04:01 +00:00
Fix encoding issue in WebBaseLoader (#3602)
The character code mismatches occurred when character information was not included in the response header (In my case, a Japanese web page). I solved this issue by changing the encoding setting to apparent_encoding.
This commit is contained in:
parent
be7a8e0824
commit
fa4c35e9e5
@ -169,6 +169,7 @@ class WebBaseLoader(BaseLoader):
|
||||
self._check_parser(parser)
|
||||
|
||||
html_doc = self.session.get(url)
|
||||
html_doc.encoding = html_doc.apparent_encoding
|
||||
return BeautifulSoup(html_doc.text, parser)
|
||||
|
||||
def scrape(self, parser: Union[str, None] = None) -> Any:
|
||||
|
Loading…
Reference in New Issue
Block a user