mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-14 19:09:03 +00:00
Fix encoding issue in WebBaseLoader (#3602)
The character code mismatches occurred when character information was not included in the response header (In my case, a Japanese web page). I solved this issue by changing the encoding setting to apparent_encoding.
This commit is contained in:
parent
be7a8e0824
commit
fa4c35e9e5
@ -169,6 +169,7 @@ class WebBaseLoader(BaseLoader):
|
|||||||
self._check_parser(parser)
|
self._check_parser(parser)
|
||||||
|
|
||||||
html_doc = self.session.get(url)
|
html_doc = self.session.get(url)
|
||||||
|
html_doc.encoding = html_doc.apparent_encoding
|
||||||
return BeautifulSoup(html_doc.text, parser)
|
return BeautifulSoup(html_doc.text, parser)
|
||||||
|
|
||||||
def scrape(self, parser: Union[str, None] = None) -> Any:
|
def scrape(self, parser: Union[str, None] = None) -> Any:
|
||||||
|
Loading…
Reference in New Issue
Block a user