mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-01 10:54:15 +00:00
community[patch]: Using the right encoding to parse the web page in RecursiveUrlLoader (#20632)
As shown in #13749 , `RecursiveUrlLoader` has encoding issue. This PR is to solve this. --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
b0b1a67771
commit
f1c3687aa5
@ -98,6 +98,8 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
continue_on_failure: bool = True,
|
||||
*,
|
||||
base_url: Optional[str] = None,
|
||||
autoset_encoding: bool = True,
|
||||
encoding: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Initialize with URL to crawl and any subdirectories to exclude.
|
||||
|
||||
@ -137,6 +139,11 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
continue_on_failure: If True, continue if getting or parsing a link raises
|
||||
an exception. Otherwise, raise the exception.
|
||||
base_url: The base url to check for outside links against.
|
||||
autoset_encoding: Whether to automatically set the encoding of the response.
|
||||
If True, the encoding of the response will be set to the apparent
|
||||
encoding, unless the `encoding` argument has already been explicitly set.
|
||||
encoding: The encoding of the response. If manually set, the encoding will be
|
||||
set to given value, regardless of the `autoset_encoding` argument.
|
||||
""" # noqa: E501
|
||||
|
||||
self.url = url
|
||||
@ -148,6 +155,8 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
if metadata_extractor is not None
|
||||
else _metadata_extractor
|
||||
)
|
||||
self.autoset_encoding = autoset_encoding
|
||||
self.encoding = encoding
|
||||
self.metadata_extractor = _wrap_metadata_extractor(metadata_extractor)
|
||||
self.exclude_dirs = exclude_dirs if exclude_dirs is not None else ()
|
||||
|
||||
@ -184,6 +193,12 @@ class RecursiveUrlLoader(BaseLoader):
|
||||
visited.add(url)
|
||||
try:
|
||||
response = requests.get(url, timeout=self.timeout, headers=self.headers)
|
||||
|
||||
if self.encoding is not None:
|
||||
response.encoding = self.encoding
|
||||
elif self.autoset_encoding:
|
||||
response.encoding = response.apparent_encoding
|
||||
|
||||
if self.check_response_status and 400 <= response.status_code <= 599:
|
||||
raise ValueError(f"Received HTTP status {response.status_code}")
|
||||
except Exception as e:
|
||||
|
Loading…
Reference in New Issue
Block a user