mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-03 03:38:06 +00:00
community[patch]: Using the right encoding to parse the web page in RecursiveUrlLoader (#20632)
As shown in #13749 , `RecursiveUrlLoader` has encoding issue. This PR is to solve this. --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
b0b1a67771
commit
f1c3687aa5
@ -98,6 +98,8 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
continue_on_failure: bool = True,
|
continue_on_failure: bool = True,
|
||||||
*,
|
*,
|
||||||
base_url: Optional[str] = None,
|
base_url: Optional[str] = None,
|
||||||
|
autoset_encoding: bool = True,
|
||||||
|
encoding: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize with URL to crawl and any subdirectories to exclude.
|
"""Initialize with URL to crawl and any subdirectories to exclude.
|
||||||
|
|
||||||
@ -137,6 +139,11 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
continue_on_failure: If True, continue if getting or parsing a link raises
|
continue_on_failure: If True, continue if getting or parsing a link raises
|
||||||
an exception. Otherwise, raise the exception.
|
an exception. Otherwise, raise the exception.
|
||||||
base_url: The base url to check for outside links against.
|
base_url: The base url to check for outside links against.
|
||||||
|
autoset_encoding: Whether to automatically set the encoding of the response.
|
||||||
|
If True, the encoding of the response will be set to the apparent
|
||||||
|
encoding, unless the `encoding` argument has already been explicitly set.
|
||||||
|
encoding: The encoding of the response. If manually set, the encoding will be
|
||||||
|
set to given value, regardless of the `autoset_encoding` argument.
|
||||||
""" # noqa: E501
|
""" # noqa: E501
|
||||||
|
|
||||||
self.url = url
|
self.url = url
|
||||||
@ -148,6 +155,8 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
if metadata_extractor is not None
|
if metadata_extractor is not None
|
||||||
else _metadata_extractor
|
else _metadata_extractor
|
||||||
)
|
)
|
||||||
|
self.autoset_encoding = autoset_encoding
|
||||||
|
self.encoding = encoding
|
||||||
self.metadata_extractor = _wrap_metadata_extractor(metadata_extractor)
|
self.metadata_extractor = _wrap_metadata_extractor(metadata_extractor)
|
||||||
self.exclude_dirs = exclude_dirs if exclude_dirs is not None else ()
|
self.exclude_dirs = exclude_dirs if exclude_dirs is not None else ()
|
||||||
|
|
||||||
@ -184,6 +193,12 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
visited.add(url)
|
visited.add(url)
|
||||||
try:
|
try:
|
||||||
response = requests.get(url, timeout=self.timeout, headers=self.headers)
|
response = requests.get(url, timeout=self.timeout, headers=self.headers)
|
||||||
|
|
||||||
|
if self.encoding is not None:
|
||||||
|
response.encoding = self.encoding
|
||||||
|
elif self.autoset_encoding:
|
||||||
|
response.encoding = response.apparent_encoding
|
||||||
|
|
||||||
if self.check_response_status and 400 <= response.status_code <= 599:
|
if self.check_response_status and 400 <= response.status_code <= 599:
|
||||||
raise ValueError(f"Received HTTP status {response.status_code}")
|
raise ValueError(f"Received HTTP status {response.status_code}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
Loading…
Reference in New Issue
Block a user