diff --git a/libs/community/langchain_community/document_loaders/recursive_url_loader.py b/libs/community/langchain_community/document_loaders/recursive_url_loader.py index 687b02ccd69..b5e9bfabcfa 100644 --- a/libs/community/langchain_community/document_loaders/recursive_url_loader.py +++ b/libs/community/langchain_community/document_loaders/recursive_url_loader.py @@ -98,6 +98,8 @@ class RecursiveUrlLoader(BaseLoader): continue_on_failure: bool = True, *, base_url: Optional[str] = None, + autoset_encoding: bool = True, + encoding: Optional[str] = None, ) -> None: """Initialize with URL to crawl and any subdirectories to exclude. @@ -137,6 +139,11 @@ class RecursiveUrlLoader(BaseLoader): continue_on_failure: If True, continue if getting or parsing a link raises an exception. Otherwise, raise the exception. base_url: The base url to check for outside links against. + autoset_encoding: Whether to automatically set the encoding of the response. + If True, the encoding of the response will be set to the apparent + encoding, unless the `encoding` argument has already been explicitly set. + encoding: The encoding of the response. If manually set, the encoding will be + set to given value, regardless of the `autoset_encoding` argument. """ # noqa: E501 self.url = url @@ -148,6 +155,8 @@ class RecursiveUrlLoader(BaseLoader): if metadata_extractor is not None else _metadata_extractor ) + self.autoset_encoding = autoset_encoding + self.encoding = encoding self.metadata_extractor = _wrap_metadata_extractor(metadata_extractor) self.exclude_dirs = exclude_dirs if exclude_dirs is not None else () @@ -184,6 +193,12 @@ class RecursiveUrlLoader(BaseLoader): visited.add(url) try: response = requests.get(url, timeout=self.timeout, headers=self.headers) + + if self.encoding is not None: + response.encoding = self.encoding + elif self.autoset_encoding: + response.encoding = response.apparent_encoding + if self.check_response_status and 400 <= response.status_code <= 599: raise ValueError(f"Received HTTP status {response.status_code}") except Exception as e: