From 50c5bb5607da2bdac4de3555a2aa49ea117d689e Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 8 Jan 2026 20:51:17 +0530 Subject: [PATCH] refactor(core): improve docstrings for HTML link extraction utilities (#34550) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # refactor(core): improve docstrings for HTML link extraction utilities ## Description This PR updates and clarifies the docstrings for `find_all_links` and `extract_sub_links` in `libs/core/langchain_core/utils/html.py`. The previous return-value descriptions were vague (e.g., "all links", "sub links"). They have now been revised to clearly describe the behavior and output of each function: - **find_all_links** → “A list of all links found in the HTML.” - **extract_sub_links** → “A list of absolute paths to sub links.” These improvements make the utilities more understandable and developer-friendly without altering functionality. ## Verification - `ruff check libs/core/langchain_core/utils/html.py`: **Passed** - `pytest libs/core/tests/unit_tests/utils/test_html.py`: **Passed** ## Checklists - PR title follows the required format: `TYPE(SCOPE): DESCRIPTION` - Changes are limited to the `langchain-core` package - `make format`, `make lint`, and `make test` pass --- libs/core/langchain_core/utils/html.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/core/langchain_core/utils/html.py b/libs/core/langchain_core/utils/html.py index 880a2edc00e..b6bbf5cf221 100644 --- a/libs/core/langchain_core/utils/html.py +++ b/libs/core/langchain_core/utils/html.py @@ -43,7 +43,7 @@ def find_all_links( pattern: Regex to use for extracting links from raw HTML. Returns: - all links + A list of all links found in the HTML. """ pattern = pattern or DEFAULT_LINK_REGEX return list(set(re.findall(pattern, raw_html))) @@ -73,7 +73,7 @@ def extract_sub_links( exception. Otherwise, raise the exception. Returns: - sub links. + A list of absolute paths to sub links. """ base_url_to_use = base_url if base_url is not None else url parsed_base_url = urlparse(base_url_to_use)