From 2aca7fcdcfce834d1cd810d6f9d74e62af54244f Mon Sep 17 00:00:00 2001 From: YH Date: Sat, 27 Apr 2024 11:22:36 +0900 Subject: [PATCH] core[patch]: Enhance link extraction with query parameters (#20259) **Description**: This update enhances the `extract_sub_links` function within the `langchain_core/utils/html.py` module to include query parameters in the extracted URLs. **Issue**: N/A **Dependencies**: No additional dependencies required for this change. **Twitter handle**: N/A Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> --- libs/core/langchain_core/utils/html.py | 2 ++ libs/core/tests/unit_tests/utils/test_html.py | 24 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/libs/core/langchain_core/utils/html.py b/libs/core/langchain_core/utils/html.py index 837b19ed101..3e41c187b4a 100644 --- a/libs/core/langchain_core/utils/html.py +++ b/libs/core/langchain_core/utils/html.py @@ -88,6 +88,8 @@ def extract_sub_links( absolute_path = f"{parsed_url.scheme}:{link}" else: absolute_path = urljoin(url, parsed_link.path) + if parsed_link.query: + absolute_path += f"?{parsed_link.query}" absolute_paths.add(absolute_path) except Exception as e: if continue_on_failure: diff --git a/libs/core/tests/unit_tests/utils/test_html.py b/libs/core/tests/unit_tests/utils/test_html.py index a2c80f6e654..a6332e4b606 100644 --- a/libs/core/tests/unit_tests/utils/test_html.py +++ b/libs/core/tests/unit_tests/utils/test_html.py @@ -183,3 +183,27 @@ def test_prevent_outside() -> None: ) ) assert actual == expected + + +def test_extract_sub_links_with_query() -> None: + html = ( + 'one' + 'two' + 'three' + '' + ) + + expected = sorted( + [ + "https://foobar.com?query=123", + "https://foobar.com/hello?query=456", + "https://foobar.com/how/are/you?query=789", + "https://foobar.com/hello/doing?query=101112", + ] + ) + actual = sorted( + extract_sub_links( + html, "https://foobar.com/hello/bill.html", base_url="https://foobar.com" + ) + ) + assert actual == expected, f"Expected {expected}, but got {actual}"