mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 06:14:37 +00:00
core[patch]: Enhance link extraction with query parameters (#20259)
**Description**: This update enhances the `extract_sub_links` function within the `langchain_core/utils/html.py` module to include query parameters in the extracted URLs. **Issue**: N/A **Dependencies**: No additional dependencies required for this change. **Twitter handle**: N/A Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
parent
0e917e319b
commit
2aca7fcdcf
@ -88,6 +88,8 @@ def extract_sub_links(
|
||||
absolute_path = f"{parsed_url.scheme}:{link}"
|
||||
else:
|
||||
absolute_path = urljoin(url, parsed_link.path)
|
||||
if parsed_link.query:
|
||||
absolute_path += f"?{parsed_link.query}"
|
||||
absolute_paths.add(absolute_path)
|
||||
except Exception as e:
|
||||
if continue_on_failure:
|
||||
|
@ -183,3 +183,27 @@ def test_prevent_outside() -> None:
|
||||
)
|
||||
)
|
||||
assert actual == expected
|
||||
|
||||
|
||||
def test_extract_sub_links_with_query() -> None:
|
||||
html = (
|
||||
'<a href="https://foobar.com?query=123">one</a>'
|
||||
'<a href="/hello?query=456">two</a>'
|
||||
'<a href="//foobar.com/how/are/you?query=789">three</a>'
|
||||
'<a href="doing?query=101112"></a>'
|
||||
)
|
||||
|
||||
expected = sorted(
|
||||
[
|
||||
"https://foobar.com?query=123",
|
||||
"https://foobar.com/hello?query=456",
|
||||
"https://foobar.com/how/are/you?query=789",
|
||||
"https://foobar.com/hello/doing?query=101112",
|
||||
]
|
||||
)
|
||||
actual = sorted(
|
||||
extract_sub_links(
|
||||
html, "https://foobar.com/hello/bill.html", base_url="https://foobar.com"
|
||||
)
|
||||
)
|
||||
assert actual == expected, f"Expected {expected}, but got {actual}"
|
||||
|
Loading…
Reference in New Issue
Block a user