mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 06:39:52 +00:00
core[patch]: Enhance link extraction with query parameters (#20259)
**Description**: This update enhances the `extract_sub_links` function within the `langchain_core/utils/html.py` module to include query parameters in the extracted URLs. **Issue**: N/A **Dependencies**: No additional dependencies required for this change. **Twitter handle**: N/A Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
parent
0e917e319b
commit
2aca7fcdcf
@ -88,6 +88,8 @@ def extract_sub_links(
|
|||||||
absolute_path = f"{parsed_url.scheme}:{link}"
|
absolute_path = f"{parsed_url.scheme}:{link}"
|
||||||
else:
|
else:
|
||||||
absolute_path = urljoin(url, parsed_link.path)
|
absolute_path = urljoin(url, parsed_link.path)
|
||||||
|
if parsed_link.query:
|
||||||
|
absolute_path += f"?{parsed_link.query}"
|
||||||
absolute_paths.add(absolute_path)
|
absolute_paths.add(absolute_path)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if continue_on_failure:
|
if continue_on_failure:
|
||||||
|
@ -183,3 +183,27 @@ def test_prevent_outside() -> None:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
assert actual == expected
|
assert actual == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_sub_links_with_query() -> None:
|
||||||
|
html = (
|
||||||
|
'<a href="https://foobar.com?query=123">one</a>'
|
||||||
|
'<a href="/hello?query=456">two</a>'
|
||||||
|
'<a href="//foobar.com/how/are/you?query=789">three</a>'
|
||||||
|
'<a href="doing?query=101112"></a>'
|
||||||
|
)
|
||||||
|
|
||||||
|
expected = sorted(
|
||||||
|
[
|
||||||
|
"https://foobar.com?query=123",
|
||||||
|
"https://foobar.com/hello?query=456",
|
||||||
|
"https://foobar.com/how/are/you?query=789",
|
||||||
|
"https://foobar.com/hello/doing?query=101112",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
actual = sorted(
|
||||||
|
extract_sub_links(
|
||||||
|
html, "https://foobar.com/hello/bill.html", base_url="https://foobar.com"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert actual == expected, f"Expected {expected}, but got {actual}"
|
||||||
|
Loading…
Reference in New Issue
Block a user