mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-15 01:13:48 +00:00
fix extract sublink bug (#10855)
This commit is contained in:
parent
7dec2d399b
commit
de0a02f507
@ -59,11 +59,12 @@ def extract_sub_links(
|
|||||||
for link in all_links:
|
for link in all_links:
|
||||||
# Some may be absolute links like https://to/path
|
# Some may be absolute links like https://to/path
|
||||||
if link.startswith("http"):
|
if link.startswith("http"):
|
||||||
if not prevent_outside or link.startswith(base_url):
|
absolute_paths.add(link)
|
||||||
absolute_paths.add(link)
|
|
||||||
# Some may have omitted the protocol like //to/path
|
# Some may have omitted the protocol like //to/path
|
||||||
elif link.startswith("//"):
|
elif link.startswith("//"):
|
||||||
absolute_paths.add(f"{urlparse(base_url).scheme}:{link}")
|
absolute_paths.add(f"{urlparse(base_url).scheme}:{link}")
|
||||||
else:
|
else:
|
||||||
absolute_paths.add(urljoin(base_url, link))
|
absolute_paths.add(urljoin(base_url, link))
|
||||||
|
if prevent_outside:
|
||||||
|
return [p for p in absolute_paths if p.startswith(base_url)]
|
||||||
return list(absolute_paths)
|
return list(absolute_paths)
|
||||||
|
@ -86,13 +86,8 @@ def test_extract_sub_links() -> None:
|
|||||||
actual = sorted(extract_sub_links(html, "https://foobar.com"))
|
actual = sorted(extract_sub_links(html, "https://foobar.com"))
|
||||||
assert actual == expected
|
assert actual == expected
|
||||||
|
|
||||||
actual = sorted(extract_sub_links(html, "https://foobar.com/hello"))
|
actual = extract_sub_links(html, "https://foobar.com/hello")
|
||||||
expected = sorted(
|
expected = ["https://foobar.com/hello"]
|
||||||
[
|
|
||||||
"https://foobar.com/hello",
|
|
||||||
"https://foobar.com/how/are/you/doing",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
assert actual == expected
|
assert actual == expected
|
||||||
|
|
||||||
actual = sorted(
|
actual = sorted(
|
||||||
|
Loading…
Reference in New Issue
Block a user