From 25a6790e1a0b9b3b56066bdee3efddd634cac11f Mon Sep 17 00:00:00 2001 From: zysoong Date: Wed, 28 Aug 2024 10:02:40 +0200 Subject: [PATCH] community[patch]: Minor Improvement of extract hyperlinks tool output (#25728) **Description:** Make the hyperlink only appear once in the extract_hyperlinks tool output. (for some websites output contains meaningless '#' hyperlinks multiple times which will extend the tokens of context window without any advantage) **Issue:** None **Dependencies:** None --- .../tools/playwright/extract_hyperlinks.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/tools/playwright/extract_hyperlinks.py b/libs/community/langchain_community/tools/playwright/extract_hyperlinks.py index 3cac3c496b3..0dcd3fd2e06 100644 --- a/libs/community/langchain_community/tools/playwright/extract_hyperlinks.py +++ b/libs/community/langchain_community/tools/playwright/extract_hyperlinks.py @@ -63,8 +63,9 @@ class ExtractHyperlinksTool(BaseBrowserTool): links = [urljoin(base_url, anchor.get("href", "")) for anchor in anchors] else: links = [anchor.get("href", "") for anchor in anchors] - # Return the list of links as a JSON string - return json.dumps(links) + # Return the list of links as a JSON string. Duplicated link + # only appears once in the list + return json.dumps(list(set(links))) def _run( self,