community[minor]: add document transformer for extracting links (#24186)

- **Description:** Add a DocumentTransformer for executing one or more `LinkExtractor`s and adding the extracted links to each document. - **Issue:** n/a - **Depedencies:** none --------- Co-authored-by: Eugene Yurtsev <eugene@langchain.dev>
2025-09-24 20:09:01 +00:00 · 2024-07-22 19:01:21 -07:00
parent 3c4652c906
commit 5ac936a284
5 changed files with 174 additions and 8 deletions
--- a/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_link_extractor_transformer.py
+++ b/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_link_extractor_transformer.py
@@ -0,0 +1,92 @@
+from typing import Set
+
+from langchain_core.documents import Document
+from langchain_core.graph_vectorstores.links import Link, get_links
+
+from langchain_community.graph_vectorstores.extractors import (
+    LinkExtractor,
+    LinkExtractorTransformer,
+)
+
+TEXT1 = "Text1"
+TEXT2 = "Text2"
+
+
+class FakeKeywordExtractor(LinkExtractor[Document]):
+    def extract_one(self, input: Document) -> Set[Link]:
+        kws: Set[str] = set()
+        if input.page_content == TEXT1:
+            kws = {"a", "b"}
+        elif input.page_content == TEXT2:
+            kws = {"b", "c"}
+
+        return {Link.bidir(kind="fakekw", tag=kw) for kw in kws}
+
+
+class FakeHyperlinkExtractor(LinkExtractor[Document]):
+    def extract_one(self, input: Document) -> Set[Link]:
+        if input.page_content == TEXT1:
+            return {
+                Link.incoming(kind="fakehref", tag="http://text1"),
+                Link.outgoing(kind="fakehref", tag="http://text2"),
+                Link.outgoing(kind="fakehref", tag="http://text3"),
+            }
+        elif input.page_content == TEXT2:
+            return {
+                Link.incoming(kind="fakehref", tag="http://text2"),
+                Link.outgoing(kind="fakehref", tag="http://text3"),
+            }
+        else:
+            raise ValueError(
+                f"Unsupported input for FakeHyperlinkExtractor: '{input.page_content}'"
+            )
+
+
+def test_one_extractor() -> None:
+    transformer = LinkExtractorTransformer(
+        [
+            FakeKeywordExtractor(),
+        ]
+    )
+    doc1 = Document(TEXT1)
+    doc2 = Document(TEXT2)
+    results = transformer.transform_documents([doc1, doc2])
+
+    assert set(get_links(results[0])) == {
+        Link.bidir(kind="fakekw", tag="a"),
+        Link.bidir(kind="fakekw", tag="b"),
+    }
+
+    assert set(get_links(results[1])) == {
+        Link.bidir(kind="fakekw", tag="b"),
+        Link.bidir(kind="fakekw", tag="c"),
+    }
+
+
+def test_multiple_extractors() -> None:
+    transformer = LinkExtractorTransformer(
+        [
+            FakeKeywordExtractor(),
+            FakeHyperlinkExtractor(),
+        ]
+    )
+
+    doc1 = Document(TEXT1)
+    doc2 = Document(TEXT2)
+
+    results = transformer.transform_documents([doc1, doc2])
+
+    assert set(get_links(results[0])) == {
+        Link.bidir(kind="fakekw", tag="a"),
+        Link.bidir(kind="fakekw", tag="b"),
+        Link.incoming(kind="fakehref", tag="http://text1"),
+        Link.outgoing(kind="fakehref", tag="http://text2"),
+        Link.outgoing(kind="fakehref", tag="http://text3"),
+    }
+
+    assert set(get_links(results[1])) == {
+        Link.bidir(kind="fakekw", tag="b"),
+        Link.bidir(kind="fakekw", tag="c"),
+        Link.incoming(kind="fakehref", tag="http://text2"),
+        Link.outgoing(kind="fakehref", tag="http://text3"),
+    }