From 5ac936a284934ca47b0c81021ec7dcb0fa905a5b Mon Sep 17 00:00:00 2001
From: Ben Chambers <35960+bjchambers@users.noreply.github.com>
Date: Mon, 22 Jul 2024 19:01:21 -0700
Subject: [PATCH] community[minor]: add document transformer for extracting
 links (#24186)

- **Description:** Add a DocumentTransformer for executing one or more
`LinkExtractor`s and adding the extracted links to each document.
- **Issue:** n/a
- **Depedencies:** none

---------

Co-authored-by: Eugene Yurtsev <eugene@langchain.dev>
---
 .../graph_vectorstores/extractors/__init__.py | 17 ++--
 .../extractors/link_extractor.py              |  2 +-
 .../extractors/link_extractor_transformer.py  | 43 +++++++++
 .../test_link_extractor_transformer.py        | 92 +++++++++++++++++++
 .../graph_vectorstores/links.py               | 28 +++++-
 5 files changed, 174 insertions(+), 8 deletions(-)
 create mode 100644 libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_transformer.py
 create mode 100644 libs/community/tests/unit_tests/graph_vectorstores/extractors/test_link_extractor_transformer.py

diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py b/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py
index a78eb3807be..5acd8000038 100644
--- a/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py
@@ -6,20 +6,24 @@ from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor
     HierarchyInput,
     HierarchyLinkExtractor,
 )
-from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
-    HtmlInput,
-    HtmlLinkExtractor,
-)
 from langchain_community.graph_vectorstores.extractors.keybert_link_extractor import (
     KeybertInput,
     KeybertLinkExtractor,
 )
-from langchain_community.graph_vectorstores.extractors.link_extractor import (
+
+from .html_link_extractor import (
+    HtmlInput,
+    HtmlLinkExtractor,
+)
+from .link_extractor import (
     LinkExtractor,
 )
-from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
+from .link_extractor_adapter import (
     LinkExtractorAdapter,
 )
+from .link_extractor_transformer import (
+    LinkExtractorTransformer,
+)
 
 __all__ = [
     "GLiNERInput",
@@ -34,4 +38,5 @@ __all__ = [
     "LinkExtractor",
     "LinkExtractorAdapter",
     "LinkExtractorAdapter",
+    "LinkExtractorTransformer",
 ]
diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py
index 2eaa94bd86b..619ba2a6d13 100644
--- a/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py
@@ -14,7 +14,7 @@ class LinkExtractor(ABC, Generic[InputT]):
     """Interface for extracting links (incoming, outgoing, bidirectional)."""
 
     @abstractmethod
-    def extract_one(self, input: InputT) -> set[Link]:  # noqa: A002
+    def extract_one(self, input: InputT) -> Set[Link]:
         """Add edges from each `input` to the corresponding documents.
 
         Args:
diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_transformer.py b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_transformer.py
new file mode 100644
index 00000000000..52e4347d45b
--- /dev/null
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_transformer.py
@@ -0,0 +1,43 @@
+from typing import Any, Iterable, Sequence
+
+from langchain_core.documents import Document
+from langchain_core.documents.transformers import BaseDocumentTransformer
+from langchain_core.graph_vectorstores.links import copy_with_links
+
+from langchain_community.graph_vectorstores.extractors.link_extractor import (
+    LinkExtractor,
+)
+
+
+class LinkExtractorTransformer(BaseDocumentTransformer):
+    """DocumentTransformer for applying one or more LinkExtractors.
+
+    Example:
+        .. code-block:: python
+
+            extract_links = LinkExtractorTransformer([
+                HtmlLinkExtractor().as_document_extractor(),
+            ])
+            extract_links.transform_documents(docs)
+    """
+
+    def __init__(self, link_extractors: Iterable[LinkExtractor[Document]]):
+        """Create a DocumentTransformer which adds extracted links to each document."""
+        self.link_extractors = link_extractors
+
+    def transform_documents(
+        self, documents: Sequence[Document], **kwargs: Any
+    ) -> Sequence[Document]:
+        # Implement `transform_docments` directly, so that LinkExtractors which operate
+        # better in batch (`extract_many`) get a chance to do so.
+
+        # Run each extractor over all documents.
+        links_per_extractor = [e.extract_many(documents) for e in self.link_extractors]
+
+        # Transpose the list of lists to pair each document with the tuple of links.
+        links_per_document = zip(*links_per_extractor)
+
+        return [
+            copy_with_links(document, *links)
+            for document, links in zip(documents, links_per_document)
+        ]
diff --git a/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_link_extractor_transformer.py b/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_link_extractor_transformer.py
new file mode 100644
index 00000000000..c791c759417
--- /dev/null
+++ b/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_link_extractor_transformer.py
@@ -0,0 +1,92 @@
+from typing import Set
+
+from langchain_core.documents import Document
+from langchain_core.graph_vectorstores.links import Link, get_links
+
+from langchain_community.graph_vectorstores.extractors import (
+    LinkExtractor,
+    LinkExtractorTransformer,
+)
+
+TEXT1 = "Text1"
+TEXT2 = "Text2"
+
+
+class FakeKeywordExtractor(LinkExtractor[Document]):
+    def extract_one(self, input: Document) -> Set[Link]:
+        kws: Set[str] = set()
+        if input.page_content == TEXT1:
+            kws = {"a", "b"}
+        elif input.page_content == TEXT2:
+            kws = {"b", "c"}
+
+        return {Link.bidir(kind="fakekw", tag=kw) for kw in kws}
+
+
+class FakeHyperlinkExtractor(LinkExtractor[Document]):
+    def extract_one(self, input: Document) -> Set[Link]:
+        if input.page_content == TEXT1:
+            return {
+                Link.incoming(kind="fakehref", tag="http://text1"),
+                Link.outgoing(kind="fakehref", tag="http://text2"),
+                Link.outgoing(kind="fakehref", tag="http://text3"),
+            }
+        elif input.page_content == TEXT2:
+            return {
+                Link.incoming(kind="fakehref", tag="http://text2"),
+                Link.outgoing(kind="fakehref", tag="http://text3"),
+            }
+        else:
+            raise ValueError(
+                f"Unsupported input for FakeHyperlinkExtractor: '{input.page_content}'"
+            )
+
+
+def test_one_extractor() -> None:
+    transformer = LinkExtractorTransformer(
+        [
+            FakeKeywordExtractor(),
+        ]
+    )
+    doc1 = Document(TEXT1)
+    doc2 = Document(TEXT2)
+    results = transformer.transform_documents([doc1, doc2])
+
+    assert set(get_links(results[0])) == {
+        Link.bidir(kind="fakekw", tag="a"),
+        Link.bidir(kind="fakekw", tag="b"),
+    }
+
+    assert set(get_links(results[1])) == {
+        Link.bidir(kind="fakekw", tag="b"),
+        Link.bidir(kind="fakekw", tag="c"),
+    }
+
+
+def test_multiple_extractors() -> None:
+    transformer = LinkExtractorTransformer(
+        [
+            FakeKeywordExtractor(),
+            FakeHyperlinkExtractor(),
+        ]
+    )
+
+    doc1 = Document(TEXT1)
+    doc2 = Document(TEXT2)
+
+    results = transformer.transform_documents([doc1, doc2])
+
+    assert set(get_links(results[0])) == {
+        Link.bidir(kind="fakekw", tag="a"),
+        Link.bidir(kind="fakekw", tag="b"),
+        Link.incoming(kind="fakehref", tag="http://text1"),
+        Link.outgoing(kind="fakehref", tag="http://text2"),
+        Link.outgoing(kind="fakehref", tag="http://text3"),
+    }
+
+    assert set(get_links(results[1])) == {
+        Link.bidir(kind="fakekw", tag="b"),
+        Link.bidir(kind="fakekw", tag="c"),
+        Link.incoming(kind="fakehref", tag="http://text2"),
+        Link.outgoing(kind="fakehref", tag="http://text3"),
+    }
diff --git a/libs/core/langchain_core/graph_vectorstores/links.py b/libs/core/langchain_core/graph_vectorstores/links.py
index 9da58a39276..11c95986ee9 100644
--- a/libs/core/langchain_core/graph_vectorstores/links.py
+++ b/libs/core/langchain_core/graph_vectorstores/links.py
@@ -12,7 +12,7 @@ class Link:
     """
 
     kind: str
-    """The kind of link. Allows different extractors to use the same tag name without 
+    """The kind of link. Allows different extractors to use the same tag name without
     creating collisions between extractors. For example “keyword” vs “url”."""
     direction: Literal["in", "out", "bidir"]
     """The direction of the link."""
@@ -66,3 +66,29 @@ def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None:
             links_in_metadata.extend(link)
         else:
             links_in_metadata.append(link)
+
+
+def copy_with_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> Document:
+    """Return a document with the given links added.
+
+    Args:
+        doc: The document to add the links to.
+        *links: The links to add to the document.
+
+    Returns:
+        A document with a shallow-copy of the metadata with the links added.
+    """
+    new_links = set(get_links(doc))
+    for link in links:
+        if isinstance(link, Iterable):
+            new_links.update(link)
+        else:
+            new_links.add(link)
+
+    return Document(
+        page_content=doc.page_content,
+        metadata={
+            **doc.metadata,
+            METADATA_LINKS_KEY: list(new_links),
+        },
+    )