community[minor]: add document transformer for extracting links (#24186)

- **Description:** Add a DocumentTransformer for executing one or more `LinkExtractor`s and adding the extracted links to each document. - **Issue:** n/a - **Depedencies:** none --------- Co-authored-by: Eugene Yurtsev <eugene@langchain.dev>
2025-08-13 14:50:00 +00:00 · 2024-07-22 19:01:21 -07:00 · 2024-07-22 19:01:21 -07:00 · 5ac936a284
commit 5ac936a284
parent 3c4652c906
5 changed files with 174 additions and 8 deletions
--- a/libs/community/langchain_community/graph_vectorstores/extractors/init.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/init.py
@ -6,20 +6,24 @@ from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor
    HierarchyInput,
    HierarchyLinkExtractor,
 )
 from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
    HtmlInput,
    HtmlLinkExtractor,
 )
 from langchain_community.graph_vectorstores.extractors.keybert_link_extractor import (
    KeybertInput,
    KeybertLinkExtractor,
 )
-from langchain_community.graph_vectorstores.extractors.link_extractor import (
+
 from .html_link_extractor import (
    HtmlInput,
    HtmlLinkExtractor,
 )
 from .link_extractor import (
    LinkExtractor,
 )
-from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
+from .link_extractor_adapter import (
    LinkExtractorAdapter,
 )
 from .link_extractor_transformer import (
    LinkExtractorTransformer,
 )
 __all__ = [
    "GLiNERInput",
@ -34,4 +38,5 @@ __all__ = [
    "LinkExtractor",
    "LinkExtractorAdapter",
    "LinkExtractorAdapter",
    "LinkExtractorTransformer",
 ]
--- a/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor.py
@ -14,7 +14,7 @@ class LinkExtractor(ABC, Generic[InputT]):
    """Interface for extracting links (incoming, outgoing, bidirectional)."""
    @abstractmethod
-    def extract_one(self, input: InputT) -> set[Link]:  # noqa: A002
+    def extract_one(self, input: InputT) -> Set[Link]:
        """Add edges from each `input` to the corresponding documents.
        Args:
--- a/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_transformer.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/link_extractor_transformer.py
@ -0,0 +1,43 @@
 from typing import Any, Iterable, Sequence
 from langchain_core.documents import Document
 from langchain_core.documents.transformers import BaseDocumentTransformer
 from langchain_core.graph_vectorstores.links import copy_with_links
 from langchain_community.graph_vectorstores.extractors.link_extractor import (
    LinkExtractor,
 )
 class LinkExtractorTransformer(BaseDocumentTransformer):
    """DocumentTransformer for applying one or more LinkExtractors.
    Example:
        .. code-block:: python
            extract_links = LinkExtractorTransformer([
                HtmlLinkExtractor().as_document_extractor(),
            ])
            extract_links.transform_documents(docs)
    """
    def __init__(self, link_extractors: Iterable[LinkExtractor[Document]]):
        """Create a DocumentTransformer which adds extracted links to each document."""
        self.link_extractors = link_extractors
    def transform_documents(
        self, documents: Sequence[Document], **kwargs: Any
    ) -> Sequence[Document]:
        # Implement `transform_docments` directly, so that LinkExtractors which operate
        # better in batch (`extract_many`) get a chance to do so.
        # Run each extractor over all documents.
        links_per_extractor = [e.extract_many(documents) for e in self.link_extractors]
        # Transpose the list of lists to pair each document with the tuple of links.
        links_per_document = zip(*links_per_extractor)
        return [
            copy_with_links(document, *links)
            for document, links in zip(documents, links_per_document)
        ]
--- a/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_link_extractor_transformer.py
+++ b/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_link_extractor_transformer.py
@ -0,0 +1,92 @@
 from typing import Set
 from langchain_core.documents import Document
 from langchain_core.graph_vectorstores.links import Link, get_links
 from langchain_community.graph_vectorstores.extractors import (
    LinkExtractor,
    LinkExtractorTransformer,
 )
 TEXT1 = "Text1"
 TEXT2 = "Text2"
 class FakeKeywordExtractor(LinkExtractor[Document]):
    def extract_one(self, input: Document) -> Set[Link]:
        kws: Set[str] = set()
        if input.page_content == TEXT1:
            kws = {"a", "b"}
        elif input.page_content == TEXT2:
            kws = {"b", "c"}
        return {Link.bidir(kind="fakekw", tag=kw) for kw in kws}
 class FakeHyperlinkExtractor(LinkExtractor[Document]):
    def extract_one(self, input: Document) -> Set[Link]:
        if input.page_content == TEXT1:
            return {
                Link.incoming(kind="fakehref", tag="http://text1"),
                Link.outgoing(kind="fakehref", tag="http://text2"),
                Link.outgoing(kind="fakehref", tag="http://text3"),
            }
        elif input.page_content == TEXT2:
            return {
                Link.incoming(kind="fakehref", tag="http://text2"),
                Link.outgoing(kind="fakehref", tag="http://text3"),
            }
        else:
            raise ValueError(
                f"Unsupported input for FakeHyperlinkExtractor: '{input.page_content}'"
            )
 def test_one_extractor() -> None:
    transformer = LinkExtractorTransformer(
        [
            FakeKeywordExtractor(),
        ]
    )
    doc1 = Document(TEXT1)
    doc2 = Document(TEXT2)
    results = transformer.transform_documents([doc1, doc2])
    assert set(get_links(results[0])) == {
        Link.bidir(kind="fakekw", tag="a"),
        Link.bidir(kind="fakekw", tag="b"),
    }
    assert set(get_links(results[1])) == {
        Link.bidir(kind="fakekw", tag="b"),
        Link.bidir(kind="fakekw", tag="c"),
    }
 def test_multiple_extractors() -> None:
    transformer = LinkExtractorTransformer(
        [
            FakeKeywordExtractor(),
            FakeHyperlinkExtractor(),
        ]
    )
    doc1 = Document(TEXT1)
    doc2 = Document(TEXT2)
    results = transformer.transform_documents([doc1, doc2])
    assert set(get_links(results[0])) == {
        Link.bidir(kind="fakekw", tag="a"),
        Link.bidir(kind="fakekw", tag="b"),
        Link.incoming(kind="fakehref", tag="http://text1"),
        Link.outgoing(kind="fakehref", tag="http://text2"),
        Link.outgoing(kind="fakehref", tag="http://text3"),
    }
    assert set(get_links(results[1])) == {
        Link.bidir(kind="fakekw", tag="b"),
        Link.bidir(kind="fakekw", tag="c"),
        Link.incoming(kind="fakehref", tag="http://text2"),
        Link.outgoing(kind="fakehref", tag="http://text3"),
    }
--- a/libs/core/langchain_core/graph_vectorstores/links.py
+++ b/libs/core/langchain_core/graph_vectorstores/links.py
@ -12,7 +12,7 @@ class Link:
    """
    kind: str
-    """The kind of link. Allows different extractors to use the same tag name without 
+    """The kind of link. Allows different extractors to use the same tag name without
    creating collisions between extractors. For example “keyword” vs “url”."""
    direction: Literal["in", "out", "bidir"]
    """The direction of the link."""
@ -66,3 +66,29 @@ def add_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> None:
            links_in_metadata.extend(link)
        else:
            links_in_metadata.append(link)
 def copy_with_links(doc: Document, *links: Union[Link, Iterable[Link]]) -> Document:
    """Return a document with the given links added.
    Args:
        doc: The document to add the links to.
        *links: The links to add to the document.
    Returns:
        A document with a shallow-copy of the metadata with the links added.
    """
    new_links = set(get_links(doc))
    for link in links:
        if isinstance(link, Iterable):
            new_links.update(link)
        else:
            new_links.add(link)
    return Document(
        page_content=doc.page_content,
        metadata={
            **doc.metadata,
            METADATA_LINKS_KEY: list(new_links),
        },
    )