Merge pull request #24315

* community: Add Hierarchy link extractor * add example * lint
2025-09-17 23:41:46 +00:00 · 2024-07-19 06:42:26 -07:00
parent c3308f31bc
commit 242b085be7
3 changed files with 196 additions and 0 deletions
--- a/libs/community/langchain_community/graph_vectorstores/extractors/init.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/init.py
@@ -1,3 +1,7 @@
+from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor import (
+    HierarchyInput,
+    HierarchyLinkExtractor,
+)
 from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
    HtmlInput,
    HtmlLinkExtractor,
@@ -12,6 +16,8 @@ from langchain_community.graph_vectorstores.extractors.link_extractor_adapter im
 __all__ = [
    "LinkExtractor",
    "LinkExtractorAdapter",
+    "HierarchyInput",
+    "HierarchyLinkExtractor",
    "HtmlInput",
    "HtmlLinkExtractor",
 ]
--- a/libs/community/langchain_community/graph_vectorstores/extractors/hierarchy_link_extractor.py
+++ b/libs/community/langchain_community/graph_vectorstores/extractors/hierarchy_link_extractor.py
@@ -0,0 +1,106 @@
+from typing import Callable, List, Set
+
+from langchain_core.documents import Document
+from langchain_core.graph_vectorstores.links import Link
+
+from langchain_community.graph_vectorstores.extractors.link_extractor import (
+    LinkExtractor,
+)
+from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
+    LinkExtractorAdapter,
+)
+
+# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`.
+HierarchyInput = List[str]
+
+_PARENT: str = "p:"
+_CHILD: str = "c:"
+_SIBLING: str = "s:"
+
+
+class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
+    def __init__(
+        self,
+        *,
+        kind: str = "hierarchy",
+        parent_links: bool = True,
+        child_links: bool = False,
+        sibling_links: bool = False,
+    ):
+        """Extract links from a document hierarchy.
+
+        Example:
+            .. code_block: python
+                # Given three paths (in this case, within the "Root" document):
+                h1 = ["Root", "H1"]
+                h1a = ["Root", "H1", "a"]
+                h1b = ["Root", "H1", "b"]
+
+                # Parent links `h1a` and `h1b` to `h1`.
+                # Child links `h1` to `h1a` and `h1b`.
+                # Sibling links `h1a` and `h1b` together (both directions).
+
+        Example use with documents:
+            .. code_block: python
+                transformer = LinkExtractorTransformer([
+                    HierarchyLinkExtractor().as_document_extractor(
+                        # Assumes the "path" to each document is in the metadata.
+                        # Could split strings, etc.
+                        lambda doc: doc.metadata.get("path", [])
+                    )
+                ])
+                linked = transformer.transform_documents(docs)
+
+        Args:
+            kind: Kind of links to produce with this extractor.
+            parent_links: Link from a section to its parent.
+            child_links: Link from a section to its children.
+            sibling_links: Link from a section to other sections with the same parent.
+        """
+        self._kind = kind
+        self._parent_links = parent_links
+        self._child_links = child_links
+        self._sibling_links = sibling_links
+
+    def as_document_extractor(
+        self, hierarchy: Callable[[Document], HierarchyInput]
+    ) -> LinkExtractor[Document]:
+        """Create a LinkExtractor from `Document`.
+
+        Args:
+            hierarchy: Function that returns the path for the given document.
+
+        Returns:
+            A `LinkExtractor[Document]` suitable for application to `Documents` directly
+            or with `LinkExtractorTransformer`.
+        """
+        return LinkExtractorAdapter(underlying=self, transform=hierarchy)
+
+    def extract_one(
+        self,
+        input: HierarchyInput,
+    ) -> Set[Link]:
+        this_path = "/".join(input)
+        parent_path = None
+
+        links = set()
+        if self._parent_links:
+            # This is linked from everything with this parent path.
+            links.add(Link.incoming(kind=self._kind, tag=_PARENT + this_path))
+        if self._child_links:
+            # This is linked to every child with this as it's "parent" path.
+            links.add(Link.outgoing(kind=self._kind, tag=_CHILD + this_path))
+
+        if len(input) >= 1:
+            parent_path = "/".join(input[0:-1])
+            if self._parent_links and len(input) > 1:
+                # This is linked to the nodes with the given parent path.
+                links.add(Link.outgoing(kind=self._kind, tag=_PARENT + parent_path))
+            if self._child_links and len(input) > 1:
+                # This is linked from every node with the given parent path.
+                links.add(Link.incoming(kind=self._kind, tag=_CHILD + parent_path))
+            if self._sibling_links:
+                # This is a sibling of everything with the same parent.
+                links.add(Link.bidir(kind=self._kind, tag=_SIBLING + parent_path))
+
+        return links