diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py b/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py index b9dd5644703..46448699293 100644 --- a/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py +++ b/libs/community/langchain_community/graph_vectorstores/extractors/__init__.py @@ -1,3 +1,7 @@ +from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor import ( + HierarchyInput, + HierarchyLinkExtractor, +) from langchain_community.graph_vectorstores.extractors.html_link_extractor import ( HtmlInput, HtmlLinkExtractor, @@ -12,6 +16,8 @@ from langchain_community.graph_vectorstores.extractors.link_extractor_adapter im __all__ = [ "LinkExtractor", "LinkExtractorAdapter", + "HierarchyInput", + "HierarchyLinkExtractor", "HtmlInput", "HtmlLinkExtractor", ] diff --git a/libs/community/langchain_community/graph_vectorstores/extractors/hierarchy_link_extractor.py b/libs/community/langchain_community/graph_vectorstores/extractors/hierarchy_link_extractor.py new file mode 100644 index 00000000000..f3449effe44 --- /dev/null +++ b/libs/community/langchain_community/graph_vectorstores/extractors/hierarchy_link_extractor.py @@ -0,0 +1,106 @@ +from typing import Callable, List, Set + +from langchain_core.documents import Document +from langchain_core.graph_vectorstores.links import Link + +from langchain_community.graph_vectorstores.extractors.link_extractor import ( + LinkExtractor, +) +from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import ( + LinkExtractorAdapter, +) + +# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`. +HierarchyInput = List[str] + +_PARENT: str = "p:" +_CHILD: str = "c:" +_SIBLING: str = "s:" + + +class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]): + def __init__( + self, + *, + kind: str = "hierarchy", + parent_links: bool = True, + child_links: bool = False, + sibling_links: bool = False, + ): + """Extract links from a document hierarchy. + + Example: + .. code_block: python + # Given three paths (in this case, within the "Root" document): + h1 = ["Root", "H1"] + h1a = ["Root", "H1", "a"] + h1b = ["Root", "H1", "b"] + + # Parent links `h1a` and `h1b` to `h1`. + # Child links `h1` to `h1a` and `h1b`. + # Sibling links `h1a` and `h1b` together (both directions). + + Example use with documents: + .. code_block: python + transformer = LinkExtractorTransformer([ + HierarchyLinkExtractor().as_document_extractor( + # Assumes the "path" to each document is in the metadata. + # Could split strings, etc. + lambda doc: doc.metadata.get("path", []) + ) + ]) + linked = transformer.transform_documents(docs) + + Args: + kind: Kind of links to produce with this extractor. + parent_links: Link from a section to its parent. + child_links: Link from a section to its children. + sibling_links: Link from a section to other sections with the same parent. + """ + self._kind = kind + self._parent_links = parent_links + self._child_links = child_links + self._sibling_links = sibling_links + + def as_document_extractor( + self, hierarchy: Callable[[Document], HierarchyInput] + ) -> LinkExtractor[Document]: + """Create a LinkExtractor from `Document`. + + Args: + hierarchy: Function that returns the path for the given document. + + Returns: + A `LinkExtractor[Document]` suitable for application to `Documents` directly + or with `LinkExtractorTransformer`. + """ + return LinkExtractorAdapter(underlying=self, transform=hierarchy) + + def extract_one( + self, + input: HierarchyInput, + ) -> Set[Link]: + this_path = "/".join(input) + parent_path = None + + links = set() + if self._parent_links: + # This is linked from everything with this parent path. + links.add(Link.incoming(kind=self._kind, tag=_PARENT + this_path)) + if self._child_links: + # This is linked to every child with this as it's "parent" path. + links.add(Link.outgoing(kind=self._kind, tag=_CHILD + this_path)) + + if len(input) >= 1: + parent_path = "/".join(input[0:-1]) + if self._parent_links and len(input) > 1: + # This is linked to the nodes with the given parent path. + links.add(Link.outgoing(kind=self._kind, tag=_PARENT + parent_path)) + if self._child_links and len(input) > 1: + # This is linked from every node with the given parent path. + links.add(Link.incoming(kind=self._kind, tag=_CHILD + parent_path)) + if self._sibling_links: + # This is a sibling of everything with the same parent. + links.add(Link.bidir(kind=self._kind, tag=_SIBLING + parent_path)) + + return links diff --git a/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_hierarchy_link_extractor.py b/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_hierarchy_link_extractor.py new file mode 100644 index 00000000000..109583f6e3a --- /dev/null +++ b/libs/community/tests/unit_tests/graph_vectorstores/extractors/test_hierarchy_link_extractor.py @@ -0,0 +1,84 @@ +from langchain_core.graph_vectorstores.links import Link + +from langchain_community.graph_vectorstores.extractors import HierarchyLinkExtractor + +PATH_1 = ["Root", "H1", "h2"] + +PATH_2 = ["Root", "H1"] + +PATH_3 = ["Root"] + + +def test_up_only() -> None: + extractor = HierarchyLinkExtractor() + + assert extractor.extract_one(PATH_1) == { + # Path1 links up to Root/H1 + Link.outgoing(kind="hierarchy", tag="up:Root/H1"), + # Path1 is linked to by stuff under Root/H1/h2 + Link.incoming(kind="hierarchy", tag="up:Root/H1/h2"), + } + + assert extractor.extract_one(PATH_2) == { + # Path2 links up to Root + Link.outgoing(kind="hierarchy", tag="up:Root"), + # Path2 is linked to by stuff under Root/H1/h2 + Link.incoming(kind="hierarchy", tag="up:Root/H1"), + } + + assert extractor.extract_one(PATH_3) == { + # Path3 is linked to by stuff under Root + Link.incoming(kind="hierarchy", tag="up:Root"), + } + + +def test_up_and_down() -> None: + extractor = HierarchyLinkExtractor(child_links=True) + + assert extractor.extract_one(PATH_1) == { + # Path1 links up to Root/H1 + Link.outgoing(kind="hierarchy", tag="up:Root/H1"), + # Path1 is linked to by stuff under Root/H1/h2 + Link.incoming(kind="hierarchy", tag="up:Root/H1/h2"), + # Path1 links down to things under Root/H1/h2. + Link.outgoing(kind="hierarchy", tag="down:Root/H1/h2"), + # Path1 is linked down to by Root/H1 + Link.incoming(kind="hierarchy", tag="down:Root/H1"), + } + + assert extractor.extract_one(PATH_2) == { + # Path2 links up to Root + Link.outgoing(kind="hierarchy", tag="up:Root"), + # Path2 is linked to by stuff under Root/H1/h2 + Link.incoming(kind="hierarchy", tag="up:Root/H1"), + # Path2 links down to things under Root/H1. + Link.outgoing(kind="hierarchy", tag="down:Root/H1"), + # Path2 is linked down to by Root + Link.incoming(kind="hierarchy", tag="down:Root"), + } + + assert extractor.extract_one(PATH_3) == { + # Path3 is linked to by stuff under Root + Link.incoming(kind="hierarchy", tag="up:Root"), + # Path3 links down to things under Root/H1. + Link.outgoing(kind="hierarchy", tag="down:Root"), + } + + +def test_sibling() -> None: + extractor = HierarchyLinkExtractor(sibling_links=True, parent_links=False) + + assert extractor.extract_one(PATH_1) == { + # Path1 links with anything else in Root/H1 + Link.bidir(kind="hierarchy", tag="sib:Root/H1"), + } + + assert extractor.extract_one(PATH_2) == { + # Path2 links with anything else in Root + Link.bidir(kind="hierarchy", tag="sib:Root"), + } + + assert extractor.extract_one(PATH_3) == { + # Path3 links with anything else at the top level + Link.bidir(kind="hierarchy", tag="sib:"), + }