mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 14:18:52 +00:00
Merge pull request #24315
* community: Add Hierarchy link extractor * add example * lint
This commit is contained in:
parent
c3308f31bc
commit
242b085be7
@ -1,3 +1,7 @@
|
|||||||
|
from langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor import (
|
||||||
|
HierarchyInput,
|
||||||
|
HierarchyLinkExtractor,
|
||||||
|
)
|
||||||
from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
|
from langchain_community.graph_vectorstores.extractors.html_link_extractor import (
|
||||||
HtmlInput,
|
HtmlInput,
|
||||||
HtmlLinkExtractor,
|
HtmlLinkExtractor,
|
||||||
@ -12,6 +16,8 @@ from langchain_community.graph_vectorstores.extractors.link_extractor_adapter im
|
|||||||
__all__ = [
|
__all__ = [
|
||||||
"LinkExtractor",
|
"LinkExtractor",
|
||||||
"LinkExtractorAdapter",
|
"LinkExtractorAdapter",
|
||||||
|
"HierarchyInput",
|
||||||
|
"HierarchyLinkExtractor",
|
||||||
"HtmlInput",
|
"HtmlInput",
|
||||||
"HtmlLinkExtractor",
|
"HtmlLinkExtractor",
|
||||||
]
|
]
|
||||||
|
@ -0,0 +1,106 @@
|
|||||||
|
from typing import Callable, List, Set
|
||||||
|
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from langchain_core.graph_vectorstores.links import Link
|
||||||
|
|
||||||
|
from langchain_community.graph_vectorstores.extractors.link_extractor import (
|
||||||
|
LinkExtractor,
|
||||||
|
)
|
||||||
|
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
|
||||||
|
LinkExtractorAdapter,
|
||||||
|
)
|
||||||
|
|
||||||
|
# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`.
|
||||||
|
HierarchyInput = List[str]
|
||||||
|
|
||||||
|
_PARENT: str = "p:"
|
||||||
|
_CHILD: str = "c:"
|
||||||
|
_SIBLING: str = "s:"
|
||||||
|
|
||||||
|
|
||||||
|
class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
kind: str = "hierarchy",
|
||||||
|
parent_links: bool = True,
|
||||||
|
child_links: bool = False,
|
||||||
|
sibling_links: bool = False,
|
||||||
|
):
|
||||||
|
"""Extract links from a document hierarchy.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
.. code_block: python
|
||||||
|
# Given three paths (in this case, within the "Root" document):
|
||||||
|
h1 = ["Root", "H1"]
|
||||||
|
h1a = ["Root", "H1", "a"]
|
||||||
|
h1b = ["Root", "H1", "b"]
|
||||||
|
|
||||||
|
# Parent links `h1a` and `h1b` to `h1`.
|
||||||
|
# Child links `h1` to `h1a` and `h1b`.
|
||||||
|
# Sibling links `h1a` and `h1b` together (both directions).
|
||||||
|
|
||||||
|
Example use with documents:
|
||||||
|
.. code_block: python
|
||||||
|
transformer = LinkExtractorTransformer([
|
||||||
|
HierarchyLinkExtractor().as_document_extractor(
|
||||||
|
# Assumes the "path" to each document is in the metadata.
|
||||||
|
# Could split strings, etc.
|
||||||
|
lambda doc: doc.metadata.get("path", [])
|
||||||
|
)
|
||||||
|
])
|
||||||
|
linked = transformer.transform_documents(docs)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
kind: Kind of links to produce with this extractor.
|
||||||
|
parent_links: Link from a section to its parent.
|
||||||
|
child_links: Link from a section to its children.
|
||||||
|
sibling_links: Link from a section to other sections with the same parent.
|
||||||
|
"""
|
||||||
|
self._kind = kind
|
||||||
|
self._parent_links = parent_links
|
||||||
|
self._child_links = child_links
|
||||||
|
self._sibling_links = sibling_links
|
||||||
|
|
||||||
|
def as_document_extractor(
|
||||||
|
self, hierarchy: Callable[[Document], HierarchyInput]
|
||||||
|
) -> LinkExtractor[Document]:
|
||||||
|
"""Create a LinkExtractor from `Document`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hierarchy: Function that returns the path for the given document.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A `LinkExtractor[Document]` suitable for application to `Documents` directly
|
||||||
|
or with `LinkExtractorTransformer`.
|
||||||
|
"""
|
||||||
|
return LinkExtractorAdapter(underlying=self, transform=hierarchy)
|
||||||
|
|
||||||
|
def extract_one(
|
||||||
|
self,
|
||||||
|
input: HierarchyInput,
|
||||||
|
) -> Set[Link]:
|
||||||
|
this_path = "/".join(input)
|
||||||
|
parent_path = None
|
||||||
|
|
||||||
|
links = set()
|
||||||
|
if self._parent_links:
|
||||||
|
# This is linked from everything with this parent path.
|
||||||
|
links.add(Link.incoming(kind=self._kind, tag=_PARENT + this_path))
|
||||||
|
if self._child_links:
|
||||||
|
# This is linked to every child with this as it's "parent" path.
|
||||||
|
links.add(Link.outgoing(kind=self._kind, tag=_CHILD + this_path))
|
||||||
|
|
||||||
|
if len(input) >= 1:
|
||||||
|
parent_path = "/".join(input[0:-1])
|
||||||
|
if self._parent_links and len(input) > 1:
|
||||||
|
# This is linked to the nodes with the given parent path.
|
||||||
|
links.add(Link.outgoing(kind=self._kind, tag=_PARENT + parent_path))
|
||||||
|
if self._child_links and len(input) > 1:
|
||||||
|
# This is linked from every node with the given parent path.
|
||||||
|
links.add(Link.incoming(kind=self._kind, tag=_CHILD + parent_path))
|
||||||
|
if self._sibling_links:
|
||||||
|
# This is a sibling of everything with the same parent.
|
||||||
|
links.add(Link.bidir(kind=self._kind, tag=_SIBLING + parent_path))
|
||||||
|
|
||||||
|
return links
|
@ -0,0 +1,84 @@
|
|||||||
|
from langchain_core.graph_vectorstores.links import Link
|
||||||
|
|
||||||
|
from langchain_community.graph_vectorstores.extractors import HierarchyLinkExtractor
|
||||||
|
|
||||||
|
PATH_1 = ["Root", "H1", "h2"]
|
||||||
|
|
||||||
|
PATH_2 = ["Root", "H1"]
|
||||||
|
|
||||||
|
PATH_3 = ["Root"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_up_only() -> None:
|
||||||
|
extractor = HierarchyLinkExtractor()
|
||||||
|
|
||||||
|
assert extractor.extract_one(PATH_1) == {
|
||||||
|
# Path1 links up to Root/H1
|
||||||
|
Link.outgoing(kind="hierarchy", tag="up:Root/H1"),
|
||||||
|
# Path1 is linked to by stuff under Root/H1/h2
|
||||||
|
Link.incoming(kind="hierarchy", tag="up:Root/H1/h2"),
|
||||||
|
}
|
||||||
|
|
||||||
|
assert extractor.extract_one(PATH_2) == {
|
||||||
|
# Path2 links up to Root
|
||||||
|
Link.outgoing(kind="hierarchy", tag="up:Root"),
|
||||||
|
# Path2 is linked to by stuff under Root/H1/h2
|
||||||
|
Link.incoming(kind="hierarchy", tag="up:Root/H1"),
|
||||||
|
}
|
||||||
|
|
||||||
|
assert extractor.extract_one(PATH_3) == {
|
||||||
|
# Path3 is linked to by stuff under Root
|
||||||
|
Link.incoming(kind="hierarchy", tag="up:Root"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_up_and_down() -> None:
|
||||||
|
extractor = HierarchyLinkExtractor(child_links=True)
|
||||||
|
|
||||||
|
assert extractor.extract_one(PATH_1) == {
|
||||||
|
# Path1 links up to Root/H1
|
||||||
|
Link.outgoing(kind="hierarchy", tag="up:Root/H1"),
|
||||||
|
# Path1 is linked to by stuff under Root/H1/h2
|
||||||
|
Link.incoming(kind="hierarchy", tag="up:Root/H1/h2"),
|
||||||
|
# Path1 links down to things under Root/H1/h2.
|
||||||
|
Link.outgoing(kind="hierarchy", tag="down:Root/H1/h2"),
|
||||||
|
# Path1 is linked down to by Root/H1
|
||||||
|
Link.incoming(kind="hierarchy", tag="down:Root/H1"),
|
||||||
|
}
|
||||||
|
|
||||||
|
assert extractor.extract_one(PATH_2) == {
|
||||||
|
# Path2 links up to Root
|
||||||
|
Link.outgoing(kind="hierarchy", tag="up:Root"),
|
||||||
|
# Path2 is linked to by stuff under Root/H1/h2
|
||||||
|
Link.incoming(kind="hierarchy", tag="up:Root/H1"),
|
||||||
|
# Path2 links down to things under Root/H1.
|
||||||
|
Link.outgoing(kind="hierarchy", tag="down:Root/H1"),
|
||||||
|
# Path2 is linked down to by Root
|
||||||
|
Link.incoming(kind="hierarchy", tag="down:Root"),
|
||||||
|
}
|
||||||
|
|
||||||
|
assert extractor.extract_one(PATH_3) == {
|
||||||
|
# Path3 is linked to by stuff under Root
|
||||||
|
Link.incoming(kind="hierarchy", tag="up:Root"),
|
||||||
|
# Path3 links down to things under Root/H1.
|
||||||
|
Link.outgoing(kind="hierarchy", tag="down:Root"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_sibling() -> None:
|
||||||
|
extractor = HierarchyLinkExtractor(sibling_links=True, parent_links=False)
|
||||||
|
|
||||||
|
assert extractor.extract_one(PATH_1) == {
|
||||||
|
# Path1 links with anything else in Root/H1
|
||||||
|
Link.bidir(kind="hierarchy", tag="sib:Root/H1"),
|
||||||
|
}
|
||||||
|
|
||||||
|
assert extractor.extract_one(PATH_2) == {
|
||||||
|
# Path2 links with anything else in Root
|
||||||
|
Link.bidir(kind="hierarchy", tag="sib:Root"),
|
||||||
|
}
|
||||||
|
|
||||||
|
assert extractor.extract_one(PATH_3) == {
|
||||||
|
# Path3 links with anything else at the top level
|
||||||
|
Link.bidir(kind="hierarchy", tag="sib:"),
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user