[community]: Render documents to graphviz (#24830)

- **Description:** Adds a helper that renders documents with the
GraphVectorStore metadata fields to Graphviz for visualization. This is
helpful for understanding and debugging.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Ben Chambers 2024-12-13 18:02:09 -08:00 committed by GitHub
parent fc8006121f
commit 008efada2c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 236 additions and 0 deletions

View File

@ -29,6 +29,7 @@ gliner>=0.2.7
google-cloud-documentai>=2.20.1,<3
gql>=3.4.1,<4
gradientai>=1.4.0,<2
graphviz>=0.20.3,<0.21
hdbcli>=2.19.21,<3
hologres-vector==0.0.6
html2text>=2020.1.16

View File

@ -0,0 +1,122 @@
import re
from typing import TYPE_CHECKING, Dict, Iterable, Optional, Tuple
from langchain_core._api import beta
from langchain_core.documents import Document
from langchain_community.graph_vectorstores.links import get_links
if TYPE_CHECKING:
import graphviz
def _escape_id(id: str) -> str:
return id.replace(":", "_")
_EDGE_DIRECTION = {
"in": "back",
"out": "forward",
"bidir": "both",
}
_WORD_RE = re.compile("\s*\S+")
def _split_prefix(s: str, max_chars: int = 50) -> str:
words = _WORD_RE.finditer(s)
split = min(len(s), max_chars)
for word in words:
if word.end(0) > max_chars:
break
split = word.end(0)
if split == len(s):
return s
else:
return f"{s[0:split]}..."
@beta()
def render_graphviz(
documents: Iterable[Document],
engine: Optional[str] = None,
node_color: Optional[str] = None,
node_colors: Optional[Dict[str, Optional[str]]] = None,
skip_tags: Iterable[Tuple[str, str]] = (),
) -> "graphviz.Digraph":
"""Render a collection of GraphVectorStore documents to GraphViz format.
Args:
documents: The documents to render.
engine: GraphViz layout engine to use. `None` uses the default.
node_color: Default node color.
node_colors: Dictionary specifying colors of specific nodes. Useful for
emphasizing nodes that were selected by MMR, or differ from other
results.
skip_tags: Set of tags to skip when rendering the graph. Specified as
tuples containing the kind and tag.
Returns:
The "graphviz.Digraph" representing the nodes. May be printed to source,
or rendered using `dot`.
Note:
To render the generated DOT source code, you also need to install Graphviz_
(`download page <https://www.graphviz.org/download/>`_,
`archived versions <https://www2.graphviz.org/Archive/stable/>`_,
`installation procedure for Windows <https://forum.graphviz.org/t/new-simplified-installation-procedure-on-windows/224>`_).
"""
if node_colors is None:
node_colors = {}
try:
import graphviz
except (ImportError, ModuleNotFoundError):
raise ImportError(
"Could not import graphviz python package. "
"Please install it with `pip install graphviz`."
)
graph = graphviz.Digraph(engine=engine)
graph.attr(rankdir="LR")
graph.attr("node", style="filled")
skip_tags = set(skip_tags)
tags: dict[Tuple[str, str], str] = {}
for document in documents:
id = document.id
if id is None:
raise ValueError(f"Illegal graph document without ID: {document}")
escaped_id = _escape_id(id)
color = node_colors[id] if id in node_colors else node_color
node_label = "\n".join(
[
graphviz.escape(id),
graphviz.escape(_split_prefix(document.page_content)),
]
)
graph.node(
escaped_id,
label=node_label,
shape="note",
fillcolor=color,
tooltip=graphviz.escape(document.page_content),
)
for link in get_links(document):
tag_key = (link.kind, link.tag)
if tag_key in skip_tags:
continue
tag_id = tags.get(tag_key)
if tag_id is None:
tag_id = f"tag_{len(tags)}"
tags[tag_key] = tag_id
graph.node(tag_id, label=graphviz.escape(f"{link.kind}:{link.tag}"))
graph.edge(escaped_id, tag_id, dir=_EDGE_DIRECTION[link.direction])
return graph

View File

@ -0,0 +1,113 @@
import pytest
from langchain_core.documents import Document
from langchain_community.graph_vectorstores.links import METADATA_LINKS_KEY, Link
from langchain_community.graph_vectorstores.visualize import render_graphviz
@pytest.mark.requires("graphviz")
def test_visualize_simple_graph() -> None:
doc1 = Document(
id="a",
page_content="some content",
metadata={
METADATA_LINKS_KEY: [
Link.incoming("href", "a"),
Link.bidir("kw", "foo"),
]
},
)
doc2 = Document(
id="b",
page_content="<some\n more content>",
metadata={
METADATA_LINKS_KEY: [
Link.incoming("href", "b"),
Link.outgoing("href", "a"),
Link.bidir("kw", "foo"),
Link.bidir("kw", "bar"),
]
},
)
assert render_graphviz([doc1, doc2]).source == (
"digraph {\n"
"\trankdir=LR\n"
"\tnode [style=filled]\n"
'\ta [label="a\nsome content" shape=note tooltip="some content"]\n'
'\ttag_0 [label="href:a"]\n'
"\ta -> tag_0 [dir=back]\n"
'\ttag_1 [label="kw:foo"]\n'
"\ta -> tag_1 [dir=both]\n"
'\tb [label="b\n<some\n more content>" '
'shape=note tooltip="<some\n more content>"]\n'
'\ttag_2 [label="href:b"]\n'
"\tb -> tag_2 [dir=back]\n"
"\tb -> tag_0 [dir=forward]\n"
"\tb -> tag_1 [dir=both]\n"
'\ttag_3 [label="kw:bar"]\n'
"\tb -> tag_3 [dir=both]\n"
"}\n"
)
assert render_graphviz([doc1, doc2], engine="fdp").engine == "fdp"
assert render_graphviz([doc1, doc2], node_colors={"a": "gold"}).source == (
"digraph {\n"
"\trankdir=LR\n"
"\tnode [style=filled]\n"
'\ta [label="a\nsome content" fillcolor=gold '
'shape=note tooltip="some content"]\n'
'\ttag_0 [label="href:a"]\n'
"\ta -> tag_0 [dir=back]\n"
'\ttag_1 [label="kw:foo"]\n'
"\ta -> tag_1 [dir=both]\n"
'\tb [label="b\n<some\n more content>" '
'shape=note tooltip="<some\n more content>"]\n'
'\ttag_2 [label="href:b"]\n'
"\tb -> tag_2 [dir=back]\n"
"\tb -> tag_0 [dir=forward]\n"
"\tb -> tag_1 [dir=both]\n"
'\ttag_3 [label="kw:bar"]\n'
"\tb -> tag_3 [dir=both]\n"
"}\n"
)
assert render_graphviz(
[doc1, doc2], node_color="gold", node_colors={"a": None}
).source == (
"digraph {\n"
"\trankdir=LR\n"
"\tnode [style=filled]\n"
'\ta [label="a\nsome content" shape=note tooltip="some content"]\n'
'\ttag_0 [label="href:a"]\n'
"\ta -> tag_0 [dir=back]\n"
'\ttag_1 [label="kw:foo"]\n'
"\ta -> tag_1 [dir=both]\n"
'\tb [label="b\n<some\n more content>" fillcolor=gold '
'shape=note tooltip="<some\n more content>"]\n'
'\ttag_2 [label="href:b"]\n'
"\tb -> tag_2 [dir=back]\n"
"\tb -> tag_0 [dir=forward]\n"
"\tb -> tag_1 [dir=both]\n"
'\ttag_3 [label="kw:bar"]\n'
"\tb -> tag_3 [dir=both]\n"
"}\n"
)
assert render_graphviz([doc1, doc2], skip_tags=[("kw", "foo")]).source == (
"digraph {\n"
"\trankdir=LR\n"
"\tnode [style=filled]\n"
'\ta [label="a\nsome content" shape=note tooltip="some content"]\n'
'\ttag_0 [label="href:a"]\n'
"\ta -> tag_0 [dir=back]\n"
'\tb [label="b\n<some\n more content>" '
'shape=note tooltip="<some\n more content>"]\n'
'\ttag_1 [label="href:b"]\n'
"\tb -> tag_1 [dir=back]\n"
"\tb -> tag_0 [dir=forward]\n"
'\ttag_2 [label="kw:bar"]\n'
"\tb -> tag_2 [dir=both]\n"
"}\n"
)