experimental[minor]: Add LLM graph transformer (#18733)

Add a class that constructs knowledge graphs based on text using an LLM.
2025-08-20 09:57:32 +00:00 · 2024-03-08 05:52:53 +01:00 · 2024-03-08 05:52:53 +01:00 · c8c592d3f1
commit c8c592d3f1
parent 3ecb903d49
2 changed files with 257 additions and 0 deletions
--- a/libs/experimental/langchain_experimental/graph_transformers/init.py
+++ b/libs/experimental/langchain_experimental/graph_transformers/init.py
@ -1,5 +1,7 @@
 from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
+from langchain_experimental.graph_transformers.llm import LLMGraphTransformer

 __all__ = [
    "DiffbotGraphTransformer",
+    "LLMGraphTransformer",
 ]
--- a/libs/experimental/langchain_experimental/graph_transformers/llm.py
+++ b/libs/experimental/langchain_experimental/graph_transformers/llm.py
@ -0,0 +1,255 @@
+from typing import Any, List, Optional, Sequence
+
+from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship
+from langchain_core.documents import Document
+from langchain_core.language_models import BaseLanguageModel
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.pydantic_v1 import BaseModel, Field
+
+system_prompt = (
+    "# Knowledge Graph Instructions for GPT-4\n"
+    "## 1. Overview\n"
+    "You are a top-tier algorithm designed for extracting information in structured "
+    "formats to build a knowledge graph.\n"
+    "Try to capture as much information from the text as possible without "
+    "sacrifing accuracy. Do not add any information that is not explicitly "
+    "mentioned in the text\n"
+    "- **Nodes** represent entities and concepts.\n"
+    "- The aim is to achieve simplicity and clarity in the knowledge graph, making it\n"
+    "accessible for a vast audience.\n"
+    "## 2. Labeling Nodes\n"
+    "- **Consistency**: Ensure you use available types for node labels.\n"
+    "Ensure you use basic or elementary types for node labels.\n"
+    "- For example, when you identify an entity representing a person, "
+    "always label it as **'person'**. Avoid using more specific terms "
+    "like 'mathematician' or 'scientist'"
+    "  - **Node IDs**: Never utilize integers as node IDs. Node IDs should be "
+    "names or human-readable identifiers found in the text.\n"
+    "- **Relationships** represent connections between entities or concepts.\n"
+    "Ensure consistency and generality in relationship types when constructing "
+    "knowledge graphs. Instead of using specific and momentary types "
+    "such as 'BECAME_PROFESSOR', use more general and timeless relationship types "
+    "like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n"
+    "## 3. Coreference Resolution\n"
+    "- **Maintain Entity Consistency**: When extracting entities, it's vital to "
+    "ensure consistency.\n"
+    'If an entity, such as "John Doe", is mentioned multiple times in the text '
+    'but is referred to by different names or pronouns (e.g., "Joe", "he"),'
+    "always use the most complete identifier for that entity throughout the "
+    'knowledge graph. In this example, use "John Doe" as the entity ID.\n'
+    "Remember, the knowledge graph should be coherent and easily understandable, "
+    "so maintaining consistency in entity references is crucial.\n"
+    "## 4. Strict Compliance\n"
+    "Adhere to the rules strictly. Non-compliance will result in termination."
+)
+
+default_prompt = ChatPromptTemplate.from_messages(
+    [
+        (
+            "system",
+            system_prompt,
+        ),
+        (
+            "human",
+            (
+                "Use the given format to extract information from the "
+                "following input: {input}"
+            ),
+        ),
+        (
+            "human",
+            "Tip: Make sure to answer in the correct format and do not include any ",
+        ),
+    ]
+)
+
+
+def optional_enum_field(
+    enum_values: Optional[List[str]] = None,
+    description: Optional[str] = None,
+    **field_kwargs: Any,
+) -> Any:
+    """Utility function to conditionally create a field with an enum constraint."""
+    if enum_values:
+        return Field(
+            ...,
+            enum=enum_values,
+            description=f"{description}. Available options are {enum_values}",
+            **field_kwargs,
+        )
+    else:
+        return Field(..., description=description, **field_kwargs)
+
+
+def create_simple_model(
+    node_labels: Optional[List[str]] = None, rel_types: Optional[List[str]] = None
+) -> Any:
+    """
+    Simple model allows to limit node and/or relationship types.
+    Doesn't have any node or relationship properties.
+    """
+
+    class SimpleNode(BaseModel):
+        """Represents a node in a graph with associated properties."""
+
+        id: str = Field(description="A unique identifier for the node.")
+        type: str = optional_enum_field(
+            node_labels, description="The type or label of the node."
+        )
+
+    class SimpleRelationship(BaseModel):
+        """Represents a directed relationship between two nodes in a graph."""
+
+        source: SimpleNode = Field(description="The source node of the relationship.")
+        target: SimpleNode = Field(description="The target node of the relationship.")
+        type: str = optional_enum_field(
+            rel_types, description="The type of the relationship."
+        )
+
+    class DynamicGraph(BaseModel):
+        """Represents a graph document consisting of nodes and relationships."""
+
+        nodes: Optional[List[SimpleNode]] = Field(description="List of nodes")
+        relationships: Optional[List[SimpleRelationship]] = Field(
+            description="List of relationships"
+        )
+
+    return DynamicGraph
+
+
+def map_to_base_node(node: Any) -> Node:
+    """Map the SimpleNode to the base Node."""
+    return Node(id=node.id.title(), type=node.type.capitalize())
+
+
+def map_to_base_relationship(rel: Any) -> Relationship:
+    """Map the SimpleRelationship to the base Relationship."""
+    source = map_to_base_node(rel.source)
+    target = map_to_base_node(rel.target)
+    return Relationship(
+        source=source, target=target, type=rel.type.replace(" ", "_").upper()
+    )
+
+
+class LLMGraphTransformer:
+    """
+    A class designed to transform documents into graph-based documents using a LLM.
+    It allows specifying constraints on the types of nodes and relationships to include
+    in the output graph. The class doesn't support neither extract and node or
+    relationship properties
+
+    Args:
+        llm (BaseLanguageModel): An instance of a language model supporting structured
+        output. allowed_nodes (List[str], optional): Specifies which node types are
+        allowed in the graph. Defaults to an empty list, allowing all node types.
+        allowed_relationships (List[str], optional): Specifies which relationship types
+        are allowed in the graph. Defaults to an empty list, allowing all relationship
+        types.
+        prompt (Optional[ChatPromptTemplate], optional): The prompt to pass to the to
+        the LLM with additional instructions.
+        strict_mode (bool, optional): Determines whether the transformer should apply
+        filtering to strictly adhere to `allowed_nodes` and `allowed_relationships`.
+        Defaults to True.
+    Example:
+        .. code-block:: python
+            from langchain_experimental.graph_transformers import LLMGraphTransformer
+            from langchain_core.documents import Document
+            from langchain_openai import ChatOpenAI
+
+            llm=ChatOpenAI(temperature=0)
+            transformer = LLMGraphTransformer(
+                llm=llm,
+                allowed_nodes=["Person", "Organization"])
+
+            doc = Document(page_content="Elon Musk is suing OpenAI")
+            graph_documents = transformer.convert_to_graph_documents([doc])
+    """
+
+    def __init__(
+        self,
+        llm: BaseLanguageModel,
+        allowed_nodes: List[str] = [],
+        allowed_relationships: List[str] = [],
+        prompt: Optional[ChatPromptTemplate] = default_prompt,
+        strict_mode: bool = True,
+    ) -> None:
+        if not hasattr(llm, "with_structured_output"):
+            raise ValueError(
+                "The specified LLM does not support the 'with_structured_output'. "
+                "Please ensure you are using an LLM that supports this feature."
+            )
+        self.allowed_nodes = allowed_nodes
+        self.allowed_relationships = allowed_relationships
+        self.strict_mode = strict_mode
+
+        # Define chain
+        schema = create_simple_model(allowed_nodes, allowed_relationships)
+        structured_llm = llm.with_structured_output(schema)
+        self.chain = prompt | structured_llm
+
+    def process_response(self, document: Document) -> GraphDocument:
+        """
+        Processes a single document, transforming it into a graph document using
+        an LLM based on the model's schema and constraints.
+        """
+        text = document.page_content
+        raw_schema = self.chain.invoke({"input": text})
+        if raw_schema.nodes:
+            nodes = [map_to_base_node(node) for node in raw_schema.nodes]
+        else:
+            nodes = []
+        if raw_schema.relationships:
+            relationships = [
+                map_to_base_relationship(rel) for rel in raw_schema.relationships
+            ]
+        else:
+            relationships = []
+
+        # Strict mode filtering
+        if self.strict_mode and (self.allowed_nodes or self.allowed_relationships):
+            if self.allowed_relationships and self.allowed_nodes:
+                nodes = [node for node in nodes if node.type in self.allowed_nodes]
+                relationships = [
+                    rel
+                    for rel in relationships
+                    if rel.type in self.allowed_relationships
+                    and rel.source.type in self.allowed_nodes
+                    and rel.target.type in self.allowed_nodes
+                ]
+            elif self.allowed_nodes and not self.allowed_relationships:
+                nodes = [node for node in nodes if node.type in self.allowed_nodes]
+                relationships = [
+                    rel
+                    for rel in relationships
+                    if rel.source.type in self.allowed_nodes
+                    and rel.target.type in self.allowed_nodes
+                ]
+            if self.allowed_relationships and not self.allowed_nodes:
+                relationships = [
+                    rel
+                    for rel in relationships
+                    if rel.type in self.allowed_relationships
+                ]
+
+        graph_document = GraphDocument(
+            nodes=nodes, relationships=relationships, source=document
+        )
+        return graph_document
+
+    def convert_to_graph_documents(
+        self, documents: Sequence[Document]
+    ) -> List[GraphDocument]:
+        """Convert a sequence of documents into graph documents.
+
+        Args:
+            documents (Sequence[Document]): The original documents.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            Sequence[GraphDocument]: The transformed documents as graphs.
+        """
+        results = []
+        for document in documents:
+            graph_document = self.process_response(document)
+            results.append(graph_document)
+        return results