Filtering graph schema for Cypher generation (#10577)

Sometimes you don't want the LLM to be aware of the whole graph schema, and want it to ignore parts of the graph when it is constructing Cypher statements.
2025-10-14 05:19:27 +00:00 · 2023-09-25 23:14:15 +02:00
parent 89ef440c14
commit 0625ab7a9e
7 changed files with 368 additions and 27 deletions
--- a/docs/extras/use_cases/more/graph/graph_cypher_qa.ipynb
+++ b/docs/extras/use_cases/more/graph/graph_cypher_qa.ipynb
@@ -135,7 +135,7 @@
    }
   ],
   "source": [
-    "print(graph.get_schema)"
+    "print(graph.schema)"
   ]
  },
  {
@@ -510,13 +510,54 @@
    "chain.run(\"Who played in Top Gun?\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eefea16b-508f-4552-8942-9d5063ed7d37",
   "metadata": {},
   "source": [
    "# Ignore specified node and relationship types\n",
    "You can use `include_types` or `exclude_types` to ignore parts of the graph schema when generating Cypher statements."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
-   "id": "48ff7cf8-18a3-43d7-8cb1-c1b91744608d",
+   "id": "a20fa21e-fb85-41c4-aac0-53fb25e34604",
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
    "chain = GraphCypherQAChain.from_llm(\n",
    "     graph=graph,\n",
    "     cypher_llm=ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo\"),\n",
    "     qa_llm=ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-16k\"),\n",
    "     verbose=True,\n",
    "     exclude_types=['Movie']\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "3ad7f6b8-543e-46e4-a3b2-40fa3e66e895",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Node properties are the following: \n",
      " {'Actor': [{'property': 'name', 'type': 'STRING'}]}\n",
      "Relationships properties are the following: \n",
      " {}\n",
      "Relationships are: \n",
      "[]\n"
     ]
    }
   ],
   "source": [
    "# Inspect graph schema\n",
    "print(chain.graph_schema)"
   ]
  }
 ],
 "metadata": {
--- a/docs/extras/use_cases/more/graph/graph_memgraph_qa.ipynb
+++ b/docs/extras/use_cases/more/graph/graph_memgraph_qa.ipynb
@@ -187,7 +187,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "print(graph.get_schema)"
+    "print(graph.schema)"
   ]
  },
  {
@@ -687,7 +687,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.8.8"
  }
 },
 "nbformat": 4,
--- a/libs/langchain/langchain/chains/graph_qa/cypher.py
+++ b/libs/langchain/langchain/chains/graph_qa/cypher.py
@@ -34,12 +34,54 @@ def extract_cypher(text: str) -> str:
    return matches[0] if matches else text
 def construct_schema(
    structured_schema: Dict[str, Any],
    include_types: List[str],
    exclude_types: List[str],
 ) -> str:
    """Filter the schema based on included or excluded types"""
    def filter_func(x: str) -> bool:
        return x in include_types if include_types else x not in exclude_types
    filtered_schema = {
        "node_props": {
            k: v
            for k, v in structured_schema.get("node_props", {}).items()
            if filter_func(k)
        },
        "rel_props": {
            k: v
            for k, v in structured_schema.get("rel_props", {}).items()
            if filter_func(k)
        },
        "relationships": [
            r
            for r in structured_schema.get("relationships", [])
            if all(filter_func(r[t]) for t in ["start", "end", "type"])
        ],
    }
    return (
        f"Node properties are the following: \n {filtered_schema['node_props']}\n"
        f"Relationships properties are the following: \n {filtered_schema['rel_props']}"
        "\nRelationships are: \n"
        + str(
            [
                f"(:{el['start']})-[:{el['type']}]->(:{el['end']})"
                for el in filtered_schema["relationships"]
            ]
        )
    )
 class GraphCypherQAChain(Chain):
    """Chain for question-answering against a graph by generating Cypher statements."""
    graph: Neo4jGraph = Field(exclude=True)
    cypher_generation_chain: LLMChain
    qa_chain: LLMChain
    graph_schema: str
    input_key: str = "query"  #: :meta private:
    output_key: str = "result"  #: :meta private:
    top_k: int = 10
@@ -79,6 +121,8 @@ class GraphCypherQAChain(Chain):
        cypher_prompt: BasePromptTemplate = CYPHER_GENERATION_PROMPT,
        cypher_llm: Optional[BaseLanguageModel] = None,
        qa_llm: Optional[BaseLanguageModel] = None,
        exclude_types: List[str] = [],
        include_types: List[str] = [],
        **kwargs: Any,
    ) -> GraphCypherQAChain:
        """Initialize from LLM."""
@@ -96,7 +140,18 @@ class GraphCypherQAChain(Chain):
        qa_chain = LLMChain(llm=qa_llm or llm, prompt=qa_prompt)
        cypher_generation_chain = LLMChain(llm=cypher_llm or llm, prompt=cypher_prompt)
        if exclude_types and include_types:
            raise ValueError(
                "Either `exclude_types` or `include_types` "
                "can be provided, but not both"
            )
        graph_schema = construct_schema(
            kwargs["graph"].structured_schema, include_types, exclude_types
        )
        return cls(
            graph_schema=graph_schema,
            qa_chain=qa_chain,
            cypher_generation_chain=cypher_generation_chain,
            **kwargs,
@@ -115,7 +170,7 @@ class GraphCypherQAChain(Chain):
        intermediate_steps: List = []
        generated_cypher = self.cypher_generation_chain.run(
-            {"question": question, "schema": self.graph.get_schema}, callbacks=callbacks
+            {"question": question, "schema": self.graph_schema}, callbacks=callbacks
        )
        # Extract Cypher code if it is wrapped in backticks
--- a/libs/langchain/langchain/graphs/memgraph_graph.py
+++ b/libs/langchain/langchain/graphs/memgraph_graph.py
@@ -6,6 +6,12 @@ YIELD *
 RETURN *
 """
 RAW_SCHEMA_QUERY = """
 CALL llm_util.schema("raw")
 YIELD *
 RETURN *
 """
 class MemgraphGraph(Neo4jGraph):
    """Memgraph wrapper for graph operations."""
@@ -24,3 +30,7 @@ class MemgraphGraph(Neo4jGraph):
        db_schema = self.query(SCHEMA_QUERY)[0].get("schema")
        assert db_schema is not None
        self.schema = db_schema
        db_structured_schema = self.query(RAW_SCHEMA_QUERY)[0].get("schema")
        assert db_structured_schema is not None
        self.structured_schema = db_structured_schema
--- a/libs/langchain/langchain/graphs/neo4j_graph.py
+++ b/libs/langchain/langchain/graphs/neo4j_graph.py
@@ -24,7 +24,7 @@ CALL apoc.meta.data()
 YIELD label, other, elementType, type, property
 WHERE type = "RELATIONSHIP" AND elementType = "node"
 UNWIND other AS other_node
-RETURN "(:" + label + ")-[:" + property + "]->(:" + toString(other_node) + ")" AS output
+RETURN {start: label, type: property, end: toString(other_node)} AS output
 """
@@ -45,7 +45,8 @@ class Neo4jGraph:
        self._driver = neo4j.GraphDatabase.driver(url, auth=(username, password))
        self._database = database
-        self.schema = ""
+        self.schema: str = ""
        self.structured_schema: Dict[str, Any] = {}
        # Verify connection
        try:
            self._driver.verify_connectivity()
@@ -69,11 +70,6 @@ class Neo4jGraph:
                "'apoc.meta.data()' is allowed in Neo4j configuration "
            )
    @property
    def get_schema(self) -> str:
        """Returns the schema of the Neo4j database"""
        return self.schema
    def query(self, query: str, params: dict = {}) -> List[Dict[str, Any]]:
        """Query Neo4j database."""
        from neo4j.exceptions import CypherSyntaxError
@@ -89,17 +85,22 @@ class Neo4jGraph:
        """
        Refreshes the Neo4j graph schema information.
        """
-        node_properties = self.query(node_properties_query)
+        node_properties = [el["output"] for el in self.query(node_properties_query)]
-        relationships_properties = self.query(rel_properties_query)
+        rel_properties = [el["output"] for el in self.query(rel_properties_query)]
-        relationships = self.query(rel_query)
+        relationships = [el["output"] for el in self.query(rel_query)]
        self.structured_schema = {
            "node_props": {el["labels"]: el["properties"] for el in node_properties},
            "rel_props": {el["type"]: el["properties"] for el in rel_properties},
            "relationships": relationships,
        }
        self.schema = f"""
        Node properties are the following:
-        {[el['output'] for el in node_properties]}
+        {node_properties}
        Relationship properties are the following:
-        {[el['output'] for el in relationships_properties]}
+        {rel_properties}
        The relationships are the following:
-        {[el['output'] for el in relationships]}
+        {[f"(:{el['start']})-[:{el['type']}]->(:{el['end']})" for el in relationships]}
        """
    def add_graph_documents(
--- a/libs/langchain/tests/integration_tests/chains/test_graph_database.py
+++ b/libs/langchain/tests/integration_tests/chains/test_graph_database.py
@@ -211,21 +211,32 @@ def test_cypher_return_correct_schema() -> None:
    expected_node_properties = [
        {
-            "properties": [{"property": "property_a", "type": "STRING"}],
+            "output": {
-            "labels": "LabelA",
+                "properties": [{"property": "property_a", "type": "STRING"}],
                "labels": "LabelA",
            }
        }
    ]
    expected_relationships_properties = [
-        {"type": "REL_TYPE", "properties": [{"property": "rel_prop", "type": "STRING"}]}
+        {
            "output": {
                "type": "REL_TYPE",
                "properties": [{"property": "rel_prop", "type": "STRING"}],
            }
        }
    ]
    expected_relationships = [
-        "(:LabelA)-[:REL_TYPE]->(:LabelB)",
+        {"output": {"start": "LabelA", "type": "REL_TYPE", "end": "LabelB"}},
-        "(:LabelA)-[:REL_TYPE]->(:LabelC)",
+        {"output": {"start": "LabelA", "type": "REL_TYPE", "end": "LabelC"}},
    ]
    assert node_properties == expected_node_properties
    assert relationships_properties == expected_relationships_properties
-    assert relationships == expected_relationships
+    # Order is not guaranteed with Neo4j returns
    assert (
        sorted(relationships, key=lambda x: x["output"]["end"])
        == expected_relationships
    )
 def test_cypher_save_load() -> None:
@@ -252,3 +263,122 @@ def test_cypher_save_load() -> None:
    qa_loaded = load_chain(FILE_PATH, graph=graph)
    assert qa_loaded == chain
 def test_exclude_types() -> None:
    """Test exclude types from schema."""
    url = os.environ.get("NEO4J_URL")
    username = os.environ.get("NEO4J_USERNAME")
    password = os.environ.get("NEO4J_PASSWORD")
    assert url is not None
    assert username is not None
    assert password is not None
    graph = Neo4jGraph(
        url=url,
        username=username,
        password=password,
    )
    # Delete all nodes in the graph
    graph.query("MATCH (n) DETACH DELETE n")
    # Create two nodes and a relationship
    graph.query(
        "CREATE (a:Actor {name:'Bruce Willis'})"
        "-[:ACTED_IN]->(:Movie {title: 'Pulp Fiction'})"
        "<-[:DIRECTED]-(p:Person {name:'John'})"
    )
    # Refresh schema information
    graph.refresh_schema()
    chain = GraphCypherQAChain.from_llm(
        OpenAI(temperature=0), graph=graph, exclude_types=["Person", "DIRECTED"]
    )
    expected_schema = (
        "Node properties are the following: \n"
        " {'Movie': [{'property': 'title', 'type': 'STRING'}], "
        "'Actor': [{'property': 'name', 'type': 'STRING'}]}\n"
        "Relationships properties are the following: \n"
        " {}\nRelationships are: \n"
        "['(:Actor)-[:ACTED_IN]->(:Movie)']"
    )
    assert chain.graph_schema == expected_schema
 def test_include_types() -> None:
    """Test include types from schema."""
    url = os.environ.get("NEO4J_URL")
    username = os.environ.get("NEO4J_USERNAME")
    password = os.environ.get("NEO4J_PASSWORD")
    assert url is not None
    assert username is not None
    assert password is not None
    graph = Neo4jGraph(
        url=url,
        username=username,
        password=password,
    )
    # Delete all nodes in the graph
    graph.query("MATCH (n) DETACH DELETE n")
    # Create two nodes and a relationship
    graph.query(
        "CREATE (a:Actor {name:'Bruce Willis'})"
        "-[:ACTED_IN]->(:Movie {title: 'Pulp Fiction'})"
        "<-[:DIRECTED]-(p:Person {name:'John'})"
    )
    # Refresh schema information
    graph.refresh_schema()
    chain = GraphCypherQAChain.from_llm(
        OpenAI(temperature=0), graph=graph, include_types=["Movie", "Actor", "ACTED_IN"]
    )
    expected_schema = (
        "Node properties are the following: \n"
        " {'Movie': [{'property': 'title', 'type': 'STRING'}], "
        "'Actor': [{'property': 'name', 'type': 'STRING'}]}\n"
        "Relationships properties are the following: \n"
        " {}\nRelationships are: \n"
        "['(:Actor)-[:ACTED_IN]->(:Movie)']"
    )
    assert chain.graph_schema == expected_schema
 def test_include_types2() -> None:
    """Test include types from schema."""
    url = os.environ.get("NEO4J_URL")
    username = os.environ.get("NEO4J_USERNAME")
    password = os.environ.get("NEO4J_PASSWORD")
    assert url is not None
    assert username is not None
    assert password is not None
    graph = Neo4jGraph(
        url=url,
        username=username,
        password=password,
    )
    # Delete all nodes in the graph
    graph.query("MATCH (n) DETACH DELETE n")
    # Create two nodes and a relationship
    graph.query(
        "CREATE (a:Actor {name:'Bruce Willis'})"
        "-[:ACTED_IN]->(:Movie {title: 'Pulp Fiction'})"
        "<-[:DIRECTED]-(p:Person {name:'John'})"
    )
    # Refresh schema information
    graph.refresh_schema()
    chain = GraphCypherQAChain.from_llm(
        OpenAI(temperature=0), graph=graph, include_types=["Movie", "ACTED_IN"]
    )
    expected_schema = (
        "Node properties are the following: \n"
        " {'Movie': [{'property': 'title', 'type': 'STRING'}]}\n"
        "Relationships properties are the following: \n"
        " {}\nRelationships are: \n"
        "[]"
    )
    assert chain.graph_schema == expected_schema
--- a/libs/langchain/tests/unit_tests/chains/test_graph_qa.py
+++ b/libs/langchain/tests/unit_tests/chains/test_graph_qa.py
@@ -1,4 +1,4 @@
-from langchain.chains.graph_qa.cypher import extract_cypher
+from langchain.chains.graph_qa.cypher import construct_schema, extract_cypher
 def test_no_backticks() -> None:
@@ -13,3 +13,107 @@ def test_backticks() -> None:
    query = "You can use the following query: ```MATCH (n) RETURN n```"
    output = extract_cypher(query)
    assert output == "MATCH (n) RETURN n"
 def test_exclude_types() -> None:
    structured_schema = {
        "node_props": {
            "Movie": [{"property": "title", "type": "STRING"}],
            "Actor": [{"property": "name", "type": "STRING"}],
            "Person": [{"property": "name", "type": "STRING"}],
        },
        "rel_props": {},
        "relationships": [
            {"start": "Actor", "end": "Movie", "type": "ACTED_IN"},
            {"start": "Person", "end": "Movie", "type": "DIRECTED"},
        ],
    }
    exclude_types = ["Person", "DIRECTED"]
    output = construct_schema(structured_schema, [], exclude_types)
    expected_schema = (
        "Node properties are the following: \n"
        " {'Movie': [{'property': 'title', 'type': 'STRING'}], "
        "'Actor': [{'property': 'name', 'type': 'STRING'}]}\n"
        "Relationships properties are the following: \n"
        " {}\nRelationships are: \n"
        "['(:Actor)-[:ACTED_IN]->(:Movie)']"
    )
    assert output == expected_schema
 def test_include_types() -> None:
    structured_schema = {
        "node_props": {
            "Movie": [{"property": "title", "type": "STRING"}],
            "Actor": [{"property": "name", "type": "STRING"}],
            "Person": [{"property": "name", "type": "STRING"}],
        },
        "rel_props": {},
        "relationships": [
            {"start": "Actor", "end": "Movie", "type": "ACTED_IN"},
            {"start": "Person", "end": "Movie", "type": "DIRECTED"},
        ],
    }
    include_types = ["Movie", "Actor", "ACTED_IN"]
    output = construct_schema(structured_schema, include_types, [])
    expected_schema = (
        "Node properties are the following: \n"
        " {'Movie': [{'property': 'title', 'type': 'STRING'}], "
        "'Actor': [{'property': 'name', 'type': 'STRING'}]}\n"
        "Relationships properties are the following: \n"
        " {}\nRelationships are: \n"
        "['(:Actor)-[:ACTED_IN]->(:Movie)']"
    )
    assert output == expected_schema
 def test_include_types2() -> None:
    structured_schema = {
        "node_props": {
            "Movie": [{"property": "title", "type": "STRING"}],
            "Actor": [{"property": "name", "type": "STRING"}],
            "Person": [{"property": "name", "type": "STRING"}],
        },
        "rel_props": {},
        "relationships": [
            {"start": "Actor", "end": "Movie", "type": "ACTED_IN"},
            {"start": "Person", "end": "Movie", "type": "DIRECTED"},
        ],
    }
    include_types = ["Movie", "Actor"]
    output = construct_schema(structured_schema, include_types, [])
    expected_schema = (
        "Node properties are the following: \n"
        " {'Movie': [{'property': 'title', 'type': 'STRING'}], "
        "'Actor': [{'property': 'name', 'type': 'STRING'}]}\n"
        "Relationships properties are the following: \n"
        " {}\nRelationships are: \n"
        "[]"
    )
    assert output == expected_schema
 def test_include_types3() -> None:
    structured_schema = {
        "node_props": {
            "Movie": [{"property": "title", "type": "STRING"}],
            "Actor": [{"property": "name", "type": "STRING"}],
            "Person": [{"property": "name", "type": "STRING"}],
        },
        "rel_props": {},
        "relationships": [
            {"start": "Actor", "end": "Movie", "type": "ACTED_IN"},
            {"start": "Person", "end": "Movie", "type": "DIRECTED"},
        ],
    }
    include_types = ["Movie", "Actor", "ACTED_IN"]
    output = construct_schema(structured_schema, include_types, [])
    expected_schema = (
        "Node properties are the following: \n"
        " {'Movie': [{'property': 'title', 'type': 'STRING'}], "
        "'Actor': [{'property': 'name', 'type': 'STRING'}]}\n"
        "Relationships properties are the following: \n"
        " {}\nRelationships are: \n"
        "['(:Actor)-[:ACTED_IN]->(:Movie)']"
    )
    assert output == expected_schema