community[patch]: Neo4j enhanced schema (#20983)

Scan the database for example values and provide them to an LLM for
better inference of Text2cypher
This commit is contained in:
Tomaz Bratanic 2024-04-29 16:45:55 +02:00 committed by GitHub
parent dc70c23a11
commit 67428c4052
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 423 additions and 92 deletions

View File

@ -21,7 +21,7 @@
"id": "dbc0ee68", "id": "dbc0ee68",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Settin up\n", "## Setting up\n",
"\n", "\n",
"You will need to have a running `Neo4j` instance. One option is to create a [free Neo4j database instance in their Aura cloud service](https://neo4j.com/cloud/platform/aura-graph-database/). You can also run the database locally using the [Neo4j Desktop application](https://neo4j.com/download/), or running a docker container.\n", "You will need to have a running `Neo4j` instance. One option is to create a [free Neo4j database instance in their Aura cloud service](https://neo4j.com/cloud/platform/aura-graph-database/). You can also run the database locally using the [Neo4j Desktop application](https://neo4j.com/download/), or running a docker container.\n",
"You can run a local docker container by running the executing the following script:\n", "You can run a local docker container by running the executing the following script:\n",
@ -31,7 +31,7 @@
" --name neo4j \\\n", " --name neo4j \\\n",
" -p 7474:7474 -p 7687:7687 \\\n", " -p 7474:7474 -p 7687:7687 \\\n",
" -d \\\n", " -d \\\n",
" -e NEO4J_AUTH=neo4j/pleaseletmein \\\n", " -e NEO4J_AUTH=neo4j/password \\\n",
" -e NEO4J_PLUGINS=\\[\\\"apoc\\\"\\] \\\n", " -e NEO4J_PLUGINS=\\[\\\"apoc\\\"\\] \\\n",
" neo4j:latest\n", " neo4j:latest\n",
"```\n", "```\n",
@ -58,9 +58,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"graph = Neo4jGraph(\n", "graph = Neo4jGraph(url=\"bolt://localhost:7687\", username=\"neo4j\", password=\"password\")"
" url=\"bolt://localhost:7687\", username=\"neo4j\", password=\"pleaseletmein\"\n",
")"
] ]
}, },
{ {
@ -93,7 +91,7 @@
"source": [ "source": [
"graph.query(\n", "graph.query(\n",
" \"\"\"\n", " \"\"\"\n",
"MERGE (m:Movie {name:\"Top Gun\"})\n", "MERGE (m:Movie {name:\"Top Gun\", runtime: 120})\n",
"WITH m\n", "WITH m\n",
"UNWIND [\"Tom Cruise\", \"Val Kilmer\", \"Anthony Edwards\", \"Meg Ryan\"] AS actor\n", "UNWIND [\"Tom Cruise\", \"Val Kilmer\", \"Anthony Edwards\", \"Meg Ryan\"] AS actor\n",
"MERGE (a:Actor {name:actor})\n", "MERGE (a:Actor {name:actor})\n",
@ -131,11 +129,12 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Node properties are the following:\n", "Node properties:\n",
"Movie {name: STRING},Actor {name: STRING}\n", "Movie {runtime: INTEGER, name: STRING}\n",
"Relationship properties are the following:\n", "Actor {name: STRING}\n",
"Relationship properties:\n",
"\n", "\n",
"The relationships are the following:\n", "The relationships:\n",
"(:Actor)-[:ACTED_IN]->(:Movie)\n" "(:Actor)-[:ACTED_IN]->(:Movie)\n"
] ]
} }
@ -144,6 +143,48 @@
"print(graph.schema)" "print(graph.schema)"
] ]
}, },
{
"cell_type": "markdown",
"id": "3d88f516-2e60-4da4-b25f-dad5801fe133",
"metadata": {},
"source": [
"## Enhanced schema information\n",
"Choosing the enhanced schema version enables the system to automatically scan for example values within the databases and calculate some distribution metrics. For example, if a node property has less than 10 distinct values, we return all possible values in the schema. Otherwise, return only a single example value per node and relationship property."
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c8233976-1ca7-4f8f-af20-e8fb3e081fdd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Node properties:\n",
"- **Movie**\n",
" - `runtime: INTEGER` Min: 120, Max: 120\n",
" - `name: STRING` Available options: ['Top Gun']\n",
"- **Actor**\n",
" - `name: STRING` Available options: ['Tom Cruise', 'Val Kilmer', 'Anthony Edwards', 'Meg Ryan']\n",
"Relationship properties:\n",
"\n",
"The relationships:\n",
"(:Actor)-[:ACTED_IN]->(:Movie)\n"
]
}
],
"source": [
"enhanced_graph = Neo4jGraph(\n",
" url=\"bolt://localhost:7687\",\n",
" username=\"neo4j\",\n",
" password=\"password\",\n",
" enhanced_schema=True,\n",
")\n",
"print(enhanced_graph.schema)"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "68a3c677", "id": "68a3c677",
@ -156,7 +197,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 7,
"id": "7476ce98", "id": "7476ce98",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -168,7 +209,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 8,
"id": "ef8ee27b", "id": "ef8ee27b",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -180,10 +221,11 @@
"\n", "\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n", "Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n", "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n",
"WHERE m.name = 'Top Gun'\n",
"RETURN a.name\u001b[0m\n", "RETURN a.name\u001b[0m\n",
"Full Context:\n", "Full Context:\n",
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n", "\u001b[32;1m\u001b[1;3m[{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Tom Cruise'}]\u001b[0m\n",
"\n", "\n",
"\u001b[1m> Finished chain.\u001b[0m\n" "\u001b[1m> Finished chain.\u001b[0m\n"
] ]
@ -191,16 +233,17 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"'Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.'" "{'query': 'Who played in Top Gun?',\n",
" 'result': 'Anthony Edwards, Meg Ryan, Val Kilmer, Tom Cruise played in Top Gun.'}"
] ]
}, },
"execution_count": 7, "execution_count": 8,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"chain.run(\"Who played in Top Gun?\")" "chain.invoke({\"query\": \"Who played in Top Gun?\"})"
] ]
}, },
{ {
@ -215,7 +258,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 9,
"id": "df230946", "id": "df230946",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -227,7 +270,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 10,
"id": "3f1600ee", "id": "3f1600ee",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -239,10 +282,11 @@
"\n", "\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n", "Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n", "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n",
"WHERE m.name = 'Top Gun'\n",
"RETURN a.name\u001b[0m\n", "RETURN a.name\u001b[0m\n",
"Full Context:\n", "Full Context:\n",
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}]\u001b[0m\n", "\u001b[32;1m\u001b[1;3m[{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n",
"\n", "\n",
"\u001b[1m> Finished chain.\u001b[0m\n" "\u001b[1m> Finished chain.\u001b[0m\n"
] ]
@ -250,16 +294,17 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"'Tom Cruise and Val Kilmer played in Top Gun.'" "{'query': 'Who played in Top Gun?',\n",
" 'result': 'Anthony Edwards, Meg Ryan played in Top Gun.'}"
] ]
}, },
"execution_count": 9, "execution_count": 10,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"chain.run(\"Who played in Top Gun?\")" "chain.invoke({\"query\": \"Who played in Top Gun?\"})"
] ]
}, },
{ {
@ -273,7 +318,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 11,
"id": "e412f36b", "id": "e412f36b",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -285,7 +330,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 12,
"id": "4f4699dc", "id": "4f4699dc",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -297,19 +342,20 @@
"\n", "\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n", "Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n", "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n",
"WHERE m.name = 'Top Gun'\n",
"RETURN a.name\u001b[0m\n", "RETURN a.name\u001b[0m\n",
"Full Context:\n", "Full Context:\n",
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n", "\u001b[32;1m\u001b[1;3m[{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Tom Cruise'}]\u001b[0m\n",
"\n", "\n",
"\u001b[1m> Finished chain.\u001b[0m\n", "\u001b[1m> Finished chain.\u001b[0m\n",
"Intermediate steps: [{'query': \"MATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\\nRETURN a.name\"}, {'context': [{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]}]\n", "Intermediate steps: [{'query': \"MATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\\nWHERE m.name = 'Top Gun'\\nRETURN a.name\"}, {'context': [{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Tom Cruise'}]}]\n",
"Final answer: Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.\n" "Final answer: Anthony Edwards, Meg Ryan, Val Kilmer, Tom Cruise played in Top Gun.\n"
] ]
} }
], ],
"source": [ "source": [
"result = chain(\"Who played in Top Gun?\")\n", "result = chain.invoke({\"query\": \"Who played in Top Gun?\"})\n",
"print(f\"Intermediate steps: {result['intermediate_steps']}\")\n", "print(f\"Intermediate steps: {result['intermediate_steps']}\")\n",
"print(f\"Final answer: {result['result']}\")" "print(f\"Final answer: {result['result']}\")"
] ]
@ -325,7 +371,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 13,
"id": "2d3acf10", "id": "2d3acf10",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -337,7 +383,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 14,
"id": "b0a9d143", "id": "b0a9d143",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -349,7 +395,8 @@
"\n", "\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n", "Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n", "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n",
"WHERE m.name = 'Top Gun'\n",
"RETURN a.name\u001b[0m\n", "RETURN a.name\u001b[0m\n",
"\n", "\n",
"\u001b[1m> Finished chain.\u001b[0m\n" "\u001b[1m> Finished chain.\u001b[0m\n"
@ -358,19 +405,20 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[{'a.name': 'Tom Cruise'},\n", "{'query': 'Who played in Top Gun?',\n",
" {'a.name': 'Val Kilmer'},\n", " 'result': [{'a.name': 'Anthony Edwards'},\n",
" {'a.name': 'Anthony Edwards'},\n", " {'a.name': 'Meg Ryan'},\n",
" {'a.name': 'Meg Ryan'}]" " {'a.name': 'Val Kilmer'},\n",
" {'a.name': 'Tom Cruise'}]}"
] ]
}, },
"execution_count": 13, "execution_count": 14,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"chain.run(\"Who played in Top Gun?\")" "chain.invoke({\"query\": \"Who played in Top Gun?\"})"
] ]
}, },
{ {
@ -384,7 +432,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 15,
"id": "59baeb88-adfa-4c26-8334-fcbff3a98efb", "id": "59baeb88-adfa-4c26-8334-fcbff3a98efb",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -422,7 +470,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 16,
"id": "47c64027-cf42-493a-9c76-2d10ba753728", "id": "47c64027-cf42-493a-9c76-2d10ba753728",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -434,7 +482,7 @@
"\n", "\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n", "Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (m:Movie {name:\"Top Gun\"})<-[:ACTED_IN]-(:Actor)\n", "\u001b[32;1m\u001b[1;3mMATCH (:Movie {name:\"Top Gun\"})<-[:ACTED_IN]-()\n",
"RETURN count(*) AS numberOfActors\u001b[0m\n", "RETURN count(*) AS numberOfActors\u001b[0m\n",
"Full Context:\n", "Full Context:\n",
"\u001b[32;1m\u001b[1;3m[{'numberOfActors': 4}]\u001b[0m\n", "\u001b[32;1m\u001b[1;3m[{'numberOfActors': 4}]\u001b[0m\n",
@ -445,16 +493,17 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"'Four people played in Top Gun.'" "{'query': 'How many people played in Top Gun?',\n",
" 'result': 'There were 4 actors who played in Top Gun.'}"
] ]
}, },
"execution_count": 15, "execution_count": 16,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"chain.run(\"How many people played in Top Gun?\")" "chain.invoke({\"query\": \"How many people played in Top Gun?\"})"
] ]
}, },
{ {
@ -468,7 +517,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 17,
"id": "6f9becc2-f579-45bf-9b50-2ce02bde92da", "id": "6f9becc2-f579-45bf-9b50-2ce02bde92da",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -483,7 +532,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 18,
"id": "ff18e3e3-3402-4683-aec4-a19898f23ca1", "id": "ff18e3e3-3402-4683-aec4-a19898f23ca1",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -495,10 +544,11 @@
"\n", "\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n", "Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n", "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n",
"WHERE m.name = 'Top Gun'\n",
"RETURN a.name\u001b[0m\n", "RETURN a.name\u001b[0m\n",
"Full Context:\n", "Full Context:\n",
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n", "\u001b[32;1m\u001b[1;3m[{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Tom Cruise'}]\u001b[0m\n",
"\n", "\n",
"\u001b[1m> Finished chain.\u001b[0m\n" "\u001b[1m> Finished chain.\u001b[0m\n"
] ]
@ -506,16 +556,17 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"'Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.'" "{'query': 'Who played in Top Gun?',\n",
" 'result': 'Anthony Edwards, Meg Ryan, Val Kilmer, and Tom Cruise played in Top Gun.'}"
] ]
}, },
"execution_count": 17, "execution_count": 18,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"chain.run(\"Who played in Top Gun?\")" "chain.invoke({\"query\": \"Who played in Top Gun?\"})"
] ]
}, },
{ {
@ -530,7 +581,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 19,
"id": "a20fa21e-fb85-41c4-aac0-53fb25e34604", "id": "a20fa21e-fb85-41c4-aac0-53fb25e34604",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -546,7 +597,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 20,
"id": "3ad7f6b8-543e-46e4-a3b2-40fa3e66e895", "id": "3ad7f6b8-543e-46e4-a3b2-40fa3e66e895",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -579,7 +630,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 21,
"id": "53665d03-7afd-433c-bdd5-750127bfb152", "id": "53665d03-7afd-433c-bdd5-750127bfb152",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -594,7 +645,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 21, "execution_count": 22,
"id": "19e1a591-9c10-4d7b-aa36-a5e1b778a97b", "id": "19e1a591-9c10-4d7b-aa36-a5e1b778a97b",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -606,10 +657,11 @@
"\n", "\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n", "Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n", "\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie)\n",
"WHERE m.name = 'Top Gun'\n",
"RETURN a.name\u001b[0m\n", "RETURN a.name\u001b[0m\n",
"Full Context:\n", "Full Context:\n",
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n", "\u001b[32;1m\u001b[1;3m[{'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Tom Cruise'}]\u001b[0m\n",
"\n", "\n",
"\u001b[1m> Finished chain.\u001b[0m\n" "\u001b[1m> Finished chain.\u001b[0m\n"
] ]
@ -617,16 +669,17 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"'Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.'" "{'query': 'Who played in Top Gun?',\n",
" 'result': 'Anthony Edwards, Meg Ryan, Val Kilmer, Tom Cruise played in Top Gun.'}"
] ]
}, },
"execution_count": 21, "execution_count": 22,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"chain.run(\"Who played in Top Gun?\")" "chain.invoke({\"query\": \"Who played in Top Gun?\"})"
] ]
}, },
{ {
@ -654,7 +707,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.12" "version": "3.9.18"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -9,6 +9,11 @@ from langchain_community.graphs.graph_store import GraphStore
BASE_ENTITY_LABEL = "__Entity__" BASE_ENTITY_LABEL = "__Entity__"
EXCLUDED_LABELS = ["_Bloom_Perspective_", "_Bloom_Scene_"] EXCLUDED_LABELS = ["_Bloom_Perspective_", "_Bloom_Scene_"]
EXCLUDED_RELS = ["_Bloom_HAS_SCENE_"] EXCLUDED_RELS = ["_Bloom_HAS_SCENE_"]
EXHAUSTIVE_SEARCH_LIMIT = 10000
LIST_LIMIT = 128
# Threshold for returning all available prop values in graph schema
DISTINCT_VALUE_LIMIT = 10
NL = "\n"
node_properties_query = """ node_properties_query = """
CALL apoc.meta.data() CALL apoc.meta.data()
@ -56,7 +61,6 @@ def value_sanitize(d: Any) -> Any:
results, can occupy significant context space and detract from results, can occupy significant context space and detract from
the LLM's performance by introducing unnecessary noise and cost. the LLM's performance by introducing unnecessary noise and cost.
""" """
LIST_LIMIT = 128
if isinstance(d, dict): if isinstance(d, dict):
new_dict = {} new_dict = {}
for key, value in d.items(): for key, value in d.items():
@ -135,6 +139,223 @@ def _get_rel_import_query(baseEntityLabel: bool) -> str:
) )
def _enhanced_schema_cypher(
label_or_type: str,
properties: List[Dict[str, Any]],
exhaustive: bool,
is_relationship: bool = False,
) -> str:
if is_relationship:
match_clause = f"MATCH ()-[n:{label_or_type}]->()"
else:
match_clause = f"MATCH (n:{label_or_type})"
with_clauses = []
return_clauses = []
output_dict = {}
if exhaustive:
for prop in properties:
prop_name = prop["property"]
prop_type = prop["type"]
if prop_type == "STRING":
with_clauses.append(
(
f"collect(distinct substring(n.`{prop_name}`, 0, 50)) "
f"AS `{prop_name}_values`"
)
)
return_clauses.append(
(
f"values:`{prop_name}_values`[..{DISTINCT_VALUE_LIMIT}],"
f" distinct_count: size(`{prop_name}_values`)"
)
)
elif prop_type in ["INTEGER", "FLOAT", "DATE"]:
with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`")
with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`")
with_clauses.append(
f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`"
)
return_clauses.append(
(
f"min: toString(`{prop_name}_min`), "
f"max: toString(`{prop_name}_max`), "
f"distinct_count: `{prop_name}_distinct`"
)
)
elif prop_type == "LIST":
with_clauses.append(
(
f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "
f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"
)
)
return_clauses.append(
f"min_size: `{prop_name}_size_min`, "
f"max_size: `{prop_name}_size_max`"
)
output_dict[prop_name] = "{" + return_clauses.pop() + "}"
else:
# Just sample 5 random nodes
match_clause += " WITH n LIMIT 5"
for prop in properties:
prop_name = prop["property"]
prop_type = prop["type"]
if prop_type == "STRING":
with_clauses.append(
(
f"collect(distinct substring(n.`{prop_name}`, 0, 50)) "
f"AS `{prop_name}_values`"
)
)
return_clauses.append(f"values: `{prop_name}_values`")
elif prop_type in ["INTEGER", "FLOAT", "DATE"]:
with_clauses.append(
f"collect(distinct toString(n.`{prop_name}`)) "
f"AS `{prop_name}_values`"
)
return_clauses.append(f"values: `{prop_name}_values`")
elif prop_type == "LIST":
with_clauses.append(
(
f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "
f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"
)
)
return_clauses.append(
f"min_size: `{prop_name}_size_min`,max_size: `{prop_name}_size_max`"
)
output_dict[prop_name] = "{" + return_clauses.pop() + "}"
with_clause = "WITH " + ",\n ".join(with_clauses)
return_clause = (
"RETURN {"
+ ", ".join(f"{k}: {v}" for k, v in output_dict.items())
+ "} AS output"
)
# Combine all parts of the Cypher query
cypher_query = "\n".join([match_clause, with_clause, return_clause])
return cypher_query
def _format_schema(schema: Dict, is_enhanced: bool) -> str:
formatted_node_props = []
formatted_rel_props = []
if is_enhanced:
# Enhanced formatting for nodes
for node_type, properties in schema["node_props"].items():
formatted_node_props.append(f"- **{node_type}**")
for prop in properties:
example = ""
if prop["type"] == "STRING":
if prop.get("distinct_count", 11) > DISTINCT_VALUE_LIMIT:
example = (
f'Example: "{prop["values"][0].replace(NL, " ")}"'
if prop["values"]
else ""
)
else: # If less than 10 possible values return all
example = (
(
"Available options: "
f'{[el.replace(NL, " ") for el in prop["values"]]}'
)
if prop["values"]
else ""
)
elif prop["type"] in ["INTEGER", "FLOAT", "DATE"]:
if prop.get("min") is not None:
example = f'Min: {prop["min"]}, Max: {prop["max"]}'
else:
example = (
f'Example: "{prop["values"][0]}"' if prop["values"] else ""
)
elif prop["type"] == "LIST":
# Skip embeddings
if prop["min_size"] > LIST_LIMIT:
continue
example = (
f'Min Size: {prop["min_size"]}, Max Size: {prop["max_size"]}'
)
formatted_node_props.append(
f" - `{prop['property']}: {prop['type']}` {example}"
)
# Enhanced formatting for relationships
for rel_type, properties in schema["rel_props"].items():
formatted_rel_props.append(f"- **{rel_type}**")
for prop in properties:
example = ""
if prop["type"] == "STRING":
if prop.get("distinct_count", 11) > DISTINCT_VALUE_LIMIT:
example = (
f'Example: "{prop["values"][0].replace(NL, " ")}"'
if prop["values"]
else ""
)
else: # If less than 10 possible values return all
example = (
(
"Available options: "
f'{[el.replace(NL, " ") for el in prop["values"]]}'
)
if prop["values"]
else ""
)
elif prop["type"] in ["INTEGER", "FLOAT", "DATE"]:
if prop.get("min"): # If we have min/max
example = f'Min: {prop["min"]}, Max: {prop["max"]}'
else: # return a single value
example = (
f'Example: "{prop["values"][0]}"' if prop["values"] else ""
)
elif prop["type"] == "LIST":
# Skip embeddings
if prop["min_size"] > LIST_LIMIT:
continue
example = (
f'Min Size: {prop["min_size"]}, Max Size: {prop["max_size"]}'
)
formatted_rel_props.append(
f" - `{prop['property']}: {prop['type']}` {example}"
)
else:
# Format node properties
for label, props in schema["node_props"].items():
props_str = ", ".join(
[f"{prop['property']}: {prop['type']}" for prop in props]
)
formatted_node_props.append(f"{label} {{{props_str}}}")
# Format relationship properties using structured_schema
for type, props in schema["rel_props"].items():
props_str = ", ".join(
[f"{prop['property']}: {prop['type']}" for prop in props]
)
formatted_rel_props.append(f"{type} {{{props_str}}}")
# Format relationships
formatted_rels = [
f"(:{el['start']})-[:{el['type']}]->(:{el['end']})"
for el in schema["relationships"]
]
return "\n".join(
[
"Node properties:",
"\n".join(formatted_node_props),
"Relationship properties:",
"\n".join(formatted_rel_props),
"The relationships:",
"\n".join(formatted_rels),
]
)
class Neo4jGraph(GraphStore): class Neo4jGraph(GraphStore):
"""Neo4j database wrapper for various graph operations. """Neo4j database wrapper for various graph operations.
@ -151,6 +372,8 @@ class Neo4jGraph(GraphStore):
embedding-like properties from database responses. Default is False. embedding-like properties from database responses. Default is False.
refresh_schema (bool): A flag whether to refresh schema information refresh_schema (bool): A flag whether to refresh schema information
at initialization. Default is True. at initialization. Default is True.
enhanced_schema (bool): A flag whether to scan the database for
example values and use them in the graph schema. Default is False.
driver_config (Dict): Configuration passed to Neo4j Driver. driver_config (Dict): Configuration passed to Neo4j Driver.
*Security note*: Make sure that the database connection uses credentials *Security note*: Make sure that the database connection uses credentials
@ -176,6 +399,7 @@ class Neo4jGraph(GraphStore):
refresh_schema: bool = True, refresh_schema: bool = True,
*, *,
driver_config: Optional[Dict] = None, driver_config: Optional[Dict] = None,
enhanced_schema: bool = False,
) -> None: ) -> None:
"""Create a new Neo4j graph wrapper instance.""" """Create a new Neo4j graph wrapper instance."""
try: try:
@ -203,6 +427,7 @@ class Neo4jGraph(GraphStore):
self._database = database self._database = database
self.timeout = timeout self.timeout = timeout
self.sanitize = sanitize self.sanitize = sanitize
self._enhanced_schema = enhanced_schema
self.schema: str = "" self.schema: str = ""
self.structured_schema: Dict[str, Any] = {} self.structured_schema: Dict[str, Any] = {}
# Verify connection # Verify connection
@ -300,37 +525,48 @@ class Neo4jGraph(GraphStore):
"metadata": {"constraint": constraint, "index": index}, "metadata": {"constraint": constraint, "index": index},
} }
# Format node properties if self._enhanced_schema:
formatted_node_props = [] schema_counts = self.query(
for el in node_properties: "CALL apoc.meta.graphSample() YIELD nodes, relationships "
props_str = ", ".join( "RETURN nodes, [rel in relationships | {name:apoc.any.property"
[f"{prop['property']}: {prop['type']}" for prop in el["properties"]] "(rel, 'type'), count: apoc.any.property(rel, 'count')}]"
" AS relationships"
) )
formatted_node_props.append(f"{el['labels']} {{{props_str}}}") # Update node info
for node in schema_counts[0]["nodes"]:
# Skip bloom labels
if node["name"] in EXCLUDED_LABELS:
continue
node_props = self.structured_schema["node_props"][node["name"]]
enhanced_cypher = _enhanced_schema_cypher(
node["name"], node_props, node["count"] < EXHAUSTIVE_SEARCH_LIMIT
)
enhanced_info = self.query(enhanced_cypher)[0]["output"]
for prop in node_props:
if prop["property"] in enhanced_info:
prop.update(enhanced_info[prop["property"]])
# Update rel info
for rel in schema_counts[0]["relationships"]:
# Skip bloom labels
if rel["name"] in EXCLUDED_RELS:
continue
rel_props = self.structured_schema["rel_props"].get(rel["name"])
if not rel_props:
continue
enhanced_cypher = _enhanced_schema_cypher(
rel["name"],
rel_props,
rel["count"] < EXHAUSTIVE_SEARCH_LIMIT,
is_relationship=True,
)
enhanced_info = self.query(enhanced_cypher)[0]["output"]
for prop in rel_props:
if prop["property"] in enhanced_info:
prop.update(enhanced_info[prop["property"]])
# Format relationship properties schema = _format_schema(self.structured_schema, self._enhanced_schema)
formatted_rel_props = []
for el in rel_properties:
props_str = ", ".join(
[f"{prop['property']}: {prop['type']}" for prop in el["properties"]]
)
formatted_rel_props.append(f"{el['type']} {{{props_str}}}")
# Format relationships self.schema = schema
formatted_rels = [
f"(:{el['start']})-[:{el['type']}]->(:{el['end']})" for el in relationships
]
self.schema = "\n".join(
[
"Node properties are the following:",
",".join(formatted_node_props),
"Relationship properties are the following:",
",".join(formatted_rel_props),
"The relationships are the following:",
",".join(formatted_rels),
]
)
def add_graph_documents( def add_graph_documents(
self, self,

View File

@ -291,3 +291,45 @@ def test_driver_config() -> None:
driver_config={"max_connection_pool_size": 1}, driver_config={"max_connection_pool_size": 1},
) )
graph.query("RETURN 'foo'") graph.query("RETURN 'foo'")
def test_enhanced_schema() -> None:
"""Test that neo4j works with driver config."""
url = os.environ.get("NEO4J_URI")
username = os.environ.get("NEO4J_USERNAME")
password = os.environ.get("NEO4J_PASSWORD")
assert url is not None
assert username is not None
assert password is not None
graph = Neo4jGraph(
url=url, username=username, password=password, enhanced_schema=True
)
graph.query("MATCH (n) DETACH DELETE n")
graph.add_graph_documents(test_data)
graph.refresh_schema()
expected_output = {
"node_props": {
"foo": [
{
"property": "id",
"type": "STRING",
"values": ["foo"],
"distinct_count": 1,
}
],
"bar": [
{
"property": "id",
"type": "STRING",
"values": ["bar"],
"distinct_count": 1,
}
],
},
"rel_props": {},
"relationships": [{"start": "foo", "type": "REL", "end": "bar"}],
}
# remove metadata portion of schema
del graph.structured_schema["metadata"]
assert graph.structured_schema == expected_output