Add indexed properties to neo4j enhanced schema (#21335)

This commit is contained in:
Tomaz Bratanic 2024-05-06 23:28:34 +02:00 committed by GitHub
parent a6cdf6572f
commit ac14f171ac
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -142,123 +142,6 @@ def _get_rel_import_query(baseEntityLabel: bool) -> str:
) )
def _enhanced_schema_cypher(
label_or_type: str,
properties: List[Dict[str, Any]],
exhaustive: bool,
is_relationship: bool = False,
) -> str:
if is_relationship:
match_clause = f"MATCH ()-[n:{label_or_type}]->()"
else:
match_clause = f"MATCH (n:{label_or_type})"
with_clauses = []
return_clauses = []
output_dict = {}
if exhaustive:
for prop in properties:
prop_name = prop["property"]
prop_type = prop["type"]
if prop_type == "STRING":
with_clauses.append(
(
f"collect(distinct substring(n.`{prop_name}`, 0, 50)) "
f"AS `{prop_name}_values`"
)
)
return_clauses.append(
(
f"values:`{prop_name}_values`[..{DISTINCT_VALUE_LIMIT}],"
f" distinct_count: size(`{prop_name}_values`)"
)
)
elif prop_type in [
"INTEGER",
"FLOAT",
"DATE",
"DATE_TIME",
"LOCAL_DATE_TIME",
]:
with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`")
with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`")
with_clauses.append(
f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`"
)
return_clauses.append(
(
f"min: toString(`{prop_name}_min`), "
f"max: toString(`{prop_name}_max`), "
f"distinct_count: `{prop_name}_distinct`"
)
)
elif prop_type == "LIST":
with_clauses.append(
(
f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "
f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"
)
)
return_clauses.append(
f"min_size: `{prop_name}_size_min`, "
f"max_size: `{prop_name}_size_max`"
)
elif prop_type in ["BOOLEAN", "POINT", "DURATION"]:
continue
output_dict[prop_name] = "{" + return_clauses.pop() + "}"
else:
# Just sample 5 random nodes
match_clause += " WITH n LIMIT 5"
for prop in properties:
prop_name = prop["property"]
prop_type = prop["type"]
if prop_type == "STRING":
with_clauses.append(
(
f"collect(distinct substring(n.`{prop_name}`, 0, 50)) "
f"AS `{prop_name}_values`"
)
)
return_clauses.append(f"values: `{prop_name}_values`")
elif prop_type in [
"INTEGER",
"FLOAT",
"DATE",
"DATE_TIME",
"LOCAL_DATE_TIME",
]:
with_clauses.append(
f"collect(distinct toString(n.`{prop_name}`)) "
f"AS `{prop_name}_values`"
)
return_clauses.append(f"values: `{prop_name}_values`")
elif prop_type == "LIST":
with_clauses.append(
(
f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "
f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"
)
)
return_clauses.append(
f"min_size: `{prop_name}_size_min`,max_size: `{prop_name}_size_max`"
)
elif prop_type in ["BOOLEAN", "POINT", "DURATION"]:
continue
output_dict[prop_name] = "{" + return_clauses.pop() + "}"
with_clause = "WITH " + ",\n ".join(with_clauses)
return_clause = (
"RETURN {"
+ ", ".join(f"`{k}`: {v}" for k, v in output_dict.items())
+ "} AS output"
)
# Combine all parts of the Cypher query
cypher_query = "\n".join([match_clause, with_clause, return_clause])
return cypher_query
def _format_schema(schema: Dict, is_enhanced: bool) -> str: def _format_schema(schema: Dict, is_enhanced: bool) -> str:
formatted_node_props = [] formatted_node_props = []
formatted_rel_props = [] formatted_rel_props = []
@ -296,17 +179,19 @@ def _format_schema(schema: Dict, is_enhanced: bool) -> str:
example = f'Min: {prop["min"]}, Max: {prop["max"]}' example = f'Min: {prop["min"]}, Max: {prop["max"]}'
else: else:
example = ( example = (
f'Example: "{prop["values"][0]}"' if prop["values"] else "" f'Example: "{prop["values"][0]}"'
if prop.get("values")
else ""
) )
elif prop["type"] == "LIST": elif prop["type"] == "LIST":
# Skip embeddings # Skip embeddings
if prop["min_size"] > LIST_LIMIT: if not prop.get("min_size") or prop["min_size"] > LIST_LIMIT:
continue continue
example = ( example = (
f'Min Size: {prop["min_size"]}, Max Size: {prop["max_size"]}' f'Min Size: {prop["min_size"]}, Max Size: {prop["max_size"]}'
) )
formatted_node_props.append( formatted_node_props.append(
f" - `{prop['property']}`: {prop['type']}` {example}" f" - `{prop['property']}`: {prop['type']} {example}"
) )
# Enhanced formatting for relationships # Enhanced formatting for relationships
@ -541,7 +426,11 @@ class Neo4jGraph(GraphStore):
# Get constraints & indexes # Get constraints & indexes
try: try:
constraint = self.query("SHOW CONSTRAINTS") constraint = self.query("SHOW CONSTRAINTS")
index = self.query("SHOW INDEXES YIELD *") index = self.query(
"CALL apoc.schema.nodes() YIELD label, properties, type, size, "
"valuesSelectivity WHERE type = 'RANGE' RETURN *, "
"size * valuesSelectivity as distinctValues"
)
except ( except (
ClientError ClientError
): # Read-only user might not have access to schema information ): # Read-only user might not have access to schema information
@ -554,7 +443,6 @@ class Neo4jGraph(GraphStore):
"relationships": relationships, "relationships": relationships,
"metadata": {"constraint": constraint, "index": index}, "metadata": {"constraint": constraint, "index": index},
} }
if self._enhanced_schema: if self._enhanced_schema:
schema_counts = self.query( schema_counts = self.query(
"CALL apoc.meta.graphSample() YIELD nodes, relationships " "CALL apoc.meta.graphSample() YIELD nodes, relationships "
@ -570,7 +458,7 @@ class Neo4jGraph(GraphStore):
node_props = self.structured_schema["node_props"].get(node["name"]) node_props = self.structured_schema["node_props"].get(node["name"])
if not node_props: # The node has no properties if not node_props: # The node has no properties
continue continue
enhanced_cypher = _enhanced_schema_cypher( enhanced_cypher = self._enhanced_schema_cypher(
node["name"], node_props, node["count"] < EXHAUSTIVE_SEARCH_LIMIT node["name"], node_props, node["count"] < EXHAUSTIVE_SEARCH_LIMIT
) )
enhanced_info = self.query(enhanced_cypher)[0]["output"] enhanced_info = self.query(enhanced_cypher)[0]["output"]
@ -585,7 +473,7 @@ class Neo4jGraph(GraphStore):
rel_props = self.structured_schema["rel_props"].get(rel["name"]) rel_props = self.structured_schema["rel_props"].get(rel["name"])
if not rel_props: # The rel has no properties if not rel_props: # The rel has no properties
continue continue
enhanced_cypher = _enhanced_schema_cypher( enhanced_cypher = self._enhanced_schema_cypher(
rel["name"], rel["name"],
rel_props, rel_props,
rel["count"] < EXHAUSTIVE_SEARCH_LIMIT, rel["count"] < EXHAUSTIVE_SEARCH_LIMIT,
@ -676,3 +564,167 @@ class Neo4jGraph(GraphStore):
] ]
}, },
) )
def _enhanced_schema_cypher(
self,
label_or_type: str,
properties: List[Dict[str, Any]],
exhaustive: bool,
is_relationship: bool = False,
) -> str:
if is_relationship:
match_clause = f"MATCH ()-[n:{label_or_type}]->()"
else:
match_clause = f"MATCH (n:{label_or_type})"
with_clauses = []
return_clauses = []
output_dict = {}
if exhaustive:
for prop in properties:
prop_name = prop["property"]
prop_type = prop["type"]
if prop_type == "STRING":
with_clauses.append(
(
f"collect(distinct substring(n.`{prop_name}`, 0, 50)) "
f"AS `{prop_name}_values`"
)
)
return_clauses.append(
(
f"values:`{prop_name}_values`[..{DISTINCT_VALUE_LIMIT}],"
f" distinct_count: size(`{prop_name}_values`)"
)
)
elif prop_type in [
"INTEGER",
"FLOAT",
"DATE",
"DATE_TIME",
"LOCAL_DATE_TIME",
]:
with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`")
with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`")
with_clauses.append(
f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`"
)
return_clauses.append(
(
f"min: toString(`{prop_name}_min`), "
f"max: toString(`{prop_name}_max`), "
f"distinct_count: `{prop_name}_distinct`"
)
)
elif prop_type == "LIST":
with_clauses.append(
(
f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "
f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"
)
)
return_clauses.append(
f"min_size: `{prop_name}_size_min`, "
f"max_size: `{prop_name}_size_max`"
)
elif prop_type in ["BOOLEAN", "POINT", "DURATION"]:
continue
output_dict[prop_name] = "{" + return_clauses.pop() + "}"
else:
# Just sample 5 random nodes
match_clause += " WITH n LIMIT 5"
for prop in properties:
prop_name = prop["property"]
prop_type = prop["type"]
# Check if indexed property, we can still do exhaustive
prop_index = [
el
for el in self.structured_schema["metadata"]["index"]
if el["label"] == label_or_type
and el["properties"] == [prop_name]
and el["type"] == "RANGE"
]
if prop_type == "STRING":
if (
prop_index
and prop_index[0].get("size") > 0
and prop_index[0].get("distinctValues") <= DISTINCT_VALUE_LIMIT
):
distinct_values = self.query(
f"CALL apoc.schema.properties.distinct("
f"'{label_or_type}', '{prop_name}') YIELD value"
)[0]["value"]
return_clauses.append(
(
f"values: {distinct_values},"
f" distinct_count: {len(distinct_values)}"
)
)
else:
with_clauses.append(
(
f"collect(distinct substring(n.`{prop_name}`, 0, 50)) "
f"AS `{prop_name}_values`"
)
)
return_clauses.append(f"values: `{prop_name}_values`")
elif prop_type in [
"INTEGER",
"FLOAT",
"DATE",
"DATE_TIME",
"LOCAL_DATE_TIME",
]:
if not prop_index:
with_clauses.append(
f"collect(distinct toString(n.`{prop_name}`)) "
f"AS `{prop_name}_values`"
)
return_clauses.append(f"values: `{prop_name}_values`")
else:
with_clauses.append(
f"min(n.`{prop_name}`) AS `{prop_name}_min`"
)
with_clauses.append(
f"max(n.`{prop_name}`) AS `{prop_name}_max`"
)
with_clauses.append(
f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`"
)
return_clauses.append(
(
f"min: toString(`{prop_name}_min`), "
f"max: toString(`{prop_name}_max`), "
f"distinct_count: `{prop_name}_distinct`"
)
)
elif prop_type == "LIST":
with_clauses.append(
(
f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "
f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"
)
)
return_clauses.append(
(
f"min_size: `{prop_name}_size_min`, "
f"max_size: `{prop_name}_size_max`"
)
)
elif prop_type in ["BOOLEAN", "POINT", "DURATION"]:
continue
output_dict[prop_name] = "{" + return_clauses.pop() + "}"
with_clause = "WITH " + ",\n ".join(with_clauses)
return_clause = (
"RETURN {"
+ ", ".join(f"`{k}`: {v}" for k, v in output_dict.items())
+ "} AS output"
)
# Combine all parts of the Cypher query
cypher_query = "\n".join([match_clause, with_clause, return_clause])
return cypher_query