mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-27 00:48:45 +00:00
Add indexed properties to neo4j enhanced schema (#21335)
This commit is contained in:
parent
a6cdf6572f
commit
ac14f171ac
@ -142,123 +142,6 @@ def _get_rel_import_query(baseEntityLabel: bool) -> str:
|
||||
)
|
||||
|
||||
|
||||
def _enhanced_schema_cypher(
|
||||
label_or_type: str,
|
||||
properties: List[Dict[str, Any]],
|
||||
exhaustive: bool,
|
||||
is_relationship: bool = False,
|
||||
) -> str:
|
||||
if is_relationship:
|
||||
match_clause = f"MATCH ()-[n:{label_or_type}]->()"
|
||||
else:
|
||||
match_clause = f"MATCH (n:{label_or_type})"
|
||||
|
||||
with_clauses = []
|
||||
return_clauses = []
|
||||
output_dict = {}
|
||||
if exhaustive:
|
||||
for prop in properties:
|
||||
prop_name = prop["property"]
|
||||
prop_type = prop["type"]
|
||||
if prop_type == "STRING":
|
||||
with_clauses.append(
|
||||
(
|
||||
f"collect(distinct substring(n.`{prop_name}`, 0, 50)) "
|
||||
f"AS `{prop_name}_values`"
|
||||
)
|
||||
)
|
||||
return_clauses.append(
|
||||
(
|
||||
f"values:`{prop_name}_values`[..{DISTINCT_VALUE_LIMIT}],"
|
||||
f" distinct_count: size(`{prop_name}_values`)"
|
||||
)
|
||||
)
|
||||
elif prop_type in [
|
||||
"INTEGER",
|
||||
"FLOAT",
|
||||
"DATE",
|
||||
"DATE_TIME",
|
||||
"LOCAL_DATE_TIME",
|
||||
]:
|
||||
with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`")
|
||||
with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`")
|
||||
with_clauses.append(
|
||||
f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`"
|
||||
)
|
||||
return_clauses.append(
|
||||
(
|
||||
f"min: toString(`{prop_name}_min`), "
|
||||
f"max: toString(`{prop_name}_max`), "
|
||||
f"distinct_count: `{prop_name}_distinct`"
|
||||
)
|
||||
)
|
||||
elif prop_type == "LIST":
|
||||
with_clauses.append(
|
||||
(
|
||||
f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "
|
||||
f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"
|
||||
)
|
||||
)
|
||||
return_clauses.append(
|
||||
f"min_size: `{prop_name}_size_min`, "
|
||||
f"max_size: `{prop_name}_size_max`"
|
||||
)
|
||||
elif prop_type in ["BOOLEAN", "POINT", "DURATION"]:
|
||||
continue
|
||||
output_dict[prop_name] = "{" + return_clauses.pop() + "}"
|
||||
else:
|
||||
# Just sample 5 random nodes
|
||||
match_clause += " WITH n LIMIT 5"
|
||||
for prop in properties:
|
||||
prop_name = prop["property"]
|
||||
prop_type = prop["type"]
|
||||
if prop_type == "STRING":
|
||||
with_clauses.append(
|
||||
(
|
||||
f"collect(distinct substring(n.`{prop_name}`, 0, 50)) "
|
||||
f"AS `{prop_name}_values`"
|
||||
)
|
||||
)
|
||||
return_clauses.append(f"values: `{prop_name}_values`")
|
||||
elif prop_type in [
|
||||
"INTEGER",
|
||||
"FLOAT",
|
||||
"DATE",
|
||||
"DATE_TIME",
|
||||
"LOCAL_DATE_TIME",
|
||||
]:
|
||||
with_clauses.append(
|
||||
f"collect(distinct toString(n.`{prop_name}`)) "
|
||||
f"AS `{prop_name}_values`"
|
||||
)
|
||||
return_clauses.append(f"values: `{prop_name}_values`")
|
||||
elif prop_type == "LIST":
|
||||
with_clauses.append(
|
||||
(
|
||||
f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "
|
||||
f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"
|
||||
)
|
||||
)
|
||||
return_clauses.append(
|
||||
f"min_size: `{prop_name}_size_min`,max_size: `{prop_name}_size_max`"
|
||||
)
|
||||
elif prop_type in ["BOOLEAN", "POINT", "DURATION"]:
|
||||
continue
|
||||
|
||||
output_dict[prop_name] = "{" + return_clauses.pop() + "}"
|
||||
|
||||
with_clause = "WITH " + ",\n ".join(with_clauses)
|
||||
return_clause = (
|
||||
"RETURN {"
|
||||
+ ", ".join(f"`{k}`: {v}" for k, v in output_dict.items())
|
||||
+ "} AS output"
|
||||
)
|
||||
|
||||
# Combine all parts of the Cypher query
|
||||
cypher_query = "\n".join([match_clause, with_clause, return_clause])
|
||||
return cypher_query
|
||||
|
||||
|
||||
def _format_schema(schema: Dict, is_enhanced: bool) -> str:
|
||||
formatted_node_props = []
|
||||
formatted_rel_props = []
|
||||
@ -296,17 +179,19 @@ def _format_schema(schema: Dict, is_enhanced: bool) -> str:
|
||||
example = f'Min: {prop["min"]}, Max: {prop["max"]}'
|
||||
else:
|
||||
example = (
|
||||
f'Example: "{prop["values"][0]}"' if prop["values"] else ""
|
||||
f'Example: "{prop["values"][0]}"'
|
||||
if prop.get("values")
|
||||
else ""
|
||||
)
|
||||
elif prop["type"] == "LIST":
|
||||
# Skip embeddings
|
||||
if prop["min_size"] > LIST_LIMIT:
|
||||
if not prop.get("min_size") or prop["min_size"] > LIST_LIMIT:
|
||||
continue
|
||||
example = (
|
||||
f'Min Size: {prop["min_size"]}, Max Size: {prop["max_size"]}'
|
||||
)
|
||||
formatted_node_props.append(
|
||||
f" - `{prop['property']}`: {prop['type']}` {example}"
|
||||
f" - `{prop['property']}`: {prop['type']} {example}"
|
||||
)
|
||||
|
||||
# Enhanced formatting for relationships
|
||||
@ -541,7 +426,11 @@ class Neo4jGraph(GraphStore):
|
||||
# Get constraints & indexes
|
||||
try:
|
||||
constraint = self.query("SHOW CONSTRAINTS")
|
||||
index = self.query("SHOW INDEXES YIELD *")
|
||||
index = self.query(
|
||||
"CALL apoc.schema.nodes() YIELD label, properties, type, size, "
|
||||
"valuesSelectivity WHERE type = 'RANGE' RETURN *, "
|
||||
"size * valuesSelectivity as distinctValues"
|
||||
)
|
||||
except (
|
||||
ClientError
|
||||
): # Read-only user might not have access to schema information
|
||||
@ -554,7 +443,6 @@ class Neo4jGraph(GraphStore):
|
||||
"relationships": relationships,
|
||||
"metadata": {"constraint": constraint, "index": index},
|
||||
}
|
||||
|
||||
if self._enhanced_schema:
|
||||
schema_counts = self.query(
|
||||
"CALL apoc.meta.graphSample() YIELD nodes, relationships "
|
||||
@ -570,7 +458,7 @@ class Neo4jGraph(GraphStore):
|
||||
node_props = self.structured_schema["node_props"].get(node["name"])
|
||||
if not node_props: # The node has no properties
|
||||
continue
|
||||
enhanced_cypher = _enhanced_schema_cypher(
|
||||
enhanced_cypher = self._enhanced_schema_cypher(
|
||||
node["name"], node_props, node["count"] < EXHAUSTIVE_SEARCH_LIMIT
|
||||
)
|
||||
enhanced_info = self.query(enhanced_cypher)[0]["output"]
|
||||
@ -585,7 +473,7 @@ class Neo4jGraph(GraphStore):
|
||||
rel_props = self.structured_schema["rel_props"].get(rel["name"])
|
||||
if not rel_props: # The rel has no properties
|
||||
continue
|
||||
enhanced_cypher = _enhanced_schema_cypher(
|
||||
enhanced_cypher = self._enhanced_schema_cypher(
|
||||
rel["name"],
|
||||
rel_props,
|
||||
rel["count"] < EXHAUSTIVE_SEARCH_LIMIT,
|
||||
@ -676,3 +564,167 @@ class Neo4jGraph(GraphStore):
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
def _enhanced_schema_cypher(
|
||||
self,
|
||||
label_or_type: str,
|
||||
properties: List[Dict[str, Any]],
|
||||
exhaustive: bool,
|
||||
is_relationship: bool = False,
|
||||
) -> str:
|
||||
if is_relationship:
|
||||
match_clause = f"MATCH ()-[n:{label_or_type}]->()"
|
||||
else:
|
||||
match_clause = f"MATCH (n:{label_or_type})"
|
||||
|
||||
with_clauses = []
|
||||
return_clauses = []
|
||||
output_dict = {}
|
||||
if exhaustive:
|
||||
for prop in properties:
|
||||
prop_name = prop["property"]
|
||||
prop_type = prop["type"]
|
||||
if prop_type == "STRING":
|
||||
with_clauses.append(
|
||||
(
|
||||
f"collect(distinct substring(n.`{prop_name}`, 0, 50)) "
|
||||
f"AS `{prop_name}_values`"
|
||||
)
|
||||
)
|
||||
return_clauses.append(
|
||||
(
|
||||
f"values:`{prop_name}_values`[..{DISTINCT_VALUE_LIMIT}],"
|
||||
f" distinct_count: size(`{prop_name}_values`)"
|
||||
)
|
||||
)
|
||||
elif prop_type in [
|
||||
"INTEGER",
|
||||
"FLOAT",
|
||||
"DATE",
|
||||
"DATE_TIME",
|
||||
"LOCAL_DATE_TIME",
|
||||
]:
|
||||
with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`")
|
||||
with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`")
|
||||
with_clauses.append(
|
||||
f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`"
|
||||
)
|
||||
return_clauses.append(
|
||||
(
|
||||
f"min: toString(`{prop_name}_min`), "
|
||||
f"max: toString(`{prop_name}_max`), "
|
||||
f"distinct_count: `{prop_name}_distinct`"
|
||||
)
|
||||
)
|
||||
elif prop_type == "LIST":
|
||||
with_clauses.append(
|
||||
(
|
||||
f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "
|
||||
f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"
|
||||
)
|
||||
)
|
||||
return_clauses.append(
|
||||
f"min_size: `{prop_name}_size_min`, "
|
||||
f"max_size: `{prop_name}_size_max`"
|
||||
)
|
||||
elif prop_type in ["BOOLEAN", "POINT", "DURATION"]:
|
||||
continue
|
||||
output_dict[prop_name] = "{" + return_clauses.pop() + "}"
|
||||
else:
|
||||
# Just sample 5 random nodes
|
||||
match_clause += " WITH n LIMIT 5"
|
||||
for prop in properties:
|
||||
prop_name = prop["property"]
|
||||
prop_type = prop["type"]
|
||||
|
||||
# Check if indexed property, we can still do exhaustive
|
||||
prop_index = [
|
||||
el
|
||||
for el in self.structured_schema["metadata"]["index"]
|
||||
if el["label"] == label_or_type
|
||||
and el["properties"] == [prop_name]
|
||||
and el["type"] == "RANGE"
|
||||
]
|
||||
if prop_type == "STRING":
|
||||
if (
|
||||
prop_index
|
||||
and prop_index[0].get("size") > 0
|
||||
and prop_index[0].get("distinctValues") <= DISTINCT_VALUE_LIMIT
|
||||
):
|
||||
distinct_values = self.query(
|
||||
f"CALL apoc.schema.properties.distinct("
|
||||
f"'{label_or_type}', '{prop_name}') YIELD value"
|
||||
)[0]["value"]
|
||||
return_clauses.append(
|
||||
(
|
||||
f"values: {distinct_values},"
|
||||
f" distinct_count: {len(distinct_values)}"
|
||||
)
|
||||
)
|
||||
else:
|
||||
with_clauses.append(
|
||||
(
|
||||
f"collect(distinct substring(n.`{prop_name}`, 0, 50)) "
|
||||
f"AS `{prop_name}_values`"
|
||||
)
|
||||
)
|
||||
return_clauses.append(f"values: `{prop_name}_values`")
|
||||
elif prop_type in [
|
||||
"INTEGER",
|
||||
"FLOAT",
|
||||
"DATE",
|
||||
"DATE_TIME",
|
||||
"LOCAL_DATE_TIME",
|
||||
]:
|
||||
if not prop_index:
|
||||
with_clauses.append(
|
||||
f"collect(distinct toString(n.`{prop_name}`)) "
|
||||
f"AS `{prop_name}_values`"
|
||||
)
|
||||
return_clauses.append(f"values: `{prop_name}_values`")
|
||||
else:
|
||||
with_clauses.append(
|
||||
f"min(n.`{prop_name}`) AS `{prop_name}_min`"
|
||||
)
|
||||
with_clauses.append(
|
||||
f"max(n.`{prop_name}`) AS `{prop_name}_max`"
|
||||
)
|
||||
with_clauses.append(
|
||||
f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`"
|
||||
)
|
||||
return_clauses.append(
|
||||
(
|
||||
f"min: toString(`{prop_name}_min`), "
|
||||
f"max: toString(`{prop_name}_max`), "
|
||||
f"distinct_count: `{prop_name}_distinct`"
|
||||
)
|
||||
)
|
||||
|
||||
elif prop_type == "LIST":
|
||||
with_clauses.append(
|
||||
(
|
||||
f"min(size(n.`{prop_name}`)) AS `{prop_name}_size_min`, "
|
||||
f"max(size(n.`{prop_name}`)) AS `{prop_name}_size_max`"
|
||||
)
|
||||
)
|
||||
return_clauses.append(
|
||||
(
|
||||
f"min_size: `{prop_name}_size_min`, "
|
||||
f"max_size: `{prop_name}_size_max`"
|
||||
)
|
||||
)
|
||||
elif prop_type in ["BOOLEAN", "POINT", "DURATION"]:
|
||||
continue
|
||||
|
||||
output_dict[prop_name] = "{" + return_clauses.pop() + "}"
|
||||
|
||||
with_clause = "WITH " + ",\n ".join(with_clauses)
|
||||
return_clause = (
|
||||
"RETURN {"
|
||||
+ ", ".join(f"`{k}`: {v}" for k, v in output_dict.items())
|
||||
+ "} AS output"
|
||||
)
|
||||
|
||||
# Combine all parts of the Cypher query
|
||||
cypher_query = "\n".join([match_clause, with_clause, return_clause])
|
||||
return cypher_query
|
||||
|
Loading…
Reference in New Issue
Block a user