From 3a1d05394d543294e181b84d513073701517bc2e Mon Sep 17 00:00:00 2001 From: German Martin Date: Tue, 17 Dec 2024 11:21:59 -0300 Subject: [PATCH] community: Apache AGE wrapper. Ensure Node Uniqueness by ID. (#28759) **Description:** The Apache AGE graph integration incorrectly handled node merging, allowing duplicate nodes with different IDs but the same type and other properties. Unlike [Neo4j](https://github.com/langchain-ai/langchain/blob/cdf62021569dd7f02b35679b46ee6abe92f02cb7/libs/community/langchain_community/graphs/neo4j_graph.py#L47), [Memgraph](https://github.com/langchain-ai/langchain/blob/cdf62021569dd7f02b35679b46ee6abe92f02cb7/libs/community/langchain_community/graphs/memgraph_graph.py#L50), [Kuzu](https://github.com/langchain-ai/langchain/blob/cdf62021569dd7f02b35679b46ee6abe92f02cb7/libs/community/langchain_community/graphs/kuzu_graph.py#L253), and [Gremlin](https://github.com/langchain-ai/langchain/blob/cdf62021569dd7f02b35679b46ee6abe92f02cb7/libs/community/langchain_community/graphs/gremlin_graph.py#L165), it did not use the node ID as the primary identifier for merging. This inconsistency caused data integrity issues and unexpected behavior when users expected updates to specific nodes by ID. **Solution:** This PR modifies the `node_insert_query` to `MERGE` nodes based on label and ID *only* and updates properties with `SET`, aligning the behavior with other graph database integrations. The `_format_properties` method was also modified to handle id overrides. **Impact:** This fix ensures data integrity by preventing duplicate nodes, and provides a consistent behavior across graph database integrations. --- libs/community/langchain_community/graphs/age_graph.py | 6 ++++-- .../tests/integration_tests/graphs/test_age_graph.py | 6 +++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/graphs/age_graph.py b/libs/community/langchain_community/graphs/age_graph.py index 4fba85731ad..434491253c6 100644 --- a/libs/community/langchain_community/graphs/age_graph.py +++ b/libs/community/langchain_community/graphs/age_graph.py @@ -697,8 +697,9 @@ class AGEGraph(GraphStore): # query for inserting nodes node_insert_query = ( """ - MERGE (n:`{label}` {properties}) - """ + MERGE (n:`{label}` {{`id`: "{id}"}}) + SET n = {properties} + """ if not include_source else """ MERGE (n:`{label}` {properties}) @@ -735,6 +736,7 @@ class AGEGraph(GraphStore): query = node_insert_query.format( label=AGEGraph.clean_graph_labels(node.type), properties=self._format_properties(node.properties), + id=node.id, ) self.query(query) diff --git a/libs/community/tests/integration_tests/graphs/test_age_graph.py b/libs/community/tests/integration_tests/graphs/test_age_graph.py index 383ddb62f8d..32c2fab33cb 100644 --- a/libs/community/tests/integration_tests/graphs/test_age_graph.py +++ b/libs/community/tests/integration_tests/graphs/test_age_graph.py @@ -10,7 +10,11 @@ from langchain_community.graphs.graph_document import GraphDocument, Node, Relat test_data = [ GraphDocument( - nodes=[Node(id="foo", type="foo"), Node(id="bar", type="bar")], + nodes=[ + Node(id="foo", type="foo"), + Node(id="bar", type="bar"), + Node(id="foo", type="foo", properties={"property_a": "a"}), + ], relationships=[ Relationship( source=Node(id="foo", type="foo"),