From 4bfe888717983c487fbc5c195147befe11574ef1 Mon Sep 17 00:00:00 2001
From: Tomaz Bratanic <bratanic.tomaz@gmail.com>
Date: Fri, 8 Mar 2024 04:21:52 +0100
Subject: [PATCH] comunity[patch]: Fix neo4j sanitizing values (#18750)

Fixing sanitization for when deeply nested lists appear
---
 .../langchain_community/graphs/neo4j_graph.py | 53 +++++++++++--------
 .../unit_tests/graphs/test_neo4j_graph.py     |  9 ++++
 2 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/libs/community/langchain_community/graphs/neo4j_graph.py b/libs/community/langchain_community/graphs/neo4j_graph.py
index afb32e17a69..d4c89987203 100644
--- a/libs/community/langchain_community/graphs/neo4j_graph.py
+++ b/libs/community/langchain_community/graphs/neo4j_graph.py
@@ -46,36 +46,45 @@ include_docs_query = (
 )
 
 
-def value_sanitize(d: Dict[str, Any]) -> Dict[str, Any]:
-    """Sanitize the input dictionary.
+def value_sanitize(d: Any) -> Any:
+    """Sanitize the input dictionary or list.
 
-    Sanitizes the input dictionary by removing embedding-like values,
+    Sanitizes the input by removing embedding-like values,
     lists with more than 128 elements, that are mostly irrelevant for
     generating answers in a LLM context. These properties, if left in
     results, can occupy significant context space and detract from
     the LLM's performance by introducing unnecessary noise and cost.
     """
     LIST_LIMIT = 128
-    # Create a new dictionary to avoid changing size during iteration
-    new_dict = {}
-    for key, value in d.items():
-        if isinstance(value, dict):
-            # Recurse to handle nested dictionaries
-            new_dict[key] = value_sanitize(value)
-        elif isinstance(value, list):
-            # check if it has less than LIST_LIMIT values
-            if len(value) < LIST_LIMIT:
-                # if value is a list, check if it contains dictionaries to clean
-                cleaned_list = []
-                for item in value:
-                    if isinstance(item, dict):
-                        cleaned_list.append(value_sanitize(item))
-                    else:
-                        cleaned_list.append(item)
-                new_dict[key] = cleaned_list  # type: ignore[assignment]
+    if isinstance(d, dict):
+        new_dict = {}
+        for key, value in d.items():
+            if isinstance(value, dict):
+                sanitized_value = value_sanitize(value)
+                if (
+                    sanitized_value is not None
+                ):  # Check if the sanitized value is not None
+                    new_dict[key] = sanitized_value
+            elif isinstance(value, list):
+                if len(value) < LIST_LIMIT:
+                    sanitized_value = value_sanitize(value)
+                    if (
+                        sanitized_value is not None
+                    ):  # Check if the sanitized value is not None
+                        new_dict[key] = sanitized_value
+                # Do not include the key if the list is oversized
+            else:
+                new_dict[key] = value
+        return new_dict
+    elif isinstance(d, list):
+        if len(d) < LIST_LIMIT:
+            return [
+                value_sanitize(item) for item in d if value_sanitize(item) is not None
+            ]
         else:
-            new_dict[key] = value
-    return new_dict
+            return None
+    else:
+        return d
 
 
 def _get_node_import_query(baseEntityLabel: bool, include_source: bool) -> str:
diff --git a/libs/community/tests/unit_tests/graphs/test_neo4j_graph.py b/libs/community/tests/unit_tests/graphs/test_neo4j_graph.py
index eebef74ef16..a274fd4768b 100644
--- a/libs/community/tests/unit_tests/graphs/test_neo4j_graph.py
+++ b/libs/community/tests/unit_tests/graphs/test_neo4j_graph.py
@@ -30,3 +30,12 @@ def test_value_sanitize_with_dict_in_list():  # type: ignore[no-untyped-def]
     input_dict = {"key1": "value1", "oversized_list": [1, 2, {"key": oversized_list}]}
     expected_output = {"key1": "value1", "oversized_list": [1, 2, {}]}
     assert value_sanitize(input_dict) == expected_output
+
+
+def test_value_sanitize_with_dict_in_nested_list():  # type: ignore[no-untyped-def]
+    input_dict = {
+        "key1": "value1",
+        "deeply_nested_lists": [[[[{"final_nested_key": list(range(200))}]]]],
+    }
+    expected_output = {"key1": "value1", "deeply_nested_lists": [[[[{}]]]]}
+    assert value_sanitize(input_dict) == expected_output