comunity[patch]: Fix neo4j sanitizing values (#18750)

Fixing sanitization for when deeply nested lists appear
This commit is contained in:
Tomaz Bratanic
2024-03-08 04:21:52 +01:00
committed by GitHub
parent 7f504c1f81
commit 4bfe888717
2 changed files with 40 additions and 22 deletions

View File

@@ -46,36 +46,45 @@ include_docs_query = (
) )
def value_sanitize(d: Dict[str, Any]) -> Dict[str, Any]: def value_sanitize(d: Any) -> Any:
"""Sanitize the input dictionary. """Sanitize the input dictionary or list.
Sanitizes the input dictionary by removing embedding-like values, Sanitizes the input by removing embedding-like values,
lists with more than 128 elements, that are mostly irrelevant for lists with more than 128 elements, that are mostly irrelevant for
generating answers in a LLM context. These properties, if left in generating answers in a LLM context. These properties, if left in
results, can occupy significant context space and detract from results, can occupy significant context space and detract from
the LLM's performance by introducing unnecessary noise and cost. the LLM's performance by introducing unnecessary noise and cost.
""" """
LIST_LIMIT = 128 LIST_LIMIT = 128
# Create a new dictionary to avoid changing size during iteration if isinstance(d, dict):
new_dict = {} new_dict = {}
for key, value in d.items(): for key, value in d.items():
if isinstance(value, dict): if isinstance(value, dict):
# Recurse to handle nested dictionaries sanitized_value = value_sanitize(value)
new_dict[key] = value_sanitize(value) if (
sanitized_value is not None
): # Check if the sanitized value is not None
new_dict[key] = sanitized_value
elif isinstance(value, list): elif isinstance(value, list):
# check if it has less than LIST_LIMIT values
if len(value) < LIST_LIMIT: if len(value) < LIST_LIMIT:
# if value is a list, check if it contains dictionaries to clean sanitized_value = value_sanitize(value)
cleaned_list = [] if (
for item in value: sanitized_value is not None
if isinstance(item, dict): ): # Check if the sanitized value is not None
cleaned_list.append(value_sanitize(item)) new_dict[key] = sanitized_value
else: # Do not include the key if the list is oversized
cleaned_list.append(item)
new_dict[key] = cleaned_list # type: ignore[assignment]
else: else:
new_dict[key] = value new_dict[key] = value
return new_dict return new_dict
elif isinstance(d, list):
if len(d) < LIST_LIMIT:
return [
value_sanitize(item) for item in d if value_sanitize(item) is not None
]
else:
return None
else:
return d
def _get_node_import_query(baseEntityLabel: bool, include_source: bool) -> str: def _get_node_import_query(baseEntityLabel: bool, include_source: bool) -> str:

View File

@@ -30,3 +30,12 @@ def test_value_sanitize_with_dict_in_list(): # type: ignore[no-untyped-def]
input_dict = {"key1": "value1", "oversized_list": [1, 2, {"key": oversized_list}]} input_dict = {"key1": "value1", "oversized_list": [1, 2, {"key": oversized_list}]}
expected_output = {"key1": "value1", "oversized_list": [1, 2, {}]} expected_output = {"key1": "value1", "oversized_list": [1, 2, {}]}
assert value_sanitize(input_dict) == expected_output assert value_sanitize(input_dict) == expected_output
def test_value_sanitize_with_dict_in_nested_list(): # type: ignore[no-untyped-def]
input_dict = {
"key1": "value1",
"deeply_nested_lists": [[[[{"final_nested_key": list(range(200))}]]]],
}
expected_output = {"key1": "value1", "deeply_nested_lists": [[[[{}]]]]}
assert value_sanitize(input_dict) == expected_output