fix(text-splitters): prevent silent data loss for empty dict values in RecursiveJsonSplitter (#35079)

This commit is contained in:
Mohammad Mohtashim
2026-03-29 06:27:53 +05:00
committed by GitHub
parent aba72f7229
commit eb28ae1b20
2 changed files with 107 additions and 3 deletions

View File

@@ -91,7 +91,7 @@ class RecursiveJsonSplitter:
"""Split json into maximum size dictionaries while preserving structure."""
current_path = current_path or []
chunks = chunks if chunks is not None else [{}]
if isinstance(data, dict):
if isinstance(data, dict) and data:
for key, value in data.items():
new_path = [*current_path, key]
chunk_size = self._json_size(chunks[-1])
@@ -108,8 +108,8 @@ class RecursiveJsonSplitter:
# Iterate
self._json_split(value, new_path, chunks)
else:
# handle single item
# Handle leaf values and empty dicts
elif current_path:
self._set_nested_dict(chunks[-1], current_path, data)
return chunks

View File

@@ -2,6 +2,7 @@
from __future__ import annotations
import json
import random
import re
import string
@@ -3253,6 +3254,109 @@ def test_split_json_many_calls() -> None:
assert chunk1 == chunk1_output
def test_split_json_with_empty_dict_values() -> None:
"""Test that empty dicts in JSON values are preserved, not dropped."""
splitter = RecursiveJsonSplitter(max_chunk_size=300)
data: dict[str, Any] = {
"a": "hello",
"b": {},
"c": "world",
}
chunks = splitter.split_json(data)
# Recombine all chunks into a single dict
merged: dict[str, Any] = {}
for chunk in chunks:
merged.update(chunk)
assert merged == {"a": "hello", "b": {}, "c": "world"}
def test_split_json_with_nested_empty_dicts() -> None:
"""Test that nested empty dicts are preserved."""
splitter = RecursiveJsonSplitter(max_chunk_size=300)
data: dict[str, Any] = {
"level1": {
"level2a": {},
"level2b": "value",
}
}
chunks = splitter.split_json(data)
merged: dict[str, Any] = {}
for chunk in chunks:
merged.update(chunk)
assert merged == {"level1": {"level2a": {}, "level2b": "value"}}
def test_split_json_empty_dict_only() -> None:
"""Test splitting a JSON that contains only an empty dict at the top level.
An empty top-level dict should produce a single empty chunk (or no chunks).
"""
splitter = RecursiveJsonSplitter(max_chunk_size=300)
data: dict[str, Any] = {}
chunks = splitter.split_json(data)
# With nothing to split, result should be empty list
assert chunks == []
def test_split_json_mixed_empty_and_nonempty_dicts() -> None:
"""Test a realistic structure mixing empty and non-empty nested dicts."""
splitter = RecursiveJsonSplitter(max_chunk_size=300)
data: dict[str, Any] = {
"config": {},
"metadata": {"author": "test", "tags": {}},
"content": "some text",
}
chunks = splitter.split_json(data)
merged: dict[str, Any] = {}
for chunk in chunks:
for k, v in chunk.items():
if k in merged and isinstance(merged[k], dict) and isinstance(v, dict):
merged[k].update(v)
else:
merged[k] = v
assert merged["config"] == {}
assert merged["metadata"] == {"author": "test", "tags": {}}
assert merged["content"] == "some text"
def test_split_json_empty_dict_value_in_large_payload() -> None:
"""Test that empty dict values survive chunking in a larger payload."""
max_chunk = 200
splitter = RecursiveJsonSplitter(max_chunk_size=max_chunk)
data: dict[str, Any] = {
"key0": "x" * 50,
"empty": {},
"key1": "y" * 50,
"nested": {f"k{i}": f"v{i}" for i in range(20)},
}
chunks = splitter.split_json(data)
# Verify all chunks are within size limits
for chunk in chunks:
assert len(json.dumps(chunk)) < max_chunk * 1.05
# Verify the empty dict is somewhere in the chunks
found_empty = False
for chunk in chunks:
# Walk nested structure to find "empty": {}
if "empty" in chunk and chunk["empty"] == {}:
found_empty = True
break
for v in chunk.values():
if isinstance(v, dict) and "empty" in v and v["empty"] == {}:
found_empty = True
break
assert found_empty, "Empty dict value was lost during splitting"
def test_powershell_code_splitter_short_code() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.POWERSHELL, chunk_size=60, chunk_overlap=0