mirror of
https://github.com/hwchase17/langchain.git
synced 2026-04-03 19:04:23 +00:00
fix(text-splitters): prevent silent data loss for empty dict values in RecursiveJsonSplitter (#35079)
This commit is contained in:
committed by
GitHub
parent
aba72f7229
commit
eb28ae1b20
@@ -91,7 +91,7 @@ class RecursiveJsonSplitter:
|
||||
"""Split json into maximum size dictionaries while preserving structure."""
|
||||
current_path = current_path or []
|
||||
chunks = chunks if chunks is not None else [{}]
|
||||
if isinstance(data, dict):
|
||||
if isinstance(data, dict) and data:
|
||||
for key, value in data.items():
|
||||
new_path = [*current_path, key]
|
||||
chunk_size = self._json_size(chunks[-1])
|
||||
@@ -108,8 +108,8 @@ class RecursiveJsonSplitter:
|
||||
|
||||
# Iterate
|
||||
self._json_split(value, new_path, chunks)
|
||||
else:
|
||||
# handle single item
|
||||
# Handle leaf values and empty dicts
|
||||
elif current_path:
|
||||
self._set_nested_dict(chunks[-1], current_path, data)
|
||||
return chunks
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import string
|
||||
@@ -3253,6 +3254,109 @@ def test_split_json_many_calls() -> None:
|
||||
assert chunk1 == chunk1_output
|
||||
|
||||
|
||||
def test_split_json_with_empty_dict_values() -> None:
|
||||
"""Test that empty dicts in JSON values are preserved, not dropped."""
|
||||
splitter = RecursiveJsonSplitter(max_chunk_size=300)
|
||||
|
||||
data: dict[str, Any] = {
|
||||
"a": "hello",
|
||||
"b": {},
|
||||
"c": "world",
|
||||
}
|
||||
chunks = splitter.split_json(data)
|
||||
# Recombine all chunks into a single dict
|
||||
merged: dict[str, Any] = {}
|
||||
for chunk in chunks:
|
||||
merged.update(chunk)
|
||||
|
||||
assert merged == {"a": "hello", "b": {}, "c": "world"}
|
||||
|
||||
|
||||
def test_split_json_with_nested_empty_dicts() -> None:
|
||||
"""Test that nested empty dicts are preserved."""
|
||||
splitter = RecursiveJsonSplitter(max_chunk_size=300)
|
||||
|
||||
data: dict[str, Any] = {
|
||||
"level1": {
|
||||
"level2a": {},
|
||||
"level2b": "value",
|
||||
}
|
||||
}
|
||||
chunks = splitter.split_json(data)
|
||||
merged: dict[str, Any] = {}
|
||||
for chunk in chunks:
|
||||
merged.update(chunk)
|
||||
|
||||
assert merged == {"level1": {"level2a": {}, "level2b": "value"}}
|
||||
|
||||
|
||||
def test_split_json_empty_dict_only() -> None:
|
||||
"""Test splitting a JSON that contains only an empty dict at the top level.
|
||||
|
||||
An empty top-level dict should produce a single empty chunk (or no chunks).
|
||||
"""
|
||||
splitter = RecursiveJsonSplitter(max_chunk_size=300)
|
||||
|
||||
data: dict[str, Any] = {}
|
||||
chunks = splitter.split_json(data)
|
||||
# With nothing to split, result should be empty list
|
||||
assert chunks == []
|
||||
|
||||
|
||||
def test_split_json_mixed_empty_and_nonempty_dicts() -> None:
|
||||
"""Test a realistic structure mixing empty and non-empty nested dicts."""
|
||||
splitter = RecursiveJsonSplitter(max_chunk_size=300)
|
||||
|
||||
data: dict[str, Any] = {
|
||||
"config": {},
|
||||
"metadata": {"author": "test", "tags": {}},
|
||||
"content": "some text",
|
||||
}
|
||||
chunks = splitter.split_json(data)
|
||||
merged: dict[str, Any] = {}
|
||||
for chunk in chunks:
|
||||
for k, v in chunk.items():
|
||||
if k in merged and isinstance(merged[k], dict) and isinstance(v, dict):
|
||||
merged[k].update(v)
|
||||
else:
|
||||
merged[k] = v
|
||||
|
||||
assert merged["config"] == {}
|
||||
assert merged["metadata"] == {"author": "test", "tags": {}}
|
||||
assert merged["content"] == "some text"
|
||||
|
||||
|
||||
def test_split_json_empty_dict_value_in_large_payload() -> None:
|
||||
"""Test that empty dict values survive chunking in a larger payload."""
|
||||
max_chunk = 200
|
||||
splitter = RecursiveJsonSplitter(max_chunk_size=max_chunk)
|
||||
|
||||
data: dict[str, Any] = {
|
||||
"key0": "x" * 50,
|
||||
"empty": {},
|
||||
"key1": "y" * 50,
|
||||
"nested": {f"k{i}": f"v{i}" for i in range(20)},
|
||||
}
|
||||
chunks = splitter.split_json(data)
|
||||
|
||||
# Verify all chunks are within size limits
|
||||
for chunk in chunks:
|
||||
assert len(json.dumps(chunk)) < max_chunk * 1.05
|
||||
|
||||
# Verify the empty dict is somewhere in the chunks
|
||||
found_empty = False
|
||||
for chunk in chunks:
|
||||
# Walk nested structure to find "empty": {}
|
||||
if "empty" in chunk and chunk["empty"] == {}:
|
||||
found_empty = True
|
||||
break
|
||||
for v in chunk.values():
|
||||
if isinstance(v, dict) and "empty" in v and v["empty"] == {}:
|
||||
found_empty = True
|
||||
break
|
||||
assert found_empty, "Empty dict value was lost during splitting"
|
||||
|
||||
|
||||
def test_powershell_code_splitter_short_code() -> None:
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.POWERSHELL, chunk_size=60, chunk_overlap=0
|
||||
|
||||
Reference in New Issue
Block a user