diff --git a/libs/text-splitters/langchain_text_splitters/json.py b/libs/text-splitters/langchain_text_splitters/json.py index 1162aa0aa13..2847a8fe53c 100644 --- a/libs/text-splitters/langchain_text_splitters/json.py +++ b/libs/text-splitters/langchain_text_splitters/json.py @@ -91,7 +91,7 @@ class RecursiveJsonSplitter: """Split json into maximum size dictionaries while preserving structure.""" current_path = current_path or [] chunks = chunks if chunks is not None else [{}] - if isinstance(data, dict): + if isinstance(data, dict) and data: for key, value in data.items(): new_path = [*current_path, key] chunk_size = self._json_size(chunks[-1]) @@ -108,8 +108,8 @@ class RecursiveJsonSplitter: # Iterate self._json_split(value, new_path, chunks) - else: - # handle single item + # Handle leaf values and empty dicts + elif current_path: self._set_nested_dict(chunks[-1], current_path, data) return chunks diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index a074c25b7f8..c7d7e2c480d 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -2,6 +2,7 @@ from __future__ import annotations +import json import random import re import string @@ -3253,6 +3254,109 @@ def test_split_json_many_calls() -> None: assert chunk1 == chunk1_output +def test_split_json_with_empty_dict_values() -> None: + """Test that empty dicts in JSON values are preserved, not dropped.""" + splitter = RecursiveJsonSplitter(max_chunk_size=300) + + data: dict[str, Any] = { + "a": "hello", + "b": {}, + "c": "world", + } + chunks = splitter.split_json(data) + # Recombine all chunks into a single dict + merged: dict[str, Any] = {} + for chunk in chunks: + merged.update(chunk) + + assert merged == {"a": "hello", "b": {}, "c": "world"} + + +def test_split_json_with_nested_empty_dicts() -> None: + """Test that nested empty dicts are preserved.""" + splitter = RecursiveJsonSplitter(max_chunk_size=300) + + data: dict[str, Any] = { + "level1": { + "level2a": {}, + "level2b": "value", + } + } + chunks = splitter.split_json(data) + merged: dict[str, Any] = {} + for chunk in chunks: + merged.update(chunk) + + assert merged == {"level1": {"level2a": {}, "level2b": "value"}} + + +def test_split_json_empty_dict_only() -> None: + """Test splitting a JSON that contains only an empty dict at the top level. + + An empty top-level dict should produce a single empty chunk (or no chunks). + """ + splitter = RecursiveJsonSplitter(max_chunk_size=300) + + data: dict[str, Any] = {} + chunks = splitter.split_json(data) + # With nothing to split, result should be empty list + assert chunks == [] + + +def test_split_json_mixed_empty_and_nonempty_dicts() -> None: + """Test a realistic structure mixing empty and non-empty nested dicts.""" + splitter = RecursiveJsonSplitter(max_chunk_size=300) + + data: dict[str, Any] = { + "config": {}, + "metadata": {"author": "test", "tags": {}}, + "content": "some text", + } + chunks = splitter.split_json(data) + merged: dict[str, Any] = {} + for chunk in chunks: + for k, v in chunk.items(): + if k in merged and isinstance(merged[k], dict) and isinstance(v, dict): + merged[k].update(v) + else: + merged[k] = v + + assert merged["config"] == {} + assert merged["metadata"] == {"author": "test", "tags": {}} + assert merged["content"] == "some text" + + +def test_split_json_empty_dict_value_in_large_payload() -> None: + """Test that empty dict values survive chunking in a larger payload.""" + max_chunk = 200 + splitter = RecursiveJsonSplitter(max_chunk_size=max_chunk) + + data: dict[str, Any] = { + "key0": "x" * 50, + "empty": {}, + "key1": "y" * 50, + "nested": {f"k{i}": f"v{i}" for i in range(20)}, + } + chunks = splitter.split_json(data) + + # Verify all chunks are within size limits + for chunk in chunks: + assert len(json.dumps(chunk)) < max_chunk * 1.05 + + # Verify the empty dict is somewhere in the chunks + found_empty = False + for chunk in chunks: + # Walk nested structure to find "empty": {} + if "empty" in chunk and chunk["empty"] == {}: + found_empty = True + break + for v in chunk.values(): + if isinstance(v, dict) and "empty" in v and v["empty"] == {}: + found_empty = True + break + assert found_empty, "Empty dict value was lost during splitting" + + def test_powershell_code_splitter_short_code() -> None: splitter = RecursiveCharacterTextSplitter.from_language( Language.POWERSHELL, chunk_size=60, chunk_overlap=0