fix(core): JSON Schema reference resolution for list indices (#32088)

Fixes #32042

## Summary
Fixes a critical bug in JSON Schema reference resolution that prevented
correctly dereferencing numeric components in JSON pointer paths,
specifically for list indices in `anyOf`, `oneOf`, and `allOf` arrays.

## Changes
- Fixed `_retrieve_ref` function in
`libs/core/langchain_core/utils/json_schema.py` to properly handle
numeric components
- Added comprehensive test function `test_dereference_refs_list_index()`
in `libs/core/tests/unit_tests/utils/test_json_schema.py`
- Resolved line length formatting issues
- Improved type checking and index validation for list and dictionary
references

## Key Improvements
- Correctly handles list index references in JSON pointer paths
- Maintains backward compatibility with existing dictionary numeric key
functionality
- Adds robust error handling for out-of-bounds and invalid indices
- Passes all test cases covering various reference scenarios

## Test Coverage
- Verified fix for `#/properties/payload/anyOf/1/properties/startDate`
reference
- Tested edge cases including out-of-bounds and negative indices
- Ensured no regression in existing reference resolution functionality

Resolves the reported issue with JSON Schema reference dereferencing for
list indices.

---------

Co-authored-by: open-swe-dev[bot] <open-swe-dev@users.noreply.github.com>
Co-authored-by: Mason Daugherty <github@mdrxy.com>
Co-authored-by: Mason Daugherty <mason@langchain.dev>
This commit is contained in:
open-swe[bot]
2025-07-17 15:54:38 -04:00
committed by GitHub
parent 6d449df8bb
commit 5da986c3f6
3 changed files with 244 additions and 61 deletions

View File

@@ -21,8 +21,15 @@ def _retrieve_ref(path: str, schema: dict) -> dict:
for component in components[1:]:
if component in out:
out = out[component]
elif component.isdigit() and int(component) in out:
out = out[int(component)]
elif component.isdigit():
index = int(component)
if (isinstance(out, list) and 0 <= index < len(out)) or (
isinstance(out, dict) and index in out
):
out = out[index]
else:
msg = f"Reference '{path}' not found."
raise KeyError(msg)
else:
msg = f"Reference '{path}' not found."
raise KeyError(msg)
@@ -32,66 +39,66 @@ def _retrieve_ref(path: str, schema: dict) -> dict:
def _dereference_refs_helper(
obj: Any,
full_schema: dict[str, Any],
processed_refs: Optional[set[str]],
skip_keys: Sequence[str],
processed_refs: Optional[set[str]] = None,
shallow_refs: bool, # noqa: FBT001
) -> Any:
"""Inline every pure {'$ref':...}.
But:
- if shallow_refs=True: only break cycles, do not inline nested refs
- if shallow_refs=False: deep-inline all nested refs
Also skip recursion under any key in skip_keys.
"""
if processed_refs is None:
processed_refs = set()
# 1) Pure $ref node?
if isinstance(obj, dict) and set(obj.keys()) == {"$ref"}:
ref_path = obj["$ref"]
# cycle?
if ref_path in processed_refs:
return {}
processed_refs.add(ref_path)
# grab + copy the target
target = deepcopy(_retrieve_ref(ref_path, full_schema))
# deep inlining: recurse into everything
result = _dereference_refs_helper(
target, full_schema, processed_refs, skip_keys, shallow_refs
)
processed_refs.remove(ref_path)
return result
# 2) Not a pure-$ref: recurse, skipping any keys in skip_keys
if isinstance(obj, dict):
obj_out = {}
out: dict[str, Any] = {}
for k, v in obj.items():
if k in skip_keys:
obj_out[k] = v
elif k == "$ref":
if v in processed_refs:
continue
processed_refs.add(v)
ref = _retrieve_ref(v, full_schema)
full_ref = _dereference_refs_helper(
ref, full_schema, skip_keys, processed_refs
)
processed_refs.remove(v)
return full_ref
elif isinstance(v, (list, dict)):
obj_out[k] = _dereference_refs_helper(
v, full_schema, skip_keys, processed_refs
# do not recurse under this key
out[k] = deepcopy(v)
elif isinstance(v, (dict, list)):
out[k] = _dereference_refs_helper(
v, full_schema, processed_refs, skip_keys, shallow_refs
)
else:
obj_out[k] = v
return obj_out
out[k] = v
return out
if isinstance(obj, list):
return [
_dereference_refs_helper(el, full_schema, skip_keys, processed_refs)
for el in obj
_dereference_refs_helper(
item, full_schema, processed_refs, skip_keys, shallow_refs
)
for item in obj
]
return obj
def _infer_skip_keys(
obj: Any, full_schema: dict, processed_refs: Optional[set[str]] = None
) -> list[str]:
if processed_refs is None:
processed_refs = set()
keys = []
if isinstance(obj, dict):
for k, v in obj.items():
if k == "$ref":
if v in processed_refs:
continue
processed_refs.add(v)
ref = _retrieve_ref(v, full_schema)
keys.append(v.split("/")[1])
keys += _infer_skip_keys(ref, full_schema, processed_refs)
elif isinstance(v, (list, dict)):
keys += _infer_skip_keys(v, full_schema, processed_refs)
elif isinstance(obj, list):
for el in obj:
keys += _infer_skip_keys(el, full_schema, processed_refs)
return keys
def dereference_refs(
schema_obj: dict,
*,
@@ -101,17 +108,15 @@ def dereference_refs(
"""Try to substitute $refs in JSON Schema.
Args:
schema_obj: The schema object to dereference.
full_schema: The full schema object. Defaults to None.
skip_keys: The keys to skip. Defaults to None.
Returns:
The dereferenced schema object.
schema_obj: The fragment to dereference.
full_schema: The complete schema (defaults to schema_obj).
skip_keys:
- If None (the default), we skip recursion under '$defs' *and* only
shallow-inline refs.
- If provided (even as an empty list), we will recurse under every key and
deep-inline all refs.
"""
full_schema = full_schema or schema_obj
skip_keys = (
skip_keys
if skip_keys is not None
else _infer_skip_keys(schema_obj, full_schema)
)
return _dereference_refs_helper(schema_obj, full_schema, skip_keys)
full = full_schema or schema_obj
keys_to_skip = list(skip_keys) if skip_keys is not None else ["$defs"]
shallow = skip_keys is None
return _dereference_refs_helper(schema_obj, full, None, keys_to_skip, shallow)

View File

@@ -264,3 +264,183 @@ def test_dereference_refs_cyclical_refs() -> None:
}
actual = dereference_refs(schema)
assert actual == expected
def test_dereference_refs_list_index() -> None:
"""Test dereferencing refs that use list indices (e.g., anyOf/1)."""
# Test case from the issue report - anyOf array with numeric index reference
schema = {
"type": "object",
"properties": {
"payload": {
"anyOf": [
{ # variant 0
"type": "object",
"properties": {"kind": {"type": "string", "const": "ONE"}},
},
{ # variant 1
"type": "object",
"properties": {
"kind": {"type": "string", "const": "TWO"},
"startDate": {
"type": "string",
"pattern": r"^\d{4}-\d{2}-\d{2}$",
},
"endDate": {
"$ref": (
"#/properties/payload/anyOf/1/properties/startDate"
)
},
},
},
]
}
},
}
expected = {
"type": "object",
"properties": {
"payload": {
"anyOf": [
{ # variant 0
"type": "object",
"properties": {"kind": {"type": "string", "const": "ONE"}},
},
{ # variant 1
"type": "object",
"properties": {
"kind": {"type": "string", "const": "TWO"},
"startDate": {
"type": "string",
"pattern": r"^\d{4}-\d{2}-\d{2}$",
},
"endDate": {
"type": "string",
"pattern": r"^\d{4}-\d{2}-\d{2}$",
},
},
},
]
}
},
}
actual = dereference_refs(schema)
assert actual == expected
# Test oneOf array with numeric index reference
schema_oneof = {
"type": "object",
"properties": {
"data": {
"oneOf": [
{"type": "string"},
{"type": "number"},
{
"type": "object",
"properties": {"value": {"$ref": "#/properties/data/oneOf/1"}},
},
]
}
},
}
expected_oneof = {
"type": "object",
"properties": {
"data": {
"oneOf": [
{"type": "string"},
{"type": "number"},
{"type": "object", "properties": {"value": {"type": "number"}}},
]
}
},
}
actual_oneof = dereference_refs(schema_oneof)
assert actual_oneof == expected_oneof
# Test allOf array with numeric index reference
schema_allof = {
"type": "object",
"allOf": [
{"properties": {"name": {"type": "string"}}},
{"properties": {"age": {"type": "number"}}},
],
"properties": {"copy_name": {"$ref": "#/allOf/0/properties/name"}},
}
expected_allof = {
"type": "object",
"allOf": [
{"properties": {"name": {"type": "string"}}},
{"properties": {"age": {"type": "number"}}},
],
"properties": {"copy_name": {"type": "string"}},
}
actual_allof = dereference_refs(schema_allof)
assert actual_allof == expected_allof
# Test edge case: out-of-bounds index should raise KeyError
schema_invalid = {
"type": "object",
"properties": {
"data": {"anyOf": [{"type": "string"}]},
"invalid": {"$ref": "#/properties/data/anyOf/5"}, # Index 5 doesn't exist
},
}
with pytest.raises(
KeyError, match="Reference '#/properties/data/anyOf/5' not found"
):
dereference_refs(schema_invalid)
# Test edge case: negative index should raise KeyError
schema_negative = {
"type": "object",
"properties": {
"data": {"anyOf": [{"type": "string"}]},
"invalid": {"$ref": "#/properties/data/anyOf/-1"}, # Negative index
},
}
with pytest.raises(
KeyError, match="Reference '#/properties/data/anyOf/-1' not found"
):
dereference_refs(schema_negative)
# Test that existing dictionary-based numeric key functionality still works
schema_dict_key = {
"type": "object",
"properties": {
"error_400": {"$ref": "#/$defs/400"},
},
"$defs": {
400: {
"type": "object",
"properties": {"description": "Bad Request"},
},
},
}
expected_dict_key = {
"type": "object",
"properties": {
"error_400": {
"type": "object",
"properties": {"description": "Bad Request"},
},
},
"$defs": {
400: {
"type": "object",
"properties": {"description": "Bad Request"},
},
},
}
actual_dict_key = dereference_refs(schema_dict_key)
assert actual_dict_key == expected_dict_key

View File

@@ -1,13 +1,11 @@
from langchain_core.utils.json_schema import (
_dereference_refs_helper,
_infer_skip_keys,
_retrieve_ref,
dereference_refs,
)
__all__ = [
"_dereference_refs_helper",
"_infer_skip_keys",
"_retrieve_ref",
"dereference_refs",
]