community[minor]: Fix json._validate_metadata_func() (#22842)

JSONparse, in _validate_metadata_func(), checks the consistency of the
_metadata_func() function. To do this, it invokes it and makes sure it
receives a dictionary in response. However, during the call, it does not
respect future calls, as shown on line 100. This generates errors if,
for example, the function is like this:
```python
        def generate_metadata(json_node:Dict[str,Any],kwargs:Dict[str,Any]) -> Dict[str,Any]:
             return {
                "source": url,
                "row": kwargs['seq_num'],
                "question":json_node.get("question"),
            }
        loader = JSONLoader(
            file_path=file_path,
            content_key="answer",
            jq_schema='.[]',
            metadata_func=generate_metadata,
            text_content=False)
```
To avoid this, the verification must comply with the specifications.
This patch does just that.

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Philippe PRADOS 2024-12-13 22:24:20 +01:00 committed by GitHub
parent 67fd554512
commit f3fb5a9c68
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 17 additions and 17 deletions

View File

@ -157,8 +157,6 @@ class JSONLoader(BaseLoader):
# and prevent the user from getting a cryptic error later on. # and prevent the user from getting a cryptic error later on.
if self._content_key is not None: if self._content_key is not None:
self._validate_content_key(data) self._validate_content_key(data)
if self._metadata_func is not None:
self._validate_metadata_func(data)
for i, sample in enumerate(data, index + 1): for i, sample in enumerate(data, index + 1):
text = self._get_text(sample=sample) text = self._get_text(sample=sample)
@ -178,7 +176,7 @@ class JSONLoader(BaseLoader):
else: else:
content = sample content = sample
if self._text_content and not isinstance(content, str): if self._text_content and not isinstance(content, str) and content is not None:
raise ValueError( raise ValueError(
f"Expected page_content is string, got {type(content)} instead. \ f"Expected page_content is string, got {type(content)} instead. \
Set `text_content=False` if the desired input for \ Set `text_content=False` if the desired input for \
@ -203,7 +201,13 @@ class JSONLoader(BaseLoader):
:return: :return:
""" """
if self._metadata_func is not None: if self._metadata_func is not None:
return self._metadata_func(sample, additional_fields) result = self._metadata_func(sample, additional_fields)
if not isinstance(result, dict):
raise ValueError(
f"Expected the metadata_func to return a dict but got \
`{type(result)}`"
)
return result
else: else:
return additional_fields return additional_fields
@ -233,15 +237,3 @@ class JSONLoader(BaseLoader):
f"Expected the jq schema to result in a list of objects (dict) \ f"Expected the jq schema to result in a list of objects (dict) \
with the key `{self._content_key}` which should be parsable by jq" with the key `{self._content_key}` which should be parsable by jq"
) )
def _validate_metadata_func(self, data: Any) -> None:
"""Check if the metadata_func output is valid"""
sample = data.first()
if self._metadata_func is not None:
sample_metadata = self._metadata_func(sample, {})
if not isinstance(sample_metadata, dict):
raise ValueError(
f"Expected the metadata_func to return a dict but got \
`{type(sample_metadata)}`"
)

View File

@ -1,12 +1,20 @@
from pathlib import Path from pathlib import Path
from typing import Dict
from langchain_community.document_loaders import JSONLoader from langchain_community.document_loaders import JSONLoader
def call_back(sample: Dict, additional_fields: Dict) -> Dict:
metadata = additional_fields.copy()
metadata["source"] += f"#seq_num={metadata['seq_num']}"
return metadata
def test_json_loader() -> None: def test_json_loader() -> None:
"""Test unstructured loader.""" """Test unstructured loader."""
file_path = Path(__file__).parent.parent / "examples/example.json" file_path = Path(__file__).parent.parent / "examples/example.json"
loader = JSONLoader(str(file_path), ".messages[].content")
loader = JSONLoader(file_path, ".messages[].content", metadata_func=call_back)
docs = loader.load() docs = loader.load()
# Check that the correct number of documents are loaded. # Check that the correct number of documents are loaded.