mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-21 22:29:51 +00:00
community[minor]: Fix json._validate_metadata_func() (#22842)
JSONparse, in _validate_metadata_func(), checks the consistency of the _metadata_func() function. To do this, it invokes it and makes sure it receives a dictionary in response. However, during the call, it does not respect future calls, as shown on line 100. This generates errors if, for example, the function is like this: ```python def generate_metadata(json_node:Dict[str,Any],kwargs:Dict[str,Any]) -> Dict[str,Any]: return { "source": url, "row": kwargs['seq_num'], "question":json_node.get("question"), } loader = JSONLoader( file_path=file_path, content_key="answer", jq_schema='.[]', metadata_func=generate_metadata, text_content=False) ``` To avoid this, the verification must comply with the specifications. This patch does just that. --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
67fd554512
commit
f3fb5a9c68
@ -157,8 +157,6 @@ class JSONLoader(BaseLoader):
|
||||
# and prevent the user from getting a cryptic error later on.
|
||||
if self._content_key is not None:
|
||||
self._validate_content_key(data)
|
||||
if self._metadata_func is not None:
|
||||
self._validate_metadata_func(data)
|
||||
|
||||
for i, sample in enumerate(data, index + 1):
|
||||
text = self._get_text(sample=sample)
|
||||
@ -178,7 +176,7 @@ class JSONLoader(BaseLoader):
|
||||
else:
|
||||
content = sample
|
||||
|
||||
if self._text_content and not isinstance(content, str):
|
||||
if self._text_content and not isinstance(content, str) and content is not None:
|
||||
raise ValueError(
|
||||
f"Expected page_content is string, got {type(content)} instead. \
|
||||
Set `text_content=False` if the desired input for \
|
||||
@ -203,7 +201,13 @@ class JSONLoader(BaseLoader):
|
||||
:return:
|
||||
"""
|
||||
if self._metadata_func is not None:
|
||||
return self._metadata_func(sample, additional_fields)
|
||||
result = self._metadata_func(sample, additional_fields)
|
||||
if not isinstance(result, dict):
|
||||
raise ValueError(
|
||||
f"Expected the metadata_func to return a dict but got \
|
||||
`{type(result)}`"
|
||||
)
|
||||
return result
|
||||
else:
|
||||
return additional_fields
|
||||
|
||||
@ -233,15 +237,3 @@ class JSONLoader(BaseLoader):
|
||||
f"Expected the jq schema to result in a list of objects (dict) \
|
||||
with the key `{self._content_key}` which should be parsable by jq"
|
||||
)
|
||||
|
||||
def _validate_metadata_func(self, data: Any) -> None:
|
||||
"""Check if the metadata_func output is valid"""
|
||||
|
||||
sample = data.first()
|
||||
if self._metadata_func is not None:
|
||||
sample_metadata = self._metadata_func(sample, {})
|
||||
if not isinstance(sample_metadata, dict):
|
||||
raise ValueError(
|
||||
f"Expected the metadata_func to return a dict but got \
|
||||
`{type(sample_metadata)}`"
|
||||
)
|
||||
|
@ -1,12 +1,20 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
from langchain_community.document_loaders import JSONLoader
|
||||
|
||||
|
||||
def call_back(sample: Dict, additional_fields: Dict) -> Dict:
|
||||
metadata = additional_fields.copy()
|
||||
metadata["source"] += f"#seq_num={metadata['seq_num']}"
|
||||
return metadata
|
||||
|
||||
|
||||
def test_json_loader() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/example.json"
|
||||
loader = JSONLoader(str(file_path), ".messages[].content")
|
||||
|
||||
loader = JSONLoader(file_path, ".messages[].content", metadata_func=call_back)
|
||||
docs = loader.load()
|
||||
|
||||
# Check that the correct number of documents are loaded.
|
||||
|
Loading…
Reference in New Issue
Block a user