mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 06:39:52 +00:00
community[minor]: Fix json._validate_metadata_func() (#22842)
JSONparse, in _validate_metadata_func(), checks the consistency of the _metadata_func() function. To do this, it invokes it and makes sure it receives a dictionary in response. However, during the call, it does not respect future calls, as shown on line 100. This generates errors if, for example, the function is like this: ```python def generate_metadata(json_node:Dict[str,Any],kwargs:Dict[str,Any]) -> Dict[str,Any]: return { "source": url, "row": kwargs['seq_num'], "question":json_node.get("question"), } loader = JSONLoader( file_path=file_path, content_key="answer", jq_schema='.[]', metadata_func=generate_metadata, text_content=False) ``` To avoid this, the verification must comply with the specifications. This patch does just that. --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
67fd554512
commit
f3fb5a9c68
@ -157,8 +157,6 @@ class JSONLoader(BaseLoader):
|
|||||||
# and prevent the user from getting a cryptic error later on.
|
# and prevent the user from getting a cryptic error later on.
|
||||||
if self._content_key is not None:
|
if self._content_key is not None:
|
||||||
self._validate_content_key(data)
|
self._validate_content_key(data)
|
||||||
if self._metadata_func is not None:
|
|
||||||
self._validate_metadata_func(data)
|
|
||||||
|
|
||||||
for i, sample in enumerate(data, index + 1):
|
for i, sample in enumerate(data, index + 1):
|
||||||
text = self._get_text(sample=sample)
|
text = self._get_text(sample=sample)
|
||||||
@ -178,7 +176,7 @@ class JSONLoader(BaseLoader):
|
|||||||
else:
|
else:
|
||||||
content = sample
|
content = sample
|
||||||
|
|
||||||
if self._text_content and not isinstance(content, str):
|
if self._text_content and not isinstance(content, str) and content is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Expected page_content is string, got {type(content)} instead. \
|
f"Expected page_content is string, got {type(content)} instead. \
|
||||||
Set `text_content=False` if the desired input for \
|
Set `text_content=False` if the desired input for \
|
||||||
@ -203,7 +201,13 @@ class JSONLoader(BaseLoader):
|
|||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
if self._metadata_func is not None:
|
if self._metadata_func is not None:
|
||||||
return self._metadata_func(sample, additional_fields)
|
result = self._metadata_func(sample, additional_fields)
|
||||||
|
if not isinstance(result, dict):
|
||||||
|
raise ValueError(
|
||||||
|
f"Expected the metadata_func to return a dict but got \
|
||||||
|
`{type(result)}`"
|
||||||
|
)
|
||||||
|
return result
|
||||||
else:
|
else:
|
||||||
return additional_fields
|
return additional_fields
|
||||||
|
|
||||||
@ -233,15 +237,3 @@ class JSONLoader(BaseLoader):
|
|||||||
f"Expected the jq schema to result in a list of objects (dict) \
|
f"Expected the jq schema to result in a list of objects (dict) \
|
||||||
with the key `{self._content_key}` which should be parsable by jq"
|
with the key `{self._content_key}` which should be parsable by jq"
|
||||||
)
|
)
|
||||||
|
|
||||||
def _validate_metadata_func(self, data: Any) -> None:
|
|
||||||
"""Check if the metadata_func output is valid"""
|
|
||||||
|
|
||||||
sample = data.first()
|
|
||||||
if self._metadata_func is not None:
|
|
||||||
sample_metadata = self._metadata_func(sample, {})
|
|
||||||
if not isinstance(sample_metadata, dict):
|
|
||||||
raise ValueError(
|
|
||||||
f"Expected the metadata_func to return a dict but got \
|
|
||||||
`{type(sample_metadata)}`"
|
|
||||||
)
|
|
||||||
|
@ -1,12 +1,20 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
from langchain_community.document_loaders import JSONLoader
|
from langchain_community.document_loaders import JSONLoader
|
||||||
|
|
||||||
|
|
||||||
|
def call_back(sample: Dict, additional_fields: Dict) -> Dict:
|
||||||
|
metadata = additional_fields.copy()
|
||||||
|
metadata["source"] += f"#seq_num={metadata['seq_num']}"
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
def test_json_loader() -> None:
|
def test_json_loader() -> None:
|
||||||
"""Test unstructured loader."""
|
"""Test unstructured loader."""
|
||||||
file_path = Path(__file__).parent.parent / "examples/example.json"
|
file_path = Path(__file__).parent.parent / "examples/example.json"
|
||||||
loader = JSONLoader(str(file_path), ".messages[].content")
|
|
||||||
|
loader = JSONLoader(file_path, ".messages[].content", metadata_func=call_back)
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
|
||||||
# Check that the correct number of documents are loaded.
|
# Check that the correct number of documents are loaded.
|
||||||
|
Loading…
Reference in New Issue
Block a user