From f3fb5a9c687eafc138b4739636cb8594ebfd73db Mon Sep 17 00:00:00 2001 From: Philippe PRADOS Date: Fri, 13 Dec 2024 22:24:20 +0100 Subject: [PATCH] community[minor]: Fix json._validate_metadata_func() (#22842) JSONparse, in _validate_metadata_func(), checks the consistency of the _metadata_func() function. To do this, it invokes it and makes sure it receives a dictionary in response. However, during the call, it does not respect future calls, as shown on line 100. This generates errors if, for example, the function is like this: ```python def generate_metadata(json_node:Dict[str,Any],kwargs:Dict[str,Any]) -> Dict[str,Any]: return { "source": url, "row": kwargs['seq_num'], "question":json_node.get("question"), } loader = JSONLoader( file_path=file_path, content_key="answer", jq_schema='.[]', metadata_func=generate_metadata, text_content=False) ``` To avoid this, the verification must comply with the specifications. This patch does just that. --------- Co-authored-by: Eugene Yurtsev --- .../document_loaders/json_loader.py | 24 +++++++------------ .../document_loaders/test_json_loader.py | 10 +++++++- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/json_loader.py b/libs/community/langchain_community/document_loaders/json_loader.py index ce86b9850e7..08d5823e2a2 100644 --- a/libs/community/langchain_community/document_loaders/json_loader.py +++ b/libs/community/langchain_community/document_loaders/json_loader.py @@ -157,8 +157,6 @@ class JSONLoader(BaseLoader): # and prevent the user from getting a cryptic error later on. if self._content_key is not None: self._validate_content_key(data) - if self._metadata_func is not None: - self._validate_metadata_func(data) for i, sample in enumerate(data, index + 1): text = self._get_text(sample=sample) @@ -178,7 +176,7 @@ class JSONLoader(BaseLoader): else: content = sample - if self._text_content and not isinstance(content, str): + if self._text_content and not isinstance(content, str) and content is not None: raise ValueError( f"Expected page_content is string, got {type(content)} instead. \ Set `text_content=False` if the desired input for \ @@ -203,7 +201,13 @@ class JSONLoader(BaseLoader): :return: """ if self._metadata_func is not None: - return self._metadata_func(sample, additional_fields) + result = self._metadata_func(sample, additional_fields) + if not isinstance(result, dict): + raise ValueError( + f"Expected the metadata_func to return a dict but got \ + `{type(result)}`" + ) + return result else: return additional_fields @@ -233,15 +237,3 @@ class JSONLoader(BaseLoader): f"Expected the jq schema to result in a list of objects (dict) \ with the key `{self._content_key}` which should be parsable by jq" ) - - def _validate_metadata_func(self, data: Any) -> None: - """Check if the metadata_func output is valid""" - - sample = data.first() - if self._metadata_func is not None: - sample_metadata = self._metadata_func(sample, {}) - if not isinstance(sample_metadata, dict): - raise ValueError( - f"Expected the metadata_func to return a dict but got \ - `{type(sample_metadata)}`" - ) diff --git a/libs/community/tests/integration_tests/document_loaders/test_json_loader.py b/libs/community/tests/integration_tests/document_loaders/test_json_loader.py index 8f85d9b0191..620cb16b4a6 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_json_loader.py +++ b/libs/community/tests/integration_tests/document_loaders/test_json_loader.py @@ -1,12 +1,20 @@ from pathlib import Path +from typing import Dict from langchain_community.document_loaders import JSONLoader +def call_back(sample: Dict, additional_fields: Dict) -> Dict: + metadata = additional_fields.copy() + metadata["source"] += f"#seq_num={metadata['seq_num']}" + return metadata + + def test_json_loader() -> None: """Test unstructured loader.""" file_path = Path(__file__).parent.parent / "examples/example.json" - loader = JSONLoader(str(file_path), ".messages[].content") + + loader = JSONLoader(file_path, ".messages[].content", metadata_func=call_back) docs = loader.load() # Check that the correct number of documents are loaded.