community[minor]: use jq schema for content_key in json_loader (#18003)

### Description Changed the value specified for `content_key` in JSONLoader from a single key to a value based on jq schema. I created [similar PR](https://github.com/langchain-ai/langchain/pull/11255) before, but it has several conflicts because of the architectural change associated stable version release, so I re-create this PR to fit new architecture. ### Why For json data like the following, specify `.data[].attributes.message` for page_content and `.data[].attributes.id` or `.data[].attributes.attributes. tags`, etc., the `content_key` must also parse the json structure. <details> <summary>sample json data</summary> ```json { "data": [ { "attributes": { "message": "message1", "tags": [ "tag1" ] }, "id": "1" }, { "attributes": { "message": "message2", "tags": [ "tag2" ] }, "id": "2" } ] } ``` </details> <details> <summary>sample code</summary> ```python def metadata_func(record: dict, metadata: dict) -> dict: metadata["source"] = None metadata["id"] = record.get("id") metadata["tags"] = record["attributes"].get("tags") return metadata sample_file = "sample1.json" loader = JSONLoader( file_path=sample_file, jq_schema=".data[]", content_key=".attributes.message", ## content_key is parsable into jq schema is_content_key_jq_parsable=True, ## this is added parameter metadata_func=metadata_func ) data = loader.load() data ``` </details> ### Dependencies none ### Twitter handle [kzk_maeda](https://twitter.com/kzk_maeda)
2025-09-15 22:44:36 +00:00 · 2024-03-06 08:51:24 +09:00
parent f4bb33bbf3
commit 60c5d964a8
3 changed files with 204 additions and 4 deletions
--- a/libs/community/tests/unit_tests/document_loaders/test_json_loader.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_json_loader.py
@@ -319,3 +319,123 @@ def test_json_meta_02(
    result = loader.load()

    assert result == expected_docs
+
+
+@pytest.mark.parametrize(
+    "params",
+    (
+        {"jq_schema": ".[].text"},
+        {"jq_schema": ".[]", "content_key": "text"},
+        {
+            "jq_schema": ".[]",
+            "content_key": ".text",
+            "is_content_key_jq_parsable": True,
+        },
+    ),
+)
+def test_load_json_with_jq_parsable_content_key(
+    params: Dict, mocker: MockerFixture
+) -> None:
+    file_path = "/workspaces/langchain/test.json"
+    expected_docs = [
+        Document(
+            page_content="value1",
+            metadata={"source": file_path, "seq_num": 1},
+        ),
+        Document(
+            page_content="value2",
+            metadata={"source": file_path, "seq_num": 2},
+        ),
+    ]
+
+    mocker.patch(
+        "pathlib.Path.open",
+        return_value=io.StringIO(
+            """
+            [{"text": "value1"}, {"text": "value2"}]
+            """
+        ),
+    )
+
+    loader = JSONLoader(file_path=file_path, json_lines=True, **params)
+    result = loader.load()
+
+    assert result == expected_docs
+
+
+def test_load_json_with_nested_jq_parsable_content_key(mocker: MockerFixture) -> None:
+    file_path = "/workspaces/langchain/test.json"
+    expected_docs = [
+        Document(
+            page_content="message1",
+            metadata={"source": file_path, "seq_num": 1},
+        ),
+        Document(
+            page_content="message2",
+            metadata={"source": file_path, "seq_num": 2},
+        ),
+    ]
+
+    mocker.patch(
+        "pathlib.Path.open",
+        return_value=io.StringIO(
+            """
+            {"data": [
+                    {"attributes": {"message": "message1","tags": ["tag1"]},"id": "1"},
+                    {"attributes": {"message": "message2","tags": ["tag2"]},"id": "2"}]}
+            """
+        ),
+    )
+
+    loader = JSONLoader(
+        file_path=file_path,
+        jq_schema=".data[]",
+        content_key=".attributes.message",
+        is_content_key_jq_parsable=True,
+    )
+    result = loader.load()
+
+    assert result == expected_docs
+
+
+def test_load_json_with_nested_jq_parsable_content_key_with_metadata(
+    mocker: MockerFixture,
+) -> None:
+    file_path = "/workspaces/langchain/test.json"
+    expected_docs = [
+        Document(
+            page_content="message1",
+            metadata={"source": file_path, "seq_num": 1, "id": "1", "tags": ["tag1"]},
+        ),
+        Document(
+            page_content="message2",
+            metadata={"source": file_path, "seq_num": 2, "id": "2", "tags": ["tag2"]},
+        ),
+    ]
+
+    mocker.patch(
+        "pathlib.Path.open",
+        return_value=io.StringIO(
+            """
+            {"data": [
+                    {"attributes": {"message": "message1","tags": ["tag1"]},"id": "1"},
+                    {"attributes": {"message": "message2","tags": ["tag2"]},"id": "2"}]}
+            """
+        ),
+    )
+
+    def _metadata_func(record: dict, metadata: dict) -> dict:
+        metadata["id"] = record.get("id")
+        metadata["tags"] = record["attributes"].get("tags")
+        return metadata
+
+    loader = JSONLoader(
+        file_path=file_path,
+        jq_schema=".data[]",
+        content_key=".attributes.message",
+        is_content_key_jq_parsable=True,
+        metadata_func=_metadata_func,
+    )
+    result = loader.load()
+
+    assert result == expected_docs