community[minor]: use jq schema for content_key in json_loader (#18003)

### Description
Changed the value specified for `content_key` in JSONLoader from a
single key to a value based on jq schema.
I created [similar
PR](https://github.com/langchain-ai/langchain/pull/11255) before, but it
has several conflicts because of the architectural change associated
stable version release, so I re-create this PR to fit new architecture.

### Why
For json data like the following, specify `.data[].attributes.message`
for page_content and `.data[].attributes.id` or
`.data[].attributes.attributes. tags`, etc., the `content_key` must also
parse the json structure.

<details>
<summary>sample json data</summary>

```json
{
  "data": [
    {
      "attributes": {
        "message": "message1",
        "tags": [
          "tag1"
        ]
      },
      "id": "1"
    },
    {
      "attributes": {
        "message": "message2",
        "tags": [
          "tag2"
        ]
      },
      "id": "2"
    }
  ]
}
```

</details>

<details>
<summary>sample code</summary>

```python
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["source"] = None
    metadata["id"] = record.get("id")
    metadata["tags"] = record["attributes"].get("tags")

    return metadata

sample_file = "sample1.json"
loader = JSONLoader(
    file_path=sample_file,
    jq_schema=".data[]",
    content_key=".attributes.message", ## content_key is parsable into jq schema
    is_content_key_jq_parsable=True, ## this is added parameter
    metadata_func=metadata_func
)

data = loader.load()
data
```

</details>

### Dependencies
none

### Twitter handle
[kzk_maeda](https://twitter.com/kzk_maeda)
This commit is contained in:
Kazuki Maeda
2024-03-06 08:51:24 +09:00
committed by GitHub
parent f4bb33bbf3
commit 60c5d964a8
3 changed files with 204 additions and 4 deletions

View File

@@ -319,3 +319,123 @@ def test_json_meta_02(
result = loader.load()
assert result == expected_docs
@pytest.mark.parametrize(
"params",
(
{"jq_schema": ".[].text"},
{"jq_schema": ".[]", "content_key": "text"},
{
"jq_schema": ".[]",
"content_key": ".text",
"is_content_key_jq_parsable": True,
},
),
)
def test_load_json_with_jq_parsable_content_key(
params: Dict, mocker: MockerFixture
) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="value1",
metadata={"source": file_path, "seq_num": 1},
),
Document(
page_content="value2",
metadata={"source": file_path, "seq_num": 2},
),
]
mocker.patch(
"pathlib.Path.open",
return_value=io.StringIO(
"""
[{"text": "value1"}, {"text": "value2"}]
"""
),
)
loader = JSONLoader(file_path=file_path, json_lines=True, **params)
result = loader.load()
assert result == expected_docs
def test_load_json_with_nested_jq_parsable_content_key(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="message1",
metadata={"source": file_path, "seq_num": 1},
),
Document(
page_content="message2",
metadata={"source": file_path, "seq_num": 2},
),
]
mocker.patch(
"pathlib.Path.open",
return_value=io.StringIO(
"""
{"data": [
{"attributes": {"message": "message1","tags": ["tag1"]},"id": "1"},
{"attributes": {"message": "message2","tags": ["tag2"]},"id": "2"}]}
"""
),
)
loader = JSONLoader(
file_path=file_path,
jq_schema=".data[]",
content_key=".attributes.message",
is_content_key_jq_parsable=True,
)
result = loader.load()
assert result == expected_docs
def test_load_json_with_nested_jq_parsable_content_key_with_metadata(
mocker: MockerFixture,
) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="message1",
metadata={"source": file_path, "seq_num": 1, "id": "1", "tags": ["tag1"]},
),
Document(
page_content="message2",
metadata={"source": file_path, "seq_num": 2, "id": "2", "tags": ["tag2"]},
),
]
mocker.patch(
"pathlib.Path.open",
return_value=io.StringIO(
"""
{"data": [
{"attributes": {"message": "message1","tags": ["tag1"]},"id": "1"},
{"attributes": {"message": "message2","tags": ["tag2"]},"id": "2"}]}
"""
),
)
def _metadata_func(record: dict, metadata: dict) -> dict:
metadata["id"] = record.get("id")
metadata["tags"] = record["attributes"].get("tags")
return metadata
loader = JSONLoader(
file_path=file_path,
jq_schema=".data[]",
content_key=".attributes.message",
is_content_key_jq_parsable=True,
metadata_func=_metadata_func,
)
result = loader.load()
assert result == expected_docs