mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-09 21:08:59 +00:00
community[minor]: use jq schema for content_key in json_loader (#18003)
### Description Changed the value specified for `content_key` in JSONLoader from a single key to a value based on jq schema. I created [similar PR](https://github.com/langchain-ai/langchain/pull/11255) before, but it has several conflicts because of the architectural change associated stable version release, so I re-create this PR to fit new architecture. ### Why For json data like the following, specify `.data[].attributes.message` for page_content and `.data[].attributes.id` or `.data[].attributes.attributes. tags`, etc., the `content_key` must also parse the json structure. <details> <summary>sample json data</summary> ```json { "data": [ { "attributes": { "message": "message1", "tags": [ "tag1" ] }, "id": "1" }, { "attributes": { "message": "message2", "tags": [ "tag2" ] }, "id": "2" } ] } ``` </details> <details> <summary>sample code</summary> ```python def metadata_func(record: dict, metadata: dict) -> dict: metadata["source"] = None metadata["id"] = record.get("id") metadata["tags"] = record["attributes"].get("tags") return metadata sample_file = "sample1.json" loader = JSONLoader( file_path=sample_file, jq_schema=".data[]", content_key=".attributes.message", ## content_key is parsable into jq schema is_content_key_jq_parsable=True, ## this is added parameter metadata_func=metadata_func ) data = loader.load() data ``` </details> ### Dependencies none ### Twitter handle [kzk_maeda](https://twitter.com/kzk_maeda)
This commit is contained in:
parent
f4bb33bbf3
commit
60c5d964a8
@ -199,6 +199,58 @@ pprint(data)
|
||||
|
||||
</CodeOutputBlock>
|
||||
|
||||
### JSON file with jq schema `content_key`
|
||||
|
||||
To load documents from a JSON file using the content_key within the jq schema, set is_content_key_jq_parsable=True.
|
||||
Ensure that content_key is compatible and can be parsed using the jq schema.
|
||||
|
||||
```python
|
||||
file_path = './sample.json'
|
||||
pprint(Path(file_path).read_text())
|
||||
```
|
||||
|
||||
<CodeOutputBlock lang="python">
|
||||
|
||||
```json
|
||||
{"data": [
|
||||
{"attributes": {
|
||||
"message": "message1",
|
||||
"tags": [
|
||||
"tag1"]},
|
||||
"id": "1"},
|
||||
{"attributes": {
|
||||
"message": "message2",
|
||||
"tags": [
|
||||
"tag2"]},
|
||||
"id": "2"}]}
|
||||
```
|
||||
|
||||
</CodeOutputBlock>
|
||||
|
||||
|
||||
```python
|
||||
loader = JSONLoader(
|
||||
file_path=file_path,
|
||||
jq_schema=".data[]",
|
||||
content_key=".attributes.message",
|
||||
is_content_key_jq_parsable=True,
|
||||
)
|
||||
|
||||
data = loader.load()
|
||||
```
|
||||
|
||||
```python
|
||||
pprint(data)
|
||||
```
|
||||
|
||||
<CodeOutputBlock lang="python">
|
||||
|
||||
```
|
||||
[Document(page_content='message1', metadata={'source': '/path/to/sample.json', 'seq_num': 1}),
|
||||
Document(page_content='message2', metadata={'source': '/path/to/sample.json', 'seq_num': 2})]
|
||||
```
|
||||
|
||||
</CodeOutputBlock>
|
||||
|
||||
## Extracting metadata
|
||||
|
||||
|
@ -21,6 +21,7 @@ class JSONLoader(BaseLoader):
|
||||
file_path: Union[str, Path],
|
||||
jq_schema: str,
|
||||
content_key: Optional[str] = None,
|
||||
is_content_key_jq_parsable: Optional[bool] = False,
|
||||
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
|
||||
text_content: bool = True,
|
||||
json_lines: bool = False,
|
||||
@ -31,8 +32,16 @@ class JSONLoader(BaseLoader):
|
||||
file_path (Union[str, Path]): The path to the JSON or JSON Lines file.
|
||||
jq_schema (str): The jq schema to use to extract the data or text from
|
||||
the JSON.
|
||||
content_key (str): The key to use to extract the content from the JSON if
|
||||
the jq_schema results to a list of objects (dict).
|
||||
content_key (str): The key to use to extract the content from
|
||||
the JSON if the jq_schema results to a list of objects (dict).
|
||||
If is_content_key_jq_parsable is True, this has to be a jq compatible
|
||||
schema. If is_content_key_jq_parsable is False, this should be a simple
|
||||
string key.
|
||||
is_content_key_jq_parsable (bool): A flag to determine if
|
||||
content_key is parsable by jq or not. If True, content_key is
|
||||
treated as a jq schema and compiled accordingly. If False or if
|
||||
content_key is None, content_key is used as a simple string.
|
||||
Default is False.
|
||||
metadata_func (Callable[Dict, Dict]): A function that takes in the JSON
|
||||
object extracted by the jq_schema and the default metadata and returns
|
||||
a dict of the updated metadata.
|
||||
@ -43,6 +52,8 @@ class JSONLoader(BaseLoader):
|
||||
"""
|
||||
try:
|
||||
import jq # noqa:F401
|
||||
|
||||
self.jq = jq
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"jq package not found, please install it with `pip install jq`"
|
||||
@ -50,6 +61,7 @@ class JSONLoader(BaseLoader):
|
||||
|
||||
self.file_path = Path(file_path).resolve()
|
||||
self._jq_schema = jq.compile(jq_schema)
|
||||
self._is_content_key_jq_parsable = is_content_key_jq_parsable
|
||||
self._content_key = content_key
|
||||
self._metadata_func = metadata_func
|
||||
self._text_content = text_content
|
||||
@ -90,7 +102,11 @@ class JSONLoader(BaseLoader):
|
||||
def _get_text(self, sample: Any) -> str:
|
||||
"""Convert sample to string format"""
|
||||
if self._content_key is not None:
|
||||
content = sample.get(self._content_key)
|
||||
if self._is_content_key_jq_parsable:
|
||||
compiled_content_key = self.jq.compile(self._content_key)
|
||||
content = compiled_content_key.input(sample).first()
|
||||
else:
|
||||
content = sample[self._content_key]
|
||||
else:
|
||||
content = sample
|
||||
|
||||
@ -125,6 +141,7 @@ class JSONLoader(BaseLoader):
|
||||
|
||||
def _validate_content_key(self, data: Any) -> None:
|
||||
"""Check if a content key is valid"""
|
||||
|
||||
sample = data.first()
|
||||
if not isinstance(sample, dict):
|
||||
raise ValueError(
|
||||
@ -132,11 +149,22 @@ class JSONLoader(BaseLoader):
|
||||
so sample must be a dict but got `{type(sample)}`"
|
||||
)
|
||||
|
||||
if sample.get(self._content_key) is None:
|
||||
if (
|
||||
not self._is_content_key_jq_parsable
|
||||
and sample.get(self._content_key) is None
|
||||
):
|
||||
raise ValueError(
|
||||
f"Expected the jq schema to result in a list of objects (dict) \
|
||||
with the key `{self._content_key}`"
|
||||
)
|
||||
if (
|
||||
self._is_content_key_jq_parsable
|
||||
and self.jq.compile(self._content_key).input(sample).text() is None
|
||||
):
|
||||
raise ValueError(
|
||||
f"Expected the jq schema to result in a list of objects (dict) \
|
||||
with the key `{self._content_key}` which should be parsable by jq"
|
||||
)
|
||||
|
||||
def _validate_metadata_func(self, data: Any) -> None:
|
||||
"""Check if the metadata_func output is valid"""
|
||||
|
@ -319,3 +319,123 @@ def test_json_meta_02(
|
||||
result = loader.load()
|
||||
|
||||
assert result == expected_docs
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params",
|
||||
(
|
||||
{"jq_schema": ".[].text"},
|
||||
{"jq_schema": ".[]", "content_key": "text"},
|
||||
{
|
||||
"jq_schema": ".[]",
|
||||
"content_key": ".text",
|
||||
"is_content_key_jq_parsable": True,
|
||||
},
|
||||
),
|
||||
)
|
||||
def test_load_json_with_jq_parsable_content_key(
|
||||
params: Dict, mocker: MockerFixture
|
||||
) -> None:
|
||||
file_path = "/workspaces/langchain/test.json"
|
||||
expected_docs = [
|
||||
Document(
|
||||
page_content="value1",
|
||||
metadata={"source": file_path, "seq_num": 1},
|
||||
),
|
||||
Document(
|
||||
page_content="value2",
|
||||
metadata={"source": file_path, "seq_num": 2},
|
||||
),
|
||||
]
|
||||
|
||||
mocker.patch(
|
||||
"pathlib.Path.open",
|
||||
return_value=io.StringIO(
|
||||
"""
|
||||
[{"text": "value1"}, {"text": "value2"}]
|
||||
"""
|
||||
),
|
||||
)
|
||||
|
||||
loader = JSONLoader(file_path=file_path, json_lines=True, **params)
|
||||
result = loader.load()
|
||||
|
||||
assert result == expected_docs
|
||||
|
||||
|
||||
def test_load_json_with_nested_jq_parsable_content_key(mocker: MockerFixture) -> None:
|
||||
file_path = "/workspaces/langchain/test.json"
|
||||
expected_docs = [
|
||||
Document(
|
||||
page_content="message1",
|
||||
metadata={"source": file_path, "seq_num": 1},
|
||||
),
|
||||
Document(
|
||||
page_content="message2",
|
||||
metadata={"source": file_path, "seq_num": 2},
|
||||
),
|
||||
]
|
||||
|
||||
mocker.patch(
|
||||
"pathlib.Path.open",
|
||||
return_value=io.StringIO(
|
||||
"""
|
||||
{"data": [
|
||||
{"attributes": {"message": "message1","tags": ["tag1"]},"id": "1"},
|
||||
{"attributes": {"message": "message2","tags": ["tag2"]},"id": "2"}]}
|
||||
"""
|
||||
),
|
||||
)
|
||||
|
||||
loader = JSONLoader(
|
||||
file_path=file_path,
|
||||
jq_schema=".data[]",
|
||||
content_key=".attributes.message",
|
||||
is_content_key_jq_parsable=True,
|
||||
)
|
||||
result = loader.load()
|
||||
|
||||
assert result == expected_docs
|
||||
|
||||
|
||||
def test_load_json_with_nested_jq_parsable_content_key_with_metadata(
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
file_path = "/workspaces/langchain/test.json"
|
||||
expected_docs = [
|
||||
Document(
|
||||
page_content="message1",
|
||||
metadata={"source": file_path, "seq_num": 1, "id": "1", "tags": ["tag1"]},
|
||||
),
|
||||
Document(
|
||||
page_content="message2",
|
||||
metadata={"source": file_path, "seq_num": 2, "id": "2", "tags": ["tag2"]},
|
||||
),
|
||||
]
|
||||
|
||||
mocker.patch(
|
||||
"pathlib.Path.open",
|
||||
return_value=io.StringIO(
|
||||
"""
|
||||
{"data": [
|
||||
{"attributes": {"message": "message1","tags": ["tag1"]},"id": "1"},
|
||||
{"attributes": {"message": "message2","tags": ["tag2"]},"id": "2"}]}
|
||||
"""
|
||||
),
|
||||
)
|
||||
|
||||
def _metadata_func(record: dict, metadata: dict) -> dict:
|
||||
metadata["id"] = record.get("id")
|
||||
metadata["tags"] = record["attributes"].get("tags")
|
||||
return metadata
|
||||
|
||||
loader = JSONLoader(
|
||||
file_path=file_path,
|
||||
jq_schema=".data[]",
|
||||
content_key=".attributes.message",
|
||||
is_content_key_jq_parsable=True,
|
||||
metadata_func=_metadata_func,
|
||||
)
|
||||
result = loader.load()
|
||||
|
||||
assert result == expected_docs
|
||||
|
Loading…
Reference in New Issue
Block a user