mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-05 14:43:08 +00:00
Bugfix/jsonloader metadata (#9793)
Hi, - Description: - Solves the issue #6478. - Includes some additional rework on the `JSONLoader` class: - Getting metadata is decoupled from `_get_text` - Validating metadata_func is perform now by `_validate_metadata_func`, instead of `_validate_content_key` - Issue: #6478 - Dependencies: NA - Tag maintainer: @hwchase17
This commit is contained in:
parent
7d1b0fbe79
commit
33f43cc1b0
@ -76,24 +76,20 @@ class JSONLoader(BaseLoader):
|
|||||||
# and prevent the user from getting a cryptic error later on.
|
# and prevent the user from getting a cryptic error later on.
|
||||||
if self._content_key is not None:
|
if self._content_key is not None:
|
||||||
self._validate_content_key(data)
|
self._validate_content_key(data)
|
||||||
|
if self._metadata_func is not None:
|
||||||
|
self._validate_metadata_func(data)
|
||||||
|
|
||||||
for i, sample in enumerate(data, len(docs) + 1):
|
for i, sample in enumerate(data, len(docs) + 1):
|
||||||
metadata = dict(
|
text = self._get_text(sample=sample)
|
||||||
source=str(self.file_path),
|
metadata = self._get_metadata(
|
||||||
seq_num=i,
|
sample=sample, source=str(self.file_path), seq_num=i
|
||||||
)
|
)
|
||||||
text = self._get_text(sample=sample, metadata=metadata)
|
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
|
|
||||||
def _get_text(self, sample: Any, metadata: dict) -> str:
|
def _get_text(self, sample: Any) -> str:
|
||||||
"""Convert sample to string format"""
|
"""Convert sample to string format"""
|
||||||
if self._content_key is not None:
|
if self._content_key is not None:
|
||||||
content = sample.get(self._content_key)
|
content = sample.get(self._content_key)
|
||||||
if self._metadata_func is not None:
|
|
||||||
# We pass in the metadata dict to the metadata_func
|
|
||||||
# so that the user can customize the default metadata
|
|
||||||
# based on the content of the JSON object.
|
|
||||||
metadata = self._metadata_func(sample, metadata)
|
|
||||||
else:
|
else:
|
||||||
content = sample
|
content = sample
|
||||||
|
|
||||||
@ -112,6 +108,20 @@ class JSONLoader(BaseLoader):
|
|||||||
else:
|
else:
|
||||||
return str(content) if content is not None else ""
|
return str(content) if content is not None else ""
|
||||||
|
|
||||||
|
def _get_metadata(
|
||||||
|
self, sample: Dict[str, Any], **additional_fields: Any
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Return a metadata dictionary base on the existence of metadata_func
|
||||||
|
:param sample: single data payload
|
||||||
|
:param additional_fields: key-word arguments to be added as metadata values
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
if self._metadata_func is not None:
|
||||||
|
return self._metadata_func(sample, additional_fields)
|
||||||
|
else:
|
||||||
|
return additional_fields
|
||||||
|
|
||||||
def _validate_content_key(self, data: Any) -> None:
|
def _validate_content_key(self, data: Any) -> None:
|
||||||
"""Check if a content key is valid"""
|
"""Check if a content key is valid"""
|
||||||
sample = data.first()
|
sample = data.first()
|
||||||
@ -127,6 +137,10 @@ class JSONLoader(BaseLoader):
|
|||||||
with the key `{self._content_key}`"
|
with the key `{self._content_key}`"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _validate_metadata_func(self, data: Any) -> None:
|
||||||
|
"""Check if the metadata_func output is valid"""
|
||||||
|
|
||||||
|
sample = data.first()
|
||||||
if self._metadata_func is not None:
|
if self._metadata_func is not None:
|
||||||
sample_metadata = self._metadata_func(sample, {})
|
sample_metadata = self._metadata_func(sample, {})
|
||||||
if not isinstance(sample_metadata, dict):
|
if not isinstance(sample_metadata, dict):
|
||||||
|
@ -244,7 +244,7 @@ def test_load_empty_jsonlines(mocker: MockerFixture) -> None:
|
|||||||
),
|
),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
def test_json_meta(
|
def test_json_meta_01(
|
||||||
patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture
|
patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture
|
||||||
) -> None:
|
) -> None:
|
||||||
mocker.patch("builtins.open", mocker.mock_open())
|
mocker.patch("builtins.open", mocker.mock_open())
|
||||||
@ -270,3 +270,52 @@ def test_json_meta(
|
|||||||
result = loader.load()
|
result = loader.load()
|
||||||
|
|
||||||
assert result == expected_docs
|
assert result == expected_docs
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"patch_func,patch_func_value,kwargs",
|
||||||
|
(
|
||||||
|
# JSON content.
|
||||||
|
(
|
||||||
|
"pathlib.Path.read_text",
|
||||||
|
'[{"text": "value1"}, {"text": "value2"}]',
|
||||||
|
{"jq_schema": ".[]", "content_key": "text"},
|
||||||
|
),
|
||||||
|
# JSON Lines content.
|
||||||
|
(
|
||||||
|
"pathlib.Path.open",
|
||||||
|
io.StringIO(
|
||||||
|
"""
|
||||||
|
{"text": "value1"}
|
||||||
|
{"text": "value2"}
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{"jq_schema": ".", "content_key": "text", "json_lines": True},
|
||||||
|
),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
def test_json_meta_02(
|
||||||
|
patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture
|
||||||
|
) -> None:
|
||||||
|
mocker.patch("builtins.open", mocker.mock_open())
|
||||||
|
mocker.patch(patch_func, return_value=patch_func_value)
|
||||||
|
|
||||||
|
file_path = "/workspaces/langchain/test.json"
|
||||||
|
expected_docs = [
|
||||||
|
Document(
|
||||||
|
page_content="value1",
|
||||||
|
metadata={"source": file_path, "seq_num": 1, "x": "value1-meta"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="value2",
|
||||||
|
metadata={"source": file_path, "seq_num": 2, "x": "value2-meta"},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
def metadata_func(record: Dict, metadata: Dict) -> Dict:
|
||||||
|
return {**metadata, "x": f"{record['text']}-meta"}
|
||||||
|
|
||||||
|
loader = JSONLoader(file_path=file_path, metadata_func=metadata_func, **kwargs)
|
||||||
|
result = loader.load()
|
||||||
|
|
||||||
|
assert result == expected_docs
|
||||||
|
Loading…
Reference in New Issue
Block a user