community: fixes json loader not getting texts with json standard (#27327)

This PR fixes JSONLoader._get_text not converting objects to json string
correctly.
If an object is serializable and is not a dict, JSONLoader will use
python built-in str() method to convert it to string. This may cause
object converted to strings not following json standard. For example, a
list will be converted to string with single quotes, and if json.loads
try to load this string, it will cause error.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Botong Zhu 2024-12-13 03:33:45 +08:00 committed by GitHub
parent 4149c0dd8d
commit 13c3c4a210
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 14 additions and 13 deletions

View File

@ -188,7 +188,7 @@ class JSONLoader(BaseLoader):
# In case the text is None, set it to an empty string
elif isinstance(content, str):
return content
elif isinstance(content, dict):
elif isinstance(content, (dict, list)):
return json.dumps(content) if content else ""
else:
return str(content) if content is not None else ""

View File

@ -1,4 +1,5 @@
import io
from pathlib import Path
from typing import Any, Dict
import pytest
@ -12,7 +13,7 @@ pytestmark = pytest.mark.requires("jq")
def test_load_valid_string_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="value1",
@ -37,7 +38,7 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None:
def test_load_valid_dict_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content='{"text": "value1"}',
@ -64,7 +65,7 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None:
def test_load_valid_bool_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="False",
@ -93,7 +94,7 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None:
def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="99",
@ -122,7 +123,7 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
def test_load_invalid_test_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
mocker.patch("builtins.open", mocker.mock_open())
mocker.patch(
@ -139,7 +140,7 @@ def test_load_invalid_test_content(mocker: MockerFixture) -> None:
def test_load_jsonlines(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="value1",
@ -177,7 +178,7 @@ def test_load_jsonlines(mocker: MockerFixture) -> None:
),
)
def test_load_jsonlines_list(params: Dict, mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="value1",
@ -250,7 +251,7 @@ def test_json_meta_01(
mocker.patch("builtins.open", mocker.mock_open())
mocker.patch(patch_func, return_value=patch_func_value)
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="value1",
@ -300,7 +301,7 @@ def test_json_meta_02(
mocker.patch("builtins.open", mocker.mock_open())
mocker.patch(patch_func, return_value=patch_func_value)
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="value1",
@ -336,7 +337,7 @@ def test_json_meta_02(
def test_load_json_with_jq_parsable_content_key(
params: Dict, mocker: MockerFixture
) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="value1",
@ -364,7 +365,7 @@ def test_load_json_with_jq_parsable_content_key(
def test_load_json_with_nested_jq_parsable_content_key(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="message1",
@ -401,7 +402,7 @@ def test_load_json_with_nested_jq_parsable_content_key(mocker: MockerFixture) ->
def test_load_json_with_nested_jq_parsable_content_key_with_metadata(
mocker: MockerFixture,
) -> None:
file_path = "/workspaces/langchain/test.json"
file_path = str(Path("/workspaces/langchain/test.json").resolve())
expected_docs = [
Document(
page_content="message1",