mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-18 21:09:00 +00:00
Harrison/json loader fix (#4686)
Co-authored-by: Triet Le <112841660+triet-lq-holistics@users.noreply.github.com>
This commit is contained in:
parent
ed8207b2fb
commit
cdc20d1203
@ -1,7 +1,7 @@
|
|||||||
"""Loader that loads data from JSON."""
|
"""Loader that loads data from JSON."""
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable, Dict, List, Optional, Union
|
from typing import Any, Callable, Dict, List, Optional, Union
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
@ -23,6 +23,7 @@ class JSONLoader(BaseLoader):
|
|||||||
jq_schema: str,
|
jq_schema: str,
|
||||||
content_key: Optional[str] = None,
|
content_key: Optional[str] = None,
|
||||||
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
|
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
|
||||||
|
text_content: bool = True,
|
||||||
):
|
):
|
||||||
"""Initialize the JSONLoader.
|
"""Initialize the JSONLoader.
|
||||||
|
|
||||||
@ -35,6 +36,8 @@ class JSONLoader(BaseLoader):
|
|||||||
metadata_func (Callable[Dict, Dict]): A function that takes in the JSON
|
metadata_func (Callable[Dict, Dict]): A function that takes in the JSON
|
||||||
object extracted by the jq_schema and the default metadata and returns
|
object extracted by the jq_schema and the default metadata and returns
|
||||||
a dict of the updated metadata.
|
a dict of the updated metadata.
|
||||||
|
text_content (bool): Boolean flag to indicates whether the content is in
|
||||||
|
string format, default to True
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
import jq # noqa:F401
|
import jq # noqa:F401
|
||||||
@ -47,16 +50,58 @@ class JSONLoader(BaseLoader):
|
|||||||
self._jq_schema = jq.compile(jq_schema)
|
self._jq_schema = jq.compile(jq_schema)
|
||||||
self._content_key = content_key
|
self._content_key = content_key
|
||||||
self._metadata_func = metadata_func
|
self._metadata_func = metadata_func
|
||||||
|
self._text_content = text_content
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load and return documents from the JSON file."""
|
"""Load and return documents from the JSON file."""
|
||||||
|
|
||||||
data = self._jq_schema.input(json.loads(self.file_path.read_text()))
|
data = self._jq_schema.input(json.loads(self.file_path.read_text()))
|
||||||
|
|
||||||
# Perform some validation
|
# Perform some validation
|
||||||
# This is not a perfect validation, but it should catch most cases
|
# This is not a perfect validation, but it should catch most cases
|
||||||
# and prevent the user from getting a cryptic error later on.
|
# and prevent the user from getting a cryptic error later on.
|
||||||
if self._content_key is not None:
|
if self._content_key is not None:
|
||||||
|
self._validate_content_key(data)
|
||||||
|
|
||||||
|
docs = []
|
||||||
|
for i, sample in enumerate(data, 1):
|
||||||
|
metadata = dict(
|
||||||
|
source=str(self.file_path),
|
||||||
|
seq_num=i,
|
||||||
|
)
|
||||||
|
text = self._get_text(sample=sample, metadata=metadata)
|
||||||
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
|
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def _get_text(self, sample: Any, metadata: dict) -> str:
|
||||||
|
"""Convert sample to string format"""
|
||||||
|
if self._content_key is not None:
|
||||||
|
content = sample.get(self._content_key)
|
||||||
|
if self._metadata_func is not None:
|
||||||
|
# We pass in the metadata dict to the metadata_func
|
||||||
|
# so that the user can customize the default metadata
|
||||||
|
# based on the content of the JSON object.
|
||||||
|
metadata = self._metadata_func(sample, metadata)
|
||||||
|
else:
|
||||||
|
content = sample
|
||||||
|
|
||||||
|
if self._text_content and not isinstance(content, str):
|
||||||
|
raise ValueError(
|
||||||
|
f"Expected page_content is string, got {type(content)} instead. \
|
||||||
|
Set `text_content=False` if the desired input for \
|
||||||
|
`page_content` is not a string"
|
||||||
|
)
|
||||||
|
|
||||||
|
# In case the text is None, set it to an empty string
|
||||||
|
elif isinstance(content, str):
|
||||||
|
return content
|
||||||
|
elif isinstance(content, dict):
|
||||||
|
return json.dumps(content) if content else ""
|
||||||
|
else:
|
||||||
|
return str(content) if content is not None else ""
|
||||||
|
|
||||||
|
def _validate_content_key(self, data: Any) -> None:
|
||||||
|
"""Check if content key is valid"""
|
||||||
sample = data.first()
|
sample = data.first()
|
||||||
if not isinstance(sample, dict):
|
if not isinstance(sample, dict):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -77,28 +122,3 @@ class JSONLoader(BaseLoader):
|
|||||||
f"Expected the metadata_func to return a dict but got \
|
f"Expected the metadata_func to return a dict but got \
|
||||||
`{type(sample_metadata)}`"
|
`{type(sample_metadata)}`"
|
||||||
)
|
)
|
||||||
|
|
||||||
docs = []
|
|
||||||
|
|
||||||
for i, sample in enumerate(data, 1):
|
|
||||||
metadata = dict(
|
|
||||||
source=str(self.file_path),
|
|
||||||
seq_num=i,
|
|
||||||
)
|
|
||||||
|
|
||||||
if self._content_key is not None:
|
|
||||||
text = sample.get(self._content_key)
|
|
||||||
if self._metadata_func is not None:
|
|
||||||
# We pass in the metadata dict to the metadata_func
|
|
||||||
# so that the user can customize the default metadata
|
|
||||||
# based on the content of the JSON object.
|
|
||||||
metadata = self._metadata_func(sample, metadata)
|
|
||||||
else:
|
|
||||||
text = sample
|
|
||||||
|
|
||||||
# In case the text is None, set it to an empty string
|
|
||||||
text = text or ""
|
|
||||||
|
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
|
||||||
|
|
||||||
return docs
|
|
||||||
|
14
poetry.lock
generated
14
poetry.lock
generated
@ -1,4 +1,4 @@
|
|||||||
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
|
# This file is automatically @generated by Poetry and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "absl-py"
|
name = "absl-py"
|
||||||
@ -9994,18 +9994,18 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
|
|||||||
cffi = ["cffi (>=1.11)"]
|
cffi = ["cffi (>=1.11)"]
|
||||||
|
|
||||||
[extras]
|
[extras]
|
||||||
all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "hnswlib", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "lark", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "protobuf", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
|
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "protobuf", "hnswlib", "steamship", "pdfminer-six"]
|
||||||
azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"]
|
azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"]
|
||||||
cohere = ["cohere"]
|
cohere = ["cohere"]
|
||||||
embeddings = ["sentence-transformers"]
|
embeddings = ["sentence-transformers"]
|
||||||
extended-testing = ["pdfminer-six", "pypdf", "tqdm"]
|
extended-testing = ["pypdf", "pdfminer-six", "tqdm", "jq"]
|
||||||
hnswlib = ["docarray", "hnswlib", "protobuf"]
|
hnswlib = ["docarray", "protobuf", "hnswlib"]
|
||||||
in-memory-store = ["docarray"]
|
in-memory-store = ["docarray"]
|
||||||
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
|
llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
|
||||||
openai = ["openai", "tiktoken"]
|
openai = ["openai", "tiktoken"]
|
||||||
qdrant = ["qdrant-client"]
|
qdrant = ["qdrant-client"]
|
||||||
|
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.8.1,<4.0"
|
python-versions = ">=3.8.1,<4.0"
|
||||||
content-hash = "6d5c4aa06539e6f7c7531c30d73cbf08fbdea75486bf4b81c106b9e678a13b45"
|
content-hash = "42b518704c39bc25c6da05f81a9488a9a6fecfd7784b3c9915d30127ce384a63"
|
||||||
|
@ -171,7 +171,7 @@ azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"]
|
|||||||
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "protobuf", "hnswlib", "steamship", "pdfminer-six"]
|
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "protobuf", "hnswlib", "steamship", "pdfminer-six"]
|
||||||
# An extra used to be able to add extended testing.
|
# An extra used to be able to add extended testing.
|
||||||
extended_testing = [
|
extended_testing = [
|
||||||
"pypdf", "pdfminer.six", "tqdm"
|
"pypdf", "pdfminer.six", "tqdm", "jq"
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.ruff]
|
[tool.ruff]
|
||||||
|
123
tests/unit_tests/document_loader/test_json_loader.py
Normal file
123
tests/unit_tests/document_loader/test_json_loader.py
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
import pytest
|
||||||
|
from pytest import raises
|
||||||
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.json_loader import JSONLoader
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("jq")
|
||||||
|
def test_load_valid_string_content(mocker: MockerFixture) -> None:
|
||||||
|
file_path = "/workspaces/langchain/test.json"
|
||||||
|
expected_docs = [
|
||||||
|
Document(
|
||||||
|
page_content="value1",
|
||||||
|
metadata={"source": file_path, "seq_num": 1},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="value2",
|
||||||
|
metadata={"source": file_path, "seq_num": 2},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
mocker.patch("builtins.open", mocker.mock_open())
|
||||||
|
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
|
||||||
|
mock_csv_reader.return_value = '[{"text": "value1"}, {"text": "value2"}]'
|
||||||
|
|
||||||
|
loader = JSONLoader(file_path=file_path, jq_schema=".[].text", text_content=True)
|
||||||
|
result = loader.load()
|
||||||
|
|
||||||
|
assert result == expected_docs
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("jq")
|
||||||
|
def test_load_valid_dict_content(mocker: MockerFixture) -> None:
|
||||||
|
file_path = "/workspaces/langchain/test.json"
|
||||||
|
expected_docs = [
|
||||||
|
Document(
|
||||||
|
page_content='{"text": "value1"}',
|
||||||
|
metadata={"source": file_path, "seq_num": 1},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content='{"text": "value2"}',
|
||||||
|
metadata={"source": file_path, "seq_num": 2},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
mocker.patch("builtins.open", mocker.mock_open())
|
||||||
|
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
|
||||||
|
mock_csv_reader.return_value = """
|
||||||
|
[{"text": "value1"}, {"text": "value2"}]
|
||||||
|
"""
|
||||||
|
|
||||||
|
loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=False)
|
||||||
|
result = loader.load()
|
||||||
|
|
||||||
|
assert result == expected_docs
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("jq")
|
||||||
|
def test_load_valid_bool_content(mocker: MockerFixture) -> None:
|
||||||
|
file_path = "/workspaces/langchain/test.json"
|
||||||
|
expected_docs = [
|
||||||
|
Document(
|
||||||
|
page_content="False",
|
||||||
|
metadata={"source": file_path, "seq_num": 1},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="True",
|
||||||
|
metadata={"source": file_path, "seq_num": 2},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
mocker.patch("builtins.open", mocker.mock_open())
|
||||||
|
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
|
||||||
|
mock_csv_reader.return_value = """
|
||||||
|
[
|
||||||
|
{"flag": false}, {"flag": true}
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
|
||||||
|
loader = JSONLoader(file_path=file_path, jq_schema=".[].flag", text_content=False)
|
||||||
|
result = loader.load()
|
||||||
|
|
||||||
|
assert result == expected_docs
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("jq")
|
||||||
|
def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
|
||||||
|
file_path = "/workspaces/langchain/test.json"
|
||||||
|
expected_docs = [
|
||||||
|
Document(
|
||||||
|
page_content="99",
|
||||||
|
metadata={"source": file_path, "seq_num": 1},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="99.5",
|
||||||
|
metadata={"source": file_path, "seq_num": 2},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
mocker.patch("builtins.open", mocker.mock_open())
|
||||||
|
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
|
||||||
|
mock_csv_reader.return_value = """
|
||||||
|
[
|
||||||
|
{"num": 99}, {"num": 99.5}
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
|
||||||
|
loader = JSONLoader(file_path=file_path, jq_schema=".[].num", text_content=False)
|
||||||
|
result = loader.load()
|
||||||
|
|
||||||
|
assert result == expected_docs
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("jq")
|
||||||
|
def test_load_invalid_test_content(mocker: MockerFixture) -> None:
|
||||||
|
file_path = "/workspaces/langchain/test.json"
|
||||||
|
mocker.patch("builtins.open", mocker.mock_open())
|
||||||
|
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
|
||||||
|
mock_csv_reader.return_value = """
|
||||||
|
[{"text": "value1"}, {"text": "value2"}]
|
||||||
|
"""
|
||||||
|
|
||||||
|
loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=True)
|
||||||
|
|
||||||
|
with raises(ValueError):
|
||||||
|
loader.load()
|
Loading…
Reference in New Issue
Block a user