diff --git a/docs/docs_skeleton/docs/modules/data_connection/document_loaders/how_to/json.mdx b/docs/docs_skeleton/docs/modules/data_connection/document_loaders/how_to/json.mdx
index ecd849bfbe4..8cc600170b2 100644
--- a/docs/docs_skeleton/docs/modules/data_connection/document_loaders/how_to/json.mdx
+++ b/docs/docs_skeleton/docs/modules/data_connection/document_loaders/how_to/json.mdx
@@ -2,6 +2,8 @@
>[JSON (JavaScript Object Notation)](https://en.wikipedia.org/wiki/JSON) is an open standard file format and data interchange format that uses human-readable text to store and transmit data objects consisting of attribute–value pairs and arrays (or other serializable values).
+>[JSON Lines](https://jsonlines.org/) is a file format where each line is a valid JSON value.
+
import Example from "@snippets/modules/data_connection/document_loaders/how_to/json.mdx"
diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/example_data/facebook_chat_messages.jsonl b/docs/extras/modules/data_connection/document_loaders/integrations/example_data/facebook_chat_messages.jsonl
new file mode 100644
index 00000000000..215d2bbaa40
--- /dev/null
+++ b/docs/extras/modules/data_connection/document_loaders/integrations/example_data/facebook_chat_messages.jsonl
@@ -0,0 +1,3 @@
+{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}
+{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no worries! Bye"}
+{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im sorry it was my mistake, the blue one is not for sale"}
diff --git a/docs/snippets/modules/data_connection/document_loaders/how_to/json.mdx b/docs/snippets/modules/data_connection/document_loaders/how_to/json.mdx
index 307720ae3e4..7b56867047b 100644
--- a/docs/snippets/modules/data_connection/document_loaders/how_to/json.mdx
+++ b/docs/snippets/modules/data_connection/document_loaders/how_to/json.mdx
@@ -78,11 +78,14 @@ pprint(data)
+
## Using `JSONLoader`
Suppose we are interested in extracting the values under the `content` field within the `messages` key of the JSON data. This can easily be done through the `JSONLoader` as shown below.
+### JSON file
+
```python
loader = JSONLoader(
file_path='./example_data/facebook_chat.json',
@@ -114,6 +117,81 @@ pprint(data)
+
+### JSON Lines file
+
+If you want to load documents from a JSON Lines file, you pass `json_lines=True`
+and specify `jq_schema` to extract `page_content` from a single JSON object.
+
+```python
+file_path = './example_data/facebook_chat_messages.jsonl'
+pprint(Path(file_path).read_text())
+```
+
+
+
+```
+ ('{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}\n'
+ '{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no '
+ 'worries! Bye"}\n'
+ '{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im '
+ 'sorry it was my mistake, the blue one is not for sale"}\n')
+```
+
+
+
+
+```python
+loader = JSONLoader(
+ file_path='./example_data/facebook_chat_messages.jsonl',
+ jq_schema='.content',
+ json_lines=True)
+
+data = loader.load()
+```
+
+```python
+pprint(data)
+```
+
+
+
+```
+ [Document(page_content='Bye!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
+ Document(page_content='Oh no worries! Bye', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
+ Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
+```
+
+
+
+
+Another option is set `jq_schema='.'` and provide `content_key`:
+
+```python
+loader = JSONLoader(
+ file_path='./example_data/facebook_chat_messages.jsonl',
+ jq_schema='.',
+ content_key='sender_name',
+ json_lines=True)
+
+data = loader.load()
+```
+
+```python
+pprint(data)
+```
+
+
+
+```
+ [Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
+ Document(page_content='User 1', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
+ Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
+```
+
+
+
+
## Extracting metadata
Generally, we want to include metadata available in the JSON file into the documents that we create from the content.
diff --git a/langchain/document_loaders/json_loader.py b/langchain/document_loaders/json_loader.py
index c31b9a4848f..9e793798e38 100644
--- a/langchain/document_loaders/json_loader.py
+++ b/langchain/document_loaders/json_loader.py
@@ -23,11 +23,12 @@ class JSONLoader(BaseLoader):
content_key: Optional[str] = None,
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
text_content: bool = True,
+ json_lines: bool = False,
):
"""Initialize the JSONLoader.
Args:
- file_path (Union[str, Path]): The path to the JSON file.
+ file_path (Union[str, Path]): The path to the JSON or JSON Lines file.
jq_schema (str): The jq schema to use to extract the data or text from
the JSON.
content_key (str): The key to use to extract the content from the JSON if
@@ -35,8 +36,10 @@ class JSONLoader(BaseLoader):
metadata_func (Callable[Dict, Dict]): A function that takes in the JSON
object extracted by the jq_schema and the default metadata and returns
a dict of the updated metadata.
- text_content (bool): Boolean flag to indicates whether the content is in
- string format, default to True
+ text_content (bool): Boolean flag to indicate whether the content is in
+ string format, default to True.
+ json_lines (bool): Boolean flag to indicate whether the input is in
+ JSON Lines format.
"""
try:
import jq # noqa:F401
@@ -50,10 +53,24 @@ class JSONLoader(BaseLoader):
self._content_key = content_key
self._metadata_func = metadata_func
self._text_content = text_content
+ self._json_lines = json_lines
def load(self) -> List[Document]:
"""Load and return documents from the JSON file."""
- data = self._jq_schema.input(json.loads(self.file_path.read_text()))
+ docs: List[Document] = []
+ if self._json_lines:
+ with self.file_path.open(encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ self._parse(line, docs)
+ else:
+ self._parse(self.file_path.read_text(), docs)
+ return docs
+
+ def _parse(self, content: str, docs: List[Document]) -> None:
+ """Convert given content to documents."""
+ data = self._jq_schema.input(json.loads(content))
# Perform some validation
# This is not a perfect validation, but it should catch most cases
@@ -61,8 +78,7 @@ class JSONLoader(BaseLoader):
if self._content_key is not None:
self._validate_content_key(data)
- docs = []
- for i, sample in enumerate(data, 1):
+ for i, sample in enumerate(data, len(docs) + 1):
metadata = dict(
source=str(self.file_path),
seq_num=i,
@@ -70,8 +86,6 @@ class JSONLoader(BaseLoader):
text = self._get_text(sample=sample, metadata=metadata)
docs.append(Document(page_content=text, metadata=metadata))
- return docs
-
def _get_text(self, sample: Any, metadata: dict) -> str:
"""Convert sample to string format"""
if self._content_key is not None:
diff --git a/tests/unit_tests/document_loaders/test_json_loader.py b/tests/unit_tests/document_loaders/test_json_loader.py
index 31739d4dfe1..19fb90e0c90 100644
--- a/tests/unit_tests/document_loaders/test_json_loader.py
+++ b/tests/unit_tests/document_loaders/test_json_loader.py
@@ -1,3 +1,6 @@
+import io
+from typing import Any, Dict
+
import pytest
from pytest import raises
from pytest_mock import MockerFixture
@@ -5,8 +8,9 @@ from pytest_mock import MockerFixture
from langchain.docstore.document import Document
from langchain.document_loaders.json_loader import JSONLoader
+pytestmark = pytest.mark.requires("jq")
+
-@pytest.mark.requires("jq")
def test_load_valid_string_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
@@ -19,9 +23,12 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None:
metadata={"source": file_path, "seq_num": 2},
),
]
+
mocker.patch("builtins.open", mocker.mock_open())
- mock_csv_reader = mocker.patch("pathlib.Path.read_text")
- mock_csv_reader.return_value = '[{"text": "value1"}, {"text": "value2"}]'
+ mocker.patch(
+ "pathlib.Path.read_text",
+ return_value='[{"text": "value1"}, {"text": "value2"}]',
+ )
loader = JSONLoader(file_path=file_path, jq_schema=".[].text", text_content=True)
result = loader.load()
@@ -29,7 +36,6 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None:
assert result == expected_docs
-@pytest.mark.requires("jq")
def test_load_valid_dict_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
@@ -42,11 +48,14 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None:
metadata={"source": file_path, "seq_num": 2},
),
]
+
mocker.patch("builtins.open", mocker.mock_open())
- mock_csv_reader = mocker.patch("pathlib.Path.read_text")
- mock_csv_reader.return_value = """
- [{"text": "value1"}, {"text": "value2"}]
- """
+ mocker.patch(
+ "pathlib.Path.read_text",
+ return_value="""
+ [{"text": "value1"}, {"text": "value2"}]
+ """,
+ )
loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=False)
result = loader.load()
@@ -54,7 +63,6 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None:
assert result == expected_docs
-@pytest.mark.requires("jq")
def test_load_valid_bool_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
@@ -67,13 +75,16 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None:
metadata={"source": file_path, "seq_num": 2},
),
]
+
mocker.patch("builtins.open", mocker.mock_open())
- mock_csv_reader = mocker.patch("pathlib.Path.read_text")
- mock_csv_reader.return_value = """
- [
- {"flag": false}, {"flag": true}
- ]
- """
+ mocker.patch(
+ "pathlib.Path.read_text",
+ return_value="""
+ [
+ {"flag": false}, {"flag": true}
+ ]
+ """,
+ )
loader = JSONLoader(file_path=file_path, jq_schema=".[].flag", text_content=False)
result = loader.load()
@@ -81,7 +92,6 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None:
assert result == expected_docs
-@pytest.mark.requires("jq")
def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
@@ -94,13 +104,16 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
metadata={"source": file_path, "seq_num": 2},
),
]
+
mocker.patch("builtins.open", mocker.mock_open())
- mock_csv_reader = mocker.patch("pathlib.Path.read_text")
- mock_csv_reader.return_value = """
- [
- {"num": 99}, {"num": 99.5}
- ]
- """
+ mocker.patch(
+ "pathlib.Path.read_text",
+ return_value="""
+ [
+ {"num": 99}, {"num": 99.5}
+ ]
+ """,
+ )
loader = JSONLoader(file_path=file_path, jq_schema=".[].num", text_content=False)
result = loader.load()
@@ -108,16 +121,152 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
assert result == expected_docs
-@pytest.mark.requires("jq")
def test_load_invalid_test_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
+
mocker.patch("builtins.open", mocker.mock_open())
- mock_csv_reader = mocker.patch("pathlib.Path.read_text")
- mock_csv_reader.return_value = """
- [{"text": "value1"}, {"text": "value2"}]
- """
+ mocker.patch(
+ "pathlib.Path.read_text",
+ return_value="""
+ [{"text": "value1"}, {"text": "value2"}]
+ """,
+ )
loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=True)
with raises(ValueError):
loader.load()
+
+
+def test_load_jsonlines(mocker: MockerFixture) -> None:
+ file_path = "/workspaces/langchain/test.json"
+ expected_docs = [
+ Document(
+ page_content="value1",
+ metadata={"source": file_path, "seq_num": 1},
+ ),
+ Document(
+ page_content="value2",
+ metadata={"source": file_path, "seq_num": 2},
+ ),
+ ]
+
+ mocker.patch(
+ "pathlib.Path.open",
+ return_value=io.StringIO(
+ """
+ {"text": "value1"}
+ {"text": "value2"}
+ """
+ ),
+ )
+
+ loader = JSONLoader(
+ file_path=file_path, jq_schema=".", content_key="text", json_lines=True
+ )
+ result = loader.load()
+
+ assert result == expected_docs
+
+
+@pytest.mark.parametrize(
+ "params",
+ (
+ {"jq_schema": ".[].text"},
+ {"jq_schema": ".[]", "content_key": "text"},
+ ),
+)
+def test_load_jsonlines_list(params: Dict, mocker: MockerFixture) -> None:
+ file_path = "/workspaces/langchain/test.json"
+ expected_docs = [
+ Document(
+ page_content="value1",
+ metadata={"source": file_path, "seq_num": 1},
+ ),
+ Document(
+ page_content="value2",
+ metadata={"source": file_path, "seq_num": 2},
+ ),
+ Document(
+ page_content="value3",
+ metadata={"source": file_path, "seq_num": 3},
+ ),
+ Document(
+ page_content="value4",
+ metadata={"source": file_path, "seq_num": 4},
+ ),
+ ]
+
+ mocker.patch(
+ "pathlib.Path.open",
+ return_value=io.StringIO(
+ """
+ [{"text": "value1"}, {"text": "value2"}]
+ [{"text": "value3"}, {"text": "value4"}]
+ """
+ ),
+ )
+
+ loader = JSONLoader(file_path=file_path, json_lines=True, **params)
+ result = loader.load()
+
+ assert result == expected_docs
+
+
+def test_load_empty_jsonlines(mocker: MockerFixture) -> None:
+ mocker.patch("pathlib.Path.open", return_value=io.StringIO(""))
+
+ loader = JSONLoader(file_path="file_path", jq_schema=".[].text", json_lines=True)
+ result = loader.load()
+
+ assert result == []
+
+
+@pytest.mark.parametrize(
+ "patch_func,patch_func_value,kwargs",
+ (
+ # JSON content.
+ (
+ "pathlib.Path.read_text",
+ '[{"text": "value1"}, {"text": "value2"}]',
+ {"jq_schema": ".[]", "content_key": "text"},
+ ),
+ # JSON Lines content.
+ (
+ "pathlib.Path.open",
+ io.StringIO(
+ """
+ {"text": "value1"}
+ {"text": "value2"}
+ """
+ ),
+ {"jq_schema": ".", "content_key": "text", "json_lines": True},
+ ),
+ ),
+)
+def test_json_meta(
+ patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture
+) -> None:
+ mocker.patch("builtins.open", mocker.mock_open())
+ mocker.patch(patch_func, return_value=patch_func_value)
+
+ file_path = "/workspaces/langchain/test.json"
+ expected_docs = [
+ Document(
+ page_content="value1",
+ metadata={"source": file_path, "seq_num": 1, "x": "value1-meta"},
+ ),
+ Document(
+ page_content="value2",
+ metadata={"source": file_path, "seq_num": 2, "x": "value2-meta"},
+ ),
+ ]
+
+ def metadata_func(record: Dict, metadata: Dict) -> Dict:
+ metadata["x"] = f"{record['text']}-meta"
+ return metadata
+
+ loader = JSONLoader(file_path=file_path, metadata_func=metadata_func, **kwargs)
+ result = loader.load()
+
+ assert result == expected_docs