mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 16:43:35 +00:00
Add JSON Lines support to JSONLoader (#6913)
**Description**: The JSON Lines format is used by some services such as OpenAI and HuggingFace. It's also a convenient alternative to CSV. This PR adds JSON Lines support to `JSONLoader` and also updates related tests. **Tag maintainer**: @rlancemartin, @eyurtsev. PS I was not able to build docs locally so didn't update related section.
This commit is contained in:
parent
153b56d19b
commit
6d15854cda
@ -2,6 +2,8 @@
|
||||
|
||||
>[JSON (JavaScript Object Notation)](https://en.wikipedia.org/wiki/JSON) is an open standard file format and data interchange format that uses human-readable text to store and transmit data objects consisting of attribute–value pairs and arrays (or other serializable values).
|
||||
|
||||
>[JSON Lines](https://jsonlines.org/) is a file format where each line is a valid JSON value.
|
||||
|
||||
import Example from "@snippets/modules/data_connection/document_loaders/how_to/json.mdx"
|
||||
|
||||
<Example/>
|
||||
|
@ -0,0 +1,3 @@
|
||||
{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}
|
||||
{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no worries! Bye"}
|
||||
{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im sorry it was my mistake, the blue one is not for sale"}
|
@ -78,11 +78,14 @@ pprint(data)
|
||||
|
||||
</CodeOutputBlock>
|
||||
|
||||
|
||||
## Using `JSONLoader`
|
||||
|
||||
Suppose we are interested in extracting the values under the `content` field within the `messages` key of the JSON data. This can easily be done through the `JSONLoader` as shown below.
|
||||
|
||||
|
||||
### JSON file
|
||||
|
||||
```python
|
||||
loader = JSONLoader(
|
||||
file_path='./example_data/facebook_chat.json',
|
||||
@ -114,6 +117,81 @@ pprint(data)
|
||||
|
||||
</CodeOutputBlock>
|
||||
|
||||
|
||||
### JSON Lines file
|
||||
|
||||
If you want to load documents from a JSON Lines file, you pass `json_lines=True`
|
||||
and specify `jq_schema` to extract `page_content` from a single JSON object.
|
||||
|
||||
```python
|
||||
file_path = './example_data/facebook_chat_messages.jsonl'
|
||||
pprint(Path(file_path).read_text())
|
||||
```
|
||||
|
||||
<CodeOutputBlock lang="python">
|
||||
|
||||
```
|
||||
('{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}\n'
|
||||
'{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no '
|
||||
'worries! Bye"}\n'
|
||||
'{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im '
|
||||
'sorry it was my mistake, the blue one is not for sale"}\n')
|
||||
```
|
||||
|
||||
</CodeOutputBlock>
|
||||
|
||||
|
||||
```python
|
||||
loader = JSONLoader(
|
||||
file_path='./example_data/facebook_chat_messages.jsonl',
|
||||
jq_schema='.content',
|
||||
json_lines=True)
|
||||
|
||||
data = loader.load()
|
||||
```
|
||||
|
||||
```python
|
||||
pprint(data)
|
||||
```
|
||||
|
||||
<CodeOutputBlock lang="python">
|
||||
|
||||
```
|
||||
[Document(page_content='Bye!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
|
||||
Document(page_content='Oh no worries! Bye', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
|
||||
Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
|
||||
```
|
||||
|
||||
</CodeOutputBlock>
|
||||
|
||||
|
||||
Another option is set `jq_schema='.'` and provide `content_key`:
|
||||
|
||||
```python
|
||||
loader = JSONLoader(
|
||||
file_path='./example_data/facebook_chat_messages.jsonl',
|
||||
jq_schema='.',
|
||||
content_key='sender_name',
|
||||
json_lines=True)
|
||||
|
||||
data = loader.load()
|
||||
```
|
||||
|
||||
```python
|
||||
pprint(data)
|
||||
```
|
||||
|
||||
<CodeOutputBlock lang="python">
|
||||
|
||||
```
|
||||
[Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
|
||||
Document(page_content='User 1', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
|
||||
Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
|
||||
```
|
||||
|
||||
</CodeOutputBlock>
|
||||
|
||||
|
||||
## Extracting metadata
|
||||
|
||||
Generally, we want to include metadata available in the JSON file into the documents that we create from the content.
|
||||
|
@ -23,11 +23,12 @@ class JSONLoader(BaseLoader):
|
||||
content_key: Optional[str] = None,
|
||||
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
|
||||
text_content: bool = True,
|
||||
json_lines: bool = False,
|
||||
):
|
||||
"""Initialize the JSONLoader.
|
||||
|
||||
Args:
|
||||
file_path (Union[str, Path]): The path to the JSON file.
|
||||
file_path (Union[str, Path]): The path to the JSON or JSON Lines file.
|
||||
jq_schema (str): The jq schema to use to extract the data or text from
|
||||
the JSON.
|
||||
content_key (str): The key to use to extract the content from the JSON if
|
||||
@ -35,8 +36,10 @@ class JSONLoader(BaseLoader):
|
||||
metadata_func (Callable[Dict, Dict]): A function that takes in the JSON
|
||||
object extracted by the jq_schema and the default metadata and returns
|
||||
a dict of the updated metadata.
|
||||
text_content (bool): Boolean flag to indicates whether the content is in
|
||||
string format, default to True
|
||||
text_content (bool): Boolean flag to indicate whether the content is in
|
||||
string format, default to True.
|
||||
json_lines (bool): Boolean flag to indicate whether the input is in
|
||||
JSON Lines format.
|
||||
"""
|
||||
try:
|
||||
import jq # noqa:F401
|
||||
@ -50,10 +53,24 @@ class JSONLoader(BaseLoader):
|
||||
self._content_key = content_key
|
||||
self._metadata_func = metadata_func
|
||||
self._text_content = text_content
|
||||
self._json_lines = json_lines
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load and return documents from the JSON file."""
|
||||
data = self._jq_schema.input(json.loads(self.file_path.read_text()))
|
||||
docs: List[Document] = []
|
||||
if self._json_lines:
|
||||
with self.file_path.open(encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
self._parse(line, docs)
|
||||
else:
|
||||
self._parse(self.file_path.read_text(), docs)
|
||||
return docs
|
||||
|
||||
def _parse(self, content: str, docs: List[Document]) -> None:
|
||||
"""Convert given content to documents."""
|
||||
data = self._jq_schema.input(json.loads(content))
|
||||
|
||||
# Perform some validation
|
||||
# This is not a perfect validation, but it should catch most cases
|
||||
@ -61,8 +78,7 @@ class JSONLoader(BaseLoader):
|
||||
if self._content_key is not None:
|
||||
self._validate_content_key(data)
|
||||
|
||||
docs = []
|
||||
for i, sample in enumerate(data, 1):
|
||||
for i, sample in enumerate(data, len(docs) + 1):
|
||||
metadata = dict(
|
||||
source=str(self.file_path),
|
||||
seq_num=i,
|
||||
@ -70,8 +86,6 @@ class JSONLoader(BaseLoader):
|
||||
text = self._get_text(sample=sample, metadata=metadata)
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
|
||||
return docs
|
||||
|
||||
def _get_text(self, sample: Any, metadata: dict) -> str:
|
||||
"""Convert sample to string format"""
|
||||
if self._content_key is not None:
|
||||
|
@ -1,3 +1,6 @@
|
||||
import io
|
||||
from typing import Any, Dict
|
||||
|
||||
import pytest
|
||||
from pytest import raises
|
||||
from pytest_mock import MockerFixture
|
||||
@ -5,8 +8,9 @@ from pytest_mock import MockerFixture
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.json_loader import JSONLoader
|
||||
|
||||
pytestmark = pytest.mark.requires("jq")
|
||||
|
||||
|
||||
@pytest.mark.requires("jq")
|
||||
def test_load_valid_string_content(mocker: MockerFixture) -> None:
|
||||
file_path = "/workspaces/langchain/test.json"
|
||||
expected_docs = [
|
||||
@ -19,9 +23,12 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None:
|
||||
metadata={"source": file_path, "seq_num": 2},
|
||||
),
|
||||
]
|
||||
|
||||
mocker.patch("builtins.open", mocker.mock_open())
|
||||
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
|
||||
mock_csv_reader.return_value = '[{"text": "value1"}, {"text": "value2"}]'
|
||||
mocker.patch(
|
||||
"pathlib.Path.read_text",
|
||||
return_value='[{"text": "value1"}, {"text": "value2"}]',
|
||||
)
|
||||
|
||||
loader = JSONLoader(file_path=file_path, jq_schema=".[].text", text_content=True)
|
||||
result = loader.load()
|
||||
@ -29,7 +36,6 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None:
|
||||
assert result == expected_docs
|
||||
|
||||
|
||||
@pytest.mark.requires("jq")
|
||||
def test_load_valid_dict_content(mocker: MockerFixture) -> None:
|
||||
file_path = "/workspaces/langchain/test.json"
|
||||
expected_docs = [
|
||||
@ -42,11 +48,14 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None:
|
||||
metadata={"source": file_path, "seq_num": 2},
|
||||
),
|
||||
]
|
||||
|
||||
mocker.patch("builtins.open", mocker.mock_open())
|
||||
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
|
||||
mock_csv_reader.return_value = """
|
||||
[{"text": "value1"}, {"text": "value2"}]
|
||||
"""
|
||||
mocker.patch(
|
||||
"pathlib.Path.read_text",
|
||||
return_value="""
|
||||
[{"text": "value1"}, {"text": "value2"}]
|
||||
""",
|
||||
)
|
||||
|
||||
loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=False)
|
||||
result = loader.load()
|
||||
@ -54,7 +63,6 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None:
|
||||
assert result == expected_docs
|
||||
|
||||
|
||||
@pytest.mark.requires("jq")
|
||||
def test_load_valid_bool_content(mocker: MockerFixture) -> None:
|
||||
file_path = "/workspaces/langchain/test.json"
|
||||
expected_docs = [
|
||||
@ -67,13 +75,16 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None:
|
||||
metadata={"source": file_path, "seq_num": 2},
|
||||
),
|
||||
]
|
||||
|
||||
mocker.patch("builtins.open", mocker.mock_open())
|
||||
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
|
||||
mock_csv_reader.return_value = """
|
||||
[
|
||||
{"flag": false}, {"flag": true}
|
||||
]
|
||||
"""
|
||||
mocker.patch(
|
||||
"pathlib.Path.read_text",
|
||||
return_value="""
|
||||
[
|
||||
{"flag": false}, {"flag": true}
|
||||
]
|
||||
""",
|
||||
)
|
||||
|
||||
loader = JSONLoader(file_path=file_path, jq_schema=".[].flag", text_content=False)
|
||||
result = loader.load()
|
||||
@ -81,7 +92,6 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None:
|
||||
assert result == expected_docs
|
||||
|
||||
|
||||
@pytest.mark.requires("jq")
|
||||
def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
|
||||
file_path = "/workspaces/langchain/test.json"
|
||||
expected_docs = [
|
||||
@ -94,13 +104,16 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
|
||||
metadata={"source": file_path, "seq_num": 2},
|
||||
),
|
||||
]
|
||||
|
||||
mocker.patch("builtins.open", mocker.mock_open())
|
||||
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
|
||||
mock_csv_reader.return_value = """
|
||||
[
|
||||
{"num": 99}, {"num": 99.5}
|
||||
]
|
||||
"""
|
||||
mocker.patch(
|
||||
"pathlib.Path.read_text",
|
||||
return_value="""
|
||||
[
|
||||
{"num": 99}, {"num": 99.5}
|
||||
]
|
||||
""",
|
||||
)
|
||||
|
||||
loader = JSONLoader(file_path=file_path, jq_schema=".[].num", text_content=False)
|
||||
result = loader.load()
|
||||
@ -108,16 +121,152 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
|
||||
assert result == expected_docs
|
||||
|
||||
|
||||
@pytest.mark.requires("jq")
|
||||
def test_load_invalid_test_content(mocker: MockerFixture) -> None:
|
||||
file_path = "/workspaces/langchain/test.json"
|
||||
|
||||
mocker.patch("builtins.open", mocker.mock_open())
|
||||
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
|
||||
mock_csv_reader.return_value = """
|
||||
[{"text": "value1"}, {"text": "value2"}]
|
||||
"""
|
||||
mocker.patch(
|
||||
"pathlib.Path.read_text",
|
||||
return_value="""
|
||||
[{"text": "value1"}, {"text": "value2"}]
|
||||
""",
|
||||
)
|
||||
|
||||
loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=True)
|
||||
|
||||
with raises(ValueError):
|
||||
loader.load()
|
||||
|
||||
|
||||
def test_load_jsonlines(mocker: MockerFixture) -> None:
|
||||
file_path = "/workspaces/langchain/test.json"
|
||||
expected_docs = [
|
||||
Document(
|
||||
page_content="value1",
|
||||
metadata={"source": file_path, "seq_num": 1},
|
||||
),
|
||||
Document(
|
||||
page_content="value2",
|
||||
metadata={"source": file_path, "seq_num": 2},
|
||||
),
|
||||
]
|
||||
|
||||
mocker.patch(
|
||||
"pathlib.Path.open",
|
||||
return_value=io.StringIO(
|
||||
"""
|
||||
{"text": "value1"}
|
||||
{"text": "value2"}
|
||||
"""
|
||||
),
|
||||
)
|
||||
|
||||
loader = JSONLoader(
|
||||
file_path=file_path, jq_schema=".", content_key="text", json_lines=True
|
||||
)
|
||||
result = loader.load()
|
||||
|
||||
assert result == expected_docs
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params",
|
||||
(
|
||||
{"jq_schema": ".[].text"},
|
||||
{"jq_schema": ".[]", "content_key": "text"},
|
||||
),
|
||||
)
|
||||
def test_load_jsonlines_list(params: Dict, mocker: MockerFixture) -> None:
|
||||
file_path = "/workspaces/langchain/test.json"
|
||||
expected_docs = [
|
||||
Document(
|
||||
page_content="value1",
|
||||
metadata={"source": file_path, "seq_num": 1},
|
||||
),
|
||||
Document(
|
||||
page_content="value2",
|
||||
metadata={"source": file_path, "seq_num": 2},
|
||||
),
|
||||
Document(
|
||||
page_content="value3",
|
||||
metadata={"source": file_path, "seq_num": 3},
|
||||
),
|
||||
Document(
|
||||
page_content="value4",
|
||||
metadata={"source": file_path, "seq_num": 4},
|
||||
),
|
||||
]
|
||||
|
||||
mocker.patch(
|
||||
"pathlib.Path.open",
|
||||
return_value=io.StringIO(
|
||||
"""
|
||||
[{"text": "value1"}, {"text": "value2"}]
|
||||
[{"text": "value3"}, {"text": "value4"}]
|
||||
"""
|
||||
),
|
||||
)
|
||||
|
||||
loader = JSONLoader(file_path=file_path, json_lines=True, **params)
|
||||
result = loader.load()
|
||||
|
||||
assert result == expected_docs
|
||||
|
||||
|
||||
def test_load_empty_jsonlines(mocker: MockerFixture) -> None:
|
||||
mocker.patch("pathlib.Path.open", return_value=io.StringIO(""))
|
||||
|
||||
loader = JSONLoader(file_path="file_path", jq_schema=".[].text", json_lines=True)
|
||||
result = loader.load()
|
||||
|
||||
assert result == []
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"patch_func,patch_func_value,kwargs",
|
||||
(
|
||||
# JSON content.
|
||||
(
|
||||
"pathlib.Path.read_text",
|
||||
'[{"text": "value1"}, {"text": "value2"}]',
|
||||
{"jq_schema": ".[]", "content_key": "text"},
|
||||
),
|
||||
# JSON Lines content.
|
||||
(
|
||||
"pathlib.Path.open",
|
||||
io.StringIO(
|
||||
"""
|
||||
{"text": "value1"}
|
||||
{"text": "value2"}
|
||||
"""
|
||||
),
|
||||
{"jq_schema": ".", "content_key": "text", "json_lines": True},
|
||||
),
|
||||
),
|
||||
)
|
||||
def test_json_meta(
|
||||
patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture
|
||||
) -> None:
|
||||
mocker.patch("builtins.open", mocker.mock_open())
|
||||
mocker.patch(patch_func, return_value=patch_func_value)
|
||||
|
||||
file_path = "/workspaces/langchain/test.json"
|
||||
expected_docs = [
|
||||
Document(
|
||||
page_content="value1",
|
||||
metadata={"source": file_path, "seq_num": 1, "x": "value1-meta"},
|
||||
),
|
||||
Document(
|
||||
page_content="value2",
|
||||
metadata={"source": file_path, "seq_num": 2, "x": "value2-meta"},
|
||||
),
|
||||
]
|
||||
|
||||
def metadata_func(record: Dict, metadata: Dict) -> Dict:
|
||||
metadata["x"] = f"{record['text']}-meta"
|
||||
return metadata
|
||||
|
||||
loader = JSONLoader(file_path=file_path, metadata_func=metadata_func, **kwargs)
|
||||
result = loader.load()
|
||||
|
||||
assert result == expected_docs
|
||||
|
Loading…
Reference in New Issue
Block a user