diff --git a/docs/docs_skeleton/docs/modules/data_connection/document_loaders/how_to/json.mdx b/docs/docs_skeleton/docs/modules/data_connection/document_loaders/how_to/json.mdx index ecd849bfbe4..8cc600170b2 100644 --- a/docs/docs_skeleton/docs/modules/data_connection/document_loaders/how_to/json.mdx +++ b/docs/docs_skeleton/docs/modules/data_connection/document_loaders/how_to/json.mdx @@ -2,6 +2,8 @@ >[JSON (JavaScript Object Notation)](https://en.wikipedia.org/wiki/JSON) is an open standard file format and data interchange format that uses human-readable text to store and transmit data objects consisting of attribute–value pairs and arrays (or other serializable values). +>[JSON Lines](https://jsonlines.org/) is a file format where each line is a valid JSON value. + import Example from "@snippets/modules/data_connection/document_loaders/how_to/json.mdx" diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/example_data/facebook_chat_messages.jsonl b/docs/extras/modules/data_connection/document_loaders/integrations/example_data/facebook_chat_messages.jsonl new file mode 100644 index 00000000000..215d2bbaa40 --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/example_data/facebook_chat_messages.jsonl @@ -0,0 +1,3 @@ +{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"} +{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no worries! Bye"} +{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im sorry it was my mistake, the blue one is not for sale"} diff --git a/docs/snippets/modules/data_connection/document_loaders/how_to/json.mdx b/docs/snippets/modules/data_connection/document_loaders/how_to/json.mdx index 307720ae3e4..7b56867047b 100644 --- a/docs/snippets/modules/data_connection/document_loaders/how_to/json.mdx +++ b/docs/snippets/modules/data_connection/document_loaders/how_to/json.mdx @@ -78,11 +78,14 @@ pprint(data) + ## Using `JSONLoader` Suppose we are interested in extracting the values under the `content` field within the `messages` key of the JSON data. This can easily be done through the `JSONLoader` as shown below. +### JSON file + ```python loader = JSONLoader( file_path='./example_data/facebook_chat.json', @@ -114,6 +117,81 @@ pprint(data) + +### JSON Lines file + +If you want to load documents from a JSON Lines file, you pass `json_lines=True` +and specify `jq_schema` to extract `page_content` from a single JSON object. + +```python +file_path = './example_data/facebook_chat_messages.jsonl' +pprint(Path(file_path).read_text()) +``` + + + +``` + ('{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}\n' + '{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no ' + 'worries! Bye"}\n' + '{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im ' + 'sorry it was my mistake, the blue one is not for sale"}\n') +``` + + + + +```python +loader = JSONLoader( + file_path='./example_data/facebook_chat_messages.jsonl', + jq_schema='.content', + json_lines=True) + +data = loader.load() +``` + +```python +pprint(data) +``` + + + +``` + [Document(page_content='Bye!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}), + Document(page_content='Oh no worries! Bye', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}), + Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})] +``` + + + + +Another option is set `jq_schema='.'` and provide `content_key`: + +```python +loader = JSONLoader( + file_path='./example_data/facebook_chat_messages.jsonl', + jq_schema='.', + content_key='sender_name', + json_lines=True) + +data = loader.load() +``` + +```python +pprint(data) +``` + + + +``` + [Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}), + Document(page_content='User 1', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}), + Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})] +``` + + + + ## Extracting metadata Generally, we want to include metadata available in the JSON file into the documents that we create from the content. diff --git a/langchain/document_loaders/json_loader.py b/langchain/document_loaders/json_loader.py index c31b9a4848f..9e793798e38 100644 --- a/langchain/document_loaders/json_loader.py +++ b/langchain/document_loaders/json_loader.py @@ -23,11 +23,12 @@ class JSONLoader(BaseLoader): content_key: Optional[str] = None, metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None, text_content: bool = True, + json_lines: bool = False, ): """Initialize the JSONLoader. Args: - file_path (Union[str, Path]): The path to the JSON file. + file_path (Union[str, Path]): The path to the JSON or JSON Lines file. jq_schema (str): The jq schema to use to extract the data or text from the JSON. content_key (str): The key to use to extract the content from the JSON if @@ -35,8 +36,10 @@ class JSONLoader(BaseLoader): metadata_func (Callable[Dict, Dict]): A function that takes in the JSON object extracted by the jq_schema and the default metadata and returns a dict of the updated metadata. - text_content (bool): Boolean flag to indicates whether the content is in - string format, default to True + text_content (bool): Boolean flag to indicate whether the content is in + string format, default to True. + json_lines (bool): Boolean flag to indicate whether the input is in + JSON Lines format. """ try: import jq # noqa:F401 @@ -50,10 +53,24 @@ class JSONLoader(BaseLoader): self._content_key = content_key self._metadata_func = metadata_func self._text_content = text_content + self._json_lines = json_lines def load(self) -> List[Document]: """Load and return documents from the JSON file.""" - data = self._jq_schema.input(json.loads(self.file_path.read_text())) + docs: List[Document] = [] + if self._json_lines: + with self.file_path.open(encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + self._parse(line, docs) + else: + self._parse(self.file_path.read_text(), docs) + return docs + + def _parse(self, content: str, docs: List[Document]) -> None: + """Convert given content to documents.""" + data = self._jq_schema.input(json.loads(content)) # Perform some validation # This is not a perfect validation, but it should catch most cases @@ -61,8 +78,7 @@ class JSONLoader(BaseLoader): if self._content_key is not None: self._validate_content_key(data) - docs = [] - for i, sample in enumerate(data, 1): + for i, sample in enumerate(data, len(docs) + 1): metadata = dict( source=str(self.file_path), seq_num=i, @@ -70,8 +86,6 @@ class JSONLoader(BaseLoader): text = self._get_text(sample=sample, metadata=metadata) docs.append(Document(page_content=text, metadata=metadata)) - return docs - def _get_text(self, sample: Any, metadata: dict) -> str: """Convert sample to string format""" if self._content_key is not None: diff --git a/tests/unit_tests/document_loaders/test_json_loader.py b/tests/unit_tests/document_loaders/test_json_loader.py index 31739d4dfe1..19fb90e0c90 100644 --- a/tests/unit_tests/document_loaders/test_json_loader.py +++ b/tests/unit_tests/document_loaders/test_json_loader.py @@ -1,3 +1,6 @@ +import io +from typing import Any, Dict + import pytest from pytest import raises from pytest_mock import MockerFixture @@ -5,8 +8,9 @@ from pytest_mock import MockerFixture from langchain.docstore.document import Document from langchain.document_loaders.json_loader import JSONLoader +pytestmark = pytest.mark.requires("jq") + -@pytest.mark.requires("jq") def test_load_valid_string_content(mocker: MockerFixture) -> None: file_path = "/workspaces/langchain/test.json" expected_docs = [ @@ -19,9 +23,12 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None: metadata={"source": file_path, "seq_num": 2}, ), ] + mocker.patch("builtins.open", mocker.mock_open()) - mock_csv_reader = mocker.patch("pathlib.Path.read_text") - mock_csv_reader.return_value = '[{"text": "value1"}, {"text": "value2"}]' + mocker.patch( + "pathlib.Path.read_text", + return_value='[{"text": "value1"}, {"text": "value2"}]', + ) loader = JSONLoader(file_path=file_path, jq_schema=".[].text", text_content=True) result = loader.load() @@ -29,7 +36,6 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None: assert result == expected_docs -@pytest.mark.requires("jq") def test_load_valid_dict_content(mocker: MockerFixture) -> None: file_path = "/workspaces/langchain/test.json" expected_docs = [ @@ -42,11 +48,14 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None: metadata={"source": file_path, "seq_num": 2}, ), ] + mocker.patch("builtins.open", mocker.mock_open()) - mock_csv_reader = mocker.patch("pathlib.Path.read_text") - mock_csv_reader.return_value = """ - [{"text": "value1"}, {"text": "value2"}] - """ + mocker.patch( + "pathlib.Path.read_text", + return_value=""" + [{"text": "value1"}, {"text": "value2"}] + """, + ) loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=False) result = loader.load() @@ -54,7 +63,6 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None: assert result == expected_docs -@pytest.mark.requires("jq") def test_load_valid_bool_content(mocker: MockerFixture) -> None: file_path = "/workspaces/langchain/test.json" expected_docs = [ @@ -67,13 +75,16 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None: metadata={"source": file_path, "seq_num": 2}, ), ] + mocker.patch("builtins.open", mocker.mock_open()) - mock_csv_reader = mocker.patch("pathlib.Path.read_text") - mock_csv_reader.return_value = """ - [ - {"flag": false}, {"flag": true} - ] - """ + mocker.patch( + "pathlib.Path.read_text", + return_value=""" + [ + {"flag": false}, {"flag": true} + ] + """, + ) loader = JSONLoader(file_path=file_path, jq_schema=".[].flag", text_content=False) result = loader.load() @@ -81,7 +92,6 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None: assert result == expected_docs -@pytest.mark.requires("jq") def test_load_valid_numeric_content(mocker: MockerFixture) -> None: file_path = "/workspaces/langchain/test.json" expected_docs = [ @@ -94,13 +104,16 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None: metadata={"source": file_path, "seq_num": 2}, ), ] + mocker.patch("builtins.open", mocker.mock_open()) - mock_csv_reader = mocker.patch("pathlib.Path.read_text") - mock_csv_reader.return_value = """ - [ - {"num": 99}, {"num": 99.5} - ] - """ + mocker.patch( + "pathlib.Path.read_text", + return_value=""" + [ + {"num": 99}, {"num": 99.5} + ] + """, + ) loader = JSONLoader(file_path=file_path, jq_schema=".[].num", text_content=False) result = loader.load() @@ -108,16 +121,152 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None: assert result == expected_docs -@pytest.mark.requires("jq") def test_load_invalid_test_content(mocker: MockerFixture) -> None: file_path = "/workspaces/langchain/test.json" + mocker.patch("builtins.open", mocker.mock_open()) - mock_csv_reader = mocker.patch("pathlib.Path.read_text") - mock_csv_reader.return_value = """ - [{"text": "value1"}, {"text": "value2"}] - """ + mocker.patch( + "pathlib.Path.read_text", + return_value=""" + [{"text": "value1"}, {"text": "value2"}] + """, + ) loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=True) with raises(ValueError): loader.load() + + +def test_load_jsonlines(mocker: MockerFixture) -> None: + file_path = "/workspaces/langchain/test.json" + expected_docs = [ + Document( + page_content="value1", + metadata={"source": file_path, "seq_num": 1}, + ), + Document( + page_content="value2", + metadata={"source": file_path, "seq_num": 2}, + ), + ] + + mocker.patch( + "pathlib.Path.open", + return_value=io.StringIO( + """ + {"text": "value1"} + {"text": "value2"} + """ + ), + ) + + loader = JSONLoader( + file_path=file_path, jq_schema=".", content_key="text", json_lines=True + ) + result = loader.load() + + assert result == expected_docs + + +@pytest.mark.parametrize( + "params", + ( + {"jq_schema": ".[].text"}, + {"jq_schema": ".[]", "content_key": "text"}, + ), +) +def test_load_jsonlines_list(params: Dict, mocker: MockerFixture) -> None: + file_path = "/workspaces/langchain/test.json" + expected_docs = [ + Document( + page_content="value1", + metadata={"source": file_path, "seq_num": 1}, + ), + Document( + page_content="value2", + metadata={"source": file_path, "seq_num": 2}, + ), + Document( + page_content="value3", + metadata={"source": file_path, "seq_num": 3}, + ), + Document( + page_content="value4", + metadata={"source": file_path, "seq_num": 4}, + ), + ] + + mocker.patch( + "pathlib.Path.open", + return_value=io.StringIO( + """ + [{"text": "value1"}, {"text": "value2"}] + [{"text": "value3"}, {"text": "value4"}] + """ + ), + ) + + loader = JSONLoader(file_path=file_path, json_lines=True, **params) + result = loader.load() + + assert result == expected_docs + + +def test_load_empty_jsonlines(mocker: MockerFixture) -> None: + mocker.patch("pathlib.Path.open", return_value=io.StringIO("")) + + loader = JSONLoader(file_path="file_path", jq_schema=".[].text", json_lines=True) + result = loader.load() + + assert result == [] + + +@pytest.mark.parametrize( + "patch_func,patch_func_value,kwargs", + ( + # JSON content. + ( + "pathlib.Path.read_text", + '[{"text": "value1"}, {"text": "value2"}]', + {"jq_schema": ".[]", "content_key": "text"}, + ), + # JSON Lines content. + ( + "pathlib.Path.open", + io.StringIO( + """ + {"text": "value1"} + {"text": "value2"} + """ + ), + {"jq_schema": ".", "content_key": "text", "json_lines": True}, + ), + ), +) +def test_json_meta( + patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture +) -> None: + mocker.patch("builtins.open", mocker.mock_open()) + mocker.patch(patch_func, return_value=patch_func_value) + + file_path = "/workspaces/langchain/test.json" + expected_docs = [ + Document( + page_content="value1", + metadata={"source": file_path, "seq_num": 1, "x": "value1-meta"}, + ), + Document( + page_content="value2", + metadata={"source": file_path, "seq_num": 2, "x": "value2-meta"}, + ), + ] + + def metadata_func(record: Dict, metadata: Dict) -> Dict: + metadata["x"] = f"{record['text']}-meta" + return metadata + + loader = JSONLoader(file_path=file_path, metadata_func=metadata_func, **kwargs) + result = loader.load() + + assert result == expected_docs