community: Implement lazy_load() for JSONLoader (#18643)

Covered by `tests/unit_tests/document_loaders/test_json_loader.py`
This commit is contained in:
Christophe Bornet 2024-03-08 19:58:17 +01:00 committed by GitHub
parent a88f62ec3c
commit ead2a74806
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,6 +1,6 @@
import json import json
from pathlib import Path from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union from typing import Any, Callable, Dict, Iterator, Optional, Union
from langchain_core.documents import Document from langchain_core.documents import Document
@ -67,20 +67,23 @@ class JSONLoader(BaseLoader):
self._text_content = text_content self._text_content = text_content
self._json_lines = json_lines self._json_lines = json_lines
def load(self) -> List[Document]: def lazy_load(self) -> Iterator[Document]:
"""Load and return documents from the JSON file.""" """Load and return documents from the JSON file."""
docs: List[Document] = [] index = 0
if self._json_lines: if self._json_lines:
with self.file_path.open(encoding="utf-8") as f: with self.file_path.open(encoding="utf-8") as f:
for line in f: for line in f:
line = line.strip() line = line.strip()
if line: if line:
self._parse(line, docs) for doc in self._parse(line, index):
yield doc
index += 1
else: else:
self._parse(self.file_path.read_text(encoding="utf-8"), docs) for doc in self._parse(self.file_path.read_text(encoding="utf-8"), index):
return docs yield doc
index += 1
def _parse(self, content: str, docs: List[Document]) -> None: def _parse(self, content: str, index: int) -> Iterator[Document]:
"""Convert given content to documents.""" """Convert given content to documents."""
data = self._jq_schema.input(json.loads(content)) data = self._jq_schema.input(json.loads(content))
@ -92,12 +95,12 @@ class JSONLoader(BaseLoader):
if self._metadata_func is not None: if self._metadata_func is not None:
self._validate_metadata_func(data) self._validate_metadata_func(data)
for i, sample in enumerate(data, len(docs) + 1): for i, sample in enumerate(data, index + 1):
text = self._get_text(sample=sample) text = self._get_text(sample=sample)
metadata = self._get_metadata( metadata = self._get_metadata(
sample=sample, source=str(self.file_path), seq_num=i sample=sample, source=str(self.file_path), seq_num=i
) )
docs.append(Document(page_content=text, metadata=metadata)) yield Document(page_content=text, metadata=metadata)
def _get_text(self, sample: Any) -> str: def _get_text(self, sample: Any) -> str:
"""Convert sample to string format""" """Convert sample to string format"""