mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 07:09:31 +00:00
community: Implement lazy_load() for JSONLoader (#18643)
Covered by `tests/unit_tests/document_loaders/test_json_loader.py`
This commit is contained in:
parent
a88f62ec3c
commit
ead2a74806
@ -1,6 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable, Dict, List, Optional, Union
|
from typing import Any, Callable, Dict, Iterator, Optional, Union
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -67,20 +67,23 @@ class JSONLoader(BaseLoader):
|
|||||||
self._text_content = text_content
|
self._text_content = text_content
|
||||||
self._json_lines = json_lines
|
self._json_lines = json_lines
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load and return documents from the JSON file."""
|
"""Load and return documents from the JSON file."""
|
||||||
docs: List[Document] = []
|
index = 0
|
||||||
if self._json_lines:
|
if self._json_lines:
|
||||||
with self.file_path.open(encoding="utf-8") as f:
|
with self.file_path.open(encoding="utf-8") as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line:
|
if line:
|
||||||
self._parse(line, docs)
|
for doc in self._parse(line, index):
|
||||||
|
yield doc
|
||||||
|
index += 1
|
||||||
else:
|
else:
|
||||||
self._parse(self.file_path.read_text(encoding="utf-8"), docs)
|
for doc in self._parse(self.file_path.read_text(encoding="utf-8"), index):
|
||||||
return docs
|
yield doc
|
||||||
|
index += 1
|
||||||
|
|
||||||
def _parse(self, content: str, docs: List[Document]) -> None:
|
def _parse(self, content: str, index: int) -> Iterator[Document]:
|
||||||
"""Convert given content to documents."""
|
"""Convert given content to documents."""
|
||||||
data = self._jq_schema.input(json.loads(content))
|
data = self._jq_schema.input(json.loads(content))
|
||||||
|
|
||||||
@ -92,12 +95,12 @@ class JSONLoader(BaseLoader):
|
|||||||
if self._metadata_func is not None:
|
if self._metadata_func is not None:
|
||||||
self._validate_metadata_func(data)
|
self._validate_metadata_func(data)
|
||||||
|
|
||||||
for i, sample in enumerate(data, len(docs) + 1):
|
for i, sample in enumerate(data, index + 1):
|
||||||
text = self._get_text(sample=sample)
|
text = self._get_text(sample=sample)
|
||||||
metadata = self._get_metadata(
|
metadata = self._get_metadata(
|
||||||
sample=sample, source=str(self.file_path), seq_num=i
|
sample=sample, source=str(self.file_path), seq_num=i
|
||||||
)
|
)
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
yield Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
def _get_text(self, sample: Any) -> str:
|
def _get_text(self, sample: Any) -> str:
|
||||||
"""Convert sample to string format"""
|
"""Convert sample to string format"""
|
||||||
|
Loading…
Reference in New Issue
Block a user