From ead2a7480604666d2367ede835253b638af6ac5f Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Fri, 8 Mar 2024 19:58:17 +0100 Subject: [PATCH] community: Implement lazy_load() for JSONLoader (#18643) Covered by `tests/unit_tests/document_loaders/test_json_loader.py` --- .../document_loaders/json_loader.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/json_loader.py b/libs/community/langchain_community/document_loaders/json_loader.py index f828873073d..5acd6c8e657 100644 --- a/libs/community/langchain_community/document_loaders/json_loader.py +++ b/libs/community/langchain_community/document_loaders/json_loader.py @@ -1,6 +1,6 @@ import json from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, Iterator, Optional, Union from langchain_core.documents import Document @@ -67,20 +67,23 @@ class JSONLoader(BaseLoader): self._text_content = text_content self._json_lines = json_lines - def load(self) -> List[Document]: + def lazy_load(self) -> Iterator[Document]: """Load and return documents from the JSON file.""" - docs: List[Document] = [] + index = 0 if self._json_lines: with self.file_path.open(encoding="utf-8") as f: for line in f: line = line.strip() if line: - self._parse(line, docs) + for doc in self._parse(line, index): + yield doc + index += 1 else: - self._parse(self.file_path.read_text(encoding="utf-8"), docs) - return docs + for doc in self._parse(self.file_path.read_text(encoding="utf-8"), index): + yield doc + index += 1 - def _parse(self, content: str, docs: List[Document]) -> None: + def _parse(self, content: str, index: int) -> Iterator[Document]: """Convert given content to documents.""" data = self._jq_schema.input(json.loads(content)) @@ -92,12 +95,12 @@ class JSONLoader(BaseLoader): if self._metadata_func is not None: self._validate_metadata_func(data) - for i, sample in enumerate(data, len(docs) + 1): + for i, sample in enumerate(data, index + 1): text = self._get_text(sample=sample) metadata = self._get_metadata( sample=sample, source=str(self.file_path), seq_num=i ) - docs.append(Document(page_content=text, metadata=metadata)) + yield Document(page_content=text, metadata=metadata) def _get_text(self, sample: Any) -> str: """Convert sample to string format"""