Hugging Face Loader: Add lazy load (#4799)

# Add lazy load to HF datasets loader

Unfortunately, there are no tests as far as i can tell. Verified code manually.
This commit is contained in:
Eugene Yurtsev 2023-05-17 12:04:23 -04:00 committed by GitHub
parent a63ab7ded1
commit 2d20a1196e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,5 +1,5 @@
"""Loader that loads HuggingFace datasets.""" """Loader that loads HuggingFace datasets."""
from typing import List, Mapping, Optional, Sequence, Union from typing import Iterator, List, Mapping, Optional, Sequence, Union
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
@ -23,8 +23,7 @@ class HuggingFaceDatasetLoader(BaseLoader):
use_auth_token: Optional[Union[bool, str]] = None, use_auth_token: Optional[Union[bool, str]] = None,
num_proc: Optional[int] = None, num_proc: Optional[int] = None,
): ):
""" """Initialize the HuggingFaceDatasetLoader.
Initialize the HuggingFaceDatasetLoader.
Args: Args:
path: Path or name of the dataset. path: Path or name of the dataset.
@ -50,8 +49,10 @@ class HuggingFaceDatasetLoader(BaseLoader):
self.use_auth_token = use_auth_token self.use_auth_token = use_auth_token
self.num_proc = num_proc self.num_proc = num_proc
def load(self) -> List[Document]: def lazy_load(
"""Load documents.""" self,
) -> Iterator[Document]:
"""Load documents lazily."""
try: try:
from datasets import load_dataset from datasets import load_dataset
except ImportError: except ImportError:
@ -72,13 +73,15 @@ class HuggingFaceDatasetLoader(BaseLoader):
num_proc=self.num_proc, num_proc=self.num_proc,
) )
docs = [ yield from (
Document( Document(
page_content=row.pop(self.page_content_column), page_content=row.pop(self.page_content_column),
metadata=row, metadata=row,
) )
for key in dataset.keys() for key in dataset.keys()
for row in dataset[key] for row in dataset[key]
] )
return docs def load(self) -> List[Document]:
"""Load documents."""
return list(self.lazy_load())