mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-06 13:18:12 +00:00
Hugging Face Loader: Add lazy load (#4799)
# Add lazy load to HF datasets loader Unfortunately, there are no tests as far as i can tell. Verified code manually.
This commit is contained in:
parent
a63ab7ded1
commit
2d20a1196e
@ -1,5 +1,5 @@
|
|||||||
"""Loader that loads HuggingFace datasets."""
|
"""Loader that loads HuggingFace datasets."""
|
||||||
from typing import List, Mapping, Optional, Sequence, Union
|
from typing import Iterator, List, Mapping, Optional, Sequence, Union
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
@ -23,8 +23,7 @@ class HuggingFaceDatasetLoader(BaseLoader):
|
|||||||
use_auth_token: Optional[Union[bool, str]] = None,
|
use_auth_token: Optional[Union[bool, str]] = None,
|
||||||
num_proc: Optional[int] = None,
|
num_proc: Optional[int] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""Initialize the HuggingFaceDatasetLoader.
|
||||||
Initialize the HuggingFaceDatasetLoader.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
path: Path or name of the dataset.
|
path: Path or name of the dataset.
|
||||||
@ -50,8 +49,10 @@ class HuggingFaceDatasetLoader(BaseLoader):
|
|||||||
self.use_auth_token = use_auth_token
|
self.use_auth_token = use_auth_token
|
||||||
self.num_proc = num_proc
|
self.num_proc = num_proc
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def lazy_load(
|
||||||
"""Load documents."""
|
self,
|
||||||
|
) -> Iterator[Document]:
|
||||||
|
"""Load documents lazily."""
|
||||||
try:
|
try:
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -72,13 +73,15 @@ class HuggingFaceDatasetLoader(BaseLoader):
|
|||||||
num_proc=self.num_proc,
|
num_proc=self.num_proc,
|
||||||
)
|
)
|
||||||
|
|
||||||
docs = [
|
yield from (
|
||||||
Document(
|
Document(
|
||||||
page_content=row.pop(self.page_content_column),
|
page_content=row.pop(self.page_content_column),
|
||||||
metadata=row,
|
metadata=row,
|
||||||
)
|
)
|
||||||
for key in dataset.keys()
|
for key in dataset.keys()
|
||||||
for row in dataset[key]
|
for row in dataset[key]
|
||||||
]
|
)
|
||||||
|
|
||||||
return docs
|
def load(self) -> List[Document]:
|
||||||
|
"""Load documents."""
|
||||||
|
return list(self.lazy_load())
|
||||||
|
Loading…
Reference in New Issue
Block a user