[community] Added PebbloTextLoader for loading text data in PebbloSafeLoader (#26582)

- **Description:** Added PebbloTextLoader for loading text in
PebbloSafeLoader.
- Since PebbloSafeLoader wraps document loaders, this new loader enables
direct loading of text into Documents using PebbloSafeLoader.
- **Issue:** NA
- **Dependencies:** NA
- [x] **Tests**: Added/Updated tests
This commit is contained in:
Rajendra Kadam
2024-09-19 19:29:04 +05:30
committed by GitHub
parent 55b641b761
commit 60dc19da30
4 changed files with 113 additions and 1 deletions

View File

@@ -4,7 +4,7 @@ import logging
import os
import uuid
from importlib.metadata import version
from typing import Dict, Iterator, List, Optional
from typing import Any, Dict, Iterable, Iterator, List, Optional
from langchain_core.documents import Document
@@ -271,3 +271,67 @@ class PebbloSafeLoader(BaseLoader):
doc_metadata["pb_checksum"] = classified_docs.get(doc.pb_id, {}).get(
"pb_checksum", None
)
class PebbloTextLoader(BaseLoader):
"""
Loader for text data.
Since PebbloSafeLoader is a wrapper around document loaders, this loader is
used to load text data directly into Documents.
"""
def __init__(
self,
texts: Iterable[str],
*,
source: Optional[str] = None,
ids: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
metadatas: Optional[List[Dict[str, Any]]] = None,
) -> None:
"""
Args:
texts: Iterable of text data.
source: Source of the text data.
Optional. Defaults to None.
ids: List of unique identifiers for each text.
Optional. Defaults to None.
metadata: Metadata for all texts.
Optional. Defaults to None.
metadatas: List of metadata for each text.
Optional. Defaults to None.
"""
self.texts = texts
self.source = source
self.ids = ids
self.metadata = metadata
self.metadatas = metadatas
def lazy_load(self) -> Iterator[Document]:
"""
Lazy load text data into Documents.
Returns:
Iterator of Documents
"""
for i, text in enumerate(self.texts):
_id = None
metadata = self.metadata or {}
if self.metadatas and i < len(self.metadatas) and self.metadatas[i]:
metadata.update(self.metadatas[i])
if self.ids and i < len(self.ids):
_id = self.ids[i]
yield Document(id=_id, page_content=text, metadata=metadata)
def load(self) -> List[Document]:
"""
Load text data into Documents.
Returns:
List of Documents
"""
documents = []
for doc in self.lazy_load():
documents.append(doc)
return documents