mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-29 18:08:36 +00:00
[community] Added PebbloTextLoader for loading text data in PebbloSafeLoader (#26582)
- **Description:** Added PebbloTextLoader for loading text in PebbloSafeLoader. - Since PebbloSafeLoader wraps document loaders, this new loader enables direct loading of text into Documents using PebbloSafeLoader. - **Issue:** NA - **Dependencies:** NA - [x] **Tests**: Added/Updated tests
This commit is contained in:
parent
55b641b761
commit
60dc19da30
@ -359,6 +359,7 @@ if TYPE_CHECKING:
|
|||||||
)
|
)
|
||||||
from langchain_community.document_loaders.pebblo import (
|
from langchain_community.document_loaders.pebblo import (
|
||||||
PebbloSafeLoader,
|
PebbloSafeLoader,
|
||||||
|
PebbloTextLoader,
|
||||||
)
|
)
|
||||||
from langchain_community.document_loaders.polars_dataframe import (
|
from langchain_community.document_loaders.polars_dataframe import (
|
||||||
PolarsDataFrameLoader,
|
PolarsDataFrameLoader,
|
||||||
@ -650,6 +651,7 @@ _module_lookup = {
|
|||||||
"PDFPlumberLoader": "langchain_community.document_loaders.pdf",
|
"PDFPlumberLoader": "langchain_community.document_loaders.pdf",
|
||||||
"PagedPDFSplitter": "langchain_community.document_loaders.pdf",
|
"PagedPDFSplitter": "langchain_community.document_loaders.pdf",
|
||||||
"PebbloSafeLoader": "langchain_community.document_loaders.pebblo",
|
"PebbloSafeLoader": "langchain_community.document_loaders.pebblo",
|
||||||
|
"PebbloTextLoader": "langchain_community.document_loaders.pebblo",
|
||||||
"PlaywrightURLLoader": "langchain_community.document_loaders.url_playwright",
|
"PlaywrightURLLoader": "langchain_community.document_loaders.url_playwright",
|
||||||
"PolarsDataFrameLoader": "langchain_community.document_loaders.polars_dataframe",
|
"PolarsDataFrameLoader": "langchain_community.document_loaders.polars_dataframe",
|
||||||
"PsychicLoader": "langchain_community.document_loaders.psychic",
|
"PsychicLoader": "langchain_community.document_loaders.psychic",
|
||||||
@ -855,6 +857,7 @@ __all__ = [
|
|||||||
"PDFPlumberLoader",
|
"PDFPlumberLoader",
|
||||||
"PagedPDFSplitter",
|
"PagedPDFSplitter",
|
||||||
"PebbloSafeLoader",
|
"PebbloSafeLoader",
|
||||||
|
"PebbloTextLoader",
|
||||||
"PlaywrightURLLoader",
|
"PlaywrightURLLoader",
|
||||||
"PolarsDataFrameLoader",
|
"PolarsDataFrameLoader",
|
||||||
"PsychicLoader",
|
"PsychicLoader",
|
||||||
|
@ -4,7 +4,7 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import uuid
|
import uuid
|
||||||
from importlib.metadata import version
|
from importlib.metadata import version
|
||||||
from typing import Dict, Iterator, List, Optional
|
from typing import Any, Dict, Iterable, Iterator, List, Optional
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -271,3 +271,67 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
doc_metadata["pb_checksum"] = classified_docs.get(doc.pb_id, {}).get(
|
doc_metadata["pb_checksum"] = classified_docs.get(doc.pb_id, {}).get(
|
||||||
"pb_checksum", None
|
"pb_checksum", None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PebbloTextLoader(BaseLoader):
|
||||||
|
"""
|
||||||
|
Loader for text data.
|
||||||
|
|
||||||
|
Since PebbloSafeLoader is a wrapper around document loaders, this loader is
|
||||||
|
used to load text data directly into Documents.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
texts: Iterable[str],
|
||||||
|
*,
|
||||||
|
source: Optional[str] = None,
|
||||||
|
ids: Optional[List[str]] = None,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
metadatas: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
texts: Iterable of text data.
|
||||||
|
source: Source of the text data.
|
||||||
|
Optional. Defaults to None.
|
||||||
|
ids: List of unique identifiers for each text.
|
||||||
|
Optional. Defaults to None.
|
||||||
|
metadata: Metadata for all texts.
|
||||||
|
Optional. Defaults to None.
|
||||||
|
metadatas: List of metadata for each text.
|
||||||
|
Optional. Defaults to None.
|
||||||
|
"""
|
||||||
|
self.texts = texts
|
||||||
|
self.source = source
|
||||||
|
self.ids = ids
|
||||||
|
self.metadata = metadata
|
||||||
|
self.metadatas = metadatas
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
|
"""
|
||||||
|
Lazy load text data into Documents.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Iterator of Documents
|
||||||
|
"""
|
||||||
|
for i, text in enumerate(self.texts):
|
||||||
|
_id = None
|
||||||
|
metadata = self.metadata or {}
|
||||||
|
if self.metadatas and i < len(self.metadatas) and self.metadatas[i]:
|
||||||
|
metadata.update(self.metadatas[i])
|
||||||
|
if self.ids and i < len(self.ids):
|
||||||
|
_id = self.ids[i]
|
||||||
|
yield Document(id=_id, page_content=text, metadata=metadata)
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""
|
||||||
|
Load text data into Documents.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Documents
|
||||||
|
"""
|
||||||
|
documents = []
|
||||||
|
for doc in self.lazy_load():
|
||||||
|
documents.append(doc)
|
||||||
|
return documents
|
||||||
|
@ -55,6 +55,7 @@ EXPECTED_ALL = [
|
|||||||
"DedocFileLoader",
|
"DedocFileLoader",
|
||||||
"DedocPDFLoader",
|
"DedocPDFLoader",
|
||||||
"PebbloSafeLoader",
|
"PebbloSafeLoader",
|
||||||
|
"PebbloTextLoader",
|
||||||
"DiffbotLoader",
|
"DiffbotLoader",
|
||||||
"DirectoryLoader",
|
"DirectoryLoader",
|
||||||
"DiscordChatLoader",
|
"DiscordChatLoader",
|
||||||
|
@ -25,6 +25,11 @@ def test_pebblo_import() -> None:
|
|||||||
from langchain_community.document_loaders import PebbloSafeLoader # noqa: F401
|
from langchain_community.document_loaders import PebbloSafeLoader # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
def test_pebblo_text_loader_import() -> None:
|
||||||
|
"""Test that the Pebblo text loader can be imported."""
|
||||||
|
from langchain_community.document_loaders import PebbloTextLoader # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
def test_empty_filebased_loader(mocker: MockerFixture) -> None:
|
def test_empty_filebased_loader(mocker: MockerFixture) -> None:
|
||||||
"""Test basic file based csv loader."""
|
"""Test basic file based csv loader."""
|
||||||
# Setup
|
# Setup
|
||||||
@ -146,3 +151,42 @@ def test_pebblo_safe_loader_api_key() -> None:
|
|||||||
# Assert
|
# Assert
|
||||||
assert loader.pb_client.api_key == api_key
|
assert loader.pb_client.api_key == api_key
|
||||||
assert loader.pb_client.classifier_location == "local"
|
assert loader.pb_client.classifier_location == "local"
|
||||||
|
|
||||||
|
|
||||||
|
def test_pebblo_text_loader(mocker: MockerFixture) -> None:
|
||||||
|
"""
|
||||||
|
Test loading in-memory text with PebbloTextLoader and PebbloSafeLoader.
|
||||||
|
"""
|
||||||
|
# Setup
|
||||||
|
from langchain_community.document_loaders import PebbloSafeLoader, PebbloTextLoader
|
||||||
|
|
||||||
|
mocker.patch.multiple(
|
||||||
|
"requests",
|
||||||
|
get=MockResponse(json_data={"data": ""}, status_code=200),
|
||||||
|
post=MockResponse(json_data={"data": ""}, status_code=200),
|
||||||
|
)
|
||||||
|
|
||||||
|
text = "This is a test text."
|
||||||
|
source = "fake_source"
|
||||||
|
expected_docs = [
|
||||||
|
Document(
|
||||||
|
metadata={
|
||||||
|
"full_path": source,
|
||||||
|
"pb_checksum": None,
|
||||||
|
},
|
||||||
|
page_content=text,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Exercise
|
||||||
|
texts = [text]
|
||||||
|
loader = PebbloSafeLoader(
|
||||||
|
PebbloTextLoader(texts, source=source),
|
||||||
|
"dummy_app_name",
|
||||||
|
"dummy_owner",
|
||||||
|
"dummy_description",
|
||||||
|
)
|
||||||
|
result = loader.load()
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert result == expected_docs
|
||||||
|
Loading…
Reference in New Issue
Block a user