From c172611647ee137df085a0f23f3add0d3a4d8ffd Mon Sep 17 00:00:00 2001 From: Rahul Triptahi Date: Tue, 30 Apr 2024 03:11:09 +0530 Subject: [PATCH] community[patch]: Add classifier_url argument in PebbloSafeLoader and documentation update. (#21030) Description: Add classifier_url argument in PebbloSafeLoader. Documentation: Updated PebbloSafeLoader documentation with above change and new links for pebblo github pages. --------- Signed-off-by: Rahul Tripathi Co-authored-by: Rahul Tripathi --- docs/docs/integrations/document_loaders/pebblo.ipynb | 8 +++++--- .../langchain_community/document_loaders/pebblo.py | 6 ++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/docs/docs/integrations/document_loaders/pebblo.ipynb b/docs/docs/integrations/document_loaders/pebblo.ipynb index 177a11fbab9..ef73792ae0d 100644 --- a/docs/docs/integrations/document_loaders/pebblo.ipynb +++ b/docs/docs/integrations/document_loaders/pebblo.ipynb @@ -6,17 +6,19 @@ "source": [ "# Pebblo Safe DocumentLoader\n", "\n", - "> [Pebblo](https://github.com/daxa-ai/pebblo) enables developers to safely load data and promote their Gen AI app to deployment without worrying about the organization’s compliance and security requirements. The project identifies semantic topics and entities found in the loaded data and summarizes them on the UI or a PDF report.\n", + "> [Pebblo](https://daxa-ai.github.io/pebblo/) enables developers to safely load data and promote their Gen AI app to deployment without worrying about the organization’s compliance and security requirements. The project identifies semantic topics and entities found in the loaded data and summarizes them on the UI or a PDF report.\n", "\n", "Pebblo has two components.\n", "\n", "1. Pebblo Safe DocumentLoader for Langchain\n", - "1. Pebblo Daemon\n", + "1. Pebblo Server\n", "\n", - "This document describes how to augment your existing Langchain DocumentLoader with Pebblo Safe DocumentLoader to get deep data visibility on the types of Topics and Entities ingested into the Gen-AI Langchain application. For details on `Pebblo Daemon` see this [pebblo daemon](https://daxa-ai.github.io/pebblo-docs/daemon.html) document.\n", + "This document describes how to augment your existing Langchain DocumentLoader with Pebblo Safe DocumentLoader to get deep data visibility on the types of Topics and Entities ingested into the Gen-AI Langchain application. For details on `Pebblo Server` see this [pebblo server](https://daxa-ai.github.io/pebblo/daemon) document.\n", "\n", "Pebblo Safeloader enables safe data ingestion for Langchain `DocumentLoader`. This is done by wrapping the document loader call with `Pebblo Safe DocumentLoader`.\n", "\n", + "Note: To configure pebblo server on some url other that pebblo's default (localhost:8000) url, put the correct URL in `PEBBLO_CLASSIFIER_URL` env variable. This is configurable using the `classifier_url` keyword argument as well. Ref: [server-configurations](https://daxa-ai.github.io/pebblo/config)\n", + "\n", "#### How to Pebblo enable Document Loading?\n", "\n", "Assume a Langchain RAG application snippet using `CSVLoader` to read a CSV document for inference.\n", diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py index a0cc2eb4115..26f1979e61a 100644 --- a/libs/community/langchain_community/document_loaders/pebblo.py +++ b/libs/community/langchain_community/document_loaders/pebblo.py @@ -45,6 +45,7 @@ class PebbloSafeLoader(BaseLoader): description: str = "", api_key: Optional[str] = None, load_semantic: bool = False, + classifier_url: Optional[str] = None, ): if not name or not isinstance(name, str): raise NameError("Must specify a valid name.") @@ -63,6 +64,7 @@ class PebbloSafeLoader(BaseLoader): self.source_type = get_loader_type(loader_name) self.source_path_size = self.get_source_size(self.source_path) self.source_aggregate_size = 0 + self.classifier_url = classifier_url or CLASSIFIER_URL self.loader_details = { "loader": loader_name, "source_path": self.source_path, @@ -210,7 +212,7 @@ class PebbloSafeLoader(BaseLoader): self.source_aggregate_size ) payload = Doc(**payload).dict(exclude_unset=True) - load_doc_url = f"{CLASSIFIER_URL}{LOADER_DOC_URL}" + load_doc_url = f"{self.classifier_url}{LOADER_DOC_URL}" classified_docs = [] try: pebblo_resp = requests.post( @@ -296,7 +298,7 @@ class PebbloSafeLoader(BaseLoader): "Content-Type": "application/json", } payload = self.app.dict(exclude_unset=True) - app_discover_url = f"{CLASSIFIER_URL}{APP_DISCOVER_URL}" + app_discover_url = f"{self.classifier_url}{APP_DISCOVER_URL}" try: pebblo_resp = requests.post( app_discover_url, headers=headers, json=payload, timeout=20