mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 16:43:35 +00:00
community[minor]: Add support for Pebblo cloud_api_key in PebbloSafeLoader (#19855)
**Description**: _PebbloSafeLoader_: Add support for pebblo's cloud api-key in PebbloSafeLoader - This Pull request enables PebbloSafeLoader to accept pebblo's cloud api-key and send the semantic classification data to pebblo cloud. **Documentation**: Updated **Unit test**: Added **Issue**: NA **Dependencies**: - None **Twitter handle**: @rahul_tripathi2 Signed-off-by: Rahul Tripathi <rauhl.psit.ec@gmail.com> Co-authored-by: Rahul Tripathi <rauhl.psit.ec@gmail.com>
This commit is contained in:
parent
34a24d4df6
commit
820b713086
@ -62,6 +62,35 @@
|
|||||||
"documents = loader.load()\n",
|
"documents = loader.load()\n",
|
||||||
"print(documents)"
|
"print(documents)"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Send semantic topics and identities to Pebblo cloud server\n",
|
||||||
|
"\n",
|
||||||
|
"To send semantic data to pebblo-cloud, pass api-key to PebbloSafeLoader as an argument or alternatively, put the api-ket in `PEBBLO_API_KEY` environment variable."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders.csv_loader import CSVLoader\n",
|
||||||
|
"from langchain_community.document_loaders import PebbloSafeLoader\n",
|
||||||
|
"\n",
|
||||||
|
"loader = PebbloSafeLoader(\n",
|
||||||
|
" CSVLoader(\"data/corp_sens_data.csv\"),\n",
|
||||||
|
" name=\"acme-corp-rag-1\", # App name (Mandatory)\n",
|
||||||
|
" owner=\"Joe Smith\", # Owner (Optional)\n",
|
||||||
|
" description=\"Support productivity RAG application\", # Description (Optional)\n",
|
||||||
|
" api_key=\"my-api-key\", # API key (Optional, can be set in the environment variable PEBBLO_API_KEY)\n",
|
||||||
|
")\n",
|
||||||
|
"documents = loader.load()\n",
|
||||||
|
"print(documents)"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@ -1,17 +1,21 @@
|
|||||||
"""Pebblo's safe dataloader is a wrapper for document loaders"""
|
"""Pebblo's safe dataloader is a wrapper for document loaders"""
|
||||||
|
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import uuid
|
import uuid
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import Any, Dict, Iterator, List
|
from typing import Any, Dict, Iterator, List, Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
from langchain_community.document_loaders.base import BaseLoader
|
from langchain_community.document_loaders.base import BaseLoader
|
||||||
from langchain_community.utilities.pebblo import (
|
from langchain_community.utilities.pebblo import (
|
||||||
|
APP_DISCOVER_URL,
|
||||||
CLASSIFIER_URL,
|
CLASSIFIER_URL,
|
||||||
|
LOADER_DOC_URL,
|
||||||
|
PEBBLO_CLOUD_URL,
|
||||||
PLUGIN_VERSION,
|
PLUGIN_VERSION,
|
||||||
App,
|
App,
|
||||||
Doc,
|
Doc,
|
||||||
@ -38,10 +42,12 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
name: str,
|
name: str,
|
||||||
owner: str = "",
|
owner: str = "",
|
||||||
description: str = "",
|
description: str = "",
|
||||||
|
api_key: Optional[str] = None,
|
||||||
):
|
):
|
||||||
if not name or not isinstance(name, str):
|
if not name or not isinstance(name, str):
|
||||||
raise NameError("Must specify a valid name.")
|
raise NameError("Must specify a valid name.")
|
||||||
self.app_name = name
|
self.app_name = name
|
||||||
|
self.api_key = os.environ.get("PEBBLO_API_KEY") or api_key
|
||||||
self.load_id = str(uuid.uuid4())
|
self.load_id = str(uuid.uuid4())
|
||||||
self.loader = langchain_loader
|
self.loader = langchain_loader
|
||||||
self.owner = owner
|
self.owner = owner
|
||||||
@ -114,8 +120,9 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
def set_loader_sent(cls) -> None:
|
def set_loader_sent(cls) -> None:
|
||||||
cls._loader_sent = True
|
cls._loader_sent = True
|
||||||
|
|
||||||
def _send_loader_doc(self, loading_end: bool = False) -> None:
|
def _send_loader_doc(self, loading_end: bool = False) -> list:
|
||||||
"""Send documents fetched from loader to pebblo-server. Internal method.
|
"""Send documents fetched from loader to pebblo-server. Then send
|
||||||
|
classified documents to Daxa cloud(If api_key is present). Internal method.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
loading_end (bool, optional): Flag indicating the halt of data
|
loading_end (bool, optional): Flag indicating the halt of data
|
||||||
@ -163,28 +170,67 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
if "loader_details" in payload:
|
if "loader_details" in payload:
|
||||||
payload["loader_details"]["source_aggr_size"] = self.source_aggr_size
|
payload["loader_details"]["source_aggr_size"] = self.source_aggr_size
|
||||||
payload = Doc(**payload).dict(exclude_unset=True)
|
payload = Doc(**payload).dict(exclude_unset=True)
|
||||||
load_doc_url = f"{CLASSIFIER_URL}/v1/loader/doc"
|
load_doc_url = f"{CLASSIFIER_URL}{LOADER_DOC_URL}"
|
||||||
|
classified_docs = []
|
||||||
try:
|
try:
|
||||||
resp = requests.post(
|
pebblo_resp = requests.post(
|
||||||
load_doc_url, headers=headers, json=payload, timeout=20
|
load_doc_url, headers=headers, json=payload, timeout=300
|
||||||
)
|
)
|
||||||
if resp.status_code not in [HTTPStatus.OK, HTTPStatus.BAD_GATEWAY]:
|
classified_docs = json.loads(pebblo_resp.text).get("docs", None)
|
||||||
|
if pebblo_resp.status_code not in [HTTPStatus.OK, HTTPStatus.BAD_GATEWAY]:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Received unexpected HTTP response code: {resp.status_code}"
|
"Received unexpected HTTP response code: %s",
|
||||||
|
pebblo_resp.status_code,
|
||||||
)
|
)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"send_loader_doc: request \
|
"send_loader_doc[local]: request url %s, body %s len %s\
|
||||||
url {resp.request.url}, \
|
response status %s body %s",
|
||||||
body {str(resp.request.body)[:999]} \
|
pebblo_resp.request.url,
|
||||||
len {len(resp.request.body if resp.request.body else [])} \
|
str(pebblo_resp.request.body),
|
||||||
response status{resp.status_code} body {resp.json()}"
|
str(len(pebblo_resp.request.body if pebblo_resp.request.body else [])),
|
||||||
|
str(pebblo_resp.status_code),
|
||||||
|
pebblo_resp.json(),
|
||||||
)
|
)
|
||||||
except requests.exceptions.RequestException:
|
except requests.exceptions.RequestException:
|
||||||
logger.warning("Unable to reach pebblo server.")
|
logger.warning("Unable to reach pebblo server.")
|
||||||
except Exception:
|
except Exception as e:
|
||||||
logger.warning("An Exception caught in _send_loader_doc.")
|
logger.warning("An Exception caught in _send_loader_doc: %s", e)
|
||||||
|
|
||||||
|
if self.api_key:
|
||||||
|
if not classified_docs:
|
||||||
|
logger.warning("No classified docs to send to pebblo-cloud.")
|
||||||
|
return classified_docs
|
||||||
|
try:
|
||||||
|
payload["docs"] = classified_docs
|
||||||
|
payload["classified"] = True
|
||||||
|
headers.update({"x-api-key": self.api_key})
|
||||||
|
pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{LOADER_DOC_URL}"
|
||||||
|
pebblo_cloud_response = requests.post(
|
||||||
|
pebblo_cloud_url, headers=headers, json=payload, timeout=20
|
||||||
|
)
|
||||||
|
logger.debug(
|
||||||
|
"send_loader_doc[cloud]: request url %s, body %s len %s\
|
||||||
|
response status %s body %s",
|
||||||
|
pebblo_cloud_response.request.url,
|
||||||
|
str(pebblo_cloud_response.request.body),
|
||||||
|
str(
|
||||||
|
len(
|
||||||
|
pebblo_cloud_response.request.body
|
||||||
|
if pebblo_cloud_response.request.body
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
),
|
||||||
|
str(pebblo_cloud_response.status_code),
|
||||||
|
pebblo_cloud_response.json(),
|
||||||
|
)
|
||||||
|
except requests.exceptions.RequestException:
|
||||||
|
logger.warning("Unable to reach Pebblo cloud server.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("An Exception caught in _send_loader_doc: %s", e)
|
||||||
|
|
||||||
if loading_end is True:
|
if loading_end is True:
|
||||||
PebbloSafeLoader.set_loader_sent()
|
PebbloSafeLoader.set_loader_sent()
|
||||||
|
return classified_docs
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def calculate_content_size(page_content: str) -> int:
|
def calculate_content_size(page_content: str) -> int:
|
||||||
@ -206,32 +252,64 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
|
|
||||||
def _send_discover(self) -> None:
|
def _send_discover(self) -> None:
|
||||||
"""Send app discovery payload to pebblo-server. Internal method."""
|
"""Send app discovery payload to pebblo-server. Internal method."""
|
||||||
headers = {"Accept": "application/json", "Content-Type": "application/json"}
|
headers = {
|
||||||
|
"Accept": "application/json",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
payload = self.app.dict(exclude_unset=True)
|
payload = self.app.dict(exclude_unset=True)
|
||||||
app_discover_url = f"{CLASSIFIER_URL}/v1/app/discover"
|
app_discover_url = f"{CLASSIFIER_URL}{APP_DISCOVER_URL}"
|
||||||
try:
|
try:
|
||||||
resp = requests.post(
|
pebblo_resp = requests.post(
|
||||||
app_discover_url, headers=headers, json=payload, timeout=20
|
app_discover_url, headers=headers, json=payload, timeout=20
|
||||||
)
|
)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"send_discover: request \
|
"send_discover[local]: request url %s, body %s len %s\
|
||||||
url {resp.request.url}, \
|
response status %s body %s",
|
||||||
headers {resp.request.headers}, \
|
pebblo_resp.request.url,
|
||||||
body {str(resp.request.body)[:999]} \
|
str(pebblo_resp.request.body),
|
||||||
len {len(resp.request.body if resp.request.body else [])} \
|
str(len(pebblo_resp.request.body if pebblo_resp.request.body else [])),
|
||||||
response status{resp.status_code} body {resp.json()}"
|
str(pebblo_resp.status_code),
|
||||||
|
pebblo_resp.json(),
|
||||||
)
|
)
|
||||||
if resp.status_code in [HTTPStatus.OK, HTTPStatus.BAD_GATEWAY]:
|
if pebblo_resp.status_code in [HTTPStatus.OK, HTTPStatus.BAD_GATEWAY]:
|
||||||
PebbloSafeLoader.set_discover_sent()
|
PebbloSafeLoader.set_discover_sent()
|
||||||
else:
|
else:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Received unexpected HTTP response code: {resp.status_code}"
|
f"Received unexpected HTTP response code: {pebblo_resp.status_code}"
|
||||||
)
|
)
|
||||||
except requests.exceptions.RequestException:
|
except requests.exceptions.RequestException:
|
||||||
logger.warning("Unable to reach pebblo server.")
|
logger.warning("Unable to reach pebblo server.")
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.warning("An Exception caught in _send_discover.")
|
logger.warning("An Exception caught in _send_discover.")
|
||||||
|
|
||||||
|
if self.api_key:
|
||||||
|
try:
|
||||||
|
headers.update({"x-api-key": self.api_key})
|
||||||
|
pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{APP_DISCOVER_URL}"
|
||||||
|
pebblo_cloud_response = requests.post(
|
||||||
|
pebblo_cloud_url, headers=headers, json=payload, timeout=20
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
"send_discover[cloud]: request url %s, body %s len %s\
|
||||||
|
response status %s body %s",
|
||||||
|
pebblo_cloud_response.request.url,
|
||||||
|
str(pebblo_cloud_response.request.body),
|
||||||
|
str(
|
||||||
|
len(
|
||||||
|
pebblo_cloud_response.request.body
|
||||||
|
if pebblo_cloud_response.request.body
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
),
|
||||||
|
str(pebblo_cloud_response.status_code),
|
||||||
|
pebblo_cloud_response.json(),
|
||||||
|
)
|
||||||
|
except requests.exceptions.RequestException:
|
||||||
|
logger.warning("Unable to reach Pebblo cloud server.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("An Exception caught in _send_discover: %s", e)
|
||||||
|
|
||||||
def _get_app_details(self) -> App:
|
def _get_app_details(self) -> App:
|
||||||
"""Fetch app details. Internal method.
|
"""Fetch app details. Internal method.
|
||||||
|
|
||||||
|
@ -13,8 +13,12 @@ from langchain_community.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
PLUGIN_VERSION = "0.1.0"
|
PLUGIN_VERSION = "0.1.1"
|
||||||
CLASSIFIER_URL = os.getenv("PEBBLO_CLASSIFIER_URL", "http://localhost:8000")
|
CLASSIFIER_URL = os.getenv("PEBBLO_CLASSIFIER_URL", "http://localhost:8000")
|
||||||
|
PEBBLO_CLOUD_URL = os.getenv("PEBBLO_CLOUD_URL", "https://api.daxa.ai")
|
||||||
|
|
||||||
|
LOADER_DOC_URL = "/v1/loader/doc"
|
||||||
|
APP_DISCOVER_URL = "/v1/app/discover"
|
||||||
|
|
||||||
# Supported loaders for Pebblo safe data loading
|
# Supported loaders for Pebblo safe data loading
|
||||||
file_loader = [
|
file_loader = [
|
||||||
|
@ -112,3 +112,23 @@ def test_pdf_lazy_load(mocker: MockerFixture) -> None:
|
|||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(result) == 2
|
assert len(result) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_pebblo_safe_loader_api_key() -> None:
|
||||||
|
# Setup
|
||||||
|
from langchain_community.document_loaders import PebbloSafeLoader
|
||||||
|
|
||||||
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_empty.csv")
|
||||||
|
api_key = "dummy_api_key"
|
||||||
|
|
||||||
|
# Exercise
|
||||||
|
loader = PebbloSafeLoader(
|
||||||
|
CSVLoader(file_path=file_path),
|
||||||
|
"dummy_app_name",
|
||||||
|
"dummy_owner",
|
||||||
|
"dummy_description",
|
||||||
|
api_key=api_key,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert loader.api_key == api_key
|
||||||
|
Loading…
Reference in New Issue
Block a user