From 7e5a9c317ff7961ecb57c2175c415621ff408c54 Mon Sep 17 00:00:00 2001 From: Rajendra Kadam Date: Wed, 25 Sep 2024 19:03:06 +0530 Subject: [PATCH] community[minor]: [Pebblo] Enhance PebbloSafeLoader to take anonymize flag (#26812) - **Description:** The flag is named `anonymize_snippets`. When set to true, the Pebblo server will anonymize snippets by redacting all personally identifiable information (PII) from the snippets going into VectorDB and the generated reports - **Issue:** NA - **Dependencies:** NA - **docs**: Updated --- .../document_loaders/pebblo.ipynb | 33 +++++++++++++++++++ .../document_loaders/pebblo.py | 2 ++ .../langchain_community/utilities/pebblo.py | 7 ++++ 3 files changed, 42 insertions(+) diff --git a/docs/docs/integrations/document_loaders/pebblo.ipynb b/docs/docs/integrations/document_loaders/pebblo.ipynb index 125a9744f35..255ea75531e 100644 --- a/docs/docs/integrations/document_loaders/pebblo.ipynb +++ b/docs/docs/integrations/document_loaders/pebblo.ipynb @@ -124,6 +124,39 @@ { "cell_type": "markdown", "metadata": {}, + "source": [ + "### Anonymize the snippets to redact all PII details\n", + "\n", + "Set `anonymize_snippets` to `True` to anonymize all personally identifiable information (PII) from the snippets going into VectorDB and the generated reports.\n", + "\n", + "> Note: The _Pebblo Entity Classifier_ effectively identifies personally identifiable information (PII) and is continuously evolving. While its recall is not yet 100%, it is steadily improving.\n", + "> For more details, please refer to the [_Pebblo Entity Classifier docs_](https://daxa-ai.github.io/pebblo/entityclassifier/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import CSVLoader, PebbloSafeLoader\n", + "\n", + "loader = PebbloSafeLoader(\n", + " CSVLoader(\"data/corp_sens_data.csv\"),\n", + " name=\"acme-corp-rag-1\", # App name (Mandatory)\n", + " owner=\"Joe Smith\", # Owner (Optional)\n", + " description=\"Support productivity RAG application\", # Description (Optional)\n", + " anonymize_snippets=True, # Whether to anonymize entities in the PDF Report (Optional, default=False)\n", + ")\n", + "documents = loader.load()\n", + "print(documents[0].metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [] } ], diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py index 8d3f54e342f..e176f3036bd 100644 --- a/libs/community/langchain_community/document_loaders/pebblo.py +++ b/libs/community/langchain_community/document_loaders/pebblo.py @@ -45,6 +45,7 @@ class PebbloSafeLoader(BaseLoader): classifier_url: Optional[str] = None, *, classifier_location: str = "local", + anonymize_snippets: bool = False, ): if not name or not isinstance(name, str): raise NameError("Must specify a valid name.") @@ -78,6 +79,7 @@ class PebbloSafeLoader(BaseLoader): api_key=api_key, classifier_location=classifier_location, classifier_url=classifier_url, + anonymize_snippets=anonymize_snippets, ) self.pb_client.send_loader_discover(self.app) diff --git a/libs/community/langchain_community/utilities/pebblo.py b/libs/community/langchain_community/utilities/pebblo.py index d68ec6e9ff7..65d5d207e1f 100644 --- a/libs/community/langchain_community/utilities/pebblo.py +++ b/libs/community/langchain_community/utilities/pebblo.py @@ -154,6 +154,8 @@ class Doc(BaseModel): """Owner of the source of the loader.""" classifier_location: str """Location of the classifier.""" + anonymize_snippets: bool + """Whether to anonymize snippets going into VectorDB and the generated reports""" def get_full_path(path: str) -> str: @@ -424,6 +426,8 @@ class PebbloLoaderAPIWrapper(BaseModel): """URL of the Pebblo Classifier""" cloud_url: Optional[str] """URL of the Pebblo Cloud""" + anonymize_snippets: bool = False + """Whether to anonymize snippets going into VectorDB and the generated reports""" def __init__(self, **kwargs: Any): """Validate that api key in environment.""" @@ -522,6 +526,8 @@ class PebbloLoaderAPIWrapper(BaseModel): # If local classifier is used add the classified information # and remove doc content self.update_doc_data(payload["docs"], classified_docs) + # Remove the anonymize_snippets key from payload + payload.pop("anonymize_snippets", None) self.send_docs_to_pebblo_cloud(payload) elif self.classifier_location == "pebblo-cloud": logger.warning("API key is missing for sending docs to Pebblo cloud.") @@ -599,6 +605,7 @@ class PebbloLoaderAPIWrapper(BaseModel): "loading_end": "false", "source_owner": source_owner, "classifier_location": self.classifier_location, + "anonymize_snippets": self.anonymize_snippets, } if loading_end is True: payload["loading_end"] = "true"