mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-31 10:23:18 +00:00
community[minor]: [Pebblo] Enhance PebbloSafeLoader to take anonymize flag (#26812)
- **Description:** The flag is named `anonymize_snippets`. When set to true, the Pebblo server will anonymize snippets by redacting all personally identifiable information (PII) from the snippets going into VectorDB and the generated reports - **Issue:** NA - **Dependencies:** NA - **docs**: Updated
This commit is contained in:
@@ -124,6 +124,39 @@
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Anonymize the snippets to redact all PII details\n",
|
||||
"\n",
|
||||
"Set `anonymize_snippets` to `True` to anonymize all personally identifiable information (PII) from the snippets going into VectorDB and the generated reports.\n",
|
||||
"\n",
|
||||
"> Note: The _Pebblo Entity Classifier_ effectively identifies personally identifiable information (PII) and is continuously evolving. While its recall is not yet 100%, it is steadily improving.\n",
|
||||
"> For more details, please refer to the [_Pebblo Entity Classifier docs_](https://daxa-ai.github.io/pebblo/entityclassifier/)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_community.document_loaders import CSVLoader, PebbloSafeLoader\n",
|
||||
"\n",
|
||||
"loader = PebbloSafeLoader(\n",
|
||||
" CSVLoader(\"data/corp_sens_data.csv\"),\n",
|
||||
" name=\"acme-corp-rag-1\", # App name (Mandatory)\n",
|
||||
" owner=\"Joe Smith\", # Owner (Optional)\n",
|
||||
" description=\"Support productivity RAG application\", # Description (Optional)\n",
|
||||
" anonymize_snippets=True, # Whether to anonymize entities in the PDF Report (Optional, default=False)\n",
|
||||
")\n",
|
||||
"documents = loader.load()\n",
|
||||
"print(documents[0].metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
|
@@ -45,6 +45,7 @@ class PebbloSafeLoader(BaseLoader):
|
||||
classifier_url: Optional[str] = None,
|
||||
*,
|
||||
classifier_location: str = "local",
|
||||
anonymize_snippets: bool = False,
|
||||
):
|
||||
if not name or not isinstance(name, str):
|
||||
raise NameError("Must specify a valid name.")
|
||||
@@ -78,6 +79,7 @@ class PebbloSafeLoader(BaseLoader):
|
||||
api_key=api_key,
|
||||
classifier_location=classifier_location,
|
||||
classifier_url=classifier_url,
|
||||
anonymize_snippets=anonymize_snippets,
|
||||
)
|
||||
self.pb_client.send_loader_discover(self.app)
|
||||
|
||||
|
@@ -154,6 +154,8 @@ class Doc(BaseModel):
|
||||
"""Owner of the source of the loader."""
|
||||
classifier_location: str
|
||||
"""Location of the classifier."""
|
||||
anonymize_snippets: bool
|
||||
"""Whether to anonymize snippets going into VectorDB and the generated reports"""
|
||||
|
||||
|
||||
def get_full_path(path: str) -> str:
|
||||
@@ -424,6 +426,8 @@ class PebbloLoaderAPIWrapper(BaseModel):
|
||||
"""URL of the Pebblo Classifier"""
|
||||
cloud_url: Optional[str]
|
||||
"""URL of the Pebblo Cloud"""
|
||||
anonymize_snippets: bool = False
|
||||
"""Whether to anonymize snippets going into VectorDB and the generated reports"""
|
||||
|
||||
def __init__(self, **kwargs: Any):
|
||||
"""Validate that api key in environment."""
|
||||
@@ -522,6 +526,8 @@ class PebbloLoaderAPIWrapper(BaseModel):
|
||||
# If local classifier is used add the classified information
|
||||
# and remove doc content
|
||||
self.update_doc_data(payload["docs"], classified_docs)
|
||||
# Remove the anonymize_snippets key from payload
|
||||
payload.pop("anonymize_snippets", None)
|
||||
self.send_docs_to_pebblo_cloud(payload)
|
||||
elif self.classifier_location == "pebblo-cloud":
|
||||
logger.warning("API key is missing for sending docs to Pebblo cloud.")
|
||||
@@ -599,6 +605,7 @@ class PebbloLoaderAPIWrapper(BaseModel):
|
||||
"loading_end": "false",
|
||||
"source_owner": source_owner,
|
||||
"classifier_location": self.classifier_location,
|
||||
"anonymize_snippets": self.anonymize_snippets,
|
||||
}
|
||||
if loading_end is True:
|
||||
payload["loading_end"] = "true"
|
||||
|
Reference in New Issue
Block a user