mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-01 11:02:37 +00:00
community[minor]: [Pebblo] Enhance PebbloSafeLoader to take anonymize flag (#26812)
- **Description:** The flag is named `anonymize_snippets`. When set to true, the Pebblo server will anonymize snippets by redacting all personally identifiable information (PII) from the snippets going into VectorDB and the generated reports - **Issue:** NA - **Dependencies:** NA - **docs**: Updated
This commit is contained in:
@@ -124,6 +124,39 @@
|
|||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Anonymize the snippets to redact all PII details\n",
|
||||||
|
"\n",
|
||||||
|
"Set `anonymize_snippets` to `True` to anonymize all personally identifiable information (PII) from the snippets going into VectorDB and the generated reports.\n",
|
||||||
|
"\n",
|
||||||
|
"> Note: The _Pebblo Entity Classifier_ effectively identifies personally identifiable information (PII) and is continuously evolving. While its recall is not yet 100%, it is steadily improving.\n",
|
||||||
|
"> For more details, please refer to the [_Pebblo Entity Classifier docs_](https://daxa-ai.github.io/pebblo/entityclassifier/)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain_community.document_loaders import CSVLoader, PebbloSafeLoader\n",
|
||||||
|
"\n",
|
||||||
|
"loader = PebbloSafeLoader(\n",
|
||||||
|
" CSVLoader(\"data/corp_sens_data.csv\"),\n",
|
||||||
|
" name=\"acme-corp-rag-1\", # App name (Mandatory)\n",
|
||||||
|
" owner=\"Joe Smith\", # Owner (Optional)\n",
|
||||||
|
" description=\"Support productivity RAG application\", # Description (Optional)\n",
|
||||||
|
" anonymize_snippets=True, # Whether to anonymize entities in the PDF Report (Optional, default=False)\n",
|
||||||
|
")\n",
|
||||||
|
"documents = loader.load()\n",
|
||||||
|
"print(documents[0].metadata)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": []
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@@ -45,6 +45,7 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
classifier_url: Optional[str] = None,
|
classifier_url: Optional[str] = None,
|
||||||
*,
|
*,
|
||||||
classifier_location: str = "local",
|
classifier_location: str = "local",
|
||||||
|
anonymize_snippets: bool = False,
|
||||||
):
|
):
|
||||||
if not name or not isinstance(name, str):
|
if not name or not isinstance(name, str):
|
||||||
raise NameError("Must specify a valid name.")
|
raise NameError("Must specify a valid name.")
|
||||||
@@ -78,6 +79,7 @@ class PebbloSafeLoader(BaseLoader):
|
|||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
classifier_location=classifier_location,
|
classifier_location=classifier_location,
|
||||||
classifier_url=classifier_url,
|
classifier_url=classifier_url,
|
||||||
|
anonymize_snippets=anonymize_snippets,
|
||||||
)
|
)
|
||||||
self.pb_client.send_loader_discover(self.app)
|
self.pb_client.send_loader_discover(self.app)
|
||||||
|
|
||||||
|
@@ -154,6 +154,8 @@ class Doc(BaseModel):
|
|||||||
"""Owner of the source of the loader."""
|
"""Owner of the source of the loader."""
|
||||||
classifier_location: str
|
classifier_location: str
|
||||||
"""Location of the classifier."""
|
"""Location of the classifier."""
|
||||||
|
anonymize_snippets: bool
|
||||||
|
"""Whether to anonymize snippets going into VectorDB and the generated reports"""
|
||||||
|
|
||||||
|
|
||||||
def get_full_path(path: str) -> str:
|
def get_full_path(path: str) -> str:
|
||||||
@@ -424,6 +426,8 @@ class PebbloLoaderAPIWrapper(BaseModel):
|
|||||||
"""URL of the Pebblo Classifier"""
|
"""URL of the Pebblo Classifier"""
|
||||||
cloud_url: Optional[str]
|
cloud_url: Optional[str]
|
||||||
"""URL of the Pebblo Cloud"""
|
"""URL of the Pebblo Cloud"""
|
||||||
|
anonymize_snippets: bool = False
|
||||||
|
"""Whether to anonymize snippets going into VectorDB and the generated reports"""
|
||||||
|
|
||||||
def __init__(self, **kwargs: Any):
|
def __init__(self, **kwargs: Any):
|
||||||
"""Validate that api key in environment."""
|
"""Validate that api key in environment."""
|
||||||
@@ -522,6 +526,8 @@ class PebbloLoaderAPIWrapper(BaseModel):
|
|||||||
# If local classifier is used add the classified information
|
# If local classifier is used add the classified information
|
||||||
# and remove doc content
|
# and remove doc content
|
||||||
self.update_doc_data(payload["docs"], classified_docs)
|
self.update_doc_data(payload["docs"], classified_docs)
|
||||||
|
# Remove the anonymize_snippets key from payload
|
||||||
|
payload.pop("anonymize_snippets", None)
|
||||||
self.send_docs_to_pebblo_cloud(payload)
|
self.send_docs_to_pebblo_cloud(payload)
|
||||||
elif self.classifier_location == "pebblo-cloud":
|
elif self.classifier_location == "pebblo-cloud":
|
||||||
logger.warning("API key is missing for sending docs to Pebblo cloud.")
|
logger.warning("API key is missing for sending docs to Pebblo cloud.")
|
||||||
@@ -599,6 +605,7 @@ class PebbloLoaderAPIWrapper(BaseModel):
|
|||||||
"loading_end": "false",
|
"loading_end": "false",
|
||||||
"source_owner": source_owner,
|
"source_owner": source_owner,
|
||||||
"classifier_location": self.classifier_location,
|
"classifier_location": self.classifier_location,
|
||||||
|
"anonymize_snippets": self.anonymize_snippets,
|
||||||
}
|
}
|
||||||
if loading_end is True:
|
if loading_end is True:
|
||||||
payload["loading_end"] = "true"
|
payload["loading_end"] = "true"
|
||||||
|
Reference in New Issue
Block a user