community[minor]: [Pebblo] Enhance PebbloSafeLoader to take anonymize flag (#26812)

- **Description:** The flag is named `anonymize_snippets`. When set to
true, the Pebblo server will anonymize snippets by redacting all
personally identifiable information (PII) from the snippets going into
VectorDB and the generated reports
- **Issue:** NA
- **Dependencies:** NA
- **docs**: Updated
This commit is contained in:
Rajendra Kadam
2024-09-25 19:03:06 +05:30
committed by GitHub
parent 92003b3724
commit 7e5a9c317f
3 changed files with 42 additions and 0 deletions

View File

@@ -154,6 +154,8 @@ class Doc(BaseModel):
"""Owner of the source of the loader."""
classifier_location: str
"""Location of the classifier."""
anonymize_snippets: bool
"""Whether to anonymize snippets going into VectorDB and the generated reports"""
def get_full_path(path: str) -> str:
@@ -424,6 +426,8 @@ class PebbloLoaderAPIWrapper(BaseModel):
"""URL of the Pebblo Classifier"""
cloud_url: Optional[str]
"""URL of the Pebblo Cloud"""
anonymize_snippets: bool = False
"""Whether to anonymize snippets going into VectorDB and the generated reports"""
def __init__(self, **kwargs: Any):
"""Validate that api key in environment."""
@@ -522,6 +526,8 @@ class PebbloLoaderAPIWrapper(BaseModel):
# If local classifier is used add the classified information
# and remove doc content
self.update_doc_data(payload["docs"], classified_docs)
# Remove the anonymize_snippets key from payload
payload.pop("anonymize_snippets", None)
self.send_docs_to_pebblo_cloud(payload)
elif self.classifier_location == "pebblo-cloud":
logger.warning("API key is missing for sending docs to Pebblo cloud.")
@@ -599,6 +605,7 @@ class PebbloLoaderAPIWrapper(BaseModel):
"loading_end": "false",
"source_owner": source_owner,
"classifier_location": self.classifier_location,
"anonymize_snippets": self.anonymize_snippets,
}
if loading_end is True:
payload["loading_end"] = "true"