langchain-experimental: Add allow_list support in experimental/data_anonymizer (#11597)

- **Description:** Add allow_list support in langchain experimental
data-anonymizer package
  - **Issue:** no
  - **Dependencies:** no
  - **Tag maintainer:** @hwchase17
  - **Twitter handle:**
This commit is contained in:
Suresh Kumar Ponnusamy
2023-10-12 03:20:41 +05:30
committed by GitHub
parent 2363c02cf3
commit 70f7558db2
4 changed files with 55 additions and 6 deletions

View File

@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Callable, Optional from typing import Callable, List, Optional
from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import ( from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
@@ -16,12 +16,19 @@ class AnonymizerBase(ABC):
wrapping the behavior for all methods in a base class. wrapping the behavior for all methods in a base class.
""" """
def anonymize(self, text: str, language: Optional[str] = None) -> str: def anonymize(
self,
text: str,
language: Optional[str] = None,
allow_list: Optional[List[str]] = None,
) -> str:
"""Anonymize text""" """Anonymize text"""
return self._anonymize(text, language) return self._anonymize(text, language, allow_list)
@abstractmethod @abstractmethod
def _anonymize(self, text: str, language: Optional[str]) -> str: def _anonymize(
self, text: str, language: Optional[str], allow_list: Optional[List[str]] = None
) -> str:
"""Abstract method to anonymize text""" """Abstract method to anonymize text"""

View File

@@ -139,7 +139,12 @@ class PresidioAnonymizerBase(AnonymizerBase):
class PresidioAnonymizer(PresidioAnonymizerBase): class PresidioAnonymizer(PresidioAnonymizerBase):
def _anonymize(self, text: str, language: Optional[str] = None) -> str: def _anonymize(
self,
text: str,
language: Optional[str] = None,
allow_list: Optional[List[str]] = None,
) -> str:
"""Anonymize text. """Anonymize text.
Each PII entity is replaced with a fake value. Each PII entity is replaced with a fake value.
Each time fake values will be different, as they are generated randomly. Each time fake values will be different, as they are generated randomly.
@@ -172,6 +177,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
text, text,
entities=self.analyzed_fields, entities=self.analyzed_fields,
language=language, language=language,
allow_list=allow_list,
) )
filtered_analyzer_results = ( filtered_analyzer_results = (
@@ -226,7 +232,12 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
for key, inner_dict in self.deanonymizer_mapping.items() for key, inner_dict in self.deanonymizer_mapping.items()
} }
def _anonymize(self, text: str, language: Optional[str] = None) -> str: def _anonymize(
self,
text: str,
language: Optional[str] = None,
allow_list: Optional[List[str]] = None,
) -> str:
"""Anonymize text. """Anonymize text.
Each PII entity is replaced with a fake value. Each PII entity is replaced with a fake value.
Each time fake values will be different, as they are generated randomly. Each time fake values will be different, as they are generated randomly.
@@ -261,6 +272,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
text, text,
entities=self.analyzed_fields, entities=self.analyzed_fields,
language=language, language=language,
allow_list=allow_list,
) )
filtered_analyzer_results = ( filtered_analyzer_results = (

View File

@@ -36,6 +36,21 @@ def test_anonymize(analyzed_fields: List[str], should_contain: bool) -> None:
assert ("John Doe" in anonymized_text) == should_contain assert ("John Doe" in anonymized_text) == should_contain
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
@pytest.mark.parametrize(
"analyzed_fields,should_contain",
[(["PERSON"], True), (["PHONE_NUMBER"], True), (None, True)],
)
def test_anonymize_allow_list(analyzed_fields: List[str], should_contain: bool) -> None:
"""Test anonymizing a name in a simple sentence"""
from langchain_experimental.data_anonymizer import PresidioAnonymizer
text = "Hello, my name is John Doe."
anonymizer = PresidioAnonymizer(analyzed_fields=analyzed_fields)
anonymized_text = anonymizer.anonymize(text, allow_list=["John Doe"])
assert ("John Doe" in anonymized_text) == should_contain
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_anonymize_multiple() -> None: def test_anonymize_multiple() -> None:
"""Test anonymizing multiple items in a sentence""" """Test anonymizing multiple items in a sentence"""

View File

@@ -37,6 +37,21 @@ def test_anonymize(analyzed_fields: List[str], should_contain: bool) -> None:
assert ("John Doe" in anonymized_text) == should_contain assert ("John Doe" in anonymized_text) == should_contain
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
@pytest.mark.parametrize(
"analyzed_fields,should_contain",
[(["PERSON"], True), (["PHONE_NUMBER"], True), (None, True)],
)
def test_anonymize_allow_list(analyzed_fields: List[str], should_contain: bool) -> None:
"""Test anonymizing a name in a simple sentence"""
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
text = "Hello, my name is John Doe."
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=analyzed_fields)
anonymized_text = anonymizer.anonymize(text, allow_list=["John Doe"])
assert ("John Doe" in anonymized_text) == should_contain
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_anonymize_multiple() -> None: def test_anonymize_multiple() -> None:
"""Test anonymizing multiple items in a sentence""" """Test anonymizing multiple items in a sentence"""