diff --git a/libs/experimental/langchain_experimental/data_anonymizer/base.py b/libs/experimental/langchain_experimental/data_anonymizer/base.py index 6092ec8fb74..b4f75b42563 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/base.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/base.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Callable, Optional +from typing import Callable, List, Optional from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import ( @@ -16,12 +16,19 @@ class AnonymizerBase(ABC): wrapping the behavior for all methods in a base class. """ - def anonymize(self, text: str, language: Optional[str] = None) -> str: + def anonymize( + self, + text: str, + language: Optional[str] = None, + allow_list: Optional[List[str]] = None, + ) -> str: """Anonymize text""" - return self._anonymize(text, language) + return self._anonymize(text, language, allow_list) @abstractmethod - def _anonymize(self, text: str, language: Optional[str]) -> str: + def _anonymize( + self, text: str, language: Optional[str], allow_list: Optional[List[str]] = None + ) -> str: """Abstract method to anonymize text""" diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py index b10841b4d81..7b39b8435da 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -139,7 +139,12 @@ class PresidioAnonymizerBase(AnonymizerBase): class PresidioAnonymizer(PresidioAnonymizerBase): - def _anonymize(self, text: str, language: Optional[str] = None) -> str: + def _anonymize( + self, + text: str, + language: Optional[str] = None, + allow_list: Optional[List[str]] = None, + ) -> str: """Anonymize text. Each PII entity is replaced with a fake value. Each time fake values will be different, as they are generated randomly. @@ -172,6 +177,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase): text, entities=self.analyzed_fields, language=language, + allow_list=allow_list, ) filtered_analyzer_results = ( @@ -226,7 +232,12 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB for key, inner_dict in self.deanonymizer_mapping.items() } - def _anonymize(self, text: str, language: Optional[str] = None) -> str: + def _anonymize( + self, + text: str, + language: Optional[str] = None, + allow_list: Optional[List[str]] = None, + ) -> str: """Anonymize text. Each PII entity is replaced with a fake value. Each time fake values will be different, as they are generated randomly. @@ -261,6 +272,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB text, entities=self.analyzed_fields, language=language, + allow_list=allow_list, ) filtered_analyzer_results = ( diff --git a/libs/experimental/tests/unit_tests/test_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_data_anonymizer.py index c600dae923e..fa7e7d23aab 100644 --- a/libs/experimental/tests/unit_tests/test_data_anonymizer.py +++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py @@ -36,6 +36,21 @@ def test_anonymize(analyzed_fields: List[str], should_contain: bool) -> None: assert ("John Doe" in anonymized_text) == should_contain +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +@pytest.mark.parametrize( + "analyzed_fields,should_contain", + [(["PERSON"], True), (["PHONE_NUMBER"], True), (None, True)], +) +def test_anonymize_allow_list(analyzed_fields: List[str], should_contain: bool) -> None: + """Test anonymizing a name in a simple sentence""" + from langchain_experimental.data_anonymizer import PresidioAnonymizer + + text = "Hello, my name is John Doe." + anonymizer = PresidioAnonymizer(analyzed_fields=analyzed_fields) + anonymized_text = anonymizer.anonymize(text, allow_list=["John Doe"]) + assert ("John Doe" in anonymized_text) == should_contain + + @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") def test_anonymize_multiple() -> None: """Test anonymizing multiple items in a sentence""" diff --git a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py index 0a30afa054b..3fd1ae35d2a 100644 --- a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py +++ b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py @@ -37,6 +37,21 @@ def test_anonymize(analyzed_fields: List[str], should_contain: bool) -> None: assert ("John Doe" in anonymized_text) == should_contain +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +@pytest.mark.parametrize( + "analyzed_fields,should_contain", + [(["PERSON"], True), (["PHONE_NUMBER"], True), (None, True)], +) +def test_anonymize_allow_list(analyzed_fields: List[str], should_contain: bool) -> None: + """Test anonymizing a name in a simple sentence""" + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + text = "Hello, my name is John Doe." + anonymizer = PresidioReversibleAnonymizer(analyzed_fields=analyzed_fields) + anonymized_text = anonymizer.anonymize(text, allow_list=["John Doe"]) + assert ("John Doe" in anonymized_text) == should_contain + + @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") def test_anonymize_multiple() -> None: """Test anonymizing multiple items in a sentence"""