mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-01 02:50:47 +00:00
langchain-experimental: Add allow_list support in experimental/data_anonymizer (#11597)
- **Description:** Add allow_list support in langchain experimental data-anonymizer package - **Issue:** no - **Dependencies:** no - **Tag maintainer:** @hwchase17 - **Twitter handle:**
This commit is contained in:
committed by
GitHub
parent
2363c02cf3
commit
70f7558db2
@@ -1,5 +1,5 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Callable, Optional
|
from typing import Callable, List, Optional
|
||||||
|
|
||||||
from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType
|
from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType
|
||||||
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
|
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
|
||||||
@@ -16,12 +16,19 @@ class AnonymizerBase(ABC):
|
|||||||
wrapping the behavior for all methods in a base class.
|
wrapping the behavior for all methods in a base class.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def anonymize(self, text: str, language: Optional[str] = None) -> str:
|
def anonymize(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
language: Optional[str] = None,
|
||||||
|
allow_list: Optional[List[str]] = None,
|
||||||
|
) -> str:
|
||||||
"""Anonymize text"""
|
"""Anonymize text"""
|
||||||
return self._anonymize(text, language)
|
return self._anonymize(text, language, allow_list)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _anonymize(self, text: str, language: Optional[str]) -> str:
|
def _anonymize(
|
||||||
|
self, text: str, language: Optional[str], allow_list: Optional[List[str]] = None
|
||||||
|
) -> str:
|
||||||
"""Abstract method to anonymize text"""
|
"""Abstract method to anonymize text"""
|
||||||
|
|
||||||
|
|
||||||
|
@@ -139,7 +139,12 @@ class PresidioAnonymizerBase(AnonymizerBase):
|
|||||||
|
|
||||||
|
|
||||||
class PresidioAnonymizer(PresidioAnonymizerBase):
|
class PresidioAnonymizer(PresidioAnonymizerBase):
|
||||||
def _anonymize(self, text: str, language: Optional[str] = None) -> str:
|
def _anonymize(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
language: Optional[str] = None,
|
||||||
|
allow_list: Optional[List[str]] = None,
|
||||||
|
) -> str:
|
||||||
"""Anonymize text.
|
"""Anonymize text.
|
||||||
Each PII entity is replaced with a fake value.
|
Each PII entity is replaced with a fake value.
|
||||||
Each time fake values will be different, as they are generated randomly.
|
Each time fake values will be different, as they are generated randomly.
|
||||||
@@ -172,6 +177,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
|
|||||||
text,
|
text,
|
||||||
entities=self.analyzed_fields,
|
entities=self.analyzed_fields,
|
||||||
language=language,
|
language=language,
|
||||||
|
allow_list=allow_list,
|
||||||
)
|
)
|
||||||
|
|
||||||
filtered_analyzer_results = (
|
filtered_analyzer_results = (
|
||||||
@@ -226,7 +232,12 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
|
|||||||
for key, inner_dict in self.deanonymizer_mapping.items()
|
for key, inner_dict in self.deanonymizer_mapping.items()
|
||||||
}
|
}
|
||||||
|
|
||||||
def _anonymize(self, text: str, language: Optional[str] = None) -> str:
|
def _anonymize(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
language: Optional[str] = None,
|
||||||
|
allow_list: Optional[List[str]] = None,
|
||||||
|
) -> str:
|
||||||
"""Anonymize text.
|
"""Anonymize text.
|
||||||
Each PII entity is replaced with a fake value.
|
Each PII entity is replaced with a fake value.
|
||||||
Each time fake values will be different, as they are generated randomly.
|
Each time fake values will be different, as they are generated randomly.
|
||||||
@@ -261,6 +272,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
|
|||||||
text,
|
text,
|
||||||
entities=self.analyzed_fields,
|
entities=self.analyzed_fields,
|
||||||
language=language,
|
language=language,
|
||||||
|
allow_list=allow_list,
|
||||||
)
|
)
|
||||||
|
|
||||||
filtered_analyzer_results = (
|
filtered_analyzer_results = (
|
||||||
|
@@ -36,6 +36,21 @@ def test_anonymize(analyzed_fields: List[str], should_contain: bool) -> None:
|
|||||||
assert ("John Doe" in anonymized_text) == should_contain
|
assert ("John Doe" in anonymized_text) == should_contain
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"analyzed_fields,should_contain",
|
||||||
|
[(["PERSON"], True), (["PHONE_NUMBER"], True), (None, True)],
|
||||||
|
)
|
||||||
|
def test_anonymize_allow_list(analyzed_fields: List[str], should_contain: bool) -> None:
|
||||||
|
"""Test anonymizing a name in a simple sentence"""
|
||||||
|
from langchain_experimental.data_anonymizer import PresidioAnonymizer
|
||||||
|
|
||||||
|
text = "Hello, my name is John Doe."
|
||||||
|
anonymizer = PresidioAnonymizer(analyzed_fields=analyzed_fields)
|
||||||
|
anonymized_text = anonymizer.anonymize(text, allow_list=["John Doe"])
|
||||||
|
assert ("John Doe" in anonymized_text) == should_contain
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||||
def test_anonymize_multiple() -> None:
|
def test_anonymize_multiple() -> None:
|
||||||
"""Test anonymizing multiple items in a sentence"""
|
"""Test anonymizing multiple items in a sentence"""
|
||||||
|
@@ -37,6 +37,21 @@ def test_anonymize(analyzed_fields: List[str], should_contain: bool) -> None:
|
|||||||
assert ("John Doe" in anonymized_text) == should_contain
|
assert ("John Doe" in anonymized_text) == should_contain
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"analyzed_fields,should_contain",
|
||||||
|
[(["PERSON"], True), (["PHONE_NUMBER"], True), (None, True)],
|
||||||
|
)
|
||||||
|
def test_anonymize_allow_list(analyzed_fields: List[str], should_contain: bool) -> None:
|
||||||
|
"""Test anonymizing a name in a simple sentence"""
|
||||||
|
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
|
||||||
|
|
||||||
|
text = "Hello, my name is John Doe."
|
||||||
|
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=analyzed_fields)
|
||||||
|
anonymized_text = anonymizer.anonymize(text, allow_list=["John Doe"])
|
||||||
|
assert ("John Doe" in anonymized_text) == should_contain
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||||
def test_anonymize_multiple() -> None:
|
def test_anonymize_multiple() -> None:
|
||||||
"""Test anonymizing multiple items in a sentence"""
|
"""Test anonymizing multiple items in a sentence"""
|
||||||
|
Reference in New Issue
Block a user