Better deanonymizer matching strategy (#11557)

@baskaryan, @hwchase17
This commit is contained in:
maks-operlejn-ds
2023-10-09 20:10:29 +02:00
committed by GitHub
parent a992b9670d
commit 4d62def9ff
5 changed files with 893 additions and 510 deletions

View File

@@ -126,3 +126,76 @@ def test_non_faker_values() -> None:
anonymizer = PresidioAnonymizer(add_default_faker_operators=False)
anonymized_text = anonymizer.anonymize(text)
assert anonymized_text == expected_result
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_exact_matching_strategy() -> None:
"""
Test exact matching strategy for deanonymization.
"""
from langchain_experimental.data_anonymizer import (
deanonymizer_matching_strategies as dms,
)
deanonymizer_mapping = {
"PERSON": {"Maria Lynch": "Slim Shady"},
"PHONE_NUMBER": {"7344131647": "313-666-7440"},
"EMAIL_ADDRESS": {"wdavis@example.net": "real.slim.shady@gmail.com"},
"CREDIT_CARD": {"213186379402654": "4916 0387 9536 0861"},
}
text = (
"Are you Maria Lynch? I found your card with number 213186379402654. "
"Is this your phone number: 7344131647? "
"Is this your email address: wdavis@example.net"
)
deanonymized_text = dms.exact_matching_strategy(text, deanonymizer_mapping)
for original_value in [
"Slim Shady",
"313-666-7440",
"real.slim.shady@gmail.com",
"4916 0387 9536 0861",
]:
assert original_value in deanonymized_text
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_best_matching_strategy() -> None:
"""
Test exact matching strategy for deanonymization.
"""
from langchain_experimental.data_anonymizer import (
deanonymizer_matching_strategies as dms,
)
deanonymizer_mapping = {
"PERSON": {"Maria Lynch": "Slim Shady"},
"PHONE_NUMBER": {"7344131647": "313-666-7440"},
"EMAIL_ADDRESS": {"wdavis@example.net": "real.slim.shady@gmail.com"},
"CREDIT_CARD": {"213186379402654": "4916 0387 9536 0861"},
}
# Changed some values:
# - "Maria Lynch" -> "Maria K. Lynch"
# - "7344131647" -> "734-413-1647"
# - "213186379402654" -> "2131 8637 9402 654"
# - "wdavis@example.net" -> the same to test exact match
text = (
"Are you Maria K. Lynch? I found your card with number 2131 8637 9402 654. "
"Is this your phone number: 734-413-1647?"
"Is this your email address: wdavis@example.net"
)
deanonymized_text = dms.combined_exact_fuzzy_matching_strategy(
text, deanonymizer_mapping
)
for original_value in [
"Slim Shady",
"313-666-7440",
"real.slim.shady@gmail.com",
"4916 0387 9536 0861",
]:
assert original_value in deanonymized_text