mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-11 13:55:03 +00:00
parent
a992b9670d
commit
4d62def9ff
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,12 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
|
from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType
|
||||||
|
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
|
||||||
|
exact_matching_strategy,
|
||||||
|
)
|
||||||
|
|
||||||
|
DEFAULT_DEANONYMIZER_MATCHING_STRATEGY = exact_matching_strategy
|
||||||
|
|
||||||
|
|
||||||
class AnonymizerBase(ABC):
|
class AnonymizerBase(ABC):
|
||||||
@ -23,10 +30,20 @@ class ReversibleAnonymizerBase(AnonymizerBase):
|
|||||||
Base abstract class for reversible anonymizers.
|
Base abstract class for reversible anonymizers.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def deanonymize(self, text: str) -> str:
|
def deanonymize(
|
||||||
|
self,
|
||||||
|
text_to_deanonymize: str,
|
||||||
|
deanonymizer_matching_strategy: Callable[
|
||||||
|
[str, MappingDataType], str
|
||||||
|
] = DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
|
||||||
|
) -> str:
|
||||||
"""Deanonymize text"""
|
"""Deanonymize text"""
|
||||||
return self._deanonymize(text)
|
return self._deanonymize(text_to_deanonymize, deanonymizer_matching_strategy)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _deanonymize(self, text: str) -> str:
|
def _deanonymize(
|
||||||
|
self,
|
||||||
|
text_to_deanonymize: str,
|
||||||
|
deanonymizer_matching_strategy: Callable[[str, MappingDataType], str],
|
||||||
|
) -> str:
|
||||||
"""Abstract method to deanonymize text"""
|
"""Abstract method to deanonymize text"""
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
from langchain_experimental.data_anonymizer.presidio import MappingDataType
|
import re
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType
|
||||||
|
|
||||||
|
|
||||||
def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
|
def exact_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
|
||||||
"""
|
"""
|
||||||
Default matching strategy for deanonymization.
|
Exact matching strategy for deanonymization.
|
||||||
It replaces all the anonymized entities with the original ones.
|
It replaces all the anonymized entities with the original ones.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -15,3 +18,168 @@ def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType)
|
|||||||
for anonymized, original in deanonymizer_mapping[entity_type].items():
|
for anonymized, original in deanonymizer_mapping[entity_type].items():
|
||||||
text = text.replace(anonymized, original)
|
text = text.replace(anonymized, original)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def case_insensitive_matching_strategy(
|
||||||
|
text: str, deanonymizer_mapping: MappingDataType
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Case insensitive matching strategy for deanonymization.
|
||||||
|
It replaces all the anonymized entities with the original ones
|
||||||
|
irrespective of their letter case.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: text to deanonymize
|
||||||
|
deanonymizer_mapping: mapping between anonymized entities and original ones
|
||||||
|
|
||||||
|
Examples of matching:
|
||||||
|
keanu reeves -> Keanu Reeves
|
||||||
|
JOHN F. KENNEDY -> John F. Kennedy
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)
|
||||||
|
for entity_type in deanonymizer_mapping:
|
||||||
|
for anonymized, original in deanonymizer_mapping[entity_type].items():
|
||||||
|
# Use regular expressions for case-insensitive matching and replacing
|
||||||
|
text = re.sub(anonymized, original, text, flags=re.IGNORECASE)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def fuzzy_matching_strategy(
|
||||||
|
text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Fuzzy matching strategy for deanonymization.
|
||||||
|
It uses fuzzy matching to find the position of the anonymized entity in the text.
|
||||||
|
It replaces all the anonymized entities with the original ones.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: text to deanonymize
|
||||||
|
deanonymizer_mapping: mapping between anonymized entities and original ones
|
||||||
|
max_l_dist: maximum Levenshtein distance between the anonymized entity and the
|
||||||
|
text segment to consider it a match
|
||||||
|
|
||||||
|
Examples of matching:
|
||||||
|
Kaenu Reves -> Keanu Reeves
|
||||||
|
John F. Kennedy -> John Kennedy
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
from fuzzysearch import find_near_matches
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError(
|
||||||
|
"Could not import fuzzysearch, please install with "
|
||||||
|
"`pip install fuzzysearch`."
|
||||||
|
) from e
|
||||||
|
|
||||||
|
for entity_type in deanonymizer_mapping:
|
||||||
|
for anonymized, original in deanonymizer_mapping[entity_type].items():
|
||||||
|
matches = find_near_matches(anonymized, text, max_l_dist=max_l_dist)
|
||||||
|
new_text = ""
|
||||||
|
last_end = 0
|
||||||
|
for m in matches:
|
||||||
|
# add the text that isn't part of a match
|
||||||
|
new_text += text[last_end : m.start]
|
||||||
|
# add the replacement text
|
||||||
|
new_text += original
|
||||||
|
last_end = m.end
|
||||||
|
# add the remaining text that wasn't part of a match
|
||||||
|
new_text += text[last_end:]
|
||||||
|
text = new_text
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def combined_exact_fuzzy_matching_strategy(
|
||||||
|
text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
RECOMMENDED STRATEGY.
|
||||||
|
Combined exact and fuzzy matching strategy for deanonymization.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: text to deanonymize
|
||||||
|
deanonymizer_mapping: mapping between anonymized entities and original ones
|
||||||
|
max_l_dist: maximum Levenshtein distance between the anonymized entity and the
|
||||||
|
text segment to consider it a match
|
||||||
|
|
||||||
|
Examples of matching:
|
||||||
|
Kaenu Reves -> Keanu Reeves
|
||||||
|
John F. Kennedy -> John Kennedy
|
||||||
|
"""
|
||||||
|
text = exact_matching_strategy(text, deanonymizer_mapping)
|
||||||
|
text = fuzzy_matching_strategy(text, deanonymizer_mapping, max_l_dist)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def ngram_fuzzy_matching_strategy(
|
||||||
|
text: str,
|
||||||
|
deanonymizer_mapping: MappingDataType,
|
||||||
|
fuzzy_threshold: int = 85,
|
||||||
|
use_variable_length: bool = True,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
N-gram fuzzy matching strategy for deanonymization.
|
||||||
|
It replaces all the anonymized entities with the original ones.
|
||||||
|
It uses fuzzy matching to find the position of the anonymized entity in the text.
|
||||||
|
It generates n-grams of the same length as the anonymized entity from the text and
|
||||||
|
uses fuzzy matching to find the position of the anonymized entity in the text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: text to deanonymize
|
||||||
|
deanonymizer_mapping: mapping between anonymized entities and original ones
|
||||||
|
fuzzy_threshold: fuzzy matching threshold
|
||||||
|
use_variable_length: whether to use (n-1, n, n+1)-grams or just n-grams
|
||||||
|
"""
|
||||||
|
|
||||||
|
def generate_ngrams(words_list: List[str], n: int) -> list:
|
||||||
|
"""Generate n-grams from a list of words"""
|
||||||
|
return [
|
||||||
|
" ".join(words_list[i : i + n]) for i in range(len(words_list) - (n - 1))
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from fuzzywuzzy import fuzz
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError(
|
||||||
|
"Could not import fuzzywuzzy, please install with "
|
||||||
|
"`pip install fuzzywuzzy`."
|
||||||
|
) from e
|
||||||
|
|
||||||
|
text_words = text.split()
|
||||||
|
replacements = []
|
||||||
|
matched_indices: List[int] = []
|
||||||
|
|
||||||
|
for entity_type in deanonymizer_mapping:
|
||||||
|
for anonymized, original in deanonymizer_mapping[entity_type].items():
|
||||||
|
anonymized_words = anonymized.split()
|
||||||
|
|
||||||
|
if use_variable_length:
|
||||||
|
gram_lengths = [
|
||||||
|
len(anonymized_words) - 1,
|
||||||
|
len(anonymized_words),
|
||||||
|
len(anonymized_words) + 1,
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
gram_lengths = [len(anonymized_words)]
|
||||||
|
for n in gram_lengths:
|
||||||
|
if n > 0: # Take only positive values
|
||||||
|
segments = generate_ngrams(text_words, n)
|
||||||
|
for i, segment in enumerate(segments):
|
||||||
|
if (
|
||||||
|
fuzz.ratio(anonymized.lower(), segment.lower())
|
||||||
|
> fuzzy_threshold
|
||||||
|
and i not in matched_indices
|
||||||
|
):
|
||||||
|
replacements.append((i, n, original))
|
||||||
|
# Add the matched segment indices to the list
|
||||||
|
matched_indices.extend(range(i, i + n))
|
||||||
|
|
||||||
|
# Sort replacements by index in reverse order
|
||||||
|
replacements.sort(key=lambda x: x[0], reverse=True)
|
||||||
|
|
||||||
|
# Apply replacements in reverse order to not affect subsequent indices
|
||||||
|
for start, length, replacement in replacements:
|
||||||
|
text_words[start : start + length] = replacement.split()
|
||||||
|
|
||||||
|
return " ".join(text_words)
|
||||||
|
@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
|
|||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from langchain_experimental.data_anonymizer.base import (
|
from langchain_experimental.data_anonymizer.base import (
|
||||||
|
DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
|
||||||
AnonymizerBase,
|
AnonymizerBase,
|
||||||
ReversibleAnonymizerBase,
|
ReversibleAnonymizerBase,
|
||||||
)
|
)
|
||||||
@ -16,7 +17,7 @@ from langchain_experimental.data_anonymizer.deanonymizer_mapping import (
|
|||||||
create_anonymizer_mapping,
|
create_anonymizer_mapping,
|
||||||
)
|
)
|
||||||
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
|
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
|
||||||
default_matching_strategy,
|
exact_matching_strategy,
|
||||||
)
|
)
|
||||||
from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
|
from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
|
||||||
get_pseudoanonymizer_mapping,
|
get_pseudoanonymizer_mapping,
|
||||||
@ -190,7 +191,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
|
|||||||
filtered_analyzer_results,
|
filtered_analyzer_results,
|
||||||
anonymizer_results,
|
anonymizer_results,
|
||||||
)
|
)
|
||||||
return default_matching_strategy(text, anonymizer_mapping)
|
return exact_matching_strategy(text, anonymizer_mapping)
|
||||||
|
|
||||||
|
|
||||||
class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase):
|
class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase):
|
||||||
@ -282,14 +283,14 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
|
|||||||
)
|
)
|
||||||
self._deanonymizer_mapping.update(new_deanonymizer_mapping)
|
self._deanonymizer_mapping.update(new_deanonymizer_mapping)
|
||||||
|
|
||||||
return default_matching_strategy(text, self.anonymizer_mapping)
|
return exact_matching_strategy(text, self.anonymizer_mapping)
|
||||||
|
|
||||||
def _deanonymize(
|
def _deanonymize(
|
||||||
self,
|
self,
|
||||||
text_to_deanonymize: str,
|
text_to_deanonymize: str,
|
||||||
deanonymizer_matching_strategy: Callable[
|
deanonymizer_matching_strategy: Callable[
|
||||||
[str, MappingDataType], str
|
[str, MappingDataType], str
|
||||||
] = default_matching_strategy,
|
] = DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Deanonymize text.
|
"""Deanonymize text.
|
||||||
Each anonymized entity is replaced with its original value.
|
Each anonymized entity is replaced with its original value.
|
||||||
|
@ -126,3 +126,76 @@ def test_non_faker_values() -> None:
|
|||||||
anonymizer = PresidioAnonymizer(add_default_faker_operators=False)
|
anonymizer = PresidioAnonymizer(add_default_faker_operators=False)
|
||||||
anonymized_text = anonymizer.anonymize(text)
|
anonymized_text = anonymizer.anonymize(text)
|
||||||
assert anonymized_text == expected_result
|
assert anonymized_text == expected_result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||||
|
def test_exact_matching_strategy() -> None:
|
||||||
|
"""
|
||||||
|
Test exact matching strategy for deanonymization.
|
||||||
|
"""
|
||||||
|
from langchain_experimental.data_anonymizer import (
|
||||||
|
deanonymizer_matching_strategies as dms,
|
||||||
|
)
|
||||||
|
|
||||||
|
deanonymizer_mapping = {
|
||||||
|
"PERSON": {"Maria Lynch": "Slim Shady"},
|
||||||
|
"PHONE_NUMBER": {"7344131647": "313-666-7440"},
|
||||||
|
"EMAIL_ADDRESS": {"wdavis@example.net": "real.slim.shady@gmail.com"},
|
||||||
|
"CREDIT_CARD": {"213186379402654": "4916 0387 9536 0861"},
|
||||||
|
}
|
||||||
|
|
||||||
|
text = (
|
||||||
|
"Are you Maria Lynch? I found your card with number 213186379402654. "
|
||||||
|
"Is this your phone number: 7344131647? "
|
||||||
|
"Is this your email address: wdavis@example.net"
|
||||||
|
)
|
||||||
|
|
||||||
|
deanonymized_text = dms.exact_matching_strategy(text, deanonymizer_mapping)
|
||||||
|
|
||||||
|
for original_value in [
|
||||||
|
"Slim Shady",
|
||||||
|
"313-666-7440",
|
||||||
|
"real.slim.shady@gmail.com",
|
||||||
|
"4916 0387 9536 0861",
|
||||||
|
]:
|
||||||
|
assert original_value in deanonymized_text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||||
|
def test_best_matching_strategy() -> None:
|
||||||
|
"""
|
||||||
|
Test exact matching strategy for deanonymization.
|
||||||
|
"""
|
||||||
|
from langchain_experimental.data_anonymizer import (
|
||||||
|
deanonymizer_matching_strategies as dms,
|
||||||
|
)
|
||||||
|
|
||||||
|
deanonymizer_mapping = {
|
||||||
|
"PERSON": {"Maria Lynch": "Slim Shady"},
|
||||||
|
"PHONE_NUMBER": {"7344131647": "313-666-7440"},
|
||||||
|
"EMAIL_ADDRESS": {"wdavis@example.net": "real.slim.shady@gmail.com"},
|
||||||
|
"CREDIT_CARD": {"213186379402654": "4916 0387 9536 0861"},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Changed some values:
|
||||||
|
# - "Maria Lynch" -> "Maria K. Lynch"
|
||||||
|
# - "7344131647" -> "734-413-1647"
|
||||||
|
# - "213186379402654" -> "2131 8637 9402 654"
|
||||||
|
# - "wdavis@example.net" -> the same to test exact match
|
||||||
|
text = (
|
||||||
|
"Are you Maria K. Lynch? I found your card with number 2131 8637 9402 654. "
|
||||||
|
"Is this your phone number: 734-413-1647?"
|
||||||
|
"Is this your email address: wdavis@example.net"
|
||||||
|
)
|
||||||
|
|
||||||
|
deanonymized_text = dms.combined_exact_fuzzy_matching_strategy(
|
||||||
|
text, deanonymizer_mapping
|
||||||
|
)
|
||||||
|
|
||||||
|
for original_value in [
|
||||||
|
"Slim Shady",
|
||||||
|
"313-666-7440",
|
||||||
|
"real.slim.shady@gmail.com",
|
||||||
|
"4916 0387 9536 0861",
|
||||||
|
]:
|
||||||
|
assert original_value in deanonymized_text
|
||||||
|
Loading…
Reference in New Issue
Block a user