mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-08 04:25:46 +00:00
parent
a992b9670d
commit
4d62def9ff
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,12 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional
|
||||
from typing import Callable, Optional
|
||||
|
||||
from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType
|
||||
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
|
||||
exact_matching_strategy,
|
||||
)
|
||||
|
||||
DEFAULT_DEANONYMIZER_MATCHING_STRATEGY = exact_matching_strategy
|
||||
|
||||
|
||||
class AnonymizerBase(ABC):
|
||||
@ -23,10 +30,20 @@ class ReversibleAnonymizerBase(AnonymizerBase):
|
||||
Base abstract class for reversible anonymizers.
|
||||
"""
|
||||
|
||||
def deanonymize(self, text: str) -> str:
|
||||
def deanonymize(
|
||||
self,
|
||||
text_to_deanonymize: str,
|
||||
deanonymizer_matching_strategy: Callable[
|
||||
[str, MappingDataType], str
|
||||
] = DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
|
||||
) -> str:
|
||||
"""Deanonymize text"""
|
||||
return self._deanonymize(text)
|
||||
return self._deanonymize(text_to_deanonymize, deanonymizer_matching_strategy)
|
||||
|
||||
@abstractmethod
|
||||
def _deanonymize(self, text: str) -> str:
|
||||
def _deanonymize(
|
||||
self,
|
||||
text_to_deanonymize: str,
|
||||
deanonymizer_matching_strategy: Callable[[str, MappingDataType], str],
|
||||
) -> str:
|
||||
"""Abstract method to deanonymize text"""
|
||||
|
@ -1,9 +1,12 @@
|
||||
from langchain_experimental.data_anonymizer.presidio import MappingDataType
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType
|
||||
|
||||
|
||||
def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
|
||||
def exact_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
|
||||
"""
|
||||
Default matching strategy for deanonymization.
|
||||
Exact matching strategy for deanonymization.
|
||||
It replaces all the anonymized entities with the original ones.
|
||||
|
||||
Args:
|
||||
@ -15,3 +18,168 @@ def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType)
|
||||
for anonymized, original in deanonymizer_mapping[entity_type].items():
|
||||
text = text.replace(anonymized, original)
|
||||
return text
|
||||
|
||||
|
||||
def case_insensitive_matching_strategy(
|
||||
text: str, deanonymizer_mapping: MappingDataType
|
||||
) -> str:
|
||||
"""
|
||||
Case insensitive matching strategy for deanonymization.
|
||||
It replaces all the anonymized entities with the original ones
|
||||
irrespective of their letter case.
|
||||
|
||||
Args:
|
||||
text: text to deanonymize
|
||||
deanonymizer_mapping: mapping between anonymized entities and original ones
|
||||
|
||||
Examples of matching:
|
||||
keanu reeves -> Keanu Reeves
|
||||
JOHN F. KENNEDY -> John F. Kennedy
|
||||
"""
|
||||
|
||||
# Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)
|
||||
for entity_type in deanonymizer_mapping:
|
||||
for anonymized, original in deanonymizer_mapping[entity_type].items():
|
||||
# Use regular expressions for case-insensitive matching and replacing
|
||||
text = re.sub(anonymized, original, text, flags=re.IGNORECASE)
|
||||
return text
|
||||
|
||||
|
||||
def fuzzy_matching_strategy(
|
||||
text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
|
||||
) -> str:
|
||||
"""
|
||||
Fuzzy matching strategy for deanonymization.
|
||||
It uses fuzzy matching to find the position of the anonymized entity in the text.
|
||||
It replaces all the anonymized entities with the original ones.
|
||||
|
||||
Args:
|
||||
text: text to deanonymize
|
||||
deanonymizer_mapping: mapping between anonymized entities and original ones
|
||||
max_l_dist: maximum Levenshtein distance between the anonymized entity and the
|
||||
text segment to consider it a match
|
||||
|
||||
Examples of matching:
|
||||
Kaenu Reves -> Keanu Reeves
|
||||
John F. Kennedy -> John Kennedy
|
||||
"""
|
||||
|
||||
try:
|
||||
from fuzzysearch import find_near_matches
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Could not import fuzzysearch, please install with "
|
||||
"`pip install fuzzysearch`."
|
||||
) from e
|
||||
|
||||
for entity_type in deanonymizer_mapping:
|
||||
for anonymized, original in deanonymizer_mapping[entity_type].items():
|
||||
matches = find_near_matches(anonymized, text, max_l_dist=max_l_dist)
|
||||
new_text = ""
|
||||
last_end = 0
|
||||
for m in matches:
|
||||
# add the text that isn't part of a match
|
||||
new_text += text[last_end : m.start]
|
||||
# add the replacement text
|
||||
new_text += original
|
||||
last_end = m.end
|
||||
# add the remaining text that wasn't part of a match
|
||||
new_text += text[last_end:]
|
||||
text = new_text
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def combined_exact_fuzzy_matching_strategy(
|
||||
text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
|
||||
) -> str:
|
||||
"""
|
||||
RECOMMENDED STRATEGY.
|
||||
Combined exact and fuzzy matching strategy for deanonymization.
|
||||
|
||||
Args:
|
||||
text: text to deanonymize
|
||||
deanonymizer_mapping: mapping between anonymized entities and original ones
|
||||
max_l_dist: maximum Levenshtein distance between the anonymized entity and the
|
||||
text segment to consider it a match
|
||||
|
||||
Examples of matching:
|
||||
Kaenu Reves -> Keanu Reeves
|
||||
John F. Kennedy -> John Kennedy
|
||||
"""
|
||||
text = exact_matching_strategy(text, deanonymizer_mapping)
|
||||
text = fuzzy_matching_strategy(text, deanonymizer_mapping, max_l_dist)
|
||||
return text
|
||||
|
||||
|
||||
def ngram_fuzzy_matching_strategy(
|
||||
text: str,
|
||||
deanonymizer_mapping: MappingDataType,
|
||||
fuzzy_threshold: int = 85,
|
||||
use_variable_length: bool = True,
|
||||
) -> str:
|
||||
"""
|
||||
N-gram fuzzy matching strategy for deanonymization.
|
||||
It replaces all the anonymized entities with the original ones.
|
||||
It uses fuzzy matching to find the position of the anonymized entity in the text.
|
||||
It generates n-grams of the same length as the anonymized entity from the text and
|
||||
uses fuzzy matching to find the position of the anonymized entity in the text.
|
||||
|
||||
Args:
|
||||
text: text to deanonymize
|
||||
deanonymizer_mapping: mapping between anonymized entities and original ones
|
||||
fuzzy_threshold: fuzzy matching threshold
|
||||
use_variable_length: whether to use (n-1, n, n+1)-grams or just n-grams
|
||||
"""
|
||||
|
||||
def generate_ngrams(words_list: List[str], n: int) -> list:
|
||||
"""Generate n-grams from a list of words"""
|
||||
return [
|
||||
" ".join(words_list[i : i + n]) for i in range(len(words_list) - (n - 1))
|
||||
]
|
||||
|
||||
try:
|
||||
from fuzzywuzzy import fuzz
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Could not import fuzzywuzzy, please install with "
|
||||
"`pip install fuzzywuzzy`."
|
||||
) from e
|
||||
|
||||
text_words = text.split()
|
||||
replacements = []
|
||||
matched_indices: List[int] = []
|
||||
|
||||
for entity_type in deanonymizer_mapping:
|
||||
for anonymized, original in deanonymizer_mapping[entity_type].items():
|
||||
anonymized_words = anonymized.split()
|
||||
|
||||
if use_variable_length:
|
||||
gram_lengths = [
|
||||
len(anonymized_words) - 1,
|
||||
len(anonymized_words),
|
||||
len(anonymized_words) + 1,
|
||||
]
|
||||
else:
|
||||
gram_lengths = [len(anonymized_words)]
|
||||
for n in gram_lengths:
|
||||
if n > 0: # Take only positive values
|
||||
segments = generate_ngrams(text_words, n)
|
||||
for i, segment in enumerate(segments):
|
||||
if (
|
||||
fuzz.ratio(anonymized.lower(), segment.lower())
|
||||
> fuzzy_threshold
|
||||
and i not in matched_indices
|
||||
):
|
||||
replacements.append((i, n, original))
|
||||
# Add the matched segment indices to the list
|
||||
matched_indices.extend(range(i, i + n))
|
||||
|
||||
# Sort replacements by index in reverse order
|
||||
replacements.sort(key=lambda x: x[0], reverse=True)
|
||||
|
||||
# Apply replacements in reverse order to not affect subsequent indices
|
||||
for start, length, replacement in replacements:
|
||||
text_words[start : start + length] = replacement.split()
|
||||
|
||||
return " ".join(text_words)
|
||||
|
@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
|
||||
import yaml
|
||||
|
||||
from langchain_experimental.data_anonymizer.base import (
|
||||
DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
|
||||
AnonymizerBase,
|
||||
ReversibleAnonymizerBase,
|
||||
)
|
||||
@ -16,7 +17,7 @@ from langchain_experimental.data_anonymizer.deanonymizer_mapping import (
|
||||
create_anonymizer_mapping,
|
||||
)
|
||||
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
|
||||
default_matching_strategy,
|
||||
exact_matching_strategy,
|
||||
)
|
||||
from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
|
||||
get_pseudoanonymizer_mapping,
|
||||
@ -190,7 +191,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
|
||||
filtered_analyzer_results,
|
||||
anonymizer_results,
|
||||
)
|
||||
return default_matching_strategy(text, anonymizer_mapping)
|
||||
return exact_matching_strategy(text, anonymizer_mapping)
|
||||
|
||||
|
||||
class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase):
|
||||
@ -282,14 +283,14 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
|
||||
)
|
||||
self._deanonymizer_mapping.update(new_deanonymizer_mapping)
|
||||
|
||||
return default_matching_strategy(text, self.anonymizer_mapping)
|
||||
return exact_matching_strategy(text, self.anonymizer_mapping)
|
||||
|
||||
def _deanonymize(
|
||||
self,
|
||||
text_to_deanonymize: str,
|
||||
deanonymizer_matching_strategy: Callable[
|
||||
[str, MappingDataType], str
|
||||
] = default_matching_strategy,
|
||||
] = DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
|
||||
) -> str:
|
||||
"""Deanonymize text.
|
||||
Each anonymized entity is replaced with its original value.
|
||||
|
@ -126,3 +126,76 @@ def test_non_faker_values() -> None:
|
||||
anonymizer = PresidioAnonymizer(add_default_faker_operators=False)
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
assert anonymized_text == expected_result
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
def test_exact_matching_strategy() -> None:
|
||||
"""
|
||||
Test exact matching strategy for deanonymization.
|
||||
"""
|
||||
from langchain_experimental.data_anonymizer import (
|
||||
deanonymizer_matching_strategies as dms,
|
||||
)
|
||||
|
||||
deanonymizer_mapping = {
|
||||
"PERSON": {"Maria Lynch": "Slim Shady"},
|
||||
"PHONE_NUMBER": {"7344131647": "313-666-7440"},
|
||||
"EMAIL_ADDRESS": {"wdavis@example.net": "real.slim.shady@gmail.com"},
|
||||
"CREDIT_CARD": {"213186379402654": "4916 0387 9536 0861"},
|
||||
}
|
||||
|
||||
text = (
|
||||
"Are you Maria Lynch? I found your card with number 213186379402654. "
|
||||
"Is this your phone number: 7344131647? "
|
||||
"Is this your email address: wdavis@example.net"
|
||||
)
|
||||
|
||||
deanonymized_text = dms.exact_matching_strategy(text, deanonymizer_mapping)
|
||||
|
||||
for original_value in [
|
||||
"Slim Shady",
|
||||
"313-666-7440",
|
||||
"real.slim.shady@gmail.com",
|
||||
"4916 0387 9536 0861",
|
||||
]:
|
||||
assert original_value in deanonymized_text
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
def test_best_matching_strategy() -> None:
|
||||
"""
|
||||
Test exact matching strategy for deanonymization.
|
||||
"""
|
||||
from langchain_experimental.data_anonymizer import (
|
||||
deanonymizer_matching_strategies as dms,
|
||||
)
|
||||
|
||||
deanonymizer_mapping = {
|
||||
"PERSON": {"Maria Lynch": "Slim Shady"},
|
||||
"PHONE_NUMBER": {"7344131647": "313-666-7440"},
|
||||
"EMAIL_ADDRESS": {"wdavis@example.net": "real.slim.shady@gmail.com"},
|
||||
"CREDIT_CARD": {"213186379402654": "4916 0387 9536 0861"},
|
||||
}
|
||||
|
||||
# Changed some values:
|
||||
# - "Maria Lynch" -> "Maria K. Lynch"
|
||||
# - "7344131647" -> "734-413-1647"
|
||||
# - "213186379402654" -> "2131 8637 9402 654"
|
||||
# - "wdavis@example.net" -> the same to test exact match
|
||||
text = (
|
||||
"Are you Maria K. Lynch? I found your card with number 2131 8637 9402 654. "
|
||||
"Is this your phone number: 734-413-1647?"
|
||||
"Is this your email address: wdavis@example.net"
|
||||
)
|
||||
|
||||
deanonymized_text = dms.combined_exact_fuzzy_matching_strategy(
|
||||
text, deanonymizer_mapping
|
||||
)
|
||||
|
||||
for original_value in [
|
||||
"Slim Shady",
|
||||
"313-666-7440",
|
||||
"real.slim.shady@gmail.com",
|
||||
"4916 0387 9536 0861",
|
||||
]:
|
||||
assert original_value in deanonymized_text
|
||||
|
Loading…
Reference in New Issue
Block a user