Multilingual anonymization (#10327)

### Description

Add multiple language support to Anonymizer

PII detection in Microsoft Presidio relies on several components - in
addition to the usual pattern matching (e.g. using regex), the analyser
uses a model for Named Entity Recognition (NER) to extract entities such
as:
- `PERSON`
- `LOCATION`
- `DATE_TIME`
- `NRP`
- `ORGANIZATION`


[[Source]](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py)

To handle NER in specific languages, we utilize unique models from the
`spaCy` library, recognized for its extensive selection covering
multiple languages and sizes. However, it's not restrictive, allowing
for integration of alternative frameworks such as
[Stanza](https://microsoft.github.io/presidio/analyzer/nlp_engines/spacy_stanza/)
or
[transformers](https://microsoft.github.io/presidio/analyzer/nlp_engines/transformers/)
when necessary.

### Future works

- **automatic language detection** - instead of passing the language as
a parameter in `anonymizer.anonymize`, we could detect the language/s
beforehand and then use the corresponding NER model. We have discussed
this internally and @mateusz-wosinski-ds will look into a standalone
language detection tool/chain for LangChain 😄

### Twitter handle
@deepsense_ai / @MaksOpp

### Tag maintainer
@baskaryan @hwchase17 @hinthornw
This commit is contained in:
maks-operlejn-ds
2023-09-07 23:42:24 +02:00
committed by GitHub
parent a9eb7c6cfc
commit 274c3dc3a8
7 changed files with 1053 additions and 475 deletions

View File

@@ -1,4 +1,5 @@
from abc import ABC, abstractmethod
from typing import Optional
class AnonymizerBase(ABC):
@@ -8,12 +9,12 @@ class AnonymizerBase(ABC):
wrapping the behavior for all methods in a base class.
"""
def anonymize(self, text: str) -> str:
def anonymize(self, text: str, language: Optional[str] = None) -> str:
"""Anonymize text"""
return self._anonymize(text)
return self._anonymize(text, language)
@abstractmethod
def _anonymize(self, text: str) -> str:
def _anonymize(self, text: str, language: Optional[str]) -> str:
"""Abstract method to anonymize text"""

View File

@@ -27,8 +27,8 @@ def get_pseudoanonymizer_mapping(seed: Optional[int] = None) -> Dict[str, Callab
fake.random_choices(string.ascii_lowercase + string.digits, length=26)
),
"IP_ADDRESS": lambda _: fake.ipv4_public(),
"LOCATION": lambda _: fake.address(),
"DATE_TIME": lambda _: fake.iso8601(),
"LOCATION": lambda _: fake.city(),
"DATE_TIME": lambda _: fake.date(),
"NRP": lambda _: str(fake.random_number(digits=8, fix_len=True)),
"MEDICAL_LICENSE": lambda _: fake.bothify(text="??######").upper(),
"URL": lambda _: fake.url(),

View File

@@ -24,6 +24,8 @@ from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
try:
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
except ImportError as e:
raise ImportError(
"Could not import presidio_analyzer, please install with "
@@ -44,12 +46,29 @@ if TYPE_CHECKING:
from presidio_analyzer import EntityRecognizer, RecognizerResult
from presidio_anonymizer.entities import EngineResult
# Configuring Anonymizer for multiple languages
# Detailed description and examples can be found here:
# langchain/docs/extras/guides/privacy/multi_language_anonymization.ipynb
DEFAULT_LANGUAGES_CONFIG = {
# You can also use Stanza or transformers library.
# See https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/
"nlp_engine_name": "spacy",
"models": [
{"lang_code": "en", "model_name": "en_core_web_lg"},
# {"lang_code": "de", "model_name": "de_core_news_md"},
# {"lang_code": "es", "model_name": "es_core_news_md"},
# ...
# List of available models: https://spacy.io/usage/models
],
}
class PresidioAnonymizerBase(AnonymizerBase):
def __init__(
self,
analyzed_fields: Optional[List[str]] = None,
operators: Optional[Dict[str, OperatorConfig]] = None,
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
faker_seed: Optional[int] = None,
):
"""
@@ -60,6 +79,11 @@ class PresidioAnonymizerBase(AnonymizerBase):
Operators allow for custom anonymization of detected PII.
Learn more:
https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/
languages_config: Configuration for the NLP engine.
First language in the list will be used as the main language
in self.anonymize(...) when no language is specified.
Learn more:
https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/
faker_seed: Seed used to initialize faker.
Defaults to None, in which case faker will be seeded randomly
and provide random values.
@@ -81,7 +105,15 @@ class PresidioAnonymizerBase(AnonymizerBase):
).items()
}
)
self._analyzer = AnalyzerEngine()
provider = NlpEngineProvider(nlp_configuration=languages_config)
nlp_engine = provider.create_engine()
self.supported_languages = list(nlp_engine.nlp.keys())
self._analyzer = AnalyzerEngine(
supported_languages=self.supported_languages, nlp_engine=nlp_engine
)
self._anonymizer = AnonymizerEngine()
def add_recognizer(self, recognizer: EntityRecognizer) -> None:
@@ -103,18 +135,31 @@ class PresidioAnonymizerBase(AnonymizerBase):
class PresidioAnonymizer(PresidioAnonymizerBase):
def _anonymize(self, text: str) -> str:
def _anonymize(self, text: str, language: Optional[str] = None) -> str:
"""Anonymize text.
Each PII entity is replaced with a fake value.
Each time fake values will be different, as they are generated randomly.
Args:
text: text to anonymize
language: language to use for analysis of PII
If None, the first (main) language in the list
of languages specified in the configuration will be used.
"""
if language is None:
language = self.supported_languages[0]
if language not in self.supported_languages:
raise ValueError(
f"Language '{language}' is not supported. "
f"Supported languages are: {self.supported_languages}. "
"Change your language configuration file to add more languages."
)
results = self._analyzer.analyze(
text,
entities=self.analyzed_fields,
language="en",
language=language,
)
return self._anonymizer.anonymize(
@@ -129,9 +174,10 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
self,
analyzed_fields: Optional[List[str]] = None,
operators: Optional[Dict[str, OperatorConfig]] = None,
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
faker_seed: Optional[int] = None,
):
super().__init__(analyzed_fields, operators, faker_seed)
super().__init__(analyzed_fields, operators, languages_config, faker_seed)
self._deanonymizer_mapping = DeanonymizerMapping()
@property
@@ -191,7 +237,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
self._deanonymizer_mapping.update(new_deanonymizer_mapping)
def _anonymize(self, text: str) -> str:
def _anonymize(self, text: str, language: Optional[str] = None) -> str:
"""Anonymize text.
Each PII entity is replaced with a fake value.
Each time fake values will be different, as they are generated randomly.
@@ -200,11 +246,24 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
Args:
text: text to anonymize
language: language to use for analysis of PII
If None, the first (main) language in the list
of languages specified in the configuration will be used.
"""
if language is None:
language = self.supported_languages[0]
if language not in self.supported_languages:
raise ValueError(
f"Language '{language}' is not supported. "
f"Supported languages are: {self.supported_languages}. "
"Change your language configuration file to add more languages."
)
analyzer_results = self._analyzer.analyze(
text,
entities=self.analyzed_fields,
language="en",
language=language,
)
filtered_analyzer_results = (