Multilingual anonymization (#10327)

### Description Add multiple language support to Anonymizer PII detection in Microsoft Presidio relies on several components - in addition to the usual pattern matching (e.g. using regex), the analyser uses a model for Named Entity Recognition (NER) to extract entities such as: - `PERSON` - `LOCATION` - `DATE_TIME` - `NRP` - `ORGANIZATION` [[Source]](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py) To handle NER in specific languages, we utilize unique models from the `spaCy` library, recognized for its extensive selection covering multiple languages and sizes. However, it's not restrictive, allowing for integration of alternative frameworks such as [Stanza](https://microsoft.github.io/presidio/analyzer/nlp_engines/spacy_stanza/) or [transformers](https://microsoft.github.io/presidio/analyzer/nlp_engines/transformers/) when necessary. ### Future works - **automatic language detection** - instead of passing the language as a parameter in `anonymizer.anonymize`, we could detect the language/s beforehand and then use the corresponding NER model. We have discussed this internally and @mateusz-wosinski-ds will look into a standalone language detection tool/chain for LangChain 😄 ### Twitter handle @deepsense_ai / @MaksOpp ### Tag maintainer @baskaryan @hwchase17 @hinthornw
2025-09-22 11:00:37 +00:00 · 2023-09-07 23:42:24 +02:00
parent a9eb7c6cfc
commit 274c3dc3a8
7 changed files with 1053 additions and 475 deletions
--- a/libs/experimental/langchain_experimental/data_anonymizer/base.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/base.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from typing import Optional


 class AnonymizerBase(ABC):
@@ -8,12 +9,12 @@ class AnonymizerBase(ABC):
        wrapping the behavior for all methods in a base class.
    """

-    def anonymize(self, text: str) -> str:
+    def anonymize(self, text: str, language: Optional[str] = None) -> str:
        """Anonymize text"""
-        return self._anonymize(text)
+        return self._anonymize(text, language)

    @abstractmethod
-    def _anonymize(self, text: str) -> str:
+    def _anonymize(self, text: str, language: Optional[str]) -> str:
        """Abstract method to anonymize text"""


--- a/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py
@@ -27,8 +27,8 @@ def get_pseudoanonymizer_mapping(seed: Optional[int] = None) -> Dict[str, Callab
            fake.random_choices(string.ascii_lowercase + string.digits, length=26)
        ),
        "IP_ADDRESS": lambda _: fake.ipv4_public(),
-        "LOCATION": lambda _: fake.address(),
-        "DATE_TIME": lambda _: fake.iso8601(),
+        "LOCATION": lambda _: fake.city(),
+        "DATE_TIME": lambda _: fake.date(),
        "NRP": lambda _: str(fake.random_number(digits=8, fix_len=True)),
        "MEDICAL_LICENSE": lambda _: fake.bothify(text="??######").upper(),
        "URL": lambda _: fake.url(),
--- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
@@ -24,6 +24,8 @@ from langchain_experimental.data_anonymizer.faker_presidio_mapping import (

 try:
    from presidio_analyzer import AnalyzerEngine
+    from presidio_analyzer.nlp_engine import NlpEngineProvider
+
 except ImportError as e:
    raise ImportError(
        "Could not import presidio_analyzer, please install with "
@@ -44,12 +46,29 @@ if TYPE_CHECKING:
    from presidio_analyzer import EntityRecognizer, RecognizerResult
    from presidio_anonymizer.entities import EngineResult

+# Configuring Anonymizer for multiple languages
+# Detailed description and examples can be found here:
+# langchain/docs/extras/guides/privacy/multi_language_anonymization.ipynb
+DEFAULT_LANGUAGES_CONFIG = {
+    # You can also use Stanza or transformers library.
+    # See https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/
+    "nlp_engine_name": "spacy",
+    "models": [
+        {"lang_code": "en", "model_name": "en_core_web_lg"},
+        # {"lang_code": "de", "model_name": "de_core_news_md"},
+        # {"lang_code": "es", "model_name": "es_core_news_md"},
+        # ...
+        # List of available models: https://spacy.io/usage/models
+    ],
+}
+

 class PresidioAnonymizerBase(AnonymizerBase):
    def __init__(
        self,
        analyzed_fields: Optional[List[str]] = None,
        operators: Optional[Dict[str, OperatorConfig]] = None,
+        languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
        faker_seed: Optional[int] = None,
    ):
        """
@@ -60,6 +79,11 @@ class PresidioAnonymizerBase(AnonymizerBase):
                Operators allow for custom anonymization of detected PII.
                Learn more:
                https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/
+            languages_config: Configuration for the NLP engine.
+                First language in the list will be used as the main language
+                in self.anonymize(...) when no language is specified.
+                Learn more:
+                https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/
            faker_seed: Seed used to initialize faker.
                Defaults to None, in which case faker will be seeded randomly
                and provide random values.
@@ -81,7 +105,15 @@ class PresidioAnonymizerBase(AnonymizerBase):
                ).items()
            }
        )
-        self._analyzer = AnalyzerEngine()
+
+        provider = NlpEngineProvider(nlp_configuration=languages_config)
+        nlp_engine = provider.create_engine()
+
+        self.supported_languages = list(nlp_engine.nlp.keys())
+
+        self._analyzer = AnalyzerEngine(
+            supported_languages=self.supported_languages, nlp_engine=nlp_engine
+        )
        self._anonymizer = AnonymizerEngine()

    def add_recognizer(self, recognizer: EntityRecognizer) -> None:
@@ -103,18 +135,31 @@ class PresidioAnonymizerBase(AnonymizerBase):


 class PresidioAnonymizer(PresidioAnonymizerBase):
-    def _anonymize(self, text: str) -> str:
+    def _anonymize(self, text: str, language: Optional[str] = None) -> str:
        """Anonymize text.
        Each PII entity is replaced with a fake value.
        Each time fake values will be different, as they are generated randomly.

        Args:
            text: text to anonymize
+            language: language to use for analysis of PII
+                If None, the first (main) language in the list
+                of languages specified in the configuration will be used.
        """
+        if language is None:
+            language = self.supported_languages[0]
+
+        if language not in self.supported_languages:
+            raise ValueError(
+                f"Language '{language}' is not supported. "
+                f"Supported languages are: {self.supported_languages}. "
+                "Change your language configuration file to add more languages."
+            )
+
        results = self._analyzer.analyze(
            text,
            entities=self.analyzed_fields,
-            language="en",
+            language=language,
        )

        return self._anonymizer.anonymize(
@@ -129,9 +174,10 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
        self,
        analyzed_fields: Optional[List[str]] = None,
        operators: Optional[Dict[str, OperatorConfig]] = None,
+        languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
        faker_seed: Optional[int] = None,
    ):
-        super().__init__(analyzed_fields, operators, faker_seed)
+        super().__init__(analyzed_fields, operators, languages_config, faker_seed)
        self._deanonymizer_mapping = DeanonymizerMapping()

    @property
@@ -191,7 +237,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB

        self._deanonymizer_mapping.update(new_deanonymizer_mapping)

-    def _anonymize(self, text: str) -> str:
+    def _anonymize(self, text: str, language: Optional[str] = None) -> str:
        """Anonymize text.
        Each PII entity is replaced with a fake value.
        Each time fake values will be different, as they are generated randomly.
@@ -200,11 +246,24 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB

        Args:
            text: text to anonymize
+            language: language to use for analysis of PII
+                If None, the first (main) language in the list
+                of languages specified in the configuration will be used.
        """
+        if language is None:
+            language = self.supported_languages[0]
+
+        if language not in self.supported_languages:
+            raise ValueError(
+                f"Language '{language}' is not supported. "
+                f"Supported languages are: {self.supported_languages}. "
+                "Change your language configuration file to add more languages."
+            )
+
        analyzer_results = self._analyzer.analyze(
            text,
            entities=self.analyzed_fields,
-            language="en",
+            language=language,
        )

        filtered_analyzer_results = (