mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-09 15:03:21 +00:00
Add multilingual data anon chain (#10346)
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class AnonymizerBase(ABC):
|
||||
@@ -8,12 +9,12 @@ class AnonymizerBase(ABC):
|
||||
wrapping the behavior for all methods in a base class.
|
||||
"""
|
||||
|
||||
def anonymize(self, text: str) -> str:
|
||||
def anonymize(self, text: str, language: Optional[str] = None) -> str:
|
||||
"""Anonymize text"""
|
||||
return self._anonymize(text)
|
||||
return self._anonymize(text, language)
|
||||
|
||||
@abstractmethod
|
||||
def _anonymize(self, text: str) -> str:
|
||||
def _anonymize(self, text: str, language: Optional[str]) -> str:
|
||||
"""Abstract method to anonymize text"""
|
||||
|
||||
|
||||
|
@@ -27,8 +27,8 @@ def get_pseudoanonymizer_mapping(seed: Optional[int] = None) -> Dict[str, Callab
|
||||
fake.random_choices(string.ascii_lowercase + string.digits, length=26)
|
||||
),
|
||||
"IP_ADDRESS": lambda _: fake.ipv4_public(),
|
||||
"LOCATION": lambda _: fake.address(),
|
||||
"DATE_TIME": lambda _: fake.iso8601(),
|
||||
"LOCATION": lambda _: fake.city(),
|
||||
"DATE_TIME": lambda _: fake.date(),
|
||||
"NRP": lambda _: str(fake.random_number(digits=8, fix_len=True)),
|
||||
"MEDICAL_LICENSE": lambda _: fake.bothify(text="??######").upper(),
|
||||
"URL": lambda _: fake.url(),
|
||||
|
@@ -24,6 +24,8 @@ from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
|
||||
|
||||
try:
|
||||
from presidio_analyzer import AnalyzerEngine
|
||||
from presidio_analyzer.nlp_engine import NlpEngineProvider
|
||||
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Could not import presidio_analyzer, please install with "
|
||||
@@ -44,12 +46,29 @@ if TYPE_CHECKING:
|
||||
from presidio_analyzer import EntityRecognizer, RecognizerResult
|
||||
from presidio_anonymizer.entities import EngineResult
|
||||
|
||||
# Configuring Anonymizer for multiple languages
|
||||
# Detailed description and examples can be found here:
|
||||
# langchain/docs/extras/guides/privacy/multi_language_anonymization.ipynb
|
||||
DEFAULT_LANGUAGES_CONFIG = {
|
||||
# You can also use Stanza or transformers library.
|
||||
# See https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/
|
||||
"nlp_engine_name": "spacy",
|
||||
"models": [
|
||||
{"lang_code": "en", "model_name": "en_core_web_lg"},
|
||||
# {"lang_code": "de", "model_name": "de_core_news_md"},
|
||||
# {"lang_code": "es", "model_name": "es_core_news_md"},
|
||||
# ...
|
||||
# List of available models: https://spacy.io/usage/models
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
class PresidioAnonymizerBase(AnonymizerBase):
|
||||
def __init__(
|
||||
self,
|
||||
analyzed_fields: Optional[List[str]] = None,
|
||||
operators: Optional[Dict[str, OperatorConfig]] = None,
|
||||
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
|
||||
faker_seed: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
@@ -60,6 +79,11 @@ class PresidioAnonymizerBase(AnonymizerBase):
|
||||
Operators allow for custom anonymization of detected PII.
|
||||
Learn more:
|
||||
https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/
|
||||
languages_config: Configuration for the NLP engine.
|
||||
First language in the list will be used as the main language
|
||||
in self.anonymize(...) when no language is specified.
|
||||
Learn more:
|
||||
https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/
|
||||
faker_seed: Seed used to initialize faker.
|
||||
Defaults to None, in which case faker will be seeded randomly
|
||||
and provide random values.
|
||||
@@ -81,7 +105,15 @@ class PresidioAnonymizerBase(AnonymizerBase):
|
||||
).items()
|
||||
}
|
||||
)
|
||||
self._analyzer = AnalyzerEngine()
|
||||
|
||||
provider = NlpEngineProvider(nlp_configuration=languages_config)
|
||||
nlp_engine = provider.create_engine()
|
||||
|
||||
self.supported_languages = list(nlp_engine.nlp.keys())
|
||||
|
||||
self._analyzer = AnalyzerEngine(
|
||||
supported_languages=self.supported_languages, nlp_engine=nlp_engine
|
||||
)
|
||||
self._anonymizer = AnonymizerEngine()
|
||||
|
||||
def add_recognizer(self, recognizer: EntityRecognizer) -> None:
|
||||
@@ -103,18 +135,31 @@ class PresidioAnonymizerBase(AnonymizerBase):
|
||||
|
||||
|
||||
class PresidioAnonymizer(PresidioAnonymizerBase):
|
||||
def _anonymize(self, text: str) -> str:
|
||||
def _anonymize(self, text: str, language: Optional[str] = None) -> str:
|
||||
"""Anonymize text.
|
||||
Each PII entity is replaced with a fake value.
|
||||
Each time fake values will be different, as they are generated randomly.
|
||||
|
||||
Args:
|
||||
text: text to anonymize
|
||||
language: language to use for analysis of PII
|
||||
If None, the first (main) language in the list
|
||||
of languages specified in the configuration will be used.
|
||||
"""
|
||||
if language is None:
|
||||
language = self.supported_languages[0]
|
||||
|
||||
if language not in self.supported_languages:
|
||||
raise ValueError(
|
||||
f"Language '{language}' is not supported. "
|
||||
f"Supported languages are: {self.supported_languages}. "
|
||||
"Change your language configuration file to add more languages."
|
||||
)
|
||||
|
||||
results = self._analyzer.analyze(
|
||||
text,
|
||||
entities=self.analyzed_fields,
|
||||
language="en",
|
||||
language=language,
|
||||
)
|
||||
|
||||
return self._anonymizer.anonymize(
|
||||
@@ -129,9 +174,10 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
|
||||
self,
|
||||
analyzed_fields: Optional[List[str]] = None,
|
||||
operators: Optional[Dict[str, OperatorConfig]] = None,
|
||||
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
|
||||
faker_seed: Optional[int] = None,
|
||||
):
|
||||
super().__init__(analyzed_fields, operators, faker_seed)
|
||||
super().__init__(analyzed_fields, operators, languages_config, faker_seed)
|
||||
self._deanonymizer_mapping = DeanonymizerMapping()
|
||||
|
||||
@property
|
||||
@@ -191,7 +237,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
|
||||
|
||||
self._deanonymizer_mapping.update(new_deanonymizer_mapping)
|
||||
|
||||
def _anonymize(self, text: str) -> str:
|
||||
def _anonymize(self, text: str, language: Optional[str] = None) -> str:
|
||||
"""Anonymize text.
|
||||
Each PII entity is replaced with a fake value.
|
||||
Each time fake values will be different, as they are generated randomly.
|
||||
@@ -200,11 +246,24 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
|
||||
|
||||
Args:
|
||||
text: text to anonymize
|
||||
language: language to use for analysis of PII
|
||||
If None, the first (main) language in the list
|
||||
of languages specified in the configuration will be used.
|
||||
"""
|
||||
if language is None:
|
||||
language = self.supported_languages[0]
|
||||
|
||||
if language not in self.supported_languages:
|
||||
raise ValueError(
|
||||
f"Language '{language}' is not supported. "
|
||||
f"Supported languages are: {self.supported_languages}. "
|
||||
"Change your language configuration file to add more languages."
|
||||
)
|
||||
|
||||
analyzer_results = self._analyzer.analyze(
|
||||
text,
|
||||
entities=self.analyzed_fields,
|
||||
language="en",
|
||||
language=language,
|
||||
)
|
||||
|
||||
filtered_analyzer_results = (
|
||||
|
Reference in New Issue
Block a user