mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-24 07:35:18 +00:00
experimental[patch]: missing resolution strategy in anonymization (#16653)
- **Description:** Presidio-based anonymizers are not working because `_remove_conflicts_and_get_text_manipulation_data` was being called without a conflict resolution strategy. This PR fixes this issue. In addition, it removes some mutable default arguments (antipattern). To reproduce the issue, just run the very first cell of this [notebook](https://python.langchain.com/docs/guides/privacy/2/) from langchain's documentation. <!-- Thank you for contributing to LangChain! Please title your PR "<package>: <description>", where <package> is whichever of langchain, community, core, experimental, etc. is being modified. Replace this entire comment with: - **Description:** a description of the change, - **Issue:** the issue # it fixes if applicable, - **Dependencies:** any dependencies required for this change, - **Twitter handle:** we announce bigger features on Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out! Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` from the root of the package you've modified to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://python.langchain.com/docs/contributing/ If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. -->
This commit is contained in:
parent
8e44363ec9
commit
1bc8d9a943
@ -27,7 +27,7 @@ if TYPE_CHECKING:
|
|||||||
from presidio_analyzer import AnalyzerEngine, EntityRecognizer
|
from presidio_analyzer import AnalyzerEngine, EntityRecognizer
|
||||||
from presidio_analyzer.nlp_engine import NlpEngineProvider
|
from presidio_analyzer.nlp_engine import NlpEngineProvider
|
||||||
from presidio_anonymizer import AnonymizerEngine
|
from presidio_anonymizer import AnonymizerEngine
|
||||||
from presidio_anonymizer.entities import OperatorConfig
|
from presidio_anonymizer.entities import ConflictResolutionStrategy, OperatorConfig
|
||||||
|
|
||||||
|
|
||||||
def _import_analyzer_engine() -> "AnalyzerEngine":
|
def _import_analyzer_engine() -> "AnalyzerEngine":
|
||||||
@ -102,7 +102,7 @@ class PresidioAnonymizerBase(AnonymizerBase):
|
|||||||
self,
|
self,
|
||||||
analyzed_fields: Optional[List[str]] = None,
|
analyzed_fields: Optional[List[str]] = None,
|
||||||
operators: Optional[Dict[str, OperatorConfig]] = None,
|
operators: Optional[Dict[str, OperatorConfig]] = None,
|
||||||
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
|
languages_config: Optional[Dict] = None,
|
||||||
add_default_faker_operators: bool = True,
|
add_default_faker_operators: bool = True,
|
||||||
faker_seed: Optional[int] = None,
|
faker_seed: Optional[int] = None,
|
||||||
):
|
):
|
||||||
@ -123,6 +123,8 @@ class PresidioAnonymizerBase(AnonymizerBase):
|
|||||||
Defaults to None, in which case faker will be seeded randomly
|
Defaults to None, in which case faker will be seeded randomly
|
||||||
and provide random values.
|
and provide random values.
|
||||||
"""
|
"""
|
||||||
|
if languages_config is None:
|
||||||
|
languages_config = DEFAULT_LANGUAGES_CONFIG
|
||||||
OperatorConfig = _import_operator_config()
|
OperatorConfig = _import_operator_config()
|
||||||
AnalyzerEngine = _import_analyzer_engine()
|
AnalyzerEngine = _import_analyzer_engine()
|
||||||
NlpEngineProvider = _import_nlp_engine_provider()
|
NlpEngineProvider = _import_nlp_engine_provider()
|
||||||
@ -183,6 +185,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
|
|||||||
text: str,
|
text: str,
|
||||||
language: Optional[str] = None,
|
language: Optional[str] = None,
|
||||||
allow_list: Optional[List[str]] = None,
|
allow_list: Optional[List[str]] = None,
|
||||||
|
conflict_resolution: Optional[ConflictResolutionStrategy] = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Anonymize text.
|
"""Anonymize text.
|
||||||
Each PII entity is replaced with a fake value.
|
Each PII entity is replaced with a fake value.
|
||||||
@ -204,8 +207,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
|
|||||||
"""
|
"""
|
||||||
if language is None:
|
if language is None:
|
||||||
language = self.supported_languages[0]
|
language = self.supported_languages[0]
|
||||||
|
elif language not in self.supported_languages:
|
||||||
if language not in self.supported_languages:
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Language '{language}' is not supported. "
|
f"Language '{language}' is not supported. "
|
||||||
f"Supported languages are: {self.supported_languages}. "
|
f"Supported languages are: {self.supported_languages}. "
|
||||||
@ -237,7 +239,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
|
|||||||
|
|
||||||
filtered_analyzer_results = (
|
filtered_analyzer_results = (
|
||||||
self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
|
self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
|
||||||
analyzer_results
|
analyzer_results, conflict_resolution
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -260,10 +262,12 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
|
|||||||
self,
|
self,
|
||||||
analyzed_fields: Optional[List[str]] = None,
|
analyzed_fields: Optional[List[str]] = None,
|
||||||
operators: Optional[Dict[str, OperatorConfig]] = None,
|
operators: Optional[Dict[str, OperatorConfig]] = None,
|
||||||
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
|
languages_config: Optional[Dict] = None,
|
||||||
add_default_faker_operators: bool = True,
|
add_default_faker_operators: bool = True,
|
||||||
faker_seed: Optional[int] = None,
|
faker_seed: Optional[int] = None,
|
||||||
):
|
):
|
||||||
|
if languages_config is None:
|
||||||
|
languages_config = DEFAULT_LANGUAGES_CONFIG
|
||||||
super().__init__(
|
super().__init__(
|
||||||
analyzed_fields,
|
analyzed_fields,
|
||||||
operators,
|
operators,
|
||||||
@ -292,6 +296,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
|
|||||||
text: str,
|
text: str,
|
||||||
language: Optional[str] = None,
|
language: Optional[str] = None,
|
||||||
allow_list: Optional[List[str]] = None,
|
allow_list: Optional[List[str]] = None,
|
||||||
|
conflict_resolution: Optional[ConflictResolutionStrategy] = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Anonymize text.
|
"""Anonymize text.
|
||||||
Each PII entity is replaced with a fake value.
|
Each PII entity is replaced with a fake value.
|
||||||
@ -348,7 +353,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
|
|||||||
|
|
||||||
filtered_analyzer_results = (
|
filtered_analyzer_results = (
|
||||||
self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
|
self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
|
||||||
analyzer_results
|
analyzer_results, conflict_resolution
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
242
libs/experimental/poetry.lock
generated
242
libs/experimental/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -12,8 +12,8 @@ repository = "https://github.com/langchain-ai/langchain"
|
|||||||
python = ">=3.8.1,<4.0"
|
python = ">=3.8.1,<4.0"
|
||||||
langchain-core = "^0.1.7"
|
langchain-core = "^0.1.7"
|
||||||
langchain = "^0.1"
|
langchain = "^0.1"
|
||||||
presidio-anonymizer = {version = "^2.2.33", optional = true}
|
presidio-anonymizer = {version = "^2.2.352", optional = true}
|
||||||
presidio-analyzer = {version = "^2.2.33", optional = true}
|
presidio-analyzer = {version = "^2.2.352", optional = true}
|
||||||
faker = {version = "^19.3.1", optional = true}
|
faker = {version = "^19.3.1", optional = true}
|
||||||
vowpal-wabbit-next = {version = "0.6.0", optional = true}
|
vowpal-wabbit-next = {version = "0.6.0", optional = true}
|
||||||
sentence-transformers = {version = "^2", optional = true}
|
sentence-transformers = {version = "^2", optional = true}
|
||||||
|
Loading…
Reference in New Issue
Block a user