mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 14:49:29 +00:00
experimental[patch]: missing resolution strategy in anonymization (#16653)
- **Description:** Presidio-based anonymizers are not working because `_remove_conflicts_and_get_text_manipulation_data` was being called without a conflict resolution strategy. This PR fixes this issue. In addition, it removes some mutable default arguments (antipattern). To reproduce the issue, just run the very first cell of this [notebook](https://python.langchain.com/docs/guides/privacy/2/) from langchain's documentation. <!-- Thank you for contributing to LangChain! Please title your PR "<package>: <description>", where <package> is whichever of langchain, community, core, experimental, etc. is being modified. Replace this entire comment with: - **Description:** a description of the change, - **Issue:** the issue # it fixes if applicable, - **Dependencies:** any dependencies required for this change, - **Twitter handle:** we announce bigger features on Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out! Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` from the root of the package you've modified to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://python.langchain.com/docs/contributing/ If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. -->
This commit is contained in:
parent
8e44363ec9
commit
1bc8d9a943
@ -27,7 +27,7 @@ if TYPE_CHECKING:
|
||||
from presidio_analyzer import AnalyzerEngine, EntityRecognizer
|
||||
from presidio_analyzer.nlp_engine import NlpEngineProvider
|
||||
from presidio_anonymizer import AnonymizerEngine
|
||||
from presidio_anonymizer.entities import OperatorConfig
|
||||
from presidio_anonymizer.entities import ConflictResolutionStrategy, OperatorConfig
|
||||
|
||||
|
||||
def _import_analyzer_engine() -> "AnalyzerEngine":
|
||||
@ -102,7 +102,7 @@ class PresidioAnonymizerBase(AnonymizerBase):
|
||||
self,
|
||||
analyzed_fields: Optional[List[str]] = None,
|
||||
operators: Optional[Dict[str, OperatorConfig]] = None,
|
||||
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
|
||||
languages_config: Optional[Dict] = None,
|
||||
add_default_faker_operators: bool = True,
|
||||
faker_seed: Optional[int] = None,
|
||||
):
|
||||
@ -123,6 +123,8 @@ class PresidioAnonymizerBase(AnonymizerBase):
|
||||
Defaults to None, in which case faker will be seeded randomly
|
||||
and provide random values.
|
||||
"""
|
||||
if languages_config is None:
|
||||
languages_config = DEFAULT_LANGUAGES_CONFIG
|
||||
OperatorConfig = _import_operator_config()
|
||||
AnalyzerEngine = _import_analyzer_engine()
|
||||
NlpEngineProvider = _import_nlp_engine_provider()
|
||||
@ -183,6 +185,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
|
||||
text: str,
|
||||
language: Optional[str] = None,
|
||||
allow_list: Optional[List[str]] = None,
|
||||
conflict_resolution: Optional[ConflictResolutionStrategy] = None,
|
||||
) -> str:
|
||||
"""Anonymize text.
|
||||
Each PII entity is replaced with a fake value.
|
||||
@ -204,8 +207,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
|
||||
"""
|
||||
if language is None:
|
||||
language = self.supported_languages[0]
|
||||
|
||||
if language not in self.supported_languages:
|
||||
elif language not in self.supported_languages:
|
||||
raise ValueError(
|
||||
f"Language '{language}' is not supported. "
|
||||
f"Supported languages are: {self.supported_languages}. "
|
||||
@ -237,7 +239,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
|
||||
|
||||
filtered_analyzer_results = (
|
||||
self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
|
||||
analyzer_results
|
||||
analyzer_results, conflict_resolution
|
||||
)
|
||||
)
|
||||
|
||||
@ -260,10 +262,12 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
|
||||
self,
|
||||
analyzed_fields: Optional[List[str]] = None,
|
||||
operators: Optional[Dict[str, OperatorConfig]] = None,
|
||||
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
|
||||
languages_config: Optional[Dict] = None,
|
||||
add_default_faker_operators: bool = True,
|
||||
faker_seed: Optional[int] = None,
|
||||
):
|
||||
if languages_config is None:
|
||||
languages_config = DEFAULT_LANGUAGES_CONFIG
|
||||
super().__init__(
|
||||
analyzed_fields,
|
||||
operators,
|
||||
@ -292,6 +296,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
|
||||
text: str,
|
||||
language: Optional[str] = None,
|
||||
allow_list: Optional[List[str]] = None,
|
||||
conflict_resolution: Optional[ConflictResolutionStrategy] = None,
|
||||
) -> str:
|
||||
"""Anonymize text.
|
||||
Each PII entity is replaced with a fake value.
|
||||
@ -348,7 +353,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
|
||||
|
||||
filtered_analyzer_results = (
|
||||
self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
|
||||
analyzer_results
|
||||
analyzer_results, conflict_resolution
|
||||
)
|
||||
)
|
||||
|
||||
|
242
libs/experimental/poetry.lock
generated
242
libs/experimental/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -12,8 +12,8 @@ repository = "https://github.com/langchain-ai/langchain"
|
||||
python = ">=3.8.1,<4.0"
|
||||
langchain-core = "^0.1.7"
|
||||
langchain = "^0.1"
|
||||
presidio-anonymizer = {version = "^2.2.33", optional = true}
|
||||
presidio-analyzer = {version = "^2.2.33", optional = true}
|
||||
presidio-anonymizer = {version = "^2.2.352", optional = true}
|
||||
presidio-analyzer = {version = "^2.2.352", optional = true}
|
||||
faker = {version = "^19.3.1", optional = true}
|
||||
vowpal-wabbit-next = {version = "0.6.0", optional = true}
|
||||
sentence-transformers = {version = "^2", optional = true}
|
||||
|
Loading…
Reference in New Issue
Block a user