experimental: docstrings update (#18048)

Added missed docstrings. Formatted docsctrings to the consistent format.
This commit is contained in:
Leonid Ganeline
2024-02-23 18:24:16 -08:00
committed by GitHub
parent 56b955fc31
commit 3f6bf852ea
61 changed files with 316 additions and 102 deletions

View File

@@ -10,8 +10,8 @@ DEFAULT_DEANONYMIZER_MATCHING_STRATEGY = exact_matching_strategy
class AnonymizerBase(ABC):
"""
Base abstract class for anonymizers.
"""Base abstract class for anonymizers.
It is public and non-virtual because it allows
wrapping the behavior for all methods in a base class.
"""
@@ -22,7 +22,8 @@ class AnonymizerBase(ABC):
language: Optional[str] = None,
allow_list: Optional[List[str]] = None,
) -> str:
"""Anonymize text"""
"""Anonymize text."""
return self._anonymize(text, language, allow_list)
@abstractmethod

View File

@@ -11,7 +11,7 @@ MappingDataType = Dict[str, Dict[str, str]]
def format_duplicated_operator(operator_name: str, count: int) -> str:
"""Format the operator name with the count"""
"""Format the operator name with the count."""
clean_operator_name = re.sub(r"[<>]", "", operator_name)
clean_operator_name = re.sub(r"_\d+$", "", clean_operator_name)
@@ -24,17 +24,20 @@ def format_duplicated_operator(operator_name: str, count: int) -> str:
@dataclass
class DeanonymizerMapping:
"""Deanonymizer mapping."""
mapping: MappingDataType = field(
default_factory=lambda: defaultdict(lambda: defaultdict(str))
)
@property
def data(self) -> MappingDataType:
"""Return the deanonymizer mapping"""
"""Return the deanonymizer mapping."""
return {k: dict(v) for k, v in self.mapping.items()}
def update(self, new_mapping: MappingDataType) -> None:
"""Update the deanonymizer mapping with new values
"""Update the deanonymizer mapping with new values.
Duplicated values will not be added
If there are multiple entities of the same type, the mapping will
include a count to differentiate them. For example, if there are
@@ -67,7 +70,8 @@ def create_anonymizer_mapping(
anonymizer_results: "EngineResult",
is_reversed: bool = False,
) -> MappingDataType:
"""Creates or updates the mapping used to anonymize and/or deanonymize text.
"""Create or update the mapping used to anonymize and/or
deanonymize a text.
This method exploits the results returned by the
analysis and anonymization processes.

View File

@@ -5,8 +5,8 @@ from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingD
def exact_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
"""
Exact matching strategy for deanonymization.
"""Exact matching strategy for deanonymization.
It replaces all the anonymized entities with the original ones.
Args:
@@ -23,8 +23,8 @@ def exact_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) ->
def case_insensitive_matching_strategy(
text: str, deanonymizer_mapping: MappingDataType
) -> str:
"""
Case insensitive matching strategy for deanonymization.
"""Case insensitive matching strategy for deanonymization.
It replaces all the anonymized entities with the original ones
irrespective of their letter case.
@@ -48,8 +48,8 @@ def case_insensitive_matching_strategy(
def fuzzy_matching_strategy(
text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
) -> str:
"""
Fuzzy matching strategy for deanonymization.
"""Fuzzy matching strategy for deanonymization.
It uses fuzzy matching to find the position of the anonymized entity in the text.
It replaces all the anonymized entities with the original ones.
@@ -93,9 +93,9 @@ def fuzzy_matching_strategy(
def combined_exact_fuzzy_matching_strategy(
text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
) -> str:
"""
RECOMMENDED STRATEGY.
Combined exact and fuzzy matching strategy for deanonymization.
"""Combined exact and fuzzy matching strategy for deanonymization.
It is a RECOMMENDED STRATEGY.
Args:
text: text to deanonymize
@@ -118,8 +118,8 @@ def ngram_fuzzy_matching_strategy(
fuzzy_threshold: int = 85,
use_variable_length: bool = True,
) -> str:
"""
N-gram fuzzy matching strategy for deanonymization.
"""N-gram fuzzy matching strategy for deanonymization.
It replaces all the anonymized entities with the original ones.
It uses fuzzy matching to find the position of the anonymized entity in the text.
It generates n-grams of the same length as the anonymized entity from the text and

View File

@@ -3,6 +3,8 @@ from typing import Callable, Dict, Optional
def get_pseudoanonymizer_mapping(seed: Optional[int] = None) -> Dict[str, Callable]:
"""Get a mapping of entities to pseudo anonymize them."""
try:
from faker import Faker
except ImportError as e:

View File

@@ -98,6 +98,11 @@ DEFAULT_LANGUAGES_CONFIG = {
class PresidioAnonymizerBase(AnonymizerBase):
"""Base Anonymizer using Microsoft Presidio.
See more: https://microsoft.github.io/presidio/
"""
def __init__(
self,
analyzed_fields: Optional[List[str]] = None,
@@ -180,6 +185,8 @@ class PresidioAnonymizerBase(AnonymizerBase):
class PresidioAnonymizer(PresidioAnonymizerBase):
"""Anonymizer using Microsoft Presidio."""
def _anonymize(
self,
text: str,
@@ -258,6 +265,8 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase):
"""Reversible Anonymizer using Microsoft Presidio."""
def __init__(
self,
analyzed_fields: Optional[List[str]] = None,