diff --git a/docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb index 63ba8931a60..ed91304e36e 100644 --- a/docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb +++ b/docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -66,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -75,7 +75,7 @@ "'Me llamo Sofía'" ] }, - "execution_count": 3, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -93,16 +93,16 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'Bridget Kirk soy Sally Knight'" + "'Kari Lopez soy Mary Walker'" ] }, - "execution_count": 4, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -131,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -157,15 +157,15 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Me llamo Michelle Smith\n", - "Yo soy Rachel Wright\n" + "Me llamo Christopher Smith\n", + "Yo soy Joseph Jenkins\n" ] } ], @@ -190,14 +190,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "My name is Ronnie Ayala\n" + "My name is Shawna Bennett\n" ] } ], @@ -205,6 +205,218 @@ "print(anonymizer.anonymize(\"My name is John\"))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usage with other frameworks\n", + "\n", + "### Language detection\n", + "\n", + "One of the drawbacks of the presented approach is that we have to pass the **language** of the input text directly. However, there is a remedy for that - *language detection* libraries.\n", + "\n", + "We recommend using one of the following frameworks:\n", + "- fasttext (recommended)\n", + "- langdetect\n", + "\n", + "From our exprience *fasttext* performs a bit better, but you should verify it on your use case." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install necessary packages\n", + "# ! pip install fasttext langdetect" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### langdetect" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import langdetect\n", + "from langchain.schema import runnable\n", + "\n", + "def detect_language(text: str) -> dict:\n", + " language = langdetect.detect(text)\n", + " print(language)\n", + " return {\"text\": text, \"language\": language}\n", + "\n", + "\n", + "chain = (\n", + " runnable.RunnableLambda(detect_language)\n", + " | (lambda x: anonymizer.anonymize(x[\"text\"], language=x[\"language\"]))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "es\n" + ] + }, + { + "data": { + "text/plain": [ + "'Me llamo Michael Perez III'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke(\"Me llamo Sofía\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "en\n" + ] + }, + { + "data": { + "text/plain": [ + "'My name is Ronald Bennett'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke(\"My name is John Doe\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### fasttext" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You need to download the fasttext model first from https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n" + ] + } + ], + "source": [ + "import fasttext\n", + "\n", + "model = fasttext.load_model(\"lid.176.ftz\")\n", + "def detect_language(text: str) -> dict:\n", + " language = model.predict(text)[0][0].replace('__label__', '')\n", + " print(language)\n", + " return {\"text\": text, \"language\": language}\n", + "\n", + "chain = (\n", + " runnable.RunnableLambda(detect_language)\n", + " | (lambda x: anonymizer.anonymize(x[\"text\"], language=x[\"language\"]))\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "es\n" + ] + }, + { + "data": { + "text/plain": [ + "'Yo soy Angela Werner'" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke(\"Yo soy Sofía\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "en\n" + ] + }, + { + "data": { + "text/plain": [ + "'My name is Carlos Newton'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke(\"My name is John Doe\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This way you only need to initialize the model with the engines corresponding to the relevant languages, but using the tool is fully automated." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -485,15 +697,6 @@ "source": [ "In many cases, even the larger models from spaCy will not be sufficient - there are already other, more complex and better methods of detecting named entities, based on transformers. You can read more about this [here](https://microsoft.github.io/presidio/analyzer/nlp_engines/transformers/)." ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Future works\n", - "\n", - "- **automatic language detection** - instead of passing the language as a parameter in `anonymizer.anonymize`, we could detect the language/s beforehand and then use the corresponding NER model." - ] } ], "metadata": { @@ -512,7 +715,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.9.16" } }, "nbformat": 4,