mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-28 17:38:36 +00:00
Data anonymizer notebook nit (#10062)
This commit is contained in:
parent
19400ba253
commit
8d66b00c73
@ -28,12 +28,12 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Install necessary packages\n",
|
"# Install necessary packages\n",
|
||||||
"# ! pip install langchain langchain-experimental openai\n",
|
"# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n",
|
||||||
"# ! python -m spacy download en_core_web_lg"
|
"# ! python -m spacy download en_core_web_lg"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -47,16 +47,16 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 14,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"'My name is Marie Santos, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'"
|
"'My name is Mrs. Rachel Chen DDS, call me at 849-829-7628x073 or email me at christopherfrey@example.org'"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 2,
|
"execution_count": 14,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -64,6 +64,92 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"from langchain_experimental.data_anonymizer import PresidioAnonymizer\n",
|
"from langchain_experimental.data_anonymizer import PresidioAnonymizer\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"anonymizer = PresidioAnonymizer()\n",
|
||||||
|
"\n",
|
||||||
|
"anonymizer.anonymize(\n",
|
||||||
|
" \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Using with LangChain Expression Language\n",
|
||||||
|
"\n",
|
||||||
|
"With LCEL we can easily chain together anonymization with the rest of our application."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Set env var OPENAI_API_KEY or load from a .env file:\n",
|
||||||
|
"# import dotenv\n",
|
||||||
|
"\n",
|
||||||
|
"# dotenv.load_dotenv()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AIMessage(content='You can find our super secret data at https://www.ross.com/', additional_kwargs={}, example=False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain.prompts.prompt import PromptTemplate\n",
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"from langchain.schema.runnable import RunnablePassthrough\n",
|
||||||
|
"\n",
|
||||||
|
"template = \"\"\"According to this text, where can you find our super secret data?\n",
|
||||||
|
"\n",
|
||||||
|
"{anonymized_text}\n",
|
||||||
|
"\n",
|
||||||
|
"Answer:\"\"\"\n",
|
||||||
|
"prompt = PromptTemplate.from_template(template)\n",
|
||||||
|
"llm = ChatOpenAI()\n",
|
||||||
|
"\n",
|
||||||
|
"chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n",
|
||||||
|
"chain.invoke(\"You can find our super secret data at https://supersecretdata.com\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Customization\n",
|
||||||
|
"We can specify ``analyzed_fields`` to only anonymize particular types of data."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'My name is Gabrielle Edwards, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
"anonymizer = PresidioAnonymizer(analyzed_fields=[\"PERSON\"])\n",
|
"anonymizer = PresidioAnonymizer(analyzed_fields=[\"PERSON\"])\n",
|
||||||
"\n",
|
"\n",
|
||||||
"anonymizer.anonymize(\n",
|
"anonymizer.anonymize(\n",
|
||||||
@ -75,7 +161,6 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"\\\n",
|
|
||||||
"As can be observed, the name was correctly identified and replaced with another. The `analyzed_fields` attribute is responsible for what values are to be detected and substituted. We can add *PHONE_NUMBER* to the list:"
|
"As can be observed, the name was correctly identified and replaced with another. The `analyzed_fields` attribute is responsible for what values are to be detected and substituted. We can add *PHONE_NUMBER* to the list:"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -331,125 +416,6 @@
|
|||||||
"anonymizer.anonymize(\"My polish phone number is 666555444\")"
|
"anonymizer.anonymize(\"My polish phone number is 666555444\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"\\\n",
|
|
||||||
"Finally, it is worth showing how to implement anonymizer as a chain. Since anonymization is based on string operations, we can use `TransformChain` for this:"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 13,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"{'text': 'You can find our super secret data at https://supersecretdata.com',\n",
|
|
||||||
" 'anonymized_text': 'You can find our super secret data at https://www.fox.org/'}"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 13,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"from langchain.chains.transform import TransformChain\n",
|
|
||||||
"\n",
|
|
||||||
"anonymizer = PresidioAnonymizer()\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def anonymize_func(inputs: dict) -> dict:\n",
|
|
||||||
" text = inputs[\"text\"]\n",
|
|
||||||
" return {\"anonymized_text\": anonymizer.anonymize(text)}\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"anonymize_chain = TransformChain(\n",
|
|
||||||
" input_variables=[\"text\"],\n",
|
|
||||||
" output_variables=[\"anonymized_text\"],\n",
|
|
||||||
" transform=anonymize_func,\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"anonymize_chain(\"You can find our super secret data at https://supersecretdata.com\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"\\\n",
|
|
||||||
"Later, you can, for example, use such anonymization as part of chain sequence. We will use `LangChain Expression Language` ([learn more here](https://python.langchain.com/docs/guides/expression_language/)) for composing these chains together, as shown below:"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 14,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"True"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 14,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# ! pip install openai\n",
|
|
||||||
"\n",
|
|
||||||
"# Set env var OPENAI_API_KEY or load from a .env file:\n",
|
|
||||||
"import dotenv\n",
|
|
||||||
"\n",
|
|
||||||
"dotenv.load_dotenv()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 17,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"{'anonymized_text': StringPromptValue(text='According to this text, where can you find our super secret data?\\n\\nYou can find our super secret data at https://evans-summers.info/\\n\\nAnswer:'),\n",
|
|
||||||
" 'text': ' https://evans-summers.info/'}"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 17,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"from operator import itemgetter\n",
|
|
||||||
"from langchain.prompts.prompt import PromptTemplate\n",
|
|
||||||
"from langchain.chains.llm import LLMChain\n",
|
|
||||||
"from langchain.llms.openai import OpenAI\n",
|
|
||||||
"\n",
|
|
||||||
"template = \"\"\"According to this text, where can you find our super secret data?\n",
|
|
||||||
"\n",
|
|
||||||
"{anonymized_text}\n",
|
|
||||||
"\n",
|
|
||||||
"Answer:\"\"\"\n",
|
|
||||||
"prompt = PromptTemplate(input_variables=[\"anonymized_text\"], template=template)\n",
|
|
||||||
"llm_chain = LLMChain(llm=OpenAI(), prompt=prompt)\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"chain = (\n",
|
|
||||||
" anonymize_chain\n",
|
|
||||||
" | {\"anonymized_text\": itemgetter(\"anonymized_text\")}\n",
|
|
||||||
" | prompt\n",
|
|
||||||
" | llm_chain\n",
|
|
||||||
")\n",
|
|
||||||
"chain.invoke(\"You can find our super secret data at https://supersecretdata.com\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
Loading…
Reference in New Issue
Block a user