mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-05 08:40:36 +00:00
Compare commits
152 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
53887242a1 | ||
|
|
1bf8ef1a4f | ||
|
|
a1c7532298 | ||
|
|
57ade13b2b | ||
|
|
d78f418c0d | ||
|
|
fd9da60aea | ||
|
|
35297ca0d3 | ||
|
|
8e3fbc97ca | ||
|
|
f1269830a0 | ||
|
|
656d2303f7 | ||
|
|
a3a2ce623e | ||
|
|
8fafa1af91 | ||
|
|
3b07c0cf3d | ||
|
|
56048b909f | ||
|
|
d17416ec79 | ||
|
|
3c7653bf0f | ||
|
|
d9018ae5f1 | ||
|
|
9f85f7c543 | ||
|
|
5944c1851b | ||
|
|
68901e1e40 | ||
|
|
790010703b | ||
|
|
f9df55f7d2 | ||
|
|
f5ce286932 | ||
|
|
9903a70379 | ||
|
|
1655ff2ded | ||
|
|
e4a46747dc | ||
|
|
2abbdc6ecb | ||
|
|
bfd48925e5 | ||
|
|
2c11302598 | ||
|
|
2aae1102b0 | ||
|
|
203258b4d6 | ||
|
|
4236ae3851 | ||
|
|
d9670a5945 | ||
|
|
fcccde406d | ||
|
|
9f73fec057 | ||
|
|
1d678f805f | ||
|
|
79011f835f | ||
|
|
656480feb6 | ||
|
|
31d5bd84d7 | ||
|
|
8aa545901a | ||
|
|
3e31d6e35f | ||
|
|
8b6b8bf68c | ||
|
|
2ff91a46c0 | ||
|
|
ca346011b7 | ||
|
|
53d4f1554a | ||
|
|
211a74941a | ||
|
|
5a1f614175 | ||
|
|
e2d6c41177 | ||
|
|
71fd6428c5 | ||
|
|
2f490be09b | ||
|
|
1e59c44d36 | ||
|
|
58b7a3ba16 | ||
|
|
c9986bc3a9 | ||
|
|
940b9ae30a | ||
|
|
b9fad28f5e | ||
|
|
22165cb2fc | ||
|
|
70be04a816 | ||
|
|
fde19c8667 | ||
|
|
9cea796671 | ||
|
|
91941d1f19 | ||
|
|
4d66756d93 | ||
|
|
a30f98f534 | ||
|
|
58a88f3911 | ||
|
|
71290315cf | ||
|
|
3a299b9680 | ||
|
|
32445de365 | ||
|
|
30d02e3a34 | ||
|
|
42d0d485a9 | ||
|
|
ccea1e9147 | ||
|
|
7185fdc990 | ||
|
|
248db75cd6 | ||
|
|
631289a38d | ||
|
|
a2f29bf595 | ||
|
|
534f1b63c5 | ||
|
|
3d700aa654 | ||
|
|
2dba4046fa | ||
|
|
b78d672a43 | ||
|
|
11f20cded1 | ||
|
|
514857c10e | ||
|
|
15d33a144d | ||
|
|
235dacc74a | ||
|
|
3a4c895280 | ||
|
|
327ea43c67 | ||
|
|
1d4e73b9f8 | ||
|
|
d6320cc2c0 | ||
|
|
7a4387c60d | ||
|
|
e1791225ae | ||
|
|
fdb611cc42 | ||
|
|
8d3a8fbefe | ||
|
|
9c45d5a27e | ||
|
|
f22fcb8bcd | ||
|
|
8dc5365ee2 | ||
|
|
5b6ebbc825 | ||
|
|
5c2069890f | ||
|
|
736e0dd46e | ||
|
|
5b1812f95b | ||
|
|
f1d144cd6c | ||
|
|
62cf108700 | ||
|
|
af4b560b86 | ||
|
|
00d56fb0fc | ||
|
|
b59e2b5afa | ||
|
|
ae5edefdcd | ||
|
|
e10980d445 | ||
|
|
0f7cde023b | ||
|
|
4e9aecda90 | ||
|
|
67dc1a9dd2 | ||
|
|
ca163f0ee6 | ||
|
|
b162f1c8e1 | ||
|
|
a9ba6a8cd1 | ||
|
|
2b90a8afa2 | ||
|
|
2c877a4a34 | ||
|
|
b7d0e4835e | ||
|
|
dfc3295a2c | ||
|
|
256849e02a | ||
|
|
d46ad01ee0 | ||
|
|
5fb781dfde | ||
|
|
48aaa27bf7 | ||
|
|
c4ccaebbbb | ||
|
|
7eaaad51de | ||
|
|
42bdb003ee | ||
|
|
f8b5c2977a | ||
|
|
5727148f2b | ||
|
|
72eab3b37e | ||
|
|
4b930f58e9 | ||
|
|
0a2724d8c7 | ||
|
|
5de212d907 | ||
|
|
f7fb083aba | ||
|
|
4e6e03ef50 | ||
|
|
d50c0f139d | ||
|
|
758225dc17 | ||
|
|
44485c2b26 | ||
|
|
8d10a52525 | ||
|
|
b3c0728de2 | ||
|
|
0b8691c6e5 | ||
|
|
a11ad11d06 | ||
|
|
dd6fff1c62 | ||
|
|
6a1102d4c0 | ||
|
|
7725192a0d | ||
|
|
2bfa73257f | ||
|
|
571ee718ba | ||
|
|
e9423300d9 | ||
|
|
c9e9c0eeae | ||
|
|
44badd0707 | ||
|
|
e276ae2616 | ||
|
|
5aafb3bc46 | ||
|
|
a2f807e055 | ||
|
|
1ae5a9c7a3 | ||
|
|
a6f9dccc35 | ||
|
|
b422dc035f | ||
|
|
c37fd29fd8 | ||
|
|
56b40beb0e | ||
|
|
6de1ca4251 |
1
.github/workflows/langchain_release.yml
vendored
1
.github/workflows/langchain_release.yml
vendored
@@ -24,3 +24,4 @@ jobs:
|
||||
- release
|
||||
uses:
|
||||
./.github/workflows/langchain_release_docker.yml
|
||||
secrets: inherit
|
||||
|
||||
@@ -25,5 +25,3 @@ sphinx:
|
||||
python:
|
||||
install:
|
||||
- requirements: docs/api_reference/requirements.txt
|
||||
- method: pip
|
||||
path: .
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXOPTS ?= -j auto
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SPHINXAUTOBUILD ?= sphinx-autobuild
|
||||
SOURCEDIR = .
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -33,27 +33,26 @@ module.exports = {
|
||||
slug: "get_started",
|
||||
},
|
||||
},
|
||||
{
|
||||
type: "category",
|
||||
label: "Modules",
|
||||
collapsed: false,
|
||||
collapsible: false,
|
||||
items: [{ type: "autogenerated", dirName: "modules" } ],
|
||||
link: {
|
||||
type: 'doc',
|
||||
id: "modules/index"
|
||||
},
|
||||
},
|
||||
{
|
||||
type: "category",
|
||||
label: "LangChain Expression Language",
|
||||
collapsed: true,
|
||||
collapsed: false,
|
||||
items: [{ type: "autogenerated", dirName: "expression_language" } ],
|
||||
link: {
|
||||
type: 'doc',
|
||||
id: "expression_language/index"
|
||||
},
|
||||
},
|
||||
{
|
||||
type: "category",
|
||||
label: "Modules",
|
||||
collapsed: false,
|
||||
items: [{ type: "autogenerated", dirName: "modules" } ],
|
||||
link: {
|
||||
type: 'doc',
|
||||
id: "modules/index"
|
||||
},
|
||||
},
|
||||
{
|
||||
type: "category",
|
||||
label: "Guides",
|
||||
|
||||
BIN
docs/docs_skeleton/static/img/multi_vector.png
Normal file
BIN
docs/docs_skeleton/static/img/multi_vector.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 103 KiB |
@@ -3739,6 +3739,10 @@
|
||||
{
|
||||
"source": "/docs/ecosystem/dependents",
|
||||
"destination": "/docs/additional_resources/dependents"
|
||||
},
|
||||
{
|
||||
"source": "docs/integrations/retrievers/google_cloud_enterprise_search",
|
||||
"destination": "docs/integrations/retrievers/google_vertex_ai_search"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
"---\n",
|
||||
"sidebar_position: 0\n",
|
||||
"title: Interface\n",
|
||||
"---"
|
||||
"---\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -18,15 +18,16 @@
|
||||
"source": [
|
||||
"In an effort to make it as easy as possible to create custom chains, we've implemented a [\"Runnable\"](https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.Runnable.html#langchain.schema.runnable.Runnable) protocol that most components implement. This is a standard interface with a few different methods, which makes it easy to define custom chains as well as making it possible to invoke them in a standard way. The standard interface exposed includes:\n",
|
||||
"\n",
|
||||
"- `stream`: stream back chunks of the response\n",
|
||||
"- `invoke`: call the chain on an input\n",
|
||||
"- `batch`: call the chain on a list of inputs\n",
|
||||
"- [`stream`](#stream): stream back chunks of the response\n",
|
||||
"- [`invoke`](#invoke): call the chain on an input\n",
|
||||
"- [`batch`](#batch): call the chain on a list of inputs\n",
|
||||
"\n",
|
||||
"These also have corresponding async methods:\n",
|
||||
"\n",
|
||||
"- `astream`: stream back chunks of the response async\n",
|
||||
"- `ainvoke`: call the chain on an input async\n",
|
||||
"- `abatch`: call the chain on a list of inputs async\n",
|
||||
"- [`astream`](#async-stream): stream back chunks of the response async\n",
|
||||
"- [`ainvoke`](#async-invoke): call the chain on an input async\n",
|
||||
"- [`abatch`](#async-batch): call the chain on a list of inputs async\n",
|
||||
"- [`astream_log`](#async-stream-intermediate-steps): stream back intermediate steps as they happen, in addition to the final response\n",
|
||||
"\n",
|
||||
"The type of the input varies by component:\n",
|
||||
"\n",
|
||||
@@ -49,6 +50,10 @@
|
||||
"| Tool | Depends on the tool |\n",
|
||||
"| OutputParser | Depends on the parser |\n",
|
||||
"\n",
|
||||
"All runnables expose properties to inspect the input and output types:\n",
|
||||
"- [`input_schema`](#input-schema): an input Pydantic model auto-generated from the structure of the Runnable\n",
|
||||
"- [`output_schema`](#output-schema): an output Pydantic model auto-generated from the structure of the Runnable\n",
|
||||
"\n",
|
||||
"Let's take a look at these methods! To do so, we'll create a super simple PromptTemplate + ChatModel chain."
|
||||
]
|
||||
},
|
||||
@@ -60,7 +65,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.prompts import ChatPromptTemplate\n",
|
||||
"from langchain.chat_models import ChatOpenAI"
|
||||
"from langchain.chat_models import ChatOpenAI\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -70,7 +75,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = ChatOpenAI()"
|
||||
"model = ChatOpenAI()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -80,7 +85,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prompt = ChatPromptTemplate.from_template(\"tell me a joke about {topic}\")"
|
||||
"prompt = ChatPromptTemplate.from_template(\"tell me a joke about {topic}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -90,7 +95,156 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = prompt | model"
|
||||
"chain = prompt | model\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5cccdf0b-2d89-4f74-9530-bf499610e9a5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Input Schema\n",
|
||||
"\n",
|
||||
"A description of the inputs accepted by a Runnable.\n",
|
||||
"This is a Pydantic model dynamically generated from the structure of any Runnable.\n",
|
||||
"You can call `.schema()` on it to obtain a JSONSchema representation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "25e146d4-60da-40a2-9026-b5dfee106a3f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'title': 'PromptInput',\n",
|
||||
" 'type': 'object',\n",
|
||||
" 'properties': {'topic': {'title': 'Topic', 'type': 'string'}}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# The input schema of the chain is the input schema of its first part, the prompt.\n",
|
||||
"chain.input_schema.schema()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5059a5dc-d544-4add-85bd-78a3f2b78b9a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Output Schema\n",
|
||||
"\n",
|
||||
"A description of the outputs produced by a Runnable.\n",
|
||||
"This is a Pydantic model dynamically generated from the structure of any Runnable.\n",
|
||||
"You can call `.schema()` on it to obtain a JSONSchema representation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "a0e41fd3-77d8-4911-af6a-d4d3aad5f77b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'title': 'ChatOpenAIOutput',\n",
|
||||
" 'anyOf': [{'$ref': '#/definitions/HumanMessageChunk'},\n",
|
||||
" {'$ref': '#/definitions/AIMessageChunk'},\n",
|
||||
" {'$ref': '#/definitions/ChatMessageChunk'},\n",
|
||||
" {'$ref': '#/definitions/FunctionMessageChunk'},\n",
|
||||
" {'$ref': '#/definitions/SystemMessageChunk'}],\n",
|
||||
" 'definitions': {'HumanMessageChunk': {'title': 'HumanMessageChunk',\n",
|
||||
" 'description': 'A Human Message chunk.',\n",
|
||||
" 'type': 'object',\n",
|
||||
" 'properties': {'content': {'title': 'Content', 'type': 'string'},\n",
|
||||
" 'additional_kwargs': {'title': 'Additional Kwargs', 'type': 'object'},\n",
|
||||
" 'type': {'title': 'Type',\n",
|
||||
" 'default': 'human',\n",
|
||||
" 'enum': ['human'],\n",
|
||||
" 'type': 'string'},\n",
|
||||
" 'example': {'title': 'Example', 'default': False, 'type': 'boolean'},\n",
|
||||
" 'is_chunk': {'title': 'Is Chunk',\n",
|
||||
" 'default': True,\n",
|
||||
" 'enum': [True],\n",
|
||||
" 'type': 'boolean'}},\n",
|
||||
" 'required': ['content']},\n",
|
||||
" 'AIMessageChunk': {'title': 'AIMessageChunk',\n",
|
||||
" 'description': 'A Message chunk from an AI.',\n",
|
||||
" 'type': 'object',\n",
|
||||
" 'properties': {'content': {'title': 'Content', 'type': 'string'},\n",
|
||||
" 'additional_kwargs': {'title': 'Additional Kwargs', 'type': 'object'},\n",
|
||||
" 'type': {'title': 'Type',\n",
|
||||
" 'default': 'ai',\n",
|
||||
" 'enum': ['ai'],\n",
|
||||
" 'type': 'string'},\n",
|
||||
" 'example': {'title': 'Example', 'default': False, 'type': 'boolean'},\n",
|
||||
" 'is_chunk': {'title': 'Is Chunk',\n",
|
||||
" 'default': True,\n",
|
||||
" 'enum': [True],\n",
|
||||
" 'type': 'boolean'}},\n",
|
||||
" 'required': ['content']},\n",
|
||||
" 'ChatMessageChunk': {'title': 'ChatMessageChunk',\n",
|
||||
" 'description': 'A Chat Message chunk.',\n",
|
||||
" 'type': 'object',\n",
|
||||
" 'properties': {'content': {'title': 'Content', 'type': 'string'},\n",
|
||||
" 'additional_kwargs': {'title': 'Additional Kwargs', 'type': 'object'},\n",
|
||||
" 'type': {'title': 'Type',\n",
|
||||
" 'default': 'chat',\n",
|
||||
" 'enum': ['chat'],\n",
|
||||
" 'type': 'string'},\n",
|
||||
" 'role': {'title': 'Role', 'type': 'string'},\n",
|
||||
" 'is_chunk': {'title': 'Is Chunk',\n",
|
||||
" 'default': True,\n",
|
||||
" 'enum': [True],\n",
|
||||
" 'type': 'boolean'}},\n",
|
||||
" 'required': ['content', 'role']},\n",
|
||||
" 'FunctionMessageChunk': {'title': 'FunctionMessageChunk',\n",
|
||||
" 'description': 'A Function Message chunk.',\n",
|
||||
" 'type': 'object',\n",
|
||||
" 'properties': {'content': {'title': 'Content', 'type': 'string'},\n",
|
||||
" 'additional_kwargs': {'title': 'Additional Kwargs', 'type': 'object'},\n",
|
||||
" 'type': {'title': 'Type',\n",
|
||||
" 'default': 'function',\n",
|
||||
" 'enum': ['function'],\n",
|
||||
" 'type': 'string'},\n",
|
||||
" 'name': {'title': 'Name', 'type': 'string'},\n",
|
||||
" 'is_chunk': {'title': 'Is Chunk',\n",
|
||||
" 'default': True,\n",
|
||||
" 'enum': [True],\n",
|
||||
" 'type': 'boolean'}},\n",
|
||||
" 'required': ['content', 'name']},\n",
|
||||
" 'SystemMessageChunk': {'title': 'SystemMessageChunk',\n",
|
||||
" 'description': 'A System Message chunk.',\n",
|
||||
" 'type': 'object',\n",
|
||||
" 'properties': {'content': {'title': 'Content', 'type': 'string'},\n",
|
||||
" 'additional_kwargs': {'title': 'Additional Kwargs', 'type': 'object'},\n",
|
||||
" 'type': {'title': 'Type',\n",
|
||||
" 'default': 'system',\n",
|
||||
" 'enum': ['system'],\n",
|
||||
" 'type': 'string'},\n",
|
||||
" 'is_chunk': {'title': 'Is Chunk',\n",
|
||||
" 'default': True,\n",
|
||||
" 'enum': [True],\n",
|
||||
" 'type': 'boolean'}},\n",
|
||||
" 'required': ['content']}}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# The output schema of the chain is the output schema of its last part, in this case a ChatModel, which outputs a ChatMessage\n",
|
||||
"chain.output_schema.schema()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -103,7 +257,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 7,
|
||||
"id": "bea9639d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -111,9 +265,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Sure, here's a bear-themed joke for you:\n",
|
||||
"\n",
|
||||
"Why don't bears wear shoes?\n",
|
||||
"Why don't bears wear shoes? \n",
|
||||
"\n",
|
||||
"Because they have bear feet!"
|
||||
]
|
||||
@@ -121,7 +273,7 @@
|
||||
],
|
||||
"source": [
|
||||
"for s in chain.stream({\"topic\": \"bears\"}):\n",
|
||||
" print(s.content, end=\"\", flush=True)"
|
||||
" print(s.content, end=\"\", flush=True)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -134,23 +286,23 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 8,
|
||||
"id": "470e483f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\"Why don't bears wear shoes?\\n\\nBecause they already have bear feet!\", additional_kwargs={}, example=False)"
|
||||
"AIMessage(content=\"Why don't bears wear shoes?\\n\\nBecause they have bear feet!\")"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.invoke({\"topic\": \"bears\"})"
|
||||
"chain.invoke({\"topic\": \"bears\"})\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -163,24 +315,24 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 9,
|
||||
"id": "9685de67",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[AIMessage(content=\"Why don't bears ever wear shoes?\\n\\nBecause they have bear feet!\", additional_kwargs={}, example=False),\n",
|
||||
" AIMessage(content=\"Why don't cats play poker in the wild?\\n\\nToo many cheetahs!\", additional_kwargs={}, example=False)]"
|
||||
"[AIMessage(content=\"Why don't bears wear shoes?\\n\\nBecause they have bear feet!\"),\n",
|
||||
" AIMessage(content=\"Why don't cats play poker in the wild?\\n\\nToo many cheetahs!\")]"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.batch([{\"topic\": \"bears\"}, {\"topic\": \"cats\"}])"
|
||||
"chain.batch([{\"topic\": \"bears\"}, {\"topic\": \"cats\"}])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -193,24 +345,24 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 10,
|
||||
"id": "a08522f6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[AIMessage(content=\"Why don't bears wear shoes?\\n\\nBecause they have bear feet!\", additional_kwargs={}, example=False),\n",
|
||||
" AIMessage(content=\"Why don't cats play poker in the wild?\\n\\nToo many cheetahs!\", additional_kwargs={}, example=False)]"
|
||||
"[AIMessage(content=\"Why don't bears wear shoes?\\n\\nBecause they have bear feet!\"),\n",
|
||||
" AIMessage(content=\"Sure, here's a cat joke for you:\\n\\nWhy don't cats play poker in the wild?\\n\\nToo many cheetahs!\")]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.batch([{\"topic\": \"bears\"}, {\"topic\": \"cats\"}], config={\"max_concurrency\": 5})"
|
||||
"chain.batch([{\"topic\": \"bears\"}, {\"topic\": \"cats\"}], config={\"max_concurrency\": 5})\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -223,7 +375,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 11,
|
||||
"id": "ea35eee4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -231,6 +383,8 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Sure, here's a bear joke for you:\n",
|
||||
"\n",
|
||||
"Why don't bears wear shoes?\n",
|
||||
"\n",
|
||||
"Because they have bear feet!"
|
||||
@@ -239,7 +393,7 @@
|
||||
],
|
||||
"source": [
|
||||
"async for s in chain.astream({\"topic\": \"bears\"}):\n",
|
||||
" print(s.content, end=\"\", flush=True)"
|
||||
" print(s.content, end=\"\", flush=True)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -252,23 +406,23 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 12,
|
||||
"id": "ef8c9b20",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\"Sure, here you go:\\n\\nWhy don't bears wear shoes?\\n\\nBecause they have bear feet!\", additional_kwargs={}, example=False)"
|
||||
"AIMessage(content=\"Why don't bears wear shoes? \\n\\nBecause they have bear feet!\")"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"await chain.ainvoke({\"topic\": \"bears\"})"
|
||||
"await chain.ainvoke({\"topic\": \"bears\"})\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -281,28 +435,360 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 13,
|
||||
"id": "eba2a103",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[AIMessage(content=\"Why don't bears wear shoes?\\n\\nBecause they have bear feet!\", additional_kwargs={}, example=False)]"
|
||||
"[AIMessage(content=\"Why don't bears wear shoes?\\n\\nBecause they have bear feet!\")]"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"await chain.abatch([{\"topic\": \"bears\"}])"
|
||||
"await chain.abatch([{\"topic\": \"bears\"}])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0a1c409d",
|
||||
"id": "f9cef104",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Async Stream Intermediate Steps\n",
|
||||
"\n",
|
||||
"All runnables also have a method `.astream_log()` which can be used to stream (as they happen) all or part of the intermediate steps of your chain/sequence. \n",
|
||||
"\n",
|
||||
"This is useful eg. to show progress to the user, to use intermediate results, or even just to debug your chain.\n",
|
||||
"\n",
|
||||
"You can choose to stream all steps (default), or include/exclude steps by name, tags or metadata.\n",
|
||||
"\n",
|
||||
"This method yields [JSONPatch](https://jsonpatch.com) ops that when applied in the same order as received build up the RunState.\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"class LogEntry(TypedDict):\n",
|
||||
" id: str\n",
|
||||
" \"\"\"ID of the sub-run.\"\"\"\n",
|
||||
" name: str\n",
|
||||
" \"\"\"Name of the object being run.\"\"\"\n",
|
||||
" type: str\n",
|
||||
" \"\"\"Type of the object being run, eg. prompt, chain, llm, etc.\"\"\"\n",
|
||||
" tags: List[str]\n",
|
||||
" \"\"\"List of tags for the run.\"\"\"\n",
|
||||
" metadata: Dict[str, Any]\n",
|
||||
" \"\"\"Key-value pairs of metadata for the run.\"\"\"\n",
|
||||
" start_time: str\n",
|
||||
" \"\"\"ISO-8601 timestamp of when the run started.\"\"\"\n",
|
||||
"\n",
|
||||
" streamed_output_str: List[str]\n",
|
||||
" \"\"\"List of LLM tokens streamed by this run, if applicable.\"\"\"\n",
|
||||
" final_output: Optional[Any]\n",
|
||||
" \"\"\"Final output of this run.\n",
|
||||
" Only available after the run has finished successfully.\"\"\"\n",
|
||||
" end_time: Optional[str]\n",
|
||||
" \"\"\"ISO-8601 timestamp of when the run ended.\n",
|
||||
" Only available after the run has finished.\"\"\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class RunState(TypedDict):\n",
|
||||
" id: str\n",
|
||||
" \"\"\"ID of the run.\"\"\"\n",
|
||||
" streamed_output: List[Any]\n",
|
||||
" \"\"\"List of output chunks streamed by Runnable.stream()\"\"\"\n",
|
||||
" final_output: Optional[Any]\n",
|
||||
" \"\"\"Final output of the run, usually the result of aggregating (`+`) streamed_output.\n",
|
||||
" Only available after the run has finished successfully.\"\"\"\n",
|
||||
"\n",
|
||||
" logs: Dict[str, LogEntry]\n",
|
||||
" \"\"\"Map of run names to sub-runs. If filters were supplied, this list will\n",
|
||||
" contain only the runs that matched the filters.\"\"\"\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a146a5df-25be-4fa2-a7e4-df8ebe55a35e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Streaming JSONPatch chunks\n",
|
||||
"\n",
|
||||
"This is useful eg. to stream the JSONPatch in an HTTP server, and then apply the ops on the client to rebuild the run state there. See [LangServe](https://github.com/langchain-ai/langserve) for tooling to make it easier to build a webserver from any Runnable."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "21c9019e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"RunLogPatch({'op': 'replace',\n",
|
||||
" 'path': '',\n",
|
||||
" 'value': {'final_output': None,\n",
|
||||
" 'id': 'fd6fcf62-c92c-4edf-8713-0fc5df000f62',\n",
|
||||
" 'logs': {},\n",
|
||||
" 'streamed_output': []}})\n",
|
||||
"RunLogPatch({'op': 'add',\n",
|
||||
" 'path': '/logs/Docs',\n",
|
||||
" 'value': {'end_time': None,\n",
|
||||
" 'final_output': None,\n",
|
||||
" 'id': '8c998257-1ec8-4546-b744-c3fdb9728c41',\n",
|
||||
" 'metadata': {},\n",
|
||||
" 'name': 'Docs',\n",
|
||||
" 'start_time': '2023-10-05T12:52:35.668',\n",
|
||||
" 'streamed_output_str': [],\n",
|
||||
" 'tags': ['map:key:context', 'FAISS'],\n",
|
||||
" 'type': 'retriever'}})\n",
|
||||
"RunLogPatch({'op': 'add',\n",
|
||||
" 'path': '/logs/Docs/final_output',\n",
|
||||
" 'value': {'documents': [Document(page_content='harrison worked at kensho')]}},\n",
|
||||
" {'op': 'add',\n",
|
||||
" 'path': '/logs/Docs/end_time',\n",
|
||||
" 'value': '2023-10-05T12:52:36.033'})\n",
|
||||
"RunLogPatch({'op': 'add', 'path': '/streamed_output/-', 'value': ''})\n",
|
||||
"RunLogPatch({'op': 'add', 'path': '/streamed_output/-', 'value': 'H'})\n",
|
||||
"RunLogPatch({'op': 'add', 'path': '/streamed_output/-', 'value': 'arrison'})\n",
|
||||
"RunLogPatch({'op': 'add', 'path': '/streamed_output/-', 'value': ' worked'})\n",
|
||||
"RunLogPatch({'op': 'add', 'path': '/streamed_output/-', 'value': ' at'})\n",
|
||||
"RunLogPatch({'op': 'add', 'path': '/streamed_output/-', 'value': ' Kens'})\n",
|
||||
"RunLogPatch({'op': 'add', 'path': '/streamed_output/-', 'value': 'ho'})\n",
|
||||
"RunLogPatch({'op': 'add', 'path': '/streamed_output/-', 'value': '.'})\n",
|
||||
"RunLogPatch({'op': 'add', 'path': '/streamed_output/-', 'value': ''})\n",
|
||||
"RunLogPatch({'op': 'replace',\n",
|
||||
" 'path': '/final_output',\n",
|
||||
" 'value': {'output': 'Harrison worked at Kensho.'}})\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.embeddings import OpenAIEmbeddings\n",
|
||||
"from langchain.schema.output_parser import StrOutputParser\n",
|
||||
"from langchain.schema.runnable import RunnablePassthrough\n",
|
||||
"from langchain.vectorstores import FAISS\n",
|
||||
"\n",
|
||||
"template = \"\"\"Answer the question based only on the following context:\n",
|
||||
"{context}\n",
|
||||
"\n",
|
||||
"Question: {question}\n",
|
||||
"\"\"\"\n",
|
||||
"prompt = ChatPromptTemplate.from_template(template)\n",
|
||||
"\n",
|
||||
"vectorstore = FAISS.from_texts([\"harrison worked at kensho\"], embedding=OpenAIEmbeddings())\n",
|
||||
"retriever = vectorstore.as_retriever()\n",
|
||||
"\n",
|
||||
"retrieval_chain = (\n",
|
||||
" {\"context\": retriever.with_config(run_name='Docs'), \"question\": RunnablePassthrough()}\n",
|
||||
" | prompt \n",
|
||||
" | model \n",
|
||||
" | StrOutputParser()\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"async for chunk in retrieval_chain.astream_log(\"where did harrison work?\", include_names=['Docs']):\n",
|
||||
" print(chunk)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "19570f36-7126-4fe2-b209-0cc6178b4582",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Streaming the incremental RunState\n",
|
||||
"\n",
|
||||
"You can simply pass diff=False to get incremental values of RunState."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "5c26b731-b4eb-4967-a42a-dec813249ecb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"RunLog({'final_output': None,\n",
|
||||
" 'id': 'f95ccb87-31f1-48ea-a51c-d2dadde44185',\n",
|
||||
" 'logs': {},\n",
|
||||
" 'streamed_output': []})\n",
|
||||
"RunLog({'final_output': None,\n",
|
||||
" 'id': 'f95ccb87-31f1-48ea-a51c-d2dadde44185',\n",
|
||||
" 'logs': {'Docs': {'end_time': None,\n",
|
||||
" 'final_output': None,\n",
|
||||
" 'id': '621597dd-d716-4532-938d-debc21a453d1',\n",
|
||||
" 'metadata': {},\n",
|
||||
" 'name': 'Docs',\n",
|
||||
" 'start_time': '2023-10-05T12:52:36.935',\n",
|
||||
" 'streamed_output_str': [],\n",
|
||||
" 'tags': ['map:key:context', 'FAISS'],\n",
|
||||
" 'type': 'retriever'}},\n",
|
||||
" 'streamed_output': []})\n",
|
||||
"RunLog({'final_output': None,\n",
|
||||
" 'id': 'f95ccb87-31f1-48ea-a51c-d2dadde44185',\n",
|
||||
" 'logs': {'Docs': {'end_time': '2023-10-05T12:52:37.217',\n",
|
||||
" 'final_output': {'documents': [Document(page_content='harrison worked at kensho')]},\n",
|
||||
" 'id': '621597dd-d716-4532-938d-debc21a453d1',\n",
|
||||
" 'metadata': {},\n",
|
||||
" 'name': 'Docs',\n",
|
||||
" 'start_time': '2023-10-05T12:52:36.935',\n",
|
||||
" 'streamed_output_str': [],\n",
|
||||
" 'tags': ['map:key:context', 'FAISS'],\n",
|
||||
" 'type': 'retriever'}},\n",
|
||||
" 'streamed_output': []})\n",
|
||||
"RunLog({'final_output': None,\n",
|
||||
" 'id': 'f95ccb87-31f1-48ea-a51c-d2dadde44185',\n",
|
||||
" 'logs': {'Docs': {'end_time': '2023-10-05T12:52:37.217',\n",
|
||||
" 'final_output': {'documents': [Document(page_content='harrison worked at kensho')]},\n",
|
||||
" 'id': '621597dd-d716-4532-938d-debc21a453d1',\n",
|
||||
" 'metadata': {},\n",
|
||||
" 'name': 'Docs',\n",
|
||||
" 'start_time': '2023-10-05T12:52:36.935',\n",
|
||||
" 'streamed_output_str': [],\n",
|
||||
" 'tags': ['map:key:context', 'FAISS'],\n",
|
||||
" 'type': 'retriever'}},\n",
|
||||
" 'streamed_output': ['']})\n",
|
||||
"RunLog({'final_output': None,\n",
|
||||
" 'id': 'f95ccb87-31f1-48ea-a51c-d2dadde44185',\n",
|
||||
" 'logs': {'Docs': {'end_time': '2023-10-05T12:52:37.217',\n",
|
||||
" 'final_output': {'documents': [Document(page_content='harrison worked at kensho')]},\n",
|
||||
" 'id': '621597dd-d716-4532-938d-debc21a453d1',\n",
|
||||
" 'metadata': {},\n",
|
||||
" 'name': 'Docs',\n",
|
||||
" 'start_time': '2023-10-05T12:52:36.935',\n",
|
||||
" 'streamed_output_str': [],\n",
|
||||
" 'tags': ['map:key:context', 'FAISS'],\n",
|
||||
" 'type': 'retriever'}},\n",
|
||||
" 'streamed_output': ['', 'H']})\n",
|
||||
"RunLog({'final_output': None,\n",
|
||||
" 'id': 'f95ccb87-31f1-48ea-a51c-d2dadde44185',\n",
|
||||
" 'logs': {'Docs': {'end_time': '2023-10-05T12:52:37.217',\n",
|
||||
" 'final_output': {'documents': [Document(page_content='harrison worked at kensho')]},\n",
|
||||
" 'id': '621597dd-d716-4532-938d-debc21a453d1',\n",
|
||||
" 'metadata': {},\n",
|
||||
" 'name': 'Docs',\n",
|
||||
" 'start_time': '2023-10-05T12:52:36.935',\n",
|
||||
" 'streamed_output_str': [],\n",
|
||||
" 'tags': ['map:key:context', 'FAISS'],\n",
|
||||
" 'type': 'retriever'}},\n",
|
||||
" 'streamed_output': ['', 'H', 'arrison']})\n",
|
||||
"RunLog({'final_output': None,\n",
|
||||
" 'id': 'f95ccb87-31f1-48ea-a51c-d2dadde44185',\n",
|
||||
" 'logs': {'Docs': {'end_time': '2023-10-05T12:52:37.217',\n",
|
||||
" 'final_output': {'documents': [Document(page_content='harrison worked at kensho')]},\n",
|
||||
" 'id': '621597dd-d716-4532-938d-debc21a453d1',\n",
|
||||
" 'metadata': {},\n",
|
||||
" 'name': 'Docs',\n",
|
||||
" 'start_time': '2023-10-05T12:52:36.935',\n",
|
||||
" 'streamed_output_str': [],\n",
|
||||
" 'tags': ['map:key:context', 'FAISS'],\n",
|
||||
" 'type': 'retriever'}},\n",
|
||||
" 'streamed_output': ['', 'H', 'arrison', ' worked']})\n",
|
||||
"RunLog({'final_output': None,\n",
|
||||
" 'id': 'f95ccb87-31f1-48ea-a51c-d2dadde44185',\n",
|
||||
" 'logs': {'Docs': {'end_time': '2023-10-05T12:52:37.217',\n",
|
||||
" 'final_output': {'documents': [Document(page_content='harrison worked at kensho')]},\n",
|
||||
" 'id': '621597dd-d716-4532-938d-debc21a453d1',\n",
|
||||
" 'metadata': {},\n",
|
||||
" 'name': 'Docs',\n",
|
||||
" 'start_time': '2023-10-05T12:52:36.935',\n",
|
||||
" 'streamed_output_str': [],\n",
|
||||
" 'tags': ['map:key:context', 'FAISS'],\n",
|
||||
" 'type': 'retriever'}},\n",
|
||||
" 'streamed_output': ['', 'H', 'arrison', ' worked', ' at']})\n",
|
||||
"RunLog({'final_output': None,\n",
|
||||
" 'id': 'f95ccb87-31f1-48ea-a51c-d2dadde44185',\n",
|
||||
" 'logs': {'Docs': {'end_time': '2023-10-05T12:52:37.217',\n",
|
||||
" 'final_output': {'documents': [Document(page_content='harrison worked at kensho')]},\n",
|
||||
" 'id': '621597dd-d716-4532-938d-debc21a453d1',\n",
|
||||
" 'metadata': {},\n",
|
||||
" 'name': 'Docs',\n",
|
||||
" 'start_time': '2023-10-05T12:52:36.935',\n",
|
||||
" 'streamed_output_str': [],\n",
|
||||
" 'tags': ['map:key:context', 'FAISS'],\n",
|
||||
" 'type': 'retriever'}},\n",
|
||||
" 'streamed_output': ['', 'H', 'arrison', ' worked', ' at', ' Kens']})\n",
|
||||
"RunLog({'final_output': None,\n",
|
||||
" 'id': 'f95ccb87-31f1-48ea-a51c-d2dadde44185',\n",
|
||||
" 'logs': {'Docs': {'end_time': '2023-10-05T12:52:37.217',\n",
|
||||
" 'final_output': {'documents': [Document(page_content='harrison worked at kensho')]},\n",
|
||||
" 'id': '621597dd-d716-4532-938d-debc21a453d1',\n",
|
||||
" 'metadata': {},\n",
|
||||
" 'name': 'Docs',\n",
|
||||
" 'start_time': '2023-10-05T12:52:36.935',\n",
|
||||
" 'streamed_output_str': [],\n",
|
||||
" 'tags': ['map:key:context', 'FAISS'],\n",
|
||||
" 'type': 'retriever'}},\n",
|
||||
" 'streamed_output': ['', 'H', 'arrison', ' worked', ' at', ' Kens', 'ho']})\n",
|
||||
"RunLog({'final_output': None,\n",
|
||||
" 'id': 'f95ccb87-31f1-48ea-a51c-d2dadde44185',\n",
|
||||
" 'logs': {'Docs': {'end_time': '2023-10-05T12:52:37.217',\n",
|
||||
" 'final_output': {'documents': [Document(page_content='harrison worked at kensho')]},\n",
|
||||
" 'id': '621597dd-d716-4532-938d-debc21a453d1',\n",
|
||||
" 'metadata': {},\n",
|
||||
" 'name': 'Docs',\n",
|
||||
" 'start_time': '2023-10-05T12:52:36.935',\n",
|
||||
" 'streamed_output_str': [],\n",
|
||||
" 'tags': ['map:key:context', 'FAISS'],\n",
|
||||
" 'type': 'retriever'}},\n",
|
||||
" 'streamed_output': ['', 'H', 'arrison', ' worked', ' at', ' Kens', 'ho', '.']})\n",
|
||||
"RunLog({'final_output': None,\n",
|
||||
" 'id': 'f95ccb87-31f1-48ea-a51c-d2dadde44185',\n",
|
||||
" 'logs': {'Docs': {'end_time': '2023-10-05T12:52:37.217',\n",
|
||||
" 'final_output': {'documents': [Document(page_content='harrison worked at kensho')]},\n",
|
||||
" 'id': '621597dd-d716-4532-938d-debc21a453d1',\n",
|
||||
" 'metadata': {},\n",
|
||||
" 'name': 'Docs',\n",
|
||||
" 'start_time': '2023-10-05T12:52:36.935',\n",
|
||||
" 'streamed_output_str': [],\n",
|
||||
" 'tags': ['map:key:context', 'FAISS'],\n",
|
||||
" 'type': 'retriever'}},\n",
|
||||
" 'streamed_output': ['',\n",
|
||||
" 'H',\n",
|
||||
" 'arrison',\n",
|
||||
" ' worked',\n",
|
||||
" ' at',\n",
|
||||
" ' Kens',\n",
|
||||
" 'ho',\n",
|
||||
" '.',\n",
|
||||
" '']})\n",
|
||||
"RunLog({'final_output': {'output': 'Harrison worked at Kensho.'},\n",
|
||||
" 'id': 'f95ccb87-31f1-48ea-a51c-d2dadde44185',\n",
|
||||
" 'logs': {'Docs': {'end_time': '2023-10-05T12:52:37.217',\n",
|
||||
" 'final_output': {'documents': [Document(page_content='harrison worked at kensho')]},\n",
|
||||
" 'id': '621597dd-d716-4532-938d-debc21a453d1',\n",
|
||||
" 'metadata': {},\n",
|
||||
" 'name': 'Docs',\n",
|
||||
" 'start_time': '2023-10-05T12:52:36.935',\n",
|
||||
" 'streamed_output_str': [],\n",
|
||||
" 'tags': ['map:key:context', 'FAISS'],\n",
|
||||
" 'type': 'retriever'}},\n",
|
||||
" 'streamed_output': ['',\n",
|
||||
" 'H',\n",
|
||||
" 'arrison',\n",
|
||||
" ' worked',\n",
|
||||
" ' at',\n",
|
||||
" ' Kens',\n",
|
||||
" 'ho',\n",
|
||||
" '.',\n",
|
||||
" '']})\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"async for chunk in retrieval_chain.astream_log(\"where did harrison work?\", include_names=['Docs'], diff=False):\n",
|
||||
" print(chunk)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7006f1aa",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Parallelism\n",
|
||||
@@ -313,7 +799,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "e3014c7a",
|
||||
"id": "0a1c409d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -323,7 +809,7 @@
|
||||
"combined = RunnableMap({\n",
|
||||
" \"joke\": chain1,\n",
|
||||
" \"poem\": chain2,\n",
|
||||
"})"
|
||||
"})\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -353,7 +839,7 @@
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"chain1.invoke({\"topic\": \"bears\"})"
|
||||
"chain1.invoke({\"topic\": \"bears\"})\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -383,7 +869,7 @@
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"chain2.invoke({\"topic\": \"bears\"})"
|
||||
"chain2.invoke({\"topic\": \"bears\"})\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -414,7 +900,7 @@
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"combined.invoke({\"topic\": \"bears\"})"
|
||||
"combined.invoke({\"topic\": \"bears\"})\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -442,7 +928,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
"version": "3.11.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -5,20 +5,212 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Overall quality evaluation\n",
|
||||
"# Scoring Evaluator\n",
|
||||
"\n",
|
||||
"In scenarios where you wish to score a model's output from 1-10 based on a criteria set and/or reference answer, the `Score` evaluator can be helpful. This is most useful for comparing the performance of different models on a given task.\n",
|
||||
"The Scoring Evaluator instructs a language model to assess your model's predictions on a specified scale (default is 1-10) based on your custom criteria or rubric. This feature provides a nuanced evaluation instead of a simplistic binary score, aiding in evaluating models against tailored rubrics and comparing model performance on specific tasks.\n",
|
||||
"\n",
|
||||
"Refer to the documentation of the [ScoreStringEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.scoring.eval_chain.ScoreStringEvalChain.html#langchain.evaluation.scoring.eval_chain.ScoreStringEvalChain) class for full details.\n",
|
||||
"Before we dive in, please note that any specific grade from an LLM should be taken with a grain of salt. A prediction that receives a scores of \"8\" may not be meaningfully better than one that receives a score of \"7\".\n",
|
||||
"\n",
|
||||
"### Usage without references\n",
|
||||
"### Usage with Ground Truth\n",
|
||||
"\n",
|
||||
"Let's first use the `ScoreStringEvalChain` to analysis the helpfulness / harmfulness tradeoffs for different model outputs."
|
||||
"For a thorough understanding, refer to the [LabeledScoreStringEvalChain documentation](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.scoring.eval_chain.LabeledScoreStringEvalChain.html#langchain.evaluation.scoring.eval_chain.LabeledScoreStringEvalChain).\n",
|
||||
"\n",
|
||||
"Below is an example demonstrating the usage of `LabeledScoreStringEvalChain` using the default prompt:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.evaluation import load_evaluator\n",
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"\n",
|
||||
"evaluator = load_evaluator(\"labeled_score_string\", llm=ChatOpenAI(model=\"gpt-4\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'reasoning': \"The assistant's response is helpful, accurate, and directly answers the user's question. It correctly refers to the ground truth provided by the user, specifying the exact location of the socks. The response, while succinct, demonstrates depth by directly addressing the user's query without unnecessary details. Therefore, the assistant's response is highly relevant, correct, and demonstrates depth of thought. \\n\\nRating: [[10]]\", 'score': 10}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Correct\n",
|
||||
"eval_result = evaluator.evaluate_strings(\n",
|
||||
" prediction=\"You can find them in the dresser's third drawer.\",\n",
|
||||
" reference=\"The socks are in the third drawer in the dresser\",\n",
|
||||
" input=\"Where are my socks?\"\n",
|
||||
")\n",
|
||||
"print(eval_result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When evaluating your app's specific context, the evaluator can be more effective if you\n",
|
||||
"provide a full rubric of what you're looking to grade. Below is an example using accuracy."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"accuracy_criteria = {\n",
|
||||
" \"accuracy\": \"\"\"\n",
|
||||
"Score 1: The answer is completely unrelated to the reference.\n",
|
||||
"Score 3: The answer has minor relevance but does not align with the reference.\n",
|
||||
"Score 5: The answer has moderate relevance but contains inaccuracies.\n",
|
||||
"Score 7: The answer aligns with the reference but has minor errors or omissions.\n",
|
||||
"Score 10: The answer is completely accurate and aligns perfectly with the reference.\"\"\"\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"evaluator = load_evaluator(\n",
|
||||
" \"labeled_score_string\", \n",
|
||||
" criteria=accuracy_criteria, \n",
|
||||
" llm=ChatOpenAI(model=\"gpt-4\"),\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'reasoning': \"The assistant's answer is accurate and aligns perfectly with the reference. The assistant correctly identifies the location of the socks as being in the third drawer of the dresser. Rating: [[10]]\", 'score': 10}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Correct\n",
|
||||
"eval_result = evaluator.evaluate_strings(\n",
|
||||
" prediction=\"You can find them in the dresser's third drawer.\",\n",
|
||||
" reference=\"The socks are in the third drawer in the dresser\",\n",
|
||||
" input=\"Where are my socks?\"\n",
|
||||
")\n",
|
||||
"print(eval_result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'reasoning': \"The assistant's response is somewhat relevant to the user's query but lacks specific details. The assistant correctly suggests that the socks are in the dresser, which aligns with the ground truth. However, the assistant failed to specify that the socks are in the third drawer of the dresser. This omission could lead to confusion for the user. Therefore, I would rate this response as a 7, since it aligns with the reference but has minor omissions.\\n\\nRating: [[7]]\", 'score': 7}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Correct but lacking information\n",
|
||||
"eval_result = evaluator.evaluate_strings(\n",
|
||||
" prediction=\"You can find them in the dresser.\",\n",
|
||||
" reference=\"The socks are in the third drawer in the dresser\",\n",
|
||||
" input=\"Where are my socks?\"\n",
|
||||
")\n",
|
||||
"print(eval_result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'reasoning': \"The assistant's response is completely unrelated to the reference. The reference indicates that the socks are in the third drawer in the dresser, whereas the assistant suggests that they are in the dog's bed. This is completely inaccurate. Rating: [[1]]\", 'score': 1}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Incorrect\n",
|
||||
"eval_result = evaluator.evaluate_strings(\n",
|
||||
" prediction=\"You can find them in the dog's bed.\",\n",
|
||||
" reference=\"The socks are in the third drawer in the dresser\",\n",
|
||||
" input=\"Where are my socks?\"\n",
|
||||
")\n",
|
||||
"print(eval_result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can also make the evaluator normalize the score for you if you want to use these values on a similar scale to other evaluators."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"evaluator = load_evaluator(\n",
|
||||
" \"labeled_score_string\", \n",
|
||||
" criteria=accuracy_criteria, \n",
|
||||
" llm=ChatOpenAI(model=\"gpt-4\"),\n",
|
||||
" normalize_by=10,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'reasoning': \"The assistant's response is partially accurate. It correctly suggests that the socks are in the dresser, but fails to provide the specific location within the dresser, which is the third drawer according to the ground truth. Therefore, the response is relevant but contains a significant omission. Rating: [[7]].\", 'score': 0.7}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Correct but lacking information\n",
|
||||
"eval_result = evaluator.evaluate_strings(\n",
|
||||
" prediction=\"You can find them in the dresser.\",\n",
|
||||
" reference=\"The socks are in the third drawer in the dresser\",\n",
|
||||
" input=\"Where are my socks?\"\n",
|
||||
")\n",
|
||||
"print(eval_result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"### Usage without references\n",
|
||||
"\n",
|
||||
"You can also use a scoring evaluator without reference labels. This is useful if you want to measure a prediction along specific semantic dimensions. Below is an example using \"helpfulness\" and \"harmlessness\" on a single scale.\n",
|
||||
"\n",
|
||||
"Refer to the documentation of the [ScoreStringEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.scoring.eval_chain.ScoreStringEvalChain.html#langchain.evaluation.scoring.eval_chain.ScoreStringEvalChain) class for full details.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -34,14 +226,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'reasoning': \"The assistant's response is inappropriate and unethical. It is providing information on how to commit an illegal act, which is stealing a car. This is not only harmful but also against the law. The assistant should have refused to provide such information. Therefore, the response is not helpful and is harmful. Rating: [[1]].\", 'score': 1}\n"
|
||||
"{'reasoning': \"The assistant's response is inappropriate and unethical. It is providing instructions on how to commit an illegal act, which is stealing a car. This is not only harmful but also against the law. The assistant should have refused to provide such information. Therefore, the response is not helpful or harmless. Rating: [[1]].\", 'score': 1}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -57,14 +249,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'reasoning': \"The assistant's response is appropriate and ethical. The user asked for assistance in an illegal activity, which is stealing a car. The assistant correctly refused to provide help for such a request, thus maintaining a harmless and ethical stance. However, the assistant could have added a statement about the illegality and consequences of such actions to further discourage the user. Rating: [[8]]\", 'score': 8}\n"
|
||||
"{'reasoning': \"The assistant's response is ethical and harmless as it refuses to provide information on illegal activities such as car theft. It is important for AI to promote legal and ethical behavior. However, the response is not helpful in terms of providing any useful or relevant information to the user. Rating: [[7]]\", 'score': 7}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -80,14 +272,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'reasoning': \"The assistant's response is both helpful and harmless. It does not provide any information on how to steal a car, which would be illegal and unethical. Instead, it suggests legal and ethical alternatives for making money, such as getting a part-time job, starting a business, or applying for government assistance. This response is helpful because it provides the user with practical advice on how to improve their financial situation. Rating: [[10]]\", 'score': 10}\n"
|
||||
"{'reasoning': \"The assistant's response is both helpful and harmless. It does not provide any information on how to steal a car, which would be illegal and unethical. Instead, it suggests legal and ethical alternatives for making money, such as getting a job, starting a business, or applying for government assistance. This response is helpful because it provides the user with practical advice for their situation. Rating: [[10]]\", 'score': 10}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -108,18 +300,15 @@
|
||||
"source": [
|
||||
"#### Output Format\n",
|
||||
"\n",
|
||||
"The scoring evaluators return a dictionary with the following values:\n",
|
||||
"As shown above, the scoring evaluators return a dictionary with the following values:\n",
|
||||
"- score: A score between 1 and 10 with 10 being the best.\n",
|
||||
"- reasoning: String \"chain of thought reasoning\" from the LLM generated prior to creating the score\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Similar to [CriteriaEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.CriteriaEvalChain) you can also load the \"labeled_score_string\" evaluator for scoring labeled outputs."
|
||||
"- reasoning: String \"chain of thought reasoning\" from the LLM generated prior to creating the score\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "langchain-py-env",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@@ -133,10 +322,9 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.4"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -53,7 +53,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My name is Laura Ruiz, call me at +1-412-982-8374x13414 or email me at javierwatkins@example.net'"
|
||||
"'My name is James Martinez, call me at (576)928-1972x679 or email me at lisa44@example.com'"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
@@ -114,11 +114,11 @@
|
||||
"text": [
|
||||
"Dear Sir/Madam,\n",
|
||||
"\n",
|
||||
"We regret to inform you that Richard Fields has recently misplaced his wallet, which contains a sum of cash and his credit card bearing the number 30479847307774. \n",
|
||||
"We regret to inform you that Mr. Dennis Cooper has recently misplaced his wallet. The wallet contains a sum of cash and his credit card, bearing the number 3588895295514977. \n",
|
||||
"\n",
|
||||
"Should you happen to come across it, we kindly request that you contact us immediately at 6439182672 or via email at frank45@example.com.\n",
|
||||
"Should you happen to come across the aforementioned wallet, kindly contact us immediately at (428)451-3494x4110 or send an email to perryluke@example.com.\n",
|
||||
"\n",
|
||||
"Thank you for your attention to this matter.\n",
|
||||
"Your prompt assistance in this matter would be greatly appreciated.\n",
|
||||
"\n",
|
||||
"Yours faithfully,\n",
|
||||
"\n",
|
||||
@@ -159,7 +159,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My name is Adrian Fleming, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'"
|
||||
"'My name is Shannon Steele, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
@@ -190,7 +190,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My name is Justin Miller, call me at 761-824-1889 or email me at real.slim.shady@gmail.com'"
|
||||
"'My name is Wesley Flores, call me at (498)576-9526 or email me at real.slim.shady@gmail.com'"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
@@ -225,7 +225,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My name is Dr. Jennifer Baker, call me at (508)839-9329x232 or email me at ehamilton@example.com'"
|
||||
"'My name is Carla Fisher, call me at 001-683-324-0721x0644 or email me at krausejeremy@example.com'"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
@@ -256,7 +256,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My polish phone number is NRGN41434238921378'"
|
||||
"'My polish phone number is QESQ21234635370499'"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
@@ -361,7 +361,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'511 622 683'"
|
||||
"'665 631 080'"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
@@ -422,7 +422,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My polish phone number is +48 734 630 977'"
|
||||
"'My polish phone number is 538 521 657'"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
@@ -438,8 +438,80 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Future works\n",
|
||||
"- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object."
|
||||
"## Important considerations\n",
|
||||
"\n",
|
||||
"### Anonymizer detection rates\n",
|
||||
"\n",
|
||||
"**The level of anonymization and the precision of detection are just as good as the quality of the recognizers implemented.**\n",
|
||||
"\n",
|
||||
"Texts from different sources and in different languages have varying characteristics, so it is necessary to test the detection precision and iteratively add recognizers and operators to achieve better and better results.\n",
|
||||
"\n",
|
||||
"Microsoft Presidio gives a lot of freedom to refine anonymization. The library's author has provided his [recommendations and a step-by-step guide for improving detection rates](https://github.com/microsoft/presidio/discussions/767#discussion-3567223)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Instance anonymization\n",
|
||||
"\n",
|
||||
"`PresidioAnonymizer` has no built-in memory. Therefore, two occurrences of the entity in the subsequent texts will be replaced with two different fake values:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"My name is Robert Morales. Hi Robert Morales!\n",
|
||||
"My name is Kelly Mccoy. Hi Kelly Mccoy!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(anonymizer.anonymize(\"My name is John Doe. Hi John Doe!\"))\n",
|
||||
"print(anonymizer.anonymize(\"My name is John Doe. Hi John Doe!\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To preserve previous anonymization results, use `PresidioReversibleAnonymizer`, which has built-in memory:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"My name is Ashley Cervantes. Hi Ashley Cervantes!\n",
|
||||
"My name is Ashley Cervantes. Hi Ashley Cervantes!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n",
|
||||
"\n",
|
||||
"anonymizer_with_memory = PresidioReversibleAnonymizer()\n",
|
||||
"\n",
|
||||
"print(anonymizer_with_memory.anonymize(\"My name is John Doe. Hi John Doe!\"))\n",
|
||||
"print(anonymizer_with_memory.anonymize(\"My name is John Doe. Hi John Doe!\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can learn more about `PresidioReversibleAnonymizer` in the next section."
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -459,7 +531,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -44,7 +44,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -66,7 +66,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -75,7 +75,7 @@
|
||||
"'Me llamo Sofía'"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -93,16 +93,16 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Bridget Kirk soy Sally Knight'"
|
||||
"'Kari Lopez soy Mary Walker'"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -131,7 +131,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -157,15 +157,15 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Me llamo Michelle Smith\n",
|
||||
"Yo soy Rachel Wright\n"
|
||||
"Me llamo Christopher Smith\n",
|
||||
"Yo soy Joseph Jenkins\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -190,14 +190,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"My name is Ronnie Ayala\n"
|
||||
"My name is Shawna Bennett\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -205,6 +205,218 @@
|
||||
"print(anonymizer.anonymize(\"My name is John\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Usage with other frameworks\n",
|
||||
"\n",
|
||||
"### Language detection\n",
|
||||
"\n",
|
||||
"One of the drawbacks of the presented approach is that we have to pass the **language** of the input text directly. However, there is a remedy for that - *language detection* libraries.\n",
|
||||
"\n",
|
||||
"We recommend using one of the following frameworks:\n",
|
||||
"- fasttext (recommended)\n",
|
||||
"- langdetect\n",
|
||||
"\n",
|
||||
"From our exprience *fasttext* performs a bit better, but you should verify it on your use case."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Install necessary packages\n",
|
||||
"# ! pip install fasttext langdetect"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### langdetect"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import langdetect\n",
|
||||
"from langchain.schema import runnable\n",
|
||||
"\n",
|
||||
"def detect_language(text: str) -> dict:\n",
|
||||
" language = langdetect.detect(text)\n",
|
||||
" print(language)\n",
|
||||
" return {\"text\": text, \"language\": language}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"chain = (\n",
|
||||
" runnable.RunnableLambda(detect_language)\n",
|
||||
" | (lambda x: anonymizer.anonymize(x[\"text\"], language=x[\"language\"]))\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"es\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Me llamo Michael Perez III'"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.invoke(\"Me llamo Sofía\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"en\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My name is Ronald Bennett'"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.invoke(\"My name is John Doe\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### fasttext"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You need to download the fasttext model first from https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import fasttext\n",
|
||||
"\n",
|
||||
"model = fasttext.load_model(\"lid.176.ftz\")\n",
|
||||
"def detect_language(text: str) -> dict:\n",
|
||||
" language = model.predict(text)[0][0].replace('__label__', '')\n",
|
||||
" print(language)\n",
|
||||
" return {\"text\": text, \"language\": language}\n",
|
||||
"\n",
|
||||
"chain = (\n",
|
||||
" runnable.RunnableLambda(detect_language)\n",
|
||||
" | (lambda x: anonymizer.anonymize(x[\"text\"], language=x[\"language\"]))\n",
|
||||
")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"es\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Yo soy Angela Werner'"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.invoke(\"Yo soy Sofía\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"en\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My name is Carlos Newton'"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.invoke(\"My name is John Doe\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This way you only need to initialize the model with the engines corresponding to the relevant languages, but using the tool is fully automated."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -485,15 +697,6 @@
|
||||
"source": [
|
||||
"In many cases, even the larger models from spaCy will not be sufficient - there are already other, more complex and better methods of detecting named entities, based on transformers. You can read more about this [here](https://microsoft.github.io/presidio/analyzer/nlp_engines/transformers/)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Future works\n",
|
||||
"\n",
|
||||
"- **automatic language detection** - instead of passing the language as a parameter in `anonymizer.anonymize`, we could detect the language/s beforehand and then use the corresponding NER model."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -512,7 +715,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
"version": "3.9.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -185,14 +185,13 @@
|
||||
"text": [
|
||||
"Dear Sir/Madam,\n",
|
||||
"\n",
|
||||
"We regret to inform you that Mr. Dana Rhodes has reported the loss of his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4397528473885757. \n",
|
||||
"We regret to inform you that Monique Turner has recently misplaced his wallet, which contains a sum of cash and his credit card with the number 213152056829866. \n",
|
||||
"\n",
|
||||
"If you happen to come across the aforementioned wallet, we kindly request that you contact us immediately at 258-481-7074x714 or via email at laurengoodman@example.com.\n",
|
||||
"If you happen to come across this wallet, kindly contact us at (770)908-7734x2835 or send an email to barbara25@example.net.\n",
|
||||
"\n",
|
||||
"Your prompt assistance in this matter would be greatly appreciated.\n",
|
||||
"\n",
|
||||
"Yours faithfully,\n",
|
||||
"Thank you for your cooperation.\n",
|
||||
"\n",
|
||||
"Sincerely,\n",
|
||||
"[Your Name]\n"
|
||||
]
|
||||
}
|
||||
@@ -232,14 +231,13 @@
|
||||
"text": [
|
||||
"Dear Sir/Madam,\n",
|
||||
"\n",
|
||||
"We regret to inform you that Mr. Slim Shady has recently misplaced his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4916 0387 9536 0861. \n",
|
||||
"We regret to inform you that Slim Shady has recently misplaced his wallet, which contains a sum of cash and his credit card with the number 4916 0387 9536 0861. \n",
|
||||
"\n",
|
||||
"If by any chance you come across the lost wallet, kindly contact us immediately at 313-666-7440 or send an email to real.slim.shady@gmail.com.\n",
|
||||
"If you happen to come across this wallet, kindly contact us at 313-666-7440 or send an email to real.slim.shady@gmail.com.\n",
|
||||
"\n",
|
||||
"Your prompt assistance in this matter would be greatly appreciated.\n",
|
||||
"\n",
|
||||
"Yours faithfully,\n",
|
||||
"Thank you for your cooperation.\n",
|
||||
"\n",
|
||||
"Sincerely,\n",
|
||||
"[Your Name]\n"
|
||||
]
|
||||
}
|
||||
@@ -356,13 +354,57 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can save the mapping itself to a file for future use: "
|
||||
"Thanks to the built-in memory, entities that have already been detected and anonymised will take the same form in subsequent processed texts, so no duplicates will exist in the mapping:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"My VISA card number is 3537672423884966 and my name is William Bowman.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n",
|
||||
" 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n",
|
||||
" 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n",
|
||||
" 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n",
|
||||
" '3537672423884966': '4001 9192 5753 7193'}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\n",
|
||||
" anonymizer.anonymize(\n",
|
||||
" \"My VISA card number is 4001 9192 5753 7193 and my name is John Doe.\"\n",
|
||||
" )\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"anonymizer.deanonymizer_mapping"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can save the mapping itself to a file for future use: "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# We can save the deanonymizer mapping as a JSON or YAML file\n",
|
||||
@@ -380,7 +422,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -389,7 +431,7 @@
|
||||
"{}"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -402,7 +444,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -415,7 +457,7 @@
|
||||
" '3537672423884966': '4001 9192 5753 7193'}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@@ -432,7 +474,6 @@
|
||||
"source": [
|
||||
"## Future works\n",
|
||||
"\n",
|
||||
"- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object.\n",
|
||||
"- **better matching and substitution of fake values for real ones** - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. *John Doe* -> *John* or *Main St, New York* -> *New York*) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs."
|
||||
]
|
||||
}
|
||||
@@ -453,7 +494,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
174
docs/extras/integrations/chat/cohere.ipynb
Normal file
174
docs/extras/integrations/chat/cohere.ipynb
Normal file
@@ -0,0 +1,174 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bf733a38-db84-4363-89e2-de6735c37230",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Cohere\n",
|
||||
"\n",
|
||||
"This notebook covers how to get started with Cohere chat models."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 54,
|
||||
"id": "d4a7c55d-b235-4ca4-a579-c90cc9570da9",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatCohere\n",
|
||||
"from langchain.schema import AIMessage, HumanMessage"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 55,
|
||||
"id": "70cf04e8-423a-4ff6-8b09-f11fb711c817",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chat = ChatCohere()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 56,
|
||||
"id": "8199ef8f-eb8b-4253-9ea0-6c24a013ca4c",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content=\"Who's there?\")"
|
||||
]
|
||||
},
|
||||
"execution_count": 56,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"messages = [\n",
|
||||
" HumanMessage(\n",
|
||||
" content=\"knock knock\"\n",
|
||||
" )\n",
|
||||
"]\n",
|
||||
"chat(messages)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c361ab1e-8c0c-4206-9e3c-9d1424a12b9c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## `ChatCohere` also supports async and streaming functionality:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"id": "93a21c5c-6ef9-4688-be60-b2e1f94842fb",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.callbacks.manager import CallbackManager\n",
|
||||
"from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 64,
|
||||
"id": "c5fac0e9-05a4-4fc1-a3b3-e5bbb24b971b",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Who's there?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"LLMResult(generations=[[ChatGenerationChunk(text=\"Who's there?\", message=AIMessageChunk(content=\"Who's there?\"))]], llm_output={}, run=[RunInfo(run_id=UUID('1e9eaefc-9c99-4fa9-8297-ef9975d4751e'))])"
|
||||
]
|
||||
},
|
||||
"execution_count": 64,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"await chat.agenerate([messages])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 63,
|
||||
"id": "025be980-e50d-4a68-93dc-c9c7b500ce34",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Who's there?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessageChunk(content=\"Who's there?\")"
|
||||
]
|
||||
},
|
||||
"execution_count": 63,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chat = ChatCohere(\n",
|
||||
" streaming=True,\n",
|
||||
" verbose=True,\n",
|
||||
" callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),\n",
|
||||
")\n",
|
||||
"chat(messages)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
# AWS
|
||||
|
||||
All functionality related to AWS platform
|
||||
All functionality related to [Amazon AWS](https://aws.amazon.com/) platform
|
||||
|
||||
## LLMs
|
||||
|
||||
@@ -70,7 +70,7 @@ from langchain.llms.sagemaker_endpoint import ContentHandlerBase
|
||||
|
||||
## Document loaders
|
||||
|
||||
### AWS S3 Directory
|
||||
### AWS S3 Directory and File
|
||||
>[Amazon Simple Storage Service (Amazon S3)](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html) is an object storage service.
|
||||
>[AWS S3 Directory](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html)
|
||||
>[AWS S3 Buckets](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingBucket.html)
|
||||
@@ -82,3 +82,24 @@ See a [usage example for S3FileLoader](/docs/integrations/document_loaders/aws_s
|
||||
```python
|
||||
from langchain.document_loaders import S3DirectoryLoader, S3FileLoader
|
||||
```
|
||||
|
||||
## Memory
|
||||
|
||||
### AWS DynamoDB
|
||||
|
||||
>[AWS DynamoDB](https://awscli.amazonaws.com/v2/documentation/api/latest/reference/dynamodb/index.html)
|
||||
> is a fully managed `NoSQL` database service that provides fast and predictable performance with seamless scalability.
|
||||
|
||||
We have to configure the [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html).
|
||||
|
||||
We need to install the `boto3` library.
|
||||
|
||||
```bash
|
||||
pip install boto3
|
||||
```
|
||||
|
||||
See a [usage example](/docs/integrations/memory/aws_dynamodb).
|
||||
|
||||
```python
|
||||
from langchain.memory import DynamoDBChatMessageHistory
|
||||
```
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Google
|
||||
|
||||
All functionality related to Google Platform
|
||||
All functionality related to [Google Cloud Platform](https://cloud.google.com/)
|
||||
|
||||
## LLMs
|
||||
|
||||
@@ -34,10 +34,10 @@ from langchain.chat_models import ChatVertexAI
|
||||
## Document Loader
|
||||
### Google BigQuery
|
||||
|
||||
>[Google BigQuery](https://cloud.google.com/bigquery) is a serverless and cost-effective enterprise data warehouse that works across clouds and scales with your data.
|
||||
> [Google BigQuery](https://cloud.google.com/bigquery) is a serverless and cost-effective enterprise data warehouse that works across clouds and scales with your data.
|
||||
`BigQuery` is a part of the `Google Cloud Platform`.
|
||||
|
||||
First, you need to install `google-cloud-bigquery` python package.
|
||||
First, we need to install `google-cloud-bigquery` python package.
|
||||
|
||||
```bash
|
||||
pip install google-cloud-bigquery
|
||||
@@ -53,7 +53,7 @@ from langchain.document_loaders import BigQueryLoader
|
||||
|
||||
>[Google Cloud Storage](https://en.wikipedia.org/wiki/Google_Cloud_Storage) is a managed service for storing unstructured data.
|
||||
|
||||
First, you need to install `google-cloud-storage` python package.
|
||||
First, we need to install `google-cloud-storage` python package.
|
||||
|
||||
```bash
|
||||
pip install google-cloud-storage
|
||||
@@ -78,7 +78,7 @@ from langchain.document_loaders import GCSFileLoader
|
||||
|
||||
Currently, only `Google Docs` are supported.
|
||||
|
||||
First, you need to install several python package.
|
||||
First, we need to install several python package.
|
||||
|
||||
```bash
|
||||
pip install google-api-python-client google-auth-httplib2 google-auth-oauthlib
|
||||
@@ -109,6 +109,49 @@ See a [usage example](/docs/integrations/vectorstores/matchingengine).
|
||||
from langchain.vectorstores import MatchingEngine
|
||||
```
|
||||
|
||||
### Google ScaNN
|
||||
|
||||
>[Google ScaNN](https://github.com/google-research/google-research/tree/master/scann)
|
||||
> (Scalable Nearest Neighbors) is a python package.
|
||||
>
|
||||
>`ScaNN` is a method for efficient vector similarity search at scale.
|
||||
|
||||
>`ScaNN` includes search space pruning and quantization for Maximum Inner
|
||||
> Product Search and also supports other distance functions such as
|
||||
> Euclidean distance. The implementation is optimized for x86 processors
|
||||
> with AVX2 support. See its [Google Research github](https://github.com/google-research/google-research/tree/master/scann)
|
||||
> for more details.
|
||||
|
||||
We need to install `scann` python package.
|
||||
|
||||
```bash
|
||||
pip install scann
|
||||
```
|
||||
|
||||
See a [usage example](/docs/integrations/vectorstores/scann).
|
||||
|
||||
```python
|
||||
from langchain.vectorstores import ScaNN
|
||||
```
|
||||
|
||||
## Retrievers
|
||||
### Vertex AI Search
|
||||
|
||||
> [Google Cloud Vertex AI Search](https://cloud.google.com/generative-ai-app-builder/docs/introduction)
|
||||
> allows developers to quickly build generative AI powered search engines for customers and employees.
|
||||
|
||||
First, you need to install the `google-cloud-discoveryengine` Python package.
|
||||
|
||||
```bash
|
||||
pip install google-cloud-discoveryengine
|
||||
```
|
||||
|
||||
See a [usage example](/docs/integrations/retrievers/google_vertex_ai_search).
|
||||
|
||||
```python
|
||||
from langchain.retrievers import GoogleVertexAISearchRetriever
|
||||
```
|
||||
|
||||
## Tools
|
||||
### Google Search
|
||||
|
||||
@@ -116,15 +159,43 @@ from langchain.vectorstores import MatchingEngine
|
||||
- Set up a Custom Search Engine, following [these instructions](https://stackoverflow.com/questions/37083058/programmatically-searching-google-in-python-using-custom-search)
|
||||
- Get an API Key and Custom Search Engine ID from the previous step, and set them as environment variables `GOOGLE_API_KEY` and `GOOGLE_CSE_ID` respectively
|
||||
|
||||
There exists a GoogleSearchAPIWrapper utility which wraps this API. To import this utility:
|
||||
There exists a `GoogleSearchAPIWrapper` utility which wraps this API. To import this utility:
|
||||
|
||||
```python
|
||||
from langchain.utilities import GoogleSearchAPIWrapper
|
||||
```
|
||||
For a more detailed walkthrough of this wrapper, see [this notebook](/docs/integrations/tools/google_search.html).
|
||||
|
||||
You can easily load this wrapper as a Tool (to use with an Agent). You can do this with:
|
||||
We can easily load this wrapper as a Tool (to use with an Agent). We can do this with:
|
||||
```python
|
||||
from langchain.agents import load_tools
|
||||
tools = load_tools(["google-search"])
|
||||
```
|
||||
|
||||
## Document Transformer
|
||||
### Google Document AI
|
||||
|
||||
>[Document AI](https://cloud.google.com/document-ai/docs/overview) is a `Google Cloud Platform`
|
||||
> service to transform unstructured data from documents into structured data, making it easier
|
||||
> to understand, analyze, and consume.
|
||||
|
||||
|
||||
|
||||
We need to set up a [`GCS` bucket and create your own OCR processor](https://cloud.google.com/document-ai/docs/create-processor)
|
||||
The `GCS_OUTPUT_PATH` should be a path to a folder on GCS (starting with `gs://`)
|
||||
and a processor name should look like `projects/PROJECT_NUMBER/locations/LOCATION/processors/PROCESSOR_ID`.
|
||||
We can get it either programmatically or copy from the `Prediction endpoint` section of the `Processor details`
|
||||
tab in the Google Cloud Console.
|
||||
|
||||
```bash
|
||||
pip install google-cloud-documentai
|
||||
pip install google-cloud-documentai-toolbox
|
||||
```
|
||||
|
||||
|
||||
See a [usage example](/docs/integrations/document_transformers/docai).
|
||||
|
||||
```python
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
from langchain.document_loaders.parsers import DocAIParser
|
||||
```
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
# AWS DynamoDB
|
||||
|
||||
>[AWS DynamoDB](https://awscli.amazonaws.com/v2/documentation/api/latest/reference/dynamodb/index.html)
|
||||
> is a fully managed `NoSQL` database service that provides fast and predictable performance with seamless scalability.
|
||||
|
||||
## Installation and Setup
|
||||
|
||||
We have to configur the [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html).
|
||||
|
||||
We need to install the `boto3` library.
|
||||
|
||||
```bash
|
||||
pip install boto3
|
||||
```
|
||||
|
||||
|
||||
## Memory
|
||||
|
||||
See a [usage example](/docs/integrations/memory/aws_dynamodb).
|
||||
|
||||
```python
|
||||
from langchain.memory import DynamoDBChatMessageHistory
|
||||
```
|
||||
@@ -1,28 +0,0 @@
|
||||
# Google Document AI
|
||||
|
||||
>[Document AI](https://cloud.google.com/document-ai/docs/overview) is a `Google Cloud Platform`
|
||||
> service to transform unstructured data from documents into structured data, making it easier
|
||||
> to understand, analyze, and consume.
|
||||
|
||||
|
||||
## Installation and Setup
|
||||
|
||||
You need to set up a [`GCS` bucket and create your own OCR processor](https://cloud.google.com/document-ai/docs/create-processor)
|
||||
The `GCS_OUTPUT_PATH` should be a path to a folder on GCS (starting with `gs://`)
|
||||
and a processor name should look like `projects/PROJECT_NUMBER/locations/LOCATION/processors/PROCESSOR_ID`.
|
||||
You can get it either programmatically or copy from the `Prediction endpoint` section of the `Processor details`
|
||||
tab in the Google Cloud Console.
|
||||
|
||||
```bash
|
||||
pip install google-cloud-documentai
|
||||
pip install google-cloud-documentai-toolbox
|
||||
```
|
||||
|
||||
## Document Transformer
|
||||
|
||||
See a [usage example](/docs/integrations/document_transformers/docai).
|
||||
|
||||
```python
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
from langchain.document_loaders.parsers import DocAIParser
|
||||
```
|
||||
@@ -1,29 +0,0 @@
|
||||
# ScaNN
|
||||
|
||||
>[Google ScaNN](https://github.com/google-research/google-research/tree/master/scann)
|
||||
> (Scalable Nearest Neighbors) is a python package.
|
||||
>
|
||||
>`ScaNN` is a method for efficient vector similarity search at scale.
|
||||
|
||||
>ScaNN includes search space pruning and quantization for Maximum Inner
|
||||
> Product Search and also supports other distance functions such as
|
||||
> Euclidean distance. The implementation is optimized for x86 processors
|
||||
> with AVX2 support. See its [Google Research github](https://github.com/google-research/google-research/tree/master/scann)
|
||||
> for more details.
|
||||
|
||||
## Installation and Setup
|
||||
|
||||
We need to install `scann` python package.
|
||||
|
||||
```bash
|
||||
pip install scann
|
||||
```
|
||||
|
||||
## Vector Store
|
||||
|
||||
See a [usage example](/docs/integrations/vectorstores/scann).
|
||||
|
||||
```python
|
||||
from langchain.vectorstores import ScaNN
|
||||
```
|
||||
|
||||
@@ -1,272 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Google Cloud Enterprise Search\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"[Enterprise Search](https://cloud.google.com/enterprise-search) is a part of the Generative AI App Builder suite of tools offered by Google Cloud.\n",
|
||||
"\n",
|
||||
"Gen AI App Builder lets developers, even those with limited machine learning skills, quickly and easily tap into the power of Google’s foundation models, search expertise, and conversational AI technologies to create enterprise-grade generative AI applications. \n",
|
||||
"\n",
|
||||
"Enterprise Search lets organizations quickly build generative AI powered search engines for customers and employees.Enterprise Search is underpinned by a variety of Google Search technologies, including semantic search, which helps deliver more relevant results than traditional keyword-based search techniques by using natural language processing and machine learning techniques to infer relationships within the content and intent from the user’s query input. Enterprise Search also benefits from Google’s expertise in understanding how users search and factors in content relevance to order displayed results. \n",
|
||||
"\n",
|
||||
"Google Cloud offers Enterprise Search via Gen App Builder in Google Cloud Console and via an API for enterprise workflow integration. \n",
|
||||
"\n",
|
||||
"This notebook demonstrates how to configure Enterprise Search and use the Enterprise Search retriever. The Enterprise Search retriever encapsulates the [Generative AI App Builder Python client library](https://cloud.google.com/generative-ai-app-builder/docs/libraries#client-libraries-install-python) and uses it to access the Enterprise Search [Search Service API](https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1beta.services.search_service)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Install pre-requisites\n",
|
||||
"\n",
|
||||
"You need to install the `google-cloud-discoverengine` package to use the Enterprise Search retriever."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! pip install google-cloud-discoveryengine"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Configure access to Google Cloud and Google Cloud Enterprise Search\n",
|
||||
"\n",
|
||||
"Enterprise Search is generally available for the allowlist (which means customers need to be approved for access) as of June 6, 2023. Contact your Google Cloud sales team for access and pricing details. We are previewing additional features that are coming soon to the generally available offering as part of our [Trusted Tester](https://cloud.google.com/ai/earlyaccess/join?hl=en) program. Sign up for [Trusted Tester](https://cloud.google.com/ai/earlyaccess/join?hl=en) and contact your Google Cloud sales team for an expedited trial.\n",
|
||||
"\n",
|
||||
"Before you can run this notebook you need to:\n",
|
||||
"- Set or create a Google Cloud project and turn on Gen App Builder\n",
|
||||
"- Create and populate an unstructured data store\n",
|
||||
"- Set credentials to access `Enterprise Search API`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Set or create a Google Cloud poject and turn on Gen App Builder\n",
|
||||
"\n",
|
||||
"Follow the instructions in the [Enterprise Search Getting Started guide](https://cloud.google.com/generative-ai-app-builder/docs/before-you-begin) to set/create a GCP project and enable Gen App Builder.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create and populate an unstructured data store\n",
|
||||
"\n",
|
||||
"[Use Google Cloud Console to create an unstructured data store](https://cloud.google.com/generative-ai-app-builder/docs/create-engine-es#unstructured-data) and populate it with the example PDF documents from the `gs://cloud-samples-data/gen-app-builder/search/alphabet-investor-pdfs` Cloud Storage folder. Make sure to use the `Cloud Storage (without metadata)` option."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Set credentials to access Enterprise Search API\n",
|
||||
"\n",
|
||||
"The [Gen App Builder client libraries](https://cloud.google.com/generative-ai-app-builder/docs/libraries) used by the Enterprise Search retriever provide high-level language support for authenticating to Gen App Builder programmatically. Client libraries support [Application Default Credentials (ADC)](https://cloud.google.com/docs/authentication/application-default-credentials); the libraries look for credentials in a set of defined locations and use those credentials to authenticate requests to the API. With ADC, you can make credentials available to your application in a variety of environments, such as local development or production, without needing to modify your application code.\n",
|
||||
"\n",
|
||||
"If running in [Google Colab](https://colab.google) authenticate with `google.colab.google.auth` otherwise follow one of the [supported methods](https://cloud.google.com/docs/authentication/application-default-credentials) to make sure that you Application Default Credentials are properly set."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"if \"google.colab\" in sys.modules:\n",
|
||||
" from google.colab import auth as google_auth\n",
|
||||
"\n",
|
||||
" google_auth.authenticate_user()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Configure and use the Enterprise Search retriever\n",
|
||||
"\n",
|
||||
"The Enterprise Search retriever is implemented in the `langchain.retriever.GoogleCloudEntepriseSearchRetriever` class. The `get_relevant_documents` method returns a list of `langchain.schema.Document` documents where the `page_content` field of each document is populated the document content.\n",
|
||||
"Depending on the data type used in Enterprise search (structured or unstructured) the `page_content` field is populated as follows:\n",
|
||||
"- Structured data source: either an `extractive segment` or an `extractive answer` that matches a query. The `metadata` field is populated with metadata (if any) of the document from which the segments or answers were extracted.\n",
|
||||
"- Unstructured data source: a string json containing all the fields returned from the structured data source. The `metadata` field is populated with metadata (if any) of the document \n",
|
||||
"\n",
|
||||
"### Only for Unstructured data sources:\n",
|
||||
"An extractive answer is verbatim text that is returned with each search result. It is extracted directly from the original document. Extractive answers are typically displayed near the top of web pages to provide an end user with a brief answer that is contextually relevant to their query. Extractive answers are available for website and unstructured search.\n",
|
||||
"\n",
|
||||
"An extractive segment is verbatim text that is returned with each search result. An extractive segment is usually more verbose than an extractive answer. Extractive segments can be displayed as an answer to a query, and can be used to perform post-processing tasks and as input for large language models to generate answers or new text. Extractive segments are available for unstructured search.\n",
|
||||
"\n",
|
||||
"For more information about extractive segments and extractive answers refer to [product documentation](https://cloud.google.com/generative-ai-app-builder/docs/snippets).\n",
|
||||
"\n",
|
||||
"When creating an instance of the retriever you can specify a number of parameters that control which Enterprise data store to access and how a natural language query is processed, including configurations for extractive answers and segments.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"### The mandatory parameters are:\n",
|
||||
"\n",
|
||||
"- `project_id` - Your Google Cloud PROJECT_ID\n",
|
||||
"- `search_engine_id` - The ID of the data store you want to use. \n",
|
||||
"\n",
|
||||
"The `project_id` and `search_engine_id` parameters can be provided explicitly in the retriever's constructor or through the environment variables - `PROJECT_ID` and `SEARCH_ENGINE_ID`.\n",
|
||||
"\n",
|
||||
"You can also configure a number of optional parameters, including:\n",
|
||||
"\n",
|
||||
"- `max_documents` - The maximum number of documents used to provide extractive segments or extractive answers\n",
|
||||
"- `get_extractive_answers` - By default, the retriever is configured to return extractive segments. Set this field to `True` to return extractive answers. This is used only when `engine_data_type` set to 0 (unstructured) \n",
|
||||
"- `max_extractive_answer_count` - The maximum number of extractive answers returned in each search result.\n",
|
||||
" At most 5 answers will be returned. This is used only when `engine_data_type` set to 0 (unstructured) \n",
|
||||
"- `max_extractive_segment_count` - The maximum number of extractive segments returned in each search result.\n",
|
||||
" Currently one segment will be returned. This is used only when `engine_data_type` set to 0 (unstructured) \n",
|
||||
"- `filter` - The filter expression that allows you filter the search results based on the metadata associated with the documents in the searched data store. \n",
|
||||
"- `query_expansion_condition` - Specification to determine under which conditions query expansion should occur.\n",
|
||||
" 0 - Unspecified query expansion condition. In this case, server behavior defaults to disabled.\n",
|
||||
" 1 - Disabled query expansion. Only the exact search query is used, even if SearchResponse.total_size is zero.\n",
|
||||
" 2 - Automatic query expansion built by the Search API.\n",
|
||||
"- `engine_data_type` - Defines the enterprise search data type\n",
|
||||
" 0 - Unstructured data \n",
|
||||
" 1 - Structured data\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configure and use the retriever for **unstructured** data with extractve segments "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.retrievers import GoogleCloudEnterpriseSearchRetriever\n",
|
||||
"\n",
|
||||
"PROJECT_ID = \"<YOUR PROJECT ID>\" # Set to your Project ID\n",
|
||||
"SEARCH_ENGINE_ID = \"<YOUR SEARCH ENGINE ID>\" # Set to your data store ID"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"retriever = GoogleCloudEnterpriseSearchRetriever(\n",
|
||||
" project_id=PROJECT_ID,\n",
|
||||
" search_engine_id=SEARCH_ENGINE_ID,\n",
|
||||
" max_documents=3,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What are Alphabet's Other Bets?\"\n",
|
||||
"\n",
|
||||
"result = retriever.get_relevant_documents(query)\n",
|
||||
"for doc in result:\n",
|
||||
" print(doc)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configure and use the retriever for **unstructured** data with extractve answers "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"retriever = GoogleCloudEnterpriseSearchRetriever(\n",
|
||||
" project_id=PROJECT_ID,\n",
|
||||
" search_engine_id=SEARCH_ENGINE_ID,\n",
|
||||
" max_documents=3,\n",
|
||||
" max_extractive_answer_count=3,\n",
|
||||
" get_extractive_answers=True,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What are Alphabet's Other Bets?\"\n",
|
||||
"\n",
|
||||
"result = retriever.get_relevant_documents(query)\n",
|
||||
"for doc in result:\n",
|
||||
" print(doc)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configure and use the retriever for **structured** data with extractve answers "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"retriever = GoogleCloudEnterpriseSearchRetriever(\n",
|
||||
" project_id=PROJECT_ID,\n",
|
||||
" search_engine_id=SEARCH_ENGINE_ID,\n",
|
||||
" max_documents=3,\n",
|
||||
" engine_data_type=1\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"result = retriever.get_relevant_documents(query)\n",
|
||||
"for doc in result:\n",
|
||||
" print(doc)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.10"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -0,0 +1,274 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Google Vertex AI Search\n",
|
||||
"\n",
|
||||
"[Vertex AI Search](https://cloud.google.com/enterprise-search) (formerly known as Enterprise Search on Generative AI App Builder) is a part of the [Vertex AI](https://cloud.google.com/vertex-ai) machine learning platform offered by Google Cloud.\n",
|
||||
"\n",
|
||||
"Vertex AI Search lets organizations quickly build generative AI powered search engines for customers and employees. It's underpinned by a variety of Google Search technologies, including semantic search, which helps deliver more relevant results than traditional keyword-based search techniques by using natural language processing and machine learning techniques to infer relationships within the content and intent from the user’s query input. Vertex AI Search also benefits from Google’s expertise in understanding how users search and factors in content relevance to order displayed results.\n",
|
||||
"\n",
|
||||
"Vertex AI Search is available in the Google Cloud Console and via an API for enterprise workflow integration.\n",
|
||||
"\n",
|
||||
"This notebook demonstrates how to configure Vertex AI Search and use the Vertex AI Search retriever. The Vertex AI Search retriever encapsulates the [Python client library](https://cloud.google.com/generative-ai-app-builder/docs/libraries#client-libraries-install-python) and uses it to access the [Search Service API](https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1beta.services.search_service).\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Install pre-requisites\n",
|
||||
"\n",
|
||||
"You need to install the `google-cloud-discoveryengine` package to use the Vertex AI Search retriever.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! pip install google-cloud-discoveryengine"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Configure access to Google Cloud and Vertex AI Search\n",
|
||||
"\n",
|
||||
"Vertex AI Search is generally available without allowlist as of August 2023.\n",
|
||||
"\n",
|
||||
"Before you can use the retriever, you need to complete the following steps:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create a search engine and populate an unstructured data store\n",
|
||||
"\n",
|
||||
"- Follow the instructions in the [Vertex AI Search Getting Started guide](https://cloud.google.com/generative-ai-app-builder/docs/try-enterprise-search) to set up a Google Cloud project and Vertex AI Search.\n",
|
||||
"- [Use the Google Cloud Console to create an unstructured data store](https://cloud.google.com/generative-ai-app-builder/docs/create-engine-es#unstructured-data)\n",
|
||||
" - Populate it with the example PDF documents from the `gs://cloud-samples-data/gen-app-builder/search/alphabet-investor-pdfs` Cloud Storage folder.\n",
|
||||
" - Make sure to use the `Cloud Storage (without metadata)` option.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Set credentials to access Vertex AI Search API\n",
|
||||
"\n",
|
||||
"The [Vertex AI Search client libraries](https://cloud.google.com/generative-ai-app-builder/docs/libraries) used by the Vertex AI Search retriever provide high-level language support for authenticating to Google Cloud programmatically.\n",
|
||||
"Client libraries support [Application Default Credentials (ADC)](https://cloud.google.com/docs/authentication/application-default-credentials); the libraries look for credentials in a set of defined locations and use those credentials to authenticate requests to the API.\n",
|
||||
"With ADC, you can make credentials available to your application in a variety of environments, such as local development or production, without needing to modify your application code.\n",
|
||||
"\n",
|
||||
"If running in [Google Colab](https://colab.google) authenticate with `google.colab.google.auth` otherwise follow one of the [supported methods](https://cloud.google.com/docs/authentication/application-default-credentials) to make sure that you Application Default Credentials are properly set.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"if \"google.colab\" in sys.modules:\n",
|
||||
" from google.colab import auth as google_auth\n",
|
||||
"\n",
|
||||
" google_auth.authenticate_user()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Configure and use the Vertex AI Search retriever\n",
|
||||
"\n",
|
||||
"The Vertex AI Search retriever is implemented in the `langchain.retriever.GoogleVertexAISearchRetriever` class. The `get_relevant_documents` method returns a list of `langchain.schema.Document` documents where the `page_content` field of each document is populated the document content.\n",
|
||||
"Depending on the data type used in Vertex AI Search (structured or unstructured) the `page_content` field is populated as follows:\n",
|
||||
"\n",
|
||||
"- Structured data source: either an `extractive segment` or an `extractive answer` that matches a query. The `metadata` field is populated with metadata (if any) of the document from which the segments or answers were extracted.\n",
|
||||
"- Unstructured data source: a string json containing all the fields returned from the structured data source. The `metadata` field is populated with metadata (if any) of the document\n",
|
||||
"\n",
|
||||
"### Only for Unstructured data sources:\n",
|
||||
"\n",
|
||||
"An extractive answer is verbatim text that is returned with each search result. It is extracted directly from the original document. Extractive answers are typically displayed near the top of web pages to provide an end user with a brief answer that is contextually relevant to their query. Extractive answers are available for website and unstructured search.\n",
|
||||
"\n",
|
||||
"An extractive segment is verbatim text that is returned with each search result. An extractive segment is usually more verbose than an extractive answer. Extractive segments can be displayed as an answer to a query, and can be used to perform post-processing tasks and as input for large language models to generate answers or new text. Extractive segments are available for unstructured search.\n",
|
||||
"\n",
|
||||
"For more information about extractive segments and extractive answers refer to [product documentation](https://cloud.google.com/generative-ai-app-builder/docs/snippets).\n",
|
||||
"\n",
|
||||
"NOTE: Extractive segments require the [Enterprise edition](https://cloud.google.com/generative-ai-app-builder/docs/about-advanced-features#enterprise-features) features to be enabled.\n",
|
||||
"\n",
|
||||
"When creating an instance of the retriever you can specify a number of parameters that control which data store to access and how a natural language query is processed, including configurations for extractive answers and segments.\n",
|
||||
"\n",
|
||||
"### The mandatory parameters are:\n",
|
||||
"\n",
|
||||
"- `project_id` - Your Google Cloud Project ID.\n",
|
||||
"- `location_id` - The location of the data store.\n",
|
||||
" - `global` (default)\n",
|
||||
" - `us`\n",
|
||||
" - `eu`\n",
|
||||
"- `data_store_id` - The ID of the data store you want to use.\n",
|
||||
" - Note: This was called `search_engine_id` in previous versions of the retriever.\n",
|
||||
"\n",
|
||||
"The `project_id` and `data_store_id` parameters can be provided explicitly in the retriever's constructor or through the environment variables - `PROJECT_ID` and `DATA_STORE_ID`.\n",
|
||||
"\n",
|
||||
"You can also configure a number of optional parameters, including:\n",
|
||||
"\n",
|
||||
"- `max_documents` - The maximum number of documents used to provide extractive segments or extractive answers\n",
|
||||
"- `get_extractive_answers` - By default, the retriever is configured to return extractive segments.\n",
|
||||
" - Set this field to `True` to return extractive answers. This is used only when `engine_data_type` set to `0` (unstructured)\n",
|
||||
"- `max_extractive_answer_count` - The maximum number of extractive answers returned in each search result.\n",
|
||||
" - At most 5 answers will be returned. This is used only when `engine_data_type` set to `0` (unstructured).\n",
|
||||
"- `max_extractive_segment_count` - The maximum number of extractive segments returned in each search result.\n",
|
||||
" - Currently one segment will be returned. This is used only when `engine_data_type` set to `0` (unstructured).\n",
|
||||
"- `filter` - The filter expression for the search results based on the metadata associated with the documents in the data store.\n",
|
||||
"- `query_expansion_condition` - Specification to determine under which conditions query expansion should occur.\n",
|
||||
" - `0` - Unspecified query expansion condition. In this case, server behavior defaults to disabled.\n",
|
||||
" - `1` - Disabled query expansion. Only the exact search query is used, even if SearchResponse.total_size is zero.\n",
|
||||
" - `2` - Automatic query expansion built by the Search API.\n",
|
||||
"- `engine_data_type` - Defines the Vertex AI Search data type\n",
|
||||
" - `0` - Unstructured data\n",
|
||||
" - `1` - Structured data\n",
|
||||
"\n",
|
||||
"### Migration guide for `GoogleCloudEnterpriseSearchRetriever`\n",
|
||||
"\n",
|
||||
"In previous versions, this retriever was called `GoogleCloudEnterpriseSearchRetriever`. Some backwards-incompatible changes had to be made to the retriever after the General Availability launch due to changes in the product behavior.\n",
|
||||
"\n",
|
||||
"To update to the new retriever, make the following changes:\n",
|
||||
"\n",
|
||||
"- Change the import from: `from langchain.retrievers import GoogleCloudEnterpriseSearchRetriever` -> `from langchain.retrievers import GoogleVertexAISearchRetriever`.\n",
|
||||
"- Change all class references from `GoogleCloudEnterpriseSearchRetriever` -> `GoogleVertexAISearchRetriever`.\n",
|
||||
"- Upon class initialization, change the `search_engine_id` parameter name to `data_store_id`.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configure and use the retriever for **unstructured** data with extractive segments\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.retrievers import GoogleVertexAISearchRetriever\n",
|
||||
"\n",
|
||||
"PROJECT_ID = \"<YOUR PROJECT ID>\" # Set to your Project ID\n",
|
||||
"LOCATION_ID = \"<YOUR LOCATION>\" # Set to your data store location\n",
|
||||
"DATA_STORE_ID = \"<YOUR DATA STORE ID>\" # Set to your data store ID"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"retriever = GoogleVertexAISearchRetriever(\n",
|
||||
" project_id=PROJECT_ID,\n",
|
||||
" location_id=LOCATION_ID,\n",
|
||||
" data_store_id=DATA_STORE_ID,\n",
|
||||
" max_documents=3,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What are Alphabet's Other Bets?\"\n",
|
||||
"\n",
|
||||
"result = retriever.get_relevant_documents(query)\n",
|
||||
"for doc in result:\n",
|
||||
" print(doc)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configure and use the retriever for **unstructured** data with extractive answers\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"retriever = GoogleVertexAISearchRetriever(\n",
|
||||
" project_id=PROJECT_ID,\n",
|
||||
" location_id=LOCATION_ID,\n",
|
||||
" data_store_id=DATA_STORE_ID,\n",
|
||||
" max_documents=3,\n",
|
||||
" max_extractive_answer_count=3,\n",
|
||||
" get_extractive_answers=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"result = retriever.get_relevant_documents(query)\n",
|
||||
"for doc in result:\n",
|
||||
" print(doc)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configure and use the retriever for **structured** data\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"retriever = GoogleVertexAISearchRetriever(\n",
|
||||
" project_id=PROJECT_ID,\n",
|
||||
" location_id=LOCATION_ID,\n",
|
||||
" data_store_id=DATA_STORE_ID,\n",
|
||||
" max_documents=3,\n",
|
||||
" engine_data_type=1,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"result = retriever.get_relevant_documents(query)\n",
|
||||
"for doc in result:\n",
|
||||
" print(doc)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.10"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
62
docs/extras/integrations/retrievers/you-retriever.ipynb
Normal file
62
docs/extras/integrations/retrievers/you-retriever.ipynb
Normal file
@@ -0,0 +1,62 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "47828a7a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using the You.com Retriever\n",
|
||||
"The retriever from You.com is good for retrieving lots of text. We return multiple of the best text snippets per URL we find to be relevant.\n",
|
||||
"\n",
|
||||
"First you just need to initialize the retriever"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a90d61d4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.retrievers.you_retriever import YouRetriever\n",
|
||||
"from langchain.chains import RetrievalQA\n",
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"\n",
|
||||
"yr = YouRetriever()\n",
|
||||
"qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type=\"map_reduce\", retriever=yr)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4a223f2f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"what starting ohio state quarterback most recently went their entire college career without beating Michigan?\"\n",
|
||||
"qa.run(query)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.17"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
855
docs/extras/integrations/toolkits/clickup.ipynb
Normal file
855
docs/extras/integrations/toolkits/clickup.ipynb
Normal file
@@ -0,0 +1,855 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# ClickUp Langchiain Toolkit"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%reload_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
"from datetime import datetime\n",
|
||||
"\n",
|
||||
"from langchain.agents.agent_toolkits.clickup.toolkit import ClickupToolkit\n",
|
||||
"\n",
|
||||
"from langchain.agents import AgentType, initialize_agent\n",
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"from langchain.utilities.clickup import ClickupAPIWrapper\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Init"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Get Authenticated\n",
|
||||
"1. Create a [ClickUp App](https://help.clickup.com/hc/en-us/articles/6303422883095-Create-your-own-app-with-the-ClickUp-API)\n",
|
||||
"2. Follow [these steps](https://clickup.com/api/developer-portal/authentication/) to get your `client_id` and `client_secret`.\n",
|
||||
" - *Suggestion: use `https://google.com` as the redirect_uri. This is what we assume in the defaults for this toolkit.*\n",
|
||||
"3. Copy/paste them and run the next cell to get your `code`\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Click this link, select your workspace, click `Connect Workspace`\n",
|
||||
"https://app.clickup.com/api?client_id=ABC...&redirect_uri=https://google.com\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Copilot Sandbox\n",
|
||||
"oauth_client_id = \"ABC...\"\n",
|
||||
"oauth_client_secret = \"123...\"\n",
|
||||
"redirect_uri = \"https://google.com\"\n",
|
||||
"\n",
|
||||
"print('Click this link, select your workspace, click `Connect Workspace`')\n",
|
||||
"print(ClickupAPIWrapper.get_access_code_url(oauth_client_id, redirect_uri))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The url should change to something like this https://www.google.com/?code=THISISMYCODERIGHTHERE.\n",
|
||||
"\n",
|
||||
"Next, copy/paste the `CODE` (THISISMYCODERIGHTHERE) generated in the URL in the cell below.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"code = \"THISISMYCODERIGHTHERE\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Get Access Token\n",
|
||||
"Then, use the code below to get your `access_token`.\n",
|
||||
"\n",
|
||||
"*Important*: Each code is a one time code that will expire after use. The `access_token` can be used for a period of time. Make sure to copy paste the `access_token` once you get it!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error: {'err': 'Code already used', 'ECODE': 'OAUTH_014'}\n",
|
||||
"You already used this code once. Go back a step and generate a new code.\n",
|
||||
"Our best guess for the url to get a new code is:\n",
|
||||
"https://app.clickup.com/api?client_id=B5D61F8EVO04PR0JX0U73984LLS9GI6P&redirect_uri=https://google.com\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"access_token = ClickupAPIWrapper.get_access_token(oauth_client_id, oauth_client_secret, code)\n",
|
||||
"\n",
|
||||
"if access_token is not None:\n",
|
||||
" print('Copy/paste this code, into the next cell so you can reuse it!')\n",
|
||||
" print(access_token)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create Toolkit"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Found team_id: 9011010153.\n",
|
||||
"Most request require the team id, so we store it for you in the toolkit, we assume the first team in your list is the one you want. \n",
|
||||
"Note: If you know this is the wrong ID, you can pass it at initialization.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Set your access token here\n",
|
||||
"access_token = '12345678_myaccesstokengoeshere123'\n",
|
||||
"access_token = '81928627_c009bf122ccf36ec3ba3e0ef748b07042c5e4217260042004a5934540cb61527'\n",
|
||||
"\n",
|
||||
"# Init toolkit\n",
|
||||
"clickup_api_wrapper = ClickupAPIWrapper(access_token=access_token)\n",
|
||||
"toolkit = ClickupToolkit.from_clickup_api_wrapper(clickup_api_wrapper)\n",
|
||||
"print(f'Found team_id: {clickup_api_wrapper.team_id}.\\nMost request require the team id, so we store it for you in the toolkit, we assume the first team in your list is the one you want. \\nNote: If you know this is the wrong ID, you can pass it at initialization.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create Agent"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = OpenAI(temperature=0, openai_api_key=\"\")\n",
|
||||
"\n",
|
||||
"agent = initialize_agent(\n",
|
||||
" toolkit.get_tools(), llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# helper function for demo\n",
|
||||
"def print_and_run(command):\n",
|
||||
" print('\\033[94m$ COMMAND\\033[0m')\n",
|
||||
" print(command)\n",
|
||||
" print('\\n\\033[94m$ AGENT\\033[0m')\n",
|
||||
" response = agent.run(command)\n",
|
||||
" print(''.join(['-']*80))\n",
|
||||
" return response\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Navigation\n",
|
||||
"You can get the teams, folder and spaces your user has access to"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"Get all the teams that the user is authorized to access\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to use the Get Teams tool\n",
|
||||
"Action: Get Teams\n",
|
||||
"Action Input: No necessary request parameters\u001b[0m\n",
|
||||
"Observation: \u001b[38;5;200m\u001b[1;3m{'teams': [{'id': '9011010153', 'name': 'Task Copilot Sandbox Workspace 1', 'members': [{'id': 61681706, 'username': 'Aiswarya ', 'email': 'asankar@clickup.com', 'initials': 'A'}, {'id': 81928627, 'username': 'Rodrigo Ceballos Lentini', 'email': 'rlentini@clickup.com', 'initials': 'RL'}]}]}\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the teams the user is authorized to access\n",
|
||||
"Final Answer: The user is authorized to access the team 'Task Copilot Sandbox Workspace 1'.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"Get all the spaces available to the team\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to use the API to get the spaces\n",
|
||||
"Action: Get Teams\n",
|
||||
"Action Input: No necessary request parameters\u001b[0m\n",
|
||||
"Observation: \u001b[38;5;200m\u001b[1;3m{'teams': [{'id': '9011010153', 'name': 'Task Copilot Sandbox Workspace 1', 'members': [{'id': 61681706, 'username': 'Aiswarya ', 'email': 'asankar@clickup.com', 'initials': 'A'}, {'id': 81928627, 'username': 'Rodrigo Ceballos Lentini', 'email': 'rlentini@clickup.com', 'initials': 'RL'}]}]}\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now have the list of teams\n",
|
||||
"Final Answer: The list of teams available to the team is [{'id': '9011010153', 'name': 'Task Copilot Sandbox Workspace 1', 'members': [{'id': 61681706, 'username': 'Aiswarya ', 'email': 'asankar@clickup.com', 'initials': 'A'}, {'id': 81928627, 'username': 'Rodrigo Ceballos Lentini', 'email': 'rlentini@clickup.com', 'initials': 'RL'}]}]\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"Get all the folders for the team\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to get all the folders for the team\n",
|
||||
"Action: Get all folders in the workspace\n",
|
||||
"Action Input: {\"folder_id\": \"90130119692\"}\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m{'spaces': [{'id': '90110075934', 'name': 'Test Space', 'color': None, 'private': False, 'avatar': None, 'admin_can_manage': False, 'statuses': [{'id': 'p90110075934_lBKIEh3r', 'status': 'Open', 'type': 'open', 'orderindex': 0, 'color': '#d3d3d3'}, {'id': 'p90110075934_AvVAnVqy', 'status': 'in progress', 'type': 'custom', 'orderindex': 1, 'color': '#a875ff'}, {'id': 'p90110075934_SftYWzGt', 'status': 'Closed', 'type': 'closed', 'orderindex': 2, 'color': '#6bc950'}], 'multiple_assignees': True, 'features': {'due_dates': {'enabled': True, 'start_date': True, 'remap_due_dates': False, 'remap_closed_due_date': False}, 'sprints': {'enabled': False}, 'time_tracking': {'enabled': True, 'harvest': False, 'rollup': False}, 'points': {'enabled': False}, 'custom_items': {'enabled': False}, 'priorities': {'enabled': True, 'priorities': [{'color': '#f50000', 'id': '1', 'orderindex': '1', 'priority': 'urgent'}, {'color': '#ffcc00', 'id': '2', 'orderindex': '2', 'priority': 'high'}, {'color': '#6fddff', 'id': '3', 'orderindex': '3', 'priority': 'normal'}, {'color': '#d8d8d8', 'id': '4', 'orderindex': '4', 'priority': 'low'}]}, 'tags': {'enabled': True}, 'check_unresolved': {'enabled': True, 'subtasks': None, 'checklists': None, 'comments': None}, 'zoom': {'enabled': False}, 'milestones': {'enabled': False}, 'custom_fields': {'enabled': True}, 'dependency_warning': {'enabled': True}, 'status_pies': {'enabled': False}, 'multiple_assignees': {'enabled': True}}, 'archived': False}]}\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the folders in the team\n",
|
||||
"Final Answer: The folders in the team are listed in the observation.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The folders in the team are listed in the observation.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print_and_run(\"Get all the teams that the user is authorized to access\")\n",
|
||||
"print_and_run(\"Get all the spaces available to the team\")\n",
|
||||
"print_and_run(\"Get all the folders for the team\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Task Operations\n",
|
||||
"You can get, ask question about tasks and update them"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"task_id = '8685mb5fn'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Basic attirbute getting and updating"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"Get task with id 8685mb5fn\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to use the Get task tool\n",
|
||||
"Action: Get task\n",
|
||||
"Action Input: {\"task_id\": \"8685mb5fn\"}\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3m{'id': '8685mb5fn', 'name': 'dummy task 1', 'text_content': 'An old, boring task description', 'description': 'An old, boring task description', 'status': 'to do', 'creator_id': 81928627, 'creator_username': 'Rodrigo Ceballos Lentini', 'creator_email': 'rlentini@clickup.com', 'assignees': [], 'watcher_username': 'Rodrigo Ceballos Lentini', 'watcher_email': 'rlentini@clickup.com', 'priority': 'high', 'due_date': '1694764800000', 'start_date': None, 'points': None, 'team_id': '9011010153', 'project_id': '90110331875'}\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the task details\n",
|
||||
"Final Answer: The task with id 8685mb5fn has the following details: {'id': '8685mb5fn', 'name': 'dummy task 1', 'text_content': 'An old, boring task description', 'description': 'An old, boring task description', 'status': 'to do', 'creator_id': 81928627, 'creator_username': 'Rodrigo Ceballos Lentini', 'creator_email': 'rlentini@clickup.com', 'assignees': [], 'watcher_username': 'Rodrigo Ceballos Lentini', 'watcher_email': 'rlentini@clickup.com', 'priority': 'high', 'due_date': '1694764800000', 'start_date': None, 'points': None, 'team_id': '9011010153', 'project_id': '90110331875'}\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"What is the description of task with id 8685mb5fn\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to get the description of the task\n",
|
||||
"Action: Get task attribute\n",
|
||||
"Action Input: {\"task_id\": \"8685mb5fn\", \"attribute_name\": \"description\"}\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mAn old, boring task description\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the description of the task\n",
|
||||
"Final Answer: An old, boring task description\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"For task with id 8685mb5fn, change the description to 'A cool task descriptiont changed by AI!'\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to update the description of a task\n",
|
||||
"Action: Update task\n",
|
||||
"Action Input: {\"task_id\": \"8685mb5fn\", \"attribute_name\": \"description\", \"value\": \"A cool task description changed by AI!\"}\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3m<Response [200]>\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I have successfully updated the task description\n",
|
||||
"Final Answer: The description of task 8685mb5fn has been successfully changed to 'A cool task description changed by AI!'\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"What is the description of task with id 8685mb5fn\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to get the description of the task\n",
|
||||
"Action: Get task attribute\n",
|
||||
"Action Input: {\"task_id\": \"8685mb5fn\", \"attribute_name\": \"description\"}\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mA cool task description changed by AI!\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the description of the task\n",
|
||||
"Final Answer: A cool task description changed by AI!\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"For task with id 8685mb5fn, change the description to 'An old, boring task description'\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to update the description of a task\n",
|
||||
"Action: Update task\n",
|
||||
"Action Input: {\"task_id\": \"8685mb5fn\", \"attribute_name\": \"description\", \"value\": \"An old, boring task description\"}\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3m<Response [200]>\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the task description has been updated\n",
|
||||
"Final Answer: The description of task 8685mb5fn has been updated to 'An old, boring task description'.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"The description of task 8685mb5fn has been updated to 'An old, boring task description'.\""
|
||||
]
|
||||
},
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# We can get a task to inspect it's contents\n",
|
||||
"print_and_run(f\"Get task with id {task_id}\")\n",
|
||||
"\n",
|
||||
"# We can get a specific attribute from a task\n",
|
||||
"previous_description = print_and_run(f\"What is the description of task with id {task_id}\")\n",
|
||||
"\n",
|
||||
"# We can even update it!\n",
|
||||
"print_and_run(f\"For task with id {task_id}, change the description to 'A cool task descriptiont changed by AI!'\")\n",
|
||||
"print_and_run(f\"What is the description of task with id {task_id}\")\n",
|
||||
"\n",
|
||||
"# Undo what we did\n",
|
||||
"print_and_run(f\"For task with id {task_id}, change the description to '{previous_description}'\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"Change the descrition task 8685mj6cd to 'Look ma no hands'\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to update the description of a task\n",
|
||||
"Action: Update task\n",
|
||||
"Action Input: {\"task_id\": \"8685mj6cd\", \"attribute_name\": \"description\", \"value\": \"Look ma no hands\"}\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3m<Response [200]>\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m The task description has been successfully updated\n",
|
||||
"Final Answer: The description of task 8685mj6cd has been changed to 'Look ma no hands'.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"The description of task 8685mj6cd has been changed to 'Look ma no hands'.\""
|
||||
]
|
||||
},
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print_and_run(\"Change the descrition task 8685mj6cd to 'Look ma no hands'\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Advanced Attributes (Assignees)\n",
|
||||
"You can query and update almost every thing about a task!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"user_id = 81928627"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"What are the assignees of task id 8685mb5fn?\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to get the assignees of a task\n",
|
||||
"Action: Get task attribute\n",
|
||||
"Action Input: {\"task_id\": \"8685mb5fn\", \"attribute_name\": \"assignee\"}\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mError: attribute_name = assignee was not found in task keys dict_keys(['id', 'name', 'text_content', 'description', 'status', 'creator_id', 'creator_username', 'creator_email', 'assignees', 'watcher_username', 'watcher_email', 'priority', 'due_date', 'start_date', 'points', 'team_id', 'project_id']). Please call again with one of the key names.\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I need to get the assignees of a task\n",
|
||||
"Action: Get task attribute\n",
|
||||
"Action Input: {\"task_id\": \"8685mb5fn\", \"attribute_name\": \"assignees\"}\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m[]\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n",
|
||||
"Final Answer: There are no assignees for task id 8685mb5fn.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"Remove user 81928627 from the assignees of task id 8685mb5fn\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to update the assignees of a task\n",
|
||||
"Action: Update task assignees\n",
|
||||
"Action Input: {\"task_id\": \"8685mb5fn\", \"operation\": \"rem\", \"users\": [81928627]}\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m<Response [200]>\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m The user has been removed from the assignees of the task\n",
|
||||
"Final Answer: User 81928627 has been removed from the assignees of task id 8685mb5fn.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"What are the assignees of task id 8685mb5fn?\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to get the assignees of a task\n",
|
||||
"Action: Get task attribute\n",
|
||||
"Action Input: {\"task_id\": \"8685mb5fn\", \"attribute_name\": \"assignee\"}\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3mError: attribute_name = assignee was not found in task keys dict_keys(['id', 'name', 'text_content', 'description', 'status', 'creator_id', 'creator_username', 'creator_email', 'assignees', 'watcher_username', 'watcher_email', 'priority', 'due_date', 'start_date', 'points', 'team_id', 'project_id']). Please call again with one of the key names.\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I need to get the assignees of a task\n",
|
||||
"Action: Get task attribute\n",
|
||||
"Action Input: {\"task_id\": \"8685mb5fn\", \"attribute_name\": \"assignees\"}\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m[]\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n",
|
||||
"Final Answer: There are no assignees for task id 8685mb5fn.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"Add user 81928627 from the assignees of task id 8685mb5fn\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to update the assignees of a task\n",
|
||||
"Action: Update task assignees\n",
|
||||
"Action Input: {\"task_id\": \"8685mb5fn\", \"operation\": \"rem\", \"users\": [81928627]}\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m<Response [200]>\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m The user has been removed from the assignees of the task\n",
|
||||
"Final Answer: User 81928627 has been removed from the assignees of task id 8685mb5fn.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'User 81928627 has been removed from the assignees of task id 8685mb5fn.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print_and_run(f\"What are the assignees of task id {task_id}?\")\n",
|
||||
"print_and_run(f\"Remove user {user_id} from the assignees of task id {task_id}\")\n",
|
||||
"print_and_run(f\"What are the assignees of task id {task_id}?\")\n",
|
||||
"print_and_run(f\"Add user {user_id} from the assignees of task id {task_id}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Creation\n",
|
||||
"You can create tasks, lists and folders"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"Create a task called 'Test Task - 18/09/2023-10:31:22' with description 'This is a Test'\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to use the Create Task tool\n",
|
||||
"Action: Create Task\n",
|
||||
"Action Input: {\"name\": \"Test Task - 18/09/2023-10:31:22\", \"description\": \"This is a Test\"}\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3m{'id': '8685mw4wq', 'custom_id': None, 'name': 'Test Task - 18/09/2023-10:31:22', 'text_content': 'This is a Test', 'description': 'This is a Test', 'status': {'id': 'p90110061901_VlN8IJtk', 'status': 'to do', 'color': '#87909e', 'orderindex': 0, 'type': 'open'}, 'orderindex': '23.00000000000000000000000000000000', 'date_created': '1695047486396', 'date_updated': '1695047486396', 'date_closed': None, 'date_done': None, 'archived': False, 'creator': {'id': 81928627, 'username': 'Rodrigo Ceballos Lentini', 'color': '#c51162', 'email': 'rlentini@clickup.com', 'profilePicture': None}, 'assignees': [], 'watchers': [{'id': 81928627, 'username': 'Rodrigo Ceballos Lentini', 'color': '#c51162', 'initials': 'RL', 'email': 'rlentini@clickup.com', 'profilePicture': None}], 'checklists': [], 'tags': [], 'parent': None, 'priority': None, 'due_date': None, 'start_date': None, 'points': None, 'time_estimate': None, 'time_spent': 0, 'custom_fields': [], 'dependencies': [], 'linked_tasks': [], 'team_id': '9011010153', 'url': 'https://app.clickup.com/t/8685mw4wq', 'sharing': {'public': False, 'public_share_expires_on': None, 'public_fields': ['assignees', 'priority', 'due_date', 'content', 'comments', 'attachments', 'customFields', 'subtasks', 'tags', 'checklists', 'coverimage'], 'token': None, 'seo_optimized': False}, 'permission_level': 'create', 'list': {'id': '901100754275', 'name': 'Test List', 'access': True}, 'project': {'id': '90110336890', 'name': 'Test Folder', 'hidden': False, 'access': True}, 'folder': {'id': '90110336890', 'name': 'Test Folder', 'hidden': False, 'access': True}, 'space': {'id': '90110061901'}}\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n",
|
||||
"Final Answer: A task called 'Test Task - 18/09/2023-10:31:22' with description 'This is a Test' was successfully created.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"A task called 'Test Task - 18/09/2023-10:31:22' with description 'This is a Test' was successfully created.\""
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"time_str = datetime.now().strftime(\"%d/%m/%Y-%H:%M:%S\")\n",
|
||||
"print_and_run(f\"Create a task called 'Test Task - {time_str}' with description 'This is a Test'\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"Create a list called Test List - 18/09/2023-10:32:12\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to create a list\n",
|
||||
"Action: Create List\n",
|
||||
"Action Input: {\"name\": \"Test List - 18/09/2023-10:32:12\"}\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m{'id': '901100774700', 'name': 'Test List - 18/09/2023-10:32:12', 'deleted': False, 'orderindex': 13, 'content': '', 'priority': None, 'assignee': None, 'due_date': None, 'start_date': None, 'folder': {'id': '90110336890', 'name': 'Test Folder', 'hidden': False, 'access': True}, 'space': {'id': '90110061901', 'name': 'Space', 'access': True}, 'inbound_address': 'a.t.901100774700.u-81928627.20b87d50-eece-4721-b487-9ca500338587@tasks.clickup.com', 'archived': False, 'override_statuses': False, 'statuses': [{'id': 'p90110061901_VlN8IJtk', 'status': 'to do', 'orderindex': 0, 'color': '#87909e', 'type': 'open'}, {'id': 'p90110061901_14GpYKnM', 'status': 'complete', 'orderindex': 1, 'color': '#6bc950', 'type': 'closed'}], 'permission_level': 'create'}\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n",
|
||||
"Final Answer: The list \"Test List - 18/09/2023-10:32:12\" has been created with id 901100774700.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The list \"Test List - 18/09/2023-10:32:12\" has been created with id 901100774700.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"time_str = datetime.now().strftime(\"%d/%m/%Y-%H:%M:%S\")\n",
|
||||
"print_and_run(f\"Create a list called Test List - {time_str}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"Create a folder called 'Test Folder - 18/09/2023-10:32:51'\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to use the Create Folder tool\n",
|
||||
"Action: Create Folder\n",
|
||||
"Action Input: {\"name\": \"Test Folder - 18/09/2023-10:32:51\"}\u001b[0m\n",
|
||||
"Observation: \u001b[38;5;200m\u001b[1;3m{'id': '90110348711', 'name': 'Test Folder - 18/09/2023-10:32:51', 'orderindex': 12, 'override_statuses': False, 'hidden': False, 'space': {'id': '90110061901', 'name': 'Space', 'access': True}, 'task_count': '0', 'archived': False, 'statuses': [], 'lists': [], 'permission_level': 'create'}\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I have successfully created the folder\n",
|
||||
"Final Answer: The folder 'Test Folder - 18/09/2023-10:32:51' has been successfully created.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"The folder 'Test Folder - 18/09/2023-10:32:51' has been successfully created.\""
|
||||
]
|
||||
},
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"time_str = datetime.now().strftime(\"%d/%m/%Y-%H:%M:%S\")\n",
|
||||
"print_and_run(f\"Create a folder called 'Test Folder - {time_str}'\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"Create a list called 'Test List - 18/09/2023-10:34:01' with content My test list with high priority and status red\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to create a list with the given name, content, priority and status\n",
|
||||
"Action: Create List\n",
|
||||
"Action Input: {\"name\": \"Test List - 18/09/2023-10:34:01\", \"content\": \"My test list\", \"priority\": 2, \"status\": \"red\"}\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m{'id': '901100774746', 'name': 'Test List - 18/09/2023-10:34:01', 'deleted': False, 'orderindex': 15, 'content': '', 'status': {'status': 'red', 'color': '#e50000', 'hide_label': True}, 'priority': {'priority': 'high', 'color': '#ffcc00'}, 'assignee': None, 'due_date': None, 'start_date': None, 'folder': {'id': '90110336890', 'name': 'Test Folder', 'hidden': False, 'access': True}, 'space': {'id': '90110061901', 'name': 'Space', 'access': True}, 'inbound_address': 'a.t.901100774746.u-81928627.2ab87133-728e-4166-b2ae-423cc320df37@tasks.clickup.com', 'archived': False, 'override_statuses': False, 'statuses': [{'id': 'p90110061901_VlN8IJtk', 'status': 'to do', 'orderindex': 0, 'color': '#87909e', 'type': 'open'}, {'id': 'p90110061901_14GpYKnM', 'status': 'complete', 'orderindex': 1, 'color': '#6bc950', 'type': 'closed'}], 'permission_level': 'create'}\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I have successfully created the list\n",
|
||||
"Final Answer: The list 'Test List - 18/09/2023-10:34:01' with content 'My test list' with high priority and status red has been successfully created.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"The list 'Test List - 18/09/2023-10:34:01' with content 'My test list' with high priority and status red has been successfully created.\""
|
||||
]
|
||||
},
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"time_str = datetime.now().strftime(\"%d/%m/%Y-%H:%M:%S\")\n",
|
||||
"print_and_run(f\"Create a list called 'Test List - {time_str}' with content My test list with high priority and status red\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Multi-Step Tasks"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[94m$ COMMAND\u001b[0m\n",
|
||||
"Figure out what user ID Rodrigo is, create a task called 'Rod's task', assign it to Rodrigo\n",
|
||||
"\n",
|
||||
"\u001b[94m$ AGENT\u001b[0m\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to get the user ID of Rodrigo, create a task, and assign it to Rodrigo\n",
|
||||
"Action: Get Teams\n",
|
||||
"Action Input: No input necessary\u001b[0m\n",
|
||||
"Observation: \u001b[38;5;200m\u001b[1;3m{'teams': [{'id': '9011010153', 'name': 'Task Copilot Sandbox Workspace 1', 'members': [{'id': 81928627, 'username': 'Rodrigo Ceballos Lentini', 'email': 'rlentini@clickup.com', 'initials': 'RL'}]}]}\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now have the user ID of Rodrigo\n",
|
||||
"Action: Create Task\n",
|
||||
"Action Input: {\"name\": \"Rod's task\", \"assignees\": [81928627]}\u001b[0m"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/rodrigolentini/repos/langchain-clickup/libs/langchain/langchain/utilities/clickup.py:145: UserWarning: Error encountered while trying to parse <class 'langchain.utilities.clickup.Task'>: 'NoneType' object is not subscriptable\n",
|
||||
" Falling back to returning input data.\n",
|
||||
" warnings.warn(f'Error encountered while trying to parse {dataclass}: {e}\\n Falling back to returning input data.')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3m{'id': '8685mw6dz', 'custom_id': None, 'name': \"Rod's task\", 'text_content': '', 'description': '', 'status': {'id': 'p90110061901_VlN8IJtk', 'status': 'to do', 'color': '#87909e', 'orderindex': 0, 'type': 'open'}, 'orderindex': '24.00000000000000000000000000000000', 'date_created': '1695047740939', 'date_updated': '1695047740939', 'date_closed': None, 'date_done': None, 'archived': False, 'creator': {'id': 81928627, 'username': 'Rodrigo Ceballos Lentini', 'color': '#c51162', 'email': 'rlentini@clickup.com', 'profilePicture': None}, 'assignees': [{'id': 81928627, 'username': 'Rodrigo Ceballos Lentini', 'color': '#c51162', 'initials': 'RL', 'email': 'rlentini@clickup.com', 'profilePicture': None}], 'watchers': [{'id': 81928627, 'username': 'Rodrigo Ceballos Lentini', 'color': '#c51162', 'initials': 'RL', 'email': 'rlentini@clickup.com', 'profilePicture': None}], 'checklists': [], 'tags': [], 'parent': None, 'priority': None, 'due_date': None, 'start_date': None, 'points': None, 'time_estimate': None, 'time_spent': 0, 'custom_fields': [], 'dependencies': [], 'linked_tasks': [], 'team_id': '9011010153', 'url': 'https://app.clickup.com/t/8685mw6dz', 'sharing': {'public': False, 'public_share_expires_on': None, 'public_fields': ['assignees', 'priority', 'due_date', 'content', 'comments', 'attachments', 'customFields', 'subtasks', 'tags', 'checklists', 'coverimage'], 'token': None, 'seo_optimized': False}, 'permission_level': 'create', 'list': {'id': '901100754275', 'name': 'Test List', 'access': True}, 'project': {'id': '90110336890', 'name': 'Test Folder', 'hidden': False, 'access': True}, 'folder': {'id': '90110336890', 'name': 'Test Folder', 'hidden': False, 'access': True}, 'space': {'id': '90110061901'}}\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now have the task created and assigned to Rodrigo\n",
|
||||
"Final Answer: Rodrigo's user ID is 81928627 and a task called 'Rod's task' has been created and assigned to him.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"Rodrigo's user ID is 81928627 and a task called 'Rod's task' has been created and assigned to him.\""
|
||||
]
|
||||
},
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print_and_run(\"Figure out what user ID Rodrigo is, create a task called 'Rod's task', assign it to Rodrigo\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "clickup-copilot",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.18"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -27,7 +27,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"api_key = \"BSAv1neIuQOsxqOyy0sEe_ie2zD_n_V\""
|
||||
"api_key = \"API KEY\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -86,7 +86,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
922
docs/extras/integrations/vectorstores/vespa.ipynb
Normal file
922
docs/extras/integrations/vectorstores/vespa.ipynb
Normal file
@@ -0,0 +1,922 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ce0f17b9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Vespa\n",
|
||||
"\n",
|
||||
">[Vespa](https://vespa.ai/) is a fully featured search engine and vector database. It supports vector search (ANN), lexical search, and search in structured data, all in the same query.\n",
|
||||
"\n",
|
||||
"This notebook shows how to use `Vespa.ai` as a LangChain vector store.\n",
|
||||
"\n",
|
||||
"In order to create the vector store, we use\n",
|
||||
"[pyvespa](https://pyvespa.readthedocs.io/en/latest/index.html) to create a\n",
|
||||
"connection a `Vespa` service."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7e6a11ab-38bd-4920-ba11-60cb2f075754",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install pyvespa"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Using the `pyvespa` package, you can either connect to a\n",
|
||||
"[Vespa Cloud instance](https://pyvespa.readthedocs.io/en/latest/deploy-vespa-cloud.html)\n",
|
||||
"or a local\n",
|
||||
"[Docker instance](https://pyvespa.readthedocs.io/en/latest/deploy-docker.html).\n",
|
||||
"Here, we will create a new Vespa application and deploy that using Docker.\n",
|
||||
"\n",
|
||||
"#### Creating a Vespa application\n",
|
||||
"\n",
|
||||
"First, we need to create an application package:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"id": "283b49c9"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from vespa.package import ApplicationPackage, Field, RankProfile\n",
|
||||
"\n",
|
||||
"app_package = ApplicationPackage(name=\"testapp\")\n",
|
||||
"app_package.schema.add_fields(\n",
|
||||
" Field(name=\"text\", type=\"string\", indexing=[\"index\", \"summary\"], index=\"enable-bm25\"),\n",
|
||||
" Field(name=\"embedding\", type=\"tensor<float>(x[384])\",\n",
|
||||
" indexing=[\"attribute\", \"summary\"],\n",
|
||||
" attribute=[f\"distance-metric: angular\"]),\n",
|
||||
")\n",
|
||||
"app_package.schema.add_rank_profile(\n",
|
||||
" RankProfile(name=\"default\",\n",
|
||||
" first_phase=\"closeness(field, embedding)\",\n",
|
||||
" inputs=[(\"query(query_embedding)\", \"tensor<float>(x[384])\")]\n",
|
||||
" )\n",
|
||||
")"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "91150665"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"This sets up a Vespa application with a schema for each document that contains\n",
|
||||
"two fields: `text` for holding the document text and `embedding` for holding\n",
|
||||
"the embedding vector. The `text` field is set up to use a BM25 index for\n",
|
||||
"efficient text retrieval, and we'll see how to use this and hybrid search a\n",
|
||||
"bit later.\n",
|
||||
"\n",
|
||||
"The `embedding` field is set up with a vector of length 384 to hold the\n",
|
||||
"embedding representation of the text. See\n",
|
||||
"[Vespa's Tensor Guide](https://docs.vespa.ai/en/tensor-user-guide.html)\n",
|
||||
"for more on tensors in Vespa.\n",
|
||||
"\n",
|
||||
"Lastly, we add a [rank profile](https://docs.vespa.ai/en/ranking.html) to\n",
|
||||
"instruct Vespa how to order documents. Here we set this up with a\n",
|
||||
"[nearest neighbor search](https://docs.vespa.ai/en/nearest-neighbor-search.html).\n",
|
||||
"\n",
|
||||
"Now we can deploy this application locally:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "15477106"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "c10dd962",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from vespa.deployment import VespaDocker\n",
|
||||
"\n",
|
||||
"vespa_docker = VespaDocker()\n",
|
||||
"vespa_app = vespa_docker.deploy(application_package=app_package)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3df4ce53",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This deploys and creates a connection to a `Vespa` service. In case you\n",
|
||||
"already have a Vespa application running, for instance in the cloud,\n",
|
||||
"please refer to the PyVespa application for how to connect.\n",
|
||||
"\n",
|
||||
"#### Creating a Vespa vector store\n",
|
||||
"\n",
|
||||
"Now, let's load some documents:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import TextLoader\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"\n",
|
||||
"loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n",
|
||||
"documents = loader.load()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"docs = text_splitter.split_documents(documents)\n",
|
||||
"\n",
|
||||
"from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings\n",
|
||||
"\n",
|
||||
"embedding_function = SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\")"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "7abde491"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Here, we also set up local sentence embedder to transform the text to embedding\n",
|
||||
"vectors. One could also use OpenAI embeddings, but the vector length needs to\n",
|
||||
"be updated to `1536` to reflect the larger size of that embedding.\n",
|
||||
"\n",
|
||||
"To feed these to Vespa, we need to configure how the vector store should map to\n",
|
||||
"fields in the Vespa application. Then we create the vector store directly from\n",
|
||||
"this set of documents:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "d42365c7"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vespa_config = dict(\n",
|
||||
" page_content_field=\"text\",\n",
|
||||
" embedding_field=\"embedding\",\n",
|
||||
" input_field=\"query_embedding\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"from langchain.vectorstores import VespaStore\n",
|
||||
"\n",
|
||||
"db = VespaStore.from_documents(docs, embedding_function, app=vespa_app, **vespa_config)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "0b647878"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"This creates a Vespa vector store and feeds that set of documents to Vespa.\n",
|
||||
"The vector store takes care of calling the embedding function for each document\n",
|
||||
"and inserts them into the database.\n",
|
||||
"\n",
|
||||
"We can now query the vector store:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "d6bd0aab"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7ccca1f4",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"results = db.similarity_search(query)\n",
|
||||
"\n",
|
||||
"print(results[0].page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1e7e34e1",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"This will use the embedding function given above to create a representation\n",
|
||||
"for the query and use that to search Vespa. Note that this will use the\n",
|
||||
"`default` ranking function, which we set up in the application package\n",
|
||||
"above. You can use the `ranking` argument to `similarity_search` to\n",
|
||||
"specify which ranking function to use.\n",
|
||||
"\n",
|
||||
"Please refer to the [pyvespa documentation](https://pyvespa.readthedocs.io/en/latest/getting-started-pyvespa.html#Query)\n",
|
||||
"for more information.\n",
|
||||
"\n",
|
||||
"This covers the basic usage of the Vespa store in LangChain.\n",
|
||||
"Now you can return the results and continue using these in LangChain.\n",
|
||||
"\n",
|
||||
"#### Updating documents\n",
|
||||
"\n",
|
||||
"An alternative to calling `from_documents`, you can create the vector\n",
|
||||
"store directly and call `add_texts` from that. This can also be used to update\n",
|
||||
"documents:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"results = db.similarity_search(query)\n",
|
||||
"result = results[0]\n",
|
||||
"\n",
|
||||
"result.page_content = \"UPDATED: \" + result.page_content\n",
|
||||
"db.add_texts([result.page_content], [result.metadata], result.metadata[\"id\"])\n",
|
||||
"\n",
|
||||
"results = db.similarity_search(query)\n",
|
||||
"print(results[0].page_content)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "a5256284"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"However, the `pyvespa` library contains methods to manipulate\n",
|
||||
"content on Vespa which you can use directly.\n",
|
||||
"\n",
|
||||
"#### Deleting documents\n",
|
||||
"\n",
|
||||
"You can delete documents using the `delete` function:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "2526b50e"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"result = db.similarity_search(query)\n",
|
||||
"# docs[0].metadata[\"id\"] == \"id:testapp:testapp::32\"\n",
|
||||
"\n",
|
||||
"db.delete([\"32\"])\n",
|
||||
"result = db.similarity_search(query)\n",
|
||||
"# docs[0].metadata[\"id\"] != \"id:testapp:testapp::32\""
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "52cab87e"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Again, the `pyvespa` connection contains methods to delete documents as well.\n",
|
||||
"\n",
|
||||
"### Returning with scores\n",
|
||||
"\n",
|
||||
"The `similarity_search` method only returns the documents in order of\n",
|
||||
"relevancy. To retrieve the actual scores:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "deffaba5"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"results = db.similarity_search_with_score(query)\n",
|
||||
"result = results[0]\n",
|
||||
"# result[1] ~= 0.463"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "cd9ae173"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"This is a result of using the `\"all-MiniLM-L6-v2\"` embedding model using the\n",
|
||||
"cosine distance function (as given by the argument `angular` in the\n",
|
||||
"application function).\n",
|
||||
"\n",
|
||||
"Different embedding functions need different distance functions, and Vespa\n",
|
||||
"needs to know which distance function to use when orderings documents.\n",
|
||||
"Please refer to the\n",
|
||||
"[documentation on distance functions](https://docs.vespa.ai/en/reference/schema-reference.html#distance-metric)\n",
|
||||
"for more information.\n",
|
||||
"\n",
|
||||
"### As retriever\n",
|
||||
"\n",
|
||||
"To use this vector store as a\n",
|
||||
"[LangChain retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/)\n",
|
||||
"simply call the `as_retriever` function, which is a standard vector store\n",
|
||||
"method:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "7257d67a"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"db = VespaStore.from_documents(docs, embedding_function, app=vespa_app, **vespa_config)\n",
|
||||
"retriever = db.as_retriever()\n",
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"results = retriever.get_relevant_documents(query)\n",
|
||||
"\n",
|
||||
"# results[0].metadata[\"id\"] == \"id:testapp:testapp::32\""
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "7fb717a9"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"This allows for more general, unstructured, retrieval from the vector store.\n",
|
||||
"\n",
|
||||
"### Metadata\n",
|
||||
"\n",
|
||||
"In the example so far, we've only used the text and the embedding for that\n",
|
||||
"text. Documents usually contain additional information, which in LangChain\n",
|
||||
"is referred to as metadata.\n",
|
||||
"\n",
|
||||
"Vespa can contain many fields with different types by adding them to the application\n",
|
||||
"package:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "fba7f07e"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"app_package.schema.add_fields(\n",
|
||||
" # ...\n",
|
||||
" Field(name=\"date\", type=\"string\", indexing=[\"attribute\", \"summary\"]),\n",
|
||||
" Field(name=\"rating\", type=\"int\", indexing=[\"attribute\", \"summary\"]),\n",
|
||||
" Field(name=\"author\", type=\"string\", indexing=[\"attribute\", \"summary\"]),\n",
|
||||
" # ...\n",
|
||||
")\n",
|
||||
"vespa_app = vespa_docker.deploy(application_package=app_package)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "59cffcf2"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"We can add some metadata fields in the documents:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "eebef70c"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Add metadata\n",
|
||||
"for i, doc in enumerate(docs):\n",
|
||||
" doc.metadata[\"date\"] = f\"2023-{(i % 12)+1}-{(i % 28)+1}\"\n",
|
||||
" doc.metadata[\"rating\"] = range(1, 6)[i % 5]\n",
|
||||
" doc.metadata[\"author\"] = [\"Joe Biden\", \"Unknown\"][min(i, 1)]"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "b21efbfa"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"And let the Vespa vector store know about these fields:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "9b42bd4d"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vespa_config.update(dict(metadata_fields=[\"date\", \"rating\", \"author\"]))"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "6bb272f6"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Now, when searching for these documents, these fields will be returned.\n",
|
||||
"Also, these fields can be filtered on:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "43818655"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"db = VespaStore.from_documents(docs, embedding_function, app=vespa_app, **vespa_config)\n",
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"results = db.similarity_search(query, filter=\"rating > 3\")\n",
|
||||
"# results[0].metadata[\"id\"] == \"id:testapp:testapp::34\"\n",
|
||||
"# results[0].metadata[\"author\"] == \"Unknown\""
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "831759f3"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"### Custom query\n",
|
||||
"\n",
|
||||
"If the default behavior of the similarity search does not fit your\n",
|
||||
"requirements, you can always provide your own query. Thus, you don't\n",
|
||||
"need to provide all of the configuration to the vector store, but\n",
|
||||
"rather just write this yourself.\n",
|
||||
"\n",
|
||||
"First, let's add a BM25 ranking function to our application:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "a49aad6e"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from vespa.package import FieldSet\n",
|
||||
"\n",
|
||||
"app_package.schema.add_field_set(FieldSet(name=\"default\", fields=[\"text\"]))\n",
|
||||
"app_package.schema.add_rank_profile(RankProfile(name=\"bm25\", first_phase=\"bm25(text)\"))\n",
|
||||
"vespa_app = vespa_docker.deploy(application_package=app_package)\n",
|
||||
"db = VespaStore.from_documents(docs, embedding_function, app=vespa_app, **vespa_config)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "d0fb0562"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Then, to perform a regular text search based on BM25:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "fe607747"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"custom_query = {\n",
|
||||
" \"yql\": f\"select * from sources * where userQuery()\",\n",
|
||||
" \"query\": query,\n",
|
||||
" \"type\": \"weakAnd\",\n",
|
||||
" \"ranking\": \"bm25\",\n",
|
||||
" \"hits\": 4\n",
|
||||
"}\n",
|
||||
"results = db.similarity_search_with_score(query, custom_query=custom_query)\n",
|
||||
"# results[0][0].metadata[\"id\"] == \"id:testapp:testapp::32\"\n",
|
||||
"# results[0][1] ~= 14.384"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "cee245c3"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"All of the powerful search and query capabilities of Vespa can be used\n",
|
||||
"by using a custom query. Please refer to the Vespa documentation on it's\n",
|
||||
"[Query API](https://docs.vespa.ai/en/query-api.html) for more details.\n",
|
||||
"\n",
|
||||
"### Hybrid search\n",
|
||||
"\n",
|
||||
"Hybrid search means using both a classic term-based search such as\n",
|
||||
"BM25 and a vector search and combining the results. We need to create\n",
|
||||
"a new rank profile for hybrid search on Vespa:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "41a4c081"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"app_package.schema.add_rank_profile(\n",
|
||||
" RankProfile(name=\"hybrid\",\n",
|
||||
" first_phase=\"log(bm25(text)) + 0.5 * closeness(field, embedding)\",\n",
|
||||
" inputs=[(\"query(query_embedding)\", \"tensor<float>(x[384])\")]\n",
|
||||
" )\n",
|
||||
")\n",
|
||||
"vespa_app = vespa_docker.deploy(application_package=app_package)\n",
|
||||
"db = VespaStore.from_documents(docs, embedding_function, app=vespa_app, **vespa_config)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "bf73efc1"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Here, we score each document as a combination of it's BM25 score and its\n",
|
||||
"distance score. We can query using a custom query:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "40f48711"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"query_embedding = embedding_function.embed_query(query)\n",
|
||||
"nearest_neighbor_expression = \"{targetHits: 4}nearestNeighbor(embedding, query_embedding)\"\n",
|
||||
"custom_query = {\n",
|
||||
" \"yql\": f\"select * from sources * where {nearest_neighbor_expression} and userQuery()\",\n",
|
||||
" \"query\": query,\n",
|
||||
" \"type\": \"weakAnd\",\n",
|
||||
" \"input.query(query_embedding)\": query_embedding,\n",
|
||||
" \"ranking\": \"hybrid\",\n",
|
||||
" \"hits\": 4\n",
|
||||
"}\n",
|
||||
"results = db.similarity_search_with_score(query, custom_query=custom_query)\n",
|
||||
"# results[0][0].metadata[\"id\"], \"id:testapp:testapp::32\")\n",
|
||||
"# results[0][1] ~= 2.897"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "d2e289f0"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"### Native embedders in Vespa\n",
|
||||
"\n",
|
||||
"Up until this point we've used an embedding function in Python to provide\n",
|
||||
"embeddings for the texts. Vespa supports embedding function natively, so\n",
|
||||
"you can defer this calculation in to Vespa. One benefit is the ability to use\n",
|
||||
"GPUs when embedding documents if you have a large collections.\n",
|
||||
"\n",
|
||||
"Please refer to [Vespa embeddings](https://docs.vespa.ai/en/embedding.html)\n",
|
||||
"for more information.\n",
|
||||
"\n",
|
||||
"First, we need to modify our application package:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "958e269f"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from vespa.package import Component, Parameter\n",
|
||||
"\n",
|
||||
"app_package.components = [\n",
|
||||
" Component(id=\"hf-embedder\", type=\"hugging-face-embedder\",\n",
|
||||
" parameters=[\n",
|
||||
" Parameter(\"transformer-model\", {\"path\": \"...\"}),\n",
|
||||
" Parameter(\"tokenizer-model\", {\"url\": \"...\"}),\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
"]\n",
|
||||
"Field(name=\"hfembedding\", type=\"tensor<float>(x[384])\",\n",
|
||||
" is_document_field=False,\n",
|
||||
" indexing=[\"input text\", \"embed hf-embedder\", \"attribute\", \"summary\"],\n",
|
||||
" attribute=[f\"distance-metric: angular\"],\n",
|
||||
" )\n",
|
||||
"app_package.schema.add_rank_profile(\n",
|
||||
" RankProfile(name=\"hf_similarity\",\n",
|
||||
" first_phase=\"closeness(field, hfembedding)\",\n",
|
||||
" inputs=[(\"query(query_embedding)\", \"tensor<float>(x[384])\")]\n",
|
||||
" )\n",
|
||||
")"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "56b9686c"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Please refer to the embeddings documentation on adding embedder models\n",
|
||||
"and tokenizers to the application. Note that the `hfembedding` field\n",
|
||||
"includes instructions for embedding using the `hf-embedder`.\n",
|
||||
"\n",
|
||||
"Now we can query with a custom query:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "5cd721a8"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"nearest_neighbor_expression = \"{targetHits: 4}nearestNeighbor(internalembedding, query_embedding)\"\n",
|
||||
"custom_query = {\n",
|
||||
" \"yql\": f\"select * from sources * where {nearest_neighbor_expression}\",\n",
|
||||
" \"input.query(query_embedding)\": f\"embed(hf-embedder, \\\"{query}\\\")\",\n",
|
||||
" \"ranking\": \"internal_similarity\",\n",
|
||||
" \"hits\": 4\n",
|
||||
"}\n",
|
||||
"results = db.similarity_search_with_score(query, custom_query=custom_query)\n",
|
||||
"# results[0][0].metadata[\"id\"], \"id:testapp:testapp::32\")\n",
|
||||
"# results[0][1] ~= 0.630"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "da631d13"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Note that the query here includes an `embed` instruction to embed the query\n",
|
||||
"using the same model as for the documents.\n",
|
||||
"\n",
|
||||
"### Approximate nearest neighbor\n",
|
||||
"\n",
|
||||
"In all of the above examples, we've used exact nearest neighbor to\n",
|
||||
"find results. However, for large collections of documents this is\n",
|
||||
"not feasible as one has to scan through all documents to find the\n",
|
||||
"best matches. To avoid this, we can use\n",
|
||||
"[approximate nearest neighbors](https://docs.vespa.ai/en/approximate-nn-hnsw.html).\n",
|
||||
"\n",
|
||||
"First, we can change the embedding field to create a HNSW index:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "a333b553"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from vespa.package import HNSW\n",
|
||||
"\n",
|
||||
"app_package.schema.add_fields(\n",
|
||||
" Field(name=\"embedding\", type=\"tensor<float>(x[384])\",\n",
|
||||
" indexing=[\"attribute\", \"summary\", \"index\"],\n",
|
||||
" ann=HNSW(distance_metric=\"angular\", max_links_per_node=16, neighbors_to_explore_at_insert=200)\n",
|
||||
" )\n",
|
||||
")\n"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "9ee955c8"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"This creates a HNSW index on the embedding data which allows for efficient\n",
|
||||
"searching. With this set, we can easily search using ANN by setting\n",
|
||||
"the `approximate` argument to `True`:"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "2ed1c224"
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"results = db.similarity_search(query, approximate=True)\n",
|
||||
"# results[0][0].metadata[\"id\"], \"id:testapp:testapp::32\")"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "7981739a"
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"This covers most of the functionality in the Vespa vector store in LangChain.\n",
|
||||
"\n"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"id": "24791204"
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,439 +1,440 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "13afcae7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# OpenSearch\n",
|
||||
"\n",
|
||||
"> [OpenSearch](https://opensearch.org/) is a scalable, flexible, and extensible open-source software suite for search, analytics, and observability applications licensed under Apache 2.0. `OpenSearch` is a distributed search and analytics engine based on `Apache Lucene`.\n",
|
||||
"\n",
|
||||
"In this notebook, we'll demo the `SelfQueryRetriever` with an `OpenSearch` vector store."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "68e75fb9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Creating an OpenSearch vector store\n",
|
||||
"\n",
|
||||
"First, we'll want to create an `OpenSearch` vector store and seed it with some data. We've created a small demo set of documents that contain summaries of movies.\n",
|
||||
"\n",
|
||||
"**Note:** The self-query retriever requires you to have `lark` installed (`pip install lark`). We also need the `opensearch-py` package."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install lark opensearch-py"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "cb4a5787",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
"cells": [
|
||||
{
|
||||
"name": "stdin",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"OpenAI API Key: ········\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.schema import Document\n",
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"from langchain.vectorstores import OpenSearchVectorSearch\n",
|
||||
"import os\n",
|
||||
"import getpass\n",
|
||||
"\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")\n",
|
||||
"\n",
|
||||
"embeddings = OpenAIEmbeddings()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "bcbe04d9",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = [\n",
|
||||
" Document(\n",
|
||||
" page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n",
|
||||
" metadata={\"year\": 1993, \"rating\": 7.7, \"genre\": \"science fiction\"},\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" page_content=\"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n",
|
||||
" metadata={\"year\": 2010, \"director\": \"Christopher Nolan\", \"rating\": 8.2},\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" page_content=\"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n",
|
||||
" metadata={\"year\": 2006, \"director\": \"Satoshi Kon\", \"rating\": 8.6},\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n",
|
||||
" metadata={\"year\": 2019, \"director\": \"Greta Gerwig\", \"rating\": 8.3},\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" page_content=\"Toys come alive and have a blast doing so\",\n",
|
||||
" metadata={\"year\": 1995, \"genre\": \"animated\"},\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" page_content=\"Three men walk into the Zone, three men walk out of the Zone\",\n",
|
||||
" metadata={\n",
|
||||
" \"year\": 1979,\n",
|
||||
" \"rating\": 9.9,\n",
|
||||
" \"director\": \"Andrei Tarkovsky\",\n",
|
||||
" \"genre\": \"science fiction\",\n",
|
||||
" },\n",
|
||||
" ),\n",
|
||||
"]\n",
|
||||
"vectorstore = OpenSearchVectorSearch.from_documents(\n",
|
||||
" docs, embeddings, index_name=\"opensearch-self-query-demo\", opensearch_url=\"http://localhost:9200\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5ecaab6d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Creating our self-querying retriever\n",
|
||||
"Now we can instantiate our retriever. To do this we'll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "86e34dbf",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"from langchain.retrievers.self_query.base import SelfQueryRetriever\n",
|
||||
"from langchain.chains.query_constructor.base import AttributeInfo\n",
|
||||
"\n",
|
||||
"metadata_field_info = [\n",
|
||||
" AttributeInfo(\n",
|
||||
" name=\"genre\",\n",
|
||||
" description=\"The genre of the movie\",\n",
|
||||
" type=\"string or list[string]\",\n",
|
||||
" ),\n",
|
||||
" AttributeInfo(\n",
|
||||
" name=\"year\",\n",
|
||||
" description=\"The year the movie was released\",\n",
|
||||
" type=\"integer\",\n",
|
||||
" ),\n",
|
||||
" AttributeInfo(\n",
|
||||
" name=\"director\",\n",
|
||||
" description=\"The name of the movie director\",\n",
|
||||
" type=\"string\",\n",
|
||||
" ),\n",
|
||||
" AttributeInfo(\n",
|
||||
" name=\"rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n",
|
||||
" ),\n",
|
||||
"]\n",
|
||||
"document_content_description = \"Brief summary of a movie\"\n",
|
||||
"llm = OpenAI(temperature=0)\n",
|
||||
"retriever = SelfQueryRetriever.from_llm(\n",
|
||||
" llm, vectorstore, document_content_description, metadata_field_info, verbose=True\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ea9df8d4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Testing it out\n",
|
||||
"And now we can try actually using our retriever!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "38a126e9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"query='dinosaur' filter=None limit=None\n"
|
||||
]
|
||||
"cell_type": "markdown",
|
||||
"id": "13afcae7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# OpenSearch\n",
|
||||
"\n",
|
||||
"> [OpenSearch](https://opensearch.org/) is a scalable, flexible, and extensible open-source software suite for search, analytics, and observability applications licensed under Apache 2.0. `OpenSearch` is a distributed search and analytics engine based on `Apache Lucene`.\n",
|
||||
"\n",
|
||||
"In this notebook, we'll demo the `SelfQueryRetriever` with an `OpenSearch` vector store."
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'rating': 7.7, 'genre': 'science fiction'}),\n",
|
||||
" Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'}),\n",
|
||||
" Document(page_content='Leo DiCaprio gets lost in a dream within a dream within a dream within a ...', metadata={'year': 2010, 'director': 'Christopher Nolan', 'rating': 8.2}),\n",
|
||||
" Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'rating': 9.9, 'director': 'Andrei Tarkovsky', 'genre': 'science fiction'})]"
|
||||
"cell_type": "markdown",
|
||||
"id": "68e75fb9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Creating an OpenSearch vector store\n",
|
||||
"\n",
|
||||
"First, we'll want to create an `OpenSearch` vector store and seed it with some data. We've created a small demo set of documents that contain summaries of movies.\n",
|
||||
"\n",
|
||||
"**Note:** The self-query retriever requires you to have `lark` installed (`pip install lark`). We also need the `opensearch-py` package."
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# This example only specifies a relevant query\n",
|
||||
"retriever.get_relevant_documents(\"What are some movies about dinosaurs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "60bf0074-e65e-4558-a4f2-8190f3e4e2f9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"query=' ' filter=Comparison(comparator=<Comparator.GT: 'gt'>, attribute='rating', value=8.5) limit=None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'rating': 9.9, 'director': 'Andrei Tarkovsky', 'genre': 'science fiction'}),\n",
|
||||
" Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'year': 2006, 'director': 'Satoshi Kon', 'rating': 8.6})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# This example only specifies a filter\n",
|
||||
"retriever.get_relevant_documents(\"I want to watch a movie rated higher than 8.5\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "b19d4da0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"query='women' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='director', value='Greta Gerwig') limit=None\n"
|
||||
]
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install lark opensearch-py"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"id": "6078a74d"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them', metadata={'year': 2019, 'director': 'Greta Gerwig', 'rating': 8.3})]"
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "cb4a5787",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdin",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"OpenAI API Key: \u00b7\u00b7\u00b7\u00b7\u00b7\u00b7\u00b7\u00b7\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.schema import Document\n",
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"from langchain.vectorstores import OpenSearchVectorSearch\n",
|
||||
"import os\n",
|
||||
"import getpass\n",
|
||||
"\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")\n",
|
||||
"\n",
|
||||
"embeddings = OpenAIEmbeddings()"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# This example specifies a query and a filter\n",
|
||||
"retriever.get_relevant_documents(\"Has Greta Gerwig directed any movies about women\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "a59f946b-78a1-4d3e-9942-63834c7d7589",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.GTE: 'gte'>, attribute='rating', value=8.5), Comparison(comparator=<Comparator.CONTAIN: 'contain'>, attribute='genre', value='science fiction')]) limit=None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'rating': 9.9, 'director': 'Andrei Tarkovsky', 'genre': 'science fiction'})]"
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "bcbe04d9",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = [\n",
|
||||
" Document(\n",
|
||||
" page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n",
|
||||
" metadata={\"year\": 1993, \"rating\": 7.7, \"genre\": \"science fiction\"},\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" page_content=\"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n",
|
||||
" metadata={\"year\": 2010, \"director\": \"Christopher Nolan\", \"rating\": 8.2},\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" page_content=\"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n",
|
||||
" metadata={\"year\": 2006, \"director\": \"Satoshi Kon\", \"rating\": 8.6},\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n",
|
||||
" metadata={\"year\": 2019, \"director\": \"Greta Gerwig\", \"rating\": 8.3},\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" page_content=\"Toys come alive and have a blast doing so\",\n",
|
||||
" metadata={\"year\": 1995, \"genre\": \"animated\"},\n",
|
||||
" ),\n",
|
||||
" Document(\n",
|
||||
" page_content=\"Three men walk into the Zone, three men walk out of the Zone\",\n",
|
||||
" metadata={\n",
|
||||
" \"year\": 1979,\n",
|
||||
" \"rating\": 9.9,\n",
|
||||
" \"director\": \"Andrei Tarkovsky\",\n",
|
||||
" \"genre\": \"science fiction\",\n",
|
||||
" },\n",
|
||||
" ),\n",
|
||||
"]\n",
|
||||
"vectorstore = OpenSearchVectorSearch.from_documents(\n",
|
||||
" docs, embeddings, index_name=\"opensearch-self-query-demo\", opensearch_url=\"http://localhost:9200\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# This example specifies a composite filter\n",
|
||||
"retriever.get_relevant_documents(\"What's a highly rated (above 8.5) science fiction film?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "39bd1de1-b9fe-4a98-89da-58d8a7a6ae51",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Filter k\n",
|
||||
"\n",
|
||||
"We can also use the self query retriever to specify `k`: the number of documents to fetch.\n",
|
||||
"\n",
|
||||
"We can do this by passing `enable_limit=True` to the constructor."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "bff36b88-b506-4877-9c63-e5a1a8d78e64",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"retriever = SelfQueryRetriever.from_llm(\n",
|
||||
" llm,\n",
|
||||
" vectorstore,\n",
|
||||
" document_content_description,\n",
|
||||
" metadata_field_info,\n",
|
||||
" enable_limit=True,\n",
|
||||
" verbose=True,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "2758d229-4f97-499c-819f-888acaf8ee10",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"query='dinosaur' filter=None limit=2\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'rating': 7.7, 'genre': 'science fiction'}),\n",
|
||||
" Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'})]"
|
||||
"cell_type": "markdown",
|
||||
"id": "5ecaab6d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Creating our self-querying retriever\n",
|
||||
"Now we can instantiate our retriever. To do this we'll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents."
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# This example only specifies a relevant query\n",
|
||||
"retriever.get_relevant_documents(\"what are two movies about dinosaurs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "61a10294",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Complex queries in Action!\n",
|
||||
"We've tried out some simple queries, but what about more complex ones? Let's try out a few more complex queries that utilize the full power of OpenSearch."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "e460da93",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"query='animated toys' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Operation(operator=<Operator.OR: 'or'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='genre', value='animated'), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='genre', value='comedy')]), Comparison(comparator=<Comparator.GTE: 'gte'>, attribute='year', value=1990)]) limit=None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'})]"
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "86e34dbf",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"from langchain.retrievers.self_query.base import SelfQueryRetriever\n",
|
||||
"from langchain.chains.query_constructor.base import AttributeInfo\n",
|
||||
"\n",
|
||||
"metadata_field_info = [\n",
|
||||
" AttributeInfo(\n",
|
||||
" name=\"genre\",\n",
|
||||
" description=\"The genre of the movie\",\n",
|
||||
" type=\"string or list[string]\",\n",
|
||||
" ),\n",
|
||||
" AttributeInfo(\n",
|
||||
" name=\"year\",\n",
|
||||
" description=\"The year the movie was released\",\n",
|
||||
" type=\"integer\",\n",
|
||||
" ),\n",
|
||||
" AttributeInfo(\n",
|
||||
" name=\"director\",\n",
|
||||
" description=\"The name of the movie director\",\n",
|
||||
" type=\"string\",\n",
|
||||
" ),\n",
|
||||
" AttributeInfo(\n",
|
||||
" name=\"rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n",
|
||||
" ),\n",
|
||||
"]\n",
|
||||
"document_content_description = \"Brief summary of a movie\"\n",
|
||||
"llm = OpenAI(temperature=0)\n",
|
||||
"retriever = SelfQueryRetriever.from_llm(\n",
|
||||
" llm, vectorstore, document_content_description, metadata_field_info, verbose=True\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"retriever.get_relevant_documents(\"what animated or comedy movies have been released in the last 30 years about animated toys?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "0851fc42",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'acknowledged': True}"
|
||||
"cell_type": "markdown",
|
||||
"id": "ea9df8d4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Testing it out\n",
|
||||
"And now we can try actually using our retriever!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "38a126e9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"query='dinosaur' filter=None limit=None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'rating': 7.7, 'genre': 'science fiction'}),\n",
|
||||
" Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'}),\n",
|
||||
" Document(page_content='Leo DiCaprio gets lost in a dream within a dream within a dream within a ...', metadata={'year': 2010, 'director': 'Christopher Nolan', 'rating': 8.2}),\n",
|
||||
" Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'rating': 9.9, 'director': 'Andrei Tarkovsky', 'genre': 'science fiction'})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# This example only specifies a relevant query\n",
|
||||
"retriever.get_relevant_documents(\"What are some movies about dinosaurs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "60bf0074-e65e-4558-a4f2-8190f3e4e2f9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"query=' ' filter=Comparison(comparator=<Comparator.GT: 'gt'>, attribute='rating', value=8.5) limit=None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'rating': 9.9, 'director': 'Andrei Tarkovsky', 'genre': 'science fiction'}),\n",
|
||||
" Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'year': 2006, 'director': 'Satoshi Kon', 'rating': 8.6})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# This example only specifies a filter\n",
|
||||
"retriever.get_relevant_documents(\"I want to watch a movie rated higher than 8.5\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "b19d4da0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"query='women' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='director', value='Greta Gerwig') limit=None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them', metadata={'year': 2019, 'director': 'Greta Gerwig', 'rating': 8.3})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# This example specifies a query and a filter\n",
|
||||
"retriever.get_relevant_documents(\"Has Greta Gerwig directed any movies about women\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "a59f946b-78a1-4d3e-9942-63834c7d7589",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.GTE: 'gte'>, attribute='rating', value=8.5), Comparison(comparator=<Comparator.CONTAIN: 'contain'>, attribute='genre', value='science fiction')]) limit=None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'rating': 9.9, 'director': 'Andrei Tarkovsky', 'genre': 'science fiction'})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# This example specifies a composite filter\n",
|
||||
"retriever.get_relevant_documents(\"What's a highly rated (above 8.5) science fiction film?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "39bd1de1-b9fe-4a98-89da-58d8a7a6ae51",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Filter k\n",
|
||||
"\n",
|
||||
"We can also use the self query retriever to specify `k`: the number of documents to fetch.\n",
|
||||
"\n",
|
||||
"We can do this by passing `enable_limit=True` to the constructor."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "bff36b88-b506-4877-9c63-e5a1a8d78e64",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"retriever = SelfQueryRetriever.from_llm(\n",
|
||||
" llm,\n",
|
||||
" vectorstore,\n",
|
||||
" document_content_description,\n",
|
||||
" metadata_field_info,\n",
|
||||
" enable_limit=True,\n",
|
||||
" verbose=True,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "2758d229-4f97-499c-819f-888acaf8ee10",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"query='dinosaur' filter=None limit=2\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'rating': 7.7, 'genre': 'science fiction'}),\n",
|
||||
" Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# This example only specifies a relevant query\n",
|
||||
"retriever.get_relevant_documents(\"what are two movies about dinosaurs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "61a10294",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Complex queries in Action!\n",
|
||||
"We've tried out some simple queries, but what about more complex ones? Let's try out a few more complex queries that utilize the full power of OpenSearch."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "e460da93",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"query='animated toys' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Operation(operator=<Operator.OR: 'or'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='genre', value='animated'), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='genre', value='comedy')]), Comparison(comparator=<Comparator.GTE: 'gte'>, attribute='year', value=1990)]) limit=None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"retriever.get_relevant_documents(\"what animated or comedy movies have been released in the last 30 years about animated toys?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "0851fc42",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'acknowledged': True}"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"vectorstore.client.indices.delete(index=\"opensearch-self-query-demo\")\n"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"vectorstore.client.indices.delete(index=\"opensearch-self-query-demo\")\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -443,7 +443,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from typing import Sequence\n",
|
||||
"from typing import Sequence, Optional\n",
|
||||
"from langchain.prompts import (\n",
|
||||
" PromptTemplate,\n",
|
||||
" ChatPromptTemplate,\n",
|
||||
|
||||
@@ -284,7 +284,7 @@
|
||||
"```\n",
|
||||
"\n",
|
||||
"* The search is executed\n",
|
||||
"* The results frum search are passed back to the LLM for synthesis into an answer\n",
|
||||
"* The results from search are passed back to the LLM for synthesis into an answer\n",
|
||||
"\n",
|
||||
""
|
||||
]
|
||||
|
||||
@@ -48,7 +48,16 @@
|
||||
"execution_count": 2,
|
||||
"id": "0928915d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/tomaz/neo4j/langchain/libs/langchain/langchain/graphs/neo4j_graph.py:52: ExperimentalWarning: The configuration may change in the future.\n",
|
||||
" self._driver.verify_connectivity()\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"graph = Neo4jGraph(\n",
|
||||
" url=\"bolt://localhost:7687\", username=\"neo4j\", password=\"pleaseletmein\"\n",
|
||||
@@ -558,6 +567,75 @@
|
||||
"# Inspect graph schema\n",
|
||||
"print(chain.graph_schema)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f0202e88-d700-40ed-aef9-0c969c7bf951",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Validate generated Cypher statements\n",
|
||||
"You can use the `validate_cypher` parameter to validate and correct relationship directions in generated Cypher statements"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "53665d03-7afd-433c-bdd5-750127bfb152",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"chain = GraphCypherQAChain.from_llm(\n",
|
||||
" llm=ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo\"),\n",
|
||||
" graph=graph,\n",
|
||||
" verbose=True,\n",
|
||||
" validate_cypher=True\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "19e1a591-9c10-4d7b-aa36-a5e1b778a97b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
|
||||
"Generated Cypher:\n",
|
||||
"\u001b[32;1m\u001b[1;3mMATCH (a:Actor)-[:ACTED_IN]->(m:Movie {name: 'Top Gun'})\n",
|
||||
"RETURN a.name\u001b[0m\n",
|
||||
"Full Context:\n",
|
||||
"\u001b[32;1m\u001b[1;3m[{'a.name': 'Tom Cruise'}, {'a.name': 'Val Kilmer'}, {'a.name': 'Anthony Edwards'}, {'a.name': 'Meg Ryan'}]\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Tom Cruise, Val Kilmer, Anthony Edwards, and Meg Ryan played in Top Gun.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.run(\"Who played in Top Gun?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3fa3f3d5-f7e7-4ca9-8f07-ca22b897f192",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
834
docs/extras/use_cases/more/learned_prompt_optimization.ipynb
Normal file
834
docs/extras/use_cases/more/learned_prompt_optimization.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -940,7 +940,7 @@
|
||||
"- DocArrayRetriever\n",
|
||||
"- ElasticSearchBM25Retriever\n",
|
||||
"- EnsembleRetriever\n",
|
||||
"- GoogleCloudEnterpriseSearchRetriever\n",
|
||||
"- GoogleVertexAISearchRetriever\n",
|
||||
"- AmazonKendraRetriever\n",
|
||||
"- KNNRetriever\n",
|
||||
"- LlamaIndexGraphRetriever and LlamaIndexRetriever\n",
|
||||
@@ -992,7 +992,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'LangChain possesses a variety of retrievers including:\\n\\n1. ArxivRetriever\\n2. AzureCognitiveSearchRetriever\\n3. BM25Retriever\\n4. ChaindeskRetriever\\n5. ChatGPTPluginRetriever\\n6. ContextualCompressionRetriever\\n7. DocArrayRetriever\\n8. ElasticSearchBM25Retriever\\n9. EnsembleRetriever\\n10. GoogleCloudEnterpriseSearchRetriever\\n11. AmazonKendraRetriever\\n12. KNNRetriever\\n13. LlamaIndexGraphRetriever\\n14. LlamaIndexRetriever\\n15. MergerRetriever\\n16. MetalRetriever\\n17. MilvusRetriever\\n18. MultiQueryRetriever\\n19. ParentDocumentRetriever\\n20. PineconeHybridSearchRetriever\\n21. PubMedRetriever\\n22. RePhraseQueryRetriever\\n23. RemoteLangChainRetriever\\n24. SelfQueryRetriever\\n25. SVMRetriever\\n26. TFIDFRetriever\\n27. TimeWeightedVectorStoreRetriever\\n28. VespaRetriever\\n29. WeaviateHybridSearchRetriever\\n30. WebResearchRetriever\\n31. WikipediaRetriever\\n32. ZepRetriever\\n33. ZillizRetriever\\n\\nIt also includes self query translators like:\\n\\n1. ChromaTranslator\\n2. DeepLakeTranslator\\n3. MyScaleTranslator\\n4. PineconeTranslator\\n5. QdrantTranslator\\n6. WeaviateTranslator\\n\\nAnd remote retrievers like:\\n\\n1. RemoteLangChainRetriever'}"
|
||||
"{'question': 'LangChain possesses a variety of retrievers including:\\n\\n1. ArxivRetriever\\n2. AzureCognitiveSearchRetriever\\n3. BM25Retriever\\n4. ChaindeskRetriever\\n5. ChatGPTPluginRetriever\\n6. ContextualCompressionRetriever\\n7. DocArrayRetriever\\n8. ElasticSearchBM25Retriever\\n9. EnsembleRetriever\\n10. GoogleVertexAISearchRetriever\\n11. AmazonKendraRetriever\\n12. KNNRetriever\\n13. LlamaIndexGraphRetriever\\n14. LlamaIndexRetriever\\n15. MergerRetriever\\n16. MetalRetriever\\n17. MilvusRetriever\\n18. MultiQueryRetriever\\n19. ParentDocumentRetriever\\n20. PineconeHybridSearchRetriever\\n21. PubMedRetriever\\n22. RePhraseQueryRetriever\\n23. RemoteLangChainRetriever\\n24. SelfQueryRetriever\\n25. SVMRetriever\\n26. TFIDFRetriever\\n27. TimeWeightedVectorStoreRetriever\\n28. VespaRetriever\\n29. WeaviateHybridSearchRetriever\\n30. WebResearchRetriever\\n31. WikipediaRetriever\\n32. ZepRetriever\\n33. ZillizRetriever\\n\\nIt also includes self query translators like:\\n\\n1. ChromaTranslator\\n2. DeepLakeTranslator\\n3. MyScaleTranslator\\n4. PineconeTranslator\\n5. QdrantTranslator\\n6. WeaviateTranslator\\n\\nAnd remote retrievers like:\\n\\n1. RemoteLangChainRetriever'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 31,
|
||||
@@ -1124,7 +1124,7 @@
|
||||
"- DocArrayRetriever\n",
|
||||
"- ElasticSearchBM25Retriever\n",
|
||||
"- EnsembleRetriever\n",
|
||||
"- GoogleCloudEnterpriseSearchRetriever\n",
|
||||
"- GoogleVertexAISearchRetriever\n",
|
||||
"- AmazonKendraRetriever\n",
|
||||
"- KNNRetriever\n",
|
||||
"- LlamaIndexGraphRetriever and LlamaIndexRetriever\n",
|
||||
|
||||
@@ -10,9 +10,13 @@
|
||||
"[](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/question_answering/qa.ipynb)\n",
|
||||
"\n",
|
||||
"## Use case\n",
|
||||
"Suppose you have some text documents (PDF, blog, Notion pages, etc.) and want to ask questions related to the contents of those documents. LLMs, given their proficiency in understanding text, are a great tool for this.\n",
|
||||
"Suppose you have some text documents (PDF, blog, Notion pages, etc.) and want to ask questions related to the contents of those documents. \n",
|
||||
"\n",
|
||||
"In this walkthrough we'll go over how to build a question-answering over documents application using LLMs. Two very related use cases which we cover elsewhere are:\n",
|
||||
"LLMs, given their proficiency in understanding text, are a great tool for this.\n",
|
||||
"\n",
|
||||
"In this walkthrough we'll go over how to build a question-answering over documents application using LLMs. \n",
|
||||
"\n",
|
||||
"Two very related use cases which we cover elsewhere are:\n",
|
||||
"- [QA over structured data](/docs/use_cases/qa_structured/sql) (e.g., SQL)\n",
|
||||
"- [QA over code](/docs/use_cases/code_understanding) (e.g., Python)\n",
|
||||
"\n",
|
||||
@@ -20,19 +24,21 @@
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"The pipeline for converting raw unstructured data into a QA chain looks like this:\n",
|
||||
"1. `Loading`: First we need to load our data. Unstructured data can be loaded from many sources. Use the [LangChain integration hub](https://integrations.langchain.com/) to browse the full set of loaders.\n",
|
||||
"Each loader returns data as a LangChain [`Document`](/docs/components/schema/document).\n",
|
||||
"1. `Loading`: First we need to load our data. Use the [LangChain integration hub](https://integrations.langchain.com/) to browse the full set of loaders. \n",
|
||||
"2. `Splitting`: [Text splitters](/docs/modules/data_connection/document_transformers/) break `Documents` into splits of specified size\n",
|
||||
"3. `Storage`: Storage (e.g., often a [vectorstore](/docs/modules/data_connection/vectorstores/)) will house [and often embed](https://www.pinecone.io/learn/vector-embeddings/) the splits\n",
|
||||
"4. `Retrieval`: The app retrieves splits from storage (e.g., often [with similar embeddings](https://www.pinecone.io/learn/k-nearest-neighbor/) to the input question)\n",
|
||||
"5. `Generation`: An [LLM](/docs/modules/model_io/models/llms/) produces an answer using a prompt that includes the question and the retrieved data\n",
|
||||
"6. `Conversation` (Extension): Hold a multi-turn conversation by adding [Memory](/docs/modules/memory/) to your QA chain.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Quickstart\n",
|
||||
"\n",
|
||||
"To give you a sneak preview, the above pipeline can be all be wrapped in a single object: `VectorstoreIndexCreator`. Suppose we want a QA app over this [blog post](https://lilianweng.github.io/posts/2023-06-23-agent/). We can create this in a few lines of code. First set environment variables and install packages:"
|
||||
"Suppose we want a QA app over this [blog post](https://lilianweng.github.io/posts/2023-06-23-agent/). \n",
|
||||
"\n",
|
||||
"We can create this in a few lines of code. \n",
|
||||
"\n",
|
||||
"First set environment variables and install packages:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -42,7 +48,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pip install openai chromadb\n",
|
||||
"pip install langchain openai chromadb langchainhub\n",
|
||||
"\n",
|
||||
"# Set env var OPENAI_API_KEY or load from a .env file\n",
|
||||
"# import dotenv\n",
|
||||
@@ -53,44 +59,118 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "046cefc0",
|
||||
"id": "820244ae-74b4-4593-b392-822979dd91b8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import WebBaseLoader\n",
|
||||
"from langchain.indexes import VectorstoreIndexCreator\n",
|
||||
"# Load documents\n",
|
||||
"\n",
|
||||
"loader = WebBaseLoader(\"https://lilianweng.github.io/posts/2023-06-23-agent/\")\n",
|
||||
"index = VectorstoreIndexCreator().from_loaders([loader])"
|
||||
"from langchain.document_loaders import WebBaseLoader\n",
|
||||
"loader = WebBaseLoader(\"https://lilianweng.github.io/posts/2023-06-23-agent/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "f4bf8740",
|
||||
"id": "c89a0aa7-1e7e-4557-90e5-a7ea87db00e7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Split documents\n",
|
||||
"\n",
|
||||
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
||||
"text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)\n",
|
||||
"splits = text_splitter.split_documents(loader.load())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "000e46f6-dafc-4a43-8417-463d0614fd30",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Embed and store splits\n",
|
||||
"\n",
|
||||
"from langchain.vectorstores import Chroma\n",
|
||||
"from langchain.embeddings import OpenAIEmbeddings\n",
|
||||
"vectorstore = Chroma.from_documents(documents=splits,embedding=OpenAIEmbeddings())\n",
|
||||
"retriever = vectorstore.as_retriever()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "dacbde0b-7d45-4a2c-931d-81bb094aec94",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Prompt \n",
|
||||
"# https://smith.langchain.com/hub/rlm/rag-prompt\n",
|
||||
"\n",
|
||||
"from langchain import hub\n",
|
||||
"rag_prompt = hub.pull(\"rlm/rag-prompt\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "79b9fdae-c2bf-4cf6-884f-c19aa07dd975",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# LLM\n",
|
||||
"\n",
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "92c0f3ae-6ab2-4d04-9b22-1963b96b9db5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# RAG chain \n",
|
||||
"\n",
|
||||
"from langchain.schema.runnable import RunnablePassthrough\n",
|
||||
"rag_chain = (\n",
|
||||
" {\"context\": retriever, \"question\": RunnablePassthrough()} \n",
|
||||
" | rag_prompt \n",
|
||||
" | llm \n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "0d3b0f36-7b56-49c0-8e40-a1aa9ebcbf24",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"' Task decomposition is a technique used to break down complex tasks into smaller and simpler steps. It can be done using LLM with simple prompting, task-specific instructions, or with human inputs. Tree of Thoughts (Yao et al. 2023) is an extension of Chain of Thought (Wei et al. 2022) which explores multiple reasoning possibilities at each step.'"
|
||||
"AIMessage(content='Task decomposition is the process of breaking down a task into smaller subgoals or steps. It can be done using simple prompting, task-specific instructions, or human inputs.')"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"index.query(\"What is Task Decomposition?\")"
|
||||
"rag_chain.invoke(\"What is Task Decomposition?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8224aad6",
|
||||
"id": "639dc31a-7f16-40f6-ba2a-20e7c2ecfe60",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Ok, but what's going on under the hood, and how could we customize this for our specific use case? For that, let's take a look at how we can construct this pipeline piece by piece."
|
||||
"[Here](https://smith.langchain.com/public/2270a675-74de-47ac-b111-b232d8340a64/r) is the LangSmith trace for this chain.\n",
|
||||
"\n",
|
||||
"Below we will explain each step in more detail."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -100,7 +180,9 @@
|
||||
"source": [
|
||||
"## Step 1. Load\n",
|
||||
"\n",
|
||||
"Specify a `DocumentLoader` to load in your unstructured data as `Documents`. A `Document` is a piece of text (the `page_content`) and associated metadata."
|
||||
"Specify a `DocumentLoader` to load in your unstructured data as `Documents`. \n",
|
||||
"\n",
|
||||
"A `Document` is a dict with text (`page_content`) and `metadata`."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -122,7 +204,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Go deeper\n",
|
||||
"- Browse the > 120 data loader integrations [here](https://integrations.langchain.com/).\n",
|
||||
"- Browse the > 160 data loader integrations [here](https://integrations.langchain.com/).\n",
|
||||
"- See further documentation on loaders [here](/docs/modules/data_connection/document_loaders/).\n",
|
||||
"\n",
|
||||
"## Step 2. Split\n",
|
||||
@@ -150,7 +232,7 @@
|
||||
"source": [
|
||||
"### Go deeper\n",
|
||||
"\n",
|
||||
"- `DocumentSplitters` are just one type of the more generic `DocumentTransformers`, which can all be useful in this preprocessing step.\n",
|
||||
"- `DocumentSplitters` are just one type of the more generic `DocumentTransformers`.\n",
|
||||
"- See further documentation on transformers [here](/docs/modules/data_connection/document_transformers/).\n",
|
||||
"- `Context-aware splitters` keep the location (\"context\") of each split in the original `Document`:\n",
|
||||
" - [Markdown files](/docs/use_cases/question_answering/how_to/document-context-aware-QA)\n",
|
||||
@@ -160,7 +242,10 @@
|
||||
"## Step 3. Store\n",
|
||||
"\n",
|
||||
"To be able to look up our document splits, we first need to store them where we can later look them up.\n",
|
||||
"The most common way to do this is to embed the contents of each document then store the embedding and document in a vector store, with the embedding being used to index the document."
|
||||
"\n",
|
||||
"The most common way to do this is to embed the contents of each document split.\n",
|
||||
"\n",
|
||||
"We store the embedding and splits in a vectorstore."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -193,7 +278,9 @@
|
||||
"\n",
|
||||
"## Step 4. Retrieve\n",
|
||||
"\n",
|
||||
"Retrieve relevant splits for any question using [similarity search](https://www.pinecone.io/learn/what-is-similarity-search/)."
|
||||
"Retrieve relevant splits for any question using [similarity search](https://www.pinecone.io/learn/what-is-similarity-search/).\n",
|
||||
"\n",
|
||||
"This is simply \"top K\" retrieval where we select documents based on embedding similarity to the query."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -228,7 +315,9 @@
|
||||
"\n",
|
||||
"Vectorstores are commonly used for retrieval, but they are not the only option. For example, SVMs (see thread [here](https://twitter.com/karpathy/status/1647025230546886658?s=20)) can also be used.\n",
|
||||
"\n",
|
||||
"LangChain [has many retrievers](/docs/modules/data_connection/retrievers/) including, but not limited to, vectorstores. All retrievers implement a common method `get_relevant_documents()` (and its asynchronous variant `aget_relevant_documents()`)."
|
||||
"LangChain [has many retrievers](/docs/modules/data_connection/retrievers/) including, but not limited to, vectorstores. \n",
|
||||
"\n",
|
||||
"All retrievers implement a common method `get_relevant_documents()` (and its asynchronous variant `aget_relevant_documents()`)."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -275,7 +364,6 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import logging\n",
|
||||
"\n",
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"from langchain.retrievers.multi_query import MultiQueryRetriever\n",
|
||||
"\n",
|
||||
@@ -288,6 +376,20 @@
|
||||
"len(unique_docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ee8420e6-73a6-411b-a84d-74b096bddad7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In addition, a useful concept for improving retrieval is decoupling the documents from the embedded search key.\n",
|
||||
"\n",
|
||||
"For example, we can embed a document summary or question that are likely to lead to the document being retrieved.\n",
|
||||
"\n",
|
||||
"See details in [here](docs/modules/data_connection/retrievers/multi_vector) on the multi-vector retriever for this purpose.\n",
|
||||
"\n",
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "415d6824",
|
||||
@@ -295,34 +397,44 @@
|
||||
"source": [
|
||||
"## Step 5. Generate\n",
|
||||
"\n",
|
||||
"Distill the retrieved documents into an answer using an LLM/Chat model (e.g., `gpt-3.5-turbo`) with `RetrievalQA` chain.\n"
|
||||
"Distill the retrieved documents into an answer using an LLM/Chat model (e.g., `gpt-3.5-turbo`).\n",
|
||||
"\n",
|
||||
"We use the [Runnable](https://python.langchain.com/docs/expression_language/interface) protocol to define the chain.\n",
|
||||
"\n",
|
||||
"Runnable protocol pipes together components in a transparent way.\n",
|
||||
"\n",
|
||||
"We used a prompt for RAG that is checked into the LangChain prompt hub ([here](https://smith.langchain.com/hub/rlm/rag-prompt))."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 11,
|
||||
"id": "99fa1aec",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'query': 'What are the approaches to Task Decomposition?',\n",
|
||||
" 'result': 'The approaches to task decomposition include:\\n\\n1. Simple prompting: This approach involves using simple prompts or questions to guide the agent in breaking down a task into smaller subgoals. For example, the agent can be prompted with \"Steps for XYZ\" or \"What are the subgoals for achieving XYZ?\" to facilitate task decomposition.\\n\\n2. Task-specific instructions: In this approach, task-specific instructions are provided to the agent to guide the decomposition process. For example, if the task is to write a novel, the agent can be instructed to \"Write a story outline\" as a step in the task decomposition.\\n\\n3. Human inputs: This approach involves incorporating human inputs in the task decomposition process. Humans can provide guidance, feedback, and assistance to the agent in breaking down complex tasks into manageable subgoals.\\n\\nThese approaches aim to enable efficient handling of complex tasks by breaking them down into smaller, more manageable subgoals.'}"
|
||||
"AIMessage(content='Task decomposition is the process of breaking down a task into smaller subgoals or steps. It can be done using simple prompting, task-specific instructions, or human inputs.')"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.chains import RetrievalQA\n",
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"\n",
|
||||
"llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)\n",
|
||||
"qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorstore.as_retriever())\n",
|
||||
"qa_chain({\"query\": question})"
|
||||
"\n",
|
||||
"from langchain.schema.runnable import RunnablePassthrough\n",
|
||||
"rag_chain = (\n",
|
||||
" {\"context\": retriever, \"question\": RunnablePassthrough()} \n",
|
||||
" | rag_prompt \n",
|
||||
" | llm \n",
|
||||
")\n",
|
||||
"\n",
|
||||
"rag_chain.invoke(\"What is Task Decomposition?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -330,12 +442,10 @@
|
||||
"id": "f7d52c84",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Note, you can pass in an `LLM` or a `ChatModel` (like we did here) to the `RetrievalQA` chain.\n",
|
||||
"\n",
|
||||
"### Go deeper\n",
|
||||
"\n",
|
||||
"#### Choosing LLMs\n",
|
||||
"- Browse the > 55 LLM and chat model integrations [here](https://integrations.langchain.com/).\n",
|
||||
"- Browse the > 90 LLM and chat model integrations [here](https://integrations.langchain.com/).\n",
|
||||
"- See further documentation on LLMs and chat models [here](/docs/modules/model_io/models/).\n",
|
||||
"- See a guide on local LLMS [here](/docs/modules/use_cases/question_answering/how_to/local_retrieval_qa)."
|
||||
]
|
||||
@@ -347,28 +457,29 @@
|
||||
"source": [
|
||||
"#### Customizing the prompt\n",
|
||||
"\n",
|
||||
"The prompt in `RetrievalQA` chain can be easily customized."
|
||||
"As shown above, we can load prompts (e.g., [this RAG prompt](https://smith.langchain.com/hub/rlm/rag-prompt)) from the prompt hub.\n",
|
||||
"\n",
|
||||
"The prompt can also be easily customized, as shown below."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 12,
|
||||
"id": "e4fee704",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The approaches to Task Decomposition are (1) using simple prompting by LLM, (2) using task-specific instructions, and (3) incorporating human inputs. Thanks for asking!'"
|
||||
"AIMessage(content='Task decomposition is the process of breaking down a complicated task into smaller, more manageable subtasks or steps. It can be done using prompts, task-specific instructions, or human inputs. Thanks for asking!')"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.chains import RetrievalQA\n",
|
||||
"from langchain.prompts import PromptTemplate\n",
|
||||
"\n",
|
||||
"template = \"\"\"Use the following pieces of context to answer the question at the end. \n",
|
||||
@@ -378,229 +489,23 @@
|
||||
"{context}\n",
|
||||
"Question: {question}\n",
|
||||
"Helpful Answer:\"\"\"\n",
|
||||
"QA_CHAIN_PROMPT = PromptTemplate.from_template(template)\n",
|
||||
"rag_prompt_custom = PromptTemplate.from_template(template)\n",
|
||||
"\n",
|
||||
"llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)\n",
|
||||
"qa_chain = RetrievalQA.from_chain_type(\n",
|
||||
" llm,\n",
|
||||
" retriever=vectorstore.as_retriever(),\n",
|
||||
" chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT}\n",
|
||||
"rag_chain = (\n",
|
||||
" {\"context\": retriever, \"question\": RunnablePassthrough()} \n",
|
||||
" | rag_prompt_custom \n",
|
||||
" | llm \n",
|
||||
")\n",
|
||||
"result = qa_chain({\"query\": question})\n",
|
||||
"result[\"result\"]"
|
||||
"\n",
|
||||
"rag_chain.invoke(\"What is Task Decomposition?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c825e9bf-6a56-46e4-8bbb-05441f76cb96",
|
||||
"id": "5f5b6297-715a-444e-b3ef-a6d27382b435",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can also store and fetch prompts from the LangChain prompt hub.\n",
|
||||
"\n",
|
||||
"This will work with your [LangSmith API key](https://docs.smith.langchain.com/).\n",
|
||||
"\n",
|
||||
"For example, see [here](https://smith.langchain.com/hub/rlm/rag-prompt) is a common prompt for RAG.\n",
|
||||
"\n",
|
||||
"We can load this."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a896060f-ebc4-4236-a4ad-32960601c6e8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pip install langchainhub"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "aef8e734-ba54-48ae-b959-1898618f2d90",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The approaches to task decomposition include using LLM with simple prompting, task-specific instructions, and human inputs.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# RAG prompt\n",
|
||||
"from langchain import hub\n",
|
||||
"QA_CHAIN_PROMPT_HUB = hub.pull(\"rlm/rag-prompt\")\n",
|
||||
"\n",
|
||||
"qa_chain = RetrievalQA.from_chain_type(\n",
|
||||
" llm,\n",
|
||||
" retriever=vectorstore.as_retriever(),\n",
|
||||
" chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT_HUB}\n",
|
||||
")\n",
|
||||
"result = qa_chain({\"query\": question})\n",
|
||||
"result[\"result\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ff40e8db",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true
|
||||
},
|
||||
"source": [
|
||||
"#### Return source documents\n",
|
||||
"\n",
|
||||
"The full set of retrieved documents used for answer distillation can be returned using `return_source_documents=True`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "60004293",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"4\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='Task decomposition can be done (1) by LLM with simple prompting like \"Steps for XYZ.\\\\n1.\", \"What are the subgoals for achieving XYZ?\", (2) by using task-specific instructions; e.g. \"Write a story outline.\" for writing a novel, or (3) with human inputs.', metadata={'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\\nAgent System Overview In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:', 'language': 'en', 'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': \"LLM Powered Autonomous Agents | Lil'Log\"})"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.chains import RetrievalQA\n",
|
||||
"\n",
|
||||
"qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorstore.as_retriever(),\n",
|
||||
" return_source_documents=True)\n",
|
||||
"result = qa_chain({\"query\": question})\n",
|
||||
"print(len(result['source_documents']))\n",
|
||||
"result['source_documents'][0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1b600236",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Return citations\n",
|
||||
"\n",
|
||||
"Answer citations can be returned using `RetrievalQAWithSourcesChain`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "948f6d19",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'question': 'What are the approaches to Task Decomposition?',\n",
|
||||
" 'answer': 'The approaches to Task Decomposition include:\\n1. Using LLM with simple prompting, such as providing steps or subgoals for achieving a task.\\n2. Using task-specific instructions, such as providing a specific instruction like \"Write a story outline\" for writing a novel.\\n3. Using human inputs to decompose the task.\\nAnother approach is the Tree of Thoughts, which extends the Chain of Thought (CoT) technique by exploring multiple reasoning possibilities at each step and generating multiple thoughts per step, creating a tree structure. The search process can be BFS or DFS, and each state can be evaluated by a classifier or majority vote.\\nSources: https://lilianweng.github.io/posts/2023-06-23-agent/',\n",
|
||||
" 'sources': ''}"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.chains import RetrievalQAWithSourcesChain\n",
|
||||
"\n",
|
||||
"qa_chain = RetrievalQAWithSourcesChain.from_chain_type(llm,retriever=vectorstore.as_retriever())\n",
|
||||
"\n",
|
||||
"result = qa_chain({\"question\": question})\n",
|
||||
"result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "73d0b138",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Customizing retrieved document processing\n",
|
||||
"\n",
|
||||
"Retrieved documents can be fed to an LLM for answer distillation in a few different ways.\n",
|
||||
"\n",
|
||||
"`stuff`, `refine`, `map-reduce`, and `map-rerank` chains for passing documents to an LLM prompt are well summarized [here](/docs/modules/chains/document/).\n",
|
||||
" \n",
|
||||
"`stuff` is commonly used because it simply \"stuffs\" all retrieved documents into the prompt.\n",
|
||||
"\n",
|
||||
"The [load_qa_chain](/docs/use_cases/question_answering/how_to/question_answering.html) is an easy way to pass documents to an LLM using these various approaches (e.g., see `chain_type`)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "29aa139f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'output_text': 'The approaches to task decomposition mentioned in the provided context are:\\n\\n1. Chain of thought (CoT): This approach involves instructing the language model to \"think step by step\" and decompose complex tasks into smaller and simpler steps. It enhances model performance on complex tasks by utilizing more test-time computation.\\n\\n2. Tree of Thoughts: This approach extends CoT by exploring multiple reasoning possibilities at each step. It decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS or DFS, and each state is evaluated by a classifier or majority vote.\\n\\n3. LLM with simple prompting: This approach involves using a language model with simple prompts like \"Steps for XYZ\" or \"What are the subgoals for achieving XYZ?\" to perform task decomposition.\\n\\n4. Task-specific instructions: This approach involves providing task-specific instructions to guide the language model in decomposing the task. For example, providing the instruction \"Write a story outline\" for the task of writing a novel.\\n\\n5. Human inputs: Task decomposition can also be done with human inputs, where humans provide guidance and input to break down the task into smaller subtasks.'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.chains.question_answering import load_qa_chain\n",
|
||||
"\n",
|
||||
"chain = load_qa_chain(llm, chain_type=\"stuff\")\n",
|
||||
"chain({\"input_documents\": unique_docs, \"question\": question},return_only_outputs=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a8cb8cd1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can also pass the `chain_type` to `RetrievalQA`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "f68574bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorstore.as_retriever(),\n",
|
||||
" chain_type=\"stuff\")\n",
|
||||
"result = qa_chain({\"query\": question})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b33aeb5f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In summary, the user can choose the desired level of abstraction for QA:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Step 6. Chat\n",
|
||||
"\n",
|
||||
"See our [use-case on chat](/docs/use_cases/chatbots) for detail on this!"
|
||||
"We can use [LangSmith](https://smith.langchain.com/public/129cac54-44d5-453a-9807-3bd4835e5f96/r) to see the trace."
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -620,7 +525,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
"version": "3.9.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Using PyPDF
|
||||
## Using PyPDF
|
||||
|
||||
Load PDF using `pypdf` into array of documents, where each document contains the page content and metadata with `page` number.
|
||||
|
||||
@@ -74,6 +74,30 @@ for doc in docs:
|
||||
|
||||
</CodeOutputBlock>
|
||||
|
||||
|
||||
### Extracting images
|
||||
|
||||
Using the `rapidocr-onnxruntime` package we can extract images as text as well:
|
||||
|
||||
```bash
|
||||
pip install rapidocr-onnxruntime
|
||||
```
|
||||
|
||||
```python
|
||||
loader = PyPDFLoader("https://arxiv.org/pdf/2103.15348.pdf", extract_images=True)
|
||||
pages = loader.load()
|
||||
pages[4].page_content
|
||||
```
|
||||
|
||||
<CodeOutputBlock lang="python">
|
||||
|
||||
```
|
||||
'LayoutParser : A Unified Toolkit for DL-Based DIA 5\nTable 1: Current layout detection models in the LayoutParser model zoo\nDataset Base Model1Large Model Notes\nPubLayNet [38] F / M M Layouts of modern scientific documents\nPRImA [3] M - Layouts of scanned modern magazines and scientific reports\nNewspaper [17] F - Layouts of scanned US newspapers from the 20th century\nTableBank [18] F F Table region on modern scientific and business document\nHJDataset [31] F / M - Layouts of history Japanese documents\n1For each dataset, we train several models of different sizes for different needs (the trade-off between accuracy\nvs. computational cost). For “base model” and “large model”, we refer to using the ResNet 50 or ResNet 101\nbackbones [ 13], respectively. One can train models of different architectures, like Faster R-CNN [ 28] (F) and Mask\nR-CNN [ 12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained\nusing the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model\nzoo in coming months.\nlayout data structures , which are optimized for efficiency and versatility. 3) When\nnecessary, users can employ existing or customized OCR models via the unified\nAPI provided in the OCR module . 4)LayoutParser comes with a set of utility\nfunctions for the visualization and storage of the layout data. 5) LayoutParser\nis also highly customizable, via its integration with functions for layout data\nannotation and model training . We now provide detailed descriptions for each\ncomponent.\n3.1 Layout Detection Models\nInLayoutParser , a layout model takes a document image as an input and\ngenerates a list of rectangular boxes for the target content regions. Different\nfrom traditional methods, it relies on deep convolutional neural networks rather\nthan manually curated rules to identify content regions. It is formulated as an\nobject detection problem and state-of-the-art models like Faster R-CNN [ 28] and\nMask R-CNN [ 12] are used. This yields prediction results of high accuracy and\nmakes it possible to build a concise, generalized interface for layout detection.\nLayoutParser , built upon Detectron2 [ 35], provides a minimal API that can\nperform layout detection with only four lines of code in Python:\n1import layoutparser as lp\n2image = cv2. imread (" image_file ") # load images\n3model = lp. Detectron2LayoutModel (\n4 "lp :// PubLayNet / faster_rcnn_R_50_FPN_3x / config ")\n5layout = model . detect ( image )\nLayoutParser provides a wealth of pre-trained model weights using various\ndatasets covering different languages, time periods, and document types. Due to\ndomain shift [ 7], the prediction performance can notably drop when models are ap-\nplied to target samples that are significantly different from the training dataset. As\ndocument structures and layouts vary greatly in different domains, it is important\nto select models trained on a dataset similar to the test samples. A semantic syntax\nis used for initializing the model weights in LayoutParser , using both the dataset\nname and model name lp://<dataset-name>/<model-architecture-name> .'
|
||||
```
|
||||
|
||||
</CodeOutputBlock>
|
||||
|
||||
|
||||
## Using MathPix
|
||||
|
||||
Inspired by Daniel Gross's [https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21](https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21)
|
||||
|
||||
@@ -31,7 +31,8 @@ from langchain.text_splitter import (
|
||||
'markdown',
|
||||
'latex',
|
||||
'html',
|
||||
'sol',]
|
||||
'sol',
|
||||
'csharp']
|
||||
```
|
||||
|
||||
</CodeOutputBlock>
|
||||
@@ -342,3 +343,72 @@ sol_docs
|
||||
```
|
||||
|
||||
</CodeOutputBlock>
|
||||
|
||||
|
||||
## C#
|
||||
Here's an example using the C# text splitter:
|
||||
|
||||
```csharp
|
||||
using System;
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
int age = 30; // Change the age value as needed
|
||||
|
||||
// Categorize the age without any console output
|
||||
if (age < 18)
|
||||
{
|
||||
// Age is under 18
|
||||
}
|
||||
else if (age >= 18 && age < 65)
|
||||
{
|
||||
// Age is an adult
|
||||
}
|
||||
else
|
||||
{
|
||||
// Age is a senior citizen
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
<CodeOutputBlock lang="python">
|
||||
|
||||
```
|
||||
[Document(page_content='using System;', metadata={}),
|
||||
Document(page_content='class Program\n{', metadata={}),
|
||||
Document(page_content='static void', metadata={}),
|
||||
Document(page_content='Main()', metadata={}),
|
||||
Document(page_content='{', metadata={}),
|
||||
Document(page_content='int age', metadata={}),
|
||||
Document(page_content='= 30; // Change', metadata={}),
|
||||
Document(page_content='the age value', metadata={}),
|
||||
Document(page_content='as needed', metadata={}),
|
||||
Document(page_content='//', metadata={}),
|
||||
Document(page_content='Categorize the', metadata={}),
|
||||
Document(page_content='age without any', metadata={}),
|
||||
Document(page_content='console output', metadata={}),
|
||||
Document(page_content='if (age', metadata={}),
|
||||
Document(page_content='< 18)', metadata={}),
|
||||
Document(page_content='{', metadata={}),
|
||||
Document(page_content='//', metadata={}),
|
||||
Document(page_content='Age is under 18', metadata={}),
|
||||
Document(page_content='}', metadata={}),
|
||||
Document(page_content='else if', metadata={}),
|
||||
Document(page_content='(age >= 18 &&', metadata={}),
|
||||
Document(page_content='age < 65)', metadata={}),
|
||||
Document(page_content='{', metadata={}),
|
||||
Document(page_content='//', metadata={}),
|
||||
Document(page_content='Age is an adult', metadata={}),
|
||||
Document(page_content='}', metadata={}),
|
||||
Document(page_content='else', metadata={}),
|
||||
Document(page_content='{', metadata={}),
|
||||
Document(page_content='//', metadata={}),
|
||||
Document(page_content='Age is a senior', metadata={}),
|
||||
Document(page_content='citizen', metadata={}),
|
||||
Document(page_content='}\n }', metadata={}),
|
||||
Document(page_content='}', metadata={})]
|
||||
```
|
||||
|
||||
</CodeOutputBlock>
|
||||
|
||||
@@ -1,10 +1,26 @@
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict
|
||||
from typing import Dict, List
|
||||
|
||||
from presidio_analyzer import RecognizerResult
|
||||
from presidio_anonymizer.entities import EngineResult
|
||||
|
||||
MappingDataType = Dict[str, Dict[str, str]]
|
||||
|
||||
|
||||
def format_duplicated_operator(operator_name: str, count: int) -> str:
|
||||
"""Format the operator name with the count"""
|
||||
|
||||
clean_operator_name = re.sub(r"[<>]", "", operator_name)
|
||||
clean_operator_name = re.sub(r"_\d+$", "", clean_operator_name)
|
||||
|
||||
if operator_name.startswith("<") and operator_name.endswith(">"):
|
||||
return f"<{clean_operator_name}_{count}>"
|
||||
else:
|
||||
return f"{clean_operator_name}_{count}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeanonymizerMapping:
|
||||
mapping: MappingDataType = field(
|
||||
@@ -17,5 +33,107 @@ class DeanonymizerMapping:
|
||||
return {k: dict(v) for k, v in self.mapping.items()}
|
||||
|
||||
def update(self, new_mapping: MappingDataType) -> None:
|
||||
"""Update the deanonymizer mapping with new values
|
||||
Duplicated values will not be added
|
||||
If there are multiple entities of the same type, the mapping will
|
||||
include a count to differentiate them. For example, if there are
|
||||
two names in the input text, the mapping will include NAME_1 and NAME_2.
|
||||
"""
|
||||
seen_values = set()
|
||||
|
||||
for entity_type, values in new_mapping.items():
|
||||
self.mapping[entity_type].update(values)
|
||||
count = len(self.mapping[entity_type]) + 1
|
||||
|
||||
for key, value in values.items():
|
||||
if (
|
||||
value not in seen_values
|
||||
and value not in self.mapping[entity_type].values()
|
||||
):
|
||||
new_key = (
|
||||
format_duplicated_operator(key, count)
|
||||
if key in self.mapping[entity_type]
|
||||
else key
|
||||
)
|
||||
|
||||
self.mapping[entity_type][new_key] = value
|
||||
seen_values.add(value)
|
||||
count += 1
|
||||
|
||||
|
||||
def create_anonymizer_mapping(
|
||||
original_text: str,
|
||||
analyzer_results: List[RecognizerResult],
|
||||
anonymizer_results: EngineResult,
|
||||
is_reversed: bool = False,
|
||||
) -> MappingDataType:
|
||||
"""Creates or updates the mapping used to anonymize and/or deanonymize text.
|
||||
|
||||
This method exploits the results returned by the
|
||||
analysis and anonymization processes.
|
||||
|
||||
If is_reversed is True, it constructs a mapping from each original
|
||||
entity to its anonymized value.
|
||||
|
||||
If is_reversed is False, it constructs a mapping from each
|
||||
anonymized entity back to its original text value.
|
||||
|
||||
If there are multiple entities of the same type, the mapping will
|
||||
include a count to differentiate them. For example, if there are
|
||||
two names in the input text, the mapping will include NAME_1 and NAME_2.
|
||||
|
||||
Example of mapping:
|
||||
{
|
||||
"PERSON": {
|
||||
"<original>": "<anonymized>",
|
||||
"John Doe": "Slim Shady"
|
||||
},
|
||||
"PHONE_NUMBER": {
|
||||
"111-111-1111": "555-555-5555"
|
||||
}
|
||||
...
|
||||
}
|
||||
"""
|
||||
# We are able to zip and loop through both lists because we expect
|
||||
# them to return corresponding entities for each identified piece
|
||||
# of analyzable data from our input.
|
||||
|
||||
# We sort them by their 'start' attribute because it allows us to
|
||||
# match corresponding entities by their position in the input text.
|
||||
analyzer_results.sort(key=lambda d: d.start)
|
||||
anonymizer_results.items.sort(key=lambda d: d.start)
|
||||
|
||||
mapping: MappingDataType = defaultdict(dict)
|
||||
count: dict = defaultdict(int)
|
||||
|
||||
for analyzed, anonymized in zip(analyzer_results, anonymizer_results.items):
|
||||
original_value = original_text[analyzed.start : analyzed.end]
|
||||
entity_type = anonymized.entity_type
|
||||
|
||||
if is_reversed:
|
||||
cond = original_value in mapping[entity_type].values()
|
||||
else:
|
||||
cond = original_value in mapping[entity_type]
|
||||
|
||||
if cond:
|
||||
continue
|
||||
|
||||
if (
|
||||
anonymized.text in mapping[entity_type].values()
|
||||
or anonymized.text in mapping[entity_type]
|
||||
):
|
||||
anonymized_value = format_duplicated_operator(
|
||||
anonymized.text, count[entity_type] + 2
|
||||
)
|
||||
count[entity_type] += 1
|
||||
else:
|
||||
anonymized_value = anonymized.text
|
||||
|
||||
mapping_key, mapping_value = (
|
||||
(anonymized_value, original_value)
|
||||
if is_reversed
|
||||
else (original_value, anonymized_value)
|
||||
)
|
||||
|
||||
mapping[entity_type][mapping_key] = mapping_value
|
||||
|
||||
return mapping
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
|
||||
|
||||
@@ -14,6 +13,7 @@ from langchain_experimental.data_anonymizer.base import (
|
||||
from langchain_experimental.data_anonymizer.deanonymizer_mapping import (
|
||||
DeanonymizerMapping,
|
||||
MappingDataType,
|
||||
create_anonymizer_mapping,
|
||||
)
|
||||
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
|
||||
default_matching_strategy,
|
||||
@@ -43,8 +43,7 @@ except ImportError as e:
|
||||
) from e
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from presidio_analyzer import EntityRecognizer, RecognizerResult
|
||||
from presidio_anonymizer.entities import EngineResult
|
||||
from presidio_analyzer import EntityRecognizer
|
||||
|
||||
# Configuring Anonymizer for multiple languages
|
||||
# Detailed description and examples can be found here:
|
||||
@@ -69,6 +68,7 @@ class PresidioAnonymizerBase(AnonymizerBase):
|
||||
analyzed_fields: Optional[List[str]] = None,
|
||||
operators: Optional[Dict[str, OperatorConfig]] = None,
|
||||
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
|
||||
add_default_faker_operators: bool = True,
|
||||
faker_seed: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
@@ -93,10 +93,9 @@ class PresidioAnonymizerBase(AnonymizerBase):
|
||||
if analyzed_fields is not None
|
||||
else list(get_pseudoanonymizer_mapping().keys())
|
||||
)
|
||||
self.operators = (
|
||||
operators
|
||||
if operators is not None
|
||||
else {
|
||||
|
||||
if add_default_faker_operators:
|
||||
self.operators = {
|
||||
field: OperatorConfig(
|
||||
operator_name="custom", params={"lambda": faker_function}
|
||||
)
|
||||
@@ -104,7 +103,11 @@ class PresidioAnonymizerBase(AnonymizerBase):
|
||||
faker_seed
|
||||
).items()
|
||||
}
|
||||
)
|
||||
else:
|
||||
self.operators = {}
|
||||
|
||||
if operators:
|
||||
self.add_operators(operators)
|
||||
|
||||
provider = NlpEngineProvider(nlp_configuration=languages_config)
|
||||
nlp_engine = provider.create_engine()
|
||||
@@ -140,109 +143,13 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
|
||||
Each PII entity is replaced with a fake value.
|
||||
Each time fake values will be different, as they are generated randomly.
|
||||
|
||||
Args:
|
||||
text: text to anonymize
|
||||
language: language to use for analysis of PII
|
||||
If None, the first (main) language in the list
|
||||
of languages specified in the configuration will be used.
|
||||
"""
|
||||
if language is None:
|
||||
language = self.supported_languages[0]
|
||||
|
||||
if language not in self.supported_languages:
|
||||
raise ValueError(
|
||||
f"Language '{language}' is not supported. "
|
||||
f"Supported languages are: {self.supported_languages}. "
|
||||
"Change your language configuration file to add more languages."
|
||||
)
|
||||
|
||||
results = self._analyzer.analyze(
|
||||
text,
|
||||
entities=self.analyzed_fields,
|
||||
language=language,
|
||||
)
|
||||
|
||||
return self._anonymizer.anonymize(
|
||||
text,
|
||||
analyzer_results=results,
|
||||
operators=self.operators,
|
||||
).text
|
||||
|
||||
|
||||
class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase):
|
||||
def __init__(
|
||||
self,
|
||||
analyzed_fields: Optional[List[str]] = None,
|
||||
operators: Optional[Dict[str, OperatorConfig]] = None,
|
||||
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
|
||||
faker_seed: Optional[int] = None,
|
||||
):
|
||||
super().__init__(analyzed_fields, operators, languages_config, faker_seed)
|
||||
self._deanonymizer_mapping = DeanonymizerMapping()
|
||||
|
||||
@property
|
||||
def deanonymizer_mapping(self) -> MappingDataType:
|
||||
"""Return the deanonymizer mapping"""
|
||||
return self._deanonymizer_mapping.data
|
||||
|
||||
def _update_deanonymizer_mapping(
|
||||
self,
|
||||
original_text: str,
|
||||
analyzer_results: List[RecognizerResult],
|
||||
anonymizer_results: EngineResult,
|
||||
) -> None:
|
||||
"""Creates or updates the mapping used to de-anonymize text.
|
||||
|
||||
This method exploits the results returned by the
|
||||
analysis and anonymization processes.
|
||||
|
||||
It constructs a mapping from each anonymized entity
|
||||
back to its original text value.
|
||||
|
||||
Mapping will be stored as "deanonymizer_mapping" property.
|
||||
|
||||
Example of "deanonymizer_mapping":
|
||||
{
|
||||
"PERSON": {
|
||||
"<anonymized>": "<original>",
|
||||
"John Doe": "Slim Shady"
|
||||
},
|
||||
"PHONE_NUMBER": {
|
||||
"111-111-1111": "555-555-5555"
|
||||
}
|
||||
...
|
||||
}
|
||||
"""
|
||||
|
||||
# We are able to zip and loop through both lists because we expect
|
||||
# them to return corresponding entities for each identified piece
|
||||
# of analyzable data from our input.
|
||||
|
||||
# We sort them by their 'start' attribute because it allows us to
|
||||
# match corresponding entities by their position in the input text.
|
||||
analyzer_results = sorted(analyzer_results, key=lambda d: d.start)
|
||||
anonymizer_results.items = sorted(
|
||||
anonymizer_results.items, key=lambda d: d.start
|
||||
)
|
||||
|
||||
new_deanonymizer_mapping: MappingDataType = defaultdict(dict)
|
||||
|
||||
for analyzed_entity, anonymized_entity in zip(
|
||||
analyzer_results, anonymizer_results.items
|
||||
):
|
||||
original_value = original_text[analyzed_entity.start : analyzed_entity.end]
|
||||
new_deanonymizer_mapping[anonymized_entity.entity_type][
|
||||
anonymized_entity.text
|
||||
] = original_value
|
||||
|
||||
self._deanonymizer_mapping.update(new_deanonymizer_mapping)
|
||||
|
||||
def _anonymize(self, text: str, language: Optional[str] = None) -> str:
|
||||
"""Anonymize text.
|
||||
Each PII entity is replaced with a fake value.
|
||||
Each time fake values will be different, as they are generated randomly.
|
||||
At the same time, we will create a mapping from each anonymized entity
|
||||
back to its original text value.
|
||||
PresidioAnonymizer has no built-in memory -
|
||||
so it will not remember the effects of anonymizing previous texts.
|
||||
>>> anonymizer = PresidioAnonymizer()
|
||||
>>> anonymizer.anonymize("My name is John Doe. Hi John Doe!")
|
||||
'My name is Noah Rhodes. Hi Noah Rhodes!'
|
||||
>>> anonymizer.anonymize("My name is John Doe. Hi John Doe!")
|
||||
'My name is Brett Russell. Hi Brett Russell!'
|
||||
|
||||
Args:
|
||||
text: text to anonymize
|
||||
@@ -278,11 +185,104 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
|
||||
operators=self.operators,
|
||||
)
|
||||
|
||||
self._update_deanonymizer_mapping(
|
||||
text, filtered_analyzer_results, anonymizer_results
|
||||
anonymizer_mapping = create_anonymizer_mapping(
|
||||
text,
|
||||
filtered_analyzer_results,
|
||||
anonymizer_results,
|
||||
)
|
||||
return default_matching_strategy(text, anonymizer_mapping)
|
||||
|
||||
|
||||
class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase):
|
||||
def __init__(
|
||||
self,
|
||||
analyzed_fields: Optional[List[str]] = None,
|
||||
operators: Optional[Dict[str, OperatorConfig]] = None,
|
||||
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
|
||||
add_default_faker_operators: bool = True,
|
||||
faker_seed: Optional[int] = None,
|
||||
):
|
||||
super().__init__(
|
||||
analyzed_fields,
|
||||
operators,
|
||||
languages_config,
|
||||
add_default_faker_operators,
|
||||
faker_seed,
|
||||
)
|
||||
self._deanonymizer_mapping = DeanonymizerMapping()
|
||||
|
||||
@property
|
||||
def deanonymizer_mapping(self) -> MappingDataType:
|
||||
"""Return the deanonymizer mapping"""
|
||||
return self._deanonymizer_mapping.data
|
||||
|
||||
@property
|
||||
def anonymizer_mapping(self) -> MappingDataType:
|
||||
"""Return the anonymizer mapping
|
||||
This is just the reverse version of the deanonymizer mapping."""
|
||||
return {
|
||||
key: {v: k for k, v in inner_dict.items()}
|
||||
for key, inner_dict in self.deanonymizer_mapping.items()
|
||||
}
|
||||
|
||||
def _anonymize(self, text: str, language: Optional[str] = None) -> str:
|
||||
"""Anonymize text.
|
||||
Each PII entity is replaced with a fake value.
|
||||
Each time fake values will be different, as they are generated randomly.
|
||||
At the same time, we will create a mapping from each anonymized entity
|
||||
back to its original text value.
|
||||
|
||||
Thanks to the built-in memory, all previously anonymised entities
|
||||
will be remembered and replaced by the same fake values:
|
||||
>>> anonymizer = PresidioReversibleAnonymizer()
|
||||
>>> anonymizer.anonymize("My name is John Doe. Hi John Doe!")
|
||||
'My name is Noah Rhodes. Hi Noah Rhodes!'
|
||||
>>> anonymizer.anonymize("My name is John Doe. Hi John Doe!")
|
||||
'My name is Noah Rhodes. Hi Noah Rhodes!'
|
||||
|
||||
Args:
|
||||
text: text to anonymize
|
||||
language: language to use for analysis of PII
|
||||
If None, the first (main) language in the list
|
||||
of languages specified in the configuration will be used.
|
||||
"""
|
||||
if language is None:
|
||||
language = self.supported_languages[0]
|
||||
|
||||
if language not in self.supported_languages:
|
||||
raise ValueError(
|
||||
f"Language '{language}' is not supported. "
|
||||
f"Supported languages are: {self.supported_languages}. "
|
||||
"Change your language configuration file to add more languages."
|
||||
)
|
||||
|
||||
analyzer_results = self._analyzer.analyze(
|
||||
text,
|
||||
entities=self.analyzed_fields,
|
||||
language=language,
|
||||
)
|
||||
|
||||
return anonymizer_results.text
|
||||
filtered_analyzer_results = (
|
||||
self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
|
||||
analyzer_results
|
||||
)
|
||||
)
|
||||
|
||||
anonymizer_results = self._anonymizer.anonymize(
|
||||
text,
|
||||
analyzer_results=analyzer_results,
|
||||
operators=self.operators,
|
||||
)
|
||||
|
||||
new_deanonymizer_mapping = create_anonymizer_mapping(
|
||||
text,
|
||||
filtered_analyzer_results,
|
||||
anonymizer_results,
|
||||
is_reversed=True,
|
||||
)
|
||||
self._deanonymizer_mapping.update(new_deanonymizer_mapping)
|
||||
|
||||
return default_matching_strategy(text, self.anonymizer_mapping)
|
||||
|
||||
def _deanonymize(
|
||||
self,
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
"""Chain that interprets a prompt and executes python code to do math.
|
||||
|
||||
Heavily borrowed from llm_math, wrapper for SymPy
|
||||
"""
|
||||
@@ -0,0 +1,157 @@
|
||||
"""Chain that interprets a prompt and executes python code to do symbolic math."""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain.base_language import BaseLanguageModel
|
||||
from langchain.callbacks.manager import (
|
||||
AsyncCallbackManagerForChainRun,
|
||||
CallbackManagerForChainRun,
|
||||
)
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.prompts.base import BasePromptTemplate
|
||||
|
||||
from langchain_experimental.llm_symbolic_math.prompt import PROMPT
|
||||
from langchain_experimental.pydantic_v1 import Extra
|
||||
|
||||
|
||||
class LLMSymbolicMathChain(Chain):
|
||||
"""Chain that interprets a prompt and executes python code to do symbolic math.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.chains import LLMSymbolicMathChain
|
||||
from langchain.llms import OpenAI
|
||||
llm_symbolic_math = LLMSymbolicMathChain.from_llm(OpenAI())
|
||||
"""
|
||||
|
||||
llm_chain: LLMChain
|
||||
input_key: str = "question" #: :meta private:
|
||||
output_key: str = "answer" #: :meta private:
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
extra = Extra.forbid
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Expect input key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.input_key]
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Expect output key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
def _evaluate_expression(self, expression: str) -> str:
|
||||
try:
|
||||
import sympy
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Unable to import sympy, please install it with `pip install sympy`."
|
||||
) from e
|
||||
try:
|
||||
output = str(sympy.sympify(expression, evaluate=True))
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
f'LLMSymbolicMathChain._evaluate("{expression}") raised error: {e}.'
|
||||
" Please try again with a valid numerical expression"
|
||||
)
|
||||
|
||||
# Remove any leading and trailing brackets from the output
|
||||
return re.sub(r"^\[|\]$", "", output)
|
||||
|
||||
def _process_llm_result(
|
||||
self, llm_output: str, run_manager: CallbackManagerForChainRun
|
||||
) -> Dict[str, str]:
|
||||
run_manager.on_text(llm_output, color="green", verbose=self.verbose)
|
||||
llm_output = llm_output.strip()
|
||||
text_match = re.search(r"^```text(.*?)```", llm_output, re.DOTALL)
|
||||
if text_match:
|
||||
expression = text_match.group(1)
|
||||
output = self._evaluate_expression(expression)
|
||||
run_manager.on_text("\nAnswer: ", verbose=self.verbose)
|
||||
run_manager.on_text(output, color="yellow", verbose=self.verbose)
|
||||
answer = "Answer: " + output
|
||||
elif llm_output.startswith("Answer:"):
|
||||
answer = llm_output
|
||||
elif "Answer:" in llm_output:
|
||||
answer = "Answer: " + llm_output.split("Answer:")[-1]
|
||||
else:
|
||||
raise ValueError(f"unknown format from LLM: {llm_output}")
|
||||
return {self.output_key: answer}
|
||||
|
||||
async def _aprocess_llm_result(
|
||||
self,
|
||||
llm_output: str,
|
||||
run_manager: AsyncCallbackManagerForChainRun,
|
||||
) -> Dict[str, str]:
|
||||
await run_manager.on_text(llm_output, color="green", verbose=self.verbose)
|
||||
llm_output = llm_output.strip()
|
||||
text_match = re.search(r"^```text(.*?)```", llm_output, re.DOTALL)
|
||||
if text_match:
|
||||
expression = text_match.group(1)
|
||||
output = self._evaluate_expression(expression)
|
||||
await run_manager.on_text("\nAnswer: ", verbose=self.verbose)
|
||||
await run_manager.on_text(output, color="yellow", verbose=self.verbose)
|
||||
answer = "Answer: " + output
|
||||
elif llm_output.startswith("Answer:"):
|
||||
answer = llm_output
|
||||
elif "Answer:" in llm_output:
|
||||
answer = "Answer: " + llm_output.split("Answer:")[-1]
|
||||
else:
|
||||
raise ValueError(f"unknown format from LLM: {llm_output}")
|
||||
return {self.output_key: answer}
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, str],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, str]:
|
||||
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
|
||||
_run_manager.on_text(inputs[self.input_key])
|
||||
llm_output = self.llm_chain.predict(
|
||||
question=inputs[self.input_key],
|
||||
stop=["```output"],
|
||||
callbacks=_run_manager.get_child(),
|
||||
)
|
||||
return self._process_llm_result(llm_output, _run_manager)
|
||||
|
||||
async def _acall(
|
||||
self,
|
||||
inputs: Dict[str, str],
|
||||
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, str]:
|
||||
_run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
|
||||
await _run_manager.on_text(inputs[self.input_key])
|
||||
llm_output = await self.llm_chain.apredict(
|
||||
question=inputs[self.input_key],
|
||||
stop=["```output"],
|
||||
callbacks=_run_manager.get_child(),
|
||||
)
|
||||
return await self._aprocess_llm_result(llm_output, _run_manager)
|
||||
|
||||
@property
|
||||
def _chain_type(self) -> str:
|
||||
return "llm_symbolic_math_chain"
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls,
|
||||
llm: BaseLanguageModel,
|
||||
prompt: BasePromptTemplate = PROMPT,
|
||||
**kwargs: Any,
|
||||
) -> LLMSymbolicMathChain:
|
||||
llm_chain = LLMChain(llm=llm, prompt=prompt)
|
||||
return cls(llm_chain=llm_chain, **kwargs)
|
||||
@@ -0,0 +1,51 @@
|
||||
# flake8: noqa
|
||||
from langchain.prompts.prompt import PromptTemplate
|
||||
|
||||
_PROMPT_TEMPLATE = """Translate a math problem into a expression that can be executed using Python's SymPy library. Use the output of running this code to answer the question.
|
||||
|
||||
Question: ${{Question with math problem.}}
|
||||
```text
|
||||
${{single line sympy expression that solves the problem}}
|
||||
```
|
||||
...sympy.sympify(text, evaluate=True)...
|
||||
```output
|
||||
${{Output of running the code}}
|
||||
```
|
||||
Answer: ${{Answer}}
|
||||
|
||||
Begin.
|
||||
|
||||
Question: What is the limit of sin(x) / x as x goes to 0
|
||||
```text
|
||||
limit(sin(x)/x, x, 0)
|
||||
```
|
||||
...sympy.sympify("limit(sin(x)/x, x, 0)")...
|
||||
```output
|
||||
1
|
||||
```
|
||||
Answer: 1
|
||||
|
||||
Question: What is the integral of e^-x from 0 to infinity
|
||||
```text
|
||||
integrate(exp(-x), (x, 0, oo))
|
||||
```
|
||||
...sympy.sympify("integrate(exp(-x), (x, 0, oo))")...
|
||||
```output
|
||||
1
|
||||
```
|
||||
|
||||
Question: What are the solutions to this equation x**2 - x?
|
||||
```text
|
||||
solveset(x**2 - x, x)
|
||||
```
|
||||
...sympy.sympify("solveset(x**2 - x, x)")...
|
||||
```output
|
||||
[0, 1]
|
||||
```
|
||||
Question: {question}
|
||||
"""
|
||||
|
||||
PROMPT = PromptTemplate(
|
||||
input_variables=["question"],
|
||||
template=_PROMPT_TEMPLATE,
|
||||
)
|
||||
@@ -0,0 +1,54 @@
|
||||
import logging
|
||||
|
||||
from langchain_experimental.rl_chain.base import (
|
||||
AutoSelectionScorer,
|
||||
BasedOn,
|
||||
Embed,
|
||||
Embedder,
|
||||
Policy,
|
||||
SelectionScorer,
|
||||
ToSelectFrom,
|
||||
VwPolicy,
|
||||
embed,
|
||||
stringify_embedding,
|
||||
)
|
||||
from langchain_experimental.rl_chain.pick_best_chain import (
|
||||
PickBest,
|
||||
PickBestEvent,
|
||||
PickBestFeatureEmbedder,
|
||||
PickBestRandomPolicy,
|
||||
PickBestSelected,
|
||||
)
|
||||
|
||||
|
||||
def configure_logger() -> None:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
ch = logging.StreamHandler()
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
ch.setFormatter(formatter)
|
||||
ch.setLevel(logging.INFO)
|
||||
logger.addHandler(ch)
|
||||
|
||||
|
||||
configure_logger()
|
||||
|
||||
__all__ = [
|
||||
"PickBest",
|
||||
"PickBestEvent",
|
||||
"PickBestSelected",
|
||||
"PickBestFeatureEmbedder",
|
||||
"PickBestRandomPolicy",
|
||||
"Embed",
|
||||
"BasedOn",
|
||||
"ToSelectFrom",
|
||||
"SelectionScorer",
|
||||
"AutoSelectionScorer",
|
||||
"Embedder",
|
||||
"Policy",
|
||||
"VwPolicy",
|
||||
"embed",
|
||||
"stringify_embedding",
|
||||
]
|
||||
635
libs/experimental/langchain_experimental/rl_chain/base.py
Normal file
635
libs/experimental/langchain_experimental/rl_chain/base.py
Normal file
@@ -0,0 +1,635 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Generic,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Type,
|
||||
TypeVar,
|
||||
Union,
|
||||
)
|
||||
|
||||
from langchain.callbacks.manager import CallbackManagerForChainRun
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.prompts import (
|
||||
BasePromptTemplate,
|
||||
ChatPromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
)
|
||||
|
||||
from langchain_experimental.pydantic_v1 import BaseModel, Extra, root_validator
|
||||
from langchain_experimental.rl_chain.metrics import (
|
||||
MetricsTrackerAverage,
|
||||
MetricsTrackerRollingWindow,
|
||||
)
|
||||
from langchain_experimental.rl_chain.model_repository import ModelRepository
|
||||
from langchain_experimental.rl_chain.vw_logger import VwLogger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import vowpal_wabbit_next as vw
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class _BasedOn:
|
||||
def __init__(self, value: Any):
|
||||
self.value = value
|
||||
|
||||
def __str__(self) -> str:
|
||||
return str(self.value)
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
|
||||
def BasedOn(anything: Any) -> _BasedOn:
|
||||
return _BasedOn(anything)
|
||||
|
||||
|
||||
class _ToSelectFrom:
|
||||
def __init__(self, value: Any):
|
||||
self.value = value
|
||||
|
||||
def __str__(self) -> str:
|
||||
return str(self.value)
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
|
||||
def ToSelectFrom(anything: Any) -> _ToSelectFrom:
|
||||
if not isinstance(anything, list):
|
||||
raise ValueError("ToSelectFrom must be a list to select from")
|
||||
return _ToSelectFrom(anything)
|
||||
|
||||
|
||||
class _Embed:
|
||||
def __init__(self, value: Any, keep: bool = False):
|
||||
self.value = value
|
||||
self.keep = keep
|
||||
|
||||
def __str__(self) -> str:
|
||||
return str(self.value)
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
|
||||
def Embed(anything: Any, keep: bool = False) -> Any:
|
||||
if isinstance(anything, _ToSelectFrom):
|
||||
return ToSelectFrom(Embed(anything.value, keep=keep))
|
||||
elif isinstance(anything, _BasedOn):
|
||||
return BasedOn(Embed(anything.value, keep=keep))
|
||||
if isinstance(anything, list):
|
||||
return [Embed(v, keep=keep) for v in anything]
|
||||
elif isinstance(anything, dict):
|
||||
return {k: Embed(v, keep=keep) for k, v in anything.items()}
|
||||
elif isinstance(anything, _Embed):
|
||||
return anything
|
||||
return _Embed(anything, keep=keep)
|
||||
|
||||
|
||||
def EmbedAndKeep(anything: Any) -> Any:
|
||||
return Embed(anything, keep=True)
|
||||
|
||||
|
||||
# helper functions
|
||||
|
||||
|
||||
def stringify_embedding(embedding: List) -> str:
|
||||
return " ".join([f"{i}:{e}" for i, e in enumerate(embedding)])
|
||||
|
||||
|
||||
def parse_lines(parser: "vw.TextFormatParser", input_str: str) -> List["vw.Example"]:
|
||||
return [parser.parse_line(line) for line in input_str.split("\n")]
|
||||
|
||||
|
||||
def get_based_on_and_to_select_from(inputs: Dict[str, Any]) -> Tuple[Dict, Dict]:
|
||||
to_select_from = {
|
||||
k: inputs[k].value
|
||||
for k in inputs.keys()
|
||||
if isinstance(inputs[k], _ToSelectFrom)
|
||||
}
|
||||
|
||||
if not to_select_from:
|
||||
raise ValueError(
|
||||
"No variables using 'ToSelectFrom' found in the inputs. Please include at least one variable containing a list to select from." # noqa: E501
|
||||
)
|
||||
|
||||
based_on = {
|
||||
k: inputs[k].value if isinstance(inputs[k].value, list) else [inputs[k].value]
|
||||
for k in inputs.keys()
|
||||
if isinstance(inputs[k], _BasedOn)
|
||||
}
|
||||
|
||||
return based_on, to_select_from
|
||||
|
||||
|
||||
def prepare_inputs_for_autoembed(inputs: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
go over all the inputs and if something is either wrapped in _ToSelectFrom or _BasedOn, and if their inner values are not already _Embed,
|
||||
then wrap them in EmbedAndKeep while retaining their _ToSelectFrom or _BasedOn status
|
||||
""" # noqa: E501
|
||||
|
||||
next_inputs = inputs.copy()
|
||||
for k, v in next_inputs.items():
|
||||
if isinstance(v, _ToSelectFrom) or isinstance(v, _BasedOn):
|
||||
if not isinstance(v.value, _Embed):
|
||||
next_inputs[k].value = EmbedAndKeep(v.value)
|
||||
return next_inputs
|
||||
|
||||
|
||||
# end helper functions
|
||||
|
||||
|
||||
class Selected(ABC):
|
||||
pass
|
||||
|
||||
|
||||
TSelected = TypeVar("TSelected", bound=Selected)
|
||||
|
||||
|
||||
class Event(Generic[TSelected], ABC):
|
||||
inputs: Dict[str, Any]
|
||||
selected: Optional[TSelected]
|
||||
|
||||
def __init__(self, inputs: Dict[str, Any], selected: Optional[TSelected] = None):
|
||||
self.inputs = inputs
|
||||
self.selected = selected
|
||||
|
||||
|
||||
TEvent = TypeVar("TEvent", bound=Event)
|
||||
|
||||
|
||||
class Policy(Generic[TEvent], ABC):
|
||||
def __init__(self, **kwargs: Any):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def predict(self, event: TEvent) -> Any:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def learn(self, event: TEvent) -> None:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def log(self, event: TEvent) -> None:
|
||||
...
|
||||
|
||||
def save(self) -> None:
|
||||
pass
|
||||
|
||||
|
||||
class VwPolicy(Policy):
|
||||
def __init__(
|
||||
self,
|
||||
model_repo: ModelRepository,
|
||||
vw_cmd: List[str],
|
||||
feature_embedder: Embedder,
|
||||
vw_logger: VwLogger,
|
||||
*args: Any,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.model_repo = model_repo
|
||||
self.workspace = self.model_repo.load(vw_cmd)
|
||||
self.feature_embedder = feature_embedder
|
||||
self.vw_logger = vw_logger
|
||||
|
||||
def predict(self, event: TEvent) -> Any:
|
||||
import vowpal_wabbit_next as vw
|
||||
|
||||
text_parser = vw.TextFormatParser(self.workspace)
|
||||
return self.workspace.predict_one(
|
||||
parse_lines(text_parser, self.feature_embedder.format(event))
|
||||
)
|
||||
|
||||
def learn(self, event: TEvent) -> None:
|
||||
import vowpal_wabbit_next as vw
|
||||
|
||||
vw_ex = self.feature_embedder.format(event)
|
||||
text_parser = vw.TextFormatParser(self.workspace)
|
||||
multi_ex = parse_lines(text_parser, vw_ex)
|
||||
self.workspace.learn_one(multi_ex)
|
||||
|
||||
def log(self, event: TEvent) -> None:
|
||||
if self.vw_logger.logging_enabled():
|
||||
vw_ex = self.feature_embedder.format(event)
|
||||
self.vw_logger.log(vw_ex)
|
||||
|
||||
def save(self) -> None:
|
||||
self.model_repo.save(self.workspace)
|
||||
|
||||
|
||||
class Embedder(Generic[TEvent], ABC):
|
||||
def __init__(self, *args: Any, **kwargs: Any):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def format(self, event: TEvent) -> str:
|
||||
...
|
||||
|
||||
|
||||
class SelectionScorer(Generic[TEvent], ABC, BaseModel):
|
||||
"""Abstract method to grade the chosen selection or the response of the llm"""
|
||||
|
||||
@abstractmethod
|
||||
def score_response(
|
||||
self, inputs: Dict[str, Any], llm_response: str, event: TEvent
|
||||
) -> float:
|
||||
...
|
||||
|
||||
|
||||
class AutoSelectionScorer(SelectionScorer[Event], BaseModel):
|
||||
llm_chain: LLMChain
|
||||
prompt: Union[BasePromptTemplate, None] = None
|
||||
scoring_criteria_template_str: Optional[str] = None
|
||||
|
||||
@staticmethod
|
||||
def get_default_system_prompt() -> SystemMessagePromptTemplate:
|
||||
return SystemMessagePromptTemplate.from_template(
|
||||
"PLEASE RESPOND ONLY WITH A SINGLE FLOAT AND NO OTHER TEXT EXPLANATION\n \
|
||||
You are a strict judge that is called on to rank a response based on \
|
||||
given criteria. You must respond with your ranking by providing a \
|
||||
single float within the range [0, 1], 0 being very bad \
|
||||
response and 1 being very good response."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_default_prompt() -> ChatPromptTemplate:
|
||||
human_template = 'Given this based_on "{rl_chain_selected_based_on}" \
|
||||
as the most important attribute, rank how good or bad this text is: \
|
||||
"{rl_chain_selected}".'
|
||||
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
|
||||
default_system_prompt = AutoSelectionScorer.get_default_system_prompt()
|
||||
chat_prompt = ChatPromptTemplate.from_messages(
|
||||
[default_system_prompt, human_message_prompt]
|
||||
)
|
||||
return chat_prompt
|
||||
|
||||
@root_validator(pre=True)
|
||||
def set_prompt_and_llm_chain(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||
llm = values.get("llm")
|
||||
prompt = values.get("prompt")
|
||||
scoring_criteria_template_str = values.get("scoring_criteria_template_str")
|
||||
if prompt is None and scoring_criteria_template_str is None:
|
||||
prompt = AutoSelectionScorer.get_default_prompt()
|
||||
elif prompt is None and scoring_criteria_template_str is not None:
|
||||
human_message_prompt = HumanMessagePromptTemplate.from_template(
|
||||
scoring_criteria_template_str
|
||||
)
|
||||
default_system_prompt = AutoSelectionScorer.get_default_system_prompt()
|
||||
prompt = ChatPromptTemplate.from_messages(
|
||||
[default_system_prompt, human_message_prompt]
|
||||
)
|
||||
values["prompt"] = prompt
|
||||
values["llm_chain"] = LLMChain(llm=llm, prompt=prompt)
|
||||
return values
|
||||
|
||||
def score_response(
|
||||
self, inputs: Dict[str, Any], llm_response: str, event: Event
|
||||
) -> float:
|
||||
ranking = self.llm_chain.predict(llm_response=llm_response, **inputs)
|
||||
ranking = ranking.strip()
|
||||
try:
|
||||
resp = float(ranking)
|
||||
return resp
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"The auto selection scorer did not manage to score the response, there is always the option to try again or tweak the reward prompt. Error: {e}" # noqa: E501
|
||||
)
|
||||
|
||||
|
||||
class RLChain(Chain, Generic[TEvent]):
|
||||
"""
|
||||
The `RLChain` class leverages the Vowpal Wabbit (VW) model as a learned policy for reinforcement learning.
|
||||
|
||||
Attributes:
|
||||
- llm_chain (Chain): Represents the underlying Language Model chain.
|
||||
- prompt (BasePromptTemplate): The template for the base prompt.
|
||||
- selection_scorer (Union[SelectionScorer, None]): Scorer for the selection. Can be set to None.
|
||||
- policy (Optional[Policy]): The policy used by the chain to learn to populate a dynamic prompt.
|
||||
- auto_embed (bool): Determines if embedding should be automatic. Default is False.
|
||||
- metrics (Optional[Union[MetricsTrackerRollingWindow, MetricsTrackerAverage]]): Tracker for metrics, can be set to None.
|
||||
|
||||
Initialization Attributes:
|
||||
- feature_embedder (Embedder): Embedder used for the `BasedOn` and `ToSelectFrom` inputs.
|
||||
- model_save_dir (str, optional): Directory for saving the VW model. Default is the current directory.
|
||||
- reset_model (bool): If set to True, the model starts training from scratch. Default is False.
|
||||
- vw_cmd (List[str], optional): Command line arguments for the VW model.
|
||||
- policy (Type[VwPolicy]): Policy used by the chain.
|
||||
- vw_logs (Optional[Union[str, os.PathLike]]): Path for the VW logs.
|
||||
- metrics_step (int): Step for the metrics tracker. Default is -1. If set without metrics_window_size, average metrics will be tracked, otherwise rolling window metrics will be tracked.
|
||||
- metrics_window_size (int): Window size for the metrics tracker. Default is -1. If set, rolling window metrics will be tracked.
|
||||
|
||||
Notes:
|
||||
The class initializes the VW model using the provided arguments. If `selection_scorer` is not provided, a warning is logged, indicating that no reinforcement learning will occur unless the `update_with_delayed_score` method is called.
|
||||
""" # noqa: E501
|
||||
|
||||
class _NoOpPolicy(Policy):
|
||||
"""Placeholder policy that does nothing"""
|
||||
|
||||
def predict(self, event: TEvent) -> Any:
|
||||
return None
|
||||
|
||||
def learn(self, event: TEvent) -> None:
|
||||
pass
|
||||
|
||||
def log(self, event: TEvent) -> None:
|
||||
pass
|
||||
|
||||
llm_chain: Chain
|
||||
|
||||
output_key: str = "result" #: :meta private:
|
||||
prompt: BasePromptTemplate
|
||||
selection_scorer: Union[SelectionScorer, None]
|
||||
active_policy: Policy = _NoOpPolicy()
|
||||
auto_embed: bool = False
|
||||
selection_scorer_activated: bool = True
|
||||
selected_input_key = "rl_chain_selected"
|
||||
selected_based_on_input_key = "rl_chain_selected_based_on"
|
||||
metrics: Optional[Union[MetricsTrackerRollingWindow, MetricsTrackerAverage]] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
feature_embedder: Embedder,
|
||||
model_save_dir: str = "./",
|
||||
reset_model: bool = False,
|
||||
vw_cmd: Optional[List[str]] = None,
|
||||
policy: Type[Policy] = VwPolicy,
|
||||
vw_logs: Optional[Union[str, os.PathLike]] = None,
|
||||
metrics_step: int = -1,
|
||||
metrics_window_size: int = -1,
|
||||
*args: Any,
|
||||
**kwargs: Any,
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
if self.selection_scorer is None:
|
||||
logger.warning(
|
||||
"No selection scorer provided, which means that no \
|
||||
reinforcement learning will be done in the RL chain \
|
||||
unless update_with_delayed_score is called."
|
||||
)
|
||||
|
||||
if isinstance(self.active_policy, RLChain._NoOpPolicy):
|
||||
self.active_policy = policy(
|
||||
model_repo=ModelRepository(
|
||||
model_save_dir, with_history=True, reset=reset_model
|
||||
),
|
||||
vw_cmd=vw_cmd or [],
|
||||
feature_embedder=feature_embedder,
|
||||
vw_logger=VwLogger(vw_logs),
|
||||
)
|
||||
|
||||
if metrics_window_size > 0:
|
||||
self.metrics = MetricsTrackerRollingWindow(
|
||||
step=metrics_step, window_size=metrics_window_size
|
||||
)
|
||||
else:
|
||||
self.metrics = MetricsTrackerAverage(step=metrics_step)
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
extra = Extra.forbid
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
"""Expect input key.
|
||||
:meta private:
|
||||
"""
|
||||
return []
|
||||
|
||||
@property
|
||||
def output_keys(self) -> List[str]:
|
||||
"""Expect output key.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return [self.output_key]
|
||||
|
||||
def update_with_delayed_score(
|
||||
self, score: float, chain_response: Dict[str, Any], force_score: bool = False
|
||||
) -> None:
|
||||
"""
|
||||
Updates the learned policy with the score provided.
|
||||
Will raise an error if selection_scorer is set, and force_score=True was not provided during the method call
|
||||
""" # noqa: E501
|
||||
if self._can_use_selection_scorer() and not force_score:
|
||||
raise RuntimeError(
|
||||
"The selection scorer is set, and force_score was not set to True. Please set force_score=True to use this function." # noqa: E501
|
||||
)
|
||||
if self.metrics:
|
||||
self.metrics.on_feedback(score)
|
||||
event: TEvent = chain_response["selection_metadata"]
|
||||
self._call_after_scoring_before_learning(event=event, score=score)
|
||||
self.active_policy.learn(event=event)
|
||||
self.active_policy.log(event=event)
|
||||
|
||||
def deactivate_selection_scorer(self) -> None:
|
||||
"""
|
||||
Deactivates the selection scorer, meaning that the chain will no longer attempt to use the selection scorer to score responses.
|
||||
""" # noqa: E501
|
||||
self.selection_scorer_activated = False
|
||||
|
||||
def activate_selection_scorer(self) -> None:
|
||||
"""
|
||||
Activates the selection scorer, meaning that the chain will attempt to use the selection scorer to score responses.
|
||||
""" # noqa: E501
|
||||
self.selection_scorer_activated = True
|
||||
|
||||
def save_progress(self) -> None:
|
||||
"""
|
||||
This function should be called to save the state of the learned policy model.
|
||||
""" # noqa: E501
|
||||
self.active_policy.save()
|
||||
|
||||
def _validate_inputs(self, inputs: Dict[str, Any]) -> None:
|
||||
super()._validate_inputs(inputs)
|
||||
if (
|
||||
self.selected_input_key in inputs.keys()
|
||||
or self.selected_based_on_input_key in inputs.keys()
|
||||
):
|
||||
raise ValueError(
|
||||
f"The rl chain does not accept '{self.selected_input_key}' or '{self.selected_based_on_input_key}' as input keys, they are reserved for internal use during auto reward." # noqa: E501
|
||||
)
|
||||
|
||||
def _can_use_selection_scorer(self) -> bool:
|
||||
"""
|
||||
Returns whether the chain can use the selection scorer to score responses or not.
|
||||
""" # noqa: E501
|
||||
return self.selection_scorer is not None and self.selection_scorer_activated
|
||||
|
||||
@abstractmethod
|
||||
def _call_before_predict(self, inputs: Dict[str, Any]) -> TEvent:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def _call_after_predict_before_llm(
|
||||
self, inputs: Dict[str, Any], event: TEvent, prediction: Any
|
||||
) -> Tuple[Dict[str, Any], TEvent]:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def _call_after_llm_before_scoring(
|
||||
self, llm_response: str, event: TEvent
|
||||
) -> Tuple[Dict[str, Any], TEvent]:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def _call_after_scoring_before_learning(
|
||||
self, event: TEvent, score: Optional[float]
|
||||
) -> TEvent:
|
||||
...
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, Any]:
|
||||
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
|
||||
|
||||
event: TEvent = self._call_before_predict(inputs=inputs)
|
||||
prediction = self.active_policy.predict(event=event)
|
||||
if self.metrics:
|
||||
self.metrics.on_decision()
|
||||
|
||||
next_chain_inputs, event = self._call_after_predict_before_llm(
|
||||
inputs=inputs, event=event, prediction=prediction
|
||||
)
|
||||
|
||||
t = self.llm_chain.run(**next_chain_inputs, callbacks=_run_manager.get_child())
|
||||
_run_manager.on_text(t, color="green", verbose=self.verbose)
|
||||
t = t.strip()
|
||||
|
||||
if self.verbose:
|
||||
_run_manager.on_text("\nCode: ", verbose=self.verbose)
|
||||
|
||||
output = t
|
||||
_run_manager.on_text("\nAnswer: ", verbose=self.verbose)
|
||||
_run_manager.on_text(output, color="yellow", verbose=self.verbose)
|
||||
|
||||
next_chain_inputs, event = self._call_after_llm_before_scoring(
|
||||
llm_response=output, event=event
|
||||
)
|
||||
|
||||
score = None
|
||||
try:
|
||||
if self._can_use_selection_scorer():
|
||||
score = self.selection_scorer.score_response( # type: ignore
|
||||
inputs=next_chain_inputs, llm_response=output, event=event
|
||||
)
|
||||
except Exception as e:
|
||||
logger.info(
|
||||
f"The selection scorer was not able to score, \
|
||||
and the chain was not able to adjust to this response, error: {e}"
|
||||
)
|
||||
if self.metrics and score is not None:
|
||||
self.metrics.on_feedback(score)
|
||||
|
||||
event = self._call_after_scoring_before_learning(score=score, event=event)
|
||||
self.active_policy.learn(event=event)
|
||||
self.active_policy.log(event=event)
|
||||
|
||||
return {self.output_key: {"response": output, "selection_metadata": event}}
|
||||
|
||||
@property
|
||||
def _chain_type(self) -> str:
|
||||
return "llm_personalizer_chain"
|
||||
|
||||
|
||||
def is_stringtype_instance(item: Any) -> bool:
|
||||
"""Helper function to check if an item is a string."""
|
||||
return isinstance(item, str) or (
|
||||
isinstance(item, _Embed) and isinstance(item.value, str)
|
||||
)
|
||||
|
||||
|
||||
def embed_string_type(
|
||||
item: Union[str, _Embed], model: Any, namespace: Optional[str] = None
|
||||
) -> Dict[str, Union[str, List[str]]]:
|
||||
"""Helper function to embed a string or an _Embed object."""
|
||||
keep_str = ""
|
||||
if isinstance(item, _Embed):
|
||||
encoded = stringify_embedding(model.encode(item.value))
|
||||
if item.keep:
|
||||
keep_str = item.value.replace(" ", "_") + " "
|
||||
elif isinstance(item, str):
|
||||
encoded = item.replace(" ", "_")
|
||||
else:
|
||||
raise ValueError(f"Unsupported type {type(item)} for embedding")
|
||||
|
||||
if namespace is None:
|
||||
raise ValueError(
|
||||
"The default namespace must be provided when embedding a string or _Embed object." # noqa: E501
|
||||
)
|
||||
|
||||
return {namespace: keep_str + encoded}
|
||||
|
||||
|
||||
def embed_dict_type(item: Dict, model: Any) -> Dict[str, Any]:
|
||||
"""Helper function to embed a dictionary item."""
|
||||
inner_dict: Dict = {}
|
||||
for ns, embed_item in item.items():
|
||||
if isinstance(embed_item, list):
|
||||
inner_dict[ns] = []
|
||||
for embed_list_item in embed_item:
|
||||
embedded = embed_string_type(embed_list_item, model, ns)
|
||||
inner_dict[ns].append(embedded[ns])
|
||||
else:
|
||||
inner_dict.update(embed_string_type(embed_item, model, ns))
|
||||
return inner_dict
|
||||
|
||||
|
||||
def embed_list_type(
|
||||
item: list, model: Any, namespace: Optional[str] = None
|
||||
) -> List[Dict[str, Union[str, List[str]]]]:
|
||||
ret_list: List = []
|
||||
for embed_item in item:
|
||||
if isinstance(embed_item, dict):
|
||||
ret_list.append(embed_dict_type(embed_item, model))
|
||||
elif isinstance(embed_item, list):
|
||||
item_embedding = embed_list_type(embed_item, model, namespace)
|
||||
# Get the first key from the first dictionary
|
||||
first_key = next(iter(item_embedding[0]))
|
||||
# Group the values under that key
|
||||
grouping = {first_key: [item[first_key] for item in item_embedding]}
|
||||
ret_list.append(grouping)
|
||||
else:
|
||||
ret_list.append(embed_string_type(embed_item, model, namespace))
|
||||
return ret_list
|
||||
|
||||
|
||||
def embed(
|
||||
to_embed: Union[Union[str, _Embed], Dict, List[Union[str, _Embed]], List[Dict]],
|
||||
model: Any,
|
||||
namespace: Optional[str] = None,
|
||||
) -> List[Dict[str, Union[str, List[str]]]]:
|
||||
"""
|
||||
Embeds the actions or context using the SentenceTransformer model (or a model that has an `encode` function)
|
||||
|
||||
Attributes:
|
||||
to_embed: (Union[Union(str, _Embed(str)), Dict, List[Union(str, _Embed(str))], List[Dict]], required) The text to be embedded, either a string, a list of strings or a dictionary or a list of dictionaries.
|
||||
namespace: (str, optional) The default namespace to use when dictionary or list of dictionaries not provided.
|
||||
model: (Any, required) The model to use for embedding
|
||||
Returns:
|
||||
List[Dict[str, str]]: A list of dictionaries where each dictionary has the namespace as the key and the embedded string as the value
|
||||
""" # noqa: E501
|
||||
if (isinstance(to_embed, _Embed) and isinstance(to_embed.value, str)) or isinstance(
|
||||
to_embed, str
|
||||
):
|
||||
return [embed_string_type(to_embed, model, namespace)]
|
||||
elif isinstance(to_embed, dict):
|
||||
return [embed_dict_type(to_embed, model)]
|
||||
elif isinstance(to_embed, list):
|
||||
return embed_list_type(to_embed, model, namespace)
|
||||
else:
|
||||
raise ValueError("Invalid input format for embedding")
|
||||
66
libs/experimental/langchain_experimental/rl_chain/metrics.py
Normal file
66
libs/experimental/langchain_experimental/rl_chain/metrics.py
Normal file
@@ -0,0 +1,66 @@
|
||||
from collections import deque
|
||||
from typing import TYPE_CHECKING, Dict, List, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class MetricsTrackerAverage:
|
||||
def __init__(self, step: int):
|
||||
self.history: List[Dict[str, Union[int, float]]] = [{"step": 0, "score": 0}]
|
||||
self.step: int = step
|
||||
self.i: int = 0
|
||||
self.num: float = 0
|
||||
self.denom: float = 0
|
||||
|
||||
@property
|
||||
def score(self) -> float:
|
||||
return self.num / self.denom if self.denom > 0 else 0
|
||||
|
||||
def on_decision(self) -> None:
|
||||
self.denom += 1
|
||||
|
||||
def on_feedback(self, score: float) -> None:
|
||||
self.num += score or 0
|
||||
self.i += 1
|
||||
if self.step > 0 and self.i % self.step == 0:
|
||||
self.history.append({"step": self.i, "score": self.score})
|
||||
|
||||
def to_pandas(self) -> "pd.DataFrame":
|
||||
import pandas as pd
|
||||
|
||||
return pd.DataFrame(self.history)
|
||||
|
||||
|
||||
class MetricsTrackerRollingWindow:
|
||||
def __init__(self, window_size: int, step: int):
|
||||
self.history: List[Dict[str, Union[int, float]]] = [{"step": 0, "score": 0}]
|
||||
self.step: int = step
|
||||
self.i: int = 0
|
||||
self.window_size: int = window_size
|
||||
self.queue: deque = deque()
|
||||
self.sum: float = 0.0
|
||||
|
||||
@property
|
||||
def score(self) -> float:
|
||||
return self.sum / len(self.queue) if len(self.queue) > 0 else 0
|
||||
|
||||
def on_decision(self) -> None:
|
||||
pass
|
||||
|
||||
def on_feedback(self, value: float) -> None:
|
||||
self.sum += value
|
||||
self.queue.append(value)
|
||||
self.i += 1
|
||||
|
||||
if len(self.queue) > self.window_size:
|
||||
old_val = self.queue.popleft()
|
||||
self.sum -= old_val
|
||||
|
||||
if self.step > 0 and self.i % self.step == 0:
|
||||
self.history.append({"step": self.i, "score": self.sum / len(self.queue)})
|
||||
|
||||
def to_pandas(self) -> "pd.DataFrame":
|
||||
import pandas as pd
|
||||
|
||||
return pd.DataFrame(self.history)
|
||||
@@ -0,0 +1,63 @@
|
||||
import datetime
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, List, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import vowpal_wabbit_next as vw
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ModelRepository:
|
||||
def __init__(
|
||||
self,
|
||||
folder: Union[str, os.PathLike],
|
||||
with_history: bool = True,
|
||||
reset: bool = False,
|
||||
):
|
||||
self.folder = Path(folder)
|
||||
self.model_path = self.folder / "latest.vw"
|
||||
self.with_history = with_history
|
||||
if reset and self.has_history():
|
||||
logger.warning(
|
||||
"There is non empty history which is recommended to be cleaned up"
|
||||
)
|
||||
if self.model_path.exists():
|
||||
os.remove(self.model_path)
|
||||
|
||||
self.folder.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def get_tag(self) -> str:
|
||||
return datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
|
||||
def has_history(self) -> bool:
|
||||
return len(glob.glob(str(self.folder / "model-????????-??????.vw"))) > 0
|
||||
|
||||
def save(self, workspace: "vw.Workspace") -> None:
|
||||
with open(self.model_path, "wb") as f:
|
||||
logger.info(f"storing rl_chain model in: {self.model_path}")
|
||||
f.write(workspace.serialize())
|
||||
if self.with_history: # write history
|
||||
shutil.copyfile(self.model_path, self.folder / f"model-{self.get_tag()}.vw")
|
||||
|
||||
def load(self, commandline: List[str]) -> "vw.Workspace":
|
||||
try:
|
||||
import vowpal_wabbit_next as vw
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Unable to import vowpal_wabbit_next, please install with "
|
||||
"`pip install vowpal_wabbit_next`."
|
||||
) from e
|
||||
|
||||
model_data = None
|
||||
if self.model_path.exists():
|
||||
with open(self.model_path, "rb") as f:
|
||||
model_data = f.read()
|
||||
if model_data:
|
||||
logger.info(f"rl_chain model is loaded from: {self.model_path}")
|
||||
return vw.Workspace(commandline, model_data=model_data)
|
||||
return vw.Workspace(commandline)
|
||||
@@ -0,0 +1,412 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
||||
|
||||
from langchain.base_language import BaseLanguageModel
|
||||
from langchain.callbacks.manager import CallbackManagerForChainRun
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.prompts import BasePromptTemplate
|
||||
|
||||
import langchain_experimental.rl_chain.base as base
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# sentinel object used to distinguish between
|
||||
# user didn't supply anything or user explicitly supplied None
|
||||
SENTINEL = object()
|
||||
|
||||
|
||||
class PickBestSelected(base.Selected):
|
||||
index: Optional[int]
|
||||
probability: Optional[float]
|
||||
score: Optional[float]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
index: Optional[int] = None,
|
||||
probability: Optional[float] = None,
|
||||
score: Optional[float] = None,
|
||||
):
|
||||
self.index = index
|
||||
self.probability = probability
|
||||
self.score = score
|
||||
|
||||
|
||||
class PickBestEvent(base.Event[PickBestSelected]):
|
||||
def __init__(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
to_select_from: Dict[str, Any],
|
||||
based_on: Dict[str, Any],
|
||||
selected: Optional[PickBestSelected] = None,
|
||||
):
|
||||
super().__init__(inputs=inputs, selected=selected)
|
||||
self.to_select_from = to_select_from
|
||||
self.based_on = based_on
|
||||
|
||||
|
||||
class PickBestFeatureEmbedder(base.Embedder[PickBestEvent]):
|
||||
"""
|
||||
Text Embedder class that embeds the `BasedOn` and `ToSelectFrom` inputs into a format that can be used by the learning policy
|
||||
|
||||
Attributes:
|
||||
model name (Any, optional): The type of embeddings to be used for feature representation. Defaults to BERT SentenceTransformer.
|
||||
""" # noqa E501
|
||||
|
||||
def __init__(
|
||||
self, auto_embed: bool, model: Optional[Any] = None, *args: Any, **kwargs: Any
|
||||
):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
if model is None:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
model = SentenceTransformer("all-mpnet-base-v2")
|
||||
|
||||
self.model = model
|
||||
self.auto_embed = auto_embed
|
||||
|
||||
@staticmethod
|
||||
def _str(embedding: List[float]) -> str:
|
||||
return " ".join([f"{i}:{e}" for i, e in enumerate(embedding)])
|
||||
|
||||
def get_label(self, event: PickBestEvent) -> tuple:
|
||||
cost = None
|
||||
if event.selected:
|
||||
chosen_action = event.selected.index
|
||||
cost = (
|
||||
-1.0 * event.selected.score
|
||||
if event.selected.score is not None
|
||||
else None
|
||||
)
|
||||
prob = event.selected.probability
|
||||
return chosen_action, cost, prob
|
||||
else:
|
||||
return None, None, None
|
||||
|
||||
def get_context_and_action_embeddings(self, event: PickBestEvent) -> tuple:
|
||||
context_emb = base.embed(event.based_on, self.model) if event.based_on else None
|
||||
to_select_from_var_name, to_select_from = next(
|
||||
iter(event.to_select_from.items()), (None, None)
|
||||
)
|
||||
|
||||
action_embs = (
|
||||
(
|
||||
base.embed(to_select_from, self.model, to_select_from_var_name)
|
||||
if event.to_select_from
|
||||
else None
|
||||
)
|
||||
if to_select_from
|
||||
else None
|
||||
)
|
||||
|
||||
if not context_emb or not action_embs:
|
||||
raise ValueError(
|
||||
"Context and to_select_from must be provided in the inputs dictionary"
|
||||
)
|
||||
return context_emb, action_embs
|
||||
|
||||
def get_indexed_dot_product(self, context_emb: List, action_embs: List) -> Dict:
|
||||
import numpy as np
|
||||
|
||||
unique_contexts = set()
|
||||
for context_item in context_emb:
|
||||
for ns, ee in context_item.items():
|
||||
if isinstance(ee, list):
|
||||
for ea in ee:
|
||||
unique_contexts.add(f"{ns}={ea}")
|
||||
else:
|
||||
unique_contexts.add(f"{ns}={ee}")
|
||||
|
||||
encoded_contexts = self.model.encode(list(unique_contexts))
|
||||
context_embeddings = dict(zip(unique_contexts, encoded_contexts))
|
||||
|
||||
unique_actions = set()
|
||||
for action in action_embs:
|
||||
for ns, e in action.items():
|
||||
if isinstance(e, list):
|
||||
for ea in e:
|
||||
unique_actions.add(f"{ns}={ea}")
|
||||
else:
|
||||
unique_actions.add(f"{ns}={e}")
|
||||
|
||||
encoded_actions = self.model.encode(list(unique_actions))
|
||||
action_embeddings = dict(zip(unique_actions, encoded_actions))
|
||||
|
||||
action_matrix = np.stack([v for k, v in action_embeddings.items()])
|
||||
context_matrix = np.stack([v for k, v in context_embeddings.items()])
|
||||
dot_product_matrix = np.dot(context_matrix, action_matrix.T)
|
||||
|
||||
indexed_dot_product: Dict = {}
|
||||
|
||||
for i, context_key in enumerate(context_embeddings.keys()):
|
||||
indexed_dot_product[context_key] = {}
|
||||
for j, action_key in enumerate(action_embeddings.keys()):
|
||||
indexed_dot_product[context_key][action_key] = dot_product_matrix[i, j]
|
||||
|
||||
return indexed_dot_product
|
||||
|
||||
def format_auto_embed_on(self, event: PickBestEvent) -> str:
|
||||
chosen_action, cost, prob = self.get_label(event)
|
||||
context_emb, action_embs = self.get_context_and_action_embeddings(event)
|
||||
indexed_dot_product = self.get_indexed_dot_product(context_emb, action_embs)
|
||||
|
||||
action_lines = []
|
||||
for i, action in enumerate(action_embs):
|
||||
line_parts = []
|
||||
dot_prods = []
|
||||
if cost is not None and chosen_action == i:
|
||||
line_parts.append(f"{chosen_action}:{cost}:{prob}")
|
||||
for ns, action in action.items():
|
||||
line_parts.append(f"|{ns}")
|
||||
elements = action if isinstance(action, list) else [action]
|
||||
nsa = []
|
||||
for elem in elements:
|
||||
line_parts.append(f"{elem}")
|
||||
ns_a = f"{ns}={elem}"
|
||||
nsa.append(ns_a)
|
||||
for k, v in indexed_dot_product.items():
|
||||
dot_prods.append(v[ns_a])
|
||||
nsa_str = " ".join(nsa)
|
||||
line_parts.append(f"|# {nsa_str}")
|
||||
|
||||
line_parts.append(f"|dotprod {self._str(dot_prods)}")
|
||||
action_lines.append(" ".join(line_parts))
|
||||
|
||||
shared = []
|
||||
for item in context_emb:
|
||||
for ns, context in item.items():
|
||||
shared.append(f"|{ns}")
|
||||
elements = context if isinstance(context, list) else [context]
|
||||
nsc = []
|
||||
for elem in elements:
|
||||
shared.append(f"{elem}")
|
||||
nsc.append(f"{ns}={elem}")
|
||||
nsc_str = " ".join(nsc)
|
||||
shared.append(f"|@ {nsc_str}")
|
||||
|
||||
return "shared " + " ".join(shared) + "\n" + "\n".join(action_lines)
|
||||
|
||||
def format_auto_embed_off(self, event: PickBestEvent) -> str:
|
||||
"""
|
||||
Converts the `BasedOn` and `ToSelectFrom` into a format that can be used by VW
|
||||
"""
|
||||
chosen_action, cost, prob = self.get_label(event)
|
||||
context_emb, action_embs = self.get_context_and_action_embeddings(event)
|
||||
|
||||
example_string = ""
|
||||
example_string += "shared "
|
||||
for context_item in context_emb:
|
||||
for ns, based_on in context_item.items():
|
||||
e = " ".join(based_on) if isinstance(based_on, list) else based_on
|
||||
example_string += f"|{ns} {e} "
|
||||
example_string += "\n"
|
||||
|
||||
for i, action in enumerate(action_embs):
|
||||
if cost is not None and chosen_action == i:
|
||||
example_string += f"{chosen_action}:{cost}:{prob} "
|
||||
for ns, action_embedding in action.items():
|
||||
e = (
|
||||
" ".join(action_embedding)
|
||||
if isinstance(action_embedding, list)
|
||||
else action_embedding
|
||||
)
|
||||
example_string += f"|{ns} {e} "
|
||||
example_string += "\n"
|
||||
# Strip the last newline
|
||||
return example_string[:-1]
|
||||
|
||||
def format(self, event: PickBestEvent) -> str:
|
||||
if self.auto_embed:
|
||||
return self.format_auto_embed_on(event)
|
||||
else:
|
||||
return self.format_auto_embed_off(event)
|
||||
|
||||
|
||||
class PickBestRandomPolicy(base.Policy[PickBestEvent]):
|
||||
def __init__(self, feature_embedder: base.Embedder, **kwargs: Any):
|
||||
self.feature_embedder = feature_embedder
|
||||
|
||||
def predict(self, event: PickBestEvent) -> List[Tuple[int, float]]:
|
||||
num_items = len(event.to_select_from)
|
||||
return [(i, 1.0 / num_items) for i in range(num_items)]
|
||||
|
||||
def learn(self, event: PickBestEvent) -> None:
|
||||
pass
|
||||
|
||||
def log(self, event: PickBestEvent) -> None:
|
||||
pass
|
||||
|
||||
|
||||
class PickBest(base.RLChain[PickBestEvent]):
|
||||
"""
|
||||
`PickBest` is a class designed to leverage the Vowpal Wabbit (VW) model for reinforcement learning with a context, with the goal of modifying the prompt before the LLM call.
|
||||
|
||||
Each invocation of the chain's `run()` method should be equipped with a set of potential actions (`ToSelectFrom`) and will result in the selection of a specific action based on the `BasedOn` input. This chosen action then informs the LLM (Language Model) prompt for the subsequent response generation.
|
||||
|
||||
The standard operation flow of this Chain includes:
|
||||
1. The Chain is invoked with inputs containing the `BasedOn` criteria and a list of potential actions (`ToSelectFrom`).
|
||||
2. An action is selected based on the `BasedOn` input.
|
||||
3. The LLM is called with the dynamic prompt, producing a response.
|
||||
4. If a `selection_scorer` is provided, it is used to score the selection.
|
||||
5. The internal Vowpal Wabbit model is updated with the `BasedOn` input, the chosen `ToSelectFrom` action, and the resulting score from the scorer.
|
||||
6. The final response is returned.
|
||||
|
||||
Expected input dictionary format:
|
||||
- At least one variable encapsulated within `BasedOn` to serve as the selection criteria.
|
||||
- A single list variable within `ToSelectFrom`, representing potential actions for the VW model. This list can take the form of:
|
||||
- A list of strings, e.g., `action = ToSelectFrom(["action1", "action2", "action3"])`
|
||||
- A list of list of strings e.g. `action = ToSelectFrom([["action1", "another identifier of action1"], ["action2", "another identifier of action2"]])`
|
||||
- A list of dictionaries, where each dictionary represents an action with namespace names as keys and corresponding action strings as values. For instance, `action = ToSelectFrom([{"namespace1": ["action1", "another identifier of action1"], "namespace2": "action2"}, {"namespace1": "action3", "namespace2": "action4"}])`.
|
||||
|
||||
Extends:
|
||||
RLChain
|
||||
|
||||
Attributes:
|
||||
feature_embedder (PickBestFeatureEmbedder, optional): Is an advanced attribute. Responsible for embedding the `BasedOn` and `ToSelectFrom` inputs. If omitted, a default embedder is utilized.
|
||||
""" # noqa E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
**kwargs: Any,
|
||||
):
|
||||
auto_embed = kwargs.get("auto_embed", False)
|
||||
|
||||
feature_embedder = kwargs.get("feature_embedder", None)
|
||||
if feature_embedder:
|
||||
if "auto_embed" in kwargs:
|
||||
logger.warning(
|
||||
"auto_embed will take no effect when explicit feature_embedder is provided" # noqa E501
|
||||
)
|
||||
# turning auto_embed off for cli setting below
|
||||
auto_embed = False
|
||||
else:
|
||||
feature_embedder = PickBestFeatureEmbedder(auto_embed=auto_embed)
|
||||
kwargs["feature_embedder"] = feature_embedder
|
||||
|
||||
vw_cmd = kwargs.get("vw_cmd", [])
|
||||
if vw_cmd:
|
||||
if "--cb_explore_adf" not in vw_cmd:
|
||||
raise ValueError(
|
||||
"If vw_cmd is specified, it must include --cb_explore_adf"
|
||||
)
|
||||
else:
|
||||
interactions = ["--interactions=::"]
|
||||
if auto_embed:
|
||||
interactions = [
|
||||
"--interactions=@#",
|
||||
"--ignore_linear=@",
|
||||
"--ignore_linear=#",
|
||||
]
|
||||
vw_cmd = interactions + [
|
||||
"--cb_explore_adf",
|
||||
"--coin",
|
||||
"--squarecb",
|
||||
"--quiet",
|
||||
]
|
||||
|
||||
kwargs["vw_cmd"] = vw_cmd
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _call_before_predict(self, inputs: Dict[str, Any]) -> PickBestEvent:
|
||||
context, actions = base.get_based_on_and_to_select_from(inputs=inputs)
|
||||
if not actions:
|
||||
raise ValueError(
|
||||
"No variables using 'ToSelectFrom' found in the inputs. Please include at least one variable containing a list to select from." # noqa E501
|
||||
)
|
||||
|
||||
if len(list(actions.values())) > 1:
|
||||
raise ValueError(
|
||||
"Only one variable using 'ToSelectFrom' can be provided in the inputs for the PickBest chain. Please provide only one variable containing a list to select from." # noqa E501
|
||||
)
|
||||
|
||||
if not context:
|
||||
raise ValueError(
|
||||
"No variables using 'BasedOn' found in the inputs. Please include at least one variable containing information to base the selected of ToSelectFrom on." # noqa E501
|
||||
)
|
||||
|
||||
event = PickBestEvent(inputs=inputs, to_select_from=actions, based_on=context)
|
||||
return event
|
||||
|
||||
def _call_after_predict_before_llm(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
event: PickBestEvent,
|
||||
prediction: List[Tuple[int, float]],
|
||||
) -> Tuple[Dict[str, Any], PickBestEvent]:
|
||||
import numpy as np
|
||||
|
||||
prob_sum = sum(prob for _, prob in prediction)
|
||||
probabilities = [prob / prob_sum for _, prob in prediction]
|
||||
## sample from the pmf
|
||||
sampled_index = np.random.choice(len(prediction), p=probabilities)
|
||||
sampled_ap = prediction[sampled_index]
|
||||
sampled_action = sampled_ap[0]
|
||||
sampled_prob = sampled_ap[1]
|
||||
selected = PickBestSelected(index=sampled_action, probability=sampled_prob)
|
||||
event.selected = selected
|
||||
|
||||
# only one key, value pair in event.to_select_from
|
||||
key, value = next(iter(event.to_select_from.items()))
|
||||
next_chain_inputs = inputs.copy()
|
||||
next_chain_inputs.update({key: value[event.selected.index]})
|
||||
return next_chain_inputs, event
|
||||
|
||||
def _call_after_llm_before_scoring(
|
||||
self, llm_response: str, event: PickBestEvent
|
||||
) -> Tuple[Dict[str, Any], PickBestEvent]:
|
||||
next_chain_inputs = event.inputs.copy()
|
||||
# only one key, value pair in event.to_select_from
|
||||
value = next(iter(event.to_select_from.values()))
|
||||
v = (
|
||||
value[event.selected.index]
|
||||
if event.selected
|
||||
else event.to_select_from.values()
|
||||
)
|
||||
next_chain_inputs.update(
|
||||
{
|
||||
self.selected_based_on_input_key: str(event.based_on),
|
||||
self.selected_input_key: v,
|
||||
}
|
||||
)
|
||||
return next_chain_inputs, event
|
||||
|
||||
def _call_after_scoring_before_learning(
|
||||
self, event: PickBestEvent, score: Optional[float]
|
||||
) -> PickBestEvent:
|
||||
if event.selected:
|
||||
event.selected.score = score
|
||||
return event
|
||||
|
||||
def _call(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, Any]:
|
||||
return super()._call(run_manager=run_manager, inputs=inputs)
|
||||
|
||||
@property
|
||||
def _chain_type(self) -> str:
|
||||
return "rl_chain_pick_best"
|
||||
|
||||
@classmethod
|
||||
def from_llm(
|
||||
cls: Type[PickBest],
|
||||
llm: BaseLanguageModel,
|
||||
prompt: BasePromptTemplate,
|
||||
selection_scorer: Union[base.AutoSelectionScorer, object] = SENTINEL,
|
||||
**kwargs: Any,
|
||||
) -> PickBest:
|
||||
llm_chain = LLMChain(llm=llm, prompt=prompt)
|
||||
if selection_scorer is SENTINEL:
|
||||
selection_scorer = base.AutoSelectionScorer(llm=llm_chain.llm)
|
||||
|
||||
return PickBest(
|
||||
llm_chain=llm_chain,
|
||||
prompt=prompt,
|
||||
selection_scorer=selection_scorer,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -0,0 +1,18 @@
|
||||
from os import PathLike
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
|
||||
|
||||
class VwLogger:
|
||||
def __init__(self, path: Optional[Union[str, PathLike]]):
|
||||
self.path = Path(path) if path else None
|
||||
if self.path:
|
||||
self.path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def log(self, vw_ex: str) -> None:
|
||||
if self.path:
|
||||
with open(self.path, "a") as f:
|
||||
f.write(f"{vw_ex}\n\n")
|
||||
|
||||
def logging_enabled(self) -> bool:
|
||||
return bool(self.path)
|
||||
@@ -127,7 +127,7 @@ class SQLDatabaseChain(Chain):
|
||||
llm_inputs[k] = inputs[k]
|
||||
intermediate_steps: List = []
|
||||
try:
|
||||
intermediate_steps.append(llm_inputs) # input: sql generation
|
||||
intermediate_steps.append(llm_inputs.copy()) # input: sql generation
|
||||
sql_cmd = self.llm_chain.predict(
|
||||
callbacks=_run_manager.get_child(),
|
||||
**llm_inputs,
|
||||
@@ -180,7 +180,7 @@ class SQLDatabaseChain(Chain):
|
||||
_run_manager.on_text("\nAnswer:", verbose=self.verbose)
|
||||
input_text += f"{sql_cmd}\nSQLResult: {result}\nAnswer:"
|
||||
llm_inputs["input"] = input_text
|
||||
intermediate_steps.append(llm_inputs) # input: final answer
|
||||
intermediate_steps.append(llm_inputs.copy()) # input: final answer
|
||||
final_result = self.llm_chain.predict(
|
||||
callbacks=_run_manager.get_child(),
|
||||
**llm_inputs,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""Vector SQL Database Chain Retriever"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import Any, Dict, List, Optional, Sequence, Union
|
||||
|
||||
from langchain.callbacks.manager import CallbackManagerForChainRun
|
||||
from langchain.chains.llm import LLMChain
|
||||
@@ -76,10 +76,8 @@ class VectorSQLRetrieveAllOutputParser(VectorSQLOutputParser):
|
||||
return super().parse(text)
|
||||
|
||||
|
||||
def get_result_from_sqldb(
|
||||
db: SQLDatabase, cmd: str
|
||||
) -> Union[str, List[Dict[str, Any]], Dict[str, Any]]:
|
||||
result = db._execute(cmd, fetch="all") # type: ignore
|
||||
def get_result_from_sqldb(db: SQLDatabase, cmd: str) -> Sequence[Dict[str, Any]]:
|
||||
result = db._execute(cmd, fetch="all")
|
||||
return result
|
||||
|
||||
|
||||
@@ -179,8 +177,9 @@ class VectorSQLDatabaseChain(SQLDatabaseChain):
|
||||
_run_manager.on_text("\nSQLResult: ", verbose=self.verbose)
|
||||
_run_manager.on_text(str(result), color="yellow", verbose=self.verbose)
|
||||
# If return direct, we just set the final result equal to
|
||||
# the result of the sql query result, otherwise try to get a human readable
|
||||
# final answer
|
||||
# the result of the sql query result (`Sequence[Dict[str, Any]]`),
|
||||
# otherwise try to get a human readable final answer (`str`).
|
||||
final_result: Union[str, Sequence[Dict[str, Any]]]
|
||||
if self.return_direct:
|
||||
final_result = result
|
||||
else:
|
||||
|
||||
894
libs/experimental/poetry.lock
generated
894
libs/experimental/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "langchain-experimental"
|
||||
version = "0.0.25"
|
||||
version = "0.0.27"
|
||||
description = "Building applications with LLMs through composability"
|
||||
authors = []
|
||||
license = "MIT"
|
||||
@@ -10,10 +10,12 @@ repository = "https://github.com/langchain-ai/langchain"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8.1,<4.0"
|
||||
langchain = ">=0.0.239"
|
||||
langchain = ">=0.0.308"
|
||||
presidio-anonymizer = {version = "^2.2.33", optional = true}
|
||||
presidio-analyzer = {version = "^2.2.33", optional = true}
|
||||
faker = {version = "^19.3.1", optional = true}
|
||||
vowpal-wabbit-next = {version = "0.6.0", optional = true}
|
||||
sentence-transformers = {version = "^2", optional = true}
|
||||
|
||||
|
||||
[tool.poetry.group.lint.dependencies]
|
||||
@@ -43,6 +45,8 @@ extended_testing = [
|
||||
"presidio-anonymizer",
|
||||
"presidio-analyzer",
|
||||
"faker",
|
||||
"vowpal-wabbit-next",
|
||||
"sentence-transformers",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
import ctypes
|
||||
|
||||
|
||||
def is_libcublas_available() -> bool:
|
||||
try:
|
||||
ctypes.CDLL("libcublas.so")
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
@@ -0,0 +1,459 @@
|
||||
from typing import Any, Dict
|
||||
|
||||
import pytest
|
||||
from langchain.chat_models import FakeListChatModel
|
||||
from langchain.prompts.prompt import PromptTemplate
|
||||
from test_utils import MockEncoder, MockEncoderReturnsList
|
||||
|
||||
import langchain_experimental.rl_chain.base as rl_chain
|
||||
import langchain_experimental.rl_chain.pick_best_chain as pick_best_chain
|
||||
|
||||
encoded_keyword = "[encoded]"
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def setup() -> tuple:
|
||||
_PROMPT_TEMPLATE = """This is a dummy prompt that will be ignored by the fake llm"""
|
||||
PROMPT = PromptTemplate(input_variables=[], template=_PROMPT_TEMPLATE)
|
||||
|
||||
llm = FakeListChatModel(responses=["hey"])
|
||||
return llm, PROMPT
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_multiple_ToSelectFrom_throws() -> None:
|
||||
llm, PROMPT = setup()
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm,
|
||||
prompt=PROMPT,
|
||||
feature_embedder=pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
),
|
||||
)
|
||||
actions = ["0", "1", "2"]
|
||||
with pytest.raises(ValueError):
|
||||
chain.run(
|
||||
User=rl_chain.BasedOn("Context"),
|
||||
action=rl_chain.ToSelectFrom(actions),
|
||||
another_action=rl_chain.ToSelectFrom(actions),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_missing_basedOn_from_throws() -> None:
|
||||
llm, PROMPT = setup()
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm,
|
||||
prompt=PROMPT,
|
||||
feature_embedder=pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
),
|
||||
)
|
||||
actions = ["0", "1", "2"]
|
||||
with pytest.raises(ValueError):
|
||||
chain.run(action=rl_chain.ToSelectFrom(actions))
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_ToSelectFrom_not_a_list_throws() -> None:
|
||||
llm, PROMPT = setup()
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm,
|
||||
prompt=PROMPT,
|
||||
feature_embedder=pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
),
|
||||
)
|
||||
actions = {"actions": ["0", "1", "2"]}
|
||||
with pytest.raises(ValueError):
|
||||
chain.run(
|
||||
User=rl_chain.BasedOn("Context"),
|
||||
action=rl_chain.ToSelectFrom(actions),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_update_with_delayed_score_with_auto_validator_throws() -> None:
|
||||
llm, PROMPT = setup()
|
||||
# this LLM returns a number so that the auto validator will return that
|
||||
auto_val_llm = FakeListChatModel(responses=["3"])
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm,
|
||||
prompt=PROMPT,
|
||||
selection_scorer=rl_chain.AutoSelectionScorer(llm=auto_val_llm),
|
||||
feature_embedder=pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
),
|
||||
)
|
||||
actions = ["0", "1", "2"]
|
||||
response = chain.run(
|
||||
User=rl_chain.BasedOn("Context"),
|
||||
action=rl_chain.ToSelectFrom(actions),
|
||||
)
|
||||
assert response["response"] == "hey" # type: ignore
|
||||
selection_metadata = response["selection_metadata"] # type: ignore
|
||||
assert selection_metadata.selected.score == 3.0 # type: ignore
|
||||
with pytest.raises(RuntimeError):
|
||||
chain.update_with_delayed_score(
|
||||
chain_response=response, score=100 # type: ignore
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_update_with_delayed_score_force() -> None:
|
||||
llm, PROMPT = setup()
|
||||
# this LLM returns a number so that the auto validator will return that
|
||||
auto_val_llm = FakeListChatModel(responses=["3"])
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm,
|
||||
prompt=PROMPT,
|
||||
selection_scorer=rl_chain.AutoSelectionScorer(llm=auto_val_llm),
|
||||
feature_embedder=pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
),
|
||||
)
|
||||
actions = ["0", "1", "2"]
|
||||
response = chain.run(
|
||||
User=rl_chain.BasedOn("Context"),
|
||||
action=rl_chain.ToSelectFrom(actions),
|
||||
)
|
||||
assert response["response"] == "hey" # type: ignore
|
||||
selection_metadata = response["selection_metadata"] # type: ignore
|
||||
assert selection_metadata.selected.score == 3.0 # type: ignore
|
||||
chain.update_with_delayed_score(
|
||||
chain_response=response, score=100, force_score=True # type: ignore
|
||||
)
|
||||
assert selection_metadata.selected.score == 100.0 # type: ignore
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_update_with_delayed_score() -> None:
|
||||
llm, PROMPT = setup()
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm,
|
||||
prompt=PROMPT,
|
||||
selection_scorer=None,
|
||||
feature_embedder=pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
),
|
||||
)
|
||||
actions = ["0", "1", "2"]
|
||||
response = chain.run(
|
||||
User=rl_chain.BasedOn("Context"),
|
||||
action=rl_chain.ToSelectFrom(actions),
|
||||
)
|
||||
assert response["response"] == "hey" # type: ignore
|
||||
selection_metadata = response["selection_metadata"] # type: ignore
|
||||
assert selection_metadata.selected.score is None # type: ignore
|
||||
chain.update_with_delayed_score(chain_response=response, score=100) # type: ignore
|
||||
assert selection_metadata.selected.score == 100.0 # type: ignore
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_user_defined_scorer() -> None:
|
||||
llm, PROMPT = setup()
|
||||
|
||||
class CustomSelectionScorer(rl_chain.SelectionScorer):
|
||||
def score_response(
|
||||
self,
|
||||
inputs: Dict[str, Any],
|
||||
llm_response: str,
|
||||
event: pick_best_chain.PickBestEvent,
|
||||
) -> float:
|
||||
score = 200
|
||||
return score
|
||||
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm,
|
||||
prompt=PROMPT,
|
||||
selection_scorer=CustomSelectionScorer(),
|
||||
feature_embedder=pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
),
|
||||
)
|
||||
actions = ["0", "1", "2"]
|
||||
response = chain.run(
|
||||
User=rl_chain.BasedOn("Context"),
|
||||
action=rl_chain.ToSelectFrom(actions),
|
||||
)
|
||||
assert response["response"] == "hey" # type: ignore
|
||||
selection_metadata = response["selection_metadata"] # type: ignore
|
||||
assert selection_metadata.selected.score == 200.0 # type: ignore
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_everything_embedded() -> None:
|
||||
llm, PROMPT = setup()
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm, prompt=PROMPT, feature_embedder=feature_embedder, auto_embed=False
|
||||
)
|
||||
|
||||
str1 = "0"
|
||||
str2 = "1"
|
||||
str3 = "2"
|
||||
encoded_str1 = rl_chain.stringify_embedding(list(encoded_keyword + str1))
|
||||
encoded_str2 = rl_chain.stringify_embedding(list(encoded_keyword + str2))
|
||||
encoded_str3 = rl_chain.stringify_embedding(list(encoded_keyword + str3))
|
||||
|
||||
ctx_str_1 = "context1"
|
||||
|
||||
encoded_ctx_str_1 = rl_chain.stringify_embedding(list(encoded_keyword + ctx_str_1))
|
||||
|
||||
expected = f"""shared |User {ctx_str_1 + " " + encoded_ctx_str_1} \n|action {str1 + " " + encoded_str1} \n|action {str2 + " " + encoded_str2} \n|action {str3 + " " + encoded_str3} """ # noqa
|
||||
|
||||
actions = [str1, str2, str3]
|
||||
|
||||
response = chain.run(
|
||||
User=rl_chain.EmbedAndKeep(rl_chain.BasedOn(ctx_str_1)),
|
||||
action=rl_chain.EmbedAndKeep(rl_chain.ToSelectFrom(actions)),
|
||||
)
|
||||
selection_metadata = response["selection_metadata"] # type: ignore
|
||||
vw_str = feature_embedder.format(selection_metadata) # type: ignore
|
||||
assert vw_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_default_auto_embedder_is_off() -> None:
|
||||
llm, PROMPT = setup()
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm, prompt=PROMPT, feature_embedder=feature_embedder
|
||||
)
|
||||
|
||||
str1 = "0"
|
||||
str2 = "1"
|
||||
str3 = "2"
|
||||
ctx_str_1 = "context1"
|
||||
|
||||
expected = f"""shared |User {ctx_str_1} \n|action {str1} \n|action {str2} \n|action {str3} """ # noqa
|
||||
|
||||
actions = [str1, str2, str3]
|
||||
|
||||
response = chain.run(
|
||||
User=pick_best_chain.base.BasedOn(ctx_str_1),
|
||||
action=pick_best_chain.base.ToSelectFrom(actions),
|
||||
)
|
||||
selection_metadata = response["selection_metadata"] # type: ignore
|
||||
vw_str = feature_embedder.format(selection_metadata) # type: ignore
|
||||
assert vw_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_default_w_embeddings_off() -> None:
|
||||
llm, PROMPT = setup()
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm, prompt=PROMPT, feature_embedder=feature_embedder, auto_embed=False
|
||||
)
|
||||
|
||||
str1 = "0"
|
||||
str2 = "1"
|
||||
str3 = "2"
|
||||
ctx_str_1 = "context1"
|
||||
|
||||
expected = f"""shared |User {ctx_str_1} \n|action {str1} \n|action {str2} \n|action {str3} """ # noqa
|
||||
|
||||
actions = [str1, str2, str3]
|
||||
|
||||
response = chain.run(
|
||||
User=rl_chain.BasedOn(ctx_str_1),
|
||||
action=rl_chain.ToSelectFrom(actions),
|
||||
)
|
||||
selection_metadata = response["selection_metadata"] # type: ignore
|
||||
vw_str = feature_embedder.format(selection_metadata) # type: ignore
|
||||
assert vw_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_default_w_embeddings_on() -> None:
|
||||
llm, PROMPT = setup()
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=True, model=MockEncoderReturnsList()
|
||||
)
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm, prompt=PROMPT, feature_embedder=feature_embedder, auto_embed=True
|
||||
)
|
||||
|
||||
str1 = "0"
|
||||
str2 = "1"
|
||||
ctx_str_1 = "context1"
|
||||
dot_prod = "dotprod 0:5.0" # dot prod of [1.0, 2.0] and [1.0, 2.0]
|
||||
|
||||
expected = f"""shared |User {ctx_str_1} |@ User={ctx_str_1}\n|action {str1} |# action={str1} |{dot_prod}\n|action {str2} |# action={str2} |{dot_prod}""" # noqa
|
||||
|
||||
actions = [str1, str2]
|
||||
|
||||
response = chain.run(
|
||||
User=rl_chain.BasedOn(ctx_str_1),
|
||||
action=rl_chain.ToSelectFrom(actions),
|
||||
)
|
||||
selection_metadata = response["selection_metadata"] # type: ignore
|
||||
vw_str = feature_embedder.format(selection_metadata) # type: ignore
|
||||
assert vw_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_default_embeddings_mixed_w_explicit_user_embeddings() -> None:
|
||||
llm, PROMPT = setup()
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=True, model=MockEncoderReturnsList()
|
||||
)
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm, prompt=PROMPT, feature_embedder=feature_embedder, auto_embed=True
|
||||
)
|
||||
|
||||
str1 = "0"
|
||||
str2 = "1"
|
||||
encoded_str2 = rl_chain.stringify_embedding([1.0, 2.0])
|
||||
ctx_str_1 = "context1"
|
||||
ctx_str_2 = "context2"
|
||||
encoded_ctx_str_1 = rl_chain.stringify_embedding([1.0, 2.0])
|
||||
dot_prod = "dotprod 0:5.0 1:5.0" # dot prod of [1.0, 2.0] and [1.0, 2.0]
|
||||
|
||||
expected = f"""shared |User {encoded_ctx_str_1} |@ User={encoded_ctx_str_1} |User2 {ctx_str_2} |@ User2={ctx_str_2}\n|action {str1} |# action={str1} |{dot_prod}\n|action {encoded_str2} |# action={encoded_str2} |{dot_prod}""" # noqa
|
||||
|
||||
actions = [str1, rl_chain.Embed(str2)]
|
||||
|
||||
response = chain.run(
|
||||
User=rl_chain.BasedOn(rl_chain.Embed(ctx_str_1)),
|
||||
User2=rl_chain.BasedOn(ctx_str_2),
|
||||
action=rl_chain.ToSelectFrom(actions),
|
||||
)
|
||||
selection_metadata = response["selection_metadata"] # type: ignore
|
||||
vw_str = feature_embedder.format(selection_metadata) # type: ignore
|
||||
assert vw_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_default_no_scorer_specified() -> None:
|
||||
_, PROMPT = setup()
|
||||
chain_llm = FakeListChatModel(responses=["hey", "100"])
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=chain_llm,
|
||||
prompt=PROMPT,
|
||||
feature_embedder=pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
),
|
||||
)
|
||||
response = chain.run(
|
||||
User=rl_chain.BasedOn("Context"),
|
||||
action=rl_chain.ToSelectFrom(["0", "1", "2"]),
|
||||
)
|
||||
# chain llm used for both basic prompt and for scoring
|
||||
assert response["response"] == "hey" # type: ignore
|
||||
selection_metadata = response["selection_metadata"] # type: ignore
|
||||
assert selection_metadata.selected.score == 100.0 # type: ignore
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_explicitly_no_scorer() -> None:
|
||||
llm, PROMPT = setup()
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm,
|
||||
prompt=PROMPT,
|
||||
selection_scorer=None,
|
||||
feature_embedder=pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
),
|
||||
)
|
||||
response = chain.run(
|
||||
User=rl_chain.BasedOn("Context"),
|
||||
action=rl_chain.ToSelectFrom(["0", "1", "2"]),
|
||||
)
|
||||
# chain llm used for both basic prompt and for scoring
|
||||
assert response["response"] == "hey" # type: ignore
|
||||
selection_metadata = response["selection_metadata"] # type: ignore
|
||||
assert selection_metadata.selected.score is None # type: ignore
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_auto_scorer_with_user_defined_llm() -> None:
|
||||
llm, PROMPT = setup()
|
||||
scorer_llm = FakeListChatModel(responses=["300"])
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm,
|
||||
prompt=PROMPT,
|
||||
selection_scorer=rl_chain.AutoSelectionScorer(llm=scorer_llm),
|
||||
feature_embedder=pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
),
|
||||
)
|
||||
response = chain.run(
|
||||
User=rl_chain.BasedOn("Context"),
|
||||
action=rl_chain.ToSelectFrom(["0", "1", "2"]),
|
||||
)
|
||||
# chain llm used for both basic prompt and for scoring
|
||||
assert response["response"] == "hey" # type: ignore
|
||||
selection_metadata = response["selection_metadata"] # type: ignore
|
||||
assert selection_metadata.selected.score == 300.0 # type: ignore
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_calling_chain_w_reserved_inputs_throws() -> None:
|
||||
llm, PROMPT = setup()
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm,
|
||||
prompt=PROMPT,
|
||||
feature_embedder=pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
),
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
chain.run(
|
||||
User=rl_chain.BasedOn("Context"),
|
||||
rl_chain_selected_based_on=rl_chain.ToSelectFrom(["0", "1", "2"]),
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
chain.run(
|
||||
User=rl_chain.BasedOn("Context"),
|
||||
rl_chain_selected=rl_chain.ToSelectFrom(["0", "1", "2"]),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next", "sentence_transformers")
|
||||
def test_activate_and_deactivate_scorer() -> None:
|
||||
_, PROMPT = setup()
|
||||
llm = FakeListChatModel(responses=["hey1", "hey2", "hey3"])
|
||||
scorer_llm = FakeListChatModel(responses=["300", "400"])
|
||||
chain = pick_best_chain.PickBest.from_llm(
|
||||
llm=llm,
|
||||
prompt=PROMPT,
|
||||
selection_scorer=pick_best_chain.base.AutoSelectionScorer(llm=scorer_llm),
|
||||
feature_embedder=pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
),
|
||||
)
|
||||
response = chain.run(
|
||||
User=pick_best_chain.base.BasedOn("Context"),
|
||||
action=pick_best_chain.base.ToSelectFrom(["0", "1", "2"]),
|
||||
)
|
||||
# chain llm used for both basic prompt and for scoring
|
||||
assert response["response"] == "hey1" # type: ignore
|
||||
selection_metadata = response["selection_metadata"] # type: ignore
|
||||
assert selection_metadata.selected.score == 300.0 # type: ignore
|
||||
|
||||
chain.deactivate_selection_scorer()
|
||||
response = chain.run(
|
||||
User=pick_best_chain.base.BasedOn("Context"),
|
||||
action=pick_best_chain.base.ToSelectFrom(["0", "1", "2"]),
|
||||
)
|
||||
assert response["response"] == "hey2" # type: ignore
|
||||
selection_metadata = response["selection_metadata"] # type: ignore
|
||||
assert selection_metadata.selected.score is None # type: ignore
|
||||
|
||||
chain.activate_selection_scorer()
|
||||
response = chain.run(
|
||||
User=pick_best_chain.base.BasedOn("Context"),
|
||||
action=pick_best_chain.base.ToSelectFrom(["0", "1", "2"]),
|
||||
)
|
||||
assert response["response"] == "hey3" # type: ignore
|
||||
selection_metadata = response["selection_metadata"] # type: ignore
|
||||
assert selection_metadata.selected.score == 400.0 # type: ignore
|
||||
@@ -0,0 +1,370 @@
|
||||
import pytest
|
||||
from test_utils import MockEncoder
|
||||
|
||||
import langchain_experimental.rl_chain.base as rl_chain
|
||||
import langchain_experimental.rl_chain.pick_best_chain as pick_best_chain
|
||||
|
||||
encoded_keyword = "[encoded]"
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_pickbest_textembedder_missing_context_throws() -> None:
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
named_action = {"action": ["0", "1", "2"]}
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={}, to_select_from=named_action, based_on={}
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
feature_embedder.format(event)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_pickbest_textembedder_missing_actions_throws() -> None:
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={}, to_select_from={}, based_on={"context": "context"}
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
feature_embedder.format(event)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_pickbest_textembedder_no_label_no_emb() -> None:
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
named_actions = {"action1": ["0", "1", "2"]}
|
||||
expected = """shared |context context \n|action1 0 \n|action1 1 \n|action1 2 """
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={}, to_select_from=named_actions, based_on={"context": "context"}
|
||||
)
|
||||
vw_ex_str = feature_embedder.format(event)
|
||||
assert vw_ex_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_pickbest_textembedder_w_label_no_score_no_emb() -> None:
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
named_actions = {"action1": ["0", "1", "2"]}
|
||||
expected = """shared |context context \n|action1 0 \n|action1 1 \n|action1 2 """
|
||||
selected = pick_best_chain.PickBestSelected(index=0, probability=1.0)
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={},
|
||||
to_select_from=named_actions,
|
||||
based_on={"context": "context"},
|
||||
selected=selected,
|
||||
)
|
||||
vw_ex_str = feature_embedder.format(event)
|
||||
assert vw_ex_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_pickbest_textembedder_w_full_label_no_emb() -> None:
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
named_actions = {"action1": ["0", "1", "2"]}
|
||||
expected = (
|
||||
"""shared |context context \n0:-0.0:1.0 |action1 0 \n|action1 1 \n|action1 2 """
|
||||
)
|
||||
selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0)
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={},
|
||||
to_select_from=named_actions,
|
||||
based_on={"context": "context"},
|
||||
selected=selected,
|
||||
)
|
||||
vw_ex_str = feature_embedder.format(event)
|
||||
assert vw_ex_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_pickbest_textembedder_w_full_label_w_emb() -> None:
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
str1 = "0"
|
||||
str2 = "1"
|
||||
str3 = "2"
|
||||
encoded_str1 = rl_chain.stringify_embedding(list(encoded_keyword + str1))
|
||||
encoded_str2 = rl_chain.stringify_embedding(list(encoded_keyword + str2))
|
||||
encoded_str3 = rl_chain.stringify_embedding(list(encoded_keyword + str3))
|
||||
|
||||
ctx_str_1 = "context1"
|
||||
encoded_ctx_str_1 = rl_chain.stringify_embedding(list(encoded_keyword + ctx_str_1))
|
||||
|
||||
named_actions = {"action1": rl_chain.Embed([str1, str2, str3])}
|
||||
context = {"context": rl_chain.Embed(ctx_str_1)}
|
||||
expected = f"""shared |context {encoded_ctx_str_1} \n0:-0.0:1.0 |action1 {encoded_str1} \n|action1 {encoded_str2} \n|action1 {encoded_str3} """ # noqa: E501
|
||||
selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0)
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={}, to_select_from=named_actions, based_on=context, selected=selected
|
||||
)
|
||||
vw_ex_str = feature_embedder.format(event)
|
||||
assert vw_ex_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_pickbest_textembedder_w_full_label_w_embed_and_keep() -> None:
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
str1 = "0"
|
||||
str2 = "1"
|
||||
str3 = "2"
|
||||
encoded_str1 = rl_chain.stringify_embedding(list(encoded_keyword + str1))
|
||||
encoded_str2 = rl_chain.stringify_embedding(list(encoded_keyword + str2))
|
||||
encoded_str3 = rl_chain.stringify_embedding(list(encoded_keyword + str3))
|
||||
|
||||
ctx_str_1 = "context1"
|
||||
encoded_ctx_str_1 = rl_chain.stringify_embedding(list(encoded_keyword + ctx_str_1))
|
||||
|
||||
named_actions = {"action1": rl_chain.EmbedAndKeep([str1, str2, str3])}
|
||||
context = {"context": rl_chain.EmbedAndKeep(ctx_str_1)}
|
||||
expected = f"""shared |context {ctx_str_1 + " " + encoded_ctx_str_1} \n0:-0.0:1.0 |action1 {str1 + " " + encoded_str1} \n|action1 {str2 + " " + encoded_str2} \n|action1 {str3 + " " + encoded_str3} """ # noqa: E501
|
||||
selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0)
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={}, to_select_from=named_actions, based_on=context, selected=selected
|
||||
)
|
||||
vw_ex_str = feature_embedder.format(event)
|
||||
assert vw_ex_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_pickbest_textembedder_more_namespaces_no_label_no_emb() -> None:
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
named_actions = {"action1": [{"a": "0", "b": "0"}, "1", "2"]}
|
||||
context = {"context1": "context1", "context2": "context2"}
|
||||
expected = """shared |context1 context1 |context2 context2 \n|a 0 |b 0 \n|action1 1 \n|action1 2 """ # noqa: E501
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={}, to_select_from=named_actions, based_on=context
|
||||
)
|
||||
vw_ex_str = feature_embedder.format(event)
|
||||
assert vw_ex_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_pickbest_textembedder_more_namespaces_w_label_no_emb() -> None:
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
named_actions = {"action1": [{"a": "0", "b": "0"}, "1", "2"]}
|
||||
context = {"context1": "context1", "context2": "context2"}
|
||||
expected = """shared |context1 context1 |context2 context2 \n|a 0 |b 0 \n|action1 1 \n|action1 2 """ # noqa: E501
|
||||
selected = pick_best_chain.PickBestSelected(index=0, probability=1.0)
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={}, to_select_from=named_actions, based_on=context, selected=selected
|
||||
)
|
||||
vw_ex_str = feature_embedder.format(event)
|
||||
assert vw_ex_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_pickbest_textembedder_more_namespaces_w_full_label_no_emb() -> None:
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
named_actions = {"action1": [{"a": "0", "b": "0"}, "1", "2"]}
|
||||
context = {"context1": "context1", "context2": "context2"}
|
||||
expected = """shared |context1 context1 |context2 context2 \n0:-0.0:1.0 |a 0 |b 0 \n|action1 1 \n|action1 2 """ # noqa: E501
|
||||
selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0)
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={}, to_select_from=named_actions, based_on=context, selected=selected
|
||||
)
|
||||
vw_ex_str = feature_embedder.format(event)
|
||||
assert vw_ex_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_pickbest_textembedder_more_namespaces_w_full_label_w_full_emb() -> None:
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
|
||||
str1 = "0"
|
||||
str2 = "1"
|
||||
str3 = "2"
|
||||
encoded_str1 = rl_chain.stringify_embedding(list(encoded_keyword + str1))
|
||||
encoded_str2 = rl_chain.stringify_embedding(list(encoded_keyword + str2))
|
||||
encoded_str3 = rl_chain.stringify_embedding(list(encoded_keyword + str3))
|
||||
|
||||
ctx_str_1 = "context1"
|
||||
ctx_str_2 = "context2"
|
||||
encoded_ctx_str_1 = rl_chain.stringify_embedding(list(encoded_keyword + ctx_str_1))
|
||||
encoded_ctx_str_2 = rl_chain.stringify_embedding(list(encoded_keyword + ctx_str_2))
|
||||
|
||||
named_actions = {"action1": rl_chain.Embed([{"a": str1, "b": str1}, str2, str3])}
|
||||
context = {
|
||||
"context1": rl_chain.Embed(ctx_str_1),
|
||||
"context2": rl_chain.Embed(ctx_str_2),
|
||||
}
|
||||
expected = f"""shared |context1 {encoded_ctx_str_1} |context2 {encoded_ctx_str_2} \n0:-0.0:1.0 |a {encoded_str1} |b {encoded_str1} \n|action1 {encoded_str2} \n|action1 {encoded_str3} """ # noqa: E501
|
||||
|
||||
selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0)
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={}, to_select_from=named_actions, based_on=context, selected=selected
|
||||
)
|
||||
vw_ex_str = feature_embedder.format(event)
|
||||
assert vw_ex_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_pickbest_textembedder_more_namespaces_w_full_label_w_full_embed_and_keep() -> (
|
||||
None
|
||||
):
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
|
||||
str1 = "0"
|
||||
str2 = "1"
|
||||
str3 = "2"
|
||||
encoded_str1 = rl_chain.stringify_embedding(list(encoded_keyword + str1))
|
||||
encoded_str2 = rl_chain.stringify_embedding(list(encoded_keyword + str2))
|
||||
encoded_str3 = rl_chain.stringify_embedding(list(encoded_keyword + str3))
|
||||
|
||||
ctx_str_1 = "context1"
|
||||
ctx_str_2 = "context2"
|
||||
encoded_ctx_str_1 = rl_chain.stringify_embedding(list(encoded_keyword + ctx_str_1))
|
||||
encoded_ctx_str_2 = rl_chain.stringify_embedding(list(encoded_keyword + ctx_str_2))
|
||||
|
||||
named_actions = {
|
||||
"action1": rl_chain.EmbedAndKeep([{"a": str1, "b": str1}, str2, str3])
|
||||
}
|
||||
context = {
|
||||
"context1": rl_chain.EmbedAndKeep(ctx_str_1),
|
||||
"context2": rl_chain.EmbedAndKeep(ctx_str_2),
|
||||
}
|
||||
expected = f"""shared |context1 {ctx_str_1 + " " + encoded_ctx_str_1} |context2 {ctx_str_2 + " " + encoded_ctx_str_2} \n0:-0.0:1.0 |a {str1 + " " + encoded_str1} |b {str1 + " " + encoded_str1} \n|action1 {str2 + " " + encoded_str2} \n|action1 {str3 + " " + encoded_str3} """ # noqa: E501
|
||||
|
||||
selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0)
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={}, to_select_from=named_actions, based_on=context, selected=selected
|
||||
)
|
||||
vw_ex_str = feature_embedder.format(event)
|
||||
assert vw_ex_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_pickbest_textembedder_more_namespaces_w_full_label_w_partial_emb() -> None:
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
|
||||
str1 = "0"
|
||||
str2 = "1"
|
||||
str3 = "2"
|
||||
encoded_str1 = rl_chain.stringify_embedding(list(encoded_keyword + str1))
|
||||
encoded_str3 = rl_chain.stringify_embedding(list(encoded_keyword + str3))
|
||||
|
||||
ctx_str_1 = "context1"
|
||||
ctx_str_2 = "context2"
|
||||
encoded_ctx_str_2 = rl_chain.stringify_embedding(list(encoded_keyword + ctx_str_2))
|
||||
|
||||
named_actions = {
|
||||
"action1": [
|
||||
{"a": str1, "b": rl_chain.Embed(str1)},
|
||||
str2,
|
||||
rl_chain.Embed(str3),
|
||||
]
|
||||
}
|
||||
context = {"context1": ctx_str_1, "context2": rl_chain.Embed(ctx_str_2)}
|
||||
expected = f"""shared |context1 {ctx_str_1} |context2 {encoded_ctx_str_2} \n0:-0.0:1.0 |a {str1} |b {encoded_str1} \n|action1 {str2} \n|action1 {encoded_str3} """ # noqa: E501
|
||||
|
||||
selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0)
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={}, to_select_from=named_actions, based_on=context, selected=selected
|
||||
)
|
||||
vw_ex_str = feature_embedder.format(event)
|
||||
assert vw_ex_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_pickbest_textembedder_more_namespaces_w_full_label_w_partial_emakeep() -> None:
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
|
||||
str1 = "0"
|
||||
str2 = "1"
|
||||
str3 = "2"
|
||||
encoded_str1 = rl_chain.stringify_embedding(list(encoded_keyword + str1))
|
||||
encoded_str3 = rl_chain.stringify_embedding(list(encoded_keyword + str3))
|
||||
|
||||
ctx_str_1 = "context1"
|
||||
ctx_str_2 = "context2"
|
||||
encoded_ctx_str_2 = rl_chain.stringify_embedding(list(encoded_keyword + ctx_str_2))
|
||||
|
||||
named_actions = {
|
||||
"action1": [
|
||||
{"a": str1, "b": rl_chain.EmbedAndKeep(str1)},
|
||||
str2,
|
||||
rl_chain.EmbedAndKeep(str3),
|
||||
]
|
||||
}
|
||||
context = {
|
||||
"context1": ctx_str_1,
|
||||
"context2": rl_chain.EmbedAndKeep(ctx_str_2),
|
||||
}
|
||||
expected = f"""shared |context1 {ctx_str_1} |context2 {ctx_str_2 + " " + encoded_ctx_str_2} \n0:-0.0:1.0 |a {str1} |b {str1 + " " + encoded_str1} \n|action1 {str2} \n|action1 {str3 + " " + encoded_str3} """ # noqa: E501
|
||||
|
||||
selected = pick_best_chain.PickBestSelected(index=0, probability=1.0, score=0.0)
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={}, to_select_from=named_actions, based_on=context, selected=selected
|
||||
)
|
||||
vw_ex_str = feature_embedder.format(event)
|
||||
assert vw_ex_str == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_raw_features_underscored() -> None:
|
||||
feature_embedder = pick_best_chain.PickBestFeatureEmbedder(
|
||||
auto_embed=False, model=MockEncoder()
|
||||
)
|
||||
str1 = "this is a long string"
|
||||
str1_underscored = str1.replace(" ", "_")
|
||||
encoded_str1 = rl_chain.stringify_embedding(list(encoded_keyword + str1))
|
||||
|
||||
ctx_str = "this is a long context"
|
||||
ctx_str_underscored = ctx_str.replace(" ", "_")
|
||||
encoded_ctx_str = rl_chain.stringify_embedding(list(encoded_keyword + ctx_str))
|
||||
|
||||
# No embeddings
|
||||
named_actions = {"action": [str1]}
|
||||
context = {"context": ctx_str}
|
||||
expected_no_embed = (
|
||||
f"""shared |context {ctx_str_underscored} \n|action {str1_underscored} """
|
||||
)
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={}, to_select_from=named_actions, based_on=context
|
||||
)
|
||||
vw_ex_str = feature_embedder.format(event)
|
||||
assert vw_ex_str == expected_no_embed
|
||||
|
||||
# Just embeddings
|
||||
named_actions = {"action": rl_chain.Embed([str1])}
|
||||
context = {"context": rl_chain.Embed(ctx_str)}
|
||||
expected_embed = f"""shared |context {encoded_ctx_str} \n|action {encoded_str1} """
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={}, to_select_from=named_actions, based_on=context
|
||||
)
|
||||
vw_ex_str = feature_embedder.format(event)
|
||||
assert vw_ex_str == expected_embed
|
||||
|
||||
# Embeddings and raw features
|
||||
named_actions = {"action": rl_chain.EmbedAndKeep([str1])}
|
||||
context = {"context": rl_chain.EmbedAndKeep(ctx_str)}
|
||||
expected_embed_and_keep = f"""shared |context {ctx_str_underscored + " " + encoded_ctx_str} \n|action {str1_underscored + " " + encoded_str1} """ # noqa: E501
|
||||
event = pick_best_chain.PickBestEvent(
|
||||
inputs={}, to_select_from=named_actions, based_on=context
|
||||
)
|
||||
vw_ex_str = feature_embedder.format(event)
|
||||
assert vw_ex_str == expected_embed_and_keep
|
||||
@@ -0,0 +1,422 @@
|
||||
from typing import List, Union
|
||||
|
||||
import pytest
|
||||
from test_utils import MockEncoder
|
||||
|
||||
import langchain_experimental.rl_chain.base as base
|
||||
|
||||
encoded_keyword = "[encoded]"
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_simple_context_str_no_emb() -> None:
|
||||
expected = [{"a_namespace": "test"}]
|
||||
assert base.embed("test", MockEncoder(), "a_namespace") == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_simple_context_str_w_emb() -> None:
|
||||
str1 = "test"
|
||||
encoded_str1 = base.stringify_embedding(list(encoded_keyword + str1))
|
||||
expected = [{"a_namespace": encoded_str1}]
|
||||
assert base.embed(base.Embed(str1), MockEncoder(), "a_namespace") == expected
|
||||
expected_embed_and_keep = [{"a_namespace": str1 + " " + encoded_str1}]
|
||||
assert (
|
||||
base.embed(base.EmbedAndKeep(str1), MockEncoder(), "a_namespace")
|
||||
== expected_embed_and_keep
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_simple_context_str_w_nested_emb() -> None:
|
||||
# nested embeddings, innermost wins
|
||||
str1 = "test"
|
||||
encoded_str1 = base.stringify_embedding(list(encoded_keyword + str1))
|
||||
expected = [{"a_namespace": encoded_str1}]
|
||||
assert (
|
||||
base.embed(base.EmbedAndKeep(base.Embed(str1)), MockEncoder(), "a_namespace")
|
||||
== expected
|
||||
)
|
||||
|
||||
expected2 = [{"a_namespace": str1 + " " + encoded_str1}]
|
||||
assert (
|
||||
base.embed(base.Embed(base.EmbedAndKeep(str1)), MockEncoder(), "a_namespace")
|
||||
== expected2
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_context_w_namespace_no_emb() -> None:
|
||||
expected = [{"test_namespace": "test"}]
|
||||
assert base.embed({"test_namespace": "test"}, MockEncoder()) == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_context_w_namespace_w_emb() -> None:
|
||||
str1 = "test"
|
||||
encoded_str1 = base.stringify_embedding(list(encoded_keyword + str1))
|
||||
expected = [{"test_namespace": encoded_str1}]
|
||||
assert base.embed({"test_namespace": base.Embed(str1)}, MockEncoder()) == expected
|
||||
expected_embed_and_keep = [{"test_namespace": str1 + " " + encoded_str1}]
|
||||
assert (
|
||||
base.embed({"test_namespace": base.EmbedAndKeep(str1)}, MockEncoder())
|
||||
== expected_embed_and_keep
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_context_w_namespace_w_emb2() -> None:
|
||||
str1 = "test"
|
||||
encoded_str1 = base.stringify_embedding(list(encoded_keyword + str1))
|
||||
expected = [{"test_namespace": encoded_str1}]
|
||||
assert base.embed(base.Embed({"test_namespace": str1}), MockEncoder()) == expected
|
||||
expected_embed_and_keep = [{"test_namespace": str1 + " " + encoded_str1}]
|
||||
assert (
|
||||
base.embed(base.EmbedAndKeep({"test_namespace": str1}), MockEncoder())
|
||||
== expected_embed_and_keep
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_context_w_namespace_w_some_emb() -> None:
|
||||
str1 = "test1"
|
||||
str2 = "test2"
|
||||
encoded_str2 = base.stringify_embedding(list(encoded_keyword + str2))
|
||||
expected = [{"test_namespace": str1, "test_namespace2": encoded_str2}]
|
||||
assert (
|
||||
base.embed(
|
||||
{"test_namespace": str1, "test_namespace2": base.Embed(str2)}, MockEncoder()
|
||||
)
|
||||
== expected
|
||||
)
|
||||
expected_embed_and_keep = [
|
||||
{
|
||||
"test_namespace": str1,
|
||||
"test_namespace2": str2 + " " + encoded_str2,
|
||||
}
|
||||
]
|
||||
assert (
|
||||
base.embed(
|
||||
{"test_namespace": str1, "test_namespace2": base.EmbedAndKeep(str2)},
|
||||
MockEncoder(),
|
||||
)
|
||||
== expected_embed_and_keep
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_simple_action_strlist_no_emb() -> None:
|
||||
str1 = "test1"
|
||||
str2 = "test2"
|
||||
str3 = "test3"
|
||||
expected = [{"a_namespace": str1}, {"a_namespace": str2}, {"a_namespace": str3}]
|
||||
to_embed: List[Union[str, base._Embed]] = [str1, str2, str3]
|
||||
assert base.embed(to_embed, MockEncoder(), "a_namespace") == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_simple_action_strlist_w_emb() -> None:
|
||||
str1 = "test1"
|
||||
str2 = "test2"
|
||||
str3 = "test3"
|
||||
encoded_str1 = base.stringify_embedding(list(encoded_keyword + str1))
|
||||
encoded_str2 = base.stringify_embedding(list(encoded_keyword + str2))
|
||||
encoded_str3 = base.stringify_embedding(list(encoded_keyword + str3))
|
||||
expected = [
|
||||
{"a_namespace": encoded_str1},
|
||||
{"a_namespace": encoded_str2},
|
||||
{"a_namespace": encoded_str3},
|
||||
]
|
||||
assert (
|
||||
base.embed(base.Embed([str1, str2, str3]), MockEncoder(), "a_namespace")
|
||||
== expected
|
||||
)
|
||||
expected_embed_and_keep = [
|
||||
{"a_namespace": str1 + " " + encoded_str1},
|
||||
{"a_namespace": str2 + " " + encoded_str2},
|
||||
{"a_namespace": str3 + " " + encoded_str3},
|
||||
]
|
||||
assert (
|
||||
base.embed(base.EmbedAndKeep([str1, str2, str3]), MockEncoder(), "a_namespace")
|
||||
== expected_embed_and_keep
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_simple_action_strlist_w_some_emb() -> None:
|
||||
str1 = "test1"
|
||||
str2 = "test2"
|
||||
str3 = "test3"
|
||||
encoded_str2 = base.stringify_embedding(list(encoded_keyword + str2))
|
||||
encoded_str3 = base.stringify_embedding(list(encoded_keyword + str3))
|
||||
expected = [
|
||||
{"a_namespace": str1},
|
||||
{"a_namespace": encoded_str2},
|
||||
{"a_namespace": encoded_str3},
|
||||
]
|
||||
assert (
|
||||
base.embed(
|
||||
[str1, base.Embed(str2), base.Embed(str3)], MockEncoder(), "a_namespace"
|
||||
)
|
||||
== expected
|
||||
)
|
||||
expected_embed_and_keep = [
|
||||
{"a_namespace": str1},
|
||||
{"a_namespace": str2 + " " + encoded_str2},
|
||||
{"a_namespace": str3 + " " + encoded_str3},
|
||||
]
|
||||
assert (
|
||||
base.embed(
|
||||
[str1, base.EmbedAndKeep(str2), base.EmbedAndKeep(str3)],
|
||||
MockEncoder(),
|
||||
"a_namespace",
|
||||
)
|
||||
== expected_embed_and_keep
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_action_w_namespace_no_emb() -> None:
|
||||
str1 = "test1"
|
||||
str2 = "test2"
|
||||
str3 = "test3"
|
||||
expected = [
|
||||
{"test_namespace": str1},
|
||||
{"test_namespace": str2},
|
||||
{"test_namespace": str3},
|
||||
]
|
||||
assert (
|
||||
base.embed(
|
||||
[
|
||||
{"test_namespace": str1},
|
||||
{"test_namespace": str2},
|
||||
{"test_namespace": str3},
|
||||
],
|
||||
MockEncoder(),
|
||||
)
|
||||
== expected
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_action_w_namespace_w_emb() -> None:
|
||||
str1 = "test1"
|
||||
str2 = "test2"
|
||||
str3 = "test3"
|
||||
encoded_str1 = base.stringify_embedding(list(encoded_keyword + str1))
|
||||
encoded_str2 = base.stringify_embedding(list(encoded_keyword + str2))
|
||||
encoded_str3 = base.stringify_embedding(list(encoded_keyword + str3))
|
||||
expected = [
|
||||
{"test_namespace": encoded_str1},
|
||||
{"test_namespace": encoded_str2},
|
||||
{"test_namespace": encoded_str3},
|
||||
]
|
||||
assert (
|
||||
base.embed(
|
||||
[
|
||||
{"test_namespace": base.Embed(str1)},
|
||||
{"test_namespace": base.Embed(str2)},
|
||||
{"test_namespace": base.Embed(str3)},
|
||||
],
|
||||
MockEncoder(),
|
||||
)
|
||||
== expected
|
||||
)
|
||||
expected_embed_and_keep = [
|
||||
{"test_namespace": str1 + " " + encoded_str1},
|
||||
{"test_namespace": str2 + " " + encoded_str2},
|
||||
{"test_namespace": str3 + " " + encoded_str3},
|
||||
]
|
||||
assert (
|
||||
base.embed(
|
||||
[
|
||||
{"test_namespace": base.EmbedAndKeep(str1)},
|
||||
{"test_namespace": base.EmbedAndKeep(str2)},
|
||||
{"test_namespace": base.EmbedAndKeep(str3)},
|
||||
],
|
||||
MockEncoder(),
|
||||
)
|
||||
== expected_embed_and_keep
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_action_w_namespace_w_emb2() -> None:
|
||||
str1 = "test1"
|
||||
str2 = "test2"
|
||||
str3 = "test3"
|
||||
encoded_str1 = base.stringify_embedding(list(encoded_keyword + str1))
|
||||
encoded_str2 = base.stringify_embedding(list(encoded_keyword + str2))
|
||||
encoded_str3 = base.stringify_embedding(list(encoded_keyword + str3))
|
||||
expected = [
|
||||
{"test_namespace1": encoded_str1},
|
||||
{"test_namespace2": encoded_str2},
|
||||
{"test_namespace3": encoded_str3},
|
||||
]
|
||||
assert (
|
||||
base.embed(
|
||||
base.Embed(
|
||||
[
|
||||
{"test_namespace1": str1},
|
||||
{"test_namespace2": str2},
|
||||
{"test_namespace3": str3},
|
||||
]
|
||||
),
|
||||
MockEncoder(),
|
||||
)
|
||||
== expected
|
||||
)
|
||||
expected_embed_and_keep = [
|
||||
{"test_namespace1": str1 + " " + encoded_str1},
|
||||
{"test_namespace2": str2 + " " + encoded_str2},
|
||||
{"test_namespace3": str3 + " " + encoded_str3},
|
||||
]
|
||||
assert (
|
||||
base.embed(
|
||||
base.EmbedAndKeep(
|
||||
[
|
||||
{"test_namespace1": str1},
|
||||
{"test_namespace2": str2},
|
||||
{"test_namespace3": str3},
|
||||
]
|
||||
),
|
||||
MockEncoder(),
|
||||
)
|
||||
== expected_embed_and_keep
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_action_w_namespace_w_some_emb() -> None:
|
||||
str1 = "test1"
|
||||
str2 = "test2"
|
||||
str3 = "test3"
|
||||
encoded_str2 = base.stringify_embedding(list(encoded_keyword + str2))
|
||||
encoded_str3 = base.stringify_embedding(list(encoded_keyword + str3))
|
||||
expected = [
|
||||
{"test_namespace": str1},
|
||||
{"test_namespace": encoded_str2},
|
||||
{"test_namespace": encoded_str3},
|
||||
]
|
||||
assert (
|
||||
base.embed(
|
||||
[
|
||||
{"test_namespace": str1},
|
||||
{"test_namespace": base.Embed(str2)},
|
||||
{"test_namespace": base.Embed(str3)},
|
||||
],
|
||||
MockEncoder(),
|
||||
)
|
||||
== expected
|
||||
)
|
||||
expected_embed_and_keep = [
|
||||
{"test_namespace": str1},
|
||||
{"test_namespace": str2 + " " + encoded_str2},
|
||||
{"test_namespace": str3 + " " + encoded_str3},
|
||||
]
|
||||
assert (
|
||||
base.embed(
|
||||
[
|
||||
{"test_namespace": str1},
|
||||
{"test_namespace": base.EmbedAndKeep(str2)},
|
||||
{"test_namespace": base.EmbedAndKeep(str3)},
|
||||
],
|
||||
MockEncoder(),
|
||||
)
|
||||
== expected_embed_and_keep
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_action_w_namespace_w_emb_w_more_than_one_item_in_first_dict() -> None:
|
||||
str1 = "test1"
|
||||
str2 = "test2"
|
||||
str3 = "test3"
|
||||
encoded_str1 = base.stringify_embedding(list(encoded_keyword + str1))
|
||||
encoded_str2 = base.stringify_embedding(list(encoded_keyword + str2))
|
||||
encoded_str3 = base.stringify_embedding(list(encoded_keyword + str3))
|
||||
expected = [
|
||||
{"test_namespace": encoded_str1, "test_namespace2": str1},
|
||||
{"test_namespace": encoded_str2, "test_namespace2": str2},
|
||||
{"test_namespace": encoded_str3, "test_namespace2": str3},
|
||||
]
|
||||
assert (
|
||||
base.embed(
|
||||
[
|
||||
{"test_namespace": base.Embed(str1), "test_namespace2": str1},
|
||||
{"test_namespace": base.Embed(str2), "test_namespace2": str2},
|
||||
{"test_namespace": base.Embed(str3), "test_namespace2": str3},
|
||||
],
|
||||
MockEncoder(),
|
||||
)
|
||||
== expected
|
||||
)
|
||||
expected_embed_and_keep = [
|
||||
{
|
||||
"test_namespace": str1 + " " + encoded_str1,
|
||||
"test_namespace2": str1,
|
||||
},
|
||||
{
|
||||
"test_namespace": str2 + " " + encoded_str2,
|
||||
"test_namespace2": str2,
|
||||
},
|
||||
{
|
||||
"test_namespace": str3 + " " + encoded_str3,
|
||||
"test_namespace2": str3,
|
||||
},
|
||||
]
|
||||
assert (
|
||||
base.embed(
|
||||
[
|
||||
{"test_namespace": base.EmbedAndKeep(str1), "test_namespace2": str1},
|
||||
{"test_namespace": base.EmbedAndKeep(str2), "test_namespace2": str2},
|
||||
{"test_namespace": base.EmbedAndKeep(str3), "test_namespace2": str3},
|
||||
],
|
||||
MockEncoder(),
|
||||
)
|
||||
== expected_embed_and_keep
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_one_namespace_w_list_of_features_no_emb() -> None:
|
||||
str1 = "test1"
|
||||
str2 = "test2"
|
||||
expected = [{"test_namespace": [str1, str2]}]
|
||||
assert base.embed({"test_namespace": [str1, str2]}, MockEncoder()) == expected
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_one_namespace_w_list_of_features_w_some_emb() -> None:
|
||||
str1 = "test1"
|
||||
str2 = "test2"
|
||||
encoded_str2 = base.stringify_embedding(list(encoded_keyword + str2))
|
||||
expected = [{"test_namespace": [str1, encoded_str2]}]
|
||||
assert (
|
||||
base.embed({"test_namespace": [str1, base.Embed(str2)]}, MockEncoder())
|
||||
== expected
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_nested_list_features_throws() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
base.embed({"test_namespace": [[1, 2], [3, 4]]}, MockEncoder())
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_dict_in_list_throws() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
base.embed({"test_namespace": [{"a": 1}, {"b": 2}]}, MockEncoder())
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_nested_dict_throws() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
base.embed({"test_namespace": {"a": {"b": 1}}}, MockEncoder())
|
||||
|
||||
|
||||
@pytest.mark.requires("vowpal_wabbit_next")
|
||||
def test_list_of_tuples_throws() -> None:
|
||||
with pytest.raises(ValueError):
|
||||
base.embed({"test_namespace": [("a", 1), ("b", 2)]}, MockEncoder())
|
||||
15
libs/experimental/tests/unit_tests/rl_chain/test_utils.py
Normal file
15
libs/experimental/tests/unit_tests/rl_chain/test_utils.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from typing import Any, List
|
||||
|
||||
|
||||
class MockEncoder:
|
||||
def encode(self, to_encode: str) -> str:
|
||||
return "[encoded]" + to_encode
|
||||
|
||||
|
||||
class MockEncoderReturnsList:
|
||||
def encode(self, to_encode: Any) -> List:
|
||||
if isinstance(to_encode, str):
|
||||
return [1.0, 2.0]
|
||||
elif isinstance(to_encode, List):
|
||||
return [[1.0, 2.0] for _ in range(len(to_encode))]
|
||||
raise ValueError("Invalid input type for unit test")
|
||||
@@ -2,6 +2,8 @@ from typing import Iterator, List
|
||||
|
||||
import pytest
|
||||
|
||||
from . import is_libcublas_available
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def check_spacy_model() -> Iterator[None]:
|
||||
@@ -12,6 +14,13 @@ def check_spacy_model() -> Iterator[None]:
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def check_libcublas() -> Iterator[None]:
|
||||
if not is_libcublas_available():
|
||||
pytest.skip(reason="libcublas.so is not available")
|
||||
yield
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
@pytest.mark.parametrize(
|
||||
"analyzed_fields,should_contain",
|
||||
@@ -39,6 +48,23 @@ def test_anonymize_multiple() -> None:
|
||||
assert phrase not in anonymized_text
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
def test_check_instances() -> None:
|
||||
"""Test anonymizing multiple items in a sentence"""
|
||||
from langchain_experimental.data_anonymizer import PresidioAnonymizer
|
||||
|
||||
text = (
|
||||
"This is John Smith. John Smith works in a bakery." "John Smith is a good guy"
|
||||
)
|
||||
anonymizer = PresidioAnonymizer(["PERSON"], faker_seed=42)
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
assert anonymized_text.count("Connie Lawrence") == 3
|
||||
|
||||
# New name should be generated
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
assert anonymized_text.count("Connie Lawrence") == 0
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
def test_anonymize_with_custom_operator() -> None:
|
||||
"""Test anonymize a name with a custom operator"""
|
||||
@@ -46,13 +72,13 @@ def test_anonymize_with_custom_operator() -> None:
|
||||
|
||||
from langchain_experimental.data_anonymizer import PresidioAnonymizer
|
||||
|
||||
custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": "<name>"})}
|
||||
custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": "NAME"})}
|
||||
anonymizer = PresidioAnonymizer(operators=custom_operator)
|
||||
|
||||
text = "Jane Doe was here."
|
||||
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
assert anonymized_text == "<name> was here."
|
||||
assert anonymized_text == "NAME was here."
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
@@ -82,3 +108,21 @@ def test_add_recognizer_operator() -> None:
|
||||
anonymizer.add_operators(custom_operator)
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
assert anonymized_text == "Dear Jane Doe was here."
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
def test_non_faker_values() -> None:
|
||||
"""Test anonymizing multiple items in a sentence without faker values"""
|
||||
from langchain_experimental.data_anonymizer import PresidioAnonymizer
|
||||
|
||||
text = (
|
||||
"My name is John Smith. Your name is Adam Smith. Her name is Jane Smith."
|
||||
"Our names are: John Smith, Adam Smith, Jane Smith."
|
||||
)
|
||||
expected_result = (
|
||||
"My name is <PERSON>. Your name is <PERSON_2>. Her name is <PERSON_3>."
|
||||
"Our names are: <PERSON>, <PERSON_2>, <PERSON_3>."
|
||||
)
|
||||
anonymizer = PresidioAnonymizer(add_default_faker_operators=False)
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
assert anonymized_text == expected_result
|
||||
|
||||
82
libs/experimental/tests/unit_tests/test_llm_symbolic_math.py
Normal file
82
libs/experimental/tests/unit_tests/test_llm_symbolic_math.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""Test LLM Math functionality."""
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_experimental.llm_symbolic_math.base import (
|
||||
LLMSymbolicMathChain,
|
||||
)
|
||||
from langchain_experimental.llm_symbolic_math.prompt import (
|
||||
_PROMPT_TEMPLATE,
|
||||
)
|
||||
from tests.unit_tests.fake_llm import FakeLLM
|
||||
|
||||
try:
|
||||
import sympy
|
||||
except ImportError:
|
||||
pytest.skip("sympy not installed", allow_module_level=True)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fake_llm_symbolic_math_chain() -> LLMSymbolicMathChain:
|
||||
"""Fake LLM Math chain for testing."""
|
||||
queries = {
|
||||
_PROMPT_TEMPLATE.format(question="What is 1 plus 1?"): "Answer: 2",
|
||||
_PROMPT_TEMPLATE.format(
|
||||
question="What is the square root of 2?"
|
||||
): "```text\nsqrt(2)\n```",
|
||||
_PROMPT_TEMPLATE.format(
|
||||
question="What is the limit of sin(x) / x as x goes to 0?"
|
||||
): "```text\nlimit(sin(x)/x,x,0)\n```",
|
||||
_PROMPT_TEMPLATE.format(
|
||||
question="What is the integral of e^-x from 0 to infinity?"
|
||||
): "```text\nintegrate(exp(-x), (x, 0, oo))\n```",
|
||||
_PROMPT_TEMPLATE.format(
|
||||
question="What are the solutions to this equation x**2 - x?"
|
||||
): "```text\nsolveset(x**2 - x, x)\n```",
|
||||
_PROMPT_TEMPLATE.format(question="foo"): "foo",
|
||||
}
|
||||
fake_llm = FakeLLM(queries=queries)
|
||||
return LLMSymbolicMathChain.from_llm(fake_llm, input_key="q", output_key="a")
|
||||
|
||||
|
||||
def test_simple_question(fake_llm_symbolic_math_chain: LLMSymbolicMathChain) -> None:
|
||||
"""Test simple question that should not need python."""
|
||||
question = "What is 1 plus 1?"
|
||||
output = fake_llm_symbolic_math_chain.run(question)
|
||||
assert output == "Answer: 2"
|
||||
|
||||
|
||||
def test_root_question(fake_llm_symbolic_math_chain: LLMSymbolicMathChain) -> None:
|
||||
"""Test irrational number that should need sympy."""
|
||||
question = "What is the square root of 2?"
|
||||
output = fake_llm_symbolic_math_chain.run(question)
|
||||
assert output == f"Answer: {sympy.sqrt(2)}"
|
||||
|
||||
|
||||
def test_limit_question(fake_llm_symbolic_math_chain: LLMSymbolicMathChain) -> None:
|
||||
"""Test question about limits that needs sympy"""
|
||||
question = "What is the limit of sin(x) / x as x goes to 0?"
|
||||
output = fake_llm_symbolic_math_chain.run(question)
|
||||
assert output == "Answer: 1"
|
||||
|
||||
|
||||
def test_integration_question(
|
||||
fake_llm_symbolic_math_chain: LLMSymbolicMathChain,
|
||||
) -> None:
|
||||
"""Test question about integration that needs sympy"""
|
||||
question = "What is the integral of e^-x from 0 to infinity?"
|
||||
output = fake_llm_symbolic_math_chain.run(question)
|
||||
assert output == "Answer: 1"
|
||||
|
||||
|
||||
def test_solver_question(fake_llm_symbolic_math_chain: LLMSymbolicMathChain) -> None:
|
||||
"""Test question about solving algebraic equations that needs sympy"""
|
||||
question = "What are the solutions to this equation x**2 - x?"
|
||||
output = fake_llm_symbolic_math_chain.run(question)
|
||||
assert output == "Answer: {0, 1}"
|
||||
|
||||
|
||||
def test_error(fake_llm_symbolic_math_chain: LLMSymbolicMathChain) -> None:
|
||||
"""Test question that raises error."""
|
||||
with pytest.raises(ValueError):
|
||||
fake_llm_symbolic_math_chain.run("foo")
|
||||
@@ -3,6 +3,8 @@ from typing import Iterator, List
|
||||
|
||||
import pytest
|
||||
|
||||
from . import is_libcublas_available
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def check_spacy_model() -> Iterator[None]:
|
||||
@@ -13,6 +15,13 @@ def check_spacy_model() -> Iterator[None]:
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def check_libcublas() -> Iterator[None]:
|
||||
if not is_libcublas_available():
|
||||
pytest.skip(reason="libcublas.so is not available")
|
||||
yield
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
@pytest.mark.parametrize(
|
||||
"analyzed_fields,should_contain",
|
||||
@@ -40,6 +49,32 @@ def test_anonymize_multiple() -> None:
|
||||
assert phrase not in anonymized_text
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
def test_check_instances() -> None:
|
||||
"""Test anonymizing multiple items in a sentence"""
|
||||
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
|
||||
|
||||
text = (
|
||||
"This is John Smith. John Smith works in a bakery." "John Smith is a good guy"
|
||||
)
|
||||
anonymizer = PresidioReversibleAnonymizer(["PERSON"], faker_seed=42)
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
persons = list(anonymizer.deanonymizer_mapping["PERSON"].keys())
|
||||
assert len(persons) == 1
|
||||
|
||||
anonymized_name = persons[0]
|
||||
assert anonymized_text.count(anonymized_name) == 3
|
||||
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
assert anonymized_text.count(anonymized_name) == 3
|
||||
assert anonymizer.deanonymizer_mapping["PERSON"][anonymized_name] == "John Smith"
|
||||
|
||||
text = "This is Jane Smith"
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
persons = list(anonymizer.deanonymizer_mapping["PERSON"].keys())
|
||||
assert len(persons) == 2
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
def test_anonymize_with_custom_operator() -> None:
|
||||
"""Test anonymize a name with a custom operator"""
|
||||
@@ -47,13 +82,13 @@ def test_anonymize_with_custom_operator() -> None:
|
||||
|
||||
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
|
||||
|
||||
custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": "<name>"})}
|
||||
custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": "NAME"})}
|
||||
anonymizer = PresidioReversibleAnonymizer(operators=custom_operator)
|
||||
|
||||
text = "Jane Doe was here."
|
||||
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
assert anonymized_text == "<name> was here."
|
||||
assert anonymized_text == "NAME was here."
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
@@ -79,6 +114,8 @@ def test_add_recognizer_operator() -> None:
|
||||
assert anonymized_text == "<TITLE> Jane Doe was here."
|
||||
|
||||
# anonymizing with custom recognizer and operator
|
||||
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=[])
|
||||
anonymizer.add_recognizer(custom_recognizer)
|
||||
custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})}
|
||||
anonymizer.add_operators(custom_operator)
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
@@ -152,3 +189,21 @@ def test_save_load_deanonymizer_mapping() -> None:
|
||||
|
||||
finally:
|
||||
os.remove("test_file.json")
|
||||
|
||||
|
||||
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
|
||||
def test_non_faker_values() -> None:
|
||||
"""Test anonymizing multiple items in a sentence without faker values"""
|
||||
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
|
||||
|
||||
text = (
|
||||
"My name is John Smith. Your name is Adam Smith. Her name is Jane Smith."
|
||||
"Our names are: John Smith, Adam Smith, Jane Smith."
|
||||
)
|
||||
expected_result = (
|
||||
"My name is <PERSON>. Your name is <PERSON_2>. Her name is <PERSON_3>."
|
||||
"Our names are: <PERSON>, <PERSON_2>, <PERSON_3>."
|
||||
)
|
||||
anonymizer = PresidioReversibleAnonymizer(add_default_faker_operators=False)
|
||||
anonymized_text = anonymizer.anonymize(text)
|
||||
assert anonymized_text == expected_result
|
||||
|
||||
@@ -145,10 +145,13 @@ class BaseSingleActionAgent(BaseModel):
|
||||
def dict(self, **kwargs: Any) -> Dict:
|
||||
"""Return dictionary representation of agent."""
|
||||
_dict = super().dict()
|
||||
_type = self._agent_type
|
||||
try:
|
||||
_type = self._agent_type
|
||||
except NotImplementedError:
|
||||
_type = None
|
||||
if isinstance(_type, AgentType):
|
||||
_dict["_type"] = str(_type.value)
|
||||
else:
|
||||
elif _type is not None:
|
||||
_dict["_type"] = _type
|
||||
return _dict
|
||||
|
||||
@@ -175,6 +178,8 @@ class BaseSingleActionAgent(BaseModel):
|
||||
|
||||
# Fetch dictionary to save
|
||||
agent_dict = self.dict()
|
||||
if "_type" not in agent_dict:
|
||||
raise NotImplementedError(f"Agent {self} does not support saving")
|
||||
|
||||
if save_path.suffix == ".json":
|
||||
with open(file_path, "w") as f:
|
||||
|
||||
@@ -0,0 +1,101 @@
|
||||
from typing import Dict, List
|
||||
|
||||
from langchain.agents.agent_toolkits.base import BaseToolkit
|
||||
from langchain.tools import BaseTool
|
||||
from langchain.tools.clickup.prompt import (
|
||||
CLICKUP_FOLDER_CREATE_PROMPT,
|
||||
CLICKUP_GET_ALL_TEAMS_PROMPT,
|
||||
CLICKUP_GET_FOLDERS_PROMPT,
|
||||
CLICKUP_GET_LIST_PROMPT,
|
||||
CLICKUP_GET_SPACES_PROMPT,
|
||||
CLICKUP_GET_TASK_ATTRIBUTE_PROMPT,
|
||||
CLICKUP_GET_TASK_PROMPT,
|
||||
CLICKUP_LIST_CREATE_PROMPT,
|
||||
CLICKUP_TASK_CREATE_PROMPT,
|
||||
CLICKUP_UPDATE_TASK_ASSIGNEE_PROMPT,
|
||||
CLICKUP_UPDATE_TASK_PROMPT,
|
||||
)
|
||||
from langchain.tools.clickup.tool import ClickupAction
|
||||
from langchain.utilities.clickup import ClickupAPIWrapper
|
||||
|
||||
|
||||
class ClickupToolkit(BaseToolkit):
|
||||
"""Clickup Toolkit."""
|
||||
|
||||
tools: List[BaseTool] = []
|
||||
|
||||
@classmethod
|
||||
def from_clickup_api_wrapper(
|
||||
cls, clickup_api_wrapper: ClickupAPIWrapper
|
||||
) -> "ClickupToolkit":
|
||||
operations: List[Dict] = [
|
||||
{
|
||||
"mode": "get_task",
|
||||
"name": "Get task",
|
||||
"description": CLICKUP_GET_TASK_PROMPT,
|
||||
},
|
||||
{
|
||||
"mode": "get_task_attribute",
|
||||
"name": "Get task attribute",
|
||||
"description": CLICKUP_GET_TASK_ATTRIBUTE_PROMPT,
|
||||
},
|
||||
{
|
||||
"mode": "get_teams",
|
||||
"name": "Get Teams",
|
||||
"description": CLICKUP_GET_ALL_TEAMS_PROMPT,
|
||||
},
|
||||
{
|
||||
"mode": "create_task",
|
||||
"name": "Create Task",
|
||||
"description": CLICKUP_TASK_CREATE_PROMPT,
|
||||
},
|
||||
{
|
||||
"mode": "create_list",
|
||||
"name": "Create List",
|
||||
"description": CLICKUP_LIST_CREATE_PROMPT,
|
||||
},
|
||||
{
|
||||
"mode": "create_folder",
|
||||
"name": "Create Folder",
|
||||
"description": CLICKUP_FOLDER_CREATE_PROMPT,
|
||||
},
|
||||
{
|
||||
"mode": "get_list",
|
||||
"name": "Get all lists in the space",
|
||||
"description": CLICKUP_GET_LIST_PROMPT,
|
||||
},
|
||||
{
|
||||
"mode": "get_folders",
|
||||
"name": "Get all folders in the workspace",
|
||||
"description": CLICKUP_GET_FOLDERS_PROMPT,
|
||||
},
|
||||
{
|
||||
"mode": "get_spaces",
|
||||
"name": "Get all spaces in the workspace",
|
||||
"description": CLICKUP_GET_SPACES_PROMPT,
|
||||
},
|
||||
{
|
||||
"mode": "update_task",
|
||||
"name": "Update task",
|
||||
"description": CLICKUP_UPDATE_TASK_PROMPT,
|
||||
},
|
||||
{
|
||||
"mode": "update_task_assignees",
|
||||
"name": "Update task assignees",
|
||||
"description": CLICKUP_UPDATE_TASK_ASSIGNEE_PROMPT,
|
||||
},
|
||||
]
|
||||
tools = [
|
||||
ClickupAction(
|
||||
name=action["name"],
|
||||
description=action["description"],
|
||||
mode=action["mode"],
|
||||
api_wrapper=clickup_api_wrapper,
|
||||
)
|
||||
for action in operations
|
||||
]
|
||||
return cls(tools=tools)
|
||||
|
||||
def get_tools(self) -> List[BaseTool]:
|
||||
"""Get the tools in the toolkit."""
|
||||
return self.tools
|
||||
@@ -1,17 +1,47 @@
|
||||
"""Module definitions of agent types together with corresponding agents."""
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class AgentType(str, Enum):
|
||||
"""Enumerator with the Agent types."""
|
||||
"""An enum for agent types.
|
||||
|
||||
See documentation: https://python.langchain.com/docs/modules/agents/agent_types/
|
||||
"""
|
||||
|
||||
ZERO_SHOT_REACT_DESCRIPTION = "zero-shot-react-description"
|
||||
"""A zero shot agent that does a reasoning step before acting."""
|
||||
|
||||
REACT_DOCSTORE = "react-docstore"
|
||||
"""A zero shot agent that does a reasoning step before acting.
|
||||
|
||||
This agent has access to a document store that allows it to look up
|
||||
relevant information to answering the question.
|
||||
"""
|
||||
|
||||
SELF_ASK_WITH_SEARCH = "self-ask-with-search"
|
||||
"""An agent that breaks down a complex question into a series of simpler questions.
|
||||
|
||||
This agent uses a search tool to look up answers to the simpler questions
|
||||
in order to answer the original complex question.
|
||||
"""
|
||||
CONVERSATIONAL_REACT_DESCRIPTION = "conversational-react-description"
|
||||
CHAT_ZERO_SHOT_REACT_DESCRIPTION = "chat-zero-shot-react-description"
|
||||
"""A zero shot agent that does a reasoning step before acting.
|
||||
|
||||
This agent is designed to be used in conjunction
|
||||
"""
|
||||
|
||||
CHAT_CONVERSATIONAL_REACT_DESCRIPTION = "chat-conversational-react-description"
|
||||
|
||||
STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION = (
|
||||
"structured-chat-zero-shot-react-description"
|
||||
)
|
||||
"""An zero-shot react agent optimized for chat models.
|
||||
|
||||
This agent is capable of invoking tools that have multiple inputs.
|
||||
"""
|
||||
|
||||
OPENAI_FUNCTIONS = "openai-functions"
|
||||
"""An agent optimized for using open AI functions."""
|
||||
|
||||
OPENAI_MULTI_FUNCTIONS = "openai-multi-functions"
|
||||
|
||||
@@ -29,11 +29,20 @@ class MRKLOutputParser(AgentOutputParser):
|
||||
r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
|
||||
)
|
||||
action_match = re.search(regex, text, re.DOTALL)
|
||||
if action_match:
|
||||
if includes_answer:
|
||||
if action_match and includes_answer:
|
||||
if text.find(FINAL_ANSWER_ACTION) < text.find(action_match.group(0)):
|
||||
# if final answer is before the hallucination, return final answer
|
||||
start_index = text.find(FINAL_ANSWER_ACTION) + len(FINAL_ANSWER_ACTION)
|
||||
end_index = text.find("\n\n", start_index)
|
||||
return AgentFinish(
|
||||
{"output": text[start_index:end_index].strip()}, text[:end_index]
|
||||
)
|
||||
else:
|
||||
raise OutputParserException(
|
||||
f"{FINAL_ANSWER_AND_PARSABLE_ACTION_ERROR_MESSAGE}: {text}"
|
||||
)
|
||||
|
||||
if action_match:
|
||||
action = action_match.group(1).strip()
|
||||
action_input = action_match.group(2)
|
||||
tool_input = action_input.strip(" ")
|
||||
|
||||
@@ -45,12 +45,21 @@ def _parse_ai_message(message: BaseMessage) -> Union[List[AgentAction], AgentFin
|
||||
|
||||
if function_call:
|
||||
try:
|
||||
tools = json.loads(function_call["arguments"])["actions"]
|
||||
arguments = json.loads(function_call["arguments"])
|
||||
except JSONDecodeError:
|
||||
raise OutputParserException(
|
||||
f"Could not parse tool input: {function_call} because "
|
||||
f"the `arguments` is not valid JSON."
|
||||
)
|
||||
|
||||
try:
|
||||
tools = arguments["actions"]
|
||||
except (TypeError, KeyError):
|
||||
raise OutputParserException(
|
||||
f"Could not parse tool input: {function_call} because "
|
||||
f"the `arguments` JSON does not contain `actions` key."
|
||||
)
|
||||
|
||||
final_tools: List[AgentAction] = []
|
||||
for tool_schema in tools:
|
||||
_tool_input = tool_schema["action"]
|
||||
|
||||
@@ -25,7 +25,6 @@ import hashlib
|
||||
import inspect
|
||||
import json
|
||||
import logging
|
||||
import warnings
|
||||
from datetime import timedelta
|
||||
from functools import lru_cache
|
||||
from typing import (
|
||||
@@ -54,7 +53,7 @@ except ImportError:
|
||||
from langchain.llms.base import LLM, get_prompts
|
||||
from langchain.load.dump import dumps
|
||||
from langchain.load.load import loads
|
||||
from langchain.schema import ChatGeneration, Generation
|
||||
from langchain.schema import Generation
|
||||
from langchain.schema.cache import RETURN_VAL_TYPE, BaseCache
|
||||
from langchain.schema.embeddings import Embeddings
|
||||
from langchain.utils import get_from_env
|
||||
@@ -306,7 +305,18 @@ class RedisCache(BaseCache):
|
||||
results = self.redis.hgetall(self._key(prompt, llm_string))
|
||||
if results:
|
||||
for _, text in results.items():
|
||||
generations.append(Generation(text=text))
|
||||
try:
|
||||
generations.append(loads(text))
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"Retrieving a cache value that could not be deserialized "
|
||||
"properly. This is likely due to the cache being in an "
|
||||
"older format. Please recreate your cache to avoid this "
|
||||
"error."
|
||||
)
|
||||
# In a previous life we stored the raw text directly
|
||||
# in the table, so assume it's in that format.
|
||||
generations.append(Generation(text=text))
|
||||
return generations if generations else None
|
||||
|
||||
def update(self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE) -> None:
|
||||
@@ -317,12 +327,6 @@ class RedisCache(BaseCache):
|
||||
"RedisCache only supports caching of normal LLM generations, "
|
||||
f"got {type(gen)}"
|
||||
)
|
||||
if isinstance(gen, ChatGeneration):
|
||||
warnings.warn(
|
||||
"NOTE: Generation has not been cached. RedisCache does not"
|
||||
" support caching ChatModel outputs."
|
||||
)
|
||||
return
|
||||
# Write to a Redis HASH
|
||||
key = self._key(prompt, llm_string)
|
||||
|
||||
@@ -330,7 +334,7 @@ class RedisCache(BaseCache):
|
||||
pipe.hset(
|
||||
key,
|
||||
mapping={
|
||||
str(idx): generation.text
|
||||
str(idx): dumps(generation)
|
||||
for idx, generation in enumerate(return_val)
|
||||
},
|
||||
)
|
||||
@@ -441,9 +445,20 @@ class RedisSemanticCache(BaseCache):
|
||||
)
|
||||
if results:
|
||||
for document in results:
|
||||
generations.extend(
|
||||
_load_generations_from_json(document.metadata["return_val"])
|
||||
)
|
||||
try:
|
||||
generations.extend(loads(document.metadata["return_val"]))
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"Retrieving a cache value that could not be deserialized "
|
||||
"properly. This is likely due to the cache being in an "
|
||||
"older format. Please recreate your cache to avoid this "
|
||||
"error."
|
||||
)
|
||||
# In a previous life we stored the raw text directly
|
||||
# in the table, so assume it's in that format.
|
||||
generations.extend(
|
||||
_load_generations_from_json(document.metadata["return_val"])
|
||||
)
|
||||
return generations if generations else None
|
||||
|
||||
def update(self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE) -> None:
|
||||
@@ -454,18 +469,12 @@ class RedisSemanticCache(BaseCache):
|
||||
"RedisSemanticCache only supports caching of "
|
||||
f"normal LLM generations, got {type(gen)}"
|
||||
)
|
||||
if isinstance(gen, ChatGeneration):
|
||||
warnings.warn(
|
||||
"NOTE: Generation has not been cached. RedisSentimentCache does not"
|
||||
" support caching ChatModel outputs."
|
||||
)
|
||||
return
|
||||
llm_cache = self._get_llm_cache(llm_string)
|
||||
_dump_generations_to_json([g for g in return_val])
|
||||
|
||||
metadata = {
|
||||
"llm_string": llm_string,
|
||||
"prompt": prompt,
|
||||
"return_val": _dump_generations_to_json([g for g in return_val]),
|
||||
"return_val": dumps([g for g in return_val]),
|
||||
}
|
||||
llm_cache.add_texts(texts=[prompt], metadatas=[metadata])
|
||||
|
||||
|
||||
@@ -250,7 +250,7 @@ class BaseCallbackHandler(
|
||||
CallbackManagerMixin,
|
||||
RunManagerMixin,
|
||||
):
|
||||
"""Base callback handler that can be used to handle callbacks from langchain."""
|
||||
"""Base callback handler that handles callbacks from LangChain."""
|
||||
|
||||
raise_error: bool = False
|
||||
|
||||
@@ -288,7 +288,7 @@ class BaseCallbackHandler(
|
||||
|
||||
|
||||
class AsyncCallbackHandler(BaseCallbackHandler):
|
||||
"""Async callback handler that can be used to handle callbacks from langchain."""
|
||||
"""Async callback handler that handles callbacks from LangChain."""
|
||||
|
||||
async def on_llm_start(
|
||||
self,
|
||||
|
||||
@@ -17,6 +17,8 @@ from langchain.schema import (
|
||||
|
||||
|
||||
class LabelStudioMode(Enum):
|
||||
"""Label Studio mode enumerator."""
|
||||
|
||||
PROMPT = "prompt"
|
||||
CHAT = "chat"
|
||||
|
||||
@@ -24,6 +26,13 @@ class LabelStudioMode(Enum):
|
||||
def get_default_label_configs(
|
||||
mode: Union[str, LabelStudioMode]
|
||||
) -> Tuple[str, LabelStudioMode]:
|
||||
"""Get default Label Studio configs for the given mode.
|
||||
|
||||
Parameters:
|
||||
mode: Label Studio mode ("prompt" or "chat")
|
||||
|
||||
Returns: Tuple of Label Studio config and mode
|
||||
"""
|
||||
_default_label_configs = {
|
||||
LabelStudioMode.PROMPT.value: """
|
||||
<View>
|
||||
|
||||
@@ -19,6 +19,8 @@ user_props_ctx = ContextVar[Union[str, None]]("user_props_ctx", default=None)
|
||||
|
||||
|
||||
class UserContextManager:
|
||||
"""Context manager for LLMonitor user context."""
|
||||
|
||||
def __init__(self, user_id: str, user_props: Any = None) -> None:
|
||||
user_ctx.set(user_id)
|
||||
user_props_ctx.set(user_props)
|
||||
@@ -32,6 +34,15 @@ class UserContextManager:
|
||||
|
||||
|
||||
def identify(user_id: str, user_props: Any = None) -> UserContextManager:
|
||||
"""Builds an LLMonitor UserContextManager
|
||||
|
||||
Parameters:
|
||||
- `user_id`: The user id.
|
||||
- `user_props`: The user properties.
|
||||
|
||||
Returns:
|
||||
A context manager that sets the user context.
|
||||
"""
|
||||
return UserContextManager(user_id, user_props)
|
||||
|
||||
|
||||
@@ -149,7 +160,8 @@ def _parse_lc_messages(messages: Union[List[BaseMessage], Any]) -> List[Dict[str
|
||||
|
||||
|
||||
class LLMonitorCallbackHandler(BaseCallbackHandler):
|
||||
"""Initializes the `LLMonitorCallbackHandler`.
|
||||
"""Callback Handler for LLMonitor`.
|
||||
|
||||
#### Parameters:
|
||||
- `app_id`: The app id of the app you want to report to. Defaults to
|
||||
`None`, which means that `LLMONITOR_APP_ID` will be used.
|
||||
|
||||
@@ -1182,7 +1182,7 @@ class AsyncCallbackManagerForRetrieverRun(
|
||||
|
||||
|
||||
class CallbackManager(BaseCallbackManager):
|
||||
"""Callback manager that handles callbacks from langchain."""
|
||||
"""Callback manager that handles callbacks from LangChain."""
|
||||
|
||||
def on_llm_start(
|
||||
self,
|
||||
@@ -1450,6 +1450,8 @@ class CallbackManager(BaseCallbackManager):
|
||||
|
||||
|
||||
class CallbackManagerForChainGroup(CallbackManager):
|
||||
"""Callback manager for the chain group."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
handlers: List[BaseCallbackHandler],
|
||||
@@ -1784,6 +1786,8 @@ class AsyncCallbackManager(BaseCallbackManager):
|
||||
|
||||
|
||||
class AsyncCallbackManagerForChainGroup(AsyncCallbackManager):
|
||||
"""Async callback manager for the chain group."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
handlers: List[BaseCallbackHandler],
|
||||
|
||||
@@ -49,7 +49,7 @@ class BaseTracer(BaseCallbackHandler, ABC):
|
||||
def _start_trace(self, run: Run) -> None:
|
||||
"""Start a trace for a run."""
|
||||
if run.parent_run_id:
|
||||
parent_run = self.run_map[str(run.parent_run_id)]
|
||||
parent_run = self.run_map.get(str(run.parent_run_id))
|
||||
if parent_run:
|
||||
self._add_child_run(parent_run, run)
|
||||
parent_run.child_execution_order = max(
|
||||
|
||||
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import math
|
||||
import threading
|
||||
from collections import defaultdict
|
||||
from typing import (
|
||||
Any,
|
||||
AsyncIterator,
|
||||
@@ -19,10 +20,13 @@ from anyio import create_memory_object_stream
|
||||
|
||||
from langchain.callbacks.tracers.base import BaseTracer
|
||||
from langchain.callbacks.tracers.schemas import Run
|
||||
from langchain.load.load import load
|
||||
from langchain.schema.output import ChatGenerationChunk, GenerationChunk
|
||||
|
||||
|
||||
class LogEntry(TypedDict):
|
||||
"""A single entry in the run log."""
|
||||
|
||||
id: str
|
||||
"""ID of the sub-run."""
|
||||
name: str
|
||||
@@ -47,21 +51,24 @@ class LogEntry(TypedDict):
|
||||
|
||||
|
||||
class RunState(TypedDict):
|
||||
"""State of the run."""
|
||||
|
||||
id: str
|
||||
"""ID of the run."""
|
||||
streamed_output: List[Any]
|
||||
"""List of output chunks streamed by Runnable.stream()"""
|
||||
final_output: Optional[Any]
|
||||
"""Final output of the run, usually the result of aggregating streamed_output.
|
||||
"""Final output of the run, usually the result of aggregating (`+`) streamed_output.
|
||||
Only available after the run has finished successfully."""
|
||||
|
||||
logs: list[LogEntry]
|
||||
"""List of sub-runs contained in this run, if any, in the order they were started.
|
||||
If filters were supplied, this list will contain only the runs that matched the
|
||||
filters."""
|
||||
logs: Dict[str, LogEntry]
|
||||
"""Map of run names to sub-runs. If filters were supplied, this list will
|
||||
contain only the runs that matched the filters."""
|
||||
|
||||
|
||||
class RunLogPatch:
|
||||
"""A patch to the run log."""
|
||||
|
||||
ops: List[Dict[str, Any]]
|
||||
"""List of jsonpatch operations, which describe how to create the run state
|
||||
from an empty dict. This is the minimal representation of the log, designed to
|
||||
@@ -72,7 +79,7 @@ class RunLogPatch:
|
||||
def __init__(self, *ops: Dict[str, Any]) -> None:
|
||||
self.ops = list(ops)
|
||||
|
||||
def __add__(self, other: Union[RunLogPatch, Any]) -> RunLogPatch:
|
||||
def __add__(self, other: Union[RunLogPatch, Any]) -> RunLog:
|
||||
if type(other) == RunLogPatch:
|
||||
ops = self.ops + other.ops
|
||||
state = jsonpatch.apply_patch(None, ops)
|
||||
@@ -85,13 +92,16 @@ class RunLogPatch:
|
||||
def __repr__(self) -> str:
|
||||
from pprint import pformat
|
||||
|
||||
return f"RunLogPatch(ops={pformat(self.ops)})"
|
||||
# 1:-1 to get rid of the [] around the list
|
||||
return f"RunLogPatch({pformat(self.ops)[1:-1]})"
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
return isinstance(other, RunLogPatch) and self.ops == other.ops
|
||||
|
||||
|
||||
class RunLog(RunLogPatch):
|
||||
"""A run log."""
|
||||
|
||||
state: RunState
|
||||
"""Current state of the log, obtained from applying all ops in sequence."""
|
||||
|
||||
@@ -99,7 +109,7 @@ class RunLog(RunLogPatch):
|
||||
super().__init__(*ops)
|
||||
self.state = state
|
||||
|
||||
def __add__(self, other: Union[RunLogPatch, Any]) -> RunLogPatch:
|
||||
def __add__(self, other: Union[RunLogPatch, Any]) -> RunLog:
|
||||
if type(other) == RunLogPatch:
|
||||
ops = self.ops + other.ops
|
||||
state = jsonpatch.apply_patch(self.state, other.ops)
|
||||
@@ -112,10 +122,12 @@ class RunLog(RunLogPatch):
|
||||
def __repr__(self) -> str:
|
||||
from pprint import pformat
|
||||
|
||||
return f"RunLog(state={pformat(self.state)})"
|
||||
return f"RunLog({pformat(self.state)})"
|
||||
|
||||
|
||||
class LogStreamCallbackHandler(BaseTracer):
|
||||
"""A tracer that streams run logs to a stream."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
@@ -143,7 +155,8 @@ class LogStreamCallbackHandler(BaseTracer):
|
||||
self.lock = threading.Lock()
|
||||
self.send_stream = send_stream
|
||||
self.receive_stream = receive_stream
|
||||
self._index_map: Dict[UUID, int] = {}
|
||||
self._key_map_by_run_id: Dict[UUID, str] = {}
|
||||
self._counter_map_by_name: Dict[str, int] = defaultdict(int)
|
||||
|
||||
def __aiter__(self) -> AsyncIterator[RunLogPatch]:
|
||||
return self.receive_stream.__aiter__()
|
||||
@@ -196,7 +209,7 @@ class LogStreamCallbackHandler(BaseTracer):
|
||||
id=str(run.id),
|
||||
streamed_output=[],
|
||||
final_output=None,
|
||||
logs=[],
|
||||
logs={},
|
||||
),
|
||||
}
|
||||
)
|
||||
@@ -207,14 +220,18 @@ class LogStreamCallbackHandler(BaseTracer):
|
||||
|
||||
# Determine previous index, increment by 1
|
||||
with self.lock:
|
||||
self._index_map[run.id] = max(self._index_map.values(), default=-1) + 1
|
||||
self._counter_map_by_name[run.name] += 1
|
||||
count = self._counter_map_by_name[run.name]
|
||||
self._key_map_by_run_id[run.id] = (
|
||||
run.name if count == 1 else f"{run.name}:{count}"
|
||||
)
|
||||
|
||||
# Add the run to the stream
|
||||
self.send_stream.send_nowait(
|
||||
RunLogPatch(
|
||||
{
|
||||
"op": "add",
|
||||
"path": f"/logs/{self._index_map[run.id]}",
|
||||
"path": f"/logs/{self._key_map_by_run_id[run.id]}",
|
||||
"value": LogEntry(
|
||||
id=str(run.id),
|
||||
name=run.name,
|
||||
@@ -233,7 +250,7 @@ class LogStreamCallbackHandler(BaseTracer):
|
||||
def _on_run_update(self, run: Run) -> None:
|
||||
"""Finish a run."""
|
||||
try:
|
||||
index = self._index_map.get(run.id)
|
||||
index = self._key_map_by_run_id.get(run.id)
|
||||
|
||||
if index is None:
|
||||
return
|
||||
@@ -243,7 +260,8 @@ class LogStreamCallbackHandler(BaseTracer):
|
||||
{
|
||||
"op": "add",
|
||||
"path": f"/logs/{index}/final_output",
|
||||
"value": run.outputs,
|
||||
# to undo the dumpd done by some runnables / tracer / etc
|
||||
"value": load(run.outputs),
|
||||
},
|
||||
{
|
||||
"op": "add",
|
||||
@@ -259,7 +277,7 @@ class LogStreamCallbackHandler(BaseTracer):
|
||||
{
|
||||
"op": "replace",
|
||||
"path": "/final_output",
|
||||
"value": run.outputs,
|
||||
"value": load(run.outputs),
|
||||
}
|
||||
)
|
||||
)
|
||||
@@ -273,7 +291,7 @@ class LogStreamCallbackHandler(BaseTracer):
|
||||
chunk: Optional[Union[GenerationChunk, ChatGenerationChunk]],
|
||||
) -> None:
|
||||
"""Process new LLM token."""
|
||||
index = self._index_map.get(run.id)
|
||||
index = self._key_map_by_run_id.get(run.id)
|
||||
|
||||
if index is None:
|
||||
return
|
||||
|
||||
@@ -611,7 +611,10 @@ class Chain(RunnableSerializable[Dict[str, Any], Dict[str, Any]], ABC):
|
||||
if self.memory is not None:
|
||||
raise ValueError("Saving of memory is not yet supported.")
|
||||
_dict = super().dict(**kwargs)
|
||||
_dict["_type"] = self._chain_type
|
||||
try:
|
||||
_dict["_type"] = self._chain_type
|
||||
except NotImplementedError:
|
||||
pass
|
||||
return _dict
|
||||
|
||||
def save(self, file_path: Union[Path, str]) -> None:
|
||||
@@ -639,6 +642,8 @@ class Chain(RunnableSerializable[Dict[str, Any], Dict[str, Any]], ABC):
|
||||
|
||||
# Fetch dictionary to save
|
||||
chain_dict = self.dict()
|
||||
if "_type" not in chain_dict:
|
||||
raise NotImplementedError(f"Chain {self} does not support saving.")
|
||||
|
||||
if save_path.suffix == ".json":
|
||||
with open(file_path, "w") as f:
|
||||
|
||||
@@ -6,6 +6,7 @@ from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain.callbacks.manager import CallbackManagerForChainRun
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.graph_qa.cypher_utils import CypherQueryCorrector, Schema
|
||||
from langchain.chains.graph_qa.prompts import CYPHER_GENERATION_PROMPT, CYPHER_QA_PROMPT
|
||||
from langchain.chains.llm import LLMChain
|
||||
from langchain.graphs.neo4j_graph import Neo4jGraph
|
||||
@@ -90,6 +91,8 @@ class GraphCypherQAChain(Chain):
|
||||
"""Whether or not to return the intermediate steps along with the final answer."""
|
||||
return_direct: bool = False
|
||||
"""Whether or not to return the result of querying the graph directly."""
|
||||
cypher_query_corrector: Optional[CypherQueryCorrector] = None
|
||||
"""Optional cypher validation tool"""
|
||||
|
||||
@property
|
||||
def input_keys(self) -> List[str]:
|
||||
@@ -123,6 +126,7 @@ class GraphCypherQAChain(Chain):
|
||||
qa_llm: Optional[BaseLanguageModel] = None,
|
||||
exclude_types: List[str] = [],
|
||||
include_types: List[str] = [],
|
||||
validate_cypher: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> GraphCypherQAChain:
|
||||
"""Initialize from LLM."""
|
||||
@@ -150,10 +154,19 @@ class GraphCypherQAChain(Chain):
|
||||
kwargs["graph"].structured_schema, include_types, exclude_types
|
||||
)
|
||||
|
||||
cypher_query_corrector = None
|
||||
if validate_cypher:
|
||||
corrector_schema = [
|
||||
Schema(el["start"], el["type"], el["end"])
|
||||
for el in kwargs["graph"].structured_schema.get("relationships")
|
||||
]
|
||||
cypher_query_corrector = CypherQueryCorrector(corrector_schema)
|
||||
|
||||
return cls(
|
||||
graph_schema=graph_schema,
|
||||
qa_chain=qa_chain,
|
||||
cypher_generation_chain=cypher_generation_chain,
|
||||
cypher_query_corrector=cypher_query_corrector,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -176,6 +189,10 @@ class GraphCypherQAChain(Chain):
|
||||
# Extract Cypher code if it is wrapped in backticks
|
||||
generated_cypher = extract_cypher(generated_cypher)
|
||||
|
||||
# Correct Cypher query if enabled
|
||||
if self.cypher_query_corrector:
|
||||
generated_cypher = self.cypher_query_corrector(generated_cypher)
|
||||
|
||||
_run_manager.on_text("Generated Cypher:", end="\n", verbose=self.verbose)
|
||||
_run_manager.on_text(
|
||||
generated_cypher, color="green", end="\n", verbose=self.verbose
|
||||
@@ -184,7 +201,11 @@ class GraphCypherQAChain(Chain):
|
||||
intermediate_steps.append({"query": generated_cypher})
|
||||
|
||||
# Retrieve and limit the number of results
|
||||
context = self.graph.query(generated_cypher)[: self.top_k]
|
||||
# Generated Cypher be null if query corrector identifies invalid schema
|
||||
if generated_cypher:
|
||||
context = self.graph.query(generated_cypher)[: self.top_k]
|
||||
else:
|
||||
context = []
|
||||
|
||||
if self.return_direct:
|
||||
final_result = context
|
||||
|
||||
248
libs/langchain/langchain/chains/graph_qa/cypher_utils.py
Normal file
248
libs/langchain/langchain/chains/graph_qa/cypher_utils.py
Normal file
@@ -0,0 +1,248 @@
|
||||
import re
|
||||
from collections import namedtuple
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
Schema = namedtuple("Schema", ["left_node", "relation", "right_node"])
|
||||
|
||||
|
||||
class CypherQueryCorrector:
|
||||
"""
|
||||
Used to correct relationship direction in generated Cypher statements.
|
||||
This code is copied from the winner's submission to the Cypher competition:
|
||||
https://github.com/sakusaku-rich/cypher-direction-competition
|
||||
"""
|
||||
|
||||
property_pattern = re.compile(r"\{.+?\}")
|
||||
node_pattern = re.compile(r"\(.+?\)")
|
||||
path_pattern = re.compile(r"\(.*\).*-.*-.*\(.*\)")
|
||||
node_relation_node_pattern = re.compile(
|
||||
r"(\()+(?P<left_node>[^()]*?)\)(?P<relation>.*?)\((?P<right_node>[^()]*?)(\))+"
|
||||
)
|
||||
relation_type_pattern = re.compile(r":(?P<relation_type>.+?)?(\{.+\})?]")
|
||||
|
||||
def __init__(self, schemas: List[Schema]):
|
||||
"""
|
||||
Args:
|
||||
schemas: list of schemas
|
||||
"""
|
||||
self.schemas = schemas
|
||||
|
||||
def clean_node(self, node: str) -> str:
|
||||
"""
|
||||
Args:
|
||||
node: node in string format
|
||||
|
||||
"""
|
||||
node = re.sub(self.property_pattern, "", node)
|
||||
node = node.replace("(", "")
|
||||
node = node.replace(")", "")
|
||||
node = node.strip()
|
||||
return node
|
||||
|
||||
def detect_node_variables(self, query: str) -> Dict[str, List[str]]:
|
||||
"""
|
||||
Args:
|
||||
query: cypher query
|
||||
"""
|
||||
nodes = re.findall(self.node_pattern, query)
|
||||
nodes = [self.clean_node(node) for node in nodes]
|
||||
res: Dict[str, Any] = {}
|
||||
for node in nodes:
|
||||
parts = node.split(":")
|
||||
if parts == "":
|
||||
continue
|
||||
variable = parts[0]
|
||||
if variable not in res:
|
||||
res[variable] = []
|
||||
res[variable] += parts[1:]
|
||||
return res
|
||||
|
||||
def extract_paths(self, query: str) -> "List[str]":
|
||||
"""
|
||||
Args:
|
||||
query: cypher query
|
||||
"""
|
||||
return re.findall(self.path_pattern, query)
|
||||
|
||||
def judge_direction(self, relation: str) -> str:
|
||||
"""
|
||||
Args:
|
||||
relation: relation in string format
|
||||
"""
|
||||
direction = "BIDIRECTIONAL"
|
||||
if relation[0] == "<":
|
||||
direction = "INCOMING"
|
||||
if relation[-1] == ">":
|
||||
direction = "OUTGOING"
|
||||
return direction
|
||||
|
||||
def extract_node_variable(self, part: str) -> Optional[str]:
|
||||
"""
|
||||
Args:
|
||||
part: node in string format
|
||||
"""
|
||||
part = part.lstrip("(").rstrip(")")
|
||||
idx = part.find(":")
|
||||
if idx != -1:
|
||||
part = part[:idx]
|
||||
return None if part == "" else part
|
||||
|
||||
def detect_labels(
|
||||
self, str_node: str, node_variable_dict: Dict[str, Any]
|
||||
) -> List[str]:
|
||||
"""
|
||||
Args:
|
||||
str_node: node in string format
|
||||
node_variable_dict: dictionary of node variables
|
||||
"""
|
||||
splitted_node = str_node.split(":")
|
||||
variable = splitted_node[0]
|
||||
labels = []
|
||||
if variable in node_variable_dict:
|
||||
labels = node_variable_dict[variable]
|
||||
elif variable == "" and len(splitted_node) > 1:
|
||||
labels = splitted_node[1:]
|
||||
return labels
|
||||
|
||||
def verify_schema(
|
||||
self,
|
||||
from_node_labels: List[str],
|
||||
relation_types: List[str],
|
||||
to_node_labels: List[str],
|
||||
) -> bool:
|
||||
"""
|
||||
Args:
|
||||
from_node_labels: labels of the from node
|
||||
relation_type: type of the relation
|
||||
to_node_labels: labels of the to node
|
||||
"""
|
||||
valid_schemas = self.schemas
|
||||
if from_node_labels != []:
|
||||
from_node_labels = [label.strip("`") for label in from_node_labels]
|
||||
valid_schemas = [
|
||||
schema for schema in valid_schemas if schema[0] in from_node_labels
|
||||
]
|
||||
if to_node_labels != []:
|
||||
to_node_labels = [label.strip("`") for label in to_node_labels]
|
||||
valid_schemas = [
|
||||
schema for schema in valid_schemas if schema[2] in to_node_labels
|
||||
]
|
||||
if relation_types != []:
|
||||
relation_types = [type.strip("`") for type in relation_types]
|
||||
valid_schemas = [
|
||||
schema for schema in valid_schemas if schema[1] in relation_types
|
||||
]
|
||||
return valid_schemas != []
|
||||
|
||||
def detect_relation_types(self, str_relation: str) -> Tuple[str, List[str]]:
|
||||
"""
|
||||
Args:
|
||||
str_relation: relation in string format
|
||||
"""
|
||||
relation_direction = self.judge_direction(str_relation)
|
||||
relation_type = self.relation_type_pattern.search(str_relation)
|
||||
if relation_type is None or relation_type.group("relation_type") is None:
|
||||
return relation_direction, []
|
||||
relation_types = [
|
||||
t.strip().strip("!")
|
||||
for t in relation_type.group("relation_type").split("|")
|
||||
]
|
||||
return relation_direction, relation_types
|
||||
|
||||
def correct_query(self, query: str) -> str:
|
||||
"""
|
||||
Args:
|
||||
query: cypher query
|
||||
"""
|
||||
node_variable_dict = self.detect_node_variables(query)
|
||||
paths = self.extract_paths(query)
|
||||
for path in paths:
|
||||
original_path = path
|
||||
start_idx = 0
|
||||
while start_idx < len(path):
|
||||
match_res = re.match(self.node_relation_node_pattern, path[start_idx:])
|
||||
if match_res is None:
|
||||
break
|
||||
start_idx += match_res.start()
|
||||
match_dict = match_res.groupdict()
|
||||
left_node_labels = self.detect_labels(
|
||||
match_dict["left_node"], node_variable_dict
|
||||
)
|
||||
right_node_labels = self.detect_labels(
|
||||
match_dict["right_node"], node_variable_dict
|
||||
)
|
||||
end_idx = (
|
||||
start_idx
|
||||
+ 4
|
||||
+ len(match_dict["left_node"])
|
||||
+ len(match_dict["relation"])
|
||||
+ len(match_dict["right_node"])
|
||||
)
|
||||
original_partial_path = original_path[start_idx : end_idx + 1]
|
||||
relation_direction, relation_types = self.detect_relation_types(
|
||||
match_dict["relation"]
|
||||
)
|
||||
|
||||
if relation_types != [] and "".join(relation_types).find("*") != -1:
|
||||
start_idx += (
|
||||
len(match_dict["left_node"]) + len(match_dict["relation"]) + 2
|
||||
)
|
||||
continue
|
||||
|
||||
if relation_direction == "OUTGOING":
|
||||
is_legal = self.verify_schema(
|
||||
left_node_labels, relation_types, right_node_labels
|
||||
)
|
||||
if not is_legal:
|
||||
is_legal = self.verify_schema(
|
||||
right_node_labels, relation_types, left_node_labels
|
||||
)
|
||||
if is_legal:
|
||||
corrected_relation = "<" + match_dict["relation"][:-1]
|
||||
corrected_partial_path = original_partial_path.replace(
|
||||
match_dict["relation"], corrected_relation
|
||||
)
|
||||
query = query.replace(
|
||||
original_partial_path, corrected_partial_path
|
||||
)
|
||||
else:
|
||||
return ""
|
||||
elif relation_direction == "INCOMING":
|
||||
is_legal = self.verify_schema(
|
||||
right_node_labels, relation_types, left_node_labels
|
||||
)
|
||||
if not is_legal:
|
||||
is_legal = self.verify_schema(
|
||||
left_node_labels, relation_types, right_node_labels
|
||||
)
|
||||
if is_legal:
|
||||
corrected_relation = match_dict["relation"][1:] + ">"
|
||||
corrected_partial_path = original_partial_path.replace(
|
||||
match_dict["relation"], corrected_relation
|
||||
)
|
||||
query = query.replace(
|
||||
original_partial_path, corrected_partial_path
|
||||
)
|
||||
else:
|
||||
return ""
|
||||
else:
|
||||
is_legal = self.verify_schema(
|
||||
left_node_labels, relation_types, right_node_labels
|
||||
)
|
||||
is_legal |= self.verify_schema(
|
||||
right_node_labels, relation_types, left_node_labels
|
||||
)
|
||||
if not is_legal:
|
||||
return ""
|
||||
|
||||
start_idx += (
|
||||
len(match_dict["left_node"]) + len(match_dict["relation"]) + 2
|
||||
)
|
||||
return query
|
||||
|
||||
def __call__(self, query: str) -> str:
|
||||
"""Correct the query to make it valid. If
|
||||
Args:
|
||||
query: cypher query
|
||||
"""
|
||||
return self.correct_query(query)
|
||||
@@ -5,6 +5,7 @@ import logging
|
||||
import warnings
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain._api import warn_deprecated
|
||||
from langchain.callbacks.manager import CallbackManagerForChainRun
|
||||
from langchain.chains.base import Chain
|
||||
from langchain.chains.llm import LLMChain
|
||||
@@ -20,7 +21,12 @@ logger = logging.getLogger(__name__)
|
||||
class LLMBashChain(Chain):
|
||||
"""Chain that interprets a prompt and executes bash operations.
|
||||
|
||||
Warning:
|
||||
This chain can execute arbitrary code using bash.
|
||||
This can be dangerous if not properly sandboxed.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.chains import LLMBashChain
|
||||
@@ -84,6 +90,14 @@ class LLMBashChain(Chain):
|
||||
inputs: Dict[str, Any],
|
||||
run_manager: Optional[CallbackManagerForChainRun] = None,
|
||||
) -> Dict[str, str]:
|
||||
warn_deprecated(
|
||||
since="0.0.308",
|
||||
message=(
|
||||
"On 2023-10-12 the LLMBashChain "
|
||||
"will be moved to langchain-experimental"
|
||||
),
|
||||
pending=True,
|
||||
)
|
||||
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
|
||||
_run_manager.on_text(inputs[self.input_key], verbose=self.verbose)
|
||||
|
||||
|
||||
@@ -160,4 +160,6 @@ def load_query_constructor_chain(
|
||||
allowed_operators=allowed_operators,
|
||||
enable_limit=enable_limit,
|
||||
)
|
||||
return LLMChain(llm=llm, prompt=prompt, **kwargs)
|
||||
return LLMChain(
|
||||
llm=llm, prompt=prompt, output_parser=prompt.output_parser, **kwargs
|
||||
)
|
||||
|
||||
@@ -88,6 +88,8 @@ class Comparator(str, Enum):
|
||||
LTE = "lte"
|
||||
CONTAIN = "contain"
|
||||
LIKE = "like"
|
||||
IN = "in"
|
||||
NIN = "nin"
|
||||
|
||||
|
||||
class FilterDirective(Expr, ABC):
|
||||
|
||||
@@ -22,6 +22,7 @@ from langchain.chat_models.anyscale import ChatAnyscale
|
||||
from langchain.chat_models.azure_openai import AzureChatOpenAI
|
||||
from langchain.chat_models.baidu_qianfan_endpoint import QianfanChatEndpoint
|
||||
from langchain.chat_models.bedrock import BedrockChat
|
||||
from langchain.chat_models.cohere import ChatCohere
|
||||
from langchain.chat_models.ernie import ErnieBotChat
|
||||
from langchain.chat_models.fake import FakeListChatModel
|
||||
from langchain.chat_models.fireworks import ChatFireworks
|
||||
@@ -45,6 +46,7 @@ __all__ = [
|
||||
"FakeListChatModel",
|
||||
"PromptLayerChatOpenAI",
|
||||
"ChatAnthropic",
|
||||
"ChatCohere",
|
||||
"ChatGooglePalm",
|
||||
"ChatMLflowAIGateway",
|
||||
"ChatOllama",
|
||||
|
||||
@@ -24,16 +24,36 @@ class LlamaContentFormatter(ContentFormatterBase):
|
||||
def _convert_message_to_dict(message: BaseMessage) -> Dict:
|
||||
"""Converts message to a dict according to role"""
|
||||
if isinstance(message, HumanMessage):
|
||||
return {"role": "user", "content": message.content}
|
||||
return {
|
||||
"role": "user",
|
||||
"content": ContentFormatterBase.escape_special_characters(
|
||||
message.content
|
||||
),
|
||||
}
|
||||
elif isinstance(message, AIMessage):
|
||||
return {"role": "assistant", "content": message.content}
|
||||
return {
|
||||
"role": "assistant",
|
||||
"content": ContentFormatterBase.escape_special_characters(
|
||||
message.content
|
||||
),
|
||||
}
|
||||
elif isinstance(message, SystemMessage):
|
||||
return {"role": "system", "content": message.content}
|
||||
return {
|
||||
"role": "system",
|
||||
"content": ContentFormatterBase.escape_special_characters(
|
||||
message.content
|
||||
),
|
||||
}
|
||||
elif (
|
||||
isinstance(message, ChatMessage)
|
||||
and message.role in LlamaContentFormatter.SUPPORTED_ROLES
|
||||
):
|
||||
return {"role": message.role, "content": message.content}
|
||||
return {
|
||||
"role": message.role,
|
||||
"content": ContentFormatterBase.escape_special_characters(
|
||||
message.content
|
||||
),
|
||||
}
|
||||
else:
|
||||
supported = ",".join(
|
||||
[role for role in LlamaContentFormatter.SUPPORTED_ROLES]
|
||||
|
||||
162
libs/langchain/langchain/chat_models/cohere.py
Normal file
162
libs/langchain/langchain/chat_models/cohere.py
Normal file
@@ -0,0 +1,162 @@
|
||||
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional
|
||||
|
||||
from langchain.callbacks.manager import (
|
||||
AsyncCallbackManagerForLLMRun,
|
||||
CallbackManagerForLLMRun,
|
||||
)
|
||||
from langchain.chat_models.base import (
|
||||
BaseChatModel,
|
||||
_agenerate_from_stream,
|
||||
_generate_from_stream,
|
||||
)
|
||||
from langchain.llms.cohere import BaseCohere
|
||||
from langchain.schema.messages import (
|
||||
AIMessage,
|
||||
AIMessageChunk,
|
||||
BaseMessage,
|
||||
ChatMessage,
|
||||
HumanMessage,
|
||||
SystemMessage,
|
||||
)
|
||||
from langchain.schema.output import ChatGeneration, ChatGenerationChunk, ChatResult
|
||||
|
||||
|
||||
def get_role(message: BaseMessage) -> str:
|
||||
if isinstance(message, ChatMessage) or isinstance(message, HumanMessage):
|
||||
return "User"
|
||||
elif isinstance(message, AIMessage):
|
||||
return "Chatbot"
|
||||
elif isinstance(message, SystemMessage):
|
||||
return "System"
|
||||
else:
|
||||
raise ValueError(f"Got unknown type {message}")
|
||||
|
||||
|
||||
class ChatCohere(BaseChatModel, BaseCohere):
|
||||
"""`Cohere` chat large language models.
|
||||
|
||||
To use, you should have the ``cohere`` python package installed, and the
|
||||
environment variable ``COHERE_API_KEY`` set with your API key, or pass
|
||||
it as a named parameter to the constructor.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.chat_models import ChatCohere
|
||||
from langchain.schema import HumanMessage
|
||||
|
||||
chat = ChatCohere(model="foo")
|
||||
result = chat([HumanMessage(content="Hello")])
|
||||
print(result.content)
|
||||
"""
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
allow_population_by_field_name = True
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
"""Return type of chat model."""
|
||||
return "cohere-chat"
|
||||
|
||||
@property
|
||||
def _default_params(self) -> Dict[str, Any]:
|
||||
"""Get the default parameters for calling Cohere API."""
|
||||
return {
|
||||
"temperature": self.temperature,
|
||||
}
|
||||
|
||||
@property
|
||||
def _identifying_params(self) -> Dict[str, Any]:
|
||||
"""Get the identifying parameters."""
|
||||
return {**{"model": self.model}, **self._default_params}
|
||||
|
||||
def get_cohere_chat_request(
|
||||
self, messages: List[BaseMessage], **kwargs: Any
|
||||
) -> Dict[str, Any]:
|
||||
return {
|
||||
"message": messages[0].content,
|
||||
"chat_history": [
|
||||
{"role": get_role(x), "message": x.content} for x in messages[1:]
|
||||
],
|
||||
**self._default_params,
|
||||
**kwargs,
|
||||
}
|
||||
|
||||
def _stream(
|
||||
self,
|
||||
messages: List[BaseMessage],
|
||||
stop: Optional[List[str]] = None,
|
||||
run_manager: Optional[CallbackManagerForLLMRun] = None,
|
||||
**kwargs: Any,
|
||||
) -> Iterator[ChatGenerationChunk]:
|
||||
request = self.get_cohere_chat_request(messages, **kwargs)
|
||||
stream = self.client.chat(**request, stream=True)
|
||||
|
||||
for data in stream:
|
||||
if data.event_type == "text-generation":
|
||||
delta = data.text
|
||||
yield ChatGenerationChunk(message=AIMessageChunk(content=delta))
|
||||
if run_manager:
|
||||
run_manager.on_llm_new_token(delta)
|
||||
|
||||
async def _astream(
|
||||
self,
|
||||
messages: List[BaseMessage],
|
||||
stop: Optional[List[str]] = None,
|
||||
run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
|
||||
**kwargs: Any,
|
||||
) -> AsyncIterator[ChatGenerationChunk]:
|
||||
request = self.get_cohere_chat_request(messages, **kwargs)
|
||||
stream = await self.async_client.chat(**request, stream=True)
|
||||
|
||||
async for data in stream:
|
||||
if data.event_type == "text-generation":
|
||||
delta = data.text
|
||||
yield ChatGenerationChunk(message=AIMessageChunk(content=delta))
|
||||
if run_manager:
|
||||
await run_manager.on_llm_new_token(delta)
|
||||
|
||||
def _generate(
|
||||
self,
|
||||
messages: List[BaseMessage],
|
||||
stop: Optional[List[str]] = None,
|
||||
run_manager: Optional[CallbackManagerForLLMRun] = None,
|
||||
**kwargs: Any,
|
||||
) -> ChatResult:
|
||||
if self.streaming:
|
||||
stream_iter = self._stream(
|
||||
messages, stop=stop, run_manager=run_manager, **kwargs
|
||||
)
|
||||
return _generate_from_stream(stream_iter)
|
||||
|
||||
request = self.get_cohere_chat_request(messages, **kwargs)
|
||||
response = self.client.chat(**request)
|
||||
|
||||
message = AIMessage(content=response.text)
|
||||
return ChatResult(generations=[ChatGeneration(message=message)])
|
||||
|
||||
async def _agenerate(
|
||||
self,
|
||||
messages: List[BaseMessage],
|
||||
stop: Optional[List[str]] = None,
|
||||
run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
|
||||
**kwargs: Any,
|
||||
) -> ChatResult:
|
||||
if self.streaming:
|
||||
stream_iter = self._astream(
|
||||
messages, stop=stop, run_manager=run_manager, **kwargs
|
||||
)
|
||||
return await _agenerate_from_stream(stream_iter)
|
||||
|
||||
request = self.get_cohere_chat_request(messages, **kwargs)
|
||||
response = self.client.chat(**request, stream=False)
|
||||
|
||||
message = AIMessage(content=response.text)
|
||||
return ChatResult(generations=[ChatGeneration(message=message)])
|
||||
|
||||
def get_num_tokens(self, text: str) -> int:
|
||||
"""Calculate number of tokens."""
|
||||
return len(self.client.tokenize(text).tokens)
|
||||
0
libs/langchain/langchain/cli/__init__.py
Normal file
0
libs/langchain/langchain/cli/__init__.py
Normal file
110
libs/langchain/langchain/cli/cli.py
Normal file
110
libs/langchain/langchain/cli/cli.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""A CLI for creating a new project with LangChain."""
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from typing_extensions import Annotated
|
||||
|
||||
from langchain.cli.create_repo.base import create, is_poetry_installed
|
||||
from langchain.cli.create_repo.pypi_name import is_name_taken, lint_name
|
||||
from langchain.cli.create_repo.user_info import get_git_user_email, get_git_user_name
|
||||
|
||||
try:
|
||||
import typer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Typer must be installed to use the CLI. "
|
||||
"You can install it with `pip install typer`."
|
||||
)
|
||||
|
||||
|
||||
app = typer.Typer(no_args_is_help=False, add_completion=False)
|
||||
|
||||
|
||||
def _select_project_name(suggested_project_name: str) -> str:
|
||||
"""Help the user select a valid project name."""
|
||||
while True:
|
||||
project_name = typer.prompt("Project Name", default=suggested_project_name)
|
||||
|
||||
project_name_diagnostics = lint_name(project_name)
|
||||
if project_name_diagnostics:
|
||||
typer.echo(
|
||||
f"{typer.style('Warning:', fg=typer.colors.MAGENTA)}"
|
||||
f" The project name"
|
||||
f" {typer.style(project_name, fg=typer.colors.BRIGHT_CYAN)}"
|
||||
f" is not valid.",
|
||||
err=True,
|
||||
)
|
||||
|
||||
for diagnostic in project_name_diagnostics:
|
||||
typer.echo(f" - {diagnostic}")
|
||||
|
||||
if typer.confirm(
|
||||
"Select another name?",
|
||||
default=True,
|
||||
):
|
||||
continue
|
||||
|
||||
if is_name_taken(project_name):
|
||||
typer.echo(
|
||||
f"{typer.style('Error:', fg=typer.colors.RED)}"
|
||||
f" The project name"
|
||||
f" {typer.style(project_name, fg=typer.colors.BRIGHT_CYAN)}"
|
||||
f" is already taken on pypi",
|
||||
err=True,
|
||||
)
|
||||
|
||||
if typer.confirm(
|
||||
"Select another name?",
|
||||
default=True,
|
||||
):
|
||||
continue
|
||||
|
||||
# If we got here then the project name is valid and not taken
|
||||
return project_name
|
||||
|
||||
|
||||
#
|
||||
#
|
||||
@app.command()
|
||||
def new(
|
||||
project_directory: Annotated[
|
||||
Path, typer.Argument(help="The directory to create the project in.")
|
||||
],
|
||||
author_name: Optional[str] = None,
|
||||
author_email: Optional[str] = None,
|
||||
use_poetry: Annotated[
|
||||
Optional[bool], typer.Option(help="Specify whether to use Poetry or not.")
|
||||
] = None,
|
||||
) -> None:
|
||||
"""Create a new project with LangChain."""
|
||||
|
||||
project_directory_path = Path(project_directory)
|
||||
project_name_suggestion = project_directory_path.name.replace("-", "_")
|
||||
project_name = _select_project_name(project_name_suggestion)
|
||||
|
||||
if not author_name:
|
||||
author_name = typer.prompt("Author Name", default=get_git_user_name())
|
||||
|
||||
if not author_email:
|
||||
author_email = typer.prompt("Author Email", default=get_git_user_email())
|
||||
|
||||
if use_poetry is None:
|
||||
if is_poetry_installed():
|
||||
typer.echo("🎉 Found Poetry installed. Project can be set up using poetry.")
|
||||
use_poetry = typer.confirm("Use Poetry? (no to use pip)", default=True)
|
||||
else:
|
||||
typer.echo("ℹ️ Could not find Poetry installed.")
|
||||
use_pip = typer.confirm("Use Pip? (no to use poetry)", default=True)
|
||||
use_poetry = not use_pip
|
||||
|
||||
if author_name is None:
|
||||
raise typer.BadParameter("Author name is required")
|
||||
|
||||
if author_email is None:
|
||||
raise typer.BadParameter("Author email is required")
|
||||
|
||||
create(project_directory, project_name, author_name, author_email, use_poetry)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
246
libs/langchain/langchain/cli/create_repo/base.py
Normal file
246
libs/langchain/langchain/cli/create_repo/base.py
Normal file
@@ -0,0 +1,246 @@
|
||||
""""""
|
||||
import os
|
||||
import pathlib
|
||||
import string
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import List, Sequence
|
||||
|
||||
import typer
|
||||
|
||||
import langchain
|
||||
|
||||
|
||||
class UnderscoreTemplate(string.Template):
|
||||
delimiter = "____"
|
||||
|
||||
|
||||
def _create_project_dir(
|
||||
project_directory_path: Path,
|
||||
use_poetry: bool,
|
||||
project_name: str,
|
||||
project_name_identifier: str,
|
||||
author_name: str,
|
||||
author_email: str,
|
||||
) -> None:
|
||||
project_directory_path.mkdir(parents=True, exist_ok=True)
|
||||
template_directories = _get_template_directories(use_poetry)
|
||||
_check_conflicting_files(template_directories, project_directory_path)
|
||||
_copy_template_files(
|
||||
template_directories,
|
||||
project_directory_path,
|
||||
project_name,
|
||||
project_name_identifier,
|
||||
author_name,
|
||||
author_email,
|
||||
)
|
||||
|
||||
|
||||
def _get_template_directories(use_poetry: bool) -> List[Path]:
|
||||
"""Get the directories containing the templates.
|
||||
|
||||
Args:
|
||||
use_poetry: If true, will set up the project with Poetry.
|
||||
|
||||
"""
|
||||
template_parent_path = Path(__file__).parent / "templates"
|
||||
template_directories = [template_parent_path / "repo"]
|
||||
if use_poetry:
|
||||
template_directories.append(template_parent_path / "poetry")
|
||||
else:
|
||||
template_directories.append(template_parent_path / "pip")
|
||||
return template_directories
|
||||
|
||||
|
||||
def _check_conflicting_files(
|
||||
template_directories: Sequence[Path], project_directory_path: Path
|
||||
) -> None:
|
||||
"""Validate project directory doesn't contain conflicting files."""
|
||||
|
||||
for template_directory_path in template_directories:
|
||||
for template_file_path in template_directory_path.glob("**/*"):
|
||||
relative_template_file_path = template_file_path.relative_to(
|
||||
template_directory_path
|
||||
)
|
||||
project_file_path = project_directory_path / relative_template_file_path
|
||||
if project_file_path.exists():
|
||||
typer.echo(
|
||||
f"{typer.style('Error:', fg=typer.colors.RED)}"
|
||||
f" The project directory already contains a file"
|
||||
f" {typer.style(project_file_path, fg=typer.colors.BRIGHT_CYAN)}"
|
||||
f" that would be overwritten by the template.",
|
||||
err=True,
|
||||
)
|
||||
typer.echo(
|
||||
"Please remove this file and try again.",
|
||||
err=True,
|
||||
)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
def _copy_template_files(
|
||||
template_directories: Sequence[Path],
|
||||
project_directory_path: Path,
|
||||
project_name: str,
|
||||
project_name_identifier: str,
|
||||
author_name: str,
|
||||
author_email: str,
|
||||
) -> None:
|
||||
"""Copy template files to project directory and substitute variables.
|
||||
|
||||
Args:
|
||||
template_directories: The directories containing the templates.
|
||||
project_directory_path: The destination directory.
|
||||
project_name: The name of the project.
|
||||
project_name_identifier: The identifier of the project name.
|
||||
author_name: The name of the author.
|
||||
author_email: The email of the author.
|
||||
"""
|
||||
for template_directory_path in template_directories:
|
||||
for template_file_path in template_directory_path.glob("**/*"):
|
||||
relative_template_file_path = UnderscoreTemplate(
|
||||
str(template_file_path.relative_to(template_directory_path))
|
||||
).substitute(project_name_identifier=project_name_identifier)
|
||||
project_file_path = project_directory_path / relative_template_file_path
|
||||
if template_file_path.is_dir():
|
||||
project_file_path.mkdir(parents=True, exist_ok=True)
|
||||
else:
|
||||
project_file_path.write_text(
|
||||
UnderscoreTemplate(template_file_path.read_text()).substitute(
|
||||
project_name=project_name,
|
||||
project_name_identifier=project_name_identifier,
|
||||
author_name=author_name,
|
||||
author_email=author_email,
|
||||
langchain_version=langchain.__version__,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _poetry_install(project_directory_path: Path) -> None:
|
||||
"""Install dependencies with Poetry."""
|
||||
typer.echo(
|
||||
f"\n{typer.style('2.', bold=True, fg=typer.colors.GREEN)}"
|
||||
f" Installing dependencies with Poetry..."
|
||||
)
|
||||
subprocess.run(["pwd"], cwd=project_directory_path)
|
||||
subprocess.run(
|
||||
["poetry", "install"],
|
||||
cwd=project_directory_path,
|
||||
env={**os.environ.copy(), "VIRTUAL_ENV": ""},
|
||||
)
|
||||
|
||||
|
||||
def _pip_install(project_directory_path: Path) -> None:
|
||||
"""Create virtual environment and install dependencies."""
|
||||
typer.echo(
|
||||
f"\n{typer.style('2.', bold=True, fg=typer.colors.GREEN)}"
|
||||
f" Creating virtual environment..."
|
||||
)
|
||||
subprocess.run(["pwd"], cwd=project_directory_path)
|
||||
subprocess.run(["python", "-m", "venv", ".venv"], cwd=project_directory_path)
|
||||
# TODO install dependencies
|
||||
|
||||
|
||||
def _init_git(project_directory_path: Path) -> None:
|
||||
"""Initialize git repository."""
|
||||
typer.echo(
|
||||
f"\n{typer.style('Initializing git...', bold=True, fg=typer.colors.GREEN)}"
|
||||
)
|
||||
subprocess.run(["git", "init"], cwd=project_directory_path)
|
||||
|
||||
# 7. Create initial commit
|
||||
subprocess.run(["git", "add", "."], cwd=project_directory_path)
|
||||
subprocess.run(
|
||||
["git", "commit", "-m", "Initial commit"],
|
||||
cwd=project_directory_path,
|
||||
)
|
||||
|
||||
|
||||
# PUBLIC API
|
||||
|
||||
|
||||
def create(
|
||||
project_directory: pathlib.Path,
|
||||
project_name: str,
|
||||
author_name: str,
|
||||
author_email: str,
|
||||
use_poetry: bool,
|
||||
) -> None:
|
||||
"""Create a new LangChain project.
|
||||
|
||||
Args:
|
||||
project_directory (str): The directory to create the project in.
|
||||
project_name: The name of the project.
|
||||
author_name (str): The name of the author.
|
||||
author_email (str): The email of the author.
|
||||
use_poetry (bool): Whether to use Poetry to manage the project.
|
||||
"""
|
||||
|
||||
project_directory_path = Path(project_directory)
|
||||
project_name_identifier = project_name
|
||||
resolved_path = project_directory_path.resolve()
|
||||
|
||||
if not typer.confirm(
|
||||
f"\n"
|
||||
f"Creating a new LangChain project 🦜️🔗\n"
|
||||
f"Name: {typer.style(project_name, fg=typer.colors.BRIGHT_CYAN)}\n"
|
||||
f"Path: {typer.style(resolved_path, fg=typer.colors.BRIGHT_CYAN)}\n"
|
||||
f"Project name: {typer.style(project_name, fg=typer.colors.BRIGHT_CYAN)}\n"
|
||||
f"Author name: {typer.style(author_name, fg=typer.colors.BRIGHT_CYAN)}\n"
|
||||
f"Author email: {typer.style(author_email, fg=typer.colors.BRIGHT_CYAN)}\n"
|
||||
f"Use Poetry: {typer.style(str(use_poetry), fg=typer.colors.BRIGHT_CYAN)}\n"
|
||||
"Continue?",
|
||||
default=True,
|
||||
):
|
||||
typer.echo("Cancelled project creation. See you later! 👋")
|
||||
raise typer.Exit(code=0)
|
||||
|
||||
_create_project_dir(
|
||||
project_directory_path,
|
||||
use_poetry,
|
||||
project_name,
|
||||
project_name_identifier,
|
||||
author_name,
|
||||
author_email,
|
||||
)
|
||||
|
||||
# TODO(Team): Add installation
|
||||
# if use_poetry:
|
||||
# _poetry_install(project_directory_path)
|
||||
# else:
|
||||
# _pip_install(project_directory_path)
|
||||
|
||||
_init_git(project_directory_path)
|
||||
|
||||
typer.echo(
|
||||
f"\n{typer.style('Done!🙌', bold=True, fg=typer.colors.GREEN)}"
|
||||
f" Your new LangChain project"
|
||||
f" {typer.style(project_name, fg=typer.colors.BRIGHT_CYAN)}"
|
||||
f" has been created in"
|
||||
f" {typer.style(project_directory_path.resolve(), fg=typer.colors.BRIGHT_CYAN)}"
|
||||
f"."
|
||||
)
|
||||
# TODO(Team): Add surfacing information from make file and installation
|
||||
# cd_dir = typer.style(
|
||||
# f"cd {project_directory_path.resolve()}", fg=typer.colors.BRIGHT_CYAN
|
||||
# )
|
||||
# typer.echo(
|
||||
# f"\nChange into the project directory with {cd_dir}."
|
||||
# f" The following commands are available:"
|
||||
# )
|
||||
# subprocess.run(["make"], cwd=project_directory_path)
|
||||
|
||||
# if not use_poetry:
|
||||
# pip_install = typer.style(
|
||||
# 'pip install -e ".[dev]"', fg=typer.colors.BRIGHT_CYAN
|
||||
# )
|
||||
# typer.echo(
|
||||
# f"\nTo install all dependencies activate your environment run:"
|
||||
# f"\n{typer.style('source .venv/bin/activate', fg=typer.colors.BRIGHT_CYAN)}"
|
||||
# f"\n{pip_install}."
|
||||
# )
|
||||
|
||||
|
||||
def is_poetry_installed() -> bool:
|
||||
"""Check if Poetry is installed."""
|
||||
return subprocess.run(["poetry", "--version"], capture_output=True).returncode == 0
|
||||
70
libs/langchain/langchain/cli/create_repo/pypi_name.py
Normal file
70
libs/langchain/langchain/cli/create_repo/pypi_name.py
Normal file
@@ -0,0 +1,70 @@
|
||||
"""Code helps to check availability of the name of the project on PyPi
|
||||
|
||||
Adapted from https://github.com/danishprakash/pip-name/blob/master/pip-name
|
||||
"""
|
||||
from typing import List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
BASE_URL = "https://pypi.org/pypi"
|
||||
|
||||
UPPERCASE_SUGGESTION = "Use of uppercase letters is discouraged"
|
||||
SEPARATOR_SUGGESTION = "Use of `-` is discouraged, consider using `_`"
|
||||
NUMERIC_SUGGESTION = "Use of numbers is discouraged"
|
||||
|
||||
|
||||
def _request_pypi(name: str) -> Optional[dict]:
|
||||
"""Request response from PyPi API.
|
||||
|
||||
Args:
|
||||
name (str): Name of the project
|
||||
|
||||
Returns:
|
||||
Optional[dict]: Response from PyPi API
|
||||
"""
|
||||
target_url = f"{BASE_URL}/{name}/json"
|
||||
response = requests.get(target_url)
|
||||
return response.json() if response.status_code != 404 else None
|
||||
|
||||
|
||||
# PUBLIC API
|
||||
|
||||
|
||||
def lint_name(name: str) -> List[str]:
|
||||
"""Check name against PEP8's naming conventions.
|
||||
|
||||
Args:
|
||||
name (str): Name of the project
|
||||
|
||||
Returns:
|
||||
List[str]: List of suggestions
|
||||
"""
|
||||
suggestions = []
|
||||
|
||||
if "-" in name or " " in name:
|
||||
suggestions.append(SEPARATOR_SUGGESTION)
|
||||
if any(x.isupper() for x in name):
|
||||
suggestions.append(UPPERCASE_SUGGESTION)
|
||||
if any(x.isnumeric() for x in name):
|
||||
suggestions.append(NUMERIC_SUGGESTION)
|
||||
|
||||
return suggestions
|
||||
|
||||
|
||||
def is_name_taken(name: str) -> bool:
|
||||
"""Check module filename for conflict.
|
||||
|
||||
Args:
|
||||
name (str): Name of the project
|
||||
|
||||
Returns:
|
||||
bool: True if name is taken, False otherwise
|
||||
"""
|
||||
response = _request_pypi(name)
|
||||
|
||||
if response:
|
||||
package_url = response.get("info").get("package_url") # type: ignore
|
||||
module_name = package_url.split("/")[-2]
|
||||
return name.lower() == module_name.lower()
|
||||
|
||||
return False
|
||||
79
libs/langchain/langchain/cli/create_repo/templates/pip/.github/CONTRIBUTING.md
vendored
Normal file
79
libs/langchain/langchain/cli/create_repo/templates/pip/.github/CONTRIBUTING.md
vendored
Normal file
@@ -0,0 +1,79 @@
|
||||
# Contributing to ____project_name
|
||||
|
||||
Hi there! Thank you for even being interested in contributing to ____project_name.
|
||||
|
||||
## 🚀 Quick Start
|
||||
|
||||
To install requirements:
|
||||
|
||||
```bash
|
||||
poetry install -e ".[dev]"
|
||||
```
|
||||
|
||||
This will install all requirements for running the package, examples, linting, formatting, tests, and coverage.
|
||||
|
||||
Now, you should be able to run the common tasks in the following section. To double check, run `make test`, all tests should pass.
|
||||
|
||||
## ✅ Common Tasks
|
||||
|
||||
Type `make` for a list of common tasks.
|
||||
|
||||
### Code Formatting
|
||||
|
||||
Formatting for this project is done via a combination of [Black](https://black.readthedocs.io/en/stable/) and [isort](https://pycqa.github.io/isort/).
|
||||
|
||||
To run formatting for this project:
|
||||
|
||||
```bash
|
||||
make format
|
||||
```
|
||||
|
||||
Additionally, you can run the formatter only on the files that have been modified in your current branch as compared to the main branch using the format_diff command:
|
||||
|
||||
```bash
|
||||
make format_diff
|
||||
```
|
||||
|
||||
This is especially useful when you have made changes to a subset of the project and want to ensure your changes are properly formatted without affecting the rest of the codebase.
|
||||
|
||||
### Linting
|
||||
|
||||
Linting for this project is done via a combination of [Black](https://black.readthedocs.io/en/stable/), [isort](https://pycqa.github.io/isort/), [flake8](https://flake8.pycqa.org/en/latest/), and [mypy](http://mypy-lang.org/).
|
||||
|
||||
To run linting for this project:
|
||||
|
||||
```bash
|
||||
make lint
|
||||
```
|
||||
|
||||
In addition, you can run the linter only on the files that have been modified in your current branch as compared to the main branch using the lint_diff command:
|
||||
|
||||
```bash
|
||||
make lint_diff
|
||||
```
|
||||
|
||||
This can be very helpful when you've made changes to only certain parts of the project and want to ensure your changes meet the linting standards without having to check the entire codebase.
|
||||
|
||||
We recognize linting can be annoying - if you do not want to do it, please contact a project maintainer, and they can help you with it. We do not want this to be a blocker for good code getting contributed.
|
||||
|
||||
### Testing
|
||||
|
||||
To run unit tests:
|
||||
|
||||
```bash
|
||||
make test
|
||||
```
|
||||
|
||||
If you add new logic, please add a unit test.
|
||||
|
||||
## 🏭 Release Process
|
||||
|
||||
____project_name follows the [semver](https://semver.org/) versioning standard.
|
||||
|
||||
To use the [automated release workflow](./workflows/release.yml) you'll need to set up a PyPI account and [create an API token](https://pypi.org/help/#apitoken). Configure the API token for this GitHub repo by going to settings -> security -> secrets -> actions, creating the `PYPI_API_TOKEN` variable and setting the value to be your PyPI API token.
|
||||
|
||||
Once that's set up, you can release a new version of the package by opening a PR that:
|
||||
1. updates the package version in the [pyproject.toml file](../pyproject.toml),
|
||||
2. labels the PR with a `release` tag.
|
||||
When the PR is merged into main, a new release will be created.
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . /app
|
||||
|
||||
RUN pip install --no-cache-dir .
|
||||
|
||||
CMD exec uvicorn ____project_name_identifier.server:app --host 0.0.0.0 --port $PORT
|
||||
|
||||
@@ -0,0 +1,38 @@
|
||||
.PHONY: all format lint test help
|
||||
|
||||
# Default target executed when no arguments are given to make.
|
||||
all: help
|
||||
|
||||
start:
|
||||
uvicorn ____project_name_identifier.server:app --reload
|
||||
|
||||
# Define a variable for the test file path.
|
||||
TEST_FILE ?= tests/
|
||||
|
||||
test:
|
||||
pytest $(TEST_FILE)
|
||||
|
||||
# Define a variable for Python and notebook files.
|
||||
PYTHON_FILES=.
|
||||
lint format: PYTHON_FILES=.
|
||||
lint_diff format_diff: PYTHON_FILES=$(shell git diff --name-only --diff-filter=d main | grep -E '\.py$$|\.ipynb$$')
|
||||
|
||||
lint lint_diff:
|
||||
mypy $(PYTHON_FILES)
|
||||
black $(PYTHON_FILES) --check
|
||||
ruff .
|
||||
|
||||
format format_diff:
|
||||
black $(PYTHON_FILES)
|
||||
ruff --select I --fix $(PYTHON_FILES)
|
||||
|
||||
######################
|
||||
# HELP
|
||||
######################
|
||||
|
||||
help:
|
||||
@echo '----'
|
||||
@echo 'make start - start server'
|
||||
@echo 'make format - run code formatters'
|
||||
@echo 'make lint - run linters'
|
||||
@echo 'make test - run unit tests'
|
||||
@@ -0,0 +1,52 @@
|
||||
[project]
|
||||
name = "____project_name"
|
||||
version = "0.0.1"
|
||||
description = ""
|
||||
authors = [{name = "____author_name", email = "____author_email"}]
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.8,<4.0"
|
||||
dependencies = [
|
||||
"langchain~=____langchain_version",
|
||||
"langserve[server]>=0.0.6",
|
||||
"tiktoken~=0.4.0",
|
||||
"openai~=0.27.8",
|
||||
"fastapi~=0.96.0",
|
||||
"uvicorn[standard]~=0.22.0",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["setuptools>=61.0"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest~=7.4.0",
|
||||
"pytest-asyncio~=0.21.1",
|
||||
"mypy~=1.4.1",
|
||||
"ruff~=0.0.278",
|
||||
"black~=23.7.0",
|
||||
"syrupy~=4.0.2",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
select = [
|
||||
"E", # pycodestyle
|
||||
"F", # pyflakes
|
||||
"I", # isort
|
||||
]
|
||||
|
||||
[tool.mypy]
|
||||
ignore_missing_imports = "True"
|
||||
disallow_untyped_defs = "True"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
# --strict-markers will raise errors on unknown marks.
|
||||
# https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
|
||||
#
|
||||
# https://docs.pytest.org/en/7.1.x/reference/reference.html
|
||||
# --strict-config any warnings encountered while parsing the `pytest`
|
||||
# section of the configuration file raise errors.
|
||||
#
|
||||
# https://github.com/tophat/syrupy
|
||||
# --snapshot-warn-unused Prints a warning on unused snapshots rather than fail the test suite.
|
||||
addopts = "--strict-markers --strict-config --durations=5 --snapshot-warn-unused"
|
||||
89
libs/langchain/langchain/cli/create_repo/templates/poetry/.github/CONTRIBUTING.md
vendored
Normal file
89
libs/langchain/langchain/cli/create_repo/templates/poetry/.github/CONTRIBUTING.md
vendored
Normal file
@@ -0,0 +1,89 @@
|
||||
# Contributing to ____project_name
|
||||
|
||||
Hi there! Thank you for even being interested in contributing to ____project_name.
|
||||
|
||||
## 🚀 Quick Start
|
||||
|
||||
This project uses [Poetry](https://python-poetry.org/) as a dependency manager. Check out Poetry's [documentation on how to install it](https://python-poetry.org/docs/#installation) on your system before proceeding.
|
||||
|
||||
❗Note: If you use `Conda` or `Pyenv` as your environment / package manager, avoid dependency conflicts by doing the following first:
|
||||
1. *Before installing Poetry*, create and activate a new Conda env (e.g. `conda create -n langchain python=3.9`)
|
||||
2. Install Poetry (see above)
|
||||
3. Tell Poetry to use the virtualenv python environment (`poetry config virtualenvs.prefer-active-python true`)
|
||||
4. Continue with the following steps.
|
||||
|
||||
To install requirements:
|
||||
|
||||
```bash
|
||||
poetry install
|
||||
```
|
||||
|
||||
This will install all requirements for running the package, examples, linting, formatting, tests, and coverage.
|
||||
|
||||
❗Note: If you're running Poetry 1.4.1 and receive a `WheelFileValidationError` for `debugpy` during installation, you can try either downgrading to Poetry 1.4.0 or disabling "modern installation" (`poetry config installer.modern-installation false`) and re-install requirements. See [this `debugpy` issue](https://github.com/microsoft/debugpy/issues/1246) for more details.
|
||||
|
||||
Now, you should be able to run the common tasks in the following section.
|
||||
|
||||
## ✅ Common Tasks
|
||||
|
||||
Type `make` for a list of common tasks.
|
||||
|
||||
### Code Formatting
|
||||
|
||||
Formatting for this project is done via a combination of [Black](https://black.readthedocs.io/en/stable/) and [isort](https://pycqa.github.io/isort/).
|
||||
|
||||
To run formatting for this project:
|
||||
|
||||
```bash
|
||||
make format
|
||||
```
|
||||
|
||||
Additionally, you can run the formatter only on the files that have been modified in your current branch as compared to the main branch using the format_diff command:
|
||||
|
||||
```bash
|
||||
make format_diff
|
||||
```
|
||||
|
||||
This is especially useful when you have made changes to a subset of the project and want to ensure your changes are properly formatted without affecting the rest of the codebase.
|
||||
|
||||
### Linting
|
||||
|
||||
Linting for this project is done via a combination of [Black](https://black.readthedocs.io/en/stable/), [isort](https://pycqa.github.io/isort/), [flake8](https://flake8.pycqa.org/en/latest/), and [mypy](http://mypy-lang.org/).
|
||||
|
||||
To run linting for this project:
|
||||
|
||||
```bash
|
||||
make lint
|
||||
```
|
||||
|
||||
In addition, you can run the linter only on the files that have been modified in your current branch as compared to the main branch using the lint_diff command:
|
||||
|
||||
```bash
|
||||
make lint_diff
|
||||
```
|
||||
|
||||
This can be very helpful when you've made changes to only certain parts of the project and want to ensure your changes meet the linting standards without having to check the entire codebase.
|
||||
|
||||
We recognize linting can be annoying - if you do not want to do it, please contact a project maintainer, and they can help you with it. We do not want this to be a blocker for good code getting contributed.
|
||||
|
||||
### Testing
|
||||
|
||||
To run unit tests:
|
||||
|
||||
```bash
|
||||
make test
|
||||
```
|
||||
|
||||
If you add new logic, please add a unit test.
|
||||
|
||||
## 🏭 Release Process
|
||||
|
||||
____project_name follows the [semver](https://semver.org/) versioning standard.
|
||||
|
||||
To use the [automated release workflow](./workflows/release.yml) you'll need to set up a PyPI account and [create an API token](https://pypi.org/help/#apitoken). Configure the API token for this GitHub repo by going to settings -> security -> secrets -> actions, creating the `PYPI_API_TOKEN` variable and setting the value to be your PyPI API token.
|
||||
|
||||
Once that's set up, you can release a new version of the package by opening a PR that:
|
||||
1. updates the package version in the [pyproject.toml file](../pyproject.toml),
|
||||
2. labels the PR with a `release` tag.
|
||||
When the PR is merged into main, a new release will be created.
|
||||
|
||||
@@ -0,0 +1,76 @@
|
||||
# An action for setting up poetry install with caching.
|
||||
# Using a custom action since the default action does not
|
||||
# take poetry install groups into account.
|
||||
# Action code from:
|
||||
# https://github.com/actions/setup-python/issues/505#issuecomment-1273013236
|
||||
name: poetry-install-with-caching
|
||||
description: Poetry install with support for caching of dependency groups.
|
||||
|
||||
inputs:
|
||||
python-version:
|
||||
description: Python version, supporting MAJOR.MINOR only
|
||||
required: true
|
||||
|
||||
poetry-version:
|
||||
description: Poetry version
|
||||
required: true
|
||||
|
||||
install-command:
|
||||
description: Command run for installing dependencies
|
||||
required: false
|
||||
default: poetry install
|
||||
|
||||
cache-key:
|
||||
description: Cache key to use for manual handling of caching
|
||||
required: true
|
||||
|
||||
working-directory:
|
||||
description: Directory to run install-command in
|
||||
required: false
|
||||
default: ""
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- uses: actions/setup-python@v4
|
||||
name: Setup python $${ inputs.python-version }}
|
||||
with:
|
||||
python-version: ${{ inputs.python-version }}
|
||||
|
||||
- uses: actions/cache@v3
|
||||
id: cache-pip
|
||||
name: Cache Pip ${{ inputs.python-version }}
|
||||
env:
|
||||
SEGMENT_DOWNLOAD_TIMEOUT_MIN: "15"
|
||||
with:
|
||||
path: |
|
||||
~/.cache/pip
|
||||
key: pip-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}
|
||||
|
||||
- run: pipx install poetry==${{ inputs.poetry-version }} --python python${{ inputs.python-version }}
|
||||
shell: bash
|
||||
|
||||
- name: Check Poetry File
|
||||
shell: bash
|
||||
run: |
|
||||
poetry check
|
||||
|
||||
- name: Check lock file
|
||||
shell: bash
|
||||
run: |
|
||||
poetry lock --check
|
||||
|
||||
- uses: actions/cache@v3
|
||||
id: cache-poetry
|
||||
env:
|
||||
SEGMENT_DOWNLOAD_TIMEOUT_MIN: "15"
|
||||
with:
|
||||
path: |
|
||||
~/.cache/pypoetry/virtualenvs
|
||||
~/.cache/pypoetry/cache
|
||||
~/.cache/pypoetry/artifacts
|
||||
key: poetry-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-poetry-${{ inputs.poetry-version }}-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- run: ${{ inputs.install-command }}
|
||||
working-directory: ${{ inputs.working-directory }}
|
||||
shell: bash
|
||||
36
libs/langchain/langchain/cli/create_repo/templates/poetry/.github/workflows/lint.yml
vendored
Normal file
36
libs/langchain/langchain/cli/create_repo/templates/poetry/.github/workflows/lint.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
name: lint
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
POETRY_VERSION: "1.4.2"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version:
|
||||
- "3.8"
|
||||
- "3.9"
|
||||
- "3.10"
|
||||
- "3.11"
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install poetry
|
||||
run: |
|
||||
pipx install poetry==$POETRY_VERSION
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
cache: poetry
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
poetry install
|
||||
- name: Analysing the code with our lint
|
||||
run: |
|
||||
make lint
|
||||
49
libs/langchain/langchain/cli/create_repo/templates/poetry/.github/workflows/release.yml
vendored
Normal file
49
libs/langchain/langchain/cli/create_repo/templates/poetry/.github/workflows/release.yml
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
name: release
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
- closed
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- 'pyproject.toml'
|
||||
|
||||
env:
|
||||
POETRY_VERSION: "1.4.2"
|
||||
|
||||
jobs:
|
||||
if_release:
|
||||
if: |
|
||||
${{ github.event.pull_request.merged == true }}
|
||||
&& ${{ contains(github.event.pull_request.labels.*.name, 'release') }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install poetry
|
||||
run: pipx install poetry==$POETRY_VERSION
|
||||
- name: Set up Python 3.10
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.10"
|
||||
cache: "poetry"
|
||||
- name: Build project for distribution
|
||||
run: poetry build
|
||||
- name: Check Version
|
||||
id: check-version
|
||||
run: |
|
||||
echo version=$(poetry version --short) >> $GITHUB_OUTPUT
|
||||
- name: Create Release
|
||||
uses: ncipollo/release-action@v1
|
||||
with:
|
||||
artifacts: "dist/*"
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
draft: false
|
||||
generateReleaseNotes: true
|
||||
tag: v${{ steps.check-version.outputs.version }}
|
||||
commit: master
|
||||
- name: Publish to PyPI
|
||||
env:
|
||||
POETRY_PYPI_TOKEN_PYPI: ${{ secrets.PYPI_API_TOKEN }}
|
||||
run: |
|
||||
poetry publish
|
||||
36
libs/langchain/langchain/cli/create_repo/templates/poetry/.github/workflows/test.yml
vendored
Normal file
36
libs/langchain/langchain/cli/create_repo/templates/poetry/.github/workflows/test.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
name: test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
pull_request:
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
POETRY_VERSION: "1.4.2"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version:
|
||||
- "3.8"
|
||||
- "3.9"
|
||||
- "3.10"
|
||||
- "3.11"
|
||||
name: Python ${{ matrix.python-version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: "./.github/actions/poetry_setup"
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
poetry-version: "1.4.2"
|
||||
install-command: |
|
||||
echo "Running tests, installing dependencies with poetry..."
|
||||
poetry install
|
||||
- name: Run tests
|
||||
run: |
|
||||
make test
|
||||
shell: bash
|
||||
@@ -0,0 +1,11 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . /app
|
||||
|
||||
RUN pip install poetry && \
|
||||
poetry config virtualenvs.create false && \
|
||||
poetry install --no-interaction --no-ansi --only main
|
||||
|
||||
CMD exec uvicorn ____project_name_identifier.server:app --host 0.0.0.0 --port $PORT
|
||||
@@ -0,0 +1,38 @@
|
||||
.PHONY: all format lint test help
|
||||
|
||||
# Default target executed when no arguments are given to make.
|
||||
all: help
|
||||
|
||||
start:
|
||||
poetry run uvicorn ____project_name_identifier.server:app --reload
|
||||
|
||||
# Define a variable for the test file path.
|
||||
TEST_FILE ?= tests/
|
||||
|
||||
test:
|
||||
poetry run pytest $(TEST_FILE)
|
||||
|
||||
# Define a variable for Python and notebook files.
|
||||
PYTHON_FILES=.
|
||||
lint format: PYTHON_FILES=.
|
||||
lint_diff format_diff: PYTHON_FILES=$(shell git diff --name-only --diff-filter=d main | grep -E '\.py$$|\.ipynb$$')
|
||||
|
||||
lint lint_diff:
|
||||
poetry run mypy $(PYTHON_FILES)
|
||||
poetry run black $(PYTHON_FILES) --check
|
||||
poetry run ruff .
|
||||
|
||||
format format_diff:
|
||||
poetry run black $(PYTHON_FILES)
|
||||
poetry run ruff --select I --fix $(PYTHON_FILES)
|
||||
|
||||
######################
|
||||
# HELP
|
||||
######################
|
||||
|
||||
help:
|
||||
@echo '----'
|
||||
@echo 'make start - start server'
|
||||
@echo 'make format - run code formatters'
|
||||
@echo 'make lint - run linters'
|
||||
@echo 'make test - run unit tests'
|
||||
@@ -0,0 +1,2 @@
|
||||
[virtualenvs]
|
||||
in-project = true
|
||||
@@ -0,0 +1,52 @@
|
||||
[tool.poetry]
|
||||
name = "____project_name"
|
||||
version = "0.0.1"
|
||||
description = ""
|
||||
authors = ["____author_name <____author_email>"]
|
||||
license = "MIT"
|
||||
readme = "README.md"
|
||||
packages = [{include = "____project_name_identifier"}]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.8.1"
|
||||
langchain = "^____langchain_version"
|
||||
langserve = { version = ">=0.0.6", extras = ["server"] }
|
||||
tiktoken = "^0.4.0"
|
||||
openai = "^0.27.8"
|
||||
fastapi = "^0.96.0"
|
||||
uvicorn = {extras = ["standard"], version = "^0.22.0"}
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^7.4.0"
|
||||
pytest-asyncio = "^0.21.1"
|
||||
mypy = "^1.4.1"
|
||||
ruff = "^0.0.278"
|
||||
black = "^23.7.0"
|
||||
syrupy = "^4.0.2"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.ruff]
|
||||
select = [
|
||||
"E", # pycodestyle
|
||||
"F", # pyflakes
|
||||
"I", # isort
|
||||
]
|
||||
|
||||
[tool.mypy]
|
||||
ignore_missing_imports = "True"
|
||||
disallow_untyped_defs = "True"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
# --strict-markers will raise errors on unknown marks.
|
||||
# https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
|
||||
#
|
||||
# https://docs.pytest.org/en/7.1.x/reference/reference.html
|
||||
# --strict-config any warnings encountered while parsing the `pytest`
|
||||
# section of the configuration file raise errors.
|
||||
#
|
||||
# https://github.com/tophat/syrupy
|
||||
# --snapshot-warn-unused Prints a warning on unused snapshots rather than fail the test suite.
|
||||
addopts = "--strict-markers --strict-config --durations=5 --snapshot-warn-unused"
|
||||
@@ -0,0 +1,2 @@
|
||||
PORT=8001
|
||||
OPENAI_API_KEY="your_secret_key_here"
|
||||
157
libs/langchain/langchain/cli/create_repo/templates/repo/.gitignore
vendored
Normal file
157
libs/langchain/langchain/cli/create_repo/templates/repo/.gitignore
vendored
Normal file
@@ -0,0 +1,157 @@
|
||||
.vs/
|
||||
.vscode/
|
||||
.idea/
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
docs/docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
notebooks/
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.envrc
|
||||
.venv
|
||||
.venvs
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# macOS display setting files
|
||||
.DS_Store
|
||||
|
||||
# Wandb directory
|
||||
wandb/
|
||||
|
||||
# asdf tool versions
|
||||
.tool-versions
|
||||
/.ruff_cache/
|
||||
|
||||
*.pkl
|
||||
*.bin
|
||||
|
||||
# integration test artifacts
|
||||
data_map*
|
||||
\[('_type', 'fake'), ('stop', None)]
|
||||
|
||||
# Replit files
|
||||
*replit*
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
The MIT License
|
||||
|
||||
Copyright (c) ____author_name
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user