mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-30 19:49:09 +00:00
513 lines
13 KiB
Plaintext
513 lines
13 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "raw",
|
|
"metadata": {},
|
|
"source": [
|
|
"---\n",
|
|
"sidebar_label: Kinetica\n",
|
|
"---"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Kinetica Vectorstore API\n",
|
|
"\n",
|
|
">[Kinetica](https://www.kinetica.com/) is a database with integrated support for vector similarity search\n",
|
|
"\n",
|
|
"It supports:\n",
|
|
"- exact and approximate nearest neighbor search\n",
|
|
"- L2 distance, inner product, and cosine distance\n",
|
|
"\n",
|
|
"This notebook shows how to use the Kinetica vector store (`Kinetica`)."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"This needs an instance of Kinetica which can easily be setup using the instructions given here - [installation instruction](https://www.kinetica.com/developer-edition/)."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Pip install necessary package\n",
|
|
"%pip install --upgrade --quiet langchain-openai langchain-community\n",
|
|
"%pip install gpudb>=7.2.2.0 \n",
|
|
"%pip install --upgrade --quiet tiktoken"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import getpass\n",
|
|
"import os\n",
|
|
"\n",
|
|
"if \"OPENAI_API_KEY\" not in os.environ:\n",
|
|
" os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"False"
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"## Loading Environment Variables\n",
|
|
"from dotenv import load_dotenv\n",
|
|
"\n",
|
|
"load_dotenv()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain_community.document_loaders import TextLoader\n",
|
|
"from langchain_community.vectorstores import (\n",
|
|
" Kinetica,\n",
|
|
" KineticaSettings,\n",
|
|
")\n",
|
|
"from langchain_openai import OpenAIEmbeddings"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"embeddings = OpenAIEmbeddings(model=\"text-embedding-3-large\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Kinetica needs the connection to the database.\n",
|
|
"# This is how to set it up.\n",
|
|
"HOST = os.getenv(\"KINETICA_HOST\", \"http://127.0.0.1:9191\")\n",
|
|
"USERNAME = os.getenv(\"KINETICA_USERNAME\", \"\")\n",
|
|
"PASSWORD = os.getenv(\"KINETICA_PASSWORD\", \"\")\n",
|
|
"OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\", \"\")\n",
|
|
"\n",
|
|
"\n",
|
|
"def create_config() -> KineticaSettings:\n",
|
|
" return KineticaSettings(host=HOST, username=USERNAME, password=PASSWORD)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from uuid import uuid4\n",
|
|
"\n",
|
|
"from langchain_core.documents import Document\n",
|
|
"\n",
|
|
"document_1 = Document(\n",
|
|
" page_content=\"I had chocolate chip pancakes and scrambled eggs for breakfast this morning.\",\n",
|
|
" metadata={\"source\": \"tweet\"},\n",
|
|
")\n",
|
|
"\n",
|
|
"document_2 = Document(\n",
|
|
" page_content=\"The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.\",\n",
|
|
" metadata={\"source\": \"news\"},\n",
|
|
")\n",
|
|
"\n",
|
|
"document_3 = Document(\n",
|
|
" page_content=\"Building an exciting new project with LangChain - come check it out!\",\n",
|
|
" metadata={\"source\": \"tweet\"},\n",
|
|
")\n",
|
|
"\n",
|
|
"document_4 = Document(\n",
|
|
" page_content=\"Robbers broke into the city bank and stole $1 million in cash.\",\n",
|
|
" metadata={\"source\": \"news\"},\n",
|
|
")\n",
|
|
"\n",
|
|
"document_5 = Document(\n",
|
|
" page_content=\"Wow! That was an amazing movie. I can't wait to see it again.\",\n",
|
|
" metadata={\"source\": \"tweet\"},\n",
|
|
")\n",
|
|
"\n",
|
|
"document_6 = Document(\n",
|
|
" page_content=\"Is the new iPhone worth the price? Read this review to find out.\",\n",
|
|
" metadata={\"source\": \"website\"},\n",
|
|
")\n",
|
|
"\n",
|
|
"document_7 = Document(\n",
|
|
" page_content=\"The top 10 soccer players in the world right now.\",\n",
|
|
" metadata={\"source\": \"website\"},\n",
|
|
")\n",
|
|
"\n",
|
|
"document_8 = Document(\n",
|
|
" page_content=\"LangGraph is the best framework for building stateful, agentic applications!\",\n",
|
|
" metadata={\"source\": \"tweet\"},\n",
|
|
")\n",
|
|
"\n",
|
|
"document_9 = Document(\n",
|
|
" page_content=\"The stock market is down 500 points today due to fears of a recession.\",\n",
|
|
" metadata={\"source\": \"news\"},\n",
|
|
")\n",
|
|
"\n",
|
|
"document_10 = Document(\n",
|
|
" page_content=\"I have a bad feeling I am going to get deleted :(\",\n",
|
|
" metadata={\"source\": \"tweet\"},\n",
|
|
")\n",
|
|
"\n",
|
|
"documents = [\n",
|
|
" document_1,\n",
|
|
" document_2,\n",
|
|
" document_3,\n",
|
|
" document_4,\n",
|
|
" document_5,\n",
|
|
" document_6,\n",
|
|
" document_7,\n",
|
|
" document_8,\n",
|
|
" document_9,\n",
|
|
" document_10,\n",
|
|
"]\n",
|
|
"uuids = [str(uuid4()) for _ in range(len(documents))]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Similarity Search with Euclidean Distance (Default)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"['05e5a484-0273-49d1-90eb-1276baca31de',\n",
|
|
" 'd98b808f-dc0b-4328-bdbf-88f6b2ab6040',\n",
|
|
" 'ba0968d4-e344-4285-ae0f-f5199b56f9d6',\n",
|
|
" 'a25393b8-6539-45b5-993e-ea16d01941ec',\n",
|
|
" '804a37e3-1278-4b60-8b02-36b159ee8c1a',\n",
|
|
" '9688b594-3dc6-41d2-a937-babf8ff24c2f',\n",
|
|
" '40f7b8fe-67c7-489a-a5a5-7d3965e33bba',\n",
|
|
" 'b4fc1376-c113-41e9-8f16-f9320517bedd',\n",
|
|
" '4d94d089-fdde-442b-84ab-36d9fe0670c8',\n",
|
|
" '66fdb79d-49ce-4b06-901a-fda6271baf2a']"
|
|
]
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# The Kinetica Module will try to create a table with the name of the collection.\n",
|
|
"# So, make sure that the collection name is unique and the user has the permission to create a table.\n",
|
|
"\n",
|
|
"COLLECTION_NAME = \"langchain_example\"\n",
|
|
"connection = create_config()\n",
|
|
"\n",
|
|
"db = Kinetica(\n",
|
|
" connection,\n",
|
|
" embeddings,\n",
|
|
" collection_name=COLLECTION_NAME,\n",
|
|
")\n",
|
|
"\n",
|
|
"db.add_documents(documents=documents, ids=uuids)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
|
"# docs_with_score = db.similarity_search_with_score(query)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"Similarity Search\n",
|
|
"* Building an exciting new project with LangChain - come check it out! [{'source': 'tweet'}]\n",
|
|
"* LangGraph is the best framework for building stateful, agentic applications! [{'source': 'tweet'}]\n",
|
|
"\n",
|
|
"Similarity search with score\n",
|
|
"* [SIM=0.945397] The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'source': 'news'}]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print()\n",
|
|
"print(\"Similarity Search\")\n",
|
|
"results = db.similarity_search(\n",
|
|
" \"LangChain provides abstractions to make working with LLMs easy\",\n",
|
|
" k=2,\n",
|
|
" filter={\"source\": \"tweet\"},\n",
|
|
")\n",
|
|
"for res in results:\n",
|
|
" print(f\"* {res.page_content} [{res.metadata}]\")\n",
|
|
"\n",
|
|
"print()\n",
|
|
"print(\"Similarity search with score\")\n",
|
|
"results = db.similarity_search_with_score(\n",
|
|
" \"Will it be hot tomorrow?\", k=1, filter={\"source\": \"news\"}\n",
|
|
")\n",
|
|
"for res, score in results:\n",
|
|
" print(f\"* [SIM={score:3f}] {res.page_content} [{res.metadata}]\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Working with vectorstore\n",
|
|
"\n",
|
|
"Above, we created a vectorstore from scratch. However, often times we want to work with an existing vectorstore.\n",
|
|
"In order to do that, we can initialize it directly."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"store = Kinetica(\n",
|
|
" collection_name=COLLECTION_NAME,\n",
|
|
" config=connection,\n",
|
|
" embedding_function=embeddings,\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Add documents\n",
|
|
"We can add documents to the existing vectorstore."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"['68c4c679-c4d9-4f2d-bf01-f6c4f2181503']"
|
|
]
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"store.add_documents([Document(page_content=\"foo\")])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"docs_with_score = db.similarity_search_with_score(\"foo\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(Document(metadata={}, page_content='foo'), 0.0015394920483231544)"
|
|
]
|
|
},
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"docs_with_score[0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(Document(metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),\n",
|
|
" 1.2609431743621826)"
|
|
]
|
|
},
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"docs_with_score[1]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Overriding a vectorstore\n",
|
|
"\n",
|
|
"If you have an existing collection, you override it by doing `from_documents` and setting `pre_delete_collection` = True"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"db = Kinetica.from_documents(\n",
|
|
" documents=documents,\n",
|
|
" embedding=embeddings,\n",
|
|
" collection_name=COLLECTION_NAME,\n",
|
|
" config=connection,\n",
|
|
" pre_delete_collection=True,\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"docs_with_score = db.similarity_search_with_score(\"foo\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(Document(metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),\n",
|
|
" 1.260920763015747)"
|
|
]
|
|
},
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"docs_with_score[0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Using a VectorStore as a Retriever"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"retriever = store.as_retriever()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"tags=['Kinetica', 'OpenAIEmbeddings'] vectorstore=<langchain_community.vectorstores.kinetica.Kinetica object at 0x7a48142b2230> search_kwargs={}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(retriever)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|