mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-20 03:21:33 +00:00
Langchain vectorstore integration with Kinetica (#18102)
- **Description:** New vectorstore integration with the Kinetica database - **Issue:** - **Dependencies:** the Kinetica Python API `pip install gpudb==7.2.0.1`, - **Tag maintainer:** @baskaryan, @hwchase17 - **Twitter handle:** --------- Co-authored-by: Chad Juliano <cjuliano@kinetica.com>
This commit is contained in:
parent
1e8ab83d7b
commit
9b8f6455b1
581
docs/docs/integrations/vectorstores/kinetica.ipynb
Normal file
581
docs/docs/integrations/vectorstores/kinetica.ipynb
Normal file
@ -0,0 +1,581 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"sidebar_label: Kinetica\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Kinetica Vectorstore API\n",
|
||||
"\n",
|
||||
">[Kinetica](https://www.kinetica.com/) is a database with integrated support for vector similarity search\n",
|
||||
"\n",
|
||||
"It supports:\n",
|
||||
"- exact and approximate nearest neighbor search\n",
|
||||
"- L2 distance, inner product, and cosine distance\n",
|
||||
"\n",
|
||||
"This notebook shows how to use the Kinetica vector store (`Kinetica`)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This needs an instance of Kinetica which can easily be setup using the instructions given here - [installation instruction](https://www.kinetica.com/developer-edition/)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
|
||||
"Note: you may need to restart the kernel to use updated packages.\n",
|
||||
"Requirement already satisfied: gpudb==7.2.0.0b in /home/anindyam/kinetica/kinetica-github/langchain/libs/langchain/.venv/lib/python3.8/site-packages (7.2.0.0b0)\n",
|
||||
"Requirement already satisfied: future in /home/anindyam/kinetica/kinetica-github/langchain/libs/langchain/.venv/lib/python3.8/site-packages (from gpudb==7.2.0.0b) (0.18.3)\n",
|
||||
"Requirement already satisfied: pyzmq in /home/anindyam/kinetica/kinetica-github/langchain/libs/langchain/.venv/lib/python3.8/site-packages (from gpudb==7.2.0.0b) (25.1.2)\n",
|
||||
"\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
|
||||
"Note: you may need to restart the kernel to use updated packages.\n",
|
||||
"\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Pip install necessary package\n",
|
||||
"%pip install --upgrade --quiet langchain-openai\n",
|
||||
"%pip install gpudb==7.2.0.1\n",
|
||||
"%pip install --upgrade --quiet tiktoken"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import getpass\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"False"
|
||||
]
|
||||
},
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"## Loading Environment Variables\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"\n",
|
||||
"load_dotenv()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.docstore.document import Document\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain_community.document_loaders import TextLoader\n",
|
||||
"from langchain_community.vectorstores import (\n",
|
||||
" DistanceStrategy,\n",
|
||||
" Kinetica,\n",
|
||||
" KineticaSettings,\n",
|
||||
")\n",
|
||||
"from langchain_openai import OpenAIEmbeddings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n",
|
||||
"documents = loader.load()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"docs = text_splitter.split_documents(documents)\n",
|
||||
"\n",
|
||||
"embeddings = OpenAIEmbeddings()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Kinetica needs the connection to the database.\n",
|
||||
"# This is how to set it up.\n",
|
||||
"HOST = os.getenv(\"KINETICA_HOST\", \"http://127.0.0.1:9191\")\n",
|
||||
"USERNAME = os.getenv(\"KINETICA_USERNAME\", \"\")\n",
|
||||
"PASSWORD = os.getenv(\"KINETICA_PASSWORD\", \"\")\n",
|
||||
"OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\", \"\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def create_config() -> KineticaSettings:\n",
|
||||
" return KineticaSettings(host=HOST, username=USERNAME, password=PASSWORD)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Similarity Search with Euclidean Distance (Default)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# The Kinetica Module will try to create a table with the name of the collection.\n",
|
||||
"# So, make sure that the collection name is unique and the user has the permission to create a table.\n",
|
||||
"\n",
|
||||
"COLLECTION_NAME = \"state_of_the_union_test\"\n",
|
||||
"connection = create_config()\n",
|
||||
"\n",
|
||||
"db = Kinetica.from_documents(\n",
|
||||
" embedding=embeddings,\n",
|
||||
" documents=docs,\n",
|
||||
" collection_name=COLLECTION_NAME,\n",
|
||||
" config=connection,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"docs_with_score = db.similarity_search_with_score(query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"Score: 0.6077010035514832\n",
|
||||
"Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
|
||||
"\n",
|
||||
"Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
|
||||
"\n",
|
||||
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
|
||||
"\n",
|
||||
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"Score: 0.6077010035514832\n",
|
||||
"Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
|
||||
"\n",
|
||||
"Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
|
||||
"\n",
|
||||
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
|
||||
"\n",
|
||||
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"Score: 0.6596046090126038\n",
|
||||
"A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n",
|
||||
"\n",
|
||||
"And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n",
|
||||
"\n",
|
||||
"We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \n",
|
||||
"\n",
|
||||
"We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \n",
|
||||
"\n",
|
||||
"We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \n",
|
||||
"\n",
|
||||
"We’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"Score: 0.6597143411636353\n",
|
||||
"A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n",
|
||||
"\n",
|
||||
"And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n",
|
||||
"\n",
|
||||
"We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \n",
|
||||
"\n",
|
||||
"We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \n",
|
||||
"\n",
|
||||
"We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \n",
|
||||
"\n",
|
||||
"We’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for doc, score in docs_with_score:\n",
|
||||
" print(\"-\" * 80)\n",
|
||||
" print(\"Score: \", score)\n",
|
||||
" print(doc.page_content)\n",
|
||||
" print(\"-\" * 80)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Maximal Marginal Relevance Search (MMR)\n",
|
||||
"Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_with_score = db.max_marginal_relevance_search_with_score(query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"Score: 0.6077010035514832\n",
|
||||
"Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
|
||||
"\n",
|
||||
"Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
|
||||
"\n",
|
||||
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
|
||||
"\n",
|
||||
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"Score: 0.6852865219116211\n",
|
||||
"It is going to transform America and put us on a path to win the economic competition of the 21st Century that we face with the rest of the world—particularly with China. \n",
|
||||
"\n",
|
||||
"As I’ve told Xi Jinping, it is never a good bet to bet against the American people. \n",
|
||||
"\n",
|
||||
"We’ll create good jobs for millions of Americans, modernizing roads, airports, ports, and waterways all across America. \n",
|
||||
"\n",
|
||||
"And we’ll do it all to withstand the devastating effects of the climate crisis and promote environmental justice. \n",
|
||||
"\n",
|
||||
"We’ll build a national network of 500,000 electric vehicle charging stations, begin to replace poisonous lead pipes—so every child—and every American—has clean water to drink at home and at school, provide affordable high-speed internet for every American—urban, suburban, rural, and tribal communities. \n",
|
||||
"\n",
|
||||
"4,000 projects have already been announced. \n",
|
||||
"\n",
|
||||
"And tonight, I’m announcing that this year we will start fixing over 65,000 miles of highway and 1,500 bridges in disrepair.\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"Score: 0.6866700053215027\n",
|
||||
"We can’t change how divided we’ve been. But we can change how we move forward—on COVID-19 and other issues we must face together. \n",
|
||||
"\n",
|
||||
"I recently visited the New York City Police Department days after the funerals of Officer Wilbert Mora and his partner, Officer Jason Rivera. \n",
|
||||
"\n",
|
||||
"They were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. \n",
|
||||
"\n",
|
||||
"Officer Mora was 27 years old. \n",
|
||||
"\n",
|
||||
"Officer Rivera was 22. \n",
|
||||
"\n",
|
||||
"Both Dominican Americans who’d grown up on the same streets they later chose to patrol as police officers. \n",
|
||||
"\n",
|
||||
"I spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves. \n",
|
||||
"\n",
|
||||
"I’ve worked on these issues a long time. \n",
|
||||
"\n",
|
||||
"I know what works: Investing in crime prevention and community police officers who’ll walk the beat, who’ll know the neighborhood, and who can restore trust and safety.\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"--------------------------------------------------------------------------------\n",
|
||||
"Score: 0.6936529278755188\n",
|
||||
"But cancer from prolonged exposure to burn pits ravaged Heath’s lungs and body. \n",
|
||||
"\n",
|
||||
"Danielle says Heath was a fighter to the very end. \n",
|
||||
"\n",
|
||||
"He didn’t know how to stop fighting, and neither did she. \n",
|
||||
"\n",
|
||||
"Through her pain she found purpose to demand we do better. \n",
|
||||
"\n",
|
||||
"Tonight, Danielle—we are. \n",
|
||||
"\n",
|
||||
"The VA is pioneering new ways of linking toxic exposures to diseases, already helping more veterans get benefits. \n",
|
||||
"\n",
|
||||
"And tonight, I’m announcing we’re expanding eligibility to veterans suffering from nine respiratory cancers. \n",
|
||||
"\n",
|
||||
"I’m also calling on Congress: pass a law to make sure veterans devastated by toxic exposures in Iraq and Afghanistan finally get the benefits and comprehensive health care they deserve. \n",
|
||||
"\n",
|
||||
"And fourth, let’s end cancer as we know it. \n",
|
||||
"\n",
|
||||
"This is personal to me and Jill, to Kamala, and to so many of you. \n",
|
||||
"\n",
|
||||
"Cancer is the #2 cause of death in America–second only to heart disease.\n",
|
||||
"--------------------------------------------------------------------------------\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for doc, score in docs_with_score:\n",
|
||||
" print(\"-\" * 80)\n",
|
||||
" print(\"Score: \", score)\n",
|
||||
" print(doc.page_content)\n",
|
||||
" print(\"-\" * 80)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Working with vectorstore\n",
|
||||
"\n",
|
||||
"Above, we created a vectorstore from scratch. However, often times we want to work with an existing vectorstore.\n",
|
||||
"In order to do that, we can initialize it directly."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"store = Kinetica(\n",
|
||||
" collection_name=COLLECTION_NAME,\n",
|
||||
" config=connection,\n",
|
||||
" embedding_function=embeddings,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Add documents\n",
|
||||
"We can add documents to the existing vectorstore."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['b94dc67c-ce7e-11ee-b8cb-b940b0e45762']"
|
||||
]
|
||||
},
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"store.add_documents([Document(page_content=\"foo\")])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_with_score = db.similarity_search_with_score(\"foo\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(Document(page_content='foo'), 0.0)"
|
||||
]
|
||||
},
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs_with_score[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(Document(page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': '../../modules/state_of_the_union.txt'}),\n",
|
||||
" 0.6946534514427185)"
|
||||
]
|
||||
},
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs_with_score[1]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Overriding a vectorstore\n",
|
||||
"\n",
|
||||
"If you have an existing collection, you override it by doing `from_documents` and setting `pre_delete_collection` = True"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"db = Kinetica.from_documents(\n",
|
||||
" documents=docs,\n",
|
||||
" embedding=embeddings,\n",
|
||||
" collection_name=COLLECTION_NAME,\n",
|
||||
" config=connection,\n",
|
||||
" pre_delete_collection=True,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs_with_score = db.similarity_search_with_score(\"foo\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(Document(page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': '../../modules/state_of_the_union.txt'}),\n",
|
||||
" 0.6946534514427185)"
|
||||
]
|
||||
},
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs_with_score[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Using a VectorStore as a Retriever"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"retriever = store.as_retriever()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tags=['Kinetica', 'OpenAIEmbeddings'] vectorstore=<langchain_community.vectorstores.kinetica.Kinetica object at 0x7f1644375e20>\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(retriever)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -216,6 +216,24 @@ def _import_hanavector() -> Any:
|
||||
return HanaDB
|
||||
|
||||
|
||||
def _import_kinetica() -> Any:
|
||||
from langchain_community.vectorstores.kinetica import Kinetica
|
||||
|
||||
return Kinetica
|
||||
|
||||
|
||||
def _import_kinetica_settings() -> Any:
|
||||
from langchain_community.vectorstores.kinetica import KineticaSettings
|
||||
|
||||
return KineticaSettings
|
||||
|
||||
|
||||
def _import_distance_strategy() -> Any:
|
||||
from langchain_community.vectorstores.kinetica import DistanceStrategy
|
||||
|
||||
return DistanceStrategy
|
||||
|
||||
|
||||
def _import_hologres() -> Any:
|
||||
from langchain_community.vectorstores.hologres import Hologres
|
||||
|
||||
@ -553,6 +571,12 @@ def __getattr__(name: str) -> Any:
|
||||
return _import_hologres()
|
||||
elif name == "KDBAI":
|
||||
return _import_kdbai()
|
||||
elif name == "DistanceStrategy":
|
||||
return _import_distance_strategy()
|
||||
elif name == "KineticaSettings":
|
||||
return _import_kinetica_settings()
|
||||
elif name == "Kinetica":
|
||||
return _import_kinetica()
|
||||
elif name == "LanceDB":
|
||||
return _import_lancedb()
|
||||
elif name == "LLMRails":
|
||||
@ -673,6 +697,9 @@ __all__ = [
|
||||
"HanaDB",
|
||||
"Hologres",
|
||||
"KDBAI",
|
||||
"DistanceStrategy",
|
||||
"Kinetica",
|
||||
"KineticaSettings",
|
||||
"LanceDB",
|
||||
"LLMRails",
|
||||
"Marqo",
|
||||
|
919
libs/community/langchain_community/vectorstores/kinetica.py
Normal file
919
libs/community/langchain_community/vectorstores/kinetica.py
Normal file
@ -0,0 +1,919 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import enum
|
||||
import json
|
||||
import logging
|
||||
import struct
|
||||
import uuid
|
||||
from collections import OrderedDict
|
||||
from enum import Enum
|
||||
from functools import partial
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type
|
||||
|
||||
import numpy as np
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.pydantic_v1 import BaseSettings
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
from langchain_community.vectorstores.utils import maximal_marginal_relevance
|
||||
|
||||
|
||||
class DistanceStrategy(str, enum.Enum):
|
||||
"""Enumerator of the Distance strategies."""
|
||||
|
||||
EUCLIDEAN = "l2"
|
||||
COSINE = "cosine"
|
||||
MAX_INNER_PRODUCT = "inner"
|
||||
|
||||
|
||||
def _results_to_docs(docs_and_scores: Any) -> List[Document]:
|
||||
"""Return docs from docs and scores."""
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
|
||||
class Dimension(int, Enum):
|
||||
"""Some default dimensions for known embeddings."""
|
||||
|
||||
OPENAI = 1536
|
||||
|
||||
|
||||
DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.EUCLIDEAN
|
||||
|
||||
_LANGCHAIN_DEFAULT_SCHEMA_NAME = "langchain" ## Default Kinetica schema name
|
||||
_LANGCHAIN_DEFAULT_COLLECTION_NAME = (
|
||||
"langchain_kinetica_embeddings" ## Default Kinetica table name
|
||||
)
|
||||
|
||||
|
||||
class KineticaSettings(BaseSettings):
|
||||
"""`Kinetica` client configuration.
|
||||
|
||||
Attribute:
|
||||
host (str) : An URL to connect to MyScale backend.
|
||||
Defaults to 'localhost'.
|
||||
port (int) : URL port to connect with HTTP. Defaults to 8443.
|
||||
username (str) : Username to login. Defaults to None.
|
||||
password (str) : Password to login. Defaults to None.
|
||||
database (str) : Database name to find the table. Defaults to 'default'.
|
||||
table (str) : Table name to operate on.
|
||||
Defaults to 'vector_table'.
|
||||
metric (str) : Metric to compute distance,
|
||||
supported are ('angular', 'euclidean', 'manhattan', 'hamming',
|
||||
'dot'). Defaults to 'angular'.
|
||||
https://github.com/spotify/annoy/blob/main/src/annoymodule.cc#L149-L169
|
||||
|
||||
"""
|
||||
|
||||
host: str = "http://127.0.0.1"
|
||||
port: int = 9191
|
||||
|
||||
username: Optional[str] = None
|
||||
password: Optional[str] = None
|
||||
|
||||
database: str = _LANGCHAIN_DEFAULT_SCHEMA_NAME
|
||||
table: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME
|
||||
metric: str = DEFAULT_DISTANCE_STRATEGY.value
|
||||
|
||||
def __getitem__(self, item: str) -> Any:
|
||||
return getattr(self, item)
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
env_prefix = "kinetica_"
|
||||
env_file_encoding = "utf-8"
|
||||
|
||||
|
||||
class Kinetica(VectorStore):
|
||||
"""`Kinetica` vector store.
|
||||
|
||||
To use, you should have the ``gpudb`` python package installed.
|
||||
|
||||
Args:
|
||||
kinetica_settings: Kinetica connection settings class.
|
||||
embedding_function: Any embedding function implementing
|
||||
`langchain.embeddings.base.Embeddings` interface.
|
||||
collection_name: The name of the collection to use. (default: langchain)
|
||||
NOTE: This is not the name of the table, but the name of the collection.
|
||||
The tables will be created when initializing the store (if not exists)
|
||||
So, make sure the user has the right permissions to create tables.
|
||||
distance_strategy: The distance strategy to use. (default: COSINE)
|
||||
pre_delete_collection: If True, will delete the collection if it exists.
|
||||
(default: False). Useful for testing.
|
||||
engine_args: SQLAlchemy's create engine arguments.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.vectorstores import Kinetica, KineticaSettings
|
||||
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
||||
|
||||
kinetica_settings = KineticaSettings(
|
||||
host="http://127.0.0.1", username="", password=""
|
||||
)
|
||||
COLLECTION_NAME = "kinetica_store"
|
||||
embeddings = OpenAIEmbeddings()
|
||||
vectorstore = Kinetica.from_documents(
|
||||
documents=docs,
|
||||
embedding=embeddings,
|
||||
collection_name=COLLECTION_NAME,
|
||||
config=kinetica_settings,
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: KineticaSettings,
|
||||
embedding_function: Embeddings,
|
||||
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
|
||||
schema_name: str = _LANGCHAIN_DEFAULT_SCHEMA_NAME,
|
||||
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
||||
pre_delete_collection: bool = False,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
relevance_score_fn: Optional[Callable[[float], float]] = None,
|
||||
) -> None:
|
||||
"""Constructor for the Kinetica class
|
||||
|
||||
Args:
|
||||
config (KineticaSettings): a `KineticaSettings` instance
|
||||
embedding_function (Embeddings): embedding function to use
|
||||
collection_name (str, optional): the Kinetica table name.
|
||||
Defaults to _LANGCHAIN_DEFAULT_COLLECTION_NAME.
|
||||
schema_name (str, optional): the Kinetica table name.
|
||||
Defaults to _LANGCHAIN_DEFAULT_SCHEMA_NAME.
|
||||
distance_strategy (DistanceStrategy, optional): _description_.
|
||||
Defaults to DEFAULT_DISTANCE_STRATEGY.
|
||||
pre_delete_collection (bool, optional): _description_. Defaults to False.
|
||||
logger (Optional[logging.Logger], optional): _description_.
|
||||
Defaults to None.
|
||||
"""
|
||||
|
||||
self._config = config
|
||||
self.embedding_function = embedding_function
|
||||
self.collection_name = collection_name
|
||||
self.schema_name = schema_name
|
||||
self._distance_strategy = distance_strategy
|
||||
self.pre_delete_collection = pre_delete_collection
|
||||
self.logger = logger or logging.getLogger(__name__)
|
||||
self.override_relevance_score_fn = relevance_score_fn
|
||||
self._db = self.__get_db(self._config)
|
||||
|
||||
def __post_init__(self, dimensions: int) -> None:
|
||||
"""
|
||||
Initialize the store.
|
||||
"""
|
||||
try:
|
||||
from gpudb import GPUdbTable
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import Kinetica python API. "
|
||||
"Please install it with `pip install gpudb==7.2.0.1`."
|
||||
)
|
||||
|
||||
self.dimensions = dimensions
|
||||
dimension_field = f"vector({dimensions})"
|
||||
|
||||
if self.pre_delete_collection:
|
||||
self.delete_schema()
|
||||
|
||||
self.table_name = self.collection_name
|
||||
if self.schema_name is not None and len(self.schema_name) > 0:
|
||||
self.table_name = f"{self.schema_name}.{self.collection_name}"
|
||||
|
||||
self.table_schema = [
|
||||
["text", "string"],
|
||||
["embedding", "bytes", dimension_field],
|
||||
["metadata", "string", "json"],
|
||||
["id", "string", "uuid"],
|
||||
]
|
||||
|
||||
self.create_schema()
|
||||
self.EmbeddingStore: GPUdbTable = self.create_tables_if_not_exists()
|
||||
|
||||
def __get_db(self, config: KineticaSettings) -> Any:
|
||||
try:
|
||||
from gpudb import GPUdb
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import Kinetica python API. "
|
||||
"Please install it with `pip install gpudb==7.2.0.1`."
|
||||
)
|
||||
|
||||
options = GPUdb.Options()
|
||||
options.username = config.username
|
||||
options.password = config.password
|
||||
options.skip_ssl_cert_verification = True
|
||||
return GPUdb(host=config.host, options=options)
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Embeddings:
|
||||
return self.embedding_function
|
||||
|
||||
@classmethod
|
||||
def __from(
|
||||
cls,
|
||||
config: KineticaSettings,
|
||||
texts: List[str],
|
||||
embeddings: List[List[float]],
|
||||
embedding: Embeddings,
|
||||
dimensions: int,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
|
||||
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
||||
pre_delete_collection: bool = False,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
**kwargs: Any,
|
||||
) -> Kinetica:
|
||||
"""Class method to assist in constructing the `Kinetica` store instance
|
||||
using different combinations of parameters
|
||||
|
||||
Args:
|
||||
config (KineticaSettings): a `KineticaSettings` instance
|
||||
texts (List[str]): The list of texts to generate embeddings for and store
|
||||
embeddings (List[List[float]]): List of embeddings
|
||||
embedding (Embeddings): the Embedding function
|
||||
dimensions (int): The number of dimensions the embeddings have
|
||||
metadatas (Optional[List[dict]], optional): List of JSON data associated
|
||||
with each text. Defaults to None.
|
||||
ids (Optional[List[str]], optional): List of unique IDs (UUID by default)
|
||||
associated with each text. Defaults to None.
|
||||
collection_name (str, optional): Kinetica schema name.
|
||||
Defaults to _LANGCHAIN_DEFAULT_COLLECTION_NAME.
|
||||
distance_strategy (DistanceStrategy, optional): Not used for now.
|
||||
Defaults to DEFAULT_DISTANCE_STRATEGY.
|
||||
pre_delete_collection (bool, optional): Whether to delete the Kinetica
|
||||
schema or not. Defaults to False.
|
||||
logger (Optional[logging.Logger], optional): Logger to use for logging at
|
||||
different levels. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Kinetica: An instance of Kinetica class
|
||||
"""
|
||||
if ids is None:
|
||||
ids = [str(uuid.uuid1()) for _ in texts]
|
||||
|
||||
if not metadatas:
|
||||
metadatas = [{} for _ in texts]
|
||||
|
||||
store = cls(
|
||||
config=config,
|
||||
collection_name=collection_name,
|
||||
embedding_function=embedding,
|
||||
# dimensions=dimensions,
|
||||
distance_strategy=distance_strategy,
|
||||
pre_delete_collection=pre_delete_collection,
|
||||
logger=logger,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
store.__post_init__(dimensions)
|
||||
|
||||
store.add_embeddings(
|
||||
texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs
|
||||
)
|
||||
|
||||
return store
|
||||
|
||||
def create_tables_if_not_exists(self) -> Any:
|
||||
"""Create the table to store the texts and embeddings"""
|
||||
|
||||
try:
|
||||
from gpudb import GPUdbTable
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import Kinetica python API. "
|
||||
"Please install it with `pip install gpudb==7.2.0.1`."
|
||||
)
|
||||
return GPUdbTable(
|
||||
_type=self.table_schema,
|
||||
name=self.table_name,
|
||||
db=self._db,
|
||||
options={"is_replicated": "true"},
|
||||
)
|
||||
|
||||
def drop_tables(self) -> None:
|
||||
"""Delete the table"""
|
||||
self._db.clear_table(
|
||||
f"{self.table_name}", options={"no_error_if_not_exists": "true"}
|
||||
)
|
||||
|
||||
def create_schema(self) -> None:
|
||||
"""Create a new Kinetica schema"""
|
||||
self._db.create_schema(self.schema_name)
|
||||
|
||||
def delete_schema(self) -> None:
|
||||
"""Delete a Kinetica schema with cascade set to `true`
|
||||
This method will delete a schema with all tables in it.
|
||||
"""
|
||||
self.logger.debug("Trying to delete collection")
|
||||
self._db.drop_schema(
|
||||
self.schema_name, {"no_error_if_not_exists": "true", "cascade": "true"}
|
||||
)
|
||||
|
||||
def add_embeddings(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
embeddings: List[List[float]],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Add embeddings to the vectorstore.
|
||||
|
||||
Args:
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
embeddings: List of list of embedding vectors.
|
||||
metadatas: List of metadatas associated with the texts.
|
||||
ids: List of ids for the text embedding pairs
|
||||
kwargs: vectorstore specific parameters
|
||||
"""
|
||||
if ids is None:
|
||||
ids = [str(uuid.uuid1()) for _ in texts]
|
||||
|
||||
if not metadatas:
|
||||
metadatas = [{} for _ in texts]
|
||||
|
||||
records = []
|
||||
for text, embedding, metadata, id in zip(texts, embeddings, metadatas, ids):
|
||||
buf = struct.pack("%sf" % self.dimensions, *embedding)
|
||||
records.append([text, buf, json.dumps(metadata), id])
|
||||
|
||||
self.EmbeddingStore.insert_records(records)
|
||||
|
||||
return ids
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Run more texts through the embeddings and add to the vectorstore.
|
||||
|
||||
Args:
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas (JSON data) associated with the texts.
|
||||
ids: List of IDs (UUID) for the texts supplied; will be generated if None
|
||||
kwargs: vectorstore specific parameters
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
"""
|
||||
embeddings = self.embedding_function.embed_documents(list(texts))
|
||||
self.dimensions = len(embeddings[0])
|
||||
if not hasattr(self, "EmbeddingStore"):
|
||||
self.__post_init__(self.dimensions)
|
||||
return self.add_embeddings(
|
||||
texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs
|
||||
)
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
filter: Optional[dict] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Run similarity search with Kinetica with distance.
|
||||
|
||||
Args:
|
||||
query (str): Query text to search for.
|
||||
k (int): Number of results to return. Defaults to 4.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query.
|
||||
"""
|
||||
embedding = self.embedding_function.embed_query(text=query)
|
||||
return self.similarity_search_by_vector(
|
||||
embedding=embedding,
|
||||
k=k,
|
||||
filter=filter,
|
||||
)
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
filter: Optional[dict] = None,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs most similar to query.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query and score for each
|
||||
"""
|
||||
embedding = self.embedding_function.embed_query(query)
|
||||
docs = self.similarity_search_with_score_by_vector(
|
||||
embedding=embedding, k=k, filter=filter
|
||||
)
|
||||
return docs
|
||||
|
||||
def similarity_search_with_score_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
filter: Optional[dict] = None,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
resp: Dict = self.__query_collection(embedding, k, filter)
|
||||
|
||||
records: OrderedDict = resp["records"]
|
||||
results = list(zip(*list(records.values())))
|
||||
|
||||
return self._results_to_docs_and_scores(results)
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
filter: Optional[dict] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to embedding vector.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query vector.
|
||||
"""
|
||||
docs_and_scores = self.similarity_search_with_score_by_vector(
|
||||
embedding=embedding, k=k, filter=filter
|
||||
)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
def _results_to_docs_and_scores(self, results: Any) -> List[Tuple[Document, float]]:
|
||||
"""Return docs and scores from results."""
|
||||
docs = [
|
||||
(
|
||||
Document(
|
||||
page_content=result[0],
|
||||
metadata=json.loads(result[1]),
|
||||
),
|
||||
result[2] if self.embedding_function is not None else None,
|
||||
)
|
||||
for result in results
|
||||
]
|
||||
return docs
|
||||
|
||||
def _select_relevance_score_fn(self) -> Callable[[float], float]:
|
||||
"""
|
||||
The 'correct' relevance function
|
||||
may differ depending on a few things, including:
|
||||
- the distance / similarity metric used by the VectorStore
|
||||
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
|
||||
- embedding dimensionality
|
||||
- etc.
|
||||
"""
|
||||
if self.override_relevance_score_fn is not None:
|
||||
return self.override_relevance_score_fn
|
||||
|
||||
# Default strategy is to rely on distance strategy provided
|
||||
# in vectorstore constructor
|
||||
if self._distance_strategy == DistanceStrategy.COSINE:
|
||||
return self._cosine_relevance_score_fn
|
||||
elif self._distance_strategy == DistanceStrategy.EUCLIDEAN:
|
||||
return self._euclidean_relevance_score_fn
|
||||
elif self._distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT:
|
||||
return self._max_inner_product_relevance_score_fn
|
||||
else:
|
||||
raise ValueError(
|
||||
"No supported normalization function"
|
||||
f" for distance_strategy of {self._distance_strategy}."
|
||||
"Consider providing relevance_score_fn to Kinetica constructor."
|
||||
)
|
||||
|
||||
@property
|
||||
def distance_strategy(self) -> str:
|
||||
if self._distance_strategy == DistanceStrategy.EUCLIDEAN:
|
||||
return "l2_distance"
|
||||
elif self._distance_strategy == DistanceStrategy.COSINE:
|
||||
return "cosine_distance"
|
||||
elif self._distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT:
|
||||
return "dot_product"
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Got unexpected value for distance: {self._distance_strategy}. "
|
||||
f"Should be one of {', '.join([ds.value for ds in DistanceStrategy])}."
|
||||
)
|
||||
|
||||
def __query_collection(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
) -> Dict:
|
||||
"""Query the collection."""
|
||||
# if filter is not None:
|
||||
# filter_clauses = []
|
||||
# for key, value in filter.items():
|
||||
# IN = "in"
|
||||
# if isinstance(value, dict) and IN in map(str.lower, value):
|
||||
# value_case_insensitive = {
|
||||
# k.lower(): v for k, v in value.items()
|
||||
# }
|
||||
# filter_by_metadata = self.EmbeddingStore.cmetadata[
|
||||
# key
|
||||
# ].astext.in_(value_case_insensitive[IN])
|
||||
# filter_clauses.append(filter_by_metadata)
|
||||
# else:
|
||||
# filter_by_metadata = self.EmbeddingStore.cmetadata[
|
||||
# key
|
||||
# ].astext == str(value)
|
||||
# filter_clauses.append(filter_by_metadata)
|
||||
|
||||
json_filter = json.dumps(filter) if filter is not None else None
|
||||
where_clause = (
|
||||
f" where '{json_filter}' = JSON(metadata) "
|
||||
if json_filter is not None
|
||||
else ""
|
||||
)
|
||||
|
||||
embedding_str = "[" + ",".join([str(x) for x in embedding]) + "]"
|
||||
|
||||
dist_strategy = self.distance_strategy
|
||||
|
||||
query_string = f"""
|
||||
SELECT text, metadata, {dist_strategy}(embedding, '{embedding_str}')
|
||||
as distance, embedding
|
||||
FROM {self.table_name}
|
||||
{where_clause}
|
||||
ORDER BY distance asc NULLS LAST
|
||||
LIMIT {k}
|
||||
"""
|
||||
|
||||
self.logger.debug(query_string)
|
||||
resp = self._db.execute_sql_and_decode(query_string)
|
||||
self.logger.debug(resp)
|
||||
return resp
|
||||
|
||||
def max_marginal_relevance_search_with_score_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs selected using the maximal marginal relevance with score
|
||||
to embedding vector.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k (int): Number of Documents to return. Defaults to 4.
|
||||
fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
||||
Defaults to 20.
|
||||
lambda_mult (float): Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List[Tuple[Document, float]]: List of Documents selected by maximal marginal
|
||||
relevance to the query and score for each.
|
||||
"""
|
||||
resp = self.__query_collection(embedding=embedding, k=fetch_k, filter=filter)
|
||||
records: OrderedDict = resp["records"]
|
||||
results = list(zip(*list(records.values())))
|
||||
|
||||
embedding_list = [
|
||||
struct.unpack("%sf" % self.dimensions, embedding)
|
||||
for embedding in records["embedding"]
|
||||
]
|
||||
|
||||
mmr_selected = maximal_marginal_relevance(
|
||||
np.array(embedding, dtype=np.float32),
|
||||
embedding_list,
|
||||
k=k,
|
||||
lambda_mult=lambda_mult,
|
||||
)
|
||||
|
||||
candidates = self._results_to_docs_and_scores(results)
|
||||
|
||||
return [r for i, r in enumerate(candidates) if i in mmr_selected]
|
||||
|
||||
def max_marginal_relevance_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
query (str): Text to look up documents similar to.
|
||||
k (int): Number of Documents to return. Defaults to 4.
|
||||
fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
||||
Defaults to 20.
|
||||
lambda_mult (float): Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List[Document]: List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
embedding = self.embedding_function.embed_query(query)
|
||||
return self.max_marginal_relevance_search_by_vector(
|
||||
embedding,
|
||||
k=k,
|
||||
fetch_k=fetch_k,
|
||||
lambda_mult=lambda_mult,
|
||||
filter=filter,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def max_marginal_relevance_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
filter: Optional[dict] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs selected using the maximal marginal relevance with score.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
query (str): Text to look up documents similar to.
|
||||
k (int): Number of Documents to return. Defaults to 4.
|
||||
fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
||||
Defaults to 20.
|
||||
lambda_mult (float): Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List[Tuple[Document, float]]: List of Documents selected by maximal marginal
|
||||
relevance to the query and score for each.
|
||||
"""
|
||||
embedding = self.embedding_function.embed_query(query)
|
||||
docs = self.max_marginal_relevance_search_with_score_by_vector(
|
||||
embedding=embedding,
|
||||
k=k,
|
||||
fetch_k=fetch_k,
|
||||
lambda_mult=lambda_mult,
|
||||
filter=filter,
|
||||
**kwargs,
|
||||
)
|
||||
return docs
|
||||
|
||||
def max_marginal_relevance_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance
|
||||
to embedding vector.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
embedding (str): Text to look up documents similar to.
|
||||
k (int): Number of Documents to return. Defaults to 4.
|
||||
fetch_k (int): Number of Documents to fetch to pass to MMR algorithm.
|
||||
Defaults to 20.
|
||||
lambda_mult (float): Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List[Document]: List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
docs_and_scores = self.max_marginal_relevance_search_with_score_by_vector(
|
||||
embedding,
|
||||
k=k,
|
||||
fetch_k=fetch_k,
|
||||
lambda_mult=lambda_mult,
|
||||
filter=filter,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
return _results_to_docs(docs_and_scores)
|
||||
|
||||
async def amax_marginal_relevance_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
filter: Optional[Dict[str, str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance."""
|
||||
|
||||
# This is a temporary workaround to make the similarity search
|
||||
# asynchronous. The proper solution is to make the similarity search
|
||||
# asynchronous in the vector store implementations.
|
||||
func = partial(
|
||||
self.max_marginal_relevance_search_by_vector,
|
||||
embedding,
|
||||
k=k,
|
||||
fetch_k=fetch_k,
|
||||
lambda_mult=lambda_mult,
|
||||
filter=filter,
|
||||
**kwargs,
|
||||
)
|
||||
return await asyncio.get_event_loop().run_in_executor(None, func)
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls: Type[Kinetica],
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
config: KineticaSettings = KineticaSettings(),
|
||||
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
|
||||
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
||||
ids: Optional[List[str]] = None,
|
||||
pre_delete_collection: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> Kinetica:
|
||||
"""Adds the texts passed in to the vector store and returns it
|
||||
|
||||
Args:
|
||||
cls (Type[Kinetica]): Kinetica class
|
||||
texts (List[str]): A list of texts for which the embeddings are generated
|
||||
embedding (Embeddings): List of embeddings
|
||||
metadatas (Optional[List[dict]], optional): List of dicts, JSON
|
||||
describing the texts/documents. Defaults to None.
|
||||
config (KineticaSettings): a `KineticaSettings` instance
|
||||
collection_name (str, optional): Kinetica schema name.
|
||||
Defaults to _LANGCHAIN_DEFAULT_COLLECTION_NAME.
|
||||
distance_strategy (DistanceStrategy, optional): Distance strategy
|
||||
e.g., l2, cosine etc.. Defaults to DEFAULT_DISTANCE_STRATEGY.
|
||||
ids (Optional[List[str]], optional): A list of UUIDs for each
|
||||
text/document. Defaults to None.
|
||||
pre_delete_collection (bool, optional): Indicates whether the Kinetica
|
||||
schema is to be deleted or not. Defaults to False.
|
||||
|
||||
Returns:
|
||||
Kinetica: a `Kinetica` instance
|
||||
"""
|
||||
|
||||
if len(texts) == 0:
|
||||
raise ValueError("texts is empty")
|
||||
|
||||
try:
|
||||
first_embedding = embedding.embed_documents(texts[0:1])
|
||||
except NotImplementedError:
|
||||
first_embedding = [embedding.embed_query(texts[0])]
|
||||
|
||||
dimensions = len(first_embedding[0])
|
||||
embeddings = embedding.embed_documents(list(texts))
|
||||
|
||||
kinetica_store = cls.__from(
|
||||
texts=texts,
|
||||
embeddings=embeddings,
|
||||
embedding=embedding,
|
||||
dimensions=dimensions,
|
||||
config=config,
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
collection_name=collection_name,
|
||||
distance_strategy=distance_strategy,
|
||||
pre_delete_collection=pre_delete_collection,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
return kinetica_store
|
||||
|
||||
@classmethod
|
||||
def from_embeddings(
|
||||
cls: Type[Kinetica],
|
||||
text_embeddings: List[Tuple[str, List[float]]],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
config: KineticaSettings = KineticaSettings(),
|
||||
dimensions: int = Dimension.OPENAI,
|
||||
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
|
||||
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
||||
ids: Optional[List[str]] = None,
|
||||
pre_delete_collection: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> Kinetica:
|
||||
"""Adds the embeddings passed in to the vector store and returns it
|
||||
|
||||
Args:
|
||||
cls (Type[Kinetica]): Kinetica class
|
||||
text_embeddings (List[Tuple[str, List[float]]]): A list of texts
|
||||
and the embeddings
|
||||
embedding (Embeddings): List of embeddings
|
||||
metadatas (Optional[List[dict]], optional): List of dicts, JSON describing
|
||||
the texts/documents. Defaults to None.
|
||||
config (KineticaSettings): a `KineticaSettings` instance
|
||||
dimensions (int, optional): Dimension for the vector data, if not passed a
|
||||
default will be used. Defaults to Dimension.OPENAI.
|
||||
collection_name (str, optional): Kinetica schema name.
|
||||
Defaults to _LANGCHAIN_DEFAULT_COLLECTION_NAME.
|
||||
distance_strategy (DistanceStrategy, optional): Distance strategy
|
||||
e.g., l2, cosine etc.. Defaults to DEFAULT_DISTANCE_STRATEGY.
|
||||
ids (Optional[List[str]], optional): A list of UUIDs for each text/document.
|
||||
Defaults to None.
|
||||
pre_delete_collection (bool, optional): Indicates whether the
|
||||
Kinetica schema is to be deleted or not. Defaults to False.
|
||||
|
||||
Returns:
|
||||
Kinetica: a `Kinetica` instance
|
||||
"""
|
||||
|
||||
texts = [t[0] for t in text_embeddings]
|
||||
embeddings = [t[1] for t in text_embeddings]
|
||||
dimensions = len(embeddings[0])
|
||||
|
||||
return cls.__from(
|
||||
texts=texts,
|
||||
embeddings=embeddings,
|
||||
embedding=embedding,
|
||||
dimensions=dimensions,
|
||||
config=config,
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
collection_name=collection_name,
|
||||
distance_strategy=distance_strategy,
|
||||
pre_delete_collection=pre_delete_collection,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_documents(
|
||||
cls: Type[Kinetica],
|
||||
documents: List[Document],
|
||||
embedding: Embeddings,
|
||||
config: KineticaSettings = KineticaSettings(),
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
|
||||
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
|
||||
ids: Optional[List[str]] = None,
|
||||
pre_delete_collection: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> Kinetica:
|
||||
"""Adds the list of `Document` passed in to the vector store and returns it
|
||||
|
||||
Args:
|
||||
cls (Type[Kinetica]): Kinetica class
|
||||
texts (List[str]): A list of texts for which the embeddings are generated
|
||||
embedding (Embeddings): List of embeddings
|
||||
config (KineticaSettings): a `KineticaSettings` instance
|
||||
metadatas (Optional[List[dict]], optional): List of dicts, JSON describing
|
||||
the texts/documents. Defaults to None.
|
||||
collection_name (str, optional): Kinetica schema name.
|
||||
Defaults to _LANGCHAIN_DEFAULT_COLLECTION_NAME.
|
||||
distance_strategy (DistanceStrategy, optional): Distance strategy
|
||||
e.g., l2, cosine etc.. Defaults to DEFAULT_DISTANCE_STRATEGY.
|
||||
ids (Optional[List[str]], optional): A list of UUIDs for each text/document.
|
||||
Defaults to None.
|
||||
pre_delete_collection (bool, optional): Indicates whether the Kinetica
|
||||
schema is to be deleted or not. Defaults to False.
|
||||
|
||||
Returns:
|
||||
Kinetica: a `Kinetica` instance
|
||||
"""
|
||||
|
||||
texts = [d.page_content for d in documents]
|
||||
metadatas = [d.metadata for d in documents]
|
||||
|
||||
return cls.from_texts(
|
||||
texts=texts,
|
||||
embedding=embedding,
|
||||
metadatas=metadatas,
|
||||
config=config,
|
||||
collection_name=collection_name,
|
||||
distance_strategy=distance_strategy,
|
||||
ids=ids,
|
||||
pre_delete_collection=pre_delete_collection,
|
||||
**kwargs,
|
||||
)
|
@ -0,0 +1,300 @@
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain_community.vectorstores import (
|
||||
DistanceStrategy,
|
||||
Kinetica,
|
||||
KineticaSettings,
|
||||
)
|
||||
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||||
|
||||
DIMENSIONS = 3
|
||||
HOST = os.getenv("KINETICA_HOST", "http://127.0.0.1:9191")
|
||||
USERNAME = os.getenv("KINETICA_USERNAME", "")
|
||||
PASSWORD = os.getenv("KINETICA_PASSWORD", "")
|
||||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
||||
|
||||
|
||||
class FakeEmbeddingsWithAdaDimension(FakeEmbeddings):
|
||||
"""Fake embeddings functionality for testing."""
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Return simple embeddings."""
|
||||
return [[float(1.0)] * (DIMENSIONS - 1) + [float(i)] for i in range(len(texts))]
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Return simple embeddings."""
|
||||
return [float(1.0)] * (DIMENSIONS - 1) + [float(0.0)]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def create_config() -> KineticaSettings:
|
||||
return KineticaSettings(host=HOST, username=USERNAME, password=PASSWORD)
|
||||
|
||||
|
||||
@pytest.mark.requires("gpudb")
|
||||
def test_kinetica(create_config: KineticaSettings) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"text": text} for text in texts]
|
||||
docsearch = Kinetica.from_texts(
|
||||
config=create_config,
|
||||
texts=texts,
|
||||
metadatas=metadatas,
|
||||
embedding=FakeEmbeddingsWithAdaDimension(),
|
||||
collection_name="test_kinetica",
|
||||
pre_delete_collection=True,
|
||||
)
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output[0].page_content == "foo"
|
||||
|
||||
|
||||
@pytest.mark.requires("gpudb")
|
||||
def test_kinetica_embeddings(create_config: KineticaSettings) -> None:
|
||||
"""Test end to end construction with embeddings and search."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
text_embeddings = FakeEmbeddingsWithAdaDimension().embed_documents(texts)
|
||||
text_embedding_pairs = list(zip(texts, text_embeddings))
|
||||
docsearch = Kinetica.from_embeddings(
|
||||
config=create_config,
|
||||
text_embeddings=text_embedding_pairs,
|
||||
embedding=FakeEmbeddingsWithAdaDimension(),
|
||||
collection_name="test_kinetica_embeddings",
|
||||
pre_delete_collection=False,
|
||||
)
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
|
||||
@pytest.mark.requires("gpudb")
|
||||
def test_kinetica_with_metadatas(create_config: KineticaSettings) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||
docsearch = Kinetica.from_texts(
|
||||
config=create_config,
|
||||
texts=texts,
|
||||
metadatas=metadatas,
|
||||
embedding=FakeEmbeddingsWithAdaDimension(),
|
||||
collection_name="test_kinetica_with_metadatas",
|
||||
pre_delete_collection=False,
|
||||
)
|
||||
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo", metadata={"page": "0"})]
|
||||
|
||||
|
||||
@pytest.mark.requires("gpudb")
|
||||
def test_kinetica_with_metadatas_with_scores(create_config: KineticaSettings) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||
docsearch = Kinetica.from_texts(
|
||||
config=create_config,
|
||||
texts=texts,
|
||||
metadatas=metadatas,
|
||||
embedding=FakeEmbeddingsWithAdaDimension(),
|
||||
collection_name="test_kinetica_with_metadatas_with_scores",
|
||||
pre_delete_collection=False,
|
||||
)
|
||||
|
||||
output = docsearch.similarity_search_with_score("foo", k=1)
|
||||
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
|
||||
|
||||
|
||||
@pytest.mark.requires("gpudb")
|
||||
def test_kinetica_with_filter_match(create_config: KineticaSettings) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||
docsearch = Kinetica.from_texts(
|
||||
config=create_config,
|
||||
texts=texts,
|
||||
metadatas=metadatas,
|
||||
embedding=FakeEmbeddingsWithAdaDimension(),
|
||||
collection_name="test_kinetica_with_filter_match",
|
||||
pre_delete_collection=False,
|
||||
)
|
||||
|
||||
output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "0"})
|
||||
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
|
||||
|
||||
|
||||
@pytest.mark.requires("gpudb")
|
||||
def test_kinetica_with_filter_distant_match(create_config: KineticaSettings) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||
docsearch = Kinetica.from_texts(
|
||||
config=create_config,
|
||||
texts=texts,
|
||||
metadatas=metadatas,
|
||||
embedding=FakeEmbeddingsWithAdaDimension(),
|
||||
collection_name="test_kinetica_with_filter_distant_match",
|
||||
pre_delete_collection=False,
|
||||
)
|
||||
|
||||
output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "2"})
|
||||
assert output == [(Document(page_content="baz", metadata={"page": "2"}), 2.0)]
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Filter condition has IN clause")
|
||||
@pytest.mark.requires("gpudb")
|
||||
def test_kinetica_with_filter_in_set(create_config: KineticaSettings) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||
docsearch = Kinetica.from_texts(
|
||||
config=create_config,
|
||||
texts=texts,
|
||||
metadatas=metadatas,
|
||||
embedding=FakeEmbeddingsWithAdaDimension(),
|
||||
collection_name="test_kinetica_with_filter_in_set",
|
||||
pre_delete_collection=False,
|
||||
)
|
||||
|
||||
output = docsearch.similarity_search_with_score(
|
||||
"foo", k=2, filter={"page": {"IN": ["0", "2"]}}
|
||||
)
|
||||
assert output == [
|
||||
(Document(page_content="foo", metadata={"page": "0"}), 0.0),
|
||||
(Document(page_content="baz", metadata={"page": "2"}), 0.0013003906671379406),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.requires("gpudb")
|
||||
def test_kinetica_relevance_score(create_config: KineticaSettings) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||
docsearch = Kinetica.from_texts(
|
||||
config=create_config,
|
||||
texts=texts,
|
||||
metadatas=metadatas,
|
||||
embedding=FakeEmbeddingsWithAdaDimension(),
|
||||
collection_name="test_kinetica_relevance_score",
|
||||
pre_delete_collection=False,
|
||||
)
|
||||
|
||||
output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
|
||||
assert output == [
|
||||
(Document(page_content="foo", metadata={"page": "0"}), 1.0),
|
||||
(Document(page_content="bar", metadata={"page": "1"}), 0.29289321881345254),
|
||||
(Document(page_content="baz", metadata={"page": "2"}), -0.4142135623730949),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.requires("openai", "gpudb")
|
||||
def test_kinetica_max_marginal_relevance_search(
|
||||
create_config: KineticaSettings,
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
openai = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
|
||||
texts = ["foo", "bar", "baz"]
|
||||
docsearch = Kinetica.from_texts(
|
||||
config=create_config,
|
||||
texts=texts,
|
||||
embedding=openai,
|
||||
distance_strategy=DistanceStrategy.COSINE,
|
||||
collection_name="test_kinetica_max_marginal_relevance_search",
|
||||
pre_delete_collection=False,
|
||||
)
|
||||
|
||||
output = docsearch.max_marginal_relevance_search("foo", k=1, fetch_k=3)
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
|
||||
@pytest.mark.requires("gpudb")
|
||||
def test_kinetica_max_marginal_relevance_search_with_score(
|
||||
create_config: KineticaSettings,
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
docsearch = Kinetica.from_texts(
|
||||
config=create_config,
|
||||
texts=texts,
|
||||
embedding=FakeEmbeddingsWithAdaDimension(),
|
||||
distance_strategy=DistanceStrategy.EUCLIDEAN,
|
||||
collection_name="test_kinetica_max_marginal_relevance_search_with_score",
|
||||
pre_delete_collection=False,
|
||||
)
|
||||
|
||||
output = docsearch.max_marginal_relevance_search_with_score("foo", k=1, fetch_k=3)
|
||||
assert output == [(Document(page_content="foo"), 0.0)]
|
||||
|
||||
|
||||
@pytest.mark.requires("openai", "gpudb")
|
||||
def test_kinetica_with_openai_embeddings(create_config: KineticaSettings) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
if OPENAI_API_KEY == "":
|
||||
assert False
|
||||
|
||||
openai = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"text": text} for text in texts]
|
||||
docsearch = Kinetica.from_texts(
|
||||
config=create_config,
|
||||
texts=texts,
|
||||
metadatas=metadatas,
|
||||
embedding=openai,
|
||||
collection_name="kinetica_openai_test",
|
||||
pre_delete_collection=False,
|
||||
)
|
||||
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo", metadata={"text": "foo"})]
|
||||
|
||||
|
||||
@pytest.mark.requires("gpudb")
|
||||
def test_kinetica_retriever_search_threshold(create_config: KineticaSettings) -> None:
|
||||
"""Test using retriever for searching with threshold."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||
docsearch = Kinetica.from_texts(
|
||||
config=create_config,
|
||||
texts=texts,
|
||||
metadatas=metadatas,
|
||||
embedding=FakeEmbeddingsWithAdaDimension(),
|
||||
distance_strategy=DistanceStrategy.EUCLIDEAN,
|
||||
collection_name="test_kinetica_retriever_search_threshold",
|
||||
pre_delete_collection=False,
|
||||
)
|
||||
|
||||
retriever = docsearch.as_retriever(
|
||||
search_type="similarity_score_threshold",
|
||||
search_kwargs={"k": 3, "score_threshold": 0.999},
|
||||
)
|
||||
output = retriever.get_relevant_documents("summer")
|
||||
assert output == [
|
||||
Document(page_content="foo", metadata={"page": "0"}),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.requires("gpudb")
|
||||
def test_kinetica_retriever_search_threshold_custom_normalization_fn(
|
||||
create_config: KineticaSettings,
|
||||
) -> None:
|
||||
"""Test searching with threshold and custom normalization function"""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||
docsearch = Kinetica.from_texts(
|
||||
config=create_config,
|
||||
texts=texts,
|
||||
metadatas=metadatas,
|
||||
embedding=FakeEmbeddingsWithAdaDimension(),
|
||||
distance_strategy=DistanceStrategy.EUCLIDEAN,
|
||||
collection_name="test_kinetica_retriever_search_threshold_custom_normalization_fn",
|
||||
pre_delete_collection=False,
|
||||
relevance_score_fn=lambda d: d * 0,
|
||||
)
|
||||
|
||||
retriever = docsearch.as_retriever(
|
||||
search_type="similarity_score_threshold",
|
||||
search_kwargs={"k": 3, "score_threshold": 0.5},
|
||||
)
|
||||
output = retriever.get_relevant_documents("foo")
|
||||
assert output == []
|
@ -10,5 +10,7 @@ def test_all_imports() -> None:
|
||||
"AlibabaCloudOpenSearchSettings",
|
||||
"ClickhouseSettings",
|
||||
"MyScaleSettings",
|
||||
"DistanceStrategy",
|
||||
"KineticaSettings",
|
||||
]:
|
||||
assert issubclass(getattr(vectorstores, cls), VectorStore)
|
||||
|
@ -21,6 +21,7 @@ _EXPECTED = [
|
||||
"DatabricksVectorSearch",
|
||||
"DeepLake",
|
||||
"Dingo",
|
||||
"DistanceStrategy",
|
||||
"DocArrayHnswSearch",
|
||||
"DocArrayInMemorySearch",
|
||||
"ElasticKnnSearch",
|
||||
@ -31,6 +32,8 @@ _EXPECTED = [
|
||||
"HanaDB",
|
||||
"Hologres",
|
||||
"KDBAI",
|
||||
"Kinetica",
|
||||
"KineticaSettings",
|
||||
"LanceDB",
|
||||
"Lantern",
|
||||
"LLMRails",
|
||||
|
Loading…
Reference in New Issue
Block a user