community: Add support for SAP HANA Vector hnsw index creation (#27884)

**Issue:** Added support for creating indexes in the SAP HANA Vector engine. **Changes**: 1. Introduced a new function `create_hnsw_index` in `hanavector.py` that enables the creation of indexes for SAP HANA Vector. 2. Added integration tests for the index creation function to ensure functionality. 3. Updated the documentation to reflect the new index creation feature, including examples and output from the notebook. 4. Fix the operator issue in ` _process_filter_object` function and change the array argument to a placeholder in the similarity search SQL statement. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
2025-08-08 20:41:52 +00:00 · 2024-12-06 00:29:08 +01:00 · 2024-12-06 00:29:08 +01:00 · 482e8a7855
commit 482e8a7855
parent 28f8d436f6
3 changed files with 681 additions and 58 deletions
--- a/docs/docs/integrations/vectorstores/sap_hanavector.ipynb
+++ b/docs/docs/integrations/vectorstores/sap_hanavector.ipynb
@ -22,7 +22,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {
    "tags": []
   },
@ -41,7 +41,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-09-09T08:02:16.802456Z",
@ -64,7 +64,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-09-09T08:02:28.174088Z",
@ -73,8 +73,10 @@
   },
   "outputs": [],
   "source": [
    "from dotenv import load_dotenv\n",
    "from hdbcli import dbapi\n",
    "\n",
    "load_dotenv()\n",
    "# Use connection settings from the environment\n",
    "connection = dbapi.connect(\n",
    "    address=os.environ.get(\"HANA_DB_ADDRESS\"),\n",
@ -102,14 +104,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-09-09T08:02:25.452472Z",
     "start_time": "2023-09-09T08:02:25.441563Z"
    }
   },
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of document chunks: 88\n"
     ]
    }
   ],
   "source": [
    "from langchain_community.document_loaders import TextLoader\n",
    "from langchain_community.vectorstores.hanavector import HanaDB\n",
@ -134,7 +144,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-09-09T08:04:16.696625Z",
@ -157,9 +167,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Delete already existing documents from the table\n",
    "db.delete(filter={})\n",
@ -178,9 +199,24 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--------------------------------------------------------------------------------\n",
      "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
      "\n",
      "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n",
      "--------------------------------------------------------------------------------\n",
      "As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \n",
      "\n",
      "While it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice.\n"
     ]
    }
   ],
   "source": [
    "query = \"What did the president say about Ketanji Brown Jackson\"\n",
    "docs = db.similarity_search(query, k=2)\n",
@ -199,9 +235,24 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--------------------------------------------------------------------------------\n",
      "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
      "\n",
      "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n",
      "--------------------------------------------------------------------------------\n",
      "As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \n",
      "\n",
      "While it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice.\n"
     ]
    }
   ],
   "source": [
    "from langchain_community.vectorstores.utils import DistanceStrategy\n",
    "\n",
@ -235,7 +286,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-09-09T08:05:23.276819Z",
@ -246,7 +297,24 @@
     "outputs_hidden": false
    }
   },
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--------------------------------------------------------------------------------\n",
      "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
      "\n",
      "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n",
      "--------------------------------------------------------------------------------\n",
      "Groups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. \n",
      "\n",
      "In this struggle as President Zelenskyy said in his speech to the European Parliament “Light will win over darkness.” The Ukrainian Ambassador to the United States is here tonight. \n",
      "\n",
      "Let each of us here tonight in this Chamber send an unmistakable signal to Ukraine and to the world.\n"
     ]
    }
   ],
   "source": [
    "docs = db.max_marginal_relevance_search(query, k=2, fetch_k=20)\n",
    "for doc in docs:\n",
@ -254,6 +322,86 @@
    "    print(doc.page_content)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Creating an HNSW Vector Index\n",
    "\n",
    "A vector index can significantly speed up top-k nearest neighbor queries for vectors. Users can create a Hierarchical Navigable Small World (HNSW) vector index using the `create_hnsw_index` function.\n",
    "\n",
    "For more information about creating an index at the database level, please refer to the [official documentation](https://help.sap.com/docs/hana-cloud-database/sap-hana-cloud-sap-hana-database-vector-engine-guide/create-vector-index-statement-data-definition).\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--------------------------------------------------------------------------------\n",
      "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
      "\n",
      "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n",
      "--------------------------------------------------------------------------------\n",
      "Groups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. \n",
      "\n",
      "In this struggle as President Zelenskyy said in his speech to the European Parliament “Light will win over darkness.” The Ukrainian Ambassador to the United States is here tonight. \n",
      "\n",
      "Let each of us here tonight in this Chamber send an unmistakable signal to Ukraine and to the world.\n"
     ]
    }
   ],
   "source": [
    "# HanaDB instance uses cosine similarity as default:\n",
    "db_cosine = HanaDB(\n",
    "    embedding=embeddings, connection=connection, table_name=\"STATE_OF_THE_UNION\"\n",
    ")\n",
    "\n",
    "# Attempting to create the HNSW index with default parameters\n",
    "db_cosine.create_hnsw_index()  # If no other parameters are specified, the default values will be used\n",
    "# Default values: m=64, ef_construction=128, ef_search=200\n",
    "# The default index name will be: STATE_OF_THE_UNION_COSINE_SIMILARITY_IDX (verify this naming pattern in HanaDB class)\n",
    "\n",
    "\n",
    "# Creating a HanaDB instance with L2 distance as the similarity function and defined values\n",
    "db_l2 = HanaDB(\n",
    "    embedding=embeddings,\n",
    "    connection=connection,\n",
    "    table_name=\"STATE_OF_THE_UNION\",\n",
    "    distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,  # Specify L2 distance\n",
    ")\n",
    "\n",
    "# This will create an index based on L2 distance strategy.\n",
    "db_l2.create_hnsw_index(\n",
    "    index_name=\"STATE_OF_THE_UNION_L2_index\",\n",
    "    m=100,  # Max number of neighbors per graph node (valid range: 4 to 1000)\n",
    "    ef_construction=200,  # Max number of candidates during graph construction (valid range: 1 to 100000)\n",
    "    ef_search=500,  # Min number of candidates during the search (valid range: 1 to 100000)\n",
    ")\n",
    "\n",
    "# Use L2 index to perform MMR\n",
    "docs = db_l2.max_marginal_relevance_search(query, k=2, fetch_k=20)\n",
    "for doc in docs:\n",
    "    print(\"-\" * 80)\n",
    "    print(doc.page_content)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "**Key Points**:\n",
    "- **Similarity Function**: The similarity function for the index is **cosine similarity** by default. If you want to use a different similarity function (e.g., `L2` distance), you need to specify it when initializing the `HanaDB` instance.\n",
    "- **Default Parameters**: In the `create_hnsw_index` function, if the user does not provide custom values for parameters like `m`, `ef_construction`, or `ef_search`, the default values (e.g., `m=64`, `ef_construction=128`, `ef_search=200`) will be used automatically. These values ensure the index is created with reasonable performance without requiring user intervention.\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -263,9 +411,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "db = HanaDB(\n",
    "    connection=connection, embedding=embeddings, table_name=\"LANGCHAIN_DEMO_BASIC\"\n",
@ -284,9 +443,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 20,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs = [Document(page_content=\"Some text\"), Document(page_content=\"Other docs\")]\n",
    "db.add_documents(docs)"
@ -301,9 +471,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs = [\n",
    "    Document(\n",
@ -327,9 +508,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--------------------------------------------------------------------------------\n",
      "foo\n",
      "{'start': 100, 'end': 150, 'doc_name': 'foo.txt', 'quality': 'bad'}\n"
     ]
    }
   ],
   "source": [
    "docs = db.similarity_search(\"foobar\", k=2, filter={\"quality\": \"bad\"})\n",
    "# With filtering on \"quality\"==\"bad\", only one document should be returned\n",
@ -348,9 +539,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 23,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    }
   ],
   "source": [
    "db.delete(filter={\"quality\": \"bad\"})\n",
    "\n",
@ -385,7 +584,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
@ -433,9 +632,30 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Filter: {'id': {'$ne': 1}}\n",
      "{'name': 'bob', 'is_active': False, 'id': 2, 'height': 5.7}\n",
      "{'name': 'jane', 'is_active': True, 'id': 3, 'height': 2.4}\n",
      "Filter: {'id': {'$gt': 1}}\n",
      "{'name': 'bob', 'is_active': False, 'id': 2, 'height': 5.7}\n",
      "{'name': 'jane', 'is_active': True, 'id': 3, 'height': 2.4}\n",
      "Filter: {'id': {'$gte': 1}}\n",
      "{'name': 'adam', 'is_active': True, 'id': 1, 'height': 10.0}\n",
      "{'name': 'bob', 'is_active': False, 'id': 2, 'height': 5.7}\n",
      "{'name': 'jane', 'is_active': True, 'id': 3, 'height': 2.4}\n",
      "Filter: {'id': {'$lt': 1}}\n",
      "<empty result>\n",
      "Filter: {'id': {'$lte': 1}}\n",
      "{'name': 'adam', 'is_active': True, 'id': 1, 'height': 10.0}\n"
     ]
    }
   ],
   "source": [
    "advanced_filter = {\"id\": {\"$ne\": 1}}\n",
    "print(f\"Filter: {advanced_filter}\")\n",
@ -467,9 +687,24 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 26,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Filter: {'id': {'$between': (1, 2)}}\n",
      "{'name': 'adam', 'is_active': True, 'id': 1, 'height': 10.0}\n",
      "{'name': 'bob', 'is_active': False, 'id': 2, 'height': 5.7}\n",
      "Filter: {'name': {'$in': ['adam', 'bob']}}\n",
      "{'name': 'adam', 'is_active': True, 'id': 1, 'height': 10.0}\n",
      "{'name': 'bob', 'is_active': False, 'id': 2, 'height': 5.7}\n",
      "Filter: {'name': {'$nin': ['adam', 'bob']}}\n",
      "{'name': 'jane', 'is_active': True, 'id': 3, 'height': 2.4}\n"
     ]
    }
   ],
   "source": [
    "advanced_filter = {\"id\": {\"$between\": (1, 2)}}\n",
    "print(f\"Filter: {advanced_filter}\")\n",
@ -493,9 +728,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Filter: {'name': {'$like': 'a%'}}\n",
      "{'name': 'adam', 'is_active': True, 'id': 1, 'height': 10.0}\n",
      "Filter: {'name': {'$like': '%a%'}}\n",
      "{'name': 'adam', 'is_active': True, 'id': 1, 'height': 10.0}\n",
      "{'name': 'jane', 'is_active': True, 'id': 3, 'height': 2.4}\n"
     ]
    }
   ],
   "source": [
    "advanced_filter = {\"name\": {\"$like\": \"a%\"}}\n",
    "print(f\"Filter: {advanced_filter}\")\n",
@ -515,9 +762,25 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Filter: {'$or': [{'id': 1}, {'name': 'bob'}]}\n",
      "{'name': 'adam', 'is_active': True, 'id': 1, 'height': 10.0}\n",
      "{'name': 'bob', 'is_active': False, 'id': 2, 'height': 5.7}\n",
      "Filter: {'$and': [{'id': 1}, {'id': 2}]}\n",
      "<empty result>\n",
      "Filter: {'$or': [{'id': 1}, {'id': 2}, {'id': 3}]}\n",
      "{'name': 'adam', 'is_active': True, 'id': 1, 'height': 10.0}\n",
      "{'name': 'bob', 'is_active': False, 'id': 2, 'height': 5.7}\n",
      "{'name': 'jane', 'is_active': True, 'id': 3, 'height': 2.4}\n"
     ]
    }
   ],
   "source": [
    "advanced_filter = {\"$or\": [{\"id\": 1}, {\"name\": \"bob\"}]}\n",
    "print(f\"Filter: {advanced_filter}\")\n",
@ -541,7 +804,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
@ -574,7 +837,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
@ -635,9 +898,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 32,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Answer from LLM:\n",
      "================\n",
      "The United States has set up joint patrols with Mexico and Guatemala to catch more human traffickers. This collaboration is part of the efforts to address immigration issues and secure the borders in the region.\n",
      "================\n",
      "Number of used source document chunks: 5\n"
     ]
    }
   ],
   "source": [
    "question = \"What about Mexico and Guatemala?\"\n",
    "\n",
@ -679,9 +954,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 34,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Answer from LLM:\n",
      "================\n",
      "Mexico and Guatemala are involved in joint patrols to catch human traffickers.\n"
     ]
    }
   ],
   "source": [
    "question = \"What about other countries?\"\n",
    "\n",
@ -711,9 +996,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 35,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Access the vector DB with a new table\n",
    "db = HanaDB(\n",
@ -742,9 +1038,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 36,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('VEC_META', 'NCLOB')\n",
      "('VEC_TEXT', 'NCLOB')\n",
      "('VEC_VECTOR', 'REAL_VECTOR')\n"
     ]
    }
   ],
   "source": [
    "cur = connection.cursor()\n",
    "cur.execute(\n",
@ -795,12 +1101,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 39,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "None\n",
      "Some other text\n",
      "{\"start\": 400, \"end\": 450, \"doc_name\": \"other.txt\"}\n",
      "<memory at 0x7f5edcb18d00>\n"
     ]
    }
   ],
   "source": [
-    "# Create a new table \"MY_OWN_TABLE\" with three \"standard\" columns and one additional column\n",
+    "# Create a new table \"MY_OWN_TABLE_ADD\" with three \"standard\" columns and one additional column\n",
-    "my_own_table_name = \"MY_OWN_TABLE\"\n",
+    "my_own_table_name = \"MY_OWN_TABLE_ADD\"\n",
    "cur = connection.cursor()\n",
    "cur.execute(\n",
    "    (\n",
@ -851,9 +1168,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 40,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--------------------------------------------------------------------------------\n",
      "Some other text\n",
      "--------------------------------------------------------------------------------\n",
      "Some more text\n"
     ]
    }
   ],
   "source": [
    "docs = [\n",
    "    Document(\n",
@ -886,9 +1214,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 41,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Filters on this value are very performant\n",
      "Some other text\n",
      "{\"start\": 400, \"end\": 450, \"doc_name\": \"other.txt\", \"CUSTOMTEXT\": \"Filters on this value are very performant\"}\n",
      "<memory at 0x7f5edcb193c0>\n"
     ]
    }
   ],
   "source": [
    "# Create a new table \"PERFORMANT_CUSTOMTEXT_FILTER\" with three \"standard\" columns and one additional column\n",
    "my_own_table_name = \"PERFORMANT_CUSTOMTEXT_FILTER\"\n",
@ -952,9 +1291,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 42,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--------------------------------------------------------------------------------\n",
      "Some other text\n",
      "--------------------------------------------------------------------------------\n",
      "Some more text\n"
     ]
    }
   ],
   "source": [
    "docs = [\n",
    "    Document(\n",
@ -994,7 +1344,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.10.14"
  }
 },
 "nbformat": 4,
--- a/libs/community/langchain_community/vectorstores/hanavector.py
+++ b/libs/community/langchain_community/vectorstores/hanavector.py
@ -256,6 +256,89 @@ class HanaDB(VectorStore):
        return metadata, special_metadata
    def create_hnsw_index(
        self,
        m: Optional[int] = None,  # Optional M parameter
        ef_construction: Optional[int] = None,  # Optional efConstruction parameter
        ef_search: Optional[int] = None,  # Optional efSearch parameter
        index_name: Optional[str] = None,  # Optional custom index name
    ) -> None:
        """
        Creates an HNSW vector index on a specified table and vector column with
        optional build and search configurations. If no configurations are provided,
        default parameters from the database are used. If provided values exceed the
        valid ranges, an error will be raised.
        The index is always created in ONLINE mode.
        Args:
            m: (Optional) Maximum number of neighbors per graph node
                (Valid Range: [4, 1000])
            ef_construction: (Optional) Maximal candidates to consider when building
                                the graph (Valid Range: [1, 100000])
            ef_search: (Optional) Minimum candidates for top-k-nearest neighbor
                                queries (Valid Range: [1, 100000])
            index_name: (Optional) Custom index name. Defaults to
                        <table_name>_<distance_strategy>_idx
        """
        # Set default index name if not provided
        distance_func_name = HANA_DISTANCE_FUNCTION[self.distance_strategy][0]
        default_index_name = f"{self.table_name}_{distance_func_name}_idx"
        # Use provided index_name or default
        index_name = (
            HanaDB._sanitize_name(index_name) if index_name else default_index_name
        )
        # Initialize build_config and search_config as empty dictionaries
        build_config = {}
        search_config = {}
        # Validate and add m parameter to build_config if provided
        if m is not None:
            m = HanaDB._sanitize_int(m)
            if not (4 <= m <= 1000):
                raise ValueError("M must be in the range [4, 1000]")
            build_config["M"] = m
        # Validate and add ef_construction to build_config if provided
        if ef_construction is not None:
            ef_construction = HanaDB._sanitize_int(ef_construction)
            if not (1 <= ef_construction <= 100000):
                raise ValueError("efConstruction must be in the range [1, 100000]")
            build_config["efConstruction"] = ef_construction
        # Validate and add ef_search to search_config if provided
        if ef_search is not None:
            ef_search = HanaDB._sanitize_int(ef_search)
            if not (1 <= ef_search <= 100000):
                raise ValueError("efSearch must be in the range [1, 100000]")
            search_config["efSearch"] = ef_search
        # Convert build_config and search_config to JSON strings if they contain values
        build_config_str = json.dumps(build_config) if build_config else ""
        search_config_str = json.dumps(search_config) if search_config else ""
        # Create the index SQL string with the ONLINE keyword
        sql_str = (
            f'CREATE HNSW VECTOR INDEX {index_name} ON "{self.table_name}" '
            f'("{self.vector_column}") '
            f"SIMILARITY FUNCTION {distance_func_name} "
        )
        # Append build_config to the SQL string if provided
        if build_config_str:
            sql_str += f"BUILD CONFIGURATION '{build_config_str}' "
        # Append search_config to the SQL string if provided
        if search_config_str:
            sql_str += f"SEARCH CONFIGURATION '{search_config_str}' "
        # Always add the ONLINE option
        sql_str += "ONLINE "
        cur = self.connection.cursor()
        try:
            cur.execute(sql_str)
        finally:
            cur.close()
    def add_texts(  # type: ignore[override]
        self,
        texts: Iterable[str],
@ -418,18 +501,18 @@ class HanaDB(VectorStore):
        k = HanaDB._sanitize_int(k)
        embedding = HanaDB._sanitize_list_float(embedding)
        distance_func_name = HANA_DISTANCE_FUNCTION[self.distance_strategy][0]
-        embedding_as_str = ",".join(map(str, embedding))
+        embedding_as_str = "[" + ",".join(map(str, embedding)) + "]"
        sql_str = (
            f"SELECT TOP {k}"
            f'  "{self.content_column}", '  # row[0]
            f'  "{self.metadata_column}", '  # row[1]
            f'  TO_NVARCHAR("{self.vector_column}"), '  # row[2]
-            f'  {distance_func_name}("{self.vector_column}", TO_REAL_VECTOR '
+            f'  {distance_func_name}("{self.vector_column}", TO_REAL_VECTOR (?)) AS CS '
            f"     (ARRAY({embedding_as_str}))) AS CS "  # row[3]
            f'FROM "{self.table_name}"'
        )
        order_str = f" order by CS {HANA_DISTANCE_FUNCTION[self.distance_strategy][1]}"
        where_str, query_tuple = self._create_where_by_filter(filter)
        query_tuple = (embedding_as_str,) + tuple(query_tuple)
        sql_str = sql_str + where_str
        sql_str = sql_str + order_str
        try:
@ -512,7 +595,7 @@ class HanaDB(VectorStore):
                            where_str_logical,
                            query_tuple_logical,
                        ) = self._process_filter_object(logical_operand)
-                        where_str += where_str_logical
+                        where_str += "(" + where_str_logical + ")"
                        query_tuple += query_tuple_logical
                    continue
--- a/libs/community/tests/integration_tests/vectorstores/test_hanavector.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_hanavector.py
@ -1432,3 +1432,193 @@ def test_preexisting_specific_columns_for_returned_metadata_completeness(
    assert docs[0].metadata["quality"] == "good"
    assert docs[0].metadata["ready"]
    assert "NonExisting" not in docs[0].metadata.keys()
@pytest.mark.skipif(not hanadb_installed, reason="hanadb not installed")
 def test_create_hnsw_index_with_default_values(texts: List[str]) -> None:
    table_name = "TEST_TABLE_HNSW_INDEX_DEFAULT"
    # Delete table if it exists (cleanup from previous tests)
    drop_table(test_setup.conn, table_name)
    # Create table and insert data
    vectorDB = HanaDB.from_texts(
        connection=test_setup.conn,
        texts=texts,
        embedding=embedding,
        table_name=table_name,
    )
    # Test the creation of HNSW index
    try:
        vectorDB.create_hnsw_index()
    except Exception as e:
        pytest.fail(f"Failed to create HNSW index: {e}")
    # Perform a search using the index to confirm its correctness
    search_result = vectorDB.max_marginal_relevance_search(texts[0], k=2, fetch_k=20)
    assert len(search_result) == 2
    assert search_result[0].page_content == texts[0]
    assert search_result[1].page_content != texts[0]
@pytest.mark.skipif(not hanadb_installed, reason="hanadb not installed")
 def test_create_hnsw_index_with_defined_values(texts: List[str]) -> None:
    table_name = "TEST_TABLE_HNSW_INDEX_DEFINED"
    # Delete table if it exists (cleanup from previous tests)
    drop_table(test_setup.conn, table_name)
    # Create table and insert data
    vectorDB = HanaDB.from_texts(
        connection=test_setup.conn,
        texts=texts,
        embedding=embedding,
        table_name=table_name,
        distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
    )
    # Test the creation of HNSW index with specific values
    try:
        vectorDB.create_hnsw_index(
            index_name="my_L2_index", ef_search=500, m=100, ef_construction=200
        )
    except Exception as e:
        pytest.fail(f"Failed to create HNSW index with defined values: {e}")
    # Perform a search using the index to confirm its correctness
    search_result = vectorDB.max_marginal_relevance_search(texts[0], k=2, fetch_k=20)
    assert len(search_result) == 2
    assert search_result[0].page_content == texts[0]
    assert search_result[1].page_content != texts[0]
@pytest.mark.skipif(not hanadb_installed, reason="hanadb not installed")
 def test_create_hnsw_index_after_initialization(texts: List[str]) -> None:
    table_name = "TEST_TABLE_HNSW_INDEX_AFTER_INIT"
    drop_table(test_setup.conn, table_name)
    # Initialize HanaDB without adding documents yet
    vectorDB = HanaDB(
        connection=test_setup.conn,
        embedding=embedding,
        table_name=table_name,
    )
    # Create HNSW index before adding documents
    vectorDB.create_hnsw_index(
        index_name="index_pre_add", ef_search=400, m=50, ef_construction=150
    )
    # Add texts after index creation
    vectorDB.add_texts(texts=texts)
    # Perform similarity search using the index
    search_result = vectorDB.similarity_search(texts[0], k=3)
    # Assert that search result is valid and has expected length
    assert len(search_result) == 3
    assert search_result[0].page_content == texts[0]
    assert search_result[1].page_content != texts[0]
@pytest.mark.skipif(not hanadb_installed, reason="hanadb not installed")
 def test_duplicate_hnsw_index_creation(texts: List[str]) -> None:
    table_name = "TEST_TABLE_HNSW_DUPLICATE_INDEX"
    # Delete table if it exists (cleanup from previous tests)
    drop_table(test_setup.conn, table_name)
    # Create table and insert data
    vectorDB = HanaDB.from_texts(
        connection=test_setup.conn,
        texts=texts,
        embedding=embedding,
        table_name=table_name,
    )
    # Create HNSW index for the first time
    vectorDB.create_hnsw_index(
        index_name="index_cosine",
        ef_search=300,
        m=80,
        ef_construction=100,
    )
    with pytest.raises(Exception):
        vectorDB.create_hnsw_index(ef_search=300, m=80, ef_construction=100)
@pytest.mark.skipif(not hanadb_installed, reason="hanadb not installed")
 def test_create_hnsw_index_invalid_m_value(texts: List[str]) -> None:
    table_name = "TEST_TABLE_HNSW_INVALID_M"
    # Cleanup: drop the table if it exists
    drop_table(test_setup.conn, table_name)
    # Create table and insert data
    vectorDB = HanaDB.from_texts(
        connection=test_setup.conn,
        texts=texts,
        embedding=embedding,
        table_name=table_name,
    )
    # Test invalid `m` value (too low)
    with pytest.raises(ValueError):
        vectorDB.create_hnsw_index(m=3)
    # Test invalid `m` value (too high)
    with pytest.raises(ValueError):
        vectorDB.create_hnsw_index(m=1001)
@pytest.mark.skipif(not hanadb_installed, reason="hanadb not installed")
 def test_create_hnsw_index_invalid_ef_construction(texts: List[str]) -> None:
    table_name = "TEST_TABLE_HNSW_INVALID_EF_CONSTRUCTION"
    # Cleanup: drop the table if it exists
    drop_table(test_setup.conn, table_name)
    # Create table and insert data
    vectorDB = HanaDB.from_texts(
        connection=test_setup.conn,
        texts=texts,
        embedding=embedding,
        table_name=table_name,
    )
    # Test invalid `ef_construction` value (too low)
    with pytest.raises(ValueError):
        vectorDB.create_hnsw_index(ef_construction=0)
    # Test invalid `ef_construction` value (too high)
    with pytest.raises(ValueError):
        vectorDB.create_hnsw_index(ef_construction=100001)
@pytest.mark.skipif(not hanadb_installed, reason="hanadb not installed")
 def test_create_hnsw_index_invalid_ef_search(texts: List[str]) -> None:
    table_name = "TEST_TABLE_HNSW_INVALID_EF_SEARCH"
    # Cleanup: drop the table if it exists
    drop_table(test_setup.conn, table_name)
    # Create table and insert data
    vectorDB = HanaDB.from_texts(
        connection=test_setup.conn,
        texts=texts,
        embedding=embedding,
        table_name=table_name,
    )
    # Test invalid `ef_search` value (too low)
    with pytest.raises(ValueError):
        vectorDB.create_hnsw_index(ef_search=0)
    # Test invalid `ef_search` value (too high)
    with pytest.raises(ValueError):
        vectorDB.create_hnsw_index(ef_search=100001)