community: Add support for SAP HANA Vector hnsw index creation (#27884)

**Issue:** Added support for creating indexes in the SAP HANA Vector
engine.
 
**Changes**: 
1. Introduced a new function `create_hnsw_index` in `hanavector.py` that
enables the creation of indexes for SAP HANA Vector.
2. Added integration tests for the index creation function to ensure
functionality.
3. Updated the documentation to reflect the new index creation feature,
including examples and output from the notebook.
4. Fix the operator issue in ` _process_filter_object` function and
change the array argument to a placeholder in the similarity search SQL
statement.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
cinqisap
2024-12-06 00:29:08 +01:00
committed by GitHub
parent 28f8d436f6
commit 482e8a7855
3 changed files with 681 additions and 58 deletions

View File

@@ -256,6 +256,89 @@ class HanaDB(VectorStore):
return metadata, special_metadata
def create_hnsw_index(
self,
m: Optional[int] = None, # Optional M parameter
ef_construction: Optional[int] = None, # Optional efConstruction parameter
ef_search: Optional[int] = None, # Optional efSearch parameter
index_name: Optional[str] = None, # Optional custom index name
) -> None:
"""
Creates an HNSW vector index on a specified table and vector column with
optional build and search configurations. If no configurations are provided,
default parameters from the database are used. If provided values exceed the
valid ranges, an error will be raised.
The index is always created in ONLINE mode.
Args:
m: (Optional) Maximum number of neighbors per graph node
(Valid Range: [4, 1000])
ef_construction: (Optional) Maximal candidates to consider when building
the graph (Valid Range: [1, 100000])
ef_search: (Optional) Minimum candidates for top-k-nearest neighbor
queries (Valid Range: [1, 100000])
index_name: (Optional) Custom index name. Defaults to
<table_name>_<distance_strategy>_idx
"""
# Set default index name if not provided
distance_func_name = HANA_DISTANCE_FUNCTION[self.distance_strategy][0]
default_index_name = f"{self.table_name}_{distance_func_name}_idx"
# Use provided index_name or default
index_name = (
HanaDB._sanitize_name(index_name) if index_name else default_index_name
)
# Initialize build_config and search_config as empty dictionaries
build_config = {}
search_config = {}
# Validate and add m parameter to build_config if provided
if m is not None:
m = HanaDB._sanitize_int(m)
if not (4 <= m <= 1000):
raise ValueError("M must be in the range [4, 1000]")
build_config["M"] = m
# Validate and add ef_construction to build_config if provided
if ef_construction is not None:
ef_construction = HanaDB._sanitize_int(ef_construction)
if not (1 <= ef_construction <= 100000):
raise ValueError("efConstruction must be in the range [1, 100000]")
build_config["efConstruction"] = ef_construction
# Validate and add ef_search to search_config if provided
if ef_search is not None:
ef_search = HanaDB._sanitize_int(ef_search)
if not (1 <= ef_search <= 100000):
raise ValueError("efSearch must be in the range [1, 100000]")
search_config["efSearch"] = ef_search
# Convert build_config and search_config to JSON strings if they contain values
build_config_str = json.dumps(build_config) if build_config else ""
search_config_str = json.dumps(search_config) if search_config else ""
# Create the index SQL string with the ONLINE keyword
sql_str = (
f'CREATE HNSW VECTOR INDEX {index_name} ON "{self.table_name}" '
f'("{self.vector_column}") '
f"SIMILARITY FUNCTION {distance_func_name} "
)
# Append build_config to the SQL string if provided
if build_config_str:
sql_str += f"BUILD CONFIGURATION '{build_config_str}' "
# Append search_config to the SQL string if provided
if search_config_str:
sql_str += f"SEARCH CONFIGURATION '{search_config_str}' "
# Always add the ONLINE option
sql_str += "ONLINE "
cur = self.connection.cursor()
try:
cur.execute(sql_str)
finally:
cur.close()
def add_texts( # type: ignore[override]
self,
texts: Iterable[str],
@@ -418,18 +501,18 @@ class HanaDB(VectorStore):
k = HanaDB._sanitize_int(k)
embedding = HanaDB._sanitize_list_float(embedding)
distance_func_name = HANA_DISTANCE_FUNCTION[self.distance_strategy][0]
embedding_as_str = ",".join(map(str, embedding))
embedding_as_str = "[" + ",".join(map(str, embedding)) + "]"
sql_str = (
f"SELECT TOP {k}"
f' "{self.content_column}", ' # row[0]
f' "{self.metadata_column}", ' # row[1]
f' TO_NVARCHAR("{self.vector_column}"), ' # row[2]
f' {distance_func_name}("{self.vector_column}", TO_REAL_VECTOR '
f" (ARRAY({embedding_as_str}))) AS CS " # row[3]
f' {distance_func_name}("{self.vector_column}", TO_REAL_VECTOR (?)) AS CS '
f'FROM "{self.table_name}"'
)
order_str = f" order by CS {HANA_DISTANCE_FUNCTION[self.distance_strategy][1]}"
where_str, query_tuple = self._create_where_by_filter(filter)
query_tuple = (embedding_as_str,) + tuple(query_tuple)
sql_str = sql_str + where_str
sql_str = sql_str + order_str
try:
@@ -512,7 +595,7 @@ class HanaDB(VectorStore):
where_str_logical,
query_tuple_logical,
) = self._process_filter_object(logical_operand)
where_str += where_str_logical
where_str += "(" + where_str_logical + ")"
query_tuple += query_tuple_logical
continue

View File

@@ -1432,3 +1432,193 @@ def test_preexisting_specific_columns_for_returned_metadata_completeness(
assert docs[0].metadata["quality"] == "good"
assert docs[0].metadata["ready"]
assert "NonExisting" not in docs[0].metadata.keys()
@pytest.mark.skipif(not hanadb_installed, reason="hanadb not installed")
def test_create_hnsw_index_with_default_values(texts: List[str]) -> None:
table_name = "TEST_TABLE_HNSW_INDEX_DEFAULT"
# Delete table if it exists (cleanup from previous tests)
drop_table(test_setup.conn, table_name)
# Create table and insert data
vectorDB = HanaDB.from_texts(
connection=test_setup.conn,
texts=texts,
embedding=embedding,
table_name=table_name,
)
# Test the creation of HNSW index
try:
vectorDB.create_hnsw_index()
except Exception as e:
pytest.fail(f"Failed to create HNSW index: {e}")
# Perform a search using the index to confirm its correctness
search_result = vectorDB.max_marginal_relevance_search(texts[0], k=2, fetch_k=20)
assert len(search_result) == 2
assert search_result[0].page_content == texts[0]
assert search_result[1].page_content != texts[0]
@pytest.mark.skipif(not hanadb_installed, reason="hanadb not installed")
def test_create_hnsw_index_with_defined_values(texts: List[str]) -> None:
table_name = "TEST_TABLE_HNSW_INDEX_DEFINED"
# Delete table if it exists (cleanup from previous tests)
drop_table(test_setup.conn, table_name)
# Create table and insert data
vectorDB = HanaDB.from_texts(
connection=test_setup.conn,
texts=texts,
embedding=embedding,
table_name=table_name,
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
)
# Test the creation of HNSW index with specific values
try:
vectorDB.create_hnsw_index(
index_name="my_L2_index", ef_search=500, m=100, ef_construction=200
)
except Exception as e:
pytest.fail(f"Failed to create HNSW index with defined values: {e}")
# Perform a search using the index to confirm its correctness
search_result = vectorDB.max_marginal_relevance_search(texts[0], k=2, fetch_k=20)
assert len(search_result) == 2
assert search_result[0].page_content == texts[0]
assert search_result[1].page_content != texts[0]
@pytest.mark.skipif(not hanadb_installed, reason="hanadb not installed")
def test_create_hnsw_index_after_initialization(texts: List[str]) -> None:
table_name = "TEST_TABLE_HNSW_INDEX_AFTER_INIT"
drop_table(test_setup.conn, table_name)
# Initialize HanaDB without adding documents yet
vectorDB = HanaDB(
connection=test_setup.conn,
embedding=embedding,
table_name=table_name,
)
# Create HNSW index before adding documents
vectorDB.create_hnsw_index(
index_name="index_pre_add", ef_search=400, m=50, ef_construction=150
)
# Add texts after index creation
vectorDB.add_texts(texts=texts)
# Perform similarity search using the index
search_result = vectorDB.similarity_search(texts[0], k=3)
# Assert that search result is valid and has expected length
assert len(search_result) == 3
assert search_result[0].page_content == texts[0]
assert search_result[1].page_content != texts[0]
@pytest.mark.skipif(not hanadb_installed, reason="hanadb not installed")
def test_duplicate_hnsw_index_creation(texts: List[str]) -> None:
table_name = "TEST_TABLE_HNSW_DUPLICATE_INDEX"
# Delete table if it exists (cleanup from previous tests)
drop_table(test_setup.conn, table_name)
# Create table and insert data
vectorDB = HanaDB.from_texts(
connection=test_setup.conn,
texts=texts,
embedding=embedding,
table_name=table_name,
)
# Create HNSW index for the first time
vectorDB.create_hnsw_index(
index_name="index_cosine",
ef_search=300,
m=80,
ef_construction=100,
)
with pytest.raises(Exception):
vectorDB.create_hnsw_index(ef_search=300, m=80, ef_construction=100)
@pytest.mark.skipif(not hanadb_installed, reason="hanadb not installed")
def test_create_hnsw_index_invalid_m_value(texts: List[str]) -> None:
table_name = "TEST_TABLE_HNSW_INVALID_M"
# Cleanup: drop the table if it exists
drop_table(test_setup.conn, table_name)
# Create table and insert data
vectorDB = HanaDB.from_texts(
connection=test_setup.conn,
texts=texts,
embedding=embedding,
table_name=table_name,
)
# Test invalid `m` value (too low)
with pytest.raises(ValueError):
vectorDB.create_hnsw_index(m=3)
# Test invalid `m` value (too high)
with pytest.raises(ValueError):
vectorDB.create_hnsw_index(m=1001)
@pytest.mark.skipif(not hanadb_installed, reason="hanadb not installed")
def test_create_hnsw_index_invalid_ef_construction(texts: List[str]) -> None:
table_name = "TEST_TABLE_HNSW_INVALID_EF_CONSTRUCTION"
# Cleanup: drop the table if it exists
drop_table(test_setup.conn, table_name)
# Create table and insert data
vectorDB = HanaDB.from_texts(
connection=test_setup.conn,
texts=texts,
embedding=embedding,
table_name=table_name,
)
# Test invalid `ef_construction` value (too low)
with pytest.raises(ValueError):
vectorDB.create_hnsw_index(ef_construction=0)
# Test invalid `ef_construction` value (too high)
with pytest.raises(ValueError):
vectorDB.create_hnsw_index(ef_construction=100001)
@pytest.mark.skipif(not hanadb_installed, reason="hanadb not installed")
def test_create_hnsw_index_invalid_ef_search(texts: List[str]) -> None:
table_name = "TEST_TABLE_HNSW_INVALID_EF_SEARCH"
# Cleanup: drop the table if it exists
drop_table(test_setup.conn, table_name)
# Create table and insert data
vectorDB = HanaDB.from_texts(
connection=test_setup.conn,
texts=texts,
embedding=embedding,
table_name=table_name,
)
# Test invalid `ef_search` value (too low)
with pytest.raises(ValueError):
vectorDB.create_hnsw_index(ef_search=0)
# Test invalid `ef_search` value (too high)
with pytest.raises(ValueError):
vectorDB.create_hnsw_index(ef_search=100001)