From ce23f8293abcdbb21b867e5198691ef822823c66 Mon Sep 17 00:00:00 2001 From: Mark Needham Date: Mon, 22 Apr 2024 18:46:37 +0100 Subject: [PATCH] Community patch clickhouse make it possible to not specify index (#20460) Vector indexes in ClickHouse are experimental at the moment and can sometimes break/change behaviour. So this PR makes it possible to say that you don't want to specify an index type. Any queries against the embedding column will be brute force/linear scan, but that gives reasonable performance for small-medium dataset sizes. --------- Co-authored-by: Erick Friis Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> --- .../vectorstores/clickhouse.py | 71 ++++++++++++++----- 1 file changed, 53 insertions(+), 18 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/clickhouse.py b/libs/community/langchain_community/vectorstores/clickhouse.py index bc541084547..e2083deb47e 100644 --- a/libs/community/langchain_community/vectorstores/clickhouse.py +++ b/libs/community/langchain_community/vectorstores/clickhouse.py @@ -72,7 +72,7 @@ class ClickhouseSettings(BaseSettings): username: Optional[str] = None password: Optional[str] = None - index_type: str = "annoy" + index_type: Optional[str] = "annoy" # Annoy supports L2Distance and cosineDistance. index_param: Optional[Union[List, Dict]] = ["'L2Distance'", 100] index_query_params: Dict[str, str] = {} @@ -172,23 +172,15 @@ class Clickhouse(VectorStore): else "" ) if isinstance(self.config.index_param, Dict) - else ",".join([str(p) for p in self.config.index_param]) - if isinstance(self.config.index_param, List) - else self.config.index_param + else ( + ",".join([str(p) for p in self.config.index_param]) + if isinstance(self.config.index_param, List) + else self.config.index_param + ) ) - self.schema = f"""\ -CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( - {self.config.column_map['id']} Nullable(String), - {self.config.column_map['document']} Nullable(String), - {self.config.column_map['embedding']} Array(Float32), - {self.config.column_map['metadata']} JSON, - {self.config.column_map['uuid']} UUID DEFAULT generateUUIDv4(), - CONSTRAINT cons_vec_len CHECK length({self.config.column_map['embedding']}) = {dim}, - INDEX vec_idx {self.config.column_map['embedding']} TYPE \ -{self.config.index_type}({index_params}) GRANULARITY 1000 -) ENGINE = MergeTree ORDER BY uuid SETTINGS index_granularity = 8192\ -""" + self.schema = self._schema(dim, index_params) + self.dim = dim self.BS = "\\" self.must_escape = ("\\", "'") @@ -205,10 +197,53 @@ CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( ) # Enable JSON type self.client.command("SET allow_experimental_object_type=1") - # Enable index - self.client.command(f"SET allow_experimental_{self.config.index_type}_index=1") + if self.config.index_type: + # Enable index + self.client.command( + f"SET allow_experimental_{self.config.index_type}_index=1" + ) self.client.command(self.schema) + def _schema(self, dim: int, index_params: Optional[str] = "") -> str: + """Create table schema + :param dim: dimension of embeddings + :param index_params: parameters used for index + + This function returns a `CREATE TABLE` statement based on the value of + `self.config.index_type`. + If an index type is specified that index will be created, otherwise + no index will be created. + In the case of there being no index, a linear scan will be performed + when the embedding field is queried. + """ + + if self.config.index_type: + return f"""\ + CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( + {self.config.column_map['id']} Nullable(String), + {self.config.column_map['document']} Nullable(String), + {self.config.column_map['embedding']} Array(Float32), + {self.config.column_map['metadata']} JSON, + {self.config.column_map['uuid']} UUID DEFAULT generateUUIDv4(), + CONSTRAINT cons_vec_len CHECK length( + {self.config.column_map['embedding']}) = {dim}, + INDEX vec_idx {self.config.column_map['embedding']} TYPE \ + {self.config.index_type}({index_params}) GRANULARITY 1000 + ) ENGINE = MergeTree ORDER BY uuid SETTINGS index_granularity = 8192\ + """ + else: + return f"""\ + CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( + {self.config.column_map['id']} Nullable(String), + {self.config.column_map['document']} Nullable(String), + {self.config.column_map['embedding']} Array(Float32), + {self.config.column_map['metadata']} JSON, + {self.config.column_map['uuid']} UUID DEFAULT generateUUIDv4(), + CONSTRAINT cons_vec_len CHECK length({ + self.config.column_map['embedding']}) = {dim} + ) ENGINE = MergeTree ORDER BY uuid + """ + @property def embeddings(self) -> Embeddings: """Provides access to the embedding mechanism used by the Clickhouse instance.