mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-29 01:48:57 +00:00
Community patch clickhouse make it possible to not specify index (#20460)
Vector indexes in ClickHouse are experimental at the moment and can sometimes break/change behaviour. So this PR makes it possible to say that you don't want to specify an index type. Any queries against the embedding column will be brute force/linear scan, but that gives reasonable performance for small-medium dataset sizes. --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
parent
c010ec8b71
commit
ce23f8293a
@ -72,7 +72,7 @@ class ClickhouseSettings(BaseSettings):
|
|||||||
username: Optional[str] = None
|
username: Optional[str] = None
|
||||||
password: Optional[str] = None
|
password: Optional[str] = None
|
||||||
|
|
||||||
index_type: str = "annoy"
|
index_type: Optional[str] = "annoy"
|
||||||
# Annoy supports L2Distance and cosineDistance.
|
# Annoy supports L2Distance and cosineDistance.
|
||||||
index_param: Optional[Union[List, Dict]] = ["'L2Distance'", 100]
|
index_param: Optional[Union[List, Dict]] = ["'L2Distance'", 100]
|
||||||
index_query_params: Dict[str, str] = {}
|
index_query_params: Dict[str, str] = {}
|
||||||
@ -172,23 +172,15 @@ class Clickhouse(VectorStore):
|
|||||||
else ""
|
else ""
|
||||||
)
|
)
|
||||||
if isinstance(self.config.index_param, Dict)
|
if isinstance(self.config.index_param, Dict)
|
||||||
else ",".join([str(p) for p in self.config.index_param])
|
else (
|
||||||
if isinstance(self.config.index_param, List)
|
",".join([str(p) for p in self.config.index_param])
|
||||||
else self.config.index_param
|
if isinstance(self.config.index_param, List)
|
||||||
|
else self.config.index_param
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
self.schema = f"""\
|
self.schema = self._schema(dim, index_params)
|
||||||
CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}(
|
|
||||||
{self.config.column_map['id']} Nullable(String),
|
|
||||||
{self.config.column_map['document']} Nullable(String),
|
|
||||||
{self.config.column_map['embedding']} Array(Float32),
|
|
||||||
{self.config.column_map['metadata']} JSON,
|
|
||||||
{self.config.column_map['uuid']} UUID DEFAULT generateUUIDv4(),
|
|
||||||
CONSTRAINT cons_vec_len CHECK length({self.config.column_map['embedding']}) = {dim},
|
|
||||||
INDEX vec_idx {self.config.column_map['embedding']} TYPE \
|
|
||||||
{self.config.index_type}({index_params}) GRANULARITY 1000
|
|
||||||
) ENGINE = MergeTree ORDER BY uuid SETTINGS index_granularity = 8192\
|
|
||||||
"""
|
|
||||||
self.dim = dim
|
self.dim = dim
|
||||||
self.BS = "\\"
|
self.BS = "\\"
|
||||||
self.must_escape = ("\\", "'")
|
self.must_escape = ("\\", "'")
|
||||||
@ -205,10 +197,53 @@ CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}(
|
|||||||
)
|
)
|
||||||
# Enable JSON type
|
# Enable JSON type
|
||||||
self.client.command("SET allow_experimental_object_type=1")
|
self.client.command("SET allow_experimental_object_type=1")
|
||||||
# Enable index
|
if self.config.index_type:
|
||||||
self.client.command(f"SET allow_experimental_{self.config.index_type}_index=1")
|
# Enable index
|
||||||
|
self.client.command(
|
||||||
|
f"SET allow_experimental_{self.config.index_type}_index=1"
|
||||||
|
)
|
||||||
self.client.command(self.schema)
|
self.client.command(self.schema)
|
||||||
|
|
||||||
|
def _schema(self, dim: int, index_params: Optional[str] = "") -> str:
|
||||||
|
"""Create table schema
|
||||||
|
:param dim: dimension of embeddings
|
||||||
|
:param index_params: parameters used for index
|
||||||
|
|
||||||
|
This function returns a `CREATE TABLE` statement based on the value of
|
||||||
|
`self.config.index_type`.
|
||||||
|
If an index type is specified that index will be created, otherwise
|
||||||
|
no index will be created.
|
||||||
|
In the case of there being no index, a linear scan will be performed
|
||||||
|
when the embedding field is queried.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self.config.index_type:
|
||||||
|
return f"""\
|
||||||
|
CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}(
|
||||||
|
{self.config.column_map['id']} Nullable(String),
|
||||||
|
{self.config.column_map['document']} Nullable(String),
|
||||||
|
{self.config.column_map['embedding']} Array(Float32),
|
||||||
|
{self.config.column_map['metadata']} JSON,
|
||||||
|
{self.config.column_map['uuid']} UUID DEFAULT generateUUIDv4(),
|
||||||
|
CONSTRAINT cons_vec_len CHECK length(
|
||||||
|
{self.config.column_map['embedding']}) = {dim},
|
||||||
|
INDEX vec_idx {self.config.column_map['embedding']} TYPE \
|
||||||
|
{self.config.index_type}({index_params}) GRANULARITY 1000
|
||||||
|
) ENGINE = MergeTree ORDER BY uuid SETTINGS index_granularity = 8192\
|
||||||
|
"""
|
||||||
|
else:
|
||||||
|
return f"""\
|
||||||
|
CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}(
|
||||||
|
{self.config.column_map['id']} Nullable(String),
|
||||||
|
{self.config.column_map['document']} Nullable(String),
|
||||||
|
{self.config.column_map['embedding']} Array(Float32),
|
||||||
|
{self.config.column_map['metadata']} JSON,
|
||||||
|
{self.config.column_map['uuid']} UUID DEFAULT generateUUIDv4(),
|
||||||
|
CONSTRAINT cons_vec_len CHECK length({
|
||||||
|
self.config.column_map['embedding']}) = {dim}
|
||||||
|
) ENGINE = MergeTree ORDER BY uuid
|
||||||
|
"""
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def embeddings(self) -> Embeddings:
|
def embeddings(self) -> Embeddings:
|
||||||
"""Provides access to the embedding mechanism used by the Clickhouse instance.
|
"""Provides access to the embedding mechanism used by the Clickhouse instance.
|
||||||
|
Loading…
Reference in New Issue
Block a user