Added ClickHouse vector sotre support

This commit is contained in:
Valery Denisov 2024-04-24 14:47:15 +02:00
parent c1802e7cf0
commit 117a548117
No known key found for this signature in database
5 changed files with 181 additions and 12 deletions

View File

@ -1,7 +1,7 @@
## Vectorstores
PrivateGPT supports [Qdrant](https://qdrant.tech/), [Chroma](https://www.trychroma.com/) and [PGVector](https://github.com/pgvector/pgvector) as vectorstore providers. Qdrant being the default.
PrivateGPT supports [Qdrant](https://qdrant.tech/), [Chroma](https://www.trychroma.com/), [PGVector](https://github.com/pgvector/pgvector) and [ClickHouse](https://github.com/ClickHouse/ClickHouse) as vectorstore providers. Qdrant being the default.
In order to select one or the other, set the `vectorstore.database` property in the `settings.yaml` file to `qdrant`, `chroma` or `postgres`.
In order to select one or the other, set the `vectorstore.database` property in the `settings.yaml` file to `qdrant`, `chroma`, `postgres` and `clickhouse`.
```yaml
vectorstore:
@ -101,3 +101,69 @@ Indexes:
postgres=#
```
The dimensions of the embeddings columns will be set based on the `embedding.embed_dim` value. If the embedding model changes this table may need to be dropped and recreated to avoid a dimension mismatch.
### ClickHouse
To utilize ClickHouse as the vector store, a [ClickHouse](https://github.com/ClickHouse/ClickHouse) database must be employed.
To enable ClickHouse, set the `vectorstore.database` property in the `settings.yaml` file to `clickhouse` and install the `vector-stores-clickhouse` extra.
```bash
poetry install --extras vector-stores-clickhouse
```
ClickHouse settings can be configured by setting values to the `clickhouse` property in the `settings.yaml` file.
The available configuration options are:
| Field | Description |
|----------------------|----------------------------------------------------------------|
| **host** | The server hosting the ClickHouse database. Default is `localhost` |
| **port** | The port on which the ClickHouse database is accessible. Default is `8123` |
| **username** | The username for database access. Default is `default` |
| **password** | The password for database access. (Optional) |
| **database** | The specific database to connect to. Default is `__default__` |
| **secure** | Use https/TLS for secure connection to the server. Default is `false` |
| **interface** | The protocol used for the connection, either 'http' or 'https'. (Optional) |
| **settings** | Specific ClickHouse server settings to be used with the session. (Optional) |
| **connect_timeout** | Timeout in seconds for establishing a connection. (Optional) |
| **send_receive_timeout** | Read timeout in seconds for http connection. (Optional) |
| **verify** | Verify the server certificate in secure/https mode. (Optional) |
| **ca_cert** | Path to Certificate Authority root certificate (.pem format). (Optional) |
| **client_cert** | Path to TLS Client certificate (.pem format). (Optional) |
| **client_cert_key** | Path to the private key for the TLS Client certificate. (Optional) |
| **http_proxy** | HTTP proxy address. (Optional) |
| **https_proxy** | HTTPS proxy address. (Optional) |
| **server_host_name** | Server host name to be checked against the TLS certificate. (Optional) |
For example:
```yaml
vectorstore:
database: clickhouse
clickhouse:
host: localhost
port: 8443
username: admin
password: <PASSWORD>
database: embeddings
secure: false
```
The following table will be created in the database:
```
clickhouse-client
:) \d embeddings.llama_index
Table "llama_index"
№ | name | type | default_type | default_expression | comment | codec_expression | ttl_expression
----|-----------|----------------------------------------------|--------------|--------------------|---------|------------------|---------------
1 | id | String | | | | |
2 | doc_id | String | | | | |
3 | text | String | | | | |
4 | vector | Array(Float32) | | | | |
5 | node_info | Tuple(start Nullable(UInt64), end Nullable(UInt64)) | | | | |
6 | metadata | String | | | | |
clickhouse-client
```
The dimensions of the embeddings columns will be set based on the `embedding.embed_dim` value. If the embedding model changes, this table may need to be dropped and recreated to avoid a dimension mismatch.

View File

@ -121,6 +121,25 @@ class VectorStoreComponent:
collection_name="make_this_parameterizable_per_api_call",
), # TODO
)
case "clickhouse":
try:
from llama_index.vector_stores.clickhouse import ClickHouseVectorStore
from clickhouse_connect import get_client
except ImportError as e:
raise ImportError(
"ClickHouse dependencies not found, install with `poetry install --extras vector-stores-clickhouse`"
) from e
if settings.clickhouse is None:
raise ValueError("ClickHouse settings not found. Please provide settings.")
clickhouse_client = get_client(
host=settings.clickhouse.host,
port=settings.clickhouse.port,
username=settings.clickhouse.username,
password=settings.clickhouse.password,
)
self.vector_store = ClickHouseVectorStore(clickhouse_client=clickhouse_client)
case _:
# Should be unreachable
# The settings validator should have caught this

View File

@ -1,4 +1,4 @@
from typing import Literal
from typing import Literal, Optional, Dict, Any, Union
from pydantic import BaseModel, Field
@ -107,7 +107,7 @@ class LLMSettings(BaseModel):
class VectorstoreSettings(BaseModel):
database: Literal["chroma", "qdrant", "postgres"]
database: Literal["chroma", "qdrant", "postgres", "clickhouse"]
class NodeStoreSettings(BaseModel):
@ -323,6 +323,77 @@ class RagSettings(BaseModel):
rerank: RerankSettings
class ClickHouseSettings(BaseModel):
host: str = Field(
"localhost",
description="The server hosting the ClickHouse database",
)
port: int = Field(
8123,
description="The port on which the ClickHouse database is accessible",
)
username: str = Field(
"default",
description="The username to use to connect to the ClickHouse database",
)
password: str = Field(
"",
description="The password to use to connect to the ClickHouse database",
)
database: str = Field(
"__default__",
description="The default database to use for connections",
)
secure: Union[bool, str] = Field(
False,
description="Use https/TLS for secure connection to the server",
)
interface: Optional[str] = Field(
None,
description="Must be either 'http' or 'https'. Determines the protocol to use for the connection",
)
settings: Optional[Dict[str, Any]] = Field(
None,
description="Specific ClickHouse server settings to be used with the session",
)
connect_timeout: Optional[int] = Field(
None,
description="Timeout in seconds for establishing a connection",
)
send_receive_timeout: Optional[int] = Field(
None,
description="Read timeout in seconds for http connection",
)
verify: Optional[bool] = Field(
None,
description="Verify the server certificate in secure/https mode",
)
ca_cert: Optional[str] = Field(
None,
description="Path to Certificate Authority root certificate (.pem format)",
)
client_cert: Optional[str] = Field(
None,
description="Path to TLS Client certificate (.pem format)",
)
client_cert_key: Optional[str] = Field(
None,
description="Path to the private key for the TLS Client certificate",
)
http_proxy: Optional[str] = Field(
None,
description="HTTP proxy address",
)
https_proxy: Optional[str] = Field(
None,
description="HTTPS proxy address",
)
server_host_name: Optional[str] = Field(
None,
description="Server host name to be checked against the TLS certificate",
)
class PostgresSettings(BaseModel):
host: str = Field(
"localhost",
@ -421,6 +492,7 @@ class Settings(BaseModel):
rag: RagSettings
qdrant: QdrantSettings | None = None
postgres: PostgresSettings | None = None
clickhouse: ClickHouseSettings | None = None
"""

View File

@ -31,12 +31,16 @@ llama-index-embeddings-azure-openai = {version ="^0.1.6", optional = true}
llama-index-vector-stores-qdrant = {version ="^0.1.3", optional = true}
llama-index-vector-stores-chroma = {version ="^0.1.4", optional = true}
llama-index-vector-stores-postgres = {version ="^0.1.2", optional = true}
llama-index-vector-stores-clickhouse = {version ="^0.1.3", optional = true}
llama-index-storage-docstore-postgres = {version ="^0.1.2", optional = true}
llama-index-storage-index-store-postgres = {version ="^0.1.2", optional = true}
# Postgres
psycopg2-binary = {version ="^2.9.9", optional = true}
asyncpg = {version="^0.29.0", optional = true}
# ClickHouse
clickhouse-connect = {version = "^0.7.8", optional = true}
# Optional Sagemaker dependency
boto3 = {version ="^1.34.51", optional = true}
@ -61,6 +65,7 @@ embeddings-openai = ["llama-index-embeddings-openai"]
embeddings-sagemaker = ["boto3"]
embeddings-azopenai = ["llama-index-embeddings-azure-openai"]
vector-stores-qdrant = ["llama-index-vector-stores-qdrant"]
vector-stores-clickhouse = ["llama-index-vector-stores-clickhouse", "clickhouse_connect"]
vector-stores-chroma = ["llama-index-vector-stores-chroma"]
vector-stores-postgres = ["llama-index-vector-stores-postgres"]
storage-nodestore-postgres = ["llama-index-storage-docstore-postgres","llama-index-storage-index-store-postgres","psycopg2-binary","asyncpg"]

View File

@ -52,6 +52,13 @@ rag:
model: cross-encoder/ms-marco-MiniLM-L-2-v2
top_n: 1
clickhouse:
host: localhost
port: 8443
username: admin
password: clickhouse
database: embeddings
llamacpp:
prompt_style: "mistral"
llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF