From 117a548117881c81af1fa63994a10afde26b7fbb Mon Sep 17 00:00:00 2001 From: Valery Denisov Date: Wed, 24 Apr 2024 14:47:15 +0200 Subject: [PATCH] Added ClickHouse vector sotre support --- fern/docs/pages/manual/vectordb.mdx | 70 +++++++++++++- .../vector_store/vector_store_component.py | 19 ++++ private_gpt/settings/settings.py | 92 +++++++++++++++++-- pyproject.toml | 5 + settings.yaml | 7 ++ 5 files changed, 181 insertions(+), 12 deletions(-) diff --git a/fern/docs/pages/manual/vectordb.mdx b/fern/docs/pages/manual/vectordb.mdx index c98efea8..b3cd3501 100644 --- a/fern/docs/pages/manual/vectordb.mdx +++ b/fern/docs/pages/manual/vectordb.mdx @@ -1,7 +1,7 @@ ## Vectorstores -PrivateGPT supports [Qdrant](https://qdrant.tech/), [Chroma](https://www.trychroma.com/) and [PGVector](https://github.com/pgvector/pgvector) as vectorstore providers. Qdrant being the default. +PrivateGPT supports [Qdrant](https://qdrant.tech/), [Chroma](https://www.trychroma.com/), [PGVector](https://github.com/pgvector/pgvector) and [ClickHouse](https://github.com/ClickHouse/ClickHouse) as vectorstore providers. Qdrant being the default. -In order to select one or the other, set the `vectorstore.database` property in the `settings.yaml` file to `qdrant`, `chroma` or `postgres`. +In order to select one or the other, set the `vectorstore.database` property in the `settings.yaml` file to `qdrant`, `chroma`, `postgres` and `clickhouse`. ```yaml vectorstore: @@ -101,3 +101,69 @@ Indexes: postgres=# ``` The dimensions of the embeddings columns will be set based on the `embedding.embed_dim` value. If the embedding model changes this table may need to be dropped and recreated to avoid a dimension mismatch. + +### ClickHouse + +To utilize ClickHouse as the vector store, a [ClickHouse](https://github.com/ClickHouse/ClickHouse) database must be employed. + +To enable ClickHouse, set the `vectorstore.database` property in the `settings.yaml` file to `clickhouse` and install the `vector-stores-clickhouse` extra. + +```bash +poetry install --extras vector-stores-clickhouse +``` + +ClickHouse settings can be configured by setting values to the `clickhouse` property in the `settings.yaml` file. + +The available configuration options are: +| Field | Description | +|----------------------|----------------------------------------------------------------| +| **host** | The server hosting the ClickHouse database. Default is `localhost` | +| **port** | The port on which the ClickHouse database is accessible. Default is `8123` | +| **username** | The username for database access. Default is `default` | +| **password** | The password for database access. (Optional) | +| **database** | The specific database to connect to. Default is `__default__` | +| **secure** | Use https/TLS for secure connection to the server. Default is `false` | +| **interface** | The protocol used for the connection, either 'http' or 'https'. (Optional) | +| **settings** | Specific ClickHouse server settings to be used with the session. (Optional) | +| **connect_timeout** | Timeout in seconds for establishing a connection. (Optional) | +| **send_receive_timeout** | Read timeout in seconds for http connection. (Optional) | +| **verify** | Verify the server certificate in secure/https mode. (Optional) | +| **ca_cert** | Path to Certificate Authority root certificate (.pem format). (Optional) | +| **client_cert** | Path to TLS Client certificate (.pem format). (Optional) | +| **client_cert_key** | Path to the private key for the TLS Client certificate. (Optional) | +| **http_proxy** | HTTP proxy address. (Optional) | +| **https_proxy** | HTTPS proxy address. (Optional) | +| **server_host_name** | Server host name to be checked against the TLS certificate. (Optional) | + +For example: +```yaml +vectorstore: + database: clickhouse + +clickhouse: + host: localhost + port: 8443 + username: admin + password: + database: embeddings + secure: false +``` + +The following table will be created in the database: +``` +clickhouse-client +:) \d embeddings.llama_index + Table "llama_index" + № | name | type | default_type | default_expression | comment | codec_expression | ttl_expression +----|-----------|----------------------------------------------|--------------|--------------------|---------|------------------|--------------- + 1 | id | String | | | | | + 2 | doc_id | String | | | | | + 3 | text | String | | | | | + 4 | vector | Array(Float32) | | | | | + 5 | node_info | Tuple(start Nullable(UInt64), end Nullable(UInt64)) | | | | | + 6 | metadata | String | | | | | + +clickhouse-client +``` + +The dimensions of the embeddings columns will be set based on the `embedding.embed_dim` value. If the embedding model changes, this table may need to be dropped and recreated to avoid a dimension mismatch. diff --git a/private_gpt/components/vector_store/vector_store_component.py b/private_gpt/components/vector_store/vector_store_component.py index f9932b57..109b3e29 100644 --- a/private_gpt/components/vector_store/vector_store_component.py +++ b/private_gpt/components/vector_store/vector_store_component.py @@ -121,6 +121,25 @@ class VectorStoreComponent: collection_name="make_this_parameterizable_per_api_call", ), # TODO ) + case "clickhouse": + try: + from llama_index.vector_stores.clickhouse import ClickHouseVectorStore + from clickhouse_connect import get_client + except ImportError as e: + raise ImportError( + "ClickHouse dependencies not found, install with `poetry install --extras vector-stores-clickhouse`" + ) from e + + if settings.clickhouse is None: + raise ValueError("ClickHouse settings not found. Please provide settings.") + + clickhouse_client = get_client( + host=settings.clickhouse.host, + port=settings.clickhouse.port, + username=settings.clickhouse.username, + password=settings.clickhouse.password, + ) + self.vector_store = ClickHouseVectorStore(clickhouse_client=clickhouse_client) case _: # Should be unreachable # The settings validator should have caught this diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py index 051cfcab..6ec21dc0 100644 --- a/private_gpt/settings/settings.py +++ b/private_gpt/settings/settings.py @@ -1,4 +1,4 @@ -from typing import Literal +from typing import Literal, Optional, Dict, Any, Union from pydantic import BaseModel, Field @@ -15,7 +15,7 @@ class CorsSettings(BaseModel): enabled: bool = Field( description="Flag indicating if CORS headers are set or not." - "If set to True, the CORS headers will be set to allow all origins, methods and headers.", + "If set to True, the CORS headers will be set to allow all origins, methods and headers.", default=False, ) allow_credentials: bool = Field( @@ -54,8 +54,8 @@ class AuthSettings(BaseModel): ) secret: str = Field( description="The secret to be used for authentication. " - "It can be any non-blank string. For HTTP basic authentication, " - "this value should be the whole 'Authorization' header that is expected" + "It can be any non-blank string. For HTTP basic authentication, " + "this value should be the whole 'Authorization' header that is expected" ) @@ -76,7 +76,7 @@ class ServerSettings(BaseModel): class DataSettings(BaseModel): local_data_folder: str = Field( description="Path to local storage." - "It will be treated as an absolute path if it starts with /" + "It will be treated as an absolute path if it starts with /" ) @@ -95,10 +95,10 @@ class LLMSettings(BaseModel): tokenizer: str = Field( None, description="The model id of a predefined tokenizer hosted inside a model repo on " - "huggingface.co. Valid model ids can be located at the root-level, like " - "`bert-base-uncased`, or namespaced under a user or organization name, " - "like `HuggingFaceH4/zephyr-7b-beta`. If not set, will load a tokenizer matching " - "gpt-3.5-turbo LLM.", + "huggingface.co. Valid model ids can be located at the root-level, like " + "`bert-base-uncased`, or namespaced under a user or organization name, " + "like `HuggingFaceH4/zephyr-7b-beta`. If not set, will load a tokenizer matching " + "gpt-3.5-turbo LLM.", ) temperature: float = Field( 0.1, @@ -107,7 +107,7 @@ class LLMSettings(BaseModel): class VectorstoreSettings(BaseModel): - database: Literal["chroma", "qdrant", "postgres"] + database: Literal["chroma", "qdrant", "postgres", "clickhouse"] class NodeStoreSettings(BaseModel): @@ -323,6 +323,77 @@ class RagSettings(BaseModel): rerank: RerankSettings +class ClickHouseSettings(BaseModel): + host: str = Field( + "localhost", + description="The server hosting the ClickHouse database", + ) + port: int = Field( + 8123, + description="The port on which the ClickHouse database is accessible", + ) + username: str = Field( + "default", + description="The username to use to connect to the ClickHouse database", + ) + password: str = Field( + "", + description="The password to use to connect to the ClickHouse database", + ) + database: str = Field( + "__default__", + description="The default database to use for connections", + ) + secure: Union[bool, str] = Field( + False, + description="Use https/TLS for secure connection to the server", + ) + interface: Optional[str] = Field( + None, + description="Must be either 'http' or 'https'. Determines the protocol to use for the connection", + ) + settings: Optional[Dict[str, Any]] = Field( + None, + description="Specific ClickHouse server settings to be used with the session", + ) + connect_timeout: Optional[int] = Field( + None, + description="Timeout in seconds for establishing a connection", + ) + send_receive_timeout: Optional[int] = Field( + None, + description="Read timeout in seconds for http connection", + ) + verify: Optional[bool] = Field( + None, + description="Verify the server certificate in secure/https mode", + ) + ca_cert: Optional[str] = Field( + None, + description="Path to Certificate Authority root certificate (.pem format)", + ) + client_cert: Optional[str] = Field( + None, + description="Path to TLS Client certificate (.pem format)", + ) + client_cert_key: Optional[str] = Field( + None, + description="Path to the private key for the TLS Client certificate", + ) + http_proxy: Optional[str] = Field( + None, + description="HTTP proxy address", + ) + https_proxy: Optional[str] = Field( + None, + description="HTTPS proxy address", + ) + server_host_name: Optional[str] = Field( + None, + description="Server host name to be checked against the TLS certificate", + ) + + class PostgresSettings(BaseModel): host: str = Field( "localhost", @@ -421,6 +492,7 @@ class Settings(BaseModel): rag: RagSettings qdrant: QdrantSettings | None = None postgres: PostgresSettings | None = None + clickhouse: ClickHouseSettings | None = None """ diff --git a/pyproject.toml b/pyproject.toml index 3d6d1dde..c3ab1717 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,12 +31,16 @@ llama-index-embeddings-azure-openai = {version ="^0.1.6", optional = true} llama-index-vector-stores-qdrant = {version ="^0.1.3", optional = true} llama-index-vector-stores-chroma = {version ="^0.1.4", optional = true} llama-index-vector-stores-postgres = {version ="^0.1.2", optional = true} +llama-index-vector-stores-clickhouse = {version ="^0.1.3", optional = true} llama-index-storage-docstore-postgres = {version ="^0.1.2", optional = true} llama-index-storage-index-store-postgres = {version ="^0.1.2", optional = true} # Postgres psycopg2-binary = {version ="^2.9.9", optional = true} asyncpg = {version="^0.29.0", optional = true} +# ClickHouse +clickhouse-connect = {version = "^0.7.8", optional = true} + # Optional Sagemaker dependency boto3 = {version ="^1.34.51", optional = true} @@ -61,6 +65,7 @@ embeddings-openai = ["llama-index-embeddings-openai"] embeddings-sagemaker = ["boto3"] embeddings-azopenai = ["llama-index-embeddings-azure-openai"] vector-stores-qdrant = ["llama-index-vector-stores-qdrant"] +vector-stores-clickhouse = ["llama-index-vector-stores-clickhouse", "clickhouse_connect"] vector-stores-chroma = ["llama-index-vector-stores-chroma"] vector-stores-postgres = ["llama-index-vector-stores-postgres"] storage-nodestore-postgres = ["llama-index-storage-docstore-postgres","llama-index-storage-index-store-postgres","psycopg2-binary","asyncpg"] diff --git a/settings.yaml b/settings.yaml index e881a555..7936d0d0 100644 --- a/settings.yaml +++ b/settings.yaml @@ -52,6 +52,13 @@ rag: model: cross-encoder/ms-marco-MiniLM-L-2-v2 top_n: 1 +clickhouse: + host: localhost + port: 8443 + username: admin + password: clickhouse + database: embeddings + llamacpp: prompt_style: "mistral" llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF