#!/usr/bin/env bash

# Copyright (c) 2023 Microsoft Corporation
#
# SPDX-License-Identifier: Apache-2.0

tests_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "${tests_dir}/common.bash"
kubernetes_dir="${tests_dir}/integration/kubernetes"
helm_chart_dir="${repo_root_dir}/tools/packaging/kata-deploy/helm-chart/kata-deploy"

AZ_APPID="${AZ_APPID:-}"
AZ_PASSWORD="${AZ_PASSWORD:-}"
AZ_SUBSCRIPTION_ID="${AZ_SUBSCRIPTION_ID:-}"
AZ_TENANT_ID="${AZ_TENANT_ID:-}"
GENPOLICY_PULL_METHOD="${GENPOLICY_PULL_METHOD:-oci-distribution}"
GH_PR_NUMBER="${GH_PR_NUMBER:-}"
HELM_DEFAULT_INSTALLATION="${HELM_DEFAULT_INSTALLATION:-false}"
HELM_AGENT_HTTPS_PROXY="${HELM_AGENT_HTTPS_PROXY:-}"
HELM_AGENT_NO_PROXY="${HELM_AGENT_NO_PROXY:-}"
HELM_ALLOWED_HYPERVISOR_ANNOTATIONS="${HELM_ALLOWED_HYPERVISOR_ANNOTATIONS:-}"
HELM_CREATE_RUNTIME_CLASSES="${HELM_CREATE_RUNTIME_CLASSES:-}"
HELM_CREATE_DEFAULT_RUNTIME_CLASS="${HELM_CREATE_DEFAULT_RUNTIME_CLASS:-}"
HELM_DEBUG="${HELM_DEBUG:-}"
HELM_DEFAULT_SHIM="${HELM_DEFAULT_SHIM:-}"
HELM_HOST_OS="${HELM_HOST_OS:-}"
HELM_IMAGE_REFERENCE="${HELM_IMAGE_REFERENCE:-}"
HELM_IMAGE_TAG="${HELM_IMAGE_TAG:-}"
HELM_K8S_DISTRIBUTION="${HELM_K8S_DISTRIBUTION:-}"
HELM_PULL_TYPE_MAPPING="${HELM_PULL_TYPE_MAPPING:-}"
HELM_SHIMS="${HELM_SHIMS:-}"
HELM_SNAPSHOTTER_HANDLER_MAPPING="${HELM_SNAPSHOTTER_HANDLER_MAPPING:-}"
KATA_DEPLOY_WAIT_TIMEOUT="${KATA_DEPLOY_WAIT_TIMEOUT:-600}"
KATA_HOST_OS="${KATA_HOST_OS:-}"
KUBERNETES="${KUBERNETES:-}"
K8S_TEST_HOST_TYPE="${K8S_TEST_HOST_TYPE:-small}"
TEST_CLUSTER_NAMESPACE="${TEST_CLUSTER_NAMESPACE:-}"

function _print_instance_type() {
	case "${K8S_TEST_HOST_TYPE}" in
		small)
			echo "Standard_D2s_v5"
			;;
		normal)
			echo "Standard_D4s_v5"
			;;
		*)
			echo "Unknown instance type '${K8S_TEST_HOST_TYPE}'" >&2
			exit 1
	esac
}

# Print the cluster name set by $AKS_NAME or generated out of runtime
# metadata (e.g. pull request number, commit SHA, etc).
#
function _print_cluster_name() {
	local test_type="${1:-k8s}"
	local short_sha
	local metadata

	if [[ -n "${AKS_NAME:-}" ]]; then
		echo "${AKS_NAME}"
	else
		short_sha="$(git rev-parse --short=12 HEAD)"
		metadata="${GH_PR_NUMBER}-${short_sha}-${KATA_HYPERVISOR}-${KATA_HOST_OS}-amd64-${K8S_TEST_HOST_TYPE:0:1}-${GENPOLICY_PULL_METHOD:0:1}"
		# Compute the SHA1 digest of the metadata part to keep the name less
		# than the limit of 63 chars of AKS
		echo "${test_type}-$(sha1sum <<< "${metadata}" | cut -d' ' -f1)"
	fi
}

function _print_rg_name() {
	test_type="${1:-k8s}"

	echo "${AZ_RG:-"kataCI-$(_print_cluster_name "${test_type}")"}"
}

# Enable the HTTP application routing add-on to AKS.
# Use with ingress to expose a service API externally.
#
function enable_cluster_http_application_routing() {
	local test_type="${1:-k8s}"
	local cluster_name
	local rg

	rg="$(_print_rg_name "${test_type}")"
	cluster_name="$(_print_cluster_name "${test_type}")"

	az aks enable-addons -g "${rg}" -n "${cluster_name}" \
		--addons http_application_routing
}

function install_azure_cli() {
	curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash

	# TODO: Remove the version spec when this bug is fixed:
	# https://github.com/Azure/azure-cli/issues/31345
	az extension add --name aks-preview --version 14.0.0b3
}

function login_azure() {
	az login \
		--service-principal \
		-u "${AZ_APPID}" \
		-p "${AZ_PASSWORD}" \
		--tenant "${AZ_TENANT_ID}"

	# Switch to the Kata Containers subscription
	az account set --subscription "${AZ_SUBSCRIPTION_ID}"
}

function create_cluster() {
	test_type="${1:-k8s}"
	local short_sha
	local tags
	local rg

	# First ensure it didn't fail to get cleaned up from a previous run.
	delete_cluster "${test_type}" || true

	rg="$(_print_rg_name "${test_type}")"

	short_sha="$(git rev-parse --short=12 HEAD)"
	tags=("GH_PR_NUMBER=${GH_PR_NUMBER:-}" \
		"SHORT_SHA=${short_sha}" \
		"KATA_HYPERVISOR=${KATA_HYPERVISOR}"\
		"KATA_HOST_OS=${KATA_HOST_OS:-}" \
		"K8S_TEST_HOST_TYPE=${K8S_TEST_HOST_TYPE:0:1}" \
		"GENPOLICY_PULL_METHOD=${GENPOLICY_PULL_METHOD:0:1}")

	az group create \
		-l eastus \
		-n "${rg}"

	# Adding a double quote on the last line ends up causing issues
	# ine the cbl-mariner installation.  Because of that, let's just
	# disable the warning for this specific case.
	# shellcheck disable=SC2046
	az aks create \
		-g "${rg}" \
		--node-resource-group "node-${rg}" \
		-n "$(_print_cluster_name "${test_type}")" \
		-s "$(_print_instance_type)" \
		--node-count 1 \
		--generate-ssh-keys \
		--tags "${tags[@]}"
}

function install_bats() {
	# Installing bats from the noble repo.
	sudo apt install -y software-properties-common
	sudo add-apt-repository 'deb http://archive.ubuntu.com/ubuntu/ noble universe'
	sudo apt install -y bats
	sudo add-apt-repository --remove 'deb http://archive.ubuntu.com/ubuntu/ noble universe'
}

function install_kubectl() {
	sudo az aks install-cli
}

# Install the kustomize tool in /usr/local/bin if it doesn't exist on
# the system yet.
#
function install_kustomize() {
	local arch
	local checksum
	local version

	if command -v kustomize >/dev/null; then
		return
	fi

	ensure_yq
	version=$(get_from_kata_deps ".externals.kustomize.version")
	arch=$(arch_to_golang)
	checksum=$(get_from_kata_deps ".externals.kustomize.checksum.${arch}")

	local tarball="kustomize_${version}_linux_${arch}.tar.gz"
	curl -Lf -o "${tarball}" "https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize/${version}/${tarball}"

	local rc=0
	echo "${checksum} ${tarball}" | sha256sum -c || rc=$?
	[[ ${rc} -eq 0 ]] && sudo tar -xvzf "${tarball}" -C /usr/local/bin || rc=$?
	rm -f "${tarball}"
	[[ ${rc} -eq 0 ]]
}

function get_cluster_credentials() {
	test_type="${1:-k8s}"

	az aks get-credentials \
		--overwrite-existing \
		-g "$(_print_rg_name "${test_type}")" \
		-n "$(_print_cluster_name "${test_type}")"
}


# Get the AKS DNS zone name of HTTP application routing.
#
# Note: if the HTTP application routing add-on isn't installed in the cluster
# then it will return an empty string.
#
function get_cluster_specific_dns_zone() {
	local test_type="${1:-k8s}"
	local cluster_name
	local rg
	local q="addonProfiles.httpApplicationRouting.config.HTTPApplicationRoutingZoneName"

	rg="$(_print_rg_name "${test_type}")"
	cluster_name="$(_print_cluster_name "${test_type}")"

	az aks show -g "${rg}" -n "${cluster_name}" --query "${q}" | tr -d \"
}

function delete_cluster() {
	test_type="${1:-k8s}"
	local rg
	rg="$(_print_rg_name "${test_type}")"

	if [[ "$(az group exists -g "${rg}")" == "true" ]]; then
		az group delete -g "${rg}" --yes
	fi
}

function delete_cluster_kcli() {
	CLUSTER_NAME="${CLUSTER_NAME:-kata-k8s}"
	kcli delete -y kube "${CLUSTER_NAME}"
}

function get_nodes_and_pods_info() {
	kubectl debug "$(kubectl get nodes -o name)" -it --image=quay.io/kata-containers/kata-debug:latest || true
	kubectl get pods -o name | grep node-debugger | xargs kubectl delete || true
}

function deploy_k0s() {
	url=$(get_from_kata_deps ".externals.k0s.url")

	k0s_version_param=""
	version=$(get_from_kata_deps ".externals.k0s.version")
	if [[ -n "${version}" ]]; then
		k0s_version_param="K0S_VERSION=${version}"
	fi

	curl -sSLf "${url}" | sudo "${k0s_version_param}" sh

	# In this case we explicitly want word splitting when calling k0s
	# with extra parameters.
	# shellcheck disable=SC2086
	sudo k0s install controller --single ${KUBERNETES_EXTRA_PARAMS:-}

	# kube-router decided to use :8080 for its metrics, and this seems
	# to be a change that affected k0s 1.30.0+, leading to kube-router
	# pod crashing all the time and anything can actually be started
	# after that.
	#
	# Due to this issue, let's simply use a different port (:9999) and
	# move on with our tests.
	sudo mkdir -p /etc/k0s
	k0s config create | sudo tee /etc/k0s/k0s.yaml
	sudo sed -i -e "s/metricsPort: 8080/metricsPort: 9999/g" /etc/k0s/k0s.yaml

	sudo k0s start

	# This is an arbitrary value that came up from local tests
	sleep 120s

	# Download the kubectl binary into /usr/bin so we can avoid depending
	# on `k0s kubectl` command
	ARCH=$(arch_to_golang)

	kubectl_version=$(sudo k0s kubectl version 2>/dev/null | grep "Client Version" | sed -e 's/Client Version: //')
	sudo curl -fL --progress-bar -o /usr/bin/kubectl https://dl.k8s.io/release/"${kubectl_version}"/bin/linux/"${ARCH}"/kubectl
	sudo chmod +x /usr/bin/kubectl

	mkdir -p ~/.kube
	sudo cp /var/lib/k0s/pki/admin.conf ~/.kube/config
	sudo chown "${USER}":"${USER}" ~/.kube/config
}

function deploy_k3s() {
	curl -sfL https://get.k3s.io | sh -s - --write-kubeconfig-mode 644

	# This is an arbitrary value that came up from local tests
	sleep 120s

	# Download the kubectl binary into /usr/bin and remove /usr/local/bin/kubectl
	#
	# We need to do this to avoid hitting issues like:
	# ```sh
	# error: open /etc/rancher/k3s/k3s.yaml.lock: permission denied
	# ```
	# Which happens basically because k3s links `/usr/local/bin/kubectl`
	# to `/usr/local/bin/k3s`, and that does extra stuff that vanilla
	# `kubectl` doesn't do.
	ARCH=$(arch_to_golang)

	kubectl_version=$(/usr/local/bin/k3s kubectl version --client=true 2>/dev/null | grep "Client Version" | sed -e 's/Client Version: //' -e 's/+k3s[0-9]\+//')
	sudo curl -fL --progress-bar -o /usr/bin/kubectl https://dl.k8s.io/release/"${kubectl_version}"/bin/linux/"${ARCH}"/kubectl
	sudo chmod +x /usr/bin/kubectl
	sudo rm -rf /usr/local/bin/kubectl

	mkdir -p ~/.kube
	cp /etc/rancher/k3s/k3s.yaml ~/.kube/config
}

function create_cluster_kcli() {
	CLUSTER_NAME="${CLUSTER_NAME:-kata-k8s}"

	delete_cluster_kcli || true

	kcli create kube "${KUBE_TYPE:-generic}" \
		-P domain="kata.com" \
		-P pool="${LIBVIRT_POOL:-default}" \
		-P ctlplanes="${CLUSTER_CONTROL_NODES:-1}" \
		-P workers="${CLUSTER_WORKERS:-1}" \
		-P network="${LIBVIRT_NETWORK:-default}" \
		-P image="${CLUSTER_IMAGE:-ubuntu2204}" \
		-P sdn=flannel \
		-P nfs=false \
		-P disk_size="${CLUSTER_DISK_SIZE:-20}" \
		"${CLUSTER_NAME}"

	export KUBECONFIG="${HOME}/.kcli/clusters/${CLUSTER_NAME}/auth/kubeconfig"

	local cmd="kubectl get nodes | grep '.*worker.*\<Ready\>'"
	echo "Wait at least one worker be Ready"
	if ! waitForProcess "330" "30" "${cmd}"; then
		echo "ERROR: worker nodes not ready."
		kubectl get nodes
		return 1
	fi

	# Ensure that system pods are running or completed.
	cmd="[ \$(kubectl get pods -A --no-headers | grep -v 'Running\|Completed' | wc -l) -eq 0 ]"
	echo "Wait system pods be running or completed"
	if ! waitForProcess "90" "30" "${cmd}"; then
		echo "ERROR: not all pods are Running or Completed."
		kubectl get pods -A
		return 1
	fi
}

function deploy_rke2() {
	curl -sfL https://get.rke2.io | sudo sh -

	sudo systemctl enable --now rke2-server.service

	# This is an arbitrary value that came up from local tests
	sleep 120s

	# Link the kubectl binary into /usr/bin
	sudo ln -sf /var/lib/rancher/rke2/bin/kubectl /usr/local/bin/kubectl

	mkdir -p ~/.kube
	sudo cp /etc/rancher/rke2/rke2.yaml ~/.kube/config
	sudo chown "${USER}":"${USER}" ~/.kube/config
}

function deploy_microk8s() {
	sudo snap install microk8s --classic
	sudo usermod -a -G microk8s "${USER}"
	mkdir -p ~/.kube
	# As we want to call microk8s with sudo, we're safe to ignore SC2024 here
	# shellcheck disable=SC2024
	sudo microk8s kubectl config view --raw > ~/.kube/config
	sudo chown "${USER}":"${USER}" ~/.kube/config

	# These are arbitrary values
	sudo microk8s status --wait-ready --timeout 300

	# install kubectl
	ARCH=$(arch_to_golang)
	kubectl_version=$(sudo microk8s version | grep -oe 'v[0-9]\+\(\.[0-9]\+\)*')
	sudo curl -fL --progress-bar -o /usr/bin/kubectl https://dl.k8s.io/release/"${kubectl_version}"/bin/linux/"${ARCH}"/kubectl
	sudo chmod +x /usr/bin/kubectl
	sudo rm -rf /usr/local/bin/kubectl
}

function _get_k0s_kubernetes_version_for_crio() {
	# k0s version will look like:
	# v1.27.5+k0s.0
	#
	# The CRI-O repo for such version of Kubernetes expects something like:
	# 1.27
	k0s_version=$(get_from_kata_deps ".externals.k0s.version")

	# Remove everything after the second '.'
	crio_version=${k0s_version%\.*+*}
	# Remove the 'v'
	crio_version=${crio_version#v}

	echo "${crio_version}"
}

function setup_crio() {
	# Get the CRI-O version to be installed depending on the version of the
	# "k8s distro" that we are using
	case "${KUBERNETES}" in
		k0s) crio_version=$(_get_k0s_kubernetes_version_for_crio) ;;
		*) >&2 echo "${KUBERNETES} flavour is not supported with CRI-O"; exit 2 ;;

	esac

	install_crio "${crio_version}"
}

function deploy_k8s() {
	echo "::group::Deploying ${KUBERNETES}"

	case "${KUBERNETES}" in
		k0s) deploy_k0s ;;
		k3s) deploy_k3s ;;
		rke2) deploy_rke2 ;;
		microk8s) deploy_microk8s ;;
		*) >&2 echo "${KUBERNETES} flavour is not supported"; exit 2 ;;
	esac

	echo "::endgroup::"
}

function set_test_cluster_namespace() {
	# Delete any spurious tests namespace that was left behind
	kubectl delete namespace "${TEST_CLUSTER_NAMESPACE}" &> /dev/null || true

	# Create a new namespace for the tests and switch to it
	kubectl apply -f "${kubernetes_dir}/runtimeclass_workloads/tests-namespace.yaml"
	kubectl config set-context --current --namespace="${TEST_CLUSTER_NAMESPACE}"
}

function set_default_cluster_namespace() {
	kubectl config set-context --current --namespace=default
}

function delete_test_cluster_namespace() {
	kubectl delete namespace "${TEST_CLUSTER_NAMESPACE}"
	set_default_cluster_namespace
}

function delete_test_runners(){
	echo "Delete test scripts"
	local scripts_names=( "run_kubernetes_tests.sh" "bats" )
	for script_name in "${scripts_names[@]}"; do
		pids=$(pgrep -f "${script_name}")
		if [[ -n "${pids}" ]]; then
			echo "${pids}" | xargs sudo kill -SIGTERM >/dev/null 2>&1 || true
		fi
	done
}

function helm_helper() {
	local max_tries
	local interval
	local i
	local values_yaml

	ensure_yq
	ensure_helm

	values_yaml=$(mktemp -t values_yaml.XXXXXX)

	if [[ -z "${HELM_IMAGE_REFERENCE}" ]]; then
		die "HELM_IMAGE_REFERENCE environment variable cannot be empty."
	fi
	yq -i ".image.reference = \"${HELM_IMAGE_REFERENCE}\"" "${values_yaml}"

	if [[ -z "${HELM_IMAGE_TAG}" ]]; then
		die "HELM_IMAGE_TAG environment variable cannot be empty."
	fi
	yq -i ".image.tag = \"${HELM_IMAGE_TAG}\"" "${values_yaml}"

	[[ -n "${HELM_K8S_DISTRIBUTION}" ]] && yq -i ".k8sDistribution = \"${HELM_K8S_DISTRIBUTION}\"" "${values_yaml}"

	if [[ "${HELM_DEFAULT_INSTALLATION}" = "false" ]]; then
		[[ -n "${HELM_DEBUG}" ]] && yq -i ".env.debug = \"${HELM_DEBUG}\"" "${values_yaml}"
		[[ -n "${HELM_SHIMS}" ]] && yq -i ".env.shims = \"${HELM_SHIMS}\"" "${values_yaml}"
		[[ -n "${HELM_DEFAULT_SHIM}" ]] && yq -i ".env.defaultShim = \"${HELM_DEFAULT_SHIM}\"" "${values_yaml}"
		[[ -n "${HELM_CREATE_RUNTIME_CLASSES}" ]] && yq -i ".env.createRuntimeClasses = \"${HELM_CREATE_RUNTIME_CLASSES}\"" "${values_yaml}"
		[[ -n "${HELM_CREATE_DEFAULT_RUNTIME_CLASS}" ]] && yq -i ".env.createDefaultRuntimeClass = \"${HELM_CREATE_DEFAULT_RUNTIME_CLASS}\"" "${values_yaml}"
		[[ -n "${HELM_ALLOWED_HYPERVISOR_ANNOTATIONS}" ]] && yq -i ".env.allowedHypervisorAnnotations = \"${HELM_ALLOWED_HYPERVISOR_ANNOTATIONS}\"" "${values_yaml}"
		[[ -n "${HELM_SNAPSHOTTER_HANDLER_MAPPING}" ]] && yq -i ".env.snapshotterHandlerMapping = \"${HELM_SNAPSHOTTER_HANDLER_MAPPING}\"" "${values_yaml}"
		[[ -n "${HELM_AGENT_HTTPS_PROXY}" ]] && yq -i ".env.agentHttpsProxy = \"${HELM_AGENT_HTTPS_PROXY}\"" "${values_yaml}"
		[[ -n "${HELM_AGENT_NO_PROXY}" ]] && yq -i ".env.agentNoProxy = \"${HELM_AGENT_NO_PROXY}\"" "${values_yaml}"
		[[ -n "${HELM_PULL_TYPE_MAPPING}" ]] && yq -i ".env.pullTypeMapping = \"${HELM_PULL_TYPE_MAPPING}\"" "${values_yaml}"
		[[ -n "${HELM_HOST_OS}" ]] && yq -i ".env.hostOS=\"${HELM_HOST_OS}\"" "${values_yaml}"
	fi

	echo "::group::Final kata-deploy manifests used in the test"
	cat "${values_yaml}"
	echo ""
	helm template "${helm_chart_dir}" --values "${values_yaml}" --namespace kube-system
	[[ "$(yq .image.reference "${values_yaml}")" = "${HELM_IMAGE_REFERENCE}" ]] || die "Failed to set image reference"
	[[ "$(yq .image.tag "${values_yaml}")" = "${HELM_IMAGE_TAG}" ]] || die "Failed to set image tag"
	echo "::endgroup::"

	max_tries=3
	interval=10
	i=10

	# Retry loop for helm install to prevent transient failures due to instantly unreachable cluster
	set +e # Disable immediate exit on failure
	while true; do
		helm upgrade --install kata-deploy "${helm_chart_dir}" --values "${values_yaml}" --namespace kube-system --debug
		ret=${?}
		if [[ ${ret} -eq 0 ]]; then
			echo "Helm install succeeded!"
			break
		fi
		i=$((i+1))
		if [[ ${i} -lt ${max_tries} ]]; then
			echo "Retrying after ${interval} seconds (Attempt ${i} of $((max_tries - 1)))"
		else
			break
		fi
		sleep "${interval}"
	done
	set -e # Re-enable immediate exit on failure
	if [[ ${i} -eq ${max_tries} ]]; then
		die "Failed to deploy kata-deploy after ${max_tries} tries"
	fi

	# `helm install --wait` does not take effect on single replicas and maxUnavailable=1 DaemonSets
	# like kata-deploy on CI. So wait for pods being Running in the "traditional" way.
	local cmd
	cmd="kubectl -n kube-system get -l name=kata-deploy pod 2>/dev/null | grep '\<Running\>'"
	waitForProcess "${KATA_DEPLOY_WAIT_TIMEOUT}" 10 "${cmd}"

	# FIXME: This is needed as the kata-deploy pod will be set to "Ready"
	# when it starts running, which may cause issues like not having the
	# node properly labeled or the artefacts properly deployed when the
	# tests actually start running.
	sleep 60s

	echo "::group::kata-deploy logs"
	kubectl_retry -n kube-system logs --tail=100 -l name=kata-deploy
	echo "::endgroup::"

	echo "::group::Runtime classes"
	kubectl_retry get runtimeclass
	echo "::endgroup::"
}