helm: Add a post-delete hook

Instead of using a lifecycle.preStop hook, as done when we're using
using the helm chat, let's add a post-delete hook to take care of
properly cleaning up the node during when uninstalling kata-deploy.

The reason why the lifecyle.preStop hook would never work on our case is
simply because each helm chart operation follows the Kuberentes
"declarative" approach, meaning that an operation won't wait for its
previous operation to successfully finish before being called, leading
to us trying to access content that's defined by our RBAC, in an
operation that was started before our RBAC was deleted, but having the
RBAC being deleted before the operation actually started.

Unfortunately this hook brings in some code duplicatioon, mainly related
to the RBAC parts, but that's not new as the same happens with our
deamonset.

Signed-off-by: Fabiano Fidêncio <fabiano.fidencio@intel.com>
This commit is contained in:
Fabiano Fidêncio 2024-08-20 10:03:10 +02:00 committed by Fabiano Fidêncio
parent 3b23d62635
commit 64ccb1645d
3 changed files with 145 additions and 5 deletions

View File

@ -22,10 +22,6 @@ spec:
- name: kube-kata
image: {{ .Values.image.reference }}:{{ default .Chart.AppVersion .Values.image.tag }}
imagePullPolicy: {{ .Values.imagePullPolicy }}
lifecycle:
preStop:
exec:
command: ["bash", "-c", "/opt/kata-artifacts/scripts/kata-deploy.sh cleanup"]
command: ["bash", "-c", "/opt/kata-artifacts/scripts/kata-deploy.sh install"]
env:
- name: NODE_NAME

View File

@ -0,0 +1,116 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ .Chart.Name }}-sa-cleanup
namespace: {{ .Release.Namespace }}
annotations:
"helm.sh/hook": post-delete
"helm.sh/hook-weight": "-3"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: {{ .Chart.Name }}-role-cleanup
annotations:
"helm.sh/hook": post-delete
"helm.sh/hook-weight": "-2"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "patch"]
- apiGroups: ["node.k8s.io"]
resources: ["runtimeclasses"]
verbs: ["create", "delete", "get", "list", "patch", "update", "watch"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: {{ .Chart.Name }}-rb-cleanup
annotations:
"helm.sh/hook": post-delete
"helm.sh/hook-weight": "-1"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ .Chart.Name }}-role-cleanup
subjects:
- kind: ServiceAccount
name: {{ .Chart.Name }}-sa-cleanup
namespace: {{ .Release.Namespace }}
---
apiVersion: batch/v1
kind: Job
metadata:
name: {{ .Chart.Name }}-cleanup
namespace: {{ .Release.Namespace }}
annotations:
"helm.sh/hook": post-delete
"helm.sh/hook-weight": "0"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
template:
metadata:
labels:
role: cleanup
spec:
serviceAccountName: {{ .Chart.Name }}-sa-cleanup
hostPID: true
containers:
- name: kube-kata-cleanup
image: {{ .Values.image.reference }}:{{ default .Chart.AppVersion .Values.image.tag }}
imagePullPolicy: IfNotPresent
command: ["bash", "-c", "/opt/kata-artifacts/scripts/kata-deploy.sh cleanup"]
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: DEBUG
value: {{ .Values.env.debug | quote }}
- name: SHIMS
value: {{ .Values.env.shims | quote }}
- name: DEFAULT_SHIM
value: {{ .Values.env.defaultShim | quote }}
- name: CREATE_RUNTIMECLASSES
value: {{ .Values.env.createRuntimeClasses | quote }}
- name: CREATE_DEFAULT_RUNTIMECLASS
value: {{ .Values.env.createDefaultRuntimeClass | quote }}
- name: ALLOWED_HYPERVISOR_ANNOTATIONS
value: {{ .Values.env.allowedHypervisorAnnotations | quote }}
- name: SNAPSHOTTER_HANDLER_MAPPING
value: {{ .Values.env.snapshotterHandlerMapping | quote }}
- name: AGENT_HTTPS_PROXY
value: {{ .Values.env.agentHttpsProxy | quote }}
- name: AGENT_NO_PROXY
value: {{ .Values.env.agentNoProxy | quote }}
- name: PULL_TYPE_MAPPING
value: {{ .Values.env.pullTypeMapping | quote }}
- name: HELM_POST_DELETE_HOOK
value: "true"
{{- with .Values.env.hostOS }}
- name: HOST_OS
value: {{ . | quote }}
{{- end }}
securityContext:
privileged: true
volumeMounts:
- name: crio-conf
mountPath: /etc/crio/
- name: containerd-conf
mountPath: /etc/containerd/
- name: host
mountPath: /host/
volumes:
- name: crio-conf
hostPath:
path: /etc/crio/
- name: containerd-conf
hostPath:
path: '{{- template "containerdConfPath" .Values }}'
- name: host
hostPath:
path: /
restartPolicy: Never

View File

@ -48,6 +48,8 @@ fi
# doubled here as well, as: `/host//opt/kata`
host_install_dir="/host${dest_dir}"
HELM_POST_DELETE_HOOK="${HELM_POST_DELETE_HOOK:-"false"}"
# If we fail for any reason a message will be displayed
die() {
msg="$*"
@ -560,6 +562,16 @@ function cleanup_cri_runtime() {
;;
esac
[ "${HELM_POST_DELETE_HOOK}" == "false" ] && return
# Only run this code in the HELM_POST_DELETE_HOOK
if [ "$1" == "k0s-worker" ] || [ "$1" == "k0s-controller" ]; then
# do nothing, k0s will automatically unload the config on the fly
:
else
host_systemctl daemon-reload
host_systemctl restart "$1"
fi
}
function cleanup_crio() {
@ -659,6 +671,7 @@ function main() {
echo "* AGENT_NO_PROXY: ${AGENT_NO_PROXY}"
echo "* PULL_TYPE_MAPPING: ${PULL_TYPE_MAPPING}"
echo "* INSTALLATION_PREFIX: ${INSTALLATION_PREFIX}"
echo "* HELM_POST_DELETE_HOOK: ${HELM_POST_DELETE_HOOK}"
# script requires that user is root
euid=$(id -u)
@ -716,9 +729,24 @@ function main() {
containerd_conf_file="${containerd_conf_tmpl_file}"
fi
if [ "${HELM_POST_DELETE_HOOK}" == "true" ]; then
# Remove the label as the first thing, so we ensure no more kata-containers
# pods would be scheduled here.
kubectl label node "$NODE_NAME" katacontainers.io/kata-runtime-
fi
cleanup_cri_runtime "$runtime"
if [ "${HELM_POST_DELETE_HOOK}" == "false" ]; then
# The Confidential Containers operator relies on this label
kubectl label node "$NODE_NAME" --overwrite katacontainers.io/kata-runtime=cleanup
fi
remove_artifacts
if [ "${HELM_POST_DELETE_HOOK}" == "true" ]; then
# After everything was cleaned up, there's no reason to continue
# and sleep forever. Let's just return success..
exit 0
fi
;;
reset)
reset_runtime $runtime