Procedures for upgrading Kubernetes clusters, rotating certificates, draining nodes, and performing routine maintenance with minimal service disruption.
# Kubernetes version skew policy:
# kube-apiserver: N
# kube-controller-manager: N or N-1
# kube-scheduler: N or N-1
# kubelet: N, N-1, or N-2
# kubectl: N-1, N, or N+1
# Upgrade order:
# 1. etcd (if major version change)
# 2. kube-apiserver
# 3. kube-controller-manager + kube-scheduler
# 4. kubelet on each node (one at a time or rolling)
# 5. kubectl
# Never skip minor versions: 1.27 → 1.28 → 1.29 (not 1.27 → 1.29)
# Step 1: Upgrade kubeadm on control plane
apt-get update && apt-get install -y kubeadm=1.29.0-00
kubeadm version
# Step 2: Preview upgrade plan
kubeadm upgrade plan
# Shows: current version, available versions, component upgrade steps
# Step 3: Apply upgrade (control plane only)
kubeadm upgrade apply v1.29.0
# This upgrades: kube-apiserver, kube-controller-manager, kube-scheduler
# Does NOT upgrade: kubelet, kubectl, etcd (separate steps)
# Step 4: Upgrade kubelet and kubectl on control plane
apt-get install -y kubelet=1.29.0-00 kubectl=1.29.0-00
systemctl daemon-reload && systemctl restart kubelet
# Step 5: Upgrade additional control plane nodes
# For each additional control plane node:
kubeadm upgrade node # (not 'upgrade apply' — that's only for first CP)
# Step 6: Drain and upgrade each worker node
kubectl drain worker-1 --ignore-daemonsets --delete-emptydir-data
# SSH to worker-1:
apt-get install -y kubeadm=1.29.0-00 kubelet=1.29.0-00 kubectl=1.29.0-00
kubeadm upgrade node
systemctl daemon-reload && systemctl restart kubelet
# Back on control plane:
kubectl uncordon worker-1
# Step 7: Verify all nodes are on new version
kubectl get nodes
# EKS upgrade: managed control plane, worker nodes separate
# Step 1: Update cluster control plane
aws eks update-cluster-version \
--name my-cluster \
--kubernetes-version 1.29
# Wait for cluster to become ACTIVE:
aws eks wait cluster-active --name my-cluster
# Step 2: Update managed node group
aws eks update-nodegroup-version \
--cluster-name my-cluster \
--nodegroup-name workers \
--kubernetes-version 1.29
# Managed: EKS drains and replaces nodes automatically
# Step 3: Update self-managed node groups (if any)
# Drain node → terminate → new node launches with new AMI → uncordon
# Step 4: Update add-ons (kube-proxy, CoreDNS, vpc-cni)
aws eks update-addon \
--cluster-name my-cluster \
--addon-name kube-proxy \
--addon-version v1.29.0-eksbuild.1
aws eks update-addon --cluster-name my-cluster --addon-name coredns \
--addon-version v1.11.1-eksbuild.4
aws eks update-addon --cluster-name my-cluster --addon-name vpc-cni \
--addon-version v1.16.0-eksbuild.1
# Step 5: Verify
kubectl version
kubectl get nodes
# 1. Check release notes for breaking changes
# https://kubernetes.io/releases/ — read CHANGELOG for target version
# 2. Check API deprecations
kubectl deprecations --k8s-version v1.29.0
# OR use pluto:
pluto detect-all-in-cluster --target-versions k8s=v1.29.0
# 3. Validate add-on compatibility
# cert-manager, Istio, Prometheus Operator, Argo CD — check their K8s version support matrix
# 4. Take etcd backup
ETCD_POD=$(kubectl get pod -n kube-system -l component=etcd \
-o jsonpath='{.items[0].metadata.name}')
kubectl exec -n kube-system $ETCD_POD -- etcdctl \
--endpoints=https://127.0.0.1:2379 \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt \
--key=/etc/kubernetes/pki/etcd/healthcheck-client.key \
snapshot save /tmp/pre-upgrade-etcd.db
kubectl cp kube-system/$ETCD_POD:/tmp/pre-upgrade-etcd.db ./etcd-pre-upgrade.db
# 5. Check all PodDisruptionBudgets
kubectl get pdb -A
# 6. Check cluster is healthy
kubectl get nodes
kubectl get pods -n kube-system
kubectl cluster-info
# Check certificate expiration
kubeadm certs check-expiration
# NAME EXPIRES RESIDUAL TIME
# admin.conf Jan 15, 2026 364d
# apiserver Jan 15, 2026 364d
# etcd-healthcheck-client Jan 15, 2026 364d
# Renew all certificates (kubeadm clusters — annual or on expiry)
kubeadm certs renew all
# Restart static pods to pick up new certificates
# Option 1: touch manifests (kubelet detects change and restarts pods)
for m in /etc/kubernetes/manifests/*.yaml; do touch $m; done
# Option 2: move and restore manifests
for m in kube-apiserver kube-controller-manager kube-scheduler etcd; do
mv /etc/kubernetes/manifests/$m.yaml /tmp/
done
sleep 30
for m in kube-apiserver kube-controller-manager kube-scheduler etcd; do
mv /tmp/$m.yaml /etc/kubernetes/manifests/
done
# Update kubeconfig
cp /etc/kubernetes/admin.conf ~/.kube/config
# OR refresh user credentials:
kubeadm kubeconfig user --client-name admin > ~/.kube/config
# cert-manager certificates (auto-renewed — see certificate management playbook)
kubectl get certificate -A
kubectl describe certificate <name> -n <ns>
# Scheduled node maintenance (kernel update, hardware replacement)
# Step 1: Cordon node (prevent new pods)
kubectl cordon worker-5
# Step 2: Drain node (evict all evictable pods)
kubectl drain worker-5 \
--ignore-daemonsets \
--delete-emptydir-data \
--timeout=300s
# If PDB is blocking eviction:
kubectl get pdb -A
# Options:
# a) Wait for other pods to become ready
# b) Temporarily scale up the affected deployment
# c) Last resort: --disable-eviction=true (bypasses PDB — risky)
# Step 3: Perform maintenance
# SSH to node, apply kernel patches, etc.
apt-get update && apt-get upgrade -y
reboot
# Step 4: Verify node is back
kubectl get node worker-5
# Should show Ready
# Step 5: Uncordon
kubectl uncordon worker-5
# Step 6: Verify pods rescheduled
kubectl get pods -o wide -n <ns> | grep worker-5
# Replace all nodes in a node group (new AMI, instance type change)
# Option 1: Rolling replacement (zero downtime)
for node in $(kubectl get nodes -l nodegroup=workers -o name); do
echo "Draining $node"
kubectl drain $node --ignore-daemonsets --delete-emptydir-data --timeout=300s
# Terminate node (cloud will provision replacement)
INSTANCE_ID=$(kubectl get $node -o jsonpath='{.spec.providerID}' | cut -d'/' -f5)
aws ec2 terminate-instances --instance-ids $INSTANCE_ID
# Wait for replacement
sleep 120
kubectl wait --for=condition=Ready node -l nodegroup=workers \
--timeout=300s
done
# Option 2: AWS ASG instance refresh (managed rolling replacement)
aws autoscaling start-instance-refresh \
--auto-scaling-group-name workers-asg \
--preferences '{"MinHealthyPercentage":80,"InstanceWarmup":300}'
# Monitor refresh
aws autoscaling describe-instance-refreshes \
--auto-scaling-group-name workers-asg
# Communicate maintenance windows via Slack/email:
# - 48h advance notice for planned maintenance
# - 15 min notice for emergency patching
# - Post in #platform-announcements channel
# Freeze period configuration (prevent Argo CD sync during freeze)
# Argo CD: set maintenance window
kubectl patch application payments-api -n argocd \
--type merge -p '{"spec":{"syncPolicy":{"automated":null}}}'
# Re-enable auto-sync after maintenance
kubectl patch application payments-api -n argocd \
--type merge -p '{"spec":{"syncPolicy":{"automated":{"prune":true,"selfHeal":true}}}}'
# Mark node as maintenance (custom label for monitoring)
kubectl label node worker-5 maintenance=true
# Remove after maintenance:
kubectl label node worker-5 maintenance-
# Run after every upgrade
echo "=== Node versions ===" && kubectl get nodes
echo "=== Control plane ===" && kubectl get pods -n kube-system -l tier=control-plane
echo "=== Add-ons ===" && kubectl get pods -n kube-system
echo "=== Workload health ===" && kubectl get pods -A | grep -v Running | grep -v Completed
echo "=== Recent events ===" && kubectl get events -A --sort-by='.lastTimestamp' | grep -v Normal | tail -20
# Smoke test key workflows
kubectl run smoke-test --image=nginx --rm -it -- nginx -v
kubectl run dns-test --image=nicolaka/netshoot --rm -it -- \
nslookup kubernetes.default.svc.cluster.local
# Verify HPA is working
kubectl get hpa -A
# Verify cert-manager
kubectl get certificate -A | grep -v True