Control Plane Issues
Overview
Diagnosis and resolution of control plane failures — API server unreachable, etcd errors, controller-manager and scheduler problems, and leader election failures.
Control Plane Health Check
# Quick health check
kubectl get componentstatuses # deprecated but still works on most clusters
# Or via individual component pods:
kubectl get pods -n kube-system -l tier=control-plane
# Check API server health endpoint (from within cluster)
kubectl run debug --image=curlimages/curl --rm -it -- \
curl -k https://kubernetes.default.svc/healthz
# From a node:
curl -k https://localhost:6443/healthz
curl -k https://localhost:6443/readyz
curl -k https://localhost:6443/livez
# Check all sub-healthchecks
curl -k "https://localhost:6443/readyz?verbose"
# [+]ping ok
# [+]log ok
# [+]etcd ok
# [+]informer-sync ok
# ...
# Managed K8s (EKS/GKE/AKS): check cloud console for control plane status
# EKS:
aws eks describe-cluster --name <cluster> --query 'cluster.status'
API Server Unreachable
# kubectl: "The connection to the server was refused"
# OR: "Unable to connect to the server: dial tcp: connection refused"
# Check if API server process is running (on control plane node)
kubectl debug node/control-plane-1 -it --image=ubuntu -- \
ps aux | grep kube-apiserver
# Check API server pod
kubectl get pod -n kube-system kube-apiserver-control-plane-1
# If not running: static pod may have failed to start
# Check kubelet logs (API server is a static pod managed by kubelet)
journalctl -u kubelet --since "30 minutes ago" | grep -i apiserver
# Check static pod manifest
cat /etc/kubernetes/manifests/kube-apiserver.yaml
# Verify: no syntax errors, correct flags
# Check API server logs (if pod is running but misbehaving)
kubectl logs -n kube-system kube-apiserver-control-plane-1 --tail=100
# Common error: "Error while dialing dial tcp: connection refused" → etcd unreachable
# Check certificates
kubeadm certs check-expiration
# Expired cert → API server won't start
# Fix: kubeadm certs renew all
# Check port is listening
ss -tlnp | grep 6443
# Check etcd health (API server can't start without etcd)
# See etcd section below
kube-controller-manager Issues
# Symptoms: Deployments not scaling, PVCs not being bound,
# garbage collection not happening
kubectl get pod -n kube-system -l component=kube-controller-manager
kubectl logs -n kube-system kube-controller-manager-<node> --tail=100
# Check leader election (only one replica should be active)
kubectl get lease kube-controller-manager -n kube-system \
-o jsonpath='{.spec.holderIdentity}'
# Common issues:
# 1. Leader election thrashing (high leaderTransitions)
kubectl get lease kube-controller-manager -n kube-system \
-o jsonpath='{.spec.leaderTransitions}'
# Cause: network partition, slow etcd, too-tight renewDeadline
# Fix: increase leaseDuration/renewDeadline, check etcd latency
# 2. Controller not reconciling specific resource
kubectl logs -n kube-system kube-controller-manager-<node> | \
grep -i "failed\|error" | grep deployment | head -20
# 3. Cloud controller manager issues (node joining, LB provisioning)
kubectl logs -n kube-system -l app=cloud-controller-manager --tail=100
# 4. Certificate controller — certificates not being rotated
kubectl logs -n kube-system kube-controller-manager-<node> | grep cert
kube-scheduler Issues
# Symptoms: pods stuck Pending indefinitely (not just "no resources")
kubectl get pod -n kube-system -l component=kube-scheduler
kubectl logs -n kube-system kube-scheduler-<node> --tail=100
# Check scheduler is elected leader
kubectl get lease kube-scheduler -n kube-system \
-o jsonpath='{.spec.holderIdentity}'
# Common issues:
# 1. Scheduler profile misconfiguration
kubectl get pod -n kube-system kube-scheduler-<node> -o yaml | \
grep -A5 -- "--config"
cat /etc/kubernetes/scheduler-config.yaml
# 2. Pod topology spread constraints being too strict
kubectl describe pod <pending-pod> -n <ns>
# "0/5 nodes are available: 5 node(s) didn't satisfy pod topology spread constraints"
# Fix: reduce maxSkew or use DoNotSchedule→ScheduleAnyway
# 3. Custom scheduler plugin crash
kubectl logs -n kube-system kube-scheduler-<node> | grep panic
# 4. Scheduler not watching new pods (watch cache issue)
# Force restart (static pod — delete manifest, kubelet restarts it):
# mv /etc/kubernetes/manifests/kube-scheduler.yaml /tmp/
# sleep 10
# mv /tmp/kube-scheduler.yaml /etc/kubernetes/manifests/
etcd Issues
# Check etcd health
ETCD_POD=$(kubectl get pod -n kube-system -l component=etcd \
-o jsonpath='{.items[0].metadata.name}')
kubectl exec -n kube-system $ETCD_POD -- etcdctl \
--endpoints=https://127.0.0.1:2379 \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
--cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt \
--key=/etc/kubernetes/pki/etcd/healthcheck-client.key \
endpoint health --cluster
# Check etcd latency (should be < 100ms for p99)
kubectl exec -n kube-system $ETCD_POD -- etcdctl \
--endpoints=https://127.0.0.1:2379 \
--cacert=... --cert=... --key=... \
check perf
# etcd disk performance (etcd needs fast fsync)
kubectl exec -n kube-system $ETCD_POD -- \
fio --rw=write --ioengine=sync --fdatasync=1 --directory=/var/lib/etcd \
--size=22m --bs=2300 --name=etcd-test
# p99 latency should be < 10ms
# Check etcd metrics
kubectl exec -n kube-system $ETCD_POD -- \
curl -s http://localhost:2381/metrics | grep etcd_disk_wal_fsync
# High etcd latency causes:
# 1. Disk I/O saturation → move etcd to dedicated SSD
# 2. Too many revisions → compact and defrag
# Compact etcd (reduce stored revisions)
CURRENT_REVISION=$(kubectl exec -n kube-system $ETCD_POD -- etcdctl \
--endpoints=https://127.0.0.1:2379 --cacert=... --cert=... --key=... \
endpoint status -w json | jq '.[0].Status.header.revision')
kubectl exec -n kube-system $ETCD_POD -- etcdctl \
--endpoints=https://127.0.0.1:2379 --cacert=... --cert=... --key=... \
compact $CURRENT_REVISION
# Defragment etcd (reclaim disk space after compaction)
kubectl exec -n kube-system $ETCD_POD -- etcdctl \
--endpoints=https://127.0.0.1:2379 --cacert=... --cert=... --key=... \
defrag
etcd Quorum Loss
etcd requires (n/2 + 1) members to be healthy for writes.
3-member cluster: needs 2 healthy members
5-member cluster: needs 3 healthy members
Quorum lost → etcd rejects all writes → API server returns 503
Recovery (3-member cluster, 2 members dead):
1. Identify which member is still alive
2. Restore from most recent snapshot
3. Re-initialize cluster from single member (disaster recovery mode)
NEVER do this without a verified snapshot backup.
# Check etcd member list
kubectl exec -n kube-system $ETCD_POD -- etcdctl \
--endpoints=https://127.0.0.1:2379 --cacert=... --cert=... --key=... \
member list
# Remove a dead member (before replacing it)
kubectl exec -n kube-system $ETCD_POD -- etcdctl \
--endpoints=https://127.0.0.1:2379 --cacert=... --cert=... --key=... \
member remove <member-id>
# Take etcd snapshot
kubectl exec -n kube-system $ETCD_POD -- etcdctl \
--endpoints=https://127.0.0.1:2379 --cacert=... --cert=... --key=... \
snapshot save /var/lib/etcd/snapshot.db
kubectl cp kube-system/$ETCD_POD:/var/lib/etcd/snapshot.db ./etcd-backup.db
# Restore from snapshot (follow kubeadm disaster recovery docs)
Webhook Breaking the Control Plane
# Symptoms: kubectl apply returns 500, API server logs show webhook errors
# Common pattern: Kyverno/OPA webhook fails → blocks all resource creation
# Find failing webhooks
kubectl logs -n kube-system kube-apiserver-<node> | grep -i webhook | tail -20
# Test which webhook is blocking
kubectl apply --dry-run=server -f test-pod.yaml
# Error message shows which webhook configuration is rejecting
# Emergency: disable all webhooks temporarily
kubectl get mutatingwebhookconfigurations
kubectl get validatingwebhookconfigurations
# Disable by changing failurePolicy to Ignore (reversible)
kubectl patch mutatingwebhookconfiguration <name> \
--type=json -p='[{"op":"replace","path":"/webhooks/0/failurePolicy","value":"Ignore"}]'
# Or delete the webhook configuration entirely (will be recreated by operator)
kubectl delete mutatingwebhookconfiguration <name>
# Check if Kyverno/OPA is running
kubectl get pods -n kyverno
kubectl get pods -n gatekeeper-system
Certificate Expiry
# Check all certificate expiration dates
kubeadm certs check-expiration
# NAME EXPIRES RESIDUAL TIME
# admin.conf Jan 15, 2026 10:00 UTC 364d
# apiserver Jan 15, 2026 10:00 UTC 364d
# ...
# Renew all certificates (kubeadm clusters)
kubeadm certs renew all
# Restart static pods to pick up new certificates
# kubelet automatically restarts static pods when their manifest changes
# Force restart by touching the manifest:
touch /etc/kubernetes/manifests/kube-apiserver.yaml
# Update kubeconfig files with new certs
kubeadm kubeconfig user --client-name admin > /etc/kubernetes/admin.conf
# EKS: certificates are managed by AWS; expiry is for add-on/custom certs only
# cert-manager: see certificate management playbook
Related
- 11 — Leader Election — how controller-manager/scheduler elect leaders
- 10 — etcd Issues — etcd-specific deep-dive
- 07 — Certificate Management — certificate rotation