Control Plane Issues

Overview

Diagnosis and resolution of control plane failures — API server unreachable, etcd errors, controller-manager and scheduler problems, and leader election failures.

Control Plane Health Check

# Quick health check
kubectl get componentstatuses   # deprecated but still works on most clusters
# Or via individual component pods:
kubectl get pods -n kube-system -l tier=control-plane

# Check API server health endpoint (from within cluster)
kubectl run debug --image=curlimages/curl --rm -it -- \
  curl -k https://kubernetes.default.svc/healthz

# From a node:
curl -k https://localhost:6443/healthz
curl -k https://localhost:6443/readyz
curl -k https://localhost:6443/livez

# Check all sub-healthchecks
curl -k "https://localhost:6443/readyz?verbose"
# [+]ping ok
# [+]log ok
# [+]etcd ok
# [+]informer-sync ok
# ...

# Managed K8s (EKS/GKE/AKS): check cloud console for control plane status
# EKS:
aws eks describe-cluster --name <cluster> --query 'cluster.status'

API Server Unreachable

# kubectl: "The connection to the server was refused"
# OR: "Unable to connect to the server: dial tcp: connection refused"

# Check if API server process is running (on control plane node)
kubectl debug node/control-plane-1 -it --image=ubuntu -- \
  ps aux | grep kube-apiserver

# Check API server pod
kubectl get pod -n kube-system kube-apiserver-control-plane-1
# If not running: static pod may have failed to start

# Check kubelet logs (API server is a static pod managed by kubelet)
journalctl -u kubelet --since "30 minutes ago" | grep -i apiserver

# Check static pod manifest
cat /etc/kubernetes/manifests/kube-apiserver.yaml
# Verify: no syntax errors, correct flags

# Check API server logs (if pod is running but misbehaving)
kubectl logs -n kube-system kube-apiserver-control-plane-1 --tail=100
# Common error: "Error while dialing dial tcp: connection refused" → etcd unreachable

# Check certificates
kubeadm certs check-expiration
# Expired cert → API server won't start
# Fix: kubeadm certs renew all

# Check port is listening
ss -tlnp | grep 6443

# Check etcd health (API server can't start without etcd)
# See etcd section below

kube-controller-manager Issues

# Symptoms: Deployments not scaling, PVCs not being bound,
#           garbage collection not happening

kubectl get pod -n kube-system -l component=kube-controller-manager
kubectl logs -n kube-system kube-controller-manager-<node> --tail=100

# Check leader election (only one replica should be active)
kubectl get lease kube-controller-manager -n kube-system \
  -o jsonpath='{.spec.holderIdentity}'

# Common issues:
# 1. Leader election thrashing (high leaderTransitions)
kubectl get lease kube-controller-manager -n kube-system \
  -o jsonpath='{.spec.leaderTransitions}'
# Cause: network partition, slow etcd, too-tight renewDeadline
# Fix: increase leaseDuration/renewDeadline, check etcd latency

# 2. Controller not reconciling specific resource
kubectl logs -n kube-system kube-controller-manager-<node> | \
  grep -i "failed\|error" | grep deployment | head -20

# 3. Cloud controller manager issues (node joining, LB provisioning)
kubectl logs -n kube-system -l app=cloud-controller-manager --tail=100

# 4. Certificate controller — certificates not being rotated
kubectl logs -n kube-system kube-controller-manager-<node> | grep cert

kube-scheduler Issues

# Symptoms: pods stuck Pending indefinitely (not just "no resources")

kubectl get pod -n kube-system -l component=kube-scheduler
kubectl logs -n kube-system kube-scheduler-<node> --tail=100

# Check scheduler is elected leader
kubectl get lease kube-scheduler -n kube-system \
  -o jsonpath='{.spec.holderIdentity}'

# Common issues:
# 1. Scheduler profile misconfiguration
kubectl get pod -n kube-system kube-scheduler-<node> -o yaml | \
  grep -A5 -- "--config"
cat /etc/kubernetes/scheduler-config.yaml

# 2. Pod topology spread constraints being too strict
kubectl describe pod <pending-pod> -n <ns>
# "0/5 nodes are available: 5 node(s) didn't satisfy pod topology spread constraints"
# Fix: reduce maxSkew or use DoNotSchedule→ScheduleAnyway

# 3. Custom scheduler plugin crash
kubectl logs -n kube-system kube-scheduler-<node> | grep panic

# 4. Scheduler not watching new pods (watch cache issue)
# Force restart (static pod — delete manifest, kubelet restarts it):
# mv /etc/kubernetes/manifests/kube-scheduler.yaml /tmp/
# sleep 10
# mv /tmp/kube-scheduler.yaml /etc/kubernetes/manifests/

etcd Issues

# Check etcd health
ETCD_POD=$(kubectl get pod -n kube-system -l component=etcd \
  -o jsonpath='{.items[0].metadata.name}')
kubectl exec -n kube-system $ETCD_POD -- etcdctl \
  --endpoints=https://127.0.0.1:2379 \
  --cacert=/etc/kubernetes/pki/etcd/ca.crt \
  --cert=/etc/kubernetes/pki/etcd/healthcheck-client.crt \
  --key=/etc/kubernetes/pki/etcd/healthcheck-client.key \
  endpoint health --cluster

# Check etcd latency (should be < 100ms for p99)
kubectl exec -n kube-system $ETCD_POD -- etcdctl \
  --endpoints=https://127.0.0.1:2379 \
  --cacert=... --cert=... --key=... \
  check perf

# etcd disk performance (etcd needs fast fsync)
kubectl exec -n kube-system $ETCD_POD -- \
  fio --rw=write --ioengine=sync --fdatasync=1 --directory=/var/lib/etcd \
  --size=22m --bs=2300 --name=etcd-test
# p99 latency should be < 10ms

# Check etcd metrics
kubectl exec -n kube-system $ETCD_POD -- \
  curl -s http://localhost:2381/metrics | grep etcd_disk_wal_fsync

# High etcd latency causes:
# 1. Disk I/O saturation → move etcd to dedicated SSD
# 2. Too many revisions → compact and defrag

# Compact etcd (reduce stored revisions)
CURRENT_REVISION=$(kubectl exec -n kube-system $ETCD_POD -- etcdctl \
  --endpoints=https://127.0.0.1:2379 --cacert=... --cert=... --key=... \
  endpoint status -w json | jq '.[0].Status.header.revision')
kubectl exec -n kube-system $ETCD_POD -- etcdctl \
  --endpoints=https://127.0.0.1:2379 --cacert=... --cert=... --key=... \
  compact $CURRENT_REVISION

# Defragment etcd (reclaim disk space after compaction)
kubectl exec -n kube-system $ETCD_POD -- etcdctl \
  --endpoints=https://127.0.0.1:2379 --cacert=... --cert=... --key=... \
  defrag

etcd Quorum Loss

etcd requires (n/2 + 1) members to be healthy for writes.
3-member cluster: needs 2 healthy members
5-member cluster: needs 3 healthy members

Quorum lost → etcd rejects all writes → API server returns 503

Recovery (3-member cluster, 2 members dead):
  1. Identify which member is still alive
  2. Restore from most recent snapshot
  3. Re-initialize cluster from single member (disaster recovery mode)

NEVER do this without a verified snapshot backup.

# Check etcd member list
kubectl exec -n kube-system $ETCD_POD -- etcdctl \
  --endpoints=https://127.0.0.1:2379 --cacert=... --cert=... --key=... \
  member list

# Remove a dead member (before replacing it)
kubectl exec -n kube-system $ETCD_POD -- etcdctl \
  --endpoints=https://127.0.0.1:2379 --cacert=... --cert=... --key=... \
  member remove <member-id>

# Take etcd snapshot
kubectl exec -n kube-system $ETCD_POD -- etcdctl \
  --endpoints=https://127.0.0.1:2379 --cacert=... --cert=... --key=... \
  snapshot save /var/lib/etcd/snapshot.db

kubectl cp kube-system/$ETCD_POD:/var/lib/etcd/snapshot.db ./etcd-backup.db

# Restore from snapshot (follow kubeadm disaster recovery docs)

Webhook Breaking the Control Plane

# Symptoms: kubectl apply returns 500, API server logs show webhook errors
# Common pattern: Kyverno/OPA webhook fails → blocks all resource creation

# Find failing webhooks
kubectl logs -n kube-system kube-apiserver-<node> | grep -i webhook | tail -20

# Test which webhook is blocking
kubectl apply --dry-run=server -f test-pod.yaml
# Error message shows which webhook configuration is rejecting

# Emergency: disable all webhooks temporarily
kubectl get mutatingwebhookconfigurations
kubectl get validatingwebhookconfigurations

# Disable by changing failurePolicy to Ignore (reversible)
kubectl patch mutatingwebhookconfiguration <name> \
  --type=json -p='[{"op":"replace","path":"/webhooks/0/failurePolicy","value":"Ignore"}]'

# Or delete the webhook configuration entirely (will be recreated by operator)
kubectl delete mutatingwebhookconfiguration <name>

# Check if Kyverno/OPA is running
kubectl get pods -n kyverno
kubectl get pods -n gatekeeper-system

Certificate Expiry

# Check all certificate expiration dates
kubeadm certs check-expiration
# NAME                                EXPIRES                  RESIDUAL TIME
# admin.conf                          Jan 15, 2026 10:00 UTC   364d
# apiserver                           Jan 15, 2026 10:00 UTC   364d
# ...

# Renew all certificates (kubeadm clusters)
kubeadm certs renew all

# Restart static pods to pick up new certificates
# kubelet automatically restarts static pods when their manifest changes
# Force restart by touching the manifest:
touch /etc/kubernetes/manifests/kube-apiserver.yaml

# Update kubeconfig files with new certs
kubeadm kubeconfig user --client-name admin > /etc/kubernetes/admin.conf

# EKS: certificates are managed by AWS; expiry is for add-on/custom certs only
# cert-manager: see certificate management playbook

11 — Leader Election — how controller-manager/scheduler elect leaders
10 — etcd Issues — etcd-specific deep-dive
07 — Certificate Management — certificate rotation

Overview

Control Plane Health Check

API Server Unreachable

kube-controller-manager Issues

kube-scheduler Issues

etcd Issues

etcd Quorum Loss

Webhook Breaking the Control Plane

Certificate Expiry

Related