Overview

Diagnosis and resolution of Kubernetes network failures — from Service connectivity and DNS resolution through NetworkPolicy blocks to CNI plugin problems.

Network Failure Decision Tree

Cannot reach Service X
        │
        ▼
Is the Service DNS name resolving?
kubectl run debug --image=nicolaka/netshoot --rm -it -- \
  nslookup payments-api.production.svc.cluster.local
┌───────────────────────────────────────────────────────────┐
│ NXDOMAIN / timeout                                        │ Resolves
└─── DNS issue → see DNS section ──────────────────────────┘    │
                                                                 ▼
                                              Does the Service have endpoints?
                                              kubectl get endpoints payments-api -n production
                                           ┌──────────────────────────────────────────────┐
                                           │ <none> or empty                              │ Has IPs
                                           └─── Pod not Ready → see Pod Failures ─────────┘   │
                                                                                               ▼
                                                                              Can you reach pod IP directly?
                                                                              kubectl exec debug -- \
                                                                                curl <pod-ip>:8080
                                                                           ┌──────────────────────────────┐
                                                                           │ No                           │ Yes
                                                                           └── NetworkPolicy block?       └── Service/kube-proxy issue
                                                                               CNI issue?

Service Connectivity Debugging

# Step 1: Check Service exists and is correct type
kubectl get svc payments-api -n production -o yaml
# Check: spec.selector matches pod labels
# Check: spec.ports matches container port

# Step 2: Verify pod labels match selector
SVC_SELECTOR=$(kubectl get svc payments-api -n production \
  -o jsonpath='{.spec.selector}' | jq -r 'to_entries | map(.key+"="+.value) | join(",")')
kubectl get pods -n production -l "$SVC_SELECTOR"

# Step 3: Check endpoints
kubectl get endpoints payments-api -n production
kubectl describe endpoints payments-api -n production
# If empty: no pods match selector + are Ready

# Step 4: Test from within the cluster
kubectl run netshoot --image=nicolaka/netshoot --rm -it -n production -- \
  curl -v http://payments-api.production.svc.cluster.local:8080/healthz

# Step 5: Check kube-proxy iptables rules (iptables mode)
kubectl get pod -n kube-system -l k8s-app=kube-proxy
# On node:
iptables -t nat -L KUBE-SERVICES | grep payments-api
iptables -t nat -L KUBE-SVC-XXXXX  # Service chain
iptables -t nat -L KUBE-SEP-XXXXX  # Endpoint chain per pod

# Step 6: Cilium (eBPF mode) — check Service map
CILIUM_POD=$(kubectl get pod -n kube-system -l k8s-app=cilium \
  -o jsonpath='{.items[0].metadata.name}')
kubectl exec -n kube-system $CILIUM_POD -- \
  cilium service list | grep payments-api

NetworkPolicy Blocking Traffic

# Symptoms: connection refused / timeout despite pod being Ready

# Step 1: Check if any NetworkPolicy exists in namespace
kubectl get networkpolicy -n production

# Step 2: Check if there's a default-deny
kubectl get networkpolicy -n production -o json | \
  jq '.items[] | select(.spec.podSelector == {}) | .metadata.name'

# Step 3: Use Cilium policy trace to simulate traffic decision
cilium policy trace \
  --src-k8s-pod production/frontend-xxx \
  --dst-k8s-pod production/payments-api-xxx \
  --dport 8080/TCP

# Step 4: Check Hubble for dropped packets
hubble observe --namespace production --verdict DROPPED \
  --from-pod frontend-xxx

# Step 5: Verify policy allows intended traffic
kubectl describe networkpolicy payments-api-allow -n production
# Confirm: ingress from correct podSelector, correct port

# Step 6: Test with policy temporarily disabled (staging only)
# Delete the restricting policy, test, restore
kubectl delete networkpolicy default-deny-ingress -n staging  # STAGING ONLY
# ... test ...
kubectl apply -f policies/default-deny-ingress.yaml

Pod-to-Pod Connectivity

# Test direct pod-to-pod (bypassing Service)
POD_A_IP=$(kubectl get pod frontend-xxx -n production -o jsonpath='{.status.podIP}')
kubectl exec -n production payments-api-xxx -- \
  curl -v http://$POD_A_IP:8080/

# Test cross-namespace
kubectl exec -n frontend frontend-xxx -- \
  curl http://payments-api.production.svc.cluster.local:8080/

# ICMP ping test (if allowed)
kubectl exec -n production frontend-xxx -- \
  ping -c 3 $POD_A_IP

# Traceroute
kubectl run netshoot --image=nicolaka/netshoot --rm -it -- \
  traceroute $POD_A_IP

# Check if traffic leaves node (tcpdump — requires privilege)
kubectl debug node/<node-name> -it --image=ubuntu -- \
  tcpdump -i any -n host $POD_A_IP

Node-to-Pod Connectivity

# Check overlay network (VXLAN/Geneve) is set up correctly
# Get pod IP and its node
kubectl get pod payments-api-xxx -n production \
  -o jsonpath='{.status.podIP} {.spec.nodeName}'

# Check routing on node
kubectl debug node/<node-name> -it --image=ubuntu -- \
  ip route show | grep <pod-cidr>

# For Cilium: check BPF routing table
kubectl exec -n kube-system $CILIUM_POD -- \
  cilium bpf lb list

# For Calico: check IPAM and route
kubectl exec -n kube-system calico-node-xxx -- \
  calico-node -status
# Check: Routing daemon table, IPAM allocations

# Cross-node pod test
kubectl run sender --image=nicolaka/netshoot --rm -it \
  --overrides='{"spec":{"nodeName":"node-1"}}' \
  -- curl http://<pod-ip-on-node-2>:8080/

Service Type LoadBalancer Issues

# External LoadBalancer not reachable

# Step 1: Check LB is provisioned
kubectl get svc payments-api -n production
# EXTERNAL-IP should be an IP or hostname, not <pending>

# <pending> for too long → cloud controller issue
kubectl logs -n kube-system -l app=cloud-controller-manager --tail=50
# AWS: check if nodes have correct IAM role for ELB provisioning

# Step 2: Check Security Groups (AWS)
# The LB security group must allow inbound on the port
# The node security group must allow inbound from LB SG

# Step 3: Check health check config
# AWS NLB: target group health check port = NodePort
kubectl get svc payments-api -n production \
  -o jsonpath='{.spec.ports[0].nodePort}'

# Step 4: Verify externalTrafficPolicy
kubectl get svc payments-api -n production \
  -o jsonpath='{.spec.externalTrafficPolicy}'
# Local = traffic only to nodes with the pod → LB must do proper health checks
# Cluster = traffic to any node, SNAT applied (default)

CNI Plugin Issues

# Pod stuck ContainerCreating — CNI failed to set up network

# Check kubelet logs on the node
kubectl debug node/<node> -it --image=ubuntu -- \
  journalctl -u kubelet --since "5 minutes ago" | grep -i cni

# Cilium CNI errors
kubectl logs -n kube-system -l k8s-app=cilium --tail=100 | grep -i error

# Check if Cilium pod is on the node
kubectl get pod -n kube-system -l k8s-app=cilium \
  --field-selector spec.nodeName=<node>

# CNI binary missing
kubectl debug node/<node> -it --image=ubuntu -- \
  ls /opt/cni/bin/

# CNI config corrupted
kubectl debug node/<node> -it --image=ubuntu -- \
  cat /etc/cni/net.d/05-cilium.conf

# IP address exhaustion (Cilium)
kubectl exec -n kube-system $CILIUM_POD -- \
  cilium ip list | grep "Available"
# If 0 available IPs: node has run out of pod CIDRs
# Fix: increase pod CIDR pool in CiliumNode, or reduce pods-per-node

# AWS VPC CNI — ENI warm pool exhausted
kubectl logs -n kube-system -l k8s-app=aws-node -c aws-node | \
  grep "warm pool"
# Fix: set WARM_IP_TARGET or MINIMUM_IP_TARGET env on aws-node DaemonSet

Ingress / Reverse Proxy Issues

# 502 Bad Gateway from NGINX Ingress
# → Backend pod unreachable or returning 5xx

# Check NGINX ingress controller logs
kubectl logs -n ingress-nginx -l app.kubernetes.io/name=ingress-nginx \
  --tail=100 | grep -E "error|upstream"

# Check upstream address NGINX is sending to
kubectl exec -n ingress-nginx <nginx-pod> -- \
  cat /etc/nginx/nginx.conf | grep upstream

# Check Service and Endpoints for the backend
kubectl get endpoints <backend-svc> -n production

# 504 Gateway Timeout
# → Backend is taking > proxy_read_timeout to respond
# Fix: increase timeout annotation on Ingress
# nginx.ingress.kubernetes.io/proxy-read-timeout: "120"

# CORS issues — NGINX adding wrong headers
# Check Ingress annotations for cors config
kubectl describe ingress <ing> -n production | grep cors

# TLS issues — see Ingress Issues playbook

Packet Capture

# Capture traffic on a specific pod interface

# Method 1: kubectl debug ephemeral container with tcpdump
kubectl debug -it <pod> -n <ns> \
  --image=nicolaka/netshoot --target=<container> \
  -- tcpdump -i eth0 -n port 8080 -w /tmp/capture.pcap

# Method 2: tcpdump on node for pod's veth
# Get pod's virtual interface on node
kubectl debug node/<node> -it --image=ubuntu -- bash
ip link show | grep -A1 "lxc"  # Cilium veth prefix
# Then: tcpdump -i lxcXXXXXX -n

# Method 3: Hubble (Cilium) — no privilege needed
hubble observe --namespace production -f \
  --from-pod frontend-xxx --to-pod payments-api-xxx

# Export pcap from pod
kubectl cp <ns>/<pod>:/tmp/capture.pcap ./capture.pcap
# Open with: wireshark ./capture.pcap