Overview

Diagnosis and resolution of Kubernetes ingress failures — 502/503/504 errors, TLS certificate problems, routing mismatches, and NGINX ingress controller issues.

HTTP Status Code Reference for Ingress

502 Bad Gateway        → Ingress controller can't reach upstream pod
                         (pod not ready, wrong port, backend crashing)

503 Service Unavailable → No healthy backends available
                          (all pods failing readiness, endpoints empty)

504 Gateway Timeout    → Upstream pod took too long to respond
                          (exceeds proxy_read_timeout, default 60s)

404 Not Found          → Ingress rule doesn't match the request path/host
                         (path regex wrong, host missing, trailing slash)

400 Bad Request        → Usually HTTPS request sent to HTTP port
                         OR malformed request headers

ERR_SSL_PROTOCOL_ERROR → TLS certificate not being served
                         (cert not yet ready, wrong secret name)

502 / 503 — Backend Unavailable

# Step 1: Check if pods are ready
kubectl get endpoints <service-name> -n <ns>
# Empty endpoints → no pods are Ready

kubectl get pods -n <ns> -l app=<app> 
# Check READY column: 1/1 = ready, 0/1 = not ready

# Step 2: Check NGINX ingress logs
kubectl logs -n ingress-nginx \
  -l app.kubernetes.io/name=ingress-nginx --tail=100 | \
  grep -E "error|upstream"
# "upstream timed out (110: Connection timed out)" → pod unresponsive
# "connect() failed (111: Connection refused)" → nothing listening on port

# Step 3: Verify service port matches container port
kubectl get svc <service-name> -n <ns> -o yaml | grep -A5 ports
kubectl get ingress <ingress-name> -n <ns> -o yaml | grep -A10 backend

# Step 4: Test upstream directly from ingress pod
INGRESS_POD=$(kubectl get pod -n ingress-nginx \
  -l app.kubernetes.io/name=ingress-nginx \
  -o jsonpath='{.items[0].metadata.name}')
POD_IP=$(kubectl get endpoints <svc> -n <ns> \
  -o jsonpath='{.subsets[0].addresses[0].ip}')
kubectl exec -n ingress-nginx $INGRESS_POD -- \
  curl -v http://$POD_IP:<port>/healthz

# Step 5: Check if NetworkPolicy blocks ingress → pod traffic
kubectl get networkpolicy -n <ns>
# ingress controller pods need to be allowed to reach app pods
# Add ingress rule allowing from ingress-nginx namespace

Ingress Rule Matching

# 404 from NGINX — rule doesn't match

# Check the Ingress object
kubectl describe ingress payments-ingress -n production
# Look for: Rules section, backend service, paths

# Common path matching issues:
# NGINX ingress uses path type:
#   Exact:  /api  → only matches exactly /api (not /api/)
#   Prefix: /api  → matches /api, /api/, /api/v1, etc.

# Check pathType
kubectl get ingress payments-ingress -n production -o yaml | grep -A5 path

# Regex paths (nginx.ingress.kubernetes.io/use-regex: "true")
# Test regex: kubectl exec $NGINX_POD -- nginx -T | grep location

# Host matching
# If host is www.example.com but request is for example.com → no match
kubectl get ingress -n production -o json | \
  jq -r '.items[].spec.rules[].host'

# Debug: check what NGINX actually configured
kubectl exec -n ingress-nginx $INGRESS_POD -- \
  cat /etc/nginx/nginx.conf | grep -A20 "server_name payments"

# Check NGINX upstream config
kubectl exec -n ingress-nginx $INGRESS_POD -- \
  nginx -T 2>/dev/null | grep -B5 -A15 "upstream"

TLS Certificate Issues

# ERR_SSL_PROTOCOL_ERROR or certificate warning in browser

# Step 1: Check TLS secret exists in the correct namespace
kubectl get ingress payments-ingress -n production -o yaml | grep tls -A5
# spec.tls[0].secretName: payments-tls

kubectl get secret payments-tls -n production
# Should be type: kubernetes.io/tls

# Step 2: Check cert-manager issued the certificate
kubectl get certificate -n production
kubectl describe certificate payments-cert -n production
# Conditions: Ready = True
# OR: "Failed" with reason (ACME challenge failed, etc.)

# Step 3: Check Certificate content
kubectl get secret payments-tls -n production \
  -o jsonpath='{.data.tls\.crt}' | base64 -d | \
  openssl x509 -noout -text | grep -E "Subject:|Not After:|DNS:"

# Step 4: Check if cert-manager is running
kubectl get pods -n cert-manager
kubectl logs -n cert-manager -l app=cert-manager --tail=50

# Step 5: Check ACME challenge
kubectl describe challenge -n production
kubectl describe order -n production
# Common: HTTP-01 challenge solver pod not reachable (firewall or DNS issue)
# Check: /.well-known/acme-challenge/ is reachable from internet

# Step 6: Test TLS from outside
openssl s_client -connect payments.example.com:443 -servername payments.example.com
# Look for: certificate chain, Not After date

# Fix: force cert renewal
kubectl annotate certificate payments-cert -n production \
  cert-manager.io/issue-temporary-certificate=true
# OR:
kubectl delete certificate payments-cert -n production
kubectl apply -f payments-cert.yaml   # recreate triggers new ACME order

NGINX Ingress Controller Issues

# Check NGINX ingress controller is running
kubectl get pods -n ingress-nginx
kubectl describe deployment ingress-nginx-controller -n ingress-nginx

# Check NGINX ingress logs for startup errors
kubectl logs -n ingress-nginx \
  -l app.kubernetes.io/name=ingress-nginx --tail=100 --previous

# Check NGINX configuration is valid
kubectl exec -n ingress-nginx $INGRESS_POD -- nginx -t

# Check NGINX ingress controller version
kubectl exec -n ingress-nginx $INGRESS_POD -- nginx -v

# Reload NGINX config (happens automatically on Ingress change)
# Force reload:
kubectl rollout restart deployment ingress-nginx-controller -n ingress-nginx

# Check Ingress class
kubectl get ingressclass
kubectl get ingress -n production -o json | \
  jq '.items[].spec.ingressClassName'
# Must match an existing IngressClass or use default

# Multiple ingress controllers: ensure correct class annotation
kubectl annotate ingress payments-ingress -n production \
  kubernetes.io/ingress.class=nginx   # legacy
# OR in spec:
# ingressClassName: nginx

Common NGINX Ingress Annotations

# Timeout tuning (504 Gateway Timeout)
nginx.ingress.kubernetes.io/proxy-connect-timeout: "30"
nginx.ingress.kubernetes.io/proxy-send-timeout: "120"
nginx.ingress.kubernetes.io/proxy-read-timeout: "120"

# Body size (413 Request Entity Too Large)
nginx.ingress.kubernetes.io/proxy-body-size: "50m"

# WebSocket support (101 Switching Protocols)
nginx.ingress.kubernetes.io/proxy-http-version: "1.1"

# CORS
nginx.ingress.kubernetes.io/enable-cors: "true"
nginx.ingress.kubernetes.io/cors-allow-origin: "https://app.example.com"
nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS"

# Rate limiting
nginx.ingress.kubernetes.io/limit-rps: "100"
nginx.ingress.kubernetes.io/limit-connections: "10"

# Auth
nginx.ingress.kubernetes.io/auth-url: "https://oauth2-proxy.example.com/oauth2/auth"
nginx.ingress.kubernetes.io/auth-signin: "https://oauth2-proxy.example.com/oauth2/start"

# TLS redirect
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"

# Custom error pages
nginx.ingress.kubernetes.io/custom-http-errors: "404,503"
nginx.ingress.kubernetes.io/default-backend: error-pages-service

# Affinity (session sticky)
nginx.ingress.kubernetes.io/affinity: "cookie"
nginx.ingress.kubernetes.io/session-cookie-name: "route"
nginx.ingress.kubernetes.io/session-cookie-expires: "172800"

AWS ALB Ingress Issues (aws-load-balancer-controller)

# ALB not created / health checks failing

# Check controller logs
kubectl logs -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller \
  --tail=100

# Check Ingress annotations
kubectl get ingress payments-ingress -n production -o yaml | grep alb

# Required annotation for ALB:
# kubernetes.io/ingress.class: alb
# alb.ingress.kubernetes.io/scheme: internet-facing | internal
# alb.ingress.kubernetes.io/target-type: ip | instance

# ALB health check default: HTTP GET / expecting 200
# If app returns 401 on /:
alb.ingress.kubernetes.io/healthcheck-path: /healthz
alb.ingress.kubernetes.io/success-codes: "200,401"

# Check ALB target group health
INGRESS_HOSTNAME=$(kubectl get ingress payments-ingress -n production \
  -o jsonpath='{.status.loadBalancer.ingress[0].hostname}')
aws elbv2 describe-target-groups \
  --query "TargetGroups[?contains(LoadBalancerArns, \
    'arn:aws:elasticloadbalancing:*')].TargetGroupArn" \
  --output text | xargs aws elbv2 describe-target-health \
    --target-group-arn

# Security group for ALB must allow inbound 443/80 from internet
# Node security group must allow inbound from ALB security group on NodePort range

Debugging End-to-End

# Full end-to-end test sequence
DOMAIN="payments.example.com"

# 1. DNS resolves
dig +short $DOMAIN
# OR:
nslookup $DOMAIN

# 2. HTTPS connectivity
curl -v https://$DOMAIN/healthz 2>&1 | grep -E "< HTTP|SSL|certificate"

# 3. Response headers
curl -I https://$DOMAIN/healthz

# 4. Check from inside cluster
kubectl run curl --image=curlimages/curl --rm -it -- \
  curl -v http://payments-api.production.svc.cluster.local:8080/healthz

# 5. Check NGINX access log
kubectl logs -n ingress-nginx \
  -l app.kubernetes.io/name=ingress-nginx --tail=200 | \
  grep $DOMAIN | tail -20

# 6. Performance test
kubectl run hey --image=williamyeh/hey --rm -it -- \
  hey -n 1000 -c 50 https://$DOMAIN/healthz