diff --git a/.gitea/workflows/deploy.yaml b/.gitea/workflows/deploy.yaml index de3e17f..0c3e81b 100644 --- a/.gitea/workflows/deploy.yaml +++ b/.gitea/workflows/deploy.yaml @@ -153,29 +153,85 @@ jobs: fi " - # ── Step 6: Health Check ────────────────── + # ── Step 6: Health Check (backoff) ──────── + # Exponential-ish backoff: 1s, 2s, 3s, 5s, 8s, 13s (~32s total). + # Why: cold-start containers need variable warmup time; + # fixed 5s intervals either wait too long or give up too early. - name: Health Check run: | - sleep 5 echo "🏥 Health check..." - for i in 1 2 3 4 5 6; do + RETRY=0 + MAX=6 + WAIT=1 + while [ $RETRY -lt $MAX ]; do + RETRY=$((RETRY + 1)) if curl -sf --max-time 10 https://nexus.noveria.net/health; then echo "" - echo "✅ Health check passed" - break + echo "✅ Health check passed (attempt $RETRY/$MAX)" + exit 0 fi - echo "⏳ Retry $i/6..." - sleep 5 + echo "⏳ Attempt $RETRY/$MAX failed, waiting ${WAIT}s..." + sleep $WAIT + # Fibonacci-ish backoff: 1,2,3,5,8,13 + NEXT=$((WAIT + RETRY)) + [ $NEXT -le 15 ] && WAIT=$NEXT || WAIT=15 done + echo "❌ Health check failed after $MAX attempts" + exit 1 - # ── Step 7: Smoke test ──────────────────── + # ── Step 7: Smoke test (multi-endpoint) ─── + # Tests multiple endpoints to catch partial failures. + # Why: a single /dashboard check can miss backend-only outages; + # testing /api/swagger confirms the API layer is healthy too. - name: Verify (smoke test) run: | echo "🔍 Smoke test..." - HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" https://nexus.noveria.net/dashboard) - echo "Dashboard: HTTP $HTTP_CODE" - if [ "$HTTP_CODE" != "200" ]; then - echo "❌ Dashboard not reachable!" + PASS=0 + FAIL=0 + BASE="https://nexus.noveria.net" + + check() { + local path="$1" label="$2" expected="${3:-200}" + local code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "${BASE}${path}") + printf " %-25s HTTP %s" "${label}:" "${code}" + if [ "$code" = "$expected" ]; then + echo " ✅" + PASS=$((PASS + 1)) + else + echo " ❌ (expected $expected)" + FAIL=$((FAIL + 1)) + fi + } + + check "/dashboard" "Dashboard" 200 + check "/health" "Health API" 200 + check "/api/swagger" "API Swagger" 200 + + echo "" + echo "Results: $PASS passed, $FAIL failed" + if [ "$FAIL" -gt 0 ]; then + echo "❌ Smoke test failed!" exit 1 fi echo "✅ Deployment verified" + + # ── Step 8: Rollback hint ──────────────── + # On any failure, prints the previous deploy tag for quick manual rollback. + # Why: reduces MTTR (mean time to recovery) by providing the exact + # git tag to roll back to without needing to look it up manually. + - name: Rollback hint + if: failure() + run: | + echo "" + echo "🔙 ─── Rollback Instructions ─── 🔙" + echo "" + echo " # 1. Checkout previous version:" + echo " git checkout tags/\$(git describe --tags --abbrev=0 2>/dev/null || echo 'unknown')" + echo "" + echo " # 2. Redeploy:" + echo " cd /opt/openclaw/data/openclaw/workspace/nexus" + echo " docker compose up -d --force-recreate" + echo "" + echo " # 3. Or trigger rollback via Gitea:" + echo " Trigger 'Deploy to Production' workflow with the previous tag" + echo ""