diff --git a/.gitea/workflows/deploy-prod.yml b/.gitea/workflows/deploy-prod.yml index 58409f4..78adfe0 100644 --- a/.gitea/workflows/deploy-prod.yml +++ b/.gitea/workflows/deploy-prod.yml @@ -38,12 +38,19 @@ jobs: - name: Verify Vault Cluster Health run: | - SEALED=$(docker run --rm --network iklimco-net hashicorp/vault:2.0.1 \ - sh -c "VAULT_ADDR=https://vault.iklim.co:8200 VAULT_SKIP_VERIFY=true vault status 2>/dev/null" \ - | awk '/^Sealed/{print $2}' || echo "true") - if [ "$SEALED" = "false" ]; then - echo "Vault cluster is unsealed and healthy" + # Fire 9 requests to the shared alias (load-balanced across all 3 nodes). + # Every request must return Sealed: false — one healthy node is not enough. + SEALED_COUNT=0 + for i in $(seq 1 9); do + SEALED=$(docker run --rm --network iklimco-net hashicorp/vault:2.0.1 \ + sh -c "VAULT_ADDR=https://vault.iklim.co:8200 VAULT_SKIP_VERIFY=true vault status 2>/dev/null" \ + | awk '/^Sealed/{print $2}' || echo "true") + [ "$SEALED" = "true" ] && SEALED_COUNT=$((SEALED_COUNT+1)) + sleep 1 + done + if [ "$SEALED_COUNT" -eq 0 ]; then + echo "Vault cluster is fully unsealed and healthy (9/9 checks passed)" else - echo "ERROR: Vault cluster is sealed or unreachable" + echo "ERROR: $SEALED_COUNT/9 checks returned sealed or unreachable" exit 1 fi diff --git a/docker-stack-vault.yml b/docker-stack-vault.yml index 719981c..798c604 100644 --- a/docker-stack-vault.yml +++ b/docker-stack-vault.yml @@ -11,13 +11,18 @@ services: image: hashicorp/vault:2.0.1 cap_add: - IPC_LOCK - # Overriding the default entrypoint to manipulate configuration strictly in RAM + # hostname uses the service slot number (stable across restarts) so that node_id, + # api_addr, and cluster_addr remain consistent after every container restart. + # vault-N.iklim.co is covered by the *.iklim.co wildcard cert (TLS works). + hostname: "vault-{{.Task.Slot}}.iklim.co" entrypoint: ["sh", "-c"] - # 1. Resolves HOSTNAME_PLACEHOLDER via sed entirely in RAM (/dev/shm) — no secret touches disk + # 1. Substitutes HOSTNAME_PLACEHOLDER with $HOSTNAME (vault-N.iklim.co) in RAM (/dev/shm) # 2. Starts vault server in background # 3. Registers SIGTERM/SIGINT trap for graceful shutdown # 4. Polls vault status; exit code 1 = not yet ready, 0 or 2 = vault is responding - # 5. Auto-unseals using vault_unseal_key Docker secret (no-op if key is wrong or file missing) + # 5. Retry-unseal loop: attempts unseal every 2s for up to 3 min. + # On initial bootstrap peers have empty Raft storage and cannot unseal until they + # join the cluster; the loop keeps retrying so they unseal as soon as Raft join succeeds. # 6. Waits for vault to exit and propagates exit code to Docker command: > "cat /vault/config/vault.json | sed \"s/HOSTNAME_PLACEHOLDER/$$HOSTNAME/g\" > /dev/shm/vault.json; @@ -26,7 +31,7 @@ services: trap 'kill -TERM $$VAULT_PID; wait $$VAULT_PID' TERM INT; export VAULT_ADDR='https://127.0.0.1:8200' VAULT_SKIP_VERIFY='true'; for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do vault status > /dev/null 2>&1; [ $$? -ne 1 ] && break; sleep 2; done; - vault operator unseal $$(cat /run/secrets/vault_unseal_key 2>/dev/null) > /dev/null 2>&1 || true; + i=0; while [ $$i -lt 90 ]; do vault status > /dev/null 2>&1 && break; vault operator unseal $$(cat /run/secrets/vault_unseal_key 2>/dev/null) > /dev/null 2>&1 || true; sleep 2; i=$$(($$i+1)); done; wait $$VAULT_PID" networks: iklimco-net: diff --git a/vault-bootstrap.sh b/vault-bootstrap.sh index 789ca95..be0d762 100755 --- a/vault-bootstrap.sh +++ b/vault-bootstrap.sh @@ -56,6 +56,31 @@ run_vault() { sh -c "VAULT_ADDR=https://vault.iklim.co:8200 VAULT_SKIP_VERIFY=true $cmd" fi } + +# Run a vault CLI command targeting a specific node by its node_id (= STABLE_ID = +# the api_addr hostname set inside the container). Used for direct per-peer unseal. +run_vault_on() { + local node_id="$1"; shift + local cmd="$*" + [ -n "$VAULT_TOKEN" ] && cmd="VAULT_TOKEN=$VAULT_TOKEN $cmd" + docker run --rm -i --network iklimco-net hashicorp/vault:2.0.1 \ + sh -c "VAULT_ADDR=https://${node_id}:8200 VAULT_SKIP_VERIFY=true $cmd" +} + +# Send N requests to the shared alias; returns 0 only when ALL return Sealed: false. +# Runs everything inside a single docker container to avoid 9 separate startups. +check_cluster_unsealed() { + local n="${1:-9}" + docker run --rm --network iklimco-net hashicorp/vault:2.0.1 sh -c " + sealed=0; i=0 + while [ \$i -lt $n ]; do + s=\$(VAULT_ADDR=https://vault.iklim.co:8200 VAULT_SKIP_VERIFY=true vault status 2>/dev/null | awk '/^Sealed/{print \$2}' || echo 'true') + [ \"\$s\" = 'true' ] && sealed=\$((sealed+1)) + i=\$((i+1)); [ \$i -lt $n ] && sleep 1 + done + exit \$sealed + " +} # ───────────────────────────────────────────────────────────────────── # ━━━ ADIM 0 — On kosullar ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ @@ -87,7 +112,10 @@ step "ADIM 3 — Vault cluster bekleniyor" wait_service_running "${STACK_NAME}_vault" 3 300 sleep 10 -# ━━━ ADIM 4 — Vault durum kontrolu ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +# ━━━ ADIM 4 — Vault durum kontrolu (erken cikis) ━━━━━━━━━━━━━━━━━━━ +# Early-exit requires the ENTIRE cluster to be unsealed. We fire N requests to +# the shared alias (load-balanced) and all must return Sealed: false. A single +# healthy node is not sufficient evidence that all 3 nodes are unsealed. step "ADIM 4 — Vault durum kontrolu" VAULT_STATUS_OUT=$(run_vault "vault status 2>/dev/null" || true) VAULT_INITIALIZED=$(echo "$VAULT_STATUS_OUT" | awk '/^Initialized/{print $2}') @@ -95,12 +123,17 @@ VAULT_SEALED=$(echo "$VAULT_STATUS_OUT" | awk '/^Sealed/{print $2}') info "Initialized: ${VAULT_INITIALIZED:-unknown}, Sealed: ${VAULT_SEALED:-unknown}" if [ "$VAULT_INITIALIZED" = "true" ] && [ "$VAULT_SEALED" = "false" ]; then - ok "Vault zaten initialize edilmis ve unsealed" - echo - echo "════════════════════════════════════════════════" - echo " BOOTSTRAP TAMAMLANDI (Vault saglıklı)" - echo "════════════════════════════════════════════════" - exit 0 + info "En az 1 node saglikli — cluster geneli kontrol ediliyor (9 istek)..." + if check_cluster_unsealed 9; then + ok "Vault cluster tamamen unsealed ve saglikli" + echo + echo "════════════════════════════════════════════════" + echo " BOOTSTRAP TAMAMLANDI (Vault saglıklı)" + echo "════════════════════════════════════════════════" + exit 0 + else + info "Bazi node'lar hala sealed — bootstrap devam ediyor..." + fi fi # ━━━ ADIM 5 — Vault initialize (gerekirse) ━━━━━━━━━━━━━━━━━━━━━━━━━ @@ -137,22 +170,93 @@ echo "$UNSEAL_KEY" | docker secret create vault_unseal_key - >/dev/null docker service update --secret-add vault_unseal_key "${STACK_NAME}_vault" >/dev/null ok "vault_unseal_key gercek degerle guncellendi" -# ━━━ ADIM 7 — Unseal dogrula ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ -step "ADIM 7 — Vault unseal dogrulaniyor" -info "Rolling restart tamamlanmasi ve unseal bekleniyor (30s)..." -sleep 30 +# ━━━ ADIM 6b — Leader unseal ve peer node'lar ━━━━━━━━━━━━━━━━━━━━━━ +# After rolling restart: +# - The node that ran 'vault operator init' has Raft data; its entrypoint retry +# loop will unseal it and it becomes the Raft leader. +# - Peer nodes start with EMPTY Raft storage. They cannot unseal until they join +# the Raft cluster (chicken-and-egg). The entrypoint retry loop keeps trying +# every 2s; once they join Raft they become Initialized=true and the next +# unseal attempt succeeds. +# - We also try to unseal peers explicitly by node_id (= STABLE_ID = api_addr +# hostname). This requires the node_id to be resolvable on the overlay network. +# If it is not, the explicit attempt is silently skipped and the entrypoint +# retry loop handles it instead (worst case: ~60s extra wait). +step "ADIM 6b — Raft leader bekleniyor ve peer node'lar unsealing" +info "Rolling restart sonrasi Raft leader unseal bekleniyor (max 3 dakika)..." -UNSEALED=0 -for i in $(seq 1 12); do +LEADER_UP=0 +for i in $(seq 1 36); do STATUS=$(run_vault "vault status 2>/dev/null" | awk '/^Sealed/{print $2}' || echo "true") if [ "$STATUS" = "false" ]; then - ok "Vault cluster unsealed" + ok "Raft leader unsealed" + LEADER_UP=1 + break + fi + echo " ${i}/36 — Sealed: ${STATUS}, 5s bekleniyor..." + sleep 5 +done +[ "$LEADER_UP" -eq 1 ] || fail "Raft leader 3 dakika icinde unseal olmadi" + +ROOT_TOKEN=$(awk '/^Initial Root Token:/{print $NF}' "$MAIN_INIT_FILE") +[ -n "$ROOT_TOKEN" ] || fail "Root token '$MAIN_INIT_FILE' dosyasinda bulunamadi" +VAULT_TOKEN="$ROOT_TOKEN" + +# Wait for all peers to join the Raft cluster (retry_join retries every ~30s). +info "Raft cluster olusmasi bekleniyor (3 peer, max 3 dakika)..." +ALL_JOINED=0 +for i in $(seq 1 36); do + PEER_COUNT=$(run_vault "vault operator raft list-peers 2>/dev/null" \ + | awk 'NR>2 && /[a-zA-Z0-9]/{c++} END{print c+0}' || true) + if [ "${PEER_COUNT:-0}" -ge 3 ]; then + ok "Raft cluster tam: ${PEER_COUNT}/3 peer" + ALL_JOINED=1 + break + fi + echo " ${i}/36 — Raft peers: ${PEER_COUNT:-0}/3, 5s bekleniyor..." + sleep 5 +done +[ "$ALL_JOINED" -eq 1 ] || fail "Raft cluster 3 dakika icinde tam olusmaadi" + +# Explicitly unseal each non-leader peer via its node_id on the overlay network. +# node_id equals STABLE_ID (the api_addr hostname configured in vault-template-v2.json). +# Best-effort: if the hostname is not resolvable, the entrypoint retry loop handles it. +info "Peer node'lar individually unsealing (best-effort)..." +PEER_HOSTS=$(run_vault "vault operator raft list-peers 2>/dev/null" \ + | awk 'NR>2 && /[a-zA-Z0-9]/ && !/leader/{print $1}' || true) +for peer_host in $PEER_HOSTS; do + info " Unsealing peer: $peer_host" + if run_vault_on "$peer_host" "vault operator unseal $UNSEAL_KEY" > /dev/null 2>&1; then + ok " $peer_host: unseal komutu gonderildi" + else + info " $peer_host: direct unseal basarisiz (overlay DNS resolve edilemedi — entrypoint loop devam ediyor)" + fi +done + +# ━━━ ADIM 7 — Tum node'lar unsealed mi? ━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +# Fire 9 requests to the shared alias with 1s sleep between each. With 3 nodes +# and any reasonable load-balancing the probability of hitting all 3 is very high. +# All 9 must return Sealed: false. We retry for up to 4 minutes to give the +# entrypoint retry loop time to finish for nodes that joined Raft late. +step "ADIM 7 — Vault cluster tam unseal dogrulaniyor" +info "Entrypoint retry loop tamamlanmasi bekleniyor (max 4 dakika)..." + +UNSEALED=0 +for i in $(seq 1 24); do + if check_cluster_unsealed 9; then + ok "Vault cluster tamamen unsealed (9/9 kontrol basarili)" UNSEALED=1 break fi - [ "$i" -eq 12 ] && break - echo " ${i}/12 — Sealed: $STATUS, retrying in 5s..." - sleep 5 + echo " ${i}/24 — Cluster henuz tam saglikli degil, 10s bekleniyor..." + # Re-attempt explicit peer unseal on every iteration in case hostname became + # resolvable after Raft catch-up (containers may still be starting up). + PEER_HOSTS=$(run_vault "vault operator raft list-peers 2>/dev/null" \ + | awk 'NR>2 && /[a-zA-Z0-9]/ && !/leader/{print $1}' || true) + for peer_host in $PEER_HOSTS; do + run_vault_on "$peer_host" "vault operator unseal $UNSEAL_KEY" > /dev/null 2>&1 || true + done + sleep 10 done [ "$UNSEALED" -eq 1 ] || fail "Vault cluster unseal olmadi — 'docker service logs ${STACK_NAME}_vault' ile loglari kontrol edin"