Root cause: Docker Swarm assigns a new random container ID as $HOSTNAME on every
task restart, making node_id, api_addr, and cluster_addr change with each restart.
Vault could not recognize its own Raft data → cluster never reformed after restart.
Fixes:
- docker-stack-vault.yml: add hostname: "vault-{{.Task.Slot}}.iklim.co" so each
replica gets a stable, slot-based hostname covered by the *.iklim.co wildcard cert.
Replace STABLE_ID/NODE_ID_PLACEHOLDER logic with a single HOSTNAME_PLACEHOLDER sed.
Replace single unseal attempt with a retry loop (90×2s) so peer nodes unseal as
soon as they join Raft, without needing external intervention.
- vault-bootstrap.sh: add ADIM 6b — after rolling restart, wait for Raft leader to
unseal, wait for all peers to join Raft (vault operator raft list-peers), then
attempt explicit per-peer unseal via overlay network (best-effort).
ADIM 4 early-exit now fires N requests to the shared alias; all must return
Sealed: false before declaring the cluster healthy.
ADIM 7 polls up to 4 minutes via check_cluster_unsealed (9 shared-alias requests)
and retries peer unseal on each iteration.
- deploy-prod.yml: health check now fires 9 requests to the shared alias; all must
return Sealed: false (single-node check was masking partially-sealed clusters).
271 lines
14 KiB
Bash
Executable File
271 lines
14 KiB
Bash
Executable File
#!/bin/bash
|
||
# vault-bootstrap.sh — Vault HA cluster bootstrap (Shamir seal, Docker secret)
|
||
# Node-agnostic: uses docker exec for local replicas, falls back to the overlay
|
||
# network via docker run when no local replica is found on this Swarm manager.
|
||
set -euo pipefail
|
||
|
||
# ─── Configuration ───────────────────────────────────────────────────
|
||
STACK_NAME="iklimco"
|
||
STACK_FILE="$(cd "$(dirname "$0")" && pwd)/docker-stack-vault.yml"
|
||
OUT_DIR="/tmp/vault-bootstrap"
|
||
SKIP_DEPLOY="${SKIP_DEPLOY:-false}"
|
||
# ─────────────────────────────────────────────────────────────────────
|
||
|
||
mkdir -p "$OUT_DIR"
|
||
MAIN_INIT_FILE="$OUT_DIR/main-vault-init.txt"
|
||
|
||
# ─── Logging ─────────────────────────────────────────────────────────
|
||
step() { echo; echo "════════════════════════════════════════════════"; echo " [$(date '+%H:%M:%S')] $*"; echo "════════════════════════════════════════════════"; }
|
||
ok() { echo " [OK] $*"; }
|
||
info() { echo " --> $*"; }
|
||
fail() { echo; echo " [HATA] $*" >&2; exit 1; }
|
||
trap 'echo; echo " [HATA] Script satir $LINENO'"'"'de beklenmedik sekilde sonlandi" >&2' ERR
|
||
# ─────────────────────────────────────────────────────────────────────
|
||
|
||
# ─── Helpers ─────────────────────────────────────────────────────────
|
||
wait_service_running() {
|
||
local svc="$1" expected="$2" timeout="${3:-180}" elapsed=0
|
||
info "Bekleniyor: $svc ($expected running task)..."
|
||
while [ "$elapsed" -lt "$timeout" ]; do
|
||
running=$(docker service ps "$svc" \
|
||
--filter "desired-state=running" \
|
||
--format '{{.CurrentState}}' 2>/dev/null \
|
||
| grep -c "^Running" || true)
|
||
if [ "$running" -ge "$expected" ]; then
|
||
ok "$svc hazir: $running/$expected"
|
||
return 0
|
||
fi
|
||
sleep 5; elapsed=$((elapsed+5))
|
||
echo " ${elapsed}s/${timeout}s — running: $running/$expected"
|
||
done
|
||
fail "$svc $timeout saniye icinde hazir olmadi"
|
||
}
|
||
|
||
# Run a vault CLI command — uses docker exec if a vault replica is on this node,
|
||
# otherwise falls back to the overlay network via docker run.
|
||
VAULT_TOKEN=""
|
||
run_vault() {
|
||
local cmd="$*"
|
||
[ -n "$VAULT_TOKEN" ] && cmd="VAULT_TOKEN=$VAULT_TOKEN $cmd"
|
||
local cid
|
||
cid=$(docker ps -q -f "name=${STACK_NAME}_vault\." | head -1 || true)
|
||
if [ -n "$cid" ]; then
|
||
docker exec -i "$cid" sh -c "VAULT_ADDR=https://127.0.0.1:8200 VAULT_SKIP_VERIFY=true $cmd"
|
||
else
|
||
docker run --rm -i --network iklimco-net hashicorp/vault:2.0.1 \
|
||
sh -c "VAULT_ADDR=https://vault.iklim.co:8200 VAULT_SKIP_VERIFY=true $cmd"
|
||
fi
|
||
}
|
||
|
||
# Run a vault CLI command targeting a specific node by its node_id (= STABLE_ID =
|
||
# the api_addr hostname set inside the container). Used for direct per-peer unseal.
|
||
run_vault_on() {
|
||
local node_id="$1"; shift
|
||
local cmd="$*"
|
||
[ -n "$VAULT_TOKEN" ] && cmd="VAULT_TOKEN=$VAULT_TOKEN $cmd"
|
||
docker run --rm -i --network iklimco-net hashicorp/vault:2.0.1 \
|
||
sh -c "VAULT_ADDR=https://${node_id}:8200 VAULT_SKIP_VERIFY=true $cmd"
|
||
}
|
||
|
||
# Send N requests to the shared alias; returns 0 only when ALL return Sealed: false.
|
||
# Runs everything inside a single docker container to avoid 9 separate startups.
|
||
check_cluster_unsealed() {
|
||
local n="${1:-9}"
|
||
docker run --rm --network iklimco-net hashicorp/vault:2.0.1 sh -c "
|
||
sealed=0; i=0
|
||
while [ \$i -lt $n ]; do
|
||
s=\$(VAULT_ADDR=https://vault.iklim.co:8200 VAULT_SKIP_VERIFY=true vault status 2>/dev/null | awk '/^Sealed/{print \$2}' || echo 'true')
|
||
[ \"\$s\" = 'true' ] && sealed=\$((sealed+1))
|
||
i=\$((i+1)); [ \$i -lt $n ] && sleep 1
|
||
done
|
||
exit \$sealed
|
||
"
|
||
}
|
||
# ─────────────────────────────────────────────────────────────────────
|
||
|
||
# ━━━ ADIM 0 — On kosullar ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||
step "ADIM 0 — On kosullar kontrol ediliyor"
|
||
docker node ls &>/dev/null || fail "Swarm manager node gerekli"
|
||
[ -f "$STACK_FILE" ] || fail "Stack dosyasi bulunamadi: $STACK_FILE"
|
||
ok "On kosullar tamam"
|
||
|
||
# ━━━ ADIM 1 — Placeholder secret ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||
step "ADIM 1 — vault_unseal_key kontrol ediliyor"
|
||
if docker secret ls --format '{{.Name}}' | grep -q '^vault_unseal_key'; then
|
||
info "vault_unseal_key mevcut, atlaniyor"
|
||
else
|
||
echo "bootstrap" | docker secret create vault_unseal_key - >/dev/null
|
||
ok "vault_unseal_key (placeholder) olusturuldu"
|
||
fi
|
||
|
||
# ━━━ ADIM 2 — Stack deploy ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||
step "ADIM 2 — Stack deploy"
|
||
if [ "$SKIP_DEPLOY" = "true" ]; then
|
||
info "SKIP_DEPLOY=true — atlaniyor"
|
||
else
|
||
docker stack deploy --with-registry-auth -c "$STACK_FILE" "$STACK_NAME"
|
||
ok "Stack deploy edildi"
|
||
fi
|
||
|
||
# ━━━ ADIM 3 — Vault cluster bekleniyor ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||
step "ADIM 3 — Vault cluster bekleniyor"
|
||
wait_service_running "${STACK_NAME}_vault" 3 300
|
||
sleep 10
|
||
|
||
# ━━━ ADIM 4 — Vault durum kontrolu (erken cikis) ━━━━━━━━━━━━━━━━━━━
|
||
# Early-exit requires the ENTIRE cluster to be unsealed. We fire N requests to
|
||
# the shared alias (load-balanced) and all must return Sealed: false. A single
|
||
# healthy node is not sufficient evidence that all 3 nodes are unsealed.
|
||
step "ADIM 4 — Vault durum kontrolu"
|
||
VAULT_STATUS_OUT=$(run_vault "vault status 2>/dev/null" || true)
|
||
VAULT_INITIALIZED=$(echo "$VAULT_STATUS_OUT" | awk '/^Initialized/{print $2}')
|
||
VAULT_SEALED=$(echo "$VAULT_STATUS_OUT" | awk '/^Sealed/{print $2}')
|
||
info "Initialized: ${VAULT_INITIALIZED:-unknown}, Sealed: ${VAULT_SEALED:-unknown}"
|
||
|
||
if [ "$VAULT_INITIALIZED" = "true" ] && [ "$VAULT_SEALED" = "false" ]; then
|
||
info "En az 1 node saglikli — cluster geneli kontrol ediliyor (9 istek)..."
|
||
if check_cluster_unsealed 9; then
|
||
ok "Vault cluster tamamen unsealed ve saglikli"
|
||
echo
|
||
echo "════════════════════════════════════════════════"
|
||
echo " BOOTSTRAP TAMAMLANDI (Vault saglıklı)"
|
||
echo "════════════════════════════════════════════════"
|
||
exit 0
|
||
else
|
||
info "Bazi node'lar hala sealed — bootstrap devam ediyor..."
|
||
fi
|
||
fi
|
||
|
||
# ━━━ ADIM 5 — Vault initialize (gerekirse) ━━━━━━━━━━━━━━━━━━━━━━━━━
|
||
step "ADIM 5 — Vault initialize / unseal key hazirlaniyor"
|
||
if [ "$VAULT_INITIALIZED" = "true" ]; then
|
||
# Vault is sealed but initialized. This happens when the vault_unseal_key Docker secret
|
||
# contains the wrong value (e.g., placeholder was never replaced). Provide the init file
|
||
# so the real key can be extracted and pushed to the secret.
|
||
info "Vault sealed ama initialize edilmis — mevcut init dosyasi kullanilacak"
|
||
[ -f "$MAIN_INIT_FILE" ] && grep -q "Unseal Key 1" "$MAIN_INIT_FILE" \
|
||
|| fail "Init dosyasi eksik: $MAIN_INIT_FILE\nUnseal Key'i manuel olarak su formatta dosyaya ekleyin:\n Unseal Key 1: <gercek-key>"
|
||
ok "Init dosyasi mevcut"
|
||
else
|
||
info "Vault initialize ediliyor..."
|
||
run_vault "vault operator init -key-shares=1 -key-threshold=1" | tee "$MAIN_INIT_FILE"
|
||
ok "Vault init tamamlandi: $MAIN_INIT_FILE"
|
||
fi
|
||
|
||
# ━━━ ADIM 6 — vault_unseal_key Docker secret guncelle ━━━━━━━━━━━━━━
|
||
# Two-step update (delete + recreate with the same name) keeps the secret name
|
||
# consistent with the stack file so future 'docker stack deploy' runs do not
|
||
# trigger a service restart or revert to the placeholder.
|
||
step "ADIM 6 — vault_unseal_key Docker secret guncelleniyor"
|
||
UNSEAL_KEY=$(awk '/Unseal Key 1:/{print $NF}' "$MAIN_INIT_FILE")
|
||
[ -n "$UNSEAL_KEY" ] || fail "Unseal key '$MAIN_INIT_FILE' dosyasinda bulunamadi"
|
||
|
||
info "Eski secret servis uzerinden kaldiriliyor (rolling restart 1/2)..."
|
||
docker service update --secret-rm vault_unseal_key "${STACK_NAME}_vault" >/dev/null
|
||
sleep 5
|
||
docker secret rm vault_unseal_key || true
|
||
|
||
info "Gercek unseal key ile secret yeniden olusturuluyor (rolling restart 2/2)..."
|
||
echo "$UNSEAL_KEY" | docker secret create vault_unseal_key - >/dev/null
|
||
docker service update --secret-add vault_unseal_key "${STACK_NAME}_vault" >/dev/null
|
||
ok "vault_unseal_key gercek degerle guncellendi"
|
||
|
||
# ━━━ ADIM 6b — Leader unseal ve peer node'lar ━━━━━━━━━━━━━━━━━━━━━━
|
||
# After rolling restart:
|
||
# - The node that ran 'vault operator init' has Raft data; its entrypoint retry
|
||
# loop will unseal it and it becomes the Raft leader.
|
||
# - Peer nodes start with EMPTY Raft storage. They cannot unseal until they join
|
||
# the Raft cluster (chicken-and-egg). The entrypoint retry loop keeps trying
|
||
# every 2s; once they join Raft they become Initialized=true and the next
|
||
# unseal attempt succeeds.
|
||
# - We also try to unseal peers explicitly by node_id (= STABLE_ID = api_addr
|
||
# hostname). This requires the node_id to be resolvable on the overlay network.
|
||
# If it is not, the explicit attempt is silently skipped and the entrypoint
|
||
# retry loop handles it instead (worst case: ~60s extra wait).
|
||
step "ADIM 6b — Raft leader bekleniyor ve peer node'lar unsealing"
|
||
info "Rolling restart sonrasi Raft leader unseal bekleniyor (max 3 dakika)..."
|
||
|
||
LEADER_UP=0
|
||
for i in $(seq 1 36); do
|
||
STATUS=$(run_vault "vault status 2>/dev/null" | awk '/^Sealed/{print $2}' || echo "true")
|
||
if [ "$STATUS" = "false" ]; then
|
||
ok "Raft leader unsealed"
|
||
LEADER_UP=1
|
||
break
|
||
fi
|
||
echo " ${i}/36 — Sealed: ${STATUS}, 5s bekleniyor..."
|
||
sleep 5
|
||
done
|
||
[ "$LEADER_UP" -eq 1 ] || fail "Raft leader 3 dakika icinde unseal olmadi"
|
||
|
||
ROOT_TOKEN=$(awk '/^Initial Root Token:/{print $NF}' "$MAIN_INIT_FILE")
|
||
[ -n "$ROOT_TOKEN" ] || fail "Root token '$MAIN_INIT_FILE' dosyasinda bulunamadi"
|
||
VAULT_TOKEN="$ROOT_TOKEN"
|
||
|
||
# Wait for all peers to join the Raft cluster (retry_join retries every ~30s).
|
||
info "Raft cluster olusmasi bekleniyor (3 peer, max 3 dakika)..."
|
||
ALL_JOINED=0
|
||
for i in $(seq 1 36); do
|
||
PEER_COUNT=$(run_vault "vault operator raft list-peers 2>/dev/null" \
|
||
| awk 'NR>2 && /[a-zA-Z0-9]/{c++} END{print c+0}' || true)
|
||
if [ "${PEER_COUNT:-0}" -ge 3 ]; then
|
||
ok "Raft cluster tam: ${PEER_COUNT}/3 peer"
|
||
ALL_JOINED=1
|
||
break
|
||
fi
|
||
echo " ${i}/36 — Raft peers: ${PEER_COUNT:-0}/3, 5s bekleniyor..."
|
||
sleep 5
|
||
done
|
||
[ "$ALL_JOINED" -eq 1 ] || fail "Raft cluster 3 dakika icinde tam olusmaadi"
|
||
|
||
# Explicitly unseal each non-leader peer via its node_id on the overlay network.
|
||
# node_id equals STABLE_ID (the api_addr hostname configured in vault-template-v2.json).
|
||
# Best-effort: if the hostname is not resolvable, the entrypoint retry loop handles it.
|
||
info "Peer node'lar individually unsealing (best-effort)..."
|
||
PEER_HOSTS=$(run_vault "vault operator raft list-peers 2>/dev/null" \
|
||
| awk 'NR>2 && /[a-zA-Z0-9]/ && !/leader/{print $1}' || true)
|
||
for peer_host in $PEER_HOSTS; do
|
||
info " Unsealing peer: $peer_host"
|
||
if run_vault_on "$peer_host" "vault operator unseal $UNSEAL_KEY" > /dev/null 2>&1; then
|
||
ok " $peer_host: unseal komutu gonderildi"
|
||
else
|
||
info " $peer_host: direct unseal basarisiz (overlay DNS resolve edilemedi — entrypoint loop devam ediyor)"
|
||
fi
|
||
done
|
||
|
||
# ━━━ ADIM 7 — Tum node'lar unsealed mi? ━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||
# Fire 9 requests to the shared alias with 1s sleep between each. With 3 nodes
|
||
# and any reasonable load-balancing the probability of hitting all 3 is very high.
|
||
# All 9 must return Sealed: false. We retry for up to 4 minutes to give the
|
||
# entrypoint retry loop time to finish for nodes that joined Raft late.
|
||
step "ADIM 7 — Vault cluster tam unseal dogrulaniyor"
|
||
info "Entrypoint retry loop tamamlanmasi bekleniyor (max 4 dakika)..."
|
||
|
||
UNSEALED=0
|
||
for i in $(seq 1 24); do
|
||
if check_cluster_unsealed 9; then
|
||
ok "Vault cluster tamamen unsealed (9/9 kontrol basarili)"
|
||
UNSEALED=1
|
||
break
|
||
fi
|
||
echo " ${i}/24 — Cluster henuz tam saglikli degil, 10s bekleniyor..."
|
||
# Re-attempt explicit peer unseal on every iteration in case hostname became
|
||
# resolvable after Raft catch-up (containers may still be starting up).
|
||
PEER_HOSTS=$(run_vault "vault operator raft list-peers 2>/dev/null" \
|
||
| awk 'NR>2 && /[a-zA-Z0-9]/ && !/leader/{print $1}' || true)
|
||
for peer_host in $PEER_HOSTS; do
|
||
run_vault_on "$peer_host" "vault operator unseal $UNSEAL_KEY" > /dev/null 2>&1 || true
|
||
done
|
||
sleep 10
|
||
done
|
||
|
||
[ "$UNSEALED" -eq 1 ] || fail "Vault cluster unseal olmadi — 'docker service logs ${STACK_NAME}_vault' ile loglari kontrol edin"
|
||
|
||
echo
|
||
echo "════════════════════════════════════════════════"
|
||
echo " BOOTSTRAP TAMAMLANDI"
|
||
echo " Init cikti: $MAIN_INIT_FILE"
|
||
echo " ONEMLI: Bu dosyayi guvenli yere yedekle ve"
|
||
echo " produksiyon ortamindan sil!"
|
||
echo "════════════════════════════════════════════════"
|