VaultTest/vault-bootstrap.sh
Murat ÖZDEMİR 483bd40cc4 docs(vault): Document Shamir HA unseal and localize bootstrap script
Updates README.md with the new Shamir-based auto-unseal process using Docker secrets,
eliminating the need for a separate transit vault.

Adds `failover_scenarios.md` to detail the Vault cluster's resilience under
various failure conditions.

Translates `vault-bootstrap.sh` messages and step titles from Turkish to English,
and aligns its execution flow with the updated documentation.
2026-06-10 19:04:48 +03:00

271 lines
14 KiB
Bash
Executable File

#!/bin/bash
# vault-bootstrap.sh — Vault HA cluster bootstrap (Shamir seal, Docker secret)
# Node-agnostic: uses docker exec for local replicas, falls back to the overlay
# network via docker run when no local replica is found on this Swarm manager.
set -euo pipefail
# ─── Configuration ───────────────────────────────────────────────────
STACK_NAME="iklimco"
STACK_FILE="$(cd "$(dirname "$0")" && pwd)/docker-stack-vault.yml"
OUT_DIR="/tmp/vault-bootstrap"
SKIP_DEPLOY="${SKIP_DEPLOY:-false}"
# ─────────────────────────────────────────────────────────────────────
mkdir -p "$OUT_DIR"
MAIN_INIT_FILE="$OUT_DIR/main-vault-init.txt"
# ─── Logging ─────────────────────────────────────────────────────────
step() { echo; echo "════════════════════════════════════════════════"; echo " [$(date '+%H:%M:%S')] $*"; echo "════════════════════════════════════════════════"; }
ok() { echo " [OK] $*"; }
info() { echo " --> $*"; }
fail() { echo; echo " [ERROR] $*" >&2; exit 1; }
trap 'echo; echo " [ERROR] Script terminated unexpectedly at line $LINENO" >&2' ERR
# ─────────────────────────────────────────────────────────────────────
# ─── Helpers ─────────────────────────────────────────────────────────
wait_service_running() {
local svc="$1" expected="$2" timeout="${3:-180}" elapsed=0
info "Waiting for: $svc ($expected running task)..."
while [ "$elapsed" -lt "$timeout" ]; do
running=$(docker service ps "$svc" \
--filter "desired-state=running" \
--format '{{.CurrentState}}' 2>/dev/null \
| grep -c "^Running" || true)
if [ "$running" -ge "$expected" ]; then
ok "$svc ready: $running/$expected"
return 0
fi
sleep 5; elapsed=$((elapsed+5))
echo " ${elapsed}s/${timeout}s — running: $running/$expected"
done
fail "$svc did not become ready within $timeout seconds"
}
# Run a vault CLI command — uses docker exec if a vault replica is on this node,
# otherwise falls back to the overlay network via docker run.
VAULT_TOKEN=""
run_vault() {
local cmd="$*"
[ -n "$VAULT_TOKEN" ] && cmd="VAULT_TOKEN=$VAULT_TOKEN $cmd"
local cid
cid=$(docker ps -q -f "name=${STACK_NAME}_vault\." | head -1 || true)
if [ -n "$cid" ]; then
docker exec -i "$cid" sh -c "VAULT_ADDR=https://127.0.0.1:8200 VAULT_SKIP_VERIFY=true $cmd"
else
docker run --rm -i --network iklimco-net hashicorp/vault:2.0.1 \
sh -c "VAULT_ADDR=https://vault.iklim.co:8200 VAULT_SKIP_VERIFY=true $cmd"
fi
}
# Run a vault CLI command targeting a specific node by its node_id (= STABLE_ID =
# the api_addr hostname set inside the container). Used for direct per-peer unseal.
run_vault_on() {
local node_id="$1"; shift
local cmd="$*"
[ -n "$VAULT_TOKEN" ] && cmd="VAULT_TOKEN=$VAULT_TOKEN $cmd"
docker run --rm -i --network iklimco-net hashicorp/vault:2.0.1 \
sh -c "VAULT_ADDR=https://${node_id}:8200 VAULT_SKIP_VERIFY=true $cmd"
}
# Send N requests to the shared alias; returns 0 only when ALL return Sealed: false.
# Runs everything inside a single docker container to avoid 9 separate startups.
check_cluster_unsealed() {
local n="${1:-9}"
docker run --rm --network iklimco-net hashicorp/vault:2.0.1 sh -c "
sealed=0; i=0
while [ \$i -lt $n ]; do
s=\$(VAULT_ADDR=https://vault.iklim.co:8200 VAULT_SKIP_VERIFY=true vault status 2>/dev/null | awk '/^Sealed/{print \$2}' || echo 'true')
[ \"\$s\" = 'true' ] && sealed=\$((sealed+1))
i=\$((i+1)); [ \$i -lt $n ] && sleep 1
done
exit \$sealed
"
}
# ─────────────────────────────────────────────────────────────────────
# ━━━ STEP 0 — Prerequisites ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
step "STEP 0 — Checking prerequisites"
docker node ls &>/dev/null || fail "Swarm manager node is required"
[ -f "$STACK_FILE" ] || fail "Stack file not found: $STACK_FILE"
ok "Prerequisites completed"
# ━━━ STEP 1 — Placeholder secret ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
step "STEP 1 — Checking vault_unseal_key"
if docker secret ls --format '{{.Name}}' | grep -q '^vault_unseal_key'; then
info "vault_unseal_key exists, skipping"
else
echo "bootstrap" | docker secret create vault_unseal_key - >/dev/null
ok "vault_unseal_key (placeholder) created"
fi
# ━━━ STEP 2 — Stack deploy ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
step "STEP 2 — Stack deploy"
if [ "$SKIP_DEPLOY" = "true" ]; then
info "SKIP_DEPLOY=true — skipping"
else
docker stack deploy --with-registry-auth -c "$STACK_FILE" "$STACK_NAME"
ok "Stack deployed"
fi
# ━━━ STEP 3 — Waiting for Vault cluster ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
step "STEP 3 — Waiting for Vault cluster"
wait_service_running "${STACK_NAME}_vault" 3 300
sleep 10
# ━━━ STEP 4 — Vault status check (early exit) ━━━━━━━━━━━━━━━━━━━
# Early-exit requires the ENTIRE cluster to be unsealed. We fire N requests to
# the shared alias (load-balanced) and all must return Sealed: false. A single
# healthy node is not sufficient evidence that all 3 nodes are unsealed.
step "STEP 4 — Vault status check"
VAULT_STATUS_OUT=$(run_vault "vault status 2>/dev/null" || true)
VAULT_INITIALIZED=$(echo "$VAULT_STATUS_OUT" | awk '/^Initialized/{print $2}')
VAULT_SEALED=$(echo "$VAULT_STATUS_OUT" | awk '/^Sealed/{print $2}')
info "Initialized: ${VAULT_INITIALIZED:-unknown}, Sealed: ${VAULT_SEALED:-unknown}"
if [ "$VAULT_INITIALIZED" = "true" ] && [ "$VAULT_SEALED" = "false" ]; then
info "At least 1 node healthy — checking cluster-wide (9 requests)..."
if check_cluster_unsealed 9; then
ok "Vault cluster fully unsealed and healthy"
echo
echo "════════════════════════════════════════════════"
echo " BOOTSTRAP COMPLETED (Vault healthy)"
echo "════════════════════════════════════════════════"
exit 0
else
info "Some nodes are still sealed — bootstrap continuing..."
fi
fi
# ━━━ STEP 5 — Vault initialize (if needed) ━━━━━━━━━━━━━━━━━━━━━━━━━
step "STEP 5 — Initializing Vault / preparing unseal key"
if [ "$VAULT_INITIALIZED" = "true" ]; then
# Vault is sealed but initialized. This happens when the vault_unseal_key Docker secret
# contains the wrong value (e.g., placeholder was never replaced). Provide the init file
# so the real key can be extracted and pushed to the secret.
info "Vault is sealed but initialized — using existing init file"
[ -f "$MAIN_INIT_FILE" ] && grep -q "Unseal Key 1" "$MAIN_INIT_FILE" \
|| fail "Init file missing: $MAIN_INIT_FILE\nManually add the Unseal Key to the file in this format:\n Unseal Key 1: <real-key>"
ok "Init file exists"
else
info "Initializing Vault..."
run_vault "vault operator init -key-shares=1 -key-threshold=1" | tee "$MAIN_INIT_FILE"
ok "Vault init completed: $MAIN_INIT_FILE"
fi
# ━━━ STEP 6 — Update vault_unseal_key Docker secret ━━━━━━━━━━━━━━
# Two-step update (delete + recreate with the same name) keeps the secret name
# consistent with the stack file so future 'docker stack deploy' runs do not
# trigger a service restart or revert to the placeholder.
step "STEP 6 — Updating vault_unseal_key Docker secret"
UNSEAL_KEY=$(awk '/Unseal Key 1:/{print $NF}' "$MAIN_INIT_FILE")
[ -n "$UNSEAL_KEY" ] || fail "Unseal key not found in '$MAIN_INIT_FILE' file"
info "Removing old secret from service (rolling restart 1/2)..."
docker service update --secret-rm vault_unseal_key "${STACK_NAME}_vault" >/dev/null
sleep 5
docker secret rm vault_unseal_key || true
info "Recreating secret with real unseal key (rolling restart 2/2)..."
echo "$UNSEAL_KEY" | docker secret create vault_unseal_key - >/dev/null
docker service update --secret-add vault_unseal_key "${STACK_NAME}_vault" >/dev/null
ok "vault_unseal_key updated with real value"
# ━━━ STEP 6b — Leader unseal and peer nodes ━━━━━━━━━━━━━━━━━━━━━━
# After rolling restart:
# - The node that ran 'vault operator init' has Raft data; its entrypoint retry
# loop will unseal it and it becomes the Raft leader.
# - Peer nodes start with EMPTY Raft storage. They cannot unseal until they join
# the Raft cluster (chicken-and-egg). The entrypoint retry loop keeps trying
# every 2s; once they join Raft they become Initialized=true and the next
# unseal attempt succeeds.
# - We also try to unseal peers explicitly by node_id (= STABLE_ID = api_addr
# hostname). This requires the node_id to be resolvable on the overlay network.
# If it is not, the explicit attempt is silently skipped and the entrypoint
# retry loop handles it instead (worst case: ~60s extra wait).
step "STEP 6b — Waiting for Raft leader and unsealing peer nodes"
info "Waiting for Raft leader unseal after rolling restart (max 3 minutes)..."
LEADER_UP=0
for i in $(seq 1 36); do
STATUS=$(run_vault "vault status 2>/dev/null" | awk '/^Sealed/{print $2}' || echo "true")
if [ "$STATUS" = "false" ]; then
ok "Raft leader unsealed"
LEADER_UP=1
break
fi
echo " ${i}/36 — Sealed: ${STATUS}, waiting 5s..."
sleep 5
done
[ "$LEADER_UP" -eq 1 ] || fail "Raft leader did not unseal within 3 minutes"
ROOT_TOKEN=$(awk '/^Initial Root Token:/{print $NF}' "$MAIN_INIT_FILE")
[ -n "$ROOT_TOKEN" ] || fail "Root token not found in '$MAIN_INIT_FILE' file"
VAULT_TOKEN="$ROOT_TOKEN"
# Wait for all peers to join the Raft cluster (retry_join retries every ~30s).
info "Waiting for Raft cluster formation (3 peers, max 3 minutes)..."
ALL_JOINED=0
for i in $(seq 1 36); do
PEER_COUNT=$(run_vault "vault operator raft list-peers 2>/dev/null" \
| awk 'NR>2 && /[a-zA-Z0-9]/{c++} END{print c+0}' || true)
if [ "${PEER_COUNT:-0}" -ge 3 ]; then
ok "Raft cluster complete: ${PEER_COUNT}/3 peers"
ALL_JOINED=1
break
fi
echo " ${i}/36 — Raft peers: ${PEER_COUNT:-0}/3, waiting 5s..."
sleep 5
done
[ "$ALL_JOINED" -eq 1 ] || fail "Raft cluster did not form within 3 minutes"
# Explicitly unseal each non-leader peer via its node_id on the overlay network.
# node_id equals STABLE_ID (the api_addr hostname configured in vault-template-v2.json).
# Best-effort: if the hostname is not resolvable, the entrypoint retry loop handles it.
info "Unsealing peer nodes individually (best-effort)..."
PEER_HOSTS=$(run_vault "vault operator raft list-peers 2>/dev/null" \
| awk 'NR>2 && /[a-zA-Z0-9]/ && !/leader/{print $1}' || true)
for peer_host in $PEER_HOSTS; do
info " Unsealing peer: $peer_host"
if run_vault_on "$peer_host" "vault operator unseal $UNSEAL_KEY" > /dev/null 2>&1; then
ok " $peer_host: unseal command sent"
else
info " $peer_host: direct unseal failed (overlay DNS could not be resolved — entrypoint loop continuing)"
fi
done
# ━━━ STEP 7 — Are all nodes unsealed? ━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# Fire 9 requests to the shared alias with 1s sleep between each. With 3 nodes
# and any reasonable load-balancing the probability of hitting all 3 is very high.
# All 9 must return Sealed: false. We retry for up to 4 minutes to give the
# entrypoint retry loop time to finish for nodes that joined Raft late.
step "STEP 7 — Verifying full Vault cluster unseal"
info "Waiting for entrypoint retry loop completion (max 4 minutes)..."
UNSEALED=0
for i in $(seq 1 24); do
if check_cluster_unsealed 9; then
ok "Vault cluster fully unsealed (9/9 checks successful)"
UNSEALED=1
break
fi
echo " ${i}/24 — Cluster not fully healthy yet, waiting 10s..."
# Re-attempt explicit peer unseal on every iteration in case hostname became
# resolvable after Raft catch-up (containers may still be starting up).
PEER_HOSTS=$(run_vault "vault operator raft list-peers 2>/dev/null" \
| awk 'NR>2 && /[a-zA-Z0-9]/ && !/leader/{print $1}' || true)
for peer_host in $PEER_HOSTS; do
run_vault_on "$peer_host" "vault operator unseal $UNSEAL_KEY" > /dev/null 2>&1 || true
done
sleep 10
done
[ "$UNSEALED" -eq 1 ] || fail "Vault cluster did not unseal — check logs with 'docker service logs ${STACK_NAME}_vault'"
echo
echo "════════════════════════════════════════════════"
echo " BOOTSTRAP COMPLETED"
echo " Init output: $MAIN_INIT_FILE"
echo " IMPORTANT: Back up this file to a safe place and"
echo " delete it from the production environment!"
echo "════════════════════════════════════════════════"