VaultTest/docker-stack-vault.yml
Murat ÖZDEMİR 392a015b8d fix(vault): Stable Raft cluster formation and reliable multi-node unseal on Docker Swarm
Root cause: Docker Swarm assigns a new random container ID as $HOSTNAME on every
task restart, making node_id, api_addr, and cluster_addr change with each restart.
Vault could not recognize its own Raft data → cluster never reformed after restart.

Fixes:
- docker-stack-vault.yml: add hostname: "vault-{{.Task.Slot}}.iklim.co" so each
  replica gets a stable, slot-based hostname covered by the *.iklim.co wildcard cert.
  Replace STABLE_ID/NODE_ID_PLACEHOLDER logic with a single HOSTNAME_PLACEHOLDER sed.
  Replace single unseal attempt with a retry loop (90×2s) so peer nodes unseal as
  soon as they join Raft, without needing external intervention.
- vault-bootstrap.sh: add ADIM 6b — after rolling restart, wait for Raft leader to
  unseal, wait for all peers to join Raft (vault operator raft list-peers), then
  attempt explicit per-peer unseal via overlay network (best-effort).
  ADIM 4 early-exit now fires N requests to the shared alias; all must return
  Sealed: false before declaring the cluster healthy.
  ADIM 7 polls up to 4 minutes via check_cluster_unsealed (9 shared-alias requests)
  and retries peer unseal on each iteration.
- deploy-prod.yml: health check now fires 9 requests to the shared alias; all must
  return Sealed: false (single-node check was masking partially-sealed clusters).
2026-06-10 18:17:59 +03:00

97 lines
4.2 KiB
YAML

version: '3.8'
services:
# -------------------------------------------------------------------------
# VAULT CLUSTER (3 NODES - RAFT HIGH AVAILABILITY)
# -------------------------------------------------------------------------
# Bootstrap: vault_unseal_key is auto-managed by vault-bootstrap.sh.
# The script creates a placeholder on first deploy and replaces it with the
# real Shamir unseal key after 'vault operator init'.
vault:
image: hashicorp/vault:2.0.1
cap_add:
- IPC_LOCK
# hostname uses the service slot number (stable across restarts) so that node_id,
# api_addr, and cluster_addr remain consistent after every container restart.
# vault-N.iklim.co is covered by the *.iklim.co wildcard cert (TLS works).
hostname: "vault-{{.Task.Slot}}.iklim.co"
entrypoint: ["sh", "-c"]
# 1. Substitutes HOSTNAME_PLACEHOLDER with $HOSTNAME (vault-N.iklim.co) in RAM (/dev/shm)
# 2. Starts vault server in background
# 3. Registers SIGTERM/SIGINT trap for graceful shutdown
# 4. Polls vault status; exit code 1 = not yet ready, 0 or 2 = vault is responding
# 5. Retry-unseal loop: attempts unseal every 2s for up to 3 min.
# On initial bootstrap peers have empty Raft storage and cannot unseal until they
# join the cluster; the loop keeps retrying so they unseal as soon as Raft join succeeds.
# 6. Waits for vault to exit and propagates exit code to Docker
command: >
"cat /vault/config/vault.json | sed \"s/HOSTNAME_PLACEHOLDER/$$HOSTNAME/g\" > /dev/shm/vault.json;
vault server -config=/dev/shm/vault.json &
VAULT_PID=$$!;
trap 'kill -TERM $$VAULT_PID; wait $$VAULT_PID' TERM INT;
export VAULT_ADDR='https://127.0.0.1:8200' VAULT_SKIP_VERIFY='true';
for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do vault status > /dev/null 2>&1; [ $$? -ne 1 ] && break; sleep 2; done;
i=0; while [ $$i -lt 90 ]; do vault status > /dev/null 2>&1 && break; vault operator unseal $$(cat /run/secrets/vault_unseal_key 2>/dev/null) > /dev/null 2>&1 || true; sleep 2; i=$$(($$i+1)); done;
wait $$VAULT_PID"
networks:
iklimco-net:
aliases:
- vault.iklim.co
environment:
VAULT_ADDR: "https://127.0.0.1:8200"
# PHASE 1 (Bootstrap): Keep it "true" so Vault CLI can operate on self-signed/temporary certificates.
# PHASE 2 (Strict SSL): set this 'false' once you switch your configuration to v2.
VAULT_SKIP_VERIFY: "true"
volumes:
# Persistent volume for primary Raft storage
- vault-data-vl:/vault/file
# Persistent volume for audit and operational logs
- vault-logs-vl:/vault/logs
# Read-only mount of the host directory where the real wildcard certificates are managed
- /opt/iklimco/ssl:/vault/certs:ro
configs:
# PHASE 1: Deploy with 'vault_template_v1' (TLS validation skipped for easy initialization)
# PHASE 2: Update this source to 'vault_template_v2' to enforce strict wildcard SSL verification
- source: vault_template_v2
target: /vault/config/vault.json
mode: 0444
secrets:
- vault_unseal_key
deploy:
mode: replicated
replicas: 3
placement:
# High Availability: each Vault node runs on a separate physical Swarm worker
max_replicas_per_node: 1
constraints:
- node.labels.type == service
restart_policy:
condition: any
delay: 5s
configs:
# =========================================================================
# CONFIG TEMPLATE PHASE 1: Bootstrap / Initial Discovery (No Verification)
# =========================================================================
vault_template_v1:
file: ./vault-template-v1.json
# =========================================================================
# CONFIG TEMPLATE PHASE 2: Production Safe (Strict Wildcard SSL Enforcement)
# =========================================================================
vault_template_v2:
file: ./vault-template-v2.json
volumes:
vault-data-vl:
vault-logs-vl:
secrets:
# Managed by vault-bootstrap.sh: placeholder on first deploy, replaced with real unseal key after init.
vault_unseal_key:
external: true
networks:
iklimco-net:
external: true