Root cause: Docker Swarm assigns a new random container ID as $HOSTNAME on every
task restart, making node_id, api_addr, and cluster_addr change with each restart.
Vault could not recognize its own Raft data → cluster never reformed after restart.
Fixes:
- docker-stack-vault.yml: add hostname: "vault-{{.Task.Slot}}.iklim.co" so each
replica gets a stable, slot-based hostname covered by the *.iklim.co wildcard cert.
Replace STABLE_ID/NODE_ID_PLACEHOLDER logic with a single HOSTNAME_PLACEHOLDER sed.
Replace single unseal attempt with a retry loop (90×2s) so peer nodes unseal as
soon as they join Raft, without needing external intervention.
- vault-bootstrap.sh: add ADIM 6b — after rolling restart, wait for Raft leader to
unseal, wait for all peers to join Raft (vault operator raft list-peers), then
attempt explicit per-peer unseal via overlay network (best-effort).
ADIM 4 early-exit now fires N requests to the shared alias; all must return
Sealed: false before declaring the cluster healthy.
ADIM 7 polls up to 4 minutes via check_cluster_unsealed (9 shared-alias requests)
and retries peer unseal on each iteration.
- deploy-prod.yml: health check now fires 9 requests to the shared alias; all must
return Sealed: false (single-node check was masking partially-sealed clusters).
97 lines
4.2 KiB
YAML
97 lines
4.2 KiB
YAML
version: '3.8'
|
|
|
|
services:
|
|
# -------------------------------------------------------------------------
|
|
# VAULT CLUSTER (3 NODES - RAFT HIGH AVAILABILITY)
|
|
# -------------------------------------------------------------------------
|
|
# Bootstrap: vault_unseal_key is auto-managed by vault-bootstrap.sh.
|
|
# The script creates a placeholder on first deploy and replaces it with the
|
|
# real Shamir unseal key after 'vault operator init'.
|
|
vault:
|
|
image: hashicorp/vault:2.0.1
|
|
cap_add:
|
|
- IPC_LOCK
|
|
# hostname uses the service slot number (stable across restarts) so that node_id,
|
|
# api_addr, and cluster_addr remain consistent after every container restart.
|
|
# vault-N.iklim.co is covered by the *.iklim.co wildcard cert (TLS works).
|
|
hostname: "vault-{{.Task.Slot}}.iklim.co"
|
|
entrypoint: ["sh", "-c"]
|
|
# 1. Substitutes HOSTNAME_PLACEHOLDER with $HOSTNAME (vault-N.iklim.co) in RAM (/dev/shm)
|
|
# 2. Starts vault server in background
|
|
# 3. Registers SIGTERM/SIGINT trap for graceful shutdown
|
|
# 4. Polls vault status; exit code 1 = not yet ready, 0 or 2 = vault is responding
|
|
# 5. Retry-unseal loop: attempts unseal every 2s for up to 3 min.
|
|
# On initial bootstrap peers have empty Raft storage and cannot unseal until they
|
|
# join the cluster; the loop keeps retrying so they unseal as soon as Raft join succeeds.
|
|
# 6. Waits for vault to exit and propagates exit code to Docker
|
|
command: >
|
|
"cat /vault/config/vault.json | sed \"s/HOSTNAME_PLACEHOLDER/$$HOSTNAME/g\" > /dev/shm/vault.json;
|
|
vault server -config=/dev/shm/vault.json &
|
|
VAULT_PID=$$!;
|
|
trap 'kill -TERM $$VAULT_PID; wait $$VAULT_PID' TERM INT;
|
|
export VAULT_ADDR='https://127.0.0.1:8200' VAULT_SKIP_VERIFY='true';
|
|
for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do vault status > /dev/null 2>&1; [ $$? -ne 1 ] && break; sleep 2; done;
|
|
i=0; while [ $$i -lt 90 ]; do vault status > /dev/null 2>&1 && break; vault operator unseal $$(cat /run/secrets/vault_unseal_key 2>/dev/null) > /dev/null 2>&1 || true; sleep 2; i=$$(($$i+1)); done;
|
|
wait $$VAULT_PID"
|
|
networks:
|
|
iklimco-net:
|
|
aliases:
|
|
- vault.iklim.co
|
|
environment:
|
|
VAULT_ADDR: "https://127.0.0.1:8200"
|
|
# PHASE 1 (Bootstrap): Keep it "true" so Vault CLI can operate on self-signed/temporary certificates.
|
|
# PHASE 2 (Strict SSL): set this 'false' once you switch your configuration to v2.
|
|
VAULT_SKIP_VERIFY: "true"
|
|
volumes:
|
|
# Persistent volume for primary Raft storage
|
|
- vault-data-vl:/vault/file
|
|
# Persistent volume for audit and operational logs
|
|
- vault-logs-vl:/vault/logs
|
|
# Read-only mount of the host directory where the real wildcard certificates are managed
|
|
- /opt/iklimco/ssl:/vault/certs:ro
|
|
configs:
|
|
# PHASE 1: Deploy with 'vault_template_v1' (TLS validation skipped for easy initialization)
|
|
# PHASE 2: Update this source to 'vault_template_v2' to enforce strict wildcard SSL verification
|
|
- source: vault_template_v2
|
|
target: /vault/config/vault.json
|
|
mode: 0444
|
|
secrets:
|
|
- vault_unseal_key
|
|
deploy:
|
|
mode: replicated
|
|
replicas: 3
|
|
placement:
|
|
# High Availability: each Vault node runs on a separate physical Swarm worker
|
|
max_replicas_per_node: 1
|
|
constraints:
|
|
- node.labels.type == service
|
|
restart_policy:
|
|
condition: any
|
|
delay: 5s
|
|
|
|
configs:
|
|
# =========================================================================
|
|
# CONFIG TEMPLATE PHASE 1: Bootstrap / Initial Discovery (No Verification)
|
|
# =========================================================================
|
|
vault_template_v1:
|
|
file: ./vault-template-v1.json
|
|
|
|
# =========================================================================
|
|
# CONFIG TEMPLATE PHASE 2: Production Safe (Strict Wildcard SSL Enforcement)
|
|
# =========================================================================
|
|
vault_template_v2:
|
|
file: ./vault-template-v2.json
|
|
|
|
volumes:
|
|
vault-data-vl:
|
|
vault-logs-vl:
|
|
|
|
secrets:
|
|
# Managed by vault-bootstrap.sh: placeholder on first deploy, replaced with real unseal key after init.
|
|
vault_unseal_key:
|
|
external: true
|
|
|
|
networks:
|
|
iklimco-net:
|
|
external: true
|