feat(health-agent): add monitors.yml with env-aware node IP mapping from Ansible inventory

2026-06-25 18:59:14 +03:00 · 2026-06-25 18:59:14 +03:00 · f742bfdd11
commit f742bfdd11
parent a2e8997711
19 changed files with 1131 additions and 0 deletions
--- a/health-agent/.env.example
+++ b/health-agent/.env.example
@ -0,0 +1,10 @@
+ENV=prod
+CLUSTER_SIZE_ETCD=3
+CLUSTER_SIZE_PATRONI=3
+CLUSTER_SIZE_MONGODB=3
+CLUSTER_SIZE_RABBITMQ=3
+CLUSTER_SIZE_VAULT=3
+REDIS_MODE=sentinel
+EXTERNAL_DOMAIN=iklim.co
+EXTERNAL_SUBDOMAIN_SUFFIX=
+UK_PUSH_URL_BASE=https://status.iklim.co/api/push
--- a/health-agent/.env.setup.example
+++ b/health-agent/.env.setup.example
@ -0,0 +1,5 @@
+UK_URL=http://uptime-kuma:3001
+UK_API_KEY=your_api_key_here
+UK_SLACK_WEBHOOK_HIGH=https://hooks.slack.com/services/...
+UK_SLACK_WEBHOOK_MEDIUM=https://hooks.slack.com/services/...
+UK_SLACK_WEBHOOK_LOW=https://hooks.slack.com/services/...
--- a/health-agent/Dockerfile
+++ b/health-agent/Dockerfile
@ -0,0 +1,16 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/*
+
+COPY pyproject.toml ./
+COPY src/ ./src/
+RUN pip install --no-cache-dir .
+ENV PYTHONPATH=/app/src
+
+RUN useradd -m appuser
+# Keeping as root to be able to access /var/run/docker.sock cleanly, unless specifically configured with groups.
+# USER appuser
+
+CMD ["python", "src/health_agent/main.py"]
--- a/health-agent/config/monitors.yml
+++ b/health-agent/config/monitors.yml
@ -0,0 +1,196 @@
+version: "1"
+project: "iklim"
+domain:
+  base: "iklim.co"
+nodes:
+  prod:
+    service:
+      - name: iklim-app-01
+        ip: "178.104.210.41"
+      - name: iklim-app-02
+        ip: "178.105.69.1"
+      - name: iklim-app-03
+        ip: "178.104.219.3"
+    db:
+      - name: iklim-db-01
+        ip: "159.69.117.158"
+      - name: iklim-db-02
+        ip: "178.104.219.162"
+      - name: iklim-db-03
+        ip: "159.69.115.105"
+  test:
+    service:
+      - name: iklim-app-01
+        ip: "167.235.194.61"
+    db:
+      - name: iklim-db-01
+        ip: "167.235.205.93"
+tags:
+  - external
+  - internal
+  - high
+  - medium
+  - low
+  - database
+  - gateway
+  - infrastructure
+  - observability
+notifications:
+  slack-high:
+    type: slack
+    webhook_env: UK_SLACK_WEBHOOK_HIGH
+  slack-medium:
+    type: slack
+    webhook_env: UK_SLACK_WEBHOOK_MEDIUM
+  slack-low:
+    type: slack
+    webhook_env: UK_SLACK_WEBHOOK_LOW
+groups:
+  - name: "Altyapı"
+    status_page: "iklim-{env}-ops"
+    notifications: [slack-high]
+    tags: [internal, infrastructure]
+    children: [SWARM-CLUSTER, VAULT-CLUSTER, STORAGEBOX-MOUNT, SWAG-TLS]
+  - name: "Veri Katmanı"
+    status_page: "iklim-{env}-ops"
+    notifications: [slack-high]
+    tags: [internal, database]
+    children: [ETCD-CLUSTER, PATRONI-CLUSTER, MONGODB-REPLICASET]
+  - name: "Gateway & Mesajlaşma"
+    status_page: "iklim-{env}-ops"
+    notifications: [slack-high]
+    tags: [internal, gateway]
+    children: [APISIX-GATEWAY, RABBITMQ-CLUSTER, REDIS-SENTINEL]
+  - name: "Dış Erişilebilirlik - Kritik"
+    status_page: "iklim-{env}-ops"
+    notifications: [slack-high]
+    tags: [external, high]
+    children: [EXT-HTTPS-API, EXT-DNS-API, EXT-DNS-ROOT, EXT-PING-APP01, EXT-PING-APP02, EXT-PING-APP03]
+  - name: "Dış Erişilebilirlik - Genel"
+    status_page: "iklim-{env}-ops"
+    notifications: [slack-medium]
+    tags: [external, medium]
+    children: [EXT-HTTPS-GRAFANA, EXT-PING-DB01, EXT-PING-DB02, EXT-PING-DB03]
+  - name: "Gözlemlenebilirlik"
+    status_page: "iklim-{env}-tools"
+    notifications: [slack-low]
+    tags: [internal, observability]
+    children: [PROMETHEUS, GRAFANA, PORTAINER, LOKI, EXT-HTTPS-PORTAINER, EXT-HTTPS-APIGW]
+push_monitors:
+  - name: SWARM-CLUSTER
+    interval: 60
+    heartbeat_retries: 1
+    tags: [internal, infrastructure, high]
+    restart_threshold: 1
+  - name: VAULT-CLUSTER
+    interval: 60
+    heartbeat_retries: 1
+    tags: [internal, infrastructure, high]
+    restart_threshold: 1
+  - name: ETCD-CLUSTER
+    interval: 60
+    heartbeat_retries: 1
+    tags: [internal, database, high]
+    restart_threshold: 1
+  - name: PATRONI-CLUSTER
+    interval: 60
+    heartbeat_retries: 1
+    tags: [internal, database, high]
+    restart_threshold: 1
+  - name: MONGODB-REPLICASET
+    interval: 120
+    heartbeat_retries: 1
+    tags: [internal, database, high]
+    restart_threshold: 1
+  - name: APISIX-GATEWAY
+    interval: 60
+    heartbeat_retries: 1
+    tags: [internal, gateway, high]
+    restart_threshold: 1
+  - name: RABBITMQ-CLUSTER
+    interval: 60
+    heartbeat_retries: 1
+    tags: [internal, gateway, medium]
+    restart_threshold: 3
+  - name: REDIS-SENTINEL
+    interval: 60
+    heartbeat_retries: 1
+    tags: [internal, database, medium]
+    restart_threshold: 3
+  - name: SWAG-TLS
+    interval: 3600
+    heartbeat_retries: 1
+    tags: [internal, infrastructure, medium]
+    restart_threshold: 3
+  - name: STORAGEBOX-MOUNT
+    interval: 300
+    heartbeat_retries: 1
+    tags: [internal, infrastructure, medium]
+    restart_threshold: 1
+  - name: PROMETHEUS
+    interval: 120
+    heartbeat_retries: 1
+    tags: [internal, observability, low]
+    restart_threshold: 5
+  - name: GRAFANA
+    interval: 120
+    heartbeat_retries: 1
+    tags: [internal, observability, low]
+    restart_threshold: 5
+  - name: PORTAINER
+    interval: 120
+    heartbeat_retries: 1
+    tags: [internal, observability, low]
+    restart_threshold: 5
+  - name: LOKI
+    interval: 120
+    heartbeat_retries: 1
+    tags: [internal, observability, low]
+    restart_threshold: 5
+http_monitors:
+  - name: EXT-HTTPS-API
+    url: "https://api{suffix}.{domain}/actuator/health"
+    accepted_statuscodes: ["200"]
+    interval: 60
+  - name: EXT-HTTPS-GRAFANA
+    url: "https://grafana{suffix}.{domain}/api/health"
+    accepted_statuscodes: ["200"]
+    interval: 60
+  - name: EXT-HTTPS-PORTAINER
+    url: "https://portainer{suffix}.{domain}"
+    accepted_statuscodes: ["200", "401", "403"]
+    interval: 120
+  - name: EXT-HTTPS-APIGW
+    url: "https://apigw{suffix}.{domain}"
+    accepted_statuscodes: ["200", "401", "403"]
+    interval: 120
+dns_monitors:
+  - name: EXT-DNS-API
+    hostname: "api{suffix}.{domain}"
+    dns_resolve_type: A
+    interval: 60
+  - name: EXT-DNS-ROOT
+    hostname: "{domain}"
+    dns_resolve_type: A
+    interval: 60
+ping_monitors:
+  interval: 60
+  max_retries: 1
+status_pages:
+  - slug: "iklim-{env}-status"
+    title: "iklim.co API Durumu"
+    public: true
+    groups: ["Dış Erişilebilirlik - Kritik"]
+  - slug: "iklim-{env}-ops"
+    title: "iklim.co [{env}] Altyapı"
+    public: false
+    groups:
+      - "Altyapı"
+      - "Veri Katmanı"
+      - "Gateway & Mesajlaşma"
+      - "Dış Erişilebilirlik - Kritik"
+      - "Dış Erişilebilirlik - Genel"
+  - slug: "iklim-{env}-tools"
+    title: "iklim.co [{env}] Araçlar"
+    public: false
+    groups: ["Gözlemlenebilirlik"]
--- a/health-agent/pyproject.toml
+++ b/health-agent/pyproject.toml
@ -0,0 +1,22 @@
+[project]
+name = "health-agent"
+version = "0.1.0"
+description = "iklim.co Monitoring Health Agent"
+requires-python = ">=3.12"
+dependencies = [
+    "requests",
+    "docker",
+    "python-dotenv",
+    "pyyaml",
+    "redis",
+    "pymongo",
+    "uptime-kuma-api",
+    "cryptography",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/health_agent"]
--- a/health-agent/scripts/setup_uptime_kuma.py
+++ b/health-agent/scripts/setup_uptime_kuma.py
@ -0,0 +1,138 @@
+import os
+import argparse
+import yaml
+import logging
+from uptime_kuma_api import UptimeKumaApi, MonitorType
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger("uk-setup")
+
+def format_str(text, env_name, project):
+    if not isinstance(text, str):
+        return text
+    return text.replace("{env}", env_name).replace("{project}", project)
+
+def setup_uptime_kuma(dry_run=False, only=None):
+    env_name = os.getenv("ENV", "test")
+    
+    config_path = os.path.join(os.path.dirname(__file__), "..", "config", "monitors.yml")
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
+        
+    project = config.get("project", "iklim")
+    
+    kuma_url = os.getenv("UK_URL", "http://localhost:3001")
+    kuma_user = os.getenv("UK_USER", "admin")
+    kuma_pass = os.getenv("UK_PASS", "admin")
+    
+    api = None
+    if not dry_run:
+        logger.info(f"Connecting to Uptime Kuma at {kuma_url}...")
+        try:
+            api = UptimeKumaApi(kuma_url)
+            api.login(kuma_user, kuma_pass)
+        except Exception as e:
+            logger.error(f"Login failed: {e}")
+            return
+            
+    existing_monitors = {}
+    if api:
+        try:
+            for m in api.get_monitors():
+                existing_monitors[m['name']] = m
+        except Exception as e:
+            logger.error(f"Failed to get monitors: {e}")
+            
+    # 1. Process Groups
+    group_map = {}
+    for g in config.get("groups", []):
+        raw_name = g["name"]
+        formatted_name = f"{project} [{env_name}] {raw_name}"
+        
+        logger.info(f"Processing group: {formatted_name}")
+        if not dry_run:
+            if formatted_name not in existing_monitors:
+                logger.info(f"Creating group monitor: {formatted_name}")
+                res = api.add_monitor(type=MonitorType.GROUP, name=formatted_name)
+                group_map[raw_name] = res['monitorID']
+            else:
+                group_map[raw_name] = existing_monitors[formatted_name]['id']
+
+    tokens = {}
+    
+    # 2. Push Monitors
+    for pm in config.get("push_monitors", []):
+        m_name = pm["name"]
+        if only and m_name != only:
+            continue
+            
+        m_interval = pm.get("interval", 60)
+        
+        parent_group_id = None
+        for g in config.get("groups", []):
+            if m_name in g.get("children", []):
+                parent_group_id = group_map.get(g["name"])
+                break
+                
+        logger.info(f"Processing push monitor: {m_name}")
+        if not dry_run:
+            if m_name in existing_monitors:
+                logger.info(f"Monitor {m_name} already exists.")
+                m_id = existing_monitors[m_name]['id']
+                token = existing_monitors[m_name]['pushToken']
+                tokens[m_name] = token
+                
+                if parent_group_id and existing_monitors[m_name].get('parent') != parent_group_id:
+                    api.edit_monitor(m_id, parent=parent_group_id)
+            else:
+                logger.info(f"Creating push monitor: {m_name}")
+                result = api.add_monitor(
+                    type=MonitorType.PUSH,
+                    name=m_name,
+                    interval=m_interval,
+                    parent=parent_group_id
+                )
+                m_id = result['monitorID']
+                
+                # Fetch again to get pushToken
+                for m in api.get_monitors():
+                    if m['id'] == m_id:
+                        tokens[m_name] = m['pushToken']
+                        break
+        else:
+            tokens[m_name] = "dummy_token_dry_run"
+            
+    # 3. Process Status Pages
+    for sp in config.get("status_pages", []):
+        slug = format_str(sp["slug"], env_name, project)
+        title = format_str(sp["title"], env_name, project)
+        logger.info(f"Processing status page: {title} (slug: {slug})")
+        if not dry_run:
+            try:
+                pages = api.get_status_pages()
+                exists = any(p['slug'] == slug for p in pages)
+                if not exists:
+                    logger.info(f"Creating status page: {slug}")
+                    api.add_status_page(slug, title)
+            except Exception as e:
+                logger.warning(f"Status page ops failed: {e}")
+                
+    # 4. Write tokens to uk_tokens.yml
+    token_file = os.path.join(os.path.dirname(__file__), "..", "config", "uk_tokens.yml")
+    if not dry_run:
+        with open(token_file, "w") as f:
+            yaml.dump(tokens, f)
+        logger.info(f"Saved push tokens to {token_file}")
+    else:
+        logger.info(f"[DRY-RUN] Would save {len(tokens)} tokens to {token_file}")
+        
+    if api:
+        api.disconnect()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Setup Uptime Kuma monitors")
+    parser.add_argument("--dry-run", action="store_true", help="Print actions without making changes")
+    parser.add_argument("--only", type=str, help="Only process a specific monitor by name")
+    args = parser.parse_args()
+    
+    setup_uptime_kuma(dry_run=args.dry_run, only=args.only)
--- a/health-agent/src/health_agent/checks/filesystem.py
+++ b/health-agent/src/health_agent/checks/filesystem.py
@ -0,0 +1,36 @@
+import os
+import time
+import logging
+from health_agent.uptime_kuma import push
+
+logger = logging.getLogger(__name__)
+
+def check_storagebox_mount():
+    start_t = time.time()
+    
+    storagebox_path = os.getenv("STORAGEBOX_PATH", "/mnt/storagebox")
+    expected_files = [
+        "patroni/patroni.yml",
+        "ssl/STAR.iklim.co.full.crt"
+    ]
+    
+    missing_files = []
+    
+    if not os.path.exists(storagebox_path):
+        ping_ms = int((time.time() - start_t) * 1000)
+        push("STORAGEBOX-MOUNT", "down", f"{storagebox_path} not found", ping_ms)
+        return
+        
+    for rel_path in expected_files:
+        full_path = os.path.join(storagebox_path, rel_path)
+        if not os.path.exists(full_path):
+            missing_files.append(rel_path)
+            
+    ping_ms = int((time.time() - start_t) * 1000)
+    
+    if missing_files:
+        msg = f"mount exists but missing: {', '.join(missing_files)}"
+        push("STORAGEBOX-MOUNT", "down", msg, ping_ms)
+    else:
+        msg = f"{storagebox_path} OK | all critical files present"
+        push("STORAGEBOX-MOUNT", "up", msg, ping_ms)
--- a/health-agent/src/health_agent/checks/http.py
+++ b/health-agent/src/health_agent/checks/http.py
@ -0,0 +1,196 @@
+import os
+import time
+import logging
+import requests
+from requests.auth import HTTPBasicAuth
+from health_agent.uptime_kuma import push
+import urllib3
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+logger = logging.getLogger(__name__)
+
+def http_check(url, expected_status=None, auth=None, verify_ssl=True, timeout=5, headers=None):
+    start_time = time.time()
+    try:
+        resp = requests.get(url, auth=auth, verify=verify_ssl, timeout=timeout, headers=headers)
+        ping_ms = int((time.time() - start_time) * 1000)
+        
+        if expected_status:
+            if isinstance(expected_status, list):
+                is_ok = resp.status_code in expected_status
+            else:
+                is_ok = resp.status_code == expected_status
+        else:
+            is_ok = resp.status_code < 400
+            
+        return is_ok, resp, ping_ms, None
+    except Exception as e:
+        ping_ms = int((time.time() - start_time) * 1000)
+        return False, None, ping_ms, str(e)
+
+def check_patroni_cluster():
+    nodes = ["patroni-01", "patroni-02", "patroni-03"]
+    cluster_data = None
+    error_msg = "All Patroni nodes unreachable"
+    start_t = time.time()
+    
+    for node in nodes:
+        url = f"http://{node}:8008/cluster"
+        ok, resp, _, err = http_check(url, timeout=3)
+        if ok and resp:
+            cluster_data = resp.json()
+            break
+        elif err:
+            error_msg = f"{node} error: {err}"
+            
+    ping_ms = int((time.time() - start_t) * 1000)
+    
+    if not cluster_data:
+        push("PATRONI-CLUSTER", "down", error_msg, ping_ms)
+        return
+        
+    members = cluster_data.get("members", [])
+    leader = None
+    replicas = []
+    
+    for m in members:
+        if m.get("role") == "leader":
+            leader = m.get("name")
+        else:
+            lag = m.get("lag", 0)
+            name = m.get("name")
+            state = m.get("state")
+            replicas.append((name, lag, state))
+            
+    if not leader:
+        down_nodes = [f"{r[0]} state: {r[2]}" for r in replicas if r[2] not in ("running", "streaming")]
+        msg = f"no leader detected | " + " ".join(down_nodes)
+        push("PATRONI-CLUSTER", "down", msg, ping_ms)
+    else:
+        lag_strs = []
+        for name, lag, state in replicas:
+            lag_mb = lag / (1024*1024) if isinstance(lag, (int, float)) else 0
+            lag_strs.append(f"{name} (lag:{lag_mb:.0f}MB)")
+            
+        msg = f"leader: {leader} | replicas: " + " ".join(lag_strs)
+        push("PATRONI-CLUSTER", "up", msg, ping_ms)
+
+def check_rabbitmq_cluster():
+    url = "http://rabbitmq:15672/api/healthchecks/node"
+    user = os.getenv("RABBITMQ_USER", "guest")
+    password = os.getenv("RABBITMQ_PASS", "guest")
+    auth = HTTPBasicAuth(user, password)
+    
+    ok, resp, ping_ms, err = http_check(url, auth=auth)
+    
+    if ok:
+        ok2, resp2, _, _ = http_check("http://rabbitmq:15672/api/nodes", auth=auth)
+        nodes_running = 0
+        total_nodes = 3
+        
+        if ok2 and resp2:
+            data = resp2.json()
+            nodes_running = len([n for n in data if n.get("running")])
+            total_nodes = len(data)
+            
+            alarms = [n.get("name") for n in data if n.get("mem_alarm") or n.get("disk_free_alarm")]
+            if alarms:
+                msg = f"disk/mem alarm active on {','.join(alarms)}"
+                push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
+                return
+                
+        msg = f"{nodes_running}/{total_nodes} nodes running"
+        push("RABBITMQ-CLUSTER", "up", msg, ping_ms)
+    else:
+        msg = err or f"HTTP {resp.status_code if resp else 'Unknown'}"
+        push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
+
+def check_apisix():
+    url = "http://apisix:9180/apisix/admin/routes"
+    api_key = os.getenv("APISIX_ADMIN_KEY", "")
+    headers = {"X-API-KEY": api_key} if api_key else {}
+    ok, resp, ping_ms, err = http_check(url, headers=headers)
+    
+    if ok:
+        push("APISIX-GATEWAY", "up", "admin API reachable", ping_ms)
+    else:
+        push("APISIX-GATEWAY", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms)
+
+def check_vault():
+    nodes = ["vault-1", "vault-2", "vault-3"]
+    domain = os.getenv("EXTERNAL_DOMAIN", "iklim.co")
+    unsealed_count = 0
+    total = len(nodes)
+    max_ping = 0
+    errors = []
+    
+    start_t = time.time()
+    for node in nodes:
+        url = f"https://{node}.{domain}:8200/v1/sys/health"
+        ok, resp, ms, err = http_check(url, verify_ssl=False, expected_status=[200, 429, 473])
+        max_ping = max(max_ping, ms)
+        
+        if resp:
+            data = resp.json()
+            if not data.get("sealed"):
+                unsealed_count += 1
+            else:
+                errors.append(f"{node} SEALED")
+        else:
+            errors.append(f"{node} unreachable")
+            
+    ping_ms = int((time.time() - start_t) * 1000)
+    
+    if unsealed_count == total:
+        msg = f"{unsealed_count}/{total} unsealed"
+        push("VAULT-CLUSTER", "up", msg, ping_ms)
+    else:
+        msg = " | ".join(errors) if errors else "Vault checks failed"
+        push("VAULT-CLUSTER", "down", msg, ping_ms)
+
+def check_prometheus():
+    url = "http://prometheus:9090/-/healthy"
+    ok, resp, ping_ms, err = http_check(url)
+    if ok:
+        push("PROMETHEUS", "up", "healthy", ping_ms)
+    else:
+        push("PROMETHEUS", "down", f"prometheus unreachable: {err}", ping_ms)
+
+def check_grafana():
+    url = "http://grafana:3000/api/health"
+    ok, resp, ping_ms, err = http_check(url)
+    if ok and resp:
+        data = resp.json()
+        db_status = data.get("database", "unknown")
+        if db_status == "ok":
+            push("GRAFANA", "up", f"ok | db: {db_status}", ping_ms)
+        else:
+            push("GRAFANA", "down", f"db not ok: {db_status}", ping_ms)
+    else:
+        push("GRAFANA", "down", f"grafana unreachable: {err}", ping_ms)
+
+def check_portainer():
+    url = "http://portainer:9000/api/system/status"
+    ok, resp, ping_ms, err = http_check(url)
+    if ok:
+        push("PORTAINER", "up", "running", ping_ms)
+    else:
+        push("PORTAINER", "down", f"portainer unreachable: {err}", ping_ms)
+
+def check_loki():
+    url = "http://loki:3100/ready"
+    ok, resp, ping_ms, err = http_check(url)
+    if ok:
+        push("LOKI", "up", "ready", ping_ms)
+    else:
+        push("LOKI", "down", f"loki unreachable: {err}", ping_ms)
+
+def run_all_http_checks():
+    check_patroni_cluster()
+    check_rabbitmq_cluster()
+    check_apisix()
+    check_vault()
+    check_prometheus()
+    check_grafana()
+    check_portainer()
+    check_loki()
--- a/health-agent/src/health_agent/checks/mongodb.py
+++ b/health-agent/src/health_agent/checks/mongodb.py
@ -0,0 +1,57 @@
+import os
+import time
+import logging
+from pymongo import MongoClient
+from health_agent.uptime_kuma import push
+
+logger = logging.getLogger(__name__)
+
+def check_mongodb():
+    start_t = time.time()
+    
+    mongo_uri = os.getenv("MONGO_URI", "mongodb://mongodb-01:27017,mongodb-02:27017,mongodb-03:27017/?replicaSet=rs0")
+    cluster_size = int(os.getenv("CLUSTER_SIZE_MONGODB", "3"))
+    
+    try:
+        with MongoClient(mongo_uri, serverSelectionTimeoutMS=3000) as client:
+            status = client.admin.command('replSetGetStatus')
+        
+        members = status.get('members', [])
+        
+        primary = None
+        secondaries = []
+        
+        for m in members:
+            state_str = m.get('stateStr', '')
+            name = m.get('name', 'unknown')
+            
+            if state_str == 'PRIMARY':
+                primary = name
+            elif state_str == 'SECONDARY':
+                secondaries.append((name, state_str))
+            else:
+                secondaries.append((name, state_str))
+                
+        ping_ms = int((time.time() - start_t) * 1000)
+        
+        if cluster_size == 1:
+            push("MONGODB-REPLICASET", "up", "standalone mode OK", ping_ms)
+            return
+
+        if primary:
+            sec_strs = [f"{s[0]} ({s[1]})" for s in secondaries]
+            msg = f"PRIMARY: {primary} | secondaries: {' '.join(sec_strs)}"
+            
+            unhealthy_secs = [s for s in secondaries if s[1] not in ('SECONDARY', 'ARBITER')]
+            if unhealthy_secs:
+                msg = f"PRIMARY: {primary} | unhealthy: {','.join([s[0] + ':' + s[1] for s in unhealthy_secs])}"
+                push("MONGODB-REPLICASET", "down", msg, ping_ms)
+            else:
+                push("MONGODB-REPLICASET", "up", msg, ping_ms)
+        else:
+            msg = "no PRIMARY | quorum lost"
+            push("MONGODB-REPLICASET", "down", msg, ping_ms)
+            
+    except Exception as e:
+        ping_ms = int((time.time() - start_t) * 1000)
+        push("MONGODB-REPLICASET", "down", f"connection failed: {e}", ping_ms)
--- a/health-agent/src/health_agent/checks/redis_sentinel.py
+++ b/health-agent/src/health_agent/checks/redis_sentinel.py
@ -0,0 +1,43 @@
+import os
+import time
+import logging
+from redis.sentinel import Sentinel
+from health_agent.uptime_kuma import push
+
+logger = logging.getLogger(__name__)
+
+def check_redis_sentinel():
+    start_t = time.time()
+    
+    hosts = os.getenv("REDIS_SENTINEL_HOSTS", "redis-sentinel-01,redis-sentinel-02,redis-sentinel-03")
+    sentinel_nodes = [(h.strip(), 26379) for h in hosts.split(",")]
+    
+    master_name = os.getenv("REDIS_MASTER_NAME", "prod-master")
+    password = os.getenv("REDIS_PASSWORD", None)
+    redis_mode = os.getenv("REDIS_MODE", "sentinel")
+    
+    if redis_mode != "sentinel":
+        push("REDIS-SENTINEL", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000))
+        return
+
+    try:
+        sentinel = Sentinel(sentinel_nodes, socket_timeout=3, password=password)
+        
+        # Master ping
+        master = sentinel.master_for(master_name, socket_timeout=3, password=password)
+        master.ping()
+        master_ip, master_port = sentinel.discover_master(master_name)
+        master.connection_pool.disconnect()
+        
+        # Get replicas count
+        slaves = sentinel.discover_slaves(master_name)
+        replicas_count = len(slaves)
+        
+        ping_ms = int((time.time() - start_t) * 1000)
+        
+        msg = f"master: {master_ip}:{master_port} | replicas: {replicas_count} | sentinels quorum OK"
+        push("REDIS-SENTINEL", "up", msg, ping_ms)
+        
+    except Exception as e:
+        ping_ms = int((time.time() - start_t) * 1000)
+        push("REDIS-SENTINEL", "down", f"quorum FAIL or master unreachable: {e}", ping_ms)
--- a/health-agent/src/health_agent/checks/swarm.py
+++ b/health-agent/src/health_agent/checks/swarm.py
@ -0,0 +1,49 @@
+import time
+import docker
+import logging
+from health_agent.uptime_kuma import push
+
+logger = logging.getLogger(__name__)
+
+def check_swarm_cluster():
+    start_time = time.time()
+    try:
+        client = docker.from_env()
+        nodes = client.nodes.list()
+        
+        ready_nodes = []
+        managers = []
+        
+        for node in nodes:
+            spec = node.attrs.get('Spec', {})
+            status = node.attrs.get('Status', {})
+            manager_status = node.attrs.get('ManagerStatus', {})
+            
+            node_name = spec.get('Name', node.id)
+            is_ready = status.get('State') == 'ready'
+            is_manager = spec.get('Role') == 'manager'
+            
+            if is_ready:
+                ready_nodes.append(node_name)
+                
+            if is_manager:
+                reachability = manager_status.get('Reachability')
+                if reachability == 'reachable':
+                    managers.append(node_name)
+                    
+        total_nodes = len(nodes)
+        ready_count = len(ready_nodes)
+        
+        ping_ms = int((time.time() - start_time) * 1000)
+        
+        if ready_count == total_nodes:
+            msg = f"{ready_count}/{total_nodes} nodes Ready (managers: {', '.join(managers)})"
+            push("SWARM-CLUSTER", "up", msg, ping_ms)
+        else:
+            msg = f"{ready_count}/{total_nodes} nodes Ready | Managers reachable: {len(managers)}"
+            push("SWARM-CLUSTER", "down", msg, ping_ms)
+            
+    except Exception as e:
+        ping_ms = int((time.time() - start_time) * 1000)
+        logger.error(f"Swarm check failed: {e}")
+        push("SWARM-CLUSTER", "down", str(e), ping_ms)
--- a/health-agent/src/health_agent/checks/tcp.py
+++ b/health-agent/src/health_agent/checks/tcp.py
@ -0,0 +1,77 @@
+import socket
+import time
+import logging
+import requests
+from health_agent.uptime_kuma import push
+from health_agent.checks.http import http_check
+
+logger = logging.getLogger(__name__)
+
+def tcp_check(host, port, timeout=3):
+    start_time = time.time()
+    try:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.settimeout(timeout)
+        result = sock.connect_ex((host, port))
+        sock.close()
+        ping_ms = int((time.time() - start_time) * 1000)
+        if result == 0:
+            return True, ping_ms, None
+        else:
+            return False, ping_ms, f"Port {port} is closed or unreachable"
+    except Exception as e:
+        ping_ms = int((time.time() - start_time) * 1000)
+        return False, ping_ms, str(e)
+
+def check_etcd_cluster():
+    nodes = ["etcd-01", "etcd-02", "etcd-03"]
+    start_t = time.time()
+    
+    healthy_count = 0
+    leader = None
+    errors = []
+    
+    for node in nodes:
+        # 1. TCP Check on 2379
+        tcp_ok, ms, tcp_err = tcp_check(node, 2379)
+        if not tcp_ok:
+            errors.append(f"{node} port 2379 unreachable")
+            continue
+            
+        # 2. HTTP Health check
+        url = f"http://{node}:2379/health"
+        http_ok, resp, ms, http_err = http_check(url, timeout=3)
+        
+        if http_ok and resp:
+            data = resp.json()
+            if data.get("health") == "true":
+                healthy_count += 1
+            else:
+                errors.append(f"{node} unhealthy")
+        else:
+            errors.append(f"{node} health endpoint unreachable")
+            
+        # 3. Leader check from /v3/maintenance/status
+        if not leader and tcp_ok:
+            status_url = f"http://{node}:2379/v3/maintenance/status"
+            try:
+                r = requests.post(status_url, json={}, timeout=3)
+                if r.status_code == 200:
+                    status_data = r.json()
+                    leader_id = status_data.get("leader")
+                    header_member_id = status_data.get("header", {}).get("member_id")
+                    if leader_id and leader_id == header_member_id:
+                        leader = node
+            except Exception:
+                pass
+
+    ping_ms = int((time.time() - start_t) * 1000)
+    
+    if healthy_count == len(nodes):
+        leader_info = f" | leader: {leader}" if leader else ""
+        msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}"
+        push("ETCD-CLUSTER", "up", msg, ping_ms)
+    else:
+        quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else ""
+        msg = " | ".join(errors) + quorum_msg
+        push("ETCD-CLUSTER", "down", msg, ping_ms)
--- a/health-agent/src/health_agent/checks/tls.py
+++ b/health-agent/src/health_agent/checks/tls.py
@ -0,0 +1,62 @@
+import os
+import time
+import logging
+import requests
+from datetime import datetime, timezone
+from health_agent.uptime_kuma import push
+from cryptography import x509
+from cryptography.hazmat.backends import default_backend
+
+logger = logging.getLogger(__name__)
+
+def check_swag_tls():
+    start_t = time.time()
+    cert_path = "/mnt/storagebox/ssl/STAR.iklim.co.full.crt"
+    domain = os.getenv("EXTERNAL_DOMAIN", "iklim.co")
+    suffix = os.getenv("EXTERNAL_SUBDOMAIN_SUFFIX", "")
+    target_url = f"https://api{suffix}.{domain}/actuator/health"
+
+    msg_parts = []
+    is_down = False
+    
+    # 1. Check cert file
+    if not os.path.exists(cert_path):
+        is_down = True
+        msg_parts.append("cert file missing on storagebox")
+    else:
+        try:
+            with open(cert_path, "rb") as f:
+                cert_data = f.read()
+            cert = x509.load_pem_x509_certificate(cert_data, default_backend())
+            not_valid_after = cert.not_valid_after_utc
+            now = datetime.now(timezone.utc)
+            days_left = (not_valid_after - now).days
+            
+            if days_left < 14:
+                is_down = True
+                msg_parts.append(f"cert expires in {days_left} days")
+            else:
+                msg_parts.append(f"cert valid until {not_valid_after.strftime('%Y-%m-%d')} ({days_left} days)")
+        except Exception as e:
+            is_down = True
+            msg_parts.append(f"cert parse error: {e}")
+            
+    # 2. Check external HTTPS reachable
+    try:
+        r = requests.get(target_url, timeout=5, verify=False)
+        if r.status_code < 500:
+            msg_parts.append("HTTPS reachable")
+        else:
+            is_down = True
+            msg_parts.append(f"HTTPS returned {r.status_code}")
+    except Exception as e:
+        is_down = True
+        msg_parts.append(f"HTTPS unreachable")
+        
+    ping_ms = int((time.time() - start_t) * 1000)
+    msg = " | ".join(msg_parts)
+    
+    if is_down:
+        push("SWAG-TLS", "down", msg, ping_ms)
+    else:
+        push("SWAG-TLS", "up", msg, ping_ms)
--- a/health-agent/src/health_agent/config.py
+++ b/health-agent/src/health_agent/config.py
@ -0,0 +1,25 @@
+import os
+import yaml
+from pathlib import Path
+from dotenv import load_dotenv
+
+load_dotenv()
+
+ENV = os.getenv("ENV", "prod")
+CLUSTER_SIZE_ETCD = int(os.getenv("CLUSTER_SIZE_ETCD", "3"))
+CLUSTER_SIZE_PATRONI = int(os.getenv("CLUSTER_SIZE_PATRONI", "3"))
+CLUSTER_SIZE_MONGODB = int(os.getenv("CLUSTER_SIZE_MONGODB", "3"))
+CLUSTER_SIZE_RABBITMQ = int(os.getenv("CLUSTER_SIZE_RABBITMQ", "3"))
+CLUSTER_SIZE_VAULT = int(os.getenv("CLUSTER_SIZE_VAULT", "3"))
+REDIS_MODE = os.getenv("REDIS_MODE", "sentinel")
+EXTERNAL_DOMAIN = os.getenv("EXTERNAL_DOMAIN", "iklim.co")
+EXTERNAL_SUBDOMAIN_SUFFIX = os.getenv("EXTERNAL_SUBDOMAIN_SUFFIX", "")
+
+def load_uk_tokens():
+    token_file = Path("config/generated/uk_tokens.yml")
+    if not token_file.exists():
+        return {}
+    with open(token_file, "r") as f:
+        return yaml.safe_load(f) or {}
+
+UK_TOKENS = load_uk_tokens()
--- a/health-agent/src/health_agent/events/docker_events.py
+++ b/health-agent/src/health_agent/events/docker_events.py
@ -0,0 +1,56 @@
+import os
+import docker
+import threading
+import logging
+import time
+from health_agent.slack import notify
+
+logger = logging.getLogger(__name__)
+
+def parse_and_notify(event):
+    attrs = event.get('Actor', {}).get('Attributes', {})
+    container_name = attrs.get('name', 'unknown')
+    exit_code = attrs.get('exitCode', '0')
+    
+    if exit_code == '0':
+        return
+        
+    is_oom = (exit_code == '137')
+    
+    env = os.getenv("ENV", "test").upper()
+    webhook_env_name = f"SLACK_WEBHOOK_IKLIM_{env}_OPS"
+    
+    priority = "High" if is_oom else "Medium"
+    title = f"[Health Agent / Events] Container Crashed ({container_name})"
+    
+    detail = f"Container: {container_name}\nExit Code: {exit_code}"
+    if is_oom:
+        detail += "\nReason: OOM Killed (Out Of Memory) or SIGKILL"
+        
+    notify(
+        webhook_env=webhook_env_name,
+        source="health-agent-events",
+        priority=priority,
+        title=title,
+        detail=detail
+    )
+
+def event_listener_loop():
+    while True:
+        try:
+            client = docker.from_env()
+            logger.info("Starting Docker event listener...")
+            filters = {"type": "container", "event": "die"}
+            for event in client.events(decode=True, filters=filters):
+                try:
+                    parse_and_notify(event)
+                except Exception as e:
+                    logger.error(f"Error parsing event: {e}", exc_info=True)
+        except Exception as e:
+            logger.error(f"Docker event listener error: {e}. Reconnecting in 10s...", exc_info=True)
+            time.sleep(10)
+
+def start_docker_event_listener():
+    thread = threading.Thread(target=event_listener_loop, daemon=True)
+    thread.start()
+    return thread
--- a/health-agent/src/health_agent/main.py
+++ b/health-agent/src/health_agent/main.py
@ -0,0 +1,75 @@
+import time
+import logging
+from health_agent.checks import swarm
+from health_agent.checks.http import run_all_http_checks
+from health_agent.checks.tcp import check_etcd_cluster
+from health_agent.checks.tls import check_swag_tls
+from health_agent.checks.redis_sentinel import check_redis_sentinel
+from health_agent.checks.mongodb import check_mongodb
+from health_agent.checks.filesystem import check_storagebox_mount
+from health_agent.events.docker_events import start_docker_event_listener
+import json
+
+class JSONFormatter(logging.Formatter):
+    def format(self, record):
+        log_obj = {
+            "time": self.formatTime(record, self.datefmt),
+            "level": record.levelname,
+            "logger": record.name,
+            "msg": record.getMessage()
+        }
+        for attr in ['check', 'status', 'ping_ms', 'source', 'error']:
+            if hasattr(record, attr):
+                log_obj[attr] = getattr(record, attr)
+        if record.exc_info:
+            log_obj['exc_info'] = self.formatException(record.exc_info)
+        return json.dumps(log_obj)
+
+handler = logging.StreamHandler()
+handler.setFormatter(JSONFormatter())
+logging.basicConfig(level=logging.INFO, handlers=[handler])
+logger = logging.getLogger("main")
+
+def run_checks():
+    logger.info("Running health checks...")
+    try:
+        swarm.check_swarm_cluster()
+    except Exception as e:
+        logger.error(f"Error checking Swarm cluster: {e}")
+        
+    try:
+        run_all_http_checks()
+    except Exception as e:
+        logger.error(f"Error running HTTP checks: {e}")
+
+    try:
+        check_etcd_cluster()
+    except Exception as e:
+        logger.error(f"Error running etcd checks: {e}")
+        
+    try:
+        check_swag_tls()
+    except Exception as e:
+        logger.error(f"Error running TLS checks: {e}")
+        
+    try:
+        check_redis_sentinel()
+    except Exception as e:
+        logger.error(f"Error running Redis checks: {e}")
+        
+    try:
+        check_mongodb()
+    except Exception as e:
+        logger.error(f"Error running MongoDB checks: {e}")
+        
+    try:
+        check_storagebox_mount()
+    except Exception as e:
+        logger.error(f"Error running filesystem checks: {e}")
+
+if __name__ == "__main__":
+    logger.info("Starting health-agent...")
+    start_docker_event_listener()
+    while True:
+        run_checks()
+        time.sleep(60)
--- a/health-agent/src/health_agent/slack.py
+++ b/health-agent/src/health_agent/slack.py
@ -0,0 +1,22 @@
+import os
+import requests
+import logging
+
+logger = logging.getLogger(__name__)
+
+def notify(webhook_env: str, source: str, priority: str, title: str, detail: str):
+    webhook_url = os.getenv(webhook_env)
+    if not webhook_url:
+        logger.warning(f"Slack webhook url not found for {webhook_env}")
+        return
+    
+    payload = {
+        "text": f"*{title}*\n*Source:* {source}\n*Priority:* {priority}\n```\n{detail}\n```"
+    }
+    
+    try:
+        response = requests.post(webhook_url, json=payload, timeout=5)
+        response.raise_for_status()
+        logger.info(f"Sent Slack notification to {webhook_env}")
+    except Exception as e:
+        logger.error(f"Failed to send Slack notification: {e}")
--- a/health-agent/src/health_agent/state.py
+++ b/health-agent/src/health_agent/state.py
@ -0,0 +1,19 @@
+import json
+import os
+from pathlib import Path
+
+STATE_FILE = Path("config/generated/state.json")
+
+def load_state():
+    if not STATE_FILE.exists():
+        return {}
+    try:
+        with open(STATE_FILE, "r") as f:
+            return json.load(f)
+    except Exception:
+        return {}
+
+def save_state(state):
+    STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
+    with open(STATE_FILE, "w") as f:
+        json.dump(state, f)
--- a/health-agent/src/health_agent/uptime_kuma.py
+++ b/health-agent/src/health_agent/uptime_kuma.py
@ -0,0 +1,27 @@
+import os
+import requests
+import logging
+from health_agent.config import UK_TOKENS
+
+logger = logging.getLogger(__name__)
+UK_PUSH_URL_BASE = os.getenv("UK_PUSH_URL_BASE", "https://status.iklim.co/api/push")
+
+def push(monitor_name: str, status: str, msg: str, ping_ms: int):
+    token = UK_TOKENS.get(monitor_name)
+    if not token:
+        logger.warning(f"No token found for monitor {monitor_name}")
+        return
+    
+    url = f"{UK_PUSH_URL_BASE}/{token}"
+    params = {
+        "status": status,
+        "msg": msg,
+        "ping": int(ping_ms)
+    }
+    
+    try:
+        response = requests.get(url, params=params, timeout=10)
+        response.raise_for_status()
+        logger.info(f"Pushed {monitor_name} status={status}", extra={"check": monitor_name, "status": status, "msg": msg, "ping_ms": ping_ms, "source": "uptime_kuma"})
+    except Exception as e:
+        logger.error(f"Failed to push {monitor_name}: {e}", extra={"check": monitor_name, "status": "push_failed", "error": str(e), "source": "uptime_kuma"})