feat(health-agent): add monitors.yml with env-aware node IP mapping from Ansible inventory

2026-06-25 18:59:14 +03:00 · 2026-06-25 18:59:14 +03:00 · f742bfdd11
commit f742bfdd11
parent a2e8997711
19 changed files with 1131 additions and 0 deletions
--- a/health-agent/.env.example
+++ b/health-agent/.env.example
@ -0,0 +1,10 @@
 ENV=prod
 CLUSTER_SIZE_ETCD=3
 CLUSTER_SIZE_PATRONI=3
 CLUSTER_SIZE_MONGODB=3
 CLUSTER_SIZE_RABBITMQ=3
 CLUSTER_SIZE_VAULT=3
 REDIS_MODE=sentinel
 EXTERNAL_DOMAIN=iklim.co
 EXTERNAL_SUBDOMAIN_SUFFIX=
 UK_PUSH_URL_BASE=https://status.iklim.co/api/push
--- a/health-agent/.env.setup.example
+++ b/health-agent/.env.setup.example
@ -0,0 +1,5 @@
 UK_URL=http://uptime-kuma:3001
 UK_API_KEY=your_api_key_here
 UK_SLACK_WEBHOOK_HIGH=https://hooks.slack.com/services/...
 UK_SLACK_WEBHOOK_MEDIUM=https://hooks.slack.com/services/...
 UK_SLACK_WEBHOOK_LOW=https://hooks.slack.com/services/...
--- a/health-agent/Dockerfile
+++ b/health-agent/Dockerfile
@ -0,0 +1,16 @@
 FROM python:3.12-slim
 WORKDIR /app
 RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/*
 COPY pyproject.toml ./
 COPY src/ ./src/
 RUN pip install --no-cache-dir .
 ENV PYTHONPATH=/app/src
 RUN useradd -m appuser
 # Keeping as root to be able to access /var/run/docker.sock cleanly, unless specifically configured with groups.
 # USER appuser
 CMD ["python", "src/health_agent/main.py"]
--- a/health-agent/config/monitors.yml
+++ b/health-agent/config/monitors.yml
@ -0,0 +1,196 @@
 version: "1"
 project: "iklim"
 domain:
  base: "iklim.co"
 nodes:
  prod:
    service:
      - name: iklim-app-01
        ip: "178.104.210.41"
      - name: iklim-app-02
        ip: "178.105.69.1"
      - name: iklim-app-03
        ip: "178.104.219.3"
    db:
      - name: iklim-db-01
        ip: "159.69.117.158"
      - name: iklim-db-02
        ip: "178.104.219.162"
      - name: iklim-db-03
        ip: "159.69.115.105"
  test:
    service:
      - name: iklim-app-01
        ip: "167.235.194.61"
    db:
      - name: iklim-db-01
        ip: "167.235.205.93"
 tags:
  - external
  - internal
  - high
  - medium
  - low
  - database
  - gateway
  - infrastructure
  - observability
 notifications:
  slack-high:
    type: slack
    webhook_env: UK_SLACK_WEBHOOK_HIGH
  slack-medium:
    type: slack
    webhook_env: UK_SLACK_WEBHOOK_MEDIUM
  slack-low:
    type: slack
    webhook_env: UK_SLACK_WEBHOOK_LOW
 groups:
  - name: "Altyapı"
    status_page: "iklim-{env}-ops"
    notifications: [slack-high]
    tags: [internal, infrastructure]
    children: [SWARM-CLUSTER, VAULT-CLUSTER, STORAGEBOX-MOUNT, SWAG-TLS]
  - name: "Veri Katmanı"
    status_page: "iklim-{env}-ops"
    notifications: [slack-high]
    tags: [internal, database]
    children: [ETCD-CLUSTER, PATRONI-CLUSTER, MONGODB-REPLICASET]
  - name: "Gateway & Mesajlaşma"
    status_page: "iklim-{env}-ops"
    notifications: [slack-high]
    tags: [internal, gateway]
    children: [APISIX-GATEWAY, RABBITMQ-CLUSTER, REDIS-SENTINEL]
  - name: "Dış Erişilebilirlik - Kritik"
    status_page: "iklim-{env}-ops"
    notifications: [slack-high]
    tags: [external, high]
    children: [EXT-HTTPS-API, EXT-DNS-API, EXT-DNS-ROOT, EXT-PING-APP01, EXT-PING-APP02, EXT-PING-APP03]
  - name: "Dış Erişilebilirlik - Genel"
    status_page: "iklim-{env}-ops"
    notifications: [slack-medium]
    tags: [external, medium]
    children: [EXT-HTTPS-GRAFANA, EXT-PING-DB01, EXT-PING-DB02, EXT-PING-DB03]
  - name: "Gözlemlenebilirlik"
    status_page: "iklim-{env}-tools"
    notifications: [slack-low]
    tags: [internal, observability]
    children: [PROMETHEUS, GRAFANA, PORTAINER, LOKI, EXT-HTTPS-PORTAINER, EXT-HTTPS-APIGW]
 push_monitors:
  - name: SWARM-CLUSTER
    interval: 60
    heartbeat_retries: 1
    tags: [internal, infrastructure, high]
    restart_threshold: 1
  - name: VAULT-CLUSTER
    interval: 60
    heartbeat_retries: 1
    tags: [internal, infrastructure, high]
    restart_threshold: 1
  - name: ETCD-CLUSTER
    interval: 60
    heartbeat_retries: 1
    tags: [internal, database, high]
    restart_threshold: 1
  - name: PATRONI-CLUSTER
    interval: 60
    heartbeat_retries: 1
    tags: [internal, database, high]
    restart_threshold: 1
  - name: MONGODB-REPLICASET
    interval: 120
    heartbeat_retries: 1
    tags: [internal, database, high]
    restart_threshold: 1
  - name: APISIX-GATEWAY
    interval: 60
    heartbeat_retries: 1
    tags: [internal, gateway, high]
    restart_threshold: 1
  - name: RABBITMQ-CLUSTER
    interval: 60
    heartbeat_retries: 1
    tags: [internal, gateway, medium]
    restart_threshold: 3
  - name: REDIS-SENTINEL
    interval: 60
    heartbeat_retries: 1
    tags: [internal, database, medium]
    restart_threshold: 3
  - name: SWAG-TLS
    interval: 3600
    heartbeat_retries: 1
    tags: [internal, infrastructure, medium]
    restart_threshold: 3
  - name: STORAGEBOX-MOUNT
    interval: 300
    heartbeat_retries: 1
    tags: [internal, infrastructure, medium]
    restart_threshold: 1
  - name: PROMETHEUS
    interval: 120
    heartbeat_retries: 1
    tags: [internal, observability, low]
    restart_threshold: 5
  - name: GRAFANA
    interval: 120
    heartbeat_retries: 1
    tags: [internal, observability, low]
    restart_threshold: 5
  - name: PORTAINER
    interval: 120
    heartbeat_retries: 1
    tags: [internal, observability, low]
    restart_threshold: 5
  - name: LOKI
    interval: 120
    heartbeat_retries: 1
    tags: [internal, observability, low]
    restart_threshold: 5
 http_monitors:
  - name: EXT-HTTPS-API
    url: "https://api{suffix}.{domain}/actuator/health"
    accepted_statuscodes: ["200"]
    interval: 60
  - name: EXT-HTTPS-GRAFANA
    url: "https://grafana{suffix}.{domain}/api/health"
    accepted_statuscodes: ["200"]
    interval: 60
  - name: EXT-HTTPS-PORTAINER
    url: "https://portainer{suffix}.{domain}"
    accepted_statuscodes: ["200", "401", "403"]
    interval: 120
  - name: EXT-HTTPS-APIGW
    url: "https://apigw{suffix}.{domain}"
    accepted_statuscodes: ["200", "401", "403"]
    interval: 120
 dns_monitors:
  - name: EXT-DNS-API
    hostname: "api{suffix}.{domain}"
    dns_resolve_type: A
    interval: 60
  - name: EXT-DNS-ROOT
    hostname: "{domain}"
    dns_resolve_type: A
    interval: 60
 ping_monitors:
  interval: 60
  max_retries: 1
 status_pages:
  - slug: "iklim-{env}-status"
    title: "iklim.co API Durumu"
    public: true
    groups: ["Dış Erişilebilirlik - Kritik"]
  - slug: "iklim-{env}-ops"
    title: "iklim.co [{env}] Altyapı"
    public: false
    groups:
      - "Altyapı"
      - "Veri Katmanı"
      - "Gateway & Mesajlaşma"
      - "Dış Erişilebilirlik - Kritik"
      - "Dış Erişilebilirlik - Genel"
  - slug: "iklim-{env}-tools"
    title: "iklim.co [{env}] Araçlar"
    public: false
    groups: ["Gözlemlenebilirlik"]
--- a/health-agent/pyproject.toml
+++ b/health-agent/pyproject.toml
@ -0,0 +1,22 @@
 [project]
 name = "health-agent"
 version = "0.1.0"
 description = "iklim.co Monitoring Health Agent"
 requires-python = ">=3.12"
 dependencies = [
    "requests",
    "docker",
    "python-dotenv",
    "pyyaml",
    "redis",
    "pymongo",
    "uptime-kuma-api",
    "cryptography",
 ]
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 [tool.hatch.build.targets.wheel]
 packages = ["src/health_agent"]
--- a/health-agent/scripts/setup_uptime_kuma.py
+++ b/health-agent/scripts/setup_uptime_kuma.py
@ -0,0 +1,138 @@
 import os
 import argparse
 import yaml
 import logging
 from uptime_kuma_api import UptimeKumaApi, MonitorType
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger("uk-setup")
 def format_str(text, env_name, project):
    if not isinstance(text, str):
        return text
    return text.replace("{env}", env_name).replace("{project}", project)
 def setup_uptime_kuma(dry_run=False, only=None):
    env_name = os.getenv("ENV", "test")
    config_path = os.path.join(os.path.dirname(__file__), "..", "config", "monitors.yml")
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)
    project = config.get("project", "iklim")
    kuma_url = os.getenv("UK_URL", "http://localhost:3001")
    kuma_user = os.getenv("UK_USER", "admin")
    kuma_pass = os.getenv("UK_PASS", "admin")
    api = None
    if not dry_run:
        logger.info(f"Connecting to Uptime Kuma at {kuma_url}...")
        try:
            api = UptimeKumaApi(kuma_url)
            api.login(kuma_user, kuma_pass)
        except Exception as e:
            logger.error(f"Login failed: {e}")
            return
    existing_monitors = {}
    if api:
        try:
            for m in api.get_monitors():
                existing_monitors[m['name']] = m
        except Exception as e:
            logger.error(f"Failed to get monitors: {e}")
    # 1. Process Groups
    group_map = {}
    for g in config.get("groups", []):
        raw_name = g["name"]
        formatted_name = f"{project} [{env_name}] {raw_name}"
        logger.info(f"Processing group: {formatted_name}")
        if not dry_run:
            if formatted_name not in existing_monitors:
                logger.info(f"Creating group monitor: {formatted_name}")
                res = api.add_monitor(type=MonitorType.GROUP, name=formatted_name)
                group_map[raw_name] = res['monitorID']
            else:
                group_map[raw_name] = existing_monitors[formatted_name]['id']
    tokens = {}
    # 2. Push Monitors
    for pm in config.get("push_monitors", []):
        m_name = pm["name"]
        if only and m_name != only:
            continue
        m_interval = pm.get("interval", 60)
        parent_group_id = None
        for g in config.get("groups", []):
            if m_name in g.get("children", []):
                parent_group_id = group_map.get(g["name"])
                break
        logger.info(f"Processing push monitor: {m_name}")
        if not dry_run:
            if m_name in existing_monitors:
                logger.info(f"Monitor {m_name} already exists.")
                m_id = existing_monitors[m_name]['id']
                token = existing_monitors[m_name]['pushToken']
                tokens[m_name] = token
                if parent_group_id and existing_monitors[m_name].get('parent') != parent_group_id:
                    api.edit_monitor(m_id, parent=parent_group_id)
            else:
                logger.info(f"Creating push monitor: {m_name}")
                result = api.add_monitor(
                    type=MonitorType.PUSH,
                    name=m_name,
                    interval=m_interval,
                    parent=parent_group_id
                )
                m_id = result['monitorID']
                # Fetch again to get pushToken
                for m in api.get_monitors():
                    if m['id'] == m_id:
                        tokens[m_name] = m['pushToken']
                        break
        else:
            tokens[m_name] = "dummy_token_dry_run"
    # 3. Process Status Pages
    for sp in config.get("status_pages", []):
        slug = format_str(sp["slug"], env_name, project)
        title = format_str(sp["title"], env_name, project)
        logger.info(f"Processing status page: {title} (slug: {slug})")
        if not dry_run:
            try:
                pages = api.get_status_pages()
                exists = any(p['slug'] == slug for p in pages)
                if not exists:
                    logger.info(f"Creating status page: {slug}")
                    api.add_status_page(slug, title)
            except Exception as e:
                logger.warning(f"Status page ops failed: {e}")
    # 4. Write tokens to uk_tokens.yml
    token_file = os.path.join(os.path.dirname(__file__), "..", "config", "uk_tokens.yml")
    if not dry_run:
        with open(token_file, "w") as f:
            yaml.dump(tokens, f)
        logger.info(f"Saved push tokens to {token_file}")
    else:
        logger.info(f"[DRY-RUN] Would save {len(tokens)} tokens to {token_file}")
    if api:
        api.disconnect()
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Setup Uptime Kuma monitors")
    parser.add_argument("--dry-run", action="store_true", help="Print actions without making changes")
    parser.add_argument("--only", type=str, help="Only process a specific monitor by name")
    args = parser.parse_args()
    setup_uptime_kuma(dry_run=args.dry_run, only=args.only)
--- a/health-agent/src/health_agent/checks/filesystem.py
+++ b/health-agent/src/health_agent/checks/filesystem.py
@ -0,0 +1,36 @@
 import os
 import time
 import logging
 from health_agent.uptime_kuma import push
 logger = logging.getLogger(__name__)
 def check_storagebox_mount():
    start_t = time.time()
    storagebox_path = os.getenv("STORAGEBOX_PATH", "/mnt/storagebox")
    expected_files = [
        "patroni/patroni.yml",
        "ssl/STAR.iklim.co.full.crt"
    ]
    missing_files = []
    if not os.path.exists(storagebox_path):
        ping_ms = int((time.time() - start_t) * 1000)
        push("STORAGEBOX-MOUNT", "down", f"{storagebox_path} not found", ping_ms)
        return
    for rel_path in expected_files:
        full_path = os.path.join(storagebox_path, rel_path)
        if not os.path.exists(full_path):
            missing_files.append(rel_path)
    ping_ms = int((time.time() - start_t) * 1000)
    if missing_files:
        msg = f"mount exists but missing: {', '.join(missing_files)}"
        push("STORAGEBOX-MOUNT", "down", msg, ping_ms)
    else:
        msg = f"{storagebox_path} OK | all critical files present"
        push("STORAGEBOX-MOUNT", "up", msg, ping_ms)
--- a/health-agent/src/health_agent/checks/http.py
+++ b/health-agent/src/health_agent/checks/http.py
@ -0,0 +1,196 @@
 import os
 import time
 import logging
 import requests
 from requests.auth import HTTPBasicAuth
 from health_agent.uptime_kuma import push
 import urllib3
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 logger = logging.getLogger(__name__)
 def http_check(url, expected_status=None, auth=None, verify_ssl=True, timeout=5, headers=None):
    start_time = time.time()
    try:
        resp = requests.get(url, auth=auth, verify=verify_ssl, timeout=timeout, headers=headers)
        ping_ms = int((time.time() - start_time) * 1000)
        if expected_status:
            if isinstance(expected_status, list):
                is_ok = resp.status_code in expected_status
            else:
                is_ok = resp.status_code == expected_status
        else:
            is_ok = resp.status_code < 400
        return is_ok, resp, ping_ms, None
    except Exception as e:
        ping_ms = int((time.time() - start_time) * 1000)
        return False, None, ping_ms, str(e)
 def check_patroni_cluster():
    nodes = ["patroni-01", "patroni-02", "patroni-03"]
    cluster_data = None
    error_msg = "All Patroni nodes unreachable"
    start_t = time.time()
    for node in nodes:
        url = f"http://{node}:8008/cluster"
        ok, resp, _, err = http_check(url, timeout=3)
        if ok and resp:
            cluster_data = resp.json()
            break
        elif err:
            error_msg = f"{node} error: {err}"
    ping_ms = int((time.time() - start_t) * 1000)
    if not cluster_data:
        push("PATRONI-CLUSTER", "down", error_msg, ping_ms)
        return
    members = cluster_data.get("members", [])
    leader = None
    replicas = []
    for m in members:
        if m.get("role") == "leader":
            leader = m.get("name")
        else:
            lag = m.get("lag", 0)
            name = m.get("name")
            state = m.get("state")
            replicas.append((name, lag, state))
    if not leader:
        down_nodes = [f"{r[0]} state: {r[2]}" for r in replicas if r[2] not in ("running", "streaming")]
        msg = f"no leader detected | " + " ".join(down_nodes)
        push("PATRONI-CLUSTER", "down", msg, ping_ms)
    else:
        lag_strs = []
        for name, lag, state in replicas:
            lag_mb = lag / (1024*1024) if isinstance(lag, (int, float)) else 0
            lag_strs.append(f"{name} (lag:{lag_mb:.0f}MB)")
        msg = f"leader: {leader} | replicas: " + " ".join(lag_strs)
        push("PATRONI-CLUSTER", "up", msg, ping_ms)
 def check_rabbitmq_cluster():
    url = "http://rabbitmq:15672/api/healthchecks/node"
    user = os.getenv("RABBITMQ_USER", "guest")
    password = os.getenv("RABBITMQ_PASS", "guest")
    auth = HTTPBasicAuth(user, password)
    ok, resp, ping_ms, err = http_check(url, auth=auth)
    if ok:
        ok2, resp2, _, _ = http_check("http://rabbitmq:15672/api/nodes", auth=auth)
        nodes_running = 0
        total_nodes = 3
        if ok2 and resp2:
            data = resp2.json()
            nodes_running = len([n for n in data if n.get("running")])
            total_nodes = len(data)
            alarms = [n.get("name") for n in data if n.get("mem_alarm") or n.get("disk_free_alarm")]
            if alarms:
                msg = f"disk/mem alarm active on {','.join(alarms)}"
                push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
                return
        msg = f"{nodes_running}/{total_nodes} nodes running"
        push("RABBITMQ-CLUSTER", "up", msg, ping_ms)
    else:
        msg = err or f"HTTP {resp.status_code if resp else 'Unknown'}"
        push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
 def check_apisix():
    url = "http://apisix:9180/apisix/admin/routes"
    api_key = os.getenv("APISIX_ADMIN_KEY", "")
    headers = {"X-API-KEY": api_key} if api_key else {}
    ok, resp, ping_ms, err = http_check(url, headers=headers)
    if ok:
        push("APISIX-GATEWAY", "up", "admin API reachable", ping_ms)
    else:
        push("APISIX-GATEWAY", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms)
 def check_vault():
    nodes = ["vault-1", "vault-2", "vault-3"]
    domain = os.getenv("EXTERNAL_DOMAIN", "iklim.co")
    unsealed_count = 0
    total = len(nodes)
    max_ping = 0
    errors = []
    start_t = time.time()
    for node in nodes:
        url = f"https://{node}.{domain}:8200/v1/sys/health"
        ok, resp, ms, err = http_check(url, verify_ssl=False, expected_status=[200, 429, 473])
        max_ping = max(max_ping, ms)
        if resp:
            data = resp.json()
            if not data.get("sealed"):
                unsealed_count += 1
            else:
                errors.append(f"{node} SEALED")
        else:
            errors.append(f"{node} unreachable")
    ping_ms = int((time.time() - start_t) * 1000)
    if unsealed_count == total:
        msg = f"{unsealed_count}/{total} unsealed"
        push("VAULT-CLUSTER", "up", msg, ping_ms)
    else:
        msg = " | ".join(errors) if errors else "Vault checks failed"
        push("VAULT-CLUSTER", "down", msg, ping_ms)
 def check_prometheus():
    url = "http://prometheus:9090/-/healthy"
    ok, resp, ping_ms, err = http_check(url)
    if ok:
        push("PROMETHEUS", "up", "healthy", ping_ms)
    else:
        push("PROMETHEUS", "down", f"prometheus unreachable: {err}", ping_ms)
 def check_grafana():
    url = "http://grafana:3000/api/health"
    ok, resp, ping_ms, err = http_check(url)
    if ok and resp:
        data = resp.json()
        db_status = data.get("database", "unknown")
        if db_status == "ok":
            push("GRAFANA", "up", f"ok | db: {db_status}", ping_ms)
        else:
            push("GRAFANA", "down", f"db not ok: {db_status}", ping_ms)
    else:
        push("GRAFANA", "down", f"grafana unreachable: {err}", ping_ms)
 def check_portainer():
    url = "http://portainer:9000/api/system/status"
    ok, resp, ping_ms, err = http_check(url)
    if ok:
        push("PORTAINER", "up", "running", ping_ms)
    else:
        push("PORTAINER", "down", f"portainer unreachable: {err}", ping_ms)
 def check_loki():
    url = "http://loki:3100/ready"
    ok, resp, ping_ms, err = http_check(url)
    if ok:
        push("LOKI", "up", "ready", ping_ms)
    else:
        push("LOKI", "down", f"loki unreachable: {err}", ping_ms)
 def run_all_http_checks():
    check_patroni_cluster()
    check_rabbitmq_cluster()
    check_apisix()
    check_vault()
    check_prometheus()
    check_grafana()
    check_portainer()
    check_loki()
--- a/health-agent/src/health_agent/checks/mongodb.py
+++ b/health-agent/src/health_agent/checks/mongodb.py
@ -0,0 +1,57 @@
 import os
 import time
 import logging
 from pymongo import MongoClient
 from health_agent.uptime_kuma import push
 logger = logging.getLogger(__name__)
 def check_mongodb():
    start_t = time.time()
    mongo_uri = os.getenv("MONGO_URI", "mongodb://mongodb-01:27017,mongodb-02:27017,mongodb-03:27017/?replicaSet=rs0")
    cluster_size = int(os.getenv("CLUSTER_SIZE_MONGODB", "3"))
    try:
        with MongoClient(mongo_uri, serverSelectionTimeoutMS=3000) as client:
            status = client.admin.command('replSetGetStatus')
        members = status.get('members', [])
        primary = None
        secondaries = []
        for m in members:
            state_str = m.get('stateStr', '')
            name = m.get('name', 'unknown')
            if state_str == 'PRIMARY':
                primary = name
            elif state_str == 'SECONDARY':
                secondaries.append((name, state_str))
            else:
                secondaries.append((name, state_str))
        ping_ms = int((time.time() - start_t) * 1000)
        if cluster_size == 1:
            push("MONGODB-REPLICASET", "up", "standalone mode OK", ping_ms)
            return
        if primary:
            sec_strs = [f"{s[0]} ({s[1]})" for s in secondaries]
            msg = f"PRIMARY: {primary} | secondaries: {' '.join(sec_strs)}"
            unhealthy_secs = [s for s in secondaries if s[1] not in ('SECONDARY', 'ARBITER')]
            if unhealthy_secs:
                msg = f"PRIMARY: {primary} | unhealthy: {','.join([s[0] + ':' + s[1] for s in unhealthy_secs])}"
                push("MONGODB-REPLICASET", "down", msg, ping_ms)
            else:
                push("MONGODB-REPLICASET", "up", msg, ping_ms)
        else:
            msg = "no PRIMARY | quorum lost"
            push("MONGODB-REPLICASET", "down", msg, ping_ms)
    except Exception as e:
        ping_ms = int((time.time() - start_t) * 1000)
        push("MONGODB-REPLICASET", "down", f"connection failed: {e}", ping_ms)
--- a/health-agent/src/health_agent/checks/redis_sentinel.py
+++ b/health-agent/src/health_agent/checks/redis_sentinel.py
@ -0,0 +1,43 @@
 import os
 import time
 import logging
 from redis.sentinel import Sentinel
 from health_agent.uptime_kuma import push
 logger = logging.getLogger(__name__)
 def check_redis_sentinel():
    start_t = time.time()
    hosts = os.getenv("REDIS_SENTINEL_HOSTS", "redis-sentinel-01,redis-sentinel-02,redis-sentinel-03")
    sentinel_nodes = [(h.strip(), 26379) for h in hosts.split(",")]
    master_name = os.getenv("REDIS_MASTER_NAME", "prod-master")
    password = os.getenv("REDIS_PASSWORD", None)
    redis_mode = os.getenv("REDIS_MODE", "sentinel")
    if redis_mode != "sentinel":
        push("REDIS-SENTINEL", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000))
        return
    try:
        sentinel = Sentinel(sentinel_nodes, socket_timeout=3, password=password)
        # Master ping
        master = sentinel.master_for(master_name, socket_timeout=3, password=password)
        master.ping()
        master_ip, master_port = sentinel.discover_master(master_name)
        master.connection_pool.disconnect()
        # Get replicas count
        slaves = sentinel.discover_slaves(master_name)
        replicas_count = len(slaves)
        ping_ms = int((time.time() - start_t) * 1000)
        msg = f"master: {master_ip}:{master_port} | replicas: {replicas_count} | sentinels quorum OK"
        push("REDIS-SENTINEL", "up", msg, ping_ms)
    except Exception as e:
        ping_ms = int((time.time() - start_t) * 1000)
        push("REDIS-SENTINEL", "down", f"quorum FAIL or master unreachable: {e}", ping_ms)
--- a/health-agent/src/health_agent/checks/swarm.py
+++ b/health-agent/src/health_agent/checks/swarm.py
@ -0,0 +1,49 @@
 import time
 import docker
 import logging
 from health_agent.uptime_kuma import push
 logger = logging.getLogger(__name__)
 def check_swarm_cluster():
    start_time = time.time()
    try:
        client = docker.from_env()
        nodes = client.nodes.list()
        ready_nodes = []
        managers = []
        for node in nodes:
            spec = node.attrs.get('Spec', {})
            status = node.attrs.get('Status', {})
            manager_status = node.attrs.get('ManagerStatus', {})
            node_name = spec.get('Name', node.id)
            is_ready = status.get('State') == 'ready'
            is_manager = spec.get('Role') == 'manager'
            if is_ready:
                ready_nodes.append(node_name)
            if is_manager:
                reachability = manager_status.get('Reachability')
                if reachability == 'reachable':
                    managers.append(node_name)
        total_nodes = len(nodes)
        ready_count = len(ready_nodes)
        ping_ms = int((time.time() - start_time) * 1000)
        if ready_count == total_nodes:
            msg = f"{ready_count}/{total_nodes} nodes Ready (managers: {', '.join(managers)})"
            push("SWARM-CLUSTER", "up", msg, ping_ms)
        else:
            msg = f"{ready_count}/{total_nodes} nodes Ready | Managers reachable: {len(managers)}"
            push("SWARM-CLUSTER", "down", msg, ping_ms)
    except Exception as e:
        ping_ms = int((time.time() - start_time) * 1000)
        logger.error(f"Swarm check failed: {e}")
        push("SWARM-CLUSTER", "down", str(e), ping_ms)
--- a/health-agent/src/health_agent/checks/tcp.py
+++ b/health-agent/src/health_agent/checks/tcp.py
@ -0,0 +1,77 @@
 import socket
 import time
 import logging
 import requests
 from health_agent.uptime_kuma import push
 from health_agent.checks.http import http_check
 logger = logging.getLogger(__name__)
 def tcp_check(host, port, timeout=3):
    start_time = time.time()
    try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(timeout)
        result = sock.connect_ex((host, port))
        sock.close()
        ping_ms = int((time.time() - start_time) * 1000)
        if result == 0:
            return True, ping_ms, None
        else:
            return False, ping_ms, f"Port {port} is closed or unreachable"
    except Exception as e:
        ping_ms = int((time.time() - start_time) * 1000)
        return False, ping_ms, str(e)
 def check_etcd_cluster():
    nodes = ["etcd-01", "etcd-02", "etcd-03"]
    start_t = time.time()
    healthy_count = 0
    leader = None
    errors = []
    for node in nodes:
        # 1. TCP Check on 2379
        tcp_ok, ms, tcp_err = tcp_check(node, 2379)
        if not tcp_ok:
            errors.append(f"{node} port 2379 unreachable")
            continue
        # 2. HTTP Health check
        url = f"http://{node}:2379/health"
        http_ok, resp, ms, http_err = http_check(url, timeout=3)
        if http_ok and resp:
            data = resp.json()
            if data.get("health") == "true":
                healthy_count += 1
            else:
                errors.append(f"{node} unhealthy")
        else:
            errors.append(f"{node} health endpoint unreachable")
        # 3. Leader check from /v3/maintenance/status
        if not leader and tcp_ok:
            status_url = f"http://{node}:2379/v3/maintenance/status"
            try:
                r = requests.post(status_url, json={}, timeout=3)
                if r.status_code == 200:
                    status_data = r.json()
                    leader_id = status_data.get("leader")
                    header_member_id = status_data.get("header", {}).get("member_id")
                    if leader_id and leader_id == header_member_id:
                        leader = node
            except Exception:
                pass
    ping_ms = int((time.time() - start_t) * 1000)
    if healthy_count == len(nodes):
        leader_info = f" | leader: {leader}" if leader else ""
        msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}"
        push("ETCD-CLUSTER", "up", msg, ping_ms)
    else:
        quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else ""
        msg = " | ".join(errors) + quorum_msg
        push("ETCD-CLUSTER", "down", msg, ping_ms)
--- a/health-agent/src/health_agent/checks/tls.py
+++ b/health-agent/src/health_agent/checks/tls.py
@ -0,0 +1,62 @@
 import os
 import time
 import logging
 import requests
 from datetime import datetime, timezone
 from health_agent.uptime_kuma import push
 from cryptography import x509
 from cryptography.hazmat.backends import default_backend
 logger = logging.getLogger(__name__)
 def check_swag_tls():
    start_t = time.time()
    cert_path = "/mnt/storagebox/ssl/STAR.iklim.co.full.crt"
    domain = os.getenv("EXTERNAL_DOMAIN", "iklim.co")
    suffix = os.getenv("EXTERNAL_SUBDOMAIN_SUFFIX", "")
    target_url = f"https://api{suffix}.{domain}/actuator/health"
    msg_parts = []
    is_down = False
    # 1. Check cert file
    if not os.path.exists(cert_path):
        is_down = True
        msg_parts.append("cert file missing on storagebox")
    else:
        try:
            with open(cert_path, "rb") as f:
                cert_data = f.read()
            cert = x509.load_pem_x509_certificate(cert_data, default_backend())
            not_valid_after = cert.not_valid_after_utc
            now = datetime.now(timezone.utc)
            days_left = (not_valid_after - now).days
            if days_left < 14:
                is_down = True
                msg_parts.append(f"cert expires in {days_left} days")
            else:
                msg_parts.append(f"cert valid until {not_valid_after.strftime('%Y-%m-%d')} ({days_left} days)")
        except Exception as e:
            is_down = True
            msg_parts.append(f"cert parse error: {e}")
    # 2. Check external HTTPS reachable
    try:
        r = requests.get(target_url, timeout=5, verify=False)
        if r.status_code < 500:
            msg_parts.append("HTTPS reachable")
        else:
            is_down = True
            msg_parts.append(f"HTTPS returned {r.status_code}")
    except Exception as e:
        is_down = True
        msg_parts.append(f"HTTPS unreachable")
    ping_ms = int((time.time() - start_t) * 1000)
    msg = " | ".join(msg_parts)
    if is_down:
        push("SWAG-TLS", "down", msg, ping_ms)
    else:
        push("SWAG-TLS", "up", msg, ping_ms)
--- a/health-agent/src/health_agent/config.py
+++ b/health-agent/src/health_agent/config.py
@ -0,0 +1,25 @@
 import os
 import yaml
 from pathlib import Path
 from dotenv import load_dotenv
 load_dotenv()
 ENV = os.getenv("ENV", "prod")
 CLUSTER_SIZE_ETCD = int(os.getenv("CLUSTER_SIZE_ETCD", "3"))
 CLUSTER_SIZE_PATRONI = int(os.getenv("CLUSTER_SIZE_PATRONI", "3"))
 CLUSTER_SIZE_MONGODB = int(os.getenv("CLUSTER_SIZE_MONGODB", "3"))
 CLUSTER_SIZE_RABBITMQ = int(os.getenv("CLUSTER_SIZE_RABBITMQ", "3"))
 CLUSTER_SIZE_VAULT = int(os.getenv("CLUSTER_SIZE_VAULT", "3"))
 REDIS_MODE = os.getenv("REDIS_MODE", "sentinel")
 EXTERNAL_DOMAIN = os.getenv("EXTERNAL_DOMAIN", "iklim.co")
 EXTERNAL_SUBDOMAIN_SUFFIX = os.getenv("EXTERNAL_SUBDOMAIN_SUFFIX", "")
 def load_uk_tokens():
    token_file = Path("config/generated/uk_tokens.yml")
    if not token_file.exists():
        return {}
    with open(token_file, "r") as f:
        return yaml.safe_load(f) or {}
 UK_TOKENS = load_uk_tokens()
--- a/health-agent/src/health_agent/events/docker_events.py
+++ b/health-agent/src/health_agent/events/docker_events.py
@ -0,0 +1,56 @@
 import os
 import docker
 import threading
 import logging
 import time
 from health_agent.slack import notify
 logger = logging.getLogger(__name__)
 def parse_and_notify(event):
    attrs = event.get('Actor', {}).get('Attributes', {})
    container_name = attrs.get('name', 'unknown')
    exit_code = attrs.get('exitCode', '0')
    if exit_code == '0':
        return
    is_oom = (exit_code == '137')
    env = os.getenv("ENV", "test").upper()
    webhook_env_name = f"SLACK_WEBHOOK_IKLIM_{env}_OPS"
    priority = "High" if is_oom else "Medium"
    title = f"[Health Agent / Events] Container Crashed ({container_name})"
    detail = f"Container: {container_name}\nExit Code: {exit_code}"
    if is_oom:
        detail += "\nReason: OOM Killed (Out Of Memory) or SIGKILL"
    notify(
        webhook_env=webhook_env_name,
        source="health-agent-events",
        priority=priority,
        title=title,
        detail=detail
    )
 def event_listener_loop():
    while True:
        try:
            client = docker.from_env()
            logger.info("Starting Docker event listener...")
            filters = {"type": "container", "event": "die"}
            for event in client.events(decode=True, filters=filters):
                try:
                    parse_and_notify(event)
                except Exception as e:
                    logger.error(f"Error parsing event: {e}", exc_info=True)
        except Exception as e:
            logger.error(f"Docker event listener error: {e}. Reconnecting in 10s...", exc_info=True)
            time.sleep(10)
 def start_docker_event_listener():
    thread = threading.Thread(target=event_listener_loop, daemon=True)
    thread.start()
    return thread
--- a/health-agent/src/health_agent/main.py
+++ b/health-agent/src/health_agent/main.py
@ -0,0 +1,75 @@
 import time
 import logging
 from health_agent.checks import swarm
 from health_agent.checks.http import run_all_http_checks
 from health_agent.checks.tcp import check_etcd_cluster
 from health_agent.checks.tls import check_swag_tls
 from health_agent.checks.redis_sentinel import check_redis_sentinel
 from health_agent.checks.mongodb import check_mongodb
 from health_agent.checks.filesystem import check_storagebox_mount
 from health_agent.events.docker_events import start_docker_event_listener
 import json
 class JSONFormatter(logging.Formatter):
    def format(self, record):
        log_obj = {
            "time": self.formatTime(record, self.datefmt),
            "level": record.levelname,
            "logger": record.name,
            "msg": record.getMessage()
        }
        for attr in ['check', 'status', 'ping_ms', 'source', 'error']:
            if hasattr(record, attr):
                log_obj[attr] = getattr(record, attr)
        if record.exc_info:
            log_obj['exc_info'] = self.formatException(record.exc_info)
        return json.dumps(log_obj)
 handler = logging.StreamHandler()
 handler.setFormatter(JSONFormatter())
 logging.basicConfig(level=logging.INFO, handlers=[handler])
 logger = logging.getLogger("main")
 def run_checks():
    logger.info("Running health checks...")
    try:
        swarm.check_swarm_cluster()
    except Exception as e:
        logger.error(f"Error checking Swarm cluster: {e}")
    try:
        run_all_http_checks()
    except Exception as e:
        logger.error(f"Error running HTTP checks: {e}")
    try:
        check_etcd_cluster()
    except Exception as e:
        logger.error(f"Error running etcd checks: {e}")
    try:
        check_swag_tls()
    except Exception as e:
        logger.error(f"Error running TLS checks: {e}")
    try:
        check_redis_sentinel()
    except Exception as e:
        logger.error(f"Error running Redis checks: {e}")
    try:
        check_mongodb()
    except Exception as e:
        logger.error(f"Error running MongoDB checks: {e}")
    try:
        check_storagebox_mount()
    except Exception as e:
        logger.error(f"Error running filesystem checks: {e}")
 if __name__ == "__main__":
    logger.info("Starting health-agent...")
    start_docker_event_listener()
    while True:
        run_checks()
        time.sleep(60)
--- a/health-agent/src/health_agent/slack.py
+++ b/health-agent/src/health_agent/slack.py
@ -0,0 +1,22 @@
 import os
 import requests
 import logging
 logger = logging.getLogger(__name__)
 def notify(webhook_env: str, source: str, priority: str, title: str, detail: str):
    webhook_url = os.getenv(webhook_env)
    if not webhook_url:
        logger.warning(f"Slack webhook url not found for {webhook_env}")
        return
    payload = {
        "text": f"*{title}*\n*Source:* {source}\n*Priority:* {priority}\n```\n{detail}\n```"
    }
    try:
        response = requests.post(webhook_url, json=payload, timeout=5)
        response.raise_for_status()
        logger.info(f"Sent Slack notification to {webhook_env}")
    except Exception as e:
        logger.error(f"Failed to send Slack notification: {e}")
--- a/health-agent/src/health_agent/state.py
+++ b/health-agent/src/health_agent/state.py
@ -0,0 +1,19 @@
 import json
 import os
 from pathlib import Path
 STATE_FILE = Path("config/generated/state.json")
 def load_state():
    if not STATE_FILE.exists():
        return {}
    try:
        with open(STATE_FILE, "r") as f:
            return json.load(f)
    except Exception:
        return {}
 def save_state(state):
    STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
    with open(STATE_FILE, "w") as f:
        json.dump(state, f)
--- a/health-agent/src/health_agent/uptime_kuma.py
+++ b/health-agent/src/health_agent/uptime_kuma.py
@ -0,0 +1,27 @@
 import os
 import requests
 import logging
 from health_agent.config import UK_TOKENS
 logger = logging.getLogger(__name__)
 UK_PUSH_URL_BASE = os.getenv("UK_PUSH_URL_BASE", "https://status.iklim.co/api/push")
 def push(monitor_name: str, status: str, msg: str, ping_ms: int):
    token = UK_TOKENS.get(monitor_name)
    if not token:
        logger.warning(f"No token found for monitor {monitor_name}")
        return
    url = f"{UK_PUSH_URL_BASE}/{token}"
    params = {
        "status": status,
        "msg": msg,
        "ping": int(ping_ms)
    }
    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        logger.info(f"Pushed {monitor_name} status={status}", extra={"check": monitor_name, "status": status, "msg": msg, "ping_ms": ping_ms, "source": "uptime_kuma"})
    except Exception as e:
        logger.error(f"Failed to push {monitor_name}: {e}", extra={"check": monitor_name, "status": "push_failed", "error": str(e), "source": "uptime_kuma"})