diff --git a/health-agent/.env.example b/health-agent/.env.example new file mode 100644 index 0000000..7855246 --- /dev/null +++ b/health-agent/.env.example @@ -0,0 +1,10 @@ +ENV=prod +CLUSTER_SIZE_ETCD=3 +CLUSTER_SIZE_PATRONI=3 +CLUSTER_SIZE_MONGODB=3 +CLUSTER_SIZE_RABBITMQ=3 +CLUSTER_SIZE_VAULT=3 +REDIS_MODE=sentinel +EXTERNAL_DOMAIN=iklim.co +EXTERNAL_SUBDOMAIN_SUFFIX= +UK_PUSH_URL_BASE=https://status.iklim.co/api/push diff --git a/health-agent/.env.setup.example b/health-agent/.env.setup.example new file mode 100644 index 0000000..19ec7cd --- /dev/null +++ b/health-agent/.env.setup.example @@ -0,0 +1,5 @@ +UK_URL=http://uptime-kuma:3001 +UK_API_KEY=your_api_key_here +UK_SLACK_WEBHOOK_HIGH=https://hooks.slack.com/services/... +UK_SLACK_WEBHOOK_MEDIUM=https://hooks.slack.com/services/... +UK_SLACK_WEBHOOK_LOW=https://hooks.slack.com/services/... diff --git a/health-agent/Dockerfile b/health-agent/Dockerfile new file mode 100644 index 0000000..14b6927 --- /dev/null +++ b/health-agent/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.12-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/* + +COPY pyproject.toml ./ +COPY src/ ./src/ +RUN pip install --no-cache-dir . +ENV PYTHONPATH=/app/src + +RUN useradd -m appuser +# Keeping as root to be able to access /var/run/docker.sock cleanly, unless specifically configured with groups. +# USER appuser + +CMD ["python", "src/health_agent/main.py"] diff --git a/health-agent/config/monitors.yml b/health-agent/config/monitors.yml new file mode 100644 index 0000000..dbc724e --- /dev/null +++ b/health-agent/config/monitors.yml @@ -0,0 +1,196 @@ +version: "1" +project: "iklim" +domain: + base: "iklim.co" +nodes: + prod: + service: + - name: iklim-app-01 + ip: "178.104.210.41" + - name: iklim-app-02 + ip: "178.105.69.1" + - name: iklim-app-03 + ip: "178.104.219.3" + db: + - name: iklim-db-01 + ip: "159.69.117.158" + - name: iklim-db-02 + ip: "178.104.219.162" + - name: iklim-db-03 + ip: "159.69.115.105" + test: + service: + - name: iklim-app-01 + ip: "167.235.194.61" + db: + - name: iklim-db-01 + ip: "167.235.205.93" +tags: + - external + - internal + - high + - medium + - low + - database + - gateway + - infrastructure + - observability +notifications: + slack-high: + type: slack + webhook_env: UK_SLACK_WEBHOOK_HIGH + slack-medium: + type: slack + webhook_env: UK_SLACK_WEBHOOK_MEDIUM + slack-low: + type: slack + webhook_env: UK_SLACK_WEBHOOK_LOW +groups: + - name: "Altyapı" + status_page: "iklim-{env}-ops" + notifications: [slack-high] + tags: [internal, infrastructure] + children: [SWARM-CLUSTER, VAULT-CLUSTER, STORAGEBOX-MOUNT, SWAG-TLS] + - name: "Veri Katmanı" + status_page: "iklim-{env}-ops" + notifications: [slack-high] + tags: [internal, database] + children: [ETCD-CLUSTER, PATRONI-CLUSTER, MONGODB-REPLICASET] + - name: "Gateway & Mesajlaşma" + status_page: "iklim-{env}-ops" + notifications: [slack-high] + tags: [internal, gateway] + children: [APISIX-GATEWAY, RABBITMQ-CLUSTER, REDIS-SENTINEL] + - name: "Dış Erişilebilirlik - Kritik" + status_page: "iklim-{env}-ops" + notifications: [slack-high] + tags: [external, high] + children: [EXT-HTTPS-API, EXT-DNS-API, EXT-DNS-ROOT, EXT-PING-APP01, EXT-PING-APP02, EXT-PING-APP03] + - name: "Dış Erişilebilirlik - Genel" + status_page: "iklim-{env}-ops" + notifications: [slack-medium] + tags: [external, medium] + children: [EXT-HTTPS-GRAFANA, EXT-PING-DB01, EXT-PING-DB02, EXT-PING-DB03] + - name: "Gözlemlenebilirlik" + status_page: "iklim-{env}-tools" + notifications: [slack-low] + tags: [internal, observability] + children: [PROMETHEUS, GRAFANA, PORTAINER, LOKI, EXT-HTTPS-PORTAINER, EXT-HTTPS-APIGW] +push_monitors: + - name: SWARM-CLUSTER + interval: 60 + heartbeat_retries: 1 + tags: [internal, infrastructure, high] + restart_threshold: 1 + - name: VAULT-CLUSTER + interval: 60 + heartbeat_retries: 1 + tags: [internal, infrastructure, high] + restart_threshold: 1 + - name: ETCD-CLUSTER + interval: 60 + heartbeat_retries: 1 + tags: [internal, database, high] + restart_threshold: 1 + - name: PATRONI-CLUSTER + interval: 60 + heartbeat_retries: 1 + tags: [internal, database, high] + restart_threshold: 1 + - name: MONGODB-REPLICASET + interval: 120 + heartbeat_retries: 1 + tags: [internal, database, high] + restart_threshold: 1 + - name: APISIX-GATEWAY + interval: 60 + heartbeat_retries: 1 + tags: [internal, gateway, high] + restart_threshold: 1 + - name: RABBITMQ-CLUSTER + interval: 60 + heartbeat_retries: 1 + tags: [internal, gateway, medium] + restart_threshold: 3 + - name: REDIS-SENTINEL + interval: 60 + heartbeat_retries: 1 + tags: [internal, database, medium] + restart_threshold: 3 + - name: SWAG-TLS + interval: 3600 + heartbeat_retries: 1 + tags: [internal, infrastructure, medium] + restart_threshold: 3 + - name: STORAGEBOX-MOUNT + interval: 300 + heartbeat_retries: 1 + tags: [internal, infrastructure, medium] + restart_threshold: 1 + - name: PROMETHEUS + interval: 120 + heartbeat_retries: 1 + tags: [internal, observability, low] + restart_threshold: 5 + - name: GRAFANA + interval: 120 + heartbeat_retries: 1 + tags: [internal, observability, low] + restart_threshold: 5 + - name: PORTAINER + interval: 120 + heartbeat_retries: 1 + tags: [internal, observability, low] + restart_threshold: 5 + - name: LOKI + interval: 120 + heartbeat_retries: 1 + tags: [internal, observability, low] + restart_threshold: 5 +http_monitors: + - name: EXT-HTTPS-API + url: "https://api{suffix}.{domain}/actuator/health" + accepted_statuscodes: ["200"] + interval: 60 + - name: EXT-HTTPS-GRAFANA + url: "https://grafana{suffix}.{domain}/api/health" + accepted_statuscodes: ["200"] + interval: 60 + - name: EXT-HTTPS-PORTAINER + url: "https://portainer{suffix}.{domain}" + accepted_statuscodes: ["200", "401", "403"] + interval: 120 + - name: EXT-HTTPS-APIGW + url: "https://apigw{suffix}.{domain}" + accepted_statuscodes: ["200", "401", "403"] + interval: 120 +dns_monitors: + - name: EXT-DNS-API + hostname: "api{suffix}.{domain}" + dns_resolve_type: A + interval: 60 + - name: EXT-DNS-ROOT + hostname: "{domain}" + dns_resolve_type: A + interval: 60 +ping_monitors: + interval: 60 + max_retries: 1 +status_pages: + - slug: "iklim-{env}-status" + title: "iklim.co API Durumu" + public: true + groups: ["Dış Erişilebilirlik - Kritik"] + - slug: "iklim-{env}-ops" + title: "iklim.co [{env}] Altyapı" + public: false + groups: + - "Altyapı" + - "Veri Katmanı" + - "Gateway & Mesajlaşma" + - "Dış Erişilebilirlik - Kritik" + - "Dış Erişilebilirlik - Genel" + - slug: "iklim-{env}-tools" + title: "iklim.co [{env}] Araçlar" + public: false + groups: ["Gözlemlenebilirlik"] diff --git a/health-agent/pyproject.toml b/health-agent/pyproject.toml new file mode 100644 index 0000000..a305a2b --- /dev/null +++ b/health-agent/pyproject.toml @@ -0,0 +1,22 @@ +[project] +name = "health-agent" +version = "0.1.0" +description = "iklim.co Monitoring Health Agent" +requires-python = ">=3.12" +dependencies = [ + "requests", + "docker", + "python-dotenv", + "pyyaml", + "redis", + "pymongo", + "uptime-kuma-api", + "cryptography", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/health_agent"] diff --git a/health-agent/scripts/setup_uptime_kuma.py b/health-agent/scripts/setup_uptime_kuma.py new file mode 100644 index 0000000..bbeafc9 --- /dev/null +++ b/health-agent/scripts/setup_uptime_kuma.py @@ -0,0 +1,138 @@ +import os +import argparse +import yaml +import logging +from uptime_kuma_api import UptimeKumaApi, MonitorType + +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger("uk-setup") + +def format_str(text, env_name, project): + if not isinstance(text, str): + return text + return text.replace("{env}", env_name).replace("{project}", project) + +def setup_uptime_kuma(dry_run=False, only=None): + env_name = os.getenv("ENV", "test") + + config_path = os.path.join(os.path.dirname(__file__), "..", "config", "monitors.yml") + with open(config_path, "r") as f: + config = yaml.safe_load(f) + + project = config.get("project", "iklim") + + kuma_url = os.getenv("UK_URL", "http://localhost:3001") + kuma_user = os.getenv("UK_USER", "admin") + kuma_pass = os.getenv("UK_PASS", "admin") + + api = None + if not dry_run: + logger.info(f"Connecting to Uptime Kuma at {kuma_url}...") + try: + api = UptimeKumaApi(kuma_url) + api.login(kuma_user, kuma_pass) + except Exception as e: + logger.error(f"Login failed: {e}") + return + + existing_monitors = {} + if api: + try: + for m in api.get_monitors(): + existing_monitors[m['name']] = m + except Exception as e: + logger.error(f"Failed to get monitors: {e}") + + # 1. Process Groups + group_map = {} + for g in config.get("groups", []): + raw_name = g["name"] + formatted_name = f"{project} [{env_name}] {raw_name}" + + logger.info(f"Processing group: {formatted_name}") + if not dry_run: + if formatted_name not in existing_monitors: + logger.info(f"Creating group monitor: {formatted_name}") + res = api.add_monitor(type=MonitorType.GROUP, name=formatted_name) + group_map[raw_name] = res['monitorID'] + else: + group_map[raw_name] = existing_monitors[formatted_name]['id'] + + tokens = {} + + # 2. Push Monitors + for pm in config.get("push_monitors", []): + m_name = pm["name"] + if only and m_name != only: + continue + + m_interval = pm.get("interval", 60) + + parent_group_id = None + for g in config.get("groups", []): + if m_name in g.get("children", []): + parent_group_id = group_map.get(g["name"]) + break + + logger.info(f"Processing push monitor: {m_name}") + if not dry_run: + if m_name in existing_monitors: + logger.info(f"Monitor {m_name} already exists.") + m_id = existing_monitors[m_name]['id'] + token = existing_monitors[m_name]['pushToken'] + tokens[m_name] = token + + if parent_group_id and existing_monitors[m_name].get('parent') != parent_group_id: + api.edit_monitor(m_id, parent=parent_group_id) + else: + logger.info(f"Creating push monitor: {m_name}") + result = api.add_monitor( + type=MonitorType.PUSH, + name=m_name, + interval=m_interval, + parent=parent_group_id + ) + m_id = result['monitorID'] + + # Fetch again to get pushToken + for m in api.get_monitors(): + if m['id'] == m_id: + tokens[m_name] = m['pushToken'] + break + else: + tokens[m_name] = "dummy_token_dry_run" + + # 3. Process Status Pages + for sp in config.get("status_pages", []): + slug = format_str(sp["slug"], env_name, project) + title = format_str(sp["title"], env_name, project) + logger.info(f"Processing status page: {title} (slug: {slug})") + if not dry_run: + try: + pages = api.get_status_pages() + exists = any(p['slug'] == slug for p in pages) + if not exists: + logger.info(f"Creating status page: {slug}") + api.add_status_page(slug, title) + except Exception as e: + logger.warning(f"Status page ops failed: {e}") + + # 4. Write tokens to uk_tokens.yml + token_file = os.path.join(os.path.dirname(__file__), "..", "config", "uk_tokens.yml") + if not dry_run: + with open(token_file, "w") as f: + yaml.dump(tokens, f) + logger.info(f"Saved push tokens to {token_file}") + else: + logger.info(f"[DRY-RUN] Would save {len(tokens)} tokens to {token_file}") + + if api: + api.disconnect() + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Setup Uptime Kuma monitors") + parser.add_argument("--dry-run", action="store_true", help="Print actions without making changes") + parser.add_argument("--only", type=str, help="Only process a specific monitor by name") + args = parser.parse_args() + + setup_uptime_kuma(dry_run=args.dry_run, only=args.only) diff --git a/health-agent/src/health_agent/checks/filesystem.py b/health-agent/src/health_agent/checks/filesystem.py new file mode 100644 index 0000000..8742091 --- /dev/null +++ b/health-agent/src/health_agent/checks/filesystem.py @@ -0,0 +1,36 @@ +import os +import time +import logging +from health_agent.uptime_kuma import push + +logger = logging.getLogger(__name__) + +def check_storagebox_mount(): + start_t = time.time() + + storagebox_path = os.getenv("STORAGEBOX_PATH", "/mnt/storagebox") + expected_files = [ + "patroni/patroni.yml", + "ssl/STAR.iklim.co.full.crt" + ] + + missing_files = [] + + if not os.path.exists(storagebox_path): + ping_ms = int((time.time() - start_t) * 1000) + push("STORAGEBOX-MOUNT", "down", f"{storagebox_path} not found", ping_ms) + return + + for rel_path in expected_files: + full_path = os.path.join(storagebox_path, rel_path) + if not os.path.exists(full_path): + missing_files.append(rel_path) + + ping_ms = int((time.time() - start_t) * 1000) + + if missing_files: + msg = f"mount exists but missing: {', '.join(missing_files)}" + push("STORAGEBOX-MOUNT", "down", msg, ping_ms) + else: + msg = f"{storagebox_path} OK | all critical files present" + push("STORAGEBOX-MOUNT", "up", msg, ping_ms) diff --git a/health-agent/src/health_agent/checks/http.py b/health-agent/src/health_agent/checks/http.py new file mode 100644 index 0000000..14b4dfb --- /dev/null +++ b/health-agent/src/health_agent/checks/http.py @@ -0,0 +1,196 @@ +import os +import time +import logging +import requests +from requests.auth import HTTPBasicAuth +from health_agent.uptime_kuma import push +import urllib3 + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +logger = logging.getLogger(__name__) + +def http_check(url, expected_status=None, auth=None, verify_ssl=True, timeout=5, headers=None): + start_time = time.time() + try: + resp = requests.get(url, auth=auth, verify=verify_ssl, timeout=timeout, headers=headers) + ping_ms = int((time.time() - start_time) * 1000) + + if expected_status: + if isinstance(expected_status, list): + is_ok = resp.status_code in expected_status + else: + is_ok = resp.status_code == expected_status + else: + is_ok = resp.status_code < 400 + + return is_ok, resp, ping_ms, None + except Exception as e: + ping_ms = int((time.time() - start_time) * 1000) + return False, None, ping_ms, str(e) + +def check_patroni_cluster(): + nodes = ["patroni-01", "patroni-02", "patroni-03"] + cluster_data = None + error_msg = "All Patroni nodes unreachable" + start_t = time.time() + + for node in nodes: + url = f"http://{node}:8008/cluster" + ok, resp, _, err = http_check(url, timeout=3) + if ok and resp: + cluster_data = resp.json() + break + elif err: + error_msg = f"{node} error: {err}" + + ping_ms = int((time.time() - start_t) * 1000) + + if not cluster_data: + push("PATRONI-CLUSTER", "down", error_msg, ping_ms) + return + + members = cluster_data.get("members", []) + leader = None + replicas = [] + + for m in members: + if m.get("role") == "leader": + leader = m.get("name") + else: + lag = m.get("lag", 0) + name = m.get("name") + state = m.get("state") + replicas.append((name, lag, state)) + + if not leader: + down_nodes = [f"{r[0]} state: {r[2]}" for r in replicas if r[2] not in ("running", "streaming")] + msg = f"no leader detected | " + " ".join(down_nodes) + push("PATRONI-CLUSTER", "down", msg, ping_ms) + else: + lag_strs = [] + for name, lag, state in replicas: + lag_mb = lag / (1024*1024) if isinstance(lag, (int, float)) else 0 + lag_strs.append(f"{name} (lag:{lag_mb:.0f}MB)") + + msg = f"leader: {leader} | replicas: " + " ".join(lag_strs) + push("PATRONI-CLUSTER", "up", msg, ping_ms) + +def check_rabbitmq_cluster(): + url = "http://rabbitmq:15672/api/healthchecks/node" + user = os.getenv("RABBITMQ_USER", "guest") + password = os.getenv("RABBITMQ_PASS", "guest") + auth = HTTPBasicAuth(user, password) + + ok, resp, ping_ms, err = http_check(url, auth=auth) + + if ok: + ok2, resp2, _, _ = http_check("http://rabbitmq:15672/api/nodes", auth=auth) + nodes_running = 0 + total_nodes = 3 + + if ok2 and resp2: + data = resp2.json() + nodes_running = len([n for n in data if n.get("running")]) + total_nodes = len(data) + + alarms = [n.get("name") for n in data if n.get("mem_alarm") or n.get("disk_free_alarm")] + if alarms: + msg = f"disk/mem alarm active on {','.join(alarms)}" + push("RABBITMQ-CLUSTER", "down", msg, ping_ms) + return + + msg = f"{nodes_running}/{total_nodes} nodes running" + push("RABBITMQ-CLUSTER", "up", msg, ping_ms) + else: + msg = err or f"HTTP {resp.status_code if resp else 'Unknown'}" + push("RABBITMQ-CLUSTER", "down", msg, ping_ms) + +def check_apisix(): + url = "http://apisix:9180/apisix/admin/routes" + api_key = os.getenv("APISIX_ADMIN_KEY", "") + headers = {"X-API-KEY": api_key} if api_key else {} + ok, resp, ping_ms, err = http_check(url, headers=headers) + + if ok: + push("APISIX-GATEWAY", "up", "admin API reachable", ping_ms) + else: + push("APISIX-GATEWAY", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms) + +def check_vault(): + nodes = ["vault-1", "vault-2", "vault-3"] + domain = os.getenv("EXTERNAL_DOMAIN", "iklim.co") + unsealed_count = 0 + total = len(nodes) + max_ping = 0 + errors = [] + + start_t = time.time() + for node in nodes: + url = f"https://{node}.{domain}:8200/v1/sys/health" + ok, resp, ms, err = http_check(url, verify_ssl=False, expected_status=[200, 429, 473]) + max_ping = max(max_ping, ms) + + if resp: + data = resp.json() + if not data.get("sealed"): + unsealed_count += 1 + else: + errors.append(f"{node} SEALED") + else: + errors.append(f"{node} unreachable") + + ping_ms = int((time.time() - start_t) * 1000) + + if unsealed_count == total: + msg = f"{unsealed_count}/{total} unsealed" + push("VAULT-CLUSTER", "up", msg, ping_ms) + else: + msg = " | ".join(errors) if errors else "Vault checks failed" + push("VAULT-CLUSTER", "down", msg, ping_ms) + +def check_prometheus(): + url = "http://prometheus:9090/-/healthy" + ok, resp, ping_ms, err = http_check(url) + if ok: + push("PROMETHEUS", "up", "healthy", ping_ms) + else: + push("PROMETHEUS", "down", f"prometheus unreachable: {err}", ping_ms) + +def check_grafana(): + url = "http://grafana:3000/api/health" + ok, resp, ping_ms, err = http_check(url) + if ok and resp: + data = resp.json() + db_status = data.get("database", "unknown") + if db_status == "ok": + push("GRAFANA", "up", f"ok | db: {db_status}", ping_ms) + else: + push("GRAFANA", "down", f"db not ok: {db_status}", ping_ms) + else: + push("GRAFANA", "down", f"grafana unreachable: {err}", ping_ms) + +def check_portainer(): + url = "http://portainer:9000/api/system/status" + ok, resp, ping_ms, err = http_check(url) + if ok: + push("PORTAINER", "up", "running", ping_ms) + else: + push("PORTAINER", "down", f"portainer unreachable: {err}", ping_ms) + +def check_loki(): + url = "http://loki:3100/ready" + ok, resp, ping_ms, err = http_check(url) + if ok: + push("LOKI", "up", "ready", ping_ms) + else: + push("LOKI", "down", f"loki unreachable: {err}", ping_ms) + +def run_all_http_checks(): + check_patroni_cluster() + check_rabbitmq_cluster() + check_apisix() + check_vault() + check_prometheus() + check_grafana() + check_portainer() + check_loki() diff --git a/health-agent/src/health_agent/checks/mongodb.py b/health-agent/src/health_agent/checks/mongodb.py new file mode 100644 index 0000000..593cef6 --- /dev/null +++ b/health-agent/src/health_agent/checks/mongodb.py @@ -0,0 +1,57 @@ +import os +import time +import logging +from pymongo import MongoClient +from health_agent.uptime_kuma import push + +logger = logging.getLogger(__name__) + +def check_mongodb(): + start_t = time.time() + + mongo_uri = os.getenv("MONGO_URI", "mongodb://mongodb-01:27017,mongodb-02:27017,mongodb-03:27017/?replicaSet=rs0") + cluster_size = int(os.getenv("CLUSTER_SIZE_MONGODB", "3")) + + try: + with MongoClient(mongo_uri, serverSelectionTimeoutMS=3000) as client: + status = client.admin.command('replSetGetStatus') + + members = status.get('members', []) + + primary = None + secondaries = [] + + for m in members: + state_str = m.get('stateStr', '') + name = m.get('name', 'unknown') + + if state_str == 'PRIMARY': + primary = name + elif state_str == 'SECONDARY': + secondaries.append((name, state_str)) + else: + secondaries.append((name, state_str)) + + ping_ms = int((time.time() - start_t) * 1000) + + if cluster_size == 1: + push("MONGODB-REPLICASET", "up", "standalone mode OK", ping_ms) + return + + if primary: + sec_strs = [f"{s[0]} ({s[1]})" for s in secondaries] + msg = f"PRIMARY: {primary} | secondaries: {' '.join(sec_strs)}" + + unhealthy_secs = [s for s in secondaries if s[1] not in ('SECONDARY', 'ARBITER')] + if unhealthy_secs: + msg = f"PRIMARY: {primary} | unhealthy: {','.join([s[0] + ':' + s[1] for s in unhealthy_secs])}" + push("MONGODB-REPLICASET", "down", msg, ping_ms) + else: + push("MONGODB-REPLICASET", "up", msg, ping_ms) + else: + msg = "no PRIMARY | quorum lost" + push("MONGODB-REPLICASET", "down", msg, ping_ms) + + except Exception as e: + ping_ms = int((time.time() - start_t) * 1000) + push("MONGODB-REPLICASET", "down", f"connection failed: {e}", ping_ms) diff --git a/health-agent/src/health_agent/checks/redis_sentinel.py b/health-agent/src/health_agent/checks/redis_sentinel.py new file mode 100644 index 0000000..7b97410 --- /dev/null +++ b/health-agent/src/health_agent/checks/redis_sentinel.py @@ -0,0 +1,43 @@ +import os +import time +import logging +from redis.sentinel import Sentinel +from health_agent.uptime_kuma import push + +logger = logging.getLogger(__name__) + +def check_redis_sentinel(): + start_t = time.time() + + hosts = os.getenv("REDIS_SENTINEL_HOSTS", "redis-sentinel-01,redis-sentinel-02,redis-sentinel-03") + sentinel_nodes = [(h.strip(), 26379) for h in hosts.split(",")] + + master_name = os.getenv("REDIS_MASTER_NAME", "prod-master") + password = os.getenv("REDIS_PASSWORD", None) + redis_mode = os.getenv("REDIS_MODE", "sentinel") + + if redis_mode != "sentinel": + push("REDIS-SENTINEL", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000)) + return + + try: + sentinel = Sentinel(sentinel_nodes, socket_timeout=3, password=password) + + # Master ping + master = sentinel.master_for(master_name, socket_timeout=3, password=password) + master.ping() + master_ip, master_port = sentinel.discover_master(master_name) + master.connection_pool.disconnect() + + # Get replicas count + slaves = sentinel.discover_slaves(master_name) + replicas_count = len(slaves) + + ping_ms = int((time.time() - start_t) * 1000) + + msg = f"master: {master_ip}:{master_port} | replicas: {replicas_count} | sentinels quorum OK" + push("REDIS-SENTINEL", "up", msg, ping_ms) + + except Exception as e: + ping_ms = int((time.time() - start_t) * 1000) + push("REDIS-SENTINEL", "down", f"quorum FAIL or master unreachable: {e}", ping_ms) diff --git a/health-agent/src/health_agent/checks/swarm.py b/health-agent/src/health_agent/checks/swarm.py new file mode 100644 index 0000000..2c70b29 --- /dev/null +++ b/health-agent/src/health_agent/checks/swarm.py @@ -0,0 +1,49 @@ +import time +import docker +import logging +from health_agent.uptime_kuma import push + +logger = logging.getLogger(__name__) + +def check_swarm_cluster(): + start_time = time.time() + try: + client = docker.from_env() + nodes = client.nodes.list() + + ready_nodes = [] + managers = [] + + for node in nodes: + spec = node.attrs.get('Spec', {}) + status = node.attrs.get('Status', {}) + manager_status = node.attrs.get('ManagerStatus', {}) + + node_name = spec.get('Name', node.id) + is_ready = status.get('State') == 'ready' + is_manager = spec.get('Role') == 'manager' + + if is_ready: + ready_nodes.append(node_name) + + if is_manager: + reachability = manager_status.get('Reachability') + if reachability == 'reachable': + managers.append(node_name) + + total_nodes = len(nodes) + ready_count = len(ready_nodes) + + ping_ms = int((time.time() - start_time) * 1000) + + if ready_count == total_nodes: + msg = f"{ready_count}/{total_nodes} nodes Ready (managers: {', '.join(managers)})" + push("SWARM-CLUSTER", "up", msg, ping_ms) + else: + msg = f"{ready_count}/{total_nodes} nodes Ready | Managers reachable: {len(managers)}" + push("SWARM-CLUSTER", "down", msg, ping_ms) + + except Exception as e: + ping_ms = int((time.time() - start_time) * 1000) + logger.error(f"Swarm check failed: {e}") + push("SWARM-CLUSTER", "down", str(e), ping_ms) diff --git a/health-agent/src/health_agent/checks/tcp.py b/health-agent/src/health_agent/checks/tcp.py new file mode 100644 index 0000000..c613049 --- /dev/null +++ b/health-agent/src/health_agent/checks/tcp.py @@ -0,0 +1,77 @@ +import socket +import time +import logging +import requests +from health_agent.uptime_kuma import push +from health_agent.checks.http import http_check + +logger = logging.getLogger(__name__) + +def tcp_check(host, port, timeout=3): + start_time = time.time() + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(timeout) + result = sock.connect_ex((host, port)) + sock.close() + ping_ms = int((time.time() - start_time) * 1000) + if result == 0: + return True, ping_ms, None + else: + return False, ping_ms, f"Port {port} is closed or unreachable" + except Exception as e: + ping_ms = int((time.time() - start_time) * 1000) + return False, ping_ms, str(e) + +def check_etcd_cluster(): + nodes = ["etcd-01", "etcd-02", "etcd-03"] + start_t = time.time() + + healthy_count = 0 + leader = None + errors = [] + + for node in nodes: + # 1. TCP Check on 2379 + tcp_ok, ms, tcp_err = tcp_check(node, 2379) + if not tcp_ok: + errors.append(f"{node} port 2379 unreachable") + continue + + # 2. HTTP Health check + url = f"http://{node}:2379/health" + http_ok, resp, ms, http_err = http_check(url, timeout=3) + + if http_ok and resp: + data = resp.json() + if data.get("health") == "true": + healthy_count += 1 + else: + errors.append(f"{node} unhealthy") + else: + errors.append(f"{node} health endpoint unreachable") + + # 3. Leader check from /v3/maintenance/status + if not leader and tcp_ok: + status_url = f"http://{node}:2379/v3/maintenance/status" + try: + r = requests.post(status_url, json={}, timeout=3) + if r.status_code == 200: + status_data = r.json() + leader_id = status_data.get("leader") + header_member_id = status_data.get("header", {}).get("member_id") + if leader_id and leader_id == header_member_id: + leader = node + except Exception: + pass + + ping_ms = int((time.time() - start_t) * 1000) + + if healthy_count == len(nodes): + leader_info = f" | leader: {leader}" if leader else "" + msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}" + push("ETCD-CLUSTER", "up", msg, ping_ms) + else: + quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else "" + msg = " | ".join(errors) + quorum_msg + push("ETCD-CLUSTER", "down", msg, ping_ms) diff --git a/health-agent/src/health_agent/checks/tls.py b/health-agent/src/health_agent/checks/tls.py new file mode 100644 index 0000000..6b5f691 --- /dev/null +++ b/health-agent/src/health_agent/checks/tls.py @@ -0,0 +1,62 @@ +import os +import time +import logging +import requests +from datetime import datetime, timezone +from health_agent.uptime_kuma import push +from cryptography import x509 +from cryptography.hazmat.backends import default_backend + +logger = logging.getLogger(__name__) + +def check_swag_tls(): + start_t = time.time() + cert_path = "/mnt/storagebox/ssl/STAR.iklim.co.full.crt" + domain = os.getenv("EXTERNAL_DOMAIN", "iklim.co") + suffix = os.getenv("EXTERNAL_SUBDOMAIN_SUFFIX", "") + target_url = f"https://api{suffix}.{domain}/actuator/health" + + msg_parts = [] + is_down = False + + # 1. Check cert file + if not os.path.exists(cert_path): + is_down = True + msg_parts.append("cert file missing on storagebox") + else: + try: + with open(cert_path, "rb") as f: + cert_data = f.read() + cert = x509.load_pem_x509_certificate(cert_data, default_backend()) + not_valid_after = cert.not_valid_after_utc + now = datetime.now(timezone.utc) + days_left = (not_valid_after - now).days + + if days_left < 14: + is_down = True + msg_parts.append(f"cert expires in {days_left} days") + else: + msg_parts.append(f"cert valid until {not_valid_after.strftime('%Y-%m-%d')} ({days_left} days)") + except Exception as e: + is_down = True + msg_parts.append(f"cert parse error: {e}") + + # 2. Check external HTTPS reachable + try: + r = requests.get(target_url, timeout=5, verify=False) + if r.status_code < 500: + msg_parts.append("HTTPS reachable") + else: + is_down = True + msg_parts.append(f"HTTPS returned {r.status_code}") + except Exception as e: + is_down = True + msg_parts.append(f"HTTPS unreachable") + + ping_ms = int((time.time() - start_t) * 1000) + msg = " | ".join(msg_parts) + + if is_down: + push("SWAG-TLS", "down", msg, ping_ms) + else: + push("SWAG-TLS", "up", msg, ping_ms) diff --git a/health-agent/src/health_agent/config.py b/health-agent/src/health_agent/config.py new file mode 100644 index 0000000..dbff2be --- /dev/null +++ b/health-agent/src/health_agent/config.py @@ -0,0 +1,25 @@ +import os +import yaml +from pathlib import Path +from dotenv import load_dotenv + +load_dotenv() + +ENV = os.getenv("ENV", "prod") +CLUSTER_SIZE_ETCD = int(os.getenv("CLUSTER_SIZE_ETCD", "3")) +CLUSTER_SIZE_PATRONI = int(os.getenv("CLUSTER_SIZE_PATRONI", "3")) +CLUSTER_SIZE_MONGODB = int(os.getenv("CLUSTER_SIZE_MONGODB", "3")) +CLUSTER_SIZE_RABBITMQ = int(os.getenv("CLUSTER_SIZE_RABBITMQ", "3")) +CLUSTER_SIZE_VAULT = int(os.getenv("CLUSTER_SIZE_VAULT", "3")) +REDIS_MODE = os.getenv("REDIS_MODE", "sentinel") +EXTERNAL_DOMAIN = os.getenv("EXTERNAL_DOMAIN", "iklim.co") +EXTERNAL_SUBDOMAIN_SUFFIX = os.getenv("EXTERNAL_SUBDOMAIN_SUFFIX", "") + +def load_uk_tokens(): + token_file = Path("config/generated/uk_tokens.yml") + if not token_file.exists(): + return {} + with open(token_file, "r") as f: + return yaml.safe_load(f) or {} + +UK_TOKENS = load_uk_tokens() diff --git a/health-agent/src/health_agent/events/docker_events.py b/health-agent/src/health_agent/events/docker_events.py new file mode 100644 index 0000000..36a149c --- /dev/null +++ b/health-agent/src/health_agent/events/docker_events.py @@ -0,0 +1,56 @@ +import os +import docker +import threading +import logging +import time +from health_agent.slack import notify + +logger = logging.getLogger(__name__) + +def parse_and_notify(event): + attrs = event.get('Actor', {}).get('Attributes', {}) + container_name = attrs.get('name', 'unknown') + exit_code = attrs.get('exitCode', '0') + + if exit_code == '0': + return + + is_oom = (exit_code == '137') + + env = os.getenv("ENV", "test").upper() + webhook_env_name = f"SLACK_WEBHOOK_IKLIM_{env}_OPS" + + priority = "High" if is_oom else "Medium" + title = f"[Health Agent / Events] Container Crashed ({container_name})" + + detail = f"Container: {container_name}\nExit Code: {exit_code}" + if is_oom: + detail += "\nReason: OOM Killed (Out Of Memory) or SIGKILL" + + notify( + webhook_env=webhook_env_name, + source="health-agent-events", + priority=priority, + title=title, + detail=detail + ) + +def event_listener_loop(): + while True: + try: + client = docker.from_env() + logger.info("Starting Docker event listener...") + filters = {"type": "container", "event": "die"} + for event in client.events(decode=True, filters=filters): + try: + parse_and_notify(event) + except Exception as e: + logger.error(f"Error parsing event: {e}", exc_info=True) + except Exception as e: + logger.error(f"Docker event listener error: {e}. Reconnecting in 10s...", exc_info=True) + time.sleep(10) + +def start_docker_event_listener(): + thread = threading.Thread(target=event_listener_loop, daemon=True) + thread.start() + return thread diff --git a/health-agent/src/health_agent/main.py b/health-agent/src/health_agent/main.py new file mode 100644 index 0000000..e85f5ca --- /dev/null +++ b/health-agent/src/health_agent/main.py @@ -0,0 +1,75 @@ +import time +import logging +from health_agent.checks import swarm +from health_agent.checks.http import run_all_http_checks +from health_agent.checks.tcp import check_etcd_cluster +from health_agent.checks.tls import check_swag_tls +from health_agent.checks.redis_sentinel import check_redis_sentinel +from health_agent.checks.mongodb import check_mongodb +from health_agent.checks.filesystem import check_storagebox_mount +from health_agent.events.docker_events import start_docker_event_listener +import json + +class JSONFormatter(logging.Formatter): + def format(self, record): + log_obj = { + "time": self.formatTime(record, self.datefmt), + "level": record.levelname, + "logger": record.name, + "msg": record.getMessage() + } + for attr in ['check', 'status', 'ping_ms', 'source', 'error']: + if hasattr(record, attr): + log_obj[attr] = getattr(record, attr) + if record.exc_info: + log_obj['exc_info'] = self.formatException(record.exc_info) + return json.dumps(log_obj) + +handler = logging.StreamHandler() +handler.setFormatter(JSONFormatter()) +logging.basicConfig(level=logging.INFO, handlers=[handler]) +logger = logging.getLogger("main") + +def run_checks(): + logger.info("Running health checks...") + try: + swarm.check_swarm_cluster() + except Exception as e: + logger.error(f"Error checking Swarm cluster: {e}") + + try: + run_all_http_checks() + except Exception as e: + logger.error(f"Error running HTTP checks: {e}") + + try: + check_etcd_cluster() + except Exception as e: + logger.error(f"Error running etcd checks: {e}") + + try: + check_swag_tls() + except Exception as e: + logger.error(f"Error running TLS checks: {e}") + + try: + check_redis_sentinel() + except Exception as e: + logger.error(f"Error running Redis checks: {e}") + + try: + check_mongodb() + except Exception as e: + logger.error(f"Error running MongoDB checks: {e}") + + try: + check_storagebox_mount() + except Exception as e: + logger.error(f"Error running filesystem checks: {e}") + +if __name__ == "__main__": + logger.info("Starting health-agent...") + start_docker_event_listener() + while True: + run_checks() + time.sleep(60) diff --git a/health-agent/src/health_agent/slack.py b/health-agent/src/health_agent/slack.py new file mode 100644 index 0000000..95190c2 --- /dev/null +++ b/health-agent/src/health_agent/slack.py @@ -0,0 +1,22 @@ +import os +import requests +import logging + +logger = logging.getLogger(__name__) + +def notify(webhook_env: str, source: str, priority: str, title: str, detail: str): + webhook_url = os.getenv(webhook_env) + if not webhook_url: + logger.warning(f"Slack webhook url not found for {webhook_env}") + return + + payload = { + "text": f"*{title}*\n*Source:* {source}\n*Priority:* {priority}\n```\n{detail}\n```" + } + + try: + response = requests.post(webhook_url, json=payload, timeout=5) + response.raise_for_status() + logger.info(f"Sent Slack notification to {webhook_env}") + except Exception as e: + logger.error(f"Failed to send Slack notification: {e}") diff --git a/health-agent/src/health_agent/state.py b/health-agent/src/health_agent/state.py new file mode 100644 index 0000000..753c49a --- /dev/null +++ b/health-agent/src/health_agent/state.py @@ -0,0 +1,19 @@ +import json +import os +from pathlib import Path + +STATE_FILE = Path("config/generated/state.json") + +def load_state(): + if not STATE_FILE.exists(): + return {} + try: + with open(STATE_FILE, "r") as f: + return json.load(f) + except Exception: + return {} + +def save_state(state): + STATE_FILE.parent.mkdir(parents=True, exist_ok=True) + with open(STATE_FILE, "w") as f: + json.dump(state, f) diff --git a/health-agent/src/health_agent/uptime_kuma.py b/health-agent/src/health_agent/uptime_kuma.py new file mode 100644 index 0000000..54196b7 --- /dev/null +++ b/health-agent/src/health_agent/uptime_kuma.py @@ -0,0 +1,27 @@ +import os +import requests +import logging +from health_agent.config import UK_TOKENS + +logger = logging.getLogger(__name__) +UK_PUSH_URL_BASE = os.getenv("UK_PUSH_URL_BASE", "https://status.iklim.co/api/push") + +def push(monitor_name: str, status: str, msg: str, ping_ms: int): + token = UK_TOKENS.get(monitor_name) + if not token: + logger.warning(f"No token found for monitor {monitor_name}") + return + + url = f"{UK_PUSH_URL_BASE}/{token}" + params = { + "status": status, + "msg": msg, + "ping": int(ping_ms) + } + + try: + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + logger.info(f"Pushed {monitor_name} status={status}", extra={"check": monitor_name, "status": status, "msg": msg, "ping_ms": ping_ms, "source": "uptime_kuma"}) + except Exception as e: + logger.error(f"Failed to push {monitor_name}: {e}", extra={"check": monitor_name, "status": "push_failed", "error": str(e), "source": "uptime_kuma"})