From 3c2e872bf4baf7a8152a6bcb53cb014b184f4261 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 20:52:35 +0300 Subject: [PATCH] refactor(health-agent): rename monitor keys to Title Case With Space Update all hardcoded push monitor names in check files to match the new Title Case With Space format in monitors.yml. The uk_tokens.yml keys are derived from monitor names so the push() calls must match exactly. --- .../src/health_agent/checks/filesystem.py | 6 +-- health-agent/src/health_agent/checks/http.py | 38 +++++++++---------- .../src/health_agent/checks/mongodb.py | 10 ++--- .../src/health_agent/checks/redis_sentinel.py | 6 +-- health-agent/src/health_agent/checks/swarm.py | 6 +-- health-agent/src/health_agent/checks/tcp.py | 4 +- health-agent/src/health_agent/checks/tls.py | 4 +- 7 files changed, 37 insertions(+), 37 deletions(-) diff --git a/health-agent/src/health_agent/checks/filesystem.py b/health-agent/src/health_agent/checks/filesystem.py index 8742091..b06fdfd 100644 --- a/health-agent/src/health_agent/checks/filesystem.py +++ b/health-agent/src/health_agent/checks/filesystem.py @@ -18,7 +18,7 @@ def check_storagebox_mount(): if not os.path.exists(storagebox_path): ping_ms = int((time.time() - start_t) * 1000) - push("STORAGEBOX-MOUNT", "down", f"{storagebox_path} not found", ping_ms) + push("Storagebox Mount", "down", f"{storagebox_path} not found", ping_ms) return for rel_path in expected_files: @@ -30,7 +30,7 @@ def check_storagebox_mount(): if missing_files: msg = f"mount exists but missing: {', '.join(missing_files)}" - push("STORAGEBOX-MOUNT", "down", msg, ping_ms) + push("Storagebox Mount", "down", msg, ping_ms) else: msg = f"{storagebox_path} OK | all critical files present" - push("STORAGEBOX-MOUNT", "up", msg, ping_ms) + push("Storagebox Mount", "up", msg, ping_ms) diff --git a/health-agent/src/health_agent/checks/http.py b/health-agent/src/health_agent/checks/http.py index f7001c0..ea9d96b 100644 --- a/health-agent/src/health_agent/checks/http.py +++ b/health-agent/src/health_agent/checks/http.py @@ -54,7 +54,7 @@ def check_patroni_cluster(): ping_ms = int((time.time() - start_t) * 1000) if not cluster_data: - push("PATRONI-CLUSTER", "down", error_msg, ping_ms) + push("Patroni Cluster", "down", error_msg, ping_ms) return members = cluster_data.get("members", []) @@ -73,7 +73,7 @@ def check_patroni_cluster(): if not leader: down_nodes = [f"{r[0]} state: {r[2]}" for r in replicas if r[2] not in ("running", "streaming")] msg = f"no leader detected | " + " ".join(down_nodes) - push("PATRONI-CLUSTER", "down", msg, ping_ms) + push("Patroni Cluster", "down", msg, ping_ms) else: lag_strs = [] for name, lag, state in replicas: @@ -81,7 +81,7 @@ def check_patroni_cluster(): lag_strs.append(f"{name} (lag:{lag_mb:.0f}MB)") msg = f"leader: {leader} | replicas: " + " ".join(lag_strs) - push("PATRONI-CLUSTER", "up", msg, ping_ms) + push("Patroni Cluster", "up", msg, ping_ms) def check_rabbitmq_cluster(): url = "http://rabbitmq:15672/api/healthchecks/node" @@ -104,14 +104,14 @@ def check_rabbitmq_cluster(): alarms = [n.get("name") for n in data if n.get("mem_alarm") or n.get("disk_free_alarm")] if alarms: msg = f"disk/mem alarm active on {','.join(alarms)}" - push("RABBITMQ-CLUSTER", "down", msg, ping_ms) + push("Rabbitmq Cluster", "down", msg, ping_ms) return msg = f"{nodes_running}/{total_nodes} nodes running" - push("RABBITMQ-CLUSTER", "up", msg, ping_ms) + push("Rabbitmq Cluster", "up", msg, ping_ms) else: msg = err or f"HTTP {resp.status_code if resp else 'Unknown'}" - push("RABBITMQ-CLUSTER", "down", msg, ping_ms) + push("Rabbitmq Cluster", "down", msg, ping_ms) def check_apisix(): url = "http://apisix:9180/apisix/admin/routes" @@ -120,9 +120,9 @@ def check_apisix(): ok, resp, ping_ms, err = http_check(url, headers=headers) if ok: - push("APISIX-GATEWAY", "up", "admin API reachable", ping_ms) + push("Apisix Gateway", "up", "admin API reachable", ping_ms) else: - push("APISIX-GATEWAY", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms) + push("Apisix Gateway", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms) def check_vault(): hosts_env = os.getenv("VAULT_HOSTS", "vault") @@ -152,18 +152,18 @@ def check_vault(): if unsealed_count == total: msg = f"{unsealed_count}/{total} unsealed" - push("VAULT-CLUSTER", "up", msg, ping_ms) + push("Vault Cluster", "up", msg, ping_ms) else: msg = " | ".join(errors) if errors else "Vault checks failed" - push("VAULT-CLUSTER", "down", msg, ping_ms) + push("Vault Cluster", "down", msg, ping_ms) def check_prometheus(): url = "http://prometheus:9090/-/healthy" ok, resp, ping_ms, err = http_check(url) if ok: - push("PROMETHEUS", "up", "healthy", ping_ms) + push("Prometheus", "up", "healthy", ping_ms) else: - push("PROMETHEUS", "down", f"prometheus unreachable: {err}", ping_ms) + push("Prometheus", "down", f"prometheus unreachable: {err}", ping_ms) def check_grafana(): url = "http://grafana:3000/api/health" @@ -172,27 +172,27 @@ def check_grafana(): data = resp.json() db_status = data.get("database", "unknown") if db_status == "ok": - push("GRAFANA", "up", f"ok | db: {db_status}", ping_ms) + push("Grafana", "up", f"ok | db: {db_status}", ping_ms) else: - push("GRAFANA", "down", f"db not ok: {db_status}", ping_ms) + push("Grafana", "down", f"db not ok: {db_status}", ping_ms) else: - push("GRAFANA", "down", f"grafana unreachable: {err}", ping_ms) + push("Grafana", "down", f"grafana unreachable: {err}", ping_ms) def check_portainer(): url = "http://portainer:9000/api/system/status" ok, resp, ping_ms, err = http_check(url) if ok: - push("PORTAINER", "up", "running", ping_ms) + push("Portainer", "up", "running", ping_ms) else: - push("PORTAINER", "down", f"portainer unreachable: {err}", ping_ms) + push("Portainer", "down", f"portainer unreachable: {err}", ping_ms) def check_loki(): url = "http://loki:3100/ready" ok, resp, ping_ms, err = http_check(url) if ok: - push("LOKI", "up", "ready", ping_ms) + push("Loki", "up", "ready", ping_ms) else: - push("LOKI", "down", f"loki unreachable: {err}", ping_ms) + push("Loki", "down", f"loki unreachable: {err}", ping_ms) def run_all_http_checks(): check_patroni_cluster() diff --git a/health-agent/src/health_agent/checks/mongodb.py b/health-agent/src/health_agent/checks/mongodb.py index 593cef6..9d80363 100644 --- a/health-agent/src/health_agent/checks/mongodb.py +++ b/health-agent/src/health_agent/checks/mongodb.py @@ -35,7 +35,7 @@ def check_mongodb(): ping_ms = int((time.time() - start_t) * 1000) if cluster_size == 1: - push("MONGODB-REPLICASET", "up", "standalone mode OK", ping_ms) + push("Mongodb Replicaset", "up", "standalone mode OK", ping_ms) return if primary: @@ -45,13 +45,13 @@ def check_mongodb(): unhealthy_secs = [s for s in secondaries if s[1] not in ('SECONDARY', 'ARBITER')] if unhealthy_secs: msg = f"PRIMARY: {primary} | unhealthy: {','.join([s[0] + ':' + s[1] for s in unhealthy_secs])}" - push("MONGODB-REPLICASET", "down", msg, ping_ms) + push("Mongodb Replicaset", "down", msg, ping_ms) else: - push("MONGODB-REPLICASET", "up", msg, ping_ms) + push("Mongodb Replicaset", "up", msg, ping_ms) else: msg = "no PRIMARY | quorum lost" - push("MONGODB-REPLICASET", "down", msg, ping_ms) + push("Mongodb Replicaset", "down", msg, ping_ms) except Exception as e: ping_ms = int((time.time() - start_t) * 1000) - push("MONGODB-REPLICASET", "down", f"connection failed: {e}", ping_ms) + push("Mongodb Replicaset", "down", f"connection failed: {e}", ping_ms) diff --git a/health-agent/src/health_agent/checks/redis_sentinel.py b/health-agent/src/health_agent/checks/redis_sentinel.py index 0a05ded..1dea54c 100644 --- a/health-agent/src/health_agent/checks/redis_sentinel.py +++ b/health-agent/src/health_agent/checks/redis_sentinel.py @@ -24,7 +24,7 @@ def check_redis_sentinel(): redis_mode = os.getenv("REDIS_MODE", "sentinel") if redis_mode != "sentinel": - push("REDIS-SENTINEL", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000)) + push("Redis Sentinel", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000)) return try: @@ -43,8 +43,8 @@ def check_redis_sentinel(): ping_ms = int((time.time() - start_t) * 1000) msg = f"master: {master_ip}:{master_port} | replicas: {replicas_count} | sentinels quorum OK" - push("REDIS-SENTINEL", "up", msg, ping_ms) + push("Redis Sentinel", "up", msg, ping_ms) except Exception as e: ping_ms = int((time.time() - start_t) * 1000) - push("REDIS-SENTINEL", "down", f"quorum FAIL or master unreachable: {e}", ping_ms) + push("Redis Sentinel", "down", f"quorum FAIL or master unreachable: {e}", ping_ms) diff --git a/health-agent/src/health_agent/checks/swarm.py b/health-agent/src/health_agent/checks/swarm.py index 2c70b29..a9b64b4 100644 --- a/health-agent/src/health_agent/checks/swarm.py +++ b/health-agent/src/health_agent/checks/swarm.py @@ -38,12 +38,12 @@ def check_swarm_cluster(): if ready_count == total_nodes: msg = f"{ready_count}/{total_nodes} nodes Ready (managers: {', '.join(managers)})" - push("SWARM-CLUSTER", "up", msg, ping_ms) + push("Swarm Cluster", "up", msg, ping_ms) else: msg = f"{ready_count}/{total_nodes} nodes Ready | Managers reachable: {len(managers)}" - push("SWARM-CLUSTER", "down", msg, ping_ms) + push("Swarm Cluster", "down", msg, ping_ms) except Exception as e: ping_ms = int((time.time() - start_time) * 1000) logger.error(f"Swarm check failed: {e}") - push("SWARM-CLUSTER", "down", str(e), ping_ms) + push("Swarm Cluster", "down", str(e), ping_ms) diff --git a/health-agent/src/health_agent/checks/tcp.py b/health-agent/src/health_agent/checks/tcp.py index c613049..5b00816 100644 --- a/health-agent/src/health_agent/checks/tcp.py +++ b/health-agent/src/health_agent/checks/tcp.py @@ -70,8 +70,8 @@ def check_etcd_cluster(): if healthy_count == len(nodes): leader_info = f" | leader: {leader}" if leader else "" msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}" - push("ETCD-CLUSTER", "up", msg, ping_ms) + push("Etcd Cluster", "up", msg, ping_ms) else: quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else "" msg = " | ".join(errors) + quorum_msg - push("ETCD-CLUSTER", "down", msg, ping_ms) + push("Etcd Cluster", "down", msg, ping_ms) diff --git a/health-agent/src/health_agent/checks/tls.py b/health-agent/src/health_agent/checks/tls.py index 6b5f691..b7cb1ed 100644 --- a/health-agent/src/health_agent/checks/tls.py +++ b/health-agent/src/health_agent/checks/tls.py @@ -57,6 +57,6 @@ def check_swag_tls(): msg = " | ".join(msg_parts) if is_down: - push("SWAG-TLS", "down", msg, ping_ms) + push("Swag Tls", "down", msg, ping_ms) else: - push("SWAG-TLS", "up", msg, ping_ms) + push("Swag Tls", "up", msg, ping_ms)