refactor(health-agent): rename monitor keys to Title Case With Space

Update all hardcoded push monitor names in check files to match the new Title Case With Space format in monitors.yml. The uk_tokens.yml keys are derived from monitor names so the push() calls must match exactly.
This commit is contained in:
Murat ÖZDEMİR 2026-06-26 20:52:35 +03:00
parent bc8b3d0934
commit 3c2e872bf4
7 changed files with 37 additions and 37 deletions

View File

@ -18,7 +18,7 @@ def check_storagebox_mount():
if not os.path.exists(storagebox_path):
ping_ms = int((time.time() - start_t) * 1000)
push("STORAGEBOX-MOUNT", "down", f"{storagebox_path} not found", ping_ms)
push("Storagebox Mount", "down", f"{storagebox_path} not found", ping_ms)
return
for rel_path in expected_files:
@ -30,7 +30,7 @@ def check_storagebox_mount():
if missing_files:
msg = f"mount exists but missing: {', '.join(missing_files)}"
push("STORAGEBOX-MOUNT", "down", msg, ping_ms)
push("Storagebox Mount", "down", msg, ping_ms)
else:
msg = f"{storagebox_path} OK | all critical files present"
push("STORAGEBOX-MOUNT", "up", msg, ping_ms)
push("Storagebox Mount", "up", msg, ping_ms)

View File

@ -54,7 +54,7 @@ def check_patroni_cluster():
ping_ms = int((time.time() - start_t) * 1000)
if not cluster_data:
push("PATRONI-CLUSTER", "down", error_msg, ping_ms)
push("Patroni Cluster", "down", error_msg, ping_ms)
return
members = cluster_data.get("members", [])
@ -73,7 +73,7 @@ def check_patroni_cluster():
if not leader:
down_nodes = [f"{r[0]} state: {r[2]}" for r in replicas if r[2] not in ("running", "streaming")]
msg = f"no leader detected | " + " ".join(down_nodes)
push("PATRONI-CLUSTER", "down", msg, ping_ms)
push("Patroni Cluster", "down", msg, ping_ms)
else:
lag_strs = []
for name, lag, state in replicas:
@ -81,7 +81,7 @@ def check_patroni_cluster():
lag_strs.append(f"{name} (lag:{lag_mb:.0f}MB)")
msg = f"leader: {leader} | replicas: " + " ".join(lag_strs)
push("PATRONI-CLUSTER", "up", msg, ping_ms)
push("Patroni Cluster", "up", msg, ping_ms)
def check_rabbitmq_cluster():
url = "http://rabbitmq:15672/api/healthchecks/node"
@ -104,14 +104,14 @@ def check_rabbitmq_cluster():
alarms = [n.get("name") for n in data if n.get("mem_alarm") or n.get("disk_free_alarm")]
if alarms:
msg = f"disk/mem alarm active on {','.join(alarms)}"
push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
push("Rabbitmq Cluster", "down", msg, ping_ms)
return
msg = f"{nodes_running}/{total_nodes} nodes running"
push("RABBITMQ-CLUSTER", "up", msg, ping_ms)
push("Rabbitmq Cluster", "up", msg, ping_ms)
else:
msg = err or f"HTTP {resp.status_code if resp else 'Unknown'}"
push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
push("Rabbitmq Cluster", "down", msg, ping_ms)
def check_apisix():
url = "http://apisix:9180/apisix/admin/routes"
@ -120,9 +120,9 @@ def check_apisix():
ok, resp, ping_ms, err = http_check(url, headers=headers)
if ok:
push("APISIX-GATEWAY", "up", "admin API reachable", ping_ms)
push("Apisix Gateway", "up", "admin API reachable", ping_ms)
else:
push("APISIX-GATEWAY", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms)
push("Apisix Gateway", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms)
def check_vault():
hosts_env = os.getenv("VAULT_HOSTS", "vault")
@ -152,18 +152,18 @@ def check_vault():
if unsealed_count == total:
msg = f"{unsealed_count}/{total} unsealed"
push("VAULT-CLUSTER", "up", msg, ping_ms)
push("Vault Cluster", "up", msg, ping_ms)
else:
msg = " | ".join(errors) if errors else "Vault checks failed"
push("VAULT-CLUSTER", "down", msg, ping_ms)
push("Vault Cluster", "down", msg, ping_ms)
def check_prometheus():
url = "http://prometheus:9090/-/healthy"
ok, resp, ping_ms, err = http_check(url)
if ok:
push("PROMETHEUS", "up", "healthy", ping_ms)
push("Prometheus", "up", "healthy", ping_ms)
else:
push("PROMETHEUS", "down", f"prometheus unreachable: {err}", ping_ms)
push("Prometheus", "down", f"prometheus unreachable: {err}", ping_ms)
def check_grafana():
url = "http://grafana:3000/api/health"
@ -172,27 +172,27 @@ def check_grafana():
data = resp.json()
db_status = data.get("database", "unknown")
if db_status == "ok":
push("GRAFANA", "up", f"ok | db: {db_status}", ping_ms)
push("Grafana", "up", f"ok | db: {db_status}", ping_ms)
else:
push("GRAFANA", "down", f"db not ok: {db_status}", ping_ms)
push("Grafana", "down", f"db not ok: {db_status}", ping_ms)
else:
push("GRAFANA", "down", f"grafana unreachable: {err}", ping_ms)
push("Grafana", "down", f"grafana unreachable: {err}", ping_ms)
def check_portainer():
url = "http://portainer:9000/api/system/status"
ok, resp, ping_ms, err = http_check(url)
if ok:
push("PORTAINER", "up", "running", ping_ms)
push("Portainer", "up", "running", ping_ms)
else:
push("PORTAINER", "down", f"portainer unreachable: {err}", ping_ms)
push("Portainer", "down", f"portainer unreachable: {err}", ping_ms)
def check_loki():
url = "http://loki:3100/ready"
ok, resp, ping_ms, err = http_check(url)
if ok:
push("LOKI", "up", "ready", ping_ms)
push("Loki", "up", "ready", ping_ms)
else:
push("LOKI", "down", f"loki unreachable: {err}", ping_ms)
push("Loki", "down", f"loki unreachable: {err}", ping_ms)
def run_all_http_checks():
check_patroni_cluster()

View File

@ -35,7 +35,7 @@ def check_mongodb():
ping_ms = int((time.time() - start_t) * 1000)
if cluster_size == 1:
push("MONGODB-REPLICASET", "up", "standalone mode OK", ping_ms)
push("Mongodb Replicaset", "up", "standalone mode OK", ping_ms)
return
if primary:
@ -45,13 +45,13 @@ def check_mongodb():
unhealthy_secs = [s for s in secondaries if s[1] not in ('SECONDARY', 'ARBITER')]
if unhealthy_secs:
msg = f"PRIMARY: {primary} | unhealthy: {','.join([s[0] + ':' + s[1] for s in unhealthy_secs])}"
push("MONGODB-REPLICASET", "down", msg, ping_ms)
push("Mongodb Replicaset", "down", msg, ping_ms)
else:
push("MONGODB-REPLICASET", "up", msg, ping_ms)
push("Mongodb Replicaset", "up", msg, ping_ms)
else:
msg = "no PRIMARY | quorum lost"
push("MONGODB-REPLICASET", "down", msg, ping_ms)
push("Mongodb Replicaset", "down", msg, ping_ms)
except Exception as e:
ping_ms = int((time.time() - start_t) * 1000)
push("MONGODB-REPLICASET", "down", f"connection failed: {e}", ping_ms)
push("Mongodb Replicaset", "down", f"connection failed: {e}", ping_ms)

View File

@ -24,7 +24,7 @@ def check_redis_sentinel():
redis_mode = os.getenv("REDIS_MODE", "sentinel")
if redis_mode != "sentinel":
push("REDIS-SENTINEL", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000))
push("Redis Sentinel", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000))
return
try:
@ -43,8 +43,8 @@ def check_redis_sentinel():
ping_ms = int((time.time() - start_t) * 1000)
msg = f"master: {master_ip}:{master_port} | replicas: {replicas_count} | sentinels quorum OK"
push("REDIS-SENTINEL", "up", msg, ping_ms)
push("Redis Sentinel", "up", msg, ping_ms)
except Exception as e:
ping_ms = int((time.time() - start_t) * 1000)
push("REDIS-SENTINEL", "down", f"quorum FAIL or master unreachable: {e}", ping_ms)
push("Redis Sentinel", "down", f"quorum FAIL or master unreachable: {e}", ping_ms)

View File

@ -38,12 +38,12 @@ def check_swarm_cluster():
if ready_count == total_nodes:
msg = f"{ready_count}/{total_nodes} nodes Ready (managers: {', '.join(managers)})"
push("SWARM-CLUSTER", "up", msg, ping_ms)
push("Swarm Cluster", "up", msg, ping_ms)
else:
msg = f"{ready_count}/{total_nodes} nodes Ready | Managers reachable: {len(managers)}"
push("SWARM-CLUSTER", "down", msg, ping_ms)
push("Swarm Cluster", "down", msg, ping_ms)
except Exception as e:
ping_ms = int((time.time() - start_time) * 1000)
logger.error(f"Swarm check failed: {e}")
push("SWARM-CLUSTER", "down", str(e), ping_ms)
push("Swarm Cluster", "down", str(e), ping_ms)

View File

@ -70,8 +70,8 @@ def check_etcd_cluster():
if healthy_count == len(nodes):
leader_info = f" | leader: {leader}" if leader else ""
msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}"
push("ETCD-CLUSTER", "up", msg, ping_ms)
push("Etcd Cluster", "up", msg, ping_ms)
else:
quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else ""
msg = " | ".join(errors) + quorum_msg
push("ETCD-CLUSTER", "down", msg, ping_ms)
push("Etcd Cluster", "down", msg, ping_ms)

View File

@ -57,6 +57,6 @@ def check_swag_tls():
msg = " | ".join(msg_parts)
if is_down:
push("SWAG-TLS", "down", msg, ping_ms)
push("Swag Tls", "down", msg, ping_ms)
else:
push("SWAG-TLS", "up", msg, ping_ms)
push("Swag Tls", "up", msg, ping_ms)