refactor(health-agent): rename monitor keys to Title Case With Space

Update all hardcoded push monitor names in check files to match the new Title Case With Space format in monitors.yml. The uk_tokens.yml keys are derived from monitor names so the push() calls must match exactly.
This commit is contained in:
Murat ÖZDEMİR 2026-06-26 20:52:35 +03:00
parent bc8b3d0934
commit 3c2e872bf4
7 changed files with 37 additions and 37 deletions

View File

@ -18,7 +18,7 @@ def check_storagebox_mount():
if not os.path.exists(storagebox_path): if not os.path.exists(storagebox_path):
ping_ms = int((time.time() - start_t) * 1000) ping_ms = int((time.time() - start_t) * 1000)
push("STORAGEBOX-MOUNT", "down", f"{storagebox_path} not found", ping_ms) push("Storagebox Mount", "down", f"{storagebox_path} not found", ping_ms)
return return
for rel_path in expected_files: for rel_path in expected_files:
@ -30,7 +30,7 @@ def check_storagebox_mount():
if missing_files: if missing_files:
msg = f"mount exists but missing: {', '.join(missing_files)}" msg = f"mount exists but missing: {', '.join(missing_files)}"
push("STORAGEBOX-MOUNT", "down", msg, ping_ms) push("Storagebox Mount", "down", msg, ping_ms)
else: else:
msg = f"{storagebox_path} OK | all critical files present" msg = f"{storagebox_path} OK | all critical files present"
push("STORAGEBOX-MOUNT", "up", msg, ping_ms) push("Storagebox Mount", "up", msg, ping_ms)

View File

@ -54,7 +54,7 @@ def check_patroni_cluster():
ping_ms = int((time.time() - start_t) * 1000) ping_ms = int((time.time() - start_t) * 1000)
if not cluster_data: if not cluster_data:
push("PATRONI-CLUSTER", "down", error_msg, ping_ms) push("Patroni Cluster", "down", error_msg, ping_ms)
return return
members = cluster_data.get("members", []) members = cluster_data.get("members", [])
@ -73,7 +73,7 @@ def check_patroni_cluster():
if not leader: if not leader:
down_nodes = [f"{r[0]} state: {r[2]}" for r in replicas if r[2] not in ("running", "streaming")] down_nodes = [f"{r[0]} state: {r[2]}" for r in replicas if r[2] not in ("running", "streaming")]
msg = f"no leader detected | " + " ".join(down_nodes) msg = f"no leader detected | " + " ".join(down_nodes)
push("PATRONI-CLUSTER", "down", msg, ping_ms) push("Patroni Cluster", "down", msg, ping_ms)
else: else:
lag_strs = [] lag_strs = []
for name, lag, state in replicas: for name, lag, state in replicas:
@ -81,7 +81,7 @@ def check_patroni_cluster():
lag_strs.append(f"{name} (lag:{lag_mb:.0f}MB)") lag_strs.append(f"{name} (lag:{lag_mb:.0f}MB)")
msg = f"leader: {leader} | replicas: " + " ".join(lag_strs) msg = f"leader: {leader} | replicas: " + " ".join(lag_strs)
push("PATRONI-CLUSTER", "up", msg, ping_ms) push("Patroni Cluster", "up", msg, ping_ms)
def check_rabbitmq_cluster(): def check_rabbitmq_cluster():
url = "http://rabbitmq:15672/api/healthchecks/node" url = "http://rabbitmq:15672/api/healthchecks/node"
@ -104,14 +104,14 @@ def check_rabbitmq_cluster():
alarms = [n.get("name") for n in data if n.get("mem_alarm") or n.get("disk_free_alarm")] alarms = [n.get("name") for n in data if n.get("mem_alarm") or n.get("disk_free_alarm")]
if alarms: if alarms:
msg = f"disk/mem alarm active on {','.join(alarms)}" msg = f"disk/mem alarm active on {','.join(alarms)}"
push("RABBITMQ-CLUSTER", "down", msg, ping_ms) push("Rabbitmq Cluster", "down", msg, ping_ms)
return return
msg = f"{nodes_running}/{total_nodes} nodes running" msg = f"{nodes_running}/{total_nodes} nodes running"
push("RABBITMQ-CLUSTER", "up", msg, ping_ms) push("Rabbitmq Cluster", "up", msg, ping_ms)
else: else:
msg = err or f"HTTP {resp.status_code if resp else 'Unknown'}" msg = err or f"HTTP {resp.status_code if resp else 'Unknown'}"
push("RABBITMQ-CLUSTER", "down", msg, ping_ms) push("Rabbitmq Cluster", "down", msg, ping_ms)
def check_apisix(): def check_apisix():
url = "http://apisix:9180/apisix/admin/routes" url = "http://apisix:9180/apisix/admin/routes"
@ -120,9 +120,9 @@ def check_apisix():
ok, resp, ping_ms, err = http_check(url, headers=headers) ok, resp, ping_ms, err = http_check(url, headers=headers)
if ok: if ok:
push("APISIX-GATEWAY", "up", "admin API reachable", ping_ms) push("Apisix Gateway", "up", "admin API reachable", ping_ms)
else: else:
push("APISIX-GATEWAY", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms) push("Apisix Gateway", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms)
def check_vault(): def check_vault():
hosts_env = os.getenv("VAULT_HOSTS", "vault") hosts_env = os.getenv("VAULT_HOSTS", "vault")
@ -152,18 +152,18 @@ def check_vault():
if unsealed_count == total: if unsealed_count == total:
msg = f"{unsealed_count}/{total} unsealed" msg = f"{unsealed_count}/{total} unsealed"
push("VAULT-CLUSTER", "up", msg, ping_ms) push("Vault Cluster", "up", msg, ping_ms)
else: else:
msg = " | ".join(errors) if errors else "Vault checks failed" msg = " | ".join(errors) if errors else "Vault checks failed"
push("VAULT-CLUSTER", "down", msg, ping_ms) push("Vault Cluster", "down", msg, ping_ms)
def check_prometheus(): def check_prometheus():
url = "http://prometheus:9090/-/healthy" url = "http://prometheus:9090/-/healthy"
ok, resp, ping_ms, err = http_check(url) ok, resp, ping_ms, err = http_check(url)
if ok: if ok:
push("PROMETHEUS", "up", "healthy", ping_ms) push("Prometheus", "up", "healthy", ping_ms)
else: else:
push("PROMETHEUS", "down", f"prometheus unreachable: {err}", ping_ms) push("Prometheus", "down", f"prometheus unreachable: {err}", ping_ms)
def check_grafana(): def check_grafana():
url = "http://grafana:3000/api/health" url = "http://grafana:3000/api/health"
@ -172,27 +172,27 @@ def check_grafana():
data = resp.json() data = resp.json()
db_status = data.get("database", "unknown") db_status = data.get("database", "unknown")
if db_status == "ok": if db_status == "ok":
push("GRAFANA", "up", f"ok | db: {db_status}", ping_ms) push("Grafana", "up", f"ok | db: {db_status}", ping_ms)
else: else:
push("GRAFANA", "down", f"db not ok: {db_status}", ping_ms) push("Grafana", "down", f"db not ok: {db_status}", ping_ms)
else: else:
push("GRAFANA", "down", f"grafana unreachable: {err}", ping_ms) push("Grafana", "down", f"grafana unreachable: {err}", ping_ms)
def check_portainer(): def check_portainer():
url = "http://portainer:9000/api/system/status" url = "http://portainer:9000/api/system/status"
ok, resp, ping_ms, err = http_check(url) ok, resp, ping_ms, err = http_check(url)
if ok: if ok:
push("PORTAINER", "up", "running", ping_ms) push("Portainer", "up", "running", ping_ms)
else: else:
push("PORTAINER", "down", f"portainer unreachable: {err}", ping_ms) push("Portainer", "down", f"portainer unreachable: {err}", ping_ms)
def check_loki(): def check_loki():
url = "http://loki:3100/ready" url = "http://loki:3100/ready"
ok, resp, ping_ms, err = http_check(url) ok, resp, ping_ms, err = http_check(url)
if ok: if ok:
push("LOKI", "up", "ready", ping_ms) push("Loki", "up", "ready", ping_ms)
else: else:
push("LOKI", "down", f"loki unreachable: {err}", ping_ms) push("Loki", "down", f"loki unreachable: {err}", ping_ms)
def run_all_http_checks(): def run_all_http_checks():
check_patroni_cluster() check_patroni_cluster()

View File

@ -35,7 +35,7 @@ def check_mongodb():
ping_ms = int((time.time() - start_t) * 1000) ping_ms = int((time.time() - start_t) * 1000)
if cluster_size == 1: if cluster_size == 1:
push("MONGODB-REPLICASET", "up", "standalone mode OK", ping_ms) push("Mongodb Replicaset", "up", "standalone mode OK", ping_ms)
return return
if primary: if primary:
@ -45,13 +45,13 @@ def check_mongodb():
unhealthy_secs = [s for s in secondaries if s[1] not in ('SECONDARY', 'ARBITER')] unhealthy_secs = [s for s in secondaries if s[1] not in ('SECONDARY', 'ARBITER')]
if unhealthy_secs: if unhealthy_secs:
msg = f"PRIMARY: {primary} | unhealthy: {','.join([s[0] + ':' + s[1] for s in unhealthy_secs])}" msg = f"PRIMARY: {primary} | unhealthy: {','.join([s[0] + ':' + s[1] for s in unhealthy_secs])}"
push("MONGODB-REPLICASET", "down", msg, ping_ms) push("Mongodb Replicaset", "down", msg, ping_ms)
else: else:
push("MONGODB-REPLICASET", "up", msg, ping_ms) push("Mongodb Replicaset", "up", msg, ping_ms)
else: else:
msg = "no PRIMARY | quorum lost" msg = "no PRIMARY | quorum lost"
push("MONGODB-REPLICASET", "down", msg, ping_ms) push("Mongodb Replicaset", "down", msg, ping_ms)
except Exception as e: except Exception as e:
ping_ms = int((time.time() - start_t) * 1000) ping_ms = int((time.time() - start_t) * 1000)
push("MONGODB-REPLICASET", "down", f"connection failed: {e}", ping_ms) push("Mongodb Replicaset", "down", f"connection failed: {e}", ping_ms)

View File

@ -24,7 +24,7 @@ def check_redis_sentinel():
redis_mode = os.getenv("REDIS_MODE", "sentinel") redis_mode = os.getenv("REDIS_MODE", "sentinel")
if redis_mode != "sentinel": if redis_mode != "sentinel":
push("REDIS-SENTINEL", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000)) push("Redis Sentinel", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000))
return return
try: try:
@ -43,8 +43,8 @@ def check_redis_sentinel():
ping_ms = int((time.time() - start_t) * 1000) ping_ms = int((time.time() - start_t) * 1000)
msg = f"master: {master_ip}:{master_port} | replicas: {replicas_count} | sentinels quorum OK" msg = f"master: {master_ip}:{master_port} | replicas: {replicas_count} | sentinels quorum OK"
push("REDIS-SENTINEL", "up", msg, ping_ms) push("Redis Sentinel", "up", msg, ping_ms)
except Exception as e: except Exception as e:
ping_ms = int((time.time() - start_t) * 1000) ping_ms = int((time.time() - start_t) * 1000)
push("REDIS-SENTINEL", "down", f"quorum FAIL or master unreachable: {e}", ping_ms) push("Redis Sentinel", "down", f"quorum FAIL or master unreachable: {e}", ping_ms)

View File

@ -38,12 +38,12 @@ def check_swarm_cluster():
if ready_count == total_nodes: if ready_count == total_nodes:
msg = f"{ready_count}/{total_nodes} nodes Ready (managers: {', '.join(managers)})" msg = f"{ready_count}/{total_nodes} nodes Ready (managers: {', '.join(managers)})"
push("SWARM-CLUSTER", "up", msg, ping_ms) push("Swarm Cluster", "up", msg, ping_ms)
else: else:
msg = f"{ready_count}/{total_nodes} nodes Ready | Managers reachable: {len(managers)}" msg = f"{ready_count}/{total_nodes} nodes Ready | Managers reachable: {len(managers)}"
push("SWARM-CLUSTER", "down", msg, ping_ms) push("Swarm Cluster", "down", msg, ping_ms)
except Exception as e: except Exception as e:
ping_ms = int((time.time() - start_time) * 1000) ping_ms = int((time.time() - start_time) * 1000)
logger.error(f"Swarm check failed: {e}") logger.error(f"Swarm check failed: {e}")
push("SWARM-CLUSTER", "down", str(e), ping_ms) push("Swarm Cluster", "down", str(e), ping_ms)

View File

@ -70,8 +70,8 @@ def check_etcd_cluster():
if healthy_count == len(nodes): if healthy_count == len(nodes):
leader_info = f" | leader: {leader}" if leader else "" leader_info = f" | leader: {leader}" if leader else ""
msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}" msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}"
push("ETCD-CLUSTER", "up", msg, ping_ms) push("Etcd Cluster", "up", msg, ping_ms)
else: else:
quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else "" quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else ""
msg = " | ".join(errors) + quorum_msg msg = " | ".join(errors) + quorum_msg
push("ETCD-CLUSTER", "down", msg, ping_ms) push("Etcd Cluster", "down", msg, ping_ms)

View File

@ -57,6 +57,6 @@ def check_swag_tls():
msg = " | ".join(msg_parts) msg = " | ".join(msg_parts)
if is_down: if is_down:
push("SWAG-TLS", "down", msg, ping_ms) push("Swag Tls", "down", msg, ping_ms)
else: else:
push("SWAG-TLS", "up", msg, ping_ms) push("Swag Tls", "up", msg, ping_ms)