refactor(health-agent): rename monitor keys to Title Case With Space
Update all hardcoded push monitor names in check files to match the new Title Case With Space format in monitors.yml. The uk_tokens.yml keys are derived from monitor names so the push() calls must match exactly.
This commit is contained in:
parent
bc8b3d0934
commit
3c2e872bf4
@ -18,7 +18,7 @@ def check_storagebox_mount():
|
||||
|
||||
if not os.path.exists(storagebox_path):
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
push("STORAGEBOX-MOUNT", "down", f"{storagebox_path} not found", ping_ms)
|
||||
push("Storagebox Mount", "down", f"{storagebox_path} not found", ping_ms)
|
||||
return
|
||||
|
||||
for rel_path in expected_files:
|
||||
@ -30,7 +30,7 @@ def check_storagebox_mount():
|
||||
|
||||
if missing_files:
|
||||
msg = f"mount exists but missing: {', '.join(missing_files)}"
|
||||
push("STORAGEBOX-MOUNT", "down", msg, ping_ms)
|
||||
push("Storagebox Mount", "down", msg, ping_ms)
|
||||
else:
|
||||
msg = f"{storagebox_path} OK | all critical files present"
|
||||
push("STORAGEBOX-MOUNT", "up", msg, ping_ms)
|
||||
push("Storagebox Mount", "up", msg, ping_ms)
|
||||
|
||||
@ -54,7 +54,7 @@ def check_patroni_cluster():
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
|
||||
if not cluster_data:
|
||||
push("PATRONI-CLUSTER", "down", error_msg, ping_ms)
|
||||
push("Patroni Cluster", "down", error_msg, ping_ms)
|
||||
return
|
||||
|
||||
members = cluster_data.get("members", [])
|
||||
@ -73,7 +73,7 @@ def check_patroni_cluster():
|
||||
if not leader:
|
||||
down_nodes = [f"{r[0]} state: {r[2]}" for r in replicas if r[2] not in ("running", "streaming")]
|
||||
msg = f"no leader detected | " + " ".join(down_nodes)
|
||||
push("PATRONI-CLUSTER", "down", msg, ping_ms)
|
||||
push("Patroni Cluster", "down", msg, ping_ms)
|
||||
else:
|
||||
lag_strs = []
|
||||
for name, lag, state in replicas:
|
||||
@ -81,7 +81,7 @@ def check_patroni_cluster():
|
||||
lag_strs.append(f"{name} (lag:{lag_mb:.0f}MB)")
|
||||
|
||||
msg = f"leader: {leader} | replicas: " + " ".join(lag_strs)
|
||||
push("PATRONI-CLUSTER", "up", msg, ping_ms)
|
||||
push("Patroni Cluster", "up", msg, ping_ms)
|
||||
|
||||
def check_rabbitmq_cluster():
|
||||
url = "http://rabbitmq:15672/api/healthchecks/node"
|
||||
@ -104,14 +104,14 @@ def check_rabbitmq_cluster():
|
||||
alarms = [n.get("name") for n in data if n.get("mem_alarm") or n.get("disk_free_alarm")]
|
||||
if alarms:
|
||||
msg = f"disk/mem alarm active on {','.join(alarms)}"
|
||||
push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
|
||||
push("Rabbitmq Cluster", "down", msg, ping_ms)
|
||||
return
|
||||
|
||||
msg = f"{nodes_running}/{total_nodes} nodes running"
|
||||
push("RABBITMQ-CLUSTER", "up", msg, ping_ms)
|
||||
push("Rabbitmq Cluster", "up", msg, ping_ms)
|
||||
else:
|
||||
msg = err or f"HTTP {resp.status_code if resp else 'Unknown'}"
|
||||
push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
|
||||
push("Rabbitmq Cluster", "down", msg, ping_ms)
|
||||
|
||||
def check_apisix():
|
||||
url = "http://apisix:9180/apisix/admin/routes"
|
||||
@ -120,9 +120,9 @@ def check_apisix():
|
||||
ok, resp, ping_ms, err = http_check(url, headers=headers)
|
||||
|
||||
if ok:
|
||||
push("APISIX-GATEWAY", "up", "admin API reachable", ping_ms)
|
||||
push("Apisix Gateway", "up", "admin API reachable", ping_ms)
|
||||
else:
|
||||
push("APISIX-GATEWAY", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms)
|
||||
push("Apisix Gateway", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms)
|
||||
|
||||
def check_vault():
|
||||
hosts_env = os.getenv("VAULT_HOSTS", "vault")
|
||||
@ -152,18 +152,18 @@ def check_vault():
|
||||
|
||||
if unsealed_count == total:
|
||||
msg = f"{unsealed_count}/{total} unsealed"
|
||||
push("VAULT-CLUSTER", "up", msg, ping_ms)
|
||||
push("Vault Cluster", "up", msg, ping_ms)
|
||||
else:
|
||||
msg = " | ".join(errors) if errors else "Vault checks failed"
|
||||
push("VAULT-CLUSTER", "down", msg, ping_ms)
|
||||
push("Vault Cluster", "down", msg, ping_ms)
|
||||
|
||||
def check_prometheus():
|
||||
url = "http://prometheus:9090/-/healthy"
|
||||
ok, resp, ping_ms, err = http_check(url)
|
||||
if ok:
|
||||
push("PROMETHEUS", "up", "healthy", ping_ms)
|
||||
push("Prometheus", "up", "healthy", ping_ms)
|
||||
else:
|
||||
push("PROMETHEUS", "down", f"prometheus unreachable: {err}", ping_ms)
|
||||
push("Prometheus", "down", f"prometheus unreachable: {err}", ping_ms)
|
||||
|
||||
def check_grafana():
|
||||
url = "http://grafana:3000/api/health"
|
||||
@ -172,27 +172,27 @@ def check_grafana():
|
||||
data = resp.json()
|
||||
db_status = data.get("database", "unknown")
|
||||
if db_status == "ok":
|
||||
push("GRAFANA", "up", f"ok | db: {db_status}", ping_ms)
|
||||
push("Grafana", "up", f"ok | db: {db_status}", ping_ms)
|
||||
else:
|
||||
push("GRAFANA", "down", f"db not ok: {db_status}", ping_ms)
|
||||
push("Grafana", "down", f"db not ok: {db_status}", ping_ms)
|
||||
else:
|
||||
push("GRAFANA", "down", f"grafana unreachable: {err}", ping_ms)
|
||||
push("Grafana", "down", f"grafana unreachable: {err}", ping_ms)
|
||||
|
||||
def check_portainer():
|
||||
url = "http://portainer:9000/api/system/status"
|
||||
ok, resp, ping_ms, err = http_check(url)
|
||||
if ok:
|
||||
push("PORTAINER", "up", "running", ping_ms)
|
||||
push("Portainer", "up", "running", ping_ms)
|
||||
else:
|
||||
push("PORTAINER", "down", f"portainer unreachable: {err}", ping_ms)
|
||||
push("Portainer", "down", f"portainer unreachable: {err}", ping_ms)
|
||||
|
||||
def check_loki():
|
||||
url = "http://loki:3100/ready"
|
||||
ok, resp, ping_ms, err = http_check(url)
|
||||
if ok:
|
||||
push("LOKI", "up", "ready", ping_ms)
|
||||
push("Loki", "up", "ready", ping_ms)
|
||||
else:
|
||||
push("LOKI", "down", f"loki unreachable: {err}", ping_ms)
|
||||
push("Loki", "down", f"loki unreachable: {err}", ping_ms)
|
||||
|
||||
def run_all_http_checks():
|
||||
check_patroni_cluster()
|
||||
|
||||
@ -35,7 +35,7 @@ def check_mongodb():
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
|
||||
if cluster_size == 1:
|
||||
push("MONGODB-REPLICASET", "up", "standalone mode OK", ping_ms)
|
||||
push("Mongodb Replicaset", "up", "standalone mode OK", ping_ms)
|
||||
return
|
||||
|
||||
if primary:
|
||||
@ -45,13 +45,13 @@ def check_mongodb():
|
||||
unhealthy_secs = [s for s in secondaries if s[1] not in ('SECONDARY', 'ARBITER')]
|
||||
if unhealthy_secs:
|
||||
msg = f"PRIMARY: {primary} | unhealthy: {','.join([s[0] + ':' + s[1] for s in unhealthy_secs])}"
|
||||
push("MONGODB-REPLICASET", "down", msg, ping_ms)
|
||||
push("Mongodb Replicaset", "down", msg, ping_ms)
|
||||
else:
|
||||
push("MONGODB-REPLICASET", "up", msg, ping_ms)
|
||||
push("Mongodb Replicaset", "up", msg, ping_ms)
|
||||
else:
|
||||
msg = "no PRIMARY | quorum lost"
|
||||
push("MONGODB-REPLICASET", "down", msg, ping_ms)
|
||||
push("Mongodb Replicaset", "down", msg, ping_ms)
|
||||
|
||||
except Exception as e:
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
push("MONGODB-REPLICASET", "down", f"connection failed: {e}", ping_ms)
|
||||
push("Mongodb Replicaset", "down", f"connection failed: {e}", ping_ms)
|
||||
|
||||
@ -24,7 +24,7 @@ def check_redis_sentinel():
|
||||
redis_mode = os.getenv("REDIS_MODE", "sentinel")
|
||||
|
||||
if redis_mode != "sentinel":
|
||||
push("REDIS-SENTINEL", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000))
|
||||
push("Redis Sentinel", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000))
|
||||
return
|
||||
|
||||
try:
|
||||
@ -43,8 +43,8 @@ def check_redis_sentinel():
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
|
||||
msg = f"master: {master_ip}:{master_port} | replicas: {replicas_count} | sentinels quorum OK"
|
||||
push("REDIS-SENTINEL", "up", msg, ping_ms)
|
||||
push("Redis Sentinel", "up", msg, ping_ms)
|
||||
|
||||
except Exception as e:
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
push("REDIS-SENTINEL", "down", f"quorum FAIL or master unreachable: {e}", ping_ms)
|
||||
push("Redis Sentinel", "down", f"quorum FAIL or master unreachable: {e}", ping_ms)
|
||||
|
||||
@ -38,12 +38,12 @@ def check_swarm_cluster():
|
||||
|
||||
if ready_count == total_nodes:
|
||||
msg = f"{ready_count}/{total_nodes} nodes Ready (managers: {', '.join(managers)})"
|
||||
push("SWARM-CLUSTER", "up", msg, ping_ms)
|
||||
push("Swarm Cluster", "up", msg, ping_ms)
|
||||
else:
|
||||
msg = f"{ready_count}/{total_nodes} nodes Ready | Managers reachable: {len(managers)}"
|
||||
push("SWARM-CLUSTER", "down", msg, ping_ms)
|
||||
push("Swarm Cluster", "down", msg, ping_ms)
|
||||
|
||||
except Exception as e:
|
||||
ping_ms = int((time.time() - start_time) * 1000)
|
||||
logger.error(f"Swarm check failed: {e}")
|
||||
push("SWARM-CLUSTER", "down", str(e), ping_ms)
|
||||
push("Swarm Cluster", "down", str(e), ping_ms)
|
||||
|
||||
@ -70,8 +70,8 @@ def check_etcd_cluster():
|
||||
if healthy_count == len(nodes):
|
||||
leader_info = f" | leader: {leader}" if leader else ""
|
||||
msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}"
|
||||
push("ETCD-CLUSTER", "up", msg, ping_ms)
|
||||
push("Etcd Cluster", "up", msg, ping_ms)
|
||||
else:
|
||||
quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else ""
|
||||
msg = " | ".join(errors) + quorum_msg
|
||||
push("ETCD-CLUSTER", "down", msg, ping_ms)
|
||||
push("Etcd Cluster", "down", msg, ping_ms)
|
||||
|
||||
@ -57,6 +57,6 @@ def check_swag_tls():
|
||||
msg = " | ".join(msg_parts)
|
||||
|
||||
if is_down:
|
||||
push("SWAG-TLS", "down", msg, ping_ms)
|
||||
push("Swag Tls", "down", msg, ping_ms)
|
||||
else:
|
||||
push("SWAG-TLS", "up", msg, ping_ms)
|
||||
push("Swag Tls", "up", msg, ping_ms)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user