refactor(health-agent): rename monitor keys to Title Case With Space
Update all hardcoded push monitor names in check files to match the new Title Case With Space format in monitors.yml. The uk_tokens.yml keys are derived from monitor names so the push() calls must match exactly.
This commit is contained in:
parent
bc8b3d0934
commit
3c2e872bf4
@ -18,7 +18,7 @@ def check_storagebox_mount():
|
|||||||
|
|
||||||
if not os.path.exists(storagebox_path):
|
if not os.path.exists(storagebox_path):
|
||||||
ping_ms = int((time.time() - start_t) * 1000)
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
push("STORAGEBOX-MOUNT", "down", f"{storagebox_path} not found", ping_ms)
|
push("Storagebox Mount", "down", f"{storagebox_path} not found", ping_ms)
|
||||||
return
|
return
|
||||||
|
|
||||||
for rel_path in expected_files:
|
for rel_path in expected_files:
|
||||||
@ -30,7 +30,7 @@ def check_storagebox_mount():
|
|||||||
|
|
||||||
if missing_files:
|
if missing_files:
|
||||||
msg = f"mount exists but missing: {', '.join(missing_files)}"
|
msg = f"mount exists but missing: {', '.join(missing_files)}"
|
||||||
push("STORAGEBOX-MOUNT", "down", msg, ping_ms)
|
push("Storagebox Mount", "down", msg, ping_ms)
|
||||||
else:
|
else:
|
||||||
msg = f"{storagebox_path} OK | all critical files present"
|
msg = f"{storagebox_path} OK | all critical files present"
|
||||||
push("STORAGEBOX-MOUNT", "up", msg, ping_ms)
|
push("Storagebox Mount", "up", msg, ping_ms)
|
||||||
|
|||||||
@ -54,7 +54,7 @@ def check_patroni_cluster():
|
|||||||
ping_ms = int((time.time() - start_t) * 1000)
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
|
|
||||||
if not cluster_data:
|
if not cluster_data:
|
||||||
push("PATRONI-CLUSTER", "down", error_msg, ping_ms)
|
push("Patroni Cluster", "down", error_msg, ping_ms)
|
||||||
return
|
return
|
||||||
|
|
||||||
members = cluster_data.get("members", [])
|
members = cluster_data.get("members", [])
|
||||||
@ -73,7 +73,7 @@ def check_patroni_cluster():
|
|||||||
if not leader:
|
if not leader:
|
||||||
down_nodes = [f"{r[0]} state: {r[2]}" for r in replicas if r[2] not in ("running", "streaming")]
|
down_nodes = [f"{r[0]} state: {r[2]}" for r in replicas if r[2] not in ("running", "streaming")]
|
||||||
msg = f"no leader detected | " + " ".join(down_nodes)
|
msg = f"no leader detected | " + " ".join(down_nodes)
|
||||||
push("PATRONI-CLUSTER", "down", msg, ping_ms)
|
push("Patroni Cluster", "down", msg, ping_ms)
|
||||||
else:
|
else:
|
||||||
lag_strs = []
|
lag_strs = []
|
||||||
for name, lag, state in replicas:
|
for name, lag, state in replicas:
|
||||||
@ -81,7 +81,7 @@ def check_patroni_cluster():
|
|||||||
lag_strs.append(f"{name} (lag:{lag_mb:.0f}MB)")
|
lag_strs.append(f"{name} (lag:{lag_mb:.0f}MB)")
|
||||||
|
|
||||||
msg = f"leader: {leader} | replicas: " + " ".join(lag_strs)
|
msg = f"leader: {leader} | replicas: " + " ".join(lag_strs)
|
||||||
push("PATRONI-CLUSTER", "up", msg, ping_ms)
|
push("Patroni Cluster", "up", msg, ping_ms)
|
||||||
|
|
||||||
def check_rabbitmq_cluster():
|
def check_rabbitmq_cluster():
|
||||||
url = "http://rabbitmq:15672/api/healthchecks/node"
|
url = "http://rabbitmq:15672/api/healthchecks/node"
|
||||||
@ -104,14 +104,14 @@ def check_rabbitmq_cluster():
|
|||||||
alarms = [n.get("name") for n in data if n.get("mem_alarm") or n.get("disk_free_alarm")]
|
alarms = [n.get("name") for n in data if n.get("mem_alarm") or n.get("disk_free_alarm")]
|
||||||
if alarms:
|
if alarms:
|
||||||
msg = f"disk/mem alarm active on {','.join(alarms)}"
|
msg = f"disk/mem alarm active on {','.join(alarms)}"
|
||||||
push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
|
push("Rabbitmq Cluster", "down", msg, ping_ms)
|
||||||
return
|
return
|
||||||
|
|
||||||
msg = f"{nodes_running}/{total_nodes} nodes running"
|
msg = f"{nodes_running}/{total_nodes} nodes running"
|
||||||
push("RABBITMQ-CLUSTER", "up", msg, ping_ms)
|
push("Rabbitmq Cluster", "up", msg, ping_ms)
|
||||||
else:
|
else:
|
||||||
msg = err or f"HTTP {resp.status_code if resp else 'Unknown'}"
|
msg = err or f"HTTP {resp.status_code if resp else 'Unknown'}"
|
||||||
push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
|
push("Rabbitmq Cluster", "down", msg, ping_ms)
|
||||||
|
|
||||||
def check_apisix():
|
def check_apisix():
|
||||||
url = "http://apisix:9180/apisix/admin/routes"
|
url = "http://apisix:9180/apisix/admin/routes"
|
||||||
@ -120,9 +120,9 @@ def check_apisix():
|
|||||||
ok, resp, ping_ms, err = http_check(url, headers=headers)
|
ok, resp, ping_ms, err = http_check(url, headers=headers)
|
||||||
|
|
||||||
if ok:
|
if ok:
|
||||||
push("APISIX-GATEWAY", "up", "admin API reachable", ping_ms)
|
push("Apisix Gateway", "up", "admin API reachable", ping_ms)
|
||||||
else:
|
else:
|
||||||
push("APISIX-GATEWAY", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms)
|
push("Apisix Gateway", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms)
|
||||||
|
|
||||||
def check_vault():
|
def check_vault():
|
||||||
hosts_env = os.getenv("VAULT_HOSTS", "vault")
|
hosts_env = os.getenv("VAULT_HOSTS", "vault")
|
||||||
@ -152,18 +152,18 @@ def check_vault():
|
|||||||
|
|
||||||
if unsealed_count == total:
|
if unsealed_count == total:
|
||||||
msg = f"{unsealed_count}/{total} unsealed"
|
msg = f"{unsealed_count}/{total} unsealed"
|
||||||
push("VAULT-CLUSTER", "up", msg, ping_ms)
|
push("Vault Cluster", "up", msg, ping_ms)
|
||||||
else:
|
else:
|
||||||
msg = " | ".join(errors) if errors else "Vault checks failed"
|
msg = " | ".join(errors) if errors else "Vault checks failed"
|
||||||
push("VAULT-CLUSTER", "down", msg, ping_ms)
|
push("Vault Cluster", "down", msg, ping_ms)
|
||||||
|
|
||||||
def check_prometheus():
|
def check_prometheus():
|
||||||
url = "http://prometheus:9090/-/healthy"
|
url = "http://prometheus:9090/-/healthy"
|
||||||
ok, resp, ping_ms, err = http_check(url)
|
ok, resp, ping_ms, err = http_check(url)
|
||||||
if ok:
|
if ok:
|
||||||
push("PROMETHEUS", "up", "healthy", ping_ms)
|
push("Prometheus", "up", "healthy", ping_ms)
|
||||||
else:
|
else:
|
||||||
push("PROMETHEUS", "down", f"prometheus unreachable: {err}", ping_ms)
|
push("Prometheus", "down", f"prometheus unreachable: {err}", ping_ms)
|
||||||
|
|
||||||
def check_grafana():
|
def check_grafana():
|
||||||
url = "http://grafana:3000/api/health"
|
url = "http://grafana:3000/api/health"
|
||||||
@ -172,27 +172,27 @@ def check_grafana():
|
|||||||
data = resp.json()
|
data = resp.json()
|
||||||
db_status = data.get("database", "unknown")
|
db_status = data.get("database", "unknown")
|
||||||
if db_status == "ok":
|
if db_status == "ok":
|
||||||
push("GRAFANA", "up", f"ok | db: {db_status}", ping_ms)
|
push("Grafana", "up", f"ok | db: {db_status}", ping_ms)
|
||||||
else:
|
else:
|
||||||
push("GRAFANA", "down", f"db not ok: {db_status}", ping_ms)
|
push("Grafana", "down", f"db not ok: {db_status}", ping_ms)
|
||||||
else:
|
else:
|
||||||
push("GRAFANA", "down", f"grafana unreachable: {err}", ping_ms)
|
push("Grafana", "down", f"grafana unreachable: {err}", ping_ms)
|
||||||
|
|
||||||
def check_portainer():
|
def check_portainer():
|
||||||
url = "http://portainer:9000/api/system/status"
|
url = "http://portainer:9000/api/system/status"
|
||||||
ok, resp, ping_ms, err = http_check(url)
|
ok, resp, ping_ms, err = http_check(url)
|
||||||
if ok:
|
if ok:
|
||||||
push("PORTAINER", "up", "running", ping_ms)
|
push("Portainer", "up", "running", ping_ms)
|
||||||
else:
|
else:
|
||||||
push("PORTAINER", "down", f"portainer unreachable: {err}", ping_ms)
|
push("Portainer", "down", f"portainer unreachable: {err}", ping_ms)
|
||||||
|
|
||||||
def check_loki():
|
def check_loki():
|
||||||
url = "http://loki:3100/ready"
|
url = "http://loki:3100/ready"
|
||||||
ok, resp, ping_ms, err = http_check(url)
|
ok, resp, ping_ms, err = http_check(url)
|
||||||
if ok:
|
if ok:
|
||||||
push("LOKI", "up", "ready", ping_ms)
|
push("Loki", "up", "ready", ping_ms)
|
||||||
else:
|
else:
|
||||||
push("LOKI", "down", f"loki unreachable: {err}", ping_ms)
|
push("Loki", "down", f"loki unreachable: {err}", ping_ms)
|
||||||
|
|
||||||
def run_all_http_checks():
|
def run_all_http_checks():
|
||||||
check_patroni_cluster()
|
check_patroni_cluster()
|
||||||
|
|||||||
@ -35,7 +35,7 @@ def check_mongodb():
|
|||||||
ping_ms = int((time.time() - start_t) * 1000)
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
|
|
||||||
if cluster_size == 1:
|
if cluster_size == 1:
|
||||||
push("MONGODB-REPLICASET", "up", "standalone mode OK", ping_ms)
|
push("Mongodb Replicaset", "up", "standalone mode OK", ping_ms)
|
||||||
return
|
return
|
||||||
|
|
||||||
if primary:
|
if primary:
|
||||||
@ -45,13 +45,13 @@ def check_mongodb():
|
|||||||
unhealthy_secs = [s for s in secondaries if s[1] not in ('SECONDARY', 'ARBITER')]
|
unhealthy_secs = [s for s in secondaries if s[1] not in ('SECONDARY', 'ARBITER')]
|
||||||
if unhealthy_secs:
|
if unhealthy_secs:
|
||||||
msg = f"PRIMARY: {primary} | unhealthy: {','.join([s[0] + ':' + s[1] for s in unhealthy_secs])}"
|
msg = f"PRIMARY: {primary} | unhealthy: {','.join([s[0] + ':' + s[1] for s in unhealthy_secs])}"
|
||||||
push("MONGODB-REPLICASET", "down", msg, ping_ms)
|
push("Mongodb Replicaset", "down", msg, ping_ms)
|
||||||
else:
|
else:
|
||||||
push("MONGODB-REPLICASET", "up", msg, ping_ms)
|
push("Mongodb Replicaset", "up", msg, ping_ms)
|
||||||
else:
|
else:
|
||||||
msg = "no PRIMARY | quorum lost"
|
msg = "no PRIMARY | quorum lost"
|
||||||
push("MONGODB-REPLICASET", "down", msg, ping_ms)
|
push("Mongodb Replicaset", "down", msg, ping_ms)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
ping_ms = int((time.time() - start_t) * 1000)
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
push("MONGODB-REPLICASET", "down", f"connection failed: {e}", ping_ms)
|
push("Mongodb Replicaset", "down", f"connection failed: {e}", ping_ms)
|
||||||
|
|||||||
@ -24,7 +24,7 @@ def check_redis_sentinel():
|
|||||||
redis_mode = os.getenv("REDIS_MODE", "sentinel")
|
redis_mode = os.getenv("REDIS_MODE", "sentinel")
|
||||||
|
|
||||||
if redis_mode != "sentinel":
|
if redis_mode != "sentinel":
|
||||||
push("REDIS-SENTINEL", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000))
|
push("Redis Sentinel", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000))
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -43,8 +43,8 @@ def check_redis_sentinel():
|
|||||||
ping_ms = int((time.time() - start_t) * 1000)
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
|
|
||||||
msg = f"master: {master_ip}:{master_port} | replicas: {replicas_count} | sentinels quorum OK"
|
msg = f"master: {master_ip}:{master_port} | replicas: {replicas_count} | sentinels quorum OK"
|
||||||
push("REDIS-SENTINEL", "up", msg, ping_ms)
|
push("Redis Sentinel", "up", msg, ping_ms)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
ping_ms = int((time.time() - start_t) * 1000)
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
push("REDIS-SENTINEL", "down", f"quorum FAIL or master unreachable: {e}", ping_ms)
|
push("Redis Sentinel", "down", f"quorum FAIL or master unreachable: {e}", ping_ms)
|
||||||
|
|||||||
@ -38,12 +38,12 @@ def check_swarm_cluster():
|
|||||||
|
|
||||||
if ready_count == total_nodes:
|
if ready_count == total_nodes:
|
||||||
msg = f"{ready_count}/{total_nodes} nodes Ready (managers: {', '.join(managers)})"
|
msg = f"{ready_count}/{total_nodes} nodes Ready (managers: {', '.join(managers)})"
|
||||||
push("SWARM-CLUSTER", "up", msg, ping_ms)
|
push("Swarm Cluster", "up", msg, ping_ms)
|
||||||
else:
|
else:
|
||||||
msg = f"{ready_count}/{total_nodes} nodes Ready | Managers reachable: {len(managers)}"
|
msg = f"{ready_count}/{total_nodes} nodes Ready | Managers reachable: {len(managers)}"
|
||||||
push("SWARM-CLUSTER", "down", msg, ping_ms)
|
push("Swarm Cluster", "down", msg, ping_ms)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
ping_ms = int((time.time() - start_time) * 1000)
|
ping_ms = int((time.time() - start_time) * 1000)
|
||||||
logger.error(f"Swarm check failed: {e}")
|
logger.error(f"Swarm check failed: {e}")
|
||||||
push("SWARM-CLUSTER", "down", str(e), ping_ms)
|
push("Swarm Cluster", "down", str(e), ping_ms)
|
||||||
|
|||||||
@ -70,8 +70,8 @@ def check_etcd_cluster():
|
|||||||
if healthy_count == len(nodes):
|
if healthy_count == len(nodes):
|
||||||
leader_info = f" | leader: {leader}" if leader else ""
|
leader_info = f" | leader: {leader}" if leader else ""
|
||||||
msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}"
|
msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}"
|
||||||
push("ETCD-CLUSTER", "up", msg, ping_ms)
|
push("Etcd Cluster", "up", msg, ping_ms)
|
||||||
else:
|
else:
|
||||||
quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else ""
|
quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else ""
|
||||||
msg = " | ".join(errors) + quorum_msg
|
msg = " | ".join(errors) + quorum_msg
|
||||||
push("ETCD-CLUSTER", "down", msg, ping_ms)
|
push("Etcd Cluster", "down", msg, ping_ms)
|
||||||
|
|||||||
@ -57,6 +57,6 @@ def check_swag_tls():
|
|||||||
msg = " | ".join(msg_parts)
|
msg = " | ".join(msg_parts)
|
||||||
|
|
||||||
if is_down:
|
if is_down:
|
||||||
push("SWAG-TLS", "down", msg, ping_ms)
|
push("Swag Tls", "down", msg, ping_ms)
|
||||||
else:
|
else:
|
||||||
push("SWAG-TLS", "up", msg, ping_ms)
|
push("Swag Tls", "up", msg, ping_ms)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user