fix(monitoring): resolve health-agent bugs and flapping monitors

- Vault flapping: Fix resp evaluation on HTTP 429
- Storagebox block: Move mount check to a daemon thread
- Push monitors: Increase interval to 75s and restore 60s sleep
- Redis Sentinel: Fix authentication in sentinel_kwargs
- Ext Https Api: Update URL to /health
This commit is contained in:
Murat ÖZDEMİR 2026-06-26 22:51:15 +03:00
parent b73ae4e5fb
commit 969c4a2301
4 changed files with 21 additions and 14 deletions

View File

@ -78,22 +78,22 @@ groups:
children: [Prometheus, Grafana, Portainer, Loki, Ext Https Portainer, Ext Https Apigw]
push_monitors:
- name: Swarm Cluster
interval: 60
interval: 75
heartbeat_retries: 1
tags: [internal, infrastructure, high]
restart_threshold: 1
- name: Vault Cluster
interval: 60
interval: 75
heartbeat_retries: 1
tags: [internal, infrastructure, high]
restart_threshold: 1
- name: Etcd Cluster
interval: 60
interval: 75
heartbeat_retries: 1
tags: [internal, database, high]
restart_threshold: 1
- name: Patroni Cluster
interval: 60
interval: 75
heartbeat_retries: 1
tags: [internal, database, high]
restart_threshold: 1
@ -103,17 +103,17 @@ push_monitors:
tags: [internal, database, high]
restart_threshold: 1
- name: Apisix Gateway
interval: 60
interval: 75
heartbeat_retries: 1
tags: [internal, gateway, high]
restart_threshold: 1
- name: Rabbitmq Cluster
interval: 60
interval: 75
heartbeat_retries: 1
tags: [internal, gateway, medium]
restart_threshold: 3
- name: Redis Sentinel
interval: 60
interval: 75
heartbeat_retries: 1
tags: [internal, database, medium]
restart_threshold: 3
@ -149,7 +149,7 @@ push_monitors:
restart_threshold: 5
http_monitors:
- name: Ext Https Api
url: "https://api{suffix}.{domain}/actuator/health"
url: "https://api{suffix}.{domain}/health"
accepted_statuscodes: ["200"]
interval: 60
- name: Ext Https Grafana

View File

@ -139,14 +139,14 @@ def check_vault():
ok, resp, ms, err = http_check(url, expected_status=[200, 429, 473])
max_ping = max(max_ping, ms)
if resp:
if resp is not None:
data = resp.json()
if not data.get("sealed"):
unsealed_count += 1
else:
errors.append(f"{node} SEALED")
else:
errors.append(f"{node} unreachable")
errors.append(f"{node} unreachable: {err}")
ping_ms = int((time.time() - start_t) * 1000)

View File

@ -28,7 +28,11 @@ def check_redis_sentinel():
return
try:
sentinel = Sentinel(sentinel_nodes, socket_timeout=3, password=password)
sentinel_kwargs = {"socket_timeout": 3}
if password:
sentinel_kwargs["password"] = password
sentinel = Sentinel(sentinel_nodes, sentinel_kwargs=sentinel_kwargs, socket_timeout=3, password=password)
# Master ping
master = sentinel.master_for(master_name, socket_timeout=3, password=password)

View File

@ -2,6 +2,7 @@ import argparse
import time
import logging
import json
import threading
from health_agent.checks import swarm
from health_agent.checks.http import run_all_http_checks
from health_agent.checks.tcp import check_etcd_cluster
@ -65,9 +66,9 @@ def run_checks():
logger.error(f"Error running MongoDB checks: {e}")
try:
check_storagebox_mount()
threading.Thread(target=check_storagebox_mount, daemon=True).start()
except Exception as e:
logger.error(f"Error running filesystem checks: {e}")
logger.error(f"Error starting filesystem check thread: {e}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="iklim.co Health Agent")
@ -88,5 +89,7 @@ if __name__ == "__main__":
run_checks()
else:
while True:
t_start = time.time()
run_checks()
time.sleep(60)
elapsed = time.time() - t_start
time.sleep(max(0, 60 - elapsed))