fix(monitoring): resolve health-agent bugs and flapping monitors
- Vault flapping: Fix resp evaluation on HTTP 429 - Storagebox block: Move mount check to a daemon thread - Push monitors: Increase interval to 75s and restore 60s sleep - Redis Sentinel: Fix authentication in sentinel_kwargs - Ext Https Api: Update URL to /health
This commit is contained in:
parent
b73ae4e5fb
commit
969c4a2301
@ -78,22 +78,22 @@ groups:
|
||||
children: [Prometheus, Grafana, Portainer, Loki, Ext Https Portainer, Ext Https Apigw]
|
||||
push_monitors:
|
||||
- name: Swarm Cluster
|
||||
interval: 60
|
||||
interval: 75
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, infrastructure, high]
|
||||
restart_threshold: 1
|
||||
- name: Vault Cluster
|
||||
interval: 60
|
||||
interval: 75
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, infrastructure, high]
|
||||
restart_threshold: 1
|
||||
- name: Etcd Cluster
|
||||
interval: 60
|
||||
interval: 75
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, database, high]
|
||||
restart_threshold: 1
|
||||
- name: Patroni Cluster
|
||||
interval: 60
|
||||
interval: 75
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, database, high]
|
||||
restart_threshold: 1
|
||||
@ -103,17 +103,17 @@ push_monitors:
|
||||
tags: [internal, database, high]
|
||||
restart_threshold: 1
|
||||
- name: Apisix Gateway
|
||||
interval: 60
|
||||
interval: 75
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, gateway, high]
|
||||
restart_threshold: 1
|
||||
- name: Rabbitmq Cluster
|
||||
interval: 60
|
||||
interval: 75
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, gateway, medium]
|
||||
restart_threshold: 3
|
||||
- name: Redis Sentinel
|
||||
interval: 60
|
||||
interval: 75
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, database, medium]
|
||||
restart_threshold: 3
|
||||
@ -149,7 +149,7 @@ push_monitors:
|
||||
restart_threshold: 5
|
||||
http_monitors:
|
||||
- name: Ext Https Api
|
||||
url: "https://api{suffix}.{domain}/actuator/health"
|
||||
url: "https://api{suffix}.{domain}/health"
|
||||
accepted_statuscodes: ["200"]
|
||||
interval: 60
|
||||
- name: Ext Https Grafana
|
||||
|
||||
@ -139,14 +139,14 @@ def check_vault():
|
||||
ok, resp, ms, err = http_check(url, expected_status=[200, 429, 473])
|
||||
max_ping = max(max_ping, ms)
|
||||
|
||||
if resp:
|
||||
if resp is not None:
|
||||
data = resp.json()
|
||||
if not data.get("sealed"):
|
||||
unsealed_count += 1
|
||||
else:
|
||||
errors.append(f"{node} SEALED")
|
||||
else:
|
||||
errors.append(f"{node} unreachable")
|
||||
errors.append(f"{node} unreachable: {err}")
|
||||
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
|
||||
|
||||
@ -28,7 +28,11 @@ def check_redis_sentinel():
|
||||
return
|
||||
|
||||
try:
|
||||
sentinel = Sentinel(sentinel_nodes, socket_timeout=3, password=password)
|
||||
sentinel_kwargs = {"socket_timeout": 3}
|
||||
if password:
|
||||
sentinel_kwargs["password"] = password
|
||||
|
||||
sentinel = Sentinel(sentinel_nodes, sentinel_kwargs=sentinel_kwargs, socket_timeout=3, password=password)
|
||||
|
||||
# Master ping
|
||||
master = sentinel.master_for(master_name, socket_timeout=3, password=password)
|
||||
|
||||
@ -2,6 +2,7 @@ import argparse
|
||||
import time
|
||||
import logging
|
||||
import json
|
||||
import threading
|
||||
from health_agent.checks import swarm
|
||||
from health_agent.checks.http import run_all_http_checks
|
||||
from health_agent.checks.tcp import check_etcd_cluster
|
||||
@ -65,9 +66,9 @@ def run_checks():
|
||||
logger.error(f"Error running MongoDB checks: {e}")
|
||||
|
||||
try:
|
||||
check_storagebox_mount()
|
||||
threading.Thread(target=check_storagebox_mount, daemon=True).start()
|
||||
except Exception as e:
|
||||
logger.error(f"Error running filesystem checks: {e}")
|
||||
logger.error(f"Error starting filesystem check thread: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="iklim.co Health Agent")
|
||||
@ -88,5 +89,7 @@ if __name__ == "__main__":
|
||||
run_checks()
|
||||
else:
|
||||
while True:
|
||||
t_start = time.time()
|
||||
run_checks()
|
||||
time.sleep(60)
|
||||
elapsed = time.time() - t_start
|
||||
time.sleep(max(0, 60 - elapsed))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user