fix(monitoring): resolve health-agent bugs and flapping monitors
- Vault flapping: Fix resp evaluation on HTTP 429 - Storagebox block: Move mount check to a daemon thread - Push monitors: Increase interval to 75s and restore 60s sleep - Redis Sentinel: Fix authentication in sentinel_kwargs - Ext Https Api: Update URL to /health
This commit is contained in:
parent
b73ae4e5fb
commit
969c4a2301
@ -78,22 +78,22 @@ groups:
|
|||||||
children: [Prometheus, Grafana, Portainer, Loki, Ext Https Portainer, Ext Https Apigw]
|
children: [Prometheus, Grafana, Portainer, Loki, Ext Https Portainer, Ext Https Apigw]
|
||||||
push_monitors:
|
push_monitors:
|
||||||
- name: Swarm Cluster
|
- name: Swarm Cluster
|
||||||
interval: 60
|
interval: 75
|
||||||
heartbeat_retries: 1
|
heartbeat_retries: 1
|
||||||
tags: [internal, infrastructure, high]
|
tags: [internal, infrastructure, high]
|
||||||
restart_threshold: 1
|
restart_threshold: 1
|
||||||
- name: Vault Cluster
|
- name: Vault Cluster
|
||||||
interval: 60
|
interval: 75
|
||||||
heartbeat_retries: 1
|
heartbeat_retries: 1
|
||||||
tags: [internal, infrastructure, high]
|
tags: [internal, infrastructure, high]
|
||||||
restart_threshold: 1
|
restart_threshold: 1
|
||||||
- name: Etcd Cluster
|
- name: Etcd Cluster
|
||||||
interval: 60
|
interval: 75
|
||||||
heartbeat_retries: 1
|
heartbeat_retries: 1
|
||||||
tags: [internal, database, high]
|
tags: [internal, database, high]
|
||||||
restart_threshold: 1
|
restart_threshold: 1
|
||||||
- name: Patroni Cluster
|
- name: Patroni Cluster
|
||||||
interval: 60
|
interval: 75
|
||||||
heartbeat_retries: 1
|
heartbeat_retries: 1
|
||||||
tags: [internal, database, high]
|
tags: [internal, database, high]
|
||||||
restart_threshold: 1
|
restart_threshold: 1
|
||||||
@ -103,17 +103,17 @@ push_monitors:
|
|||||||
tags: [internal, database, high]
|
tags: [internal, database, high]
|
||||||
restart_threshold: 1
|
restart_threshold: 1
|
||||||
- name: Apisix Gateway
|
- name: Apisix Gateway
|
||||||
interval: 60
|
interval: 75
|
||||||
heartbeat_retries: 1
|
heartbeat_retries: 1
|
||||||
tags: [internal, gateway, high]
|
tags: [internal, gateway, high]
|
||||||
restart_threshold: 1
|
restart_threshold: 1
|
||||||
- name: Rabbitmq Cluster
|
- name: Rabbitmq Cluster
|
||||||
interval: 60
|
interval: 75
|
||||||
heartbeat_retries: 1
|
heartbeat_retries: 1
|
||||||
tags: [internal, gateway, medium]
|
tags: [internal, gateway, medium]
|
||||||
restart_threshold: 3
|
restart_threshold: 3
|
||||||
- name: Redis Sentinel
|
- name: Redis Sentinel
|
||||||
interval: 60
|
interval: 75
|
||||||
heartbeat_retries: 1
|
heartbeat_retries: 1
|
||||||
tags: [internal, database, medium]
|
tags: [internal, database, medium]
|
||||||
restart_threshold: 3
|
restart_threshold: 3
|
||||||
@ -149,7 +149,7 @@ push_monitors:
|
|||||||
restart_threshold: 5
|
restart_threshold: 5
|
||||||
http_monitors:
|
http_monitors:
|
||||||
- name: Ext Https Api
|
- name: Ext Https Api
|
||||||
url: "https://api{suffix}.{domain}/actuator/health"
|
url: "https://api{suffix}.{domain}/health"
|
||||||
accepted_statuscodes: ["200"]
|
accepted_statuscodes: ["200"]
|
||||||
interval: 60
|
interval: 60
|
||||||
- name: Ext Https Grafana
|
- name: Ext Https Grafana
|
||||||
|
|||||||
@ -139,14 +139,14 @@ def check_vault():
|
|||||||
ok, resp, ms, err = http_check(url, expected_status=[200, 429, 473])
|
ok, resp, ms, err = http_check(url, expected_status=[200, 429, 473])
|
||||||
max_ping = max(max_ping, ms)
|
max_ping = max(max_ping, ms)
|
||||||
|
|
||||||
if resp:
|
if resp is not None:
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
if not data.get("sealed"):
|
if not data.get("sealed"):
|
||||||
unsealed_count += 1
|
unsealed_count += 1
|
||||||
else:
|
else:
|
||||||
errors.append(f"{node} SEALED")
|
errors.append(f"{node} SEALED")
|
||||||
else:
|
else:
|
||||||
errors.append(f"{node} unreachable")
|
errors.append(f"{node} unreachable: {err}")
|
||||||
|
|
||||||
ping_ms = int((time.time() - start_t) * 1000)
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
|
|
||||||
|
|||||||
@ -28,7 +28,11 @@ def check_redis_sentinel():
|
|||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
sentinel = Sentinel(sentinel_nodes, socket_timeout=3, password=password)
|
sentinel_kwargs = {"socket_timeout": 3}
|
||||||
|
if password:
|
||||||
|
sentinel_kwargs["password"] = password
|
||||||
|
|
||||||
|
sentinel = Sentinel(sentinel_nodes, sentinel_kwargs=sentinel_kwargs, socket_timeout=3, password=password)
|
||||||
|
|
||||||
# Master ping
|
# Master ping
|
||||||
master = sentinel.master_for(master_name, socket_timeout=3, password=password)
|
master = sentinel.master_for(master_name, socket_timeout=3, password=password)
|
||||||
|
|||||||
@ -2,6 +2,7 @@ import argparse
|
|||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
|
import threading
|
||||||
from health_agent.checks import swarm
|
from health_agent.checks import swarm
|
||||||
from health_agent.checks.http import run_all_http_checks
|
from health_agent.checks.http import run_all_http_checks
|
||||||
from health_agent.checks.tcp import check_etcd_cluster
|
from health_agent.checks.tcp import check_etcd_cluster
|
||||||
@ -65,9 +66,9 @@ def run_checks():
|
|||||||
logger.error(f"Error running MongoDB checks: {e}")
|
logger.error(f"Error running MongoDB checks: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
check_storagebox_mount()
|
threading.Thread(target=check_storagebox_mount, daemon=True).start()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error running filesystem checks: {e}")
|
logger.error(f"Error starting filesystem check thread: {e}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="iklim.co Health Agent")
|
parser = argparse.ArgumentParser(description="iklim.co Health Agent")
|
||||||
@ -88,5 +89,7 @@ if __name__ == "__main__":
|
|||||||
run_checks()
|
run_checks()
|
||||||
else:
|
else:
|
||||||
while True:
|
while True:
|
||||||
|
t_start = time.time()
|
||||||
run_checks()
|
run_checks()
|
||||||
time.sleep(60)
|
elapsed = time.time() - t_start
|
||||||
|
time.sleep(max(0, 60 - elapsed))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user