diff --git a/health-agent/config/monitors.yml b/health-agent/config/monitors.yml index 3cfe7fd..576d4ee 100644 --- a/health-agent/config/monitors.yml +++ b/health-agent/config/monitors.yml @@ -78,22 +78,22 @@ groups: children: [Prometheus, Grafana, Portainer, Loki, Ext Https Portainer, Ext Https Apigw] push_monitors: - name: Swarm Cluster - interval: 60 + interval: 75 heartbeat_retries: 1 tags: [internal, infrastructure, high] restart_threshold: 1 - name: Vault Cluster - interval: 60 + interval: 75 heartbeat_retries: 1 tags: [internal, infrastructure, high] restart_threshold: 1 - name: Etcd Cluster - interval: 60 + interval: 75 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 - name: Patroni Cluster - interval: 60 + interval: 75 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 @@ -103,17 +103,17 @@ push_monitors: tags: [internal, database, high] restart_threshold: 1 - name: Apisix Gateway - interval: 60 + interval: 75 heartbeat_retries: 1 tags: [internal, gateway, high] restart_threshold: 1 - name: Rabbitmq Cluster - interval: 60 + interval: 75 heartbeat_retries: 1 tags: [internal, gateway, medium] restart_threshold: 3 - name: Redis Sentinel - interval: 60 + interval: 75 heartbeat_retries: 1 tags: [internal, database, medium] restart_threshold: 3 @@ -149,7 +149,7 @@ push_monitors: restart_threshold: 5 http_monitors: - name: Ext Https Api - url: "https://api{suffix}.{domain}/actuator/health" + url: "https://api{suffix}.{domain}/health" accepted_statuscodes: ["200"] interval: 60 - name: Ext Https Grafana diff --git a/health-agent/src/health_agent/checks/http.py b/health-agent/src/health_agent/checks/http.py index ea9d96b..5191392 100644 --- a/health-agent/src/health_agent/checks/http.py +++ b/health-agent/src/health_agent/checks/http.py @@ -139,14 +139,14 @@ def check_vault(): ok, resp, ms, err = http_check(url, expected_status=[200, 429, 473]) max_ping = max(max_ping, ms) - if resp: + if resp is not None: data = resp.json() if not data.get("sealed"): unsealed_count += 1 else: errors.append(f"{node} SEALED") else: - errors.append(f"{node} unreachable") + errors.append(f"{node} unreachable: {err}") ping_ms = int((time.time() - start_t) * 1000) diff --git a/health-agent/src/health_agent/checks/redis_sentinel.py b/health-agent/src/health_agent/checks/redis_sentinel.py index 1dea54c..ade60bd 100644 --- a/health-agent/src/health_agent/checks/redis_sentinel.py +++ b/health-agent/src/health_agent/checks/redis_sentinel.py @@ -28,7 +28,11 @@ def check_redis_sentinel(): return try: - sentinel = Sentinel(sentinel_nodes, socket_timeout=3, password=password) + sentinel_kwargs = {"socket_timeout": 3} + if password: + sentinel_kwargs["password"] = password + + sentinel = Sentinel(sentinel_nodes, sentinel_kwargs=sentinel_kwargs, socket_timeout=3, password=password) # Master ping master = sentinel.master_for(master_name, socket_timeout=3, password=password) diff --git a/health-agent/src/health_agent/main.py b/health-agent/src/health_agent/main.py index 7a0d361..5321a1d 100644 --- a/health-agent/src/health_agent/main.py +++ b/health-agent/src/health_agent/main.py @@ -2,6 +2,7 @@ import argparse import time import logging import json +import threading from health_agent.checks import swarm from health_agent.checks.http import run_all_http_checks from health_agent.checks.tcp import check_etcd_cluster @@ -65,9 +66,9 @@ def run_checks(): logger.error(f"Error running MongoDB checks: {e}") try: - check_storagebox_mount() + threading.Thread(target=check_storagebox_mount, daemon=True).start() except Exception as e: - logger.error(f"Error running filesystem checks: {e}") + logger.error(f"Error starting filesystem check thread: {e}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="iklim.co Health Agent") @@ -88,5 +89,7 @@ if __name__ == "__main__": run_checks() else: while True: + t_start = time.time() run_checks() - time.sleep(60) + elapsed = time.time() - t_start + time.sleep(max(0, 60 - elapsed))