import socket import time import logging import requests from health_agent.uptime_kuma import push from health_agent.checks.http import http_check logger = logging.getLogger(__name__) def tcp_check(host, port, timeout=3): start_time = time.time() try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(timeout) result = sock.connect_ex((host, port)) sock.close() ping_ms = int((time.time() - start_time) * 1000) if result == 0: return True, ping_ms, None else: return False, ping_ms, f"Port {port} is closed or unreachable" except Exception as e: ping_ms = int((time.time() - start_time) * 1000) return False, ping_ms, str(e) def check_etcd_cluster(): nodes = ["etcd-01", "etcd-02", "etcd-03"] start_t = time.time() healthy_count = 0 leader = None errors = [] for node in nodes: # 1. TCP Check on 2379 tcp_ok, ms, tcp_err = tcp_check(node, 2379) if not tcp_ok: errors.append(f"{node} port 2379 unreachable") continue # 2. HTTP Health check url = f"http://{node}:2379/health" http_ok, resp, ms, http_err = http_check(url, timeout=3) if http_ok and resp: data = resp.json() if data.get("health") == "true": healthy_count += 1 else: errors.append(f"{node} unhealthy") else: errors.append(f"{node} health endpoint unreachable") # 3. Leader check from /v3/maintenance/status if not leader and tcp_ok: status_url = f"http://{node}:2379/v3/maintenance/status" try: r = requests.post(status_url, json={}, timeout=3) if r.status_code == 200: status_data = r.json() leader_id = status_data.get("leader") header_member_id = status_data.get("header", {}).get("member_id") if leader_id and leader_id == header_member_id: leader = node except Exception: pass ping_ms = int((time.time() - start_t) * 1000) if healthy_count == len(nodes): leader_info = f" | leader: {leader}" if leader else "" msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}" push("ETCD-CLUSTER", "up", msg, ping_ms) else: quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else "" msg = " | ".join(errors) + quorum_msg push("ETCD-CLUSTER", "down", msg, ping_ms)