78 lines
2.6 KiB
Python

import socket
import time
import logging
import requests
from health_agent.uptime_kuma import push
from health_agent.checks.http import http_check
logger = logging.getLogger(__name__)
def tcp_check(host, port, timeout=3):
start_time = time.time()
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
result = sock.connect_ex((host, port))
sock.close()
ping_ms = int((time.time() - start_time) * 1000)
if result == 0:
return True, ping_ms, None
else:
return False, ping_ms, f"Port {port} is closed or unreachable"
except Exception as e:
ping_ms = int((time.time() - start_time) * 1000)
return False, ping_ms, str(e)
def check_etcd_cluster():
nodes = ["etcd-01", "etcd-02", "etcd-03"]
start_t = time.time()
healthy_count = 0
leader = None
errors = []
for node in nodes:
# 1. TCP Check on 2379
tcp_ok, ms, tcp_err = tcp_check(node, 2379)
if not tcp_ok:
errors.append(f"{node} port 2379 unreachable")
continue
# 2. HTTP Health check
url = f"http://{node}:2379/health"
http_ok, resp, ms, http_err = http_check(url, timeout=3)
if http_ok and resp:
data = resp.json()
if data.get("health") == "true":
healthy_count += 1
else:
errors.append(f"{node} unhealthy")
else:
errors.append(f"{node} health endpoint unreachable")
# 3. Leader check from /v3/maintenance/status
if not leader and tcp_ok:
status_url = f"http://{node}:2379/v3/maintenance/status"
try:
r = requests.post(status_url, json={}, timeout=3)
if r.status_code == 200:
status_data = r.json()
leader_id = status_data.get("leader")
header_member_id = status_data.get("header", {}).get("member_id")
if leader_id and leader_id == header_member_id:
leader = node
except Exception:
pass
ping_ms = int((time.time() - start_t) * 1000)
if healthy_count == len(nodes):
leader_info = f" | leader: {leader}" if leader else ""
msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}"
push("ETCD-CLUSTER", "up", msg, ping_ms)
else:
quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else ""
msg = " | ".join(errors) + quorum_msg
push("ETCD-CLUSTER", "down", msg, ping_ms)