78 lines
2.6 KiB
Python
78 lines
2.6 KiB
Python
import socket
|
|
import time
|
|
import logging
|
|
import requests
|
|
from health_agent.uptime_kuma import push
|
|
from health_agent.checks.http import http_check
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def tcp_check(host, port, timeout=3):
|
|
start_time = time.time()
|
|
try:
|
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
sock.settimeout(timeout)
|
|
result = sock.connect_ex((host, port))
|
|
sock.close()
|
|
ping_ms = int((time.time() - start_time) * 1000)
|
|
if result == 0:
|
|
return True, ping_ms, None
|
|
else:
|
|
return False, ping_ms, f"Port {port} is closed or unreachable"
|
|
except Exception as e:
|
|
ping_ms = int((time.time() - start_time) * 1000)
|
|
return False, ping_ms, str(e)
|
|
|
|
def check_etcd_cluster():
|
|
nodes = ["etcd-01", "etcd-02", "etcd-03"]
|
|
start_t = time.time()
|
|
|
|
healthy_count = 0
|
|
leader = None
|
|
errors = []
|
|
|
|
for node in nodes:
|
|
# 1. TCP Check on 2379
|
|
tcp_ok, ms, tcp_err = tcp_check(node, 2379)
|
|
if not tcp_ok:
|
|
errors.append(f"{node} port 2379 unreachable")
|
|
continue
|
|
|
|
# 2. HTTP Health check
|
|
url = f"http://{node}:2379/health"
|
|
http_ok, resp, ms, http_err = http_check(url, timeout=3)
|
|
|
|
if http_ok and resp:
|
|
data = resp.json()
|
|
if data.get("health") == "true":
|
|
healthy_count += 1
|
|
else:
|
|
errors.append(f"{node} unhealthy")
|
|
else:
|
|
errors.append(f"{node} health endpoint unreachable")
|
|
|
|
# 3. Leader check from /v3/maintenance/status
|
|
if not leader and tcp_ok:
|
|
status_url = f"http://{node}:2379/v3/maintenance/status"
|
|
try:
|
|
r = requests.post(status_url, json={}, timeout=3)
|
|
if r.status_code == 200:
|
|
status_data = r.json()
|
|
leader_id = status_data.get("leader")
|
|
header_member_id = status_data.get("header", {}).get("member_id")
|
|
if leader_id and leader_id == header_member_id:
|
|
leader = node
|
|
except Exception:
|
|
pass
|
|
|
|
ping_ms = int((time.time() - start_t) * 1000)
|
|
|
|
if healthy_count == len(nodes):
|
|
leader_info = f" | leader: {leader}" if leader else ""
|
|
msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}"
|
|
push("ETCD-CLUSTER", "up", msg, ping_ms)
|
|
else:
|
|
quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else ""
|
|
msg = " | ".join(errors) + quorum_msg
|
|
push("ETCD-CLUSTER", "down", msg, ping_ms)
|