feat(health-agent): add monitors.yml with env-aware node IP mapping from Ansible inventory

This commit is contained in:
Murat ÖZDEMİR 2026-06-25 18:59:14 +03:00
parent a2e8997711
commit f742bfdd11
19 changed files with 1131 additions and 0 deletions

10
health-agent/.env.example Normal file
View File

@ -0,0 +1,10 @@
ENV=prod
CLUSTER_SIZE_ETCD=3
CLUSTER_SIZE_PATRONI=3
CLUSTER_SIZE_MONGODB=3
CLUSTER_SIZE_RABBITMQ=3
CLUSTER_SIZE_VAULT=3
REDIS_MODE=sentinel
EXTERNAL_DOMAIN=iklim.co
EXTERNAL_SUBDOMAIN_SUFFIX=
UK_PUSH_URL_BASE=https://status.iklim.co/api/push

View File

@ -0,0 +1,5 @@
UK_URL=http://uptime-kuma:3001
UK_API_KEY=your_api_key_here
UK_SLACK_WEBHOOK_HIGH=https://hooks.slack.com/services/...
UK_SLACK_WEBHOOK_MEDIUM=https://hooks.slack.com/services/...
UK_SLACK_WEBHOOK_LOW=https://hooks.slack.com/services/...

16
health-agent/Dockerfile Normal file
View File

@ -0,0 +1,16 @@
FROM python:3.12-slim
WORKDIR /app
RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/*
COPY pyproject.toml ./
COPY src/ ./src/
RUN pip install --no-cache-dir .
ENV PYTHONPATH=/app/src
RUN useradd -m appuser
# Keeping as root to be able to access /var/run/docker.sock cleanly, unless specifically configured with groups.
# USER appuser
CMD ["python", "src/health_agent/main.py"]

View File

@ -0,0 +1,196 @@
version: "1"
project: "iklim"
domain:
base: "iklim.co"
nodes:
prod:
service:
- name: iklim-app-01
ip: "178.104.210.41"
- name: iklim-app-02
ip: "178.105.69.1"
- name: iklim-app-03
ip: "178.104.219.3"
db:
- name: iklim-db-01
ip: "159.69.117.158"
- name: iklim-db-02
ip: "178.104.219.162"
- name: iklim-db-03
ip: "159.69.115.105"
test:
service:
- name: iklim-app-01
ip: "167.235.194.61"
db:
- name: iklim-db-01
ip: "167.235.205.93"
tags:
- external
- internal
- high
- medium
- low
- database
- gateway
- infrastructure
- observability
notifications:
slack-high:
type: slack
webhook_env: UK_SLACK_WEBHOOK_HIGH
slack-medium:
type: slack
webhook_env: UK_SLACK_WEBHOOK_MEDIUM
slack-low:
type: slack
webhook_env: UK_SLACK_WEBHOOK_LOW
groups:
- name: "Altyapı"
status_page: "iklim-{env}-ops"
notifications: [slack-high]
tags: [internal, infrastructure]
children: [SWARM-CLUSTER, VAULT-CLUSTER, STORAGEBOX-MOUNT, SWAG-TLS]
- name: "Veri Katmanı"
status_page: "iklim-{env}-ops"
notifications: [slack-high]
tags: [internal, database]
children: [ETCD-CLUSTER, PATRONI-CLUSTER, MONGODB-REPLICASET]
- name: "Gateway & Mesajlaşma"
status_page: "iklim-{env}-ops"
notifications: [slack-high]
tags: [internal, gateway]
children: [APISIX-GATEWAY, RABBITMQ-CLUSTER, REDIS-SENTINEL]
- name: "Dış Erişilebilirlik - Kritik"
status_page: "iklim-{env}-ops"
notifications: [slack-high]
tags: [external, high]
children: [EXT-HTTPS-API, EXT-DNS-API, EXT-DNS-ROOT, EXT-PING-APP01, EXT-PING-APP02, EXT-PING-APP03]
- name: "Dış Erişilebilirlik - Genel"
status_page: "iklim-{env}-ops"
notifications: [slack-medium]
tags: [external, medium]
children: [EXT-HTTPS-GRAFANA, EXT-PING-DB01, EXT-PING-DB02, EXT-PING-DB03]
- name: "Gözlemlenebilirlik"
status_page: "iklim-{env}-tools"
notifications: [slack-low]
tags: [internal, observability]
children: [PROMETHEUS, GRAFANA, PORTAINER, LOKI, EXT-HTTPS-PORTAINER, EXT-HTTPS-APIGW]
push_monitors:
- name: SWARM-CLUSTER
interval: 60
heartbeat_retries: 1
tags: [internal, infrastructure, high]
restart_threshold: 1
- name: VAULT-CLUSTER
interval: 60
heartbeat_retries: 1
tags: [internal, infrastructure, high]
restart_threshold: 1
- name: ETCD-CLUSTER
interval: 60
heartbeat_retries: 1
tags: [internal, database, high]
restart_threshold: 1
- name: PATRONI-CLUSTER
interval: 60
heartbeat_retries: 1
tags: [internal, database, high]
restart_threshold: 1
- name: MONGODB-REPLICASET
interval: 120
heartbeat_retries: 1
tags: [internal, database, high]
restart_threshold: 1
- name: APISIX-GATEWAY
interval: 60
heartbeat_retries: 1
tags: [internal, gateway, high]
restart_threshold: 1
- name: RABBITMQ-CLUSTER
interval: 60
heartbeat_retries: 1
tags: [internal, gateway, medium]
restart_threshold: 3
- name: REDIS-SENTINEL
interval: 60
heartbeat_retries: 1
tags: [internal, database, medium]
restart_threshold: 3
- name: SWAG-TLS
interval: 3600
heartbeat_retries: 1
tags: [internal, infrastructure, medium]
restart_threshold: 3
- name: STORAGEBOX-MOUNT
interval: 300
heartbeat_retries: 1
tags: [internal, infrastructure, medium]
restart_threshold: 1
- name: PROMETHEUS
interval: 120
heartbeat_retries: 1
tags: [internal, observability, low]
restart_threshold: 5
- name: GRAFANA
interval: 120
heartbeat_retries: 1
tags: [internal, observability, low]
restart_threshold: 5
- name: PORTAINER
interval: 120
heartbeat_retries: 1
tags: [internal, observability, low]
restart_threshold: 5
- name: LOKI
interval: 120
heartbeat_retries: 1
tags: [internal, observability, low]
restart_threshold: 5
http_monitors:
- name: EXT-HTTPS-API
url: "https://api{suffix}.{domain}/actuator/health"
accepted_statuscodes: ["200"]
interval: 60
- name: EXT-HTTPS-GRAFANA
url: "https://grafana{suffix}.{domain}/api/health"
accepted_statuscodes: ["200"]
interval: 60
- name: EXT-HTTPS-PORTAINER
url: "https://portainer{suffix}.{domain}"
accepted_statuscodes: ["200", "401", "403"]
interval: 120
- name: EXT-HTTPS-APIGW
url: "https://apigw{suffix}.{domain}"
accepted_statuscodes: ["200", "401", "403"]
interval: 120
dns_monitors:
- name: EXT-DNS-API
hostname: "api{suffix}.{domain}"
dns_resolve_type: A
interval: 60
- name: EXT-DNS-ROOT
hostname: "{domain}"
dns_resolve_type: A
interval: 60
ping_monitors:
interval: 60
max_retries: 1
status_pages:
- slug: "iklim-{env}-status"
title: "iklim.co API Durumu"
public: true
groups: ["Dış Erişilebilirlik - Kritik"]
- slug: "iklim-{env}-ops"
title: "iklim.co [{env}] Altyapı"
public: false
groups:
- "Altyapı"
- "Veri Katmanı"
- "Gateway & Mesajlaşma"
- "Dış Erişilebilirlik - Kritik"
- "Dış Erişilebilirlik - Genel"
- slug: "iklim-{env}-tools"
title: "iklim.co [{env}] Araçlar"
public: false
groups: ["Gözlemlenebilirlik"]

View File

@ -0,0 +1,22 @@
[project]
name = "health-agent"
version = "0.1.0"
description = "iklim.co Monitoring Health Agent"
requires-python = ">=3.12"
dependencies = [
"requests",
"docker",
"python-dotenv",
"pyyaml",
"redis",
"pymongo",
"uptime-kuma-api",
"cryptography",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src/health_agent"]

View File

@ -0,0 +1,138 @@
import os
import argparse
import yaml
import logging
from uptime_kuma_api import UptimeKumaApi, MonitorType
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("uk-setup")
def format_str(text, env_name, project):
if not isinstance(text, str):
return text
return text.replace("{env}", env_name).replace("{project}", project)
def setup_uptime_kuma(dry_run=False, only=None):
env_name = os.getenv("ENV", "test")
config_path = os.path.join(os.path.dirname(__file__), "..", "config", "monitors.yml")
with open(config_path, "r") as f:
config = yaml.safe_load(f)
project = config.get("project", "iklim")
kuma_url = os.getenv("UK_URL", "http://localhost:3001")
kuma_user = os.getenv("UK_USER", "admin")
kuma_pass = os.getenv("UK_PASS", "admin")
api = None
if not dry_run:
logger.info(f"Connecting to Uptime Kuma at {kuma_url}...")
try:
api = UptimeKumaApi(kuma_url)
api.login(kuma_user, kuma_pass)
except Exception as e:
logger.error(f"Login failed: {e}")
return
existing_monitors = {}
if api:
try:
for m in api.get_monitors():
existing_monitors[m['name']] = m
except Exception as e:
logger.error(f"Failed to get monitors: {e}")
# 1. Process Groups
group_map = {}
for g in config.get("groups", []):
raw_name = g["name"]
formatted_name = f"{project} [{env_name}] {raw_name}"
logger.info(f"Processing group: {formatted_name}")
if not dry_run:
if formatted_name not in existing_monitors:
logger.info(f"Creating group monitor: {formatted_name}")
res = api.add_monitor(type=MonitorType.GROUP, name=formatted_name)
group_map[raw_name] = res['monitorID']
else:
group_map[raw_name] = existing_monitors[formatted_name]['id']
tokens = {}
# 2. Push Monitors
for pm in config.get("push_monitors", []):
m_name = pm["name"]
if only and m_name != only:
continue
m_interval = pm.get("interval", 60)
parent_group_id = None
for g in config.get("groups", []):
if m_name in g.get("children", []):
parent_group_id = group_map.get(g["name"])
break
logger.info(f"Processing push monitor: {m_name}")
if not dry_run:
if m_name in existing_monitors:
logger.info(f"Monitor {m_name} already exists.")
m_id = existing_monitors[m_name]['id']
token = existing_monitors[m_name]['pushToken']
tokens[m_name] = token
if parent_group_id and existing_monitors[m_name].get('parent') != parent_group_id:
api.edit_monitor(m_id, parent=parent_group_id)
else:
logger.info(f"Creating push monitor: {m_name}")
result = api.add_monitor(
type=MonitorType.PUSH,
name=m_name,
interval=m_interval,
parent=parent_group_id
)
m_id = result['monitorID']
# Fetch again to get pushToken
for m in api.get_monitors():
if m['id'] == m_id:
tokens[m_name] = m['pushToken']
break
else:
tokens[m_name] = "dummy_token_dry_run"
# 3. Process Status Pages
for sp in config.get("status_pages", []):
slug = format_str(sp["slug"], env_name, project)
title = format_str(sp["title"], env_name, project)
logger.info(f"Processing status page: {title} (slug: {slug})")
if not dry_run:
try:
pages = api.get_status_pages()
exists = any(p['slug'] == slug for p in pages)
if not exists:
logger.info(f"Creating status page: {slug}")
api.add_status_page(slug, title)
except Exception as e:
logger.warning(f"Status page ops failed: {e}")
# 4. Write tokens to uk_tokens.yml
token_file = os.path.join(os.path.dirname(__file__), "..", "config", "uk_tokens.yml")
if not dry_run:
with open(token_file, "w") as f:
yaml.dump(tokens, f)
logger.info(f"Saved push tokens to {token_file}")
else:
logger.info(f"[DRY-RUN] Would save {len(tokens)} tokens to {token_file}")
if api:
api.disconnect()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Setup Uptime Kuma monitors")
parser.add_argument("--dry-run", action="store_true", help="Print actions without making changes")
parser.add_argument("--only", type=str, help="Only process a specific monitor by name")
args = parser.parse_args()
setup_uptime_kuma(dry_run=args.dry_run, only=args.only)

View File

@ -0,0 +1,36 @@
import os
import time
import logging
from health_agent.uptime_kuma import push
logger = logging.getLogger(__name__)
def check_storagebox_mount():
start_t = time.time()
storagebox_path = os.getenv("STORAGEBOX_PATH", "/mnt/storagebox")
expected_files = [
"patroni/patroni.yml",
"ssl/STAR.iklim.co.full.crt"
]
missing_files = []
if not os.path.exists(storagebox_path):
ping_ms = int((time.time() - start_t) * 1000)
push("STORAGEBOX-MOUNT", "down", f"{storagebox_path} not found", ping_ms)
return
for rel_path in expected_files:
full_path = os.path.join(storagebox_path, rel_path)
if not os.path.exists(full_path):
missing_files.append(rel_path)
ping_ms = int((time.time() - start_t) * 1000)
if missing_files:
msg = f"mount exists but missing: {', '.join(missing_files)}"
push("STORAGEBOX-MOUNT", "down", msg, ping_ms)
else:
msg = f"{storagebox_path} OK | all critical files present"
push("STORAGEBOX-MOUNT", "up", msg, ping_ms)

View File

@ -0,0 +1,196 @@
import os
import time
import logging
import requests
from requests.auth import HTTPBasicAuth
from health_agent.uptime_kuma import push
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
logger = logging.getLogger(__name__)
def http_check(url, expected_status=None, auth=None, verify_ssl=True, timeout=5, headers=None):
start_time = time.time()
try:
resp = requests.get(url, auth=auth, verify=verify_ssl, timeout=timeout, headers=headers)
ping_ms = int((time.time() - start_time) * 1000)
if expected_status:
if isinstance(expected_status, list):
is_ok = resp.status_code in expected_status
else:
is_ok = resp.status_code == expected_status
else:
is_ok = resp.status_code < 400
return is_ok, resp, ping_ms, None
except Exception as e:
ping_ms = int((time.time() - start_time) * 1000)
return False, None, ping_ms, str(e)
def check_patroni_cluster():
nodes = ["patroni-01", "patroni-02", "patroni-03"]
cluster_data = None
error_msg = "All Patroni nodes unreachable"
start_t = time.time()
for node in nodes:
url = f"http://{node}:8008/cluster"
ok, resp, _, err = http_check(url, timeout=3)
if ok and resp:
cluster_data = resp.json()
break
elif err:
error_msg = f"{node} error: {err}"
ping_ms = int((time.time() - start_t) * 1000)
if not cluster_data:
push("PATRONI-CLUSTER", "down", error_msg, ping_ms)
return
members = cluster_data.get("members", [])
leader = None
replicas = []
for m in members:
if m.get("role") == "leader":
leader = m.get("name")
else:
lag = m.get("lag", 0)
name = m.get("name")
state = m.get("state")
replicas.append((name, lag, state))
if not leader:
down_nodes = [f"{r[0]} state: {r[2]}" for r in replicas if r[2] not in ("running", "streaming")]
msg = f"no leader detected | " + " ".join(down_nodes)
push("PATRONI-CLUSTER", "down", msg, ping_ms)
else:
lag_strs = []
for name, lag, state in replicas:
lag_mb = lag / (1024*1024) if isinstance(lag, (int, float)) else 0
lag_strs.append(f"{name} (lag:{lag_mb:.0f}MB)")
msg = f"leader: {leader} | replicas: " + " ".join(lag_strs)
push("PATRONI-CLUSTER", "up", msg, ping_ms)
def check_rabbitmq_cluster():
url = "http://rabbitmq:15672/api/healthchecks/node"
user = os.getenv("RABBITMQ_USER", "guest")
password = os.getenv("RABBITMQ_PASS", "guest")
auth = HTTPBasicAuth(user, password)
ok, resp, ping_ms, err = http_check(url, auth=auth)
if ok:
ok2, resp2, _, _ = http_check("http://rabbitmq:15672/api/nodes", auth=auth)
nodes_running = 0
total_nodes = 3
if ok2 and resp2:
data = resp2.json()
nodes_running = len([n for n in data if n.get("running")])
total_nodes = len(data)
alarms = [n.get("name") for n in data if n.get("mem_alarm") or n.get("disk_free_alarm")]
if alarms:
msg = f"disk/mem alarm active on {','.join(alarms)}"
push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
return
msg = f"{nodes_running}/{total_nodes} nodes running"
push("RABBITMQ-CLUSTER", "up", msg, ping_ms)
else:
msg = err or f"HTTP {resp.status_code if resp else 'Unknown'}"
push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
def check_apisix():
url = "http://apisix:9180/apisix/admin/routes"
api_key = os.getenv("APISIX_ADMIN_KEY", "")
headers = {"X-API-KEY": api_key} if api_key else {}
ok, resp, ping_ms, err = http_check(url, headers=headers)
if ok:
push("APISIX-GATEWAY", "up", "admin API reachable", ping_ms)
else:
push("APISIX-GATEWAY", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms)
def check_vault():
nodes = ["vault-1", "vault-2", "vault-3"]
domain = os.getenv("EXTERNAL_DOMAIN", "iklim.co")
unsealed_count = 0
total = len(nodes)
max_ping = 0
errors = []
start_t = time.time()
for node in nodes:
url = f"https://{node}.{domain}:8200/v1/sys/health"
ok, resp, ms, err = http_check(url, verify_ssl=False, expected_status=[200, 429, 473])
max_ping = max(max_ping, ms)
if resp:
data = resp.json()
if not data.get("sealed"):
unsealed_count += 1
else:
errors.append(f"{node} SEALED")
else:
errors.append(f"{node} unreachable")
ping_ms = int((time.time() - start_t) * 1000)
if unsealed_count == total:
msg = f"{unsealed_count}/{total} unsealed"
push("VAULT-CLUSTER", "up", msg, ping_ms)
else:
msg = " | ".join(errors) if errors else "Vault checks failed"
push("VAULT-CLUSTER", "down", msg, ping_ms)
def check_prometheus():
url = "http://prometheus:9090/-/healthy"
ok, resp, ping_ms, err = http_check(url)
if ok:
push("PROMETHEUS", "up", "healthy", ping_ms)
else:
push("PROMETHEUS", "down", f"prometheus unreachable: {err}", ping_ms)
def check_grafana():
url = "http://grafana:3000/api/health"
ok, resp, ping_ms, err = http_check(url)
if ok and resp:
data = resp.json()
db_status = data.get("database", "unknown")
if db_status == "ok":
push("GRAFANA", "up", f"ok | db: {db_status}", ping_ms)
else:
push("GRAFANA", "down", f"db not ok: {db_status}", ping_ms)
else:
push("GRAFANA", "down", f"grafana unreachable: {err}", ping_ms)
def check_portainer():
url = "http://portainer:9000/api/system/status"
ok, resp, ping_ms, err = http_check(url)
if ok:
push("PORTAINER", "up", "running", ping_ms)
else:
push("PORTAINER", "down", f"portainer unreachable: {err}", ping_ms)
def check_loki():
url = "http://loki:3100/ready"
ok, resp, ping_ms, err = http_check(url)
if ok:
push("LOKI", "up", "ready", ping_ms)
else:
push("LOKI", "down", f"loki unreachable: {err}", ping_ms)
def run_all_http_checks():
check_patroni_cluster()
check_rabbitmq_cluster()
check_apisix()
check_vault()
check_prometheus()
check_grafana()
check_portainer()
check_loki()

View File

@ -0,0 +1,57 @@
import os
import time
import logging
from pymongo import MongoClient
from health_agent.uptime_kuma import push
logger = logging.getLogger(__name__)
def check_mongodb():
start_t = time.time()
mongo_uri = os.getenv("MONGO_URI", "mongodb://mongodb-01:27017,mongodb-02:27017,mongodb-03:27017/?replicaSet=rs0")
cluster_size = int(os.getenv("CLUSTER_SIZE_MONGODB", "3"))
try:
with MongoClient(mongo_uri, serverSelectionTimeoutMS=3000) as client:
status = client.admin.command('replSetGetStatus')
members = status.get('members', [])
primary = None
secondaries = []
for m in members:
state_str = m.get('stateStr', '')
name = m.get('name', 'unknown')
if state_str == 'PRIMARY':
primary = name
elif state_str == 'SECONDARY':
secondaries.append((name, state_str))
else:
secondaries.append((name, state_str))
ping_ms = int((time.time() - start_t) * 1000)
if cluster_size == 1:
push("MONGODB-REPLICASET", "up", "standalone mode OK", ping_ms)
return
if primary:
sec_strs = [f"{s[0]} ({s[1]})" for s in secondaries]
msg = f"PRIMARY: {primary} | secondaries: {' '.join(sec_strs)}"
unhealthy_secs = [s for s in secondaries if s[1] not in ('SECONDARY', 'ARBITER')]
if unhealthy_secs:
msg = f"PRIMARY: {primary} | unhealthy: {','.join([s[0] + ':' + s[1] for s in unhealthy_secs])}"
push("MONGODB-REPLICASET", "down", msg, ping_ms)
else:
push("MONGODB-REPLICASET", "up", msg, ping_ms)
else:
msg = "no PRIMARY | quorum lost"
push("MONGODB-REPLICASET", "down", msg, ping_ms)
except Exception as e:
ping_ms = int((time.time() - start_t) * 1000)
push("MONGODB-REPLICASET", "down", f"connection failed: {e}", ping_ms)

View File

@ -0,0 +1,43 @@
import os
import time
import logging
from redis.sentinel import Sentinel
from health_agent.uptime_kuma import push
logger = logging.getLogger(__name__)
def check_redis_sentinel():
start_t = time.time()
hosts = os.getenv("REDIS_SENTINEL_HOSTS", "redis-sentinel-01,redis-sentinel-02,redis-sentinel-03")
sentinel_nodes = [(h.strip(), 26379) for h in hosts.split(",")]
master_name = os.getenv("REDIS_MASTER_NAME", "prod-master")
password = os.getenv("REDIS_PASSWORD", None)
redis_mode = os.getenv("REDIS_MODE", "sentinel")
if redis_mode != "sentinel":
push("REDIS-SENTINEL", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000))
return
try:
sentinel = Sentinel(sentinel_nodes, socket_timeout=3, password=password)
# Master ping
master = sentinel.master_for(master_name, socket_timeout=3, password=password)
master.ping()
master_ip, master_port = sentinel.discover_master(master_name)
master.connection_pool.disconnect()
# Get replicas count
slaves = sentinel.discover_slaves(master_name)
replicas_count = len(slaves)
ping_ms = int((time.time() - start_t) * 1000)
msg = f"master: {master_ip}:{master_port} | replicas: {replicas_count} | sentinels quorum OK"
push("REDIS-SENTINEL", "up", msg, ping_ms)
except Exception as e:
ping_ms = int((time.time() - start_t) * 1000)
push("REDIS-SENTINEL", "down", f"quorum FAIL or master unreachable: {e}", ping_ms)

View File

@ -0,0 +1,49 @@
import time
import docker
import logging
from health_agent.uptime_kuma import push
logger = logging.getLogger(__name__)
def check_swarm_cluster():
start_time = time.time()
try:
client = docker.from_env()
nodes = client.nodes.list()
ready_nodes = []
managers = []
for node in nodes:
spec = node.attrs.get('Spec', {})
status = node.attrs.get('Status', {})
manager_status = node.attrs.get('ManagerStatus', {})
node_name = spec.get('Name', node.id)
is_ready = status.get('State') == 'ready'
is_manager = spec.get('Role') == 'manager'
if is_ready:
ready_nodes.append(node_name)
if is_manager:
reachability = manager_status.get('Reachability')
if reachability == 'reachable':
managers.append(node_name)
total_nodes = len(nodes)
ready_count = len(ready_nodes)
ping_ms = int((time.time() - start_time) * 1000)
if ready_count == total_nodes:
msg = f"{ready_count}/{total_nodes} nodes Ready (managers: {', '.join(managers)})"
push("SWARM-CLUSTER", "up", msg, ping_ms)
else:
msg = f"{ready_count}/{total_nodes} nodes Ready | Managers reachable: {len(managers)}"
push("SWARM-CLUSTER", "down", msg, ping_ms)
except Exception as e:
ping_ms = int((time.time() - start_time) * 1000)
logger.error(f"Swarm check failed: {e}")
push("SWARM-CLUSTER", "down", str(e), ping_ms)

View File

@ -0,0 +1,77 @@
import socket
import time
import logging
import requests
from health_agent.uptime_kuma import push
from health_agent.checks.http import http_check
logger = logging.getLogger(__name__)
def tcp_check(host, port, timeout=3):
start_time = time.time()
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
result = sock.connect_ex((host, port))
sock.close()
ping_ms = int((time.time() - start_time) * 1000)
if result == 0:
return True, ping_ms, None
else:
return False, ping_ms, f"Port {port} is closed or unreachable"
except Exception as e:
ping_ms = int((time.time() - start_time) * 1000)
return False, ping_ms, str(e)
def check_etcd_cluster():
nodes = ["etcd-01", "etcd-02", "etcd-03"]
start_t = time.time()
healthy_count = 0
leader = None
errors = []
for node in nodes:
# 1. TCP Check on 2379
tcp_ok, ms, tcp_err = tcp_check(node, 2379)
if not tcp_ok:
errors.append(f"{node} port 2379 unreachable")
continue
# 2. HTTP Health check
url = f"http://{node}:2379/health"
http_ok, resp, ms, http_err = http_check(url, timeout=3)
if http_ok and resp:
data = resp.json()
if data.get("health") == "true":
healthy_count += 1
else:
errors.append(f"{node} unhealthy")
else:
errors.append(f"{node} health endpoint unreachable")
# 3. Leader check from /v3/maintenance/status
if not leader and tcp_ok:
status_url = f"http://{node}:2379/v3/maintenance/status"
try:
r = requests.post(status_url, json={}, timeout=3)
if r.status_code == 200:
status_data = r.json()
leader_id = status_data.get("leader")
header_member_id = status_data.get("header", {}).get("member_id")
if leader_id and leader_id == header_member_id:
leader = node
except Exception:
pass
ping_ms = int((time.time() - start_t) * 1000)
if healthy_count == len(nodes):
leader_info = f" | leader: {leader}" if leader else ""
msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}"
push("ETCD-CLUSTER", "up", msg, ping_ms)
else:
quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else ""
msg = " | ".join(errors) + quorum_msg
push("ETCD-CLUSTER", "down", msg, ping_ms)

View File

@ -0,0 +1,62 @@
import os
import time
import logging
import requests
from datetime import datetime, timezone
from health_agent.uptime_kuma import push
from cryptography import x509
from cryptography.hazmat.backends import default_backend
logger = logging.getLogger(__name__)
def check_swag_tls():
start_t = time.time()
cert_path = "/mnt/storagebox/ssl/STAR.iklim.co.full.crt"
domain = os.getenv("EXTERNAL_DOMAIN", "iklim.co")
suffix = os.getenv("EXTERNAL_SUBDOMAIN_SUFFIX", "")
target_url = f"https://api{suffix}.{domain}/actuator/health"
msg_parts = []
is_down = False
# 1. Check cert file
if not os.path.exists(cert_path):
is_down = True
msg_parts.append("cert file missing on storagebox")
else:
try:
with open(cert_path, "rb") as f:
cert_data = f.read()
cert = x509.load_pem_x509_certificate(cert_data, default_backend())
not_valid_after = cert.not_valid_after_utc
now = datetime.now(timezone.utc)
days_left = (not_valid_after - now).days
if days_left < 14:
is_down = True
msg_parts.append(f"cert expires in {days_left} days")
else:
msg_parts.append(f"cert valid until {not_valid_after.strftime('%Y-%m-%d')} ({days_left} days)")
except Exception as e:
is_down = True
msg_parts.append(f"cert parse error: {e}")
# 2. Check external HTTPS reachable
try:
r = requests.get(target_url, timeout=5, verify=False)
if r.status_code < 500:
msg_parts.append("HTTPS reachable")
else:
is_down = True
msg_parts.append(f"HTTPS returned {r.status_code}")
except Exception as e:
is_down = True
msg_parts.append(f"HTTPS unreachable")
ping_ms = int((time.time() - start_t) * 1000)
msg = " | ".join(msg_parts)
if is_down:
push("SWAG-TLS", "down", msg, ping_ms)
else:
push("SWAG-TLS", "up", msg, ping_ms)

View File

@ -0,0 +1,25 @@
import os
import yaml
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()
ENV = os.getenv("ENV", "prod")
CLUSTER_SIZE_ETCD = int(os.getenv("CLUSTER_SIZE_ETCD", "3"))
CLUSTER_SIZE_PATRONI = int(os.getenv("CLUSTER_SIZE_PATRONI", "3"))
CLUSTER_SIZE_MONGODB = int(os.getenv("CLUSTER_SIZE_MONGODB", "3"))
CLUSTER_SIZE_RABBITMQ = int(os.getenv("CLUSTER_SIZE_RABBITMQ", "3"))
CLUSTER_SIZE_VAULT = int(os.getenv("CLUSTER_SIZE_VAULT", "3"))
REDIS_MODE = os.getenv("REDIS_MODE", "sentinel")
EXTERNAL_DOMAIN = os.getenv("EXTERNAL_DOMAIN", "iklim.co")
EXTERNAL_SUBDOMAIN_SUFFIX = os.getenv("EXTERNAL_SUBDOMAIN_SUFFIX", "")
def load_uk_tokens():
token_file = Path("config/generated/uk_tokens.yml")
if not token_file.exists():
return {}
with open(token_file, "r") as f:
return yaml.safe_load(f) or {}
UK_TOKENS = load_uk_tokens()

View File

@ -0,0 +1,56 @@
import os
import docker
import threading
import logging
import time
from health_agent.slack import notify
logger = logging.getLogger(__name__)
def parse_and_notify(event):
attrs = event.get('Actor', {}).get('Attributes', {})
container_name = attrs.get('name', 'unknown')
exit_code = attrs.get('exitCode', '0')
if exit_code == '0':
return
is_oom = (exit_code == '137')
env = os.getenv("ENV", "test").upper()
webhook_env_name = f"SLACK_WEBHOOK_IKLIM_{env}_OPS"
priority = "High" if is_oom else "Medium"
title = f"[Health Agent / Events] Container Crashed ({container_name})"
detail = f"Container: {container_name}\nExit Code: {exit_code}"
if is_oom:
detail += "\nReason: OOM Killed (Out Of Memory) or SIGKILL"
notify(
webhook_env=webhook_env_name,
source="health-agent-events",
priority=priority,
title=title,
detail=detail
)
def event_listener_loop():
while True:
try:
client = docker.from_env()
logger.info("Starting Docker event listener...")
filters = {"type": "container", "event": "die"}
for event in client.events(decode=True, filters=filters):
try:
parse_and_notify(event)
except Exception as e:
logger.error(f"Error parsing event: {e}", exc_info=True)
except Exception as e:
logger.error(f"Docker event listener error: {e}. Reconnecting in 10s...", exc_info=True)
time.sleep(10)
def start_docker_event_listener():
thread = threading.Thread(target=event_listener_loop, daemon=True)
thread.start()
return thread

View File

@ -0,0 +1,75 @@
import time
import logging
from health_agent.checks import swarm
from health_agent.checks.http import run_all_http_checks
from health_agent.checks.tcp import check_etcd_cluster
from health_agent.checks.tls import check_swag_tls
from health_agent.checks.redis_sentinel import check_redis_sentinel
from health_agent.checks.mongodb import check_mongodb
from health_agent.checks.filesystem import check_storagebox_mount
from health_agent.events.docker_events import start_docker_event_listener
import json
class JSONFormatter(logging.Formatter):
def format(self, record):
log_obj = {
"time": self.formatTime(record, self.datefmt),
"level": record.levelname,
"logger": record.name,
"msg": record.getMessage()
}
for attr in ['check', 'status', 'ping_ms', 'source', 'error']:
if hasattr(record, attr):
log_obj[attr] = getattr(record, attr)
if record.exc_info:
log_obj['exc_info'] = self.formatException(record.exc_info)
return json.dumps(log_obj)
handler = logging.StreamHandler()
handler.setFormatter(JSONFormatter())
logging.basicConfig(level=logging.INFO, handlers=[handler])
logger = logging.getLogger("main")
def run_checks():
logger.info("Running health checks...")
try:
swarm.check_swarm_cluster()
except Exception as e:
logger.error(f"Error checking Swarm cluster: {e}")
try:
run_all_http_checks()
except Exception as e:
logger.error(f"Error running HTTP checks: {e}")
try:
check_etcd_cluster()
except Exception as e:
logger.error(f"Error running etcd checks: {e}")
try:
check_swag_tls()
except Exception as e:
logger.error(f"Error running TLS checks: {e}")
try:
check_redis_sentinel()
except Exception as e:
logger.error(f"Error running Redis checks: {e}")
try:
check_mongodb()
except Exception as e:
logger.error(f"Error running MongoDB checks: {e}")
try:
check_storagebox_mount()
except Exception as e:
logger.error(f"Error running filesystem checks: {e}")
if __name__ == "__main__":
logger.info("Starting health-agent...")
start_docker_event_listener()
while True:
run_checks()
time.sleep(60)

View File

@ -0,0 +1,22 @@
import os
import requests
import logging
logger = logging.getLogger(__name__)
def notify(webhook_env: str, source: str, priority: str, title: str, detail: str):
webhook_url = os.getenv(webhook_env)
if not webhook_url:
logger.warning(f"Slack webhook url not found for {webhook_env}")
return
payload = {
"text": f"*{title}*\n*Source:* {source}\n*Priority:* {priority}\n```\n{detail}\n```"
}
try:
response = requests.post(webhook_url, json=payload, timeout=5)
response.raise_for_status()
logger.info(f"Sent Slack notification to {webhook_env}")
except Exception as e:
logger.error(f"Failed to send Slack notification: {e}")

View File

@ -0,0 +1,19 @@
import json
import os
from pathlib import Path
STATE_FILE = Path("config/generated/state.json")
def load_state():
if not STATE_FILE.exists():
return {}
try:
with open(STATE_FILE, "r") as f:
return json.load(f)
except Exception:
return {}
def save_state(state):
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(STATE_FILE, "w") as f:
json.dump(state, f)

View File

@ -0,0 +1,27 @@
import os
import requests
import logging
from health_agent.config import UK_TOKENS
logger = logging.getLogger(__name__)
UK_PUSH_URL_BASE = os.getenv("UK_PUSH_URL_BASE", "https://status.iklim.co/api/push")
def push(monitor_name: str, status: str, msg: str, ping_ms: int):
token = UK_TOKENS.get(monitor_name)
if not token:
logger.warning(f"No token found for monitor {monitor_name}")
return
url = f"{UK_PUSH_URL_BASE}/{token}"
params = {
"status": status,
"msg": msg,
"ping": int(ping_ms)
}
try:
response = requests.get(url, params=params, timeout=10)
response.raise_for_status()
logger.info(f"Pushed {monitor_name} status={status}", extra={"check": monitor_name, "status": status, "msg": msg, "ping_ms": ping_ms, "source": "uptime_kuma"})
except Exception as e:
logger.error(f"Failed to push {monitor_name}: {e}", extra={"check": monitor_name, "status": "push_failed", "error": str(e), "source": "uptime_kuma"})