feat(health-agent): add monitors.yml with env-aware node IP mapping from Ansible inventory
This commit is contained in:
parent
a2e8997711
commit
f742bfdd11
10
health-agent/.env.example
Normal file
10
health-agent/.env.example
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
ENV=prod
|
||||||
|
CLUSTER_SIZE_ETCD=3
|
||||||
|
CLUSTER_SIZE_PATRONI=3
|
||||||
|
CLUSTER_SIZE_MONGODB=3
|
||||||
|
CLUSTER_SIZE_RABBITMQ=3
|
||||||
|
CLUSTER_SIZE_VAULT=3
|
||||||
|
REDIS_MODE=sentinel
|
||||||
|
EXTERNAL_DOMAIN=iklim.co
|
||||||
|
EXTERNAL_SUBDOMAIN_SUFFIX=
|
||||||
|
UK_PUSH_URL_BASE=https://status.iklim.co/api/push
|
||||||
5
health-agent/.env.setup.example
Normal file
5
health-agent/.env.setup.example
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
UK_URL=http://uptime-kuma:3001
|
||||||
|
UK_API_KEY=your_api_key_here
|
||||||
|
UK_SLACK_WEBHOOK_HIGH=https://hooks.slack.com/services/...
|
||||||
|
UK_SLACK_WEBHOOK_MEDIUM=https://hooks.slack.com/services/...
|
||||||
|
UK_SLACK_WEBHOOK_LOW=https://hooks.slack.com/services/...
|
||||||
16
health-agent/Dockerfile
Normal file
16
health-agent/Dockerfile
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
FROM python:3.12-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY pyproject.toml ./
|
||||||
|
COPY src/ ./src/
|
||||||
|
RUN pip install --no-cache-dir .
|
||||||
|
ENV PYTHONPATH=/app/src
|
||||||
|
|
||||||
|
RUN useradd -m appuser
|
||||||
|
# Keeping as root to be able to access /var/run/docker.sock cleanly, unless specifically configured with groups.
|
||||||
|
# USER appuser
|
||||||
|
|
||||||
|
CMD ["python", "src/health_agent/main.py"]
|
||||||
196
health-agent/config/monitors.yml
Normal file
196
health-agent/config/monitors.yml
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
version: "1"
|
||||||
|
project: "iklim"
|
||||||
|
domain:
|
||||||
|
base: "iklim.co"
|
||||||
|
nodes:
|
||||||
|
prod:
|
||||||
|
service:
|
||||||
|
- name: iklim-app-01
|
||||||
|
ip: "178.104.210.41"
|
||||||
|
- name: iklim-app-02
|
||||||
|
ip: "178.105.69.1"
|
||||||
|
- name: iklim-app-03
|
||||||
|
ip: "178.104.219.3"
|
||||||
|
db:
|
||||||
|
- name: iklim-db-01
|
||||||
|
ip: "159.69.117.158"
|
||||||
|
- name: iklim-db-02
|
||||||
|
ip: "178.104.219.162"
|
||||||
|
- name: iklim-db-03
|
||||||
|
ip: "159.69.115.105"
|
||||||
|
test:
|
||||||
|
service:
|
||||||
|
- name: iklim-app-01
|
||||||
|
ip: "167.235.194.61"
|
||||||
|
db:
|
||||||
|
- name: iklim-db-01
|
||||||
|
ip: "167.235.205.93"
|
||||||
|
tags:
|
||||||
|
- external
|
||||||
|
- internal
|
||||||
|
- high
|
||||||
|
- medium
|
||||||
|
- low
|
||||||
|
- database
|
||||||
|
- gateway
|
||||||
|
- infrastructure
|
||||||
|
- observability
|
||||||
|
notifications:
|
||||||
|
slack-high:
|
||||||
|
type: slack
|
||||||
|
webhook_env: UK_SLACK_WEBHOOK_HIGH
|
||||||
|
slack-medium:
|
||||||
|
type: slack
|
||||||
|
webhook_env: UK_SLACK_WEBHOOK_MEDIUM
|
||||||
|
slack-low:
|
||||||
|
type: slack
|
||||||
|
webhook_env: UK_SLACK_WEBHOOK_LOW
|
||||||
|
groups:
|
||||||
|
- name: "Altyapı"
|
||||||
|
status_page: "iklim-{env}-ops"
|
||||||
|
notifications: [slack-high]
|
||||||
|
tags: [internal, infrastructure]
|
||||||
|
children: [SWARM-CLUSTER, VAULT-CLUSTER, STORAGEBOX-MOUNT, SWAG-TLS]
|
||||||
|
- name: "Veri Katmanı"
|
||||||
|
status_page: "iklim-{env}-ops"
|
||||||
|
notifications: [slack-high]
|
||||||
|
tags: [internal, database]
|
||||||
|
children: [ETCD-CLUSTER, PATRONI-CLUSTER, MONGODB-REPLICASET]
|
||||||
|
- name: "Gateway & Mesajlaşma"
|
||||||
|
status_page: "iklim-{env}-ops"
|
||||||
|
notifications: [slack-high]
|
||||||
|
tags: [internal, gateway]
|
||||||
|
children: [APISIX-GATEWAY, RABBITMQ-CLUSTER, REDIS-SENTINEL]
|
||||||
|
- name: "Dış Erişilebilirlik - Kritik"
|
||||||
|
status_page: "iklim-{env}-ops"
|
||||||
|
notifications: [slack-high]
|
||||||
|
tags: [external, high]
|
||||||
|
children: [EXT-HTTPS-API, EXT-DNS-API, EXT-DNS-ROOT, EXT-PING-APP01, EXT-PING-APP02, EXT-PING-APP03]
|
||||||
|
- name: "Dış Erişilebilirlik - Genel"
|
||||||
|
status_page: "iklim-{env}-ops"
|
||||||
|
notifications: [slack-medium]
|
||||||
|
tags: [external, medium]
|
||||||
|
children: [EXT-HTTPS-GRAFANA, EXT-PING-DB01, EXT-PING-DB02, EXT-PING-DB03]
|
||||||
|
- name: "Gözlemlenebilirlik"
|
||||||
|
status_page: "iklim-{env}-tools"
|
||||||
|
notifications: [slack-low]
|
||||||
|
tags: [internal, observability]
|
||||||
|
children: [PROMETHEUS, GRAFANA, PORTAINER, LOKI, EXT-HTTPS-PORTAINER, EXT-HTTPS-APIGW]
|
||||||
|
push_monitors:
|
||||||
|
- name: SWARM-CLUSTER
|
||||||
|
interval: 60
|
||||||
|
heartbeat_retries: 1
|
||||||
|
tags: [internal, infrastructure, high]
|
||||||
|
restart_threshold: 1
|
||||||
|
- name: VAULT-CLUSTER
|
||||||
|
interval: 60
|
||||||
|
heartbeat_retries: 1
|
||||||
|
tags: [internal, infrastructure, high]
|
||||||
|
restart_threshold: 1
|
||||||
|
- name: ETCD-CLUSTER
|
||||||
|
interval: 60
|
||||||
|
heartbeat_retries: 1
|
||||||
|
tags: [internal, database, high]
|
||||||
|
restart_threshold: 1
|
||||||
|
- name: PATRONI-CLUSTER
|
||||||
|
interval: 60
|
||||||
|
heartbeat_retries: 1
|
||||||
|
tags: [internal, database, high]
|
||||||
|
restart_threshold: 1
|
||||||
|
- name: MONGODB-REPLICASET
|
||||||
|
interval: 120
|
||||||
|
heartbeat_retries: 1
|
||||||
|
tags: [internal, database, high]
|
||||||
|
restart_threshold: 1
|
||||||
|
- name: APISIX-GATEWAY
|
||||||
|
interval: 60
|
||||||
|
heartbeat_retries: 1
|
||||||
|
tags: [internal, gateway, high]
|
||||||
|
restart_threshold: 1
|
||||||
|
- name: RABBITMQ-CLUSTER
|
||||||
|
interval: 60
|
||||||
|
heartbeat_retries: 1
|
||||||
|
tags: [internal, gateway, medium]
|
||||||
|
restart_threshold: 3
|
||||||
|
- name: REDIS-SENTINEL
|
||||||
|
interval: 60
|
||||||
|
heartbeat_retries: 1
|
||||||
|
tags: [internal, database, medium]
|
||||||
|
restart_threshold: 3
|
||||||
|
- name: SWAG-TLS
|
||||||
|
interval: 3600
|
||||||
|
heartbeat_retries: 1
|
||||||
|
tags: [internal, infrastructure, medium]
|
||||||
|
restart_threshold: 3
|
||||||
|
- name: STORAGEBOX-MOUNT
|
||||||
|
interval: 300
|
||||||
|
heartbeat_retries: 1
|
||||||
|
tags: [internal, infrastructure, medium]
|
||||||
|
restart_threshold: 1
|
||||||
|
- name: PROMETHEUS
|
||||||
|
interval: 120
|
||||||
|
heartbeat_retries: 1
|
||||||
|
tags: [internal, observability, low]
|
||||||
|
restart_threshold: 5
|
||||||
|
- name: GRAFANA
|
||||||
|
interval: 120
|
||||||
|
heartbeat_retries: 1
|
||||||
|
tags: [internal, observability, low]
|
||||||
|
restart_threshold: 5
|
||||||
|
- name: PORTAINER
|
||||||
|
interval: 120
|
||||||
|
heartbeat_retries: 1
|
||||||
|
tags: [internal, observability, low]
|
||||||
|
restart_threshold: 5
|
||||||
|
- name: LOKI
|
||||||
|
interval: 120
|
||||||
|
heartbeat_retries: 1
|
||||||
|
tags: [internal, observability, low]
|
||||||
|
restart_threshold: 5
|
||||||
|
http_monitors:
|
||||||
|
- name: EXT-HTTPS-API
|
||||||
|
url: "https://api{suffix}.{domain}/actuator/health"
|
||||||
|
accepted_statuscodes: ["200"]
|
||||||
|
interval: 60
|
||||||
|
- name: EXT-HTTPS-GRAFANA
|
||||||
|
url: "https://grafana{suffix}.{domain}/api/health"
|
||||||
|
accepted_statuscodes: ["200"]
|
||||||
|
interval: 60
|
||||||
|
- name: EXT-HTTPS-PORTAINER
|
||||||
|
url: "https://portainer{suffix}.{domain}"
|
||||||
|
accepted_statuscodes: ["200", "401", "403"]
|
||||||
|
interval: 120
|
||||||
|
- name: EXT-HTTPS-APIGW
|
||||||
|
url: "https://apigw{suffix}.{domain}"
|
||||||
|
accepted_statuscodes: ["200", "401", "403"]
|
||||||
|
interval: 120
|
||||||
|
dns_monitors:
|
||||||
|
- name: EXT-DNS-API
|
||||||
|
hostname: "api{suffix}.{domain}"
|
||||||
|
dns_resolve_type: A
|
||||||
|
interval: 60
|
||||||
|
- name: EXT-DNS-ROOT
|
||||||
|
hostname: "{domain}"
|
||||||
|
dns_resolve_type: A
|
||||||
|
interval: 60
|
||||||
|
ping_monitors:
|
||||||
|
interval: 60
|
||||||
|
max_retries: 1
|
||||||
|
status_pages:
|
||||||
|
- slug: "iklim-{env}-status"
|
||||||
|
title: "iklim.co API Durumu"
|
||||||
|
public: true
|
||||||
|
groups: ["Dış Erişilebilirlik - Kritik"]
|
||||||
|
- slug: "iklim-{env}-ops"
|
||||||
|
title: "iklim.co [{env}] Altyapı"
|
||||||
|
public: false
|
||||||
|
groups:
|
||||||
|
- "Altyapı"
|
||||||
|
- "Veri Katmanı"
|
||||||
|
- "Gateway & Mesajlaşma"
|
||||||
|
- "Dış Erişilebilirlik - Kritik"
|
||||||
|
- "Dış Erişilebilirlik - Genel"
|
||||||
|
- slug: "iklim-{env}-tools"
|
||||||
|
title: "iklim.co [{env}] Araçlar"
|
||||||
|
public: false
|
||||||
|
groups: ["Gözlemlenebilirlik"]
|
||||||
22
health-agent/pyproject.toml
Normal file
22
health-agent/pyproject.toml
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
[project]
|
||||||
|
name = "health-agent"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "iklim.co Monitoring Health Agent"
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
dependencies = [
|
||||||
|
"requests",
|
||||||
|
"docker",
|
||||||
|
"python-dotenv",
|
||||||
|
"pyyaml",
|
||||||
|
"redis",
|
||||||
|
"pymongo",
|
||||||
|
"uptime-kuma-api",
|
||||||
|
"cryptography",
|
||||||
|
]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["hatchling"]
|
||||||
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
[tool.hatch.build.targets.wheel]
|
||||||
|
packages = ["src/health_agent"]
|
||||||
138
health-agent/scripts/setup_uptime_kuma.py
Normal file
138
health-agent/scripts/setup_uptime_kuma.py
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
import yaml
|
||||||
|
import logging
|
||||||
|
from uptime_kuma_api import UptimeKumaApi, MonitorType
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||||
|
logger = logging.getLogger("uk-setup")
|
||||||
|
|
||||||
|
def format_str(text, env_name, project):
|
||||||
|
if not isinstance(text, str):
|
||||||
|
return text
|
||||||
|
return text.replace("{env}", env_name).replace("{project}", project)
|
||||||
|
|
||||||
|
def setup_uptime_kuma(dry_run=False, only=None):
|
||||||
|
env_name = os.getenv("ENV", "test")
|
||||||
|
|
||||||
|
config_path = os.path.join(os.path.dirname(__file__), "..", "config", "monitors.yml")
|
||||||
|
with open(config_path, "r") as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
project = config.get("project", "iklim")
|
||||||
|
|
||||||
|
kuma_url = os.getenv("UK_URL", "http://localhost:3001")
|
||||||
|
kuma_user = os.getenv("UK_USER", "admin")
|
||||||
|
kuma_pass = os.getenv("UK_PASS", "admin")
|
||||||
|
|
||||||
|
api = None
|
||||||
|
if not dry_run:
|
||||||
|
logger.info(f"Connecting to Uptime Kuma at {kuma_url}...")
|
||||||
|
try:
|
||||||
|
api = UptimeKumaApi(kuma_url)
|
||||||
|
api.login(kuma_user, kuma_pass)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Login failed: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
existing_monitors = {}
|
||||||
|
if api:
|
||||||
|
try:
|
||||||
|
for m in api.get_monitors():
|
||||||
|
existing_monitors[m['name']] = m
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to get monitors: {e}")
|
||||||
|
|
||||||
|
# 1. Process Groups
|
||||||
|
group_map = {}
|
||||||
|
for g in config.get("groups", []):
|
||||||
|
raw_name = g["name"]
|
||||||
|
formatted_name = f"{project} [{env_name}] {raw_name}"
|
||||||
|
|
||||||
|
logger.info(f"Processing group: {formatted_name}")
|
||||||
|
if not dry_run:
|
||||||
|
if formatted_name not in existing_monitors:
|
||||||
|
logger.info(f"Creating group monitor: {formatted_name}")
|
||||||
|
res = api.add_monitor(type=MonitorType.GROUP, name=formatted_name)
|
||||||
|
group_map[raw_name] = res['monitorID']
|
||||||
|
else:
|
||||||
|
group_map[raw_name] = existing_monitors[formatted_name]['id']
|
||||||
|
|
||||||
|
tokens = {}
|
||||||
|
|
||||||
|
# 2. Push Monitors
|
||||||
|
for pm in config.get("push_monitors", []):
|
||||||
|
m_name = pm["name"]
|
||||||
|
if only and m_name != only:
|
||||||
|
continue
|
||||||
|
|
||||||
|
m_interval = pm.get("interval", 60)
|
||||||
|
|
||||||
|
parent_group_id = None
|
||||||
|
for g in config.get("groups", []):
|
||||||
|
if m_name in g.get("children", []):
|
||||||
|
parent_group_id = group_map.get(g["name"])
|
||||||
|
break
|
||||||
|
|
||||||
|
logger.info(f"Processing push monitor: {m_name}")
|
||||||
|
if not dry_run:
|
||||||
|
if m_name in existing_monitors:
|
||||||
|
logger.info(f"Monitor {m_name} already exists.")
|
||||||
|
m_id = existing_monitors[m_name]['id']
|
||||||
|
token = existing_monitors[m_name]['pushToken']
|
||||||
|
tokens[m_name] = token
|
||||||
|
|
||||||
|
if parent_group_id and existing_monitors[m_name].get('parent') != parent_group_id:
|
||||||
|
api.edit_monitor(m_id, parent=parent_group_id)
|
||||||
|
else:
|
||||||
|
logger.info(f"Creating push monitor: {m_name}")
|
||||||
|
result = api.add_monitor(
|
||||||
|
type=MonitorType.PUSH,
|
||||||
|
name=m_name,
|
||||||
|
interval=m_interval,
|
||||||
|
parent=parent_group_id
|
||||||
|
)
|
||||||
|
m_id = result['monitorID']
|
||||||
|
|
||||||
|
# Fetch again to get pushToken
|
||||||
|
for m in api.get_monitors():
|
||||||
|
if m['id'] == m_id:
|
||||||
|
tokens[m_name] = m['pushToken']
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
tokens[m_name] = "dummy_token_dry_run"
|
||||||
|
|
||||||
|
# 3. Process Status Pages
|
||||||
|
for sp in config.get("status_pages", []):
|
||||||
|
slug = format_str(sp["slug"], env_name, project)
|
||||||
|
title = format_str(sp["title"], env_name, project)
|
||||||
|
logger.info(f"Processing status page: {title} (slug: {slug})")
|
||||||
|
if not dry_run:
|
||||||
|
try:
|
||||||
|
pages = api.get_status_pages()
|
||||||
|
exists = any(p['slug'] == slug for p in pages)
|
||||||
|
if not exists:
|
||||||
|
logger.info(f"Creating status page: {slug}")
|
||||||
|
api.add_status_page(slug, title)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Status page ops failed: {e}")
|
||||||
|
|
||||||
|
# 4. Write tokens to uk_tokens.yml
|
||||||
|
token_file = os.path.join(os.path.dirname(__file__), "..", "config", "uk_tokens.yml")
|
||||||
|
if not dry_run:
|
||||||
|
with open(token_file, "w") as f:
|
||||||
|
yaml.dump(tokens, f)
|
||||||
|
logger.info(f"Saved push tokens to {token_file}")
|
||||||
|
else:
|
||||||
|
logger.info(f"[DRY-RUN] Would save {len(tokens)} tokens to {token_file}")
|
||||||
|
|
||||||
|
if api:
|
||||||
|
api.disconnect()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="Setup Uptime Kuma monitors")
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="Print actions without making changes")
|
||||||
|
parser.add_argument("--only", type=str, help="Only process a specific monitor by name")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
setup_uptime_kuma(dry_run=args.dry_run, only=args.only)
|
||||||
36
health-agent/src/health_agent/checks/filesystem.py
Normal file
36
health-agent/src/health_agent/checks/filesystem.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from health_agent.uptime_kuma import push
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def check_storagebox_mount():
|
||||||
|
start_t = time.time()
|
||||||
|
|
||||||
|
storagebox_path = os.getenv("STORAGEBOX_PATH", "/mnt/storagebox")
|
||||||
|
expected_files = [
|
||||||
|
"patroni/patroni.yml",
|
||||||
|
"ssl/STAR.iklim.co.full.crt"
|
||||||
|
]
|
||||||
|
|
||||||
|
missing_files = []
|
||||||
|
|
||||||
|
if not os.path.exists(storagebox_path):
|
||||||
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
|
push("STORAGEBOX-MOUNT", "down", f"{storagebox_path} not found", ping_ms)
|
||||||
|
return
|
||||||
|
|
||||||
|
for rel_path in expected_files:
|
||||||
|
full_path = os.path.join(storagebox_path, rel_path)
|
||||||
|
if not os.path.exists(full_path):
|
||||||
|
missing_files.append(rel_path)
|
||||||
|
|
||||||
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
|
|
||||||
|
if missing_files:
|
||||||
|
msg = f"mount exists but missing: {', '.join(missing_files)}"
|
||||||
|
push("STORAGEBOX-MOUNT", "down", msg, ping_ms)
|
||||||
|
else:
|
||||||
|
msg = f"{storagebox_path} OK | all critical files present"
|
||||||
|
push("STORAGEBOX-MOUNT", "up", msg, ping_ms)
|
||||||
196
health-agent/src/health_agent/checks/http.py
Normal file
196
health-agent/src/health_agent/checks/http.py
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
import requests
|
||||||
|
from requests.auth import HTTPBasicAuth
|
||||||
|
from health_agent.uptime_kuma import push
|
||||||
|
import urllib3
|
||||||
|
|
||||||
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def http_check(url, expected_status=None, auth=None, verify_ssl=True, timeout=5, headers=None):
|
||||||
|
start_time = time.time()
|
||||||
|
try:
|
||||||
|
resp = requests.get(url, auth=auth, verify=verify_ssl, timeout=timeout, headers=headers)
|
||||||
|
ping_ms = int((time.time() - start_time) * 1000)
|
||||||
|
|
||||||
|
if expected_status:
|
||||||
|
if isinstance(expected_status, list):
|
||||||
|
is_ok = resp.status_code in expected_status
|
||||||
|
else:
|
||||||
|
is_ok = resp.status_code == expected_status
|
||||||
|
else:
|
||||||
|
is_ok = resp.status_code < 400
|
||||||
|
|
||||||
|
return is_ok, resp, ping_ms, None
|
||||||
|
except Exception as e:
|
||||||
|
ping_ms = int((time.time() - start_time) * 1000)
|
||||||
|
return False, None, ping_ms, str(e)
|
||||||
|
|
||||||
|
def check_patroni_cluster():
|
||||||
|
nodes = ["patroni-01", "patroni-02", "patroni-03"]
|
||||||
|
cluster_data = None
|
||||||
|
error_msg = "All Patroni nodes unreachable"
|
||||||
|
start_t = time.time()
|
||||||
|
|
||||||
|
for node in nodes:
|
||||||
|
url = f"http://{node}:8008/cluster"
|
||||||
|
ok, resp, _, err = http_check(url, timeout=3)
|
||||||
|
if ok and resp:
|
||||||
|
cluster_data = resp.json()
|
||||||
|
break
|
||||||
|
elif err:
|
||||||
|
error_msg = f"{node} error: {err}"
|
||||||
|
|
||||||
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
|
|
||||||
|
if not cluster_data:
|
||||||
|
push("PATRONI-CLUSTER", "down", error_msg, ping_ms)
|
||||||
|
return
|
||||||
|
|
||||||
|
members = cluster_data.get("members", [])
|
||||||
|
leader = None
|
||||||
|
replicas = []
|
||||||
|
|
||||||
|
for m in members:
|
||||||
|
if m.get("role") == "leader":
|
||||||
|
leader = m.get("name")
|
||||||
|
else:
|
||||||
|
lag = m.get("lag", 0)
|
||||||
|
name = m.get("name")
|
||||||
|
state = m.get("state")
|
||||||
|
replicas.append((name, lag, state))
|
||||||
|
|
||||||
|
if not leader:
|
||||||
|
down_nodes = [f"{r[0]} state: {r[2]}" for r in replicas if r[2] not in ("running", "streaming")]
|
||||||
|
msg = f"no leader detected | " + " ".join(down_nodes)
|
||||||
|
push("PATRONI-CLUSTER", "down", msg, ping_ms)
|
||||||
|
else:
|
||||||
|
lag_strs = []
|
||||||
|
for name, lag, state in replicas:
|
||||||
|
lag_mb = lag / (1024*1024) if isinstance(lag, (int, float)) else 0
|
||||||
|
lag_strs.append(f"{name} (lag:{lag_mb:.0f}MB)")
|
||||||
|
|
||||||
|
msg = f"leader: {leader} | replicas: " + " ".join(lag_strs)
|
||||||
|
push("PATRONI-CLUSTER", "up", msg, ping_ms)
|
||||||
|
|
||||||
|
def check_rabbitmq_cluster():
|
||||||
|
url = "http://rabbitmq:15672/api/healthchecks/node"
|
||||||
|
user = os.getenv("RABBITMQ_USER", "guest")
|
||||||
|
password = os.getenv("RABBITMQ_PASS", "guest")
|
||||||
|
auth = HTTPBasicAuth(user, password)
|
||||||
|
|
||||||
|
ok, resp, ping_ms, err = http_check(url, auth=auth)
|
||||||
|
|
||||||
|
if ok:
|
||||||
|
ok2, resp2, _, _ = http_check("http://rabbitmq:15672/api/nodes", auth=auth)
|
||||||
|
nodes_running = 0
|
||||||
|
total_nodes = 3
|
||||||
|
|
||||||
|
if ok2 and resp2:
|
||||||
|
data = resp2.json()
|
||||||
|
nodes_running = len([n for n in data if n.get("running")])
|
||||||
|
total_nodes = len(data)
|
||||||
|
|
||||||
|
alarms = [n.get("name") for n in data if n.get("mem_alarm") or n.get("disk_free_alarm")]
|
||||||
|
if alarms:
|
||||||
|
msg = f"disk/mem alarm active on {','.join(alarms)}"
|
||||||
|
push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
|
||||||
|
return
|
||||||
|
|
||||||
|
msg = f"{nodes_running}/{total_nodes} nodes running"
|
||||||
|
push("RABBITMQ-CLUSTER", "up", msg, ping_ms)
|
||||||
|
else:
|
||||||
|
msg = err or f"HTTP {resp.status_code if resp else 'Unknown'}"
|
||||||
|
push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
|
||||||
|
|
||||||
|
def check_apisix():
|
||||||
|
url = "http://apisix:9180/apisix/admin/routes"
|
||||||
|
api_key = os.getenv("APISIX_ADMIN_KEY", "")
|
||||||
|
headers = {"X-API-KEY": api_key} if api_key else {}
|
||||||
|
ok, resp, ping_ms, err = http_check(url, headers=headers)
|
||||||
|
|
||||||
|
if ok:
|
||||||
|
push("APISIX-GATEWAY", "up", "admin API reachable", ping_ms)
|
||||||
|
else:
|
||||||
|
push("APISIX-GATEWAY", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms)
|
||||||
|
|
||||||
|
def check_vault():
|
||||||
|
nodes = ["vault-1", "vault-2", "vault-3"]
|
||||||
|
domain = os.getenv("EXTERNAL_DOMAIN", "iklim.co")
|
||||||
|
unsealed_count = 0
|
||||||
|
total = len(nodes)
|
||||||
|
max_ping = 0
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
start_t = time.time()
|
||||||
|
for node in nodes:
|
||||||
|
url = f"https://{node}.{domain}:8200/v1/sys/health"
|
||||||
|
ok, resp, ms, err = http_check(url, verify_ssl=False, expected_status=[200, 429, 473])
|
||||||
|
max_ping = max(max_ping, ms)
|
||||||
|
|
||||||
|
if resp:
|
||||||
|
data = resp.json()
|
||||||
|
if not data.get("sealed"):
|
||||||
|
unsealed_count += 1
|
||||||
|
else:
|
||||||
|
errors.append(f"{node} SEALED")
|
||||||
|
else:
|
||||||
|
errors.append(f"{node} unreachable")
|
||||||
|
|
||||||
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
|
|
||||||
|
if unsealed_count == total:
|
||||||
|
msg = f"{unsealed_count}/{total} unsealed"
|
||||||
|
push("VAULT-CLUSTER", "up", msg, ping_ms)
|
||||||
|
else:
|
||||||
|
msg = " | ".join(errors) if errors else "Vault checks failed"
|
||||||
|
push("VAULT-CLUSTER", "down", msg, ping_ms)
|
||||||
|
|
||||||
|
def check_prometheus():
|
||||||
|
url = "http://prometheus:9090/-/healthy"
|
||||||
|
ok, resp, ping_ms, err = http_check(url)
|
||||||
|
if ok:
|
||||||
|
push("PROMETHEUS", "up", "healthy", ping_ms)
|
||||||
|
else:
|
||||||
|
push("PROMETHEUS", "down", f"prometheus unreachable: {err}", ping_ms)
|
||||||
|
|
||||||
|
def check_grafana():
|
||||||
|
url = "http://grafana:3000/api/health"
|
||||||
|
ok, resp, ping_ms, err = http_check(url)
|
||||||
|
if ok and resp:
|
||||||
|
data = resp.json()
|
||||||
|
db_status = data.get("database", "unknown")
|
||||||
|
if db_status == "ok":
|
||||||
|
push("GRAFANA", "up", f"ok | db: {db_status}", ping_ms)
|
||||||
|
else:
|
||||||
|
push("GRAFANA", "down", f"db not ok: {db_status}", ping_ms)
|
||||||
|
else:
|
||||||
|
push("GRAFANA", "down", f"grafana unreachable: {err}", ping_ms)
|
||||||
|
|
||||||
|
def check_portainer():
|
||||||
|
url = "http://portainer:9000/api/system/status"
|
||||||
|
ok, resp, ping_ms, err = http_check(url)
|
||||||
|
if ok:
|
||||||
|
push("PORTAINER", "up", "running", ping_ms)
|
||||||
|
else:
|
||||||
|
push("PORTAINER", "down", f"portainer unreachable: {err}", ping_ms)
|
||||||
|
|
||||||
|
def check_loki():
|
||||||
|
url = "http://loki:3100/ready"
|
||||||
|
ok, resp, ping_ms, err = http_check(url)
|
||||||
|
if ok:
|
||||||
|
push("LOKI", "up", "ready", ping_ms)
|
||||||
|
else:
|
||||||
|
push("LOKI", "down", f"loki unreachable: {err}", ping_ms)
|
||||||
|
|
||||||
|
def run_all_http_checks():
|
||||||
|
check_patroni_cluster()
|
||||||
|
check_rabbitmq_cluster()
|
||||||
|
check_apisix()
|
||||||
|
check_vault()
|
||||||
|
check_prometheus()
|
||||||
|
check_grafana()
|
||||||
|
check_portainer()
|
||||||
|
check_loki()
|
||||||
57
health-agent/src/health_agent/checks/mongodb.py
Normal file
57
health-agent/src/health_agent/checks/mongodb.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from pymongo import MongoClient
|
||||||
|
from health_agent.uptime_kuma import push
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def check_mongodb():
|
||||||
|
start_t = time.time()
|
||||||
|
|
||||||
|
mongo_uri = os.getenv("MONGO_URI", "mongodb://mongodb-01:27017,mongodb-02:27017,mongodb-03:27017/?replicaSet=rs0")
|
||||||
|
cluster_size = int(os.getenv("CLUSTER_SIZE_MONGODB", "3"))
|
||||||
|
|
||||||
|
try:
|
||||||
|
with MongoClient(mongo_uri, serverSelectionTimeoutMS=3000) as client:
|
||||||
|
status = client.admin.command('replSetGetStatus')
|
||||||
|
|
||||||
|
members = status.get('members', [])
|
||||||
|
|
||||||
|
primary = None
|
||||||
|
secondaries = []
|
||||||
|
|
||||||
|
for m in members:
|
||||||
|
state_str = m.get('stateStr', '')
|
||||||
|
name = m.get('name', 'unknown')
|
||||||
|
|
||||||
|
if state_str == 'PRIMARY':
|
||||||
|
primary = name
|
||||||
|
elif state_str == 'SECONDARY':
|
||||||
|
secondaries.append((name, state_str))
|
||||||
|
else:
|
||||||
|
secondaries.append((name, state_str))
|
||||||
|
|
||||||
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
|
|
||||||
|
if cluster_size == 1:
|
||||||
|
push("MONGODB-REPLICASET", "up", "standalone mode OK", ping_ms)
|
||||||
|
return
|
||||||
|
|
||||||
|
if primary:
|
||||||
|
sec_strs = [f"{s[0]} ({s[1]})" for s in secondaries]
|
||||||
|
msg = f"PRIMARY: {primary} | secondaries: {' '.join(sec_strs)}"
|
||||||
|
|
||||||
|
unhealthy_secs = [s for s in secondaries if s[1] not in ('SECONDARY', 'ARBITER')]
|
||||||
|
if unhealthy_secs:
|
||||||
|
msg = f"PRIMARY: {primary} | unhealthy: {','.join([s[0] + ':' + s[1] for s in unhealthy_secs])}"
|
||||||
|
push("MONGODB-REPLICASET", "down", msg, ping_ms)
|
||||||
|
else:
|
||||||
|
push("MONGODB-REPLICASET", "up", msg, ping_ms)
|
||||||
|
else:
|
||||||
|
msg = "no PRIMARY | quorum lost"
|
||||||
|
push("MONGODB-REPLICASET", "down", msg, ping_ms)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
|
push("MONGODB-REPLICASET", "down", f"connection failed: {e}", ping_ms)
|
||||||
43
health-agent/src/health_agent/checks/redis_sentinel.py
Normal file
43
health-agent/src/health_agent/checks/redis_sentinel.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from redis.sentinel import Sentinel
|
||||||
|
from health_agent.uptime_kuma import push
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def check_redis_sentinel():
|
||||||
|
start_t = time.time()
|
||||||
|
|
||||||
|
hosts = os.getenv("REDIS_SENTINEL_HOSTS", "redis-sentinel-01,redis-sentinel-02,redis-sentinel-03")
|
||||||
|
sentinel_nodes = [(h.strip(), 26379) for h in hosts.split(",")]
|
||||||
|
|
||||||
|
master_name = os.getenv("REDIS_MASTER_NAME", "prod-master")
|
||||||
|
password = os.getenv("REDIS_PASSWORD", None)
|
||||||
|
redis_mode = os.getenv("REDIS_MODE", "sentinel")
|
||||||
|
|
||||||
|
if redis_mode != "sentinel":
|
||||||
|
push("REDIS-SENTINEL", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000))
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
sentinel = Sentinel(sentinel_nodes, socket_timeout=3, password=password)
|
||||||
|
|
||||||
|
# Master ping
|
||||||
|
master = sentinel.master_for(master_name, socket_timeout=3, password=password)
|
||||||
|
master.ping()
|
||||||
|
master_ip, master_port = sentinel.discover_master(master_name)
|
||||||
|
master.connection_pool.disconnect()
|
||||||
|
|
||||||
|
# Get replicas count
|
||||||
|
slaves = sentinel.discover_slaves(master_name)
|
||||||
|
replicas_count = len(slaves)
|
||||||
|
|
||||||
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
|
|
||||||
|
msg = f"master: {master_ip}:{master_port} | replicas: {replicas_count} | sentinels quorum OK"
|
||||||
|
push("REDIS-SENTINEL", "up", msg, ping_ms)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
|
push("REDIS-SENTINEL", "down", f"quorum FAIL or master unreachable: {e}", ping_ms)
|
||||||
49
health-agent/src/health_agent/checks/swarm.py
Normal file
49
health-agent/src/health_agent/checks/swarm.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
import time
|
||||||
|
import docker
|
||||||
|
import logging
|
||||||
|
from health_agent.uptime_kuma import push
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def check_swarm_cluster():
|
||||||
|
start_time = time.time()
|
||||||
|
try:
|
||||||
|
client = docker.from_env()
|
||||||
|
nodes = client.nodes.list()
|
||||||
|
|
||||||
|
ready_nodes = []
|
||||||
|
managers = []
|
||||||
|
|
||||||
|
for node in nodes:
|
||||||
|
spec = node.attrs.get('Spec', {})
|
||||||
|
status = node.attrs.get('Status', {})
|
||||||
|
manager_status = node.attrs.get('ManagerStatus', {})
|
||||||
|
|
||||||
|
node_name = spec.get('Name', node.id)
|
||||||
|
is_ready = status.get('State') == 'ready'
|
||||||
|
is_manager = spec.get('Role') == 'manager'
|
||||||
|
|
||||||
|
if is_ready:
|
||||||
|
ready_nodes.append(node_name)
|
||||||
|
|
||||||
|
if is_manager:
|
||||||
|
reachability = manager_status.get('Reachability')
|
||||||
|
if reachability == 'reachable':
|
||||||
|
managers.append(node_name)
|
||||||
|
|
||||||
|
total_nodes = len(nodes)
|
||||||
|
ready_count = len(ready_nodes)
|
||||||
|
|
||||||
|
ping_ms = int((time.time() - start_time) * 1000)
|
||||||
|
|
||||||
|
if ready_count == total_nodes:
|
||||||
|
msg = f"{ready_count}/{total_nodes} nodes Ready (managers: {', '.join(managers)})"
|
||||||
|
push("SWARM-CLUSTER", "up", msg, ping_ms)
|
||||||
|
else:
|
||||||
|
msg = f"{ready_count}/{total_nodes} nodes Ready | Managers reachable: {len(managers)}"
|
||||||
|
push("SWARM-CLUSTER", "down", msg, ping_ms)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ping_ms = int((time.time() - start_time) * 1000)
|
||||||
|
logger.error(f"Swarm check failed: {e}")
|
||||||
|
push("SWARM-CLUSTER", "down", str(e), ping_ms)
|
||||||
77
health-agent/src/health_agent/checks/tcp.py
Normal file
77
health-agent/src/health_agent/checks/tcp.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import socket
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
import requests
|
||||||
|
from health_agent.uptime_kuma import push
|
||||||
|
from health_agent.checks.http import http_check
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def tcp_check(host, port, timeout=3):
|
||||||
|
start_time = time.time()
|
||||||
|
try:
|
||||||
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
sock.settimeout(timeout)
|
||||||
|
result = sock.connect_ex((host, port))
|
||||||
|
sock.close()
|
||||||
|
ping_ms = int((time.time() - start_time) * 1000)
|
||||||
|
if result == 0:
|
||||||
|
return True, ping_ms, None
|
||||||
|
else:
|
||||||
|
return False, ping_ms, f"Port {port} is closed or unreachable"
|
||||||
|
except Exception as e:
|
||||||
|
ping_ms = int((time.time() - start_time) * 1000)
|
||||||
|
return False, ping_ms, str(e)
|
||||||
|
|
||||||
|
def check_etcd_cluster():
|
||||||
|
nodes = ["etcd-01", "etcd-02", "etcd-03"]
|
||||||
|
start_t = time.time()
|
||||||
|
|
||||||
|
healthy_count = 0
|
||||||
|
leader = None
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
for node in nodes:
|
||||||
|
# 1. TCP Check on 2379
|
||||||
|
tcp_ok, ms, tcp_err = tcp_check(node, 2379)
|
||||||
|
if not tcp_ok:
|
||||||
|
errors.append(f"{node} port 2379 unreachable")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 2. HTTP Health check
|
||||||
|
url = f"http://{node}:2379/health"
|
||||||
|
http_ok, resp, ms, http_err = http_check(url, timeout=3)
|
||||||
|
|
||||||
|
if http_ok and resp:
|
||||||
|
data = resp.json()
|
||||||
|
if data.get("health") == "true":
|
||||||
|
healthy_count += 1
|
||||||
|
else:
|
||||||
|
errors.append(f"{node} unhealthy")
|
||||||
|
else:
|
||||||
|
errors.append(f"{node} health endpoint unreachable")
|
||||||
|
|
||||||
|
# 3. Leader check from /v3/maintenance/status
|
||||||
|
if not leader and tcp_ok:
|
||||||
|
status_url = f"http://{node}:2379/v3/maintenance/status"
|
||||||
|
try:
|
||||||
|
r = requests.post(status_url, json={}, timeout=3)
|
||||||
|
if r.status_code == 200:
|
||||||
|
status_data = r.json()
|
||||||
|
leader_id = status_data.get("leader")
|
||||||
|
header_member_id = status_data.get("header", {}).get("member_id")
|
||||||
|
if leader_id and leader_id == header_member_id:
|
||||||
|
leader = node
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
|
|
||||||
|
if healthy_count == len(nodes):
|
||||||
|
leader_info = f" | leader: {leader}" if leader else ""
|
||||||
|
msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}"
|
||||||
|
push("ETCD-CLUSTER", "up", msg, ping_ms)
|
||||||
|
else:
|
||||||
|
quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else ""
|
||||||
|
msg = " | ".join(errors) + quorum_msg
|
||||||
|
push("ETCD-CLUSTER", "down", msg, ping_ms)
|
||||||
62
health-agent/src/health_agent/checks/tls.py
Normal file
62
health-agent/src/health_agent/checks/tls.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
import requests
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from health_agent.uptime_kuma import push
|
||||||
|
from cryptography import x509
|
||||||
|
from cryptography.hazmat.backends import default_backend
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def check_swag_tls():
|
||||||
|
start_t = time.time()
|
||||||
|
cert_path = "/mnt/storagebox/ssl/STAR.iklim.co.full.crt"
|
||||||
|
domain = os.getenv("EXTERNAL_DOMAIN", "iklim.co")
|
||||||
|
suffix = os.getenv("EXTERNAL_SUBDOMAIN_SUFFIX", "")
|
||||||
|
target_url = f"https://api{suffix}.{domain}/actuator/health"
|
||||||
|
|
||||||
|
msg_parts = []
|
||||||
|
is_down = False
|
||||||
|
|
||||||
|
# 1. Check cert file
|
||||||
|
if not os.path.exists(cert_path):
|
||||||
|
is_down = True
|
||||||
|
msg_parts.append("cert file missing on storagebox")
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
with open(cert_path, "rb") as f:
|
||||||
|
cert_data = f.read()
|
||||||
|
cert = x509.load_pem_x509_certificate(cert_data, default_backend())
|
||||||
|
not_valid_after = cert.not_valid_after_utc
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
days_left = (not_valid_after - now).days
|
||||||
|
|
||||||
|
if days_left < 14:
|
||||||
|
is_down = True
|
||||||
|
msg_parts.append(f"cert expires in {days_left} days")
|
||||||
|
else:
|
||||||
|
msg_parts.append(f"cert valid until {not_valid_after.strftime('%Y-%m-%d')} ({days_left} days)")
|
||||||
|
except Exception as e:
|
||||||
|
is_down = True
|
||||||
|
msg_parts.append(f"cert parse error: {e}")
|
||||||
|
|
||||||
|
# 2. Check external HTTPS reachable
|
||||||
|
try:
|
||||||
|
r = requests.get(target_url, timeout=5, verify=False)
|
||||||
|
if r.status_code < 500:
|
||||||
|
msg_parts.append("HTTPS reachable")
|
||||||
|
else:
|
||||||
|
is_down = True
|
||||||
|
msg_parts.append(f"HTTPS returned {r.status_code}")
|
||||||
|
except Exception as e:
|
||||||
|
is_down = True
|
||||||
|
msg_parts.append(f"HTTPS unreachable")
|
||||||
|
|
||||||
|
ping_ms = int((time.time() - start_t) * 1000)
|
||||||
|
msg = " | ".join(msg_parts)
|
||||||
|
|
||||||
|
if is_down:
|
||||||
|
push("SWAG-TLS", "down", msg, ping_ms)
|
||||||
|
else:
|
||||||
|
push("SWAG-TLS", "up", msg, ping_ms)
|
||||||
25
health-agent/src/health_agent/config.py
Normal file
25
health-agent/src/health_agent/config.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
from pathlib import Path
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
ENV = os.getenv("ENV", "prod")
|
||||||
|
CLUSTER_SIZE_ETCD = int(os.getenv("CLUSTER_SIZE_ETCD", "3"))
|
||||||
|
CLUSTER_SIZE_PATRONI = int(os.getenv("CLUSTER_SIZE_PATRONI", "3"))
|
||||||
|
CLUSTER_SIZE_MONGODB = int(os.getenv("CLUSTER_SIZE_MONGODB", "3"))
|
||||||
|
CLUSTER_SIZE_RABBITMQ = int(os.getenv("CLUSTER_SIZE_RABBITMQ", "3"))
|
||||||
|
CLUSTER_SIZE_VAULT = int(os.getenv("CLUSTER_SIZE_VAULT", "3"))
|
||||||
|
REDIS_MODE = os.getenv("REDIS_MODE", "sentinel")
|
||||||
|
EXTERNAL_DOMAIN = os.getenv("EXTERNAL_DOMAIN", "iklim.co")
|
||||||
|
EXTERNAL_SUBDOMAIN_SUFFIX = os.getenv("EXTERNAL_SUBDOMAIN_SUFFIX", "")
|
||||||
|
|
||||||
|
def load_uk_tokens():
|
||||||
|
token_file = Path("config/generated/uk_tokens.yml")
|
||||||
|
if not token_file.exists():
|
||||||
|
return {}
|
||||||
|
with open(token_file, "r") as f:
|
||||||
|
return yaml.safe_load(f) or {}
|
||||||
|
|
||||||
|
UK_TOKENS = load_uk_tokens()
|
||||||
56
health-agent/src/health_agent/events/docker_events.py
Normal file
56
health-agent/src/health_agent/events/docker_events.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
import os
|
||||||
|
import docker
|
||||||
|
import threading
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from health_agent.slack import notify
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def parse_and_notify(event):
|
||||||
|
attrs = event.get('Actor', {}).get('Attributes', {})
|
||||||
|
container_name = attrs.get('name', 'unknown')
|
||||||
|
exit_code = attrs.get('exitCode', '0')
|
||||||
|
|
||||||
|
if exit_code == '0':
|
||||||
|
return
|
||||||
|
|
||||||
|
is_oom = (exit_code == '137')
|
||||||
|
|
||||||
|
env = os.getenv("ENV", "test").upper()
|
||||||
|
webhook_env_name = f"SLACK_WEBHOOK_IKLIM_{env}_OPS"
|
||||||
|
|
||||||
|
priority = "High" if is_oom else "Medium"
|
||||||
|
title = f"[Health Agent / Events] Container Crashed ({container_name})"
|
||||||
|
|
||||||
|
detail = f"Container: {container_name}\nExit Code: {exit_code}"
|
||||||
|
if is_oom:
|
||||||
|
detail += "\nReason: OOM Killed (Out Of Memory) or SIGKILL"
|
||||||
|
|
||||||
|
notify(
|
||||||
|
webhook_env=webhook_env_name,
|
||||||
|
source="health-agent-events",
|
||||||
|
priority=priority,
|
||||||
|
title=title,
|
||||||
|
detail=detail
|
||||||
|
)
|
||||||
|
|
||||||
|
def event_listener_loop():
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
client = docker.from_env()
|
||||||
|
logger.info("Starting Docker event listener...")
|
||||||
|
filters = {"type": "container", "event": "die"}
|
||||||
|
for event in client.events(decode=True, filters=filters):
|
||||||
|
try:
|
||||||
|
parse_and_notify(event)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error parsing event: {e}", exc_info=True)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Docker event listener error: {e}. Reconnecting in 10s...", exc_info=True)
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
def start_docker_event_listener():
|
||||||
|
thread = threading.Thread(target=event_listener_loop, daemon=True)
|
||||||
|
thread.start()
|
||||||
|
return thread
|
||||||
75
health-agent/src/health_agent/main.py
Normal file
75
health-agent/src/health_agent/main.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from health_agent.checks import swarm
|
||||||
|
from health_agent.checks.http import run_all_http_checks
|
||||||
|
from health_agent.checks.tcp import check_etcd_cluster
|
||||||
|
from health_agent.checks.tls import check_swag_tls
|
||||||
|
from health_agent.checks.redis_sentinel import check_redis_sentinel
|
||||||
|
from health_agent.checks.mongodb import check_mongodb
|
||||||
|
from health_agent.checks.filesystem import check_storagebox_mount
|
||||||
|
from health_agent.events.docker_events import start_docker_event_listener
|
||||||
|
import json
|
||||||
|
|
||||||
|
class JSONFormatter(logging.Formatter):
|
||||||
|
def format(self, record):
|
||||||
|
log_obj = {
|
||||||
|
"time": self.formatTime(record, self.datefmt),
|
||||||
|
"level": record.levelname,
|
||||||
|
"logger": record.name,
|
||||||
|
"msg": record.getMessage()
|
||||||
|
}
|
||||||
|
for attr in ['check', 'status', 'ping_ms', 'source', 'error']:
|
||||||
|
if hasattr(record, attr):
|
||||||
|
log_obj[attr] = getattr(record, attr)
|
||||||
|
if record.exc_info:
|
||||||
|
log_obj['exc_info'] = self.formatException(record.exc_info)
|
||||||
|
return json.dumps(log_obj)
|
||||||
|
|
||||||
|
handler = logging.StreamHandler()
|
||||||
|
handler.setFormatter(JSONFormatter())
|
||||||
|
logging.basicConfig(level=logging.INFO, handlers=[handler])
|
||||||
|
logger = logging.getLogger("main")
|
||||||
|
|
||||||
|
def run_checks():
|
||||||
|
logger.info("Running health checks...")
|
||||||
|
try:
|
||||||
|
swarm.check_swarm_cluster()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error checking Swarm cluster: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
run_all_http_checks()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error running HTTP checks: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
check_etcd_cluster()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error running etcd checks: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
check_swag_tls()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error running TLS checks: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
check_redis_sentinel()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error running Redis checks: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
check_mongodb()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error running MongoDB checks: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
check_storagebox_mount()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error running filesystem checks: {e}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logger.info("Starting health-agent...")
|
||||||
|
start_docker_event_listener()
|
||||||
|
while True:
|
||||||
|
run_checks()
|
||||||
|
time.sleep(60)
|
||||||
22
health-agent/src/health_agent/slack.py
Normal file
22
health-agent/src/health_agent/slack.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def notify(webhook_env: str, source: str, priority: str, title: str, detail: str):
|
||||||
|
webhook_url = os.getenv(webhook_env)
|
||||||
|
if not webhook_url:
|
||||||
|
logger.warning(f"Slack webhook url not found for {webhook_env}")
|
||||||
|
return
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"text": f"*{title}*\n*Source:* {source}\n*Priority:* {priority}\n```\n{detail}\n```"
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(webhook_url, json=payload, timeout=5)
|
||||||
|
response.raise_for_status()
|
||||||
|
logger.info(f"Sent Slack notification to {webhook_env}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to send Slack notification: {e}")
|
||||||
19
health-agent/src/health_agent/state.py
Normal file
19
health-agent/src/health_agent/state.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
STATE_FILE = Path("config/generated/state.json")
|
||||||
|
|
||||||
|
def load_state():
|
||||||
|
if not STATE_FILE.exists():
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
with open(STATE_FILE, "r") as f:
|
||||||
|
return json.load(f)
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def save_state(state):
|
||||||
|
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(STATE_FILE, "w") as f:
|
||||||
|
json.dump(state, f)
|
||||||
27
health-agent/src/health_agent/uptime_kuma.py
Normal file
27
health-agent/src/health_agent/uptime_kuma.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import logging
|
||||||
|
from health_agent.config import UK_TOKENS
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
UK_PUSH_URL_BASE = os.getenv("UK_PUSH_URL_BASE", "https://status.iklim.co/api/push")
|
||||||
|
|
||||||
|
def push(monitor_name: str, status: str, msg: str, ping_ms: int):
|
||||||
|
token = UK_TOKENS.get(monitor_name)
|
||||||
|
if not token:
|
||||||
|
logger.warning(f"No token found for monitor {monitor_name}")
|
||||||
|
return
|
||||||
|
|
||||||
|
url = f"{UK_PUSH_URL_BASE}/{token}"
|
||||||
|
params = {
|
||||||
|
"status": status,
|
||||||
|
"msg": msg,
|
||||||
|
"ping": int(ping_ms)
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(url, params=params, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
logger.info(f"Pushed {monitor_name} status={status}", extra={"check": monitor_name, "status": status, "msg": msg, "ping_ms": ping_ms, "source": "uptime_kuma"})
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to push {monitor_name}: {e}", extra={"check": monitor_name, "status": "push_failed", "error": str(e), "source": "uptime_kuma"})
|
||||||
Loading…
x
Reference in New Issue
Block a user