feat(health-agent): add monitors.yml with env-aware node IP mapping from Ansible inventory
This commit is contained in:
parent
a2e8997711
commit
f742bfdd11
10
health-agent/.env.example
Normal file
10
health-agent/.env.example
Normal file
@ -0,0 +1,10 @@
|
||||
ENV=prod
|
||||
CLUSTER_SIZE_ETCD=3
|
||||
CLUSTER_SIZE_PATRONI=3
|
||||
CLUSTER_SIZE_MONGODB=3
|
||||
CLUSTER_SIZE_RABBITMQ=3
|
||||
CLUSTER_SIZE_VAULT=3
|
||||
REDIS_MODE=sentinel
|
||||
EXTERNAL_DOMAIN=iklim.co
|
||||
EXTERNAL_SUBDOMAIN_SUFFIX=
|
||||
UK_PUSH_URL_BASE=https://status.iklim.co/api/push
|
||||
5
health-agent/.env.setup.example
Normal file
5
health-agent/.env.setup.example
Normal file
@ -0,0 +1,5 @@
|
||||
UK_URL=http://uptime-kuma:3001
|
||||
UK_API_KEY=your_api_key_here
|
||||
UK_SLACK_WEBHOOK_HIGH=https://hooks.slack.com/services/...
|
||||
UK_SLACK_WEBHOOK_MEDIUM=https://hooks.slack.com/services/...
|
||||
UK_SLACK_WEBHOOK_LOW=https://hooks.slack.com/services/...
|
||||
16
health-agent/Dockerfile
Normal file
16
health-agent/Dockerfile
Normal file
@ -0,0 +1,16 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY pyproject.toml ./
|
||||
COPY src/ ./src/
|
||||
RUN pip install --no-cache-dir .
|
||||
ENV PYTHONPATH=/app/src
|
||||
|
||||
RUN useradd -m appuser
|
||||
# Keeping as root to be able to access /var/run/docker.sock cleanly, unless specifically configured with groups.
|
||||
# USER appuser
|
||||
|
||||
CMD ["python", "src/health_agent/main.py"]
|
||||
196
health-agent/config/monitors.yml
Normal file
196
health-agent/config/monitors.yml
Normal file
@ -0,0 +1,196 @@
|
||||
version: "1"
|
||||
project: "iklim"
|
||||
domain:
|
||||
base: "iklim.co"
|
||||
nodes:
|
||||
prod:
|
||||
service:
|
||||
- name: iklim-app-01
|
||||
ip: "178.104.210.41"
|
||||
- name: iklim-app-02
|
||||
ip: "178.105.69.1"
|
||||
- name: iklim-app-03
|
||||
ip: "178.104.219.3"
|
||||
db:
|
||||
- name: iklim-db-01
|
||||
ip: "159.69.117.158"
|
||||
- name: iklim-db-02
|
||||
ip: "178.104.219.162"
|
||||
- name: iklim-db-03
|
||||
ip: "159.69.115.105"
|
||||
test:
|
||||
service:
|
||||
- name: iklim-app-01
|
||||
ip: "167.235.194.61"
|
||||
db:
|
||||
- name: iklim-db-01
|
||||
ip: "167.235.205.93"
|
||||
tags:
|
||||
- external
|
||||
- internal
|
||||
- high
|
||||
- medium
|
||||
- low
|
||||
- database
|
||||
- gateway
|
||||
- infrastructure
|
||||
- observability
|
||||
notifications:
|
||||
slack-high:
|
||||
type: slack
|
||||
webhook_env: UK_SLACK_WEBHOOK_HIGH
|
||||
slack-medium:
|
||||
type: slack
|
||||
webhook_env: UK_SLACK_WEBHOOK_MEDIUM
|
||||
slack-low:
|
||||
type: slack
|
||||
webhook_env: UK_SLACK_WEBHOOK_LOW
|
||||
groups:
|
||||
- name: "Altyapı"
|
||||
status_page: "iklim-{env}-ops"
|
||||
notifications: [slack-high]
|
||||
tags: [internal, infrastructure]
|
||||
children: [SWARM-CLUSTER, VAULT-CLUSTER, STORAGEBOX-MOUNT, SWAG-TLS]
|
||||
- name: "Veri Katmanı"
|
||||
status_page: "iklim-{env}-ops"
|
||||
notifications: [slack-high]
|
||||
tags: [internal, database]
|
||||
children: [ETCD-CLUSTER, PATRONI-CLUSTER, MONGODB-REPLICASET]
|
||||
- name: "Gateway & Mesajlaşma"
|
||||
status_page: "iklim-{env}-ops"
|
||||
notifications: [slack-high]
|
||||
tags: [internal, gateway]
|
||||
children: [APISIX-GATEWAY, RABBITMQ-CLUSTER, REDIS-SENTINEL]
|
||||
- name: "Dış Erişilebilirlik - Kritik"
|
||||
status_page: "iklim-{env}-ops"
|
||||
notifications: [slack-high]
|
||||
tags: [external, high]
|
||||
children: [EXT-HTTPS-API, EXT-DNS-API, EXT-DNS-ROOT, EXT-PING-APP01, EXT-PING-APP02, EXT-PING-APP03]
|
||||
- name: "Dış Erişilebilirlik - Genel"
|
||||
status_page: "iklim-{env}-ops"
|
||||
notifications: [slack-medium]
|
||||
tags: [external, medium]
|
||||
children: [EXT-HTTPS-GRAFANA, EXT-PING-DB01, EXT-PING-DB02, EXT-PING-DB03]
|
||||
- name: "Gözlemlenebilirlik"
|
||||
status_page: "iklim-{env}-tools"
|
||||
notifications: [slack-low]
|
||||
tags: [internal, observability]
|
||||
children: [PROMETHEUS, GRAFANA, PORTAINER, LOKI, EXT-HTTPS-PORTAINER, EXT-HTTPS-APIGW]
|
||||
push_monitors:
|
||||
- name: SWARM-CLUSTER
|
||||
interval: 60
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, infrastructure, high]
|
||||
restart_threshold: 1
|
||||
- name: VAULT-CLUSTER
|
||||
interval: 60
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, infrastructure, high]
|
||||
restart_threshold: 1
|
||||
- name: ETCD-CLUSTER
|
||||
interval: 60
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, database, high]
|
||||
restart_threshold: 1
|
||||
- name: PATRONI-CLUSTER
|
||||
interval: 60
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, database, high]
|
||||
restart_threshold: 1
|
||||
- name: MONGODB-REPLICASET
|
||||
interval: 120
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, database, high]
|
||||
restart_threshold: 1
|
||||
- name: APISIX-GATEWAY
|
||||
interval: 60
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, gateway, high]
|
||||
restart_threshold: 1
|
||||
- name: RABBITMQ-CLUSTER
|
||||
interval: 60
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, gateway, medium]
|
||||
restart_threshold: 3
|
||||
- name: REDIS-SENTINEL
|
||||
interval: 60
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, database, medium]
|
||||
restart_threshold: 3
|
||||
- name: SWAG-TLS
|
||||
interval: 3600
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, infrastructure, medium]
|
||||
restart_threshold: 3
|
||||
- name: STORAGEBOX-MOUNT
|
||||
interval: 300
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, infrastructure, medium]
|
||||
restart_threshold: 1
|
||||
- name: PROMETHEUS
|
||||
interval: 120
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, observability, low]
|
||||
restart_threshold: 5
|
||||
- name: GRAFANA
|
||||
interval: 120
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, observability, low]
|
||||
restart_threshold: 5
|
||||
- name: PORTAINER
|
||||
interval: 120
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, observability, low]
|
||||
restart_threshold: 5
|
||||
- name: LOKI
|
||||
interval: 120
|
||||
heartbeat_retries: 1
|
||||
tags: [internal, observability, low]
|
||||
restart_threshold: 5
|
||||
http_monitors:
|
||||
- name: EXT-HTTPS-API
|
||||
url: "https://api{suffix}.{domain}/actuator/health"
|
||||
accepted_statuscodes: ["200"]
|
||||
interval: 60
|
||||
- name: EXT-HTTPS-GRAFANA
|
||||
url: "https://grafana{suffix}.{domain}/api/health"
|
||||
accepted_statuscodes: ["200"]
|
||||
interval: 60
|
||||
- name: EXT-HTTPS-PORTAINER
|
||||
url: "https://portainer{suffix}.{domain}"
|
||||
accepted_statuscodes: ["200", "401", "403"]
|
||||
interval: 120
|
||||
- name: EXT-HTTPS-APIGW
|
||||
url: "https://apigw{suffix}.{domain}"
|
||||
accepted_statuscodes: ["200", "401", "403"]
|
||||
interval: 120
|
||||
dns_monitors:
|
||||
- name: EXT-DNS-API
|
||||
hostname: "api{suffix}.{domain}"
|
||||
dns_resolve_type: A
|
||||
interval: 60
|
||||
- name: EXT-DNS-ROOT
|
||||
hostname: "{domain}"
|
||||
dns_resolve_type: A
|
||||
interval: 60
|
||||
ping_monitors:
|
||||
interval: 60
|
||||
max_retries: 1
|
||||
status_pages:
|
||||
- slug: "iklim-{env}-status"
|
||||
title: "iklim.co API Durumu"
|
||||
public: true
|
||||
groups: ["Dış Erişilebilirlik - Kritik"]
|
||||
- slug: "iklim-{env}-ops"
|
||||
title: "iklim.co [{env}] Altyapı"
|
||||
public: false
|
||||
groups:
|
||||
- "Altyapı"
|
||||
- "Veri Katmanı"
|
||||
- "Gateway & Mesajlaşma"
|
||||
- "Dış Erişilebilirlik - Kritik"
|
||||
- "Dış Erişilebilirlik - Genel"
|
||||
- slug: "iklim-{env}-tools"
|
||||
title: "iklim.co [{env}] Araçlar"
|
||||
public: false
|
||||
groups: ["Gözlemlenebilirlik"]
|
||||
22
health-agent/pyproject.toml
Normal file
22
health-agent/pyproject.toml
Normal file
@ -0,0 +1,22 @@
|
||||
[project]
|
||||
name = "health-agent"
|
||||
version = "0.1.0"
|
||||
description = "iklim.co Monitoring Health Agent"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"requests",
|
||||
"docker",
|
||||
"python-dotenv",
|
||||
"pyyaml",
|
||||
"redis",
|
||||
"pymongo",
|
||||
"uptime-kuma-api",
|
||||
"cryptography",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/health_agent"]
|
||||
138
health-agent/scripts/setup_uptime_kuma.py
Normal file
138
health-agent/scripts/setup_uptime_kuma.py
Normal file
@ -0,0 +1,138 @@
|
||||
import os
|
||||
import argparse
|
||||
import yaml
|
||||
import logging
|
||||
from uptime_kuma_api import UptimeKumaApi, MonitorType
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||
logger = logging.getLogger("uk-setup")
|
||||
|
||||
def format_str(text, env_name, project):
|
||||
if not isinstance(text, str):
|
||||
return text
|
||||
return text.replace("{env}", env_name).replace("{project}", project)
|
||||
|
||||
def setup_uptime_kuma(dry_run=False, only=None):
|
||||
env_name = os.getenv("ENV", "test")
|
||||
|
||||
config_path = os.path.join(os.path.dirname(__file__), "..", "config", "monitors.yml")
|
||||
with open(config_path, "r") as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
project = config.get("project", "iklim")
|
||||
|
||||
kuma_url = os.getenv("UK_URL", "http://localhost:3001")
|
||||
kuma_user = os.getenv("UK_USER", "admin")
|
||||
kuma_pass = os.getenv("UK_PASS", "admin")
|
||||
|
||||
api = None
|
||||
if not dry_run:
|
||||
logger.info(f"Connecting to Uptime Kuma at {kuma_url}...")
|
||||
try:
|
||||
api = UptimeKumaApi(kuma_url)
|
||||
api.login(kuma_user, kuma_pass)
|
||||
except Exception as e:
|
||||
logger.error(f"Login failed: {e}")
|
||||
return
|
||||
|
||||
existing_monitors = {}
|
||||
if api:
|
||||
try:
|
||||
for m in api.get_monitors():
|
||||
existing_monitors[m['name']] = m
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get monitors: {e}")
|
||||
|
||||
# 1. Process Groups
|
||||
group_map = {}
|
||||
for g in config.get("groups", []):
|
||||
raw_name = g["name"]
|
||||
formatted_name = f"{project} [{env_name}] {raw_name}"
|
||||
|
||||
logger.info(f"Processing group: {formatted_name}")
|
||||
if not dry_run:
|
||||
if formatted_name not in existing_monitors:
|
||||
logger.info(f"Creating group monitor: {formatted_name}")
|
||||
res = api.add_monitor(type=MonitorType.GROUP, name=formatted_name)
|
||||
group_map[raw_name] = res['monitorID']
|
||||
else:
|
||||
group_map[raw_name] = existing_monitors[formatted_name]['id']
|
||||
|
||||
tokens = {}
|
||||
|
||||
# 2. Push Monitors
|
||||
for pm in config.get("push_monitors", []):
|
||||
m_name = pm["name"]
|
||||
if only and m_name != only:
|
||||
continue
|
||||
|
||||
m_interval = pm.get("interval", 60)
|
||||
|
||||
parent_group_id = None
|
||||
for g in config.get("groups", []):
|
||||
if m_name in g.get("children", []):
|
||||
parent_group_id = group_map.get(g["name"])
|
||||
break
|
||||
|
||||
logger.info(f"Processing push monitor: {m_name}")
|
||||
if not dry_run:
|
||||
if m_name in existing_monitors:
|
||||
logger.info(f"Monitor {m_name} already exists.")
|
||||
m_id = existing_monitors[m_name]['id']
|
||||
token = existing_monitors[m_name]['pushToken']
|
||||
tokens[m_name] = token
|
||||
|
||||
if parent_group_id and existing_monitors[m_name].get('parent') != parent_group_id:
|
||||
api.edit_monitor(m_id, parent=parent_group_id)
|
||||
else:
|
||||
logger.info(f"Creating push monitor: {m_name}")
|
||||
result = api.add_monitor(
|
||||
type=MonitorType.PUSH,
|
||||
name=m_name,
|
||||
interval=m_interval,
|
||||
parent=parent_group_id
|
||||
)
|
||||
m_id = result['monitorID']
|
||||
|
||||
# Fetch again to get pushToken
|
||||
for m in api.get_monitors():
|
||||
if m['id'] == m_id:
|
||||
tokens[m_name] = m['pushToken']
|
||||
break
|
||||
else:
|
||||
tokens[m_name] = "dummy_token_dry_run"
|
||||
|
||||
# 3. Process Status Pages
|
||||
for sp in config.get("status_pages", []):
|
||||
slug = format_str(sp["slug"], env_name, project)
|
||||
title = format_str(sp["title"], env_name, project)
|
||||
logger.info(f"Processing status page: {title} (slug: {slug})")
|
||||
if not dry_run:
|
||||
try:
|
||||
pages = api.get_status_pages()
|
||||
exists = any(p['slug'] == slug for p in pages)
|
||||
if not exists:
|
||||
logger.info(f"Creating status page: {slug}")
|
||||
api.add_status_page(slug, title)
|
||||
except Exception as e:
|
||||
logger.warning(f"Status page ops failed: {e}")
|
||||
|
||||
# 4. Write tokens to uk_tokens.yml
|
||||
token_file = os.path.join(os.path.dirname(__file__), "..", "config", "uk_tokens.yml")
|
||||
if not dry_run:
|
||||
with open(token_file, "w") as f:
|
||||
yaml.dump(tokens, f)
|
||||
logger.info(f"Saved push tokens to {token_file}")
|
||||
else:
|
||||
logger.info(f"[DRY-RUN] Would save {len(tokens)} tokens to {token_file}")
|
||||
|
||||
if api:
|
||||
api.disconnect()
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Setup Uptime Kuma monitors")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Print actions without making changes")
|
||||
parser.add_argument("--only", type=str, help="Only process a specific monitor by name")
|
||||
args = parser.parse_args()
|
||||
|
||||
setup_uptime_kuma(dry_run=args.dry_run, only=args.only)
|
||||
36
health-agent/src/health_agent/checks/filesystem.py
Normal file
36
health-agent/src/health_agent/checks/filesystem.py
Normal file
@ -0,0 +1,36 @@
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
from health_agent.uptime_kuma import push
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def check_storagebox_mount():
|
||||
start_t = time.time()
|
||||
|
||||
storagebox_path = os.getenv("STORAGEBOX_PATH", "/mnt/storagebox")
|
||||
expected_files = [
|
||||
"patroni/patroni.yml",
|
||||
"ssl/STAR.iklim.co.full.crt"
|
||||
]
|
||||
|
||||
missing_files = []
|
||||
|
||||
if not os.path.exists(storagebox_path):
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
push("STORAGEBOX-MOUNT", "down", f"{storagebox_path} not found", ping_ms)
|
||||
return
|
||||
|
||||
for rel_path in expected_files:
|
||||
full_path = os.path.join(storagebox_path, rel_path)
|
||||
if not os.path.exists(full_path):
|
||||
missing_files.append(rel_path)
|
||||
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
|
||||
if missing_files:
|
||||
msg = f"mount exists but missing: {', '.join(missing_files)}"
|
||||
push("STORAGEBOX-MOUNT", "down", msg, ping_ms)
|
||||
else:
|
||||
msg = f"{storagebox_path} OK | all critical files present"
|
||||
push("STORAGEBOX-MOUNT", "up", msg, ping_ms)
|
||||
196
health-agent/src/health_agent/checks/http.py
Normal file
196
health-agent/src/health_agent/checks/http.py
Normal file
@ -0,0 +1,196 @@
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import requests
|
||||
from requests.auth import HTTPBasicAuth
|
||||
from health_agent.uptime_kuma import push
|
||||
import urllib3
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def http_check(url, expected_status=None, auth=None, verify_ssl=True, timeout=5, headers=None):
|
||||
start_time = time.time()
|
||||
try:
|
||||
resp = requests.get(url, auth=auth, verify=verify_ssl, timeout=timeout, headers=headers)
|
||||
ping_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
if expected_status:
|
||||
if isinstance(expected_status, list):
|
||||
is_ok = resp.status_code in expected_status
|
||||
else:
|
||||
is_ok = resp.status_code == expected_status
|
||||
else:
|
||||
is_ok = resp.status_code < 400
|
||||
|
||||
return is_ok, resp, ping_ms, None
|
||||
except Exception as e:
|
||||
ping_ms = int((time.time() - start_time) * 1000)
|
||||
return False, None, ping_ms, str(e)
|
||||
|
||||
def check_patroni_cluster():
|
||||
nodes = ["patroni-01", "patroni-02", "patroni-03"]
|
||||
cluster_data = None
|
||||
error_msg = "All Patroni nodes unreachable"
|
||||
start_t = time.time()
|
||||
|
||||
for node in nodes:
|
||||
url = f"http://{node}:8008/cluster"
|
||||
ok, resp, _, err = http_check(url, timeout=3)
|
||||
if ok and resp:
|
||||
cluster_data = resp.json()
|
||||
break
|
||||
elif err:
|
||||
error_msg = f"{node} error: {err}"
|
||||
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
|
||||
if not cluster_data:
|
||||
push("PATRONI-CLUSTER", "down", error_msg, ping_ms)
|
||||
return
|
||||
|
||||
members = cluster_data.get("members", [])
|
||||
leader = None
|
||||
replicas = []
|
||||
|
||||
for m in members:
|
||||
if m.get("role") == "leader":
|
||||
leader = m.get("name")
|
||||
else:
|
||||
lag = m.get("lag", 0)
|
||||
name = m.get("name")
|
||||
state = m.get("state")
|
||||
replicas.append((name, lag, state))
|
||||
|
||||
if not leader:
|
||||
down_nodes = [f"{r[0]} state: {r[2]}" for r in replicas if r[2] not in ("running", "streaming")]
|
||||
msg = f"no leader detected | " + " ".join(down_nodes)
|
||||
push("PATRONI-CLUSTER", "down", msg, ping_ms)
|
||||
else:
|
||||
lag_strs = []
|
||||
for name, lag, state in replicas:
|
||||
lag_mb = lag / (1024*1024) if isinstance(lag, (int, float)) else 0
|
||||
lag_strs.append(f"{name} (lag:{lag_mb:.0f}MB)")
|
||||
|
||||
msg = f"leader: {leader} | replicas: " + " ".join(lag_strs)
|
||||
push("PATRONI-CLUSTER", "up", msg, ping_ms)
|
||||
|
||||
def check_rabbitmq_cluster():
|
||||
url = "http://rabbitmq:15672/api/healthchecks/node"
|
||||
user = os.getenv("RABBITMQ_USER", "guest")
|
||||
password = os.getenv("RABBITMQ_PASS", "guest")
|
||||
auth = HTTPBasicAuth(user, password)
|
||||
|
||||
ok, resp, ping_ms, err = http_check(url, auth=auth)
|
||||
|
||||
if ok:
|
||||
ok2, resp2, _, _ = http_check("http://rabbitmq:15672/api/nodes", auth=auth)
|
||||
nodes_running = 0
|
||||
total_nodes = 3
|
||||
|
||||
if ok2 and resp2:
|
||||
data = resp2.json()
|
||||
nodes_running = len([n for n in data if n.get("running")])
|
||||
total_nodes = len(data)
|
||||
|
||||
alarms = [n.get("name") for n in data if n.get("mem_alarm") or n.get("disk_free_alarm")]
|
||||
if alarms:
|
||||
msg = f"disk/mem alarm active on {','.join(alarms)}"
|
||||
push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
|
||||
return
|
||||
|
||||
msg = f"{nodes_running}/{total_nodes} nodes running"
|
||||
push("RABBITMQ-CLUSTER", "up", msg, ping_ms)
|
||||
else:
|
||||
msg = err or f"HTTP {resp.status_code if resp else 'Unknown'}"
|
||||
push("RABBITMQ-CLUSTER", "down", msg, ping_ms)
|
||||
|
||||
def check_apisix():
|
||||
url = "http://apisix:9180/apisix/admin/routes"
|
||||
api_key = os.getenv("APISIX_ADMIN_KEY", "")
|
||||
headers = {"X-API-KEY": api_key} if api_key else {}
|
||||
ok, resp, ping_ms, err = http_check(url, headers=headers)
|
||||
|
||||
if ok:
|
||||
push("APISIX-GATEWAY", "up", "admin API reachable", ping_ms)
|
||||
else:
|
||||
push("APISIX-GATEWAY", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms)
|
||||
|
||||
def check_vault():
|
||||
nodes = ["vault-1", "vault-2", "vault-3"]
|
||||
domain = os.getenv("EXTERNAL_DOMAIN", "iklim.co")
|
||||
unsealed_count = 0
|
||||
total = len(nodes)
|
||||
max_ping = 0
|
||||
errors = []
|
||||
|
||||
start_t = time.time()
|
||||
for node in nodes:
|
||||
url = f"https://{node}.{domain}:8200/v1/sys/health"
|
||||
ok, resp, ms, err = http_check(url, verify_ssl=False, expected_status=[200, 429, 473])
|
||||
max_ping = max(max_ping, ms)
|
||||
|
||||
if resp:
|
||||
data = resp.json()
|
||||
if not data.get("sealed"):
|
||||
unsealed_count += 1
|
||||
else:
|
||||
errors.append(f"{node} SEALED")
|
||||
else:
|
||||
errors.append(f"{node} unreachable")
|
||||
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
|
||||
if unsealed_count == total:
|
||||
msg = f"{unsealed_count}/{total} unsealed"
|
||||
push("VAULT-CLUSTER", "up", msg, ping_ms)
|
||||
else:
|
||||
msg = " | ".join(errors) if errors else "Vault checks failed"
|
||||
push("VAULT-CLUSTER", "down", msg, ping_ms)
|
||||
|
||||
def check_prometheus():
|
||||
url = "http://prometheus:9090/-/healthy"
|
||||
ok, resp, ping_ms, err = http_check(url)
|
||||
if ok:
|
||||
push("PROMETHEUS", "up", "healthy", ping_ms)
|
||||
else:
|
||||
push("PROMETHEUS", "down", f"prometheus unreachable: {err}", ping_ms)
|
||||
|
||||
def check_grafana():
|
||||
url = "http://grafana:3000/api/health"
|
||||
ok, resp, ping_ms, err = http_check(url)
|
||||
if ok and resp:
|
||||
data = resp.json()
|
||||
db_status = data.get("database", "unknown")
|
||||
if db_status == "ok":
|
||||
push("GRAFANA", "up", f"ok | db: {db_status}", ping_ms)
|
||||
else:
|
||||
push("GRAFANA", "down", f"db not ok: {db_status}", ping_ms)
|
||||
else:
|
||||
push("GRAFANA", "down", f"grafana unreachable: {err}", ping_ms)
|
||||
|
||||
def check_portainer():
|
||||
url = "http://portainer:9000/api/system/status"
|
||||
ok, resp, ping_ms, err = http_check(url)
|
||||
if ok:
|
||||
push("PORTAINER", "up", "running", ping_ms)
|
||||
else:
|
||||
push("PORTAINER", "down", f"portainer unreachable: {err}", ping_ms)
|
||||
|
||||
def check_loki():
|
||||
url = "http://loki:3100/ready"
|
||||
ok, resp, ping_ms, err = http_check(url)
|
||||
if ok:
|
||||
push("LOKI", "up", "ready", ping_ms)
|
||||
else:
|
||||
push("LOKI", "down", f"loki unreachable: {err}", ping_ms)
|
||||
|
||||
def run_all_http_checks():
|
||||
check_patroni_cluster()
|
||||
check_rabbitmq_cluster()
|
||||
check_apisix()
|
||||
check_vault()
|
||||
check_prometheus()
|
||||
check_grafana()
|
||||
check_portainer()
|
||||
check_loki()
|
||||
57
health-agent/src/health_agent/checks/mongodb.py
Normal file
57
health-agent/src/health_agent/checks/mongodb.py
Normal file
@ -0,0 +1,57 @@
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
from pymongo import MongoClient
|
||||
from health_agent.uptime_kuma import push
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def check_mongodb():
|
||||
start_t = time.time()
|
||||
|
||||
mongo_uri = os.getenv("MONGO_URI", "mongodb://mongodb-01:27017,mongodb-02:27017,mongodb-03:27017/?replicaSet=rs0")
|
||||
cluster_size = int(os.getenv("CLUSTER_SIZE_MONGODB", "3"))
|
||||
|
||||
try:
|
||||
with MongoClient(mongo_uri, serverSelectionTimeoutMS=3000) as client:
|
||||
status = client.admin.command('replSetGetStatus')
|
||||
|
||||
members = status.get('members', [])
|
||||
|
||||
primary = None
|
||||
secondaries = []
|
||||
|
||||
for m in members:
|
||||
state_str = m.get('stateStr', '')
|
||||
name = m.get('name', 'unknown')
|
||||
|
||||
if state_str == 'PRIMARY':
|
||||
primary = name
|
||||
elif state_str == 'SECONDARY':
|
||||
secondaries.append((name, state_str))
|
||||
else:
|
||||
secondaries.append((name, state_str))
|
||||
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
|
||||
if cluster_size == 1:
|
||||
push("MONGODB-REPLICASET", "up", "standalone mode OK", ping_ms)
|
||||
return
|
||||
|
||||
if primary:
|
||||
sec_strs = [f"{s[0]} ({s[1]})" for s in secondaries]
|
||||
msg = f"PRIMARY: {primary} | secondaries: {' '.join(sec_strs)}"
|
||||
|
||||
unhealthy_secs = [s for s in secondaries if s[1] not in ('SECONDARY', 'ARBITER')]
|
||||
if unhealthy_secs:
|
||||
msg = f"PRIMARY: {primary} | unhealthy: {','.join([s[0] + ':' + s[1] for s in unhealthy_secs])}"
|
||||
push("MONGODB-REPLICASET", "down", msg, ping_ms)
|
||||
else:
|
||||
push("MONGODB-REPLICASET", "up", msg, ping_ms)
|
||||
else:
|
||||
msg = "no PRIMARY | quorum lost"
|
||||
push("MONGODB-REPLICASET", "down", msg, ping_ms)
|
||||
|
||||
except Exception as e:
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
push("MONGODB-REPLICASET", "down", f"connection failed: {e}", ping_ms)
|
||||
43
health-agent/src/health_agent/checks/redis_sentinel.py
Normal file
43
health-agent/src/health_agent/checks/redis_sentinel.py
Normal file
@ -0,0 +1,43 @@
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
from redis.sentinel import Sentinel
|
||||
from health_agent.uptime_kuma import push
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def check_redis_sentinel():
|
||||
start_t = time.time()
|
||||
|
||||
hosts = os.getenv("REDIS_SENTINEL_HOSTS", "redis-sentinel-01,redis-sentinel-02,redis-sentinel-03")
|
||||
sentinel_nodes = [(h.strip(), 26379) for h in hosts.split(",")]
|
||||
|
||||
master_name = os.getenv("REDIS_MASTER_NAME", "prod-master")
|
||||
password = os.getenv("REDIS_PASSWORD", None)
|
||||
redis_mode = os.getenv("REDIS_MODE", "sentinel")
|
||||
|
||||
if redis_mode != "sentinel":
|
||||
push("REDIS-SENTINEL", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000))
|
||||
return
|
||||
|
||||
try:
|
||||
sentinel = Sentinel(sentinel_nodes, socket_timeout=3, password=password)
|
||||
|
||||
# Master ping
|
||||
master = sentinel.master_for(master_name, socket_timeout=3, password=password)
|
||||
master.ping()
|
||||
master_ip, master_port = sentinel.discover_master(master_name)
|
||||
master.connection_pool.disconnect()
|
||||
|
||||
# Get replicas count
|
||||
slaves = sentinel.discover_slaves(master_name)
|
||||
replicas_count = len(slaves)
|
||||
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
|
||||
msg = f"master: {master_ip}:{master_port} | replicas: {replicas_count} | sentinels quorum OK"
|
||||
push("REDIS-SENTINEL", "up", msg, ping_ms)
|
||||
|
||||
except Exception as e:
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
push("REDIS-SENTINEL", "down", f"quorum FAIL or master unreachable: {e}", ping_ms)
|
||||
49
health-agent/src/health_agent/checks/swarm.py
Normal file
49
health-agent/src/health_agent/checks/swarm.py
Normal file
@ -0,0 +1,49 @@
|
||||
import time
|
||||
import docker
|
||||
import logging
|
||||
from health_agent.uptime_kuma import push
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def check_swarm_cluster():
|
||||
start_time = time.time()
|
||||
try:
|
||||
client = docker.from_env()
|
||||
nodes = client.nodes.list()
|
||||
|
||||
ready_nodes = []
|
||||
managers = []
|
||||
|
||||
for node in nodes:
|
||||
spec = node.attrs.get('Spec', {})
|
||||
status = node.attrs.get('Status', {})
|
||||
manager_status = node.attrs.get('ManagerStatus', {})
|
||||
|
||||
node_name = spec.get('Name', node.id)
|
||||
is_ready = status.get('State') == 'ready'
|
||||
is_manager = spec.get('Role') == 'manager'
|
||||
|
||||
if is_ready:
|
||||
ready_nodes.append(node_name)
|
||||
|
||||
if is_manager:
|
||||
reachability = manager_status.get('Reachability')
|
||||
if reachability == 'reachable':
|
||||
managers.append(node_name)
|
||||
|
||||
total_nodes = len(nodes)
|
||||
ready_count = len(ready_nodes)
|
||||
|
||||
ping_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
if ready_count == total_nodes:
|
||||
msg = f"{ready_count}/{total_nodes} nodes Ready (managers: {', '.join(managers)})"
|
||||
push("SWARM-CLUSTER", "up", msg, ping_ms)
|
||||
else:
|
||||
msg = f"{ready_count}/{total_nodes} nodes Ready | Managers reachable: {len(managers)}"
|
||||
push("SWARM-CLUSTER", "down", msg, ping_ms)
|
||||
|
||||
except Exception as e:
|
||||
ping_ms = int((time.time() - start_time) * 1000)
|
||||
logger.error(f"Swarm check failed: {e}")
|
||||
push("SWARM-CLUSTER", "down", str(e), ping_ms)
|
||||
77
health-agent/src/health_agent/checks/tcp.py
Normal file
77
health-agent/src/health_agent/checks/tcp.py
Normal file
@ -0,0 +1,77 @@
|
||||
import socket
|
||||
import time
|
||||
import logging
|
||||
import requests
|
||||
from health_agent.uptime_kuma import push
|
||||
from health_agent.checks.http import http_check
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def tcp_check(host, port, timeout=3):
|
||||
start_time = time.time()
|
||||
try:
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
sock.settimeout(timeout)
|
||||
result = sock.connect_ex((host, port))
|
||||
sock.close()
|
||||
ping_ms = int((time.time() - start_time) * 1000)
|
||||
if result == 0:
|
||||
return True, ping_ms, None
|
||||
else:
|
||||
return False, ping_ms, f"Port {port} is closed or unreachable"
|
||||
except Exception as e:
|
||||
ping_ms = int((time.time() - start_time) * 1000)
|
||||
return False, ping_ms, str(e)
|
||||
|
||||
def check_etcd_cluster():
|
||||
nodes = ["etcd-01", "etcd-02", "etcd-03"]
|
||||
start_t = time.time()
|
||||
|
||||
healthy_count = 0
|
||||
leader = None
|
||||
errors = []
|
||||
|
||||
for node in nodes:
|
||||
# 1. TCP Check on 2379
|
||||
tcp_ok, ms, tcp_err = tcp_check(node, 2379)
|
||||
if not tcp_ok:
|
||||
errors.append(f"{node} port 2379 unreachable")
|
||||
continue
|
||||
|
||||
# 2. HTTP Health check
|
||||
url = f"http://{node}:2379/health"
|
||||
http_ok, resp, ms, http_err = http_check(url, timeout=3)
|
||||
|
||||
if http_ok and resp:
|
||||
data = resp.json()
|
||||
if data.get("health") == "true":
|
||||
healthy_count += 1
|
||||
else:
|
||||
errors.append(f"{node} unhealthy")
|
||||
else:
|
||||
errors.append(f"{node} health endpoint unreachable")
|
||||
|
||||
# 3. Leader check from /v3/maintenance/status
|
||||
if not leader and tcp_ok:
|
||||
status_url = f"http://{node}:2379/v3/maintenance/status"
|
||||
try:
|
||||
r = requests.post(status_url, json={}, timeout=3)
|
||||
if r.status_code == 200:
|
||||
status_data = r.json()
|
||||
leader_id = status_data.get("leader")
|
||||
header_member_id = status_data.get("header", {}).get("member_id")
|
||||
if leader_id and leader_id == header_member_id:
|
||||
leader = node
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
|
||||
if healthy_count == len(nodes):
|
||||
leader_info = f" | leader: {leader}" if leader else ""
|
||||
msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}"
|
||||
push("ETCD-CLUSTER", "up", msg, ping_ms)
|
||||
else:
|
||||
quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else ""
|
||||
msg = " | ".join(errors) + quorum_msg
|
||||
push("ETCD-CLUSTER", "down", msg, ping_ms)
|
||||
62
health-agent/src/health_agent/checks/tls.py
Normal file
62
health-agent/src/health_agent/checks/tls.py
Normal file
@ -0,0 +1,62 @@
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import requests
|
||||
from datetime import datetime, timezone
|
||||
from health_agent.uptime_kuma import push
|
||||
from cryptography import x509
|
||||
from cryptography.hazmat.backends import default_backend
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def check_swag_tls():
|
||||
start_t = time.time()
|
||||
cert_path = "/mnt/storagebox/ssl/STAR.iklim.co.full.crt"
|
||||
domain = os.getenv("EXTERNAL_DOMAIN", "iklim.co")
|
||||
suffix = os.getenv("EXTERNAL_SUBDOMAIN_SUFFIX", "")
|
||||
target_url = f"https://api{suffix}.{domain}/actuator/health"
|
||||
|
||||
msg_parts = []
|
||||
is_down = False
|
||||
|
||||
# 1. Check cert file
|
||||
if not os.path.exists(cert_path):
|
||||
is_down = True
|
||||
msg_parts.append("cert file missing on storagebox")
|
||||
else:
|
||||
try:
|
||||
with open(cert_path, "rb") as f:
|
||||
cert_data = f.read()
|
||||
cert = x509.load_pem_x509_certificate(cert_data, default_backend())
|
||||
not_valid_after = cert.not_valid_after_utc
|
||||
now = datetime.now(timezone.utc)
|
||||
days_left = (not_valid_after - now).days
|
||||
|
||||
if days_left < 14:
|
||||
is_down = True
|
||||
msg_parts.append(f"cert expires in {days_left} days")
|
||||
else:
|
||||
msg_parts.append(f"cert valid until {not_valid_after.strftime('%Y-%m-%d')} ({days_left} days)")
|
||||
except Exception as e:
|
||||
is_down = True
|
||||
msg_parts.append(f"cert parse error: {e}")
|
||||
|
||||
# 2. Check external HTTPS reachable
|
||||
try:
|
||||
r = requests.get(target_url, timeout=5, verify=False)
|
||||
if r.status_code < 500:
|
||||
msg_parts.append("HTTPS reachable")
|
||||
else:
|
||||
is_down = True
|
||||
msg_parts.append(f"HTTPS returned {r.status_code}")
|
||||
except Exception as e:
|
||||
is_down = True
|
||||
msg_parts.append(f"HTTPS unreachable")
|
||||
|
||||
ping_ms = int((time.time() - start_t) * 1000)
|
||||
msg = " | ".join(msg_parts)
|
||||
|
||||
if is_down:
|
||||
push("SWAG-TLS", "down", msg, ping_ms)
|
||||
else:
|
||||
push("SWAG-TLS", "up", msg, ping_ms)
|
||||
25
health-agent/src/health_agent/config.py
Normal file
25
health-agent/src/health_agent/config.py
Normal file
@ -0,0 +1,25 @@
|
||||
import os
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
ENV = os.getenv("ENV", "prod")
|
||||
CLUSTER_SIZE_ETCD = int(os.getenv("CLUSTER_SIZE_ETCD", "3"))
|
||||
CLUSTER_SIZE_PATRONI = int(os.getenv("CLUSTER_SIZE_PATRONI", "3"))
|
||||
CLUSTER_SIZE_MONGODB = int(os.getenv("CLUSTER_SIZE_MONGODB", "3"))
|
||||
CLUSTER_SIZE_RABBITMQ = int(os.getenv("CLUSTER_SIZE_RABBITMQ", "3"))
|
||||
CLUSTER_SIZE_VAULT = int(os.getenv("CLUSTER_SIZE_VAULT", "3"))
|
||||
REDIS_MODE = os.getenv("REDIS_MODE", "sentinel")
|
||||
EXTERNAL_DOMAIN = os.getenv("EXTERNAL_DOMAIN", "iklim.co")
|
||||
EXTERNAL_SUBDOMAIN_SUFFIX = os.getenv("EXTERNAL_SUBDOMAIN_SUFFIX", "")
|
||||
|
||||
def load_uk_tokens():
|
||||
token_file = Path("config/generated/uk_tokens.yml")
|
||||
if not token_file.exists():
|
||||
return {}
|
||||
with open(token_file, "r") as f:
|
||||
return yaml.safe_load(f) or {}
|
||||
|
||||
UK_TOKENS = load_uk_tokens()
|
||||
56
health-agent/src/health_agent/events/docker_events.py
Normal file
56
health-agent/src/health_agent/events/docker_events.py
Normal file
@ -0,0 +1,56 @@
|
||||
import os
|
||||
import docker
|
||||
import threading
|
||||
import logging
|
||||
import time
|
||||
from health_agent.slack import notify
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def parse_and_notify(event):
|
||||
attrs = event.get('Actor', {}).get('Attributes', {})
|
||||
container_name = attrs.get('name', 'unknown')
|
||||
exit_code = attrs.get('exitCode', '0')
|
||||
|
||||
if exit_code == '0':
|
||||
return
|
||||
|
||||
is_oom = (exit_code == '137')
|
||||
|
||||
env = os.getenv("ENV", "test").upper()
|
||||
webhook_env_name = f"SLACK_WEBHOOK_IKLIM_{env}_OPS"
|
||||
|
||||
priority = "High" if is_oom else "Medium"
|
||||
title = f"[Health Agent / Events] Container Crashed ({container_name})"
|
||||
|
||||
detail = f"Container: {container_name}\nExit Code: {exit_code}"
|
||||
if is_oom:
|
||||
detail += "\nReason: OOM Killed (Out Of Memory) or SIGKILL"
|
||||
|
||||
notify(
|
||||
webhook_env=webhook_env_name,
|
||||
source="health-agent-events",
|
||||
priority=priority,
|
||||
title=title,
|
||||
detail=detail
|
||||
)
|
||||
|
||||
def event_listener_loop():
|
||||
while True:
|
||||
try:
|
||||
client = docker.from_env()
|
||||
logger.info("Starting Docker event listener...")
|
||||
filters = {"type": "container", "event": "die"}
|
||||
for event in client.events(decode=True, filters=filters):
|
||||
try:
|
||||
parse_and_notify(event)
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing event: {e}", exc_info=True)
|
||||
except Exception as e:
|
||||
logger.error(f"Docker event listener error: {e}. Reconnecting in 10s...", exc_info=True)
|
||||
time.sleep(10)
|
||||
|
||||
def start_docker_event_listener():
|
||||
thread = threading.Thread(target=event_listener_loop, daemon=True)
|
||||
thread.start()
|
||||
return thread
|
||||
75
health-agent/src/health_agent/main.py
Normal file
75
health-agent/src/health_agent/main.py
Normal file
@ -0,0 +1,75 @@
|
||||
import time
|
||||
import logging
|
||||
from health_agent.checks import swarm
|
||||
from health_agent.checks.http import run_all_http_checks
|
||||
from health_agent.checks.tcp import check_etcd_cluster
|
||||
from health_agent.checks.tls import check_swag_tls
|
||||
from health_agent.checks.redis_sentinel import check_redis_sentinel
|
||||
from health_agent.checks.mongodb import check_mongodb
|
||||
from health_agent.checks.filesystem import check_storagebox_mount
|
||||
from health_agent.events.docker_events import start_docker_event_listener
|
||||
import json
|
||||
|
||||
class JSONFormatter(logging.Formatter):
|
||||
def format(self, record):
|
||||
log_obj = {
|
||||
"time": self.formatTime(record, self.datefmt),
|
||||
"level": record.levelname,
|
||||
"logger": record.name,
|
||||
"msg": record.getMessage()
|
||||
}
|
||||
for attr in ['check', 'status', 'ping_ms', 'source', 'error']:
|
||||
if hasattr(record, attr):
|
||||
log_obj[attr] = getattr(record, attr)
|
||||
if record.exc_info:
|
||||
log_obj['exc_info'] = self.formatException(record.exc_info)
|
||||
return json.dumps(log_obj)
|
||||
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(JSONFormatter())
|
||||
logging.basicConfig(level=logging.INFO, handlers=[handler])
|
||||
logger = logging.getLogger("main")
|
||||
|
||||
def run_checks():
|
||||
logger.info("Running health checks...")
|
||||
try:
|
||||
swarm.check_swarm_cluster()
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking Swarm cluster: {e}")
|
||||
|
||||
try:
|
||||
run_all_http_checks()
|
||||
except Exception as e:
|
||||
logger.error(f"Error running HTTP checks: {e}")
|
||||
|
||||
try:
|
||||
check_etcd_cluster()
|
||||
except Exception as e:
|
||||
logger.error(f"Error running etcd checks: {e}")
|
||||
|
||||
try:
|
||||
check_swag_tls()
|
||||
except Exception as e:
|
||||
logger.error(f"Error running TLS checks: {e}")
|
||||
|
||||
try:
|
||||
check_redis_sentinel()
|
||||
except Exception as e:
|
||||
logger.error(f"Error running Redis checks: {e}")
|
||||
|
||||
try:
|
||||
check_mongodb()
|
||||
except Exception as e:
|
||||
logger.error(f"Error running MongoDB checks: {e}")
|
||||
|
||||
try:
|
||||
check_storagebox_mount()
|
||||
except Exception as e:
|
||||
logger.error(f"Error running filesystem checks: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info("Starting health-agent...")
|
||||
start_docker_event_listener()
|
||||
while True:
|
||||
run_checks()
|
||||
time.sleep(60)
|
||||
22
health-agent/src/health_agent/slack.py
Normal file
22
health-agent/src/health_agent/slack.py
Normal file
@ -0,0 +1,22 @@
|
||||
import os
|
||||
import requests
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def notify(webhook_env: str, source: str, priority: str, title: str, detail: str):
|
||||
webhook_url = os.getenv(webhook_env)
|
||||
if not webhook_url:
|
||||
logger.warning(f"Slack webhook url not found for {webhook_env}")
|
||||
return
|
||||
|
||||
payload = {
|
||||
"text": f"*{title}*\n*Source:* {source}\n*Priority:* {priority}\n```\n{detail}\n```"
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(webhook_url, json=payload, timeout=5)
|
||||
response.raise_for_status()
|
||||
logger.info(f"Sent Slack notification to {webhook_env}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send Slack notification: {e}")
|
||||
19
health-agent/src/health_agent/state.py
Normal file
19
health-agent/src/health_agent/state.py
Normal file
@ -0,0 +1,19 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
STATE_FILE = Path("config/generated/state.json")
|
||||
|
||||
def load_state():
|
||||
if not STATE_FILE.exists():
|
||||
return {}
|
||||
try:
|
||||
with open(STATE_FILE, "r") as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def save_state(state):
|
||||
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(STATE_FILE, "w") as f:
|
||||
json.dump(state, f)
|
||||
27
health-agent/src/health_agent/uptime_kuma.py
Normal file
27
health-agent/src/health_agent/uptime_kuma.py
Normal file
@ -0,0 +1,27 @@
|
||||
import os
|
||||
import requests
|
||||
import logging
|
||||
from health_agent.config import UK_TOKENS
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
UK_PUSH_URL_BASE = os.getenv("UK_PUSH_URL_BASE", "https://status.iklim.co/api/push")
|
||||
|
||||
def push(monitor_name: str, status: str, msg: str, ping_ms: int):
|
||||
token = UK_TOKENS.get(monitor_name)
|
||||
if not token:
|
||||
logger.warning(f"No token found for monitor {monitor_name}")
|
||||
return
|
||||
|
||||
url = f"{UK_PUSH_URL_BASE}/{token}"
|
||||
params = {
|
||||
"status": status,
|
||||
"msg": msg,
|
||||
"ping": int(ping_ms)
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(url, params=params, timeout=10)
|
||||
response.raise_for_status()
|
||||
logger.info(f"Pushed {monitor_name} status={status}", extra={"check": monitor_name, "status": status, "msg": msg, "ping_ms": ping_ms, "source": "uptime_kuma"})
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to push {monitor_name}: {e}", extra={"check": monitor_name, "status": "push_failed", "error": str(e), "source": "uptime_kuma"})
|
||||
Loading…
x
Reference in New Issue
Block a user