fix(health-agent): fix ping maxretries param and status page group lookup

Fix ping monitor creation error ('max_retries' is not a valid uptime-kuma-api param; correct name is 'maxretries'). Fix status pages never linking groups: re-fetching get_monitors() after add_monitor() races with WebSocket delivery so newly created groups are missing; use group_map populated in Section 1 directly instead.
This commit is contained in:
Murat ÖZDEMİR 2026-06-26 21:07:11 +03:00
parent 95dd439a34
commit 8b10653ff4

View File

@ -103,7 +103,7 @@ def setup_uptime_kuma(dry_run=False, only=None):
type=NotificationType.SLACK,
name=notif_name,
isDefault=False,
webhookURL=webhook_url,
slackwebhookURL=webhook_url,
applyExisting=False
)
notification_map[notif_key] = res.get('id')
@ -138,6 +138,7 @@ def setup_uptime_kuma(dry_run=False, only=None):
group_map[raw_name] = existing_monitors[formatted_name]['id']
tokens = {}
new_monitor_ids = {} # m_name -> monitorID for monitors created in this run
# 2. Push Monitors
for pm in config.get("push_monitors", []):
@ -169,15 +170,24 @@ def setup_uptime_kuma(dry_run=False, only=None):
if notif_ids:
kwargs["notification_id_list"] = notif_ids
result = api.add_monitor(**kwargs)
m_id = result['monitorID']
for m in api.get_monitors():
if m['id'] == m_id:
tokens[m_name] = m['pushToken']
break
new_monitor_ids[m_name] = result['monitorID']
else:
tokens[m_name] = "dummy_token_dry_run"
# Fetch push tokens for newly created monitors in one batch call.
# Calling api.get_monitors() per-monitor races with WebSocket event delivery;
# a single call after all creates allows the server state to settle.
if new_monitor_ids and api:
id_to_name = {v: k for k, v in new_monitor_ids.items()}
for m in api.get_monitors():
if m['id'] in id_to_name:
m_name = id_to_name[m['id']]
tokens[m_name] = m.get('pushToken', '')
logger.info(f"Captured push token for {m_name}")
missing = [n for n in new_monitor_ids if n not in tokens]
if missing:
logger.warning(f"Could not capture push token for: {missing}")
# 3. HTTP Monitors
for hm in config.get("http_monitors", []):
m_name = hm["name"]
@ -269,7 +279,7 @@ def setup_uptime_kuma(dry_run=False, only=None):
"name": m_name,
"hostname": ip,
"interval": ping_interval,
"max_retries": ping_retries,
"maxretries": ping_retries,
}
if parent_group_id is not None:
kwargs["parent"] = parent_group_id
@ -299,7 +309,7 @@ def setup_uptime_kuma(dry_run=False, only=None):
"name": m_name,
"hostname": ip,
"interval": ping_interval,
"max_retries": ping_retries,
"maxretries": ping_retries,
}
if parent_group_id is not None:
kwargs["parent"] = parent_group_id
@ -312,13 +322,6 @@ def setup_uptime_kuma(dry_run=False, only=None):
# 6. Status Pages
if api:
all_monitors = {}
try:
for m in api.get_monitors():
all_monitors[m['name']] = m
except Exception as e:
logger.warning(f"Failed to re-fetch monitors for status pages: {e}")
existing_pages = {}
try:
for p in api.get_status_pages():
@ -339,18 +342,18 @@ def setup_uptime_kuma(dry_run=False, only=None):
api.add_status_page(slug, title)
# Each monitors.yml group becomes one display section on the status page.
# The GROUP monitor is added so Uptime Kuma renders it with all its children.
# Use group_map (populated during Section 1) to avoid re-fetching monitors;
# a fresh get_monitors() call after add_monitor() races with WebSocket delivery.
public_group_list = []
for group_raw_name in sp_groups:
group_formatted = f"{project} [{env_name}] {group_raw_name}"
group_monitor = all_monitors.get(group_formatted)
if not group_monitor:
logger.warning(f"Group '{group_formatted}' not found, skipping in status page")
group_id = group_map.get(group_raw_name)
if not group_id:
logger.warning(f"Group '{group_raw_name}' not in group_map, skipping in status page")
continue
public_group_list.append({
"name": group_raw_name,
"weight": len(public_group_list) + 1,
"monitorList": [{"id": group_monitor['id']}]
"monitorList": [{"id": group_id}]
})
if public_group_list: