fix(health-agent): fix ping maxretries param and status page group lookup

Fix ping monitor creation error ('max_retries' is not a valid uptime-kuma-api param; correct name is 'maxretries'). Fix status pages never linking groups: re-fetching get_monitors() after add_monitor() races with WebSocket delivery so newly created groups are missing; use group_map populated in Section 1 directly instead.
2026-06-26 21:07:11 +03:00 · 2026-06-26 21:07:11 +03:00 · 8b10653ff4
commit 8b10653ff4
parent 95dd439a34
1 changed files with 25 additions and 22 deletions
--- a/health-agent/scripts/setup_uptime_kuma.py
+++ b/health-agent/scripts/setup_uptime_kuma.py
@ -103,7 +103,7 @@ def setup_uptime_kuma(dry_run=False, only=None):
                        type=NotificationType.SLACK,
                        name=notif_name,
                        isDefault=False,
-                        webhookURL=webhook_url,
+                        slackwebhookURL=webhook_url,
                        applyExisting=False
                    )
                    notification_map[notif_key] = res.get('id')
@ -138,6 +138,7 @@ def setup_uptime_kuma(dry_run=False, only=None):
                group_map[raw_name] = existing_monitors[formatted_name]['id']

    tokens = {}
+    new_monitor_ids = {}  # m_name -> monitorID for monitors created in this run

    # 2. Push Monitors
    for pm in config.get("push_monitors", []):
@ -169,15 +170,24 @@ def setup_uptime_kuma(dry_run=False, only=None):
                if notif_ids:
                    kwargs["notification_id_list"] = notif_ids
                result = api.add_monitor(**kwargs)
-                m_id = result['monitorID']
-
-                for m in api.get_monitors():
-                    if m['id'] == m_id:
-                        tokens[m_name] = m['pushToken']
-                        break
+                new_monitor_ids[m_name] = result['monitorID']
        else:
            tokens[m_name] = "dummy_token_dry_run"

+    # Fetch push tokens for newly created monitors in one batch call.
+    # Calling api.get_monitors() per-monitor races with WebSocket event delivery;
+    # a single call after all creates allows the server state to settle.
+    if new_monitor_ids and api:
+        id_to_name = {v: k for k, v in new_monitor_ids.items()}
+        for m in api.get_monitors():
+            if m['id'] in id_to_name:
+                m_name = id_to_name[m['id']]
+                tokens[m_name] = m.get('pushToken', '')
+                logger.info(f"Captured push token for {m_name}")
+        missing = [n for n in new_monitor_ids if n not in tokens]
+        if missing:
+            logger.warning(f"Could not capture push token for: {missing}")
+
    # 3. HTTP Monitors
    for hm in config.get("http_monitors", []):
        m_name = hm["name"]
@ -269,7 +279,7 @@ def setup_uptime_kuma(dry_run=False, only=None):
                        "name": m_name,
                        "hostname": ip,
                        "interval": ping_interval,
-                        "max_retries": ping_retries,
+                        "maxretries": ping_retries,
                    }
                    if parent_group_id is not None:
                        kwargs["parent"] = parent_group_id
@ -299,7 +309,7 @@ def setup_uptime_kuma(dry_run=False, only=None):
                        "name": m_name,
                        "hostname": ip,
                        "interval": ping_interval,
-                        "max_retries": ping_retries,
+                        "maxretries": ping_retries,
                    }
                    if parent_group_id is not None:
                        kwargs["parent"] = parent_group_id
@ -312,13 +322,6 @@ def setup_uptime_kuma(dry_run=False, only=None):

    # 6. Status Pages
    if api:
-        all_monitors = {}
-        try:
-            for m in api.get_monitors():
-                all_monitors[m['name']] = m
-        except Exception as e:
-            logger.warning(f"Failed to re-fetch monitors for status pages: {e}")
-
        existing_pages = {}
        try:
            for p in api.get_status_pages():
@ -339,18 +342,18 @@ def setup_uptime_kuma(dry_run=False, only=None):
                    api.add_status_page(slug, title)

                # Each monitors.yml group becomes one display section on the status page.
-                # The GROUP monitor is added so Uptime Kuma renders it with all its children.
+                # Use group_map (populated during Section 1) to avoid re-fetching monitors;
+                # a fresh get_monitors() call after add_monitor() races with WebSocket delivery.
                public_group_list = []
                for group_raw_name in sp_groups:
-                    group_formatted = f"{project} [{env_name}] {group_raw_name}"
-                    group_monitor = all_monitors.get(group_formatted)
-                    if not group_monitor:
-                        logger.warning(f"Group '{group_formatted}' not found, skipping in status page")
+                    group_id = group_map.get(group_raw_name)
+                    if not group_id:
+                        logger.warning(f"Group '{group_raw_name}' not in group_map, skipping in status page")
                        continue
                    public_group_list.append({
                        "name": group_raw_name,
                        "weight": len(public_group_list) + 1,
-                        "monitorList": [{"id": group_monitor['id']}]
+                        "monitorList": [{"id": group_id}]
                    })

                if public_group_list: