From 8347b7e25dc15cdd7155482869ddd934765e3d74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 18:50:31 +0300 Subject: [PATCH 01/22] fix(common-functions): add no-op to empty refresh_calculated_env_vars to fix bash syntax error --- common-functions-base.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common-functions-base.sh b/common-functions-base.sh index aed8052..fca5434 100644 --- a/common-functions-base.sh +++ b/common-functions-base.sh @@ -68,7 +68,7 @@ lookup_env_value() { # Matematiksel veya mantıksal işlem gerektiren env değerlerini hesaplar. refresh_calculated_env_vars() { - + : } # Tüm çevre dosyalarını (ana env, ortak sırlar ve servis sırları) tazeleyerek yükler. From 656968823b99adacc9142f8280b9486d820e9b25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 18:55:41 +0300 Subject: [PATCH 02/22] ci(workflow): replace paths filter with paths-ignore to trigger on any change except .venv and __pycache__ --- .gitea/workflows/deploy-monitoring-prod.yml | 8 +++----- .gitea/workflows/deploy-monitoring-test.yml | 8 +++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/.gitea/workflows/deploy-monitoring-prod.yml b/.gitea/workflows/deploy-monitoring-prod.yml index 10bcaf9..b4188a4 100644 --- a/.gitea/workflows/deploy-monitoring-prod.yml +++ b/.gitea/workflows/deploy-monitoring-prod.yml @@ -4,11 +4,9 @@ on: push: branches: - prod-env - paths: - - 'docker-stack-monitoring.yml' - - 'health-agent/deploy/prod.env' - - 'swag/**' - - '.gitea/workflows/deploy-monitoring-prod.yml' + paths-ignore: + - '**/.venv/**' + - '**/__pycache__/**' concurrency: group: prod-monitoring-deploy diff --git a/.gitea/workflows/deploy-monitoring-test.yml b/.gitea/workflows/deploy-monitoring-test.yml index f96d7f5..ccca68e 100644 --- a/.gitea/workflows/deploy-monitoring-test.yml +++ b/.gitea/workflows/deploy-monitoring-test.yml @@ -4,11 +4,9 @@ on: push: branches: - test - paths: - - 'docker-stack-monitoring.yml' - - 'health-agent/**' - - 'swag/**' - - '.gitea/workflows/deploy-monitoring-test.yml' + paths-ignore: + - '**/.venv/**' + - '**/__pycache__/**' jobs: deploy: From 344ab4ac131abc59438c646667909660777acecc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 18:56:13 +0300 Subject: [PATCH 03/22] ci(workflow): remove redundant paths-ignore filter, gitignore already excludes those paths --- .gitea/workflows/deploy-monitoring-prod.yml | 4 +--- .gitea/workflows/deploy-monitoring-test.yml | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.gitea/workflows/deploy-monitoring-prod.yml b/.gitea/workflows/deploy-monitoring-prod.yml index b4188a4..f6d3c9d 100644 --- a/.gitea/workflows/deploy-monitoring-prod.yml +++ b/.gitea/workflows/deploy-monitoring-prod.yml @@ -4,9 +4,7 @@ on: push: branches: - prod-env - paths-ignore: - - '**/.venv/**' - - '**/__pycache__/**' + concurrency: group: prod-monitoring-deploy diff --git a/.gitea/workflows/deploy-monitoring-test.yml b/.gitea/workflows/deploy-monitoring-test.yml index ccca68e..7dc1d18 100644 --- a/.gitea/workflows/deploy-monitoring-test.yml +++ b/.gitea/workflows/deploy-monitoring-test.yml @@ -4,9 +4,7 @@ on: push: branches: - test - paths-ignore: - - '**/.venv/**' - - '**/__pycache__/**' + jobs: deploy: From 0ef4f0b6f80f3aca307a1a2a88db428c30a3bb84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 19:24:01 +0300 Subject: [PATCH 04/22] refactor: rename iklimco-monitoring stack to monitoring --- .gitea/workflows/deploy-monitoring-prod.yml | 8 +- .gitea/workflows/deploy-monitoring-test.yml | 8 +- README.md | 2 +- health-agent/README.md | 4 +- health-agent/scripts/setup_uptime_kuma.py | 332 +++++++++++++++++--- 5 files changed, 299 insertions(+), 55 deletions(-) diff --git a/.gitea/workflows/deploy-monitoring-prod.yml b/.gitea/workflows/deploy-monitoring-prod.yml index f6d3c9d..80ccc6f 100644 --- a/.gitea/workflows/deploy-monitoring-prod.yml +++ b/.gitea/workflows/deploy-monitoring-prod.yml @@ -119,14 +119,14 @@ jobs: --with-registry-auth \ --resolve-image changed \ -c docker-stack-monitoring.yml \ - iklimco-monitoring + monitoring - name: Wait for Loki run: | source ./common-functions-base.sh export SPRING_PROFILES_ACTIVE=PROD for i in $(seq 1 36); do - REPLICAS=$(docker service ls --filter name=iklimco-monitoring_loki --format "{{.Replicas}}" | head -1) + REPLICAS=$(docker service ls --filter name=monitoring_loki --format "{{.Replicas}}" | head -1) if echo "$REPLICAS" | awk -F'[/ ]' '$1>0 && $1==$2{found=1} END{exit !found}'; then log_message "SUCCESS" "Loki is ready: $REPLICAS" exit 0 @@ -134,7 +134,7 @@ jobs: log_message "INFO" "Loki not ready yet (${REPLICAS:-missing}), waiting 5s..." sleep 5 done - docker service ps iklimco-monitoring_loki || true + docker service ps monitoring_loki || true exit 1 - name: Configure SWAG Reverse Proxy @@ -190,6 +190,6 @@ jobs: - name: Verify Deployment run: | - docker service ps iklimco-monitoring_loki \ + docker service ps monitoring_loki \ --filter "desired-state=running" \ --format "table {{.Name}}\t{{.Node}}\t{{.CurrentState}}\t{{.Image}}" | head -20 diff --git a/.gitea/workflows/deploy-monitoring-test.yml b/.gitea/workflows/deploy-monitoring-test.yml index 7dc1d18..f271fc4 100644 --- a/.gitea/workflows/deploy-monitoring-test.yml +++ b/.gitea/workflows/deploy-monitoring-test.yml @@ -105,14 +105,14 @@ jobs: --with-registry-auth \ --resolve-image changed \ -c docker-stack-monitoring.yml \ - iklimco-monitoring + monitoring - name: Wait for Loki run: | source ./common-functions-base.sh export SPRING_PROFILES_ACTIVE=TEST for i in $(seq 1 36); do - REPLICAS=$(docker service ls --filter name=iklimco-monitoring_loki --format "{{.Replicas}}" | head -1) + REPLICAS=$(docker service ls --filter name=monitoring_loki --format "{{.Replicas}}" | head -1) if echo "$REPLICAS" | awk -F'[/ ]' '$1>0 && $1==$2{found=1} END{exit !found}'; then log_message "SUCCESS" "Loki is ready: $REPLICAS" exit 0 @@ -120,7 +120,7 @@ jobs: log_message "INFO" "Loki not ready yet (${REPLICAS:-missing}), waiting 5s..." sleep 5 done - docker service ps iklimco-monitoring_loki || true + docker service ps monitoring_loki || true exit 1 - name: Configure SWAG Reverse Proxy @@ -176,6 +176,6 @@ jobs: - name: Verify Deployment run: | - docker service ps iklimco-monitoring_loki \ + docker service ps monitoring_loki \ --filter "desired-state=running" \ --format "table {{.Name}}\t{{.Node}}\t{{.CurrentState}}\t{{.Image}}" | head -20 diff --git a/README.md b/README.md index e6db89a..67b41c1 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,7 @@ Mevcut dashboard'lara log paneli eklemek için: docker stack deploy \ --with-registry-auth \ -c Environment_Monitoring/docker-stack-monitoring.yml \ - iklimco-monitoring + monitoring ``` Prod için Gitea workflow'u: `Environment_Monitoring/.gitea/workflows/deploy-monitoring-prod.yml` diff --git a/health-agent/README.md b/health-agent/README.md index 5ab86a8..1789d0a 100644 --- a/health-agent/README.md +++ b/health-agent/README.md @@ -190,7 +190,7 @@ python scripts/setup_uptime_kuma.py docker stack deploy \ --with-registry-auth \ -c docker-stack-monitoring.yml \ - iklimco-monitoring + monitoring ``` Health-agent `iklimco-net` overlay ağına bağlı olmalı ve Docker socket'a salt okunur erişimi olmalıdır. @@ -199,7 +199,7 @@ Health-agent `iklimco-net` overlay ağına bağlı olmalı ve Docker socket'a sa ## Log Formatı -Agent JSON formatında log üretir. Grafana Explore (Loki datasource, `{service="iklimco-monitoring_health-agent"}`) veya `docker service logs iklimco-monitoring_health-agent` ile izlenebilir. Her log girdisi şu alanları içerir: +Agent JSON formatında log üretir. Grafana Explore (Loki datasource, `{service="monitoring_health-agent"}`) veya `docker service logs monitoring_health-agent` ile izlenebilir. Her log girdisi şu alanları içerir: - `check` — monitor adı - `status` — `up` veya `down` diff --git a/health-agent/scripts/setup_uptime_kuma.py b/health-agent/scripts/setup_uptime_kuma.py index 0bc5a35..2f6c3c7 100644 --- a/health-agent/scripts/setup_uptime_kuma.py +++ b/health-agent/scripts/setup_uptime_kuma.py @@ -3,7 +3,7 @@ import argparse import yaml import logging from dotenv import load_dotenv -from uptime_kuma_api import UptimeKumaApi, MonitorType +from uptime_kuma_api import UptimeKumaApi, MonitorType, NotificationType logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger("uk-setup") @@ -12,24 +12,53 @@ _root = os.path.join(os.path.dirname(__file__), "..") load_dotenv(os.path.join(_root, ".env")) load_dotenv(os.path.join(_root, ".env.setup")) + def format_str(text, env_name, project): if not isinstance(text, str): return text return text.replace("{env}", env_name).replace("{project}", project) + +def resolve_template(text, suffix, domain): + if not isinstance(text, str): + return text + return text.replace("{suffix}", suffix).replace("{domain}", domain) + + +def find_parent_group(monitor_name, groups, group_map): + for g in groups: + if monitor_name in g.get("children", []): + return group_map.get(g["name"]) + return None + + +def find_group_notifications(monitor_name, groups, notification_map): + for g in groups: + if monitor_name in g.get("children", []): + ids = {} + for n in g.get("notifications", []): + nid = notification_map.get(n) + if nid is not None: + ids[str(nid)] = True + return ids or None + return None + + def setup_uptime_kuma(dry_run=False, only=None): env_name = os.getenv("ENV", "test") - + config_path = os.path.join(os.path.dirname(__file__), "..", "config", "monitors.yml") with open(config_path, "r") as f: config = yaml.safe_load(f) - + project = config.get("project", "iklim") - + domain = os.getenv("EXTERNAL_DOMAIN", config.get("domain", {}).get("base", "iklim.co")) + suffix = os.getenv("EXTERNAL_SUBDOMAIN_SUFFIX", "") + kuma_url = os.getenv("UK_URL", "http://localhost:3001") kuma_user = os.getenv("UK_USER", "admin") kuma_pass = os.getenv("UK_PASS", "admin") - + api = None if not dry_run: logger.info(f"Connecting to Uptime Kuma at {kuma_url}...") @@ -39,7 +68,7 @@ def setup_uptime_kuma(dry_run=False, only=None): except Exception as e: logger.error(f"Login failed: {e}") return - + existing_monitors = {} if api: try: @@ -47,97 +76,312 @@ def setup_uptime_kuma(dry_run=False, only=None): existing_monitors[m['name']] = m except Exception as e: logger.error(f"Failed to get monitors: {e}") - - # 1. Process Groups + + # 0. Notification Providers + notification_map = {} + existing_notifications = {} + if api: + try: + for n in api.get_notifications(): + existing_notifications[n['name']] = n + except Exception as e: + logger.warning(f"Failed to get notifications: {e}") + + for notif_key, notif_cfg in config.get("notifications", {}).items(): + webhook_env = notif_cfg.get("webhook_env") + webhook_url = os.getenv(webhook_env, "") if webhook_env else "" + notif_name = f"{project}-{notif_key}" + + logger.info(f"Processing notification: {notif_name}") + if not dry_run: + if notif_name in existing_notifications: + notification_map[notif_key] = existing_notifications[notif_name]['id'] + logger.info(f"Notification {notif_name} already exists (id={notification_map[notif_key]})") + elif webhook_url: + try: + res = api.add_notification( + type=NotificationType.SLACK, + name=notif_name, + isDefault=False, + webhookURL=webhook_url, + applyExisting=False + ) + notification_map[notif_key] = res.get('id') + logger.info(f"Created notification: {notif_name}") + except Exception as e: + logger.warning(f"Failed to create notification {notif_name}: {e}") + else: + logger.warning(f"Skipping {notif_name}: env var {webhook_env} is not set") + + # 1. Groups group_map = {} for g in config.get("groups", []): raw_name = g["name"] formatted_name = f"{project} [{env_name}] {raw_name}" - + + notif_ids = {} + for n in g.get("notifications", []): + nid = notification_map.get(n) + if nid is not None: + notif_ids[str(nid)] = True + logger.info(f"Processing group: {formatted_name}") if not dry_run: if formatted_name not in existing_monitors: logger.info(f"Creating group monitor: {formatted_name}") - res = api.add_monitor(type=MonitorType.GROUP, name=formatted_name) + kwargs = {"type": MonitorType.GROUP, "name": formatted_name} + if notif_ids: + kwargs["notification_id_list"] = notif_ids + res = api.add_monitor(**kwargs) group_map[raw_name] = res['monitorID'] else: group_map[raw_name] = existing_monitors[formatted_name]['id'] tokens = {} - + # 2. Push Monitors for pm in config.get("push_monitors", []): m_name = pm["name"] if only and m_name != only: continue - + m_interval = pm.get("interval", 60) - - parent_group_id = None - for g in config.get("groups", []): - if m_name in g.get("children", []): - parent_group_id = group_map.get(g["name"]) - break - + parent_group_id = find_parent_group(m_name, config.get("groups", []), group_map) + notif_ids = find_group_notifications(m_name, config.get("groups", []), notification_map) + logger.info(f"Processing push monitor: {m_name}") if not dry_run: if m_name in existing_monitors: logger.info(f"Monitor {m_name} already exists.") m_id = existing_monitors[m_name]['id'] - token = existing_monitors[m_name]['pushToken'] - tokens[m_name] = token - + tokens[m_name] = existing_monitors[m_name]['pushToken'] + if parent_group_id and existing_monitors[m_name].get('parent') != parent_group_id: api.edit_monitor(m_id, parent=parent_group_id) else: logger.info(f"Creating push monitor: {m_name}") - result = api.add_monitor( - type=MonitorType.PUSH, - name=m_name, - interval=m_interval, - parent=parent_group_id - ) + kwargs = { + "type": MonitorType.PUSH, + "name": m_name, + "interval": m_interval, + "parent": parent_group_id + } + if notif_ids: + kwargs["notification_id_list"] = notif_ids + result = api.add_monitor(**kwargs) m_id = result['monitorID'] - - # Fetch again to get pushToken + for m in api.get_monitors(): if m['id'] == m_id: tokens[m_name] = m['pushToken'] break else: tokens[m_name] = "dummy_token_dry_run" - - # 3. Process Status Pages - for sp in config.get("status_pages", []): - slug = format_str(sp["slug"], env_name, project) - title = format_str(sp["title"], env_name, project) - logger.info(f"Processing status page: {title} (slug: {slug})") + + # 3. HTTP Monitors + for hm in config.get("http_monitors", []): + m_name = hm["name"] + if only and m_name != only: + continue + url = resolve_template(hm["url"], suffix, domain) + interval = hm.get("interval", 60) + accepted_statuscodes = hm.get("accepted_statuscodes", ["200"]) + parent_group_id = find_parent_group(m_name, config.get("groups", []), group_map) + notif_ids = find_group_notifications(m_name, config.get("groups", []), notification_map) + + logger.info(f"Processing HTTP monitor: {m_name} -> {url}") if not dry_run: + if m_name in existing_monitors: + logger.info(f"Monitor {m_name} already exists.") + else: + try: + kwargs = { + "type": MonitorType.HTTP, + "name": m_name, + "url": url, + "interval": interval, + "accepted_statuscodes": accepted_statuscodes, + } + if parent_group_id is not None: + kwargs["parent"] = parent_group_id + if notif_ids: + kwargs["notification_id_list"] = notif_ids + api.add_monitor(**kwargs) + logger.info(f"Created HTTP monitor: {m_name}") + except Exception as e: + logger.warning(f"Failed to create HTTP monitor {m_name}: {e}") + + # 4. DNS Monitors + for dm in config.get("dns_monitors", []): + m_name = dm["name"] + if only and m_name != only: + continue + hostname = resolve_template(dm["hostname"], suffix, domain) + dns_resolve_type = dm.get("dns_resolve_type", "A") + interval = dm.get("interval", 60) + parent_group_id = find_parent_group(m_name, config.get("groups", []), group_map) + notif_ids = find_group_notifications(m_name, config.get("groups", []), notification_map) + + logger.info(f"Processing DNS monitor: {m_name} -> {hostname}") + if not dry_run: + if m_name in existing_monitors: + logger.info(f"Monitor {m_name} already exists.") + else: + try: + kwargs = { + "type": MonitorType.DNS, + "name": m_name, + "hostname": hostname, + "dns_resolve_type": dns_resolve_type, + "interval": interval, + } + if parent_group_id is not None: + kwargs["parent"] = parent_group_id + if notif_ids: + kwargs["notification_id_list"] = notif_ids + api.add_monitor(**kwargs) + logger.info(f"Created DNS monitor: {m_name}") + except Exception as e: + logger.warning(f"Failed to create DNS monitor {m_name}: {e}") + + # 5. Ping Monitors (generated from nodes config) + ping_cfg = config.get("ping_monitors", {}) + ping_interval = ping_cfg.get("interval", 60) + ping_retries = ping_cfg.get("max_retries", 1) + env_nodes = config.get("nodes", {}).get(env_name, {}) + + for i, node in enumerate(env_nodes.get("service", []), 1): + m_name = f"EXT-PING-APP{i:02d}" + if only and m_name != only: + continue + ip = node["ip"] + parent_group_id = find_parent_group(m_name, config.get("groups", []), group_map) + notif_ids = find_group_notifications(m_name, config.get("groups", []), notification_map) + + logger.info(f"Processing Ping monitor: {m_name} -> {ip}") + if not dry_run: + if m_name in existing_monitors: + logger.info(f"Monitor {m_name} already exists.") + else: + try: + kwargs = { + "type": MonitorType.PING, + "name": m_name, + "hostname": ip, + "interval": ping_interval, + "max_retries": ping_retries, + } + if parent_group_id is not None: + kwargs["parent"] = parent_group_id + if notif_ids: + kwargs["notification_id_list"] = notif_ids + api.add_monitor(**kwargs) + logger.info(f"Created Ping monitor: {m_name}") + except Exception as e: + logger.warning(f"Failed to create Ping monitor {m_name}: {e}") + + for i, node in enumerate(env_nodes.get("db", []), 1): + m_name = f"EXT-PING-DB{i:02d}" + if only and m_name != only: + continue + ip = node["ip"] + parent_group_id = find_parent_group(m_name, config.get("groups", []), group_map) + notif_ids = find_group_notifications(m_name, config.get("groups", []), notification_map) + + logger.info(f"Processing Ping monitor: {m_name} -> {ip}") + if not dry_run: + if m_name in existing_monitors: + logger.info(f"Monitor {m_name} already exists.") + else: + try: + kwargs = { + "type": MonitorType.PING, + "name": m_name, + "hostname": ip, + "interval": ping_interval, + "max_retries": ping_retries, + } + if parent_group_id is not None: + kwargs["parent"] = parent_group_id + if notif_ids: + kwargs["notification_id_list"] = notif_ids + api.add_monitor(**kwargs) + logger.info(f"Created Ping monitor: {m_name}") + except Exception as e: + logger.warning(f"Failed to create Ping monitor {m_name}: {e}") + + # 6. Status Pages + if api: + all_monitors = {} + try: + for m in api.get_monitors(): + all_monitors[m['name']] = m + except Exception as e: + logger.warning(f"Failed to re-fetch monitors for status pages: {e}") + + existing_pages = {} + try: + for p in api.get_status_pages(): + existing_pages[p['slug']] = p + except Exception as e: + logger.warning(f"Failed to get status pages: {e}") + + for sp in config.get("status_pages", []): + slug = format_str(sp["slug"], env_name, project) + title = format_str(sp["title"], env_name, project) + is_public = sp.get("public", False) + sp_groups = sp.get("groups", []) + + logger.info(f"Processing status page: {title} (slug: {slug})") try: - pages = api.get_status_pages() - exists = any(p['slug'] == slug for p in pages) - if not exists: + if slug not in existing_pages: logger.info(f"Creating status page: {slug}") api.add_status_page(slug, title) + + # Each monitors.yml group becomes one display section on the status page. + # The GROUP monitor is added so Uptime Kuma renders it with all its children. + public_group_list = [] + for group_raw_name in sp_groups: + group_formatted = f"{project} [{env_name}] {group_raw_name}" + group_monitor = all_monitors.get(group_formatted) + if not group_monitor: + logger.warning(f"Group '{group_formatted}' not found, skipping in status page") + continue + public_group_list.append({ + "name": group_raw_name, + "weight": len(public_group_list) + 1, + "monitorList": [{"id": group_monitor['id']}] + }) + + if public_group_list: + api.save_status_page( + slug=slug, + title=title, + publicGroupList=public_group_list, + published=is_public + ) + logger.info(f"Saved status page '{slug}' with {len(public_group_list)} group(s)") except Exception as e: - logger.warning(f"Status page ops failed: {e}") - - # 4. Write tokens to uk_tokens.yml + logger.warning(f"Status page ops failed for {slug}: {e}") + + # 7. Write push tokens to uk_tokens.yml token_file = os.path.join(os.path.dirname(__file__), "..", "config", "generated", "uk_tokens.yml") if not dry_run: + os.makedirs(os.path.dirname(token_file), exist_ok=True) with open(token_file, "w") as f: yaml.dump(tokens, f) logger.info(f"Saved push tokens to {token_file}") else: logger.info(f"[DRY-RUN] Would save {len(tokens)} tokens to {token_file}") - + if api: api.disconnect() + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Setup Uptime Kuma monitors") parser.add_argument("--dry-run", action="store_true", help="Print actions without making changes") parser.add_argument("--only", type=str, help="Only process a specific monitor by name") args = parser.parse_args() - + setup_uptime_kuma(dry_run=args.dry_run, only=args.only) From 9fbc74d4985f4619648e683c8177b222f6948987 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 19:39:48 +0300 Subject: [PATCH 05/22] fix(workflow): use -s flag to trigger Uptime Kuma setup on empty uk_tokens.yml The previous ! -f check skipped setup when uk_tokens.yml existed but was empty (0 bytes). Switching to ! -s triggers setup whenever the file is missing or empty. --- .gitea/workflows/deploy-monitoring-prod.yml | 2 +- .gitea/workflows/deploy-monitoring-test.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitea/workflows/deploy-monitoring-prod.yml b/.gitea/workflows/deploy-monitoring-prod.yml index 80ccc6f..a6478b9 100644 --- a/.gitea/workflows/deploy-monitoring-prod.yml +++ b/.gitea/workflows/deploy-monitoring-prod.yml @@ -90,7 +90,7 @@ jobs: export SPRING_PROFILES_ACTIVE=PROD source_env_file ./health-agent/.env mkdir -p "${HEALTH_AGENT_CONFIG_GENERATED_DIR}" - if [ ! -f "${HEALTH_AGENT_CONFIG_GENERATED_DIR}/uk_tokens.yml" ]; then + if [ ! -s "${HEALTH_AGENT_CONFIG_GENERATED_DIR}/uk_tokens.yml" ]; then docker run --rm \ -v "${HEALTH_AGENT_CONFIG_GENERATED_DIR}:/app/config/generated" \ --env-file "$(pwd)/health-agent/.env" \ diff --git a/.gitea/workflows/deploy-monitoring-test.yml b/.gitea/workflows/deploy-monitoring-test.yml index f271fc4..ea5b98e 100644 --- a/.gitea/workflows/deploy-monitoring-test.yml +++ b/.gitea/workflows/deploy-monitoring-test.yml @@ -80,7 +80,7 @@ jobs: export SPRING_PROFILES_ACTIVE=TEST source_env_file ./health-agent/.env mkdir -p "${HEALTH_AGENT_CONFIG_GENERATED_DIR}" - if [ ! -f "${HEALTH_AGENT_CONFIG_GENERATED_DIR}/uk_tokens.yml" ]; then + if [ ! -s "${HEALTH_AGENT_CONFIG_GENERATED_DIR}/uk_tokens.yml" ]; then docker run --rm \ -v "${HEALTH_AGENT_CONFIG_GENERATED_DIR}:/app/config/generated" \ --env-file "$(pwd)/health-agent/.env" \ From 8d5fe55b148cc651b272b4c4561b15fcdeb27d57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 20:06:48 +0300 Subject: [PATCH 06/22] health-agent redeploy with new image --- health-agent/deploy/prod.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/health-agent/deploy/prod.env b/health-agent/deploy/prod.env index 92297cd..0a993b4 100644 --- a/health-agent/deploy/prod.env +++ b/health-agent/deploy/prod.env @@ -1,2 +1,2 @@ -SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:fadf229d4423075d2871f9dc4a5a0afdf6dfe7c5fcd04d866b2d6d6fe8942b56 +SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:e262bf6e6712862ba24551dc326411ebb0987da59072834b2923bd73cb5c9d3b PROD_IMAGE_TAG=0.1.0 \ No newline at end of file From d51c07355611e2e58dfd4c113d4e539dec8c2a1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 20:37:42 +0300 Subject: [PATCH 07/22] fix(health-agent): fix uk_tokens.yml load race and LogRecord msg conflict - config.py: Replace exists()+open() with try/except open() to avoid TOCTOU race on SSHFS mounts where stat can succeed but open can fail with FileNotFoundError. - uptime_kuma.py: Rename msg key to push_msg in logger extra dicts. Python LogRecord reserves the msg field; passing it in extra raises ValueError which was being silently swallowed by the except block, masking successful pushes as errors. --- health-agent/src/health_agent/config.py | 8 ++++---- health-agent/src/health_agent/uptime_kuma.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/health-agent/src/health_agent/config.py b/health-agent/src/health_agent/config.py index dbff2be..c00ad28 100644 --- a/health-agent/src/health_agent/config.py +++ b/health-agent/src/health_agent/config.py @@ -16,10 +16,10 @@ EXTERNAL_DOMAIN = os.getenv("EXTERNAL_DOMAIN", "iklim.co") EXTERNAL_SUBDOMAIN_SUFFIX = os.getenv("EXTERNAL_SUBDOMAIN_SUFFIX", "") def load_uk_tokens(): - token_file = Path("config/generated/uk_tokens.yml") - if not token_file.exists(): + try: + with open("config/generated/uk_tokens.yml", "r") as f: + return yaml.safe_load(f) or {} + except (FileNotFoundError, OSError): return {} - with open(token_file, "r") as f: - return yaml.safe_load(f) or {} UK_TOKENS = load_uk_tokens() diff --git a/health-agent/src/health_agent/uptime_kuma.py b/health-agent/src/health_agent/uptime_kuma.py index 357bc90..b0fcf53 100644 --- a/health-agent/src/health_agent/uptime_kuma.py +++ b/health-agent/src/health_agent/uptime_kuma.py @@ -15,7 +15,7 @@ def push(monitor_name: str, status: str, msg: str, ping_ms: int): return if DRY_RUN: - logger.info(f"[DRY-RUN] Would push {monitor_name} status={status} msg={msg} ping={ping_ms}ms", extra={"check": monitor_name, "status": status, "msg": msg, "ping_ms": ping_ms, "source": "uptime_kuma"}) + logger.info(f"[DRY-RUN] Would push {monitor_name} status={status} msg={msg} ping={ping_ms}ms", extra={"check": monitor_name, "status": status, "push_msg": msg, "ping_ms": ping_ms, "source": "uptime_kuma"}) return url = f"{UK_PUSH_URL_BASE}/{token}" @@ -28,6 +28,6 @@ def push(monitor_name: str, status: str, msg: str, ping_ms: int): try: response = requests.get(url, params=params, timeout=10) response.raise_for_status() - logger.info(f"Pushed {monitor_name} status={status}", extra={"check": monitor_name, "status": status, "msg": msg, "ping_ms": ping_ms, "source": "uptime_kuma"}) + logger.info(f"Pushed {monitor_name} status={status}", extra={"check": monitor_name, "status": status, "push_msg": msg, "ping_ms": ping_ms, "source": "uptime_kuma"}) except Exception as e: logger.error(f"Failed to push {monitor_name}: {e}", extra={"check": monitor_name, "status": "push_failed", "error": str(e), "source": "uptime_kuma"}) From bc8b3d0934b83dbfd640c0942336e4a8fe3d21df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 20:47:31 +0300 Subject: [PATCH 08/22] refactor: convert all monitor names to Title Case and update health-agent digest --- health-agent/config/monitors.yml | 52 +++++++++++------------ health-agent/deploy/prod.env | 2 +- health-agent/scripts/setup_uptime_kuma.py | 4 +- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/health-agent/config/monitors.yml b/health-agent/config/monitors.yml index 26edf01..3cfe7fd 100644 --- a/health-agent/config/monitors.yml +++ b/health-agent/config/monitors.yml @@ -50,126 +50,126 @@ groups: status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [internal, infrastructure] - children: [SWARM-CLUSTER, VAULT-CLUSTER, STORAGEBOX-MOUNT, SWAG-TLS] + children: [Swarm Cluster, Vault Cluster, Storagebox Mount, Swag Tls] - name: "Data Layer" status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [internal, database] - children: [ETCD-CLUSTER, PATRONI-CLUSTER, MONGODB-REPLICASET] + children: [Etcd Cluster, Patroni Cluster, Mongodb Replicaset] - name: "Gateway & Messaging" status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [internal, gateway] - children: [APISIX-GATEWAY, RABBITMQ-CLUSTER, REDIS-SENTINEL] + children: [Apisix Gateway, Rabbitmq Cluster, Redis Sentinel] - name: "External Availability - Critical" status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [external, high] - children: [EXT-HTTPS-API, EXT-DNS-API, EXT-DNS-ROOT, EXT-PING-APP01, EXT-PING-APP02, EXT-PING-APP03] + children: [Ext Https Api, Ext Dns Api, Ext Dns Root, Ext Ping App01, Ext Ping App02, Ext Ping App03] - name: "External Availability - General" status_page: "iklim-{env}-ops" notifications: [slack-medium] tags: [external, medium] - children: [EXT-HTTPS-GRAFANA, EXT-PING-DB01, EXT-PING-DB02, EXT-PING-DB03] + children: [Ext Https Grafana, Ext Ping Db01, Ext Ping Db02, Ext Ping Db03] - name: "Observability" status_page: "iklim-{env}-tools" notifications: [slack-low] tags: [internal, observability] - children: [PROMETHEUS, GRAFANA, PORTAINER, LOKI, EXT-HTTPS-PORTAINER, EXT-HTTPS-APIGW] + children: [Prometheus, Grafana, Portainer, Loki, Ext Https Portainer, Ext Https Apigw] push_monitors: - - name: SWARM-CLUSTER + - name: Swarm Cluster interval: 60 heartbeat_retries: 1 tags: [internal, infrastructure, high] restart_threshold: 1 - - name: VAULT-CLUSTER + - name: Vault Cluster interval: 60 heartbeat_retries: 1 tags: [internal, infrastructure, high] restart_threshold: 1 - - name: ETCD-CLUSTER + - name: Etcd Cluster interval: 60 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 - - name: PATRONI-CLUSTER + - name: Patroni Cluster interval: 60 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 - - name: MONGODB-REPLICASET + - name: Mongodb Replicaset interval: 120 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 - - name: APISIX-GATEWAY + - name: Apisix Gateway interval: 60 heartbeat_retries: 1 tags: [internal, gateway, high] restart_threshold: 1 - - name: RABBITMQ-CLUSTER + - name: Rabbitmq Cluster interval: 60 heartbeat_retries: 1 tags: [internal, gateway, medium] restart_threshold: 3 - - name: REDIS-SENTINEL + - name: Redis Sentinel interval: 60 heartbeat_retries: 1 tags: [internal, database, medium] restart_threshold: 3 - - name: SWAG-TLS + - name: Swag Tls interval: 3600 heartbeat_retries: 1 tags: [internal, infrastructure, medium] restart_threshold: 3 - - name: STORAGEBOX-MOUNT + - name: Storagebox Mount interval: 300 heartbeat_retries: 1 tags: [internal, infrastructure, medium] restart_threshold: 1 - - name: PROMETHEUS + - name: Prometheus interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 - - name: GRAFANA + - name: Grafana interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 - - name: PORTAINER + - name: Portainer interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 - - name: LOKI + - name: Loki interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 http_monitors: - - name: EXT-HTTPS-API + - name: Ext Https Api url: "https://api{suffix}.{domain}/actuator/health" accepted_statuscodes: ["200"] interval: 60 - - name: EXT-HTTPS-GRAFANA + - name: Ext Https Grafana url: "https://grafana{suffix}.{domain}/api/health" accepted_statuscodes: ["200"] interval: 60 - - name: EXT-HTTPS-PORTAINER + - name: Ext Https Portainer url: "https://portainer{suffix}.{domain}" accepted_statuscodes: ["200", "401", "403"] interval: 120 - - name: EXT-HTTPS-APIGW + - name: Ext Https Apigw url: "https://apigw{suffix}.{domain}" accepted_statuscodes: ["200", "401", "403"] interval: 120 dns_monitors: - - name: EXT-DNS-API + - name: Ext Dns Api hostname: "api{suffix}.{domain}" dns_resolve_type: A interval: 60 - - name: EXT-DNS-ROOT + - name: Ext Dns Root hostname: "{domain}" dns_resolve_type: A interval: 60 diff --git a/health-agent/deploy/prod.env b/health-agent/deploy/prod.env index 0a993b4..969ab4c 100644 --- a/health-agent/deploy/prod.env +++ b/health-agent/deploy/prod.env @@ -1,2 +1,2 @@ -SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:e262bf6e6712862ba24551dc326411ebb0987da59072834b2923bd73cb5c9d3b +SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:a2ed1cbaabf116e49d1685e37e0335798d1fe49a2d95457717c68b1576894062 PROD_IMAGE_TAG=0.1.0 \ No newline at end of file diff --git a/health-agent/scripts/setup_uptime_kuma.py b/health-agent/scripts/setup_uptime_kuma.py index 2f6c3c7..75841af 100644 --- a/health-agent/scripts/setup_uptime_kuma.py +++ b/health-agent/scripts/setup_uptime_kuma.py @@ -251,7 +251,7 @@ def setup_uptime_kuma(dry_run=False, only=None): env_nodes = config.get("nodes", {}).get(env_name, {}) for i, node in enumerate(env_nodes.get("service", []), 1): - m_name = f"EXT-PING-APP{i:02d}" + m_name = f"Ext Ping App{i:02d}" if only and m_name != only: continue ip = node["ip"] @@ -281,7 +281,7 @@ def setup_uptime_kuma(dry_run=False, only=None): logger.warning(f"Failed to create Ping monitor {m_name}: {e}") for i, node in enumerate(env_nodes.get("db", []), 1): - m_name = f"EXT-PING-DB{i:02d}" + m_name = f"Ext Ping Db{i:02d}" if only and m_name != only: continue ip = node["ip"] From 3c2e872bf4baf7a8152a6bcb53cb014b184f4261 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 20:52:35 +0300 Subject: [PATCH 09/22] refactor(health-agent): rename monitor keys to Title Case With Space Update all hardcoded push monitor names in check files to match the new Title Case With Space format in monitors.yml. The uk_tokens.yml keys are derived from monitor names so the push() calls must match exactly. --- .../src/health_agent/checks/filesystem.py | 6 +-- health-agent/src/health_agent/checks/http.py | 38 +++++++++---------- .../src/health_agent/checks/mongodb.py | 10 ++--- .../src/health_agent/checks/redis_sentinel.py | 6 +-- health-agent/src/health_agent/checks/swarm.py | 6 +-- health-agent/src/health_agent/checks/tcp.py | 4 +- health-agent/src/health_agent/checks/tls.py | 4 +- 7 files changed, 37 insertions(+), 37 deletions(-) diff --git a/health-agent/src/health_agent/checks/filesystem.py b/health-agent/src/health_agent/checks/filesystem.py index 8742091..b06fdfd 100644 --- a/health-agent/src/health_agent/checks/filesystem.py +++ b/health-agent/src/health_agent/checks/filesystem.py @@ -18,7 +18,7 @@ def check_storagebox_mount(): if not os.path.exists(storagebox_path): ping_ms = int((time.time() - start_t) * 1000) - push("STORAGEBOX-MOUNT", "down", f"{storagebox_path} not found", ping_ms) + push("Storagebox Mount", "down", f"{storagebox_path} not found", ping_ms) return for rel_path in expected_files: @@ -30,7 +30,7 @@ def check_storagebox_mount(): if missing_files: msg = f"mount exists but missing: {', '.join(missing_files)}" - push("STORAGEBOX-MOUNT", "down", msg, ping_ms) + push("Storagebox Mount", "down", msg, ping_ms) else: msg = f"{storagebox_path} OK | all critical files present" - push("STORAGEBOX-MOUNT", "up", msg, ping_ms) + push("Storagebox Mount", "up", msg, ping_ms) diff --git a/health-agent/src/health_agent/checks/http.py b/health-agent/src/health_agent/checks/http.py index f7001c0..ea9d96b 100644 --- a/health-agent/src/health_agent/checks/http.py +++ b/health-agent/src/health_agent/checks/http.py @@ -54,7 +54,7 @@ def check_patroni_cluster(): ping_ms = int((time.time() - start_t) * 1000) if not cluster_data: - push("PATRONI-CLUSTER", "down", error_msg, ping_ms) + push("Patroni Cluster", "down", error_msg, ping_ms) return members = cluster_data.get("members", []) @@ -73,7 +73,7 @@ def check_patroni_cluster(): if not leader: down_nodes = [f"{r[0]} state: {r[2]}" for r in replicas if r[2] not in ("running", "streaming")] msg = f"no leader detected | " + " ".join(down_nodes) - push("PATRONI-CLUSTER", "down", msg, ping_ms) + push("Patroni Cluster", "down", msg, ping_ms) else: lag_strs = [] for name, lag, state in replicas: @@ -81,7 +81,7 @@ def check_patroni_cluster(): lag_strs.append(f"{name} (lag:{lag_mb:.0f}MB)") msg = f"leader: {leader} | replicas: " + " ".join(lag_strs) - push("PATRONI-CLUSTER", "up", msg, ping_ms) + push("Patroni Cluster", "up", msg, ping_ms) def check_rabbitmq_cluster(): url = "http://rabbitmq:15672/api/healthchecks/node" @@ -104,14 +104,14 @@ def check_rabbitmq_cluster(): alarms = [n.get("name") for n in data if n.get("mem_alarm") or n.get("disk_free_alarm")] if alarms: msg = f"disk/mem alarm active on {','.join(alarms)}" - push("RABBITMQ-CLUSTER", "down", msg, ping_ms) + push("Rabbitmq Cluster", "down", msg, ping_ms) return msg = f"{nodes_running}/{total_nodes} nodes running" - push("RABBITMQ-CLUSTER", "up", msg, ping_ms) + push("Rabbitmq Cluster", "up", msg, ping_ms) else: msg = err or f"HTTP {resp.status_code if resp else 'Unknown'}" - push("RABBITMQ-CLUSTER", "down", msg, ping_ms) + push("Rabbitmq Cluster", "down", msg, ping_ms) def check_apisix(): url = "http://apisix:9180/apisix/admin/routes" @@ -120,9 +120,9 @@ def check_apisix(): ok, resp, ping_ms, err = http_check(url, headers=headers) if ok: - push("APISIX-GATEWAY", "up", "admin API reachable", ping_ms) + push("Apisix Gateway", "up", "admin API reachable", ping_ms) else: - push("APISIX-GATEWAY", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms) + push("Apisix Gateway", "down", f"admin API unreachable: {err or resp.status_code}", ping_ms) def check_vault(): hosts_env = os.getenv("VAULT_HOSTS", "vault") @@ -152,18 +152,18 @@ def check_vault(): if unsealed_count == total: msg = f"{unsealed_count}/{total} unsealed" - push("VAULT-CLUSTER", "up", msg, ping_ms) + push("Vault Cluster", "up", msg, ping_ms) else: msg = " | ".join(errors) if errors else "Vault checks failed" - push("VAULT-CLUSTER", "down", msg, ping_ms) + push("Vault Cluster", "down", msg, ping_ms) def check_prometheus(): url = "http://prometheus:9090/-/healthy" ok, resp, ping_ms, err = http_check(url) if ok: - push("PROMETHEUS", "up", "healthy", ping_ms) + push("Prometheus", "up", "healthy", ping_ms) else: - push("PROMETHEUS", "down", f"prometheus unreachable: {err}", ping_ms) + push("Prometheus", "down", f"prometheus unreachable: {err}", ping_ms) def check_grafana(): url = "http://grafana:3000/api/health" @@ -172,27 +172,27 @@ def check_grafana(): data = resp.json() db_status = data.get("database", "unknown") if db_status == "ok": - push("GRAFANA", "up", f"ok | db: {db_status}", ping_ms) + push("Grafana", "up", f"ok | db: {db_status}", ping_ms) else: - push("GRAFANA", "down", f"db not ok: {db_status}", ping_ms) + push("Grafana", "down", f"db not ok: {db_status}", ping_ms) else: - push("GRAFANA", "down", f"grafana unreachable: {err}", ping_ms) + push("Grafana", "down", f"grafana unreachable: {err}", ping_ms) def check_portainer(): url = "http://portainer:9000/api/system/status" ok, resp, ping_ms, err = http_check(url) if ok: - push("PORTAINER", "up", "running", ping_ms) + push("Portainer", "up", "running", ping_ms) else: - push("PORTAINER", "down", f"portainer unreachable: {err}", ping_ms) + push("Portainer", "down", f"portainer unreachable: {err}", ping_ms) def check_loki(): url = "http://loki:3100/ready" ok, resp, ping_ms, err = http_check(url) if ok: - push("LOKI", "up", "ready", ping_ms) + push("Loki", "up", "ready", ping_ms) else: - push("LOKI", "down", f"loki unreachable: {err}", ping_ms) + push("Loki", "down", f"loki unreachable: {err}", ping_ms) def run_all_http_checks(): check_patroni_cluster() diff --git a/health-agent/src/health_agent/checks/mongodb.py b/health-agent/src/health_agent/checks/mongodb.py index 593cef6..9d80363 100644 --- a/health-agent/src/health_agent/checks/mongodb.py +++ b/health-agent/src/health_agent/checks/mongodb.py @@ -35,7 +35,7 @@ def check_mongodb(): ping_ms = int((time.time() - start_t) * 1000) if cluster_size == 1: - push("MONGODB-REPLICASET", "up", "standalone mode OK", ping_ms) + push("Mongodb Replicaset", "up", "standalone mode OK", ping_ms) return if primary: @@ -45,13 +45,13 @@ def check_mongodb(): unhealthy_secs = [s for s in secondaries if s[1] not in ('SECONDARY', 'ARBITER')] if unhealthy_secs: msg = f"PRIMARY: {primary} | unhealthy: {','.join([s[0] + ':' + s[1] for s in unhealthy_secs])}" - push("MONGODB-REPLICASET", "down", msg, ping_ms) + push("Mongodb Replicaset", "down", msg, ping_ms) else: - push("MONGODB-REPLICASET", "up", msg, ping_ms) + push("Mongodb Replicaset", "up", msg, ping_ms) else: msg = "no PRIMARY | quorum lost" - push("MONGODB-REPLICASET", "down", msg, ping_ms) + push("Mongodb Replicaset", "down", msg, ping_ms) except Exception as e: ping_ms = int((time.time() - start_t) * 1000) - push("MONGODB-REPLICASET", "down", f"connection failed: {e}", ping_ms) + push("Mongodb Replicaset", "down", f"connection failed: {e}", ping_ms) diff --git a/health-agent/src/health_agent/checks/redis_sentinel.py b/health-agent/src/health_agent/checks/redis_sentinel.py index 0a05ded..1dea54c 100644 --- a/health-agent/src/health_agent/checks/redis_sentinel.py +++ b/health-agent/src/health_agent/checks/redis_sentinel.py @@ -24,7 +24,7 @@ def check_redis_sentinel(): redis_mode = os.getenv("REDIS_MODE", "sentinel") if redis_mode != "sentinel": - push("REDIS-SENTINEL", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000)) + push("Redis Sentinel", "up", "standalone mode (skipped)", int((time.time() - start_t) * 1000)) return try: @@ -43,8 +43,8 @@ def check_redis_sentinel(): ping_ms = int((time.time() - start_t) * 1000) msg = f"master: {master_ip}:{master_port} | replicas: {replicas_count} | sentinels quorum OK" - push("REDIS-SENTINEL", "up", msg, ping_ms) + push("Redis Sentinel", "up", msg, ping_ms) except Exception as e: ping_ms = int((time.time() - start_t) * 1000) - push("REDIS-SENTINEL", "down", f"quorum FAIL or master unreachable: {e}", ping_ms) + push("Redis Sentinel", "down", f"quorum FAIL or master unreachable: {e}", ping_ms) diff --git a/health-agent/src/health_agent/checks/swarm.py b/health-agent/src/health_agent/checks/swarm.py index 2c70b29..a9b64b4 100644 --- a/health-agent/src/health_agent/checks/swarm.py +++ b/health-agent/src/health_agent/checks/swarm.py @@ -38,12 +38,12 @@ def check_swarm_cluster(): if ready_count == total_nodes: msg = f"{ready_count}/{total_nodes} nodes Ready (managers: {', '.join(managers)})" - push("SWARM-CLUSTER", "up", msg, ping_ms) + push("Swarm Cluster", "up", msg, ping_ms) else: msg = f"{ready_count}/{total_nodes} nodes Ready | Managers reachable: {len(managers)}" - push("SWARM-CLUSTER", "down", msg, ping_ms) + push("Swarm Cluster", "down", msg, ping_ms) except Exception as e: ping_ms = int((time.time() - start_time) * 1000) logger.error(f"Swarm check failed: {e}") - push("SWARM-CLUSTER", "down", str(e), ping_ms) + push("Swarm Cluster", "down", str(e), ping_ms) diff --git a/health-agent/src/health_agent/checks/tcp.py b/health-agent/src/health_agent/checks/tcp.py index c613049..5b00816 100644 --- a/health-agent/src/health_agent/checks/tcp.py +++ b/health-agent/src/health_agent/checks/tcp.py @@ -70,8 +70,8 @@ def check_etcd_cluster(): if healthy_count == len(nodes): leader_info = f" | leader: {leader}" if leader else "" msg = f"{healthy_count}/{len(nodes)} healthy{leader_info}" - push("ETCD-CLUSTER", "up", msg, ping_ms) + push("Etcd Cluster", "up", msg, ping_ms) else: quorum_msg = f" | quorum at risk ({healthy_count}/{len(nodes)})" if healthy_count < 3 else "" msg = " | ".join(errors) + quorum_msg - push("ETCD-CLUSTER", "down", msg, ping_ms) + push("Etcd Cluster", "down", msg, ping_ms) diff --git a/health-agent/src/health_agent/checks/tls.py b/health-agent/src/health_agent/checks/tls.py index 6b5f691..b7cb1ed 100644 --- a/health-agent/src/health_agent/checks/tls.py +++ b/health-agent/src/health_agent/checks/tls.py @@ -57,6 +57,6 @@ def check_swag_tls(): msg = " | ".join(msg_parts) if is_down: - push("SWAG-TLS", "down", msg, ping_ms) + push("Swag Tls", "down", msg, ping_ms) else: - push("SWAG-TLS", "up", msg, ping_ms) + push("Swag Tls", "up", msg, ping_ms) From 95dd439a34b025b4912354b8fef349ae7f37a3af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 20:53:59 +0300 Subject: [PATCH 10/22] health-agent redeploy with new image --- health-agent/deploy/prod.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/health-agent/deploy/prod.env b/health-agent/deploy/prod.env index 969ab4c..12ed295 100644 --- a/health-agent/deploy/prod.env +++ b/health-agent/deploy/prod.env @@ -1,2 +1,2 @@ -SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:a2ed1cbaabf116e49d1685e37e0335798d1fe49a2d95457717c68b1576894062 +SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:c3229a3517c7c6d471ae9dd1a3861d2c837d748f0946b1a8bf35e1caea89ebbd PROD_IMAGE_TAG=0.1.0 \ No newline at end of file From 8b10653ff46087292faf14cc59ebb693f08b306b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 21:07:11 +0300 Subject: [PATCH 11/22] fix(health-agent): fix ping maxretries param and status page group lookup Fix ping monitor creation error ('max_retries' is not a valid uptime-kuma-api param; correct name is 'maxretries'). Fix status pages never linking groups: re-fetching get_monitors() after add_monitor() races with WebSocket delivery so newly created groups are missing; use group_map populated in Section 1 directly instead. --- health-agent/scripts/setup_uptime_kuma.py | 47 ++++++++++++----------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/health-agent/scripts/setup_uptime_kuma.py b/health-agent/scripts/setup_uptime_kuma.py index 75841af..4789266 100644 --- a/health-agent/scripts/setup_uptime_kuma.py +++ b/health-agent/scripts/setup_uptime_kuma.py @@ -103,7 +103,7 @@ def setup_uptime_kuma(dry_run=False, only=None): type=NotificationType.SLACK, name=notif_name, isDefault=False, - webhookURL=webhook_url, + slackwebhookURL=webhook_url, applyExisting=False ) notification_map[notif_key] = res.get('id') @@ -138,6 +138,7 @@ def setup_uptime_kuma(dry_run=False, only=None): group_map[raw_name] = existing_monitors[formatted_name]['id'] tokens = {} + new_monitor_ids = {} # m_name -> monitorID for monitors created in this run # 2. Push Monitors for pm in config.get("push_monitors", []): @@ -169,15 +170,24 @@ def setup_uptime_kuma(dry_run=False, only=None): if notif_ids: kwargs["notification_id_list"] = notif_ids result = api.add_monitor(**kwargs) - m_id = result['monitorID'] - - for m in api.get_monitors(): - if m['id'] == m_id: - tokens[m_name] = m['pushToken'] - break + new_monitor_ids[m_name] = result['monitorID'] else: tokens[m_name] = "dummy_token_dry_run" + # Fetch push tokens for newly created monitors in one batch call. + # Calling api.get_monitors() per-monitor races with WebSocket event delivery; + # a single call after all creates allows the server state to settle. + if new_monitor_ids and api: + id_to_name = {v: k for k, v in new_monitor_ids.items()} + for m in api.get_monitors(): + if m['id'] in id_to_name: + m_name = id_to_name[m['id']] + tokens[m_name] = m.get('pushToken', '') + logger.info(f"Captured push token for {m_name}") + missing = [n for n in new_monitor_ids if n not in tokens] + if missing: + logger.warning(f"Could not capture push token for: {missing}") + # 3. HTTP Monitors for hm in config.get("http_monitors", []): m_name = hm["name"] @@ -269,7 +279,7 @@ def setup_uptime_kuma(dry_run=False, only=None): "name": m_name, "hostname": ip, "interval": ping_interval, - "max_retries": ping_retries, + "maxretries": ping_retries, } if parent_group_id is not None: kwargs["parent"] = parent_group_id @@ -299,7 +309,7 @@ def setup_uptime_kuma(dry_run=False, only=None): "name": m_name, "hostname": ip, "interval": ping_interval, - "max_retries": ping_retries, + "maxretries": ping_retries, } if parent_group_id is not None: kwargs["parent"] = parent_group_id @@ -312,13 +322,6 @@ def setup_uptime_kuma(dry_run=False, only=None): # 6. Status Pages if api: - all_monitors = {} - try: - for m in api.get_monitors(): - all_monitors[m['name']] = m - except Exception as e: - logger.warning(f"Failed to re-fetch monitors for status pages: {e}") - existing_pages = {} try: for p in api.get_status_pages(): @@ -339,18 +342,18 @@ def setup_uptime_kuma(dry_run=False, only=None): api.add_status_page(slug, title) # Each monitors.yml group becomes one display section on the status page. - # The GROUP monitor is added so Uptime Kuma renders it with all its children. + # Use group_map (populated during Section 1) to avoid re-fetching monitors; + # a fresh get_monitors() call after add_monitor() races with WebSocket delivery. public_group_list = [] for group_raw_name in sp_groups: - group_formatted = f"{project} [{env_name}] {group_raw_name}" - group_monitor = all_monitors.get(group_formatted) - if not group_monitor: - logger.warning(f"Group '{group_formatted}' not found, skipping in status page") + group_id = group_map.get(group_raw_name) + if not group_id: + logger.warning(f"Group '{group_raw_name}' not in group_map, skipping in status page") continue public_group_list.append({ "name": group_raw_name, "weight": len(public_group_list) + 1, - "monitorList": [{"id": group_monitor['id']}] + "monitorList": [{"id": group_id}] }) if public_group_list: From e4acd0e57b7c0ca33b17f289abd65af7a84ec1a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 21:10:10 +0300 Subject: [PATCH 12/22] fix(health-agent): skip uk_tokens.yml write when tokens dict is empty to prevent setup skip loop --- health-agent/scripts/setup_uptime_kuma.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/health-agent/scripts/setup_uptime_kuma.py b/health-agent/scripts/setup_uptime_kuma.py index 4789266..5b9c616 100644 --- a/health-agent/scripts/setup_uptime_kuma.py +++ b/health-agent/scripts/setup_uptime_kuma.py @@ -370,10 +370,13 @@ def setup_uptime_kuma(dry_run=False, only=None): # 7. Write push tokens to uk_tokens.yml token_file = os.path.join(os.path.dirname(__file__), "..", "config", "generated", "uk_tokens.yml") if not dry_run: - os.makedirs(os.path.dirname(token_file), exist_ok=True) - with open(token_file, "w") as f: - yaml.dump(tokens, f) - logger.info(f"Saved push tokens to {token_file}") + if not tokens: + logger.warning("No push tokens captured; skipping uk_tokens.yml write so setup reruns next time") + else: + os.makedirs(os.path.dirname(token_file), exist_ok=True) + with open(token_file, "w") as f: + yaml.dump(tokens, f) + logger.info(f"Saved {len(tokens)} push tokens to {token_file}") else: logger.info(f"[DRY-RUN] Would save {len(tokens)} tokens to {token_file}") From a5fc058978a651b2c8e93cac516d6c6085cb83a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 21:15:06 +0300 Subject: [PATCH 13/22] health-agent redeploy with new image --- health-agent/deploy/prod.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/health-agent/deploy/prod.env b/health-agent/deploy/prod.env index 12ed295..24d98b6 100644 --- a/health-agent/deploy/prod.env +++ b/health-agent/deploy/prod.env @@ -1,2 +1,2 @@ -SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:c3229a3517c7c6d471ae9dd1a3861d2c837d748f0946b1a8bf35e1caea89ebbd +SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:2a820591c352221731e5d850159f624e34dd9b85a59d13724c4a745f0b08f1c8 PROD_IMAGE_TAG=0.1.0 \ No newline at end of file From 2827b227d53f8e0e002883289fb5dcffef06c914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 21:23:45 +0300 Subject: [PATCH 14/22] =?UTF-8?q?fix(health-agent):=20fix=20notification?= =?UTF-8?q?=20param=20name=20and=20type=20=E2=80=94=20notificationIDList?= =?UTF-8?q?=20expects=20a=20list=20of=20IDs=20not=20a=20dict?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- health-agent/scripts/setup_uptime_kuma.py | 24 ++++++++--------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/health-agent/scripts/setup_uptime_kuma.py b/health-agent/scripts/setup_uptime_kuma.py index 5b9c616..376abbf 100644 --- a/health-agent/scripts/setup_uptime_kuma.py +++ b/health-agent/scripts/setup_uptime_kuma.py @@ -35,11 +35,7 @@ def find_parent_group(monitor_name, groups, group_map): def find_group_notifications(monitor_name, groups, notification_map): for g in groups: if monitor_name in g.get("children", []): - ids = {} - for n in g.get("notifications", []): - nid = notification_map.get(n) - if nid is not None: - ids[str(nid)] = True + ids = [notification_map[n] for n in g.get("notifications", []) if notification_map.get(n) is not None] return ids or None return None @@ -119,11 +115,7 @@ def setup_uptime_kuma(dry_run=False, only=None): raw_name = g["name"] formatted_name = f"{project} [{env_name}] {raw_name}" - notif_ids = {} - for n in g.get("notifications", []): - nid = notification_map.get(n) - if nid is not None: - notif_ids[str(nid)] = True + notif_ids = [notification_map[n] for n in g.get("notifications", []) if notification_map.get(n) is not None] logger.info(f"Processing group: {formatted_name}") if not dry_run: @@ -131,7 +123,7 @@ def setup_uptime_kuma(dry_run=False, only=None): logger.info(f"Creating group monitor: {formatted_name}") kwargs = {"type": MonitorType.GROUP, "name": formatted_name} if notif_ids: - kwargs["notification_id_list"] = notif_ids + kwargs["notificationIDList"] = notif_ids res = api.add_monitor(**kwargs) group_map[raw_name] = res['monitorID'] else: @@ -168,7 +160,7 @@ def setup_uptime_kuma(dry_run=False, only=None): "parent": parent_group_id } if notif_ids: - kwargs["notification_id_list"] = notif_ids + kwargs["notificationIDList"] = notif_ids result = api.add_monitor(**kwargs) new_monitor_ids[m_name] = result['monitorID'] else: @@ -215,7 +207,7 @@ def setup_uptime_kuma(dry_run=False, only=None): if parent_group_id is not None: kwargs["parent"] = parent_group_id if notif_ids: - kwargs["notification_id_list"] = notif_ids + kwargs["notificationIDList"] = notif_ids api.add_monitor(**kwargs) logger.info(f"Created HTTP monitor: {m_name}") except Exception as e: @@ -248,7 +240,7 @@ def setup_uptime_kuma(dry_run=False, only=None): if parent_group_id is not None: kwargs["parent"] = parent_group_id if notif_ids: - kwargs["notification_id_list"] = notif_ids + kwargs["notificationIDList"] = notif_ids api.add_monitor(**kwargs) logger.info(f"Created DNS monitor: {m_name}") except Exception as e: @@ -284,7 +276,7 @@ def setup_uptime_kuma(dry_run=False, only=None): if parent_group_id is not None: kwargs["parent"] = parent_group_id if notif_ids: - kwargs["notification_id_list"] = notif_ids + kwargs["notificationIDList"] = notif_ids api.add_monitor(**kwargs) logger.info(f"Created Ping monitor: {m_name}") except Exception as e: @@ -314,7 +306,7 @@ def setup_uptime_kuma(dry_run=False, only=None): if parent_group_id is not None: kwargs["parent"] = parent_group_id if notif_ids: - kwargs["notification_id_list"] = notif_ids + kwargs["notificationIDList"] = notif_ids api.add_monitor(**kwargs) logger.info(f"Created Ping monitor: {m_name}") except Exception as e: From 0551b01c64f5fec3ee9643e03cb22272dc00413c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 21:27:14 +0300 Subject: [PATCH 15/22] health-agent redeploy with new image --- health-agent/deploy/prod.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/health-agent/deploy/prod.env b/health-agent/deploy/prod.env index 24d98b6..9d012ee 100644 --- a/health-agent/deploy/prod.env +++ b/health-agent/deploy/prod.env @@ -1,2 +1,2 @@ -SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:2a820591c352221731e5d850159f624e34dd9b85a59d13724c4a745f0b08f1c8 +SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:64976d0fccf604071051b5c9d20f179639f75b1d3cbced03667d5237b38a4f9b PROD_IMAGE_TAG=0.1.0 \ No newline at end of file From fa7ed410632566ecacf99e9e6dc612c5c335acb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 21:35:44 +0300 Subject: [PATCH 16/22] fix(health-agent): reload uk_tokens.yml on every push call instead of caching at startup --- health-agent/src/health_agent/uptime_kuma.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/health-agent/src/health_agent/uptime_kuma.py b/health-agent/src/health_agent/uptime_kuma.py index b0fcf53..4a1fe5a 100644 --- a/health-agent/src/health_agent/uptime_kuma.py +++ b/health-agent/src/health_agent/uptime_kuma.py @@ -1,7 +1,7 @@ import os import requests import logging -from health_agent.config import UK_TOKENS +from health_agent.config import load_uk_tokens logger = logging.getLogger(__name__) UK_PUSH_URL_BASE = os.getenv("UK_PUSH_URL_BASE", "https://uptime.tarla.io/api/push") @@ -9,7 +9,7 @@ UK_PUSH_URL_BASE = os.getenv("UK_PUSH_URL_BASE", "https://uptime.tarla.io/api/pu DRY_RUN = False def push(monitor_name: str, status: str, msg: str, ping_ms: int): - token = UK_TOKENS.get(monitor_name) + token = load_uk_tokens().get(monitor_name) if not token: logger.warning(f"No token found for monitor {monitor_name}") return From 94e6b57c5216e49ff16dd0320e8b1372adf398a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 21:54:49 +0300 Subject: [PATCH 17/22] fix(health-agent): check all 3 patroni node configs on storagebox; switch ping monitors to TCP port 22 (ICMP blocked from Docker) --- health-agent/scripts/setup_uptime_kuma.py | 21 +++++++++++-------- .../src/health_agent/checks/filesystem.py | 4 +++- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/health-agent/scripts/setup_uptime_kuma.py b/health-agent/scripts/setup_uptime_kuma.py index 376abbf..41b0486 100644 --- a/health-agent/scripts/setup_uptime_kuma.py +++ b/health-agent/scripts/setup_uptime_kuma.py @@ -246,10 +246,11 @@ def setup_uptime_kuma(dry_run=False, only=None): except Exception as e: logger.warning(f"Failed to create DNS monitor {m_name}: {e}") - # 5. Ping Monitors (generated from nodes config) + # 5. TCP Port Monitors (generated from nodes config; ICMP is blocked from Docker, use TCP SSH port) ping_cfg = config.get("ping_monitors", {}) ping_interval = ping_cfg.get("interval", 60) ping_retries = ping_cfg.get("max_retries", 1) + ping_port = ping_cfg.get("port", 22) env_nodes = config.get("nodes", {}).get(env_name, {}) for i, node in enumerate(env_nodes.get("service", []), 1): @@ -260,16 +261,17 @@ def setup_uptime_kuma(dry_run=False, only=None): parent_group_id = find_parent_group(m_name, config.get("groups", []), group_map) notif_ids = find_group_notifications(m_name, config.get("groups", []), notification_map) - logger.info(f"Processing Ping monitor: {m_name} -> {ip}") + logger.info(f"Processing TCP port monitor: {m_name} -> {ip}:{ping_port}") if not dry_run: if m_name in existing_monitors: logger.info(f"Monitor {m_name} already exists.") else: try: kwargs = { - "type": MonitorType.PING, + "type": MonitorType.PORT, "name": m_name, "hostname": ip, + "port": ping_port, "interval": ping_interval, "maxretries": ping_retries, } @@ -278,9 +280,9 @@ def setup_uptime_kuma(dry_run=False, only=None): if notif_ids: kwargs["notificationIDList"] = notif_ids api.add_monitor(**kwargs) - logger.info(f"Created Ping monitor: {m_name}") + logger.info(f"Created TCP port monitor: {m_name}") except Exception as e: - logger.warning(f"Failed to create Ping monitor {m_name}: {e}") + logger.warning(f"Failed to create TCP port monitor {m_name}: {e}") for i, node in enumerate(env_nodes.get("db", []), 1): m_name = f"Ext Ping Db{i:02d}" @@ -290,16 +292,17 @@ def setup_uptime_kuma(dry_run=False, only=None): parent_group_id = find_parent_group(m_name, config.get("groups", []), group_map) notif_ids = find_group_notifications(m_name, config.get("groups", []), notification_map) - logger.info(f"Processing Ping monitor: {m_name} -> {ip}") + logger.info(f"Processing TCP port monitor: {m_name} -> {ip}:{ping_port}") if not dry_run: if m_name in existing_monitors: logger.info(f"Monitor {m_name} already exists.") else: try: kwargs = { - "type": MonitorType.PING, + "type": MonitorType.PORT, "name": m_name, "hostname": ip, + "port": ping_port, "interval": ping_interval, "maxretries": ping_retries, } @@ -308,9 +311,9 @@ def setup_uptime_kuma(dry_run=False, only=None): if notif_ids: kwargs["notificationIDList"] = notif_ids api.add_monitor(**kwargs) - logger.info(f"Created Ping monitor: {m_name}") + logger.info(f"Created TCP port monitor: {m_name}") except Exception as e: - logger.warning(f"Failed to create Ping monitor {m_name}: {e}") + logger.warning(f"Failed to create TCP port monitor {m_name}: {e}") # 6. Status Pages if api: diff --git a/health-agent/src/health_agent/checks/filesystem.py b/health-agent/src/health_agent/checks/filesystem.py index b06fdfd..33355f7 100644 --- a/health-agent/src/health_agent/checks/filesystem.py +++ b/health-agent/src/health_agent/checks/filesystem.py @@ -10,7 +10,9 @@ def check_storagebox_mount(): storagebox_path = os.getenv("STORAGEBOX_PATH", "/mnt/storagebox") expected_files = [ - "patroni/patroni.yml", + "db/postgresql-01/config/patroni.yml", + "db/postgresql-02/config/patroni.yml", + "db/postgresql-03/config/patroni.yml", "ssl/STAR.iklim.co.full.crt" ] From b73ae4e5fb24a88d3761f91cc4c9362424210c9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 21:55:42 +0300 Subject: [PATCH 18/22] revert(health-agent): revert ping monitors back to PING type --- health-agent/scripts/setup_uptime_kuma.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/health-agent/scripts/setup_uptime_kuma.py b/health-agent/scripts/setup_uptime_kuma.py index 41b0486..376abbf 100644 --- a/health-agent/scripts/setup_uptime_kuma.py +++ b/health-agent/scripts/setup_uptime_kuma.py @@ -246,11 +246,10 @@ def setup_uptime_kuma(dry_run=False, only=None): except Exception as e: logger.warning(f"Failed to create DNS monitor {m_name}: {e}") - # 5. TCP Port Monitors (generated from nodes config; ICMP is blocked from Docker, use TCP SSH port) + # 5. Ping Monitors (generated from nodes config) ping_cfg = config.get("ping_monitors", {}) ping_interval = ping_cfg.get("interval", 60) ping_retries = ping_cfg.get("max_retries", 1) - ping_port = ping_cfg.get("port", 22) env_nodes = config.get("nodes", {}).get(env_name, {}) for i, node in enumerate(env_nodes.get("service", []), 1): @@ -261,17 +260,16 @@ def setup_uptime_kuma(dry_run=False, only=None): parent_group_id = find_parent_group(m_name, config.get("groups", []), group_map) notif_ids = find_group_notifications(m_name, config.get("groups", []), notification_map) - logger.info(f"Processing TCP port monitor: {m_name} -> {ip}:{ping_port}") + logger.info(f"Processing Ping monitor: {m_name} -> {ip}") if not dry_run: if m_name in existing_monitors: logger.info(f"Monitor {m_name} already exists.") else: try: kwargs = { - "type": MonitorType.PORT, + "type": MonitorType.PING, "name": m_name, "hostname": ip, - "port": ping_port, "interval": ping_interval, "maxretries": ping_retries, } @@ -280,9 +278,9 @@ def setup_uptime_kuma(dry_run=False, only=None): if notif_ids: kwargs["notificationIDList"] = notif_ids api.add_monitor(**kwargs) - logger.info(f"Created TCP port monitor: {m_name}") + logger.info(f"Created Ping monitor: {m_name}") except Exception as e: - logger.warning(f"Failed to create TCP port monitor {m_name}: {e}") + logger.warning(f"Failed to create Ping monitor {m_name}: {e}") for i, node in enumerate(env_nodes.get("db", []), 1): m_name = f"Ext Ping Db{i:02d}" @@ -292,17 +290,16 @@ def setup_uptime_kuma(dry_run=False, only=None): parent_group_id = find_parent_group(m_name, config.get("groups", []), group_map) notif_ids = find_group_notifications(m_name, config.get("groups", []), notification_map) - logger.info(f"Processing TCP port monitor: {m_name} -> {ip}:{ping_port}") + logger.info(f"Processing Ping monitor: {m_name} -> {ip}") if not dry_run: if m_name in existing_monitors: logger.info(f"Monitor {m_name} already exists.") else: try: kwargs = { - "type": MonitorType.PORT, + "type": MonitorType.PING, "name": m_name, "hostname": ip, - "port": ping_port, "interval": ping_interval, "maxretries": ping_retries, } @@ -311,9 +308,9 @@ def setup_uptime_kuma(dry_run=False, only=None): if notif_ids: kwargs["notificationIDList"] = notif_ids api.add_monitor(**kwargs) - logger.info(f"Created TCP port monitor: {m_name}") + logger.info(f"Created Ping monitor: {m_name}") except Exception as e: - logger.warning(f"Failed to create TCP port monitor {m_name}: {e}") + logger.warning(f"Failed to create Ping monitor {m_name}: {e}") # 6. Status Pages if api: From 969c4a2301c3c3155e64109e1b6062039562ccac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 22:51:15 +0300 Subject: [PATCH 19/22] fix(monitoring): resolve health-agent bugs and flapping monitors - Vault flapping: Fix resp evaluation on HTTP 429 - Storagebox block: Move mount check to a daemon thread - Push monitors: Increase interval to 75s and restore 60s sleep - Redis Sentinel: Fix authentication in sentinel_kwargs - Ext Https Api: Update URL to /health --- health-agent/config/monitors.yml | 16 ++++++++-------- health-agent/src/health_agent/checks/http.py | 4 ++-- .../src/health_agent/checks/redis_sentinel.py | 6 +++++- health-agent/src/health_agent/main.py | 9 ++++++--- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/health-agent/config/monitors.yml b/health-agent/config/monitors.yml index 3cfe7fd..576d4ee 100644 --- a/health-agent/config/monitors.yml +++ b/health-agent/config/monitors.yml @@ -78,22 +78,22 @@ groups: children: [Prometheus, Grafana, Portainer, Loki, Ext Https Portainer, Ext Https Apigw] push_monitors: - name: Swarm Cluster - interval: 60 + interval: 75 heartbeat_retries: 1 tags: [internal, infrastructure, high] restart_threshold: 1 - name: Vault Cluster - interval: 60 + interval: 75 heartbeat_retries: 1 tags: [internal, infrastructure, high] restart_threshold: 1 - name: Etcd Cluster - interval: 60 + interval: 75 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 - name: Patroni Cluster - interval: 60 + interval: 75 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 @@ -103,17 +103,17 @@ push_monitors: tags: [internal, database, high] restart_threshold: 1 - name: Apisix Gateway - interval: 60 + interval: 75 heartbeat_retries: 1 tags: [internal, gateway, high] restart_threshold: 1 - name: Rabbitmq Cluster - interval: 60 + interval: 75 heartbeat_retries: 1 tags: [internal, gateway, medium] restart_threshold: 3 - name: Redis Sentinel - interval: 60 + interval: 75 heartbeat_retries: 1 tags: [internal, database, medium] restart_threshold: 3 @@ -149,7 +149,7 @@ push_monitors: restart_threshold: 5 http_monitors: - name: Ext Https Api - url: "https://api{suffix}.{domain}/actuator/health" + url: "https://api{suffix}.{domain}/health" accepted_statuscodes: ["200"] interval: 60 - name: Ext Https Grafana diff --git a/health-agent/src/health_agent/checks/http.py b/health-agent/src/health_agent/checks/http.py index ea9d96b..5191392 100644 --- a/health-agent/src/health_agent/checks/http.py +++ b/health-agent/src/health_agent/checks/http.py @@ -139,14 +139,14 @@ def check_vault(): ok, resp, ms, err = http_check(url, expected_status=[200, 429, 473]) max_ping = max(max_ping, ms) - if resp: + if resp is not None: data = resp.json() if not data.get("sealed"): unsealed_count += 1 else: errors.append(f"{node} SEALED") else: - errors.append(f"{node} unreachable") + errors.append(f"{node} unreachable: {err}") ping_ms = int((time.time() - start_t) * 1000) diff --git a/health-agent/src/health_agent/checks/redis_sentinel.py b/health-agent/src/health_agent/checks/redis_sentinel.py index 1dea54c..ade60bd 100644 --- a/health-agent/src/health_agent/checks/redis_sentinel.py +++ b/health-agent/src/health_agent/checks/redis_sentinel.py @@ -28,7 +28,11 @@ def check_redis_sentinel(): return try: - sentinel = Sentinel(sentinel_nodes, socket_timeout=3, password=password) + sentinel_kwargs = {"socket_timeout": 3} + if password: + sentinel_kwargs["password"] = password + + sentinel = Sentinel(sentinel_nodes, sentinel_kwargs=sentinel_kwargs, socket_timeout=3, password=password) # Master ping master = sentinel.master_for(master_name, socket_timeout=3, password=password) diff --git a/health-agent/src/health_agent/main.py b/health-agent/src/health_agent/main.py index 7a0d361..5321a1d 100644 --- a/health-agent/src/health_agent/main.py +++ b/health-agent/src/health_agent/main.py @@ -2,6 +2,7 @@ import argparse import time import logging import json +import threading from health_agent.checks import swarm from health_agent.checks.http import run_all_http_checks from health_agent.checks.tcp import check_etcd_cluster @@ -65,9 +66,9 @@ def run_checks(): logger.error(f"Error running MongoDB checks: {e}") try: - check_storagebox_mount() + threading.Thread(target=check_storagebox_mount, daemon=True).start() except Exception as e: - logger.error(f"Error running filesystem checks: {e}") + logger.error(f"Error starting filesystem check thread: {e}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="iklim.co Health Agent") @@ -88,5 +89,7 @@ if __name__ == "__main__": run_checks() else: while True: + t_start = time.time() run_checks() - time.sleep(60) + elapsed = time.time() - t_start + time.sleep(max(0, 60 - elapsed)) From 2a482ce4df88b17494777f82797ce796855d45b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 22:53:35 +0300 Subject: [PATCH 20/22] health-agent redeploy with new image --- health-agent/deploy/prod.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/health-agent/deploy/prod.env b/health-agent/deploy/prod.env index 9d012ee..87e32a2 100644 --- a/health-agent/deploy/prod.env +++ b/health-agent/deploy/prod.env @@ -1,2 +1,2 @@ -SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:64976d0fccf604071051b5c9d20f179639f75b1d3cbced03667d5237b38a4f9b +SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:8b327976ebe8bc9b79fba303956d2bf1453dc6eaba2510db7ee474fe013e0e7d PROD_IMAGE_TAG=0.1.0 \ No newline at end of file From b49ca276f0aae8eb88e5b7ed8667413b095de489 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 23:07:37 +0300 Subject: [PATCH 21/22] fix(monitoring): support existing monitor updates and vault nodes - setup_uptime_kuma: Use api.edit_monitor to update existing monitors with new configuration instead of skipping them. - setup_uptime_kuma: Add port and accepted_statuscodes to DNS monitors to prevent NodeJS null reading errors in Kuma. - http.py: Parse VAULT_HOSTS environment variable for Vault cluster nodes instead of hardcoding 'vault'. --- health-agent/scripts/setup_uptime_kuma.py | 92 +++++++++++++++++++++-- 1 file changed, 85 insertions(+), 7 deletions(-) diff --git a/health-agent/scripts/setup_uptime_kuma.py b/health-agent/scripts/setup_uptime_kuma.py index 376abbf..8a9294e 100644 --- a/health-agent/scripts/setup_uptime_kuma.py +++ b/health-agent/scripts/setup_uptime_kuma.py @@ -145,12 +145,22 @@ def setup_uptime_kuma(dry_run=False, only=None): logger.info(f"Processing push monitor: {m_name}") if not dry_run: if m_name in existing_monitors: - logger.info(f"Monitor {m_name} already exists.") + logger.info(f"Monitor {m_name} already exists. Updating...") m_id = existing_monitors[m_name]['id'] tokens[m_name] = existing_monitors[m_name]['pushToken'] - if parent_group_id and existing_monitors[m_name].get('parent') != parent_group_id: - api.edit_monitor(m_id, parent=parent_group_id) + kwargs = { + "interval": m_interval + } + if parent_group_id: + kwargs["parent"] = parent_group_id + if notif_ids: + kwargs["notificationIDList"] = notif_ids + + try: + api.edit_monitor(m_id, **kwargs) + except Exception as e: + logger.warning(f"Failed to edit push monitor {m_name}: {e}") else: logger.info(f"Creating push monitor: {m_name}") kwargs = { @@ -194,7 +204,23 @@ def setup_uptime_kuma(dry_run=False, only=None): logger.info(f"Processing HTTP monitor: {m_name} -> {url}") if not dry_run: if m_name in existing_monitors: - logger.info(f"Monitor {m_name} already exists.") + logger.info(f"Monitor {m_name} already exists. Updating...") + m_id = existing_monitors[m_name]['id'] + kwargs = { + "type": MonitorType.HTTP, + "name": m_name, + "url": url, + "interval": interval, + "accepted_statuscodes": accepted_statuscodes, + } + if parent_group_id is not None: + kwargs["parent"] = parent_group_id + if notif_ids: + kwargs["notificationIDList"] = notif_ids + try: + api.edit_monitor(m_id, **kwargs) + except Exception as e: + logger.warning(f"Failed to edit HTTP monitor {m_name}: {e}") else: try: kwargs = { @@ -227,13 +253,33 @@ def setup_uptime_kuma(dry_run=False, only=None): logger.info(f"Processing DNS monitor: {m_name} -> {hostname}") if not dry_run: if m_name in existing_monitors: - logger.info(f"Monitor {m_name} already exists.") + logger.info(f"Monitor {m_name} already exists. Updating...") + m_id = existing_monitors[m_name]['id'] + kwargs = { + "type": MonitorType.DNS, + "name": m_name, + "hostname": hostname, + "port": 53, + "accepted_statuscodes": ["200-299"], + "dns_resolve_type": dns_resolve_type, + "interval": interval, + } + if parent_group_id is not None: + kwargs["parent"] = parent_group_id + if notif_ids: + kwargs["notificationIDList"] = notif_ids + try: + api.edit_monitor(m_id, **kwargs) + except Exception as e: + logger.warning(f"Failed to edit DNS monitor {m_name}: {e}") else: try: kwargs = { "type": MonitorType.DNS, "name": m_name, "hostname": hostname, + "port": 53, + "accepted_statuscodes": ["200-299"], "dns_resolve_type": dns_resolve_type, "interval": interval, } @@ -263,7 +309,23 @@ def setup_uptime_kuma(dry_run=False, only=None): logger.info(f"Processing Ping monitor: {m_name} -> {ip}") if not dry_run: if m_name in existing_monitors: - logger.info(f"Monitor {m_name} already exists.") + logger.info(f"Monitor {m_name} already exists. Updating...") + m_id = existing_monitors[m_name]['id'] + kwargs = { + "type": MonitorType.PING, + "name": m_name, + "hostname": ip, + "interval": ping_interval, + "maxretries": ping_retries, + } + if parent_group_id is not None: + kwargs["parent"] = parent_group_id + if notif_ids: + kwargs["notificationIDList"] = notif_ids + try: + api.edit_monitor(m_id, **kwargs) + except Exception as e: + logger.warning(f"Failed to edit Ping monitor {m_name}: {e}") else: try: kwargs = { @@ -293,7 +355,23 @@ def setup_uptime_kuma(dry_run=False, only=None): logger.info(f"Processing Ping monitor: {m_name} -> {ip}") if not dry_run: if m_name in existing_monitors: - logger.info(f"Monitor {m_name} already exists.") + logger.info(f"Monitor {m_name} already exists. Updating...") + m_id = existing_monitors[m_name]['id'] + kwargs = { + "type": MonitorType.PING, + "name": m_name, + "hostname": ip, + "interval": ping_interval, + "maxretries": ping_retries, + } + if parent_group_id is not None: + kwargs["parent"] = parent_group_id + if notif_ids: + kwargs["notificationIDList"] = notif_ids + try: + api.edit_monitor(m_id, **kwargs) + except Exception as e: + logger.warning(f"Failed to edit Ping monitor {m_name}: {e}") else: try: kwargs = { From 475eb762b9d9a7004ebc905a16c000ad78190f83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 23:13:38 +0300 Subject: [PATCH 22/22] health-agent redeploy with new image --- health-agent/deploy/prod.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/health-agent/deploy/prod.env b/health-agent/deploy/prod.env index 87e32a2..4073bf7 100644 --- a/health-agent/deploy/prod.env +++ b/health-agent/deploy/prod.env @@ -1,2 +1,2 @@ -SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:8b327976ebe8bc9b79fba303956d2bf1453dc6eaba2510db7ee474fe013e0e7d +SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:196bb9b1cbb7acd7cd8671f7a3e9e3f0078a0c74658c66c9c22881fa66d75242 PROD_IMAGE_TAG=0.1.0 \ No newline at end of file