From 0ef4f0b6f80f3aca307a1a2a88db428c30a3bb84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Murat=20=C3=96ZDEM=C4=B0R?= Date: Fri, 26 Jun 2026 19:24:01 +0300 Subject: [PATCH] refactor: rename iklimco-monitoring stack to monitoring --- .gitea/workflows/deploy-monitoring-prod.yml | 8 +- .gitea/workflows/deploy-monitoring-test.yml | 8 +- README.md | 2 +- health-agent/README.md | 4 +- health-agent/scripts/setup_uptime_kuma.py | 332 +++++++++++++++++--- 5 files changed, 299 insertions(+), 55 deletions(-) diff --git a/.gitea/workflows/deploy-monitoring-prod.yml b/.gitea/workflows/deploy-monitoring-prod.yml index f6d3c9d..80ccc6f 100644 --- a/.gitea/workflows/deploy-monitoring-prod.yml +++ b/.gitea/workflows/deploy-monitoring-prod.yml @@ -119,14 +119,14 @@ jobs: --with-registry-auth \ --resolve-image changed \ -c docker-stack-monitoring.yml \ - iklimco-monitoring + monitoring - name: Wait for Loki run: | source ./common-functions-base.sh export SPRING_PROFILES_ACTIVE=PROD for i in $(seq 1 36); do - REPLICAS=$(docker service ls --filter name=iklimco-monitoring_loki --format "{{.Replicas}}" | head -1) + REPLICAS=$(docker service ls --filter name=monitoring_loki --format "{{.Replicas}}" | head -1) if echo "$REPLICAS" | awk -F'[/ ]' '$1>0 && $1==$2{found=1} END{exit !found}'; then log_message "SUCCESS" "Loki is ready: $REPLICAS" exit 0 @@ -134,7 +134,7 @@ jobs: log_message "INFO" "Loki not ready yet (${REPLICAS:-missing}), waiting 5s..." sleep 5 done - docker service ps iklimco-monitoring_loki || true + docker service ps monitoring_loki || true exit 1 - name: Configure SWAG Reverse Proxy @@ -190,6 +190,6 @@ jobs: - name: Verify Deployment run: | - docker service ps iklimco-monitoring_loki \ + docker service ps monitoring_loki \ --filter "desired-state=running" \ --format "table {{.Name}}\t{{.Node}}\t{{.CurrentState}}\t{{.Image}}" | head -20 diff --git a/.gitea/workflows/deploy-monitoring-test.yml b/.gitea/workflows/deploy-monitoring-test.yml index 7dc1d18..f271fc4 100644 --- a/.gitea/workflows/deploy-monitoring-test.yml +++ b/.gitea/workflows/deploy-monitoring-test.yml @@ -105,14 +105,14 @@ jobs: --with-registry-auth \ --resolve-image changed \ -c docker-stack-monitoring.yml \ - iklimco-monitoring + monitoring - name: Wait for Loki run: | source ./common-functions-base.sh export SPRING_PROFILES_ACTIVE=TEST for i in $(seq 1 36); do - REPLICAS=$(docker service ls --filter name=iklimco-monitoring_loki --format "{{.Replicas}}" | head -1) + REPLICAS=$(docker service ls --filter name=monitoring_loki --format "{{.Replicas}}" | head -1) if echo "$REPLICAS" | awk -F'[/ ]' '$1>0 && $1==$2{found=1} END{exit !found}'; then log_message "SUCCESS" "Loki is ready: $REPLICAS" exit 0 @@ -120,7 +120,7 @@ jobs: log_message "INFO" "Loki not ready yet (${REPLICAS:-missing}), waiting 5s..." sleep 5 done - docker service ps iklimco-monitoring_loki || true + docker service ps monitoring_loki || true exit 1 - name: Configure SWAG Reverse Proxy @@ -176,6 +176,6 @@ jobs: - name: Verify Deployment run: | - docker service ps iklimco-monitoring_loki \ + docker service ps monitoring_loki \ --filter "desired-state=running" \ --format "table {{.Name}}\t{{.Node}}\t{{.CurrentState}}\t{{.Image}}" | head -20 diff --git a/README.md b/README.md index e6db89a..67b41c1 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,7 @@ Mevcut dashboard'lara log paneli eklemek için: docker stack deploy \ --with-registry-auth \ -c Environment_Monitoring/docker-stack-monitoring.yml \ - iklimco-monitoring + monitoring ``` Prod için Gitea workflow'u: `Environment_Monitoring/.gitea/workflows/deploy-monitoring-prod.yml` diff --git a/health-agent/README.md b/health-agent/README.md index 5ab86a8..1789d0a 100644 --- a/health-agent/README.md +++ b/health-agent/README.md @@ -190,7 +190,7 @@ python scripts/setup_uptime_kuma.py docker stack deploy \ --with-registry-auth \ -c docker-stack-monitoring.yml \ - iklimco-monitoring + monitoring ``` Health-agent `iklimco-net` overlay ağına bağlı olmalı ve Docker socket'a salt okunur erişimi olmalıdır. @@ -199,7 +199,7 @@ Health-agent `iklimco-net` overlay ağına bağlı olmalı ve Docker socket'a sa ## Log Formatı -Agent JSON formatında log üretir. Grafana Explore (Loki datasource, `{service="iklimco-monitoring_health-agent"}`) veya `docker service logs iklimco-monitoring_health-agent` ile izlenebilir. Her log girdisi şu alanları içerir: +Agent JSON formatında log üretir. Grafana Explore (Loki datasource, `{service="monitoring_health-agent"}`) veya `docker service logs monitoring_health-agent` ile izlenebilir. Her log girdisi şu alanları içerir: - `check` — monitor adı - `status` — `up` veya `down` diff --git a/health-agent/scripts/setup_uptime_kuma.py b/health-agent/scripts/setup_uptime_kuma.py index 0bc5a35..2f6c3c7 100644 --- a/health-agent/scripts/setup_uptime_kuma.py +++ b/health-agent/scripts/setup_uptime_kuma.py @@ -3,7 +3,7 @@ import argparse import yaml import logging from dotenv import load_dotenv -from uptime_kuma_api import UptimeKumaApi, MonitorType +from uptime_kuma_api import UptimeKumaApi, MonitorType, NotificationType logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger("uk-setup") @@ -12,24 +12,53 @@ _root = os.path.join(os.path.dirname(__file__), "..") load_dotenv(os.path.join(_root, ".env")) load_dotenv(os.path.join(_root, ".env.setup")) + def format_str(text, env_name, project): if not isinstance(text, str): return text return text.replace("{env}", env_name).replace("{project}", project) + +def resolve_template(text, suffix, domain): + if not isinstance(text, str): + return text + return text.replace("{suffix}", suffix).replace("{domain}", domain) + + +def find_parent_group(monitor_name, groups, group_map): + for g in groups: + if monitor_name in g.get("children", []): + return group_map.get(g["name"]) + return None + + +def find_group_notifications(monitor_name, groups, notification_map): + for g in groups: + if monitor_name in g.get("children", []): + ids = {} + for n in g.get("notifications", []): + nid = notification_map.get(n) + if nid is not None: + ids[str(nid)] = True + return ids or None + return None + + def setup_uptime_kuma(dry_run=False, only=None): env_name = os.getenv("ENV", "test") - + config_path = os.path.join(os.path.dirname(__file__), "..", "config", "monitors.yml") with open(config_path, "r") as f: config = yaml.safe_load(f) - + project = config.get("project", "iklim") - + domain = os.getenv("EXTERNAL_DOMAIN", config.get("domain", {}).get("base", "iklim.co")) + suffix = os.getenv("EXTERNAL_SUBDOMAIN_SUFFIX", "") + kuma_url = os.getenv("UK_URL", "http://localhost:3001") kuma_user = os.getenv("UK_USER", "admin") kuma_pass = os.getenv("UK_PASS", "admin") - + api = None if not dry_run: logger.info(f"Connecting to Uptime Kuma at {kuma_url}...") @@ -39,7 +68,7 @@ def setup_uptime_kuma(dry_run=False, only=None): except Exception as e: logger.error(f"Login failed: {e}") return - + existing_monitors = {} if api: try: @@ -47,97 +76,312 @@ def setup_uptime_kuma(dry_run=False, only=None): existing_monitors[m['name']] = m except Exception as e: logger.error(f"Failed to get monitors: {e}") - - # 1. Process Groups + + # 0. Notification Providers + notification_map = {} + existing_notifications = {} + if api: + try: + for n in api.get_notifications(): + existing_notifications[n['name']] = n + except Exception as e: + logger.warning(f"Failed to get notifications: {e}") + + for notif_key, notif_cfg in config.get("notifications", {}).items(): + webhook_env = notif_cfg.get("webhook_env") + webhook_url = os.getenv(webhook_env, "") if webhook_env else "" + notif_name = f"{project}-{notif_key}" + + logger.info(f"Processing notification: {notif_name}") + if not dry_run: + if notif_name in existing_notifications: + notification_map[notif_key] = existing_notifications[notif_name]['id'] + logger.info(f"Notification {notif_name} already exists (id={notification_map[notif_key]})") + elif webhook_url: + try: + res = api.add_notification( + type=NotificationType.SLACK, + name=notif_name, + isDefault=False, + webhookURL=webhook_url, + applyExisting=False + ) + notification_map[notif_key] = res.get('id') + logger.info(f"Created notification: {notif_name}") + except Exception as e: + logger.warning(f"Failed to create notification {notif_name}: {e}") + else: + logger.warning(f"Skipping {notif_name}: env var {webhook_env} is not set") + + # 1. Groups group_map = {} for g in config.get("groups", []): raw_name = g["name"] formatted_name = f"{project} [{env_name}] {raw_name}" - + + notif_ids = {} + for n in g.get("notifications", []): + nid = notification_map.get(n) + if nid is not None: + notif_ids[str(nid)] = True + logger.info(f"Processing group: {formatted_name}") if not dry_run: if formatted_name not in existing_monitors: logger.info(f"Creating group monitor: {formatted_name}") - res = api.add_monitor(type=MonitorType.GROUP, name=formatted_name) + kwargs = {"type": MonitorType.GROUP, "name": formatted_name} + if notif_ids: + kwargs["notification_id_list"] = notif_ids + res = api.add_monitor(**kwargs) group_map[raw_name] = res['monitorID'] else: group_map[raw_name] = existing_monitors[formatted_name]['id'] tokens = {} - + # 2. Push Monitors for pm in config.get("push_monitors", []): m_name = pm["name"] if only and m_name != only: continue - + m_interval = pm.get("interval", 60) - - parent_group_id = None - for g in config.get("groups", []): - if m_name in g.get("children", []): - parent_group_id = group_map.get(g["name"]) - break - + parent_group_id = find_parent_group(m_name, config.get("groups", []), group_map) + notif_ids = find_group_notifications(m_name, config.get("groups", []), notification_map) + logger.info(f"Processing push monitor: {m_name}") if not dry_run: if m_name in existing_monitors: logger.info(f"Monitor {m_name} already exists.") m_id = existing_monitors[m_name]['id'] - token = existing_monitors[m_name]['pushToken'] - tokens[m_name] = token - + tokens[m_name] = existing_monitors[m_name]['pushToken'] + if parent_group_id and existing_monitors[m_name].get('parent') != parent_group_id: api.edit_monitor(m_id, parent=parent_group_id) else: logger.info(f"Creating push monitor: {m_name}") - result = api.add_monitor( - type=MonitorType.PUSH, - name=m_name, - interval=m_interval, - parent=parent_group_id - ) + kwargs = { + "type": MonitorType.PUSH, + "name": m_name, + "interval": m_interval, + "parent": parent_group_id + } + if notif_ids: + kwargs["notification_id_list"] = notif_ids + result = api.add_monitor(**kwargs) m_id = result['monitorID'] - - # Fetch again to get pushToken + for m in api.get_monitors(): if m['id'] == m_id: tokens[m_name] = m['pushToken'] break else: tokens[m_name] = "dummy_token_dry_run" - - # 3. Process Status Pages - for sp in config.get("status_pages", []): - slug = format_str(sp["slug"], env_name, project) - title = format_str(sp["title"], env_name, project) - logger.info(f"Processing status page: {title} (slug: {slug})") + + # 3. HTTP Monitors + for hm in config.get("http_monitors", []): + m_name = hm["name"] + if only and m_name != only: + continue + url = resolve_template(hm["url"], suffix, domain) + interval = hm.get("interval", 60) + accepted_statuscodes = hm.get("accepted_statuscodes", ["200"]) + parent_group_id = find_parent_group(m_name, config.get("groups", []), group_map) + notif_ids = find_group_notifications(m_name, config.get("groups", []), notification_map) + + logger.info(f"Processing HTTP monitor: {m_name} -> {url}") if not dry_run: + if m_name in existing_monitors: + logger.info(f"Monitor {m_name} already exists.") + else: + try: + kwargs = { + "type": MonitorType.HTTP, + "name": m_name, + "url": url, + "interval": interval, + "accepted_statuscodes": accepted_statuscodes, + } + if parent_group_id is not None: + kwargs["parent"] = parent_group_id + if notif_ids: + kwargs["notification_id_list"] = notif_ids + api.add_monitor(**kwargs) + logger.info(f"Created HTTP monitor: {m_name}") + except Exception as e: + logger.warning(f"Failed to create HTTP monitor {m_name}: {e}") + + # 4. DNS Monitors + for dm in config.get("dns_monitors", []): + m_name = dm["name"] + if only and m_name != only: + continue + hostname = resolve_template(dm["hostname"], suffix, domain) + dns_resolve_type = dm.get("dns_resolve_type", "A") + interval = dm.get("interval", 60) + parent_group_id = find_parent_group(m_name, config.get("groups", []), group_map) + notif_ids = find_group_notifications(m_name, config.get("groups", []), notification_map) + + logger.info(f"Processing DNS monitor: {m_name} -> {hostname}") + if not dry_run: + if m_name in existing_monitors: + logger.info(f"Monitor {m_name} already exists.") + else: + try: + kwargs = { + "type": MonitorType.DNS, + "name": m_name, + "hostname": hostname, + "dns_resolve_type": dns_resolve_type, + "interval": interval, + } + if parent_group_id is not None: + kwargs["parent"] = parent_group_id + if notif_ids: + kwargs["notification_id_list"] = notif_ids + api.add_monitor(**kwargs) + logger.info(f"Created DNS monitor: {m_name}") + except Exception as e: + logger.warning(f"Failed to create DNS monitor {m_name}: {e}") + + # 5. Ping Monitors (generated from nodes config) + ping_cfg = config.get("ping_monitors", {}) + ping_interval = ping_cfg.get("interval", 60) + ping_retries = ping_cfg.get("max_retries", 1) + env_nodes = config.get("nodes", {}).get(env_name, {}) + + for i, node in enumerate(env_nodes.get("service", []), 1): + m_name = f"EXT-PING-APP{i:02d}" + if only and m_name != only: + continue + ip = node["ip"] + parent_group_id = find_parent_group(m_name, config.get("groups", []), group_map) + notif_ids = find_group_notifications(m_name, config.get("groups", []), notification_map) + + logger.info(f"Processing Ping monitor: {m_name} -> {ip}") + if not dry_run: + if m_name in existing_monitors: + logger.info(f"Monitor {m_name} already exists.") + else: + try: + kwargs = { + "type": MonitorType.PING, + "name": m_name, + "hostname": ip, + "interval": ping_interval, + "max_retries": ping_retries, + } + if parent_group_id is not None: + kwargs["parent"] = parent_group_id + if notif_ids: + kwargs["notification_id_list"] = notif_ids + api.add_monitor(**kwargs) + logger.info(f"Created Ping monitor: {m_name}") + except Exception as e: + logger.warning(f"Failed to create Ping monitor {m_name}: {e}") + + for i, node in enumerate(env_nodes.get("db", []), 1): + m_name = f"EXT-PING-DB{i:02d}" + if only and m_name != only: + continue + ip = node["ip"] + parent_group_id = find_parent_group(m_name, config.get("groups", []), group_map) + notif_ids = find_group_notifications(m_name, config.get("groups", []), notification_map) + + logger.info(f"Processing Ping monitor: {m_name} -> {ip}") + if not dry_run: + if m_name in existing_monitors: + logger.info(f"Monitor {m_name} already exists.") + else: + try: + kwargs = { + "type": MonitorType.PING, + "name": m_name, + "hostname": ip, + "interval": ping_interval, + "max_retries": ping_retries, + } + if parent_group_id is not None: + kwargs["parent"] = parent_group_id + if notif_ids: + kwargs["notification_id_list"] = notif_ids + api.add_monitor(**kwargs) + logger.info(f"Created Ping monitor: {m_name}") + except Exception as e: + logger.warning(f"Failed to create Ping monitor {m_name}: {e}") + + # 6. Status Pages + if api: + all_monitors = {} + try: + for m in api.get_monitors(): + all_monitors[m['name']] = m + except Exception as e: + logger.warning(f"Failed to re-fetch monitors for status pages: {e}") + + existing_pages = {} + try: + for p in api.get_status_pages(): + existing_pages[p['slug']] = p + except Exception as e: + logger.warning(f"Failed to get status pages: {e}") + + for sp in config.get("status_pages", []): + slug = format_str(sp["slug"], env_name, project) + title = format_str(sp["title"], env_name, project) + is_public = sp.get("public", False) + sp_groups = sp.get("groups", []) + + logger.info(f"Processing status page: {title} (slug: {slug})") try: - pages = api.get_status_pages() - exists = any(p['slug'] == slug for p in pages) - if not exists: + if slug not in existing_pages: logger.info(f"Creating status page: {slug}") api.add_status_page(slug, title) + + # Each monitors.yml group becomes one display section on the status page. + # The GROUP monitor is added so Uptime Kuma renders it with all its children. + public_group_list = [] + for group_raw_name in sp_groups: + group_formatted = f"{project} [{env_name}] {group_raw_name}" + group_monitor = all_monitors.get(group_formatted) + if not group_monitor: + logger.warning(f"Group '{group_formatted}' not found, skipping in status page") + continue + public_group_list.append({ + "name": group_raw_name, + "weight": len(public_group_list) + 1, + "monitorList": [{"id": group_monitor['id']}] + }) + + if public_group_list: + api.save_status_page( + slug=slug, + title=title, + publicGroupList=public_group_list, + published=is_public + ) + logger.info(f"Saved status page '{slug}' with {len(public_group_list)} group(s)") except Exception as e: - logger.warning(f"Status page ops failed: {e}") - - # 4. Write tokens to uk_tokens.yml + logger.warning(f"Status page ops failed for {slug}: {e}") + + # 7. Write push tokens to uk_tokens.yml token_file = os.path.join(os.path.dirname(__file__), "..", "config", "generated", "uk_tokens.yml") if not dry_run: + os.makedirs(os.path.dirname(token_file), exist_ok=True) with open(token_file, "w") as f: yaml.dump(tokens, f) logger.info(f"Saved push tokens to {token_file}") else: logger.info(f"[DRY-RUN] Would save {len(tokens)} tokens to {token_file}") - + if api: api.disconnect() + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Setup Uptime Kuma monitors") parser.add_argument("--dry-run", action="store_true", help="Print actions without making changes") parser.add_argument("--only", type=str, help="Only process a specific monitor by name") args = parser.parse_args() - + setup_uptime_kuma(dry_run=args.dry_run, only=args.only)