diff --git a/health-agent/config/monitors.yml b/health-agent/config/monitors.yml index 26edf01..3cfe7fd 100644 --- a/health-agent/config/monitors.yml +++ b/health-agent/config/monitors.yml @@ -50,126 +50,126 @@ groups: status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [internal, infrastructure] - children: [SWARM-CLUSTER, VAULT-CLUSTER, STORAGEBOX-MOUNT, SWAG-TLS] + children: [Swarm Cluster, Vault Cluster, Storagebox Mount, Swag Tls] - name: "Data Layer" status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [internal, database] - children: [ETCD-CLUSTER, PATRONI-CLUSTER, MONGODB-REPLICASET] + children: [Etcd Cluster, Patroni Cluster, Mongodb Replicaset] - name: "Gateway & Messaging" status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [internal, gateway] - children: [APISIX-GATEWAY, RABBITMQ-CLUSTER, REDIS-SENTINEL] + children: [Apisix Gateway, Rabbitmq Cluster, Redis Sentinel] - name: "External Availability - Critical" status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [external, high] - children: [EXT-HTTPS-API, EXT-DNS-API, EXT-DNS-ROOT, EXT-PING-APP01, EXT-PING-APP02, EXT-PING-APP03] + children: [Ext Https Api, Ext Dns Api, Ext Dns Root, Ext Ping App01, Ext Ping App02, Ext Ping App03] - name: "External Availability - General" status_page: "iklim-{env}-ops" notifications: [slack-medium] tags: [external, medium] - children: [EXT-HTTPS-GRAFANA, EXT-PING-DB01, EXT-PING-DB02, EXT-PING-DB03] + children: [Ext Https Grafana, Ext Ping Db01, Ext Ping Db02, Ext Ping Db03] - name: "Observability" status_page: "iklim-{env}-tools" notifications: [slack-low] tags: [internal, observability] - children: [PROMETHEUS, GRAFANA, PORTAINER, LOKI, EXT-HTTPS-PORTAINER, EXT-HTTPS-APIGW] + children: [Prometheus, Grafana, Portainer, Loki, Ext Https Portainer, Ext Https Apigw] push_monitors: - - name: SWARM-CLUSTER + - name: Swarm Cluster interval: 60 heartbeat_retries: 1 tags: [internal, infrastructure, high] restart_threshold: 1 - - name: VAULT-CLUSTER + - name: Vault Cluster interval: 60 heartbeat_retries: 1 tags: [internal, infrastructure, high] restart_threshold: 1 - - name: ETCD-CLUSTER + - name: Etcd Cluster interval: 60 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 - - name: PATRONI-CLUSTER + - name: Patroni Cluster interval: 60 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 - - name: MONGODB-REPLICASET + - name: Mongodb Replicaset interval: 120 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 - - name: APISIX-GATEWAY + - name: Apisix Gateway interval: 60 heartbeat_retries: 1 tags: [internal, gateway, high] restart_threshold: 1 - - name: RABBITMQ-CLUSTER + - name: Rabbitmq Cluster interval: 60 heartbeat_retries: 1 tags: [internal, gateway, medium] restart_threshold: 3 - - name: REDIS-SENTINEL + - name: Redis Sentinel interval: 60 heartbeat_retries: 1 tags: [internal, database, medium] restart_threshold: 3 - - name: SWAG-TLS + - name: Swag Tls interval: 3600 heartbeat_retries: 1 tags: [internal, infrastructure, medium] restart_threshold: 3 - - name: STORAGEBOX-MOUNT + - name: Storagebox Mount interval: 300 heartbeat_retries: 1 tags: [internal, infrastructure, medium] restart_threshold: 1 - - name: PROMETHEUS + - name: Prometheus interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 - - name: GRAFANA + - name: Grafana interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 - - name: PORTAINER + - name: Portainer interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 - - name: LOKI + - name: Loki interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 http_monitors: - - name: EXT-HTTPS-API + - name: Ext Https Api url: "https://api{suffix}.{domain}/actuator/health" accepted_statuscodes: ["200"] interval: 60 - - name: EXT-HTTPS-GRAFANA + - name: Ext Https Grafana url: "https://grafana{suffix}.{domain}/api/health" accepted_statuscodes: ["200"] interval: 60 - - name: EXT-HTTPS-PORTAINER + - name: Ext Https Portainer url: "https://portainer{suffix}.{domain}" accepted_statuscodes: ["200", "401", "403"] interval: 120 - - name: EXT-HTTPS-APIGW + - name: Ext Https Apigw url: "https://apigw{suffix}.{domain}" accepted_statuscodes: ["200", "401", "403"] interval: 120 dns_monitors: - - name: EXT-DNS-API + - name: Ext Dns Api hostname: "api{suffix}.{domain}" dns_resolve_type: A interval: 60 - - name: EXT-DNS-ROOT + - name: Ext Dns Root hostname: "{domain}" dns_resolve_type: A interval: 60 diff --git a/health-agent/deploy/prod.env b/health-agent/deploy/prod.env index 0a993b4..969ab4c 100644 --- a/health-agent/deploy/prod.env +++ b/health-agent/deploy/prod.env @@ -1,2 +1,2 @@ -SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:e262bf6e6712862ba24551dc326411ebb0987da59072834b2923bd73cb5c9d3b +SOURCE_IMAGE_DIGEST=registry.tarla.io/iklimco/health-agent@sha256:a2ed1cbaabf116e49d1685e37e0335798d1fe49a2d95457717c68b1576894062 PROD_IMAGE_TAG=0.1.0 \ No newline at end of file diff --git a/health-agent/scripts/setup_uptime_kuma.py b/health-agent/scripts/setup_uptime_kuma.py index 2f6c3c7..75841af 100644 --- a/health-agent/scripts/setup_uptime_kuma.py +++ b/health-agent/scripts/setup_uptime_kuma.py @@ -251,7 +251,7 @@ def setup_uptime_kuma(dry_run=False, only=None): env_nodes = config.get("nodes", {}).get(env_name, {}) for i, node in enumerate(env_nodes.get("service", []), 1): - m_name = f"EXT-PING-APP{i:02d}" + m_name = f"Ext Ping App{i:02d}" if only and m_name != only: continue ip = node["ip"] @@ -281,7 +281,7 @@ def setup_uptime_kuma(dry_run=False, only=None): logger.warning(f"Failed to create Ping monitor {m_name}: {e}") for i, node in enumerate(env_nodes.get("db", []), 1): - m_name = f"EXT-PING-DB{i:02d}" + m_name = f"Ext Ping Db{i:02d}" if only and m_name != only: continue ip = node["ip"]