version: "1" project: "iklim" domain: base: "iklim.co" nodes: prod: service: - name: iklim-app-01 ip: "178.104.210.41" - name: iklim-app-02 ip: "178.105.69.1" - name: iklim-app-03 ip: "178.104.219.3" db: - name: iklim-db-01 ip: "159.69.117.158" - name: iklim-db-02 ip: "178.104.219.162" - name: iklim-db-03 ip: "159.69.115.105" test: service: - name: iklim-app-01 ip: "167.235.194.61" db: - name: iklim-db-01 ip: "167.235.205.93" tags: - external - internal - high - medium - low - database - gateway - infrastructure - observability notifications: slack-high: type: slack webhook_env: UK_SLACK_WEBHOOK_HIGH slack-medium: type: slack webhook_env: UK_SLACK_WEBHOOK_MEDIUM slack-low: type: slack webhook_env: UK_SLACK_WEBHOOK_LOW groups: - name: "Infrastructure" status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [internal, infrastructure] children: [SWARM-CLUSTER, VAULT-CLUSTER, STORAGEBOX-MOUNT, SWAG-TLS] - name: "Data Layer" status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [internal, database] children: [ETCD-CLUSTER, PATRONI-CLUSTER, MONGODB-REPLICASET] - name: "Gateway & Messaging" status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [internal, gateway] children: [APISIX-GATEWAY, RABBITMQ-CLUSTER, REDIS-SENTINEL] - name: "External Availability - Critical" status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [external, high] children: [EXT-HTTPS-API, EXT-DNS-API, EXT-DNS-ROOT, EXT-PING-APP01, EXT-PING-APP02, EXT-PING-APP03] - name: "External Availability - General" status_page: "iklim-{env}-ops" notifications: [slack-medium] tags: [external, medium] children: [EXT-HTTPS-GRAFANA, EXT-PING-DB01, EXT-PING-DB02, EXT-PING-DB03] - name: "Observability" status_page: "iklim-{env}-tools" notifications: [slack-low] tags: [internal, observability] children: [PROMETHEUS, GRAFANA, PORTAINER, LOKI, EXT-HTTPS-PORTAINER, EXT-HTTPS-APIGW] push_monitors: - name: SWARM-CLUSTER interval: 60 heartbeat_retries: 1 tags: [internal, infrastructure, high] restart_threshold: 1 - name: VAULT-CLUSTER interval: 60 heartbeat_retries: 1 tags: [internal, infrastructure, high] restart_threshold: 1 - name: ETCD-CLUSTER interval: 60 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 - name: PATRONI-CLUSTER interval: 60 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 - name: MONGODB-REPLICASET interval: 120 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 - name: APISIX-GATEWAY interval: 60 heartbeat_retries: 1 tags: [internal, gateway, high] restart_threshold: 1 - name: RABBITMQ-CLUSTER interval: 60 heartbeat_retries: 1 tags: [internal, gateway, medium] restart_threshold: 3 - name: REDIS-SENTINEL interval: 60 heartbeat_retries: 1 tags: [internal, database, medium] restart_threshold: 3 - name: SWAG-TLS interval: 3600 heartbeat_retries: 1 tags: [internal, infrastructure, medium] restart_threshold: 3 - name: STORAGEBOX-MOUNT interval: 300 heartbeat_retries: 1 tags: [internal, infrastructure, medium] restart_threshold: 1 - name: PROMETHEUS interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 - name: GRAFANA interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 - name: PORTAINER interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 - name: LOKI interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 http_monitors: - name: EXT-HTTPS-API url: "https://api{suffix}.{domain}/actuator/health" accepted_statuscodes: ["200"] interval: 60 - name: EXT-HTTPS-GRAFANA url: "https://grafana{suffix}.{domain}/api/health" accepted_statuscodes: ["200"] interval: 60 - name: EXT-HTTPS-PORTAINER url: "https://portainer{suffix}.{domain}" accepted_statuscodes: ["200", "401", "403"] interval: 120 - name: EXT-HTTPS-APIGW url: "https://apigw{suffix}.{domain}" accepted_statuscodes: ["200", "401", "403"] interval: 120 dns_monitors: - name: EXT-DNS-API hostname: "api{suffix}.{domain}" dns_resolve_type: A interval: 60 - name: EXT-DNS-ROOT hostname: "{domain}" dns_resolve_type: A interval: 60 ping_monitors: interval: 60 max_retries: 1 status_pages: - slug: "iklim-{env}-status" title: "iklim.co API Status" public: true groups: ["External Availability - Critical"] - slug: "iklim-{env}-ops" title: "iklim.co [{env}] Infrastructure" public: false groups: - "Infrastructure" - "Data Layer" - "Gateway & Messaging" - "External Availability - Critical" - "External Availability - General" - slug: "iklim-{env}-tools" title: "iklim.co [{env}] Tools" public: false groups: ["Observability"]