version: "1" project: "iklim" domain: base: "iklim.co" nodes: prod: service: - name: iklim-app-01 ip: "178.104.210.41" - name: iklim-app-02 ip: "178.105.69.1" - name: iklim-app-03 ip: "178.104.219.3" db: - name: iklim-db-01 ip: "159.69.117.158" - name: iklim-db-02 ip: "178.104.219.162" - name: iklim-db-03 ip: "159.69.115.105" test: service: - name: iklim-app-01 ip: "167.235.194.61" db: - name: iklim-db-01 ip: "167.235.205.93" tags: - external - internal - high - medium - low - database - gateway - infrastructure - observability notifications: slack-high: type: slack webhook_env: UK_SLACK_WEBHOOK_HIGH slack-medium: type: slack webhook_env: UK_SLACK_WEBHOOK_MEDIUM slack-low: type: slack webhook_env: UK_SLACK_WEBHOOK_LOW groups: - name: "Infrastructure" status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [internal, infrastructure] children: [Swarm Cluster, Vault Cluster, Storagebox Mount, Swag Tls] - name: "Data Layer" status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [internal, database] children: [Etcd Cluster, Patroni Cluster, Mongodb Replicaset] - name: "Gateway & Messaging" status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [internal, gateway] children: [Apisix Gateway, Rabbitmq Cluster, Redis Sentinel] - name: "External Availability - Critical" status_page: "iklim-{env}-ops" notifications: [slack-high] tags: [external, high] children: [Ext Https Api, Ext Dns Api, Ext Dns Root, Ext Ping App01, Ext Ping App02, Ext Ping App03] - name: "External Availability - General" status_page: "iklim-{env}-ops" notifications: [slack-medium] tags: [external, medium] children: [Ext Https Grafana, Ext Ping Db01, Ext Ping Db02, Ext Ping Db03] - name: "Observability" status_page: "iklim-{env}-tools" notifications: [slack-low] tags: [internal, observability] children: [Prometheus, Grafana, Portainer, Loki, Ext Https Portainer, Ext Https Apigw] push_monitors: - name: Swarm Cluster interval: 75 heartbeat_retries: 1 tags: [internal, infrastructure, high] restart_threshold: 1 - name: Vault Cluster interval: 75 heartbeat_retries: 1 tags: [internal, infrastructure, high] restart_threshold: 1 - name: Etcd Cluster interval: 75 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 - name: Patroni Cluster interval: 75 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 - name: Mongodb Replicaset interval: 120 heartbeat_retries: 1 tags: [internal, database, high] restart_threshold: 1 - name: Apisix Gateway interval: 75 heartbeat_retries: 1 tags: [internal, gateway, high] restart_threshold: 1 - name: Rabbitmq Cluster interval: 75 heartbeat_retries: 1 tags: [internal, gateway, medium] restart_threshold: 3 - name: Redis Sentinel interval: 75 heartbeat_retries: 1 tags: [internal, database, medium] restart_threshold: 3 - name: Swag Tls interval: 3600 heartbeat_retries: 1 tags: [internal, infrastructure, medium] restart_threshold: 3 - name: Storagebox Mount interval: 300 heartbeat_retries: 1 tags: [internal, infrastructure, medium] restart_threshold: 1 - name: Prometheus interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 - name: Grafana interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 - name: Portainer interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 - name: Loki interval: 120 heartbeat_retries: 1 tags: [internal, observability, low] restart_threshold: 5 http_monitors: - name: Ext Https Api url: "https://api{suffix}.{domain}/health" accepted_statuscodes: ["200"] interval: 60 - name: Ext Https Grafana url: "https://grafana{suffix}.{domain}/api/health" accepted_statuscodes: ["200"] interval: 60 - name: Ext Https Portainer url: "https://portainer{suffix}.{domain}" accepted_statuscodes: ["200", "401", "403"] interval: 120 - name: Ext Https Apigw url: "https://apigw{suffix}.{domain}" accepted_statuscodes: ["200", "401", "403"] interval: 120 dns_monitors: - name: Ext Dns Api hostname: "api{suffix}.{domain}" dns_resolve_type: A interval: 60 - name: Ext Dns Root hostname: "{domain}" dns_resolve_type: A interval: 60 ping_monitors: interval: 60 max_retries: 1 status_pages: - slug: "iklim-{env}-status" title: "iklim.co API Status" public: true groups: ["External Availability - Critical"] - slug: "iklim-{env}-ops" title: "iklim.co [{env}] Infrastructure" public: false groups: - "Infrastructure" - "Data Layer" - "Gateway & Messaging" - "External Availability - Critical" - "External Availability - General" - slug: "iklim-{env}-tools" title: "iklim.co [{env}] Tools" public: false groups: ["Observability"]