Murat ÖZDEMİR 969c4a2301 fix(monitoring): resolve health-agent bugs and flapping monitors
- Vault flapping: Fix resp evaluation on HTTP 429
- Storagebox block: Move mount check to a daemon thread
- Push monitors: Increase interval to 75s and restore 60s sleep
- Redis Sentinel: Fix authentication in sentinel_kwargs
- Ext Https Api: Update URL to /health
2026-06-26 22:51:15 +03:00

197 lines
5.2 KiB
YAML

version: "1"
project: "iklim"
domain:
base: "iklim.co"
nodes:
prod:
service:
- name: iklim-app-01
ip: "178.104.210.41"
- name: iklim-app-02
ip: "178.105.69.1"
- name: iklim-app-03
ip: "178.104.219.3"
db:
- name: iklim-db-01
ip: "159.69.117.158"
- name: iklim-db-02
ip: "178.104.219.162"
- name: iklim-db-03
ip: "159.69.115.105"
test:
service:
- name: iklim-app-01
ip: "167.235.194.61"
db:
- name: iklim-db-01
ip: "167.235.205.93"
tags:
- external
- internal
- high
- medium
- low
- database
- gateway
- infrastructure
- observability
notifications:
slack-high:
type: slack
webhook_env: UK_SLACK_WEBHOOK_HIGH
slack-medium:
type: slack
webhook_env: UK_SLACK_WEBHOOK_MEDIUM
slack-low:
type: slack
webhook_env: UK_SLACK_WEBHOOK_LOW
groups:
- name: "Infrastructure"
status_page: "iklim-{env}-ops"
notifications: [slack-high]
tags: [internal, infrastructure]
children: [Swarm Cluster, Vault Cluster, Storagebox Mount, Swag Tls]
- name: "Data Layer"
status_page: "iklim-{env}-ops"
notifications: [slack-high]
tags: [internal, database]
children: [Etcd Cluster, Patroni Cluster, Mongodb Replicaset]
- name: "Gateway & Messaging"
status_page: "iklim-{env}-ops"
notifications: [slack-high]
tags: [internal, gateway]
children: [Apisix Gateway, Rabbitmq Cluster, Redis Sentinel]
- name: "External Availability - Critical"
status_page: "iklim-{env}-ops"
notifications: [slack-high]
tags: [external, high]
children: [Ext Https Api, Ext Dns Api, Ext Dns Root, Ext Ping App01, Ext Ping App02, Ext Ping App03]
- name: "External Availability - General"
status_page: "iklim-{env}-ops"
notifications: [slack-medium]
tags: [external, medium]
children: [Ext Https Grafana, Ext Ping Db01, Ext Ping Db02, Ext Ping Db03]
- name: "Observability"
status_page: "iklim-{env}-tools"
notifications: [slack-low]
tags: [internal, observability]
children: [Prometheus, Grafana, Portainer, Loki, Ext Https Portainer, Ext Https Apigw]
push_monitors:
- name: Swarm Cluster
interval: 75
heartbeat_retries: 1
tags: [internal, infrastructure, high]
restart_threshold: 1
- name: Vault Cluster
interval: 75
heartbeat_retries: 1
tags: [internal, infrastructure, high]
restart_threshold: 1
- name: Etcd Cluster
interval: 75
heartbeat_retries: 1
tags: [internal, database, high]
restart_threshold: 1
- name: Patroni Cluster
interval: 75
heartbeat_retries: 1
tags: [internal, database, high]
restart_threshold: 1
- name: Mongodb Replicaset
interval: 120
heartbeat_retries: 1
tags: [internal, database, high]
restart_threshold: 1
- name: Apisix Gateway
interval: 75
heartbeat_retries: 1
tags: [internal, gateway, high]
restart_threshold: 1
- name: Rabbitmq Cluster
interval: 75
heartbeat_retries: 1
tags: [internal, gateway, medium]
restart_threshold: 3
- name: Redis Sentinel
interval: 75
heartbeat_retries: 1
tags: [internal, database, medium]
restart_threshold: 3
- name: Swag Tls
interval: 3600
heartbeat_retries: 1
tags: [internal, infrastructure, medium]
restart_threshold: 3
- name: Storagebox Mount
interval: 300
heartbeat_retries: 1
tags: [internal, infrastructure, medium]
restart_threshold: 1
- name: Prometheus
interval: 120
heartbeat_retries: 1
tags: [internal, observability, low]
restart_threshold: 5
- name: Grafana
interval: 120
heartbeat_retries: 1
tags: [internal, observability, low]
restart_threshold: 5
- name: Portainer
interval: 120
heartbeat_retries: 1
tags: [internal, observability, low]
restart_threshold: 5
- name: Loki
interval: 120
heartbeat_retries: 1
tags: [internal, observability, low]
restart_threshold: 5
http_monitors:
- name: Ext Https Api
url: "https://api{suffix}.{domain}/health"
accepted_statuscodes: ["200"]
interval: 60
- name: Ext Https Grafana
url: "https://grafana{suffix}.{domain}/api/health"
accepted_statuscodes: ["200"]
interval: 60
- name: Ext Https Portainer
url: "https://portainer{suffix}.{domain}"
accepted_statuscodes: ["200", "401", "403"]
interval: 120
- name: Ext Https Apigw
url: "https://apigw{suffix}.{domain}"
accepted_statuscodes: ["200", "401", "403"]
interval: 120
dns_monitors:
- name: Ext Dns Api
hostname: "api{suffix}.{domain}"
dns_resolve_type: A
interval: 60
- name: Ext Dns Root
hostname: "{domain}"
dns_resolve_type: A
interval: 60
ping_monitors:
interval: 60
max_retries: 1
status_pages:
- slug: "iklim-{env}-status"
title: "iklim.co API Status"
public: true
groups: ["External Availability - Critical"]
- slug: "iklim-{env}-ops"
title: "iklim.co [{env}] Infrastructure"
public: false
groups:
- "Infrastructure"
- "Data Layer"
- "Gateway & Messaging"
- "External Availability - Critical"
- "External Availability - General"
- slug: "iklim-{env}-tools"
title: "iklim.co [{env}] Tools"
public: false
groups: ["Observability"]