- Add health-agent README with architecture, config, and deployment docs - Add deploy-monitoring-test.yml workflow (mirrors prod, test-runner, test storagebox paths) - Add health-agent service to docker-stack-monitoring.yml - Add .env.example with all runtime variables and .gitignore for generated files - Add config/generated/.gitkeep to track empty generated directory - Translate all Turkish group names and status page titles in monitors.yml to English - Remove users.yml.example (Dozzle was removed in previous commit)
197 lines
5.2 KiB
YAML
197 lines
5.2 KiB
YAML
version: "1"
|
|
project: "iklim"
|
|
domain:
|
|
base: "iklim.co"
|
|
nodes:
|
|
prod:
|
|
service:
|
|
- name: iklim-app-01
|
|
ip: "178.104.210.41"
|
|
- name: iklim-app-02
|
|
ip: "178.105.69.1"
|
|
- name: iklim-app-03
|
|
ip: "178.104.219.3"
|
|
db:
|
|
- name: iklim-db-01
|
|
ip: "159.69.117.158"
|
|
- name: iklim-db-02
|
|
ip: "178.104.219.162"
|
|
- name: iklim-db-03
|
|
ip: "159.69.115.105"
|
|
test:
|
|
service:
|
|
- name: iklim-app-01
|
|
ip: "167.235.194.61"
|
|
db:
|
|
- name: iklim-db-01
|
|
ip: "167.235.205.93"
|
|
tags:
|
|
- external
|
|
- internal
|
|
- high
|
|
- medium
|
|
- low
|
|
- database
|
|
- gateway
|
|
- infrastructure
|
|
- observability
|
|
notifications:
|
|
slack-high:
|
|
type: slack
|
|
webhook_env: UK_SLACK_WEBHOOK_HIGH
|
|
slack-medium:
|
|
type: slack
|
|
webhook_env: UK_SLACK_WEBHOOK_MEDIUM
|
|
slack-low:
|
|
type: slack
|
|
webhook_env: UK_SLACK_WEBHOOK_LOW
|
|
groups:
|
|
- name: "Infrastructure"
|
|
status_page: "iklim-{env}-ops"
|
|
notifications: [slack-high]
|
|
tags: [internal, infrastructure]
|
|
children: [SWARM-CLUSTER, VAULT-CLUSTER, STORAGEBOX-MOUNT, SWAG-TLS]
|
|
- name: "Data Layer"
|
|
status_page: "iklim-{env}-ops"
|
|
notifications: [slack-high]
|
|
tags: [internal, database]
|
|
children: [ETCD-CLUSTER, PATRONI-CLUSTER, MONGODB-REPLICASET]
|
|
- name: "Gateway & Messaging"
|
|
status_page: "iklim-{env}-ops"
|
|
notifications: [slack-high]
|
|
tags: [internal, gateway]
|
|
children: [APISIX-GATEWAY, RABBITMQ-CLUSTER, REDIS-SENTINEL]
|
|
- name: "External Availability - Critical"
|
|
status_page: "iklim-{env}-ops"
|
|
notifications: [slack-high]
|
|
tags: [external, high]
|
|
children: [EXT-HTTPS-API, EXT-DNS-API, EXT-DNS-ROOT, EXT-PING-APP01, EXT-PING-APP02, EXT-PING-APP03]
|
|
- name: "External Availability - General"
|
|
status_page: "iklim-{env}-ops"
|
|
notifications: [slack-medium]
|
|
tags: [external, medium]
|
|
children: [EXT-HTTPS-GRAFANA, EXT-PING-DB01, EXT-PING-DB02, EXT-PING-DB03]
|
|
- name: "Observability"
|
|
status_page: "iklim-{env}-tools"
|
|
notifications: [slack-low]
|
|
tags: [internal, observability]
|
|
children: [PROMETHEUS, GRAFANA, PORTAINER, LOKI, EXT-HTTPS-PORTAINER, EXT-HTTPS-APIGW]
|
|
push_monitors:
|
|
- name: SWARM-CLUSTER
|
|
interval: 60
|
|
heartbeat_retries: 1
|
|
tags: [internal, infrastructure, high]
|
|
restart_threshold: 1
|
|
- name: VAULT-CLUSTER
|
|
interval: 60
|
|
heartbeat_retries: 1
|
|
tags: [internal, infrastructure, high]
|
|
restart_threshold: 1
|
|
- name: ETCD-CLUSTER
|
|
interval: 60
|
|
heartbeat_retries: 1
|
|
tags: [internal, database, high]
|
|
restart_threshold: 1
|
|
- name: PATRONI-CLUSTER
|
|
interval: 60
|
|
heartbeat_retries: 1
|
|
tags: [internal, database, high]
|
|
restart_threshold: 1
|
|
- name: MONGODB-REPLICASET
|
|
interval: 120
|
|
heartbeat_retries: 1
|
|
tags: [internal, database, high]
|
|
restart_threshold: 1
|
|
- name: APISIX-GATEWAY
|
|
interval: 60
|
|
heartbeat_retries: 1
|
|
tags: [internal, gateway, high]
|
|
restart_threshold: 1
|
|
- name: RABBITMQ-CLUSTER
|
|
interval: 60
|
|
heartbeat_retries: 1
|
|
tags: [internal, gateway, medium]
|
|
restart_threshold: 3
|
|
- name: REDIS-SENTINEL
|
|
interval: 60
|
|
heartbeat_retries: 1
|
|
tags: [internal, database, medium]
|
|
restart_threshold: 3
|
|
- name: SWAG-TLS
|
|
interval: 3600
|
|
heartbeat_retries: 1
|
|
tags: [internal, infrastructure, medium]
|
|
restart_threshold: 3
|
|
- name: STORAGEBOX-MOUNT
|
|
interval: 300
|
|
heartbeat_retries: 1
|
|
tags: [internal, infrastructure, medium]
|
|
restart_threshold: 1
|
|
- name: PROMETHEUS
|
|
interval: 120
|
|
heartbeat_retries: 1
|
|
tags: [internal, observability, low]
|
|
restart_threshold: 5
|
|
- name: GRAFANA
|
|
interval: 120
|
|
heartbeat_retries: 1
|
|
tags: [internal, observability, low]
|
|
restart_threshold: 5
|
|
- name: PORTAINER
|
|
interval: 120
|
|
heartbeat_retries: 1
|
|
tags: [internal, observability, low]
|
|
restart_threshold: 5
|
|
- name: LOKI
|
|
interval: 120
|
|
heartbeat_retries: 1
|
|
tags: [internal, observability, low]
|
|
restart_threshold: 5
|
|
http_monitors:
|
|
- name: EXT-HTTPS-API
|
|
url: "https://api{suffix}.{domain}/actuator/health"
|
|
accepted_statuscodes: ["200"]
|
|
interval: 60
|
|
- name: EXT-HTTPS-GRAFANA
|
|
url: "https://grafana{suffix}.{domain}/api/health"
|
|
accepted_statuscodes: ["200"]
|
|
interval: 60
|
|
- name: EXT-HTTPS-PORTAINER
|
|
url: "https://portainer{suffix}.{domain}"
|
|
accepted_statuscodes: ["200", "401", "403"]
|
|
interval: 120
|
|
- name: EXT-HTTPS-APIGW
|
|
url: "https://apigw{suffix}.{domain}"
|
|
accepted_statuscodes: ["200", "401", "403"]
|
|
interval: 120
|
|
dns_monitors:
|
|
- name: EXT-DNS-API
|
|
hostname: "api{suffix}.{domain}"
|
|
dns_resolve_type: A
|
|
interval: 60
|
|
- name: EXT-DNS-ROOT
|
|
hostname: "{domain}"
|
|
dns_resolve_type: A
|
|
interval: 60
|
|
ping_monitors:
|
|
interval: 60
|
|
max_retries: 1
|
|
status_pages:
|
|
- slug: "iklim-{env}-status"
|
|
title: "iklim.co API Status"
|
|
public: true
|
|
groups: ["External Availability - Critical"]
|
|
- slug: "iklim-{env}-ops"
|
|
title: "iklim.co [{env}] Infrastructure"
|
|
public: false
|
|
groups:
|
|
- "Infrastructure"
|
|
- "Data Layer"
|
|
- "Gateway & Messaging"
|
|
- "External Availability - Critical"
|
|
- "External Availability - General"
|
|
- slug: "iklim-{env}-tools"
|
|
title: "iklim.co [{env}] Tools"
|
|
public: false
|
|
groups: ["Observability"]
|