Murat ÖZDEMİR 72a91072fb feat(health-agent): add README, workflows, and translate monitors.yml to English
- Add health-agent README with architecture, config, and deployment docs
- Add deploy-monitoring-test.yml workflow (mirrors prod, test-runner, test storagebox paths)
- Add health-agent service to docker-stack-monitoring.yml
- Add .env.example with all runtime variables and .gitignore for generated files
- Add config/generated/.gitkeep to track empty generated directory
- Translate all Turkish group names and status page titles in monitors.yml to English
- Remove users.yml.example (Dozzle was removed in previous commit)
2026-06-25 19:20:25 +03:00

197 lines
5.2 KiB
YAML

version: "1"
project: "iklim"
domain:
base: "iklim.co"
nodes:
prod:
service:
- name: iklim-app-01
ip: "178.104.210.41"
- name: iklim-app-02
ip: "178.105.69.1"
- name: iklim-app-03
ip: "178.104.219.3"
db:
- name: iklim-db-01
ip: "159.69.117.158"
- name: iklim-db-02
ip: "178.104.219.162"
- name: iklim-db-03
ip: "159.69.115.105"
test:
service:
- name: iklim-app-01
ip: "167.235.194.61"
db:
- name: iklim-db-01
ip: "167.235.205.93"
tags:
- external
- internal
- high
- medium
- low
- database
- gateway
- infrastructure
- observability
notifications:
slack-high:
type: slack
webhook_env: UK_SLACK_WEBHOOK_HIGH
slack-medium:
type: slack
webhook_env: UK_SLACK_WEBHOOK_MEDIUM
slack-low:
type: slack
webhook_env: UK_SLACK_WEBHOOK_LOW
groups:
- name: "Infrastructure"
status_page: "iklim-{env}-ops"
notifications: [slack-high]
tags: [internal, infrastructure]
children: [SWARM-CLUSTER, VAULT-CLUSTER, STORAGEBOX-MOUNT, SWAG-TLS]
- name: "Data Layer"
status_page: "iklim-{env}-ops"
notifications: [slack-high]
tags: [internal, database]
children: [ETCD-CLUSTER, PATRONI-CLUSTER, MONGODB-REPLICASET]
- name: "Gateway & Messaging"
status_page: "iklim-{env}-ops"
notifications: [slack-high]
tags: [internal, gateway]
children: [APISIX-GATEWAY, RABBITMQ-CLUSTER, REDIS-SENTINEL]
- name: "External Availability - Critical"
status_page: "iklim-{env}-ops"
notifications: [slack-high]
tags: [external, high]
children: [EXT-HTTPS-API, EXT-DNS-API, EXT-DNS-ROOT, EXT-PING-APP01, EXT-PING-APP02, EXT-PING-APP03]
- name: "External Availability - General"
status_page: "iklim-{env}-ops"
notifications: [slack-medium]
tags: [external, medium]
children: [EXT-HTTPS-GRAFANA, EXT-PING-DB01, EXT-PING-DB02, EXT-PING-DB03]
- name: "Observability"
status_page: "iklim-{env}-tools"
notifications: [slack-low]
tags: [internal, observability]
children: [PROMETHEUS, GRAFANA, PORTAINER, LOKI, EXT-HTTPS-PORTAINER, EXT-HTTPS-APIGW]
push_monitors:
- name: SWARM-CLUSTER
interval: 60
heartbeat_retries: 1
tags: [internal, infrastructure, high]
restart_threshold: 1
- name: VAULT-CLUSTER
interval: 60
heartbeat_retries: 1
tags: [internal, infrastructure, high]
restart_threshold: 1
- name: ETCD-CLUSTER
interval: 60
heartbeat_retries: 1
tags: [internal, database, high]
restart_threshold: 1
- name: PATRONI-CLUSTER
interval: 60
heartbeat_retries: 1
tags: [internal, database, high]
restart_threshold: 1
- name: MONGODB-REPLICASET
interval: 120
heartbeat_retries: 1
tags: [internal, database, high]
restart_threshold: 1
- name: APISIX-GATEWAY
interval: 60
heartbeat_retries: 1
tags: [internal, gateway, high]
restart_threshold: 1
- name: RABBITMQ-CLUSTER
interval: 60
heartbeat_retries: 1
tags: [internal, gateway, medium]
restart_threshold: 3
- name: REDIS-SENTINEL
interval: 60
heartbeat_retries: 1
tags: [internal, database, medium]
restart_threshold: 3
- name: SWAG-TLS
interval: 3600
heartbeat_retries: 1
tags: [internal, infrastructure, medium]
restart_threshold: 3
- name: STORAGEBOX-MOUNT
interval: 300
heartbeat_retries: 1
tags: [internal, infrastructure, medium]
restart_threshold: 1
- name: PROMETHEUS
interval: 120
heartbeat_retries: 1
tags: [internal, observability, low]
restart_threshold: 5
- name: GRAFANA
interval: 120
heartbeat_retries: 1
tags: [internal, observability, low]
restart_threshold: 5
- name: PORTAINER
interval: 120
heartbeat_retries: 1
tags: [internal, observability, low]
restart_threshold: 5
- name: LOKI
interval: 120
heartbeat_retries: 1
tags: [internal, observability, low]
restart_threshold: 5
http_monitors:
- name: EXT-HTTPS-API
url: "https://api{suffix}.{domain}/actuator/health"
accepted_statuscodes: ["200"]
interval: 60
- name: EXT-HTTPS-GRAFANA
url: "https://grafana{suffix}.{domain}/api/health"
accepted_statuscodes: ["200"]
interval: 60
- name: EXT-HTTPS-PORTAINER
url: "https://portainer{suffix}.{domain}"
accepted_statuscodes: ["200", "401", "403"]
interval: 120
- name: EXT-HTTPS-APIGW
url: "https://apigw{suffix}.{domain}"
accepted_statuscodes: ["200", "401", "403"]
interval: 120
dns_monitors:
- name: EXT-DNS-API
hostname: "api{suffix}.{domain}"
dns_resolve_type: A
interval: 60
- name: EXT-DNS-ROOT
hostname: "{domain}"
dns_resolve_type: A
interval: 60
ping_monitors:
interval: 60
max_retries: 1
status_pages:
- slug: "iklim-{env}-status"
title: "iklim.co API Status"
public: true
groups: ["External Availability - Critical"]
- slug: "iklim-{env}-ops"
title: "iklim.co [{env}] Infrastructure"
public: false
groups:
- "Infrastructure"
- "Data Layer"
- "Gateway & Messaging"
- "External Availability - Critical"
- "External Availability - General"
- slug: "iklim-{env}-tools"
title: "iklim.co [{env}] Tools"
public: false
groups: ["Observability"]