/usr/lib/netdata/conf.d/health.d
# you can disable an alarm notification by setting the 'to' line to: silent template: consul_license_expiration_time on: consul.license_expiration_time class: Errors type: ServiceMesh component: Consul calc: $license_expiration every: 60m units: seconds warn: $this < 14*24*60*60 crit: $this < 7*24*60*60 summary: Consul license expiration on ${label:node_name} info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_autopilot_health_status on: consul.autopilot_health_status class: Errors type: ServiceMesh component: Consul calc: $unhealthy every: 10s units: status warn: $this == 1 delay: down 5m multiplier 1.5 max 1h summary: Consul datacenter ${label:datacenter} health info: Datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name} to: sysadmin template: consul_autopilot_server_health_status on: consul.autopilot_server_health_status class: Errors type: ServiceMesh component: Consul calc: $unhealthy every: 10s units: status warn: $this == 1 delay: down 5m multiplier 1.5 max 1h summary: Consul server ${label:node_name} health info: Server ${label:node_name} from datacenter ${label:datacenter} is unhealthy to: sysadmin template: consul_raft_leader_last_contact_time on: consul.raft_leader_last_contact_time class: Errors type: ServiceMesh component: Consul lookup: average -1m unaligned of quantile_0.5 every: 10s units: milliseconds warn: $this > (($status >= $WARNING) ? (150) : (200)) crit: $this > (($status == $CRITICAL) ? (200) : (500)) delay: down 5m multiplier 1.5 max 1h summary: Consul leader server ${label:node_name} last contact time info: Median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes to: sysadmin template: consul_raft_leadership_transitions on: consul.raft_leadership_transitions_rate class: Errors type: ServiceMesh component: Consul lookup: sum -1m unaligned every: 10s units: transitions warn: $this > 0 delay: down 5m multiplier 1.5 max 1h summary: Consul server ${label:node_name} leadership transitions info: There has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader to: sysadmin template: consul_raft_thread_main_saturation on: consul.raft_thread_main_saturation_perc class: Utilization type: ServiceMesh component: Consul lookup: average -1m unaligned of quantile_0.9 every: 10s units: percentage warn: $this > (($status >= $WARNING) ? (40) : (50)) delay: down 5m multiplier 1.5 max 1h summary: Consul server ${label:node_name} main Raft saturation info: Average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_raft_thread_fsm_saturation on: consul.raft_thread_fsm_saturation_perc class: Utilization type: ServiceMesh component: Consul lookup: average -1m unaligned of quantile_0.9 every: 10s units: milliseconds warn: $this > (($status >= $WARNING) ? (40) : (50)) delay: down 5m multiplier 1.5 max 1h summary: Consul server ${label:node_name} FSM Raft saturation info: Average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_client_rpc_requests_exceeded on: consul.client_rpc_requests_exceeded_rate class: Errors type: ServiceMesh component: Consul lookup: sum -1m unaligned every: 10s units: requests warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: down 5m multiplier 1.5 max 1h summary: Consul server ${label:node_name} RPC requests rate info: Number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_client_rpc_requests_failed on: consul.client_rpc_requests_failed_rate class: Errors type: ServiceMesh component: Consul lookup: sum -1m unaligned every: 10s units: requests warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: down 5m multiplier 1.5 max 1h summary: Consul server ${label:node_name} failed RPC requests info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_node_health_check_status on: consul.node_health_check_status class: Errors type: ServiceMesh component: Consul calc: $warning + $critical every: 10s units: status warn: $this != nan AND $this != 0 delay: down 5m multiplier 1.5 max 1h summary: Consul node health check ${label:check_name} on ${label:node_name} info: Node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_service_health_check_status on: consul.service_health_check_status class: Errors type: ServiceMesh component: Consul calc: $warning + $critical every: 10s units: status warn: $this == 1 delay: down 5m multiplier 1.5 max 1h summary: Consul service health check ${label:check_name} service ${label:service_name} node ${label:node_name} info: Service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_gc_pause_time on: consul.gc_pause_time class: Errors type: ServiceMesh component: Consul lookup: sum -1m unaligned every: 10s units: seconds warn: $this > (($status >= $WARNING) ? (1) : (2)) crit: $this > (($status >= $WARNING) ? (2) : (5)) delay: down 5m multiplier 1.5 max 1h summary: Consul server ${label:node_name} garbage collection pauses info: Time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter} to: sysadmin
.
Edit
..
Edit
adaptec_raid.conf
Edit
apcupsd.conf
Edit
as400.conf
Edit
bcache.conf
Edit
beanstalkd.conf
Edit
boinc.conf
Edit
btrfs.conf
Edit
ceph.conf
Edit
cgroups.conf
Edit
clickhouse.conf
Edit
cockroachdb.conf
Edit
consul.conf
Edit
cpu.conf
Edit
db2.conf
Edit
dbengine.conf
Edit
disks.conf
Edit
dns_query.conf
Edit
dnsmasq_dhcp.conf
Edit
docker.conf
Edit
elasticsearch.conf
Edit
entropy.conf
Edit
exporting.conf
Edit
file_descriptors.conf
Edit
gearman.conf
Edit
geth.conf
Edit
go.d.plugin.conf
Edit
haproxy.conf
Edit
hdfs.conf
Edit
httpcheck.conf
Edit
ioping.conf
Edit
ipc.conf
Edit
ipfs.conf
Edit
ipmi.conf
Edit
isc_dhcpd.conf
Edit
k8sstate.conf
Edit
kubelet.conf
Edit
load.conf
Edit
lvm.conf
Edit
mdstat.conf
Edit
megacli.conf
Edit
memcached.conf
Edit
memory.conf
Edit
ml.conf
Edit
mq.conf
Edit
mysql.conf
Edit
net.conf
Edit
netfilter.conf
Edit
nvme.conf
Edit
pihole.conf
Edit
ping.conf
Edit
plugin.conf
Edit
portcheck.conf
Edit
postgres.conf
Edit
power_supply_capacity.conf
Edit
processes.conf
Edit
proxysql.conf
Edit
python.d.plugin.conf
Edit
qos.conf
Edit
rabbitmq.conf
Edit
ram.conf
Edit
reboot.conf
Edit
redis.conf
Edit
retroshare.conf
Edit
riakkv.conf
Edit
scaleio.conf
Edit
softnet.conf
Edit
storcli.conf
Edit
streaming.conf
Edit
swap.conf
Edit
synchronization.conf
Edit
systemdunits.conf
Edit
tcp_conn.conf
Edit
tcp_listen.conf
Edit
tcp_mem.conf
Edit
tcp_orphans.conf
Edit
tcp_resets.conf
Edit
timex.conf
Edit
udp_errors.conf
Edit
unbound.conf
Edit
upsd.conf
Edit
vcsa.conf
Edit
vernemq.conf
Edit
vsphere.conf
Edit
web_log.conf
Edit
websphere_jmx.conf
Edit
websphere_mp.conf
Edit
websphere_pmi.conf
Edit
whoisquery.conf
Edit
x509check.conf
Edit
zfs.conf
Edit