/usr/lib/netdata/conf.d/health.d
# you can disable an alarm notification by setting the 'to' line to: silent # ----------------------------------------------------------------------------- # True (1) if the node is experiencing a configuration-related error, false (0) otherwise. template: kubelet_node_config_error on: k8s_kubelet.kubelet_node_config_error class: Errors type: Kubernetes component: Kubelet calc: $experiencing_error units: bool every: 10s warn: $this == 1 delay: down 1m multiplier 1.5 max 2h summary: Kubelet node config error info: The node is experiencing a configuration-related error (0: false, 1: true) to: sysadmin # Failed Token() requests to the alternate token source template: kubelet_token_requests on: k8s_kubelet.kubelet_token_requests class: Errors type: Kubernetes component: Kubelet lookup: sum -10s of failed units: requests every: 10s warn: $this > 0 delay: down 1m multiplier 1.5 max 2h summary: Kubelet failed token requests info: Number of failed Token() requests to the alternate token source to: sysadmin # Docker and runtime operation errors template: kubelet_operations_error on: k8s_kubelet.kubelet_operations_errors class: Errors type: Kubernetes component: Kubelet lookup: sum -1m units: errors every: 10s warn: $this > (($status >= $WARNING) ? (0) : (20)) delay: up 30s down 1m multiplier 1.5 max 2h summary: Kubelet runtime errors info: Number of Docker or runtime operation errors to: sysadmin # ----------------------------------------------------------------------------- # Pod Lifecycle Event Generator Relisting Latency # 1. calculate the pleg relisting latency for 1m (quantile 0.5, quantile 0.9, quantile 0.99) # 2. do the same for the last 10s # 3. raise an alarm if the later is: # - 2x the first for quantile 0.5 # - 4x the first for quantile 0.9 # - 8x the first for quantile 0.99 # # we assume the minimum latency is 1000 microseconds # quantile 0.5 template: kubelet_1m_pleg_relist_latency_quantile_05 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds class: Latency type: Kubernetes component: Kubelet lookup: average -1m unaligned of 0.5 units: microseconds every: 10s info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5) template: kubelet_10s_pleg_relist_latency_quantile_05 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds class: Latency type: Kubernetes component: Kubelet lookup: average -10s unaligned of 0.5 calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05)) every: 10s units: % warn: $this > (($status >= $WARNING)?(100):(200)) crit: $this > (($status >= $WARNING)?(200):(400)) delay: down 1m multiplier 1.5 max 2h summary: Kubelet relisting latency (quantile 0.5) info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ compared to the last minute (quantile 0.5) to: sysadmin # quantile 0.9 template: kubelet_1m_pleg_relist_latency_quantile_09 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds class: Latency type: Kubernetes component: Kubelet lookup: average -1m unaligned of 0.9 units: microseconds every: 10s info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9) template: kubelet_10s_pleg_relist_latency_quantile_09 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds class: Latency type: Kubernetes component: Kubelet lookup: average -10s unaligned of 0.9 calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09)) every: 10s units: % warn: $this > (($status >= $WARNING)?(200):(400)) crit: $this > (($status >= $WARNING)?(400):(800)) delay: down 1m multiplier 1.5 max 2h summary: Kubelet relisting latency (quantile 0.9) info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ compared to the last minute (quantile 0.9) to: sysadmin # quantile 0.99 template: kubelet_1m_pleg_relist_latency_quantile_099 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds class: Latency type: Kubernetes component: Kubelet lookup: average -1m unaligned of 0.99 units: microseconds every: 10s info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99) template: kubelet_10s_pleg_relist_latency_quantile_099 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds class: Latency type: Kubernetes component: Kubelet lookup: average -10s unaligned of 0.99 calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099)) every: 10s units: % warn: $this > (($status >= $WARNING)?(400):(800)) crit: $this > (($status >= $WARNING)?(800):(1200)) delay: down 1m multiplier 1.5 max 2h summary: Kubelet relisting latency (quantile 0.99) info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ compared to the last minute (quantile 0.99) to: sysadmin
.
Edit
..
Edit
adaptec_raid.conf
Edit
apcupsd.conf
Edit
as400.conf
Edit
bcache.conf
Edit
beanstalkd.conf
Edit
boinc.conf
Edit
btrfs.conf
Edit
ceph.conf
Edit
cgroups.conf
Edit
clickhouse.conf
Edit
cockroachdb.conf
Edit
consul.conf
Edit
cpu.conf
Edit
db2.conf
Edit
dbengine.conf
Edit
disks.conf
Edit
dns_query.conf
Edit
dnsmasq_dhcp.conf
Edit
docker.conf
Edit
elasticsearch.conf
Edit
entropy.conf
Edit
exporting.conf
Edit
file_descriptors.conf
Edit
gearman.conf
Edit
geth.conf
Edit
go.d.plugin.conf
Edit
haproxy.conf
Edit
hdfs.conf
Edit
httpcheck.conf
Edit
ioping.conf
Edit
ipc.conf
Edit
ipfs.conf
Edit
ipmi.conf
Edit
isc_dhcpd.conf
Edit
k8sstate.conf
Edit
kubelet.conf
Edit
load.conf
Edit
lvm.conf
Edit
mdstat.conf
Edit
megacli.conf
Edit
memcached.conf
Edit
memory.conf
Edit
ml.conf
Edit
mq.conf
Edit
mysql.conf
Edit
net.conf
Edit
netfilter.conf
Edit
nvme.conf
Edit
pihole.conf
Edit
ping.conf
Edit
plugin.conf
Edit
portcheck.conf
Edit
postgres.conf
Edit
power_supply_capacity.conf
Edit
processes.conf
Edit
proxysql.conf
Edit
python.d.plugin.conf
Edit
qos.conf
Edit
rabbitmq.conf
Edit
ram.conf
Edit
reboot.conf
Edit
redis.conf
Edit
retroshare.conf
Edit
riakkv.conf
Edit
scaleio.conf
Edit
softnet.conf
Edit
storcli.conf
Edit
streaming.conf
Edit
swap.conf
Edit
synchronization.conf
Edit
systemdunits.conf
Edit
tcp_conn.conf
Edit
tcp_listen.conf
Edit
tcp_mem.conf
Edit
tcp_orphans.conf
Edit
tcp_resets.conf
Edit
timex.conf
Edit
udp_errors.conf
Edit
unbound.conf
Edit
upsd.conf
Edit
vcsa.conf
Edit
vernemq.conf
Edit
vsphere.conf
Edit
web_log.conf
Edit
websphere_jmx.conf
Edit
websphere_mp.conf
Edit
websphere_pmi.conf
Edit
whoisquery.conf
Edit
x509check.conf
Edit
zfs.conf
Edit