openshift 容器云从入门到崩溃之九《容器监控-报警》

时间:2022-06-26 03:50:26

容器状态监控

主要是监控POD的状态包括重启、不健康等等这些k8s api 状态本身会报出来,在配合zabbix报警

导入zabbix模板关联上oc master主机

<?xml version="1.0" encoding="UTF-8"?>
<zabbix_export>
<version>3.2</version>
<date>--27T07::05Z</date>
<groups>
<group>
<name>Templates</name>
</group>
</groups>
<templates>
<template>
<template>OC Pods</template>
<name>OC Pods</name>
<description/>
<groups>
<group>
<name>Templates</name>
</group>
</groups>
<applications>
<application>
<name>restartCount</name>
</application>
<application>
<name>RunningStatus</name>
</application>
</applications>
<items/>
<discovery_rules>
<discovery_rule>
<name>OC Pods Discover</name>
<type></type>
<snmp_community/>
<snmp_oid/>
<key>oc.pod.status[discover,discover]</key>
<delay></delay>
<status></status>
<allowed_hosts/>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel></snmpv3_securitylevel>
<snmpv3_authprotocol></snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol></snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<delay_flex/>
<params/>
<ipmi_sensor/>
<authtype></authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<filter>
<evaltype></evaltype>
<formula/>
<conditions/>
</filter>
<lifetime></lifetime>
<description/>
<item_prototypes>
<item_prototype>
<name>Pod {#POD_NAME} Get Status</name>
<type></type>
<snmp_community/>
<multiplier></multiplier>
<snmp_oid/>
<key>oc.pod.status[{#POD_NAME},get_status]</key>
<delay></delay>
<history></history>
<trends></trends>
<status></status>
<value_type></value_type>
<allowed_hosts/>
<units/>
<delta></delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel></snmpv3_securitylevel>
<snmpv3_authprotocol></snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol></snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula></formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type></data_type>
<authtype></authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link></inventory_link>
<applications>
<application>
<name>RunningStatus</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>Pod {#POD_NAME} Restarts</name>
<type></type>
<snmp_community/>
<multiplier></multiplier>
<snmp_oid/>
<key>oc.pod.status[{#POD_NAME},restarts]</key>
<delay></delay>
<history></history>
<trends></trends>
<status></status>
<value_type></value_type>
<allowed_hosts/>
<units/>
<delta></delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel></snmpv3_securitylevel>
<snmpv3_authprotocol></snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol></snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula></formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type></data_type>
<authtype></authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link></inventory_link>
<applications>
<application>
<name>restartCount</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>Pod {#POD_NAME} Running</name>
<type></type>
<snmp_community/>
<multiplier></multiplier>
<snmp_oid/>
<key>oc.pod.status[{#POD_NAME},running]</key>
<delay></delay>
<history></history>
<trends></trends>
<status></status>
<value_type></value_type>
<allowed_hosts/>
<units/>
<delta></delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel></snmpv3_securitylevel>
<snmpv3_authprotocol></snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol></snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula></formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type></data_type>
<authtype></authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link></inventory_link>
<applications>
<application>
<name>RunningStatus</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
</item_prototypes>
<trigger_prototypes>
<trigger_prototype>
<expression>{OC Pods:oc.pod.status[{#POD_NAME},running].str(Running_true)}=&#;
and&#;
{OC Pods:oc.pod.status[{#POD_NAME},running].str(Pod deleted)}=</expression>
<recovery_mode></recovery_mode>
<recovery_expression/>
<name>Pod {#POD_NAME} Not Running</name>
<correlation_mode></correlation_mode>
<correlation_tag/>
<url/>
<status></status>
<priority></priority>
<description/>
<type></type>
<manual_close></manual_close>
<dependencies/>
<tags/>
</trigger_prototype>
<trigger_prototype>
<expression>{OC Pods:oc.pod.status[{#POD_NAME},restarts].str(Warning)}=</expression>
<recovery_mode></recovery_mode>
<recovery_expression>{OC Pods:oc.pod.status[{#POD_NAME},restarts].str(Warning,#)}=</recovery_expression>
<name>Pod {#POD_NAME} restarted Warning</name>
<correlation_mode></correlation_mode>
<correlation_tag/>
<url/>
<status></status>
<priority></priority>
<description/>
<type></type>
<manual_close></manual_close>
<dependencies/>
<tags/>
</trigger_prototype>
</trigger_prototypes>
<graph_prototypes/>
<host_prototypes/>
</discovery_rule>
</discovery_rules>
<httptests/>
<macros/>
<templates/>
<screens/>
</template>
</templates>
</zabbix_export>

zabbix客户端配置

修改zabbix_agentd.conf

Timeout=
UserParameter=oc.pod.status[*],/data/app/zabbix/etc/oc_pod_monitor.sh $ $

oc_pod_monitor.sh内容

#!/bin/bash
TOKEN=""
ENDPOINT=""
POD_NAME="`echo "$" |sed 's/.*=\(.*$\)/\1/'`"
Monitoring_type="$2"
WORKSPACE="/data/tmp/oc_monitor"
mkdir -p $WORKSPACE #通过pod name获得pod所在的namespace
NAMESPACE="`jq -r '.items |.[] |.metadata |.name,.namespace' $WORKSPACE/all_pods.json |grep -A1 $POD_NAME |grep -v $POD_NAME`" #验证pod是否存在
if [ "$POD_NAME" == "discover" ]; then
echo
elif [ ! -n "$NAMESPACE" ]; then
echo "Pod deleted"
exit
fi
##自动发现
case $Monitoring_type in
discover)
#获取所有pod只保留pod name
curl -k \
-H "Authorization: Bearer $TOKEN" \
-H 'Accept: application/json' \
https://$ENDPOINT/api/v1/pods 2>/dev/null > $WORKSPACE/all_pods.json Pod_Name=(`jq -r '.items | .[] | .metadata | .name' $WORKSPACE/all_pods.json |egrep -v 'build|deploy|debug'`)
#转换为json格式
printf "{\n"
printf '\t"data":[\n'
for ((i=;i<${#Pod_Name[@]};i++))
do
NAMESPACE="`jq -r '.items |.[] |.metadata |.name,.namespace' $WORKSPACE/all_pods.json |grep -A1 ${Pod_Name[i]} |grep -v ${Pod_Name[i]}`"
Pod_Name_N=""$NAMESPACE"="${Pod_Name[i]}""
printf '\t\t{\n'
num=$(echo $((${#Pod_Name[@]}-)))
if [ "$i" == ${num} ];
then
printf "\t\t\t\"{#POD_NAME}\":\"${Pod_Name_N}\"}\n"
else
printf "\t\t\t\"{#POD_NAME}\":\"${Pod_Name_N}\"},\n"
fi
done
printf "\t]\n"
printf "}\n"
exit
;; get_status)#获取pod状态以供所有项目调用
curl -k \
-H "Authorization: Bearer $TOKEN" \
-H 'Accept: application/json' \
https://${ENDPOINT}/api/v1/namespaces/$NAMESPACE/pods/$POD_NAME/status 2>/dev/null > $WORKSPACE/${NAMESPACE}-${POD_NAME}.status
Pod_NotFound="`cat $WORKSPACE/${NAMESPACE}-${POD_NAME}.status |grep '"code": 404'`"
if [ -n "$Pod_NotFound" ]; then
echo "Pod_Status=NotFound"
exit
else
echo "Success"
exit
fi
;;
esac #获取pod状态数据
if [ -f "$WORKSPACE/${NAMESPACE}-${POD_NAME}.status" ];then
Pod_Status="`cat $WORKSPACE/${NAMESPACE}-${POD_NAME}.status`"
else
echo "" > $WORKSPACE/${NAMESPACE}-${POD_NAME}.status
Pod_Status="`cat $WORKSPACE/${NAMESPACE}-${POD_NAME}.status`"
fi #处理Pod_Status的异常
if [ ! -n "$Pod_Status" ]; then #处理Pod_Status的为空的异常
echo "Running_true Pod_Status=Null"
exit
elif [ -n "`echo "$Pod_Status" |grep '"code": 404'`" ]; then #处理pod不存在但是all_pods.json还没更新的异常
echo "Pod_Status=NotFound"
exit
elif [ "`echo "$Pod_Status" |jq -r '.status |.phase'`" = "Pending" ]; then #验证容器是否在Pending状态
echo "Pending"
exit
fi #选择要获取的数据
case $Monitoring_type in
restarts)#监控pod是否重启过
#判断是否是新pod
if [ ! -f "$WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount" ]; then
echo "Warning New Pod"
echo "" > $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount
exit
fi ##获取上次的值
A_line=`sed -n 1p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
B_line_null="`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`"
if [ ! -n "$B_line_null" ]; then #处理有两个restartCount值的pod
B_line=""
else
B_line=`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
fi
Last_state=`expr $A_line + $B_line`
## ##获取本次的值
echo "$Pod_Status" |jq -r '.status |.containerStatuses |.[] |.restartCount' > $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount
A_line=`sed -n 1p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
B_line_null="`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`"
if [ ! -n "$B_line_null" ]; then #处理有两个restartCount值的pod
B_line=""
else
B_line=`sed -n 2p $WORKSPACE/${NAMESPACE}-${POD_NAME}.restartCount`
fi
Current_state=`expr $A_line + $B_line`
## #对比本次拿到的restartCount值与上此的restartCount值
if [ "$Current_state" -gt "$Last_state" ]; then
Restart_status="Warning restart_count=$Current_state"
else
Restart_status="Normal restart_count=$Current_state"
fi
echo "$Restart_status"
;; running)#监控pod的运行状态和容器的状态返回字符串 #获取pod和容器的状态
running_status=`echo "$Pod_Status" |jq -r '.status |.phase'`
Container_status="`echo "$Pod_Status" |jq -r '.status |.containerStatuses |.[] |.ready' |grep false`"
if [ ! -n "$Container_status" ]; then
Container_status="_true"
else
Container_status="_false"
fi
echo "${running_status}${Container_status}"
;; *)
echo "Error parameters"
exit
;; esac
exit

这样POD重启或者新建都会报出来

集群NODE节点监控

主要监控node节点的不健康状态,还有lvm卷容量监控

导入zabbix模板关联上oc master主机

<?xml version="1.0" encoding="UTF-8"?>
<zabbix_export>
<version>3.2</version>
<date>--27T07::32Z</date>
<groups>
<group>
<name>Templates</name>
</group>
</groups>
<templates>
<template>
<template>OC Node Status</template>
<name>OC Node Status</name>
<description/>
<groups>
<group>
<name>Templates</name>
</group>
</groups>
<applications>
<application>
<name>oc_node</name>
</application>
</applications>
<items/>
<discovery_rules>
<discovery_rule>
<name>OC Nodes Discover</name>
<type></type>
<snmp_community/>
<snmp_oid/>
<key>oc.node.status[discover,discover]</key>
<delay></delay>
<status></status>
<allowed_hosts/>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel></snmpv3_securitylevel>
<snmpv3_authprotocol></snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol></snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<delay_flex/>
<params/>
<ipmi_sensor/>
<authtype></authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<filter>
<evaltype></evaltype>
<formula/>
<conditions/>
</filter>
<lifetime></lifetime>
<description/>
<item_prototypes>
<item_prototype>
<name>Node {#NODE_NAME} DiskPressure</name>
<type></type>
<snmp_community/>
<multiplier></multiplier>
<snmp_oid/>
<key>oc.node.status[{#NODE_NAME},DiskPressure]</key>
<delay></delay>
<history></history>
<trends></trends>
<status></status>
<value_type></value_type>
<allowed_hosts/>
<units/>
<delta></delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel></snmpv3_securitylevel>
<snmpv3_authprotocol></snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol></snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula></formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type></data_type>
<authtype></authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link></inventory_link>
<applications>
<application>
<name>oc_node</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>Node {#NODE_NAME} Get Status</name>
<type></type>
<snmp_community/>
<multiplier></multiplier>
<snmp_oid/>
<key>oc.node.status[{#NODE_NAME},get_status]</key>
<delay></delay>
<history></history>
<trends></trends>
<status></status>
<value_type></value_type>
<allowed_hosts/>
<units/>
<delta></delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel></snmpv3_securitylevel>
<snmpv3_authprotocol></snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol></snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula></formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type></data_type>
<authtype></authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link></inventory_link>
<applications/>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>Node {#NODE_NAME} MemoryPressure</name>
<type></type>
<snmp_community/>
<multiplier></multiplier>
<snmp_oid/>
<key>oc.node.status[{#NODE_NAME},MemoryPressure]</key>
<delay></delay>
<history></history>
<trends></trends>
<status></status>
<value_type></value_type>
<allowed_hosts/>
<units/>
<delta></delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel></snmpv3_securitylevel>
<snmpv3_authprotocol></snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol></snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula></formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type></data_type>
<authtype></authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link></inventory_link>
<applications>
<application>
<name>oc_node</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>Node {#NODE_NAME} Ready</name>
<type></type>
<snmp_community/>
<multiplier></multiplier>
<snmp_oid/>
<key>oc.node.status[{#NODE_NAME},node_ready]</key>
<delay></delay>
<history></history>
<trends></trends>
<status></status>
<value_type></value_type>
<allowed_hosts/>
<units/>
<delta></delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel></snmpv3_securitylevel>
<snmpv3_authprotocol></snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol></snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula></formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type></data_type>
<authtype></authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link></inventory_link>
<applications>
<application>
<name>oc_node</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>Node {#NODE_NAME} CPU Limits</name>
<type></type>
<snmp_community/>
<multiplier></multiplier>
<snmp_oid/>
<key>oc.node.status[{#NODE_NAME},node_resources,cpu_limits]</key>
<delay></delay>
<history></history>
<trends></trends>
<status></status>
<value_type></value_type>
<allowed_hosts/>
<units>%</units>
<delta></delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel></snmpv3_securitylevel>
<snmpv3_authprotocol></snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol></snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula></formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type></data_type>
<authtype></authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link></inventory_link>
<applications>
<application>
<name>oc_node</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>Node {#NODE_NAME} CPU Requests</name>
<type></type>
<snmp_community/>
<multiplier></multiplier>
<snmp_oid/>
<key>oc.node.status[{#NODE_NAME},node_resources,cpu_requests]</key>
<delay></delay>
<history></history>
<trends></trends>
<status></status>
<value_type></value_type>
<allowed_hosts/>
<units>%</units>
<delta></delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel></snmpv3_securitylevel>
<snmpv3_authprotocol></snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol></snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula></formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type></data_type>
<authtype></authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link></inventory_link>
<applications>
<application>
<name>oc_node</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>Node {#NODE_NAME} Memory Limits</name>
<type></type>
<snmp_community/>
<multiplier></multiplier>
<snmp_oid/>
<key>oc.node.status[{#NODE_NAME},node_resources,memory_limits]</key>
<delay></delay>
<history></history>
<trends></trends>
<status></status>
<value_type></value_type>
<allowed_hosts/>
<units>%</units>
<delta></delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel></snmpv3_securitylevel>
<snmpv3_authprotocol></snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol></snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula></formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type></data_type>
<authtype></authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link></inventory_link>
<applications>
<application>
<name>oc_node</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>Node {#NODE_NAME} Memory Requests</name>
<type></type>
<snmp_community/>
<multiplier></multiplier>
<snmp_oid/>
<key>oc.node.status[{#NODE_NAME},node_resources,memory_requests]</key>
<delay></delay>
<history></history>
<trends></trends>
<status></status>
<value_type></value_type>
<allowed_hosts/>
<units>%</units>
<delta></delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel></snmpv3_securitylevel>
<snmpv3_authprotocol></snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol></snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula></formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type></data_type>
<authtype></authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link></inventory_link>
<applications>
<application>
<name>oc_node</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
<item_prototype>
<name>Node {#NODE_NAME} OutOfDisk</name>
<type></type>
<snmp_community/>
<multiplier></multiplier>
<snmp_oid/>
<key>oc.node.status[{#NODE_NAME},OutOfDisk]</key>
<delay></delay>
<history></history>
<trends></trends>
<status></status>
<value_type></value_type>
<allowed_hosts/>
<units/>
<delta></delta>
<snmpv3_contextname/>
<snmpv3_securityname/>
<snmpv3_securitylevel></snmpv3_securitylevel>
<snmpv3_authprotocol></snmpv3_authprotocol>
<snmpv3_authpassphrase/>
<snmpv3_privprotocol></snmpv3_privprotocol>
<snmpv3_privpassphrase/>
<formula></formula>
<delay_flex/>
<params/>
<ipmi_sensor/>
<data_type></data_type>
<authtype></authtype>
<username/>
<password/>
<publickey/>
<privatekey/>
<port/>
<description/>
<inventory_link></inventory_link>
<applications>
<application>
<name>oc_node</name>
</application>
</applications>
<valuemap/>
<logtimefmt/>
<application_prototypes/>
</item_prototype>
</item_prototypes>
<trigger_prototypes>
<trigger_prototype>
<expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,cpu_limits].last()}&gt;</expression>
<recovery_mode></recovery_mode>
<recovery_expression/>
<name>Node {#NODE_NAME} CPU Limits %</name>
<correlation_mode></correlation_mode>
<correlation_tag/>
<url/>
<status></status>
<priority></priority>
<description/>
<type></type>
<manual_close></manual_close>
<dependencies/>
<tags/>
</trigger_prototype>
<trigger_prototype>
<expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,cpu_requests].last()}&gt;</expression>
<recovery_mode></recovery_mode>
<recovery_expression/>
<name>Node {#NODE_NAME} CPU Requests %</name>
<correlation_mode></correlation_mode>
<correlation_tag/>
<url/>
<status></status>
<priority></priority>
<description/>
<type></type>
<manual_close></manual_close>
<dependencies/>
<tags/>
</trigger_prototype>
<trigger_prototype>
<expression>{OC Node Status:oc.node.status[{#NODE_NAME},DiskPressure].str(DiskPressure_False)}=</expression>
<recovery_mode></recovery_mode>
<recovery_expression/>
<name>Node {#NODE_NAME} DiskPressure</name>
<correlation_mode></correlation_mode>
<correlation_tag/>
<url/>
<status></status>
<priority></priority>
<description/>
<type></type>
<manual_close></manual_close>
<dependencies/>
<tags/>
</trigger_prototype>
<trigger_prototype>
<expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,memory_limits].last()}&gt;</expression>
<recovery_mode></recovery_mode>
<recovery_expression/>
<name>Node {#NODE_NAME} Memory Limits %</name>
<correlation_mode></correlation_mode>
<correlation_tag/>
<url/>
<status></status>
<priority></priority>
<description/>
<type></type>
<manual_close></manual_close>
<dependencies/>
<tags/>
</trigger_prototype>
<trigger_prototype>
<expression>{OC Node Status:oc.node.status[{#NODE_NAME},MemoryPressure].str(MemoryPressure_False)}=</expression>
<recovery_mode></recovery_mode>
<recovery_expression/>
<name>Node {#NODE_NAME} MemoryPressure</name>
<correlation_mode></correlation_mode>
<correlation_tag/>
<url/>
<status></status>
<priority></priority>
<description/>
<type></type>
<manual_close></manual_close>
<dependencies/>
<tags/>
</trigger_prototype>
<trigger_prototype>
<expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_resources,memory_requests].last()}&gt;</expression>
<recovery_mode></recovery_mode>
<recovery_expression/>
<name>Node {#NODE_NAME} Memory Requests %</name>
<correlation_mode></correlation_mode>
<correlation_tag/>
<url/>
<status></status>
<priority></priority>
<description/>
<type></type>
<manual_close></manual_close>
<dependencies/>
<tags/>
</trigger_prototype>
<trigger_prototype>
<expression>{OC Node Status:oc.node.status[{#NODE_NAME},node_ready].str(Ready_True)}=</expression>
<recovery_mode></recovery_mode>
<recovery_expression/>
<name>Node {#NODE_NAME} Not Ready</name>
<correlation_mode></correlation_mode>
<correlation_tag/>
<url/>
<status></status>
<priority></priority>
<description/>
<type></type>
<manual_close></manual_close>
<dependencies/>
<tags/>
</trigger_prototype>
<trigger_prototype>
<expression>{OC Node Status:oc.node.status[{#NODE_NAME},OutOfDisk].str(OutOfDisk_False)}=</expression>
<recovery_mode></recovery_mode>
<recovery_expression/>
<name>Node {#NODE_NAME} OutOfDisk</name>
<correlation_mode></correlation_mode>
<correlation_tag/>
<url/>
<status></status>
<priority></priority>
<description/>
<type></type>
<manual_close></manual_close>
<dependencies/>
<tags/>
</trigger_prototype>
</trigger_prototypes>
<graph_prototypes/>
<host_prototypes/>
</discovery_rule>
</discovery_rules>
<httptests/>
<macros/>
<templates/>
<screens/>
</template>
</templates>
</zabbix_export>

zabbix客户端配置

修改zabbix_agentd.conf

Timeout=
UserParameter=oc.node.status[*],/data/app/zabbix/etc/oc_node_monitor.sh $ $ $

oc_node_monitor.sh的内容

#!/bin/bash
TOKEN=""
ENDPOINT=""
NODE_NAME="$1"
Monitoring_type="$2"
WORKSPACE="/data/tmp/oc_monitor"
mkdir -p $WORKSPACE case $Monitoring_type in
discover)#自动发现节点
Node_Name=(`curl -k \
-H "Authorization: Bearer $TOKEN" \
-H 'Accept: application/json' \
https://$ENDPOINT/api/v1/nodes 2>/dev/null |jq -r '.items|.[]|.metadata|.name'`) printf "{\n"
printf '\t"data":[\n'
for ((i=;i<${#Node_Name[@]};i++))
do
printf '\t\t{\n'
num=$(echo $((${#Node_Name[@]}-)))
if [ "$i" == ${num} ];
then
printf "\t\t\t\"{#NODE_NAME}\":\"${Node_Name[$i]}\"}\n"
else
printf "\t\t\t\"{#NODE_NAME}\":\"${Node_Name[$i]}\"},\n"
fi
done
printf "\t]\n"
printf "}\n"
exit
;;
get_status)#获取node状态以供所有项目调用
curl -k \
-H "Authorization: Bearer $TOKEN" \
-H 'Accept: application/json' \
https://${ENDPOINT}/api/v1/nodes/$NODE_NAME 2>/dev/null > $WORKSPACE/${NODE_NAME}.status
if [ -n "`cat $WORKSPACE/${NODE_NAME}.status |grep '"code": 404'`" ]; then
echo "Node_Status=NotFound"
exit
elif [ ! -n "`cat $WORKSPACE/${NODE_NAME}.status`" ]; then
echo "Node_Status=null"
exit
else
echo "Success"
exit
fi
;;
esac case $Monitoring_type in
OutOfDisk)#监控node是否磁盘空间不足
Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 1p`"
if [ "$Node_Status" == "False" ]; then
echo "OutOfDisk_False"
elif [ ! -n "$Node_Status" ]; then
echo "OutOfDisk_False"
else
echo "OutOfDisk_$Node_Status"
fi
;; MemoryPressure)#监控node是否磁盘空间不足
Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 2p`"
if [ "$Node_Status" == "False" ]; then
echo "MemoryPressure_False"
elif [ ! -n "$Node_Status" ]; then
echo "MemoryPressure_False"
else
echo "MemoryPressure_$Node_Status"
fi
;; DiskPressure)#监控node是否磁盘压力太大
Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 3p`"
if [ "$Node_Status" == "False" ]; then
echo "DiskPressure_False"
elif [ ! -n "$Node_Status" ]; then
echo "DiskPressure_False"
else
echo "DiskPressure_$Node_Status"
fi
;; node_ready)#监控node是否准备好了
Node_Status="`cat $WORKSPACE/${NODE_NAME}.status |jq -r '.status|.conditions|.[]|.status' | sed -n 4p`"
if [ "$Node_Status" == "True" ]; then
echo "Ready_True"
elif [ ! -n "$Node_Status" ]; then
echo "Ready_True"
else
echo "Ready_$Node_Status"
fi
;; node_resources)#监控node资源分配情况
null="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $2}'`"
if [ ! -n "$null" ]; then
sleep
fi
if [ "$3" == "cpu_requests" ]; then
data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $2}' |grep -o '[0-9]*'`"
if [ $data -gt ]; then
echo $data
else
echo
fi
elif [ "$3" == "cpu_limits" ]; then
data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $4}' |grep -o '[0-9]*'`"
if [ $data -gt ]; then
echo $data
else
echo
fi elif [ "$3" == "memory_requests" ]; then
data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $6}' |grep -o '[0-9]*'`"
if [ "$data" -gt ]; then
echo $data
else
echo
fi elif [ "$3" == "memory_limits" ]; then
data="`cat $WORKSPACE/${NODE_NAME}.resources |awk '{print $8}' |grep -o '[0-9]*'`"
if [ $data -gt ]; then
echo $data
else
echo
fi
fi
;;
esac

crontab -e

*/ * * * * /data/scripts/oc_master_crontab.sh >/dev/null >&

oc_master_crontab.sh内容

node_name=(`oc get node |grep -v "NAME" |awk '{print $1}'`)
for ((i=;i<${#node_name[*]};i++))
do
oc describe node "${node_name[i]}" |grep -B "Events" |grep -v "Events" > /data/tmp/oc_monitor/${node_name[i]}.resources
chmod -R /data/tmp/
done