shell实战之Linux主机系统监控

时间:2023-03-09 23:10:12
shell实战之Linux主机系统监控

1、系统监控概述

采集的监控信息主要有内存占用率,CPU占用率,当前在线用户,磁盘挂载及磁盘空间使用率,平均每秒写入流量,平均每秒流出流量。磁盘IO:平均每秒从磁盘读入内存的速率,平均每秒从内存写入磁盘的速率。

2、监控原理

2.1、CPU占用率

监控原理:

CPU相关信息记录在文件 /proc/stat中。详情请查看博文:https://blog.csdn.net/ustclu/article/details/1721673

stephen@stephen-K55VD:~/shell$ cat  /proc/stat
cpu
cpu0
cpu1
cpu2
cpu3
intr
ctxt
btime
processes
procs_running
procs_blocked
softirq

代码实现:

 #获取CPU的总量与使用量
cpuTotalStart=`awk 'BEGIN{total=0} /cpu / {for(i=2;i<=NF;i++);total+=i}END{print $total}' /proc/stat`
cpuUsedStart=`awk 'BEGIN{used=0} /cpu / { used=$2+$3+$4+$7+$8 }END{print used}' /proc/stat`
#隔30s再获取一次CPU总量与使用量并计算差值
sleep
cpuTotalEnd=`awk 'BEGIN{total=0} /cpu / {for(i=2;i<=NF;i++);total+=i}END{print $total}' /proc/stat`
cpuUsedEnd=`awk 'BEGIN{used=0} /cpu / { used=$2+$3+$4+$7+$8 }END{print used}' /proc/stat`
usedCPU=`expr ${cpuUsedEnd} - ${cpuUsedStart}`
totalCPU=`expr ${cpuTotalEnd} - ${cpuTotalStart}`

2.2、内存占用率

监控原理:

内存相关的信息记录在/proc/meminfo文件中,MemTotal为内存总量,单位为kb,MemFree为空闲内存。内存占用率=(总内存-空闲内存)/ 总内存。

stephen@stephen-K55VD:~/shell$ cat /proc/meminfo
MemTotal: kB
MemFree: kB
MemAvailable: kB
Buffers: kB
Cached: kB
SwapCached: kB
Active: kB

代码实现:

 #获取内存使用率
function memUsage(){
logInfo "Begin to get mem usage of Host [${ip}]"
#获取总内存
totalMem=`awk '/MemTotal/{print $2}' /proc/meminfo`
#获取空闲内存
freeMem=`awk '/MemFree/{print $2}' /proc/meminfo`
usedMem=`expr ${totalMem} - ${freeMem}`
#echo $(usagePercent ${usedMem} ${totalMem})
#echo $(kbToGb ${totalMem})
logInfo "Host [${ip}] total mem is : $(kbToGb ${totalMem}) GB"
#计算内存使用率并打印到日志中
logInfo "Host [${ip}] mem usage is : $(usagePercent ${usedMem} ${totalMem}) %"
logInfo "End to get mem usage of Host [${ip}]"
}

2.3、流量监控

监控原理:

Linux机器流量信息记录在/proc/net/dev文件中。通过计算一段时间段内接收和发送的字节数来计算速率。第一列为网卡信息,第二列为接收的字节数,第10列为发送的字节数。

stephen@stephen-K55VD:~/shell/sysMonitor$ cat /proc/net/dev
Inter-| Receive | Transmit
face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed
wlp3s0:
enp4s0f2:
docker0:
lo:

代码实现:

 #ethName为网卡名称
receiveByteStart=`cat /proc/net/dev |grep -E "${ethName}"|awk '{print $2}'`
sendByteStart=`cat /proc/net/dev |grep -E "${ethName}"|awk '{print $10}'`

2.4、磁盘IO

监控原理:

磁盘IO相关的信息记录在/proc/vmstat文件中,pgpgin对应的为输入方向的数据量。pgpgout对应的为输出方向的数据量。采集一段时间的数据量,除以时间来计算速率。

代码实现:

 #disk IO in
function diskIOIn(){
#获取磁盘入方向IO
inIoStart=`awk '/pgpgin/{print $2}' /proc/vmstat`
sleep
inIoEnd=`awk '/pgpgin/{print $2}' /proc/vmstat`
inIo=$(((inIoEnd-inIoStart)/(*)))
logInfo "Host [${ip}] in IO is : ${inIo} MB / s" }

3、脚本代码

  • hostLists:监控主机的IP集合。
  • sysMonitor.sh*:获取各项监控信息的脚本。
 #!/bin/bash
#监控linux主机系统信息
#导入工具模块
source utils #获取CPU占用率
function cpuUsage()
{
#物理CPU个数
phyCPUNums=`cat /proc/cpuinfo |grep "physical id"|sort |uniq|wc -l`
#逻辑CPU个数
lgCPUNums=`cat /proc/cpuinfo |grep "processor"|wc -l`
#core
cores=`cat /proc/cpuinfo |grep "cores"|uniq|awk '{print $4}'`
logInfo "Host [${ip}] physical CPU nums is : ${phyCPUNums}"
logInfo "Host [${ip}] logic CPU nums is : ${lgCPUNums}"
logInfo "Host [${ip}] core nums is : ${cores}"
#CPU占用率
#获取CPU的总量与使用量
cpuTotalStart=`awk 'BEGIN{total=0} /cpu / {for(i=2;i<=NF;i++);total+=i}END{print $total}' /proc/stat`
cpuUsedStart=`awk 'BEGIN{used=0} /cpu / { used=$2+$3+$4+$7+$8 }END{print used}' /proc/stat`
#隔30s再获取一次CPU总量与使用量并计算差值
sleep
cpuTotalEnd=`awk 'BEGIN{total=0} /cpu / {for(i=2;i<=NF;i++);total+=i}END{print $total}' /proc/stat`
cpuUsedEnd=`awk 'BEGIN{used=0} /cpu / { used=$2+$3+$4+$7+$8 }END{print used}' /proc/stat`
usedCPU=`expr ${cpuUsedEnd} - ${cpuUsedStart}`
totalCPU=`expr ${cpuTotalEnd} - ${cpuTotalStart}`
logInfo "Host [${ip}] CPU usage is : $(usagePercent ${usedCPU} ${totalCPU}) %" } #获取内存使用率
function memUsage(){
logInfo "Begin to get mem usage of Host [${ip}]"
#获取总内存
totalMem=`awk '/MemTotal/{print $2}' /proc/meminfo`
#获取空闲内存
freeMem=`awk '/MemFree/{print $2}' /proc/meminfo`
usedMem=`expr ${totalMem} - ${freeMem}`
#echo $(usagePercent ${usedMem} ${totalMem})
#echo $(kbToGb ${totalMem})
logInfo "Host [${ip}] total mem is : $(kbToGb ${totalMem}) GB"
#计算内存使用率并打印到日志中
logInfo "Host [${ip}] mem usage is : $(usagePercent ${usedMem} ${totalMem}) %"
logInfo "End to get mem usage of Host [${ip}]"
} #网卡平均每秒流量
function netData(){
logInfo "Begin to get net data of Host [${ip}]"
ethName=$
receiveByteStart=`cat /proc/net/dev |grep -E "${ethName}"|awk '{print $2}'`
sendByteStart=`cat /proc/net/dev |grep -E "${ethName}"|awk '{print $10}'`
sleep
receiveByteSEnd=`cat /proc/net/dev |grep -E "${ethName}"|awk '{print $2}'`
sendBytesEnd=`cat /proc/net/dev |grep -E "${ethName}"|awk '{print $10}'`
inDataRate=$(echo "scale=2;(${receiveByteSEnd}-${receiveByteStart})/10" | bc)
outDataRate=$(echo "scale=2;(${sendBytesEnd}-${sendByteStart})/10" | bc)
logInfo "Host [${ip}] in data is : ${inDataRate} kb / s"
logInfo "Host [${ip}] out data is : ${outDataRate} kb / s"
logInfo "End to get net data of Host [${ip}]"
} #磁盘空间使用情况
function diskUsage(){
logInfo "Begin to get disk usage of Host [${ip}]"
noTimeLogInfo "`df -h`"
logInfo "End to get disk usage of Host [${ip}]"
} #disk IO in
function diskIOIn(){
#获取磁盘入方向IO
inIoStart=`awk '/pgpgin/{print $2}' /proc/vmstat`
sleep
inIoEnd=`awk '/pgpgin/{print $2}' /proc/vmstat`
inIo=$(((inIoEnd-inIoStart)/(*)))
logInfo "Host [${ip}] in IO is : ${inIo} MB / s" } #disk IO out
function diskIOout(){
#获取磁盘出方向的IO
outIoStart=`awk '/pgpgout/{print $2}' /proc/vmstat`
sleep
outIoEnd=`awk '/pgpgout/{print $2}' /proc/vmstat`
outIo=$(((outIoEnd-outIoStart)/(*)))
logInfo "Host [${ip}] out IO is : ${outIo} MB / s"
} #当前在线用户
function onlineUser(){
user=`w |awk 'NR>1'|awk '{print $1 "\t" "\t" $4}'`
userCount=`w |awk 'NR>1'|wc -l`
#loginAt=`w |awk 'NR>1'|awk '{print $4 }'`
logInfo "There are [${userCount}] users online now."
noTimeLogInfo "UserName loginAt"
noTimeLogInfo "${user}"
} #判断主机网络连通性
function isAlive(){
for ip in `cat hostLists`
do
ping ${ip} -c >/dev/null
if [ $? -eq ];then
logInfo "${ip} is reachable"
#查看在线用户
onlineUser
#获取CPU相关信息
cpuUsage
#获取mem相关信息
memUsage
#获取磁盘IO
diskIOIn
diskIOout
#磁盘使用率
diskUsage
#平均每秒流接收或输出流量
netData wlp3s0
else
logInfo "ERROR ${ip} is unreachable,try login in see more details.."
fi
done
} while [ ]
do
isAlive
sleep
done
  • utils:打印日志的函数等。
 #!/bin/bash
#日志打印
curr_path=`pwd`
function logInfo()
{
local curr_time=`date "+%Y-%m-%d %H:%M:%S"`
log_file=${curr_path}/system_status.log
#判断日志文件是否存在
if [ -e ${log_file} ]
then
#检测文件是否可写
if [ -w ${log_file} ]
then
#若文件无写权限则使用chmod命令赋予权限
chmod ${log_file}
fi
else
#若日志文件不存在则创建
touch ${log_file}
fi
#写日志
local info=$
echo "${curr_time} `whoami` [Info] ${info}">>${log_file}
}
function noTimeLogInfo(){
msg=$
echo "${msg}">>${log_file}
} #把kb转换成gb,精度为3。expr只支持整数计算
function kbToGb(){
kbVal=$
gbVal=$(echo "scale=3;${kbVal}/1024/1024"| bc)
echo $gbVal
}
#使用率以百分比的形式
#第一个参数为已使用量,第二个参数为总量
function usagePercent(){
used=$
total=$
usedPercent=$(echo "scale=2;${used}*100/${total}"| bc)
echo ${usedPercent}
}

脚本结构:

 -rw-r--r--  stephen stephen    4月    : hostLists
-rwxrwxr-x stephen stephen 4月 : sysMonitor.sh*
-rw-r--r-- stephen stephen 4月 : utils

4、运行结果

监控信息记录在日志system_status.log中。运行结果如下:

-- ::  stephen [Info] 192.168.1.109 is reachable
-- :: stephen [Info] There are [] users online now.
UserName loginAt
USER LOGIN@
stephen :
-- :: stephen [Info] Host [192.168.1.109] physical CPU nums is :
-- :: stephen [Info] Host [192.168.1.109] logic CPU nums is :
-- :: stephen [Info] Host [192.168.1.109] core nums is :
-- :: stephen [Info] Host [192.168.1.109] CPU usage is : 10.12 %
-- :: stephen [Info] Begin to get mem usage of Host [192.168.1.109]
-- :: stephen [Info] Host [192.168.1.109] total mem is : 3.741 GB
-- :: stephen [Info] Host [192.168.1.109] mem usage is : 95.83 %
-- :: stephen [Info] End to get mem usage of Host [192.168.1.109]
-- :: stephen [Info] Host [192.168.1.109] in IO is : MB / s
-- :: stephen [Info] Host [192.168.1.109] out IO is : MB / s
-- :: stephen [Info] Begin to get disk usage of Host [192.168.1.109]
文件系统 容量 已用 可用 已用% 挂载点
udev .9G .9G % /dev
tmpfs 384M 2.0M 382M % /run
/dev/sda10 42G 20G 20G % /
tmpfs .9G 20M .9G % /dev/shm
tmpfs 5.0M .0K 5.0M % /run/lock
tmpfs .9G .9G % /sys/fs/cgroup
/dev/loop0 3.8M 3.8M % /snap/notepad-plus-plus/
/dev/loop2 54M 54M % /snap/core18/
/dev/loop4 441M 441M % /snap/wine-platform/
/dev/loop5 441M 441M % /snap/wine-platform/
/dev/loop7 3.8M 3.8M % /snap/notepad-plus-plus/
/dev/loop3 90M 90M % /snap/core/
/dev/loop1 274M 274M % /snap/wps-office-multilang/
/dev/loop6 91M 91M % /snap/core/
/dev/loop8 92M 92M % /snap/core/
/dev/loop9 36M 36M % /snap/gtk-common-themes/
/dev/loop10 3.8M 3.8M % /snap/notepad-plus-plus/
/dev/loop11 441M 441M % /snap/wine-platform/
tmpfs 384M 16K 384M % /run/user/
tmpfs 384M 52K 384M % /run/user/
-- :: stephen [Info] End to get disk usage of Host [192.168.1.109]
-- :: stephen [Info] Begin to get net data of Host [192.168.1.109]
-- :: stephen [Info] Host [192.168.1.109] in data is : 42.90 kb / s
-- :: stephen [Info] Host [192.168.1.109] out data is : 7.00 kb / s
-- :: stephen [Info] End to get net data of Host [192.168.1.109]
-- :: stephen [Info] ERROR 255.255.255.254 is unreachable,try login in see more details..

5、参考文档

5.1、ifstat网络流量监控之/proc/net/dev文件

https://blog.csdn.net/kongshuai19900505/article/details/80676607

5.2、awk命令

http://man.linuxde.net/awk

5.3、使用shell脚本采集系统cpu、内存、磁盘、网络等信息

https://www.jb51.net/article/50436.htm