#lsblk 命令可以查看一些机器目前的磁盘信息
[root@localhost ~]# lsblk
NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT
sda 8:0 0 30T 0 disk
└─sda1 8:1 0 27.3T 0 part
└─vgdata-lvdata 253:2 0 27.3T 0 lvm /data
sdb 8:16 0 278.9G 0 disk
├─sdb1 8:17 0 1G 0 part /boot
└─sdb2 8:18 0 277.9G 0 part
├─centos-root 253:0 0 50G 0 lvm /
├─centos-swap 253:1 0 7.8G 0 lvm [SWAP]
└─centos-home 253:3 0 220.1G 0 lvm /home
[root@localhost ~]# cat /proc/mdstat
Personalities :
unused devices: <none>
######在另外的机器上查看mdstat, 这个机器我用MegaCli无法操作raid(只返回Exit Code: 0x00),感觉是个软raid
[root@w Linux]# cat /proc/mdstat
Personalities : [raid10]
md126 : active raid10 sda[3] sdb[2] sdc[1] sdd[0]
7814032384 blocks super external:/md0/0 64K chunks 2 near-copies [4/4] [UUUU]
md0 : inactive sdc[3](S) sdd[2](S) sdb[1](S) sda[0](S)
9040 blocks super external:imsm
unused devices: <none>
[root@w Linux]# lsblk
NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT
sda 8:0 0 3.7T 0 disk
└─md126 9:126 0 7.3T 0 raid10
├─md126p1 259:0 0 200M 0 md /boot/efi
├─md126p2 259:1 0 500M 0 md /boot
└─md126p3 259:2 0 7.3T 0 md
├─VolGroup-lv_root (dm-0) 253:0 0 30G 0 lvm /
├─VolGroup-lv_swap (dm-1) 253:1 0 8G 0 lvm [SWAP]
└─VolGroup-share (dm-2) 253:2 0 7.2T 0 lvm /w2
sdc 8:32 0 3.7T 0 disk
└─md126 9:126 0 7.3T 0 raid10
├─md126p1 259:0 0 200M 0 md /boot/efi
├─md126p2 259:1 0 500M 0 md /boot
└─md126p3 259:2 0 7.3T 0 md
├─VolGroup-lv_root (dm-0) 253:0 0 30G 0 lvm /
├─VolGroup-lv_swap (dm-1) 253:1 0 8G 0 lvm [SWAP]
└─VolGroup-share (dm-2) 253:2 0 7.2T 0 lvm /w2
sdb 8:16 0 3.7T 0 disk
└─md126 9:126 0 7.3T 0 raid10
├─md126p1 259:0 0 200M 0 md /boot/efi
├─md126p2 259:1 0 500M 0 md /boot
└─md126p3 259:2 0 7.3T 0 md
├─VolGroup-lv_root (dm-0) 253:0 0 30G 0 lvm /
├─VolGroup-lv_swap (dm-1) 253:1 0 8G 0 lvm [SWAP]
└─VolGroup-share (dm-2) 253:2 0 7.2T 0 lvm /w2
sdd 8:48 0 3.7T 0 disk
└─md126 9:126 0 7.3T 0 raid10
├─md126p1 259:0 0 200M 0 md /boot/efi
├─md126p2 259:1 0 500M 0 md /boot
└─md126p3 259:2 0 7.3T 0 md
├─VolGroup-lv_root (dm-0) 253:0 0 30G 0 lvm /
├─VolGroup-lv_swap (dm-1) 253:1 0 8G 0 lvm [SWAP]
└─VolGroup-share (dm-2) 253:2 0 7.2T 0 lvm /w2
显示系统中所有PCI总线设备或连接到该总线上的所有设备的工具,用来确定机器raid硬件型号信息,不同的raid用不同的管理工具。
[root@w Linux]# lspci
00:00.0 Host bridge: Intel Corporation Skylake Host Bridge/DRAM Registers (rev 07)
00:01.0 PCI bridge: Intel Corporation Skylake PCIe Controller (x16) (rev 07)
00:14.0 USB controller: Intel Corporation Sunrise Point-H USB 3.0 xHCI Controller (rev 31)
00:14.2 Signal processing controller: Intel Corporation Sunrise Point-H Thermal subsystem (rev 31)
00:16.0 Communication controller: Intel Corporation Sunrise Point-H CSME HECI #1 (rev 31)
00:17.0 RAID bus controller: Intel Corporation SATA Controller [RAID mode] (rev 31)
00:1c.0 PCI bridge: Intel Corporation Sunrise Point-H PCI Express Root Port #1 (rev f1)
00:1f.0 ISA bridge: Intel Corporation Sunrise Point-H LPC Controller (rev 31)
00:1f.2 Memory controller: Intel Corporation Sunrise Point-H PMC (rev 31)
00:1f.3 Audio device: Intel Corporation Sunrise Point-H HD Audio (rev 31)
00:1f.4 SMBus: Intel Corporation Sunrise Point-H SMBus (rev 31)
00:1f.6 Ethernet controller: Intel Corporation Ethernet Connection (2) I219-LM (rev 31)
01:00.0 VGA compatible controller: NVIDIA Corporation GM107GL [Quadro K620] (rev a2)
01:00.1 Audio device: NVIDIA Corporation Device 0fbc (rev a1)
02:00.0 PCI bridge: Texas Instruments XIO2001 PCI Express-to-PCI Bridge
[root@localhost ~]# dmidecode | grep "Product Name"
Product Name: PowerVault NX3200
Product Name: 068CDY
[root@localhost ~]# dmidecode | more
# dmidecode 3.2
#......省略部分输出
BIOS Information
Vendor: Dell Inc.
Version: 2.4.3
选择raid管理工具,这里使用MegaCli管理,下载地址https://docs.broadcom.com/docs-and-downloads/raid-controllers/raid-controllers-common-files/8-07-14_MegaCLI.zip
##安装
[root@localhost ~]# unzip 8-07-14_MegaCLI.zip
[root@localhost ~]# cd Linux/
[root@localhost Linux]# rpm -ivh MegaCli-8.07.14-1.noarch.rpm
##查所有适配器信息
[root@localhost Linux]# /opt/MegaRAID/MegaCli/MegaCli64 -AdpAllInfo -aALL
#Adapter #0说明只有一块lsi卡
[root@localhost Linux]# /opt/MegaRAID/MegaCli/MegaCli64 -PDList -aALL|grep Adapter
Adapter #0
#查看磁盘有没有坏的,Firmware state: Online, Spun Up表示磁盘状态良好;Firmware state: Unconfigured(good), Spun Up表示表示这块盘已经被更换过,但是没有配置; Firmware state: Failed 坏硬盘
[root@localhost Linux]# /opt/MegaRAID/MegaCli/MegaCli64 -PDList -a0|grep Firm
Firmware state: Online, Spun Up
Device Firmware Level: DG04
Firmware state: Online, Spun Up
Device Firmware Level: 0003
Firmware state: Online, Spun Up
Device Firmware Level: DG04
Firmware state: Online, Spun Up
Device Firmware Level: DG04
#.........省略部分输出
根据输出编写监控脚本,原理:输出的行字符串 不等于 Firmware state: Online, Spun Up。
[root@localhost ~]# vim /mnt/check_raid.sh
#!/bin/bash
#有故障盘返回1,无故障磁盘返回0
oldifs="$IFS"
IFS=$'\n'
flag=0
ok="Firmware state: Online, Spun Up"
#ok1是热备磁盘的状态
ok1="Firmware state: Hotspare, Spun Up"
for i in `sudo /opt/MegaRAID/MegaCli/MegaCli64 -PDList -a0|grep 'Firmware state'`
do
if [ $i != $ok -a $i != $ok1 ]
then
flag=1
fi
#echo $i
done
echo $flag
IFS="$oldifs"
配置zabbix
[root@localhost ~]# vim /etc/zabbix/zabbix_agentd.conf
#.....省略其他内容
UnsafeUserParameters=1
UserParameter=check_raid.status[*],/mnt/check_raid.sh
[root@localhost ~]# systemctl restart zabbix-agent
配置sudo权限,zabbix用户无法直接使用/opt/MegaRAID/MegaCli/MegaCli64命令
[root@localhost ~]# vim /etc/sudoers
zabbix ALL=(ALL) NOPASSWD: /opt/MegaRAID/MegaCli/MegaCli64
在主机中定义监控项目与触发器。
触发器定义:(测试报警可以先把<>设置为=来测试,没问题了改为<>)
测试告警:
至此,磁盘故障告警添加配置完成。