知乎專欄 | 多維度架構 | 微信號 netkiller-ebook | QQ群:128659835 請註明“讀者” |
檢查命令配置檔案 /etc/nagios-plugins/config/
nagios check_ping命令使用方法
具體如下: -H 主機地址 -w WARNING 狀態: 響應時間(毫秒),丟包率 (%) 閥值 -c CRITICAL狀態: 響應時間(毫秒),丟包率 (%) 閥值 -p 發送的包數 預設5個包 -t 超時時間 預設10秒 -4|-6 使用ipv4|ipv6 地址 預設ipv4
實例:
/usr/lib64/nagios/plugins/check_ping -H 74.125.71.106 -w 100.0,20% -c 200.0,50%
# /usr/lib64/nagios/plugins/check_procs PROCS OK: 75 processes # /usr/lib64/nagios/plugins/check_procs -a mingetty PROCS OK: 6 processes with args 'mingetty' # /usr/lib64/nagios/plugins/check_procs -C crond PROCS OK: 1 process with command name 'crond'
監控如果有用戶登陸就發出警告
# /usr/lib64/nagios/plugins/check_users -w 0 -c 5 USERS WARNING - 1 users currently logged in |users=1;0;5;0
監控用戶上線5
# /usr/lib64/nagios/plugins/check_users -w 5 -c 50 USERS OK - 1 users currently logged in |users=1;5;50;0
命令定義
define command{ command_name check_http_404 command_line /usr/lib/nagios/plugins/check_http -H '$HOSTADDRESS$' -I '$HOSTADDRESS$' -e '404' } define command{ command_name check_http_status command_line /usr/lib/nagios/plugins/check_http -H '$HOSTADDRESS$' -I '$HOSTADDRESS$' -e '$ARG1$' } define command{ command_name check_http_url command_line /usr/lib/nagios/plugins/check_http -H '$HOSTADDRESS$' -I '$HOSTADDRESS$' -u '$ARG1$' }
預設HTTP健康檢查超時時間是10秒,如果你的網站需要更長的時間才能打開可以使用-t參數修改預設Timeout時間
# 'check_http' command definition define command{ command_name check_http command_line /usr/lib/nagios/plugins/check_http -t 30 -H '$HOSTADDRESS$' -I '$HOSTADDRESS$' }
# /srv/nagios/libexec/check_http -H www.163.com HTTP OK: HTTP/1.0 200 OK - 657627 bytes in 1.772 second response time |time=1.771681s;;;0.000000 size=657627B;;;0 $ /usr/lib/nagios/plugins/check_http -H www.example.com -I 172.16.0.8 -s "HTTs" HTTP CRITICAL: HTTP/1.1 404 Not Found - string not found - 336 bytes in 0.001 second response time |time=0.000733s;;;0.000000 size=336B;;;0 $ /usr/lib/nagios/plugins/check_http -H www.example.com -I 172.16.0.8 -e '404' HTTP OK: Status line output matched "404" - 336 bytes in 0.001 second response time |time=0.000715s;;;0.000000 size=336B;;;0
命令參數
check_mysql [-d database] [-H host] [-P port] [-s socket] [-u user] [-p password] [-S] /usr/lib64/nagios/plugins/check_mysql -d dbname -H 202.176.120.10 -P 3306 -u test -p password Uptime: 254264 Threads: 16 Questions: 535110791 Slow queries: 21 Opens: 110 Flush tables: 1 Open tables: 81 Queries per second avg: 2104.547
$ /usr/lib64/nagios/plugins/check_mysql --hostname=172.16.1.5 --port=3306 --username=monitor --password=monitor Uptime: 27001 Threads: 8 Questions: 25280156 Slow queries: 14941 Opens: 1389932 Flush tables: 3 Open tables: 128 Queries per second avg: 936.267
cat >> /usr/lib64/nagios/plugins/check_mysql_replication <<EOF #!/bin/bash declare -a slave_is slave_is=($(mysql -h$1 -umonitor -pxmNhj -e "show slave status\G"|grep Running |awk '{print $2}')) if [ "${slave_is[0]}" = "Yes" -a "${slave_is[1]}" = "Yes" ] then echo "OK - Slave is running" exit 0 else echo "Critical - Slave is error" exit 2 fi EOF
sudo chmod +x /usr/lib64/nagios/plugins/check_mysql_replication /usr/lib64/nagios/plugins/check_mysql_replication 172.16.1.4 Critical - slave is error
vim /etc/nagios-plugins/config/mysql.cfg # 'check_mysql_replication' command definition define command{ command_name check_mysql_replication command_line /usr/lib/nagios/plugins/check_mysql_replication $HOSTADDRESS$ } define command{ command_name check_mysql_replication_host command_line /usr/lib/nagios/plugins/check_mysql_replication '$ARG1$' }
nrpe.cfg
cat >> /usr/lib64/nagios/plugins/check_mysql_replication <<EOF #!/bin/bash declare -a slave_is slave_is=($(mysql -umonitor -pxmNhj -e "show slave status\G"|grep Running |awk '{print $2}')) if [ "${slave_is[0]}" = "Yes" -a "${slave_is[1]}" = "Yes" ] then echo "OK - slave is running" exit 0 else echo "Critical - slave is error" exit 2 fi EOF command[check_mysql_slave]=/usr/lib64/nagios/plugins/check_mysql_replication /usr/local/nagios/libexec/check_nrpe -H 192.168.1.1 /usr/local/nagios/libexec/check_nrpe -H 192.168.1.1 -c check_mysql_replication define service { host_name 192.168.10.232 service_description check_mysql_replication check_period 24x7 max_check_attempts 5 normal_check_interval 3 retry_check_interval 2 contact_groups mygroup notification_interval 5 notification_period 24x7 notification_options w,u,c,r check_command check_nrpe!check_mysql_replication }
$ cat /etc/nagios-plugins/config/disk.cfg # 'check_disk' command definition define command{ command_name check_disk command_line /usr/lib/nagios/plugins/check_disk -w '$ARG1$' -c '$ARG2$' -e -p '$ARG3$' } # 'check_all_disks' command definition define command{ command_name check_all_disks command_line /usr/lib/nagios/plugins/check_disk -w '$ARG1$' -c '$ARG2$' -e } # 'ssh_disk' command definition define command{ command_name ssh_disk command_line /usr/lib/nagios/plugins/check_by_ssh -H '$HOSTADDRESS$' -C '/usr/lib/nagios/plugins/check_disk -w '\''$ARG1$' -c '\''$ARG2$'\'' -e -p '\''$ARG3$'\' } #### # use these checks, if you want to test IPv4 connectivity on IPv6 enabled systems #### # 'ssh_disk_4' command definition define command{ command_name ssh_disk_4 command_line /usr/lib/nagios/plugins/check_by_ssh -H '$HOSTADDRESS$' -C '/usr/lib/nagios/plugins/check_disk -w '\''$ARG1$'\'' -c '\''$ARG2$'\'' -e -p '\''$ARG3$'\' -4 }
WARNING/CRITICAL 報警閥值
-w 10% -c 5% -w 100M -c 50M
-p, --path=PATH, --partition=PARTITION 參數監控路徑,可以一次寫多個參數
$ /usr/lib/nagios/plugins/check_disk -w 10% -c 5% -p / -p /opt -p /boot DISK OK - free space: / 23872 MB (66% inode=92%); /opt 99242 MB (47% inode=93%); /boot 276 MB (63% inode=99%);| /=11767MB;33792;35669;0;37547 /opt=110882MB;199232;210300;0;221369 /boot=160MB;414;437;0;460 $ /usr/lib/nagios/plugins/check_disk -w 100M -c 50M -p / -p /opt -p /boot DISK OK - free space: / 23872 MB (66% inode=92%); /opt 99242 MB (47% inode=93%); /boot 276 MB (63% inode=99%);| /=11768MB;37447;37497;0;37547 /opt=110882MB;221269;221319;0;221369 /boot=160MB;360;410;0;460
-x, --exclude_device=PATH 排除監控路徑
/usr/lib64/nagios/plugins/check_disk -w 10% -c 5% -e -x /bak -x /u01
$ cat disk-smb.cfg # 'check_disk_smb' command definition define command{ command_name check_disk_smb command_line /usr/lib/nagios/plugins/check_disk_smb -H '$ARG1$' -s '$ARG2$' } # 'check_disk_smb_workgroup' command definition define command{ command_name check_disk_smb_workgroup command_line /usr/lib/nagios/plugins/check_disk_smb -H '$ARG1$' -s '$ARG2$' -W '$ARG3$' } # 'check_disk_smb_host' command definition define command{ command_name check_disk_smb_host command_line /usr/lib/nagios/plugins/check_disk_smb -a '$HOSTADDRESS$' -H '$ARG1$' -s '$ARG2$' } # 'check_disk_smb_workgroup_host' command definition define command{ command_name check_disk_smb_workgroup_host command_line /usr/lib/nagios/plugins/check_disk_smb -a '$HOSTADDRESS$' -H '$ARG1$' -s '$ARG2$' -W '$ARG3$' } # 'check_disk_smb_user' command definition define command{ command_name check_disk_smb_user command_line /usr/lib/nagios/plugins/check_disk_smb -H '$ARG1$' -s '$ARG2$' -u '$ARG3$' -p '$ARG4$' -w '$ARG5$' -c '$ARG6$' } # 'check_disk_smb_workgroup_user' command definition define command{ command_name check_disk_smb_workgroup_user command_line /usr/lib/nagios/plugins/check_disk_smb -H '$ARG1$' -s '$ARG2$' -W '$ARG3$' -u '$ARG4$' -p '$ARG5$' } # 'check_disk_smb_host_user' command definition define command{ command_name check_disk_smb_host_user command_line /usr/lib/nagios/plugins/check_disk_smb -a '$HOSTADDRESS$' -H '$ARG1$' -s '$ARG2$' -u '$ARG3$' -p '$ARG4$' } # 'check_disk_smb_workgroup_host_user' command definition define command{ command_name check_disk_smb_workgroup_host_user command_line /usr/lib/nagios/plugins/check_disk_smb -a '$HOSTADDRESS$' -H '$ARG1$' -s '$ARG2$' -W '$ARG3$' -u '$ARG4$' -p '$ARG5$' }
$ /usr/lib/nagios/plugins/check_tcp -H 172.16.1.2 -p 80 TCP OK - 0.000 second response time on port 80|time=0.000369s;;;0.000000;10.000000
$ /usr/lib64/nagios/plugins/check_tcp -H localhost -p 11211 -t 5 -E -s 'stats\r\nquit\r\n' -e 'uptime' -M crit TCP OK - 0.001 second response time on port 11211 [STAT pid 29253 STAT uptime 36088 STAT time 1311100189 STAT version 1.4.5 STAT pointer_size 64 STAT rusage_user 3.207512 STAT rusage_system 50.596308 STAT curr_connections 10 STAT total_connections 97372 STAT connection_structures 84 STAT cmd_get 84673 STAT cmd_set 273 STAT cmd_flush 0 STAT get_hits 84336 STAT get_misses 337 STAT delete_misses 0 STAT delete_hits 0 STAT incr_misses 0 STAT incr_hits 0 STAT decr_misses 0 STAT decr_hits 0 STAT cas_misses 0 STAT cas_hits 0 STAT cas_badval 0 STAT auth_cmds 0 STAT auth_errors 0 STAT bytes_read 49280152 STAT bytes_written 46326517326 STAT limit_maxbytes 4294967296 STAT accepting_conns 1 STAT listen_disabled_num 0 STAT threads 4 STAT conn_yields 0 STAT bytes 1345 STAT curr_items 14 STAT total_items 241 STAT evictions 0 STAT reclaimed 135 END]|time=0.000658s;;;0.000000;5.000000
# /usr/lib64/nagios/plugins/check_tcp -H 192.168.2.1 -p 6379 -t 5 -E -s 'info\r\n' -q 'quit\r\n' -e 'uptime_in_days' -M crit TCP OK - 0.001 second response time on port 6379 [$1043 redis_version:2.4.10 redis_git_sha1:00000000 redis_git_dirty:0 arch_bits:64 multiplexing_api:epoll gcc_version:4.4.6 process_id:21331 uptime_in_seconds:18152153 uptime_in_days:210 lru_clock:1801614 used_cpu_sys:1579.41 used_cpu_user:2279.26 used_cpu_sys_children:54.32 used_cpu_user_children:54.11 connected_clients:2 connected_slaves:1 client_longest_output_list:0 client_biggest_input_buf:0 blocked_clients:0 used_memory:1158016 used_memory_human:1.10M used_memory_rss:1560576 used_memory_peak:1289920 used_memory_peak_human:1.23M mem_fragmentation_ratio:1.35 mem_allocator:jemalloc-2.2.5 loading:0 aof_enabled:0 changes_since_last_save:2 bgsave_in_progress:0 last_save_time:1423107828 bgrewriteaof_in_progress:0 total_connections_received:594376 total_commands_processed:1350747 expired_keys:12199 evicted_keys:0 keyspace_hits:511525 keyspace_misses:124116 pubsub_channels:0 pubsub_patterns:0 latest_fork_usec:361 vm_enabled:0 role:master slave0:192.168.6.1,58091,online db0:keys=1913,expires=7]|time=0.000815s;;;0.000000;5.000000
http://exchange.nagios.org/directory/Plugins/Network-Connections,-Stats-and-Bandwidth/check_traffic-2Esh/details
https://github.com/cloved/check_traffic
網卡流量監測
nrpe 插件接收來自nagios-nrpe-server數據報告
cat /etc/nagios3/hosts/host.example.org.cfg define host{ use generic-host ; Inherit default values from a template host_name host.example.org ; The name we're giving to this host alias Some Remote Host ; A longer name associated with the host address 172.16.1.3 ; IP address of the host hostgroups all ; Host groups this host is associated with } # NRPE disk check. define service { use generic-service host_name backup service_description nrpe-disk check_command check_nrpe_1arg!check_all_disks!172.16.1.3 } define service { use generic-service host_name backup service_description nrpe-users check_command check_nrpe_1arg!check_users!172.16.1.3 } define service { use generic-service host_name backup service_description nrpe-swap check_command check_nrpe_1arg!check_swap!172.16.1.3 } define service { use generic-service host_name backup service_description nrpe-procs check_command check_nrpe_1arg!check_procs!172.16.1.3 }
Define windows services that should be monitored.
# Define a host for the Windows machine we'll be monitoring # Change the host_name, alias, and address to fit your situation define host{ use windows-server ; Inherit default values from a template host_name remote-windows-host ; The name we're giving to this host alias Remote Windows Host ; A longer name associated with the host address 192.168.1.4 ; IP address of the remote windows host } define service{ use generic-service host_name remote-windows-host service_description NSClient++ Version check_command check_nt!CLIENTVERSION } define service{ use generic-service host_name remote-windows-host service_description Uptime check_command check_nt!UPTIME } define service{ use generic-service host_name remote-windows-host service_description CPU Load check_command check_nt!CPULOAD!-l 5,80,90 } define service{ use generic-service host_name remote-windows-host service_description Memory Usage check_command check_nt!MEMUSE!-w 80 -c 90 } define service{ use generic-service host_name remote-windows-host service_description C:\ Drive Space check_command check_nt!USEDDISKSPACE!-l c -w 80 -c 90 } define service{ use generic-service host_name remote-windows-host service_description W3SVC check_command check_nt!SERVICESTATE!-d SHOWALL -l W3SVC } define service{ use generic-service host_name remote-windows-host service_description Explorer check_command check_nt!PROCSTATE!-d SHOWALL -l Explorer.exe }
Enable Password Protection
define command{ command_name check_nt command_line $USER1$/check_nt -H $HOSTADDRESS$ -p 12489 -s My2Secure$Password -v $ARG1$ $ARG2$ }
https://code.google.com/p/jmxquery/
wget https://jmxquery.googlecode.com/files/jmxquery-1.3-bin.zip unzip jmxquery-1.3-bin.zip chmod +x check_jmx
# ./check_jmx -help Usage: check_jmx [-option...] -U url -O object -A attribute (to query an attribute) or check_jmx [-option...] -U url -O object -M method (to invoke a zero-argument method) or check_jmx -help (to display this help page) Mandatory parameters are: -U JMX URL, for example: "service:jmx:rmi:///jndi/rmi://localhost:1616/jmxrmi" -O Object name to be checked, for example, "java.lang:type=Memory" -A Attribute of the object to be checked, for example, "NonHeapMemoryUsage" (not compatible with -M switch) -M Zero-argument method to be invoked (not compatible with -A switch) Options are: -K <key> Key for compound data, for example, "used" -I <info attribute> Attribute of the object containing information for text output -J <info attribute key> Attribute key for -I attribute compound data, for example, "used" -v[v[v[v]]] Verbatim level controlled as a number of v -w <limit> Warning long value -c <limit> Critical long value -default <value> Use default value if requested object/attribute/method does not exist -username <user name> -password <password> Credentials for JMX Note that if warning level > critical, system checks object attribute value to be LESS THAN OR EQUAL warning, critical If warning level < critical, system checks object attribute value to be MORE THAN OR EQUAL warning, critical
例 96.1.
# ./check_jmx -U service:jmx:rmi:///jndi/rmi://localhost:9012/jmxrmi -O java.lang:type=Memory -A HeapMemoryUsage -K used -I HeapMemoryUsage -J used -vvvv -w 731847066 -c 1045495808 JMX OK - HeapMemoryUsage.used=98617544 | HeapMemoryUsage.used=98617544,committed=514850816;init=536870912;max=7635730432;used=98617544
# ./check_jmx -U service:jmx:rmi:///jndi/rmi://localhost:9012/jmxrmi -O org:type=Spring,name=BackgroundService -A QueueSize -w 10 -c 20 JMX CRITICAL - org:type=Spring,name=BackgroundService