Home | Mirror | Search |
homepage: http://www.nagios.org/
Nagios 是一種開放原始碼監視軟件,它可以掃瞄主機、服務、網絡方面存在的問題。Nagios 與其他類似的包之間的主要區別在於,Nagios 將所有的信息簡化為“工作(working)”、“可疑的(questionable)”和“故障(failure)”狀態,並且 Nagios 支持由插件組成的非常豐富的“生態系統”。這些特性使得用戶能夠進行有效安裝,在此過程中無需過多地關心細節內容,只提供他們所需的信息即可。
install
$ sudo apt-get install nagios3 nagios-nrpe-plugin
add user nagiosadmin for nagios
$ sudo htpasswd -c /etc/nagios2/htpasswd.users nagiosadmin New password: Re-type new password: Adding password for user nagiosadmin
Create a new nagcmd group for allowing external commands to be submitted through the web interface. Add both the nagios user and the apache user to the group.
$ groupadd nagcmd $ sudo usermod -a -G nagcmd nagios $ sudo usermod -a -G nagcmd www-data $ cat /etc/group nagcmd:x:1003:nagios,www-data
reload apache
$ sudo /etc/init.d/apache2 reload * Reloading web server config apache2 [ OK ]
$ sudo vim /etc/nagios3/nagios.cfg cfg_dir=/etc/nagios3/hosts cfg_dir=/etc/nagios3/servers cfg_dir=/etc/nagios3/switches cfg_dir=/etc/nagios3/routers admin_email=nagios, neo.chen@example.com
add user neo for nagios
$ sudo htpasswd /etc/nagios3/htpasswd.users neo New password: Re-type new password: Adding password for user neo
$ sudo vim /etc/nagios3/cgi.cfg authorized_for_all_services=nagiosadmin,neo authorized_for_all_hosts=nagiosadmin,neo
$ sudo vim /etc/nagios3/conf.d/contacts_nagios2.cfg ############################################################################### # contacts.cfg ############################################################################### define contact{ contact_name neo alias Neo service_notification_period 24x7 host_notification_period 24x7 service_notification_options w,u,c,r host_notification_options d,r service_notification_commands notify-service-by-email host_notification_commands notify-host-by-email email neo.chen@example.com } ############################################################################### ############################################################################### # # CONTACT GROUPS # ############################################################################### ############################################################################### # We only have one contact in this simple configuration file, so there is # no need to create more than one contact group. define contactgroup{ contactgroup_name admins alias Nagios Administrators members root, neo }
當服務出現w—報警(warning),u—未知(unkown),c—嚴重(critical),r—從異常恢復到正常,在這四種情況下通知聯繫人
當主機出現d-當機(down),u—返回不可達(unreachable),r—從異常情況恢復正常,在這3種情況下通知聯繫人
確認 contact_groups 已經設置
neo@monitor:/etc/nagios3$ grep admins conf.d/generic-host_nagios2.cfg contact_groups admins neo@monitor:/etc/nagios3$ grep admins conf.d/generic-service_nagios2.cfg contact_groups admins
$ sudo vim /etc/nagios3/conf.d/hostgroups_nagios2.cfg define hostgroup { hostgroup_name mysql-servers alias MySQL Servers members * }
$ cat /etc/nagios3/conf.d/generic-service_nagios2.cfg # generic service template definition define service{ name generic-service ; The 'name' of this service template active_checks_enabled 1 ; Active service checks are enabled passive_checks_enabled 1 ; Passive service checks are enabled/accepted parallelize_check 1 ; Active service checks should be parallelized (disabling this can lead to major performance problems) obsess_over_service 1 ; We should obsess over this service (if necessary) check_freshness 0 ; Default is to NOT check service 'freshness' notifications_enabled 1 ; Service notifications are enabled event_handler_enabled 1 ; Service event handler is enabled flap_detection_enabled 1 ; Flap detection is enabled failure_prediction_enabled 1 ; Failure prediction is enabled process_perf_data 1 ; Process performance data retain_status_information 1 ; Retain status information across program restarts retain_nonstatus_information 1 ; Retain non-status information across program restarts notification_interval 0 ; Only send notifications on status change by default. is_volatile 0 check_period 24x7 normal_check_interval 5 retry_check_interval 1 max_check_attempts 4 notification_period 24x7 notification_options w,u,c,r contact_groups admins register 0 ; DONT REGISTER THIS DEFINITION - ITS NOT A REAL SERVICE, JUST A TEMPLATE! }
notification_interval 報警發送間隔,單位分鐘
normal_check_interval 間隔時間
retry_check_interval 重試間隔時間
max_check_attempts 檢查次數,4次失敗後報警
發出警報聲
$ sudo vim /etc/nagios3/cgi.cfg # SOUND OPTIONS # These options allow you to specify an optional audio file # that should be played in your browser window when there are # problems on the network. The audio files are used only in # the status CGI. Only the sound for the most critical problem # will be played. Order of importance (higher to lower) is as # follows: unreachable hosts, down hosts, critical services, # warning services, and unknown services. If there are no # visible problems, the sound file optionally specified by # 'normal_sound' variable will be played. # # # <varname>=<sound_file> # # Note: All audio files must be placed in the /media subdirectory # under the HTML path (i.e. /usr/local/nagios/share/media/). host_unreachable_sound=hostdown.wav host_down_sound=hostdown.wav service_critical_sound=critical.wav service_warning_sound=warning.wav service_unknown_sound=warning.wav normal_sound=noproblem.wav
vim /etc/nagios3/commands.cfg # 'notify-host-by-sms' command definition define command{ command_name notify-host-by-sms command_line /srv/sms/sms "Host: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $HOSTOUTPUT$\n\nDate/Time: $LONGDATETIME$\n" } # 'notify-service-by-sms' command definition define command{ command_name notify-service-by-sms command_line /srv/sms/sms "Service: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$" }
sudo vim /etc/nagios3/conf.d/contacts_nagios2.cfg define contact{ contact_name neo alias Neo service_notification_period 24x7 host_notification_period 24x7 service_notification_options w,u,c,r host_notification_options d,r service_notification_commands notify-service-by-email, notify-service-by-sms host_notification_commands notify-host-by-email, notify-host-by-sms email neo.chen@example.com }
vim /etc/nagios3/routers/firewall.cfg define host{ use generic-host; Inherit default values from a template host_name firewall ; The name we're giving to this switch alias Cisco PIX 515E Firewall ; A longer name associated with the switch address 172.16.1.254 ; IP address of the switch hostgroups all,networks ; Host groups this switch is associated with } define service{ use generic-service ; Inherit values from a template host_name firewall ; The name of the host the service is associated with service_description PING ; The service description check_command check_ping!200.0,20%!600.0,60% ; The command used to monitor the service normal_check_interval 5 ; Check the service every 5 minutes under normal conditions retry_check_interval 1 ; Re-check the service every minute until its final/hard state is determined } define service{ use generic-service ; Inherit values from a template host_name firewall service_description Uptime check_command check_snmp!-C public -o sysUpTime.0 }
hosts
$ cat /etc/nagios3/hosts/www.example.com.cfg define host{ use generic-host ; Inherit default values from a template host_name www.example.com ; The name we're giving to this host alias Some Remote Host ; A longer name associated with the host address 120.132.14.6 ; IP address of the host hostgroups all,http-servers ; Host groups this host is associated with } define service{ use generic-service ; Inherit default values from a template host_name www.example.com service_description HTTP check_command check_http }
HTTP狀態
neo@monitor:~$ /usr/lib/nagios/plugins/check_http -H www.example.com -I 172.16.0.8 -s "HTTs" HTTP CRITICAL: HTTP/1.1 404 Not Found - string not found - 336 bytes in 0.001 second response time |time=0.000733s;;;0.000000 size=336B;;;0 neo@monitor:~$ /usr/lib/nagios/plugins/check_http -H www.example.com -I 172.16.0.8 -e '404' HTTP OK: Status line output matched "404" - 336 bytes in 0.001 second response time |time=0.000715s;;;0.000000 size=336B;;;0
$ sudo vim /etc/nagios3/hosts/mysql.cfg define host{ use generic-host ; Inherit default values from a template host_name mysql-master.example.com ; The name we're giving to this host alias Some Remote Host ; A longer name associated with the host address 172.16.1.6 ; IP address of the host hostgroups all,mysql-servers ; Host groups this host is associated with } define service{ use generic-service ; Inherit default values from a template host_name mysql-master.example.com service_description MySQL check_command check_mysql_database!user!passwd!database }
nrpe 插件接收來自nagios-nrpe-server數據報告
cat /etc/nagios3/hosts/host.example.org.cfg define host{ use generic-host ; Inherit default values from a template host_name host.example.org ; The name we're giving to this host alias Some Remote Host ; A longer name associated with the host address 172.16.1.3 ; IP address of the host hostgroups all ; Host groups this host is associated with } # NRPE disk check. define service { use generic-service host_name backup service_description nrpe-disk check_command check_nrpe_1arg!check_all_disks!172.16.1.3 } define service { use generic-service host_name backup service_description nrpe-users check_command check_nrpe_1arg!check_users!172.16.1.3 } define service { use generic-service host_name backup service_description nrpe-swap check_command check_nrpe_1arg!check_swap!172.16.1.3 } define service { use generic-service host_name backup service_description nrpe-procs check_command check_nrpe_1arg!check_procs!172.16.1.3 }
nagios-nrpe-server 的功能是向伺服器發送監控數據
sudo apt-get install nagios-nrpe-server nagios-plugins
/etc/nagios/nrpe.cfg
/etc/nagios/nrpe_local.cfg
$ sudo vim /etc/nagios/nrpe_local.cfg allowed_hosts=172.16.1.2 command[check_users]=/usr/lib/nagios/plugins/check_users -w 5 -c 10 command[check_load]=/usr/lib/nagios/plugins/check_load -w 15,10,5 -c 30,25,20 command[check_zombie_procs]=/usr/lib/nagios/plugins/check_procs -w 5 -c 10 -s Z command[check_total_procs]=/usr/lib/nagios/plugins/check_procs -w 150 -c 200 command[check_procs]=/usr/lib/nagios/plugins/check_procs -w 150 -c 200 command[check_swap]=/usr/lib/nagios/plugins/check_swap -w 20% -c 10% command[check_all_disks]=/usr/lib/nagios/plugins/check_disk -w 20% -c 10% -e command[check_disk_root]=/usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p / command[check_disk_home]=/usr/lib/nagios/plugins/check_disk -w 20% -c 10% -p /home command[check_sda_iostat]=/usr/lib/nagios/plugins/check_iostat -d sda -w 100 -c 200 command[check_sdb_iostat]=/usr/lib/nagios/plugins/check_iostat -d sdb -w 100 -c 200 # command[check_uri_user]=/usr/lib/nagios/plugins/check_http -I 127.0.0.1 -p 80 -u http://example.com/test/ok.php # command[check_mysql]=/usr/lib/nagios/plugins/check_mysql -H localhost -u root -ppassword test -P 3306
重啟後生效
/etc/init.d/nagios-nrpe-server restart
Define windows services that should be monitored.
# Define a host for the Windows machine we'll be monitoring # Change the host_name, alias, and address to fit your situation define host{ use windows-server ; Inherit default values from a template host_name remote-windows-host ; The name we're giving to this host alias Remote Windows Host ; A longer name associated with the host address 192.168.1.4 ; IP address of the remote windows host } define service{ use generic-service host_name remote-windows-host service_description NSClient++ Version check_command check_nt!CLIENTVERSION } define service{ use generic-service host_name remote-windows-host service_description Uptime check_command check_nt!UPTIME } define service{ use generic-service host_name remote-windows-host service_description CPU Load check_command check_nt!CPULOAD!-l 5,80,90 } define service{ use generic-service host_name remote-windows-host service_description Memory Usage check_command check_nt!MEMUSE!-w 80 -c 90 } define service{ use generic-service host_name remote-windows-host service_description C:\ Drive Space check_command check_nt!USEDDISKSPACE!-l c -w 80 -c 90 } define service{ use generic-service host_name remote-windows-host service_description W3SVC check_command check_nt!SERVICESTATE!-d SHOWALL -l W3SVC } define service{ use generic-service host_name remote-windows-host service_description Explorer check_command check_nt!PROCSTATE!-d SHOWALL -l Explorer.exe }
檢查命令配置檔案 /etc/nagios-plugins/config/
define command{ command_name check_http_404 command_line /usr/lib/nagios/plugins/check_http -H '$HOSTADDRESS$' -I '$HOSTADDRESS$' -e '404' } define command{ command_name check_http_status command_line /usr/lib/nagios/plugins/check_http -H '$HOSTADDRESS$' -I '$HOSTADDRESS$' -e '$ARG1$' } define command{ command_name check_http_url command_line /usr/lib/nagios/plugins/check_http -H '$HOSTADDRESS$' -I '$HOSTADDRESS$' -u '$ARG1$' }
預設HTTP健康檢查超時時間是10秒,如果你的網站需要更長的時間才能打開可以使用-t參數修改預設Timeout時間
# 'check_http' command definition define command{ command_name check_http command_line /usr/lib/nagios/plugins/check_http -t 30 -H '$HOSTADDRESS$' -I '$HOSTADDRESS$' }
neo@monitor:~$ /usr/lib/nagios/plugins/check_http -H www.example.com -I 172.16.0.8 -s "HTTs" HTTP CRITICAL: HTTP/1.1 404 Not Found - string not found - 336 bytes in 0.001 second response time |time=0.000733s;;;0.000000 size=336B;;;0 neo@monitor:~$ /usr/lib/nagios/plugins/check_http -H www.example.com -I 172.16.0.8 -e '404' HTTP OK: Status line output matched "404" - 336 bytes in 0.001 second response time |time=0.000715s;;;0.000000 size=336B;;;0
/etc/nagios-plugins/config/mysql.cfg
check_mysql [-d database] [-H host] [-P port] [-s socket] [-u user] [-p password] [-S] /usr/lib64/nagios/plugins/check_mysql -d hx9999_real -H 202.176.120.10 -P 3306 -u test -p password Uptime: 254264 Threads: 16 Questions: 535110791 Slow queries: 21 Opens: 110 Flush tables: 1 Open tables: 81 Queries per second avg: 2104.547
$ /usr/lib64/nagios/plugins/check_mysql --hostname=172.16.1.5 --port=3306 --username=monitor --password=monitor Uptime: 27001 Threads: 8 Questions: 25280156 Slow queries: 14941 Opens: 1389932 Flush tables: 3 Open tables: 128 Queries per second avg: 936.267
cat >> /usr/lib64/nagios/plugins/check_mysql_replication <<EOF #!/bin/bash declare -a slave_is slave_is=($(mysql -h$1 -umonitor -pxmNhj -e "show slave status\G"|grep Running |awk '{print $2}')) if [ "${slave_is[0]}" = "Yes" -a "${slave_is[1]}" = "Yes" ] then echo "OK - Slave is running" exit 0 else echo "Critical - Slave is error" exit 2 fi EOF
sudo chmod +x /usr/lib64/nagios/plugins/check_mysql_replication /usr/lib64/nagios/plugins/check_mysql_replication 172.16.1.4 Critical - slave is error
vim /etc/nagios-plugins/config/mysql.cfg # 'check_mysql_replication' command definition define command{ command_name check_mysql_replication command_line /usr/lib/nagios/plugins/check_mysql_replication $HOSTADDRESS$ } define command{ command_name check_mysql_replication_host command_line /usr/lib/nagios/plugins/check_mysql_replication '$ARG1$' }
nrpe.cfg
cat >> /usr/lib64/nagios/plugins/check_mysql_replication <<EOF #!/bin/bash declare -a slave_is slave_is=($(mysql -umonitor -pxmNhj -e "show slave status\G"|grep Running |awk '{print $2}')) if [ "${slave_is[0]}" = "Yes" -a "${slave_is[1]}" = "Yes" ] then echo "OK - slave is running" exit 0 else echo "Critical - slave is error" exit 2 fi EOF command[check_mysql_slave]=/usr/lib64/nagios/plugins/check_mysql_replication /usr/local/nagios/libexec/check_nrpe -H 192.168.1.1 /usr/local/nagios/libexec/check_nrpe -H 192.168.1.1 -c check_mysql_replication define service { host_name 192.168.10.232 service_description check_mysql_replication check_period 24x7 max_check_attempts 5 normal_check_interval 3 retry_check_interval 2 contact_groups mygroup notification_interval 5 notification_period 24x7 notification_options w,u,c,r check_command check_nrpe!check_mysql_replication }
$ cat /etc/nagios-plugins/config/disk.cfg # 'check_disk' command definition define command{ command_name check_disk command_line /usr/lib/nagios/plugins/check_disk -w '$ARG1$' -c '$ARG2$' -e -p '$ARG3$' } # 'check_all_disks' command definition define command{ command_name check_all_disks command_line /usr/lib/nagios/plugins/check_disk -w '$ARG1$' -c '$ARG2$' -e } # 'ssh_disk' command definition define command{ command_name ssh_disk command_line /usr/lib/nagios/plugins/check_by_ssh -H '$HOSTADDRESS$' -C '/usr/lib/nagios/plugins/check_disk -w '\''$ARG1$' -c '\''$ARG2$'\'' -e -p '\''$ARG3$'\' } #### # use these checks, if you want to test IPv4 connectivity on IPv6 enabled systems #### # 'ssh_disk_4' command definition define command{ command_name ssh_disk_4 command_line /usr/lib/nagios/plugins/check_by_ssh -H '$HOSTADDRESS$' -C '/usr/lib/nagios/plugins/check_disk -w '\''$ARG1$'\'' -c '\''$ARG2$'\'' -e -p '\''$ARG3$'\' -4 }
WARNING/CRITICAL 報警閥值
-w 10% -c 5% -w 100M -c 50M
-p, --path=PATH, --partition=PARTITION 參數監控路徑,可以一次寫多個參數
$ /usr/lib/nagios/plugins/check_disk -w 10% -c 5% -p / -p /opt -p /boot DISK OK - free space: / 23872 MB (66% inode=92%); /opt 99242 MB (47% inode=93%); /boot 276 MB (63% inode=99%);| /=11767MB;33792;35669;0;37547 /opt=110882MB;199232;210300;0;221369 /boot=160MB;414;437;0;460 $ /usr/lib/nagios/plugins/check_disk -w 100M -c 50M -p / -p /opt -p /boot DISK OK - free space: / 23872 MB (66% inode=92%); /opt 99242 MB (47% inode=93%); /boot 276 MB (63% inode=99%);| /=11768MB;37447;37497;0;37547 /opt=110882MB;221269;221319;0;221369 /boot=160MB;360;410;0;460
-x, --exclude_device=PATH 排除監控路徑
/usr/lib64/nagios/plugins/check_disk -w 10% -c 5% -e -x /bak -x /u01
$ cat disk-smb.cfg # 'check_disk_smb' command definition define command{ command_name check_disk_smb command_line /usr/lib/nagios/plugins/check_disk_smb -H '$ARG1$' -s '$ARG2$' } # 'check_disk_smb_workgroup' command definition define command{ command_name check_disk_smb_workgroup command_line /usr/lib/nagios/plugins/check_disk_smb -H '$ARG1$' -s '$ARG2$' -W '$ARG3$' } # 'check_disk_smb_host' command definition define command{ command_name check_disk_smb_host command_line /usr/lib/nagios/plugins/check_disk_smb -a '$HOSTADDRESS$' -H '$ARG1$' -s '$ARG2$' } # 'check_disk_smb_workgroup_host' command definition define command{ command_name check_disk_smb_workgroup_host command_line /usr/lib/nagios/plugins/check_disk_smb -a '$HOSTADDRESS$' -H '$ARG1$' -s '$ARG2$' -W '$ARG3$' } # 'check_disk_smb_user' command definition define command{ command_name check_disk_smb_user command_line /usr/lib/nagios/plugins/check_disk_smb -H '$ARG1$' -s '$ARG2$' -u '$ARG3$' -p '$ARG4$' -w '$ARG5$' -c '$ARG6$' } # 'check_disk_smb_workgroup_user' command definition define command{ command_name check_disk_smb_workgroup_user command_line /usr/lib/nagios/plugins/check_disk_smb -H '$ARG1$' -s '$ARG2$' -W '$ARG3$' -u '$ARG4$' -p '$ARG5$' } # 'check_disk_smb_host_user' command definition define command{ command_name check_disk_smb_host_user command_line /usr/lib/nagios/plugins/check_disk_smb -a '$HOSTADDRESS$' -H '$ARG1$' -s '$ARG2$' -u '$ARG3$' -p '$ARG4$' } # 'check_disk_smb_workgroup_host_user' command definition define command{ command_name check_disk_smb_workgroup_host_user command_line /usr/lib/nagios/plugins/check_disk_smb -a '$HOSTADDRESS$' -H '$ARG1$' -s '$ARG2$' -W '$ARG3$' -u '$ARG4$' -p '$ARG5$' }
$ /usr/lib/nagios/plugins/check_tcp -H 172.16.1.2 -p 80 TCP OK - 0.000 second response time on port 80|time=0.000369s;;;0.000000;10.000000
$ /usr/lib64/nagios/plugins/check_tcp -H localhost -p 11211 -t 5 -E -s 'stats\r\nquit\r\n' -e 'uptime' -M crit TCP OK - 0.001 second response time on port 11211 [STAT pid 29253 STAT uptime 36088 STAT time 1311100189 STAT version 1.4.5 STAT pointer_size 64 STAT rusage_user 3.207512 STAT rusage_system 50.596308 STAT curr_connections 10 STAT total_connections 97372 STAT connection_structures 84 STAT cmd_get 84673 STAT cmd_set 273 STAT cmd_flush 0 STAT get_hits 84336 STAT get_misses 337 STAT delete_misses 0 STAT delete_hits 0 STAT incr_misses 0 STAT incr_hits 0 STAT decr_misses 0 STAT decr_hits 0 STAT cas_misses 0 STAT cas_hits 0 STAT cas_badval 0 STAT auth_cmds 0 STAT auth_errors 0 STAT bytes_read 49280152 STAT bytes_written 46326517326 STAT limit_maxbytes 4294967296 STAT accepting_conns 1 STAT listen_disabled_num 0 STAT threads 4 STAT conn_yields 0 STAT bytes 1345 STAT curr_items 14 STAT total_items 241 STAT evictions 0 STAT reclaimed 135 END]|time=0.000658s;;;0.000000;5.000000