Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
home:ioguix:check_pgactivity
monitoring-plugins-pacemaker-node
check_pacemaker-node
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File check_pacemaker-node of Package monitoring-plugins-pacemaker-node
#!/usr/bin/perl -w # # About: A script for monitoring Linux HA status, to be used as a # NRPE Plugin for pacemaker. # Author: Martin Caj mcaj@suse.cz # Ruediger Oertel ro@suse.de # Last change: 2011-10-27 # 2017-03-22 # TODO: rewrite res-check to awk ? # use awk to check/print more about cluster status # add check for sudo # integrate stonith devices check # ############################################################################# use strict; use Getopt::Std; use Data::Dumper; ############# # Variables # ############# my $NODE_NAME=`uname -n|tr "[:upper:]" "[:lower:]"`; chomp($NODE_NAME); my $CRM_MON="/usr/sbin/crm_mon"; my $ADMIN_EMAIL="mcaj\@suse.cz"; ################################### # Resources and DRBD are options :# ################################### # you should defive them via /etc/nagios/nrpe.cfg # eg: #command[check_pacemaker-nod]=/usr/lib/nagios/plugins/check_pacemaker-nod -r drbd_example fs_example extIP intIP vpn_exemple route_EXT ###################### # Nagios Error Codes # ###################### my $OK="0"; # = OK my $WARNING="1"; # = Warning my $CRITICAL="2"; # = Fatal Error my $UNKNOWN="3"; # UNKNOWN Error my $retval; my $result; my $smap = ["OK","WARNING","CRITICAL","UNKNOWN"]; ##################### # function def part.# ##################### #--------------------------------------------------------------------------------------------- # usage: # Printing help if the script is run without any options # the node is: # online - script can continue # offline - HA service isn't running - exit CRITICAL # standby = the server is in standby mode no more check are need. exit OK #--------------------------------------------------------------------------------------------- sub usage { my ($retval) = @_; print "\n"; print "This is a Nagios Check for Pacemaker Status.\n"; print "It has been designed for checking the cluster via NRPE directly on the machine,\n"; print "where the cluster (e.g.: openais) is running\n"; print "For this check, the nagios user has to be allow run /usr/sbin/crm_mon as sudo.\n"; print "Check setting sudo rigths via visudo command.\n"; print "\n"; print " usage:\n"; print " -h : help\n"; print " -p : print list of resources for check\n"; print " -r : run check\n"; print " -d : check deciding if the node is master or slave (comma separated list)\n"; print " all in this group should run on the same node\n"; print " -m : resources that should be running on the master (comma separated list)\n"; print " -s : resources that should be running on the slave (comma separated list)\n"; print " -b : resources that should be running on the both (comma separated list)\n"; print "\n"; print "this is a example for an entry in the nagios nrpe file /etc/nagios/nrpe.cfg\n"; print "command[check_pacemaker-node]=/usr/lib/nagios/plugins/check_pacemaker-node -r -d drbd_example -m fs_example,extIP,intIP,vpn_exemple,route_EXT\n"; print "\n"; print "If you have any questins please contact local admins on email \"$ADMIN_EMAIL\"\n"; print "\n"; exit $retval; } sub print_to_log { my ($severity,$message) = @_; push @{$result->{$severity}}, $message; } sub print_from_short_log { print join(", ", @{$result->{'short'}})."\n"; } sub print_from_log { my ($severity) = @_; for (@{$result->{$severity}}) { print "$smap->[$severity]: $_\n"; } } #--------------------------------------------------------------------------------------------- # usage: # Testing node itself # there are three possible cases # the node is: # online - script can continue # offline - HA service isn't running - exit CRITICAL # standby = the server is in standby mode no more check are need. it can continue #----------------------------------------------------------------------------------------------- sub check_node_online { my ($crmstat,$nodename) = @_; my $res_status; my $retval; if (grep { $_ eq $nodename } @{$crmstat->{'Online'}}) { $res_status = "Online"; $retval = $OK; } elsif (grep { $_ eq $nodename } @{$crmstat->{'Offline'}}) { $res_status = "Offline"; $retval = $CRITICAL; } elsif (grep { $_ eq $nodename } @{$crmstat->{'Standby'}}) { $res_status = "Standby"; $retval = $OK; } else { $res_status = "Unknown"; $retval = $UNKNOWN; } print_to_log($retval,"The node \"$nodename\" has status \"$res_status\"."); print_to_log($retval," Please check the status ASAP!") if $retval; print_to_log("short", "$nodename:$res_status"); return ($retval, $res_status); } #--------------------------------------------------------------------------------------------- # usage: # Testiing it there aren't any failed resources. # During migtation process might some resource failed, but when the migration is done # there should be non. it the script find a failed resource it wait 20s and try it again # then it ends with CRITICAL exit. #--------------------------------------------------------------------------------------------- sub check_failed { my ($crmstat) = @_; my $retval = $OK; if ($crmstat->{'Failed'}) { $retval = $WARNING; print_to_log($retval, "here are some failed actions, please check crm ASAP!"); print_to_log($retval, "failed actions are:"); my $flist = join(",",@{$crmstat->{'Failed'}}); print_to_log($retval,$flist); print_to_log("short", "Failed:$flist"); } unless ($crmstat->{'Online'} || $crmstat->{'Offline'} || $crmstat->{'Standby'}) { $retval = $UNKNOWN; print_to_log($retval, "There is something wrong"); print_to_log($retval, "Could not parse crm_mon output"); print_to_log($retval, "Please check if nagios user is allowed to run"); print_to_log($retval, "\"/usr/sbin/crm_mon\" via sudo"); } return $retval; } #--------------------------------------------------------------------------------------------- # usage: # Testing Resources # if you have simmilar resource name, you can use exclude via command grep -v # example is us in check_res #--------------------------------------------------------------------------------------------- sub check_res { my ($crmstat,$nodename,$resname) = @_; my $retval; if (grep {$_ eq $nodename} @{$crmstat->{'resource'}->{$resname}->{'Started'}}) { $retval = $OK; print_to_log($retval, "The resource \"$resname\" is fine"); } elsif (grep {$_ eq $nodename} @{$crmstat->{'Clone Set'}->{$resname}->{'Started'}}) { $retval = $OK; print_to_log($retval, "The resource \"$resname\" is fine"); } else { $retval = $CRITICAL; print_to_log($retval, "The resource \"$resname\" is not running on this node"); print_to_log("short", "missing:$resname"); } return $retval; } #--------------------------------------------------------------------------------------------- # usage: # Testing drbd status # drbd Might be in the mode : # Masters: # Slaves: # Stopped: if the server is in the snadby mode - exit OK else CTITICAL ! # the master node has usally runnig all services on it self. sub check_ms { my ($crmstat,$nodename,$res_ms) = @_; my $retval; for my $resource (@{$res_ms}) { my $rstate; for my $state ("Masters","Slaves","Stopped") { $rstate = $state if grep { $_ eq $nodename } @{$crmstat->{'Master/Slave Set'}->{$resource}->{$state}}; next unless $rstate; if ($retval) { $retval = "MISMATCH" unless $retval eq $rstate; } else { $retval = $rstate; } } if ($retval eq "MISMATCH") { print_to_log($WARNING,"M/S resource $resource status $rstate"); } else { print_to_log($OK,"M/S resource $resource status $rstate"); } unless ($rstate) { print_to_log($CRITICAL,"M/S resource $resource not found"); $retval = $CRITICAL; } } return $retval; } ################ # running part:# ################ usage (1) unless $#ARGV >= 0; our($opt_h,$opt_r,$opt_p,$opt_d,$opt_m,$opt_s,$opt_b); usage unless getopts('hrpd:m:s:b:'); usage(0) if $opt_h; usage(1) if @ARGV; my ($res_ms,$res_on_master,$res_on_slave,$res_on_both); @{$res_ms} = split(",",$opt_d || ""); @{$res_on_master} = split(",",$opt_m || ""); @{$res_on_slave} = split(",",$opt_s || ""); @{$res_on_both} = split(",",$opt_b || ""); if ($opt_p) { print "The resources for checking are:\n"; for my $i (0..scalar(@{$res_ms})-1) { print "Master/Slave resource $res_ms->[$i],\n"; } for my $i (0..scalar(@{$res_on_master})-1) { print "Master should have running $res_on_master->[$i],\n"; } for my $i (0..scalar(@{$res_on_slave})-1) { print "Slave should have running $res_on_slave->[$i],\n"; } for my $i (0..scalar(@{$res_on_both})-1) { print "Both should have running $res_on_both->[$i],\n"; } print "\n"; print "The name of this node is \"$NODE_NAME\".\n"; print "Provides a summary of cluster's current state via \"sudo $CRM_MON\".\n"; print "local admin contact is \"$ADMIN_EMAIL\".\n"; print "\n"; print "runng one short cluster status....\n"; system("sudo $CRM_MON -1"); print "done\n"; exit $UNKNOWN; } if ($opt_r) { open (CRM, "sudo $CRM_MON -1 |"); my @CRMOUT = <CRM>; chomp(@CRMOUT); close (CRM); @CRMOUT = grep { !/^$/ } @CRMOUT; my $crmstat; while ($_ = shift @CRMOUT) { if (/^ /) { if (/\[.*\]/) { if (/^\s+(.* Set): (.*) \[(.*)\]/) { my $type = $1; my $name = $3; while ($_ = shift(@CRMOUT), /^\s+\s+/) { push @{$crmstat->{$type}->{$name}->{$1}}, split('\s+',$2) if /^\s+(.*):\s+\[\s+(.*)\s+\]/; } unshift @CRMOUT, $_; } } else { push @{$crmstat->{'resource'}->{$1}->{$3}}, split('\s+',$4) if /^\s+(.*)\s+(.*)\s+(St.*)\s+(.*)$/; push @{$crmstat->{'resource'}->{$2}->{$3}}, split('\s+',$4) if /^\s+(.*)\s+\((.*)\):\s+(St.*)\s+(.*)$/; } } else { if (/^Online:/ || /^Offline:/) { push @{$crmstat->{$1}}, split('\s+',$2) if /^(.*):\s+\[ (.*) \].*/; } elsif (/^Node/) { push @{$crmstat->{Standby}}, split('\s+',$2) if /^(.*):\s+\[ (.*) \].*/; } elsif (/^Failed/) { push @{$crmstat->{Failed}}, split('\s+',$2) if /^(.*):\s+(.*)/; } } } # print Dumper($crmstat); # the first test if node is online/offline/standby my $res_status; ($retval, $res_status) = check_node_online($crmstat,$NODE_NAME); #test if all resvices are running well. $retval |= check_failed($crmstat); #------------------------------------- # DRBD check MASTER/SLAVE. # res 1-12 are master resources # res 13 is slave resources # res 14 might be run on both. # ----------------------------------- if ($res_ms) { my $ms_state = check_ms($crmstat,$NODE_NAME,$res_ms); if ($ms_state eq "Masters") { print_to_log($OK, "Master/Slave resources running as Master"); print_to_log("short", "master/slave:".join(",",@{$res_ms}).":master"); for my $i (0..scalar(@{$res_on_master})-1) { if ($res_on_master->[$i]) { $retval |= check_res($crmstat,$NODE_NAME,$res_on_master->[$i]); } } for my $i (0..scalar(@{$res_on_both})-1) { if ($res_on_both->[$i]) { $retval |= check_res($crmstat,$NODE_NAME,$res_on_both->[$i]); } } if ($retval & $CRITICAL) { $retval = $CRITICAL; } elsif ($retval & $WARNING) { $retval = $WARNING; } else { $retval = $OK; print_to_log($retval, "all resources on the Master node \"$NODE_NAME\" are fine ;-)"); } } elsif ($ms_state eq "Slaves") { print_to_log($OK, "Master/Slave resources running as Slave"); print_to_log("short", "master/slave:".join(",",@{$res_ms}).":slave"); for my $i (0..scalar(@{$res_on_slave})-1) { if ($res_on_slave->[$i]) { $retval |= check_res($crmstat,$NODE_NAME,$res_on_slave->[$i]); } } for my $i (0..scalar(@{$res_on_both})-1) { if ($res_on_both->[$i]) { $retval |= check_res($crmstat,$NODE_NAME,$res_on_both->[$i]); } } if ($retval & $CRITICAL) { $retval = $CRITICAL; } elsif ($retval & $WARNING) { $retval = $WARNING; } else { $retval = $OK; print_to_log($retval, "all resources on the Slave node \"$NODE_NAME\" are fine ;-)"); } } elsif ($ms_state eq "Stopped") { print_to_log($OK, "Master/Slave resources stopped on this node"); print_to_log("short", "master/slave:".join(",",@{$res_ms}).":stopped"); print_to_log($OK, "No resources are running on this node \"$NODE_NAME\""); if ($res_status eq "Standby") { $retval = $OK; print_to_log($retval, "Status: Current server \"$NODE_NAME\" is in \"$res_status\" mode."); print_to_log($retval, "To switch it to online mode, you have to do this via crm."); } else { $retval = $WARNING; print_to_log($retval, "Master/Slave resources stopped but server \"$NODE_NAME\" is not in standby mode:"); print_to_log($retval, "The server status is: \"$res_status\""); print_to_log($retval, "There is something wrong. Check the server \"$NODE_NAME\" ASAP!"); } } else { $retval = $CRITICAL; print_to_log($retval, "Master/Slave resources not running correctly, it has status \"$ms_state\"."); print_to_log($retval, "There is something wrong. Check the server \"$NODE_NAME\" ASAP!"); print_to_log("short", "master/slave:mismatched"); } } else { $retval = $UNKNOWN; print_to_log($retval, "Master/Slave resources not defined. check your nrpe.conf file."); } } print $smap->[$retval].": "; print_from_short_log(); for my $severity (3,2,1,0) { print_from_log($severity); } exit $retval; # end.
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor