File check_pacemaker-node of Package monitoring-plugins-pacemaker-node

#!/usr/bin/perl -w
#
# About:        A script for monitoring Linux HA status, to be used as a
#               NRPE Plugin for pacemaker.
# Author:       Martin  Caj mcaj@suse.cz
#               Ruediger Oertel ro@suse.de
# Last change:  2011-10-27
#               2017-03-22
# TODO: 	rewrite res-check to awk ?
#		use awk to check/print more about cluster status
#               add check for sudo 
#		integrate stonith devices check 
# 		
#############################################################################

use strict;
use Getopt::Std;
use Data::Dumper;
#############
# Variables #
#############

my $NODE_NAME=`uname -n|tr "[:upper:]" "[:lower:]"`;
chomp($NODE_NAME);
my $CRM_MON="/usr/sbin/crm_mon";
my $ADMIN_EMAIL="mcaj\@suse.cz";


###################################
# Resources and DRBD are options :#
###################################
# you should defive them via /etc/nagios/nrpe.cfg
# eg:

#command[check_pacemaker-nod]=/usr/lib/nagios/plugins/check_pacemaker-nod -r drbd_example fs_example extIP intIP vpn_exemple route_EXT

######################
# Nagios Error Codes #
######################

my $OK="0"; # = OK
my $WARNING="1"; # = Warning
my $CRITICAL="2"; # = Fatal Error
my $UNKNOWN="3"; # UNKNOWN Error

my $retval;
my $result;
my $smap = ["OK","WARNING","CRITICAL","UNKNOWN"];

#####################
# function def part.#
#####################

#---------------------------------------------------------------------------------------------
# usage:
# Printing help if the script is run without any options
# the node is: 
#               online  - script can continue
#               offline - HA service isn't running - exit CRITICAL
#               standby = the server is in standby mode no more check are need. exit OK 
#---------------------------------------------------------------------------------------------

sub usage {
	my ($retval) = @_;
	print "\n";
	print "This is a Nagios Check for Pacemaker Status.\n";
        print "It has been designed for checking the cluster via NRPE directly on the machine,\n";
	print "where the cluster (e.g.: openais) is running\n";
	print "For this check, the nagios user has to be allow run /usr/sbin/crm_mon as sudo.\n";
	print "Check setting sudo rigths via visudo command.\n";
	print "\n";
        print " usage:\n";
        print "       -h : help\n";
        print "       -p : print list of resources for check\n";
	print "       -r : run check\n";
	print "       -d : check deciding if the node is master or slave (comma separated list)\n";
	print "            all in this group should run on the same node\n";
	print "       -m : resources that should be running on the master (comma separated list)\n";
	print "       -s : resources that should be running on the slave (comma separated list)\n";
	print "       -b : resources that should be running on the both (comma separated list)\n";
	print "\n";
	print "this is a example for an entry in the nagios nrpe file /etc/nagios/nrpe.cfg\n";
	print "command[check_pacemaker-node]=/usr/lib/nagios/plugins/check_pacemaker-node -r -d drbd_example -m fs_example,extIP,intIP,vpn_exemple,route_EXT\n";
	print "\n";
        print "If you have any questins please contact local admins on email \"$ADMIN_EMAIL\"\n";
        print "\n";
        exit $retval;
}

sub print_to_log {
	my ($severity,$message) = @_;
	push @{$result->{$severity}}, $message;
}

sub print_from_short_log {
	print join(", ", @{$result->{'short'}})."\n";
}

sub print_from_log {
	my ($severity) = @_;
	for (@{$result->{$severity}}) {
	    print "$smap->[$severity]: $_\n";
	}
}

#---------------------------------------------------------------------------------------------
# usage:
# Testing node itself
# there are three possible cases
# the node is: 
# 		online  - script can continue
# 		offline - HA service isn't running - exit CRITICAL
#		standby = the server is in standby mode no more check are need. it can continue
#-----------------------------------------------------------------------------------------------

sub check_node_online {
	my ($crmstat,$nodename) = @_;
	my $res_status;
	my $retval;
	if (grep { $_ eq $nodename } @{$crmstat->{'Online'}}) {
		$res_status = "Online";
		$retval = $OK;
	} elsif (grep { $_ eq $nodename } @{$crmstat->{'Offline'}}) {
		$res_status = "Offline";
		$retval = $CRITICAL;
	} elsif (grep { $_ eq $nodename } @{$crmstat->{'Standby'}}) {
		$res_status = "Standby";
		$retval = $OK;
	} else {
		$res_status = "Unknown";
		$retval = $UNKNOWN;
	}
	
	print_to_log($retval,"The node \"$nodename\" has status \"$res_status\".");
	print_to_log($retval," Please check the status ASAP!") if $retval;
	print_to_log("short", "$nodename:$res_status");
		
	return ($retval, $res_status);
}

#---------------------------------------------------------------------------------------------
# usage:
# Testiing it there aren't any failed resources.
# During migtation process might some resource failed, but when the migration is done 
# there should be non. it the script find a failed resource it wait 20s and try it again
# then it ends with CRITICAL exit.
#---------------------------------------------------------------------------------------------

sub check_failed {
	my ($crmstat) = @_;
	my $retval = $OK;
	if ($crmstat->{'Failed'}) {
		$retval = $WARNING;
		print_to_log($retval, "here are some failed actions, please check crm ASAP!");
		print_to_log($retval, "failed actions are:");
		my $flist = join(",",@{$crmstat->{'Failed'}});
		print_to_log($retval,$flist);
		print_to_log("short", "Failed:$flist");
	}
	unless ($crmstat->{'Online'} || $crmstat->{'Offline'} || $crmstat->{'Standby'}) {
		$retval = $UNKNOWN;
		print_to_log($retval, "There is something wrong");
		print_to_log($retval, "Could not parse crm_mon output");
		print_to_log($retval, "Please check if nagios user is allowed to run");
		print_to_log($retval, "\"/usr/sbin/crm_mon\" via sudo");
	}
	return $retval;
}


#---------------------------------------------------------------------------------------------
# usage:
# Testing Resources
# if you have simmilar resource name, you can use exclude via command grep -v
# example is us in check_res
#---------------------------------------------------------------------------------------------

sub check_res {
	my ($crmstat,$nodename,$resname) = @_;
	my $retval;
	if (grep {$_ eq $nodename} @{$crmstat->{'resource'}->{$resname}->{'Started'}}) {
		$retval = $OK;
		print_to_log($retval, "The resource \"$resname\" is fine");
	} elsif (grep {$_ eq $nodename} @{$crmstat->{'Clone Set'}->{$resname}->{'Started'}}) {
		$retval = $OK;
		print_to_log($retval, "The resource \"$resname\" is fine");
	} else {
		$retval = $CRITICAL;
		print_to_log($retval, "The resource \"$resname\" is not running on this node");
		print_to_log("short", "missing:$resname");
	}
	return $retval;
}

#---------------------------------------------------------------------------------------------
# usage:
# Testing drbd status
# drbd Might be in the mode :
# Masters:
# Slaves: 
# Stopped: if the server is in the snadby mode - exit OK else CTITICAL !
# the master node has usally runnig all services on it self.
sub check_ms {
	my ($crmstat,$nodename,$res_ms) = @_;
	my $retval;
	for my $resource (@{$res_ms}) {
	    my $rstate; 
	    for my $state ("Masters","Slaves","Stopped") {
		$rstate = $state if grep { $_ eq $nodename } @{$crmstat->{'Master/Slave Set'}->{$resource}->{$state}};
		next unless $rstate;
		if ($retval) {
		    $retval = "MISMATCH" unless $retval eq $rstate;
		} else {
		    $retval = $rstate;
		}
	    }
	    if ($retval eq "MISMATCH") {
		print_to_log($WARNING,"M/S resource $resource status $rstate");
	    } else {
		print_to_log($OK,"M/S resource $resource status $rstate");
	    }
	    unless ($rstate) {
		print_to_log($CRITICAL,"M/S resource $resource not found");
		$retval = $CRITICAL;
	    }
	}
	return $retval;
}

################
# running part:#
################

usage (1) unless $#ARGV >= 0;

our($opt_h,$opt_r,$opt_p,$opt_d,$opt_m,$opt_s,$opt_b);
usage unless getopts('hrpd:m:s:b:');

usage(0) if $opt_h;
usage(1) if @ARGV;

my ($res_ms,$res_on_master,$res_on_slave,$res_on_both);
@{$res_ms} = split(",",$opt_d || "");
@{$res_on_master} = split(",",$opt_m || "");
@{$res_on_slave} = split(",",$opt_s || "");
@{$res_on_both} = split(",",$opt_b || "");

if ($opt_p) {
	print "The resources for checking are:\n";
	for my $i (0..scalar(@{$res_ms})-1) {
	    print "Master/Slave resource $res_ms->[$i],\n";
	}
	for my $i (0..scalar(@{$res_on_master})-1) {
	    print "Master should have running $res_on_master->[$i],\n";
	}
	for my $i (0..scalar(@{$res_on_slave})-1) {
	    print "Slave should have running $res_on_slave->[$i],\n";
	}
	for my $i (0..scalar(@{$res_on_both})-1) {
	    print "Both should have running $res_on_both->[$i],\n";
	}
	print "\n";
	print "The name of this node is \"$NODE_NAME\".\n";
	print "Provides a summary of cluster's current state via \"sudo $CRM_MON\".\n";
	print "local admin contact is \"$ADMIN_EMAIL\".\n";
	print "\n";
	print "runng one short cluster status....\n";
	system("sudo $CRM_MON -1");
	print "done\n";
	exit $UNKNOWN;
}

if ($opt_r) {
	open (CRM, "sudo $CRM_MON -1 |");
	my @CRMOUT = <CRM>;
	chomp(@CRMOUT);
	close (CRM);
	@CRMOUT = grep { !/^$/ } @CRMOUT;
	my $crmstat;
	while ($_ = shift @CRMOUT) {
	    if (/^ /) {
		if (/\[.*\]/) {
		    if (/^\s+(.* Set): (.*) \[(.*)\]/) {
			my $type = $1;
			my $name = $3;
			while ($_ = shift(@CRMOUT), /^\s+\s+/) {
			    push @{$crmstat->{$type}->{$name}->{$1}}, split('\s+',$2) if /^\s+(.*):\s+\[\s+(.*)\s+\]/;
			}
			unshift @CRMOUT, $_;
		    }
		} else {
		    push @{$crmstat->{'resource'}->{$1}->{$3}}, split('\s+',$4) if /^\s+(.*)\s+(.*)\s+(St.*)\s+(.*)$/;
		    push @{$crmstat->{'resource'}->{$2}->{$3}}, split('\s+',$4) if /^\s+(.*)\s+\((.*)\):\s+(St.*)\s+(.*)$/;
		}
	    } else {
		if (/^Online:/ || /^Offline:/) {
		    push @{$crmstat->{$1}}, split('\s+',$2) if /^(.*):\s+\[ (.*) \].*/;
		} elsif (/^Node/) {
		    push @{$crmstat->{Standby}}, split('\s+',$2) if /^(.*):\s+\[ (.*) \].*/;
		} elsif (/^Failed/) {
		    push @{$crmstat->{Failed}}, split('\s+',$2) if /^(.*):\s+(.*)/;
		}
	    }
	}

	# print Dumper($crmstat);

	# the first test if node is online/offline/standby
	my $res_status;
	($retval, $res_status) = check_node_online($crmstat,$NODE_NAME);

	#test if all resvices are running well.
	$retval |= check_failed($crmstat);

	#-------------------------------------
	# DRBD check MASTER/SLAVE. 
	# res 1-12 are master resources
	# res 13 is slave resources
	# res 14 might be run on both. 
	# -----------------------------------

	if ($res_ms) {
		my $ms_state = check_ms($crmstat,$NODE_NAME,$res_ms);
		if ($ms_state eq "Masters") {
			print_to_log($OK, "Master/Slave resources running as Master");
			print_to_log("short", "master/slave:".join(",",@{$res_ms}).":master");
			for my $i (0..scalar(@{$res_on_master})-1) {
				if ($res_on_master->[$i]) {
					$retval |= check_res($crmstat,$NODE_NAME,$res_on_master->[$i]);
				}
			}
			for my $i (0..scalar(@{$res_on_both})-1) {
				if ($res_on_both->[$i]) {
					$retval |= check_res($crmstat,$NODE_NAME,$res_on_both->[$i]);
				}
			}
			if ($retval & $CRITICAL) {
			    $retval = $CRITICAL;
			} elsif ($retval & $WARNING) {
			    $retval = $WARNING;
			} else {
			    $retval = $OK;
			    print_to_log($retval, "all resources on the Master node \"$NODE_NAME\" are fine ;-)");
			}
		} elsif ($ms_state eq "Slaves") {
			print_to_log($OK, "Master/Slave resources  running as Slave");
			print_to_log("short", "master/slave:".join(",",@{$res_ms}).":slave");
			for my $i (0..scalar(@{$res_on_slave})-1) {
				if ($res_on_slave->[$i]) {
					$retval |= check_res($crmstat,$NODE_NAME,$res_on_slave->[$i]);
				}
			}
			for my $i (0..scalar(@{$res_on_both})-1) {
				if ($res_on_both->[$i]) {
					$retval |= check_res($crmstat,$NODE_NAME,$res_on_both->[$i]);
				}
			}
			if ($retval & $CRITICAL) {
			    $retval = $CRITICAL;
			} elsif ($retval & $WARNING) {
			    $retval = $WARNING;
			} else {
			    $retval = $OK;
			    print_to_log($retval, "all resources on the Slave node \"$NODE_NAME\" are fine ;-)");
			}
		} elsif ($ms_state eq "Stopped") {
			print_to_log($OK, "Master/Slave resources stopped on this node");
			print_to_log("short", "master/slave:".join(",",@{$res_ms}).":stopped");
			print_to_log($OK, "No resources are running on this node \"$NODE_NAME\"");
			if ($res_status eq "Standby") {
				$retval = $OK;
				print_to_log($retval, "Status: Current server \"$NODE_NAME\" is in \"$res_status\" mode.");
				print_to_log($retval, "To switch it to online mode, you have to do this via crm.");
			} else {
				$retval = $WARNING;
				print_to_log($retval, "Master/Slave resources stopped but server \"$NODE_NAME\" is not in standby mode:");
				print_to_log($retval, "The server status is: \"$res_status\"");
				print_to_log($retval, "There is something wrong. Check the server \"$NODE_NAME\" ASAP!");
			}
		} else {
			$retval = $CRITICAL;
			print_to_log($retval, "Master/Slave resources not running correctly, it has status \"$ms_state\".");
			print_to_log($retval, "There is something wrong. Check the server \"$NODE_NAME\" ASAP!");
			print_to_log("short", "master/slave:mismatched");
		}
	} else {
		$retval = $UNKNOWN;
		print_to_log($retval, "Master/Slave resources not defined. check your nrpe.conf file.");
	}

}

print $smap->[$retval].": ";
print_from_short_log();

for my $severity (3,2,1,0) {
    print_from_log($severity);
}

exit $retval;
# end.
openSUSE Build Service is sponsored by