File check_pacemaker-node of Package monitoring-plugins-pacemaker-node
#!/usr/bin/perl -w
#
# About: A script for monitoring Linux HA status, to be used as a
# NRPE Plugin for pacemaker.
# Author: Martin Caj mcaj@suse.cz
# Ruediger Oertel ro@suse.de
# Last change: 2011-10-27
# 2017-03-22
# TODO: rewrite res-check to awk ?
# use awk to check/print more about cluster status
# add check for sudo
# integrate stonith devices check
#
#############################################################################
use strict;
use Getopt::Std;
use Data::Dumper;
#############
# Variables #
#############
my $NODE_NAME=`uname -n|tr "[:upper:]" "[:lower:]"`;
chomp($NODE_NAME);
my $CRM_MON="/usr/sbin/crm_mon";
my $ADMIN_EMAIL="mcaj\@suse.cz";
###################################
# Resources and DRBD are options :#
###################################
# you should defive them via /etc/nagios/nrpe.cfg
# eg:
#command[check_pacemaker-nod]=/usr/lib/nagios/plugins/check_pacemaker-nod -r drbd_example fs_example extIP intIP vpn_exemple route_EXT
######################
# Nagios Error Codes #
######################
my $OK="0"; # = OK
my $WARNING="1"; # = Warning
my $CRITICAL="2"; # = Fatal Error
my $UNKNOWN="3"; # UNKNOWN Error
my $retval;
my $result;
my $smap = ["OK","WARNING","CRITICAL","UNKNOWN"];
#####################
# function def part.#
#####################
#---------------------------------------------------------------------------------------------
# usage:
# Printing help if the script is run without any options
# the node is:
# online - script can continue
# offline - HA service isn't running - exit CRITICAL
# standby = the server is in standby mode no more check are need. exit OK
#---------------------------------------------------------------------------------------------
sub usage {
my ($retval) = @_;
print "\n";
print "This is a Nagios Check for Pacemaker Status.\n";
print "It has been designed for checking the cluster via NRPE directly on the machine,\n";
print "where the cluster (e.g.: openais) is running\n";
print "For this check, the nagios user has to be allow run /usr/sbin/crm_mon as sudo.\n";
print "Check setting sudo rigths via visudo command.\n";
print "\n";
print " usage:\n";
print " -h : help\n";
print " -p : print list of resources for check\n";
print " -r : run check\n";
print " -d : check deciding if the node is master or slave (comma separated list)\n";
print " all in this group should run on the same node\n";
print " -m : resources that should be running on the master (comma separated list)\n";
print " -s : resources that should be running on the slave (comma separated list)\n";
print " -b : resources that should be running on the both (comma separated list)\n";
print "\n";
print "this is a example for an entry in the nagios nrpe file /etc/nagios/nrpe.cfg\n";
print "command[check_pacemaker-node]=/usr/lib/nagios/plugins/check_pacemaker-node -r -d drbd_example -m fs_example,extIP,intIP,vpn_exemple,route_EXT\n";
print "\n";
print "If you have any questins please contact local admins on email \"$ADMIN_EMAIL\"\n";
print "\n";
exit $retval;
}
sub print_to_log {
my ($severity,$message) = @_;
push @{$result->{$severity}}, $message;
}
sub print_from_short_log {
print join(", ", @{$result->{'short'}})."\n";
}
sub print_from_log {
my ($severity) = @_;
for (@{$result->{$severity}}) {
print "$smap->[$severity]: $_\n";
}
}
#---------------------------------------------------------------------------------------------
# usage:
# Testing node itself
# there are three possible cases
# the node is:
# online - script can continue
# offline - HA service isn't running - exit CRITICAL
# standby = the server is in standby mode no more check are need. it can continue
#-----------------------------------------------------------------------------------------------
sub check_node_online {
my ($crmstat,$nodename) = @_;
my $res_status;
my $retval;
if (grep { $_ eq $nodename } @{$crmstat->{'Online'}}) {
$res_status = "Online";
$retval = $OK;
} elsif (grep { $_ eq $nodename } @{$crmstat->{'Offline'}}) {
$res_status = "Offline";
$retval = $CRITICAL;
} elsif (grep { $_ eq $nodename } @{$crmstat->{'Standby'}}) {
$res_status = "Standby";
$retval = $OK;
} else {
$res_status = "Unknown";
$retval = $UNKNOWN;
}
print_to_log($retval,"The node \"$nodename\" has status \"$res_status\".");
print_to_log($retval," Please check the status ASAP!") if $retval;
print_to_log("short", "$nodename:$res_status");
return ($retval, $res_status);
}
#---------------------------------------------------------------------------------------------
# usage:
# Testiing it there aren't any failed resources.
# During migtation process might some resource failed, but when the migration is done
# there should be non. it the script find a failed resource it wait 20s and try it again
# then it ends with CRITICAL exit.
#---------------------------------------------------------------------------------------------
sub check_failed {
my ($crmstat) = @_;
my $retval = $OK;
if ($crmstat->{'Failed'}) {
$retval = $WARNING;
print_to_log($retval, "here are some failed actions, please check crm ASAP!");
print_to_log($retval, "failed actions are:");
my $flist = join(",",@{$crmstat->{'Failed'}});
print_to_log($retval,$flist);
print_to_log("short", "Failed:$flist");
}
unless ($crmstat->{'Online'} || $crmstat->{'Offline'} || $crmstat->{'Standby'}) {
$retval = $UNKNOWN;
print_to_log($retval, "There is something wrong");
print_to_log($retval, "Could not parse crm_mon output");
print_to_log($retval, "Please check if nagios user is allowed to run");
print_to_log($retval, "\"/usr/sbin/crm_mon\" via sudo");
}
return $retval;
}
#---------------------------------------------------------------------------------------------
# usage:
# Testing Resources
# if you have simmilar resource name, you can use exclude via command grep -v
# example is us in check_res
#---------------------------------------------------------------------------------------------
sub check_res {
my ($crmstat,$nodename,$resname) = @_;
my $retval;
if (grep {$_ eq $nodename} @{$crmstat->{'resource'}->{$resname}->{'Started'}}) {
$retval = $OK;
print_to_log($retval, "The resource \"$resname\" is fine");
} elsif (grep {$_ eq $nodename} @{$crmstat->{'Clone Set'}->{$resname}->{'Started'}}) {
$retval = $OK;
print_to_log($retval, "The resource \"$resname\" is fine");
} else {
$retval = $CRITICAL;
print_to_log($retval, "The resource \"$resname\" is not running on this node");
print_to_log("short", "missing:$resname");
}
return $retval;
}
#---------------------------------------------------------------------------------------------
# usage:
# Testing drbd status
# drbd Might be in the mode :
# Masters:
# Slaves:
# Stopped: if the server is in the snadby mode - exit OK else CTITICAL !
# the master node has usally runnig all services on it self.
sub check_ms {
my ($crmstat,$nodename,$res_ms) = @_;
my $retval;
for my $resource (@{$res_ms}) {
my $rstate;
for my $state ("Masters","Slaves","Stopped") {
$rstate = $state if grep { $_ eq $nodename } @{$crmstat->{'Master/Slave Set'}->{$resource}->{$state}};
next unless $rstate;
if ($retval) {
$retval = "MISMATCH" unless $retval eq $rstate;
} else {
$retval = $rstate;
}
}
if ($retval eq "MISMATCH") {
print_to_log($WARNING,"M/S resource $resource status $rstate");
} else {
print_to_log($OK,"M/S resource $resource status $rstate");
}
unless ($rstate) {
print_to_log($CRITICAL,"M/S resource $resource not found");
$retval = $CRITICAL;
}
}
return $retval;
}
################
# running part:#
################
usage (1) unless $#ARGV >= 0;
our($opt_h,$opt_r,$opt_p,$opt_d,$opt_m,$opt_s,$opt_b);
usage unless getopts('hrpd:m:s:b:');
usage(0) if $opt_h;
usage(1) if @ARGV;
my ($res_ms,$res_on_master,$res_on_slave,$res_on_both);
@{$res_ms} = split(",",$opt_d || "");
@{$res_on_master} = split(",",$opt_m || "");
@{$res_on_slave} = split(",",$opt_s || "");
@{$res_on_both} = split(",",$opt_b || "");
if ($opt_p) {
print "The resources for checking are:\n";
for my $i (0..scalar(@{$res_ms})-1) {
print "Master/Slave resource $res_ms->[$i],\n";
}
for my $i (0..scalar(@{$res_on_master})-1) {
print "Master should have running $res_on_master->[$i],\n";
}
for my $i (0..scalar(@{$res_on_slave})-1) {
print "Slave should have running $res_on_slave->[$i],\n";
}
for my $i (0..scalar(@{$res_on_both})-1) {
print "Both should have running $res_on_both->[$i],\n";
}
print "\n";
print "The name of this node is \"$NODE_NAME\".\n";
print "Provides a summary of cluster's current state via \"sudo $CRM_MON\".\n";
print "local admin contact is \"$ADMIN_EMAIL\".\n";
print "\n";
print "runng one short cluster status....\n";
system("sudo $CRM_MON -1");
print "done\n";
exit $UNKNOWN;
}
if ($opt_r) {
open (CRM, "sudo $CRM_MON -1 |");
my @CRMOUT = <CRM>;
chomp(@CRMOUT);
close (CRM);
@CRMOUT = grep { !/^$/ } @CRMOUT;
my $crmstat;
while ($_ = shift @CRMOUT) {
if (/^ /) {
if (/\[.*\]/) {
if (/^\s+(.* Set): (.*) \[(.*)\]/) {
my $type = $1;
my $name = $3;
while ($_ = shift(@CRMOUT), /^\s+\s+/) {
push @{$crmstat->{$type}->{$name}->{$1}}, split('\s+',$2) if /^\s+(.*):\s+\[\s+(.*)\s+\]/;
}
unshift @CRMOUT, $_;
}
} else {
push @{$crmstat->{'resource'}->{$1}->{$3}}, split('\s+',$4) if /^\s+(.*)\s+(.*)\s+(St.*)\s+(.*)$/;
push @{$crmstat->{'resource'}->{$2}->{$3}}, split('\s+',$4) if /^\s+(.*)\s+\((.*)\):\s+(St.*)\s+(.*)$/;
}
} else {
if (/^Online:/ || /^Offline:/) {
push @{$crmstat->{$1}}, split('\s+',$2) if /^(.*):\s+\[ (.*) \].*/;
} elsif (/^Node/) {
push @{$crmstat->{Standby}}, split('\s+',$2) if /^(.*):\s+\[ (.*) \].*/;
} elsif (/^Failed/) {
push @{$crmstat->{Failed}}, split('\s+',$2) if /^(.*):\s+(.*)/;
}
}
}
# print Dumper($crmstat);
# the first test if node is online/offline/standby
my $res_status;
($retval, $res_status) = check_node_online($crmstat,$NODE_NAME);
#test if all resvices are running well.
$retval |= check_failed($crmstat);
#-------------------------------------
# DRBD check MASTER/SLAVE.
# res 1-12 are master resources
# res 13 is slave resources
# res 14 might be run on both.
# -----------------------------------
if ($res_ms) {
my $ms_state = check_ms($crmstat,$NODE_NAME,$res_ms);
if ($ms_state eq "Masters") {
print_to_log($OK, "Master/Slave resources running as Master");
print_to_log("short", "master/slave:".join(",",@{$res_ms}).":master");
for my $i (0..scalar(@{$res_on_master})-1) {
if ($res_on_master->[$i]) {
$retval |= check_res($crmstat,$NODE_NAME,$res_on_master->[$i]);
}
}
for my $i (0..scalar(@{$res_on_both})-1) {
if ($res_on_both->[$i]) {
$retval |= check_res($crmstat,$NODE_NAME,$res_on_both->[$i]);
}
}
if ($retval & $CRITICAL) {
$retval = $CRITICAL;
} elsif ($retval & $WARNING) {
$retval = $WARNING;
} else {
$retval = $OK;
print_to_log($retval, "all resources on the Master node \"$NODE_NAME\" are fine ;-)");
}
} elsif ($ms_state eq "Slaves") {
print_to_log($OK, "Master/Slave resources running as Slave");
print_to_log("short", "master/slave:".join(",",@{$res_ms}).":slave");
for my $i (0..scalar(@{$res_on_slave})-1) {
if ($res_on_slave->[$i]) {
$retval |= check_res($crmstat,$NODE_NAME,$res_on_slave->[$i]);
}
}
for my $i (0..scalar(@{$res_on_both})-1) {
if ($res_on_both->[$i]) {
$retval |= check_res($crmstat,$NODE_NAME,$res_on_both->[$i]);
}
}
if ($retval & $CRITICAL) {
$retval = $CRITICAL;
} elsif ($retval & $WARNING) {
$retval = $WARNING;
} else {
$retval = $OK;
print_to_log($retval, "all resources on the Slave node \"$NODE_NAME\" are fine ;-)");
}
} elsif ($ms_state eq "Stopped") {
print_to_log($OK, "Master/Slave resources stopped on this node");
print_to_log("short", "master/slave:".join(",",@{$res_ms}).":stopped");
print_to_log($OK, "No resources are running on this node \"$NODE_NAME\"");
if ($res_status eq "Standby") {
$retval = $OK;
print_to_log($retval, "Status: Current server \"$NODE_NAME\" is in \"$res_status\" mode.");
print_to_log($retval, "To switch it to online mode, you have to do this via crm.");
} else {
$retval = $WARNING;
print_to_log($retval, "Master/Slave resources stopped but server \"$NODE_NAME\" is not in standby mode:");
print_to_log($retval, "The server status is: \"$res_status\"");
print_to_log($retval, "There is something wrong. Check the server \"$NODE_NAME\" ASAP!");
}
} else {
$retval = $CRITICAL;
print_to_log($retval, "Master/Slave resources not running correctly, it has status \"$ms_state\".");
print_to_log($retval, "There is something wrong. Check the server \"$NODE_NAME\" ASAP!");
print_to_log("short", "master/slave:mismatched");
}
} else {
$retval = $UNKNOWN;
print_to_log($retval, "Master/Slave resources not defined. check your nrpe.conf file.");
}
}
print $smap->[$retval].": ";
print_from_short_log();
for my $severity (3,2,1,0) {
print_from_log($severity);
}
exit $retval;
# end.