File check_puppet_agent of Package monitoring-plugins-puppet_agent
#!/bin/bash
# Nagios plugin to monitor Puppet agent state
#
# Copyright (c) 2011 Alexander Swen <a@swen.nu>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
#
# Example configuration
#
# Typical this check is placed on a client and runs via nrpe
# So add this to nrpe.cfg:
# command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet -w 3600 -c 7200 -s /var/lib/puppet/state/last_run_summary.yaml -d 0
# This should warn when the agent hasnt run for an hour and go critical after two hours
# if you have dont_blame_nrpe=1 set you can choose to
# command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet -w $ARG1$ -c $ARG2$ -s $ARG3$ -d $ARG4$
#
# define service {
# use generic-service
# service_description Puppet agent
# check_command check_nrpe!check_puppet_agent
# or
# check_command check_nrpe!check_puppet_agent!3600!7200
#}
#
# NOTE:
# This script needs a line like so in the sudoers:
# nagios ALL=NOPASSWD:/usr/bin/puppet,/usr/lib/nagios/plugins/check_puppet_agent,/bin/kill
#
# CHANGELOG:
# 20120126 A.Swen created.
# 20120214 trey85stang Modified, added getopts, usage, defaults
# 20120220 A.Swen lastrunfile can be overriden
# 20130717 A.Swen Moved finding lastrunfile to after getopts and made it conditional to param -s
# Added option to tell script if puppet agent is started from cron or as a daemon (-d)
# Switched to use awk to filter values from lastrunfile and set them as params
# Updated some comments
# Removed bug in search for process (that would previously always find something because grep find it's processline)
# "puppet config print lastrunfile" has to be run as root. As normal user it yields ~/.puppet/var/state.
# Based on feedback Михайло Масик updated:
# - Puppet --configprint => puppet config print (version 3 has new way of printing config)
# - Added new pattern to search for process
# - Added test kill -0 to see if process is still there
# 20130725 A.Swen Based on feedback Михайло Масик updated a test (removed ! from test)
# 20130725 A.Swen Added sudo to puppet config print pidfile.
# 20131209 Mark Ruys Issue warning when last_run_report.yaml contain errors.
# SETTINGS
CRIT=7200
WARN=3600
# FUNCTIONS
result () {
case $1 in
0) echo "OK: Puppet agent ${version} running catalogversion ${config}, and executed at ${last_run_human} for last time";rc=0 ;;
1) echo "UNKNOWN: last_run_summary.yaml not found, not readable or incomplete";rc=3 ;;
2) echo "WARNING: Last run was ${time_since_last} seconds ago. warn is ${WARN}";rc=1 ;;
3) echo "CRITICAL: Last run was ${time_since_last} seconds ago. crit is ${CRIT}";rc=2 ;;
4) echo "CRITICAL: Puppet daemon not running or something wrong with process";rc=2 ;;
5) echo "UNKNOWN: no WARN or CRIT parameters were sent to this check";rc=3 ;;
6) echo "CRITICAL: Last run had 1 or more errors. Check the logs";rc=2 ;;
7) echo "WARNING: ${last_run_log}" ;rc=1 ;;
esac
exit $rc
}
usage () {
echo ""
echo "USAGE: "
echo " $0 [-w 3600] [-c 7200] [-s lastrunfile] [-d0]"
echo " -w warning threshold (default 3600 seconds)"
echo " -c critical threshold (default 7200 seconds)"
echo " -s lastrunfile (default: /var/lib/puppet/state/last_run_summary.yaml)"
echo " -d 0|1: puppet agent should be a daemon(1) or not (0).(default 1)"
echo ""
exit 1
}
while getopts "c:d:s:w:" opt; do
case $opt in
c)
if ! echo $OPTARG | grep -q "[A-Za-z]" && [ -n "$OPTARG" ]
then
CRIT=$OPTARG
else
usage
fi
;;
d)
# argument should be 0 or 1
if [ ${OPTARG} -eq 0 -o ${OPTARG} -eq 1 ];then
daemonized=${OPTARG}
else
usage
fi
;;
s) lastrunfile=${OPTARG} ;;
w)
if ! echo $OPTARG | grep -q "[A-Za-z]" && [ -n "$OPTARG" ]
then
WARN=$OPTARG
else
usage
fi
;;
*)
usage
;;
esac
done
# if the lastrunfile is not given as a param try to find it ourselves
[ -z "${lastrunfile}" ] && lastrunfile=$(sudo /usr/bin/puppet config print lastrunfile)
# check if state file exists
[ -s ${lastrunfile} -a -r ${lastrunfile} ] || result 1
# check if daemonized was sent, else set default
[ -n "${daemonized}" ] || daemonized=1
# if Puppet agent runs as a daemon there should be a process. We can't check so much when it is triggered by cron.
if [ ${daemonized} -eq 1 ];then
# check puppet daemon:
# I only know the cmd lines for Debian and CentOS/RedHat:
[ "$(ps axf|egrep "/usr/bin/ruby /usr/sbin/puppetd|/usr/bin/ruby1.8 /usr/bin/puppet agent|/usr/bin/ruby /usr/bin/puppet agent"|grep -v grep)" ] || result 4
default_pidfile=/var/run/puppet/agent.pid
[ -e ${default_pidfile} ]&&pidfile=${default_pidfile}||pidfile=$(sudo /usr/bin/puppet config print pidfile)
# if there is a pidfile tell me the pid, else fail
[ -f ${pidfile} ]&&pid=$(cat ${pidfile})||result 4
# see if the process is running
sudo kill -0 ${pid} ||result 4
# test if the pid we found in the pidfile is puppet:
grep -q puppet /proc/${pid}/cmdline ||result 4
fi
# check when last run happened
last_run=$(awk '/last_run:/ {print $2}' ${lastrunfile})
last_run_human=$(date -d @${last_run} +%c)
now=$(date +%s)
time_since_last=$((now-last_run))
[ ${time_since_last} -ge ${CRIT} ] && result 3
[ ${time_since_last} -ge ${WARN} ] && result 2
# check last_run_report.yaml for errors
lastrunreport=$(sudo /usr/bin/puppet config print lastrunreport)
if [ -f ${lastrunreport} ]; then
last_run_log=$(awk '
/^ [^ ]/ { inside = 0 }
/^ logs:/ { inside = 1 }
inside == 1 {
if ( $1 == "message:" ) { $1 = ""; gsub(/^ "/, ""); gsub(/"$/, ""); message = $0 }
if ( $1 == "level:" && $3 ~ /err|alert|emerg|crit/ ) print message
}
' ${lastrunreport})
[ -n "${last_run_log}" ] && result 7
fi
# get some more info from the yaml file
config=$(awk '/config:/ {print $2}' ${lastrunfile})
version=$(awk '/puppet:/ {print $2}' ${lastrunfile})
failed=$(awk '/failed:/ {print $2}' ${lastrunfile})
failure=$(awk '/failure:/ {print $2}' ${lastrunfile})
failed_to_restart=$(awk '/failed_to_restart:/ {print $2}' ${lastrunfile})
# if any of the values above doesn't return raise an error
[ -z "${last_run}" -o -z "${config}" -o -z "${version}" -o -z "${failed}" -o -z "${failure}" -o -z "${failed_to_restart}" ] && result 1
# if anything went wrong last run => crit
[ ${failed} -gt 0 -o ${failure} -gt 0 -o ${failed_to_restart} -gt 0 ] && result 6
# if we come here it works!
result 0
# END