File s390-tools-sles15sp3-03-dbginfo.sh-cleanup-2x-kernel-add-timeout.patch of Package s390-tools.28664
Subject: [PATCH] [BZ 195579] dbginfo.sh: cleanup 2.x kernel & adding timeout
From: Joern Siglen <siglen@de.ibm.com>
Description: dbginfo.sh: stabilzation of data collection
Symptom: o script hangup possible on single commands
o missing data collection on error
o overwrite of buffers by diag commands
Problem: hangup and lost of data collection
Solution: rework code regarding
o add timeout function
o remove probelmatic commands
o include handling improvements
Reproduction: -
Upstream-ID: 68ff79e4c3bec429be7a54c6f9d3426be74fd1d1
Problem-ID: 195579
Upstream-Description:
dbginfo.sh: cleanup 2.x kernel & adding timeout
- clenaup outdated kernel - we no longer have 2.x kenrel in support
- add timeout on command excution - ensure to not miss all data for a
single command to hang
Signed-off-by: Joern Siglen <siglen@de.ibm.com>
Signed-off-by: Jan Hoeppner <hoeppner@linux.ibm.com>
Signed-off-by: Joern Siglen <siglen@de.ibm.com>
--- s390-tools-service.orig/scripts/dbginfo.sh
+++ s390-tools-service/scripts/dbginfo.sh
@@ -12,12 +12,47 @@
LC_ALL=C
export LC_ALL
-# The general name of this script
-readonly SCRIPTNAME="${0##*/}"
+########################################
+# Global used variables
+readonly SCRIPTNAME="${0##*/}" # general name of this script
+#
+readonly DOCKER=$(if which docker >/dev/null 2>&1; then echo "YES"; else echo "NO"; fi)
+readonly HW="$(uname -i 2>/dev/null)"
+# retrieve and split kernel version
+readonly KERNEL_BASE="$(uname -r 2>/dev/null)"
+readonly KERNEL_VERSION=$(echo ${KERNEL_BASE} | cut -d'.' -f1 )
+readonly KERNEL_MAJOR_REVISION=$(echo ${KERNEL_BASE} | cut -d'.' -f2 )
+readonly KERNEL_MINOR_REVISION=$(echo ${KERNEL_BASE} | cut -d'.' -f3 | sed 's/[^0-9].*//g')
+readonly KERNEL_INFO=${KERNEL_VERSION}.${KERNEL_MAJOR_REVISION}.${KERNEL_MINOR_REVISION}
+readonly KVM=$(if which virsh >/dev/null 2>&1; then echo "YES"; else echo "NO"; fi)
+# The file to indicate that another instance of the script is already running
+readonly LOCKFILE="/tmp/${SCRIPTNAME}.lock"
+# check limits for logfiles like /var/log/messages
+readonly LOG_FILE_SIZE_CHECK=50 # max logfile size in MB
+readonly LOG_FILE_AGE_CHECK=7 # age in days to include for size checking
+# distro info
+readonly OSPRETTY="$(cat /etc/os* 2>/dev/null | grep -m1 PRETTY_NAME | sed 's/\"//g')"
+readonly OS_NAME="${OSPRETTY##*=}"
+# The processor ID for the first processor
+readonly PROCESSORID="$(grep -E ".*processor 0:.*" /proc/cpuinfo | \
+ sed 's/.*identification[[:space:]]*\=[[:space:]]*\([[:alnum:]]*\).*/\1/g')"
+readonly PROCESSORVERSION="$(grep -E ".*processor 0:.*" /proc/cpuinfo | \
+ sed 's/.*version[[:space:]]*\=[[:space:]]*\([[:alnum:]]*\).*/\1/g')"
+if test "x${PROCESSORVERSION}" = "xFF" || test "x${PROCESSORVERSION}" = "xff"; then
+ RUNTIME_ENVIRONMENT=$(grep -E "VM00.*Control Program.*" /proc/sysinfo | \
+ sed 's/.*:[[:space:]]*\([[:graph:]]*\).*/\1/g')
+else
+ RUNTIME_ENVIRONMENT="LPAR"
+fi
+readonly TOS=15 # timeout seconds for command execution
+readonly ZDEV_CONF=$(lszdev --configured 2>/dev/null | wc -l)
+readonly ZDEV_OFF=$(lszdev --offline 2>/dev/null | wc -l)
+readonly ZDEV_ONL=$(lszdev --online 2>/dev/null | wc -l)
+paramWORKDIR_BASE="/tmp/" # initial default path
########################################
-# print version info
+# print dbginfo.sh version info
print_version() {
cat <<EOF
${SCRIPTNAME}: Debug information script version %S390_TOOLS_VERSION%
@@ -25,11 +60,9 @@ Copyright IBM Corp. 2002, 2021
EOF
}
-
########################################
# print how to use this script
-print_usage()
-{
+print_usage() {
print_version
cat <<EOF
@@ -41,11 +74,12 @@ This script collects runtime, configurat
a Linux on IBM Z installation for debugging purposes.
It also traces information about z/VM if the Linux runs under z/VM.
+KVM or DOCKER data ist collected on a host serving this.
-
+Default location for data collection and final tar file is "/tmp/".
The collected information is written to a TAR archive named
- /tmp/DBGINFO-[date]-[time]-[hostname]-[processorid].tgz
+ DBGINFO-[date]-[time]-[hostname]-[processorid].tgz
where [date] and [time] are the date and time when debug data is collected.
[hostname] indicates the hostname of the system the data was collected from.
@@ -53,24 +87,82 @@ The [processorid] is taken from the proc
identification.
Options:
-
-d|--directory specify the directory where the data collection
stores the temporary data and the final archive.
-h|--help print this help
-v|--version print version information
-
+ -c|--check online quick check (no data collection)
Please report bugs to: linux390@de.ibm.com
EOF
}
+########################################
+# check for oversize logfiles and missing rotation
+logfile_checker() {
+ local counter
+ local logfile
+ local logfiles
+
+ # find files bigger than recommended
+ counter=$(find $1 -maxdepth 1 -type f -mtime -${LOG_FILE_AGE_CHECK} \
+ -size ${LOG_FILE_SIZE_CHECK}M | wc -l)
+
+ echo " ${counter} logfiles over ${LOG_FILE_SIZE_CHECK} MB"
+ # maybe check for rotation of base names
+ if [ ${counter} -ne 0 ]; then
+ for logfile in $(find $1 -maxdepth 1 -type f -mtime -${LOG_FILE_AGE_CHECK} \
+ -size ${LOG_FILE_SIZE_CHECK}M -print); do
+ # use a neutral separtor ':' as concat is different in some bash
+ # insert the 'blank' for later use in for loop
+ # add the base name before '.' or '-' only for checks
+ logfiles="${logfiles}: ${logfile%%[.-]*}"
+ done
+ # change separator to new line for sorting
+ logfiles=$(echo "${logfiles}" | sed s'/:/\n/g' | sort -u)
+ for logfile in ${logfiles}; do
+ counter=$(ls ${logfile}* 2>/dev/null | wc -l)
+ if [ ${counter} -eq 1 ]; then
+ echo " CHECK - ${logfile} may miss a rotation"
+ else
+ echo " OK - ${logfile}* may have a rotation in place: ${counter} files"
+ fi
+ done
+ fi
+}
+
+########################################
+# print basic info and online checks
+print_check() {
+ print_version
+ cat <<EOF
+
+Hardware platform = ${HW}
+Runtime environment = ${RUNTIME_ENVIRONMENT}
+Kernel version = ${KERNEL_INFO}
+OS version / distro = ${OS_NAME}
+KVM host = ${KVM}
+DOCKER host = ${DOCKER}
+
+Current user = $(whoami) (must be root for data collection)
+Date and time = $(date)
+Uptime =$(uptime)
+Number of coredumps = $(corecumpctl 2>/dev/null | wc -l)
+zdevice onl/conf/offl = ${ZDEV_ONL} / ${ZDEV_CONF} / ${ZDEV_OFF}
+Log file check =$(logfile_checker "/var/log*")
+
+Working directory = $(ls -d ${paramWORKDIR_BASE} 2>&1 && df -k ${paramWORKDIR_BASE})
+$(ls -ltr ${paramWORKDIR_BASE}/DBGINFO*tgz 2>/dev/null | tail -2)
+$(ls ${LOCKFILE} 2>/dev/null && echo " WARNING: dbginfo running since: $(cat ${LOCKFILE})")
+
+This is a console output only - no data was saved using option -c !
+
+EOF
+}
#######################################
# Parsing the command line and pre checks
-#
-paramWORKDIR_BASE="/tmp/"
-
while [ ${#} -gt 0 ]; do
case ${1} in
--help|-h)
@@ -87,11 +179,19 @@ while [ ${#} -gt 0 ]; do
echo "${SCRIPTNAME}: Error: No directory specified for data collection!"
echo
exit 1
+ elif test ! -d "${paramWORKDIR_BASE}"; then
+ echo "${SCRIPTNAME}: Error: The specified directory \"${paramWORKDIR_BASE}\" does not exist!"
+ echo
+ exit 1
else
- # jump to next param, if already last the final shift can do termination
+ # jump to next param
shift
fi
;;
+ --check|-c)
+ print_check
+ exit 0
+ ;;
-*|--*|*)
echo
echo "${SCRIPTNAME}: invalid option \"${1}\""
@@ -100,17 +200,10 @@ while [ ${#} -gt 0 ]; do
exit 1
;;
esac
- # next parameter
+ # next parameter, if already last the final shift will do termination
shift
done
-# check for a valid path
-if test ! -d "${paramWORKDIR_BASE}"; then
- echo "${SCRIPTNAME}: Error: The specified directory \"${paramWORKDIR_BASE}\" does not exist!"
- echo
- exit 1
-fi
-
# finally verification to run as root
if test "$(/usr/bin/id -u 2>/dev/null)" -ne 0; then
echo "${SCRIPTNAME}: Error: You must be user root to run \"${SCRIPTNAME}\"!"
@@ -133,12 +226,6 @@ readonly SYSTEMHOSTNAME="$(hostname -s 2
# The kernel release version as delivered from uname -r
readonly KERNEL_RELEASE_VERSION="$(uname -r 2>/dev/null)"
-# The processor ID for the first processor
-readonly PROCESSORID="$(grep -E ".*processor 0:.*" /proc/cpuinfo | \
- sed 's/.*identification[[:space:]]*\=[[:space:]]*\([[:alnum:]]*\).*/\1/g')"
-# The processor version for the first processor
-readonly PROCESSORVERSION="$(grep -E ".*processor 0:.*" /proc/cpuinfo | \
- sed 's/.*version[[:space:]]*\=[[:space:]]*\([[:alnum:]]*\).*/\1/g')"
# The current date
readonly DATETIME="$(date +%Y-%m-%d-%H-%M-%S 2>/dev/null)"
@@ -158,9 +245,6 @@ readonly WORKARCHIVE="${WORKDIR_BASE}${W
# The log file of activities from this script execution
readonly LOGFILE="${WORKPATH}dbginfo.log"
-# The file to indicate that another instance of the script is already running
-readonly LOCKFILE="/tmp/${SCRIPTNAME}.lock"
-
# File that includes output of Linux commands
readonly OUTPUT_FILE_CMD="${WORKPATH}runtime.out"
@@ -203,39 +287,6 @@ readonly OUTPUT_FILE_KVM="${WORKPATH}kvm
# Mount point of the debug file system
readonly MOUNT_POINT_DEBUGFS="/sys/kernel/debug"
-# The kernel version (e.g. '2' from 2.6.32 or '3' from 3.2.1)
-readonly KERNEL_VERSION=$(uname -r 2>/dev/null | cut -d'.' -f1)
-
-# The kernel major revision number (e.g. '6' from 2.6.32 or '2' from 3.2.1)
-readonly KERNEL_MAJOR_REVISION=$(uname -r 2>/dev/null | cut -d'.' -f2)
-
-# The kernel mainor revision number (e.g. '32' from 2.6.32 or '1' from 3.2.1)
-readonly KERNEL_MINOR_REVISION=$(uname -r 2>/dev/null | cut -d'.' -f3 | sed 's/[^0-9].*//g')
-
-# Is this kernel supporting sysfs - since 2.4 (0=yes, 1=no)
-if test "${KERNEL_VERSION}" -lt 2 ||
- ( test "${KERNEL_VERSION}" -eq 2 && test "${KERNEL_MAJOR_REVISION}" -le 4 ); then
- readonly LINUX_SUPPORT_SYSFS=1
-else
- readonly LINUX_SUPPORT_SYSFS=0
-fi
-
-# Is this kernel potentially using the /sys/kernel/debug feature - since 2.6.13 (0=yes, 1=no)
-if test "${KERNEL_VERSION}" -lt 2 ||
- ( test "${KERNEL_VERSION}" -eq 2 &&
- ( test "${KERNEL_MAJOR_REVISION}" -lt 6 ||
- ( test "${KERNEL_MAJOR_REVISION}" -eq 6 && test "${KERNEL_MINOR_REVISION}" -lt 13 ))); then
- readonly LINUX_SUPPORT_SYSFSDBF=1
-else
- readonly LINUX_SUPPORT_SYSFSDBF=0
-fi
-
-if test "x${PROCESSORVERSION}" = "xFF" || test "x${PROCESSORVERSION}" = "xff"; then
- readonly RUNTIME_ENVIRONMENT=$(grep -E "VM00.*Control Program.*" /proc/sysinfo| sed 's/.*:[[:space:]]*\([[:graph:]]*\).*/\1/g')
-else
- readonly RUNTIME_ENVIRONMENT="LPAR"
-fi
-
# define order of collection steps
ALL_STEPS="\
collect_cmdsout\
@@ -310,26 +361,6 @@ if test -e /proc/scsi; then
"
fi
-# Adding files to PROCFILES in case we run on Kernel 2.4 or older
-if test "${LINUX_SUPPORT_SYSFS}" -eq 1; then
- PROCFILES="${PROCFILES}\
- /proc/chpids\
- /proc/chandev\
- /proc/ksyms\
- /proc/lvm/global\
- /proc/subchannels\
- "
-fi
-
-# Adding s390dbf files to PROCFILE in case we run on Kernel lower than 2.6.13
-if test "${LINUX_SUPPORT_SYSFSDBF}" -eq 1; then
- if test -e /proc/s390dbf; then
- PROCFILES="${PROCFILES}\
- $(find /proc/s390dbf -type f -not -path "*/raw" -not -path "*/flush" 2>/dev/null)\
- "
- fi
-fi
-
########################################
LOGFILES="\
@@ -706,11 +737,7 @@ collect_sysfs() {
local file_name
debugfs_mounted=0
- # Requires kernel version newer then 2.4
- if test "${LINUX_SUPPORT_SYSFS}" -eq 0; then
pr_syslog_stdout "${step_num} Collecting sysfs"
- # Requires kernel version of 2.6.13 or newer
- if test "${LINUX_SUPPORT_SYSFSDBF}" -eq 0; then
if ! grep -qE "${MOUNT_POINT_DEBUGFS}.*debugfs" /proc/mounts 2>/dev/null; then
if mount -t debugfs debugfs "${MOUNT_POINT_DEBUGFS}" >/dev/null 2>&1; then
sleep 2
@@ -719,7 +746,6 @@ collect_sysfs() {
pr_log_stdout "${SCRIPTNAME}: Warning: Unable to mount debugfs at \"${MOUNT_POINT_DEBUGFS}\""
fi
fi
- fi
# Collect sysfs files using multiple threads (-J 1) while excluding
# files known to block on read (-x). Stop reading a file that takes
@@ -748,9 +774,6 @@ collect_sysfs() {
if test ${debugfs_mounted} -eq 1; then
umount "${MOUNT_POINT_DEBUGFS}"
fi
- else
- pr_syslog_stdout "${step_num} Collecting sysfs skipped. Kernel $(uname -r) must be newer than 2.4"
- fi
pr_log_stdout " "
}
@@ -766,7 +789,7 @@ collect_logfiles() {
call_collect_file "${file_name}"
done
- pr_log_stdout " "
+ pr_log_stdout "$(logfile_checker "/var/log*")"
}
@@ -1091,34 +1114,36 @@ post_processing() {
# Be aware that this output must be
# redirected into a separate logfile
call_run_command() {
- local cmd
- local logfile
- local raw_cmd
-
- cmd="${1}"
- logfile="${2}"
- raw_cmd=$(echo "${cmd}" | sed -ne 's/^\([^[:space:]]*\).*$/\1/p')
+ local rc
+ local cmd="${1}"
+ local logfile="${2}"
+ local raw_cmd=$(echo "${cmd}" | sed -ne 's/^\([^[:space:]]*\).*$/\1/p')
echo "#######################################################" >> "${logfile}"
echo "${USER}@${SYSTEMHOSTNAME:-localhost}> ${cmd}" >> "${logfile}"
- # check if command exists
- if ! which "${raw_cmd}" >/dev/null 2>&1; then
- # check if command is a builtin
- if ! command -v "${raw_cmd}" >/dev/null 2>&1; then
- echo "${SCRIPTNAME}: Warning: Command \"${raw_cmd}\" not available" >> "${logfile}"
- echo >> "${logfile}"
- return 1
- fi
+ # check if calling command and timeout exist
+ if which "${raw_cmd}" >/dev/null 2>&1 && which timeout >/dev/null 2>&1; then
+ eval timeout ${TOS} "${cmd}" >> ${logfile} 2>&1
+ rc=$?
+ # check if command is a builtin (no use of timeout possible)
+ elif command -v "${raw_cmd}" >/dev/null 2>&1; then
+ eval "${cmd}" >> ${logfile} 2>&1
+ rc=$?
+ else
+ echo "${SCRIPTNAME}: Warning: Command \"${raw_cmd}\" not available" >> "${logfile}"
+ echo >> "${logfile}"
+ return 1
fi
- if ! eval "${cmd}" >> "${logfile}" 2>&1; then
- echo "${SCRIPTNAME}: Warning: Command \"${cmd}\" failed" >> "${logfile}"
- echo >> "${logfile}"
- return 1
+ # log a warning on rc not 0 and define return
+ if [ ${rc} ]; then
+ echo >> "${logfile}"
+ return 0
else
- echo >> "${logfile}"
- return 0
+ echo "${SCRIPTNAME}: Warning: Command \"${cmd}\" failed" >> "${logfile}"
+ echo >> "${logfile}"
+ return 1
fi
}
@@ -1285,9 +1310,8 @@ pr_syslog_stdout()
logger -t "${SCRIPTNAME}" "$@"
}
-
###############################################################################
-# Running the script
+# Running the script (main)
environment_setup
print_version
@@ -1299,9 +1323,10 @@ exec 8>&1 9>&2 >"${LOGFILE}" 2>&1
trap emergency_exit SIGHUP SIGINT SIGTERM
pr_log_stdout ""
-pr_log_stdout "Hardware platform = $(uname -i)"
-pr_log_stdout "Kernel version = ${KERNEL_VERSION}.${KERNEL_MAJOR_REVISION}.${KERNEL_MINOR_REVISION} ($(uname -r 2>/dev/null))"
+pr_log_stdout "Hardware platform = ${HW}"
+pr_log_stdout "Kernel version = ${KERNEL_INFO} (${KERNEL_BASE})"
pr_log_stdout "Runtime environment = ${RUNTIME_ENVIRONMENT}"
+pr_log_stdout "OS version / distro = ${OS_NAME}"
pr_log_stdout ""
logger -t "${SCRIPTNAME}" "Starting data collection"
--- s390-tools-service.orig/scripts/dbginfo.sh.1
+++ s390-tools-service/scripts/dbginfo.sh.1
@@ -14,8 +14,10 @@ for debugging Linux on IBM Z
This script collects runtime, configuration and trace information that can
be used to debug a Linux on IBM Z instance.
For Linux on z/VM, the script also traces information about the z/VM system.
+KVM or DOCKER data ist collected on a host serving this.
+
The debug information is written to a file
-/tmp/DBGINFO\-<date>\-<time>\-<hostname>\-<processorid>.tgz
+/<DIRECTORY>/DBGINFO\-<date>\-<time>\-<hostname>\-<processorid>.tgz
where [date] and [time] are the date and time when the debug data was
collected. [hostname] indicates the hostname of the system the data was
collected from. The [processorid] is taken from the processor 0 and indicates
@@ -32,7 +34,12 @@ Print version information, then exit.
.TP
\fB\-d <DIRECTORY>\fP, \fB\-\-directory <DIRECTORY>\fP
-Specify the DIRECTORY where the data collection stores the temporary data and the final archive. The specified directory must already exist. If this parameter is not specified, /tmp is used by default.
+Specify the DIRECTORY where the data collection stores the temporary data and the final archive.
+The specified directory must already exist. If this parameter is not specified, /tmp is used by default.
+
+.TP
+\fB\-c\fP, \fB\-\-check\fP
+Print online quick check (not saving any data) - can be combind with a preceding -d option.
.SH FILES
A .tgz file of the form
@@ -56,48 +63,57 @@ Copyright IBM Corp. 2002, 2021
.PP
Hardware platform = s390x
.br
-Kernel version = 5.4.0 (5.4.0-70-generic)
-.br
Runtime environment = z/VM
+.br
+Kernel version = 4.18.0 (4.18.0-305.el8.s390x)
+.br
+OS version / distro = Red Hat Enterprise Linux 8.4 (Ootpa)
+.br
+Date and time of info = 2021-08-26-15-27-58
+.PP
+1 of 18: Collecting command output
+.PP
+2 of 18: Collecting hyptop for z/VM - 5s output
+.PP
+3 of 18: Collecting z/VM output
.PP
-1 of 16: Collecting command output
+4 of 18: Collecting procfs
.PP
-2 of 16: Collecting z/VM command output
+5 of 18: Collecting sysfs
.PP
-3 of 16: Collecting procfs
+6 of 18: Collecting config files
.PP
-4 of 16: Collecting sysfs
+7 of 18: Collecting network output
.PP
-5 of 16: Collecting log files
+8 of 18: Collecting osa oat output
.PP
-6 of 16: Collecting config files
+9 of 18: Collecting ethtool output
.PP
-7 of 16: Collecting osa oat output
+10 of 18: Collecting Trafic Control output
.PP
-8 of 16: Collecting ethtool output
+11 of 18: Collecting bridge output
.PP
-9 of 16: Collecting tc output
+12 of 18: Skip OpenVSwitch: ovs-vsctl not available
.PP
-10 of 16: Collecting bridge output
+13 of 18: Skip KVM: no virsh command
.PP
-11 of 16: Collecting OpenVSwitch output
+14 of 18: Collecting docker output
.PP
-12a of 16: Collecting docker container
-12b of 16: Collecting docker network
+15 of 18: Collecting NVME storage output
.PP
-13 of 16: Collecting nvme output
+16 of 18: Collecting log files
.PP
-14 of 16: Collecting KVM data
+ 0 logfiles over 50 MB
.PP
-15 of 16: Postprocessing
+17 of 18: Postprocessing
.PP
-16 of 16: Finalizing: Creating archive with collected data
+18 of 18: Finalizing: Creating archive with collected data
.PP
Collected data was saved to:
.br
- >> /data\-collection/DBGINFO\-2021\-04\-20\-14\-00\-07\-host\-012345.tgz <<
+ >> /data\-collection/DBGINFO\-2021\-08\-26\-15\-27\-58\-host\-012345.tgz <<
.br
-Review the collected data before sending to your service organization.
+Please review all collected data before sending to your service organization.
.SH HINTS
Run the script with root authority.
.br