File pacemaker-libservices-prevent-killing-foreign-process.patch of Package pacemaker.14737
commit 9acc32260562df262a768bd2259372d16d90283b
Author: Jan Pokorný <jpokorny@redhat.com>
Date: Tue Jan 31 20:49:04 2017 +0100
Low: libservices(sync): partially prevent killing foreign process
Do not attempt sending SIGKILL to process for which previous
waitpid(pid, &st, WNOHANG) indicated ECHILD issue. This might
be a result of clashing with another thread or an orthogonal
signal handler getting to reap the child first.
For good measure, repeat non-hanging waitpid test right before
killing -- it won't prevent illustrated race, but will limit it
a bit more. That's unlikely, sad path, hence without regularly
imposed performance penalty. And when we can pay more attention
to prevent killing innocent processes (the code in question is
commonly run as root so nothing will prevent that), we should.
diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c
index ffb74ef42..16f25f351 100644
--- a/lib/services/services_linux.c
+++ b/lib/services/services_linux.c
@@ -521,11 +521,19 @@ action_synced_wait(svc_action_t * op, sigset_t *mask)
#endif
wait_rc = waitpid(op->pid, &status, WNOHANG);
- if (wait_rc < 0){
- crm_perror(LOG_ERR, "waitpid() for %d failed", op->pid);
-
- } else if (wait_rc > 0) {
+ if (wait_rc > 0) {
break;
+
+ } else if (wait_rc < 0){
+ if (errno == ECHILD) {
+ /* Here, don't dare to kill and bail out... */
+ break;
+
+ } else {
+ /* ...otherwise pretend process still runs. */
+ wait_rc = 0;
+ }
+ crm_perror(LOG_ERR, "waitpid() for %d failed", op->pid);
}
}
}
@@ -547,9 +555,8 @@ action_synced_wait(svc_action_t * op, sigset_t *mask)
crm_trace("Child done: %d", op->pid);
if (wait_rc <= 0) {
- int killrc = kill(op->pid, SIGKILL);
-
op->rc = PCMK_OCF_UNKNOWN_ERROR;
+
if (op->timeout > 0 && timeout <= 0) {
op->status = PCMK_LRM_OP_TIMEOUT;
crm_warn("%s:%d - timed out after %dms", op->id, op->pid, op->timeout);
@@ -558,16 +565,15 @@ action_synced_wait(svc_action_t * op, sigset_t *mask)
op->status = PCMK_LRM_OP_ERROR;
}
- if (killrc && errno != ESRCH) {
- crm_err("kill(%d, KILL) failed: %d", op->pid, errno);
+ /* If only child hasn't been successfully waited for, yet.
+ This is to limit killing wrong target a bit more. */
+ if (wait_rc == 0 && waitpid(op->pid, &status, WNOHANG) == 0) {
+ if (kill(op->pid, SIGKILL)) {
+ crm_err("kill(%d, KILL) failed: %d", op->pid, errno);
+ }
+ /* Safe to skip WNOHANG here as we sent non-ignorable signal. */
+ while (waitpid(op->pid, &status, 0) == (pid_t) -1 && errno == EINTR) /*omit*/;
}
- /*
- * From sigprocmask(2):
- * It is not possible to block SIGKILL or SIGSTOP. Attempts to do so are silently ignored.
- *
- * This makes it safe to skip WNOHANG here
- */
- while (waitpid(op->pid, &status, 0) == (pid_t) -1 && errno == EINTR) /*omit*/;
} else if (WIFEXITED(status)) {
op->status = PCMK_LRM_OP_DONE;