LogoopenSUSE Build Service > Projects
Sign Up | Log In

View File cman_make_qdisk_heuristics_time_out.patch of Package cluster (Project home:sschapiro:openstack:upstream)

commit da0d0e0e4fee1bac432304f9a792de8bd89c36d2
Author: Lon Hohberger <lhh@redhat.com>
Date:   Tue Sep 21 13:45:20 2010 -0400

    cman: Make qdiskd heuristics time out
    
    Qdiskd heuristics were previously expected to enforce
    their own timeouts.  This patch makes qdiskd count
    any heuristic which has taken longer than (interval*(tko-1))
    as failed, since that heuristic is not being reliable.
    
    A side effect is that now qdiskd will also automatically
    calculate interval and tko counts for all heuristics,
    obviating the need for administrators to do this manually.
    
    Resolves: rhbz#636243
    
    Signed-off-by: Lon Hohberger <lhh@redhat.com>
    Reviewed-by: Fabio M. Di Nitto <fdinitto@redhat.com>

diff --git a/cman/man/qdisk.5 b/cman/man/qdisk.5
index efa3638..4070f48 100644
--- a/cman/man/qdisk.5
+++ b/cman/man/qdisk.5
@@ -189,7 +189,7 @@ master will only grant a node membership if:
 
 .in 12
 (a) CMAN believes the node to be online, and
-.br
+.bi
 (b) that node has made enough consecutive, timely writes
 .in 16
 to the quorum disk, and
@@ -448,15 +448,15 @@ for heuristics.  The default score for each heuristic is 1.
 \fIinterval\fP\fB="\fP2\fB"\fP
 .in 12
 This is the frequency (in seconds) at which we poll the heuristic.  The
-default interval for every heuristic is 2 seconds.
+default interval is determined by the qdiskd timeout.
 .in 0
 
 .in 9
 \fItko\fP\fB="\fP1\fB"\fP
 .in 12
 After this many failed attempts to run the heuristic, it is considered DOWN,
-and its score is removed.  The default tko for each heuristic is 1, which 
-may be inadequate for things such as 'ping'.
+and its score is removed.  The default tko for each heuristic is determined
+by the qdiskd timeout.
 .in 8
 \fB/>\fP
 .in 0
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index 8ca99f7..617a705 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -1844,7 +1844,11 @@ get_config_data(qd_ctx *ctx, struct h_data *h, int maxh, int *cfh)
 		goto out;
 	}
 
-	*cfh = configure_heuristics(ccsfd, h, maxh);
+	/* Heuristics need to report in 1 cycle before we need to 
+	 * report in so we can get their score.
+	 */
+	*cfh = configure_heuristics(ccsfd, h, maxh,
+				    ctx->qc_interval * (ctx->qc_tko - 1));
 
 	if (*cfh) {
 		if (ctx->qc_flags & RF_MASTER_WINS) {
diff --git a/cman/qdisk/score.c b/cman/qdisk/score.c
index 81ff700..572464d 100644
--- a/cman/qdisk/score.c
+++ b/cman/qdisk/score.c
@@ -75,22 +75,25 @@ restore_signals(void)
   Spin off a user-defined heuristic
  */
 static int
-fork_heuristic(struct h_data *h)
+fork_heuristic(struct h_data *h, struct timespec *now)
 {
 	int pid;
 	char *argv[4];
-	time_t now;
 
 	if (h->childpid) {	
 		errno = EINPROGRESS;
 		return -1;
 	}
 
-	now = time(NULL);
-	if (now < h->nextrun)
+	if (now->tv_sec < h->nextrun.tv_sec ||
+	    now->tv_nsec < h->nextrun.tv_nsec)
 		return 0;
 
-	h->nextrun = now + h->interval;
+	h->nextrun.tv_sec = now->tv_sec + h->interval;
+	h->nextrun.tv_nsec = now->tv_nsec;
+
+	h->failtime.tv_sec = now->tv_sec + h->maxtime;
+	h->failtime.tv_nsec = now->tv_nsec;
 
 	pid = fork();
 	if (pid < 0)
@@ -162,7 +165,7 @@ total_score(struct h_data *h, int max, int *score, int *maxscore)
   Check for response from a user-defined heuristic / script
  */
 static int
-check_heuristic(struct h_data *h, int block)
+check_heuristic(struct h_data *h, int block, struct timespec *now)
 {
 	int ret;
 	int status;
@@ -172,14 +175,40 @@ check_heuristic(struct h_data *h, int block)
 		return 0;
 
 	ret = waitpid(h->childpid, &status, block?0:WNOHANG);
-	if (!block && ret == 0)
+	if (!block && ret == 0) {
 		/* No children exited */
+
+		/* no timeout */
+		if (!h->maxtime)
+			return 0;
+
+		/* If we overran our timeout, the heuristic is dead */
+		if (now->tv_sec > h->failtime.tv_sec ||
+		    (now->tv_sec == h->failtime.tv_sec &&
+		     now->tv_nsec > h->failtime.tv_nsec)) {
+			h->misses = h->tko;
+			h->failed = ETIMEDOUT;
+			if (h->available) {
+				logt_print(LOG_INFO, "Heuristic: '%s' DOWN - "
+					"Exceeded timeout of %d seconds\n",
+					h->program, h->maxtime);
+				h->available = 0;
+			}
+		}
+
 		return 0;
+	}
 
 	h->childpid = 0;
 	if (ret < 0 && errno == ECHILD)
 		/* wrong child? */
 		goto miss;
+
+	/* Timed out previously; this run must be ignored.  */
+	if (h->failed) {
+		h->failed = 0;
+		goto miss;
+	}
 	if (!WIFEXITED(status)) {
 		ret = 0;
 		goto miss;
@@ -188,7 +217,7 @@ check_heuristic(struct h_data *h, int block)
 		ret = 0;
 		goto miss;
 	}
-	
+
 	/* Returned 0 and was not killed */
 	if (!h->available) {
 		h->available = 1;
@@ -222,10 +251,12 @@ miss:
 static int
 fork_heuristics(struct h_data *h, int max)
 {
+	struct timespec now;
 	int x;
 
+	clock_gettime(CLOCK_MONOTONIC, &now);
 	for (x = 0; x < max; x++)
-		fork_heuristic(&h[x]);
+		fork_heuristic(&h[x], &now);
 	return 0;
 }
 
@@ -236,19 +267,49 @@ fork_heuristics(struct h_data *h, int max)
 static int
 check_heuristics(struct h_data *h, int max, int block)
 {
+	struct timespec now;
 	int x;
 
+	clock_gettime(CLOCK_MONOTONIC, &now);
 	for (x = 0; x < max; x++)
-		check_heuristic(&h[x], block);
+		check_heuristic(&h[x], block, &now);
 	return 0;
 }
 
 
+/*
+ * absmax should be qdiskd (interval * (tko-1))
+ */
+static void
+auto_heuristic_timing(int *interval, int *tko, int absmax)
+{
+	if (!interval || ! tko)
+		return;
+
+	if (absmax < 3)
+		return;
+
+	if (absmax <= 4) {
+		*interval = 1;
+	} else if (absmax <= 22) {
+		*interval = 2;
+	} else if (absmax <= 39) {
+		*interval = 3;
+	} else if (absmax <= 50) {
+		*interval = 4;
+	} else {
+		*interval = 5;
+	}
+
+	*tko = absmax / (*interval);
+}
+
+
 /**
   Read configuration data from CCS into the array provided
  */
 int
-configure_heuristics(int ccsfd, struct h_data *h, int max)
+configure_heuristics(int ccsfd, struct h_data *h, int max, int maxtime)
 {
 	int x = 0;
 	char *val;
@@ -261,11 +322,14 @@ configure_heuristics(int ccsfd, struct h_data *h, int max)
 		h[x].program = NULL;
 		h[x].available = 0;
 		h[x].misses = 0;
-		h[x].interval = 2;
-		h[x].tko = 1;
+		auto_heuristic_timing(&h[x].interval, &h[x].tko, maxtime);
+		h[x].maxtime = maxtime;
 		h[x].score = 1;
 		h[x].childpid = 0;
-		h[x].nextrun = 0;
+		h[x].nextrun.tv_sec = 0;
+		h[x].nextrun.tv_nsec = 0;
+		h[x].failtime.tv_sec = 0;
+		h[x].failtime.tv_nsec = 0;
 
 		/* Get program */
 		snprintf(query, sizeof(query),
diff --git a/cman/qdisk/score.h b/cman/qdisk/score.h
index 77e155b..beff31b 100644
--- a/cman/qdisk/score.h
+++ b/cman/qdisk/score.h
@@ -10,19 +10,22 @@
 
 struct h_data {
 	char *	program;
+	struct timespec nextrun;
+	struct timespec failtime;
 	int	score;
 	int	available;
 	int	tko;
 	int	interval;
+	int	maxtime;
 	int	misses;
+	int	failed;
 	pid_t	childpid;
-	time_t	nextrun;
 };
 
 /*
    Grab score data from CCSD
  */
-int configure_heuristics(int ccsfd, struct h_data *hp, int max);
+int configure_heuristics(int ccsfd, struct h_data *hp, int max, int maxtime);
 
 /*
    Start the thread which runs the scoring applets