File 0001-fence_gce-Add-timeouts-and-failure-options-458.patch of Package fence-agents.23355
From 67f8cee8f5351731a55534a5ae777436a2f67aea Mon Sep 17 00:00:00 2001
From: kj1724 <78624900+kj1724@users.noreply.github.com>
Date: Tue, 1 Feb 2022 03:43:58 -0500
Subject: [PATCH 1/1] fence_gce: Add timeouts and failure options (#458)
Gives users the opportunity to timeout while waiting for pending resets and allows them to run a follow command if the reset fails. This gives users more flexibility on how failures are handled.
---
 agents/gce/fence_gce.py           | 140 +++++++++++++++++++++++-------
 tests/data/metadata/fence_gce.xml |  25 ++++++
 2 files changed, 134 insertions(+), 31 deletions(-)
diff --git a/agents/gce/fence_gce.py b/agents/gce/fence_gce.py
index c0bfefdb..3594fd8e 100644
--- a/agents/gce/fence_gce.py
+++ b/agents/gce/fence_gce.py
@@ -2,10 +2,10 @@
 
 #
 # Requires the googleapiclient and oauth2client
-# RHEL 7.x: google-api-python-client==1.6.7 python-gflags==2.0 pyasn1==0.4.8 rsa==3.4.2
-# RHEL 8.x: nothing additional needed
-# SLES 12.x: python-google-api-python-client python-oauth2client python-oauth2client-gce
-# SLES 15.x: python3-google-api-python-client python3-oauth2client python3-oauth2client-gce
+# RHEL 7.x: google-api-python-client==1.6.7 python-gflags==2.0 pyasn1==0.4.8 rsa==3.4.2 pysocks==1.7.1 httplib2==0.19.0
+# RHEL 8.x: pysocks==1.7.1 httplib2==0.19.0
+# SLES 12.x: python-google-api-python-client python-oauth2client python-oauth2client-gce pysocks==1.7.1 httplib2==0.19.0
+# SLES 15.x: python3-google-api-python-client python3-oauth2client pysocks==1.7.1 httplib2==0.19.0
 #
 
 import atexit
@@ -28,7 +28,7 @@ else:
   import urllib2 as urlrequest
 sys.path.append("@FENCEAGENTSLIBDIR@")
 
-from fencing import fail_usage, run_delay, all_opt, atexit_handler, check_input, process_input, show_docs, fence_action
+from fencing import fail_usage, run_delay, all_opt, atexit_handler, check_input, process_input, show_docs, fence_action, run_command
 try:
   import googleapiclient.discovery
   import socks
@@ -41,6 +41,19 @@ except:
 
 METADATA_SERVER = 'http://metadata.google.internal/computeMetadata/v1/'
 METADATA_HEADERS = {'Metadata-Flavor': 'Google'}
+INSTANCE_LINK = 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}/instances/{}'
+
+def run_on_fail(options):
+	if "--runonfail" in options:
+		run_command(options, options["--runonfail"])
+
+def fail_fence_agent(options, message):
+	run_on_fail(options)
+	fail_usage(message)
+
+def raise_fence_agent(options, message):
+	run_on_fail(options)
+	raise Exception(message)
 
 #
 # Will use baremetalsolution setting or the environment variable
@@ -65,7 +78,7 @@ def replace_api_uri(options, http_request):
 			{
 				"matchlength": 4,
 				"match": "https://compute.googleapis.com/compute/v1/projects/(.*)/zones/(.*)/instances/(.*)/reset(.*)",
-				"replace": "https://baremetalsolution.googleapis.com/v1alpha1/projects/\\1/locations/\\2/instances/\\3:resetInstance\\4"
+				"replace": "https://baremetalsolution.googleapis.com/v1/projects/\\1/locations/\\2/instances/\\3:resetInstance\\4"
 			})
 	for uri_replacement in uri_replacements:
 		# each uri_replacement should have matchlength, match, and replace
@@ -120,14 +133,17 @@ def translate_status(instance_status):
 
 def get_nodes_list(conn, options):
 	result = {}
+	if "--zone" not in options:
+		fail_fence_agent(options, "Failed: get_nodes_list: Please specify the --zone in the command")
 	try:
-		instanceList = retry_api_execute(options, conn.instances().list(
-			project=options["--project"],
-			zone=options["--zone"]))
-		for instance in instanceList["items"]:
-			result[instance["id"]] = (instance["name"], translate_status(instance["status"]))
+		for zone in options["--zone"].split(","):
+			instanceList = retry_api_execute(options, conn.instances().list(
+				project=options["--project"],
+				zone=zone))
+			for instance in instanceList["items"]:
+				result[instance["id"]] = (instance["name"], translate_status(instance["status"]))
 	except Exception as err:
-		fail_usage("Failed: get_nodes_list: {}".format(str(err)))
+		fail_fence_agent(options, "Failed: get_nodes_list: {}".format(str(err)))
 
 	return result
 
@@ -141,23 +157,54 @@ def get_power_status(conn, options):
 			return "off"
 		else:
 			return "on"
+	# If zone is not listed for an entry we attempt to get it automatically
+	instance = options["--plug"]
+	zone = get_zone(conn, options, instance) if "--plugzonemap" not in options else options["--plugzonemap"][instance]
+	instance_status = get_instance_power_status(conn, options, instance, zone)
+	# If any of the instances do not match the intended status we return the
+	# the opposite status so that the fence agent can change it.
+	if instance_status != options.get("--action"):
+		return instance_status
+
+	return options.get("--action")
+
+
+def get_instance_power_status(conn, options, instance, zone):
 	try:
-		instance = retry_api_execute(options, conn.instances().get(
-				project=options["--project"],
-				zone=options["--zone"],
-				instance=options["--plug"]))
+		instance = retry_api_execute(
+				options,
+				conn.instances().get(project=options["--project"], zone=zone, instance=instance))
 		return translate_status(instance["status"])
 	except Exception as err:
-		fail_usage("Failed: get_power_status: {}".format(str(err)))
+		fail_fence_agent(options, "Failed: get_instance_power_status: {}".format(str(err)))
+
+
+def check_for_existing_operation(conn, options, instance, zone, operation_type):
+	logging.debug("check_for_existing_operation")
+	if "--baremetalsolution" in options:
+		# There is no API for checking in progress operations
+		return False
+
+	project = options["--project"]
+	target_link = INSTANCE_LINK.format(project, zone, instance)
+	query_filter = '(targetLink = "{}") AND (operationType = "{}") AND (status = "RUNNING")'.format(target_link, operation_type)
+	result = retry_api_execute(
+			options,
+			conn.zoneOperations().list(project=project, zone=zone, filter=query_filter, maxResults=1))
+
+	if "items" in result and result["items"]:
+		logging.info("Existing %s operation found", operation_type)
+		return result["items"][0]
 
 
-def wait_for_operation(conn, options, operation):
+def wait_for_operation(conn, options, zone, operation):
 	if 'name' not in operation:
 		logging.warning('Cannot wait for operation to complete, the'
 		' requested operation will continue asynchronously')
-		return
+		return False
+
+	wait_time = 0
 	project = options["--project"]
-	zone = options["--zone"]
 	while True:
 		result = retry_api_execute(options, conn.zoneOperations().get(
 			project=project,
@@ -165,56 +212,93 @@ def wait_for_operation(conn, options, operation):
 			operation=operation['name']))
 		if result['status'] == 'DONE':
 			if 'error' in result:
-				raise Exception(result['error'])
-			return
+				raise_fence_agent(options, result['error'])
+			return True
+
+		if "--errortimeout" in options and wait_time > int(options["--errortimeout"]):
+			raise_fence_agent(options, "Operation did not complete before the timeout.")
+
+		if "--warntimeout" in options and wait_time > int(options["--warntimeout"]):
+			logging.warning("Operation did not complete before the timeout.")
+			if "--runonwarn" in options:
+				run_command(options, options["--runonwarn"])
+			return False
+
+		wait_time = wait_time + 1
 		time.sleep(1)
 
 
 def set_power_status(conn, options):
-	logging.debug("set_power_status");
+	logging.debug("set_power_status")
+	instance = options["--plug"]
+	# If zone is not listed for an entry we attempt to get it automatically
+	zone = get_zone(conn, options, instance) if "--plugzonemap" not in options else options["--plugzonemap"][instance]
+	set_instance_power_status(conn, options, instance, zone, options["--action"])
+
+
+def set_instance_power_status(conn, options, instance, zone, action):
+	logging.info("Setting power status of %s in zone %s", instance, zone)
+	project = options["--project"]
+
 	try:
-		if options["--action"] == "off":
-			logging.info("Issuing poweroff of %s in zone %s" % (options["--plug"], options["--zone"]))
-			operation = retry_api_execute(options, conn.instances().stop(
-					project=options["--project"],
-					zone=options["--zone"],
-					instance=options["--plug"]))
+		if action == "off":
+			logging.info("Issuing poweroff of %s in zone %s", instance, zone)
+			operation = check_for_existing_operation(conn, options, instance, zone, "stop")
+			if operation and "--earlyexit" in options:
+				return
+			if not operation:
+				operation = retry_api_execute(
+						options,
+						conn.instances().stop(project=project, zone=zone, instance=instance))
 			logging.info("Poweroff command completed, waiting for the operation to complete")
-			wait_for_operation(conn, options, operation)
-			logging.info("Poweroff of %s in zone %s complete" % (options["--plug"], options["--zone"]))
-		elif options["--action"] == "on":
-			logging.info("Issuing poweron of %s in zone %s" % (options["--plug"], options["--zone"]))
-			operation = retry_api_execute(options, conn.instances().start(
-					project=options["--project"],
-					zone=options["--zone"],
-					instance=options["--plug"]))
-			wait_for_operation(conn, options, operation)
-			logging.info("Poweron of %s in zone %s complete" % (options["--plug"], options["--zone"]))
+			if wait_for_operation(conn, options, zone, operation):
+				logging.info("Poweroff of %s in zone %s complete", instance, zone)
+		elif action == "on":
+			logging.info("Issuing poweron of %s in zone %s", instance, zone)
+			operation = check_for_existing_operation(conn, options, instance, zone, "start")
+			if operation and "--earlyexit" in options:
+				return
+			if not operation:
+				operation = retry_api_execute(
+						options,
+						conn.instances().start(project=project, zone=zone, instance=instance))
+			if wait_for_operation(conn, options, zone, operation):
+				logging.info("Poweron of %s in zone %s complete", instance, zone)
 	except Exception as err:
-		fail_usage("Failed: set_power_status: {}".format(str(err)))
-
+		fail_fence_agent(options, "Failed: set_instance_power_status: {}".format(str(err)))
 
 def power_cycle(conn, options):
-	logging.debug("power_cycle");
+	logging.debug("power_cycle")
+	instance = options["--plug"]
+	# If zone is not listed for an entry we attempt to get it automatically
+	zone = get_zone(conn, options, instance) if "--plugzonemap" not in options else options["--plugzonemap"][instance]
+	return power_cycle_instance(conn, options, instance, zone)
+
+
+def power_cycle_instance(conn, options, instance, zone):
+	logging.info("Issuing reset of %s in zone %s", instance, zone)
+	project = options["--project"]
+
 	try:
-		logging.info('Issuing reset of %s in zone %s' % (options["--plug"], options["--zone"]))
-		operation = retry_api_execute(options, conn.instances().reset(
-				project=options["--project"],
-				zone=options["--zone"],
-				instance=options["--plug"]))
-		logging.info("Reset command completed, waiting for the operation to complete")
-		wait_for_operation(conn, options, operation)
-		logging.info('Reset of %s in zone %s complete' % (options["--plug"], options["--zone"]))
+		operation = check_for_existing_operation(conn, options, instance, zone, "reset")
+		if operation and "--earlyexit" in options:
+			return True
+		if not operation:
+			operation = retry_api_execute(
+					options,
+					conn.instances().reset(project=project, zone=zone, instance=instance))
+		logging.info("Reset command sent, waiting for the operation to complete")
+		if wait_for_operation(conn, options, zone, operation):
+			logging.info("Reset of %s in zone %s complete", instance, zone)
 		return True
 	except Exception as err:
-		logging.error("Failed: power_cycle: {}".format(str(err)))
-		return False
+		logging.exception("Failed: power_cycle")
+		raise err
 
 
-def get_zone(conn, options):
+def get_zone(conn, options, instance):
 	logging.debug("get_zone");
 	project = options['--project']
-	instance = options['--plug']
 	fl = 'name="%s"' % instance
 	request = replace_api_uri(options, conn.instances().aggregatedList(project=project, filter=fl))
 	while request is not None:
@@ -226,7 +310,7 @@ def get_zone(conn, options):
 					return inst['zone'].split("/")[-1]
 		request = replace_api_uri(options, conn.instances().aggregatedList_next(
 				previous_request=request, previous_response=response))
-	raise Exception("Unable to find instance %s" % (instance))
+	raise_fence_agent(options, "Unable to find instance %s" % (instance))
 
 
 def get_metadata(metadata_key, params=None, timeout=None):
@@ -325,13 +409,21 @@ def define_new_opts():
 		"required" : "0",
 		"order" : 9
 	}
+	all_opt["plugzonemap"] = {
+		"getopt" : ":",
+		"longopt" : "plugzonemap",
+		"help" : "--plugzonemap=[plugzonemap]    Comma separated zone map when fencing multiple plugs",
+		"shortdesc" : "Comma separated zone map when fencing multiple plugs.",
+		"required" : "0",
+		"order" : 10
+	}
 	all_opt["proxyhost"] = {
 		"getopt" : ":",
 		"longopt" : "proxyhost",
 		"help" : "--proxyhost=[proxy_host]       The proxy host to use, if one is needed to access the internet (Example: 10.122.0.33)",
 		"shortdesc" : "If a proxy is used for internet access, the proxy host should be specified.",
 		"required" : "0",
-		"order" : 10
+		"order" : 11
 	}
 	all_opt["proxyport"] = {
 		"getopt" : ":",
@@ -340,7 +432,49 @@ def define_new_opts():
 		"help" : "--proxyport=[proxy_port]       The proxy port to use, if one is needed to access the internet (Example: 3127)",
 		"shortdesc" : "If a proxy is used for internet access, the proxy port should be specified.",
 		"required" : "0",
-		"order" : 11
+		"order" : 12
+	}
+	all_opt["earlyexit"] = {
+		"getopt" : "",
+		"longopt" : "earlyexit",
+		"help" : "--earlyexit                    Return early if reset is already in progress",
+		"shortdesc" : "If an existing reset operation is detected, the fence agent will return before the operation completes with a 0 return code.",
+		"required" : "0",
+		"order" : 13
+	}
+	all_opt["warntimeout"] = {
+		"getopt" : ":",
+		"type" : "second",
+		"longopt" : "warntimeout",
+		"help" : "--warntimeout=[warn_timeout]   Timeout seconds before logging a warning and returning a 0 status code",
+		"shortdesc" : "If the operation is not completed within the timeout, the cluster operations are allowed to continue.",
+		"required" : "0",
+		"order" : 14
+	}
+	all_opt["errortimeout"] = {
+		"getopt" : ":",
+		"type" : "second",
+		"longopt" : "errortimeout",
+		"help" : "--errortimeout=[error_timeout] Timeout seconds before failing and returning a non-zero status code",
+		"shortdesc" : "If the operation is not completed within the timeout, cluster is notified of the operation failure.",
+		"required" : "0",
+		"order" : 15
+	}
+	all_opt["runonwarn"] = {
+		"getopt" : ":",
+		"longopt" : "runonwarn",
+		"help" : "--runonwarn=[run_on_warn]      If a timeout occurs and warning is generated, run the supplied command",
+		"shortdesc" : "If a timeout would occur while running the agent, then the supplied command is run.",
+		"required" : "0",
+		"order" : 16
+	}
+	all_opt["runonfail"] = {
+		"getopt" : ":",
+		"longopt" : "runonfail",
+		"help" : "--runonfail=[run_on_fail]      If a failure occurs, run the supplied command",
+		"shortdesc" : "If a failure would occur while running the agent, then the supplied command is run.",
+		"required" : "0",
+		"order" : 17
 	}
 
 
@@ -349,7 +483,8 @@ def main():
 
 	device_opt = ["port", "no_password", "zone", "project", "stackdriver-logging",
 		"method", "baremetalsolution", "apitimeout", "retries", "retrysleep",
-		"serviceaccount", "proxyhost", "proxyport"]
+		"serviceaccount", "plugzonemap", "proxyhost", "proxyport", "earlyexit",
+		"warntimeout", "errortimeout", "runonwarn", "runonfail"]
 
 	atexit.register(atexit_handler)
 
@@ -430,22 +565,39 @@ def main():
 			conn = googleapiclient.discovery.build(
 				'compute', 'v1', credentials=credentials, cache_discovery=False)
 	except Exception as err:
-		fail_usage("Failed: Create GCE compute v1 connection: {}".format(str(err)))
+		fail_fence_agent(options, "Failed: Create GCE compute v1 connection: {}".format(str(err)))
 
 	# Get project and zone
 	if not options.get("--project"):
 		try:
 			options["--project"] = get_metadata('project/project-id')
 		except Exception as err:
-			fail_usage("Failed retrieving GCE project. Please provide --project option: {}".format(str(err)))
+			fail_fence_agent(options, "Failed retrieving GCE project. Please provide --project option: {}".format(str(err)))
 
 	if "--baremetalsolution" in options:
 		options["--zone"] = "none"
-	if not options.get("--zone"):
-		try:
-			options["--zone"] = get_zone(conn, options)
-		except Exception as err:
-			fail_usage("Failed retrieving GCE zone. Please provide --zone option: {}".format(str(err)))
+
+	# Populates zone automatically if missing from the command
+	zones = [] if not "--zone" in options else options["--zone"].split(",")
+	options["--plugzonemap"] = {}
+	if "--plug" in options:
+		for i, instance in enumerate(options["--plug"].split(",")):
+			if len(zones) == 1:
+				# If only one zone is specified, use it across all plugs
+				options["--plugzonemap"][instance] = zones[0]
+				continue
+
+			if len(zones) - 1 >= i:
+				# If we have enough zones specified with the --zone flag use the zone at
+				# the same index as the plug
+				options["--plugzonemap"][instance] = zones[i]
+				continue
+
+			try:
+				# In this case we do not have a zone specified so we attempt to detect it
+				options["--plugzonemap"][instance] = get_zone(conn, options, instance)
+			except Exception as err:
+				fail_fence_agent(options, "Failed retrieving GCE zone. Please provide --zone option: {}".format(str(err)))
 
 	# Operate the fencing device
 	result = fence_action(conn, options, set_power_status, get_power_status, get_nodes_list, power_cycle)
diff --git a/tests/data/metadata/fence_gce.xml b/tests/data/metadata/fence_gce.xml
index d2c597b3..8d2a2dfc 100644
--- a/tests/data/metadata/fence_gce.xml
+++ b/tests/data/metadata/fence_gce.xml
@@ -73,6 +73,11 @@ For instructions see: https://cloud.google.com/compute/docs/tutorials/python-gui
 		<content type="string"  />
 		<shortdesc lang="en">Service Account to use for authentication to the google cloud APIs.</shortdesc>
 	</parameter>
+	<parameter name="plugzonemap" unique="0" required="0">
+		<getopt mixed="--plugzonemap=[plugzonemap]" />
+		<content type="string"  />
+		<shortdesc lang="en">Comma separated zone map when fencing multiple plugs.</shortdesc>
+	</parameter>
 	<parameter name="proxyhost" unique="0" required="0">
 		<getopt mixed="--proxyhost=[proxy_host]" />
 		<content type="string"  />
@@ -83,6 +88,31 @@ For instructions see: https://cloud.google.com/compute/docs/tutorials/python-gui
 		<content type="integer"  />
 		<shortdesc lang="en">If a proxy is used for internet access, the proxy port should be specified.</shortdesc>
 	</parameter>
+	<parameter name="earlyexit" unique="0" required="0">
+		<getopt mixed="--earlyexit" />
+		<content type="boolean"  />
+		<shortdesc lang="en">If an existing reset operation is detected, the fence agent will return before the operation completes with a 0 return code.</shortdesc>
+	</parameter>
+	<parameter name="warntimeout" unique="0" required="0">
+		<getopt mixed="--warntimeout=[warn_timeout]" />
+		<content type="second"  />
+		<shortdesc lang="en">If the operation is not completed within the timeout, the cluster operations are allowed to continue.</shortdesc>
+	</parameter>
+	<parameter name="errortimeout" unique="0" required="0">
+		<getopt mixed="--errortimeout=[error_timeout]" />
+		<content type="second"  />
+		<shortdesc lang="en">If the operation is not completed within the timeout, cluster is notified of the operation failure.</shortdesc>
+	</parameter>
+	<parameter name="runonwarn" unique="0" required="0">
+		<getopt mixed="--runonwarn=[run_on_warn]" />
+		<content type="string"  />
+		<shortdesc lang="en">If a timeout would occur while running the agent, then the supplied command is run.</shortdesc>
+	</parameter>
+	<parameter name="runonfail" unique="0" required="0">
+		<getopt mixed="--runonfail=[run_on_fail]" />
+		<content type="string"  />
+		<shortdesc lang="en">If a failure would occur while running the agent, then the supplied command is run.</shortdesc>
+	</parameter>
 	<parameter name="quiet" unique="0" required="0">
 		<getopt mixed="-q, --quiet" />
 		<content type="boolean"  />