File Adjust-GPU-data-gathering-to-work-with-all-Slurm-versions-since-18.08.patch of Package golang-github-vpenso-prometheus_slurm_exporter

From: Egbert Eich <eich@suse.com>
Date: Tue Jun 27 13:09:51 2023 +0200
Subject: Adjust GPU data gathering to work with all Slurm versions since 18.08
Patch-mainline: Not yet
Git-commit: dad0d76a32c784e658573a0025d423df610ab9e1
References: 

These changes have been ported from the development branch of
https://github.com/vpenso/prometheus-slurm-exporter.

Signed-off-by: Egbert Eich <eich@suse.com>
Signed-off-by: Egbert Eich <eich@suse.de>
---
 gpus.go | 169 ++++++++++++++++++++++++++++++++++++++++++++++++++++------------
 node.go |  19 ++++----
 2 files changed, 147 insertions(+), 41 deletions(-)
diff --git a/gpus.go b/gpus.go
index ca3bcaf..7db5ab3 100644
--- a/gpus.go
+++ b/gpus.go
@@ -1,4 +1,4 @@
-/* Copyright 2020 Joeri Hermans, Victor Penso, Matteo Dessalvi
+/* Copyright 2022 Joeri Hermans, Victor Penso, Matteo Dessalvi, Iztok Lebar Bajec
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -16,17 +16,19 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>. */
 package main
 
 import (
-	"github.com/prometheus/client_golang/prometheus"
-	"github.com/prometheus/common/log"
-	"io/ioutil"
 	"os/exec"
-	"strings"
+	"regexp"
 	"strconv"
+	"strings"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/common/log"
 )
 
 type GPUsMetrics struct {
 	alloc       float64
 	idle        float64
+	other       float64
 	total       float64
 	utilization float64
 }
@@ -35,6 +37,11 @@ func GPUsGetMetrics() *GPUsMetrics {
 	return ParseGPUsMetrics()
 }
 
+/* TODO:
+  sinfo has gresUSED since slurm>=19.05.0rc01 https://github.com/SchedMD/slurm/blob/master/NEWS
+  revert to old process on slurm<19.05.0rc01
+  --format=AllocGRES will return gres/gpu=8
+  --format=AllocTRES will return billing=16,cpu=16,gres/gpu=8,mem=256G,node=1
 func ParseAllocatedGPUs() float64 {
 	var num_gpus = 0.0
 
@@ -53,21 +60,106 @@ func ParseAllocatedGPUs() float64 {
 
 	return num_gpus
 }
+*/
 
-func ParseTotalGPUs() float64 {
+func ParseAllocatedGPUs(data []byte) float64 {
 	var num_gpus = 0.0
+	// sinfo -a -h --Format="Nodes: ,GresUsed:" --state=allocated
+	// 3 gpu:2                                       # slurm>=20.11.8
+	// 1 gpu:(null):3(IDX:0-7)                       # slurm 21.08.5
+	// 13 gpu:A30:4(IDX:0-3),gpu:Q6K:4(IDX:0-3)      # slurm 21.08.5
+
+	sinfo_lines := string(data)
+	re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`)
+	if len(sinfo_lines) > 0 {
+		for _, line := range strings.Split(sinfo_lines, "\n") {
+			// log.info(line)
+			if len(line) > 0 && strings.Contains(line, "gpu:") {
+				nodes := strings.Fields(line)[0]
+				num_nodes, _ := strconv.ParseFloat(nodes, 64)
+				node_active_gpus := strings.Fields(line)[1]
+				num_node_active_gpus := 0.0
+				for _, node_active_gpus_type := range strings.Split(node_active_gpus, ",") {
+					if strings.Contains(node_active_gpus_type, "gpu:") {
+						node_active_gpus_type = re.FindStringSubmatch(node_active_gpus_type)[2]
+						num_node_active_gpus_type, _ := strconv.ParseFloat(node_active_gpus_type, 64)
+						num_node_active_gpus += num_node_active_gpus_type
+					}
+				}
+				num_gpus += num_nodes * num_node_active_gpus
+			}
+		}
+	}
 
-	args := []string{"-h", "-o \"%n %G\""}
-	output := string(Execute("sinfo", args))
-	if len(output) > 0 {
-		for _, line := range strings.Split(output, "\n") {
-			if len(line) > 0 {
-				line = strings.Trim(line, "\"")
-				descriptor := strings.Fields(line)[1]
-				descriptor = strings.TrimPrefix(descriptor, "gpu:")
-				descriptor = strings.Split(descriptor, "(")[0]
-				node_gpus, _ :=  strconv.ParseFloat(descriptor, 64)
-				num_gpus += node_gpus
+	return num_gpus
+}
+
+func ParseIdleGPUs(data []byte) float64 {
+	var num_gpus = 0.0
+	// sinfo -a -h --Format="Nodes: ,Gres: ,GresUsed:" --state=idle,allocated
+	// 3 gpu:4 gpu:2                                       																# slurm 20.11.8
+	// 1 gpu:8(S:0-1) gpu:(null):3(IDX:0-7)                       												# slurm 21.08.5
+	// 13 gpu:A30:4(S:0-1),gpu:Q6K:40(S:0-1) gpu:A30:4(IDX:0-3),gpu:Q6K:4(IDX:0-3)       	# slurm 21.08.5
+
+	sinfo_lines := string(data)
+	re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`)
+	if len(sinfo_lines) > 0 {
+		for _, line := range strings.Split(sinfo_lines, "\n") {
+			// log.info(line)
+			if len(line) > 0 && strings.Contains(line, "gpu:") {
+				nodes := strings.Fields(line)[0]
+				num_nodes, _ := strconv.ParseFloat(nodes, 64)
+				node_gpus := strings.Fields(line)[1]
+				num_node_gpus := 0.0
+				for _, node_gpus_type := range strings.Split(node_gpus, ",") {
+					if strings.Contains(node_gpus_type, "gpu:") {
+						node_gpus_type = re.FindStringSubmatch(node_gpus_type)[2]
+						num_node_gpus_type, _ := strconv.ParseFloat(node_gpus_type, 64)
+						num_node_gpus += num_node_gpus_type
+					}
+				}
+				num_node_active_gpus := 0.0
+				node_active_gpus := strings.Fields(line)[2]
+				for _, node_active_gpus_type := range strings.Split(node_active_gpus, ",") {
+					if strings.Contains(node_active_gpus_type, "gpu:") {
+						node_active_gpus_type = re.FindStringSubmatch(node_active_gpus_type)[2]
+						num_node_active_gpus_type, _ := strconv.ParseFloat(node_active_gpus_type, 64)
+						num_node_active_gpus += num_node_active_gpus_type
+					}
+				}
+				num_gpus += num_nodes * (num_node_gpus - num_node_active_gpus)
+			}
+		}
+	}
+
+	return num_gpus
+}
+
+func ParseTotalGPUs(data []byte) float64 {
+	var num_gpus = 0.0
+	// sinfo -a -h --Format="Nodes: ,Gres:"
+	// 3 gpu:4                                       	# slurm 20.11.8
+	// 1 gpu:8(S:0-1)                                	# slurm 21.08.5
+	// 13 gpu:A30:4(S:0-1),gpu:Q6K:40(S:0-1)        	# slurm 21.08.5
+
+	sinfo_lines := string(data)
+	re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`)
+	if len(sinfo_lines) > 0 {
+		for _, line := range strings.Split(sinfo_lines, "\n") {
+			// log.Info(line)
+			if len(line) > 0 && strings.Contains(line, "gpu:") {
+				nodes := strings.Fields(line)[0]
+				num_nodes, _ := strconv.ParseFloat(nodes, 64)
+				node_gpus := strings.Fields(line)[1]
+				num_node_gpus := 0.0
+				for _, node_gpus_type := range strings.Split(node_gpus, ",") {
+					if strings.Contains(node_gpus_type, "gpu:") {
+						node_gpus_type = re.FindStringSubmatch(node_gpus_type)[2]
+						num_node_gpus_type, _ := strconv.ParseFloat(node_gpus_type, 64)
+						num_node_gpus += num_node_gpus_type
+					}
+				}
+				num_gpus += num_nodes * num_node_gpus
 			}
 		}
 	}
@@ -77,29 +169,40 @@ func ParseTotalGPUs() float64 {
 
 func ParseGPUsMetrics() *GPUsMetrics {
 	var gm GPUsMetrics
-	total_gpus := ParseTotalGPUs()
-	allocated_gpus := ParseAllocatedGPUs()
+	total_gpus := ParseTotalGPUs(TotalGPUsData())
+	allocated_gpus := ParseAllocatedGPUs(AllocatedGPUsData())
+	idle_gpus := ParseIdleGPUs(IdleGPUsData())
+	other_gpus := total_gpus - allocated_gpus - idle_gpus
 	gm.alloc = allocated_gpus
-	gm.idle = total_gpus - allocated_gpus
+	gm.idle = idle_gpus
+	gm.other = other_gpus
 	gm.total = total_gpus
 	gm.utilization = allocated_gpus / total_gpus
 	return &gm
 }
 
+func AllocatedGPUsData() []byte {
+	args := []string{"-a", "-h", "--Format=Nodes: ,GresUsed:", "--state=allocated"}
+	return Execute("sinfo", args)
+}
+
+func IdleGPUsData() []byte {
+	args := []string{"-a", "-h", "--Format=Nodes: ,Gres: ,GresUsed:", "--state=idle,allocated"}
+	return Execute("sinfo", args)
+}
+
+func TotalGPUsData() []byte {
+	args := []string{"-a", "-h", "--Format=Nodes: ,Gres:"}
+	return Execute("sinfo", args)
+}
+
 // Execute the sinfo command and return its output
 func Execute(command string, arguments []string) []byte {
 	cmd := exec.Command(command, arguments...)
-	stdout, err := cmd.StdoutPipe()
+	out, err := cmd.CombinedOutput()
 	if err != nil {
 		log.Fatal(err)
 	}
-	if err := cmd.Start(); err != nil {
-		log.Fatal(err)
-	}
-	out, _ := ioutil.ReadAll(stdout)
-	if err := cmd.Wait(); err != nil {
-		log.Fatal(err)
-	}
 	return out
 }
 
@@ -111,9 +214,10 @@ func Execute(command string, arguments []string) []byte {
 
 func NewGPUsCollector() *GPUsCollector {
 	return &GPUsCollector{
-		alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
-		idle:  prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
-		total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
+		alloc:       prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
+		idle:        prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
+		other:       prometheus.NewDesc("slurm_gpus_other", "Other GPUs", nil, nil),
+		total:       prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
 		utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil),
 	}
 }
@@ -121,6 +225,7 @@ func NewGPUsCollector() *GPUsCollector {
 type GPUsCollector struct {
 	alloc       *prometheus.Desc
 	idle        *prometheus.Desc
+	other       *prometheus.Desc
 	total       *prometheus.Desc
 	utilization *prometheus.Desc
 }
@@ -129,6 +234,7 @@ type GPUsCollector struct {
 func (cc *GPUsCollector) Describe(ch chan<- *prometheus.Desc) {
 	ch <- cc.alloc
 	ch <- cc.idle
+	ch <- cc.other
 	ch <- cc.total
 	ch <- cc.utilization
 }
@@ -136,6 +242,7 @@ func (cc *GPUsCollector) Collect(ch chan<- prometheus.Metric) {
 	cm := GPUsGetMetrics()
 	ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, cm.alloc)
 	ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, cm.idle)
+	ch <- prometheus.MustNewConstMetric(cc.other, prometheus.GaugeValue, cm.other)
 	ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, cm.total)
 	ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, cm.utilization)
 }
diff --git a/node.go b/node.go
index bf2f759..ae7dc90 100644
--- a/node.go
+++ b/node.go
@@ -27,12 +27,12 @@ import (
 
 // NodeMetrics stores metrics for each node
 type NodeMetrics struct {
-	memAlloc uint64
-	memTotal uint64
-	cpuAlloc uint64
-	cpuIdle  uint64
-	cpuOther uint64
-	cpuTotal uint64
+	memAlloc   uint64
+	memTotal   uint64
+	cpuAlloc   uint64
+	cpuIdle    uint64
+	cpuOther   uint64
+	cpuTotal   uint64
 	nodeStatus string
 }
 
@@ -60,7 +60,6 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics {
 		memAlloc, _ := strconv.ParseUint(node[1], 10, 64)
 		memTotal, _ := strconv.ParseUint(node[2], 10, 64)
 
-
 		cpuInfo := strings.Split(node[3], "/")
 		cpuAlloc, _ := strconv.ParseUint(cpuInfo[0], 10, 64)
 		cpuIdle, _ := strconv.ParseUint(cpuInfo[1], 10, 64)
@@ -82,7 +81,7 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics {
 // NodeData executes the sinfo command to get data for each node
 // It returns the output of the sinfo command
 func NodeData() []byte {
-	cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,StateLong")
+	cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList: ,AllocMem: ,Memory: ,CPUsState: ,StateLong:")
 	out, err := cmd.Output()
 	if err != nil {
 		log.Fatal(err)
@@ -102,7 +101,7 @@ type NodeCollector struct {
 // NewNodeCollector creates a Prometheus collector to keep all our stats in
 // It returns a set of collections for consumption
 func NewNodeCollector() *NodeCollector {
-	labels := []string{"node","status"}
+	labels := []string{"node", "status"}
 
 	return &NodeCollector{
 		cpuAlloc: prometheus.NewDesc("slurm_node_cpu_alloc", "Allocated CPUs per node", labels, nil),
@@ -128,7 +127,7 @@ func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) {
 	nodes := NodeGetMetrics()
 	for node := range nodes {
 		ch <- prometheus.MustNewConstMetric(nc.cpuAlloc, prometheus.GaugeValue, float64(nodes[node].cpuAlloc), node, nodes[node].nodeStatus)
-		ch <- prometheus.MustNewConstMetric(nc.cpuIdle,  prometheus.GaugeValue, float64(nodes[node].cpuIdle),  node, nodes[node].nodeStatus)
+		ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nodes[node].cpuIdle), node, nodes[node].nodeStatus)
 		ch <- prometheus.MustNewConstMetric(nc.cpuOther, prometheus.GaugeValue, float64(nodes[node].cpuOther), node, nodes[node].nodeStatus)
 		ch <- prometheus.MustNewConstMetric(nc.cpuTotal, prometheus.GaugeValue, float64(nodes[node].cpuTotal), node, nodes[node].nodeStatus)
 		ch <- prometheus.MustNewConstMetric(nc.memAlloc, prometheus.GaugeValue, float64(nodes[node].memAlloc), node, nodes[node].nodeStatus)
openSUSE Build Service is sponsored by