File Adjust-GPU-data-gathering-to-work-with-all-Slurm-versions-since-18.08.patch of Package golang-github-vpenso-prometheus_slurm_exporter
From: Egbert Eich <eich@suse.com>
Date: Tue Jun 27 13:09:51 2023 +0200
Subject: Adjust GPU data gathering to work with all Slurm versions since 18.08
Patch-mainline: Not yet
Git-commit: dad0d76a32c784e658573a0025d423df610ab9e1
References:
These changes have been ported from the development branch of
https://github.com/vpenso/prometheus-slurm-exporter.
Signed-off-by: Egbert Eich <eich@suse.com>
Signed-off-by: Egbert Eich <eich@suse.de>
---
gpus.go | 169 ++++++++++++++++++++++++++++++++++++++++++++++++++++------------
node.go | 19 ++++----
2 files changed, 147 insertions(+), 41 deletions(-)
diff --git a/gpus.go b/gpus.go
index ca3bcaf..7db5ab3 100644
--- a/gpus.go
+++ b/gpus.go
@@ -1,4 +1,4 @@
-/* Copyright 2020 Joeri Hermans, Victor Penso, Matteo Dessalvi
+/* Copyright 2022 Joeri Hermans, Victor Penso, Matteo Dessalvi, Iztok Lebar Bajec
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -16,17 +16,19 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */
package main
import (
- "github.com/prometheus/client_golang/prometheus"
- "github.com/prometheus/common/log"
- "io/ioutil"
"os/exec"
- "strings"
+ "regexp"
"strconv"
+ "strings"
+
+ "github.com/prometheus/client_golang/prometheus"
+ "github.com/prometheus/common/log"
)
type GPUsMetrics struct {
alloc float64
idle float64
+ other float64
total float64
utilization float64
}
@@ -35,6 +37,11 @@ func GPUsGetMetrics() *GPUsMetrics {
return ParseGPUsMetrics()
}
+/* TODO:
+ sinfo has gresUSED since slurm>=19.05.0rc01 https://github.com/SchedMD/slurm/blob/master/NEWS
+ revert to old process on slurm<19.05.0rc01
+ --format=AllocGRES will return gres/gpu=8
+ --format=AllocTRES will return billing=16,cpu=16,gres/gpu=8,mem=256G,node=1
func ParseAllocatedGPUs() float64 {
var num_gpus = 0.0
@@ -53,21 +60,106 @@ func ParseAllocatedGPUs() float64 {
return num_gpus
}
+*/
-func ParseTotalGPUs() float64 {
+func ParseAllocatedGPUs(data []byte) float64 {
var num_gpus = 0.0
+ // sinfo -a -h --Format="Nodes: ,GresUsed:" --state=allocated
+ // 3 gpu:2 # slurm>=20.11.8
+ // 1 gpu:(null):3(IDX:0-7) # slurm 21.08.5
+ // 13 gpu:A30:4(IDX:0-3),gpu:Q6K:4(IDX:0-3) # slurm 21.08.5
+
+ sinfo_lines := string(data)
+ re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`)
+ if len(sinfo_lines) > 0 {
+ for _, line := range strings.Split(sinfo_lines, "\n") {
+ // log.info(line)
+ if len(line) > 0 && strings.Contains(line, "gpu:") {
+ nodes := strings.Fields(line)[0]
+ num_nodes, _ := strconv.ParseFloat(nodes, 64)
+ node_active_gpus := strings.Fields(line)[1]
+ num_node_active_gpus := 0.0
+ for _, node_active_gpus_type := range strings.Split(node_active_gpus, ",") {
+ if strings.Contains(node_active_gpus_type, "gpu:") {
+ node_active_gpus_type = re.FindStringSubmatch(node_active_gpus_type)[2]
+ num_node_active_gpus_type, _ := strconv.ParseFloat(node_active_gpus_type, 64)
+ num_node_active_gpus += num_node_active_gpus_type
+ }
+ }
+ num_gpus += num_nodes * num_node_active_gpus
+ }
+ }
+ }
- args := []string{"-h", "-o \"%n %G\""}
- output := string(Execute("sinfo", args))
- if len(output) > 0 {
- for _, line := range strings.Split(output, "\n") {
- if len(line) > 0 {
- line = strings.Trim(line, "\"")
- descriptor := strings.Fields(line)[1]
- descriptor = strings.TrimPrefix(descriptor, "gpu:")
- descriptor = strings.Split(descriptor, "(")[0]
- node_gpus, _ := strconv.ParseFloat(descriptor, 64)
- num_gpus += node_gpus
+ return num_gpus
+}
+
+func ParseIdleGPUs(data []byte) float64 {
+ var num_gpus = 0.0
+ // sinfo -a -h --Format="Nodes: ,Gres: ,GresUsed:" --state=idle,allocated
+ // 3 gpu:4 gpu:2 # slurm 20.11.8
+ // 1 gpu:8(S:0-1) gpu:(null):3(IDX:0-7) # slurm 21.08.5
+ // 13 gpu:A30:4(S:0-1),gpu:Q6K:40(S:0-1) gpu:A30:4(IDX:0-3),gpu:Q6K:4(IDX:0-3) # slurm 21.08.5
+
+ sinfo_lines := string(data)
+ re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`)
+ if len(sinfo_lines) > 0 {
+ for _, line := range strings.Split(sinfo_lines, "\n") {
+ // log.info(line)
+ if len(line) > 0 && strings.Contains(line, "gpu:") {
+ nodes := strings.Fields(line)[0]
+ num_nodes, _ := strconv.ParseFloat(nodes, 64)
+ node_gpus := strings.Fields(line)[1]
+ num_node_gpus := 0.0
+ for _, node_gpus_type := range strings.Split(node_gpus, ",") {
+ if strings.Contains(node_gpus_type, "gpu:") {
+ node_gpus_type = re.FindStringSubmatch(node_gpus_type)[2]
+ num_node_gpus_type, _ := strconv.ParseFloat(node_gpus_type, 64)
+ num_node_gpus += num_node_gpus_type
+ }
+ }
+ num_node_active_gpus := 0.0
+ node_active_gpus := strings.Fields(line)[2]
+ for _, node_active_gpus_type := range strings.Split(node_active_gpus, ",") {
+ if strings.Contains(node_active_gpus_type, "gpu:") {
+ node_active_gpus_type = re.FindStringSubmatch(node_active_gpus_type)[2]
+ num_node_active_gpus_type, _ := strconv.ParseFloat(node_active_gpus_type, 64)
+ num_node_active_gpus += num_node_active_gpus_type
+ }
+ }
+ num_gpus += num_nodes * (num_node_gpus - num_node_active_gpus)
+ }
+ }
+ }
+
+ return num_gpus
+}
+
+func ParseTotalGPUs(data []byte) float64 {
+ var num_gpus = 0.0
+ // sinfo -a -h --Format="Nodes: ,Gres:"
+ // 3 gpu:4 # slurm 20.11.8
+ // 1 gpu:8(S:0-1) # slurm 21.08.5
+ // 13 gpu:A30:4(S:0-1),gpu:Q6K:40(S:0-1) # slurm 21.08.5
+
+ sinfo_lines := string(data)
+ re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`)
+ if len(sinfo_lines) > 0 {
+ for _, line := range strings.Split(sinfo_lines, "\n") {
+ // log.Info(line)
+ if len(line) > 0 && strings.Contains(line, "gpu:") {
+ nodes := strings.Fields(line)[0]
+ num_nodes, _ := strconv.ParseFloat(nodes, 64)
+ node_gpus := strings.Fields(line)[1]
+ num_node_gpus := 0.0
+ for _, node_gpus_type := range strings.Split(node_gpus, ",") {
+ if strings.Contains(node_gpus_type, "gpu:") {
+ node_gpus_type = re.FindStringSubmatch(node_gpus_type)[2]
+ num_node_gpus_type, _ := strconv.ParseFloat(node_gpus_type, 64)
+ num_node_gpus += num_node_gpus_type
+ }
+ }
+ num_gpus += num_nodes * num_node_gpus
}
}
}
@@ -77,29 +169,40 @@ func ParseTotalGPUs() float64 {
func ParseGPUsMetrics() *GPUsMetrics {
var gm GPUsMetrics
- total_gpus := ParseTotalGPUs()
- allocated_gpus := ParseAllocatedGPUs()
+ total_gpus := ParseTotalGPUs(TotalGPUsData())
+ allocated_gpus := ParseAllocatedGPUs(AllocatedGPUsData())
+ idle_gpus := ParseIdleGPUs(IdleGPUsData())
+ other_gpus := total_gpus - allocated_gpus - idle_gpus
gm.alloc = allocated_gpus
- gm.idle = total_gpus - allocated_gpus
+ gm.idle = idle_gpus
+ gm.other = other_gpus
gm.total = total_gpus
gm.utilization = allocated_gpus / total_gpus
return &gm
}
+func AllocatedGPUsData() []byte {
+ args := []string{"-a", "-h", "--Format=Nodes: ,GresUsed:", "--state=allocated"}
+ return Execute("sinfo", args)
+}
+
+func IdleGPUsData() []byte {
+ args := []string{"-a", "-h", "--Format=Nodes: ,Gres: ,GresUsed:", "--state=idle,allocated"}
+ return Execute("sinfo", args)
+}
+
+func TotalGPUsData() []byte {
+ args := []string{"-a", "-h", "--Format=Nodes: ,Gres:"}
+ return Execute("sinfo", args)
+}
+
// Execute the sinfo command and return its output
func Execute(command string, arguments []string) []byte {
cmd := exec.Command(command, arguments...)
- stdout, err := cmd.StdoutPipe()
+ out, err := cmd.CombinedOutput()
if err != nil {
log.Fatal(err)
}
- if err := cmd.Start(); err != nil {
- log.Fatal(err)
- }
- out, _ := ioutil.ReadAll(stdout)
- if err := cmd.Wait(); err != nil {
- log.Fatal(err)
- }
return out
}
@@ -111,9 +214,10 @@ func Execute(command string, arguments []string) []byte {
func NewGPUsCollector() *GPUsCollector {
return &GPUsCollector{
- alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
- idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
- total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
+ alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
+ idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
+ other: prometheus.NewDesc("slurm_gpus_other", "Other GPUs", nil, nil),
+ total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil),
}
}
@@ -121,6 +225,7 @@ func NewGPUsCollector() *GPUsCollector {
type GPUsCollector struct {
alloc *prometheus.Desc
idle *prometheus.Desc
+ other *prometheus.Desc
total *prometheus.Desc
utilization *prometheus.Desc
}
@@ -129,6 +234,7 @@ type GPUsCollector struct {
func (cc *GPUsCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- cc.alloc
ch <- cc.idle
+ ch <- cc.other
ch <- cc.total
ch <- cc.utilization
}
@@ -136,6 +242,7 @@ func (cc *GPUsCollector) Collect(ch chan<- prometheus.Metric) {
cm := GPUsGetMetrics()
ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, cm.alloc)
ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, cm.idle)
+ ch <- prometheus.MustNewConstMetric(cc.other, prometheus.GaugeValue, cm.other)
ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, cm.total)
ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, cm.utilization)
}
diff --git a/node.go b/node.go
index bf2f759..ae7dc90 100644
--- a/node.go
+++ b/node.go
@@ -27,12 +27,12 @@ import (
// NodeMetrics stores metrics for each node
type NodeMetrics struct {
- memAlloc uint64
- memTotal uint64
- cpuAlloc uint64
- cpuIdle uint64
- cpuOther uint64
- cpuTotal uint64
+ memAlloc uint64
+ memTotal uint64
+ cpuAlloc uint64
+ cpuIdle uint64
+ cpuOther uint64
+ cpuTotal uint64
nodeStatus string
}
@@ -60,7 +60,6 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics {
memAlloc, _ := strconv.ParseUint(node[1], 10, 64)
memTotal, _ := strconv.ParseUint(node[2], 10, 64)
-
cpuInfo := strings.Split(node[3], "/")
cpuAlloc, _ := strconv.ParseUint(cpuInfo[0], 10, 64)
cpuIdle, _ := strconv.ParseUint(cpuInfo[1], 10, 64)
@@ -82,7 +81,7 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics {
// NodeData executes the sinfo command to get data for each node
// It returns the output of the sinfo command
func NodeData() []byte {
- cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,StateLong")
+ cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList: ,AllocMem: ,Memory: ,CPUsState: ,StateLong:")
out, err := cmd.Output()
if err != nil {
log.Fatal(err)
@@ -102,7 +101,7 @@ type NodeCollector struct {
// NewNodeCollector creates a Prometheus collector to keep all our stats in
// It returns a set of collections for consumption
func NewNodeCollector() *NodeCollector {
- labels := []string{"node","status"}
+ labels := []string{"node", "status"}
return &NodeCollector{
cpuAlloc: prometheus.NewDesc("slurm_node_cpu_alloc", "Allocated CPUs per node", labels, nil),
@@ -128,7 +127,7 @@ func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) {
nodes := NodeGetMetrics()
for node := range nodes {
ch <- prometheus.MustNewConstMetric(nc.cpuAlloc, prometheus.GaugeValue, float64(nodes[node].cpuAlloc), node, nodes[node].nodeStatus)
- ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nodes[node].cpuIdle), node, nodes[node].nodeStatus)
+ ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nodes[node].cpuIdle), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.cpuOther, prometheus.GaugeValue, float64(nodes[node].cpuOther), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.cpuTotal, prometheus.GaugeValue, float64(nodes[node].cpuTotal), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.memAlloc, prometheus.GaugeValue, float64(nodes[node].memAlloc), node, nodes[node].nodeStatus)