File llvm-Revert-ockl-Don-t-use-wave32-ballot-builtin.patch of Package rocm-llvm
From a509224cd43bfd13fe12c62dd261d58c8c3879d5 Mon Sep 17 00:00:00 2001
From: Jeremy Newton <Jeremy.Newton@amd.com>
Date: Thu, 6 Jun 2024 15:59:48 -0400
Subject: [PATCH] Revert "ockl: Don't use wave32 ballot builtin"
This reverts commit 96b2ba31ded4a892390dfba3767c413bd1a3a29d.
---
amd/device-libs/ockl/src/dm.cl | 50 +++++++++++++++++++++++++------
amd/device-libs/ockl/src/wfaas.cl | 43 +++++++++++++++++++++++---
2 files changed, 80 insertions(+), 13 deletions(-)
diff --git a/amd/device-libs/ockl/src/dm.cl b/amd/device-libs/ockl/src/dm.cl
index 4820df6bb4e1..829e016d3028 100644
--- a/amd/device-libs/ockl/src/dm.cl
+++ b/amd/device-libs/ockl/src/dm.cl
@@ -294,14 +294,9 @@ first(__global void * v)
return __builtin_astype(w2, __global void *);
}
-// Read val from one active lane whose predicate is one.
-// If no lanes have the predicate set, return none
-// This is like first, except that first may not have its predicate set
+REQUIRES_WAVE64
static uint
-elect_uint(int pred, uint val, uint none)
-{
- // Pretend wave32 doesn't exist. The wave64 ballot works, and the high half
- // will fold out as 0.
+elect_uint_wave64(int pred, uint val, uint none) {
uint ret = none;
ulong mask = __builtin_amdgcn_ballot_w64(pred != 0);
@@ -313,14 +308,51 @@ elect_uint(int pred, uint val, uint none)
return ret;
}
-// Count the number of nonzero arguments across the wave
+REQUIRES_WAVE32
static uint
-votes(bool b)
+elect_uint_wave32(int pred, uint val, uint none) {
+ uint ret = none;
+ uint mask = __builtin_amdgcn_ballot_w32(pred != 0);
+ if (mask != 0U) {
+ uint l = __ockl_ctz_u32(mask);
+ ret = __builtin_amdgcn_ds_bpermute(l << 2, val);
+ }
+
+ return ret;
+}
+
+// Read val from one active lane whose predicate is one.
+// If no lanes have the predicate set, return none
+// This is like first, except that first may not have its predicate set
+static uint
+elect_uint(int pred, uint val, uint none)
+{
+ return __oclc_wavefrontsize64 ? elect_uint_wave64(pred, val, none) : elect_uint_wave32(pred, val, none);
+}
+
+REQUIRES_WAVE64
+static uint
+votes_wave64(bool b)
{
ulong mask = __builtin_amdgcn_ballot_w64(b);
return __builtin_popcountl(mask);
}
+REQUIRES_WAVE32
+static uint
+votes_wave32(bool b)
+{
+ uint mask = __builtin_amdgcn_ballot_w32(b);
+ return __builtin_popcount(mask);
+}
+
+// Count the number of nonzero arguments across the wave
+static uint
+votes(bool b)
+{
+ return __oclc_wavefrontsize64 ? votes_wave64(b) : votes_wave32(b);
+}
+
// The kind of the smallest block that can hold sz bytes
static uint
size_to_kind(uint sz)
diff --git a/amd/device-libs/ockl/src/wfaas.cl b/amd/device-libs/ockl/src/wfaas.cl
index 562bafa8452f..60fe55ac97e4 100644
--- a/amd/device-libs/ockl/src/wfaas.cl
+++ b/amd/device-libs/ockl/src/wfaas.cl
@@ -22,25 +22,60 @@ static int optimizationBarrierHack(int in_val)
return out_val;
}
+REQUIRES_WAVE32
+static bool wfany_impl_w32(int e) {
+ return __builtin_amdgcn_ballot_w32(e) != 0;
+}
+
+REQUIRES_WAVE64
+static bool wfany_impl_w64(int e) {
+ return __builtin_amdgcn_ballot_w64(e) != 0;
+}
+
ATTR bool
OCKL_MANGLE_I32(wfany)(int e)
{
e = optimizationBarrierHack(e);
- return __builtin_amdgcn_ballot_w64(e) != 0;
+ return __oclc_wavefrontsize64 ?
+ wfany_impl_w64(e) : wfany_impl_w32(e);
+}
+
+REQUIRES_WAVE32
+static bool wfall_impl_w32(int e) {
+ return __builtin_amdgcn_ballot_w32(e) == __builtin_amdgcn_read_exec_lo();
+}
+
+REQUIRES_WAVE64
+static bool wfall_impl_w64(int e) {
+ return __builtin_amdgcn_ballot_w64(e) == __builtin_amdgcn_read_exec();
}
ATTR bool
OCKL_MANGLE_I32(wfall)(int e)
{
e = optimizationBarrierHack(e);
- return __builtin_amdgcn_ballot_w64(e) == __builtin_amdgcn_read_exec();
+ return __oclc_wavefrontsize64 ?
+ wfall_impl_w64(e) : wfall_impl_w32(e);
+}
+
+
+REQUIRES_WAVE32
+static bool wfsame_impl_w32(int e) {
+ uint u = __builtin_amdgcn_ballot_w32(e);
+ return (u == 0) | (u == __builtin_amdgcn_read_exec_lo());
+}
+
+REQUIRES_WAVE64
+static bool wfsame_impl_w64(int e) {
+ ulong u = __builtin_amdgcn_ballot_w64(e);
+ return (u == 0UL) | (u == __builtin_amdgcn_read_exec());
}
ATTR bool
OCKL_MANGLE_I32(wfsame)(int e)
{
e = optimizationBarrierHack(e);
- ulong u = __builtin_amdgcn_ballot_w64(e);
- return (u == 0UL) | (u == __builtin_amdgcn_read_exec());
+ return __oclc_wavefrontsize64 ?
+ wfsame_impl_w64(e) : wfsame_impl_w32(e);
}
--
2.45.1