File melonds_PR2065.patch of Package melonds

From 57d20751fcd1ff52fe9f8890d565c65c5b6abc13 Mon Sep 17 00:00:00 2001
From: FireNX70 <firenx70@gmail.com>
Date: Sun, 9 Jun 2024 18:47:43 +0200
Subject: [PATCH 1/9] Fix glMemoryBarrier flags

---
 src/GPU3D_Compute.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/GPU3D_Compute.cpp b/src/GPU3D_Compute.cpp
index da2559507..027d68066 100644
--- a/src/GPU3D_Compute.cpp
+++ b/src/GPU3D_Compute.cpp
@@ -932,23 +932,23 @@ void ComputeRenderer::RenderFrame(GPU& gpu)
         glBindImageTexture(0, YSpanIndicesTexture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGBA16UI);
         glUseProgram(ShaderInterpXSpans[wbuffer]);
         glDispatchCompute((numSetupIndices + 31) / 32, 1, 1);
-        glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
+        glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
 
         // bin polygons
         glUseProgram(ShaderBinCombined);
         glDispatchCompute(((gpu.GPU3D.RenderNumPolygons + 31) / 32), ScreenWidth/CoarseTileW, ScreenHeight/CoarseTileH);
-        glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
+        glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
 
         // calculate list offsets
         glUseProgram(ShaderCalculateWorkListOffset);
         glDispatchCompute((numVariants + 31) / 32, 1, 1);
-        glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
+        glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
 
         // sort shader work
         glUseProgram(ShaderSortWork);
         glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory);
         glDispatchComputeIndirect(offsetof(BinResultHeader, SortWorkWorkCount));
-        glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
+        glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
 
         glActiveTexture(GL_TEXTURE0);
 

From 528501149e21e380898dd77a4c527087f0d8651e Mon Sep 17 00:00:00 2001
From: FireNX70 <firenx70@gmail.com>
Date: Sun, 9 Jun 2024 20:07:26 +0200
Subject: [PATCH 2/9] Scale TileSize with internal resolution

---
 src/GPU3D_Compute.cpp       | 7 +++++++
 src/GPU3D_Compute.h         | 6 +++---
 src/GPU3D_Compute_shaders.h | 1 -
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/GPU3D_Compute.cpp b/src/GPU3D_Compute.cpp
index 027d68066..6d6115fe1 100644
--- a/src/GPU3D_Compute.cpp
+++ b/src/GPU3D_Compute.cpp
@@ -19,6 +19,7 @@
 #include "GPU3D_Compute.h"
 
 #include <assert.h>
+#include <algorithm>
 
 #include "OpenGLSupport.h"
 
@@ -50,6 +51,8 @@ bool ComputeRenderer::CompileShader(GLuint& shader, const std::string& source, c
     shaderSource += std::to_string(ScreenHeight);
     shaderSource += "\n#define MaxWorkTiles ";
     shaderSource += std::to_string(MaxWorkTiles);
+    shaderSource += "\n#define TileSize ";
+    shaderSource += std::to_string(TileSize);
 
     shaderSource += ComputeRendererShaders::Common;
     shaderSource += source;
@@ -310,6 +313,10 @@ void ComputeRenderer::SetRenderSettings(int scale, bool highResolutionCoordinate
     ScreenWidth = 256 * ScaleFactor;
     ScreenHeight = 192 * ScaleFactor;
 
+    TileSize = std::min(8 * (1 << (ScaleFactor / 5)), 32);
+    CoarseTileW = CoarseTileCountX * TileSize;
+    CoarseTileH = CoarseTileCountY * TileSize;
+
     TilesPerLine = ScreenWidth/TileSize;
     TileLines = ScreenHeight/TileSize;
 
diff --git a/src/GPU3D_Compute.h b/src/GPU3D_Compute.h
index 7544c09e0..1e225948b 100644
--- a/src/GPU3D_Compute.h
+++ b/src/GPU3D_Compute.h
@@ -163,11 +163,11 @@ class ComputeRenderer : public Renderer3D
         float TextureLayer;
     };
 
-    static constexpr int TileSize = 8;
+    int TileSize;
     static constexpr int CoarseTileCountX = 8;
     static constexpr int CoarseTileCountY = 4;
-    static constexpr int CoarseTileW = CoarseTileCountX * TileSize;
-    static constexpr int CoarseTileH = CoarseTileCountY * TileSize;
+    int CoarseTileW;
+    int CoarseTileH;
 
     static constexpr int BinStride = 2048/32;
     static constexpr int CoarseBinStride = BinStride/32;
diff --git a/src/GPU3D_Compute_shaders.h b/src/GPU3D_Compute_shaders.h
index 572f9ad66..0ad57dbd1 100644
--- a/src/GPU3D_Compute_shaders.h
+++ b/src/GPU3D_Compute_shaders.h
@@ -339,7 +339,6 @@ const uint ResultAttrStart = ResultDepthStart+ScreenWidth*ScreenHeight*2;
 
 const char* Common = R"(
 
-#define TileSize 8
 const int CoarseTileCountX = 8;
 const int CoarseTileCountY = 4;
 const int CoarseTileW = (CoarseTileCountX * TileSize);

From 078b4c43f4af165279e0fd235abb3cb241b2b20b Mon Sep 17 00:00:00 2001
From: FireNX70 <firenx70@gmail.com>
Date: Sun, 9 Jun 2024 21:46:44 +0200
Subject: [PATCH 3/9] Clean up tile size calc

---
 src/GPU3D_Compute.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/GPU3D_Compute.cpp b/src/GPU3D_Compute.cpp
index 6d6115fe1..8e0ea4f36 100644
--- a/src/GPU3D_Compute.cpp
+++ b/src/GPU3D_Compute.cpp
@@ -313,7 +313,7 @@ void ComputeRenderer::SetRenderSettings(int scale, bool highResolutionCoordinate
     ScreenWidth = 256 * ScaleFactor;
     ScreenHeight = 192 * ScaleFactor;
 
-    TileSize = std::min(8 * (1 << (ScaleFactor / 5)), 32);
+    TileSize = std::min(8 << (ScaleFactor / 5), 32);
     CoarseTileW = CoarseTileCountX * TileSize;
     CoarseTileH = CoarseTileCountY * TileSize;
 

From 7433388beb6662673b021a1c4b0ceba9513bd156 Mon Sep 17 00:00:00 2001
From: FireNX70 <firenx70@gmail.com>
Date: Mon, 10 Jun 2024 20:15:05 +0200
Subject: [PATCH 4/9] Use a different CoarseTileCountY with size 32 tiles

---
 src/GPU3D_Compute.cpp       | 3 +++
 src/GPU3D_Compute.h         | 2 +-
 src/GPU3D_Compute_shaders.h | 1 -
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/GPU3D_Compute.cpp b/src/GPU3D_Compute.cpp
index 8e0ea4f36..294cd5180 100644
--- a/src/GPU3D_Compute.cpp
+++ b/src/GPU3D_Compute.cpp
@@ -53,6 +53,8 @@ bool ComputeRenderer::CompileShader(GLuint& shader, const std::string& source, c
     shaderSource += std::to_string(MaxWorkTiles);
     shaderSource += "\n#define TileSize ";
     shaderSource += std::to_string(TileSize);
+    shaderSource += "\nconst int CoarseTileCountY = ";
+    shaderSource += std::to_string(CoarseTileCountY) + ";";
 
     shaderSource += ComputeRendererShaders::Common;
     shaderSource += source;
@@ -314,6 +316,7 @@ void ComputeRenderer::SetRenderSettings(int scale, bool highResolutionCoordinate
     ScreenHeight = 192 * ScaleFactor;
 
     TileSize = std::min(8 << (ScaleFactor / 5), 32);
+    CoarseTileCountY = TileSize < 32 ? 4 : 6;
     CoarseTileW = CoarseTileCountX * TileSize;
     CoarseTileH = CoarseTileCountY * TileSize;
 
diff --git a/src/GPU3D_Compute.h b/src/GPU3D_Compute.h
index 1e225948b..4f944e42f 100644
--- a/src/GPU3D_Compute.h
+++ b/src/GPU3D_Compute.h
@@ -165,7 +165,7 @@ class ComputeRenderer : public Renderer3D
 
     int TileSize;
     static constexpr int CoarseTileCountX = 8;
-    static constexpr int CoarseTileCountY = 4;
+    int CoarseTileCountY;
     int CoarseTileW;
     int CoarseTileH;
 
diff --git a/src/GPU3D_Compute_shaders.h b/src/GPU3D_Compute_shaders.h
index 0ad57dbd1..9fb4aae76 100644
--- a/src/GPU3D_Compute_shaders.h
+++ b/src/GPU3D_Compute_shaders.h
@@ -340,7 +340,6 @@ const uint ResultAttrStart = ResultDepthStart+ScreenWidth*ScreenHeight*2;
 const char* Common = R"(
 
 const int CoarseTileCountX = 8;
-const int CoarseTileCountY = 4;
 const int CoarseTileW = (CoarseTileCountX * TileSize);
 const int CoarseTileH = (CoarseTileCountY * TileSize);
 

From d24079e693befa45e0ea1d89d70f48adf25614a8 Mon Sep 17 00:00:00 2001
From: FireNX70 <firenx70@gmail.com>
Date: Tue, 11 Jun 2024 15:56:24 +0200
Subject: [PATCH 5/9] Better tile scaling calc

---
 src/GPU3D_Compute.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/GPU3D_Compute.cpp b/src/GPU3D_Compute.cpp
index 294cd5180..9d5a23785 100644
--- a/src/GPU3D_Compute.cpp
+++ b/src/GPU3D_Compute.cpp
@@ -302,6 +302,8 @@ void ComputeRenderer::Reset(GPU& gpu)
 
 void ComputeRenderer::SetRenderSettings(int scale, bool highResolutionCoordinates)
 {
+    unsigned char TileScale;
+
     CurGLCompositor.SetScaleFactor(scale);
 
     if (ScaleFactor != -1)
@@ -315,7 +317,13 @@ void ComputeRenderer::SetRenderSettings(int scale, bool highResolutionCoordinate
     ScreenWidth = 256 * ScaleFactor;
     ScreenHeight = 192 * ScaleFactor;
 
-    TileSize = std::min(8 << (ScaleFactor / 5), 32);
+    //Starting at 4.5x we want to double TileSize every time scale doubles
+    TileScale = 2 * ScaleFactor / 9;
+    TileScale &= ~(TileScale >> 1);
+    TileScale <<= 1;
+    TileScale += TileScale == 0;
+    
+    TileSize = std::min(8 * TileScale, 32);
     CoarseTileCountY = TileSize < 32 ? 4 : 6;
     CoarseTileW = CoarseTileCountX * TileSize;
     CoarseTileH = CoarseTileCountY * TileSize;

From 57b5e16e6dfbeb0e3d53e32f2a32e654e2fb3203 Mon Sep 17 00:00:00 2001
From: FireNX70 <firenx70@gmail.com>
Date: Tue, 11 Jun 2024 17:22:36 +0200
Subject: [PATCH 6/9] Actually good way to get the MS bit

---
 src/GPU3D_Compute.cpp |  9 +++++++--
 src/Utils.h           | 14 ++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/GPU3D_Compute.cpp b/src/GPU3D_Compute.cpp
index 9d5a23785..c22a66b96 100644
--- a/src/GPU3D_Compute.cpp
+++ b/src/GPU3D_Compute.cpp
@@ -21,6 +21,8 @@
 #include <assert.h>
 #include <algorithm>
 
+#include "Utils.h"
+
 #include "OpenGLSupport.h"
 
 #include "GPU3D_Compute_shaders.h"
@@ -302,7 +304,7 @@ void ComputeRenderer::Reset(GPU& gpu)
 
 void ComputeRenderer::SetRenderSettings(int scale, bool highResolutionCoordinates)
 {
-    unsigned char TileScale;
+    u8 TileScale;
 
     CurGLCompositor.SetScaleFactor(scale);
 
@@ -319,9 +321,12 @@ void ComputeRenderer::SetRenderSettings(int scale, bool highResolutionCoordinate
 
     //Starting at 4.5x we want to double TileSize every time scale doubles
     TileScale = 2 * ScaleFactor / 9;
-    TileScale &= ~(TileScale >> 1);
+    TileScale = GetMSBit(TileScale);
     TileScale <<= 1;
     TileScale += TileScale == 0;
+
+    std::printf("Scale: %d\n", ScaleFactor);
+    std::printf("TileScale: %d\n", TileScale);
     
     TileSize = std::min(8 * TileScale, 32);
     CoarseTileCountY = TileSize < 32 ? 4 : 6;
diff --git a/src/Utils.h b/src/Utils.h
index 63be217b8..e6444c51f 100644
--- a/src/Utils.h
+++ b/src/Utils.h
@@ -38,6 +38,20 @@ std::pair<std::unique_ptr<u8[]>, u32> PadToPowerOf2(const u8* data, u32 len) noe
 
 std::unique_ptr<u8[]> CopyToUnique(const u8* data, u32 len) noexcept;
 
+template <typename T>
+T GetMSBit(T val)
+{
+    val |= (val >>  1);
+    val |= (val >>  2);
+    val |= (val >>  4);
+
+    if constexpr(sizeof(val) > 1) val |= (val >>  8);
+    if constexpr(sizeof(val) > 2) val |= (val >> 16);
+    if constexpr(sizeof(val) > 4) val |= (val >> 32);
+    
+    return val - (val >> 1);
+}
+
 }
 
 #endif // MELONDS_UTILS_H

From f097cc3b4e93bcd9b5db73d6be68d21dcfb85955 Mon Sep 17 00:00:00 2001
From: FireNX70 <firenx70@gmail.com>
Date: Thu, 13 Jun 2024 00:30:33 +0200
Subject: [PATCH 7/9] Tie BinCombined's local_size_x to the coarse tiles' area

---
 src/GPU3D_Compute.cpp       | 5 ++++-
 src/GPU3D_Compute.h         | 1 +
 src/GPU3D_Compute_shaders.h | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/GPU3D_Compute.cpp b/src/GPU3D_Compute.cpp
index c22a66b96..ecf485440 100644
--- a/src/GPU3D_Compute.cpp
+++ b/src/GPU3D_Compute.cpp
@@ -57,6 +57,8 @@ bool ComputeRenderer::CompileShader(GLuint& shader, const std::string& source, c
     shaderSource += std::to_string(TileSize);
     shaderSource += "\nconst int CoarseTileCountY = ";
     shaderSource += std::to_string(CoarseTileCountY) + ";";
+    shaderSource += "\n#define CoarseTileArea ";
+    shaderSource += std::to_string(CoarseTileArea);
 
     shaderSource += ComputeRendererShaders::Common;
     shaderSource += source;
@@ -330,6 +332,7 @@ void ComputeRenderer::SetRenderSettings(int scale, bool highResolutionCoordinate
     
     TileSize = std::min(8 * TileScale, 32);
     CoarseTileCountY = TileSize < 32 ? 4 : 6;
+    CoarseTileArea = CoarseTileCountX * CoarseTileCountY;
     CoarseTileW = CoarseTileCountX * TileSize;
     CoarseTileH = CoarseTileCountY * TileSize;
 
@@ -959,7 +962,7 @@ void ComputeRenderer::RenderFrame(GPU& gpu)
 
         // bin polygons
         glUseProgram(ShaderBinCombined);
-        glDispatchCompute(((gpu.GPU3D.RenderNumPolygons + 31) / 32), ScreenWidth/CoarseTileW, ScreenHeight/CoarseTileH);
+        glDispatchCompute(((gpu.GPU3D.RenderNumPolygons + CoarseTileArea - 1) / CoarseTileArea), ScreenWidth/CoarseTileW, ScreenHeight/CoarseTileH);
         glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
 
         // calculate list offsets
diff --git a/src/GPU3D_Compute.h b/src/GPU3D_Compute.h
index 4f944e42f..6a5fd499c 100644
--- a/src/GPU3D_Compute.h
+++ b/src/GPU3D_Compute.h
@@ -166,6 +166,7 @@ class ComputeRenderer : public Renderer3D
     int TileSize;
     static constexpr int CoarseTileCountX = 8;
     int CoarseTileCountY;
+    int CoarseTileArea;
     int CoarseTileW;
     int CoarseTileH;
 
diff --git a/src/GPU3D_Compute_shaders.h b/src/GPU3D_Compute_shaders.h
index 9fb4aae76..9b3190295 100644
--- a/src/GPU3D_Compute_shaders.h
+++ b/src/GPU3D_Compute_shaders.h
@@ -862,7 +862,7 @@ const std::string BinCombined =
     XSpanSetupBuffer +
     WorkDescBuffer + R"(
 
-layout (local_size_x = 32) in;
+layout (local_size_x = CoarseTileArea) in;
 
 bool BinPolygon(Polygon polygon, ivec2 topLeft, ivec2 botRight)
 {

From 9c87d9998f1ad793b7ea442aed8407851d65b750 Mon Sep 17 00:00:00 2001
From: FireNX70 <firenx70@gmail.com>
Date: Thu, 13 Jun 2024 00:46:05 +0200
Subject: [PATCH 8/9] Work count X is unrelated to the local size here

---
 src/GPU3D_Compute.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/GPU3D_Compute.cpp b/src/GPU3D_Compute.cpp
index ecf485440..93aac5ce3 100644
--- a/src/GPU3D_Compute.cpp
+++ b/src/GPU3D_Compute.cpp
@@ -962,7 +962,7 @@ void ComputeRenderer::RenderFrame(GPU& gpu)
 
         // bin polygons
         glUseProgram(ShaderBinCombined);
-        glDispatchCompute(((gpu.GPU3D.RenderNumPolygons + CoarseTileArea - 1) / CoarseTileArea), ScreenWidth/CoarseTileW, ScreenHeight/CoarseTileH);
+        glDispatchCompute(((gpu.GPU3D.RenderNumPolygons + 31) / 32), ScreenWidth/CoarseTileW, ScreenHeight/CoarseTileH);
         glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
 
         // calculate list offsets

From 6f8ce9fe897fab0277754209122df4181fb4a177 Mon Sep 17 00:00:00 2001
From: FireNX70 <firenx70@gmail.com>
Date: Thu, 13 Jun 2024 07:57:49 +0200
Subject: [PATCH 9/9] Adjust ClearCoarseBinMask's local size according to
 TileSize

---
 src/GPU3D_Compute.cpp       | 5 ++++-
 src/GPU3D_Compute.h         | 1 +
 src/GPU3D_Compute_shaders.h | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/GPU3D_Compute.cpp b/src/GPU3D_Compute.cpp
index 93aac5ce3..16a3d80a8 100644
--- a/src/GPU3D_Compute.cpp
+++ b/src/GPU3D_Compute.cpp
@@ -59,6 +59,8 @@ bool ComputeRenderer::CompileShader(GLuint& shader, const std::string& source, c
     shaderSource += std::to_string(CoarseTileCountY) + ";";
     shaderSource += "\n#define CoarseTileArea ";
     shaderSource += std::to_string(CoarseTileArea);
+    shaderSource += "\n#define ClearCoarseBinMaskLocalSize ";
+    shaderSource += std::to_string(ClearCoarseBinMaskLocalSize);
 
     shaderSource += ComputeRendererShaders::Common;
     shaderSource += source;
@@ -332,6 +334,7 @@ void ComputeRenderer::SetRenderSettings(int scale, bool highResolutionCoordinate
     
     TileSize = std::min(8 * TileScale, 32);
     CoarseTileCountY = TileSize < 32 ? 4 : 6;
+    ClearCoarseBinMaskLocalSize = TileSize < 32 ? 64 : 48;
     CoarseTileArea = CoarseTileCountX * CoarseTileCountY;
     CoarseTileW = CoarseTileCountX * TileSize;
     CoarseTileH = CoarseTileCountY * TileSize;
@@ -944,7 +947,7 @@ void ComputeRenderer::RenderFrame(GPU& gpu)
     glBindBufferBase(GL_UNIFORM_BUFFER, 0, MetaUniformMemory);
 
     glUseProgram(ShaderClearCoarseBinMask);
-    glDispatchCompute(TilesPerLine*TileLines/32, 1, 1);
+    glDispatchCompute(TilesPerLine*TileLines/ClearCoarseBinMaskLocalSize, 1, 1);
 
     bool wbuffer = false;
     if (numYSpans > 0)
diff --git a/src/GPU3D_Compute.h b/src/GPU3D_Compute.h
index 6a5fd499c..30766ec7a 100644
--- a/src/GPU3D_Compute.h
+++ b/src/GPU3D_Compute.h
@@ -169,6 +169,7 @@ class ComputeRenderer : public Renderer3D
     int CoarseTileArea;
     int CoarseTileW;
     int CoarseTileH;
+    int ClearCoarseBinMaskLocalSize;
 
     static constexpr int BinStride = 2048/32;
     static constexpr int CoarseBinStride = BinStride/32;
diff --git a/src/GPU3D_Compute_shaders.h b/src/GPU3D_Compute_shaders.h
index 9b3190295..556346109 100644
--- a/src/GPU3D_Compute_shaders.h
+++ b/src/GPU3D_Compute_shaders.h
@@ -846,7 +846,7 @@ void main()
 
 const std::string ClearCoarseBinMask =
     BinningBuffer + R"(
-layout (local_size_x = 32) in;
+layout (local_size_x = ClearCoarseBinMaskLocalSize) in;
 
 void main()
 {
openSUSE Build Service is sponsored by