File melonds_compute-shader-renderer.patch of Package melonds

From 924415716341d244996f20baba8883ecc26dcc7e Mon Sep 17 00:00:00 2001
From: v-fox <virtuousfox@gmail.com>
Date: Tue, 31 Oct 2023 16:33:56 +0500
Subject: [PATCH] compute shader renderer (squashed)

---
 src/CMakeLists.txt                          |    5 +
 src/DMA.cpp                                 |    1 +
 src/DSi_NDMA.cpp                            |    1 +
 src/GPU.cpp                                 |    8 +-
 src/GPU.h                                   |   13 +-
 src/GPU2D.cpp                               |    1 +
 src/GPU2D_Soft.cpp                          |    3 +-
 src/GPU3D.cpp                               |    1 +
 src/GPU3D.h                                 |    5 +
 src/GPU3D_Compute.cpp                       | 1037 ++++++++++++
 src/GPU3D_Compute.h                         |  227 +++
 src/GPU3D_Compute_shaders.h                 | 1665 +++++++++++++++++++
 src/GPU3D_OpenGL.cpp                        |  210 +--
 src/GPU3D_OpenGL.h                          |   26 +-
 src/GPU3D_Texcache.cpp                      |  269 +++
 src/GPU3D_Texcache.h                        |  309 ++++
 src/GPU3D_TexcacheOpenGL.cpp                |   29 +
 src/GPU3D_TexcacheOpenGL.h                  |   25 +
 src/GPU_OpenGL.cpp                          |   45 +-
 src/GPU_OpenGL.h                            |    6 +-
 src/NDS.cpp                                 |    1 +
 src/NonStupidBitfield.h                     |   66 +
 src/OpenGLSupport.cpp                       |  330 +++-
 src/OpenGLSupport.h                         |   21 +-
 src/frontend/qt_sdl/Config.cpp              |    5 +-
 src/frontend/qt_sdl/Config.h                |    1 +
 src/frontend/qt_sdl/OSD.cpp                 |   27 +-
 src/frontend/qt_sdl/VideoSettingsDialog.cpp |   61 +-
 src/frontend/qt_sdl/VideoSettingsDialog.h   |    3 +
 src/frontend/qt_sdl/VideoSettingsDialog.ui  |   90 +-
 src/frontend/qt_sdl/main.cpp                |   40 +-
 src/frontend/qt_sdl/main.h                  |    2 +-
 32 files changed, 4189 insertions(+), 344 deletions(-)
 create mode 100644 src/GPU3D_Compute.cpp
 create mode 100644 src/GPU3D_Compute.h
 create mode 100644 src/GPU3D_Compute_shaders.h
 create mode 100644 src/GPU3D_Texcache.cpp
 create mode 100644 src/GPU3D_Texcache.h
 create mode 100644 src/GPU3D_TexcacheOpenGL.cpp
 create mode 100644 src/GPU3D_TexcacheOpenGL.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9fe93ae..a02fcaf 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -35,6 +35,8 @@ add_library(core STATIC
     GPU2D_Soft.cpp
     GPU3D.cpp
     GPU3D_Soft.cpp
+    GPU3D_Texcache.cpp
+    GPU3D_Texcache.h
     melonDLDI.h
     NDS.cpp
     NDSCart.cpp
@@ -76,6 +78,9 @@ if (ENABLE_OGLRENDERER)
         GPU_OpenGL.cpp
         GPU_OpenGL_shaders.h
         GPU3D_OpenGL.cpp
+        GPU3D_Compute.cpp
+        GPU3D_TexcacheOpenGL.cpp
+        GPU3D_TexcacheOpenGL.h
         GPU3D_OpenGL_shaders.h
         OpenGLSupport.cpp)
 
diff --git a/src/DMA.cpp b/src/DMA.cpp
index a7558ff..51a5ea8 100644
--- a/src/DMA.cpp
+++ b/src/DMA.cpp
@@ -21,6 +21,7 @@
 #include "DSi.h"
 #include "DMA.h"
 #include "GPU.h"
+#include "GPU3D.h"
 #include "DMA_Timings.h"
 #include "Platform.h"
 
diff --git a/src/DSi_NDMA.cpp b/src/DSi_NDMA.cpp
index dba920c..4db18f3 100644
--- a/src/DSi_NDMA.cpp
+++ b/src/DSi_NDMA.cpp
@@ -22,6 +22,7 @@
 #include "DSi_NDMA.h"
 #include "GPU.h"
 #include "DSi_AES.h"
+#include "GPU3D.h"
 
 using Platform::Log;
 using Platform::LogLevel;
diff --git a/src/GPU.cpp b/src/GPU.cpp
index 630e88d..bc822c7 100644
--- a/src/GPU.cpp
+++ b/src/GPU.cpp
@@ -25,6 +25,7 @@
 #endif
 
 #include "GPU2D_Soft.h"
+#include "GPU3D.h"
 
 using Platform::Log;
 using Platform::LogLevel;
@@ -395,7 +396,7 @@ void AssignFramebuffers()
 void InitRenderer(int renderer)
 {
 #ifdef OGLRENDERER_ENABLED
-    if (renderer == 1)
+    if (renderer != renderer3D_Software)
     {
         CurGLCompositor = GLCompositor::New();
         // Create opengl renderer
@@ -405,7 +406,10 @@ void InitRenderer(int renderer)
             renderer = 0;
             GPU3D::CurrentRenderer = std::make_unique<GPU3D::SoftRenderer>();
         }
-        GPU3D::CurrentRenderer = GPU3D::GLRenderer::New();
+        if (renderer == renderer3D_OpenGL)
+            GPU3D::CurrentRenderer = GPU3D::GLRenderer::New();
+        else if (renderer == renderer3D_OpenGLCompute)
+            GPU3D::CurrentRenderer = GPU3D::ComputeRenderer::New();
         if (!GPU3D::CurrentRenderer)
         {
             // Fallback on software renderer
diff --git a/src/GPU.h b/src/GPU.h
index cec8a2d..b4692b6 100644
--- a/src/GPU.h
+++ b/src/GPU.h
@@ -160,6 +160,7 @@ struct RenderSettings
 
     int GL_ScaleFactor;
     bool GL_BetterPolygons;
+    bool GL_HiresCoordinates;
 };
 
 
@@ -170,6 +171,16 @@ void Stop();
 
 void DoSavestate(Savestate* file);
 
+enum
+{
+    renderer3D_Software = 0,
+#ifdef OGLRENDERER_ENABLED
+    renderer3D_OpenGL,
+    renderer3D_OpenGLCompute,
+#endif
+    renderer3D_Max,
+};
+
 void InitRenderer(int renderer);
 void DeInitRenderer();
 void ResetRenderer();
@@ -618,6 +629,4 @@ void SetDispStat(u32 cpu, u16 val);
 void SetVCount(u16 val);
 }
 
-#include "GPU3D.h"
-
 #endif
diff --git a/src/GPU2D.cpp b/src/GPU2D.cpp
index d5df992..fe01ebb 100644
--- a/src/GPU2D.cpp
+++ b/src/GPU2D.cpp
@@ -20,6 +20,7 @@
 #include <string.h>
 #include "NDS.h"
 #include "GPU.h"
+#include "GPU3D.h"
 
 using Platform::Log;
 using Platform::LogLevel;
diff --git a/src/GPU2D_Soft.cpp b/src/GPU2D_Soft.cpp
index 070079a..3704de9 100644
--- a/src/GPU2D_Soft.cpp
+++ b/src/GPU2D_Soft.cpp
@@ -18,6 +18,7 @@
 
 #include "GPU2D_Soft.h"
 #include "GPU.h"
+#include "GPU3D.h"
 
 namespace GPU2D
 {
@@ -367,7 +368,7 @@ void SoftRenderer::VBlankEnd(Unit* unitA, Unit* unitB)
     {
         if ((unitA->CaptureCnt & (1<<31)) && (((unitA->CaptureCnt >> 29) & 0x3) != 1))
         {
-            reinterpret_cast<GPU3D::GLRenderer*>(GPU3D::CurrentRenderer.get())->PrepareCaptureFrame();
+            GPU3D::CurrentRenderer.get()->PrepareCaptureFrame();
         }
     }
 #endif
diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp
index 8cc380a..10b3dce 100644
--- a/src/GPU3D.cpp
+++ b/src/GPU3D.cpp
@@ -23,6 +23,7 @@
 #include "GPU.h"
 #include "FIFO.h"
 #include "Platform.h"
+#include "GPU3D.h"
 
 using Platform::Log;
 using Platform::LogLevel;
diff --git a/src/GPU3D.h b/src/GPU3D.h
index 44d422a..59c958d 100644
--- a/src/GPU3D.h
+++ b/src/GPU3D.h
@@ -156,6 +156,10 @@ public:
     virtual void RenderFrame() = 0;
     virtual void RestartFrame() {};
     virtual u32* GetLine(int line) = 0;
+
+    virtual void SetupAccelFrame() {}
+    virtual void PrepareCaptureFrame() {}
+
 protected:
     Renderer3D(bool Accelerated);
 };
@@ -169,6 +173,7 @@ extern std::unique_ptr<Renderer3D> CurrentRenderer;
 
 #ifdef OGLRENDERER_ENABLED
 #include "GPU3D_OpenGL.h"
+#include "GPU3D_Compute.h"
 #endif
 
 #endif
diff --git a/src/GPU3D_Compute.cpp b/src/GPU3D_Compute.cpp
new file mode 100644
index 0000000..8d500a4
--- /dev/null
+++ b/src/GPU3D_Compute.cpp
@@ -0,0 +1,1037 @@
+/*
+    Copyright 2016-2022 melonDS team
+
+    This file is part of melonDS.
+
+    melonDS is free software: you can redistribute it and/or modify it under
+    the terms of the GNU General Public License as published by the Free
+    Software Foundation, either version 3 of the License, or (at your option)
+    any later version.
+
+    melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
+    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+    FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with melonDS. If not, see http://www.gnu.org/licenses/.
+*/
+
+#include "GPU3D_Compute.h"
+
+#include <assert.h>
+
+#include "OpenGLSupport.h"
+
+#include "GPU3D_Compute_shaders.h"
+
+namespace GPU3D
+{
+
+ComputeRenderer::ComputeRenderer()
+    : Renderer3D(true), Texcache(TexcacheOpenGLLoader())
+{}
+
+bool ComputeRenderer::CompileShader(GLuint& shader, const std::string& source, const std::initializer_list<const char*>& defines)
+{
+    std::string shaderName;
+    std::string shaderSource;
+    shaderSource += "#version 430 core\n";
+    for (const char* define : defines)
+    {
+        shaderSource += "#define ";
+        shaderSource += define;
+        shaderSource += '\n';
+        shaderName += define;
+        shaderName += ',';
+    }
+    shaderSource += "#define ScreenWidth ";
+    shaderSource += std::to_string(ScreenWidth);
+    shaderSource += "\n#define ScreenHeight ";
+    shaderSource += std::to_string(ScreenHeight);
+    shaderSource += "\n#define MaxWorkTiles ";
+    shaderSource += std::to_string(MaxWorkTiles);
+
+    shaderSource += ComputeRendererShaders::Common;
+    shaderSource += source;
+
+    return OpenGL::CompileComputeProgram(shader, shaderSource.c_str(), shaderName.c_str());
+}
+
+void blah(GLenum source,GLenum type,GLuint id,GLenum severity,GLsizei length,const GLchar *message,const void *userParam)
+{
+    printf("%s\n", message);
+}
+
+std::unique_ptr<ComputeRenderer> ComputeRenderer::New()
+{
+    std::unique_ptr<ComputeRenderer> result = std::unique_ptr<ComputeRenderer>(new ComputeRenderer());
+
+    //glDebugMessageCallback(blah, NULL);
+    //glEnable(GL_DEBUG_OUTPUT);
+    glGenBuffers(1, &result->YSpanSetupMemory);
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, result->YSpanSetupMemory);
+    glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupY)*MaxYSpanSetups, nullptr, GL_DYNAMIC_DRAW);
+    
+    glGenBuffers(1, &result->RenderPolygonMemory);
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, result->RenderPolygonMemory);
+    glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(RenderPolygon)*2048, nullptr, GL_DYNAMIC_DRAW);
+
+    glGenBuffers(1, &result->XSpanSetupMemory);
+    glGenBuffers(1, &result->BinResultMemory);
+    glGenBuffers(1, &result->FinalTileMemory);
+    glGenBuffers(1, &result->YSpanIndicesTextureMemory);
+    glGenBuffers(tilememoryLayer_Num, result->TileMemory);
+    glGenBuffers(1, &result->WorkDescMemory);
+
+    glGenTextures(1, &result->YSpanIndicesTexture);
+    glGenTextures(1, &result->LowResFramebuffer);
+    glBindTexture(GL_TEXTURE_2D, result->LowResFramebuffer);
+    glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8UI, 256, 192);
+
+    glGenBuffers(1, &result->MetaUniformMemory);
+    glBindBuffer(GL_UNIFORM_BUFFER, result->MetaUniformMemory);
+    glBufferData(GL_UNIFORM_BUFFER, sizeof(MetaUniform), nullptr, GL_DYNAMIC_DRAW);
+
+    glGenSamplers(9, result->Samplers);
+    for (u32 j = 0; j < 3; j++)
+    {
+        for (u32 i = 0; i < 3; i++)
+        {
+            const GLenum translateWrapMode[3] = {GL_CLAMP_TO_EDGE, GL_REPEAT, GL_MIRRORED_REPEAT};
+            glSamplerParameteri(result->Samplers[i+j*3], GL_TEXTURE_WRAP_S, translateWrapMode[i]);
+            glSamplerParameteri(result->Samplers[i+j*3], GL_TEXTURE_WRAP_T, translateWrapMode[j]);
+            glSamplerParameteri(result->Samplers[i+j*3], GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+            glSamplerParameterf(result->Samplers[i+j*3], GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+        }
+    }
+
+    glGenBuffers(1, &result->PixelBuffer);
+    glBindBuffer(GL_PIXEL_PACK_BUFFER, result->PixelBuffer);
+    glBufferData(GL_PIXEL_PACK_BUFFER, 256*192*4, NULL, GL_DYNAMIC_READ);
+
+    return result;
+}
+
+ComputeRenderer::~ComputeRenderer()
+{
+    Texcache.Reset();
+
+    glDeleteBuffers(1, &YSpanSetupMemory);
+    glDeleteBuffers(1, &RenderPolygonMemory);
+    glDeleteBuffers(1, &XSpanSetupMemory);
+    glDeleteBuffers(1, &BinResultMemory);
+    glDeleteBuffers(tilememoryLayer_Num, TileMemory);
+    glDeleteBuffers(1, &WorkDescMemory);
+    glDeleteBuffers(1, &FinalTileMemory);
+    glDeleteBuffers(1, &YSpanIndicesTextureMemory);
+    glDeleteTextures(1, &YSpanIndicesTexture);
+    glDeleteTextures(1, &Framebuffer);
+    glDeleteBuffers(1, &MetaUniformMemory);
+
+    glDeleteSamplers(9, Samplers);
+    glDeleteBuffers(1, &PixelBuffer);
+}
+
+void ComputeRenderer::DeleteShaders()
+{
+    std::initializer_list<GLuint> allPrograms =
+    {
+        ShaderInterpXSpans[0],
+        ShaderInterpXSpans[1],
+        ShaderBinCombined,
+        ShaderDepthBlend[0],
+        ShaderDepthBlend[1],
+        ShaderRasteriseNoTexture[0],
+        ShaderRasteriseNoTexture[1],
+        ShaderRasteriseNoTextureToon[0],
+        ShaderRasteriseNoTextureToon[1],
+        ShaderRasteriseNoTextureHighlight[0],
+        ShaderRasteriseNoTextureHighlight[1],
+        ShaderRasteriseUseTextureDecal[0],
+        ShaderRasteriseUseTextureDecal[1],
+        ShaderRasteriseUseTextureModulate[0],
+        ShaderRasteriseUseTextureModulate[1],
+        ShaderRasteriseUseTextureToon[0],
+        ShaderRasteriseUseTextureToon[1],
+        ShaderRasteriseUseTextureHighlight[0],
+        ShaderRasteriseUseTextureHighlight[1],
+        ShaderRasteriseShadowMask[0],
+        ShaderRasteriseShadowMask[1],
+        ShaderClearCoarseBinMask,
+        ShaderClearIndirectWorkCount,
+        ShaderCalculateWorkListOffset,
+        ShaderSortWork,
+        ShaderFinalPass[0],
+        ShaderFinalPass[1],
+        ShaderFinalPass[2],
+        ShaderFinalPass[3],
+        ShaderFinalPass[4],
+        ShaderFinalPass[5],
+        ShaderFinalPass[6],
+        ShaderFinalPass[7],
+    };
+    for (GLuint program : allPrograms)
+        glDeleteProgram(program);
+}
+
+void ComputeRenderer::Reset()
+{
+    Texcache.Reset();
+}
+
+void ComputeRenderer::SetRenderSettings(GPU::RenderSettings& settings)
+{
+    if (ScaleFactor != -1)
+    {
+        DeleteShaders();
+    }
+
+    ScaleFactor = settings.GL_ScaleFactor;
+    ScreenWidth = 256 * ScaleFactor;
+    ScreenHeight = 192 * ScaleFactor;
+
+    TilesPerLine = ScreenWidth/TileSize;
+    TileLines = ScreenHeight/TileSize;
+
+    HiresCoordinates = settings.GL_HiresCoordinates;
+
+    MaxWorkTiles = TilesPerLine*TileLines*8;
+
+    for (int i = 0; i < tilememoryLayer_Num; i++)
+    {
+        glBindBuffer(GL_SHADER_STORAGE_BUFFER, TileMemory[i]);
+        glBufferData(GL_SHADER_STORAGE_BUFFER, 4*TileSize*TileSize*MaxWorkTiles, nullptr, GL_DYNAMIC_DRAW);
+    }
+
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, FinalTileMemory);
+    glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*2*ScreenWidth*ScreenHeight, nullptr, GL_DYNAMIC_DRAW);
+
+    int binResultSize = sizeof(BinResultHeader)
+        + TilesPerLine*TileLines*CoarseBinStride*4 // BinnedMaskCoarse
+        + TilesPerLine*TileLines*BinStride*4 // BinnedMask
+        + TilesPerLine*TileLines*BinStride*4; // WorkOffsets
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, BinResultMemory);
+    glBufferData(GL_SHADER_STORAGE_BUFFER, binResultSize, nullptr, GL_DYNAMIC_DRAW);
+
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, WorkDescMemory);
+    glBufferData(GL_SHADER_STORAGE_BUFFER, MaxWorkTiles*2*4*2, nullptr, GL_DYNAMIC_DRAW);
+
+    if (Framebuffer != 0)
+        glDeleteTextures(1, &Framebuffer);
+    glGenTextures(1, &Framebuffer);
+    glBindTexture(GL_TEXTURE_2D, Framebuffer);
+    glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, ScreenWidth, ScreenHeight);
+
+    // eh those are pretty bad guesses
+    // though real hw shouldn't be eable to render all 2048 polygons on every line either
+    int maxYSpanIndices = 64*2048 * ScaleFactor;
+    YSpanIndices.resize(maxYSpanIndices);
+
+    glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory);
+    glBufferData(GL_TEXTURE_BUFFER, maxYSpanIndices*2*4, nullptr, GL_DYNAMIC_DRAW);
+
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, XSpanSetupMemory);
+    glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupX)*maxYSpanIndices, nullptr, GL_DYNAMIC_DRAW);
+
+    glBindTexture(GL_TEXTURE_BUFFER, YSpanIndicesTexture);
+    glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA16UI, YSpanIndicesTextureMemory);
+
+    CompileShader(ShaderInterpXSpans[0], ComputeRendererShaders::InterpSpans, {"InterpSpans", "ZBuffer"});
+    CompileShader(ShaderInterpXSpans[1], ComputeRendererShaders::InterpSpans, {"InterpSpans", "WBuffer"});
+    CompileShader(ShaderBinCombined, ComputeRendererShaders::BinCombined, {"BinCombined"});
+    CompileShader(ShaderDepthBlend[0], ComputeRendererShaders::DepthBlend, {"DepthBlend", "ZBuffer"});
+    CompileShader(ShaderDepthBlend[1], ComputeRendererShaders::DepthBlend, {"DepthBlend", "WBuffer"});
+    CompileShader(ShaderRasteriseNoTexture[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture"});
+    CompileShader(ShaderRasteriseNoTexture[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture"});
+    CompileShader(ShaderRasteriseNoTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Toon"});
+    CompileShader(ShaderRasteriseNoTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Toon"});
+    CompileShader(ShaderRasteriseNoTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Highlight"});
+    CompileShader(ShaderRasteriseNoTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Highlight"});
+    CompileShader(ShaderRasteriseUseTextureDecal[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Decal"});
+    CompileShader(ShaderRasteriseUseTextureDecal[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Decal"});
+    CompileShader(ShaderRasteriseUseTextureModulate[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Modulate"});
+    CompileShader(ShaderRasteriseUseTextureModulate[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Modulate"});
+    CompileShader(ShaderRasteriseUseTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Toon"});
+    CompileShader(ShaderRasteriseUseTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Toon"});
+    CompileShader(ShaderRasteriseUseTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Highlight"});
+    CompileShader(ShaderRasteriseUseTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Highlight"});
+    CompileShader(ShaderRasteriseShadowMask[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "ShadowMask"});
+    CompileShader(ShaderRasteriseShadowMask[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "ShadowMask"});
+    CompileShader(ShaderClearCoarseBinMask, ComputeRendererShaders::ClearCoarseBinMask, {"ClearCoarseBinMask"});
+    CompileShader(ShaderClearIndirectWorkCount, ComputeRendererShaders::ClearIndirectWorkCount, {"ClearIndirectWorkCount"});
+    CompileShader(ShaderCalculateWorkListOffset, ComputeRendererShaders::CalcOffsets, {"CalculateWorkOffsets"});
+    CompileShader(ShaderSortWork, ComputeRendererShaders::SortWork, {"SortWork"});
+    CompileShader(ShaderFinalPass[0], ComputeRendererShaders::FinalPass, {"FinalPass"});
+    CompileShader(ShaderFinalPass[1], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking"});
+    CompileShader(ShaderFinalPass[2], ComputeRendererShaders::FinalPass, {"FinalPass", "Fog"});
+    CompileShader(ShaderFinalPass[3], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking", "Fog"});
+    CompileShader(ShaderFinalPass[4], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing"});
+    CompileShader(ShaderFinalPass[5], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking"});
+    CompileShader(ShaderFinalPass[6], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "Fog"});
+    CompileShader(ShaderFinalPass[7], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking", "Fog"});
+}
+
+void ComputeRenderer::VCount144()
+{
+
+}
+
+void ComputeRenderer::SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int to)
+{
+    span->Z0 = poly->FinalZ[from];
+    span->W0 = poly->FinalW[from];
+    span->Z1 = poly->FinalZ[to];
+    span->W1 = poly->FinalW[to];
+    span->ColorR0 = poly->Vertices[from]->FinalColor[0];
+    span->ColorG0 = poly->Vertices[from]->FinalColor[1];
+    span->ColorB0 = poly->Vertices[from]->FinalColor[2];
+    span->ColorR1 = poly->Vertices[to]->FinalColor[0];
+    span->ColorG1 = poly->Vertices[to]->FinalColor[1];
+    span->ColorB1 = poly->Vertices[to]->FinalColor[2];
+    span->TexcoordU0 = poly->Vertices[from]->TexCoords[0];
+    span->TexcoordV0 = poly->Vertices[from]->TexCoords[1];
+    span->TexcoordU1 = poly->Vertices[to]->TexCoords[0];
+    span->TexcoordV1 = poly->Vertices[to]->TexCoords[1];
+}
+
+void ComputeRenderer::SetupYSpanDummy(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2])
+{
+    s32 x0 = positions[vertex][0];
+    if (side)
+    {
+        span->DxInitial = -0x40000;
+        x0--;
+    }
+    else
+    {
+        span->DxInitial = 0;
+    }
+
+    span->X0 = span->X1 = x0;
+    span->XMin = x0;
+    span->XMax = x0;
+    span->Y0 = span->Y1 = positions[vertex][1];
+
+    if (span->XMin < rp->XMin)
+    {
+        rp->XMin = span->XMin;
+        rp->XMinY = span->Y0;
+    }
+    if (span->XMax > rp->XMax)
+    {
+        rp->XMax = span->XMax;
+        rp->XMaxY = span->Y0;
+    }
+
+    span->Increment = 0;
+
+    span->I0 = span->I1 = span->IRecip = 0;
+    span->Linear = true;
+
+    span->XCovIncr = 0;
+
+    span->IsDummy = true;
+
+    SetupAttrs(span, poly, vertex, vertex);
+}
+
+void ComputeRenderer::SetupYSpan(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int from, int to, int side, s32 positions[10][2])
+{
+    span->X0 = positions[from][0];
+    span->X1 = positions[to][0];
+    span->Y0 = positions[from][1];
+    span->Y1 = positions[to][1];
+
+    SetupAttrs(span, poly, from, to);
+
+    s32 minXY, maxXY;
+    bool negative = false;
+    if (span->X1 > span->X0)
+    {
+        span->XMin = span->X0;
+        span->XMax = span->X1-1;
+
+        minXY = span->Y0;
+        maxXY = span->Y1;
+    }
+    else if (span->X1 < span->X0)
+    {
+        span->XMin = span->X1;
+        span->XMax = span->X0-1;
+        negative = true;
+
+        minXY = span->Y1;
+        maxXY = span->Y0;
+    }
+    else
+    {
+        span->XMin = span->X0;
+        if (side) span->XMin--;
+        span->XMax = span->XMin;
+
+        // doesn't matter for completely vertical slope
+        minXY = span->Y0;
+        maxXY = span->Y0;
+    }
+
+    if (span->XMin < rp->XMin)
+    {
+        rp->XMin = span->XMin;
+        rp->XMinY = minXY;
+    }
+    if (span->XMax > rp->XMax)
+    {
+        rp->XMax = span->XMax;
+        rp->XMaxY = maxXY;
+    }
+
+    span->IsDummy = false;
+
+    s32 xlen = span->XMax+1 - span->XMin;
+    s32 ylen = span->Y1 - span->Y0;
+
+    // slope increment has a 18-bit fractional part
+    // note: for some reason, x/y isn't calculated directly,
+    // instead, 1/y is calculated and then multiplied by x
+    // TODO: this is still not perfect (see for example x=169 y=33)
+    if (ylen == 0)
+    {
+        span->Increment = 0;
+    }
+    else if (ylen == xlen)
+    {
+        span->Increment = 0x40000;
+    }
+    else
+    {
+        s32 yrecip = (1<<18) / ylen;
+        span->Increment = (span->X1-span->X0) * yrecip;
+        if (span->Increment < 0) span->Increment = -span->Increment;
+    }
+
+    bool xMajor = (span->Increment > 0x40000);
+
+    if (side)
+    {
+        // right
+
+        if (xMajor)
+            span->DxInitial = negative ? (0x20000 + 0x40000) : (span->Increment - 0x20000);
+        else if (span->Increment != 0)
+            span->DxInitial = negative ? 0x40000 : 0;
+        else
+            span->DxInitial = -0x40000;
+    }
+    else
+    {
+        // left
+
+        if (xMajor)
+            span->DxInitial = negative ? ((span->Increment - 0x20000) + 0x40000) : 0x20000;
+        else if (span->Increment != 0)
+            span->DxInitial = negative ? 0x40000 : 0;
+        else
+            span->DxInitial = 0;
+    }
+
+    if (xMajor)
+    {
+        if (side)
+        {
+            span->I0 = span->X0 - 1;
+            span->I1 = span->X1 - 1;
+        }
+        else
+        {
+            span->I0 = span->X0;
+            span->I1 = span->X1;
+        }
+
+        // used for calculating AA coverage
+        span->XCovIncr = (ylen << 10) / xlen;
+    }
+    else
+    {
+        span->I0 = span->Y0;
+        span->I1 = span->Y1;
+    }
+
+    if (span->I0 != span->I1)
+        span->IRecip = (1<<30) / (span->I1 - span->I0);
+    else
+        span->IRecip = 0;
+
+    span->Linear = (span->W0 == span->W1) && !(span->W0 & 0x7E) && !(span->W1 & 0x7E);
+
+    if ((span->W0 & 0x1) && !(span->W1 & 0x1))
+    {
+        span->W0n = (span->W0 - 1) >> 1;
+        span->W0d = (span->W0 + 1) >> 1;
+        span->W1d = span->W1 >> 1;
+    }
+    else
+    {
+        span->W0n = span->W0 >> 1;
+        span->W0d = span->W0 >> 1;
+        span->W1d = span->W1 >> 1;
+    }
+}
+
+struct Variant
+{
+    GLuint Texture, Sampler;
+    u16 Width, Height;
+    u8 BlendMode;
+
+    bool operator==(const Variant& other)
+    {
+        return Texture == other.Texture && Sampler == other.Sampler && BlendMode == other.BlendMode;
+    }
+};
+
+/*
+    Antialiasing
+    W-Buffer
+    With Texture
+    0
+    1, 3
+    2
+    without Texture
+    2
+    0, 1, 3
+
+    => 20 Shader + 1x Shadow Mask
+*/
+
+void ComputeRenderer::RenderFrame()
+{
+    //printf("render frame\n");
+
+    if (!Texcache.Update() && RenderFrameIdentical)
+    {
+        return;
+    }
+
+    int numYSpans = 0;
+    int numSetupIndices = 0;
+
+    /*
+        Some games really like to spam small textures, often
+        to store the data like PPU tiles. E.g. Shantae
+        or some Mega Man game. Fortunately they are usually kind
+        enough to not vary the texture size all too often (usually
+        they just use 8x8 or 16x for everything).
+
+        This is the reason we have this whole mess where textures of
+        the same size are put into array textures. This allows
+        to increase the batch size.
+        Less variance between each Variant hah!
+    */
+    u32 numVariants = 0, prevVariant, prevTexLayer;
+    Variant variants[MaxVariants];
+
+    bool enableTextureMaps = RenderDispCnt & (1<<0);
+
+    for (int i = 0; i < RenderNumPolygons; i++)
+    {
+        Polygon* polygon = RenderPolygonRAM[i];
+
+        u32 nverts = polygon->NumVertices;
+        u32 vtop = polygon->VTop, vbot = polygon->VBottom;
+
+        u32 curVL = vtop, curVR = vtop;
+        u32 nextVL, nextVR;
+
+        RenderPolygons[i].FirstXSpan = numSetupIndices;
+        RenderPolygons[i].Attr = polygon->Attr;
+
+        bool foundVariant = false;
+        if (i > 0)
+        {
+            // if the whole texture attribute matches
+            // the texture layer will also match
+            Polygon* prevPolygon = RenderPolygonRAM[i - 1];
+            foundVariant = prevPolygon->TexParam == polygon->TexParam
+                && prevPolygon->TexPalette == polygon->TexPalette
+                && (prevPolygon->Attr & 0x30) == (polygon->Attr & 0x30)
+                && prevPolygon->IsShadowMask == polygon->IsShadowMask;
+        }
+
+        if (!foundVariant)
+        {
+            Variant variant;
+            variant.BlendMode = polygon->IsShadowMask ? 4 : ((polygon->Attr >> 4) & 0x3);
+            variant.Texture = 0;
+            variant.Sampler = 0;
+            u32* textureLastVariant = nullptr;
+            // we always need to look up the texture to get the layer of the array texture
+            if (enableTextureMaps && (polygon->TexParam >> 26) & 0x7)
+            {
+                Texcache.GetTexture(polygon->TexParam, polygon->TexPalette, variant.Texture, prevTexLayer, textureLastVariant);
+                bool wrapS = (polygon->TexParam >> 16) & 1;
+                bool wrapT = (polygon->TexParam >> 17) & 1;
+                bool mirrorS = (polygon->TexParam >> 18) & 1;
+                bool mirrorT = (polygon->TexParam >> 19) & 1;
+                variant.Sampler = Samplers[(wrapS ? (mirrorS ? 2 : 1) : 0) + (wrapT ? (mirrorT ? 2 : 1) : 0) * 3];
+
+                if (*textureLastVariant < numVariants && variants[*textureLastVariant] == variant)
+                {
+                    foundVariant = true;
+                    prevVariant = *textureLastVariant;
+                }
+            }
+
+            if (!foundVariant)
+            {
+                for (int j = numVariants - 1; j >= 0; j--)
+                {
+                    if (variants[j] == variant)
+                    {
+                        foundVariant = true;
+                        prevVariant = j;
+                        goto foundVariant;
+                    }
+                }
+
+                prevVariant = numVariants;
+                variants[numVariants] = variant;
+                variants[numVariants].Width = TextureWidth(polygon->TexParam);
+                variants[numVariants].Height = TextureHeight(polygon->TexParam);
+                numVariants++;
+                assert(numVariants <= MaxVariants);
+            foundVariant:;
+
+                if (textureLastVariant)
+                    *textureLastVariant = prevVariant;
+            }
+        }
+        RenderPolygons[i].Variant = prevVariant;
+        RenderPolygons[i].TextureLayer = (float)prevTexLayer;
+
+        if (polygon->FacingView)
+        {
+            nextVL = curVL + 1;
+            if (nextVL >= nverts) nextVL = 0;
+            nextVR = curVR - 1;
+            if ((s32)nextVR < 0) nextVR = nverts - 1;
+        }
+        else
+        {
+            nextVL = curVL - 1;
+            if ((s32)nextVL < 0) nextVL = nverts - 1;
+            nextVR = curVR + 1;
+            if (nextVR >= nverts) nextVR = 0;
+        }
+
+        s32 scaledPositions[10][2];
+        s32 ytop = ScreenHeight, ybot = 0;
+        for (int i = 0; i < polygon->NumVertices; i++)
+        {
+            if (HiresCoordinates)
+            {
+                scaledPositions[i][0] = (polygon->Vertices[i]->HiresPosition[0] * ScaleFactor) >> 4;
+                scaledPositions[i][1] = (polygon->Vertices[i]->HiresPosition[1] * ScaleFactor) >> 4;
+            }
+            else
+            {
+                scaledPositions[i][0] = polygon->Vertices[i]->FinalPosition[0] * ScaleFactor;
+                scaledPositions[i][1] = polygon->Vertices[i]->FinalPosition[1] * ScaleFactor;
+            }
+            ytop = std::min(scaledPositions[i][1], ytop);
+            ybot = std::max(scaledPositions[i][1], ybot);
+        }
+        RenderPolygons[i].YTop = ytop;
+        RenderPolygons[i].YBot = ybot;
+        RenderPolygons[i].XMin = ScreenWidth;
+        RenderPolygons[i].XMax = 0;
+
+        if (ybot == ytop)
+        {
+            vtop = 0; vbot = 0;
+
+            RenderPolygons[i].YBot++;
+
+            int j = 1;
+            if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j;
+            if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j;
+
+            j = nverts - 1;
+            if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j;
+            if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j;
+
+            assert(numYSpans < MaxYSpanSetups);
+            u32 curSpanL = numYSpans;
+            SetupYSpanDummy(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, vtop, 0, scaledPositions);
+            assert(numYSpans < MaxYSpanSetups);
+            u32 curSpanR = numYSpans;
+            SetupYSpanDummy(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, vbot, 1, scaledPositions);
+
+            YSpanIndices[numSetupIndices].PolyIdx = i;
+            YSpanIndices[numSetupIndices].SpanIdxL = curSpanL;
+            YSpanIndices[numSetupIndices].SpanIdxR = curSpanR;
+            YSpanIndices[numSetupIndices].Y = ytop;
+            numSetupIndices++;
+        }
+        else
+        {
+            u32 curSpanL = numYSpans;
+            assert(numYSpans < MaxYSpanSetups);
+            SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVL, nextVL, 0, scaledPositions);
+            u32 curSpanR = numYSpans;
+            assert(numYSpans < MaxYSpanSetups);
+            SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVR, nextVR, 1, scaledPositions);
+
+            for (u32 y = ytop; y < ybot; y++)
+            {
+                if (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom)
+                {
+                    while (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom)
+                    {
+                        curVL = nextVL;
+                        if (polygon->FacingView)
+                        {
+                            nextVL = curVL + 1;
+                            if (nextVL >= nverts)
+                                nextVL = 0;
+                        }
+                        else
+                        {
+                            nextVL = curVL - 1;
+                            if ((s32)nextVL < 0)
+                                nextVL = nverts - 1;
+                        }
+                    }
+
+
+                    assert(numYSpans < MaxYSpanSetups);
+                    curSpanL = numYSpans;
+                    SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVL, nextVL, 0, scaledPositions);
+                }
+                if (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom)
+                {
+                    while (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom)
+                    {
+                        curVR = nextVR;
+                        if (polygon->FacingView)
+                        {
+                            nextVR = curVR - 1;
+                            if ((s32)nextVR < 0)
+                                nextVR = nverts - 1;
+                        }
+                        else
+                        {
+                            nextVR = curVR + 1;
+                            if (nextVR >= nverts)
+                                nextVR = 0;
+                        }
+                    }
+
+                    assert(numYSpans < MaxYSpanSetups);
+                    curSpanR = numYSpans;
+                    SetupYSpan(&RenderPolygons[i] ,&YSpanSetups[numYSpans++], polygon, curVR, nextVR, 1, scaledPositions);
+                }
+
+                YSpanIndices[numSetupIndices].PolyIdx = i;
+                YSpanIndices[numSetupIndices].SpanIdxL = curSpanL;
+                YSpanIndices[numSetupIndices].SpanIdxR = curSpanR;
+                YSpanIndices[numSetupIndices].Y = y;
+                numSetupIndices++;
+            }
+        }
+
+        //printf("polygon min max %d %d | %d %d\n", RenderPolygons[i].XMin, RenderPolygons[i].XMinY, RenderPolygons[i].XMax, RenderPolygons[i].XMaxY);
+    }
+
+    /*for (u32 i = 0; i < RenderNumPolygons; i++)
+    {
+        if (RenderPolygons[i].Variant >= numVariants)
+        {
+            printf("blarb2 %d %d %d\n", RenderPolygons[i].Variant, i, RenderNumPolygons);
+        }
+        //assert(RenderPolygons[i].Variant < numVariants);
+    }*/
+
+    if (numYSpans > 0)
+    {
+        glBindBuffer(GL_SHADER_STORAGE_BUFFER, YSpanSetupMemory);
+        glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(SpanSetupY)*numYSpans, YSpanSetups);
+
+        glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory);
+        glBufferSubData(GL_TEXTURE_BUFFER, 0, numSetupIndices*4*2, YSpanIndices.data());
+
+        glBindBuffer(GL_SHADER_STORAGE_BUFFER, RenderPolygonMemory);
+        glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, RenderNumPolygons*sizeof(RenderPolygon), RenderPolygons);
+        // we haven't accessed image data yet, so we don't need to invalidate anything
+    }
+
+    //printf("found via %d %d %d of %d\n", foundviatexcache, foundviaprev, numslow, RenderNumPolygons);
+
+    // bind everything
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, RenderPolygonMemory);
+
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, XSpanSetupMemory);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, YSpanSetupMemory);
+
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, FinalTileMemory);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, BinResultMemory);
+    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 7, WorkDescMemory);
+
+    MetaUniform meta;
+    meta.DispCnt = RenderDispCnt;
+    meta.NumPolygons = RenderNumPolygons;
+    meta.NumVariants = numVariants;
+    meta.AlphaRef = RenderAlphaRef;
+    {
+        u32 r = (RenderClearAttr1 << 1) & 0x3E; if (r) r++;
+        u32 g = (RenderClearAttr1 >> 4) & 0x3E; if (g) g++;
+        u32 b = (RenderClearAttr1 >> 9) & 0x3E; if (b) b++;
+        u32 a = (RenderClearAttr1 >> 16) & 0x1F;
+        meta.ClearColor = r | (g << 8) | (b << 16) | (a << 24);
+        meta.ClearDepth = ((RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF;
+        meta.ClearAttr = RenderClearAttr1 & 0x3F008000;
+    }
+    for (u32 i = 0; i < 32; i++)
+    {
+        u32 color = RenderToonTable[i];
+        u32 r = (color << 1) & 0x3E;
+        u32 g = (color >> 4) & 0x3E;
+        u32 b = (color >> 9) & 0x3E;
+        if (r) r++;
+        if (g) g++;
+        if (b) b++;
+
+        meta.ToonTable[i*4+0] = r | (g << 8) | (b << 16);
+    }
+    for (u32 i = 0; i < 34; i++)
+    {
+        meta.ToonTable[i*4+1] = RenderFogDensityTable[i];
+    }
+    for (u32 i = 0; i < 8; i++)
+    {
+        u32 color = RenderEdgeTable[i];
+        u32 r = (color << 1) & 0x3E;
+        u32 g = (color >> 4) & 0x3E;
+        u32 b = (color >> 9) & 0x3E;
+        if (r) r++;
+        if (g) g++;
+        if (b) b++;
+
+        meta.ToonTable[i*4+2] = r | (g << 8) | (b << 16);
+    }
+    meta.FogOffset = RenderFogOffset;
+    meta.FogShift = RenderFogShift;
+    {
+        u32 fogR = (RenderFogColor << 1) & 0x3E; if (fogR) fogR++;
+        u32 fogG = (RenderFogColor >> 4) & 0x3E; if (fogG) fogG++;
+        u32 fogB = (RenderFogColor >> 9) & 0x3E; if (fogB) fogB++;
+        u32 fogA = (RenderFogColor >> 16) & 0x1F;
+        meta.FogColor = fogR | (fogG << 8) | (fogB << 16) | (fogA << 24);
+    }
+
+    glBindBuffer(GL_UNIFORM_BUFFER, MetaUniformMemory);
+    glBufferSubData(GL_UNIFORM_BUFFER, 0, sizeof(MetaUniform), &meta);
+    glBindBufferBase(GL_UNIFORM_BUFFER, 0, MetaUniformMemory);
+
+    glUseProgram(ShaderClearCoarseBinMask);
+    glDispatchCompute(TilesPerLine*TileLines/32, 1, 1);
+
+    bool wbuffer = false;
+    if (numYSpans > 0)
+    {
+        wbuffer = RenderPolygonRAM[0]->WBuffer;
+
+        glUseProgram(ShaderClearIndirectWorkCount);
+        glDispatchCompute((numVariants+31)/32, 1, 1);
+
+        // calculate x-spans
+        glBindImageTexture(0, YSpanIndicesTexture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGBA16UI);
+        glUseProgram(ShaderInterpXSpans[wbuffer]);
+        glDispatchCompute((numSetupIndices + 31) / 32, 1, 1);
+        glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
+
+        // bin polygons
+        glUseProgram(ShaderBinCombined);
+        glDispatchCompute(((RenderNumPolygons + 31) / 32), ScreenWidth/CoarseTileW, ScreenHeight/CoarseTileH);
+        glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
+
+        // calculate list offsets
+        glUseProgram(ShaderCalculateWorkListOffset);
+        glDispatchCompute((numVariants + 31) / 32, 1, 1);
+        glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
+
+        // sort shader work
+        glUseProgram(ShaderSortWork);
+        glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory);
+        glDispatchComputeIndirect(offsetof(BinResultHeader, SortWorkWorkCount));
+        glMemoryBarrier(GL_SHADER_STORAGE_BUFFER);
+
+        glActiveTexture(GL_TEXTURE0);
+
+        for (int i = 0; i < tilememoryLayer_Num; i++)
+            glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2+i, TileMemory[i]);
+
+        // rasterise
+        {
+            bool highLightMode = RenderDispCnt & (1<<1);
+
+            GLuint shadersNoTexture[] =
+            {
+                ShaderRasteriseNoTexture[wbuffer],
+                ShaderRasteriseNoTexture[wbuffer],
+                highLightMode
+                    ? ShaderRasteriseNoTextureHighlight[wbuffer]
+                    : ShaderRasteriseNoTextureToon[wbuffer],
+                ShaderRasteriseNoTexture[wbuffer],
+                ShaderRasteriseShadowMask[wbuffer]
+            };
+            GLuint shadersUseTexture[] =
+            {
+                ShaderRasteriseUseTextureModulate[wbuffer],
+                ShaderRasteriseUseTextureDecal[wbuffer],
+                highLightMode
+                    ? ShaderRasteriseUseTextureHighlight[wbuffer]
+                    : ShaderRasteriseUseTextureToon[wbuffer],
+                ShaderRasteriseUseTextureDecal[wbuffer],
+                ShaderRasteriseShadowMask[wbuffer]
+            };
+
+            GLuint prevShader = 0;
+            s32 prevTexture = 0, prevSampler = 0;
+            for (int i = 0; i < numVariants; i++)
+            {
+                GLuint shader = 0;
+                if (variants[i].Texture == 0)
+                {
+                    shader = shadersNoTexture[variants[i].BlendMode];
+                }
+                else
+                {
+                    shader = shadersUseTexture[variants[i].BlendMode];
+                    if (variants[i].Texture != prevTexture)
+                    {
+                        glBindTexture(GL_TEXTURE_2D_ARRAY, variants[i].Texture);
+                        prevTexture = variants[i].Texture;
+                    }
+                    if (variants[i].Sampler != prevSampler)
+                    {
+                        glBindSampler(0, variants[i].Sampler);
+                        prevSampler = variants[i].Sampler;
+                    }
+                }
+                assert(shader != 0);
+                if (shader != prevShader)
+                {
+                    glUseProgram(shader);
+                    prevShader = shader;
+                }
+
+                glUniform1ui(UniformIdxCurVariant, i);
+                glUniform2f(UniformIdxTextureSize, 1.f / variants[i].Width, 1.f / variants[i].Height);
+                glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory);
+                glDispatchComputeIndirect(offsetof(BinResultHeader, VariantWorkCount) + i*4*4);
+            }
+        }
+    }
+    glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+
+    // compose final image
+    glUseProgram(ShaderDepthBlend[wbuffer]);
+    glDispatchCompute(ScreenWidth/TileSize, ScreenHeight/TileSize, 1);
+    glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+
+    glBindImageTexture(0, Framebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8);
+    glBindImageTexture(1, LowResFramebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8UI);
+    u32 finalPassShader = 0;
+    if (RenderDispCnt & (1<<4))
+        finalPassShader |= 0x4;
+    if (RenderDispCnt & (1<<7))
+        finalPassShader |= 0x2;
+    if (RenderDispCnt & (1<<5))
+        finalPassShader |= 0x1;
+    
+    glUseProgram(ShaderFinalPass[finalPassShader]);
+    glDispatchCompute(ScreenWidth/32, ScreenHeight, 1);
+    glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
+
+    glBindSampler(0, 0);
+
+    /*u64 starttime = armGetSystemTick();
+    EmuQueue.waitIdle();
+    printf("total time %f\n", armTicksToNs(armGetSystemTick()-starttime)*0.000001f);*/
+
+    /*for (u32 i = 0; i < RenderNumPolygons; i++)
+    {
+        if (RenderPolygons[i].Variant >= numVariants)
+        {
+            printf("blarb %d %d %d\n", RenderPolygons[i].Variant, i, RenderNumPolygons);
+        }
+        //assert(RenderPolygons[i].Variant < numVariants);
+    }*/
+
+    /*for (int i = 0; i < binresult->SortWorkWorkCount[0]*32; i++)
+    {
+        printf("sorted %x %x\n", binresult->SortedWork[i*2+0], binresult->SortedWork[i*2+1]);
+    }*/
+/*    if (polygonvisible != -1)
+    {
+        SpanSetupX* xspans = Gfx::DataHeap->CpuAddr<SpanSetupX>(XSpanSetupMemory);
+        printf("span result\n");
+        Polygon* poly = RenderPolygonRAM[polygonvisible];
+        u32 xspanoffset = RenderPolygons[polygonvisible].FirstXSpan;
+        for (u32 i = 0; i < (poly->YBottom - poly->YTop); i++)
+        {
+            printf("%d: %d - %d | %d %d | %d %d\n", i + poly->YTop, xspans[xspanoffset + i].X0, xspans[xspanoffset + i].X1, xspans[xspanoffset + i].__pad0, xspans[xspanoffset + i].__pad1, RenderPolygons[polygonvisible].YTop, RenderPolygons[polygonvisible].YBot);
+        }
+    }*/
+/*
+    printf("xspans: %d\n", numSetupIndices);
+    SpanSetupX* xspans = Gfx::DataHeap->CpuAddr<SpanSetupX>(XSpanSetupMemory[curSlice]);
+    for (int i = 0; i < numSetupIndices; i++)
+    {
+        printf("poly %d %d %d | line %d | %d to %d\n", YSpanIndices[i].PolyIdx, YSpanIndices[i].SpanIdxL, YSpanIndices[i].SpanIdxR, YSpanIndices[i].Y, xspans[i].X0, xspans[i].X1);
+    }
+    printf("bin result\n");
+    BinResult* binresult = Gfx::DataHeap->CpuAddr<BinResult>(BinResultMemory);
+    for (u32 y = 0; y < 192/8; y++)
+    {
+        for (u32 x = 0; x < 256/8; x++)
+        {
+            printf("%08x ", binresult->BinnedMaskCoarse[(x + y * (256/8)) * 2]);
+        }
+        printf("\n");
+    }*/
+}
+
+void ComputeRenderer::RestartFrame()
+{
+
+}
+
+u32* ComputeRenderer::GetLine(int line)
+{
+    int stride = 256;
+
+    if (line == 0)
+    {
+        glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer);
+        u8* data = (u8*)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
+        if (data) memcpy(&FramebufferCPU[0], data, 4*stride*192);
+        glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
+    }
+
+    return &FramebufferCPU[stride * line];
+}
+
+void ComputeRenderer::SetupAccelFrame()
+{
+    glBindTexture(GL_TEXTURE_2D, Framebuffer);
+}
+
+void ComputeRenderer::PrepareCaptureFrame()
+{
+    glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer);
+    glBindTexture(GL_TEXTURE_2D, LowResFramebuffer);
+    glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, nullptr);
+}
+
+}
\ No newline at end of file
diff --git a/src/GPU3D_Compute.h b/src/GPU3D_Compute.h
new file mode 100644
index 0000000..b36bc18
--- /dev/null
+++ b/src/GPU3D_Compute.h
@@ -0,0 +1,227 @@
+/*
+    Copyright 2016-2022 melonDS team
+
+    This file is part of melonDS.
+
+    melonDS is free software: you can redistribute it and/or modify it under
+    the terms of the GNU General Public License as published by the Free
+    Software Foundation, either version 3 of the License, or (at your option)
+    any later version.
+
+    melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
+    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+    FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with melonDS. If not, see http://www.gnu.org/licenses/.
+*/
+
+#ifndef GPU3D_COMPUTE
+#define GPU3D_COMPUTE
+
+#include <memory>
+
+#include "GPU3D.h"
+
+#include "OpenGLSupport.h"
+
+#include "GPU3D_TexcacheOpenGL.h"
+
+#include "NonStupidBitfield.h"
+
+namespace GPU3D
+{
+
+class ComputeRenderer : public Renderer3D
+{
+public:
+    static std::unique_ptr<ComputeRenderer> New();
+    ~ComputeRenderer() override;
+
+    void Reset() override;
+
+    void SetRenderSettings(GPU::RenderSettings& settings) override;
+
+    void VCount144() override;
+
+    void RenderFrame() override;
+    void RestartFrame() override;
+    u32* GetLine(int line) override;
+
+    void SetupAccelFrame() override;
+    void PrepareCaptureFrame() override;
+private:
+    ComputeRenderer();
+
+    GLuint ShaderInterpXSpans[2];
+    GLuint ShaderBinCombined;
+    GLuint ShaderDepthBlend[2];
+    GLuint ShaderRasteriseNoTexture[2];
+    GLuint ShaderRasteriseNoTextureToon[2];
+    GLuint ShaderRasteriseNoTextureHighlight[2];
+    GLuint ShaderRasteriseUseTextureDecal[2];
+    GLuint ShaderRasteriseUseTextureModulate[2];
+    GLuint ShaderRasteriseUseTextureToon[2];
+    GLuint ShaderRasteriseUseTextureHighlight[2];
+    GLuint ShaderRasteriseShadowMask[2];
+    GLuint ShaderClearCoarseBinMask;
+    GLuint ShaderClearIndirectWorkCount;
+    GLuint ShaderCalculateWorkListOffset;
+    GLuint ShaderSortWork;
+    GLuint ShaderFinalPass[8];
+
+    GLuint YSpanIndicesTextureMemory;
+    GLuint YSpanIndicesTexture;
+    GLuint YSpanSetupMemory;
+    GLuint XSpanSetupMemory;
+    GLuint BinResultMemory;
+    GLuint RenderPolygonMemory;
+    GLuint WorkDescMemory;
+
+    enum
+    {
+        tilememoryLayer_Color,
+        tilememoryLayer_Depth,
+        tilememoryLayer_Attr,
+        tilememoryLayer_Num,
+    };
+
+    GLuint TileMemory[tilememoryLayer_Num];
+    GLuint FinalTileMemory;
+
+    u32 DummyLine[256] = {};
+
+    struct SpanSetupY
+    {
+        // Attributes
+        s32 Z0, Z1, W0, W1;
+        s32 ColorR0, ColorG0, ColorB0;
+        s32 ColorR1, ColorG1, ColorB1;
+        s32 TexcoordU0, TexcoordV0;
+        s32 TexcoordU1, TexcoordV1;
+
+        // Interpolator
+        s32 I0, I1;
+        s32 Linear;
+        s32 IRecip;
+        s32 W0n, W0d, W1d;
+
+        // Slope
+        s32 Increment;
+
+        s32 X0, X1, Y0, Y1;
+        s32 XMin, XMax;
+        s32 DxInitial;
+
+        s32 XCovIncr;
+        u32 IsDummy;
+    };
+    struct SpanSetupX
+    {
+        s32 X0, X1;
+
+        s32 EdgeLenL, EdgeLenR, EdgeCovL, EdgeCovR;
+
+        s32 XRecip;
+
+        u32 Flags;
+
+        s32 Z0, Z1, W0, W1;
+        s32 ColorR0, ColorG0, ColorB0;
+        s32 ColorR1, ColorG1, ColorB1;
+        s32 TexcoordU0, TexcoordV0;
+        s32 TexcoordU1, TexcoordV1;
+
+        s32 CovLInitial, CovRInitial;
+    };
+    struct SetupIndices
+    {
+        u16 PolyIdx, SpanIdxL, SpanIdxR, Y;
+    };
+    struct RenderPolygon
+    {
+        u32 FirstXSpan;
+        s32 YTop, YBot;
+
+        s32 XMin, XMax;
+        s32 XMinY, XMaxY;
+
+        u32 Variant;
+        u32 Attr;
+
+        float TextureLayer;
+    };
+
+    static constexpr int TileSize = 8;
+    static constexpr int CoarseTileCountX = 8;
+    static constexpr int CoarseTileCountY = 4;
+    static constexpr int CoarseTileW = CoarseTileCountX * TileSize;
+    static constexpr int CoarseTileH = CoarseTileCountY * TileSize;
+
+    static constexpr int BinStride = 2048/32;
+    static constexpr int CoarseBinStride = BinStride/32;
+
+    static constexpr int MaxVariants = 256;
+
+    static constexpr int UniformIdxCurVariant = 0;
+    static constexpr int UniformIdxTextureSize = 1;
+
+    static constexpr int MaxFullscreenLayers = 16;
+
+    struct BinResultHeader
+    {
+        u32 VariantWorkCount[MaxVariants*4];
+        u32 SortedWorkOffset[MaxVariants];
+
+        u32 SortWorkWorkCount[4];
+    };
+
+    static const int MaxYSpanSetups = 6144*2;
+    std::vector<SetupIndices> YSpanIndices;
+    SpanSetupY YSpanSetups[MaxYSpanSetups];
+    RenderPolygon RenderPolygons[2048];
+
+    TexcacheOpenGL Texcache;
+
+    struct MetaUniform
+    {
+        u32 NumPolygons;
+        u32 NumVariants;
+
+        u32 AlphaRef;
+        u32 DispCnt;
+
+        u32 ToonTable[4*34];
+
+        u32 ClearColor, ClearDepth, ClearAttr;
+
+        u32 FogOffset, FogShift, FogColor;
+    };
+    GLuint MetaUniformMemory;
+
+    GLuint Samplers[9];
+
+    GLuint Framebuffer = 0;
+    GLuint LowResFramebuffer;
+    GLuint PixelBuffer;
+
+    u32 FramebufferCPU[256*192];
+
+    int ScreenWidth, ScreenHeight;
+    int TilesPerLine, TileLines;
+    int ScaleFactor = -1;
+    int MaxWorkTiles;
+    bool HiresCoordinates;
+
+    void DeleteShaders();
+
+    void SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int to);
+    void SetupYSpan(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int from, int to, int side, s32 positions[10][2]);
+    void SetupYSpanDummy(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2]);
+
+    bool CompileShader(GLuint& shader, const std::string& source, const std::initializer_list<const char*>& defines);
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/GPU3D_Compute_shaders.h b/src/GPU3D_Compute_shaders.h
new file mode 100644
index 0000000..39858a4
--- /dev/null
+++ b/src/GPU3D_Compute_shaders.h
@@ -0,0 +1,1665 @@
+/*
+    Copyright 2016-2022 melonDS team
+
+    This file is part of melonDS.
+
+    melonDS is free software: you can redistribute it and/or modify it under
+    the terms of the GNU General Public License as published by the Free
+    Software Foundation, either version 3 of the License, or (at your option)
+    any later version.
+
+    melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
+    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+    FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with melonDS. If not, see http://www.gnu.org/licenses/.
+*/
+
+#ifndef GPU3D_COMPUTE_SHADERS
+#define GPU3D_COMPUTE_SHADERS
+
+#include <string>
+
+namespace GPU3D
+{
+
+namespace ComputeRendererShaders
+{
+
+// defines:
+// InterpSpans
+// BinCombined
+// Rasterise
+// DepthBlend
+// ClearCoarseBinMask
+// ClearIndirectWorkCount
+// CalculateWorkOffsets
+// SortWork
+// FinalPass
+
+// AntiAliasing
+// EdgeMarking
+// Fog
+
+// ZBuffer
+// WBuffer
+
+// for Rasterise
+// NoTexture
+// UseTexture
+// Decal
+// Modulate
+// Toon
+// Highlight
+// ShadowMask
+
+
+/*
+    Some notes on signed division:
+
+    we want to avoid it, so we can avoid higher precision numbers
+    in a few places.
+    
+    Fortunately all divisions *should* assuming I'm not mistaken
+    have the same sign on the divisor and the dividend.
+
+    Thus we apply:
+
+    assuming n < 0 <=> d < 0
+    n/d = abs(n)/abs(d)
+
+*/
+
+const std::string XSpanSetupBuffer{R"(
+
+const uint XSpanSetup_Linear = 1U << 0;
+const uint XSpanSetup_FillInside = 1U << 1;
+const uint XSpanSetup_FillLeft = 1U << 2;
+const uint XSpanSetup_FillRight = 1U << 3;
+
+struct XSpanSetup
+{
+    int X0, X1;
+
+    int InsideStart, InsideEnd, EdgeCovL, EdgeCovR;
+
+    int XRecip;
+
+    uint Flags;
+
+    int Z0, Z1, W0, W1;
+    int ColorR0, ColorG0, ColorB0;
+    int ColorR1, ColorG1, ColorB1;
+    int TexcoordU0, TexcoordV0;
+    int TexcoordU1, TexcoordV1;
+
+    int CovLInitial, CovRInitial;
+};
+
+#if defined(Rasterise)
+int CalcYFactorX(XSpanSetup span, int x)
+{
+    x -= span.X0;
+
+    if (span.X0 != span.X1)
+    {
+        uint numLo = uint(x) * uint(span.W0);
+        uint numHi = 0U;
+        numHi |= numLo >> (32U-YFactorShift);
+        numLo <<= YFactorShift;
+
+        uint den = uint(x) * uint(span.W0) + uint(span.X1 - span.X0 - x) * uint(span.W1);
+
+        if (den == 0)
+            return 0;
+        else
+            return int(Div64_32_32(numHi, numLo, den));
+    }
+    else
+    {
+        return 0;
+    }
+}
+#endif
+
+layout (std430, binding = 1) buffer XSpanSetupsBuffer
+{
+    XSpanSetup XSpanSetups[];
+};
+
+)"};
+
+const std::string YSpanSetupBuffer{R"(
+
+struct YSpanSetup
+{
+    // Attributes
+    int Z0, Z1, W0, W1;
+    int ColorR0, ColorG0, ColorB0;
+    int ColorR1, ColorG1, ColorB1;
+    int TexcoordU0, TexcoordV0;
+    int TexcoordU1, TexcoordV1;
+
+    // Interpolator
+    int I0, I1;
+    bool Linear;
+    int IRecip;
+    int W0n, W0d, W1d;
+
+    // Slope
+    int Increment;
+
+    int X0, X1, Y0, Y1;
+    int XMin, XMax;
+    int DxInitial;
+
+    int XCovIncr;
+
+    bool IsDummy;
+};
+
+#if defined(InterpSpans)
+int CalcYFactorY(YSpanSetup span, int i)
+{
+    /*
+        maybe it would be better to do use a 32x32=64 multiplication?
+    */
+    uint numLo = uint(abs(i)) * uint(span.W0n);
+    uint numHi = 0U;
+    numHi |= numLo >> (32U-YFactorShift);
+    numLo <<= YFactorShift;
+
+    uint den = uint(abs(i)) * uint(span.W0d) + uint(abs(span.I1 - span.I0 - i)) * span.W1d;
+
+    if (den == 0)
+    {
+        return 0;
+    }
+    else
+    {
+        return int(Div64_32_32(numHi, numLo, den));
+    }
+}
+
+int CalculateDx(int y, YSpanSetup span)
+{
+    return span.DxInitial + (y - span.Y0) * span.Increment;
+}
+
+int CalculateX(int dx, YSpanSetup span)
+{
+    int x = span.X0;
+    if (span.X1 < span.X0)
+        x -= dx >> 18;
+    else
+        x += dx >> 18;
+    return clamp(x, span.XMin, span.XMax);
+}
+
+void EdgeParams_XMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov)
+{
+    bool negative = span.X1 < span.X0;
+    int len;
+    if (side != negative)
+        len = (dx >> 18) - ((dx-span.Increment) >> 18);
+    else
+        len = ((dx+span.Increment) >> 18) - (dx >> 18);
+    edgelen = len;
+
+    int xlen = span.XMax + 1 - span.XMin;
+    int startx = dx >> 18;
+    if (negative) startx = xlen - startx;
+    if (side) startx = startx - len + 1;
+
+    uint r;
+    int startcov = int(Div(uint(((startx << 10) + 0x1FF) * (span.Y1 - span.Y0)), uint(xlen), r));
+    edgecov = (1<<31) | ((startcov & 0x3FF) << 12) | (span.XCovIncr & 0x3FF);
+}
+
+void EdgeParams_YMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov)
+{
+    bool negative = span.X1 < span.X0;
+    edgelen = 1;
+    
+    if (span.Increment == 0)
+    {
+        edgecov = 31;
+    }
+    else
+    {
+        int cov = ((dx >> 9) + (span.Increment >> 10)) >> 4;
+        if ((cov >> 5) != (dx >> 18)) cov = 31;
+        cov &= 0x1F;
+        if (side == negative) cov = 0x1F - cov;
+
+        edgecov = cov;
+    }
+}
+#endif
+
+layout (std430, binding = 2) buffer YSpanSetupsBuffer
+{
+    YSpanSetup YSpanSetups[];
+};
+
+)"};
+
+const std::string PolygonBuffer{R"(
+struct Polygon
+{
+    int FirstXSpan;
+    int YTop, YBot;
+
+    int XMin, XMax;
+    int XMinY, XMaxY;
+
+    int Variant;
+
+    uint Attr;
+
+    float TextureLayer;
+};
+
+layout (std430, binding = 0) readonly buffer PolygonBuffer
+{
+    Polygon Polygons[];
+};
+)"};
+
+const std::string BinningBuffer{R"(
+
+layout (std430, binding = 6) buffer BinResultBuffer
+{
+    uvec4 VariantWorkCount[MaxVariants];
+    uint SortedWorkOffset[MaxVariants];
+
+    uvec4 SortWorkWorkCount;
+
+    uint BinningMaskAndOffset[];
+    //uint BinnedMaskCoarse[TilesPerLine*TileLines*CoarseBinStride];
+    //uint BinnedMask[TilesPerLine*TileLines*BinStride];
+    //uint WorkOffsets[TilesPerLine*TileLines*BinStride];
+};
+
+const int BinningCoarseMaskStart = 0;
+const int BinningMaskStart = BinningCoarseMaskStart+TilesPerLine*TileLines*CoarseBinStride;
+const int BinningWorkOffsetsStart = BinningMaskStart+TilesPerLine*TileLines*BinStride;
+
+)"};
+
+/*
+    structure of each WorkDesc item:
+        x:
+            bits 0-10: polygon idx
+            bits 11-31: tile idx (before sorting within variant after sorting within all tiles)
+        y:
+            bits 0-15: X position on screen
+            bits 15-31: Y position on screen
+*/
+const std::string WorkDescBuffer{R"(
+layout (std430, binding = 7) buffer WorkDescBuffer
+{
+    //uvec2 UnsortedWorkDescs[MaxWorkTiles];
+    //uvec2 SortedWorkDescs[MaxWorkTiles];
+    uvec2 WorkDescs[];
+};
+
+const uint WorkDescsUnsortedStart = 0;
+const uint WorkDescsSortedStart = WorkDescsUnsortedStart+MaxWorkTiles;
+
+)"};
+
+const std::string Tilebuffers{R"(
+layout (std430, binding = 2) buffer ColorTileBuffer
+{
+    uint ColorTiles[];
+};
+layout (std430, binding = 3) buffer DepthTileBuffer
+{
+    uint DepthTiles[];
+};
+layout (std430, binding = 4) buffer AttrTileBuffer
+{
+    uint AttrTiles[];
+};
+
+)"};
+
+const std::string ResultBuffer{R"(
+layout (std430, binding = 5) buffer ResultBuffer
+{
+    uint ResultValue[];
+};
+
+const uint ResultColorStart = 0;
+const uint ResultDepthStart = ResultColorStart+ScreenWidth*ScreenHeight*2;
+const uint ResultAttrStart = ResultDepthStart+ScreenWidth*ScreenHeight*2;
+)"};
+
+const char* Common = R"(
+
+#define TileSize 8
+const int CoarseTileCountX = 8;
+const int CoarseTileCountY = 4;
+const int CoarseTileW = (CoarseTileCountX * TileSize);
+const int CoarseTileH = (CoarseTileCountY * TileSize);
+
+const int FramebufferStride = ScreenWidth*ScreenHeight;
+const int TilesPerLine = ScreenWidth/TileSize;
+const int TileLines = ScreenHeight/TileSize;
+
+const int BinStride = 2048/32;
+const int CoarseBinStride = BinStride/32;
+
+const int MaxVariants = 256;
+
+layout (std140, binding = 0) uniform MetaUniform
+{
+    uint NumPolygons;
+    uint NumVariants;
+
+    int AlphaRef;
+
+    uint DispCnt;
+
+    // r = Toon
+    // g = Fog Density
+    // b = Edge Color
+    uvec4 ToonTable[34];
+
+    uint ClearColor, ClearDepth, ClearAttr;
+
+    uint FogOffset, FogShift, FogColor;
+};
+
+#ifdef InterpSpans
+const int YFactorShift = 9;
+#else
+const int YFactorShift = 8;
+#endif
+
+#if defined(InterpSpans) || defined(Rasterise)
+uint Umulh(uint a, uint b)
+{
+    uint lo, hi;
+    umulExtended(a, b, hi, lo);
+    return hi;
+}
+
+const uint startTable[256] = uint[256](
+    254, 252, 250, 248, 246, 244, 242, 240, 238, 236, 234, 233, 231, 229, 227, 225, 224, 222, 220, 218, 217, 215, 213, 212, 210, 208, 207, 205, 203, 202, 200, 199, 197, 195, 194, 192, 191, 189, 188, 186, 185, 183, 182, 180, 179, 178, 176, 175, 173, 172, 170, 169, 168, 166, 165, 164, 162, 161, 160, 158, 
+157, 156, 154, 153, 152, 151, 149, 148, 147, 146, 144, 143, 142, 141, 139, 138, 137, 136, 135, 134, 132, 131, 130, 129, 128, 127, 126, 125, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 88, 87, 86, 85, 84, 83, 82, 81, 80, 80, 79, 78, 77, 76, 75, 74, 74, 73, 72, 71, 70, 70, 69, 68, 67, 66, 66, 65, 64, 63, 62, 62, 61, 60, 59, 59, 58, 57, 56, 56, 55, 54, 53, 53, 52, 51, 50, 50, 49, 48, 48, 47, 46, 46, 45, 44, 43, 43, 42, 41, 41, 40, 39, 39, 38, 37, 37, 36, 35, 35, 34, 33, 33, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0
+);
+
+uint Div(uint x, uint y, out uint r)
+{
+    // https://www.microsoft.com/en-us/research/publication/software-integer-division/
+    uint k = 31 - findMSB(y);
+    uint ty = (y << k) >> (32 - 9);
+    uint t = startTable[ty - 256] + 256;
+    uint z = (t << (32 - 9)) >> (32 - k - 1);
+    uint my = 0 - y;
+
+    z += Umulh(z, my * z);
+    z += Umulh(z, my * z);
+
+    uint q = Umulh(x, z);
+    r = x - y * q;
+    if(r >= y)
+    {
+        r = r - y;
+        q = q + 1;
+        if(r >= y)
+        {
+            r = r - y;
+            q = q + 1;
+        }
+    }
+
+    return q;
+}
+
+uint Div64_32_32(uint numHi, uint numLo, uint den)
+{
+    // based on https://github.com/ridiculousfish/libdivide/blob/3bd34388573681ce563348cdf04fe15d24770d04/libdivide.h#L469
+    // modified to work with half the size 64/32=32 instead of 128/64=64
+    // for further details see https://ridiculousfish.com/blog/posts/labor-of-division-episode-iv.html
+
+    // We work in base 2**16.
+    // A uint32 holds a single digit (in the lower 16 bit). A uint32 holds two digits.
+    // Our numerator is conceptually [num3, num2, num1, num0].
+    // Our denominator is [den1, den0].
+    const uint b = (1U << 16);
+
+    // Determine the normalization factor. We multiply den by this, so that its leading digit is at
+    // least half b. In binary this means just shifting left by the number of leading zeros, so that
+    // there's a 1 in the MSB.
+    // We also shift numer by the same amount. This cannot overflow because numHi < den.
+    // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
+    // by 64. (it's also UB in GLSL!!!!)
+    uint shift = 31 - findMSB(den);
+    den <<= shift;
+    numHi <<= shift;
+    numHi |= (numLo >> (-shift & 31U)) & uint(-int(shift) >> 31);
+    numLo <<= shift;
+
+    // Extract the low digits of the numerator and both digits of the denominator.
+    uint num1 = (numLo >> 16);
+    uint num0 = (numLo & 0xFFFFU);
+    uint den1 = (den >> 16);
+    uint den0 = (den & 0xFFFFU);
+
+    // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
+    // Estimate q1 as [n3 n2] / [d1], and then correct it.
+    // Note while qhat may be 2 digits, q1 is always 1 digit.
+
+    uint rhat;
+    uint qhat = Div(numHi, den1, rhat);
+    uint c1 = qhat * den0;
+    uint c2 = rhat * b + num1;
+    if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
+    uint q1 = qhat & 0xFFFFU;
+
+    // Compute the true (partial) remainder.
+    uint rem = numHi * b + num1 - q1 * den;
+
+    // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
+    // Estimate q0 as [rem1 rem0] / [d1] and correct it.
+    qhat = Div(rem, den1, rhat);
+    c1 = qhat * den0;
+    c2 = rhat * b + num0;
+    if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
+
+    return bitfieldInsert(qhat, q1, 16, 16);
+}
+
+int InterpolateAttrPersp(int y0, int y1, int ifactor)
+{
+    if (y0 == y1)
+        return y0;
+
+    if (y0 < y1)
+        return y0 + (((y1-y0) * ifactor) >> YFactorShift);
+    else
+        return y1 + (((y0-y1) * ((1<<YFactorShift)-ifactor)) >> YFactorShift);
+}
+
+int InterpolateAttrLinear(int y0, int y1, int i, int irecip, int idiff)
+{
+    if (y0 == y1)
+        return y0;
+
+#ifndef Rasterise
+    irecip = abs(irecip);
+#endif
+
+    uint mulLo, mulHi, carry;
+    if (y0 < y1)
+    {
+#ifndef Rasterise
+        uint offset = uint(abs(i));
+#else
+        uint offset = uint(i);
+#endif
+        umulExtended(uint(y1-y0)*offset, uint(irecip), mulHi, mulLo);
+        mulLo = uaddCarry(mulLo, 3U<<24, carry);
+        mulHi += carry;
+        return y0 + int((mulLo >> 30) | (mulHi << (32 - 30)));
+        //return y0 + int(((int64_t(y1-y0) * int64_t(offset) * int64_t(irecip)) + int64_t(3<<24)) >> 30);
+    }
+    else
+    {
+#ifndef Rasterise
+        uint offset = uint(abs(idiff-i));
+#else
+        uint offset = uint(idiff-i);
+#endif
+        umulExtended(uint(y0-y1)*offset, uint(irecip), mulHi, mulLo);
+        mulLo = uaddCarry(mulLo, 3<<24, carry);
+        mulHi += carry;
+        return y1 + int((mulLo >> 30) | (mulHi << (32 - 30)));
+        //return y1 + int(((int64_t(y0-y1) * int64_t(offset) * int64_t(irecip)) + int64_t(3<<24)) >> 30);
+    }
+}
+
+uint InterpolateZZBuffer(int z0, int z1, int i, int irecip, int idiff)
+{
+    if (z0 == z1)
+        return z0;
+
+    uint base, disp, factor;
+    if (z0 < z1)
+    {
+        base = uint(z0);
+        disp = uint(z1 - z0);
+        factor = uint(abs(i));
+    }
+    else
+    {
+        base = uint(z1);
+        disp = uint(z0 - z1),
+        factor = uint(abs(idiff - i));
+    }
+
+#ifdef InterpSpans
+    int shiftl = 0;
+    const int shiftr = 22;
+    if (disp > 0x3FF)
+    {
+        shiftl = findMSB(disp) - 9;
+        disp >>= shiftl;
+    }
+#else
+    disp >>= 9;
+    const int shiftl = 0;
+    const int shiftr = 13;
+#endif
+    uint mulLo, mulHi;
+
+    umulExtended(disp * factor, abs(irecip) >> 8, mulHi, mulLo);
+
+    return base + (((mulLo >> shiftr) | (mulHi << (32 - shiftr))) << shiftl);
+/*
+    int base, disp, factor;
+    if (z0 < z1)
+    {
+        base = z0;
+        disp = z1 - z0;
+        factor = i;
+    }
+    else
+    {
+        base = z1;
+        disp = z0 - z1,
+        factor = idiff - i;
+    }
+
+#ifdef InterpSpans
+    {
+        int shift = 0;
+        while (disp > 0x3FF)
+        {
+            disp >>= 1;
+            shift++;
+        }
+
+        return base + int(((int64_t(disp) * int64_t(factor) * (int64_t(irecip) >> 8)) >> 22) << shift);
+    }
+#else
+    {
+        disp >>= 9;
+        return base + int((int64_t(disp) * int64_t(factor) * (int64_t(irecip) >> 8)) >> 13);
+    }
+#endif*/
+}
+
+uint InterpolateZWBuffer(int z0, int z1, int ifactor)
+{
+    if (z0 == z1)
+        return z0;
+
+#ifdef Rasterise
+    // since the precision along x spans is only 8 bit the result will always fit in 32-bit
+    if (z0 < z1)
+    {
+        return uint(z0) + (((z1-z0) * ifactor) >> YFactorShift);
+    }
+    else
+    {
+        return uint(z1) + (((z0-z1) * ((1<<YFactorShift)-ifactor)) >> YFactorShift);
+    }
+#else
+    uint mulLo, mulHi;
+    if (z0 < z1)
+    {
+        umulExtended(z1-z0, ifactor, mulHi, mulLo);
+        // 64-bit shift
+        return uint(z0) + ((mulLo >> YFactorShift) | (mulHi << (32-YFactorShift)));
+    }
+    else
+    {
+        umulExtended(z0-z1, (1<<YFactorShift)-ifactor, mulHi, mulLo);
+        return uint(z1) + ((mulLo >> YFactorShift) | (mulHi << (32-YFactorShift)));
+    }
+#endif
+    /*if (z0 < z1)
+    {
+        return uint(z0) + uint((int64_t(z1-z0) * int64_t(ifactor)) >> YFactorShift);
+    }
+    else
+    {
+        return uint(z1) + uint((int64_t(z0-z1) * int64_t((1<<YFactorShift)-ifactor)) >> YFactorShift);
+    }*/
+}
+#endif
+
+)";
+
+const std::string InterpSpans = 
+    PolygonBuffer +
+    XSpanSetupBuffer +
+    YSpanSetupBuffer + R"(
+layout (local_size_x = 32) in;
+
+layout (binding = 0, rgba16ui) uniform readonly uimageBuffer SetupIndices;
+
+void main()
+{
+    uvec4 setup = imageLoad(SetupIndices, int(gl_GlobalInvocationID.x));
+
+    YSpanSetup spanL = YSpanSetups[setup.y];
+    YSpanSetup spanR = YSpanSetups[setup.z];
+    
+    XSpanSetup xspan;
+    xspan.Flags = 0U;
+
+    int y = int(setup.w);
+
+    int dxl = CalculateDx(y, spanL);
+    int dxr = CalculateDx(y, spanR);
+
+    int xl = CalculateX(dxl, spanL);
+    int xr = CalculateX(dxr, spanR);
+
+    Polygon polygon = Polygons[setup.x];
+
+    int edgeLenL, edgeLenR;
+
+    if (xl > xr)
+    {
+        YSpanSetup tmpSpan = spanL;
+        spanL = spanR;
+        spanR = tmpSpan;
+
+        int tmp = xl;
+        xl = xr;
+        xr = tmp;
+    
+        EdgeParams_YMajor(false, dxr, spanL, edgeLenL, xspan.EdgeCovL);
+        EdgeParams_YMajor(true, dxl, spanR, edgeLenR, xspan.EdgeCovR);
+    }
+    else
+    {
+        // edges are the right way
+        if (spanL.Increment > 0x40000)
+            EdgeParams_XMajor(false, dxl, spanL, edgeLenL, xspan.EdgeCovL);
+        else
+            EdgeParams_YMajor(false, dxl, spanL, edgeLenL, xspan.EdgeCovL);
+        if (spanR.Increment > 0x40000)
+            EdgeParams_XMajor(true, dxr, spanR, edgeLenR, xspan.EdgeCovR);
+        else
+            EdgeParams_YMajor(true, dxr, spanR, edgeLenR, xspan.EdgeCovR);
+    }
+
+    xspan.CovLInitial = (xspan.EdgeCovL >> 12) & 0x3FF;
+    if (xspan.CovLInitial == 0x3FF)
+        xspan.CovLInitial = 0;
+    xspan.CovRInitial = (xspan.EdgeCovR >> 12) & 0x3FF;
+    if (xspan.CovRInitial == 0x3FF)
+        xspan.CovRInitial = 0;
+
+    xspan.X0 = xl;
+    xspan.X1 = xr + 1;
+
+    uint polyalpha = ((polygon.Attr >> 16) & 0x1FU);
+    bool isWireframe = polyalpha == 0U;
+
+    if (!isWireframe || (y == polygon.YTop || y == polygon.YBot - 1))
+        xspan.Flags |= XSpanSetup_FillInside;
+
+    xspan.InsideStart = xspan.X0 + edgeLenL;
+    if (xspan.InsideStart > xspan.X1)
+        xspan.InsideStart = xspan.X1;
+    xspan.InsideEnd = xspan.X1 - edgeLenR;
+    if (xspan.InsideEnd > xspan.X1)
+        xspan.InsideEnd = xspan.X1;
+
+    bool isShadowMask = ((polygon.Attr & 0x3F000030U) == 0x00000030U);
+    bool fillAllEdges = polyalpha < 31 || (DispCnt & (3U<<4)) != 0U;
+
+    if (fillAllEdges || spanL.X1 < spanL.X0 || spanL.Increment <= 0x40000)
+        xspan.Flags |= XSpanSetup_FillLeft;
+    if (fillAllEdges || (spanR.X1 >= spanR.X0 && spanR.Increment > 0x40000) || spanR.Increment == 0)
+        xspan.Flags |= XSpanSetup_FillRight;
+
+    if (spanL.I0 == spanL.I1)
+    {
+        xspan.TexcoordU0 = spanL.TexcoordU0;
+        xspan.TexcoordV0 = spanL.TexcoordV0;
+        xspan.ColorR0 = spanL.ColorR0;
+        xspan.ColorG0 = spanL.ColorG0;
+        xspan.ColorB0 = spanL.ColorB0;
+        xspan.Z0 = spanL.Z0;
+        xspan.W0 = spanL.W0;
+    }
+    else
+    {
+        int i = (spanL.Increment > 0x40000 ? xl : y) - spanL.I0;
+        int ifactor = CalcYFactorY(spanL, i);
+        int idiff = spanL.I1 - spanL.I0;
+
+#ifdef ZBuffer
+        xspan.Z0 = int(InterpolateZZBuffer(spanL.Z0, spanL.Z1, i, spanL.IRecip, idiff));
+#endif
+#ifdef WBuffer
+        xspan.Z0 = int(InterpolateZWBuffer(spanL.Z0, spanL.Z1, ifactor));
+#endif
+
+        if (!spanL.Linear)
+        {
+            xspan.TexcoordU0 = InterpolateAttrPersp(spanL.TexcoordU0, spanL.TexcoordU1, ifactor);
+            xspan.TexcoordV0 = InterpolateAttrPersp(spanL.TexcoordV0, spanL.TexcoordV1, ifactor);
+
+            xspan.ColorR0 = InterpolateAttrPersp(spanL.ColorR0, spanL.ColorR1, ifactor);
+            xspan.ColorG0 = InterpolateAttrPersp(spanL.ColorG0, spanL.ColorG1, ifactor);
+            xspan.ColorB0 = InterpolateAttrPersp(spanL.ColorB0, spanL.ColorB1, ifactor);
+
+            xspan.W0 = InterpolateAttrPersp(spanL.W0, spanL.W1, ifactor);
+        }
+        else
+        {
+            xspan.TexcoordU0 = InterpolateAttrLinear(spanL.TexcoordU0, spanL.TexcoordU1, i, spanL.IRecip, idiff);
+            xspan.TexcoordV0 = InterpolateAttrLinear(spanL.TexcoordV0, spanL.TexcoordV1, i, spanL.IRecip, idiff);
+
+            xspan.ColorR0 = InterpolateAttrLinear(spanL.ColorR0, spanL.ColorR1, i, spanL.IRecip, idiff);
+            xspan.ColorG0 = InterpolateAttrLinear(spanL.ColorG0, spanL.ColorG1, i, spanL.IRecip, idiff);
+            xspan.ColorB0 = InterpolateAttrLinear(spanL.ColorB0, spanL.ColorB1, i, spanL.IRecip, idiff);
+
+            xspan.W0 = spanL.W0; // linear mode is only taken if W0 == W1
+        }
+    }
+
+    if (spanR.I0 == spanR.I1)
+    {
+        xspan.TexcoordU1 = spanR.TexcoordU0;
+        xspan.TexcoordV1 = spanR.TexcoordV0;
+        xspan.ColorR1 = spanR.ColorR0;
+        xspan.ColorG1 = spanR.ColorG0;
+        xspan.ColorB1 = spanR.ColorB0;
+        xspan.Z1 = spanR.Z0;
+        xspan.W1 = spanR.W0;
+    }
+    else
+    {
+        int i = (spanR.Increment > 0x40000 ? xr : y) - spanR.I0;
+        int ifactor = CalcYFactorY(spanR, i);
+        int idiff = spanR.I1 - spanR.I0;
+
+    #ifdef ZBuffer
+            xspan.Z1 = int(InterpolateZZBuffer(spanR.Z0, spanR.Z1, i, spanR.IRecip, idiff));
+    #endif
+    #ifdef WBuffer
+            xspan.Z1 = int(InterpolateZWBuffer(spanR.Z0, spanR.Z1, ifactor));
+    #endif
+
+        if (!spanR.Linear)
+        {
+            xspan.TexcoordU1 = InterpolateAttrPersp(spanR.TexcoordU0, spanR.TexcoordU1, ifactor);
+            xspan.TexcoordV1 = InterpolateAttrPersp(spanR.TexcoordV0, spanR.TexcoordV1, ifactor);
+
+            xspan.ColorR1 = InterpolateAttrPersp(spanR.ColorR0, spanR.ColorR1, ifactor);
+            xspan.ColorG1 = InterpolateAttrPersp(spanR.ColorG0, spanR.ColorG1, ifactor);
+            xspan.ColorB1 = InterpolateAttrPersp(spanR.ColorB0, spanR.ColorB1, ifactor);
+
+            xspan.W1 = int(InterpolateAttrPersp(spanR.W0, spanR.W1, ifactor));
+        }
+        else
+        {
+            xspan.TexcoordU1 = InterpolateAttrLinear(spanR.TexcoordU0, spanR.TexcoordU1, i, spanR.IRecip, idiff);
+            xspan.TexcoordV1 = InterpolateAttrLinear(spanR.TexcoordV0, spanR.TexcoordV1, i, spanR.IRecip, idiff);
+
+            xspan.ColorR1 = InterpolateAttrLinear(spanR.ColorR0, spanR.ColorR1, i, spanR.IRecip, idiff);
+            xspan.ColorG1 = InterpolateAttrLinear(spanR.ColorG0, spanR.ColorG1, i, spanR.IRecip, idiff);
+            xspan.ColorB1 = InterpolateAttrLinear(spanR.ColorB0, spanR.ColorB1, i, spanR.IRecip, idiff);
+
+            xspan.W1 = spanR.W0;
+        }
+    }
+
+    if (xspan.W0 == xspan.W1 && ((xspan.W0 | xspan.W1) & 0x7F) == 0)
+    {
+        xspan.Flags |= XSpanSetup_Linear;
+// a bit hacky, but when wbuffering we only need to calculate xrecip for linear spans
+#ifdef ZBuffer
+    }
+    {
+#endif
+        uint r;
+        xspan.XRecip = int(Div(1U<<30, uint(xspan.X1 - xspan.X0), r));
+    }
+
+    XSpanSetups[gl_GlobalInvocationID.x] = xspan;
+}
+
+)";
+
+const std::string ClearIndirectWorkCount =
+    BinningBuffer + R"(
+
+layout (local_size_x = 32) in;
+
+void main()
+{
+    VariantWorkCount[gl_GlobalInvocationID.x] = uvec4(1, 1, 0, 0);
+}
+
+)";
+
+const std::string ClearCoarseBinMask =
+    BinningBuffer + R"(
+layout (local_size_x = 32) in;
+
+void main()
+{
+    BinningMaskAndOffset[BinningCoarseMaskStart + gl_GlobalInvocationID.x*CoarseBinStride+0] = 0;
+    BinningMaskAndOffset[BinningCoarseMaskStart + gl_GlobalInvocationID.x*CoarseBinStride+1] = 0;
+}
+
+)";
+
+const std::string BinCombined =
+    PolygonBuffer +
+    BinningBuffer +
+    XSpanSetupBuffer +
+    WorkDescBuffer + R"(
+
+layout (local_size_x = 32) in;
+
+bool BinPolygon(Polygon polygon, ivec2 topLeft, ivec2 botRight)
+{
+    if (polygon.YTop > botRight.y || polygon.YBot <= topLeft.y)
+        return false;
+
+    int polygonHeight = polygon.YBot - polygon.YTop;
+
+    /*
+        All (good) polygons are convex. So the following holds true:
+
+        Starting from the top most point where both edges originate
+        the X coordinate of the left edge will stay the same or falls until
+        the minimum X-axis coordinate is reached. Then it stays the same or
+        rises until the point it meets with the right edge.
+
+        The same applies to the right edge, except that it first may rise or stay equal and
+        after the maximum point may only fall or stay the same.
+
+        This means that for every tile which doesn't contain the point where the direction changes
+        we can just get the maximum point by sampling the top most and bottom most coordinate
+        within the tile.
+
+        For a tile which is that the height of the direction change
+
+        As a sidenote another consequence of this design decision is
+        that malformed polygons aren't binned properly.
+
+        As a note bottom Y is exclusive!
+    */
+    int polyInnerTopY = clamp(topLeft.y - polygon.YTop, 0, max(polygonHeight-1, 0));
+    int polyInnerBotY = clamp(botRight.y - polygon.YTop, 0, max(polygonHeight-1, 0));
+
+    XSpanSetup xspanTop = XSpanSetups[polygon.FirstXSpan + polyInnerTopY];
+    XSpanSetup xspanBot = XSpanSetups[polygon.FirstXSpan + polyInnerBotY];
+
+    int minXL;
+    if (polygon.XMinY >= topLeft.y && polygon.XMinY <= botRight.y)
+        minXL = polygon.XMin;
+    else
+        minXL = min(xspanTop.X0, xspanBot.X0);
+
+    if (minXL > botRight.x)
+        return false;
+
+    int maxXR;
+    if (polygon.XMaxY >= topLeft.y && polygon.XMaxY <= botRight.y)
+        maxXR = polygon.XMax;
+    else
+        maxXR = max(xspanTop.X1, xspanBot.X1) - 1;
+
+    if (maxXR < topLeft.x)
+        return false;
+
+    return true;
+}
+
+shared uint mergedMaskShared;
+
+void main()
+{
+    int groupIdx = int(gl_WorkGroupID.x);
+    ivec2 coarseTile = ivec2(gl_WorkGroupID.yz);
+
+#if 0
+    int localIdx = int(gl_SubGroupInvocationARB);
+#else
+    int localIdx = int(gl_LocalInvocationIndex);
+
+    if (localIdx == 0)
+        mergedMaskShared = 0U;
+    barrier();
+#endif
+
+    int polygonIdx = groupIdx * 32 + localIdx;
+
+    ivec2 coarseTopLeft = coarseTile * ivec2(CoarseTileW, CoarseTileH);
+    ivec2 coarseBotRight = coarseTopLeft + ivec2(CoarseTileW-1, CoarseTileH-1);
+
+    bool binned = false;
+    if (polygonIdx < NumPolygons)
+    {
+        binned = BinPolygon(Polygons[polygonIdx], coarseTopLeft, coarseBotRight);
+    }
+
+#if 0
+    uint mergedMask = unpackUint2x32(ballotARB(binned)).x;
+#else
+    if (binned)
+        atomicOr(mergedMaskShared, 1U << localIdx);
+    barrier();
+    uint mergedMask = mergedMaskShared;
+#endif
+
+    ivec2 fineTile = ivec2(localIdx & 0x7, localIdx >> 3);
+
+    ivec2 fineTileTopLeft = coarseTopLeft + fineTile * ivec2(TileSize, TileSize);
+    ivec2 fineTileBotRight = fineTileTopLeft + ivec2(TileSize-1, TileSize-1);
+
+    uint binnedMask = 0U;
+    while (mergedMask != 0U)
+    {
+        int bit = findLSB(mergedMask);
+        mergedMask &= ~(1U << bit);
+
+        int polygonIdx = groupIdx * 32 + bit;
+
+        if (BinPolygon(Polygons[polygonIdx], fineTileTopLeft, fineTileBotRight))
+            binnedMask |= 1U << bit;
+    }
+
+    int linearTile = fineTile.x + fineTile.y * TilesPerLine + coarseTile.x * CoarseTileCountX + coarseTile.y * TilesPerLine * CoarseTileCountY;
+
+    BinningMaskAndOffset[BinningMaskStart + linearTile * BinStride + groupIdx] = binnedMask;
+    int coarseMaskIdx = linearTile * CoarseBinStride + (groupIdx >> 5);
+    if (binnedMask != 0U)
+        atomicOr(BinningMaskAndOffset[BinningCoarseMaskStart + coarseMaskIdx], 1U << (groupIdx & 0x1F));
+
+    if (binnedMask != 0U)
+    {
+        uint workOffset = atomicAdd(VariantWorkCount[0].w, uint(bitCount(binnedMask)));
+        BinningMaskAndOffset[BinningWorkOffsetsStart + linearTile * BinStride + groupIdx] = workOffset;
+
+        uint tilePositionCombined = bitfieldInsert(fineTileTopLeft.x, fineTileTopLeft.y, 16, 16);
+
+        int idx = 0;
+        while (binnedMask != 0U)
+        {
+            int bit = findLSB(binnedMask);
+            binnedMask &= ~(1U << bit);
+
+            int polygonIdx = groupIdx * 32 + bit;
+            int variantIdx = Polygons[polygonIdx].Variant;
+
+            int inVariantOffset = int(atomicAdd(VariantWorkCount[variantIdx].z, 1));
+            WorkDescs[WorkDescsUnsortedStart + workOffset + idx] = uvec2(tilePositionCombined, bitfieldInsert(polygonIdx, inVariantOffset, 12, 20));
+
+            idx++;
+        }
+    }
+}
+
+)";
+
+const std::string CalcOffsets = 
+    BinningBuffer + R"(
+
+layout (local_size_x = 32) in;
+
+void main()
+{
+    if (gl_GlobalInvocationID.x < NumVariants)
+    {
+        if (gl_GlobalInvocationID.x == 0)
+        {
+            // a bit of a cheat putting this here, but this shader won't run that often
+            SortWorkWorkCount = uvec4((VariantWorkCount[0].w + 31) / 32, 1, 1, 0);
+        }
+        SortedWorkOffset[gl_GlobalInvocationID.x] = atomicAdd(VariantWorkCount[1].w, VariantWorkCount[gl_GlobalInvocationID.x].z);
+    }
+}
+
+
+)";
+
+const std::string SortWork =
+    PolygonBuffer +
+    BinningBuffer +
+    WorkDescBuffer + R"(
+
+layout (local_size_x = 32) in;
+
+void main()
+{
+    if (gl_GlobalInvocationID.x < VariantWorkCount[0].w)
+    {
+        uvec2 workDesc = WorkDescs[WorkDescsUnsortedStart + gl_GlobalInvocationID.x];
+        int inVariantOffset = int(bitfieldExtract(workDesc.y, 12, 20));
+        int polygonIdx = int(bitfieldExtract(workDesc.y, 0, 12));
+        int variantIdx = Polygons[polygonIdx].Variant;
+
+        int sortedIndex = int(SortedWorkOffset[variantIdx]) + inVariantOffset;
+        WorkDescs[WorkDescsSortedStart + sortedIndex] = uvec2(workDesc.x, bitfieldInsert(workDesc.y, gl_GlobalInvocationID.x, 12, 20));
+    }
+}
+
+)";
+
+const std::string Rasterise =
+    PolygonBuffer +
+    WorkDescBuffer +
+    XSpanSetupBuffer +
+    BinningBuffer +
+    Tilebuffers + R"(
+
+layout (local_size_x = TileSize, local_size_y = TileSize) in;
+
+layout (binding = 0) uniform usampler2DArray CurrentTexture;
+
+layout (location = 0) uniform uint CurVariant;
+layout (location = 1) uniform vec2 InvTextureSize;
+
+void main()
+{
+    uvec2 workDesc = WorkDescs[WorkDescsSortedStart + SortedWorkOffset[CurVariant] + gl_WorkGroupID.z];
+    Polygon polygon = Polygons[bitfieldExtract(workDesc.y, 0, 12)];
+    ivec2 position = ivec2(bitfieldExtract(workDesc.x, 0, 16), bitfieldExtract(workDesc.x, 16, 16)) + ivec2(gl_LocalInvocationID.xy);
+    int tileOffset = int(bitfieldExtract(workDesc.y, 12, 20)) * TileSize * TileSize + TileSize * int(gl_LocalInvocationID.y) + int(gl_LocalInvocationID.x);
+
+    uint color = 0U;
+    if (position.y >= polygon.YTop && position.y < polygon.YBot)
+    {
+        XSpanSetup xspan = XSpanSetups[polygon.FirstXSpan + (position.y - polygon.YTop)];
+
+        bool insideLeftEdge = position.x < xspan.InsideStart;
+        bool insideRightEdge = position.x >= xspan.InsideEnd;
+        bool insidePolygonInside = !insideLeftEdge && !insideRightEdge;
+
+        if (position.x >= xspan.X0 && position.x < xspan.X1
+            && ((insideLeftEdge && (xspan.Flags & XSpanSetup_FillLeft) != 0U)
+                || (insideRightEdge && (xspan.Flags & XSpanSetup_FillRight) != 0U)
+                || (insidePolygonInside && (xspan.Flags & XSpanSetup_FillInside) != 0U)))
+        {
+            uint attr = 0;
+            if (position.y == polygon.YTop)
+                attr |= 0x4U;
+            else if (position.y == polygon.YBot - 1)
+                attr |= 0x8U;
+
+            if (insideLeftEdge)
+            {
+                attr |= 0x1U;
+
+                int cov = xspan.EdgeCovL;
+                if (cov < 0)
+                {
+                    int xcov = xspan.CovLInitial + (xspan.EdgeCovL & 0x3FF) * (position.x - xspan.X0);
+                    cov = min(xcov >> 5, 31);
+                }
+
+                attr |= uint(cov) << 8;
+            }
+            else if (insideRightEdge)
+            {
+                attr |= 0x2U;
+
+                int cov = xspan.EdgeCovR;
+                if (cov < 0)
+                {
+                    int xcov = xspan.CovRInitial + (xspan.EdgeCovR & 0x3FF) * (position.x - xspan.InsideEnd);
+                    cov = max(0x1F - (xcov >> 5), 0);
+                }
+
+                attr |= uint(cov) << 8;
+            }
+
+            uint z;
+            int u, v, vr, vg, vb;
+
+            if (xspan.X0 == xspan.X1)
+            {
+                z = xspan.Z0;
+                u = xspan.TexcoordU0;
+                v = xspan.TexcoordV0;
+                vr = xspan.ColorR0;
+                vg = xspan.ColorG0;
+                vb = xspan.ColorB0;
+            }
+            else
+            {
+                int ifactor = CalcYFactorX(xspan, position.x);
+                int idiff = xspan.X1 - xspan.X0;
+                int i = position.x - xspan.X0;
+
+#ifdef ZBuffer
+                z = InterpolateZZBuffer(xspan.Z0, xspan.Z1, i, xspan.XRecip, idiff);
+#endif
+#ifdef WBuffer
+                z = InterpolateZWBuffer(xspan.Z0, xspan.Z1, ifactor);
+#endif
+                if ((xspan.Flags & XSpanSetup_Linear) == 0U)
+                {
+                    u = InterpolateAttrPersp(xspan.TexcoordU0, xspan.TexcoordU1, ifactor);
+                    v = InterpolateAttrPersp(xspan.TexcoordV0, xspan.TexcoordV1, ifactor);
+
+                    vr = InterpolateAttrPersp(xspan.ColorR0, xspan.ColorR1, ifactor);
+                    vg = InterpolateAttrPersp(xspan.ColorG0, xspan.ColorG1, ifactor);
+                    vb = InterpolateAttrPersp(xspan.ColorB0, xspan.ColorB1, ifactor);
+                }
+                else
+                {
+                    u = InterpolateAttrLinear(xspan.TexcoordU0, xspan.TexcoordU1, i, xspan.XRecip, idiff);
+                    v = InterpolateAttrLinear(xspan.TexcoordV0, xspan.TexcoordV1, i, xspan.XRecip, idiff);
+
+                    vr = InterpolateAttrLinear(xspan.ColorR0, xspan.ColorR1, i, xspan.XRecip, idiff);
+                    vg = InterpolateAttrLinear(xspan.ColorG0, xspan.ColorG1, i, xspan.XRecip, idiff);
+                    vb = InterpolateAttrLinear(xspan.ColorB0, xspan.ColorB1, i, xspan.XRecip, idiff);
+                }
+            }
+
+#ifndef ShadowMask
+            vr >>= 3;
+            vg >>= 3;
+            vb >>= 3;
+
+            uint r, g, b, a;
+            uint polyalpha = bitfieldExtract(polygon.Attr, 16, 5);
+
+#ifdef Toon
+            uint tooncolor = ToonTable[vr >> 1].r;
+            vr = int(bitfieldExtract(tooncolor, 0, 8));
+            vg = int(bitfieldExtract(tooncolor, 8, 8));
+            vb = int(bitfieldExtract(tooncolor, 16, 8));
+#endif
+#ifdef Highlight
+            vg = vr;
+            vb = vr;
+#endif
+
+#ifdef NoTexture
+            a = int(polyalpha);
+#endif
+            r = vr;
+            g = vg;
+            b = vb;
+
+#ifdef UseTexture
+            vec2 uvf = vec2(ivec2(u, v)) * vec2(1.0 / 16.0) * InvTextureSize;
+
+            uvec4 texcolor = texture(CurrentTexture, vec3(uvf, polygon.TextureLayer));
+#ifdef Decal
+            if (texcolor.a == 31)
+            {
+                r = int(texcolor.r);
+                g = int(texcolor.g);
+                b = int(texcolor.b);
+            }
+            else if (texcolor.a > 0)
+            {
+                r = int((texcolor.r * texcolor.a) + (vr * (31-texcolor.a))) >> 5;
+                g = int((texcolor.g * texcolor.a) + (vg * (31-texcolor.a))) >> 5;
+                b = int((texcolor.b * texcolor.a) + (vb * (31-texcolor.a))) >> 5;
+            }
+            a = int(polyalpha);
+#endif
+#if defined(Modulate) || defined(Toon) || defined(Highlight)
+            r = int((texcolor.r+1) * (vr+1) - 1) >> 6;
+            g = int((texcolor.g+1) * (vg+1) - 1) >> 6;
+            b = int((texcolor.b+1) * (vb+1) - 1) >> 6;
+            a = int((texcolor.a+1) * (polyalpha+1) - 1) >> 5;
+#endif
+#endif
+
+#ifdef Highlight
+            uint tooncolor = ToonTable[vr >> 1].r;
+
+            r = min(r + int(bitfieldExtract(tooncolor, 0, 8)), 63);
+            g = min(g + int(bitfieldExtract(tooncolor, 8, 8)), 63);
+            b = min(b + int(bitfieldExtract(tooncolor, 16, 8)), 63);
+#endif
+
+            if (polyalpha == 0)
+                a = 31;
+
+            if (a > AlphaRef)
+            {
+                color = r | (g << 8) | (b << 16) | (a << 24);
+
+                DepthTiles[tileOffset] = z;
+                AttrTiles[tileOffset] = attr;
+            }
+#else
+            color = 0xFFFFFFFF; // doesn't really matter as long as it's not 0
+            DepthTiles[tileOffset] = z;
+#endif
+        }
+    }
+
+    ColorTiles[tileOffset] = color;
+}
+
+)";
+
+const std::string DepthBlend =
+    PolygonBuffer +
+    Tilebuffers +
+    ResultBuffer +
+    BinningBuffer + R"(
+
+layout (local_size_x = TileSize, local_size_y = TileSize) in;
+
+void PlotTranslucent(inout uint color, inout uint depth, inout uint attr, bool isShadow, uint tileColor, uint srcA, uint tileDepth, uint srcAttr, bool writeDepth)
+{
+    uint blendAttr = (srcAttr & 0xE0F0U) | ((srcAttr >> 8) & 0xFF0000U) | (1U<<22) | (attr & 0xFF001F0FU);
+
+    if ((!isShadow || (attr & (1U<<22)) != 0U)
+        ? (attr & 0x007F0000U) != (blendAttr & 0x007F0000U)
+        : (attr & 0x3F000000U) != (srcAttr & 0x3F000000U))
+    {
+        // le blend
+        if (writeDepth)
+            depth = tileDepth;
+
+        if ((attr & (1U<<15)) == 0)
+            blendAttr &= ~(1U<<15);
+        attr = blendAttr;
+
+        uint srcRB = tileColor & 0x3F003FU;
+        uint srcG = tileColor & 0x003F00U;
+        uint dstRB = color & 0x3F003FU;
+        uint dstG = color & 0x003F00U;
+        uint dstA = color & 0x1F000000U;
+
+        uint alpha = (srcA >> 24) + 1;
+        if (dstA != 0)
+        {
+            srcRB = ((srcRB * alpha) + (dstRB * (32-alpha))) >> 5;
+            srcG = ((srcG * alpha) + (dstG * (32-alpha))) >> 5;
+        }
+
+        color = (srcRB & 0x3F003FU) | (srcG & 0x003F00U) | max(dstA, srcA);
+    }
+}
+
+void ProcessCoarseMask(int linearTile, uint coarseMask, uint coarseOffset,
+    inout uvec2 color, inout uvec2 depth, inout uvec2 attr, inout uint stencil,
+    inout bool prevIsShadowMask)
+{
+    int tileInnerOffset = int(gl_LocalInvocationID.x) + int(gl_LocalInvocationID.y) * TileSize;
+
+    while (coarseMask != 0U)
+    {
+        uint coarseBit = findLSB(coarseMask);
+        coarseMask &= ~(1U << coarseBit);
+
+        uint tileOffset = linearTile * BinStride + coarseBit + coarseOffset;
+
+        uint fineMask = BinningMaskAndOffset[BinningMaskStart + tileOffset];
+        uint workIdx = BinningMaskAndOffset[BinningWorkOffsetsStart + tileOffset];
+
+        while (fineMask != 0U)
+        {
+            uint fineIdx = findLSB(fineMask);
+            fineMask &= ~(1U << fineIdx);
+
+            uint pixelindex = tileInnerOffset + workIdx * TileSize * TileSize;
+            uint tileColor = ColorTiles[pixelindex];
+            workIdx++;
+
+            uint polygonIdx = fineIdx + (coarseBit + coarseOffset) * 32;
+
+            if (tileColor != 0U)
+            {
+                uint polygonAttr = Polygons[polygonIdx].Attr;
+
+                bool isShadowMask = ((polygonAttr & 0x3F000030U) == 0x00000030U);
+                bool prevIsShadowMaskOld = prevIsShadowMask;
+                prevIsShadowMask = isShadowMask;
+
+                bool equalDepthTest = (polygonAttr & (1U << 14)) != 0U;
+
+                uint tileDepth = DepthTiles[pixelindex];
+                uint tileAttr = AttrTiles[pixelindex];
+
+                uint dstattr = attr.x;
+
+                if (!isShadowMask)
+                {
+                    bool isShadow = (polygonAttr & 0x30U) == 0x30U;
+
+                    bool writeSecondLayer = false;
+
+                    if (isShadow)
+                    {
+                        if (stencil == 0U)
+                            continue;
+                        if ((stencil & 1U) == 0U)
+                            writeSecondLayer = true;
+                        if ((stencil & 2U) == 0U)
+                            dstattr &= ~0x3U;
+                    }
+
+                    uint dstDepth = writeSecondLayer ? depth.y : depth.x;
+                    if (!(equalDepthTest
+#ifdef WBuffer
+                        ? dstDepth - tileDepth + 0xFFU <= 0x1FE
+#endif
+#ifdef ZBuffer
+                        ? dstDepth - tileDepth + 0x200 <= 0x400
+#endif
+                        : tileDepth < dstDepth))
+                    {
+                        if ((dstattr & 0x3U) == 0U || writeSecondLayer)
+                            continue;
+
+                        writeSecondLayer = true;
+                        dstattr = attr.y;
+                        if (!(equalDepthTest
+#ifdef WBuffer
+                            ? depth.y - tileDepth + 0xFFU <= 0x1FE
+#endif
+#ifdef ZBuffer
+                            ? depth.y - tileDepth + 0x200 <= 0x400
+#endif
+                            : tileDepth < depth.y))
+                            continue;
+                    }
+
+                    uint srcAttr = (polygonAttr & 0x3F008000U);
+
+                    uint srcA = tileColor & 0x1F000000U;
+                    if (srcA == 0x1F000000U)
+                    {
+                        srcAttr |= tileAttr;
+
+                        if (!writeSecondLayer)
+                        {
+                            if ((srcAttr & 0x3U) != 0U)
+                            {
+                                color.y = color.x;
+                                depth.y = depth.x;
+                                attr.y = attr.x;
+                            }
+
+                            color.x = tileColor;
+                            depth.x = tileDepth;
+                            attr.x = srcAttr;
+                        }
+                        else
+                        {
+                            color.y = tileColor;
+                            depth.y = tileDepth;
+                            attr.y = srcAttr;
+                        }
+                    }
+                    else
+                    {
+                        bool writeDepth = (polygonAttr & (1U<<11)) != 0;
+
+                        if (!writeSecondLayer)
+                        {
+                            // blend into both layers
+                            PlotTranslucent(color.x, depth.x, attr.x, isShadow, tileColor, srcA, tileDepth, srcAttr, writeDepth);
+                        }
+                        if (writeSecondLayer || (dstattr & 0x3U) != 0U)
+                        {
+                            PlotTranslucent(color.y, depth.y, attr.y, isShadow, tileColor, srcA, tileDepth, srcAttr, writeDepth);
+                        }
+                    }
+                }
+                else
+                {
+                    if (!prevIsShadowMaskOld)
+                        stencil = 0;
+
+                    if (!(equalDepthTest
+#ifdef WBuffer
+                        ? depth.x - tileDepth + 0xFFU <= 0x1FE
+#endif
+#ifdef ZBuffer
+                        ? depth.x - tileDepth + 0x200 <= 0x400
+#endif
+                        : tileDepth < depth.x))
+                        stencil = 0x1U;
+
+                    if ((dstattr & 0x3U) != 0U)
+                    {
+                        if (!(equalDepthTest
+#ifdef WBuffer
+                            ? depth.y - tileDepth + 0xFFU <= 0x1FE
+#endif
+#ifdef ZBuffer
+                            ? depth.y - tileDepth + 0x200 <= 0x400
+#endif
+                            : tileDepth < depth.y))
+                            stencil |= 0x2U;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void main()
+{
+    int linearTile = int(gl_WorkGroupID.x + (gl_WorkGroupID.y * TilesPerLine));
+
+    uint coarseMaskLo = BinningMaskAndOffset[BinningCoarseMaskStart + linearTile*CoarseBinStride + 0];
+    uint coarseMaskHi = BinningMaskAndOffset[BinningCoarseMaskStart + linearTile*CoarseBinStride + 1];
+
+    uvec2 color = uvec2(ClearColor, 0U);
+    uvec2 depth = uvec2(ClearDepth, 0U);
+    uvec2 attr = uvec2(ClearAttr, 0U);
+    uint stencil = 0U;
+    bool prevIsShadowMask = false;
+
+    ProcessCoarseMask(linearTile, coarseMaskLo, 0, color, depth, attr, stencil, prevIsShadowMask);
+    ProcessCoarseMask(linearTile, coarseMaskHi, BinStride/2, color, depth, attr, stencil, prevIsShadowMask);
+
+    int resultOffset = int(gl_GlobalInvocationID.x) + int(gl_GlobalInvocationID.y) * ScreenWidth;
+    ResultValue[ResultColorStart+resultOffset] = color.x;
+    ResultValue[ResultColorStart+resultOffset+FramebufferStride] = color.y;
+    ResultValue[ResultDepthStart+resultOffset] = depth.x;
+    ResultValue[ResultDepthStart+resultOffset+FramebufferStride] = depth.y;
+    ResultValue[ResultAttrStart+resultOffset] = attr.x;
+    ResultValue[ResultAttrStart+resultOffset+FramebufferStride] = attr.y;
+}
+
+)";
+
+const std::string FinalPass =
+    ResultBuffer + R"(
+
+layout (local_size_x = 32) in;
+
+layout (binding = 0, rgba8) writeonly uniform image2D FinalFB; 
+layout (binding = 1, rgba8ui) writeonly uniform uimage2D LowResFB; 
+
+uint BlendFog(uint color, uint depth)
+{
+    uint densityid = 0, densityfrac = 0;
+
+    if (depth >= FogOffset)
+    {
+        depth -= FogOffset;
+        depth = (depth >> 2) << FogShift;
+
+        densityid = depth >> 17;
+        if (densityid >= 32)
+        {
+            densityid = 32;
+            densityfrac = 0;
+        }
+        else
+        {
+            densityfrac = depth & 0x1FFFFU;
+        }
+    }
+
+    uint density =
+        ((ToonTable[densityid].g * (0x20000U-densityfrac)) +
+         (ToonTable[densityid+1].g * densityfrac)) >> 17;
+    density = min(density, 128U);
+
+    uint colorRB = color & 0x3F003FU;
+    uint colorGA = (color >> 8) & 0x3F003FU;
+
+    uint fogRB = FogColor & 0x3F003FU;
+    uint fogGA = (FogColor >> 8) & 0x1F003FU;
+
+    uint finalColorRB = ((fogRB * density) + (colorRB * (128-density))) >> 7;
+    uint finalColorGA = ((fogGA * density) + (colorGA * (128-density))) >> 7;
+
+    finalColorRB &= 0x3F003FU;
+    finalColorGA &= 0x1F003FU;
+
+    return (DispCnt & (1U<<6)) != 0
+        ? (bitfieldInsert(color, finalColorGA >> 16, 24, 8))
+        : (finalColorRB | (finalColorGA << 8));
+}
+
+void main()
+{
+    int srcX = int(gl_GlobalInvocationID.x);
+    int resultOffset = int(srcX) + int(gl_GlobalInvocationID.y) * ScreenWidth;
+
+    uvec2 color = uvec2(ResultValue[resultOffset+ResultColorStart], ResultValue[resultOffset+FramebufferStride+ResultColorStart]);
+    uvec2 depth = uvec2(ResultValue[resultOffset+ResultDepthStart], ResultValue[resultOffset+FramebufferStride+ResultDepthStart]);
+    uvec2 attr = uvec2(ResultValue[resultOffset+ResultAttrStart], ResultValue[resultOffset+FramebufferStride+ResultAttrStart]);
+
+#ifdef EdgeMarking
+    if ((attr.x & 0xFU) != 0U)
+    {
+        uvec4 otherAttr = uvec4(ClearAttr);
+        uvec4 otherDepth = uvec4(ClearDepth);
+
+        if (srcX > 0U)
+        {
+            otherAttr.x = ResultValue[resultOffset-1+ResultAttrStart];
+            otherDepth.x = ResultValue[resultOffset-1+ResultDepthStart];
+        }
+        if (srcX < ScreenWidth-1)
+        {
+            otherAttr.y = ResultValue[resultOffset+1+ResultAttrStart];
+            otherDepth.y = ResultValue[resultOffset+1+ResultDepthStart];
+        }
+        if (gl_GlobalInvocationID.y > 0U)
+        {
+            otherAttr.z = ResultValue[resultOffset-ScreenWidth+ResultAttrStart];
+            otherDepth.z = ResultValue[resultOffset-ScreenWidth+ResultDepthStart];
+        }
+        if (gl_GlobalInvocationID.y < ScreenHeight-1)
+        {
+            otherAttr.w = ResultValue[resultOffset+ScreenWidth+ResultAttrStart];
+            otherDepth.w = ResultValue[resultOffset+ScreenWidth+ResultDepthStart];
+        }
+
+        uint polyId = bitfieldExtract(attr.x, 24, 6);
+        uvec4 otherPolyId = bitfieldExtract(otherAttr, 24, 6);
+
+        bvec4 polyIdMismatch = notEqual(uvec4(polyId), otherPolyId);
+        bvec4 nearer = lessThan(uvec4(depth.x), otherDepth);
+
+        if ((polyIdMismatch.x && nearer.x)
+            || (polyIdMismatch.y && nearer.y)
+            || (polyIdMismatch.z && nearer.z)
+            || (polyIdMismatch.w && nearer.w))
+        {
+            color.x = ToonTable[polyId >> 3].b | (color.x & 0xFF000000U);
+            attr.x = (attr.x & 0xFFFFE0FFU) | 0x00001000U;
+        }
+    }
+#endif
+
+#ifdef Fog
+    if ((attr.x & (1U<<15)) != 0U)
+    {
+        color.x = BlendFog(color.x, depth.x);
+    }
+
+    if ((attr.x & 0xFU) != 0 && (attr.y & (1U<<15)) != 0U)
+    {
+        color.y = BlendFog(color.y, depth.y);
+    }
+#endif
+
+#ifdef AntiAliasing
+    // resolve anti-aliasing
+    if ((attr.x & 0x3U) != 0)
+    {
+        uint coverage = (attr.x >> 8) & 0x1FU;
+
+        if (coverage != 0)
+        {
+            uint topRB = color.x & 0x3F003FU;
+            uint topG = color.x & 0x003F00U;
+            uint topA = bitfieldExtract(color.x, 24, 5);
+
+            uint botRB = color.y & 0x3F003FU;
+            uint botG = color.y & 0x003F00U;
+            uint botA = bitfieldExtract(color.y, 24, 5);
+
+            coverage++;
+
+            if (botA > 0)
+            {
+                topRB = ((topRB * coverage) + (botRB * (32-coverage))) >> 5;
+                topG = ((topG * coverage) + (botG * (32-coverage))) >> 5;
+
+                topRB &= 0x3F003FU;
+                topG &= 0x003F00U;
+            }
+
+            topA = ((topA * coverage) + (botA * (32-coverage))) >> 5;
+
+            color.x = topRB | topG | (topA << 24);
+        }
+        else
+        {
+            color.x = color.y;
+        }
+    }
+#endif
+
+//    if (bitfieldExtract(color.x, 24, 8) != 0U)
+//        color.x |= 0x40000000U;
+//    else
+//        color.x = 0U;
+
+    //if ((gl_GlobalInvocationID.y % 8) == 7 || (gl_GlobalInvocationID.y % 8) == 7)
+    //    color.x = 0x1F00001FU | 0x40000000U;
+
+    vec4 result = vec4(bitfieldExtract(color.x, 16, 8), bitfieldExtract(color.x, 8, 8), color.x & 0x3FU, bitfieldExtract(color.x, 24, 8));
+    result /= vec4(63.0, 63.0, 63.0, 31.0);
+    imageStore(FinalFB, ivec2(gl_GlobalInvocationID.xy), result);
+
+    // It's a division by constant, so using the builtin division is fine
+    const int scale = ScreenWidth/256;
+    ivec2 lowresCoordinate = ivec2(gl_GlobalInvocationID.xy) / scale;
+    ivec2 lowresCoordinateRest = ivec2(gl_GlobalInvocationID.xy) % scale;
+    if (lowresCoordinateRest == ivec2(0, 0))
+    {
+        uvec4 color8;
+        color8.x = bitfieldExtract(color.x, 0, 8);
+        color8.y = bitfieldExtract(color.x, 8, 8);
+        color8.z = bitfieldExtract(color.x, 16, 8);
+        color8.w = bitfieldExtract(color.x, 24, 8);
+        imageStore(LowResFB, lowresCoordinate, color8);
+    }
+}
+
+)";
+
+}
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/GPU3D_OpenGL.cpp b/src/GPU3D_OpenGL.cpp
index 9648be3..825a471 100644
--- a/src/GPU3D_OpenGL.cpp
+++ b/src/GPU3D_OpenGL.cpp
@@ -28,46 +28,32 @@
 namespace GPU3D
 {
 
-bool GLRenderer::BuildRenderShader(u32 flags, const char* vs, const char* fs)
+bool GLRenderer::BuildRenderShader(u32 flags, const std::string& vs, const std::string& fs)
 {
     char shadername[32];
     sprintf(shadername, "RenderShader%02X", flags);
 
     int headerlen = strlen(kShaderHeader);
 
-    int vslen = strlen(vs);
-    int vsclen = strlen(kRenderVSCommon);
-    char* vsbuf = new char[headerlen + vsclen + vslen + 1];
-    strcpy(&vsbuf[0], kShaderHeader);
-    strcpy(&vsbuf[headerlen], kRenderVSCommon);
-    strcpy(&vsbuf[headerlen + vsclen], vs);
+    std::string vsbuf;
+    vsbuf += kShaderHeader;
+    vsbuf += kRenderVSCommon;
+    vsbuf += vs;
 
-    int fslen = strlen(fs);
-    int fsclen = strlen(kRenderFSCommon);
-    char* fsbuf = new char[headerlen + fsclen + fslen + 1];
-    strcpy(&fsbuf[0], kShaderHeader);
-    strcpy(&fsbuf[headerlen], kRenderFSCommon);
-    strcpy(&fsbuf[headerlen + fsclen], fs);
+    std::string fsbuf;
+    fsbuf += kShaderHeader;
+    fsbuf += kRenderFSCommon;
+    fsbuf += fs;
 
-    bool ret = OpenGL::BuildShaderProgram(vsbuf, fsbuf, RenderShader[flags], shadername);
-
-    delete[] vsbuf;
-    delete[] fsbuf;
+    GLuint prog;
+    bool ret = OpenGL::CompileVertexFragmentProgram(prog,
+        vsbuf, fsbuf,
+        shadername,
+        {{"vPosition", 0}, {"vColor", 1}, {"vTexcoord", 2}, {"vPolygonAttr", 3}},
+        {{"oColor", 0}, {"oAttr", 1}});
 
     if (!ret) return false;
 
-    GLuint prog = RenderShader[flags][2];
-
-    glBindAttribLocation(prog, 0, "vPosition");
-    glBindAttribLocation(prog, 1, "vColor");
-    glBindAttribLocation(prog, 2, "vTexcoord");
-    glBindAttribLocation(prog, 3, "vPolygonAttr");
-    glBindFragDataLocation(prog, 0, "oColor");
-    glBindFragDataLocation(prog, 1, "oAttr");
-
-    if (!OpenGL::LinkShaderProgram(RenderShader[flags]))
-        return false;
-
     GLint uni_id = glGetUniformBlockIndex(prog, "uConfig");
     glUniformBlockBinding(prog, uni_id, 0);
 
@@ -78,13 +64,15 @@ bool GLRenderer::BuildRenderShader(u32 flags, const char* vs, const char* fs)
     uni_id = glGetUniformLocation(prog, "TexPalMem");
     glUniform1i(uni_id, 1);
 
+    RenderShader[flags] = prog;
+
     return true;
 }
 
 void GLRenderer::UseRenderShader(u32 flags)
 {
     if (CurShaderID == flags) return;
-    glUseProgram(RenderShader[flags][2]);
+    glUseProgram(RenderShader[flags]);
     CurShaderID = flags;
 }
 
@@ -118,21 +106,17 @@ std::unique_ptr<GLRenderer> GLRenderer::New() noexcept
     glDepthRange(0, 1);
     glClearDepth(1.0);
 
-
-    if (!OpenGL::BuildShaderProgram(kClearVS, kClearFS, result->ClearShaderPlain, "ClearShader"))
+    if (!OpenGL::CompileVertexFragmentProgram(result->ClearShaderPlain,
+            kClearVS, kClearFS,
+            "ClearShader",
+            {{"vPosition", 0}},
+            {{"oColor", 0}, {"oAttr", 1}}))
         return nullptr;
 
-    glBindAttribLocation(result->ClearShaderPlain[2], 0, "vPosition");
-    glBindFragDataLocation(result->ClearShaderPlain[2], 0, "oColor");
-    glBindFragDataLocation(result->ClearShaderPlain[2], 1, "oAttr");
-
-    if (!OpenGL::LinkShaderProgram(result->ClearShaderPlain))
-        return nullptr;
-
-    result->ClearUniformLoc[0] = glGetUniformLocation(result->ClearShaderPlain[2], "uColor");
-    result->ClearUniformLoc[1] = glGetUniformLocation(result->ClearShaderPlain[2], "uDepth");
-    result->ClearUniformLoc[2] = glGetUniformLocation(result->ClearShaderPlain[2], "uOpaquePolyID");
-    result->ClearUniformLoc[3] = glGetUniformLocation(result->ClearShaderPlain[2], "uFogFlag");
+    result->ClearUniformLoc[0] = glGetUniformLocation(result->ClearShaderPlain, "uColor");
+    result->ClearUniformLoc[1] = glGetUniformLocation(result->ClearShaderPlain, "uDepth");
+    result->ClearUniformLoc[2] = glGetUniformLocation(result->ClearShaderPlain, "uOpaquePolyID");
+    result->ClearUniformLoc[3] = glGetUniformLocation(result->ClearShaderPlain, "uFogFlag");
 
     memset(result->RenderShader, 0, sizeof(RenderShader));
 
@@ -160,42 +144,35 @@ std::unique_ptr<GLRenderer> GLRenderer::New() noexcept
     if (!result->BuildRenderShader(RenderFlag_ShadowMask | RenderFlag_WBuffer, kRenderVS_W, kRenderFS_WSM))
         return nullptr;
 
-    if (!OpenGL::BuildShaderProgram(kFinalPassVS, kFinalPassEdgeFS, result->FinalPassEdgeShader, "FinalPassEdgeShader"))
+    if (!OpenGL::CompileVertexFragmentProgram(result->FinalPassEdgeShader,
+            kFinalPassVS, kFinalPassEdgeFS,
+            "FinalPassEdgeShader",
+            {{"vPosition", 0}},
+            {{"oColor", 0}}))
         return nullptr;
-
-    if (!OpenGL::BuildShaderProgram(kFinalPassVS, kFinalPassFogFS, result->FinalPassFogShader, "FinalPassFogShader"))
+    if (!OpenGL::CompileVertexFragmentProgram(result->FinalPassFogShader,
+            kFinalPassVS, kFinalPassFogFS,
+            "FinalPassFogShader",
+            {{"vPosition", 0}},
+            {{"oColor", 0}}))
         return nullptr;
 
-    glBindAttribLocation(result->FinalPassEdgeShader[2], 0, "vPosition");
-    glBindFragDataLocation(result->FinalPassEdgeShader[2], 0, "oColor");
-
-    if (!OpenGL::LinkShaderProgram(result->FinalPassEdgeShader))
-        return nullptr;
+    GLuint uni_id = glGetUniformBlockIndex(result->FinalPassEdgeShader, "uConfig");
+    glUniformBlockBinding(result->FinalPassEdgeShader, uni_id, 0);
 
-    GLint uni_id = glGetUniformBlockIndex(result->FinalPassEdgeShader[2], "uConfig");
-    glUniformBlockBinding(result->FinalPassEdgeShader[2], uni_id, 0);
-
-    glUseProgram(result->FinalPassEdgeShader[2]);
-
-    uni_id = glGetUniformLocation(result->FinalPassEdgeShader[2], "DepthBuffer");
+    glUseProgram(result->FinalPassEdgeShader);
+    uni_id = glGetUniformLocation(result->FinalPassEdgeShader, "DepthBuffer");
     glUniform1i(uni_id, 0);
-    uni_id = glGetUniformLocation(result->FinalPassEdgeShader[2], "AttrBuffer");
+    uni_id = glGetUniformLocation(result->FinalPassEdgeShader, "AttrBuffer");
     glUniform1i(uni_id, 1);
 
-    glBindAttribLocation(result->FinalPassFogShader[2], 0, "vPosition");
-    glBindFragDataLocation(result->FinalPassFogShader[2], 0, "oColor");
+    uni_id = glGetUniformBlockIndex(result->FinalPassFogShader, "uConfig");
+    glUniformBlockBinding(result->FinalPassFogShader, uni_id, 0);
 
-    if (!OpenGL::LinkShaderProgram(result->FinalPassFogShader))
-        return nullptr;
-
-    uni_id = glGetUniformBlockIndex(result->FinalPassFogShader[2], "uConfig");
-    glUniformBlockBinding(result->FinalPassFogShader[2], uni_id, 0);
-
-    glUseProgram(result->FinalPassFogShader[2]);
-
-    uni_id = glGetUniformLocation(result->FinalPassFogShader[2], "DepthBuffer");
+    glUseProgram(result->FinalPassFogShader);
+    uni_id = glGetUniformLocation(result->FinalPassFogShader, "DepthBuffer");
     glUniform1i(uni_id, 0);
-    uni_id = glGetUniformLocation(result->FinalPassFogShader[2], "AttrBuffer");
+    uni_id = glGetUniformLocation(result->FinalPassFogShader, "AttrBuffer");
     glUniform1i(uni_id, 1);
 
 
@@ -248,29 +225,26 @@ std::unique_ptr<GLRenderer> GLRenderer::New() noexcept
     glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, result->IndexBufferID);
     glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(IndexBuffer), nullptr, GL_DYNAMIC_DRAW);
 
-    glGenFramebuffers(4, &result->FramebufferID[0]);
-    glBindFramebuffer(GL_FRAMEBUFFER, result->FramebufferID[0]);
-
-    glGenTextures(8, &result->FramebufferTex[0]);
-    result->FrontBuffer = 0;
+    glGenFramebuffers(1, &result->MainFramebuffer);
 
     // color buffers
-    SetupDefaultTexParams(result->FramebufferTex[0]);
-    SetupDefaultTexParams(result->FramebufferTex[1]);
+    glGenTextures(1, &result->ColorBufferTex);
+    SetupDefaultTexParams(result->ColorBufferTex);
 
     // depth/stencil buffer
-    SetupDefaultTexParams(result->FramebufferTex[4]);
-    SetupDefaultTexParams(result->FramebufferTex[6]);
+    glGenTextures(1, &result->DepthBufferTex);
+    SetupDefaultTexParams(result->DepthBufferTex);
 
     // attribute buffer
     // R: opaque polyID (for edgemarking)
     // G: edge flag
     // B: fog flag
-    SetupDefaultTexParams(result->FramebufferTex[5]);
-    SetupDefaultTexParams(result->FramebufferTex[7]);
+    glGenTextures(1, &result->AttrBufferTex);
+    SetupDefaultTexParams(result->AttrBufferTex);
 
     // downscale framebuffer for display capture (always 256x192)
-    SetupDefaultTexParams(result->FramebufferTex[3]);
+    glGenTextures(1, &result->DownScaleBufferTex);
+    SetupDefaultTexParams(result->DownScaleBufferTex);
     glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 256, 192, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
 
     glEnable(GL_BLEND);
@@ -308,8 +282,12 @@ GLRenderer::~GLRenderer()
     glDeleteTextures(1, &TexMemID);
     glDeleteTextures(1, &TexPalMemID);
 
-    glDeleteFramebuffers(4, &FramebufferID[0]);
-    glDeleteTextures(8, &FramebufferTex[0]);
+    glDeleteFramebuffers(1, &MainFramebuffer);
+    glDeleteFramebuffers(1, &DownscaleFramebuffer);
+    glDeleteTextures(1, &ColorBufferTex);
+    glDeleteTextures(1, &DepthBufferTex);
+    glDeleteTextures(1, &AttrBufferTex);
+    glDeleteTextures(1, &DownScaleBufferTex);
 
     glDeleteVertexArrays(1, &VertexArrayID);
     glDeleteBuffers(1, &VertexBufferID);
@@ -320,8 +298,8 @@ GLRenderer::~GLRenderer()
 
     for (int i = 0; i < 16; i++)
     {
-        if (!RenderShader[i][2]) continue;
-        OpenGL::DeleteShaderProgram(RenderShader[i]);
+        if (!RenderShader[i]) continue;
+        glDeleteProgram(RenderShader[i]);
     }
 }
 
@@ -339,40 +317,25 @@ void GLRenderer::SetRenderSettings(GPU::RenderSettings& settings)
     ScreenW = 256 * scale;
     ScreenH = 192 * scale;
 
-    glBindTexture(GL_TEXTURE_2D, FramebufferTex[0]);
+    glBindTexture(GL_TEXTURE_2D, ColorBufferTex);
     glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, ScreenW, ScreenH, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-    glBindTexture(GL_TEXTURE_2D, FramebufferTex[1]);
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, ScreenW, ScreenH, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-
-    glBindTexture(GL_TEXTURE_2D, FramebufferTex[4]);
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, ScreenW, ScreenH, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL);
-    glBindTexture(GL_TEXTURE_2D, FramebufferTex[5]);
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, ScreenW, ScreenH, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL);
 
-    glBindTexture(GL_TEXTURE_2D, FramebufferTex[6]);
+    glBindTexture(GL_TEXTURE_2D, DepthBufferTex);
     glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, ScreenW, ScreenH, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL);
-    glBindTexture(GL_TEXTURE_2D, FramebufferTex[7]);
+    glBindTexture(GL_TEXTURE_2D, AttrBufferTex);
     glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, ScreenW, ScreenH, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL);
 
-    glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[3]);
-    glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, FramebufferTex[3], 0);
+    glBindFramebuffer(GL_FRAMEBUFFER, DownscaleFramebuffer);
+    glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, DownScaleBufferTex, 0);
 
     GLenum fbassign[2] = {GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1};
 
-    glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[0]);
-    glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, FramebufferTex[0], 0);
-    glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, FramebufferTex[4], 0);
-    glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, FramebufferTex[5], 0);
-    glDrawBuffers(2, fbassign);
-
-    glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[1]);
-    glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, FramebufferTex[1], 0);
-    glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, FramebufferTex[6], 0);
-    glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, FramebufferTex[7], 0);
+    glBindFramebuffer(GL_FRAMEBUFFER, MainFramebuffer);
+    glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, ColorBufferTex, 0);
+    glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, DepthBufferTex, 0);
+    glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, AttrBufferTex, 0);
     glDrawBuffers(2, fbassign);
 
-    glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[0]);
-
     glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelbufferID);
     glBufferData(GL_PIXEL_PACK_BUFFER, 256*192*4, NULL, GL_DYNAMIC_READ);
 
@@ -1081,9 +1044,9 @@ void GLRenderer::RenderSceneChunk(int y, int h)
         glStencilMask(0);
 
         glActiveTexture(GL_TEXTURE0);
-        glBindTexture(GL_TEXTURE_2D, FramebufferTex[FrontBuffer ? 6 : 4]);
+        glBindTexture(GL_TEXTURE_2D, DepthBufferTex);
         glActiveTexture(GL_TEXTURE1);
-        glBindTexture(GL_TEXTURE_2D, FramebufferTex[FrontBuffer ? 7 : 5]);
+        glBindTexture(GL_TEXTURE_2D, AttrBufferTex);
 
         glBindBuffer(GL_ARRAY_BUFFER, ClearVertexBufferID);
         glBindVertexArray(ClearVertexArrayID);
@@ -1093,7 +1056,7 @@ void GLRenderer::RenderSceneChunk(int y, int h)
             // edge marking
             // TODO: depth/polyid values at screen edges
 
-            glUseProgram(FinalPassEdgeShader[2]);
+            glUseProgram(FinalPassEdgeShader);
 
             glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ZERO, GL_ONE);
 
@@ -1104,7 +1067,7 @@ void GLRenderer::RenderSceneChunk(int y, int h)
         {
             // fog
 
-            glUseProgram(FinalPassFogShader[2]);
+            glUseProgram(FinalPassFogShader);
 
             if (RenderDispCnt & (1<<6))
                 glBlendFuncSeparate(GL_ZERO, GL_ONE, GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_ALPHA);
@@ -1132,7 +1095,7 @@ void GLRenderer::RenderFrame()
     CurShaderID = -1;
 
     glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
-    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, FramebufferID[FrontBuffer]);
+    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, MainFramebuffer);
 
     ShaderConfig.uScreenSize[0] = ScreenW;
     ShaderConfig.uScreenSize[1] = ScreenH;
@@ -1238,7 +1201,7 @@ void GLRenderer::RenderFrame()
     // TODO: check whether 'clear polygon ID' affects translucent polyID
     // (for example when alpha is 1..30)
     {
-        glUseProgram(ClearShaderPlain[2]);
+        glUseProgram(ClearShaderPlain);
         glDepthFunc(GL_ALWAYS);
 
         u32 r = RenderClearAttr1 & 0x1F;
@@ -1298,22 +1261,18 @@ void GLRenderer::RenderFrame()
 
         RenderSceneChunk(0, 192);
     }
-
-    FrontBuffer = FrontBuffer ? 0 : 1;
 }
 
 void GLRenderer::PrepareCaptureFrame()
 {
-    // TODO: make sure this picks the right buffer when doing antialiasing
-    int original_fb = FrontBuffer^1;
-
-    glBindFramebuffer(GL_READ_FRAMEBUFFER, FramebufferID[original_fb]);
+    glBindFramebuffer(GL_READ_FRAMEBUFFER, MainFramebuffer);
     glReadBuffer(GL_COLOR_ATTACHMENT0);
-    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, FramebufferID[3]);
+    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, DownscaleFramebuffer);
     glDrawBuffer(GL_COLOR_ATTACHMENT0);
     glBlitFramebuffer(0, 0, ScreenW, ScreenH, 0, 0, 256, 192, GL_COLOR_BUFFER_BIT, GL_NEAREST);
 
-    glBindFramebuffer(GL_READ_FRAMEBUFFER, FramebufferID[3]);
+    glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelbufferID);
+    glBindFramebuffer(GL_READ_FRAMEBUFFER, DownscaleFramebuffer);
     glReadPixels(0, 0, 256, 192, GL_BGRA, GL_UNSIGNED_BYTE, NULL);
 }
 
@@ -1323,6 +1282,7 @@ u32* GLRenderer::GetLine(int line)
 
     if (line == 0)
     {
+        glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelbufferID);
         u8* data = (u8*)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
         if (data) memcpy(&Framebuffer[stride*0], data, 4*stride*192);
         glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
@@ -1342,7 +1302,7 @@ u32* GLRenderer::GetLine(int line)
 
 void GLRenderer::SetupAccelFrame()
 {
-    glBindTexture(GL_TEXTURE_2D, FramebufferTex[FrontBuffer]);
+    glBindTexture(GL_TEXTURE_2D, ColorBufferTex);
 }
 
 }
diff --git a/src/GPU3D_OpenGL.h b/src/GPU3D_OpenGL.h
index 597f13e..04fb27a 100644
--- a/src/GPU3D_OpenGL.h
+++ b/src/GPU3D_OpenGL.h
@@ -37,8 +37,8 @@ public:
     virtual void RenderFrame() override;
     virtual u32* GetLine(int line) override;
 
-    void SetupAccelFrame();
-    void PrepareCaptureFrame();
+    void SetupAccelFrame() override;
+    void PrepareCaptureFrame() override;
 
     static std::unique_ptr<GLRenderer> New() noexcept;
 private:
@@ -65,7 +65,7 @@ private:
 
     RendererPolygon PolygonList[2048] {};
 
-    bool BuildRenderShader(u32 flags, const char* vs, const char* fs);
+    bool BuildRenderShader(u32 flags, const std::string& vs, const std::string& fs);
     void UseRenderShader(u32 flags);
     void SetupPolygon(RendererPolygon* rp, Polygon* polygon);
     u32* SetupVertex(Polygon* poly, int vid, Vertex* vtx, u32 vtxattr, u32* vptr);
@@ -84,13 +84,13 @@ private:
     };
 
 
-    GLuint ClearShaderPlain[3] {};
+    GLuint ClearShaderPlain {};
 
-    GLuint RenderShader[16][3] {};
+    GLuint RenderShader[16] {};
     GLuint CurShaderID = -1;
 
-    GLuint FinalPassEdgeShader[3] {};
-    GLuint FinalPassFogShader[3] {};
+    GLuint FinalPassEdgeShader {};
+    GLuint FinalPassFogShader {};
 
     // std140 compliant structure
     struct
@@ -143,11 +143,11 @@ private:
     bool BetterPolygons {};
     int ScreenW {}, ScreenH {};
 
-    GLuint FramebufferTex[8] {};
-    int FrontBuffer {};
-    GLuint FramebufferID[4] {}, PixelbufferID {};
-    u32 Framebuffer[256*192] {};
-
+    GLuint ColorBufferTex {}, DepthBufferTex {}, AttrBufferTex {};
+    GLuint DownScaleBufferTex {};
+    GLuint PixelbufferID {};
 
+    GLuint MainFramebuffer {}, DownscaleFramebuffer {};
+    u32 Framebuffer[256*192] {};
 };
-}
\ No newline at end of file
+}
diff --git a/src/GPU3D_Texcache.cpp b/src/GPU3D_Texcache.cpp
new file mode 100644
index 0000000..1d409c2
--- /dev/null
+++ b/src/GPU3D_Texcache.cpp
@@ -0,0 +1,269 @@
+#include "GPU3D_Texcache.h"
+
+namespace GPU3D
+{
+
+inline u16 ColorAvg(u16 color0, u16 color1)
+{
+    u32 r0 = color0 & 0x001F;
+    u32 g0 = color0 & 0x03E0;
+    u32 b0 = color0 & 0x7C00;
+    u32 r1 = color1 & 0x001F;
+    u32 g1 = color1 & 0x03E0;
+    u32 b1 = color1 & 0x7C00;
+
+    u32 r = (r0 + r1) >> 1;
+    u32 g = ((g0 + g1) >> 1) & 0x03E0;
+    u32 b = ((b0 + b1) >> 1) & 0x7C00;
+
+    return r | g | b;
+}
+
+inline u16 Color5of3(u16 color0, u16 color1)
+{
+    u32 r0 = color0 & 0x001F;
+    u32 g0 = color0 & 0x03E0;
+    u32 b0 = color0 & 0x7C00;
+    u32 r1 = color1 & 0x001F;
+    u32 g1 = color1 & 0x03E0;
+    u32 b1 = color1 & 0x7C00;
+
+    u32 r = (r0*5 + r1*3) >> 3;
+    u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0;
+    u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00;
+
+    return r | g | b;
+}
+
+inline u16 Color3of5(u16 color0, u16 color1)
+{
+    u32 r0 = color0 & 0x001F;
+    u32 g0 = color0 & 0x03E0;
+    u32 b0 = color0 & 0x7C00;
+    u32 r1 = color1 & 0x001F;
+    u32 g1 = color1 & 0x03E0;
+    u32 b1 = color1 & 0x7C00;
+
+    u32 r = (r0*3 + r1*5) >> 3;
+    u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0;
+    u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00;
+
+    return r | g | b;
+}
+
+inline u32 ConvertRGB5ToRGB8(u16 val)
+{
+    return (((u32)val & 0x1F) << 3)
+        | (((u32)val & 0x3E0) << 6)
+        | (((u32)val & 0x7C00) << 9);
+}
+inline u32 ConvertRGB5ToBGR8(u16 val)
+{
+    return (((u32)val & 0x1F) << 9)
+        | (((u32)val & 0x3E0) << 6)
+        | (((u32)val & 0x7C00) << 3);
+}
+inline u32 ConvertRGB5ToRGB6(u16 val)
+{
+    u8 r = (val & 0x1F) << 1;
+    u8 g = (val & 0x3E0) >> 4;
+    u8 b = (val & 0x7C00) >> 9;
+    if (r) r++;
+    if (g) g++;
+    if (b) b++;
+    return (u32)r | ((u32)g << 8) | ((u32)b << 16);
+}
+
+template <int outputFmt>
+void ConvertBitmapTexture(u32 width, u32 height, u32* output, u8* texData)
+{
+    for (u32 i = 0; i < width*height; i++)
+    {
+        u16 value = *(u16*)&texData[i * 2];
+
+        switch (outputFmt)
+        {
+        case outputFmt_RGB6A5:
+            output[i] = ConvertRGB5ToRGB6(value) | (value & 0x8000 ? 0x1F000000 : 0);
+            break;
+        case outputFmt_RGBA8:
+            output[i] = ConvertRGB5ToRGB8(value) | (value & 0x8000 ? 0xFF000000 : 0);
+            break;
+        case outputFmt_BGRA8:
+            output[i] = ConvertRGB5ToBGR8(value) | (value & 0x8000 ? 0xFF000000 : 0);
+            break;
+        }
+    }
+}
+
+template void ConvertBitmapTexture<outputFmt_RGB6A5>(u32 width, u32 height, u32* output, u8* texData);
+
+template <int outputFmt>
+void ConvertCompressedTexture(u32 width, u32 height, u32* output, u8* texData, u8* texAuxData, u16* palData)
+{
+    // we process a whole block at the time
+    for (int y = 0; y < height / 4; y++)
+    {
+        for (int x = 0; x < width / 4; x++)
+        {
+            u32 data = ((u32*)texData)[x + y * (width / 4)];
+            u16 auxData = ((u16*)texAuxData)[x + y * (width / 4)];
+
+            u32 paletteOffset = auxData & 0x3FFF;
+            u16 color0 = palData[paletteOffset*2] | 0x8000;
+            u16 color1 = palData[paletteOffset*2+1] | 0x8000;
+            u16 color2, color3;
+
+            switch ((auxData >> 14) & 0x3)
+            {
+            case 0:
+                color2 = palData[paletteOffset*2+2] | 0x8000;
+                color3 = 0;
+                break;
+            case 1:
+                {
+                    u32 r0 = color0 & 0x001F;
+                    u32 g0 = color0 & 0x03E0;
+                    u32 b0 = color0 & 0x7C00;
+                    u32 r1 = color1 & 0x001F;
+                    u32 g1 = color1 & 0x03E0;
+                    u32 b1 = color1 & 0x7C00;
+
+                    u32 r = (r0 + r1) >> 1;
+                    u32 g = ((g0 + g1) >> 1) & 0x03E0;
+                    u32 b = ((b0 + b1) >> 1) & 0x7C00;
+                    color2 = r | g | b | 0x8000;
+                }
+                color3 = 0;
+                break;
+            case 2:
+                color2 = palData[paletteOffset*2+2] | 0x8000;
+                color3 = palData[paletteOffset*2+3] | 0x8000;
+                break;
+            case 3:
+                {
+                    u32 r0 = color0 & 0x001F;
+                    u32 g0 = color0 & 0x03E0;
+                    u32 b0 = color0 & 0x7C00;
+                    u32 r1 = color1 & 0x001F;
+                    u32 g1 = color1 & 0x03E0;
+                    u32 b1 = color1 & 0x7C00;
+
+                    u32 r = (r0*5 + r1*3) >> 3;
+                    u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0;
+                    u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00;
+
+                    color2 = r | g | b | 0x8000;
+                }
+                {
+                    u32 r0 = color0 & 0x001F;
+                    u32 g0 = color0 & 0x03E0;
+                    u32 b0 = color0 & 0x7C00;
+                    u32 r1 = color1 & 0x001F;
+                    u32 g1 = color1 & 0x03E0;
+                    u32 b1 = color1 & 0x7C00;
+
+                    u32 r = (r0*3 + r1*5) >> 3;
+                    u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0;
+                    u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00;
+
+                    color3 = r | g | b | 0x8000;
+                }
+                break;
+            }
+
+            // in 2020 our default data types are big enough to be used as lookup tables...
+            u64 packed = color0 | ((u64)color1 << 16) | ((u64)color2 << 32) | ((u64)color3 << 48);
+
+            for (int j = 0; j < 4; j++)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    u16 color = (packed >> 16 * (data >> 2 * (i + j * 4))) & 0xFFFF;
+                    u32 res;
+                    switch (outputFmt)
+                    {
+                    case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color)
+                        | ((color & 0x8000) ? 0x1F000000 : 0); break;
+                    case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color)
+                        | ((color & 0x8000) ? 0xFF000000 : 0); break;
+                    case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color)
+                        | ((color & 0x8000) ? 0xFF000000 : 0); break;
+                    }
+                    output[x * 4 + i + (y * 4 + j) * width] = res;
+                }
+            }
+        }
+    }
+}
+
+template void ConvertCompressedTexture<outputFmt_RGB6A5>(u32, u32, u32*, u8*, u8*, u16*);
+
+template <int outputFmt, int X, int Y>
+void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData)
+{
+    for (int y = 0; y < height; y++)
+    {
+        for (int x = 0; x < width; x++)
+        {
+            u8 val = texData[x + y * width];
+
+            u32 idx = val & ((1 << Y) - 1);
+
+            u16 color = palData[idx];
+            u32 alpha = (val >> Y) & ((1 << X) - 1);
+            if (X != 5)
+                alpha = alpha * 4 + alpha / 2;
+
+            u32 res;
+            switch (outputFmt)
+            {
+            case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color) | alpha << 24; break;
+            // make sure full alpha == 255
+            case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break;
+            case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break;
+            }
+            output[x + y * width] = res;
+        }
+    }
+}
+
+template void ConvertAXIYTexture<outputFmt_RGB6A5, 5, 3>(u32, u32, u32*, u8*, u16*);
+template void ConvertAXIYTexture<outputFmt_RGB6A5, 3, 5>(u32, u32, u32*, u8*, u16*);
+
+template <int outputFmt, int colorBits>
+void ConvertNColorsTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData, bool color0Transparent)
+{
+    for (int y = 0; y < height; y++)
+    {
+        for (int x = 0; x < width / (8 / colorBits); x++)
+        {
+            u8 val = texData[x + y * (width / (8 / colorBits))];
+
+            for (int i = 0; i < 8 / colorBits; i++)
+            {
+                u32 index = (val >> (i * colorBits)) & ((1 << colorBits) - 1);
+                u16 color = palData[index];
+
+                bool transparent = color0Transparent && index == 0;
+                u32 res;
+                switch (outputFmt)
+                {
+                case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color)
+                    | (transparent ? 0 : 0x1F000000); break;
+                case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color)
+                    | (transparent ? 0 : 0xFF000000); break;
+                case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color)
+                    | (transparent ? 0 : 0xFF000000); break;
+                }
+                output[x * (8 / colorBits) + y * width + i] = res;
+            }
+        }
+    }
+}
+
+template void ConvertNColorsTexture<outputFmt_RGB6A5, 2>(u32, u32, u32*, u8*, u16*, bool);
+template void ConvertNColorsTexture<outputFmt_RGB6A5, 4>(u32, u32, u32*, u8*, u16*, bool);
+template void ConvertNColorsTexture<outputFmt_RGB6A5, 8>(u32, u32, u32*, u8*, u16*, bool);
+
+}
\ No newline at end of file
diff --git a/src/GPU3D_Texcache.h b/src/GPU3D_Texcache.h
new file mode 100644
index 0000000..73d70cb
--- /dev/null
+++ b/src/GPU3D_Texcache.h
@@ -0,0 +1,309 @@
+#ifndef GPU3D_TEXCACHE
+#define GPU3D_TEXCACHE
+
+#include "types.h"
+#include "GPU.h"
+
+#include <assert.h>
+#include <unordered_map>
+
+#define XXH_STATIC_LINKING_ONLY
+#include "xxhash/xxhash.h"
+
+namespace GPU3D
+{
+
+inline u32 TextureWidth(u32 texparam)
+{
+    return 8 << ((texparam >> 20) & 0x7);
+}
+
+inline u32 TextureHeight(u32 texparam)
+{
+    return 8 << ((texparam >> 23) & 0x7);
+}
+
+enum
+{
+    outputFmt_RGB6A5,
+    outputFmt_RGBA8,
+    outputFmt_BGRA8
+};
+
+template <int outputFmt>
+void ConvertBitmapTexture(u32 width, u32 height, u32* output, u8* texData);
+template <int outputFmt>
+void ConvertCompressedTexture(u32 width, u32 height, u32* output, u8* texData, u8* texAuxData, u16* palData);
+template <int outputFmt, int X, int Y>
+void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData);
+template <int outputFmt, int colorBits>
+void ConvertNColorsTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData, bool color0Transparent);
+
+template <typename TexLoaderT, typename TexHandleT>
+class Texcache
+{
+public:
+    Texcache(const TexLoaderT& texloader)
+        : TexLoader(texloader) // probably better if this would be a move constructor???
+    {}
+
+    bool Update()
+    {
+        auto textureDirty = GPU::VRAMDirty_Texture.DeriveState(GPU::VRAMMap_Texture);
+        auto texPalDirty = GPU::VRAMDirty_TexPal.DeriveState(GPU::VRAMMap_TexPal);
+
+        bool textureChanged = GPU::MakeVRAMFlat_TextureCoherent(textureDirty);
+        bool texPalChanged = GPU::MakeVRAMFlat_TexPalCoherent(texPalDirty);
+
+        if (textureChanged || texPalChanged)
+        {
+            //printf("check invalidation %d\n", TexCache.size());
+            for (auto it = Cache.begin(); it != Cache.end();)
+            {
+                TexCacheEntry& entry = it->second;
+                if (textureChanged)
+                {
+                    for (u32 i = 0; i < 2; i++)
+                    {
+                        u32 startBit = entry.TextureRAMStart[i] / GPU::VRAMDirtyGranularity;
+                        u32 bitsCount = ((entry.TextureRAMStart[i] + entry.TextureRAMSize[i] + GPU::VRAMDirtyGranularity - 1) / GPU::VRAMDirtyGranularity) - startBit;
+
+                        u32 startEntry = startBit >> 6;
+                        u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry;
+                        for (u32 j = startEntry; j < startEntry + entriesCount; j++)
+                        {
+                            if (GetRangedBitMask(j, startBit, bitsCount) & textureDirty.Data[j])
+                            {
+                                u64 newTexHash = XXH3_64bits(&GPU::VRAMFlat_Texture[entry.TextureRAMStart[i]], entry.TextureRAMSize[i]);
+
+                                if (newTexHash != entry.TextureHash[i])
+                                    goto invalidate;
+                            }
+                        }
+                    }
+                }
+
+                if (texPalChanged && entry.TexPalSize > 0)
+                {
+                    u32 startBit = entry.TexPalStart / GPU::VRAMDirtyGranularity;
+                    u32 bitsCount = ((entry.TexPalStart + entry.TexPalSize + GPU::VRAMDirtyGranularity - 1) / GPU::VRAMDirtyGranularity) - startBit;
+
+                    u32 startEntry = startBit >> 6;
+                    u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry;
+                    for (u32 j = startEntry; j < startEntry + entriesCount; j++)
+                    {
+                        if (GetRangedBitMask(j, startBit, bitsCount) & texPalDirty.Data[j])
+                        {
+                            u64 newPalHash = XXH3_64bits(&GPU::VRAMFlat_TexPal[entry.TexPalStart], entry.TexPalSize);
+                            if (newPalHash != entry.TexPalHash)
+                                goto invalidate;
+                        }
+                    }
+                }
+
+                it++;
+                continue;
+            invalidate:
+                FreeTextures[entry.WidthLog2][entry.HeightLog2].push_back(entry.Texture);
+
+                //printf("invalidating texture %d\n", entry.ImageDescriptor);
+
+                it = Cache.erase(it);
+            }
+
+            return true;
+        }
+
+        return false;
+    }
+
+    void GetTexture(u32 texParam, u32 palBase, TexHandleT& textureHandle, u32& layer, u32*& helper)
+    {
+        // remove sampling and texcoord gen params
+        texParam &= ~0xC00F0000;
+
+        u32 fmt = (texParam >> 26) & 0x7;
+        u64 key = texParam;
+        if (fmt != 7)
+        {
+            key |= (u64)palBase << 32;
+            if (fmt == 5)
+                key &= ~((u64)1 << 29);
+        }
+        //printf("%" PRIx64 " %" PRIx32 " %" PRIx32 "\n", key, texParam, palBase);
+
+        assert(fmt != 0 && "no texture is not a texture format!");
+
+        auto it = Cache.find(key);
+
+        if (it != Cache.end())
+        {
+            textureHandle = it->second.Texture.TextureID;
+            layer = it->second.Texture.Layer;
+            helper = &it->second.LastVariant;
+            return;
+        }
+
+        u32 widthLog2 = (texParam >> 20) & 0x7;
+        u32 heightLog2 = (texParam >> 23) & 0x7;
+        u32 width = 8 << widthLog2;
+        u32 height = 8 << heightLog2;
+
+        u32 addr = (texParam & 0xFFFF) * 8;
+
+        TexCacheEntry entry = {0};
+
+        entry.TextureRAMStart[0] = addr;
+        entry.WidthLog2 = widthLog2;
+        entry.HeightLog2 = heightLog2;
+
+        // apparently a new texture
+        if (fmt == 7)
+        {
+            entry.TextureRAMSize[0] = width*height*2;
+
+            ConvertBitmapTexture<outputFmt_RGB6A5>(width, height, DecodingBuffer, &GPU::VRAMFlat_Texture[addr]);
+        }
+        else if (fmt == 5)
+        {
+            u8* texData = &GPU::VRAMFlat_Texture[addr];
+            u32 slot1addr = 0x20000 + ((addr & 0x1FFFC) >> 1);
+            if (addr >= 0x40000)
+                slot1addr += 0x10000;
+            u8* texAuxData = &GPU::VRAMFlat_Texture[slot1addr];
+
+            u16* palData = (u16*)(GPU::VRAMFlat_TexPal + palBase*16);
+
+            entry.TextureRAMSize[0] = width*height/16*4;
+            entry.TextureRAMStart[1] = slot1addr;
+            entry.TextureRAMSize[1] = width*height/16*2;
+            entry.TexPalStart = palBase*16;
+            entry.TexPalSize = 0x10000;
+
+            ConvertCompressedTexture<outputFmt_RGB6A5>(width, height, DecodingBuffer, texData, texAuxData, palData);
+        }
+        else
+        {
+            u32 texSize, palAddr = palBase*16, numPalEntries;
+            switch (fmt)
+            {
+            case 1: texSize = width*height; numPalEntries = 32; break;
+            case 6: texSize = width*height; numPalEntries = 8; break;
+            case 2: texSize = width*height/4; numPalEntries = 4; palAddr >>= 1; break;
+            case 3: texSize = width*height/2; numPalEntries = 16; break;
+            case 4: texSize = width*height; numPalEntries = 256; break;
+            }
+
+            palAddr &= 0x1FFFF;
+
+            /*printf("creating texture | fmt: %d | %dx%d | %08x | %08x\n", fmt, width, height, addr, palAddr);
+            svcSleepThread(1000*1000);*/
+
+            entry.TextureRAMSize[0] = texSize;
+            entry.TexPalStart = palAddr;
+            entry.TexPalSize = numPalEntries*2;
+
+            u8* texData = &GPU::VRAMFlat_Texture[addr];
+            u16* palData = (u16*)(GPU::VRAMFlat_TexPal + palAddr);
+
+            //assert(entry.TexPalStart+entry.TexPalSize <= 128*1024*1024);
+
+            bool color0Transparent = texParam & (1 << 29);
+
+            switch (fmt)
+            {
+            case 1: ConvertAXIYTexture<outputFmt_RGB6A5, 3, 5>(width, height, DecodingBuffer, texData, palData); break;
+            case 6: ConvertAXIYTexture<outputFmt_RGB6A5, 5, 3>(width, height, DecodingBuffer, texData, palData); break;
+            case 2: ConvertNColorsTexture<outputFmt_RGB6A5, 2>(width, height, DecodingBuffer, texData, palData, color0Transparent); break;
+            case 3: ConvertNColorsTexture<outputFmt_RGB6A5, 4>(width, height, DecodingBuffer, texData, palData, color0Transparent); break;
+            case 4: ConvertNColorsTexture<outputFmt_RGB6A5, 8>(width, height, DecodingBuffer, texData, palData, color0Transparent); break;
+            }
+        }
+
+        for (int i = 0; i < 2; i++)
+        {
+            if (entry.TextureRAMSize[i])
+                entry.TextureHash[i] = XXH3_64bits(&GPU::VRAMFlat_Texture[entry.TextureRAMStart[i]], entry.TextureRAMSize[i]);
+        }
+        if (entry.TexPalSize)
+            entry.TexPalHash = XXH3_64bits(&GPU::VRAMFlat_TexPal[entry.TexPalStart], entry.TexPalSize);
+
+        auto& texArrays = TexArrays[widthLog2][heightLog2];
+        auto& freeTextures = FreeTextures[widthLog2][heightLog2];
+
+        if (freeTextures.size() == 0)
+        {
+            texArrays.resize(texArrays.size()+1);
+            GLuint& array = texArrays[texArrays.size()-1];
+
+            u32 layers = std::min<u32>((8*1024*1024) / (width*height*4), 64);
+
+            // allocate new array texture
+            //printf("allocating new layer set for %d %d %d %d\n", width, height, texArrays.size()-1, array.ImageDescriptor);
+            array = TexLoader.GenerateTexture(width, height, layers);
+
+            for (u32 i = 0; i < layers; i++)
+            {
+                freeTextures.push_back(TexArrayEntry{array, i});
+            }
+        }
+
+        TexArrayEntry storagePlace = freeTextures[freeTextures.size()-1];
+        freeTextures.pop_back();
+
+        entry.Texture = storagePlace;
+
+        TexLoader.UploadTexture(storagePlace.TextureID, width, height, storagePlace.Layer, DecodingBuffer);
+        //printf("using storage place %d %d | %d %d (%d)\n", width, height, storagePlace.TexArrayIdx, storagePlace.LayerIdx, array.ImageDescriptor);
+
+        textureHandle = storagePlace.TextureID;
+        layer = storagePlace.Layer;
+        helper = &Cache.emplace(std::make_pair(key, entry)).first->second.LastVariant;
+    }
+
+    void Reset()
+    {
+        for (u32 i = 0; i < 8; i++)
+        {
+            for (u32 j = 0; j < 8; j++)
+            {
+                for (u32 k = 0; k < TexArrays[i][j].size(); k++)
+                    TexLoader.DeleteTexture(TexArrays[i][j][k]);
+                TexArrays[i][j].clear();
+                FreeTextures[i][j].clear();
+            }
+        }
+        Cache.clear();
+    }
+private:
+    struct TexArrayEntry
+    {
+        TexHandleT TextureID;
+        u32 Layer;
+    };
+
+    struct TexCacheEntry
+    {
+        u32 LastVariant; // very cheap way to make variant lookup faster
+
+        u32 TextureRAMStart[2], TextureRAMSize[2];
+        u32 TexPalStart, TexPalSize;
+        u8 WidthLog2, HeightLog2;
+        TexArrayEntry Texture;
+
+        u64 TextureHash[2];
+        u64 TexPalHash;
+    };
+    std::unordered_map<u64, TexCacheEntry> Cache;
+
+    TexLoaderT TexLoader;
+
+    std::vector<TexArrayEntry> FreeTextures[8][8];
+    std::vector<TexHandleT> TexArrays[8][8];
+
+    u32 DecodingBuffer[1024*1024];
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/GPU3D_TexcacheOpenGL.cpp b/src/GPU3D_TexcacheOpenGL.cpp
new file mode 100644
index 0000000..0473237
--- /dev/null
+++ b/src/GPU3D_TexcacheOpenGL.cpp
@@ -0,0 +1,29 @@
+#include "GPU3D_TexcacheOpenGL.h"
+
+namespace GPU3D
+{
+
+GLuint TexcacheOpenGLLoader::GenerateTexture(u32 width, u32 height, u32 layers)
+{
+    GLuint texarray;
+    glGenTextures(1, &texarray);
+    glBindTexture(GL_TEXTURE_2D_ARRAY, texarray);
+    glTexStorage3D(GL_TEXTURE_2D_ARRAY, 1, GL_RGBA8UI, width, height, layers);
+    return texarray;
+}
+
+void TexcacheOpenGLLoader::UploadTexture(GLuint handle, u32 width, u32 height, u32 layer, void* data)
+{
+    glBindTexture(GL_TEXTURE_2D_ARRAY, handle);
+    glTexSubImage3D(GL_TEXTURE_2D_ARRAY,
+        0, 0, 0, layer,
+        width, height, 1,
+        GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, data);
+}
+
+void TexcacheOpenGLLoader::DeleteTexture(GLuint handle)
+{
+    glDeleteTextures(1, &handle);
+}
+
+}
\ No newline at end of file
diff --git a/src/GPU3D_TexcacheOpenGL.h b/src/GPU3D_TexcacheOpenGL.h
new file mode 100644
index 0000000..d61ae24
--- /dev/null
+++ b/src/GPU3D_TexcacheOpenGL.h
@@ -0,0 +1,25 @@
+#ifndef GPU3D_TEXCACHEOPENGL
+#define GPU3D_TEXCACHEOPENGL
+
+#include "GPU3D_Texcache.h"
+#include "OpenGLSupport.h"
+
+namespace GPU3D
+{
+
+template <typename, typename>
+class Texcache;
+
+class TexcacheOpenGLLoader
+{
+public:
+    GLuint GenerateTexture(u32 width, u32 height, u32 layers);
+    void UploadTexture(GLuint handle, u32 width, u32 height, u32 layer, void* data);
+    void DeleteTexture(GLuint handle);
+};
+
+using TexcacheOpenGL = Texcache<TexcacheOpenGLLoader, GLuint>;
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/GPU_OpenGL.cpp b/src/GPU_OpenGL.cpp
index 47e04d2..9d81717 100644
--- a/src/GPU_OpenGL.cpp
+++ b/src/GPU_OpenGL.cpp
@@ -36,32 +36,27 @@ using namespace OpenGL;
 std::unique_ptr<GLCompositor> GLCompositor::New() noexcept
 {
     assert(glBindAttribLocation != nullptr);
+    GLuint CompShader {};
 
-    std::array<GLuint, 3> CompShader {};
-    if (!OpenGL::BuildShaderProgram(kCompositorVS, kCompositorFS_Nearest, &CompShader[0], "CompositorShader"))
-        return nullptr;
-
-    glBindAttribLocation(CompShader[2], 0, "vPosition");
-    glBindAttribLocation(CompShader[2], 1, "vTexcoord");
-    glBindFragDataLocation(CompShader[2], 0, "oColor");
-
-    if (!OpenGL::LinkShaderProgram(CompShader.data()))
-        // OpenGL::LinkShaderProgram already deletes the shader program object
-        // if linking the shaders together failed.
+    if (!OpenGL::CompileVertexFragmentProgram(CompShader,
+            kCompositorVS, kCompositorFS_Nearest, 
+            "CompositorShader",
+            {{"vPosition", 0}, {"vTexcoord", 1}},
+            {{"oColor", 0}}))
         return nullptr;
 
     return std::unique_ptr<GLCompositor>(new GLCompositor(CompShader));
 }
 
-GLCompositor::GLCompositor(std::array<GLuint, 3> compShader) noexcept : CompShader(compShader)
+GLCompositor::GLCompositor(GLuint compShader) noexcept : CompShader(compShader)
 {
-    CompScaleLoc = glGetUniformLocation(CompShader[2], "u3DScale");
-    Comp3DXPosLoc = glGetUniformLocation(CompShader[2], "u3DXPos");
+    CompScaleLoc = glGetUniformLocation(CompShader, "u3DScale");
+    Comp3DXPosLoc = glGetUniformLocation(CompShader, "u3DXPos");
 
-    glUseProgram(CompShader[2]);
-    GLuint screenTextureUniform = glGetUniformLocation(CompShader[2], "ScreenTex");
+    glUseProgram(CompShader);
+    GLuint screenTextureUniform = glGetUniformLocation(CompShader, "ScreenTex");
     glUniform1i(screenTextureUniform, 0);
-    GLuint _3dTextureUniform = glGetUniformLocation(CompShader[2], "_3DTex");
+    GLuint _3dTextureUniform = glGetUniformLocation(CompShader, "_3DTex");
     glUniform1i(_3dTextureUniform, 1);
 
     // all this mess is to prevent bleeding
@@ -136,7 +131,7 @@ GLCompositor::~GLCompositor()
     glDeleteVertexArrays(1, &CompVertexArrayID);
     glDeleteBuffers(1, &CompVertexBufferID);
 
-    OpenGL::DeleteShaderProgram(CompShader.data());
+    glDeleteProgram(CompShader);
 }
 
 void GLCompositor::Reset()
@@ -186,9 +181,9 @@ void GLCompositor::Stop()
 
 void GLCompositor::RenderFrame()
 {
-    int frontbuf = GPU::FrontBuffer;
+    int backbuf = GPU::FrontBuffer ^ 1;
     glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
-    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, CompScreenOutputFB[frontbuf]);
+    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, CompScreenOutputFB[backbuf]);
 
     glDisable(GL_DEPTH_TEST);
     glDisable(GL_STENCIL_TEST);
@@ -200,7 +195,7 @@ void GLCompositor::RenderFrame()
     glClear(GL_COLOR_BUFFER_BIT);
 
     // TODO: select more shaders (filtering, etc)
-    OpenGL::UseShaderProgram(CompShader.data());
+    glUseProgram(CompShader);
     glUniform1ui(CompScaleLoc, Scale);
 
     // TODO: support setting this midframe, if ever needed
@@ -209,16 +204,16 @@ void GLCompositor::RenderFrame()
     glActiveTexture(GL_TEXTURE0);
     glBindTexture(GL_TEXTURE_2D, CompScreenInputTex);
 
-    if (GPU::Framebuffer[frontbuf][0] && GPU::Framebuffer[frontbuf][1])
+    if (GPU::Framebuffer[backbuf][0] && GPU::Framebuffer[backbuf][1])
     {
         glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 256*3 + 1, 192, GL_RGBA_INTEGER,
-                        GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][0]);
+                        GL_UNSIGNED_BYTE, GPU::Framebuffer[backbuf][0]);
         glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256*3 + 1, 192, GL_RGBA_INTEGER,
-                        GL_UNSIGNED_BYTE, GPU::Framebuffer[frontbuf][1]);
+                        GL_UNSIGNED_BYTE, GPU::Framebuffer[backbuf][1]);
     }
 
     glActiveTexture(GL_TEXTURE1);
-    reinterpret_cast<GPU3D::GLRenderer*>(GPU3D::CurrentRenderer.get())->SetupAccelFrame();
+    GPU3D::CurrentRenderer->SetupAccelFrame();
 
     glBindBuffer(GL_ARRAY_BUFFER, CompVertexBufferID);
     glBindVertexArray(CompVertexArrayID);
diff --git a/src/GPU_OpenGL.h b/src/GPU_OpenGL.h
index 90c17ae..a507842 100644
--- a/src/GPU_OpenGL.h
+++ b/src/GPU_OpenGL.h
@@ -44,12 +44,12 @@ public:
     void RenderFrame();
     void BindOutputTexture(int buf);
 private:
-    GLCompositor(std::array<GLuint, 3> CompShader) noexcept;
+    GLCompositor(GLuint CompShader) noexcept;
 
     int Scale;
     int ScreenH, ScreenW;
 
-    std::array<GLuint, 3> CompShader;
+    GLuint CompShader;
     GLuint CompScaleLoc;
     GLuint Comp3DXPosLoc;
 
@@ -68,4 +68,4 @@ private:
     GLuint CompScreenOutputFB[2];
 };
 
-}
\ No newline at end of file
+}
diff --git a/src/NDS.cpp b/src/NDS.cpp
index 832661a..0e00bab 100644
--- a/src/NDS.cpp
+++ b/src/NDS.cpp
@@ -33,6 +33,7 @@
 #include "AREngine.h"
 #include "Platform.h"
 #include "FreeBIOS.h"
+#include "GPU3D.h"
 
 #ifdef JIT_ENABLED
 #include "ARMJIT.h"
diff --git a/src/NonStupidBitfield.h b/src/NonStupidBitfield.h
index a4fe7ec..6d9cd2c 100644
--- a/src/NonStupidBitfield.h
+++ b/src/NonStupidBitfield.h
@@ -26,6 +26,32 @@
 #include <initializer_list>
 #include <algorithm>
 
+inline u64 GetRangedBitMask(u32 idx, u32 startBit, u32 bitsCount)
+{
+    u32 startEntry = startBit >> 6;
+    u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry;
+
+    if (entriesCount > 1)
+    {
+        if (idx == startEntry)
+            return 0xFFFFFFFFFFFFFFFF << (startBit & 0x3F);
+        if (((startBit + bitsCount) & 0x3F) && idx == startEntry + entriesCount - 1)
+            return ~(0xFFFFFFFFFFFFFFFF << ((startBit + bitsCount) & 0x3F));
+
+        return 0xFFFFFFFFFFFFFFFF;
+    }
+    else if (idx == startEntry)
+    {
+        return bitsCount == 64
+            ? 0xFFFFFFFFFFFFFFFF
+            : ((1ULL << bitsCount) - 1) << (startBit & 0x3F);
+    }
+    else
+    {
+        return 0;
+    }
+}
+
 // like std::bitset but less stupid and optimised for 
 // our use case (keeping track of memory invalidations)
 
@@ -164,6 +190,11 @@ struct NonStupidBitField
         return Ref{*this, idx};
     }
 
+    bool operator[](u32 idx) const
+    {
+        return Data[idx >> 6] & (1ULL << (idx & 0x3F));
+    }
+
     void SetRange(u32 startBit, u32 bitsCount)
     {
         u32 startEntry = startBit >> 6;
@@ -185,6 +216,26 @@ struct NonStupidBitField
         }
     }
 
+    int Min() const
+    {
+        for (int i = 0; i < DataLength; i++)
+        {
+            if (Data[i])
+                return i * 64 + __builtin_ctzll(Data[i]);
+        }
+        return -1;
+    }
+
+    int Max() const
+    {
+        for (int i = DataLength - 1; i >= 0; i--)
+        {
+            if (Data[i])
+                return i * 64 + (63 - __builtin_clzll(Data[i]));
+        }
+        return -1;
+    }
+
     NonStupidBitField& operator|=(const NonStupidBitField<Size>& other)
     {
         for (u32 i = 0; i < DataLength; i++)
@@ -193,6 +244,7 @@ struct NonStupidBitField
         }
         return *this;
     }
+
     NonStupidBitField& operator&=(const NonStupidBitField<Size>& other)
     {
         for (u32 i = 0; i < DataLength; i++)
@@ -201,6 +253,20 @@ struct NonStupidBitField
         }
         return *this;
     }
+
+    operator bool() const
+    {
+        for (int i = 0; i < DataLength - 1; i++)
+        {
+            if (Data[i])
+                return true;
+        }
+        if (Data[DataLength-1] & ((Size&0x3F) ? ~(0xFFFFFFFFFFFFFFFF << (Size&0x3F)) : 0xFFFFFFFFFFFFFFFF))
+        {
+            return true;
+        }
+        return false;
+    }
 };
 
 
diff --git a/src/OpenGLSupport.cpp b/src/OpenGLSupport.cpp
index 5a8da11..0c793fa 100644
--- a/src/OpenGLSupport.cpp
+++ b/src/OpenGLSupport.cpp
@@ -18,78 +18,204 @@
 
 #include "OpenGLSupport.h"
 
+#include <unordered_map>
+#include <vector>
+
+#include <assert.h>
+
+#define XXH_STATIC_LINKING_ONLY
+#include "xxhash/xxhash.h"
+
 using Platform::Log;
 using Platform::LogLevel;
 
 namespace OpenGL
 {
 
-bool BuildShaderProgram(const char* vs, const char* fs, GLuint* ids, const char* name)
+struct ShaderCacheEntry
 {
-    int len;
-    int res;
+    u32 Length;
+    u8* Data;
+    u32 BinaryFormat;
 
-    if (!glCreateShader)
+    ShaderCacheEntry(u8* data, u32 length, u32 binaryFmt)
+        : Length(length), Data(data), BinaryFormat(binaryFmt)
     {
-        Log(LogLevel::Error, "OpenGL: Cannot build shader program, OpenGL hasn't been loaded\n");
-        return false;
+        assert(data != nullptr);
     }
 
-    ids[0] = glCreateShader(GL_VERTEX_SHADER);
-    len = strlen(vs);
-    glShaderSource(ids[0], 1, &vs, &len);
-    glCompileShader(ids[0]);
+    ShaderCacheEntry(const ShaderCacheEntry&) = delete;
+    ShaderCacheEntry(ShaderCacheEntry&& other)
+    {
+        Data = other.Data;
+        Length = other.Length;
+        BinaryFormat = other.BinaryFormat;
 
-    glGetShaderiv(ids[0], GL_COMPILE_STATUS, &res);
-    if (res != GL_TRUE)
+        other.Data = nullptr;
+        other.Length = 0;
+        other.BinaryFormat = 0;
+    }
+
+    ~ShaderCacheEntry()
     {
-        glGetShaderiv(ids[0], GL_INFO_LOG_LENGTH, &res);
-        if (res < 1) res = 1024;
-        char* log = new char[res+1];
-        glGetShaderInfoLog(ids[0], res+1, NULL, log);
-        Log(LogLevel::Error, "OpenGL: failed to compile vertex shader %s: %s\n", name, log);
-        Log(LogLevel::Debug, "shader source:\n--\n%s\n--\n", vs);
-        delete[] log;
+        if (Data) // check whether it was moved
+            delete[] Data;
+    }
+};
+
+std::unordered_map<u64, ShaderCacheEntry> ShaderCache;
+std::vector<u64> NewShaders;
+
+constexpr u32 ShaderCacheMagic = 0x11CAC4E1;
+constexpr u32 ShaderCacheVersion = 1;
+
+void LoadShaderCache()
+{
+    // for now the shader cache only contains only compute shaders
+    // because they take the longest to compile
+    Platform::FileHandle* file = Platform::OpenLocalFile("shadercache", Platform::FileMode::Read);
+    if (file == nullptr)
+    {
+        Log(LogLevel::Error, "Could not find shader cache\n");
+        return;
+    }
+
+    u32 magic, version, numPrograms;
+    if (Platform::FileRead(&magic, 4, 1, file) != 1 || magic != ShaderCacheMagic)
+    {
+        Log(LogLevel::Error, "Shader cache file has invalid magic\n");
+        goto fileInvalid;
+    }
+
+    if (Platform::FileRead(&version, 4, 1, file) != 1 || version != ShaderCacheVersion)
+    {
+        Log(LogLevel::Error, "Shader cache file has bad version\n");
+        goto fileInvalid;
+    }
+
+    if (Platform::FileRead(&numPrograms, 4, 1, file) != 1)
+    {
+        Log(LogLevel::Error, "Shader cache file invalid program count\n");
+        goto fileInvalid;
+    }
+
+    // not the best approach, because once changes pile up
+    // we read and overwrite the old files
+    for (u32 i = 0; i < numPrograms; i++)
+    {
+        int error = 3;
+
+        u32 length, binaryFormat;
+        u64 sourceHash;
+        error -= Platform::FileRead(&sourceHash, 8, 1, file);
+        error -= Platform::FileRead(&length, 4, 1, file);
+        error -= Platform::FileRead(&binaryFormat, 4, 1, file);
+
+        if (error != 0)
+        {
+            Log(LogLevel::Error, "Invalid shader cache entry\n");
+            goto fileInvalid;
+        }
+
+        u8* data = new u8[length];
+        if (Platform::FileRead(data, length, 1, file) != 1)
+        {
+            Log(LogLevel::Error, "Could not read shader cache entry data\n");
+            delete[] data;
+            goto fileInvalid;
+        }
+
+        ShaderCache.erase(sourceHash);
+        ShaderCache.emplace(sourceHash, ShaderCacheEntry(data, length, binaryFormat));
+    }
+
+fileInvalid:
+    Platform::CloseFile(file);
+}
+
+void SaveShaderCache()
+{
+    Platform::FileHandle* file = Platform::OpenLocalFile("shadercache", Platform::FileMode::ReadWrite);
+
+    if (file == nullptr)
+    {
+        Log(LogLevel::Error, "Could not open or create shader cache file\n");
+        return;
+    }
+
+    int written = 3;
+    u32 magic = ShaderCacheMagic, version = ShaderCacheVersion, numPrograms = ShaderCache.size();
+    written -= Platform::FileWrite(&magic, 4, 1, file);
+    written -= Platform::FileWrite(&version, 4, 1, file);
+    written -= Platform::FileWrite(&numPrograms, 4, 1, file);
+
+    if (written != 0)
+    {
+        Log(LogLevel::Error, "Could not write shader cache header\n");
+        goto writeError;
+    }
+
+    Platform::FileSeek(file, 0, Platform::FileSeekOrigin::End);
+
+    printf("new shaders %d\n", NewShaders.size());
 
-        glDeleteShader(ids[0]);
+    for (u64 newShader : NewShaders)
+    {
+        int error = 4;
+        auto it = ShaderCache.find(newShader);
+
+        error -= Platform::FileWrite(&it->first, 8, 1, file);
+        error -= Platform::FileWrite(&it->second.Length, 4, 1, file);
+        error -= Platform::FileWrite(&it->second.BinaryFormat, 4, 1, file);
+        error -= Platform::FileWrite(it->second.Data, it->second.Length, 1, file);
+
+        if (error != 0)
+        {
+            Log(LogLevel::Error, "Could not insert new shader cache entry\n");
+            goto writeError;
+        }
+    }
+
+writeError:
+    Platform::CloseFile(file);
+
+    NewShaders.clear();
+}
+
+bool CompilerShader(GLuint& id, const std::string& source, const std::string& name, const std::string& type)
+{
+    int res;
 
+    if (!glCreateShader)
+    {
+        Log(LogLevel::Error, "OpenGL: Cannot build shader program, OpenGL hasn't been loaded\n");
         return false;
     }
 
-    ids[1] = glCreateShader(GL_FRAGMENT_SHADER);
-    len = strlen(fs);
-    glShaderSource(ids[1], 1, &fs, &len);
-    glCompileShader(ids[1]);
+    const char* sourceC = source.c_str();
+    int len = source.length();
+    glShaderSource(id, 1, &sourceC, &len);
+
+    glCompileShader(id);
 
-    glGetShaderiv(ids[1], GL_COMPILE_STATUS, &res);
+    glGetShaderiv(id, GL_COMPILE_STATUS, &res);
     if (res != GL_TRUE)
     {
-        glGetShaderiv(ids[1], GL_INFO_LOG_LENGTH, &res);
+        glGetShaderiv(id, GL_INFO_LOG_LENGTH, &res);
         if (res < 1) res = 1024;
         char* log = new char[res+1];
-        glGetShaderInfoLog(ids[1], res+1, NULL, log);
-        Log(LogLevel::Error, "OpenGL: failed to compile fragment shader %s: %s\n", name, log);
-        //printf("shader source:\n--\n%s\n--\n", fs);
+        glGetShaderInfoLog(id, res+1, NULL, log);
+        Log(LogLevel::Error, "OpenGL: failed to compile %s shader %s: %s\n", type.c_str(), name.c_str(), log);
+        Log(LogLevel::Debug, "shader source:\n--\n%s\n--\n", source.c_str());
         delete[] log;
 
-        Platform::FileHandle* logf = Platform::OpenFile("shaderfail.log", Platform::FileMode::WriteText);
-        Platform::FileWrite(fs, len+1, 1, logf);
-        Platform::CloseFile(logf);
-
-        glDeleteShader(ids[0]);
-        glDeleteShader(ids[1]);
-
         return false;
     }
 
-    ids[2] = glCreateProgram();
-    glAttachShader(ids[2], ids[0]);
-    glAttachShader(ids[2], ids[1]);
-
     return true;
 }
 
-bool LinkShaderProgram(GLuint* ids)
+bool LinkProgram(GLuint& result, GLuint* ids, int numIds)
 {
     int res;
 
@@ -99,46 +225,132 @@ bool LinkShaderProgram(GLuint* ids)
         return false;
     }
 
-    glLinkProgram(ids[2]);
+    for (int i = 0; i < numIds; i++)
+    {
+        glAttachShader(result, ids[i]);
+    }
 
-    glDetachShader(ids[2], ids[0]);
-    glDetachShader(ids[2], ids[1]);
+    glLinkProgram(result);
 
-    glDeleteShader(ids[0]);
-    glDeleteShader(ids[1]);
+    for (int i = 0; i < numIds; i++)
+        glDetachShader(result, ids[i]);
 
-    glGetProgramiv(ids[2], GL_LINK_STATUS, &res);
+    glGetProgramiv(result, GL_LINK_STATUS, &res);
     if (res != GL_TRUE)
     {
-        glGetProgramiv(ids[2], GL_INFO_LOG_LENGTH, &res);
+        glGetProgramiv(result, GL_INFO_LOG_LENGTH, &res);
         if (res < 1) res = 1024;
         char* log = new char[res+1];
-        glGetProgramInfoLog(ids[2], res+1, NULL, log);
+        glGetProgramInfoLog(result, res+1, NULL, log);
         Log(LogLevel::Error, "OpenGL: failed to link shader program: %s\n", log);
         delete[] log;
 
-        glDeleteProgram(ids[2]);
-
         return false;
     }
 
     return true;
 }
 
-void DeleteShaderProgram(GLuint* ids)
+bool CompileComputeProgram(GLuint& result, const std::string& source, const std::string& name)
 {
-    if (glDeleteProgram)
-    { // If OpenGL isn't loaded, then there's no shader program to delete
-        glDeleteProgram(ids[2]);
+    result = glCreateProgram();
+
+    /*u64 sourceHash = XXH64(source.data(), source.size(), 0);
+    auto it = ShaderCache.find(sourceHash);
+    if (it != ShaderCache.end())
+    {
+        glProgramBinary(result, it->second.BinaryFormat, it->second.Data, it->second.Length);
+
+        GLint linkStatus;
+        glGetProgramiv(result, GL_LINK_STATUS, &linkStatus);
+        if (linkStatus == GL_TRUE)
+        {
+            Log(LogLevel::Info, "Restored shader %s from cache\n", name.c_str());
+            return true;
+        }
+        else
+        {
+        }
+    }*/
+    Log(LogLevel::Error, "Shader %s from cache was rejected\n", name.c_str());
+
+    GLuint shader;
+    bool linkingSucess = false;
+
+    if (!glCreateShader || !glDeleteShader)
+        goto error;
+
+    shader = glCreateShader(GL_COMPUTE_SHADER);
+
+    if (!CompilerShader(shader, source, name, "compute"))
+        goto error;
+
+    linkingSucess = LinkProgram(result, &shader, 1);
+
+error:
+    glDeleteShader(shader);
+
+    if (!linkingSucess)
+    {
+        glDeleteProgram(result);
     }
+    /*else
+    {
+        GLint length;
+        GLenum format;
+        glGetProgramiv(result, GL_PROGRAM_BINARY_LENGTH, &length);
+
+        u8* buffer = new u8[length];
+        glGetProgramBinary(result, length, nullptr, &format, buffer);
+
+        ShaderCache.emplace(sourceHash, ShaderCacheEntry(buffer, length, format));
+        NewShaders.push_back(sourceHash);
+    }*/
+
+    return linkingSucess;
 }
 
-void UseShaderProgram(GLuint* ids)
+bool CompileVertexFragmentProgram(GLuint& result,
+    const std::string& vs, const std::string& fs,
+    const std::string& name,
+    const std::initializer_list<AttributeTarget>& vertexInAttrs,
+    const std::initializer_list<AttributeTarget>& fragmentOutAttrs)
 {
-    if (glUseProgram)
-    { // If OpenGL isn't loaded, then there's no shader program to use
-        glUseProgram(ids[2]);
+    GLuint shaders[2] =
+    {
+        glCreateShader(GL_VERTEX_SHADER),
+        glCreateShader(GL_FRAGMENT_SHADER)
+    };
+    result = glCreateProgram();
+
+    bool linkingSucess = false;
+
+    if (!CompilerShader(shaders[0], vs, name, "vertex"))
+        goto error;
+
+    if (!CompilerShader(shaders[1], fs, name, "fragment"))
+        goto error;
+
+
+    for (const AttributeTarget& target : vertexInAttrs)
+    {
+        glBindAttribLocation(result, target.Location, target.Name);
     }
+    for (const AttributeTarget& target : fragmentOutAttrs)
+    {
+        glBindFragDataLocation(result, target.Location, target.Name);
+    }
+
+    linkingSucess = LinkProgram(result, shaders, 2);
+
+error:
+    glDeleteShader(shaders[1]);
+    glDeleteShader(shaders[0]);
+
+    if (!linkingSucess)
+        glDeleteProgram(result);
+
+    return linkingSucess;
 }
 
 }
diff --git a/src/OpenGLSupport.h b/src/OpenGLSupport.h
index 14be01a..ec2cb1f 100644
--- a/src/OpenGLSupport.h
+++ b/src/OpenGLSupport.h
@@ -29,10 +29,23 @@
 namespace OpenGL
 {
 
-bool BuildShaderProgram(const char* vs, const char* fs, GLuint* ids, const char* name);
-bool LinkShaderProgram(GLuint* ids);
-void DeleteShaderProgram(GLuint* ids);
-void UseShaderProgram(GLuint* ids);
+void LoadShaderCache();
+void SaveShaderCache();
+
+struct AttributeTarget
+{
+    const char* Name;
+    u32 Location;
+};
+
+
+bool CompileVertexFragmentProgram(GLuint& result,
+    const std::string& vs, const std::string& fs,
+    const std::string& name,
+    const std::initializer_list<AttributeTarget>& vertexInAttrs,
+    const std::initializer_list<AttributeTarget>& fragmentOutAttrs);
+
+bool CompileComputeProgram(GLuint& result, const std::string& source, const std::string& name);
 
 }
 
diff --git a/src/frontend/qt_sdl/Config.cpp b/src/frontend/qt_sdl/Config.cpp
index c28f63a..cc489cc 100644
--- a/src/frontend/qt_sdl/Config.cpp
+++ b/src/frontend/qt_sdl/Config.cpp
@@ -22,6 +22,7 @@
 #include <inttypes.h>
 #include "Platform.h"
 #include "Config.h"
+#include "GPU.h"
 
 
 namespace Config
@@ -58,6 +59,7 @@ bool Threaded3D;
 
 int GL_ScaleFactor;
 bool GL_BetterPolygons;
+bool GL_HiresCoordinates;
 
 bool LimitFPS;
 bool AudioSync;
@@ -243,11 +245,12 @@ ConfigEntry ConfigFile[] =
     {"ScreenVSync",         1, &ScreenVSync,         false, false},
     {"ScreenVSyncInterval", 0, &ScreenVSyncInterval, 1, false},
 
-    {"3DRenderer", 0, &_3DRenderer, 0, false},
+    {"3DRenderer", 0, &_3DRenderer, GPU::renderer3D_Software, false},
     {"Threaded3D", 1, &Threaded3D, true, false},
 
     {"GL_ScaleFactor", 0, &GL_ScaleFactor, 1, false},
     {"GL_BetterPolygons", 1, &GL_BetterPolygons, false, false},
+    {"GL_HiresCoordinates", 1, &GL_HiresCoordinates, true, false},
 
     {"LimitFPS", 1, &LimitFPS, true, false},
     {"AudioSync", 1, &AudioSync, false},
diff --git a/src/frontend/qt_sdl/Config.h b/src/frontend/qt_sdl/Config.h
index fba9bfb..19bf0c4 100644
--- a/src/frontend/qt_sdl/Config.h
+++ b/src/frontend/qt_sdl/Config.h
@@ -103,6 +103,7 @@ extern bool Threaded3D;
 
 extern int GL_ScaleFactor;
 extern bool GL_BetterPolygons;
+extern bool GL_HiresCoordinates;
 
 extern bool LimitFPS;
 extern bool AudioSync;
diff --git a/src/frontend/qt_sdl/OSD.cpp b/src/frontend/qt_sdl/OSD.cpp
index d3becc1..6842d5f 100644
--- a/src/frontend/qt_sdl/OSD.cpp
+++ b/src/frontend/qt_sdl/OSD.cpp
@@ -57,7 +57,7 @@ struct Item
 
 std::deque<Item> ItemQueue;
 
-GLuint Shader[3];
+GLuint Shader;
 GLint uScreenSize, uOSDPos, uOSDSize;
 GLfloat uScaleFactor;
 GLuint OSDVertexArray;
@@ -70,20 +70,19 @@ bool Init(bool openGL)
 {
     if (openGL)
     {
-        OpenGL::BuildShaderProgram(kScreenVS_OSD, kScreenFS_OSD, Shader, "OSDShader");
+        OpenGL::CompileVertexFragmentProgram(Shader,
+            kScreenVS_OSD, kScreenFS_OSD,
+            "OSDShader",
+            {{"vPosition", 0}},
+            {{"oColor", 0}});
 
-        GLuint pid = Shader[2];
-        glBindAttribLocation(pid, 0, "vPosition");
-        glBindFragDataLocation(pid, 0, "oColor");
+        glUseProgram(Shader);
+        glUniform1i(glGetUniformLocation(Shader, "OSDTex"), 0);
 
-        OpenGL::LinkShaderProgram(Shader);
-        glUseProgram(pid);
-        glUniform1i(glGetUniformLocation(pid, "OSDTex"), 0);
-
-        uScreenSize = glGetUniformLocation(pid, "uScreenSize");
-        uOSDPos = glGetUniformLocation(pid, "uOSDPos");
-        uOSDSize = glGetUniformLocation(pid, "uOSDSize");
-        uScaleFactor = glGetUniformLocation(pid, "uScaleFactor");
+        uScreenSize = glGetUniformLocation(Shader, "uScreenSize");
+        uOSDPos = glGetUniformLocation(Shader, "uOSDPos");
+        uOSDSize = glGetUniformLocation(Shader, "uOSDSize");
+        uScaleFactor = glGetUniformLocation(Shader, "uScaleFactor");
 
         float vertices[6*2] =
         {
@@ -425,7 +424,7 @@ void DrawGL(float w, float h)
 
     u32 y = kOSDMargin;
 
-    glUseProgram(Shader[2]);
+    glUseProgram(Shader);
 
     glUniform2f(uScreenSize, w, h);
     glUniform1f(uScaleFactor, mainWindow->devicePixelRatioF());
diff --git a/src/frontend/qt_sdl/VideoSettingsDialog.cpp b/src/frontend/qt_sdl/VideoSettingsDialog.cpp
index 95ec7d3..5ef10c5 100644
--- a/src/frontend/qt_sdl/VideoSettingsDialog.cpp
+++ b/src/frontend/qt_sdl/VideoSettingsDialog.cpp
@@ -23,6 +23,7 @@
 #include "types.h"
 #include "Platform.h"
 #include "Config.h"
+#include "GPU.h"
 
 #include "VideoSettingsDialog.h"
 #include "ui_VideoSettingsDialog.h"
@@ -30,11 +31,20 @@
 
 inline bool UsesGL()
 {
-    return (Config::ScreenUseGL != 0) || (Config::_3DRenderer != 0);
+    return (Config::ScreenUseGL != 0) || (Config::_3DRenderer != GPU::renderer3D_Software);
 }
 
 VideoSettingsDialog* VideoSettingsDialog::currentDlg = nullptr;
 
+void VideoSettingsDialog::setEnabled()
+{
+    bool softwareRenderer = Config::_3DRenderer == GPU::renderer3D_Software;
+    ui->cbGLDisplay->setEnabled(softwareRenderer);
+    ui->cbSoftwareThreaded->setEnabled(softwareRenderer);
+    ui->cbxGLResolution->setEnabled(!softwareRenderer);
+    ui->cbBetterPolygons->setEnabled(Config::_3DRenderer == GPU::renderer3D_OpenGL);
+    ui->cbxComputeHiResCoords->setEnabled(Config::_3DRenderer == GPU::renderer3D_OpenGLCompute);
+}
 
 VideoSettingsDialog::VideoSettingsDialog(QWidget* parent) : QDialog(parent), ui(new Ui::VideoSettingsDialog)
 {
@@ -48,10 +58,12 @@ VideoSettingsDialog::VideoSettingsDialog(QWidget* parent) : QDialog(parent), ui(
     oldSoftThreaded = Config::Threaded3D;
     oldGLScale = Config::GL_ScaleFactor;
     oldGLBetterPolygons = Config::GL_BetterPolygons;
+    oldHiresCoordinates = Config::GL_HiresCoordinates;
 
     grp3DRenderer = new QButtonGroup(this);
-    grp3DRenderer->addButton(ui->rb3DSoftware, 0);
-    grp3DRenderer->addButton(ui->rb3DOpenGL,   1);
+    grp3DRenderer->addButton(ui->rb3DSoftware, GPU::renderer3D_Software);
+    grp3DRenderer->addButton(ui->rb3DOpenGL,   GPU::renderer3D_OpenGL);
+    grp3DRenderer->addButton(ui->rb3DCompute,  GPU::renderer3D_OpenGLCompute);
 #if QT_VERSION < QT_VERSION_CHECK(5, 15, 0)
     connect(grp3DRenderer, SIGNAL(buttonClicked(int)), this, SLOT(onChange3DRenderer(int)));
 #else
@@ -75,25 +87,13 @@ VideoSettingsDialog::VideoSettingsDialog(QWidget* parent) : QDialog(parent), ui(
     ui->cbxGLResolution->setCurrentIndex(Config::GL_ScaleFactor-1);
 
     ui->cbBetterPolygons->setChecked(Config::GL_BetterPolygons != 0);
+    ui->cbxComputeHiResCoords->setChecked(Config::GL_HiresCoordinates != 0);
 
     if (!Config::ScreenVSync)
         ui->sbVSyncInterval->setEnabled(false);
     setVsyncControlEnable(UsesGL());
 
-    if (Config::_3DRenderer == 0)
-    {
-        ui->cbGLDisplay->setEnabled(true);
-        ui->cbSoftwareThreaded->setEnabled(true);
-        ui->cbxGLResolution->setEnabled(false);
-        ui->cbBetterPolygons->setEnabled(false);
-    }
-    else
-    {
-        ui->cbGLDisplay->setEnabled(false);
-        ui->cbSoftwareThreaded->setEnabled(false);
-        ui->cbxGLResolution->setEnabled(true);
-        ui->cbBetterPolygons->setEnabled(true);
-    }
+    setEnabled();
 }
 
 VideoSettingsDialog::~VideoSettingsDialog()
@@ -119,6 +119,7 @@ void VideoSettingsDialog::on_VideoSettingsDialog_rejected()
     Config::Threaded3D = oldSoftThreaded;
     Config::GL_ScaleFactor = oldGLScale;
     Config::GL_BetterPolygons = oldGLBetterPolygons;
+    Config::GL_HiresCoordinates = oldHiresCoordinates;
 
     emit updateVideoSettings(old_gl != UsesGL());
 
@@ -133,31 +134,18 @@ void VideoSettingsDialog::setVsyncControlEnable(bool hasOGL)
 
 void VideoSettingsDialog::onChange3DRenderer(int renderer)
 {
-    bool old_gl = (Config::ScreenUseGL != 0) || (Config::_3DRenderer != 0);
+    bool old_gl = UsesGL();
 
     Config::_3DRenderer = renderer;
 
-    if (renderer == 0)
-    {
-        ui->cbGLDisplay->setEnabled(true);
-        ui->cbSoftwareThreaded->setEnabled(true);
-        ui->cbxGLResolution->setEnabled(false);
-        ui->cbBetterPolygons->setEnabled(false);
-    }
-    else
-    {
-        ui->cbGLDisplay->setEnabled(false);
-        ui->cbSoftwareThreaded->setEnabled(false);
-        ui->cbxGLResolution->setEnabled(true);
-        ui->cbBetterPolygons->setEnabled(true);
-    }
+    setEnabled();
 
     emit updateVideoSettings(old_gl != UsesGL());
 }
 
 void VideoSettingsDialog::on_cbGLDisplay_stateChanged(int state)
 {
-    bool old_gl = (Config::ScreenUseGL != 0) || (Config::_3DRenderer != 0);
+    bool old_gl = UsesGL();
 
     Config::ScreenUseGL = (state != 0);
 
@@ -205,3 +193,10 @@ void VideoSettingsDialog::on_cbBetterPolygons_stateChanged(int state)
 
     emit updateVideoSettings(false);
 }
+
+void VideoSettingsDialog::on_cbxComputeHiResCoords_stateChanged(int state)
+{
+    Config::GL_HiresCoordinates = (state != 0);
+
+    emit updateVideoSettings(false);
+}
diff --git a/src/frontend/qt_sdl/VideoSettingsDialog.h b/src/frontend/qt_sdl/VideoSettingsDialog.h
index 7fee5bb..166a826 100644
--- a/src/frontend/qt_sdl/VideoSettingsDialog.h
+++ b/src/frontend/qt_sdl/VideoSettingsDialog.h
@@ -65,10 +65,12 @@ private slots:
 
     void on_cbxGLResolution_currentIndexChanged(int idx);
     void on_cbBetterPolygons_stateChanged(int state);
+    void on_cbxComputeHiResCoords_stateChanged(int state);
 
     void on_cbSoftwareThreaded_stateChanged(int state);
 private:
     void setVsyncControlEnable(bool hasOGL);
+    void setEnabled();
 
     Ui::VideoSettingsDialog* ui;
 
@@ -81,6 +83,7 @@ private:
     int oldSoftThreaded;
     int oldGLScale;
     int oldGLBetterPolygons;
+    int oldHiresCoordinates;
 };
 
 #endif // VIDEOSETTINGSDIALOG_H
diff --git a/src/frontend/qt_sdl/VideoSettingsDialog.ui b/src/frontend/qt_sdl/VideoSettingsDialog.ui
index 11cfe3d..ff9baf8 100644
--- a/src/frontend/qt_sdl/VideoSettingsDialog.ui
+++ b/src/frontend/qt_sdl/VideoSettingsDialog.ui
@@ -6,7 +6,7 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>408</width>
+    <width>427</width>
     <height>262</height>
    </rect>
   </property>
@@ -24,7 +24,7 @@
     <enum>QLayout::SetFixedSize</enum>
    </property>
    <property name="horizontalSpacing">
-    <number>-1</number>
+    <number>6</number>
    </property>
    <item row="1" column="1">
     <widget class="QGroupBox" name="groupBox_3">
@@ -39,6 +39,16 @@
         </property>
        </widget>
       </item>
+      <item row="2" column="0">
+       <widget class="QCheckBox" name="cbBetterPolygons">
+        <property name="whatsThis">
+         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;Enabling this may help reduce distortion on quads and more complex polygons, but may also reduce performance.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+        </property>
+        <property name="text">
+         <string>Improved polygon splitting</string>
+        </property>
+       </widget>
+      </item>
       <item row="1" column="0">
        <widget class="QComboBox" name="cbxGLResolution">
         <property name="whatsThis">
@@ -46,13 +56,10 @@
         </property>
        </widget>
       </item>
-      <item row="2" column="0">
-       <widget class="QCheckBox" name="cbBetterPolygons">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;Enabling this may help reduce distortion on quads and more complex polygons, but may also reduce performance.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
+      <item row="3" column="0">
+       <widget class="QCheckBox" name="cbxComputeHiResCoords">
         <property name="text">
-         <string>Improved polygon splitting</string>
+         <string>Use high resolution coordinates</string>
         </property>
        </widget>
       </item>
@@ -94,23 +101,7 @@
       <string>Display settings</string>
      </property>
      <layout class="QGridLayout" name="gridLayout_2">
-      <item row="6" column="0">
-       <widget class="QLabel" name="label_2">
-        <property name="sizePolicy">
-         <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
-          <horstretch>0</horstretch>
-          <verstretch>0</verstretch>
-         </sizepolicy>
-        </property>
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The interval at which to synchronize to the monitor's refresh rate. Set to 1 for a 60Hz monitor, 2 for 120Hz, ...&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-        <property name="text">
-         <string>VSync interval:</string>
-        </property>
-       </widget>
-      </item>
-      <item row="6" column="1">
+      <item row="7" column="1">
        <widget class="QSpinBox" name="sbVSyncInterval">
         <property name="whatsThis">
          <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The interval at which to synchronize to the monitor's refresh rate. Set to 1 for a 60Hz monitor, 2 for 120Hz, ...&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
@@ -123,7 +114,7 @@
         </property>
        </widget>
       </item>
-      <item row="4" column="0" colspan="2">
+      <item row="5" column="0" colspan="2">
        <widget class="QCheckBox" name="cbGLDisplay">
         <property name="whatsThis">
          <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;Use OpenGL to draw the DS screens to the main window. May result in better frame pacing. Mandatory when using the OpenGL 3D renderer.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
@@ -133,17 +124,7 @@
         </property>
        </widget>
       </item>
-      <item row="5" column="0" colspan="2">
-       <widget class="QCheckBox" name="cbVSync">
-        <property name="whatsThis">
-         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;When using OpenGL, synchronize the video output to your monitor's refresh rate.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
-        </property>
-        <property name="text">
-         <string>VSync</string>
-        </property>
-       </widget>
-      </item>
-      <item row="3" column="0" colspan="2">
+      <item row="4" column="0" colspan="2">
        <spacer name="verticalSpacer">
         <property name="orientation">
          <enum>Qt::Vertical</enum>
@@ -159,13 +140,39 @@
         </property>
        </spacer>
       </item>
+      <item row="6" column="0" colspan="2">
+       <widget class="QCheckBox" name="cbVSync">
+        <property name="whatsThis">
+         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;When using OpenGL, synchronize the video output to your monitor's refresh rate.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+        </property>
+        <property name="text">
+         <string>VSync</string>
+        </property>
+       </widget>
+      </item>
+      <item row="7" column="0">
+       <widget class="QLabel" name="label_2">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+        <property name="whatsThis">
+         <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The interval at which to synchronize to the monitor's refresh rate. Set to 1 for a 60Hz monitor, 2 for 120Hz, ...&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+        </property>
+        <property name="text">
+         <string>VSync interval:</string>
+        </property>
+       </widget>
+      </item>
       <item row="2" column="0" colspan="2">
        <widget class="QRadioButton" name="rb3DOpenGL">
         <property name="whatsThis">
          <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;The OpenGL renderer may be faster than software and supports graphical enhancements, but is more prone to glitches.&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
         </property>
         <property name="text">
-         <string>OpenGL</string>
+         <string>OpenGL (Classic)</string>
         </property>
        </widget>
       </item>
@@ -186,6 +193,13 @@
         </property>
        </widget>
       </item>
+      <item row="3" column="0">
+       <widget class="QRadioButton" name="rb3DCompute">
+        <property name="text">
+         <string>OpenGL (Compute shader)</string>
+        </property>
+       </widget>
+      </item>
      </layout>
     </widget>
    </item>
diff --git a/src/frontend/qt_sdl/main.cpp b/src/frontend/qt_sdl/main.cpp
index f670477..ccd1775 100644
--- a/src/frontend/qt_sdl/main.cpp
+++ b/src/frontend/qt_sdl/main.cpp
@@ -230,19 +230,17 @@ void EmuThread::initOpenGL()
     oglContext = windowctx;
     oglContext->MakeCurrent();
 
-    OpenGL::BuildShaderProgram(kScreenVS, kScreenFS, screenShaderProgram, "ScreenShader");
-    GLuint pid = screenShaderProgram[2];
-    glBindAttribLocation(pid, 0, "vPosition");
-    glBindAttribLocation(pid, 1, "vTexcoord");
-    glBindFragDataLocation(pid, 0, "oColor");
+    OpenGL::CompileVertexFragmentProgram(screenShaderProgram,
+        kScreenVS, kScreenFS,
+        "ScreenShader",
+        {{"vPosition", 0}, {"vTexcoord", 1}},
+        {{"oColor", 0}});
 
-    OpenGL::LinkShaderProgram(screenShaderProgram);
+    glUseProgram(screenShaderProgram);
+    glUniform1i(glGetUniformLocation(screenShaderProgram, "ScreenTex"), 0);
 
-    glUseProgram(pid);
-    glUniform1i(glGetUniformLocation(pid, "ScreenTex"), 0);
-
-    screenShaderScreenSizeULoc = glGetUniformLocation(pid, "uScreenSize");
-    screenShaderTransformULoc = glGetUniformLocation(pid, "uTransform");
+    screenShaderScreenSizeULoc = glGetUniformLocation(screenShaderProgram, "uScreenSize");
+    screenShaderTransformULoc = glGetUniformLocation(screenShaderProgram, "uTransform");
 
     // to prevent bleeding between both parts of the screen
     // with bilinear filtering enabled
@@ -302,7 +300,7 @@ void EmuThread::deinitOpenGL()
     glDeleteVertexArrays(1, &screenVertexArray);
     glDeleteBuffers(1, &screenVertexBuffer);
 
-    OpenGL::DeleteShaderProgram(screenShaderProgram);
+    glDeleteProgram(screenShaderProgram);
 
     OSD::DeInit();
 
@@ -328,6 +326,7 @@ void EmuThread::run()
     videoSettings.Soft_Threaded = Config::Threaded3D != 0;
     videoSettings.GL_ScaleFactor = Config::GL_ScaleFactor;
     videoSettings.GL_BetterPolygons = Config::GL_BetterPolygons;
+    videoSettings.GL_HiresCoordinates = Config::GL_HiresCoordinates;
 
     if (mainWindow->hasOGL)
     {
@@ -458,16 +457,17 @@ void EmuThread::run()
                 else
 #endif
                 {
-                    videoRenderer = 0;
+                    videoRenderer = GPU::renderer3D_Software;
                 }
 
-                videoRenderer = oglContext ? Config::_3DRenderer : 0;
+                videoRenderer = oglContext ? Config::_3DRenderer : GPU::renderer3D_Software;
 
                 videoSettingsDirty = false;
 
                 videoSettings.Soft_Threaded = Config::Threaded3D != 0;
                 videoSettings.GL_ScaleFactor = Config::GL_ScaleFactor;
                 videoSettings.GL_BetterPolygons = Config::GL_BetterPolygons;
+                videoSettings.GL_HiresCoordinates = Config::GL_HiresCoordinates;
 
                 GPU::SetRenderSettings(videoRenderer, videoSettings);
             }
@@ -770,7 +770,7 @@ void EmuThread::drawScreenGL()
 
     glViewport(0, 0, w, h);
 
-    glUseProgram(screenShaderProgram[2]);
+    glUseProgram(screenShaderProgram);
     glUniform2f(screenShaderScreenSizeULoc, w / factor, h / factor);
 
     int frontbuf = FrontBuffer;
@@ -1887,7 +1887,7 @@ void MainWindow::closeEvent(QCloseEvent* event)
 
 void MainWindow::createScreenPanel()
 {
-    hasOGL = (Config::ScreenUseGL != 0) || (Config::_3DRenderer != 0);
+    hasOGL = (Config::ScreenUseGL != 0) || (Config::_3DRenderer != GPU::renderer3D_Software);
 
     if (hasOGL)
     {
@@ -3307,13 +3307,7 @@ int main(int argc, char** argv)
 
 #define SANITIZE(var, min, max)  { var = std::clamp(var, min, max); }
     SANITIZE(Config::ConsoleType, 0, 1);
-    SANITIZE(Config::_3DRenderer,
-    0,
-    0 // Minimum, Software renderer
-    #ifdef OGLRENDERER_ENABLED
-    + 1 // OpenGL Renderer
-    #endif
-    );
+    SANITIZE(Config::_3DRenderer, (int)GPU::renderer3D_Software, (int)GPU::renderer3D_Max);
     SANITIZE(Config::ScreenVSyncInterval, 1, 20);
     SANITIZE(Config::GL_ScaleFactor, 1, 16);
     SANITIZE(Config::AudioInterp, 0, 3);
diff --git a/src/frontend/qt_sdl/main.h b/src/frontend/qt_sdl/main.h
index 5832ed3..dd927f1 100644
--- a/src/frontend/qt_sdl/main.h
+++ b/src/frontend/qt_sdl/main.h
@@ -121,7 +121,7 @@ private:
     GL::Context* oglContext = nullptr;
     GLuint screenVertexBuffer, screenVertexArray;
     GLuint screenTexture;
-    GLuint screenShaderProgram[3];
+    GLuint screenShaderProgram;
     GLuint screenShaderTransformULoc, screenShaderScreenSizeULoc;
 
     QMutex screenSettingsLock;
-- 
2.42.0

openSUSE Build Service is sponsored by