File melonds_PR2127.patch of Package melonds

diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp
index 580c66fc..a0078041 100644
--- a/src/ARMInterpreter_LoadStore.cpp
+++ b/src/ARMInterpreter_LoadStore.cpp
@@ -454,11 +454,7 @@ void A_LDM(ARM* cpu)
     if (!(cpu->CurInstr & (1<<23))) // decrement
     {
         // decrement is actually an increment starting from the end address
-        for (int i = 0; i < 16; i++)
-        {
-            if (cpu->CurInstr & (1<<i))
-                base -= 4;
-        }
+        base -= 4 * __builtin_popcount(cpu->CurInstr & 0xFFFF);
 
         if (cpu->CurInstr & (1<<21))
         {
@@ -473,20 +469,17 @@ void A_LDM(ARM* cpu)
     if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15)))
         cpu->UpdateMode(cpu->CPSR, (cpu->CPSR&~0x1F)|0x10, true);
 
-    for (int i = 0; i < 15; i++)
+    u16 reglist = cpu->CurInstr & 0x7FFF;
+    while (reglist)
     {
-        if (cpu->CurInstr & (1<<i))
-        {
-            if (preinc) base += 4;
-            if (!(first ? cpu->DataRead32 (base, &cpu->R[i])
-                        : cpu->DataRead32S(base, &cpu->R[i])))
-            {
-                goto dataabort;
-            }
+        int i = __builtin_ctz(reglist);
+        reglist ^= 1<<i;
 
-            first = false;
-            if (!preinc) base += 4;
-        }
+        if (preinc) base += 4;
+        if (first) cpu->DataRead32 (base, &cpu->R[i]);
+        else       cpu->DataRead32S(base, &cpu->R[i]);
+        first = false;
+        if (!preinc) base += 4;
     }
 
     u32 pc;
@@ -559,11 +552,7 @@ void A_STM(ARM* cpu)
 
     if (!(cpu->CurInstr & (1<<23)))
     {
-        for (u32 i = 0; i < 16; i++)
-        {
-            if (cpu->CurInstr & (1<<i))
-                base -= 4;
-        }
+        base -= 4 * __builtin_popcount(cpu->CurInstr & 0xFFFF);
 
         if (cpu->CurInstr & (1<<21))
             cpu->R[baseid] = base;
@@ -583,32 +572,27 @@ void A_STM(ARM* cpu)
         cpu->UpdateMode(cpu->CPSR, (cpu->CPSR&~0x1F)|0x10, true);
     }
 
-    for (u32 i = 0; i < 16; i++)
+    u16 reglist = cpu->CurInstr & 0xFFFF;
+    while (reglist)
     {
-        if (cpu->CurInstr & (1<<i))
-        {
-            if (preinc) base += 4;
-
-            u32 val;
-            if (i == baseid && !isbanked)
-            {
-                if ((cpu->Num == 0) || (!(cpu->CurInstr & ((1<<i)-1))))
-                    val = oldbase;
-                else val = base;
-            }
-            else val = cpu->R[i];
+        int i = __builtin_ctz(reglist);
+        reglist ^= 1<<i;
 
-            if (i == 15) val+=4;
+        if (preinc) base += 4;
 
-            if (!(first ? cpu->DataWrite32 (base, val)
-                        : cpu->DataWrite32S(base, val)))
-            {
-                goto dataabort;
-            }
+        if (i == baseid && !isbanked)
+        {
+            if ((cpu->Num == 0) || (!(cpu->CurInstr & ((1<<i)-1))))
+                first ? cpu->DataWrite32(base, oldbase) : cpu->DataWrite32S(base, oldbase);
+            else
+                first ? cpu->DataWrite32(base, base) : cpu->DataWrite32S(base, base); // checkme
+        }
+        else
+            first ? cpu->DataWrite32(base, cpu->R[i]) : cpu->DataWrite32S(base, cpu->R[i]);
 
-            first = false;
+        first = false;
 
-            if (!preinc) base += 4;
+        if (!preinc) base += 4;
         }
     }
 
@@ -799,34 +783,23 @@ void T_LDR_SPREL(ARM* cpu)
 
 void T_PUSH(ARM* cpu)
 {
-    int nregs = 0;
     bool first = true;
 
-    for (int i = 0; i < 8; i++)
-    {
-        if (cpu->CurInstr & (1<<i))
-            nregs++;
-    }
-
-    if (cpu->CurInstr & (1<<8))
-        nregs++;
-
     u32 base = cpu->R[13];
-    base -= (nregs<<2);
+    base -= 4 * __builtin_popcount(cpu->CurInstr & 0x1FF);
+
     u32 wbbase = base;
 
-    for (int i = 0; i < 8; i++)
+    u8 reglist = cpu->CurInstr & 0xFF;
+    while (reglist)
     {
-        if (cpu->CurInstr & (1<<i))
-        {
-            if (!(first ? cpu->DataWrite32 (base, cpu->R[i])
-                        : cpu->DataWrite32S(base, cpu->R[i])))
-            {
-                goto dataabort;
-            }
-            first = false;
-            base += 4;
-        }
+        int i = __builtin_ctz(reglist);
+        reglist ^= 1<<i;
+
+        if (first) cpu->DataWrite32 (base, cpu->R[i]);
+        else       cpu->DataWrite32S(base, cpu->R[i]);
+        first = false;
+        base += 4;
     }
 
     if (cpu->CurInstr & (1<<8))
@@ -849,18 +822,16 @@ void T_POP(ARM* cpu)
     u32 base = cpu->R[13];
     bool first = true;
 
-    for (int i = 0; i < 8; i++)
+    u8 reglist = cpu->CurInstr & 0xFF;
+    while (reglist)
     {
-        if (cpu->CurInstr & (1<<i))
-        {
-            if (!(first ? cpu->DataRead32 (base, &cpu->R[i])
-                        : cpu->DataRead32S(base, &cpu->R[i])))
-            {
-                goto dataabort;
-            }
-            first = false;
-            base += 4;
-        }
+        int i = __builtin_ctz(reglist);
+        reglist ^= 1<<i;
+
+        if (first) cpu->DataRead32 (base, &cpu->R[i]);
+        else       cpu->DataRead32S(base, &cpu->R[i]);
+        first = false;
+        base += 4;
     }
 
     if (cpu->CurInstr & (1<<8))
@@ -887,18 +858,16 @@ void T_STMIA(ARM* cpu)
     u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7];
     bool first = true;
 
-    for (int i = 0; i < 8; i++)
+    u8 reglist = cpu->CurInstr & 0xFF;
+    while (reglist)
     {
-        if (cpu->CurInstr & (1<<i))
-        {
-            if (!(first ? cpu->DataWrite32 (base, cpu->R[i])
-                        : cpu->DataWrite32S(base, cpu->R[i])))
-            {
-                goto dataabort;
-            }
-            first = false;
-            base += 4;
-        }
+        int i = __builtin_ctz(reglist);
+        reglist ^= 1<<i;
+
+        if (first) cpu->DataWrite32 (base, cpu->R[i]);
+        else       cpu->DataWrite32S(base, cpu->R[i]);
+        first = false;
+        base += 4;
     }
 
     // TODO: check "Rb included in Rlist" case
@@ -912,18 +881,16 @@ void T_LDMIA(ARM* cpu)
     u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7];
     bool first = true;
 
-    for (int i = 0; i < 8; i++)
+    u8 reglist = cpu->CurInstr & 0xFF;
+    while (reglist)
     {
-        if (cpu->CurInstr & (1<<i))
-        {
-            if (!(first ? cpu->DataRead32 (base, &cpu->R[i])
-                        : cpu->DataRead32S(base, &cpu->R[i])))
-            {
-                goto dataabort;
-            }
-            first = false;
-            base += 4;
-        }
+        int i = __builtin_ctz(reglist);
+        reglist ^= 1<<i;
+
+        if (first) cpu->DataRead32 (base, &cpu->R[i]);
+        else       cpu->DataRead32S(base, &cpu->R[i]);
+        first = false;
+        base += 4;
     }
 
     if (!(cpu->CurInstr & (1<<((cpu->CurInstr >> 8) & 0x7))))
diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp
index 1221ed59..03c48f99 100644
--- a/src/GPU3D_Soft.cpp
+++ b/src/GPU3D_Soft.cpp
@@ -143,6 +143,8 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
     u32 vramaddr = (texparam & 0xFFFF) << 3;
 
     s32 width = 8 << ((texparam >> 20) & 0x7);
+    // since width is always a multple of 8 (thus a multiple of 2) we can replace all multiplications by width with a bitshift
+    s32 widthshift = 3 + ((texparam >> 20) & 0x7);
     s32 height = 8 << ((texparam >> 23) & 0x7);
 
     s >>= 4;
@@ -192,7 +194,7 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
     {
     case 1: // A3I5
         {
-            vramaddr += ((t * width) + s);
+            vramaddr += ((t << widthshift) + s);
             u8 pixel = ReadVRAM_Texture<u8>(vramaddr, gpu);
 
             texpal <<= 4;
@@ -203,7 +205,7 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
 
     case 2: // 4-color
         {
-            vramaddr += (((t * width) + s) >> 2);
+            vramaddr += (((t << widthshift) + s) >> 2);
             u8 pixel = ReadVRAM_Texture<u8>(vramaddr, gpu);
             pixel >>= ((s & 0x3) << 1);
             pixel &= 0x3;
@@ -216,7 +218,7 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
 
     case 3: // 16-color
         {
-            vramaddr += (((t * width) + s) >> 1);
+            vramaddr += (((t << widthshift) + s) >> 1);
             u8 pixel = ReadVRAM_Texture<u8>(vramaddr, gpu);
             if (s & 0x1) pixel >>= 4;
             else         pixel &= 0xF;
@@ -229,7 +231,7 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
 
     case 4: // 256-color
         {
-            vramaddr += ((t * width) + s);
+            vramaddr += ((t << widthshift) + s);
             u8 pixel = ReadVRAM_Texture<u8>(vramaddr, gpu);
 
             texpal <<= 4;
@@ -240,7 +242,7 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
 
     case 5: // compressed
         {
-            vramaddr += ((t & 0x3FC) * (width>>2)) + (s & 0x3FC);
+            vramaddr += ((t & 0x3FC) << (widthshift-2)) + (s & 0x3FC);
             vramaddr += (t & 0x3);
             vramaddr &= 0x7FFFF; // address used for all calcs wraps around after slot 3
 
@@ -352,7 +354,7 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
 
     case 6: // A5I3
         {
-            vramaddr += ((t * width) + s);
+            vramaddr += ((t << widthshift) + s);
             u8 pixel = ReadVRAM_Texture<u8>(vramaddr, gpu);
 
             texpal <<= 4;
@@ -363,7 +365,7 @@ void SoftRenderer::TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s
 
     case 7: // direct color
         {
-            vramaddr += (((t * width) + s) << 1);
+            vramaddr += (((t << widthshift) + s) << 1);
             *color = ReadVRAM_Texture<u16>(vramaddr, gpu);
             *alpha = (*color & 0x8000) ? 31 : 0;
         }
diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h
index 55a698b0..e2c2fa1b 100644
--- a/src/GPU3D_Soft.h
+++ b/src/GPU3D_Soft.h
@@ -101,15 +101,15 @@ private:
 
                 if ((w0 & 0x1) && !(w1 & 0x1))
                 {
-                    this->w0n = w0 - 1;
-                    this->w0d = w0 + 1;
-                    this->w1d = w1;
+                    this->w0n = w0 - 1 >> 1;
+                    this->w0d = w0 + 1 >> 1;
+                    this->w1d = w1 >> 1;
                 }
                 else
                 {
-                    this->w0n = w0 & 0xFFFE;
-                    this->w0d = w0 & 0xFFFE;
-                    this->w1d = w1 & 0xFFFE;
+                    this->w0n = w0 >> 1;
+                    this->w0d = w0 >> 1;
+                    this->w1d = w1 >> 1;
                 }
 
                 this->shift = 9;
@@ -138,7 +138,7 @@ private:
                 // this seems to be a proper division on hardware :/
                 // I haven't been able to find cases that produce imperfect output
                 if (den == 0) yfactor = 0;
-                else          yfactor = (s32)(num / den);
+                else          yfactor = ((u32)num / den);
             }
         }
 
openSUSE Build Service is sponsored by