File xscreensaver-speedup-conv.diff of Package xscreensaver

# From: garloff@suse.de
# Subject [2/4] Speed up GBR(A) to RGBA converstion, using SSE2 insns on x86/-64
# We get twice the speed for the conversion into the preferred texture format
# using SSE2 instructions working on 4 pixels in parallel.
# As this step is still only 1/3 of the overall time to create the texture
# (the other 2/3 is sending it to the server), we reduce the flicker time
# by merely 17%. It's a start ...
# We're using intrinsics to access the SSE2 functions; they should work on
# both x86 and x86-64. (x86 needs -msse2 or -march=some modern processor
# to support SSE2.)
Index: xscreensaver-5.22/hacks/glx/grab-ximage.c
===================================================================
--- xscreensaver-5.22.orig/hacks/glx/grab-ximage.c
+++ xscreensaver-5.22/hacks/glx/grab-ximage.c
@@ -38,8 +38,13 @@
 #include "grab-ximage.h"
 #include "grabscreen.h"
 #include "visual.h"
 
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+
 /* If REFORMAT_IMAGE_DATA is defined, then we convert Pixmaps to textures
    like this:
 
      - get Pixmap as an XImage in whatever form the server hands us;
@@ -117,9 +122,9 @@ decode_mask (unsigned int mask, unsigned
 
 
 /* Given a value and a field-width, expands the field to fill out 8 bits.
  */
-static unsigned char
+static inline unsigned char
 spread_bits (unsigned char value, unsigned char width)
 {
   switch (width)
     {
@@ -133,8 +138,62 @@ spread_bits (unsigned char value, unsign
     default: abort(); break;
     }
 }
 
+static inline void swap_bits_rgba32(unsigned int* val);
+static inline void swap_bits_rgba32(unsigned int* val)
+{
+  const register unsigned int v = *val & 0x00ffffff;
+  *val = (v >> 16) | (v << 16) | (v & 0x0000ff00) | 0xff000000;
+}
+
+void fast_xpix_to_rgba32(XImage *from, XImage *to);
+void fast_xpix_to_rgba32(XImage *from, XImage *to)
+{
+  unsigned int y, x;
+  for (y = 0; y < from->height; ++y) {
+#ifdef __SSE2__
+    char* fptr = from->data + y*from->bytes_per_line;
+    char* tptr = to->data + y*to->bytes_per_line;
+    for (x = 0; x < from->width-3; x+=4) {
+      __v4si m1 = { 0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000 };
+      __v4si m2 = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
+      __v4si m3 = { 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff };
+      __v4si m4 = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
+      register __m128i MM1, MM2, MM3;
+      MM1 = _mm_loadu_si128((__m128i*)fptr);
+      fptr += 16;
+      MM2 = _mm_srli_epi32(MM1, 16);
+      MM3 = _mm_slli_epi32(MM1, 16);
+      MM1 = _mm_and_si128(MM1, (__m128i)m2);
+      MM2 = _mm_and_si128(MM2, (__m128i)m3);
+      MM3 = _mm_and_si128(MM3, (__m128i)m1);
+      MM1 = _mm_or_si128(MM1, MM2);
+      MM1 = _mm_or_si128(MM1, MM3);
+      MM1 = _mm_or_si128(MM1, (__m128i)m4);
+      _mm_storeu_si128((__m128i*)tptr, MM1);
+      tptr += 16;
+    }
+#else
+    for (x = 0; x < from->width-3; x+=4) {
+      unsigned int p[4];
+      p[0] = XGetPixel(from, x  , y); p[1] = XGetPixel(from, x+1, y);
+      p[2] = XGetPixel(from, x+2, y); p[3] = XGetPixel(from, x+3, y);
+      swap_bits_rgba32(p  ); swap_bits_rgba32(p+1);
+      swap_bits_rgba32(p+2); swap_bits_rgba32(p+3);
+      XPutPixel(to, x  , y, p[0]); XPutPixel(to, x+1, y, p[1]);
+      XPutPixel(to, x+2, y, p[2]); XPutPixel(to, x+3, y, p[3]);
+    }
+#endif
+    /* Remaining pixels */
+    for (; x < from->width; ++x) {
+      unsigned int p = XGetPixel(from, x, y);
+      swap_bits_rgba32(&p);
+      XPutPixel(to, x, y, p);
+    }
+  }
+}
+
 
 static XImage *
 convert_ximage_to_rgba32 (Screen *screen, XImage *image)
 {
@@ -194,8 +253,20 @@ convert_ximage_to_rgba32 (Screen *screen
     crpos = 24, cgpos = 16, cbpos =  8, capos =  0;
   else
     crpos =  0, cgpos =  8, cbpos = 16, capos = 24;
 
+  /* trying to track down an intermittent crash in ximage_putpixel_32 */
+  if (to->width  < from->width)  abort();
+  if (to->height < from->height) abort();
+
+  /* Fast routine */
+  if (!colors && srpos == 16 && sgpos == 8 && sbpos == 0 && crpos == 0 && cgpos == 8 && cbpos == 16 && capos == 24
+	&& from->bits_per_pixel == 32 && to->bits_per_pixel == 32)
+    {
+	fast_xpix_to_rgba32(from, to);
+	return to;
+    }
+
   if (colors == 0)  /* truecolor */
     {
       int i;
       for (i = 0; i < 256; i++)
@@ -205,12 +276,8 @@ convert_ximage_to_rgba32 (Screen *screen
           spread_map[2][i] = spread_bits (i, sbsiz);
         }
     }
 
-  /* trying to track down an intermittent crash in ximage_putpixel_32 */
-  if (to->width  < from->width)  abort();
-  if (to->height < from->height) abort();
-
   for (y = 0; y < from->height; y++)
     for (x = 0; x < from->width; x++)
       {
         unsigned long sp = XGetPixel (from, x, y);
@@ -815,9 +882,9 @@ load_texture_async_cb (Screen *screen, W
                 D = total elapsed time from "want image" to "see image"
 
                 B+C is responsible for any frame-rate glitches.
               */
-             "%s: loading elapsed: %.2f + %.2f + %.2f = %.2f sec\n",
+             "%s: loading elapsed: %.3f + %.3f + %.3f = %.3f sec\n",
              progname,
              cvt_time  - dd.load_time,
              tex_time  - cvt_time,
              done_time - tex_time,
openSUSE Build Service is sponsored by