File xscreensaver-speedup-conv.diff of Package xscreensaver
# From: garloff@suse.de
# Subject [2/4] Speed up GBR(A) to RGBA converstion, using SSE2 insns on x86/-64
# We get twice the speed for the conversion into the preferred texture format
# using SSE2 instructions working on 4 pixels in parallel.
# As this step is still only 1/3 of the overall time to create the texture
# (the other 2/3 is sending it to the server), we reduce the flicker time
# by merely 17%. It's a start ...
# We're using intrinsics to access the SSE2 functions; they should work on
# both x86 and x86-64. (x86 needs -msse2 or -march=some modern processor
# to support SSE2.)
Index: xscreensaver-5.22/hacks/glx/grab-ximage.c
===================================================================
--- xscreensaver-5.22.orig/hacks/glx/grab-ximage.c
+++ xscreensaver-5.22/hacks/glx/grab-ximage.c
@@ -38,8 +38,13 @@
#include "grab-ximage.h"
#include "grabscreen.h"
#include "visual.h"
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+
/* If REFORMAT_IMAGE_DATA is defined, then we convert Pixmaps to textures
like this:
- get Pixmap as an XImage in whatever form the server hands us;
@@ -117,9 +122,9 @@ decode_mask (unsigned int mask, unsigned
/* Given a value and a field-width, expands the field to fill out 8 bits.
*/
-static unsigned char
+static inline unsigned char
spread_bits (unsigned char value, unsigned char width)
{
switch (width)
{
@@ -133,8 +138,62 @@ spread_bits (unsigned char value, unsign
default: abort(); break;
}
}
+static inline void swap_bits_rgba32(unsigned int* val);
+static inline void swap_bits_rgba32(unsigned int* val)
+{
+ const register unsigned int v = *val & 0x00ffffff;
+ *val = (v >> 16) | (v << 16) | (v & 0x0000ff00) | 0xff000000;
+}
+
+void fast_xpix_to_rgba32(XImage *from, XImage *to);
+void fast_xpix_to_rgba32(XImage *from, XImage *to)
+{
+ unsigned int y, x;
+ for (y = 0; y < from->height; ++y) {
+#ifdef __SSE2__
+ char* fptr = from->data + y*from->bytes_per_line;
+ char* tptr = to->data + y*to->bytes_per_line;
+ for (x = 0; x < from->width-3; x+=4) {
+ __v4si m1 = { 0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000 };
+ __v4si m2 = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
+ __v4si m3 = { 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff };
+ __v4si m4 = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
+ register __m128i MM1, MM2, MM3;
+ MM1 = _mm_loadu_si128((__m128i*)fptr);
+ fptr += 16;
+ MM2 = _mm_srli_epi32(MM1, 16);
+ MM3 = _mm_slli_epi32(MM1, 16);
+ MM1 = _mm_and_si128(MM1, (__m128i)m2);
+ MM2 = _mm_and_si128(MM2, (__m128i)m3);
+ MM3 = _mm_and_si128(MM3, (__m128i)m1);
+ MM1 = _mm_or_si128(MM1, MM2);
+ MM1 = _mm_or_si128(MM1, MM3);
+ MM1 = _mm_or_si128(MM1, (__m128i)m4);
+ _mm_storeu_si128((__m128i*)tptr, MM1);
+ tptr += 16;
+ }
+#else
+ for (x = 0; x < from->width-3; x+=4) {
+ unsigned int p[4];
+ p[0] = XGetPixel(from, x , y); p[1] = XGetPixel(from, x+1, y);
+ p[2] = XGetPixel(from, x+2, y); p[3] = XGetPixel(from, x+3, y);
+ swap_bits_rgba32(p ); swap_bits_rgba32(p+1);
+ swap_bits_rgba32(p+2); swap_bits_rgba32(p+3);
+ XPutPixel(to, x , y, p[0]); XPutPixel(to, x+1, y, p[1]);
+ XPutPixel(to, x+2, y, p[2]); XPutPixel(to, x+3, y, p[3]);
+ }
+#endif
+ /* Remaining pixels */
+ for (; x < from->width; ++x) {
+ unsigned int p = XGetPixel(from, x, y);
+ swap_bits_rgba32(&p);
+ XPutPixel(to, x, y, p);
+ }
+ }
+}
+
static XImage *
convert_ximage_to_rgba32 (Screen *screen, XImage *image)
{
@@ -194,8 +253,20 @@ convert_ximage_to_rgba32 (Screen *screen
crpos = 24, cgpos = 16, cbpos = 8, capos = 0;
else
crpos = 0, cgpos = 8, cbpos = 16, capos = 24;
+ /* trying to track down an intermittent crash in ximage_putpixel_32 */
+ if (to->width < from->width) abort();
+ if (to->height < from->height) abort();
+
+ /* Fast routine */
+ if (!colors && srpos == 16 && sgpos == 8 && sbpos == 0 && crpos == 0 && cgpos == 8 && cbpos == 16 && capos == 24
+ && from->bits_per_pixel == 32 && to->bits_per_pixel == 32)
+ {
+ fast_xpix_to_rgba32(from, to);
+ return to;
+ }
+
if (colors == 0) /* truecolor */
{
int i;
for (i = 0; i < 256; i++)
@@ -205,12 +276,8 @@ convert_ximage_to_rgba32 (Screen *screen
spread_map[2][i] = spread_bits (i, sbsiz);
}
}
- /* trying to track down an intermittent crash in ximage_putpixel_32 */
- if (to->width < from->width) abort();
- if (to->height < from->height) abort();
-
for (y = 0; y < from->height; y++)
for (x = 0; x < from->width; x++)
{
unsigned long sp = XGetPixel (from, x, y);
@@ -815,9 +882,9 @@ load_texture_async_cb (Screen *screen, W
D = total elapsed time from "want image" to "see image"
B+C is responsible for any frame-rate glitches.
*/
- "%s: loading elapsed: %.2f + %.2f + %.2f = %.2f sec\n",
+ "%s: loading elapsed: %.3f + %.3f + %.3f = %.3f sec\n",
progname,
cvt_time - dd.load_time,
tex_time - cvt_time,
done_time - tex_time,