File glibc-memset-nontemporal.diff of Package glibc.8004

Fix for bnc #868622, slow memset for large block sizes.

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index db4fb84..9c42018 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -84,6 +84,9 @@ L(loop_start):
 	movdqu	%xmm8, -48(%rdi,%rdx)
 	movdqu	%xmm8, 48(%rdi)
 	movdqu	%xmm8, -64(%rdi,%rdx)
+	mov	__x86_shared_cache_size(%rip),%r9d  # The largest cache size
+	cmp	%r9,%rdx
+	ja	L(nt_move)
 	addq	%rdi, %rdx
 	andq	$-64, %rdx
 	cmpq	%rdx, %rcx
@@ -99,6 +102,23 @@ L(loop):
 	jne	L(loop)
 	rep
 	ret
+L(nt_move):
+	addq	%rdi, %rdx
+	andq	$-64, %rdx
+	cmpq	%rdx, %rcx
+	je	L(return)
+	.p2align 4
+L(nt_loop):
+	movntdq	%xmm8, (%rcx)
+	movntdq	%xmm8, 16(%rcx)
+	movntdq	%xmm8, 32(%rcx)
+	movntdq	%xmm8, 48(%rcx)
+	addq	$64, %rcx
+	cmpq	%rcx, %rdx
+	jne	L(nt_loop)
+	sfence
+	rep
+	ret
 L(less_16_bytes):
 	movq %xmm8, %rcx
 	testb	$24, %dl
openSUSE Build Service is sponsored by