File glibc-fix-avx512-mempcpy.patch of Package glibc.8228
bnc#1092877 and maybe bnc#1093291
mempcpy overwrites 128 bytes after the destination buffer (when copy size
is large enough to go into the non-temporal loop, i.e. when it's larger
than half the shared cache size per core).
Index: glibc-2.26/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
===================================================================
--- glibc-2.26.orig/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S 2017-08-02 14:57:16.000000000 +0200
+++ glibc-2.26/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S 2018-05-18 14:40:32.000000000 +0200
@@ -31,6 +31,7 @@ END (__mempcpy_chk_avx512_no_vzeroupper)
ENTRY (__mempcpy_avx512_no_vzeroupper)
movq %rdi, %rax
+ movq %rdi, %r11
addq %rdx, %rax
jmp L(start)
END (__mempcpy_avx512_no_vzeroupper)
@@ -45,6 +46,7 @@ END (__memmove_chk_avx512_no_vzeroupper)
ENTRY (__memmove_avx512_no_vzeroupper)
mov %rdi, %rax
+ mov %rdi, %r11
# ifdef USE_AS_MEMPCPY
add %rdx, %rax
# endif
@@ -370,8 +372,8 @@ L(gobble_256bytes_nt_loop):
cmp $256, %rdx
ja L(gobble_256bytes_nt_loop)
sfence
- vmovups %zmm4, (%rax)
- vmovups %zmm5, 0x40(%rax)
+ vmovups %zmm4, (%r11)
+ vmovups %zmm5, 0x40(%r11)
jmp L(check)
L(preloop_large_bkw):