File openssl-Improve-performance-for-6x-unrolling-with-vpermxor-i.patch of Package openssl-1_1

From 3d3a7ecd1ae5ab08d22041f7b3b035c34f12fa02 Mon Sep 17 00:00:00 2001
From: Danny Tsen <dtsen@linux.ibm.com>
Date: Tue, 22 Aug 2023 15:58:53 -0400
Subject: [PATCH] Improve performance for 6x unrolling with vpermxor
 instruction

Reviewed-by: Paul Dale <pauli@openssl.org>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/21812)
---
 crypto/aes/asm/aesp8-ppc.pl | 145 +++++++++++++++++++++++-------------
 1 file changed, 95 insertions(+), 50 deletions(-)

diff --git a/crypto/aes/asm/aesp8-ppc.pl b/crypto/aes/asm/aesp8-ppc.pl
index 60cf86f52aed2..38b9405a283b7 100755
--- a/crypto/aes/asm/aesp8-ppc.pl
+++ b/crypto/aes/asm/aesp8-ppc.pl
@@ -99,11 +99,12 @@
 .long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
 .long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
 .long	0,0,0,0						?asis
+.long	0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
 Lconsts:
 	mflr	r0
 	bcl	20,31,\$+4
 	mflr	$ptr	 #vvvvv "distance between . and rcon
-	addi	$ptr,$ptr,-0x48
+	addi	$ptr,$ptr,-0x58
 	mtlr	r0
 	blr
 	.long	0
@@ -2405,7 +2406,7 @@ ()
 my $key_=$key2;
 my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
     $x00=0 if ($flavour =~ /osx/);
-my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
+my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5)=map("v$_",(0..5));
 my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
 my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
 my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
@@ -2460,6 +2461,18 @@ ()
 	li		$x70,0x70
 	mtspr		256,r0
 
+	# Reverse eighty7 to 0x010101..87
+	xxlor		2, 32+$eighty7, 32+$eighty7
+	vsldoi		$eighty7,$tmp,$eighty7,1	# 0x010101..87
+	xxlor		1, 32+$eighty7, 32+$eighty7
+
+	# Load XOR contents. 0xf102132435465768798a9bacbdcedfe
+	mr		$x70, r6
+	bl		Lconsts
+	lxvw4x		0, $x40, r6		# load XOR contents
+	mr		r6, $x70
+	li		$x70,0x70
+
 	subi		$rounds,$rounds,3	# -4 in total
 
 	lvx		$rndkey0,$x00,$key1	# load key schedule
@@ -2502,69 +2515,77 @@ ()
 	?vperm		v31,v31,$twk5,$keyperm
 	lvx		v25,$x10,$key_		# pre-load round[2]
 
+	# Switch to use the following codes with 0x010101..87 to generate tweak.
+	#     eighty7 = 0x010101..87
+	# vsrab		tmp, tweak, seven	# next tweak value, right shift 7 bits
+	# vand		tmp, tmp, eighty7	# last byte with carry
+	# vaddubm	tweak, tweak, tweak	# left shift 1 bit (x2)
+	# xxlor		vsx, 0, 0
+	# vpermxor	tweak, tweak, tmp, vsx
+
 	 vperm		$in0,$inout,$inptail,$inpperm
 	 subi		$inp,$inp,31		# undo "caller"
 	vxor		$twk0,$tweak,$rndkey0
 	vsrab		$tmp,$tweak,$seven	# next tweak value
 	vaddubm		$tweak,$tweak,$tweak
-	vsldoi		$tmp,$tmp,$tmp,15
 	vand		$tmp,$tmp,$eighty7
 	 vxor		$out0,$in0,$twk0
-	vxor		$tweak,$tweak,$tmp
+	xxlor		32+$in1, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in1
 
 	 lvx_u		$in1,$x10,$inp
 	vxor		$twk1,$tweak,$rndkey0
 	vsrab		$tmp,$tweak,$seven	# next tweak value
 	vaddubm		$tweak,$tweak,$tweak
-	vsldoi		$tmp,$tmp,$tmp,15
 	 le?vperm	$in1,$in1,$in1,$leperm
 	vand		$tmp,$tmp,$eighty7
 	 vxor		$out1,$in1,$twk1
-	vxor		$tweak,$tweak,$tmp
+	xxlor		32+$in2, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in2
 
 	 lvx_u		$in2,$x20,$inp
 	 andi.		$taillen,$len,15
 	vxor		$twk2,$tweak,$rndkey0
 	vsrab		$tmp,$tweak,$seven	# next tweak value
 	vaddubm		$tweak,$tweak,$tweak
-	vsldoi		$tmp,$tmp,$tmp,15
 	 le?vperm	$in2,$in2,$in2,$leperm
 	vand		$tmp,$tmp,$eighty7
 	 vxor		$out2,$in2,$twk2
-	vxor		$tweak,$tweak,$tmp
+	xxlor		32+$in3, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in3
 
 	 lvx_u		$in3,$x30,$inp
 	 sub		$len,$len,$taillen
 	vxor		$twk3,$tweak,$rndkey0
 	vsrab		$tmp,$tweak,$seven	# next tweak value
 	vaddubm		$tweak,$tweak,$tweak
-	vsldoi		$tmp,$tmp,$tmp,15
 	 le?vperm	$in3,$in3,$in3,$leperm
 	vand		$tmp,$tmp,$eighty7
 	 vxor		$out3,$in3,$twk3
-	vxor		$tweak,$tweak,$tmp
+	xxlor		32+$in4, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in4
 
 	 lvx_u		$in4,$x40,$inp
 	 subi		$len,$len,0x60
 	vxor		$twk4,$tweak,$rndkey0
 	vsrab		$tmp,$tweak,$seven	# next tweak value
 	vaddubm		$tweak,$tweak,$tweak
-	vsldoi		$tmp,$tmp,$tmp,15
 	 le?vperm	$in4,$in4,$in4,$leperm
 	vand		$tmp,$tmp,$eighty7
 	 vxor		$out4,$in4,$twk4
-	vxor		$tweak,$tweak,$tmp
+	xxlor		32+$in5, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in5
 
 	 lvx_u		$in5,$x50,$inp
 	 addi		$inp,$inp,0x60
 	vxor		$twk5,$tweak,$rndkey0
 	vsrab		$tmp,$tweak,$seven	# next tweak value
 	vaddubm		$tweak,$tweak,$tweak
-	vsldoi		$tmp,$tmp,$tmp,15
 	 le?vperm	$in5,$in5,$in5,$leperm
 	vand		$tmp,$tmp,$eighty7
 	 vxor		$out5,$in5,$twk5
-	vxor		$tweak,$tweak,$tmp
+	xxlor		32+$in0, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in0
 
 	vxor		v31,v31,$rndkey0
 	mtctr		$rounds
@@ -2590,6 +2611,8 @@ ()
 	lvx		v25,$x10,$key_		# round[4]
 	bdnz		Loop_xts_enc6x
 
+	xxlor		32+$eighty7, 1, 1		# 0x010101..87
+
 	subic		$len,$len,96		# $len-=96
 	 vxor		$in0,$twk0,v31		# xor with last round key
 	vcipher		$out0,$out0,v24
@@ -2599,7 +2622,6 @@ ()
 	 vaddubm	$tweak,$tweak,$tweak
 	vcipher		$out2,$out2,v24
 	vcipher		$out3,$out3,v24
-	 vsldoi		$tmp,$tmp,$tmp,15
 	vcipher		$out4,$out4,v24
 	vcipher		$out5,$out5,v24
 
@@ -2607,7 +2629,8 @@ ()
 	 vand		$tmp,$tmp,$eighty7
 	vcipher		$out0,$out0,v25
 	vcipher		$out1,$out1,v25
-	 vxor		$tweak,$tweak,$tmp
+	 xxlor		32+$in1, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in1
 	vcipher		$out2,$out2,v25
 	vcipher		$out3,$out3,v25
 	 vxor		$in1,$twk1,v31
@@ -2618,13 +2641,13 @@ ()
 
 	and		r0,r0,$len
 	 vaddubm	$tweak,$tweak,$tweak
-	 vsldoi		$tmp,$tmp,$tmp,15
 	vcipher		$out0,$out0,v26
 	vcipher		$out1,$out1,v26
 	 vand		$tmp,$tmp,$eighty7
 	vcipher		$out2,$out2,v26
 	vcipher		$out3,$out3,v26
-	 vxor		$tweak,$tweak,$tmp
+	 xxlor		32+$in2, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in2
 	vcipher		$out4,$out4,v26
 	vcipher		$out5,$out5,v26
 
@@ -2638,7 +2661,6 @@ ()
 	 vaddubm	$tweak,$tweak,$tweak
 	vcipher		$out0,$out0,v27
 	vcipher		$out1,$out1,v27
-	 vsldoi		$tmp,$tmp,$tmp,15
 	vcipher		$out2,$out2,v27
 	vcipher		$out3,$out3,v27
 	 vand		$tmp,$tmp,$eighty7
@@ -2646,7 +2668,8 @@ ()
 	vcipher		$out5,$out5,v27
 
 	addi		$key_,$sp,$FRAME+15	# rewind $key_
-	 vxor		$tweak,$tweak,$tmp
+	 xxlor		32+$in3, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in3
 	vcipher		$out0,$out0,v28
 	vcipher		$out1,$out1,v28
 	 vxor		$in3,$twk3,v31
@@ -2655,7 +2678,6 @@ ()
 	vcipher		$out2,$out2,v28
 	vcipher		$out3,$out3,v28
 	 vaddubm	$tweak,$tweak,$tweak
-	 vsldoi		$tmp,$tmp,$tmp,15
 	vcipher		$out4,$out4,v28
 	vcipher		$out5,$out5,v28
 	lvx		v24,$x00,$key_		# re-pre-load round[1]
@@ -2663,7 +2685,8 @@ ()
 
 	vcipher		$out0,$out0,v29
 	vcipher		$out1,$out1,v29
-	 vxor		$tweak,$tweak,$tmp
+	 xxlor		32+$in4, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in4
 	vcipher		$out2,$out2,v29
 	vcipher		$out3,$out3,v29
 	 vxor		$in4,$twk4,v31
@@ -2673,14 +2696,14 @@ ()
 	vcipher		$out5,$out5,v29
 	lvx		v25,$x10,$key_		# re-pre-load round[2]
 	 vaddubm	$tweak,$tweak,$tweak
-	 vsldoi		$tmp,$tmp,$tmp,15
 
 	vcipher		$out0,$out0,v30
 	vcipher		$out1,$out1,v30
 	 vand		$tmp,$tmp,$eighty7
 	vcipher		$out2,$out2,v30
 	vcipher		$out3,$out3,v30
-	 vxor		$tweak,$tweak,$tmp
+	 xxlor		32+$in5, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in5
 	vcipher		$out4,$out4,v30
 	vcipher		$out5,$out5,v30
 	 vxor		$in5,$twk5,v31
@@ -2690,7 +2713,6 @@ ()
 	vcipherlast	$out0,$out0,$in0
 	 lvx_u		$in0,$x00,$inp		# load next input block
 	 vaddubm	$tweak,$tweak,$tweak
-	 vsldoi		$tmp,$tmp,$tmp,15
 	vcipherlast	$out1,$out1,$in1
 	 lvx_u		$in1,$x10,$inp
 	vcipherlast	$out2,$out2,$in2
@@ -2703,7 +2725,10 @@ ()
 	vcipherlast	$out4,$out4,$in4
 	 le?vperm	$in2,$in2,$in2,$leperm
 	 lvx_u		$in4,$x40,$inp
-	 vxor		$tweak,$tweak,$tmp
+	 xxlor		10, 32+$in0, 32+$in0
+	 xxlor		32+$in0, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in0
+	 xxlor		32+$in0, 10, 10
 	vcipherlast	$tmp,$out5,$in5		# last block might be needed
 						# in stealing mode
 	 le?vperm	$in3,$in3,$in3,$leperm
@@ -2736,6 +2761,8 @@ ()
 	mtctr		$rounds
 	beq		Loop_xts_enc6x		# did $len-=96 borrow?
 
+	xxlor		32+$eighty7, 2, 2		# 0x870101..01
+
 	addic.		$len,$len,0x60
 	beq		Lxts_enc6x_zero
 	cmpwi		$len,0x20
@@ -3112,6 +3139,18 @@ ()
 	li		$x70,0x70
 	mtspr		256,r0
 
+	# Reverse eighty7 to 0x010101..87
+	xxlor		2, 32+$eighty7, 32+$eighty7
+	vsldoi		$eighty7,$tmp,$eighty7,1	# 0x010101..87
+	xxlor		1, 32+$eighty7, 32+$eighty7
+
+	# Load XOR contents. 0xf102132435465768798a9bacbdcedfe
+	mr		$x70, r6
+	bl		Lconsts
+	lxvw4x		0, $x40, r6		# load XOR contents
+	mr		r6, $x70
+	li		$x70,0x70
+
 	subi		$rounds,$rounds,3	# -4 in total
 
 	lvx		$rndkey0,$x00,$key1	# load key schedule
@@ -3159,64 +3198,64 @@ ()
 	vxor		$twk0,$tweak,$rndkey0
 	vsrab		$tmp,$tweak,$seven	# next tweak value
 	vaddubm		$tweak,$tweak,$tweak
-	vsldoi		$tmp,$tmp,$tmp,15
 	vand		$tmp,$tmp,$eighty7
 	 vxor		$out0,$in0,$twk0
-	vxor		$tweak,$tweak,$tmp
+	xxlor		32+$in1, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in1
 
 	 lvx_u		$in1,$x10,$inp
 	vxor		$twk1,$tweak,$rndkey0
 	vsrab		$tmp,$tweak,$seven	# next tweak value
 	vaddubm		$tweak,$tweak,$tweak
-	vsldoi		$tmp,$tmp,$tmp,15
 	 le?vperm	$in1,$in1,$in1,$leperm
 	vand		$tmp,$tmp,$eighty7
 	 vxor		$out1,$in1,$twk1
-	vxor		$tweak,$tweak,$tmp
+	xxlor		32+$in2, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in2
 
 	 lvx_u		$in2,$x20,$inp
 	 andi.		$taillen,$len,15
 	vxor		$twk2,$tweak,$rndkey0
 	vsrab		$tmp,$tweak,$seven	# next tweak value
 	vaddubm		$tweak,$tweak,$tweak
-	vsldoi		$tmp,$tmp,$tmp,15
 	 le?vperm	$in2,$in2,$in2,$leperm
 	vand		$tmp,$tmp,$eighty7
 	 vxor		$out2,$in2,$twk2
-	vxor		$tweak,$tweak,$tmp
+	xxlor		32+$in3, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in3
 
 	 lvx_u		$in3,$x30,$inp
 	 sub		$len,$len,$taillen
 	vxor		$twk3,$tweak,$rndkey0
 	vsrab		$tmp,$tweak,$seven	# next tweak value
 	vaddubm		$tweak,$tweak,$tweak
-	vsldoi		$tmp,$tmp,$tmp,15
 	 le?vperm	$in3,$in3,$in3,$leperm
 	vand		$tmp,$tmp,$eighty7
 	 vxor		$out3,$in3,$twk3
-	vxor		$tweak,$tweak,$tmp
+	xxlor		32+$in4, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in4
 
 	 lvx_u		$in4,$x40,$inp
 	 subi		$len,$len,0x60
 	vxor		$twk4,$tweak,$rndkey0
 	vsrab		$tmp,$tweak,$seven	# next tweak value
 	vaddubm		$tweak,$tweak,$tweak
-	vsldoi		$tmp,$tmp,$tmp,15
 	 le?vperm	$in4,$in4,$in4,$leperm
 	vand		$tmp,$tmp,$eighty7
 	 vxor		$out4,$in4,$twk4
-	vxor		$tweak,$tweak,$tmp
+	xxlor		32+$in5, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in5
 
 	 lvx_u		$in5,$x50,$inp
 	 addi		$inp,$inp,0x60
 	vxor		$twk5,$tweak,$rndkey0
 	vsrab		$tmp,$tweak,$seven	# next tweak value
 	vaddubm		$tweak,$tweak,$tweak
-	vsldoi		$tmp,$tmp,$tmp,15
 	 le?vperm	$in5,$in5,$in5,$leperm
 	vand		$tmp,$tmp,$eighty7
 	 vxor		$out5,$in5,$twk5
-	vxor		$tweak,$tweak,$tmp
+	xxlor		32+$in0, 0, 0
+	vpermxor	$tweak, $tweak, $tmp, $in0
 
 	vxor		v31,v31,$rndkey0
 	mtctr		$rounds
@@ -3242,6 +3281,8 @@ ()
 	lvx		v25,$x10,$key_		# round[4]
 	bdnz		Loop_xts_dec6x
 
+	xxlor		32+$eighty7, 1, 1
+
 	subic		$len,$len,96		# $len-=96
 	 vxor		$in0,$twk0,v31		# xor with last round key
 	vncipher	$out0,$out0,v24
@@ -3251,7 +3292,6 @@ ()
 	 vaddubm	$tweak,$tweak,$tweak
 	vncipher	$out2,$out2,v24
 	vncipher	$out3,$out3,v24
-	 vsldoi		$tmp,$tmp,$tmp,15
 	vncipher	$out4,$out4,v24
 	vncipher	$out5,$out5,v24
 
@@ -3259,7 +3299,8 @@ ()
 	 vand		$tmp,$tmp,$eighty7
 	vncipher	$out0,$out0,v25
 	vncipher	$out1,$out1,v25
-	 vxor		$tweak,$tweak,$tmp
+	 xxlor		32+$in1, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in1
 	vncipher	$out2,$out2,v25
 	vncipher	$out3,$out3,v25
 	 vxor		$in1,$twk1,v31
@@ -3270,13 +3311,13 @@ ()
 
 	and		r0,r0,$len
 	 vaddubm	$tweak,$tweak,$tweak
-	 vsldoi		$tmp,$tmp,$tmp,15
 	vncipher	$out0,$out0,v26
 	vncipher	$out1,$out1,v26
 	 vand		$tmp,$tmp,$eighty7
 	vncipher	$out2,$out2,v26
 	vncipher	$out3,$out3,v26
-	 vxor		$tweak,$tweak,$tmp
+	 xxlor		32+$in2, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in2
 	vncipher	$out4,$out4,v26
 	vncipher	$out5,$out5,v26
 
@@ -3290,7 +3331,6 @@ ()
 	 vaddubm	$tweak,$tweak,$tweak
 	vncipher	$out0,$out0,v27
 	vncipher	$out1,$out1,v27
-	 vsldoi		$tmp,$tmp,$tmp,15
 	vncipher	$out2,$out2,v27
 	vncipher	$out3,$out3,v27
 	 vand		$tmp,$tmp,$eighty7
@@ -3298,7 +3338,8 @@ ()
 	vncipher	$out5,$out5,v27
 
 	addi		$key_,$sp,$FRAME+15	# rewind $key_
-	 vxor		$tweak,$tweak,$tmp
+	 xxlor		32+$in3, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in3
 	vncipher	$out0,$out0,v28
 	vncipher	$out1,$out1,v28
 	 vxor		$in3,$twk3,v31
@@ -3307,7 +3348,6 @@ ()
 	vncipher	$out2,$out2,v28
 	vncipher	$out3,$out3,v28
 	 vaddubm	$tweak,$tweak,$tweak
-	 vsldoi		$tmp,$tmp,$tmp,15
 	vncipher	$out4,$out4,v28
 	vncipher	$out5,$out5,v28
 	lvx		v24,$x00,$key_		# re-pre-load round[1]
@@ -3315,7 +3355,8 @@ ()
 
 	vncipher	$out0,$out0,v29
 	vncipher	$out1,$out1,v29
-	 vxor		$tweak,$tweak,$tmp
+	 xxlor		32+$in4, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in4
 	vncipher	$out2,$out2,v29
 	vncipher	$out3,$out3,v29
 	 vxor		$in4,$twk4,v31
@@ -3325,14 +3366,14 @@ ()
 	vncipher	$out5,$out5,v29
 	lvx		v25,$x10,$key_		# re-pre-load round[2]
 	 vaddubm	$tweak,$tweak,$tweak
-	 vsldoi		$tmp,$tmp,$tmp,15
 
 	vncipher	$out0,$out0,v30
 	vncipher	$out1,$out1,v30
 	 vand		$tmp,$tmp,$eighty7
 	vncipher	$out2,$out2,v30
 	vncipher	$out3,$out3,v30
-	 vxor		$tweak,$tweak,$tmp
+	 xxlor		32+$in5, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in5
 	vncipher	$out4,$out4,v30
 	vncipher	$out5,$out5,v30
 	 vxor		$in5,$twk5,v31
@@ -3342,7 +3383,6 @@ ()
 	vncipherlast	$out0,$out0,$in0
 	 lvx_u		$in0,$x00,$inp		# load next input block
 	 vaddubm	$tweak,$tweak,$tweak
-	 vsldoi		$tmp,$tmp,$tmp,15
 	vncipherlast	$out1,$out1,$in1
 	 lvx_u		$in1,$x10,$inp
 	vncipherlast	$out2,$out2,$in2
@@ -3355,7 +3395,10 @@ ()
 	vncipherlast	$out4,$out4,$in4
 	 le?vperm	$in2,$in2,$in2,$leperm
 	 lvx_u		$in4,$x40,$inp
-	 vxor		$tweak,$tweak,$tmp
+	 xxlor		10, 32+$in0, 32+$in0
+	 xxlor		32+$in0, 0, 0
+	 vpermxor	$tweak, $tweak, $tmp, $in0
+	 xxlor		32+$in0, 10, 10
 	vncipherlast	$out5,$out5,$in5
 	 le?vperm	$in3,$in3,$in3,$leperm
 	 lvx_u		$in5,$x50,$inp
@@ -3386,6 +3429,8 @@ ()
 	mtctr		$rounds
 	beq		Loop_xts_dec6x		# did $len-=96 borrow?
 
+	xxlor		32+$eighty7, 2, 2
+
 	addic.		$len,$len,0x60
 	beq		Lxts_dec6x_zero
 	cmpwi		$len,0x20
openSUSE Build Service is sponsored by