File openssl-1_1-Optimize-ppc64.patch of Package openssl-1_1
From 4dba53694bf633c272075e62acdc5a5ca3003ce6 Mon Sep 17 00:00:00 2001
From: Amitay Isaacs <amitay@ozlabs.org>
Date: Mon, 29 Mar 2021 18:06:13 +1100
Subject: [PATCH 01/29] numbers: Define 128-bit integers if compiler supports
Signed-off-by: Amitay Isaacs <amitay@ozlabs.org>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Matt Caswell <matt@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/14784)
(cherry picked from commit bbed0d1cbd436af6797d7837e270bff4ca4d5a10)
---
include/internal/numbers.h | 10 ++++++++++
1 file changed, 10 insertions(+)
Index: openssl-1.1.1l/include/internal/numbers.h
===================================================================
--- openssl-1.1.1l.orig/include/internal/numbers.h
+++ openssl-1.1.1l/include/internal/numbers.h
@@ -60,6 +60,16 @@
# define UINT64_MAX __MAXUINT__(uint64_t)
# endif
+# ifndef INT128_MAX
+# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16
+typedef __int128_t int128_t;
+typedef __uint128_t uint128_t;
+# define INT128_MIN __MININT__(int128_t)
+# define INT128_MAX __MAXINT__(int128_t)
+# define UINT128_MAX __MAXUINT__(uint128_t)
+# endif
+# endif
+
# ifndef SIZE_MAX
# define SIZE_MAX __MAXUINT__(size_t)
# endif
Index: openssl-1.1.1l/crypto/bn/bn_div.c
===================================================================
--- openssl-1.1.1l.orig/crypto/bn/bn_div.c
+++ openssl-1.1.1l/crypto/bn/bn_div.c
@@ -97,7 +97,7 @@ BN_ULONG bn_div_3_words(const BN_ULONG *
*/
# if BN_BITS2 == 64 && defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
# undef BN_ULLONG
-# define BN_ULLONG __uint128_t
+# define BN_ULLONG uint128_t
# define BN_LLONG
# endif
Index: openssl-1.1.1l/crypto/bn/bn_local.h
===================================================================
--- openssl-1.1.1l.orig/crypto/bn/bn_local.h
+++ openssl-1.1.1l/crypto/bn/bn_local.h
@@ -22,6 +22,7 @@
# endif
# include "crypto/bn.h"
+# include "internal/numbers.h"
/*
* These preprocessor symbols control various aspects of the bignum headers
@@ -374,9 +375,9 @@ struct bn_gencb_st {
*/
# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16 && \
(defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
-# define BN_UMULT_HIGH(a,b) (((__uint128_t)(a)*(b))>>64)
+# define BN_UMULT_HIGH(a,b) (((uint128_t)(a)*(b))>>64)
# define BN_UMULT_LOHI(low,high,a,b) ({ \
- __uint128_t ret=(__uint128_t)(a)*(b); \
+ uint128_t ret=(uint128_t)(a)*(b); \
(high)=ret>>64; (low)=ret; })
# elif defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
# if defined(__DECC)
Index: openssl-1.1.1l/crypto/ec/curve25519.c
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/curve25519.c
+++ openssl-1.1.1l/crypto/ec/curve25519.c
@@ -11,6 +11,8 @@
#include "ec_local.h"
#include <openssl/sha.h>
+#include "internal/numbers.h"
+
#if defined(X25519_ASM) && (defined(__x86_64) || defined(__x86_64__) || \
defined(_M_AMD64) || defined(_M_X64))
@@ -252,7 +254,7 @@ static void x25519_scalar_mulx(uint8_t o
#endif
#if defined(X25519_ASM) \
- || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \
+ || ( defined(INT128_MAX) \
&& !defined(__sparc__) \
&& (!defined(__SIZEOF_LONG__) || (__SIZEOF_LONG__ == 8)) \
&& !(defined(__ANDROID__) && !defined(__clang__)) )
@@ -385,7 +387,7 @@ void x25519_fe51_mul121666(fe51 h, fe51
# define fe51_mul121666 x25519_fe51_mul121666
# else
-typedef __uint128_t u128;
+typedef uint128_t u128;
static void fe51_mul(fe51 h, const fe51 f, const fe51 g)
{
Index: openssl-1.1.1l/crypto/ec/curve448/curve448utils.h
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/curve448/curve448utils.h
+++ openssl-1.1.1l/crypto/ec/curve448/curve448utils.h
@@ -15,6 +15,8 @@
# include <openssl/e_os2.h>
+# include "internal/numbers.h"
+
/*
* Internal word types. Somewhat tricky. This could be decided separately per
* platform. However, the structs do need to be all the same size and
@@ -41,9 +43,9 @@ typedef int64_t c448_sword_t;
/* "Boolean" type, will be set to all-zero or all-one (i.e. -1u) */
typedef uint64_t c448_bool_t;
/* Double-word size for internal computations */
-typedef __uint128_t c448_dword_t;
+typedef uint128_t c448_dword_t;
/* Signed double-word size for internal computations */
-typedef __int128_t c448_dsword_t;
+typedef int128_t c448_dsword_t;
# elif C448_WORD_BITS == 32
/* Word size for internal computations */
typedef uint32_t c448_word_t;
Index: openssl-1.1.1l/crypto/ec/curve448/word.h
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/curve448/word.h
+++ openssl-1.1.1l/crypto/ec/curve448/word.h
@@ -17,15 +17,20 @@
# include <assert.h>
# include <stdlib.h>
# include <openssl/e_os2.h>
-# include "arch_intrinsics.h"
# include "curve448utils.h"
+# ifdef INT128_MAX
+# include "arch_64/arch_intrinsics.h"
+# else
+# include "arch_32/arch_intrinsics.h"
+# endif
+
# if (ARCH_WORD_BITS == 64)
typedef uint64_t word_t, mask_t;
-typedef __uint128_t dword_t;
+typedef uint128_t dword_t;
typedef int32_t hsword_t;
typedef int64_t sword_t;
-typedef __int128_t dsword_t;
+typedef int128_t dsword_t;
# elif (ARCH_WORD_BITS == 32)
typedef uint32_t word_t, mask_t;
typedef uint64_t dword_t;
Index: openssl-1.1.1l/crypto/ec/ecp_nistp224.c
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/ecp_nistp224.c
+++ openssl-1.1.1l/crypto/ec/ecp_nistp224.c
@@ -40,11 +40,9 @@ NON_EMPTY_TRANSLATION_UNIT
# include <openssl/err.h>
# include "ec_local.h"
-# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
- /* even with gcc, the typedef won't work for 32-bit platforms */
-typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit
- * platforms */
-# else
+#include "internal/numbers.h"
+
+#ifndef INT128_MAX
# error "Your compiler doesn't appear to support 128-bit integer types"
# endif
Index: openssl-1.1.1l/crypto/ec/ecp_nistp256.c
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/ecp_nistp256.c
+++ openssl-1.1.1l/crypto/ec/ecp_nistp256.c
@@ -41,14 +41,11 @@ NON_EMPTY_TRANSLATION_UNIT
# include <openssl/err.h>
# include "ec_local.h"
-# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
- /* even with gcc, the typedef won't work for 32-bit platforms */
-typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit
- * platforms */
-typedef __int128_t int128_t;
-# else
-# error "Your compiler doesn't appear to support 128-bit integer types"
-# endif
+#include "internal/numbers.h"
+
+#ifndef INT128_MAX
+# error "Your compiler doesn't appear to support 128-bit integer types"
+#endif
typedef uint8_t u8;
typedef uint32_t u32;
Index: openssl-1.1.1l/crypto/ec/ecp_nistp521.c
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/ecp_nistp521.c
+++ openssl-1.1.1l/crypto/ec/ecp_nistp521.c
@@ -40,13 +40,11 @@ NON_EMPTY_TRANSLATION_UNIT
# include <openssl/err.h>
# include "ec_local.h"
-# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
- /* even with gcc, the typedef won't work for 32-bit platforms */
-typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit
- * platforms */
-# else
-# error "Your compiler doesn't appear to support 128-bit integer types"
-# endif
+#include "internal/numbers.h"
+
+#ifndef INT128_MAX
+# error "Your compiler doesn't appear to support 128-bit integer types"
+#endif
typedef uint8_t u8;
typedef uint64_t u64;
@@ -400,7 +398,7 @@ static void felem_diff128(largefelem out
* On exit:
* out[i] < 17 * max(in[i]) * max(in[i])
*/
-static void felem_square(largefelem out, const felem in)
+static void felem_square_ref(largefelem out, const felem in)
{
felem inx2, inx4;
felem_scalar(inx2, in, 2);
@@ -484,7 +482,7 @@ static void felem_square(largefelem out,
* On exit:
* out[i] < 17 * max(in1[i]) * max(in2[i])
*/
-static void felem_mul(largefelem out, const felem in1, const felem in2)
+static void felem_mul_ref(largefelem out, const felem in1, const felem in2)
{
felem in2x2;
felem_scalar(in2x2, in2, 2);
@@ -674,6 +672,57 @@ static void felem_reduce(felem out, cons
*/
}
+#if defined(ECP_NISTP521_ASM)
+void felem_square_wrapper(largefelem out, const felem in);
+void felem_mul_wrapper(largefelem out, const felem in1, const felem in2);
+
+static void (*felem_square_p)(largefelem out, const felem in) =
+ felem_square_wrapper;
+static void (*felem_mul_p)(largefelem out, const felem in1, const felem in2) =
+ felem_mul_wrapper;
+
+void p521_felem_square(largefelem out, const felem in);
+void p521_felem_mul(largefelem out, const felem in1, const felem in2);
+
+# if defined(_ARCH_PPC64)
+# include "../ppc_arch.h"
+# endif
+
+void felem_select(void)
+{
+# if defined(_ARCH_PPC64)
+ if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
+ felem_square_p = p521_felem_square;
+ felem_mul_p = p521_felem_mul;
+
+ return;
+ }
+# endif
+
+ /* Default */
+ felem_square_p = felem_square_ref;
+ felem_mul_p = felem_mul_ref;
+}
+
+void felem_square_wrapper(largefelem out, const felem in)
+{
+ felem_select();
+ felem_square_p(out, in);
+}
+
+void felem_mul_wrapper(largefelem out, const felem in1, const felem in2)
+{
+ felem_select();
+ felem_mul_p(out, in1, in2);
+}
+
+# define felem_square felem_square_p
+# define felem_mul felem_mul_p
+#else
+# define felem_square felem_square_ref
+# define felem_mul felem_mul_ref
+#endif
+
static void felem_square_reduce(felem out, const felem in)
{
largefelem tmp;
Index: openssl-1.1.1l/crypto/poly1305/poly1305.c
===================================================================
--- openssl-1.1.1l.orig/crypto/poly1305/poly1305.c
+++ openssl-1.1.1l/crypto/poly1305/poly1305.c
@@ -95,11 +95,10 @@ poly1305_blocks(void *ctx, const unsigne
(a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1) \
)
-# if (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) && \
- (defined(__SIZEOF_LONG__) && __SIZEOF_LONG__==8)
+# if defined(INT64_MAX) && defined(INT128_MAX)
typedef unsigned long u64;
-typedef __uint128_t u128;
+typedef uint128_t u128;
typedef struct {
u64 h[3];
Index: openssl-1.1.1l/crypto/poly1305/poly1305_base2_44.c
===================================================================
--- openssl-1.1.1l.orig/crypto/poly1305/poly1305_base2_44.c
+++ openssl-1.1.1l/crypto/poly1305/poly1305_base2_44.c
@@ -18,7 +18,7 @@
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long u64;
-typedef unsigned __int128 u128;
+typedef uint128_t u128;
typedef struct {
u64 h[3];
Index: openssl-1.1.1l/crypto/ec/build.info
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/build.info
+++ openssl-1.1.1l/crypto/ec/build.info
@@ -6,8 +13,9 @@ SOURCE[../../libcrypto]=\
ecp_nistp224.c ecp_nistp256.c ecp_nistp521.c ecp_nistputil.c \
ecp_oct.c ec2_oct.c ec_oct.c ec_kmeth.c ecdh_ossl.c ecdh_kdf.c \
ecdsa_ossl.c ecdsa_sign.c ecdsa_vrf.c curve25519.c ecx_meth.c \
- curve448/arch_32/f_impl.c curve448/f_generic.c curve448/scalar.c \
+ curve448/f_generic.c curve448/scalar.c \
curve448/curve448_tables.c curve448/eddsa.c curve448/curve448.c \
+ curve448/arch_64/f_impl64.c curve448/arch_32/f_impl32.c \
{- $target{ec_asm_src} -}
GENERATE[ecp_nistz256-x86.s]=asm/ecp_nistz256-x86.pl \
@@ -29,6 +38,8 @@ GENERATE[ecp_nistz256-armv8.S]=asm/ecp_n
INCLUDE[ecp_nistz256-armv8.o]=..
GENERATE[ecp_nistz256-ppc64.s]=asm/ecp_nistz256-ppc64.pl $(PERLASM_SCHEME)
+GENERATE[ecp_nistp521-ppc64.s]=asm/ecp_nistp521-ppc64.pl $(PERLASM_SCHEME)
+
GENERATE[x25519-x86_64.s]=asm/x25519-x86_64.pl $(PERLASM_SCHEME)
GENERATE[x25519-ppc64.s]=asm/x25519-ppc64.pl $(PERLASM_SCHEME)
@@ -36,10 +47,3 @@ BEGINRAW[Makefile]
{- $builddir -}/ecp_nistz256-%.S: {- $sourcedir -}/asm/ecp_nistz256-%.pl
CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
ENDRAW[Makefile]
-
-INCLUDE[curve448/arch_32/f_impl.o]=curve448/arch_32 curve448
-INCLUDE[curve448/f_generic.o]=curve448/arch_32 curve448
-INCLUDE[curve448/scalar.o]=curve448/arch_32 curve448
-INCLUDE[curve448/curve448_tables.o]=curve448/arch_32 curve448
-INCLUDE[curve448/eddsa.o]=curve448/arch_32 curve448
-INCLUDE[curve448/curve448.o]=curve448/arch_32 curve448
Index: openssl-1.1.1l/crypto/ec/curve448/field.h
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/curve448/field.h
+++ openssl-1.1.1l/crypto/ec/curve448/field.h
@@ -66,10 +66,15 @@ void gf_serialize(uint8_t *serial, const
mask_t gf_deserialize(gf x, const uint8_t serial[SER_BYTES], int with_hibit,
uint8_t hi_nmask);
-# include "f_impl.h" /* Bring in the inline implementations */
# define LIMBPERM(i) (i)
-# define LIMB_MASK(i) (((1)<<LIMB_PLACE_VALUE(i))-1)
+# if (ARCH_WORD_BITS == 32)
+# include "arch_32/f_impl.h" /* Bring in the inline implementations */
+# define LIMB_MASK(i) (((1)<<LIMB_PLACE_VALUE(i))-1)
+# elif (ARCH_WORD_BITS == 64)
+# include "arch_64/f_impl.h" /* Bring in the inline implementations */
+# define LIMB_MASK(i) (((1ULL)<<LIMB_PLACE_VALUE(i))-1)
+# endif
static const gf ZERO = {{{0}}}, ONE = {{{1}}};
Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/arch_intrinsics.h
===================================================================
--- /dev/null
+++ openssl-1.1.1l/crypto/ec/curve448/arch_64/arch_intrinsics.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2016 Cryptography Research, Inc.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ *
+ * Originally written by Mike Hamburg
+ */
+
+# include "internal/constant_time.h"
+
+#ifndef OSSL_CRYPTO_EC_CURVE448_ARCH_64_INTRINSICS_H
+# define OSSL_CRYPTO_EC_CURVE448_ARCH_64_INTRINSICS_H
+
+# define ARCH_WORD_BITS 64
+
+# define word_is_zero(a) constant_time_is_zero_64(a)
+
+static ossl_inline uint128_t widemul(uint64_t a, uint64_t b)
+{
+ return ((uint128_t) a) * b;
+}
+
+#endif /* OSSL_CRYPTO_EC_CURVE448_ARCH_64_INTRINSICS_H */
Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.h
===================================================================
--- /dev/null
+++ openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2014-2016 Cryptography Research, Inc.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ *
+ * Originally written by Mike Hamburg
+ */
+
+#ifndef OSSL_CRYPTO_EC_CURVE448_ARCH_64_F_IMPL_H
+# define OSSL_CRYPTO_EC_CURVE448_ARCH_64_F_IMPL_H
+
+# define GF_HEADROOM 9999 /* Everything is reduced anyway */
+# define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
+
+# define LIMB_PLACE_VALUE(i) 56
+
+void gf_add_RAW(gf out, const gf a, const gf b)
+{
+ unsigned int i;
+
+ for (i = 0; i < NLIMBS; i++)
+ out->limb[i] = a->limb[i] + b->limb[i];
+
+ gf_weak_reduce(out);
+}
+
+void gf_sub_RAW(gf out, const gf a, const gf b)
+{
+ uint64_t co1 = ((1ULL << 56) - 1) * 2, co2 = co1 - 2;
+ unsigned int i;
+
+ for (i = 0; i < NLIMBS; i++)
+ out->limb[i] = a->limb[i] - b->limb[i] + ((i == NLIMBS / 2) ? co2 : co1);
+
+ gf_weak_reduce(out);
+}
+
+void gf_bias(gf a, int amt)
+{
+}
+
+void gf_weak_reduce(gf a)
+{
+ uint64_t mask = (1ULL << 56) - 1;
+ uint64_t tmp = a->limb[NLIMBS - 1] >> 56;
+ unsigned int i;
+
+ a->limb[NLIMBS / 2] += tmp;
+ for (i = NLIMBS - 1; i > 0; i--)
+ a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56);
+ a->limb[0] = (a->limb[0] & mask) + tmp;
+}
+
+#endif /* OSSL_CRYPTO_EC_CURVE448_ARCH_64_F_IMPL_H */
Index: openssl-1.1.1l/include/internal/constant_time.h
===================================================================
--- openssl-1.1.1l.orig/include/internal/constant_time.h
+++ openssl-1.1.1l/include/internal/constant_time.h
@@ -181,6 +181,11 @@ static ossl_inline uint32_t constant_tim
return constant_time_msb_32(~a & (a - 1));
}
+static ossl_inline uint64_t constant_time_is_zero_64(uint64_t a)
+{
+ return constant_time_msb_64(~a & (a - 1));
+}
+
static ossl_inline unsigned int constant_time_eq(unsigned int a,
unsigned int b)
{
Index: openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl32.c
===================================================================
--- /dev/null
+++ openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl32.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2014 Cryptography Research, Inc.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ *
+ * Originally written by Mike Hamburg
+ */
+
+#include <openssl/opensslconf.h>
+#include "internal/numbers.h"
+
+#ifdef UINT128_MAX
+/* We have support for 128 bit ints, so do nothing here */
+NON_EMPTY_TRANSLATION_UNIT
+#else
+
+# include "../field.h"
+
+void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
+{
+ const uint32_t *a = as->limb, *b = bs->limb;
+ uint32_t *c = cs->limb;
+ uint64_t accum0 = 0, accum1 = 0, accum2 = 0;
+ uint32_t mask = (1 << 28) - 1;
+ uint32_t aa[8], bb[8];
+ int i, j;
+
+ for (i = 0; i < 8; i++) {
+ aa[i] = a[i] + a[i + 8];
+ bb[i] = b[i] + b[i + 8];
+ }
+
+ for (j = 0; j < 8; j++) {
+ accum2 = 0;
+ for (i = 0; i < j + 1; i++) {
+ accum2 += widemul(a[j - i], b[i]);
+ accum1 += widemul(aa[j - i], bb[i]);
+ accum0 += widemul(a[8 + j - i], b[8 + i]);
+ }
+ accum1 -= accum2;
+ accum0 += accum2;
+ accum2 = 0;
+ for (i = j + 1; i < 8; i++) {
+ accum0 -= widemul(a[8 + j - i], b[i]);
+ accum2 += widemul(aa[8 + j - i], bb[i]);
+ accum1 += widemul(a[16 + j - i], b[8 + i]);
+ }
+ accum1 += accum2;
+ accum0 += accum2;
+ c[j] = ((uint32_t)(accum0)) & mask;
+ c[j + 8] = ((uint32_t)(accum1)) & mask;
+ accum0 >>= 28;
+ accum1 >>= 28;
+ }
+
+ accum0 += accum1;
+ accum0 += c[8];
+ accum1 += c[0];
+ c[8] = ((uint32_t)(accum0)) & mask;
+ c[0] = ((uint32_t)(accum1)) & mask;
+
+ accum0 >>= 28;
+ accum1 >>= 28;
+ c[9] += ((uint32_t)(accum0));
+ c[1] += ((uint32_t)(accum1));
+}
+
+void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b)
+{
+ const uint32_t *a = as->limb;
+ uint32_t *c = cs->limb;
+ uint64_t accum0 = 0, accum8 = 0;
+ uint32_t mask = (1 << 28) - 1;
+ int i;
+
+ assert(b <= mask);
+
+ for (i = 0; i < 8; i++) {
+ accum0 += widemul(b, a[i]);
+ accum8 += widemul(b, a[i + 8]);
+ c[i] = accum0 & mask;
+ accum0 >>= 28;
+ c[i + 8] = accum8 & mask;
+ accum8 >>= 28;
+ }
+
+ accum0 += accum8 + c[8];
+ c[8] = ((uint32_t)accum0) & mask;
+ c[9] += (uint32_t)(accum0 >> 28);
+
+ accum8 += c[0];
+ c[0] = ((uint32_t)accum8) & mask;
+ c[1] += (uint32_t)(accum8 >> 28);
+}
+
+void gf_sqr(gf_s * RESTRICT cs, const gf as)
+{
+ gf_mul(cs, as, as); /* Performs better with a dedicated square */
+}
+#endif
Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl64.c
===================================================================
--- /dev/null
+++ openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl64.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2014 Cryptography Research, Inc.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ *
+ * Originally written by Mike Hamburg
+ */
+
+#include <openssl/opensslconf.h>
+#include "internal/numbers.h"
+
+#ifndef UINT128_MAX
+/* No support for 128 bit ints, so do nothing here */
+NON_EMPTY_TRANSLATION_UNIT
+#else
+
+# include "../field.h"
+
+void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
+{
+ const uint64_t *a = as->limb, *b = bs->limb;
+ uint64_t *c = cs->limb;
+ uint128_t accum0 = 0, accum1 = 0, accum2;
+ uint64_t mask = (1ULL << 56) - 1;
+ uint64_t aa[4], bb[4], bbb[4];
+ unsigned int i, j;
+
+ for (i = 0; i < 4; i++) {
+ aa[i] = a[i] + a[i + 4];
+ bb[i] = b[i] + b[i + 4];
+ bbb[i] = bb[i] + b[i + 4];
+ }
+
+ for (i = 0; i < 4; i++) {
+ accum2 = 0;
+
+ for (j = 0; j <= i; j++) {
+ accum2 += widemul(a[j], b[i - j]);
+ accum1 += widemul(aa[j], bb[i - j]);
+ accum0 += widemul(a[j + 4], b[i - j + 4]);
+ }
+ for (; j < 4; j++) {
+ accum2 += widemul(a[j], b[i - j + 8]);
+ accum1 += widemul(aa[j], bbb[i - j + 4]);
+ accum0 += widemul(a[j + 4], bb[i - j + 4]);
+ }
+
+ accum1 -= accum2;
+ accum0 += accum2;
+
+ c[i] = ((uint64_t)(accum0)) & mask;
+ c[i + 4] = ((uint64_t)(accum1)) & mask;
+
+ accum0 >>= 56;
+ accum1 >>= 56;
+ }
+
+ accum0 += accum1;
+ accum0 += c[4];
+ accum1 += c[0];
+ c[4] = ((uint64_t)(accum0)) & mask;
+ c[0] = ((uint64_t)(accum1)) & mask;
+
+ accum0 >>= 56;
+ accum1 >>= 56;
+
+ c[5] += ((uint64_t)(accum0));
+ c[1] += ((uint64_t)(accum1));
+}
+
+void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
+{
+ const uint64_t *a = as->limb;
+ uint64_t *c = cs->limb;
+ uint128_t accum0 = 0, accum4 = 0;
+ uint64_t mask = (1ULL << 56) - 1;
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ accum0 += widemul(b, a[i]);
+ accum4 += widemul(b, a[i + 4]);
+ c[i] = accum0 & mask;
+ accum0 >>= 56;
+ c[i + 4] = accum4 & mask;
+ accum4 >>= 56;
+ }
+
+ accum0 += accum4 + c[4];
+ c[4] = accum0 & mask;
+ c[5] += accum0 >> 56;
+
+ accum4 += c[0];
+ c[0] = accum4 & mask;
+ c[1] += accum4 >> 56;
+}
+
+void gf_sqr(gf_s * __restrict__ cs, const gf as)
+{
+ const uint64_t *a = as->limb;
+ uint64_t *c = cs->limb;
+ uint128_t accum0 = 0, accum1 = 0, accum2;
+ uint64_t mask = (1ULL << 56) - 1;
+ uint64_t aa[4];
+
+ /* For some reason clang doesn't vectorize this without prompting? */
+ unsigned int i;
+ for (i = 0; i < 4; i++) {
+ aa[i] = a[i] + a[i + 4];
+ }
+
+ accum2 = widemul(a[0], a[3]);
+ accum0 = widemul(aa[0], aa[3]);
+ accum1 = widemul(a[4], a[7]);
+
+ accum2 += widemul(a[1], a[2]);
+ accum0 += widemul(aa[1], aa[2]);
+ accum1 += widemul(a[5], a[6]);
+
+ accum0 -= accum2;
+ accum1 += accum2;
+
+ c[3] = ((uint64_t)(accum1)) << 1 & mask;
+ c[7] = ((uint64_t)(accum0)) << 1 & mask;
+
+ accum0 >>= 55;
+ accum1 >>= 55;
+
+ accum0 += widemul(2 * aa[1], aa[3]);
+ accum1 += widemul(2 * a[5], a[7]);
+ accum0 += widemul(aa[2], aa[2]);
+ accum1 += accum0;
+
+ accum0 -= widemul(2 * a[1], a[3]);
+ accum1 += widemul(a[6], a[6]);
+
+ accum2 = widemul(a[0], a[0]);
+ accum1 -= accum2;
+ accum0 += accum2;
+
+ accum0 -= widemul(a[2], a[2]);
+ accum1 += widemul(aa[0], aa[0]);
+ accum0 += widemul(a[4], a[4]);
+
+ c[0] = ((uint64_t)(accum0)) & mask;
+ c[4] = ((uint64_t)(accum1)) & mask;
+
+ accum0 >>= 56;
+ accum1 >>= 56;
+
+ accum2 = widemul(2 * aa[2], aa[3]);
+ accum0 -= widemul(2 * a[2], a[3]);
+ accum1 += widemul(2 * a[6], a[7]);
+
+ accum1 += accum2;
+ accum0 += accum2;
+
+ accum2 = widemul(2 * a[0], a[1]);
+ accum1 += widemul(2 * aa[0], aa[1]);
+ accum0 += widemul(2 * a[4], a[5]);
+
+ accum1 -= accum2;
+ accum0 += accum2;
+
+ c[1] = ((uint64_t)(accum0)) & mask;
+ c[5] = ((uint64_t)(accum1)) & mask;
+
+ accum0 >>= 56;
+ accum1 >>= 56;
+
+ accum2 = widemul(aa[3], aa[3]);
+ accum0 -= widemul(a[3], a[3]);
+ accum1 += widemul(a[7], a[7]);
+
+ accum1 += accum2;
+ accum0 += accum2;
+
+ accum2 = widemul(2 * a[0], a[2]);
+ accum1 += widemul(2 * aa[0], aa[2]);
+ accum0 += widemul(2 * a[4], a[6]);
+
+ accum2 += widemul(a[1], a[1]);
+ accum1 += widemul(aa[1], aa[1]);
+ accum0 += widemul(a[5], a[5]);
+
+ accum1 -= accum2;
+ accum0 += accum2;
+
+ c[2] = ((uint64_t)(accum0)) & mask;
+ c[6] = ((uint64_t)(accum1)) & mask;
+
+ accum0 >>= 56;
+ accum1 >>= 56;
+
+ accum0 += c[3];
+ accum1 += c[7];
+ c[3] = ((uint64_t)(accum0)) & mask;
+ c[7] = ((uint64_t)(accum1)) & mask;
+
+ /* we could almost stop here, but it wouldn't be stable, so... */
+
+ accum0 >>= 56;
+ accum1 >>= 56;
+ c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
+ c[0] += ((uint64_t)(accum1));
+}
+#endif
Index: openssl-1.1.1l/Configurations/00-base-templates.conf
===================================================================
--- openssl-1.1.1l.orig/Configurations/00-base-templates.conf
+++ openssl-1.1.1l/Configurations/00-base-templates.conf
@@ -351,7 +351,8 @@ my %targets=(
ppc64_asm => {
inherit_from => [ "ppc32_asm" ],
template => 1,
- ec_asm_src => "ecp_nistz256.c ecp_nistz256-ppc64.s x25519-ppc64.s",
+ bn_asm_src => add("ppc64-mont-fixed.s"),
+ ec_asm_src => "ecp_nistz256.c ecp_nistz256-ppc64.s ecp_nistp521-ppc64.s x25519-ppc64.s",
keccak1600_asm_src => "keccak1600-ppc64.s",
},
);
Index: openssl-1.1.1l/crypto/bn/asm/ppc64-mont-fixed.pl
===================================================================
--- /dev/null
+++ openssl-1.1.1l/crypto/bn/asm/ppc64-mont-fixed.pl
@@ -0,0 +1,581 @@
+#! /usr/bin/env perl
+# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# ====================================================================
+# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
+# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
+# the OpenSSL project.
+# ====================================================================
+
+#
+# Fixed length (n=6), unrolled PPC Montgomery Multiplication
+#
+
+# 2021
+#
+# Although this is a generic implementation for unrolling Montgomery
+# Multiplication for arbitrary values of n, this is currently only
+# used for n = 6 to improve the performance of ECC p384.
+#
+# Unrolling allows intermediate results to be stored in registers,
+# rather than on the stack, improving performance by ~7% compared to
+# the existing PPC assembly code.
+#
+# The ISA 3.0 implementation uses combination multiply/add
+# instructions (maddld, maddhdu) to improve performance by an
+# additional ~10% on Power 9.
+#
+# Finally, saving non-volatile registers into volatile vector
+# registers instead of onto the stack saves a little more.
+#
+# On a Power 9 machine we see an overall improvement of ~18%.
+#
+
+use strict;
+use warnings;
+
+my ($flavour, $output, $dir, $xlate);
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour \"$output\""
+ or die "can't call $xlate: $!";
+
+if ($flavour !~ /64/) {
+ die "bad flavour ($flavour) - only ppc64 permitted";
+}
+
+my $SIZE_T= 8;
+
+# Registers are global so the code is remotely readable
+
+# Parameters for Montgomery multiplication
+my $sp = "r1";
+my $toc = "r2";
+my $rp = "r3";
+my $ap = "r4";
+my $bp = "r5";
+my $np = "r6";
+my $n0 = "r7";
+my $num = "r8";
+
+my $i = "r9";
+my $c0 = "r10";
+my $bp0 = "r11";
+my $bpi = "r11";
+my $bpj = "r11";
+my $tj = "r12";
+my $apj = "r12";
+my $npj = "r12";
+my $lo = "r14";
+my $c1 = "r14";
+
+# Non-volatile registers used for tp[i]
+#
+# 12 registers are available but the limit on unrolling is 10,
+# since registers from $tp[0] to $tp[$n+1] are used.
+my @tp = ("r20" .. "r31");
+
+# volatile VSRs for saving non-volatile GPRs - faster than stack
+my @vsrs = ("v32" .. "v46");
+
+package Mont;
+
+sub new($$)
+{
+ my ($class, $n) = @_;
+
+ if ($n > 10) {
+ die "Can't unroll for BN length ${n} (maximum 10)"
+ }
+
+ my $self = {
+ code => "",
+ n => $n,
+ };
+ bless $self, $class;
+
+ return $self;
+}
+
+sub add_code($$)
+{
+ my ($self, $c) = @_;
+
+ $self->{code} .= $c;
+}
+
+sub get_code($)
+{
+ my ($self) = @_;
+
+ return $self->{code};
+}
+
+sub get_function_name($)
+{
+ my ($self) = @_;
+
+ return "bn_mul_mont_fixed_n" . $self->{n};
+}
+
+sub get_label($$)
+{
+ my ($self, $l) = @_;
+
+ return "L" . $l . "_" . $self->{n};
+}
+
+sub get_labels($@)
+{
+ my ($self, @labels) = @_;
+
+ my %out = ();
+
+ foreach my $l (@labels) {
+ $out{"$l"} = $self->get_label("$l");
+ }
+
+ return \%out;
+}
+
+sub nl($)
+{
+ my ($self) = @_;
+
+ $self->add_code("\n");
+}
+
+sub copy_result($)
+{
+ my ($self) = @_;
+
+ my ($n) = $self->{n};
+
+ for (my $j = 0; $j < $n; $j++) {
+ $self->add_code(<<___);
+ std $tp[$j],`$j*$SIZE_T`($rp)
+___
+ }
+
+}
+
+sub mul_mont_fixed($)
+{
+ my ($self) = @_;
+
+ my ($n) = $self->{n};
+ my $fname = $self->get_function_name();
+ my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
+
+ $self->add_code(<<___);
+
+.globl .${fname}
+.align 5
+.${fname}:
+
+___
+
+ $self->save_registers();
+
+ $self->add_code(<<___);
+ ld $n0,0($n0)
+
+ ld $bp0,0($bp)
+
+ ld $apj,0($ap)
+___
+
+ $self->mul_c_0($tp[0], $apj, $bp0, $c0);
+
+ for (my $j = 1; $j < $n - 1; $j++) {
+ $self->add_code(<<___);
+ ld $apj,`$j*$SIZE_T`($ap)
+___
+ $self->mul($tp[$j], $apj, $bp0, $c0);
+ }
+
+ $self->add_code(<<___);
+ ld $apj,`($n-1)*$SIZE_T`($ap)
+___
+
+ $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
+
+ $self->add_code(<<___);
+ li $tp[$n+1],0
+
+___
+
+ $self->add_code(<<___);
+ li $i,0
+ mtctr $num
+ b $label->{"enter"}
+
+.align 4
+$label->{"outer"}:
+ ldx $bpi,$bp,$i
+
+ ld $apj,0($ap)
+___
+
+ $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
+
+ for (my $j = 1; $j < $n; $j++) {
+ $self->add_code(<<___);
+ ld $apj,`$j*$SIZE_T`($ap)
+___
+ $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
+ }
+
+ $self->add_code(<<___);
+ addc $tp[$n],$tp[$n],$c0
+ addze $tp[$n+1],$tp[$n+1]
+___
+
+ $self->add_code(<<___);
+.align 4
+$label->{"enter"}:
+ mulld $bpi,$tp[0],$n0
+
+ ld $npj,0($np)
+___
+
+ $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
+
+ for (my $j = 1; $j < $n; $j++) {
+ $self->add_code(<<___);
+ ld $npj,`$j*$SIZE_T`($np)
+___
+ $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
+ }
+
+ $self->add_code(<<___);
+ addc $tp[$n-1],$tp[$n],$c0
+ addze $tp[$n],$tp[$n+1]
+
+ addi $i,$i,$SIZE_T
+ bdnz $label->{"outer"}
+
+ and. $tp[$n],$tp[$n],$tp[$n]
+ bne $label->{"sub"}
+
+ cmpld $tp[$n-1],$npj
+ blt $label->{"copy"}
+
+$label->{"sub"}:
+___
+
+ #
+ # Reduction
+ #
+
+ $self->add_code(<<___);
+ ld $bpj,`0*$SIZE_T`($np)
+ subfc $c1,$bpj,$tp[0]
+ std $c1,`0*$SIZE_T`($rp)
+
+___
+ for (my $j = 1; $j < $n - 1; $j++) {
+ $self->add_code(<<___);
+ ld $bpj,`$j*$SIZE_T`($np)
+ subfe $c1,$bpj,$tp[$j]
+ std $c1,`$j*$SIZE_T`($rp)
+
+___
+ }
+
+ $self->add_code(<<___);
+ subfe $c1,$npj,$tp[$n-1]
+ std $c1,`($n-1)*$SIZE_T`($rp)
+
+___
+
+ $self->add_code(<<___);
+ addme. $tp[$n],$tp[$n]
+ beq $label->{"end"}
+
+$label->{"copy"}:
+___
+
+ $self->copy_result();
+
+ $self->add_code(<<___);
+
+$label->{"end"}:
+___
+
+ $self->restore_registers();
+
+ $self->add_code(<<___);
+ li r3,1
+ blr
+.size .${fname},.-.${fname}
+___
+
+}
+
+package Mont::GPR;
+
+our @ISA = ('Mont');
+
+sub new($$)
+{
+ my ($class, $n) = @_;
+
+ return $class->SUPER::new($n);
+}
+
+sub save_registers($)
+{
+ my ($self) = @_;
+
+ my $n = $self->{n};
+
+ $self->add_code(<<___);
+ std $lo,-8($sp)
+___
+
+ for (my $j = 0; $j <= $n+1; $j++) {
+ $self->{code}.=<<___;
+ std $tp[$j],-`($j+2)*8`($sp)
+___
+ }
+
+ $self->add_code(<<___);
+
+___
+}
+
+sub restore_registers($)
+{
+ my ($self) = @_;
+
+ my $n = $self->{n};
+
+ $self->add_code(<<___);
+ ld $lo,-8($sp)
+___
+
+ for (my $j = 0; $j <= $n+1; $j++) {
+ $self->{code}.=<<___;
+ ld $tp[$j],-`($j+2)*8`($sp)
+___
+ }
+
+ $self->{code} .=<<___;
+
+___
+}
+
+# Direct translation of C mul()
+sub mul($$$$$)
+{
+ my ($self, $r, $a, $w, $c) = @_;
+
+ $self->add_code(<<___);
+ mulld $lo,$a,$w
+ addc $r,$lo,$c
+ mulhdu $c,$a,$w
+ addze $c,$c
+
+___
+}
+
+# Like mul() but $c is ignored as an input - an optimisation to save a
+# preliminary instruction that would set input $c to 0
+sub mul_c_0($$$$$)
+{
+ my ($self, $r, $a, $w, $c) = @_;
+
+ $self->add_code(<<___);
+ mulld $r,$a,$w
+ mulhdu $c,$a,$w
+
+___
+}
+
+# Like mul() but does not to the final addition of CA into $c - an
+# optimisation to save an instruction
+sub mul_last($$$$$$)
+{
+ my ($self, $r1, $r2, $a, $w, $c) = @_;
+
+ $self->add_code(<<___);
+ mulld $lo,$a,$w
+ addc $r1,$lo,$c
+ mulhdu $c,$a,$w
+
+ addze $r2,$c
+___
+}
+
+# Like C mul_add() but allow $r_out and $r_in to be different
+sub mul_add($$$$$$)
+{
+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;
+
+ $self->add_code(<<___);
+ mulld $lo,$a,$w
+ addc $lo,$lo,$c
+ mulhdu $c,$a,$w
+ addze $c,$c
+ addc $r_out,$r_in,$lo
+ addze $c,$c
+
+___
+}
+
+# Like mul_add() but $c is ignored as an input - an optimisation to save a
+# preliminary instruction that would set input $c to 0
+sub mul_add_c_0($$$$$$)
+{
+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;
+
+ $self->add_code(<<___);
+ mulld $lo,$a,$w
+ addc $r_out,$r_in,$lo
+ mulhdu $c,$a,$w
+ addze $c,$c
+
+___
+}
+
+package Mont::GPR_300;
+
+our @ISA = ('Mont::GPR');
+
+sub new($$)
+{
+ my ($class, $n) = @_;
+
+ my $mont = $class->SUPER::new($n);
+
+ return $mont;
+}
+
+sub get_function_name($)
+{
+ my ($self) = @_;
+
+ return "bn_mul_mont_300_fixed_n" . $self->{n};
+}
+
+sub get_label($$)
+{
+ my ($self, $l) = @_;
+
+ return "L" . $l . "_300_" . $self->{n};
+}
+
+# Direct translation of C mul()
+sub mul($$$$$)
+{
+ my ($self, $r, $a, $w, $c, $last) = @_;
+
+ $self->add_code(<<___);
+ maddld $r,$a,$w,$c
+ maddhdu $c,$a,$w,$c
+
+___
+}
+
+# Save the last carry as the final entry
+sub mul_last($$$$$)
+{
+ my ($self, $r1, $r2, $a, $w, $c) = @_;
+
+ $self->add_code(<<___);
+ maddld $r1,$a,$w,$c
+ maddhdu $r2,$a,$w,$c
+
+___
+}
+
+# Like mul() but $c is ignored as an input - an optimisation to save a
+# preliminary instruction that would set input $c to 0
+sub mul_c_0($$$$$)
+{
+ my ($self, $r, $a, $w, $c) = @_;
+
+ $self->add_code(<<___);
+ mulld $r,$a,$w
+ mulhdu $c,$a,$w
+
+___
+}
+
+# Like C mul_add() but allow $r_out and $r_in to be different
+sub mul_add($$$$$$)
+{
+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;
+
+ $self->add_code(<<___);
+ maddld $lo,$a,$w,$c
+ maddhdu $c,$a,$w,$c
+ addc $r_out,$r_in,$lo
+ addze $c,$c
+
+___
+}
+
+# Like mul_add() but $c is ignored as an input - an optimisation to save a
+# preliminary instruction that would set input $c to 0
+sub mul_add_c_0($$$$$$)
+{
+ my ($self, $r_out, $r_in, $a, $w, $c) = @_;
+
+ $self->add_code(<<___);
+ maddld $lo,$a,$w,$r_in
+ maddhdu $c,$a,$w,$r_in
+___
+
+ if ($r_out ne $lo) {
+ $self->add_code(<<___);
+ mr $r_out,$lo
+___
+ }
+
+ $self->nl();
+}
+
+
+package main;
+
+my $code;
+
+$code.=<<___;
+.machine "any"
+.text
+___
+
+my $mont;
+
+$mont = new Mont::GPR(6);
+$mont->mul_mont_fixed();
+$code .= $mont->get_code();
+
+$mont = new Mont::GPR_300(6);
+$mont->mul_mont_fixed();
+$code .= $mont->get_code();
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+$code.=<<___;
+.asciz "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
+___
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
Index: openssl-1.1.1l/crypto/bn/build.info
===================================================================
--- openssl-1.1.1l.orig/crypto/bn/build.info
+++ openssl-1.1.1l/crypto/bn/build.info
@@ -56,6 +56,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont.
GENERATE[bn-ppc.s]=asm/ppc.pl $(PERLASM_SCHEME)
GENERATE[ppc-mont.s]=asm/ppc-mont.pl $(PERLASM_SCHEME)
GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl $(PERLASM_SCHEME)
+GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl $(PERLASM_SCHEME)
GENERATE[alpha-mont.S]=asm/alpha-mont.pl $(PERLASM_SCHEME)
Index: openssl-1.1.1l/crypto/ppccap.c
===================================================================
--- openssl-1.1.1l.orig/crypto/ppccap.c
+++ openssl-1.1.1l/crypto/ppccap.c
@@ -46,6 +46,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_U
const BN_ULONG *np, const BN_ULONG *n0, int num);
int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, int num);
+ int bn_mul_mont_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
+ const BN_ULONG *bp, const BN_ULONG *np,
+ const BN_ULONG *n0, int num);
+ int bn_mul_mont_300_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
+ const BN_ULONG *bp, const BN_ULONG *np,
+ const BN_ULONG *n0, int num);
if (num < 4)
return 0;
@@ -61,6 +67,15 @@ int bn_mul_mont(BN_ULONG *rp, const BN_U
* no opportunity to figure it out...
*/
+#if defined(_ARCH_PPC64)
+ if (num == 6) {
+ if (OPENSSL_ppccap_P & PPC_MADD300)
+ return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num);
+ else
+ return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num);
+ }
+#endif
+
return bn_mul_mont_int(rp, ap, bp, np, n0, num);
}
#endif
Index: openssl-1.1.1l/crypto/perlasm/ppc-xlate.pl
===================================================================
--- openssl-1.1.1l.orig/crypto/perlasm/ppc-xlate.pl
+++ openssl-1.1.1l/crypto/perlasm/ppc-xlate.pl
@@ -136,6 +136,71 @@ my $quad = sub {
};
################################################################
+# vector register number hacking
+################################################################
+
+# It is convenient to be able to set a variable like:
+# my $foo = "v33";
+# and use this in different contexts where:
+# * a VSR (Vector-Scaler Register) number (i.e. "v33") is required
+# * a VR (Vector Register) number (i.e. "v1") is required
+# Map VSR numbering to VR number for certain vector instructions.
+
+# vs<N> -> v<N-32> if N > 32
+sub vsr2vr1 {
+ my $in = shift;
+
+ my $n = int($in);
+ if ($n >= 32) {
+ $n -= 32;
+ }
+
+ return "$n";
+}
+# As above for first $num register args, returns list
+sub _vsr2vr {
+ my $num = shift;
+ my @rest = @_;
+ my @subst = splice(@rest, 0, $num);
+
+ @subst = map { vsr2vr1($_); } @subst;
+
+ return (@subst, @rest);
+}
+# As above but 1st arg ($f) is extracted and reinserted after
+# processing so that it can be ignored by a code generation function
+# that consumes the result
+sub vsr2vr_args {
+ my $num = shift;
+ my $f = shift;
+
+ my @out = _vsr2vr($num, @_);
+
+ return ($f, @out);
+}
+# As above but 1st arg is mnemonic, return formatted instruction
+sub vsr2vr {
+ my $mnemonic = shift;
+ my $num = shift;
+ my $f = shift;
+
+ my @out = _vsr2vr($num, @_);
+
+ " ${mnemonic}${f} " . join(",", @out);
+}
+
+# ISA 2.03
+my $vsel = sub { vsr2vr("vsel", 4, @_); };
+my $vsl = sub { vsr2vr("vsl", 3, @_); };
+my $vspltisb = sub { vsr2vr("vspltisb", 1, @_); };
+my $vspltisw = sub { vsr2vr("vspltisw", 1, @_); };
+my $vsr = sub { vsr2vr("vsr", 3, @_); };
+my $vsro = sub { vsr2vr("vsro", 3, @_); };
+
+# ISA 3.0
+my $lxsd = sub { vsr2vr("lxsd", 1, @_); };
+
+################################################################
# simplified mnemonics not handled by at least one assembler
################################################################
my $cmplw = sub {
@@ -226,13 +291,18 @@ my $vpermdi = sub { # xxpermdi
# PowerISA 2.07 stuff
sub vcrypto_op {
- my ($f, $vrt, $vra, $vrb, $op) = @_;
+ my ($f, $vrt, $vra, $vrb, $op) = vsr2vr_args(3, @_);
" .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op;
}
sub vfour {
my ($f, $vrt, $vra, $vrb, $vrc, $op) = @_;
" .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($vrc<<6)|$op;
};
+sub vfour_vsr {
+ my ($f, $vrt, $vra, $vrb, $vrc, $op) = vsr2vr_args(4, @_);
+ " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($vrc<<6)|$op;
+};
+
my $vcipher = sub { vcrypto_op(@_, 1288); };
my $vcipherlast = sub { vcrypto_op(@_, 1289); };
my $vncipher = sub { vcrypto_op(@_, 1352); };
@@ -254,10 +324,10 @@ my $vsld = sub { vcrypto_op(@_, 1476); }
my $vsrd = sub { vcrypto_op(@_, 1732); };
my $vsubudm = sub { vcrypto_op(@_, 1216); };
my $vaddcuq = sub { vcrypto_op(@_, 320); };
-my $vaddeuqm = sub { vfour(@_,60); };
-my $vaddecuq = sub { vfour(@_,61); };
-my $vmrgew = sub { vfour(@_,0,1932); };
-my $vmrgow = sub { vfour(@_,0,1676); };
+my $vaddeuqm = sub { vfour_vsr(@_,60); };
+my $vaddecuq = sub { vfour_vsr(@_,61); };
+my $vmrgew = sub { vfour_vsr(@_,0,1932); };
+my $vmrgow = sub { vfour_vsr(@_,0,1676); };
my $mtsle = sub {
my ($f, $arg) = @_;
@@ -298,7 +368,7 @@ my $addex = sub {
my ($f, $rt, $ra, $rb, $cy) = @_; # only cy==0 is specified in 3.0B
" .long ".sprintf "0x%X",(31<<26)|($rt<<21)|($ra<<16)|($rb<<11)|($cy<<9)|(170<<1);
};
-my $vmsumudm = sub { vfour(@_,35); };
+my $vmsumudm = sub { vfour_vsr(@_, 35); };
while($line=<>) {
Index: openssl-1.1.1l/Configurations/10-main.conf
===================================================================
--- openssl-1.1.1l.orig/Configurations/10-main.conf
+++ openssl-1.1.1l/Configurations/10-main.conf
@@ -669,7 +669,7 @@ my %targets = (
inherit_from => [ "linux-generic64", asm("ppc64_asm") ],
cflags => add("-m64"),
cxxflags => add("-m64"),
- lib_cppflags => add("-DB_ENDIAN"),
+ lib_cppflags => add("-DB_ENDIAN -DECP_NISTP521_ASM"),
perlasm_scheme => "linux64",
multilib => "64",
},
@@ -677,7 +677,7 @@ my %targets = (
inherit_from => [ "linux-generic64", asm("ppc64_asm") ],
cflags => add("-m64"),
cxxflags => add("-m64"),
- lib_cppflags => add("-DL_ENDIAN"),
+ lib_cppflags => add("-DL_ENDIAN -DECP_NISTP521_ASM"),
perlasm_scheme => "linux64le",
},
Index: openssl-1.1.1l/crypto/ec/asm/ecp_nistp521-ppc64.pl
===================================================================
--- /dev/null
+++ openssl-1.1.1l/crypto/ec/asm/ecp_nistp521-ppc64.pl
@@ -0,0 +1,435 @@
+#! /usr/bin/env perl
+# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# ====================================================================
+# Written by Amitay Isaacs <amitay@ozlabs.org> and Martin Schwenke
+# <martin@meltin.net> for the OpenSSL project.
+# ====================================================================
+#
+# p521 lower-level primitives for PPC64 using vector instructions.
+#
+
+use strict;
+use warnings;
+
+my $flavour = shift;
+my $output = "";
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+if (!$output) {
+ $output = "-";
+}
+
+my ($xlate, $dir);
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+my $code = "";
+
+my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12");
+
+my $vzero = "v32";
+
+sub startproc($)
+{
+ my ($name) = @_;
+
+ $code.=<<___;
+ .globl ${name}
+ .align 5
+${name}:
+
+___
+}
+
+sub endproc($)
+{
+ my ($name) = @_;
+
+ $code.=<<___;
+ blr
+ .size ${name},.-${name}
+
+___
+}
+
+
+sub push_vrs($$)
+{
+ my ($min, $max) = @_;
+
+ my $count = $max - $min + 1;
+
+ $code.=<<___;
+ mr $savesp,$sp
+ stdu $sp,-16*`$count+1`($sp)
+
+___
+ for (my $i = $min; $i <= $max; $i++) {
+ my $mult = $max - $i + 1;
+ $code.=<<___;
+ stxv $i,-16*$mult($savesp)
+___
+
+ }
+
+ $code.=<<___;
+
+___
+}
+
+sub pop_vrs($$)
+{
+ my ($min, $max) = @_;
+
+ $code.=<<___;
+ ld $savesp,0($sp)
+___
+ for (my $i = $min; $i <= $max; $i++) {
+ my $mult = $max - $i + 1;
+ $code.=<<___;
+ lxv $i,-16*$mult($savesp)
+___
+ }
+
+ $code.=<<___;
+ mr $sp,$savesp
+
+___
+}
+
+sub load_vrs($$)
+{
+ my ($pointer, $reg_list) = @_;
+
+ for (my $i = 0; $i <= 8; $i++) {
+ my $offset = $i * 8;
+ $code.=<<___;
+ lxsd $reg_list->[$i],$offset($pointer)
+___
+ }
+
+ $code.=<<___;
+
+___
+}
+
+sub store_vrs($$)
+{
+ my ($pointer, $reg_list) = @_;
+
+ for (my $i = 0; $i <= 8; $i++) {
+ my $offset = $i * 16;
+ $code.=<<___;
+ stxv $reg_list->[$i],$offset($pointer)
+___
+ }
+
+ $code.=<<___;
+
+___
+}
+
+$code.=<<___;
+.text
+
+___
+
+{
+ # mul/square common
+ my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v44", "v54");
+ my ($zero, $one) = ("r8", "r9");
+ my @out = map("v$_",(55..63));
+
+ {
+ #
+ # p521_felem_mul
+ #
+
+ my ($in1p, $in2p) = ("r4", "r5");
+ my @in1 = map("v$_",(45..53));
+ my @in2 = map("v$_",(35..43));
+
+ startproc("p521_felem_mul");
+
+ push_vrs(52, 63);
+
+ $code.=<<___;
+ vspltisw $vzero,0
+
+___
+
+ load_vrs($in1p, \@in1);
+ load_vrs($in2p, \@in2);
+
+ $code.=<<___;
+ vmsumudm $out[0],$in1[0],$in2[0],$vzero
+
+ xxpermdi $t1,$in1[0],$in1[1],0b00
+ xxpermdi $t2,$in2[1],$in2[0],0b00
+ vmsumudm $out[1],$t1,$t2,$vzero
+
+ xxpermdi $t2,$in2[2],$in2[1],0b00
+ vmsumudm $out[2],$t1,$t2,$vzero
+ vmsumudm $out[2],$in1[2],$in2[0],$out[2]
+
+ xxpermdi $t2,$in2[3],$in2[2],0b00
+ vmsumudm $out[3],$t1,$t2,$vzero
+ xxpermdi $t3,$in1[2],$in1[3],0b00
+ xxpermdi $t4,$in2[1],$in2[0],0b00
+ vmsumudm $out[3],$t3,$t4,$out[3]
+
+ xxpermdi $t2,$in2[4],$in2[3],0b00
+ vmsumudm $out[4],$t1,$t2,$vzero
+ xxpermdi $t4,$in2[2],$in2[1],0b00
+ vmsumudm $out[4],$t3,$t4,$out[4]
+ vmsumudm $out[4],$in1[4],$in2[0],$out[4]
+
+ xxpermdi $t2,$in2[5],$in2[4],0b00
+ vmsumudm $out[5],$t1,$t2,$vzero
+ xxpermdi $t4,$in2[3],$in2[2],0b00
+ vmsumudm $out[5],$t3,$t4,$out[5]
+
+ xxpermdi $t2,$in2[6],$in2[5],0b00
+ vmsumudm $out[6],$t1,$t2,$vzero
+ xxpermdi $t4,$in2[4],$in2[3],0b00
+ vmsumudm $out[6],$t3,$t4,$out[6]
+
+ xxpermdi $t2,$in2[7],$in2[6],0b00
+ vmsumudm $out[7],$t1,$t2,$vzero
+ xxpermdi $t4,$in2[5],$in2[4],0b00
+ vmsumudm $out[7],$t3,$t4,$out[7]
+
+ xxpermdi $t2,$in2[8],$in2[7],0b00
+ vmsumudm $out[8],$t1,$t2,$vzero
+ xxpermdi $t4,$in2[6],$in2[5],0b00
+ vmsumudm $out[8],$t3,$t4,$out[8]
+
+ xxpermdi $t1,$in1[4],$in1[5],0b00
+ xxpermdi $t2,$in2[1],$in2[0],0b00
+ vmsumudm $out[5],$t1,$t2,$out[5]
+
+ xxpermdi $t2,$in2[2],$in2[1],0b00
+ vmsumudm $out[6],$t1,$t2,$out[6]
+ vmsumudm $out[6],$in1[6],$in2[0],$out[6]
+
+ xxpermdi $t2,$in2[3],$in2[2],0b00
+ vmsumudm $out[7],$t1,$t2,$out[7]
+ xxpermdi $t3,$in1[6],$in1[7],0b00
+ xxpermdi $t4,$in2[1],$in2[0],0b00
+ vmsumudm $out[7],$t3,$t4,$out[7]
+
+ xxpermdi $t2,$in2[4],$in2[3],0b00
+ vmsumudm $out[8],$t1,$t2,$out[8]
+ xxpermdi $t4,$in2[2],$in2[1],0b00
+ vmsumudm $out[8],$t3,$t4,$out[8]
+ vmsumudm $out[8],$in1[8],$in2[0],$out[8]
+
+ li $zero,0
+ li $one,1
+ mtvsrdd $t1,$one,$zero
+___
+
+ for (my $i = 0; $i <= 8; $i++) {
+ $code.=<<___;
+ vsld $in2[$i],$in2[$i],$t1
+___
+ }
+
+ $code.=<<___;
+
+ vmsumudm $out[7],$in1[8],$in2[8],$out[7]
+
+ xxpermdi $t2,$in2[8],$in2[7],0b00
+ xxpermdi $t1,$in1[7],$in1[8],0b00
+ vmsumudm $out[6],$t1,$t2,$out[6]
+
+ xxpermdi $t1,$in1[6],$in1[7],0b00
+ vmsumudm $out[5],$t1,$t2,$out[5]
+ vmsumudm $out[5],$in1[8],$in2[6],$out[5]
+
+ xxpermdi $t1,$in1[5],$in1[6],0b00
+ vmsumudm $out[4],$t1,$t2,$out[4]
+ xxpermdi $t4,$in2[6],$in2[5],0b00
+ xxpermdi $t3,$in1[7],$in1[8],0b00
+ vmsumudm $out[4],$t3,$t4,$out[4]
+
+ xxpermdi $t1,$in1[4],$in1[5],0b00
+ vmsumudm $out[3],$t1,$t2,$out[3]
+ xxpermdi $t3,$in1[6],$in1[7],0b00
+ vmsumudm $out[3],$t3,$t4,$out[3]
+ vmsumudm $out[3],$in1[8],$in2[4],$out[3]
+
+ xxpermdi $t1,$in1[3],$in1[4],0b00
+ vmsumudm $out[2],$t1,$t2,$out[2]
+ xxpermdi $t3,$in1[5],$in1[6],0b00
+ vmsumudm $out[2],$t3,$t4,$out[2]
+
+ xxpermdi $t1,$in1[2],$in1[3],0b00
+ vmsumudm $out[1],$t1,$t2,$out[1]
+ xxpermdi $t3,$in1[4],$in1[5],0b00
+ vmsumudm $out[1],$t3,$t4,$out[1]
+
+ xxpermdi $t1,$in1[1],$in1[2],0b00
+ vmsumudm $out[0],$t1,$t2,$out[0]
+ xxpermdi $t3,$in1[3],$in1[4],0b00
+ vmsumudm $out[0],$t3,$t4,$out[0]
+
+ xxpermdi $t2,$in2[4],$in2[3],0b00
+ xxpermdi $t1,$in1[7],$in1[8],0b00
+ vmsumudm $out[2],$t1,$t2,$out[2]
+
+ xxpermdi $t1,$in1[6],$in1[7],0b00
+ vmsumudm $out[1],$t1,$t2,$out[1]
+ vmsumudm $out[1],$in1[8],$in2[2],$out[1]
+
+ xxpermdi $t1,$in1[5],$in1[6],0b00
+ vmsumudm $out[0],$t1,$t2,$out[0]
+ xxpermdi $t4,$in2[2],$in2[1],0b00
+ xxpermdi $t3,$in1[7],$in1[8],0b00
+ vmsumudm $out[0],$t3,$t4,$out[0]
+
+___
+
+ store_vrs($outp, \@out);
+
+ pop_vrs(52, 63);
+
+ endproc("p521_felem_mul");
+ }
+
+ {
+ #
+ # p51_felem_square
+ #
+
+ my ($inp) = ("r4");
+ my @in = map("v$_",(45..53));
+ my @inx2 = map("v$_",(35..43));
+
+ startproc("p521_felem_square");
+
+ push_vrs(52, 63);
+
+ $code.=<<___;
+ vspltisw $vzero,0
+
+___
+
+ load_vrs($inp, \@in);
+
+ $code.=<<___;
+ li $zero,0
+ li $one,1
+ mtvsrdd $t1,$one,$zero
+___
+
+ for (my $i = 0; $i <= 8; $i++) {
+ $code.=<<___;
+ vsld $inx2[$i],$in[$i],$t1
+___
+ }
+
+ $code.=<<___;
+ vmsumudm $out[0],$in[0],$in[0],$vzero
+
+ vmsumudm $out[1],$in[0],$inx2[1],$vzero
+
+ xxpermdi $t1,$in[0],$in[1],0b00
+ xxpermdi $t2,$inx2[2],$in[1],0b00
+ vmsumudm $out[2],$t1,$t2,$vzero
+
+ xxpermdi $t2,$inx2[3],$inx2[2],0b00
+ vmsumudm $out[3],$t1,$t2,$vzero
+
+ xxpermdi $t2,$inx2[4],$inx2[3],0b00
+ vmsumudm $out[4],$t1,$t2,$vzero
+ vmsumudm $out[4],$in[2],$in[2],$out[4]
+
+ xxpermdi $t2,$inx2[5],$inx2[4],0b00
+ vmsumudm $out[5],$t1,$t2,$vzero
+ vmsumudm $out[5],$in[2],$inx2[3],$out[5]
+
+ xxpermdi $t2,$inx2[6],$inx2[5],0b00
+ vmsumudm $out[6],$t1,$t2,$vzero
+ xxpermdi $t3,$in[2],$in[3],0b00
+ xxpermdi $t4,$inx2[4],$in[3],0b00
+ vmsumudm $out[6],$t3,$t4,$out[6]
+
+ xxpermdi $t2,$inx2[7],$inx2[6],0b00
+ vmsumudm $out[7],$t1,$t2,$vzero
+ xxpermdi $t4,$inx2[5],$inx2[4],0b00
+ vmsumudm $out[7],$t3,$t4,$out[7]
+
+ xxpermdi $t2,$inx2[8],$inx2[7],0b00
+ vmsumudm $out[8],$t1,$t2,$vzero
+ xxpermdi $t4,$inx2[6],$inx2[5],0b00
+ vmsumudm $out[8],$t3,$t4,$out[8]
+ vmsumudm $out[8],$in[4],$in[4],$out[8]
+
+ vmsumudm $out[1],$in[5],$inx2[5],$out[1]
+
+ vmsumudm $out[3],$in[6],$inx2[6],$out[3]
+
+ vmsumudm $out[5],$in[7],$inx2[7],$out[5]
+
+ vmsumudm $out[7],$in[8],$inx2[8],$out[7]
+
+ mtvsrdd $t1,$one,$zero
+___
+
+ for (my $i = 5; $i <= 8; $i++) {
+ $code.=<<___;
+ vsld $inx2[$i],$inx2[$i],$t1
+___
+ }
+
+ $code.=<<___;
+
+ vmsumudm $out[6],$in[7],$inx2[8],$out[6]
+
+ vmsumudm $out[5],$in[6],$inx2[8],$out[5]
+
+ xxpermdi $t2,$inx2[8],$inx2[7],0b00
+ xxpermdi $t1,$in[5],$in[6],0b00
+ vmsumudm $out[4],$t1,$t2,$out[4]
+
+ xxpermdi $t1,$in[4],$in[5],0b00
+ vmsumudm $out[3],$t1,$t2,$out[3]
+
+ xxpermdi $t1,$in[3],$in[4],0b00
+ vmsumudm $out[2],$t1,$t2,$out[2]
+ vmsumudm $out[2],$in[5],$inx2[6],$out[2]
+
+ xxpermdi $t1,$in[2],$in[3],0b00
+ vmsumudm $out[1],$t1,$t2,$out[1]
+ vmsumudm $out[1],$in[4],$inx2[6],$out[1]
+
+ xxpermdi $t1,$in[1],$in[2],0b00
+ vmsumudm $out[0],$t1,$t2,$out[0]
+ xxpermdi $t2,$inx2[6],$inx2[5],0b00
+ xxpermdi $t1,$in[3],$in[4],0b00
+ vmsumudm $out[0],$t1,$t2,$out[0]
+
+___
+
+ store_vrs($outp, \@out);
+
+ pop_vrs(52, 63);
+
+ endproc("p521_felem_square");
+ }
+}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
Index: openssl-1.1.1l/crypto/ec/ec_local.h
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/ec_local.h
+++ openssl-1.1.1l/crypto/ec/ec_local.h
@@ -499,6 +499,10 @@ int ec_GF2m_simple_field_div(const EC_GR
const BIGNUM *b, BN_CTX *);
#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+# ifdef B_ENDIAN
+# error "Can not enable ec_nistp_64_gcc_128 on big-endian systems"
+# endif
+
/* method functions in ecp_nistp224.c */
int ec_GFp_nistp224_group_init(EC_GROUP *group);
int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
Index: openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl.c
===================================================================
--- openssl-1.1.1l.orig/crypto/ec/curve448/arch_32/f_impl.c
+++ openssl-1.1.1l/crypto/ec/curve448/arch_32/f_impl.c
@@ -10,7 +10,7 @@
* Originally written by Mike Hamburg
*/
-#include "field.h"
+#include "../field.h"
void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
{
Index: openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.c
===================================================================
--- /dev/null
+++ openssl-1.1.1l/crypto/ec/curve448/arch_64/f_impl.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2014 Cryptography Research, Inc.
+ *
+ * Licensed under the OpenSSL license (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ *
+ * Originally written by Mike Hamburg
+ */
+
+#include "../field.h"
+
+void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs)
+{
+ const uint64_t *a = as->limb, *b = bs->limb;
+ uint64_t *c = cs->limb;
+ uint128_t accum0 = 0, accum1 = 0, accum2;
+ uint64_t mask = (1ULL << 56) - 1;
+ uint64_t aa[4], bb[4], bbb[4];
+ unsigned int i, j;
+
+ for (i = 0; i < 4; i++) {
+ aa[i] = a[i] + a[i + 4];
+ bb[i] = b[i] + b[i + 4];
+ bbb[i] = bb[i] + b[i + 4];
+ }
+
+ for (i = 0; i < 4; i++) {
+ accum2 = 0;
+
+ for (j = 0; j <= i; j++) {
+ accum2 += widemul(a[j], b[i - j]);
+ accum1 += widemul(aa[j], bb[i - j]);
+ accum0 += widemul(a[j + 4], b[i - j + 4]);
+ }
+ for (; j < 4; j++) {
+ accum2 += widemul(a[j], b[i - j + 8]);
+ accum1 += widemul(aa[j], bbb[i - j + 4]);
+ accum0 += widemul(a[j + 4], bb[i - j + 4]);
+ }
+
+ accum1 -= accum2;
+ accum0 += accum2;
+
+ c[i] = ((uint64_t)(accum0)) & mask;
+ c[i + 4] = ((uint64_t)(accum1)) & mask;
+
+ accum0 >>= 56;
+ accum1 >>= 56;
+ }
+
+ accum0 += accum1;
+ accum0 += c[4];
+ accum1 += c[0];
+ c[4] = ((uint64_t)(accum0)) & mask;
+ c[0] = ((uint64_t)(accum1)) & mask;
+
+ accum0 >>= 56;
+ accum1 >>= 56;
+
+ c[5] += ((uint64_t)(accum0));
+ c[1] += ((uint64_t)(accum1));
+}
+
+void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b)
+{
+ const uint64_t *a = as->limb;
+ uint64_t *c = cs->limb;
+ uint128_t accum0 = 0, accum4 = 0;
+ uint64_t mask = (1ULL << 56) - 1;
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ accum0 += widemul(b, a[i]);
+ accum4 += widemul(b, a[i + 4]);
+ c[i] = accum0 & mask;
+ accum0 >>= 56;
+ c[i + 4] = accum4 & mask;
+ accum4 >>= 56;
+ }
+
+ accum0 += accum4 + c[4];
+ c[4] = accum0 & mask;
+ c[5] += accum0 >> 56;
+
+ accum4 += c[0];
+ c[0] = accum4 & mask;
+ c[1] += accum4 >> 56;
+}
+
+void gf_sqr(gf_s * RESTRICT cs, const gf as)
+{
+ const uint64_t *a = as->limb;
+ uint64_t *c = cs->limb;
+ uint128_t accum0 = 0, accum1 = 0, accum2;
+ uint64_t mask = (1ULL << 56) - 1;
+ uint64_t aa[4];
+ unsigned int i;
+
+ /* For some reason clang doesn't vectorize this without prompting? */
+ for (i = 0; i < 4; i++)
+ aa[i] = a[i] + a[i + 4];
+
+ accum2 = widemul(a[0], a[3]);
+ accum0 = widemul(aa[0], aa[3]);
+ accum1 = widemul(a[4], a[7]);
+
+ accum2 += widemul(a[1], a[2]);
+ accum0 += widemul(aa[1], aa[2]);
+ accum1 += widemul(a[5], a[6]);
+
+ accum0 -= accum2;
+ accum1 += accum2;
+
+ c[3] = ((uint64_t)(accum1)) << 1 & mask;
+ c[7] = ((uint64_t)(accum0)) << 1 & mask;
+
+ accum0 >>= 55;
+ accum1 >>= 55;
+
+ accum0 += widemul(2 * aa[1], aa[3]);
+ accum1 += widemul(2 * a[5], a[7]);
+ accum0 += widemul(aa[2], aa[2]);
+ accum1 += accum0;
+
+ accum0 -= widemul(2 * a[1], a[3]);
+ accum1 += widemul(a[6], a[6]);
+
+ accum2 = widemul(a[0], a[0]);
+ accum1 -= accum2;
+ accum0 += accum2;
+
+ accum0 -= widemul(a[2], a[2]);
+ accum1 += widemul(aa[0], aa[0]);
+ accum0 += widemul(a[4], a[4]);
+
+ c[0] = ((uint64_t)(accum0)) & mask;
+ c[4] = ((uint64_t)(accum1)) & mask;
+
+ accum0 >>= 56;
+ accum1 >>= 56;
+
+ accum2 = widemul(2 * aa[2], aa[3]);
+ accum0 -= widemul(2 * a[2], a[3]);
+ accum1 += widemul(2 * a[6], a[7]);
+
+ accum1 += accum2;
+ accum0 += accum2;
+
+ accum2 = widemul(2 * a[0], a[1]);
+ accum1 += widemul(2 * aa[0], aa[1]);
+ accum0 += widemul(2 * a[4], a[5]);
+
+ accum1 -= accum2;
+ accum0 += accum2;
+
+ c[1] = ((uint64_t)(accum0)) & mask;
+ c[5] = ((uint64_t)(accum1)) & mask;
+
+ accum0 >>= 56;
+ accum1 >>= 56;
+
+ accum2 = widemul(aa[3], aa[3]);
+ accum0 -= widemul(a[3], a[3]);
+ accum1 += widemul(a[7], a[7]);
+
+ accum1 += accum2;
+ accum0 += accum2;
+
+ accum2 = widemul(2 * a[0], a[2]);
+ accum1 += widemul(2 * aa[0], aa[2]);
+ accum0 += widemul(2 * a[4], a[6]);
+
+ accum2 += widemul(a[1], a[1]);
+ accum1 += widemul(aa[1], aa[1]);
+ accum0 += widemul(a[5], a[5]);
+
+ accum1 -= accum2;
+ accum0 += accum2;
+
+ c[2] = ((uint64_t)(accum0)) & mask;
+ c[6] = ((uint64_t)(accum1)) & mask;
+
+ accum0 >>= 56;
+ accum1 >>= 56;
+
+ accum0 += c[3];
+ accum1 += c[7];
+ c[3] = ((uint64_t)(accum0)) & mask;
+ c[7] = ((uint64_t)(accum1)) & mask;
+
+ /* we could almost stop here, but it wouldn't be stable, so... */
+
+ accum0 >>= 56;
+ accum1 >>= 56;
+ c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
+ c[0] += ((uint64_t)(accum1));
+}
Index: openssl-1.1.1l/Configure
===================================================================
--- openssl-1.1.1l.orig/Configure
+++ openssl-1.1.1l/Configure
@@ -1476,6 +1476,20 @@ if (!$disabled{asm} && !$predefined_C{__
}
}
+# Check if __SIZEOF_INT128__ is defined by compiler
+$config{use_int128} = 0;
+{
+ my $cc = $config{CROSS_COMPILE}.$config{CC};
+ open(PIPE, "$cc -E -dM - </dev/null 2>&1 |");
+ while(<PIPE>) {
+ if (m/__SIZEOF_INT128__/) {
+ $config{use_int128} = 1;
+ last;
+ }
+ }
+ close(PIPE);
+}
+
# Deal with bn_ops ###################################################
$config{bn_ll} =0;