File grub2-x86-fast-sha256.patch of Package grub2

--- a/grub-core/kern/i386/pc/startup.S
+++ b/grub-core/kern/i386/pc/startup.S
@@ -110,6 +110,11 @@ LOCAL(cont):
 #endif
 	subl	%edi, %ecx
 
+	/* enable sse */
+	movl	%cr4, %eax
+	orl	$0x40200, %eax
+	movl	%eax,%cr4
+
 	/* clean out */
 	xorl	%eax, %eax
 	cld
--- a/grub-core/kern/i386/realmode.S
+++ b/grub-core/kern/i386/realmode.S
@@ -47,7 +47,21 @@
  */
 
 protstack:
-	.long	GRUB_MEMORY_MACHINE_PROT_STACK
+	/*
+	 * Note: Subtract 4 because real_to_prot/prot_to_real assume to be *called*.
+	 *       They change the address on the top of stack and then return.
+	 *
+	 *       Subsequently, for the first real_to_prot call, 'protstack' must look as if
+	 *       real_to_prot has been called - so lower the stack value by 4.
+	 *
+	 *       Otherwise, after the first real_to_prot call, the stack will have an
+	 *       invalid address of GRUB_MEMORY_MACHINE_PROT_STACK + 4.
+	 *
+	 *       Plus, this leads to breaking stack frame alignment assumptions
+	 *       in gcc (expecting stack frames to be 16-bytes aligned) and all local
+	 *       variables larger than 4 bytes will be misaligned.
+	 */
+	.long	GRUB_MEMORY_MACHINE_PROT_STACK - 4
 
 	.macro PROT_TO_REAL
 	call	prot_to_real
@@ -172,18 +186,13 @@ protcseg:
 	movw	%ax, %gs
 	movw	%ax, %ss
 
-	/* put the return address in a known safe location */
+	/* get the return address */
 	movl	(%esp), %eax
-	movl	%eax, GRUB_MEMORY_MACHINE_REAL_STACK
-
-	/* get protected mode stack */
-	movl	protstack, %eax
-	movl	%eax, %esp
-	movl	%eax, %ebp
 
-	/* get return address onto the right stack */
-	movl	GRUB_MEMORY_MACHINE_REAL_STACK, %eax
+	/* set up protected mode stack */
+	movl	protstack, %esp
 	movl	%eax, (%esp)
+	movl	%esp, %ebp
 
 	/* zero %eax */
 	xorl	%eax, %eax
@@ -222,17 +231,15 @@ prot_to_real:
 	lidt    LOCAL(realidt)
 
 	/* save the protected mode stack */
-	movl	%esp, %eax
-	movl	%eax, protstack
+	movl	%esp, protstack
 
 	/* get the return address */
 	movl	(%esp), %eax
-	movl	%eax, GRUB_MEMORY_MACHINE_REAL_STACK
 
-	/* set up new stack */
-	movl	$GRUB_MEMORY_MACHINE_REAL_STACK, %eax
-	movl	%eax, %esp
-	movl	%eax, %ebp
+	/* set up real mode stack */
+	movl	$GRUB_MEMORY_MACHINE_REAL_STACK, %esp
+	pushl	%eax
+	movl	%esp, %ebp
 
 	/* set up segment limits */
 	movw	$GRUB_MEMORY_MACHINE_PSEUDO_REAL_DSEG, %ax
--- a/grub-core/kern/main.c
+++ b/grub-core/kern/main.c
@@ -318,6 +318,8 @@ grub_main (void)
 
   grub_boot_time ("After machine init.");
 
+  STACK_ALIGN_CHECK(16);
+
   /* This breaks flicker-free boot on EFI systems, so disable it there. */
 #ifndef GRUB_MACHINE_EFI
   /* Hello.  */
--- a/grub-core/lib/pbkdf2.c
+++ b/grub-core/lib/pbkdf2.c
@@ -18,9 +18,12 @@
 /* Written by Simon Josefsson.  */
 /* Imported from gnulib.  */
 
+#pragma GCC diagnostic ignored "-Wvla"
+
 #include <grub/crypto.h>
 #include <grub/mm.h>
 #include <grub/misc.h>
+#include <grub/time.h>
 #include <grub/dl.h>
 
 GRUB_MOD_LICENSE ("GPLv2+");
@@ -32,6 +35,13 @@ GRUB_MOD_LICENSE ("GPLv2+");
    must have room for at least DKLEN octets.  The output buffer will
    be filled with the derived data.  */
 
+void
+grub_crypto_pbkdf2_sha (
+		    const grub_uint8_t *P, grub_size_t Plen,
+		    const grub_uint8_t *S, grub_size_t Slen,
+		    unsigned int c,
+		    grub_uint8_t *DK, grub_size_t dkLen);
+
 gcry_err_code_t
 grub_crypto_pbkdf2 (const struct gcry_md_spec *md,
 		    const grub_uint8_t *P, grub_size_t Plen,
@@ -63,6 +73,13 @@ grub_crypto_pbkdf2 (const struct gcry_md_spec *md,
   if (dkLen > 4294967295U)
     return GPG_ERR_INV_ARG;
 
+  if (!grub_strcasecmp (md->name, "SHA256"))
+    {
+      grub_crypto_pbkdf2_sha (P, Plen, S, Slen, c, DK, dkLen);
+
+      return GPG_ERR_NO_ERROR;
+    }
+
   l = ((dkLen - 1) / hLen) + 1;
   r = dkLen - (l - 1) * hLen;
 
@@ -107,3 +124,636 @@ grub_crypto_pbkdf2 (const struct gcry_md_spec *md,
 
   return GPG_ERR_NO_ERROR;
 }
+
+
+typedef __UINT8_TYPE__ uint8_t;
+typedef __UINT32_TYPE__ uint32_t;
+typedef __UINT64_TYPE__ uint64_t;
+
+typedef enum { f_default = 0, f_normal, f_slow, f_fast } func_t;
+
+typedef struct {
+  uint8_t *buf;
+  unsigned len;
+} data_t;
+
+#define SHA_STATE_SIZE	8
+#define SHA_INITIAL_STATE { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }
+#define SHA_HASH_SIZE	(SHA_STATE_SIZE * 4)
+#define SHA_BLOCK_SIZE	64
+#define SHA_LEN_SIZE	8
+
+#define MEM_COPY(dst, src, n) do { for(unsigned u = 0; u < (n); u++) { (dst)[u] = (src)[u]; } } while(0)
+#define MEM_XOR(dst, src, n) do { for(unsigned u = 0; u < (n); u++) { (dst)[u] ^= (src)[u]; } } while(0)
+#define MEM_XOR_CONST(dst, val, n) do { for(unsigned u = 0; u < (n); u++) { (dst)[u] ^= val; } } while(0)
+
+typedef struct {
+  uint32_t data[SHA_STATE_SIZE];
+} sha_state_t;
+
+typedef struct {
+  sha_state_t state;
+  uint64_t len;
+  struct {
+    uint8_t buf[SHA_BLOCK_SIZE];
+    unsigned len;
+  } last;
+} sha_t;
+
+typedef struct {
+  uint8_t blk[SHA_BLOCK_SIZE];
+  sha_state_t state1, state2;
+} sha_hmac_state_t;
+
+static unsigned cpu_has_sha(void)
+{
+#if defined(__i386__) || defined (__x86_64__)
+  uint32_t a, b, c, d;
+
+  asm volatile (
+    "cpuid"
+    : "=a" (a), "=b" (b), "=c" (c), "=d" (d)
+    : "a" (7), "c" (0)
+    :
+  );
+
+  return (b >> 29) & 1;
+#else
+  return 0;
+#endif
+}
+
+uint32_t get_uint32_be(uint8_t *buf);
+uint64_t get_uint64_be(uint8_t *buf);
+void put_uint32_be(uint8_t *buf, uint32_t val);
+void put_uint64_be(uint8_t *buf, uint64_t val);
+
+void sha_pbkdf2(data_t *key, data_t *password, data_t *salt, unsigned iterations);
+void sha_hmac_pbkdf2_prep(sha_hmac_state_t *hmac_state, data_t *password);
+void sha_hmac_pbkdf2_step(sha_hmac_state_t *hmac_state);
+void sha_hmac(uint8_t sum[SHA_HASH_SIZE], data_t *data, data_t *password);
+void sha_sum(uint8_t sum[SHA_HASH_SIZE], data_t *data);
+void sha_sum2(uint8_t sum[SHA_HASH_SIZE], data_t *data1, data_t *data2);
+
+void sha_init(sha_t *sha);
+void sha_finish(sha_t *sha);
+void sha_set_sum(sha_t *sha, uint8_t sum[SHA_HASH_SIZE]);
+void sha_process_data(sha_t *sha, data_t *data);
+
+void sha_process_block(sha_t *sha, uint8_t *buf);
+
+#if defined(__i386__) || defined (__x86_64__)
+void sha_process_block_fast(sha_t *sha, uint8_t *buffer);
+#endif
+void sha_process_block_slow(sha_t *sha, uint8_t *buffer);
+
+void
+grub_crypto_pbkdf2_sha (
+		    const grub_uint8_t *P, grub_size_t Plen,
+		    const grub_uint8_t *S, grub_size_t Slen,
+		    unsigned int c,
+		    grub_uint8_t *DK, grub_size_t dkLen)
+{
+  sha_pbkdf2(
+    &(data_t) { .buf = DK, .len = dkLen },
+    &(data_t) { .buf = (uint8_t *) P, .len = Plen },
+    &(data_t) { .buf = (uint8_t *) S, .len = Slen },
+    c
+  );
+}
+
+static func_t sha_func = f_default;
+
+uint32_t get_uint32_be(uint8_t *buf)
+{
+  return ((uint32_t) buf[0] << 24) + ((uint32_t) buf[1] << 16) + ((uint32_t) buf[2] << 8) + buf[3];
+}
+
+uint64_t get_uint64_be(uint8_t *buf)
+{
+  return ((uint64_t) get_uint32_be(buf) << 32) + get_uint32_be(buf + 4);
+}
+
+void put_uint32_be(uint8_t *buf, uint32_t val)
+{
+  buf[3] = val;
+  buf[2] = val >> 8;
+  buf[1] = val >> 16;
+  buf[0] = val >> 24;
+}
+
+void put_uint64_be(uint8_t *buf, uint64_t val)
+{
+  put_uint32_be(buf + 4, val);
+  put_uint32_be(buf, val >> 32);
+}
+
+void sha_pbkdf2(data_t *key, data_t *password, data_t *salt, unsigned iterations)
+{
+  if(!iterations || !key->len) return;
+
+  unsigned hash_len = SHA_HASH_SIZE;
+  unsigned blocks = (key->len - 1) / hash_len + 1;
+  unsigned r = key->len - (blocks - 1) * hash_len;
+
+  uint8_t __attribute__((aligned(8))) salt_buf[salt->len + 4];
+
+  MEM_COPY(salt_buf, salt->buf, salt->len);
+
+  sha_hmac_state_t __attribute__((aligned(16))) hmac_state;
+
+  sha_hmac_pbkdf2_prep(&hmac_state, password);
+
+  for(unsigned block = 0; block < blocks; block++) {
+    uint8_t __attribute__((aligned(8))) T[hash_len];
+
+    put_uint32_be(salt_buf + salt->len, block + 1);
+
+    sha_hmac(hmac_state.blk, &(data_t) { .buf = salt_buf, .len = salt->len + 4 }, password);
+
+    MEM_COPY(T, hmac_state.blk, hash_len);
+
+    for(unsigned cnt = 1; cnt < iterations; cnt++) {
+      sha_hmac_pbkdf2_step(&hmac_state);
+
+      MEM_XOR(T, hmac_state.blk, hash_len);
+    }
+
+    unsigned count = block == blocks - 1 ? r : hash_len;
+    unsigned ofs = block * hash_len;
+
+    MEM_COPY(key->buf + ofs, T, count);
+  }
+}
+
+void sha_hmac_pbkdf2_prep(sha_hmac_state_t *hmac_state, data_t *password)
+{
+  *hmac_state = (sha_hmac_state_t) { };
+  data_t blk_data = { .buf = hmac_state->blk, .len = sizeof hmac_state->blk };
+
+  if(password->len > sizeof hmac_state->blk) {
+    sha_sum(hmac_state->blk, password);
+  }
+  else {
+    MEM_COPY(hmac_state->blk, password->buf, password->len);
+  }
+
+  MEM_XOR_CONST(hmac_state->blk, 0x36, sizeof hmac_state->blk);
+
+  sha_t __attribute__ ((aligned(16))) sha;
+
+  sha_init(&sha);
+  sha_process_data(&sha, &blk_data);
+
+  hmac_state->state1 = sha.state;
+
+  MEM_XOR_CONST(hmac_state->blk, 0x36 ^ 0x5c, sizeof hmac_state->blk);
+
+  sha_init(&sha);
+  sha_process_data(&sha, &blk_data);
+
+  hmac_state->state2 = sha.state;
+
+  blk_data.len = SHA_HASH_SIZE;
+
+  sha_process_data(&sha, &blk_data);
+  sha_finish(&sha);
+
+  MEM_COPY(hmac_state->blk, sha.last.buf, sizeof hmac_state->blk);
+}
+
+void sha_hmac_pbkdf2_step(sha_hmac_state_t *hmac_state)
+{
+  sha_t __attribute__ ((aligned(16))) sha;
+
+  sha.state = hmac_state->state1;
+
+  sha_process_block(&sha, hmac_state->blk);
+
+  sha_set_sum(&sha, hmac_state->blk);
+
+  sha.state = hmac_state->state2;
+
+  sha_process_block(&sha, hmac_state->blk);
+
+  sha_set_sum(&sha, hmac_state->blk);
+}
+
+void sha_hmac(uint8_t sum[SHA_HASH_SIZE], data_t *data, data_t *password)
+{
+  uint8_t __attribute__((aligned(16))) blk[SHA_BLOCK_SIZE] = { };
+  data_t blk_data = { .buf = blk, .len = sizeof blk };
+
+  if(password->len > sizeof blk) {
+    sha_sum(blk, password);
+  }
+  else {
+    MEM_COPY(blk, password->buf, password->len);
+  }
+
+  MEM_XOR_CONST(blk, 0x36, sizeof blk);
+
+  uint8_t __attribute__((aligned(16))) sum1[SHA_HASH_SIZE];
+
+  sha_sum2(sum1, &blk_data, data);
+
+  MEM_XOR_CONST(blk, 0x36 ^ 0x5c, sizeof blk);
+
+  sha_sum2(sum, &blk_data, &(data_t) { .buf = sum1, .len = sizeof sum1 });
+}
+
+void sha_sum(uint8_t sum[SHA_HASH_SIZE], data_t *data)
+{
+  sha_t __attribute__((aligned(16))) sha;
+
+  sha_init(&sha);
+  sha_process_data(&sha, data);
+  sha_finish(&sha);
+  sha_set_sum(&sha, sum);
+}
+
+void sha_sum2(uint8_t sum[SHA_HASH_SIZE], data_t *data1, data_t *data2)
+{
+  sha_t __attribute__((aligned(16))) sha;
+
+  sha_init(&sha);
+  sha_process_data(&sha, data1);
+  sha_process_data(&sha, data2);
+  sha_finish(&sha);
+  sha_set_sum(&sha, sum);
+}
+
+void __attribute__((noinline)) sha_init(sha_t *sha)
+{
+  sha->state = (sha_state_t) { .data = SHA_INITIAL_STATE };
+
+  sha->len = 0;
+  sha->last.len = 0;
+
+  if(sha_func == f_default) {
+    sha_func = cpu_has_sha() ? f_fast : f_normal;
+  }
+}
+
+void __attribute__((noinline)) sha_finish(sha_t *sha)
+{
+  unsigned len = sha->last.len;
+
+  sha->len += len;
+
+  // add '1' bit
+  sha->last.buf[len++] = 0x80;
+
+  // len is <= SHA_BLOCK_SIZE
+
+  if(len > SHA_BLOCK_SIZE - SHA_LEN_SIZE) {
+    for(unsigned u = len; u < SHA_BLOCK_SIZE; u++) { sha->last.buf[u] = 0; }
+    sha_process_block(sha, sha->last.buf);
+    len = 0;
+  }
+
+  for(unsigned u = len; u < SHA_BLOCK_SIZE - SHA_LEN_SIZE; u++) { sha->last.buf[u] = 0; }
+
+  // add length in bits
+  put_uint64_be(&sha->last.buf[SHA_BLOCK_SIZE - SHA_LEN_SIZE], sha->len << 3);
+
+  sha_process_block(sha, sha->last.buf);
+}
+
+void __attribute__((noinline)) sha_set_sum(sha_t *sha, uint8_t sum[SHA_HASH_SIZE])
+{
+  for(unsigned u = 0; u < sizeof sha->state.data / sizeof *sha->state.data ; u++) {
+    put_uint32_be(&sum[sizeof (uint32_t) * u], sha->state.data[u]);
+  }
+}
+
+/*
+ * FIXME: does not handle data not in SHA_BLOCK_SIZE chunks, except in the last call
+ */
+void __attribute__((noinline)) sha_process_data(sha_t *sha, data_t *data)
+{
+  unsigned len = data->len;
+  unsigned pos = 0;
+
+  while(len >= SHA_BLOCK_SIZE) {
+    sha_process_block(sha, data->buf + pos);
+    sha->len += SHA_BLOCK_SIZE;
+    pos += SHA_BLOCK_SIZE;
+    len -= SHA_BLOCK_SIZE;
+  }
+
+  if(len) {
+    uint8_t *src = &data->buf[pos];
+    MEM_COPY(sha->last.buf, src, len);
+    sha->last.len = len;
+  }
+}
+
+void sha_process_block(sha_t *sha, uint8_t *buffer)
+{
+  switch(sha_func) {
+    case f_default:
+    case f_normal:
+    case f_slow:
+      sha_process_block_slow(sha, buffer);
+      break;
+#if (defined(__i386__) || defined (__x86_64__))
+    case f_fast:
+      sha_process_block_fast(sha, buffer);
+      break;
+#else
+    default:
+      sha_process_block_slow(sha, buffer);
+#endif
+  }
+}
+
+#if (defined(__i386__) || defined (__x86_64__))
+void sha_process_block_fast(sha_t *sha, uint8_t *buffer)
+{
+  static const uint32_t __attribute__ ((aligned (16))) sha256_k[64] = {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+  };
+
+  static const uint32_t __attribute__ ((aligned (16))) shuffle_be[4] = {
+    0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
+  };
+
+  // necessary for asm
+  void *tmp;
+
+  // code is generated; taken from sha256_inline.S
+  asm volatile (
+    "lea %4, %0\n"
+    "movdqu %3,%%xmm5\n"
+
+    "pshufd $0x1b,(%1),%%xmm6\n"
+    "pshufd $0xb1,0x10(%1),%%xmm7\n"
+    "movdqa %%xmm6,%%xmm0\n"
+    "pblendw $0xf,%%xmm7,%%xmm6\n"
+    "pblendw $0xf0,%%xmm7,%%xmm0\n"
+    "pshufd $0x4e,%%xmm0,%%xmm7\n"
+    "movdqu (%2),%%xmm0\n"
+    "pshufb %%xmm5,%%xmm0\n"
+    "movdqa %%xmm0,%%xmm1\n"
+    "paddd  (%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "movdqu 0x10(%2),%%xmm0\n"
+    "pshufb %%xmm5,%%xmm0\n"
+    "movdqa %%xmm0,%%xmm2\n"
+    "paddd  0x10(%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "sha256msg1 %%xmm2,%%xmm1\n"
+    "movdqu 0x20(%2),%%xmm0\n"
+    "pshufb %%xmm5,%%xmm0\n"
+    "movdqa %%xmm0,%%xmm3\n"
+    "paddd  0x20(%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "sha256msg1 %%xmm3,%%xmm2\n"
+    "movdqu 0x30(%2),%%xmm0\n"
+    "pshufb %%xmm5,%%xmm0\n"
+    "movdqa %%xmm0,%%xmm4\n"
+    "paddd  0x30(%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "movdqa %%xmm4,%%xmm5\n"
+    "palignr $0x4,%%xmm3,%%xmm5\n"
+    "paddd  %%xmm5,%%xmm1\n"
+    "sha256msg2 %%xmm4,%%xmm1\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "sha256msg1 %%xmm4,%%xmm3\n"
+    "movdqa %%xmm1,%%xmm0\n"
+    "paddd  0x40(%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "movdqa %%xmm1,%%xmm5\n"
+    "palignr $0x4,%%xmm4,%%xmm5\n"
+    "paddd  %%xmm5,%%xmm2\n"
+    "sha256msg2 %%xmm1,%%xmm2\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "sha256msg1 %%xmm1,%%xmm4\n"
+    "movdqa %%xmm2,%%xmm0\n"
+    "paddd  0x50(%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "movdqa %%xmm2,%%xmm5\n"
+    "palignr $0x4,%%xmm1,%%xmm5\n"
+    "paddd  %%xmm5,%%xmm3\n"
+    "sha256msg2 %%xmm2,%%xmm3\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "sha256msg1 %%xmm2,%%xmm1\n"
+    "movdqa %%xmm3,%%xmm0\n"
+    "paddd  0x60(%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "movdqa %%xmm3,%%xmm5\n"
+    "palignr $0x4,%%xmm2,%%xmm5\n"
+    "paddd  %%xmm5,%%xmm4\n"
+    "sha256msg2 %%xmm3,%%xmm4\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "sha256msg1 %%xmm3,%%xmm2\n"
+    "movdqa %%xmm4,%%xmm0\n"
+    "paddd  0x70(%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "movdqa %%xmm4,%%xmm5\n"
+    "palignr $0x4,%%xmm3,%%xmm5\n"
+    "paddd  %%xmm5,%%xmm1\n"
+    "sha256msg2 %%xmm4,%%xmm1\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "sha256msg1 %%xmm4,%%xmm3\n"
+    "movdqa %%xmm1,%%xmm0\n"
+    "paddd  0x80(%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "movdqa %%xmm1,%%xmm5\n"
+    "palignr $0x4,%%xmm4,%%xmm5\n"
+    "paddd  %%xmm5,%%xmm2\n"
+    "sha256msg2 %%xmm1,%%xmm2\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "sha256msg1 %%xmm1,%%xmm4\n"
+    "movdqa %%xmm2,%%xmm0\n"
+    "paddd  0x90(%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "movdqa %%xmm2,%%xmm5\n"
+    "palignr $0x4,%%xmm1,%%xmm5\n"
+    "paddd  %%xmm5,%%xmm3\n"
+    "sha256msg2 %%xmm2,%%xmm3\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "sha256msg1 %%xmm2,%%xmm1\n"
+    "movdqa %%xmm3,%%xmm0\n"
+    "paddd  0xa0(%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "movdqa %%xmm3,%%xmm5\n"
+    "palignr $0x4,%%xmm2,%%xmm5\n"
+    "paddd  %%xmm5,%%xmm4\n"
+    "sha256msg2 %%xmm3,%%xmm4\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "sha256msg1 %%xmm3,%%xmm2\n"
+    "movdqa %%xmm4,%%xmm0\n"
+    "paddd  0xb0(%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "movdqa %%xmm4,%%xmm5\n"
+    "palignr $0x4,%%xmm3,%%xmm5\n"
+    "paddd  %%xmm5,%%xmm1\n"
+    "sha256msg2 %%xmm4,%%xmm1\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "sha256msg1 %%xmm4,%%xmm3\n"
+    "movdqa %%xmm1,%%xmm0\n"
+    "paddd  0xc0(%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "movdqa %%xmm1,%%xmm5\n"
+    "palignr $0x4,%%xmm4,%%xmm5\n"
+    "paddd  %%xmm5,%%xmm2\n"
+    "sha256msg2 %%xmm1,%%xmm2\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "sha256msg1 %%xmm1,%%xmm4\n"
+    "movdqa %%xmm2,%%xmm0\n"
+    "paddd  0xd0(%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "movdqa %%xmm2,%%xmm5\n"
+    "palignr $0x4,%%xmm1,%%xmm5\n"
+    "paddd  %%xmm5,%%xmm3\n"
+    "sha256msg2 %%xmm2,%%xmm3\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "movdqa %%xmm3,%%xmm0\n"
+    "paddd  0xe0(%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "movdqa %%xmm3,%%xmm5\n"
+    "palignr $0x4,%%xmm2,%%xmm5\n"
+    "paddd  %%xmm5,%%xmm4\n"
+    "sha256msg2 %%xmm3,%%xmm4\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "movdqa %%xmm4,%%xmm0\n"
+    "paddd  0xf0(%0),%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm6,%%xmm7\n"
+    "pshufd $0xe,%%xmm0,%%xmm0\n"
+    "sha256rnds2 %%xmm0,%%xmm7,%%xmm6\n"
+    "pshufd $0xb1,%%xmm7,%%xmm0\n"
+    "pshufd $0x1b,%%xmm6,%%xmm6\n"
+    "movdqa %%xmm0,%%xmm7\n"
+    "pblendw $0xf0,%%xmm6,%%xmm7\n"
+    "pblendw $0xf0,%%xmm0,%%xmm6\n"
+    "pshufd $0x4e,%%xmm7,%%xmm7\n"
+    "movdqu (%1),%%xmm0\n"
+    "paddd  %%xmm6,%%xmm0\n"
+    "movdqu %%xmm0,(%1)\n"
+    "movdqu 0x10(%1),%%xmm0\n"
+    "paddd  %%xmm7,%%xmm0\n"
+    "movdqu %%xmm0,0x10(%1)\n"
+
+    : "=&r" (tmp)
+    : "r" (sha), "r" (buffer), "m" (shuffle_be), "m" (sha256_k)
+    : "memory"
+  );
+}
+#endif
+
+static const uint32_t sha256_round_constants[64] = {
+  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+#define K(I) sha256_round_constants[I]
+
+#define F2(A, B, C) ((A & B) | (C & (A | B)))
+#define F1(E, F, G) (G ^ (E & (F ^ G)))
+
+#define ROTL(x, n) (((x) << (n)) + ((x) >> (32 - (n))))
+
+#define SS0(x) (ROTL(x, 30) ^ ROTL(x, 19) ^ ROTL(x, 10))
+#define SS1(x) (ROTL(x, 26) ^ ROTL(x, 21) ^ ROTL(x,  7))
+#define  S0(x) (ROTL(x, 25) ^ ROTL(x, 14) ^ (x >>  3))
+#define  S1(x) (ROTL(x, 15) ^ ROTL(x, 13) ^ (x >> 10))
+
+void sha_process_block_slow(sha_t *sha, uint8_t *buffer)
+{
+  uint32_t x[16];
+  uint32_t a = sha->state.data[0];
+  uint32_t b = sha->state.data[1];
+  uint32_t c = sha->state.data[2];
+  uint32_t d = sha->state.data[3];
+  uint32_t e = sha->state.data[4];
+  uint32_t f = sha->state.data[5];
+  uint32_t g = sha->state.data[6];
+  uint32_t h = sha->state.data[7];
+
+  for(unsigned u = 0; u < 16; u++) {
+    x[u] = get_uint32_be(buffer);
+    buffer += 4;
+  }
+
+  for(unsigned t = 0; t < 64; t++) {
+    uint32_t w;
+
+    if(t < 16) {
+      w = x[t];
+    }
+    else {
+      w = S1(x[(t - 2) & 15]) + x[(t - 7) & 15] + S0(x[(t - 15) & 15]) + x[t & 15];
+      x[t & 15] = w;
+    }
+
+    uint32_t t1 = h + SS1(e) + F1(e, f, g) + K(t) + w;
+    uint32_t t2 = SS0(a) + F2(a, b, c);
+
+    h = g;
+    g = f;
+    f = e;
+    e = d + t1;
+    d = c;
+    c = b;
+    b = a;
+    a = t1 + t2;
+  }
+
+  sha->state.data[0] += a;
+  sha->state.data[1] += b;
+  sha->state.data[2] += c;
+  sha->state.data[3] += d;
+  sha->state.data[4] += e;
+  sha->state.data[5] += f;
+  sha->state.data[6] += g;
+  sha->state.data[7] += h;
+}
--- a/include/grub/misc.h
+++ b/include/grub/misc.h
@@ -35,6 +35,20 @@
 #define ARRAY_SIZE(array) (sizeof (array) / sizeof (array[0]))
 #define COMPILE_TIME_ASSERT(cond) switch (0) { case 1: case !(cond): ; }
 
+/*
+ * gcc would optimize away the 'if' branch since it 'knows'
+ * 'x' is aligned and so the 'if' condition must be false.
+ * Pass the pointer through a volatile var to avoid this.
+ */
+#define STACK_ALIGN_CHECK(N) \
+do { \
+  int x[1] __attribute__((aligned((N)))); \
+  volatile long l = (long) x; \
+  if((l & ((N) - 1))) { \
+    grub_fatal("%s: misaligned stack: %p\n", __FUNCTION__, x); \
+  } \
+} while(0)
+
 #define grub_dprintf(condition, ...) grub_real_dprintf(GRUB_FILE, __LINE__, condition, __VA_ARGS__)
 
 void *EXPORT_FUNC(grub_memmove) (void *dest, const void *src, grub_size_t n);
openSUSE Build Service is sponsored by