File VIA_padlock_support_on_64systems.patch of Package openssl
diff -rNU 30 ../openssl-1.0.1n-o/engines/e_padlock.c ./engines/e_padlock.c
--- ../openssl-1.0.1n-o/engines/e_padlock.c 2015-06-11 15:01:06.000000000 +0200
+++ ./engines/e_padlock.c 2015-06-12 04:30:50.000000000 +0200
@@ -74,61 +74,64 @@
# include <openssl/aes.h>
#endif
#include <openssl/rand.h>
#include <openssl/err.h>
#ifndef OPENSSL_NO_HW
# ifndef OPENSSL_NO_HW_PADLOCK
/* Attempt to have a single source for both 0.9.7 and 0.9.8 :-) */
# if (OPENSSL_VERSION_NUMBER >= 0x00908000L)
# ifndef OPENSSL_NO_DYNAMIC_ENGINE
# define DYNAMIC_ENGINE
# endif
# elif (OPENSSL_VERSION_NUMBER >= 0x00907000L)
# ifdef ENGINE_DYNAMIC_SUPPORT
# define DYNAMIC_ENGINE
# endif
# else
# error "Only OpenSSL >= 0.9.7 is supported"
# endif
/*
* VIA PadLock AES is available *ONLY* on some x86 CPUs. Not only that it
* doesn't exist elsewhere, but it even can't be compiled on other platforms!
*
* In addition, because of the heavy use of inline assembler, compiler choice
* is limited to GCC and Microsoft C.
*/
# undef COMPILE_HW_PADLOCK
# if !defined(I386_ONLY) && !defined(OPENSSL_NO_INLINE_ASM)
-# if (defined(__GNUC__) && (defined(__i386__) || defined(__i386))) || \
+# if (defined(__GNUC__) && __GNUC__>=2 && \
+ (defined(__i386__) || defined(__i386) || \
+ defined(__x86_64__) || defined(__x86_64)) \
+ ) || \
(defined(_MSC_VER) && defined(_M_IX86))
# define COMPILE_HW_PADLOCK
# endif
# endif
# ifdef OPENSSL_NO_DYNAMIC_ENGINE
# ifdef COMPILE_HW_PADLOCK
static ENGINE *ENGINE_padlock(void);
# endif
void ENGINE_load_padlock(void)
{
/* On non-x86 CPUs it just returns. */
# ifdef COMPILE_HW_PADLOCK
ENGINE *toadd = ENGINE_padlock();
if (!toadd)
return;
ENGINE_add(toadd);
ENGINE_free(toadd);
ERR_clear_error();
# endif
}
# endif
# ifdef COMPILE_HW_PADLOCK
/*
* We do these includes here to avoid header problems on platforms that do
* not have the VIA padlock anyway...
*/
@@ -276,60 +279,61 @@
int ciphr:1; /* n/a in C3 */
unsigned int keygen:1;
int interm:1;
unsigned int encdec:1;
int ksize:2;
} b;
} cword; /* Control word */
AES_KEY ks; /* Encryption key */
};
/*
* Essentially this variable belongs in thread local storage.
* Having this variable global on the other hand can only cause
* few bogus key reloads [if any at all on single-CPU system],
* so we accept the penatly...
*/
static volatile struct padlock_cipher_data *padlock_saved_context;
# endif
/*-
* =======================================================
* Inline assembler section(s).
* =======================================================
* Order of arguments is chosen to facilitate Windows port
* using __fastcall calling convention. If you wish to add
* more routines, keep in mind that first __fastcall
* argument is passed in %ecx and second - in %edx.
* =======================================================
*/
# if defined(__GNUC__) && __GNUC__>=2
+#if defined(__i386__) || defined(__i386)
/*
* As for excessive "push %ebx"/"pop %ebx" found all over.
* When generating position-independent code GCC won't let
* us use "b" in assembler templates nor even respect "ebx"
* in "clobber description." Therefore the trouble...
*/
/*
* Helper function - check if a CPUID instruction is available on this CPU
*/
static int padlock_insn_cpuid_available(void)
{
int result = -1;
/*
* We're checking if the bit #21 of EFLAGS can be toggled. If yes =
* CPUID is available.
*/
asm volatile ("pushf\n"
"popl %%eax\n"
"xorl $0x200000, %%eax\n"
"movl %%eax, %%ecx\n"
"andl $0x200000, %%ecx\n"
"pushl %%eax\n"
"popf\n"
"pushf\n"
"popl %%eax\n"
"andl $0x200000, %%eax\n"
"xorl %%eax, %%ecx\n"
"movl %%ecx, %0\n":"=r" (result)::"eax", "ecx");
@@ -422,98 +426,228 @@
" cmpl %2,%1\n"
" je 1f\n"
" popfl\n"
" subl $4,%%esp\n"
"1: addl $4,%%esp\n"
" movl %2,%0":"+m" (padlock_saved_context)
:"r"(padlock_saved_context), "r"(cdata):"cc");
}
/* Template for padlock_xcrypt_* modes */
/*
* BIG FAT WARNING: The offsets used with 'leal' instructions describe items
* of the 'padlock_cipher_data' structure.
*/
# define PADLOCK_XCRYPT_ASM(name,rep_xcrypt) \
static inline void *name(size_t cnt, \
struct padlock_cipher_data *cdata, \
void *out, const void *inp) \
{ void *iv; \
asm volatile ( "pushl %%ebx\n" \
" leal 16(%0),%%edx\n" \
" leal 32(%0),%%ebx\n" \
rep_xcrypt "\n" \
" popl %%ebx" \
: "=a"(iv), "=c"(cnt), "=D"(out), "=S"(inp) \
: "0"(cdata), "1"(cnt), "2"(out), "3"(inp) \
: "edx", "cc", "memory"); \
return iv; \
}
+
+
+#endif
+
+#elif defined(__x86_64__) || defined(__x86_64)
+
+/* Load supported features of the CPU to see if
+ the PadLock is available. */
+ static int
+padlock_available(void)
+{
+ char vendor_string[16];
+ unsigned int eax, edx;
+ size_t scratch;
+
+ /* Are we running on the Centaur (VIA) CPU? */
+ eax = 0x00000000;
+ vendor_string[12] = 0;
+ asm volatile (
+ "movq %%rbx,%1\n"
+ "cpuid\n"
+ "movl %%ebx,(%2)\n"
+ "movl %%edx,4(%2)\n"
+ "movl %%ecx,8(%2)\n"
+ "movq %1,%%rbx"
+ : "+a"(eax), "=&r"(scratch) : "r"(vendor_string) : "rcx", "rdx");
+ if (strcmp(vendor_string, "CentaurHauls") != 0)
+ return 0;
+
+ /* Check for Centaur Extended Feature Flags presence */
+ eax = 0xC0000000;
+ asm volatile ("movq %%rbx,%1; cpuid; movq %1,%%rbx"
+ : "+a"(eax), "=&r"(scratch) : : "rcx", "rdx");
+ if (eax < 0xC0000001)
+ return 0;
+
+ /* Read the Centaur Extended Feature Flags */
+ eax = 0xC0000001;
+ asm volatile ("movq %%rbx,%2; cpuid; movq %2,%%rbx"
+ : "+a"(eax), "=d"(edx), "=&r"(scratch) : : "rcx");
+
+ /* Fill up some flags */
+ padlock_use_ace = ((edx & (0x3<<6)) == (0x3<<6));
+ padlock_use_rng = ((edx & (0x3<<2)) == (0x3<<2));
+
+ return padlock_use_ace + padlock_use_rng;
+}
+
+/* Force key reload from memory to the CPU microcode.
+ Loading EFLAGS from the stack clears EFLAGS[30]
+ which does the trick. */
+ static inline void
+padlock_reload_key(void)
+{
+ asm volatile ("pushfq; popfq");
+}
+
+#ifndef OPENSSL_NO_AES
+/*
+ * This is heuristic key context tracing. At first one
+ * believes that one should use atomic swap instructions,
+ * but it's not actually necessary. Point is that if
+ * padlock_saved_context was changed by another thread
+ * after we've read it and before we compare it with cdata,
+ * our key *shall* be reloaded upon thread context switch
+ * and we are therefore set in either case...
+ */
+ static inline void
+padlock_verify_context(struct padlock_cipher_data *cdata)
+{
+ asm volatile (
+ "pushfq\n"
+ " btl $30,(%%rsp)\n"
+ " jnc 1f\n"
+ " cmpq %2,%1\n"
+ " je 1f\n"
+ " popfq\n"
+ " subq $8,%%rsp\n"
+ "1: addq $8,%%rsp\n"
+ " movq %2,%0"
+ :"+m"(padlock_saved_context)
+ : "r"(padlock_saved_context), "r"(cdata) : "cc");
+}
+
+/* Template for padlock_xcrypt_* modes */
+/* BIG FAT WARNING:
+ * The offsets used with 'leal' instructions
+ * describe items of the 'padlock_cipher_data'
+ * structure.
+ */
+#define PADLOCK_XCRYPT_ASM(name,rep_xcrypt) \
+ static inline void *name(size_t cnt, \
+ struct padlock_cipher_data *cdata, \
+ void *out, const void *inp) \
+{ void *iv; \
+ size_t scratch; \
+ asm volatile ( "movq %%rbx,%4\n" \
+ " leaq 16(%0),%%rdx\n" \
+ " leaq 32(%0),%%rbx\n" \
+ rep_xcrypt "\n" \
+ " movq %4,%%rbx" \
+ : "=a"(iv), "=c"(cnt), "=D"(out), "=S"(inp), "=&r"(scratch) \
+ : "0"(cdata), "1"(cnt), "2"(out), "3"(inp) \
+ : "rdx", "cc", "memory"); \
+ return iv; \
+}
+#endif
+
+#endif /* cpu */
+
+#ifndef OPENSSL_NO_AES
+
+
+
/* Generate all functions with appropriate opcodes */
/* rep xcryptecb */
PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb, ".byte 0xf3,0x0f,0xa7,0xc8")
/* rep xcryptcbc */
PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc, ".byte 0xf3,0x0f,0xa7,0xd0")
/* rep xcryptcfb */
PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb, ".byte 0xf3,0x0f,0xa7,0xe0")
/* rep xcryptofb */
PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb, ".byte 0xf3,0x0f,0xa7,0xe8")
+
+
+/* Our own htonl()/ntohl() */
+static inline void
+padlock_bswapl(AES_KEY *ks)
+{
+ size_t i = sizeof(ks->rd_key)/sizeof(ks->rd_key[0]);
+ unsigned int *key = ks->rd_key;
+
+ while (i--) {
+ asm volatile ("bswapl %0" : "+r"(*key));
+ key++;
+ }
+}
+
+
# endif
/* The RNG call itself */
static inline unsigned int padlock_xstore(void *addr, unsigned int edx_in)
{
unsigned int eax_out;
asm volatile (".byte 0x0f,0xa7,0xc0" /* xstore */
:"=a" (eax_out), "=m"(*(unsigned *)addr)
:"D"(addr), "d"(edx_in)
);
return eax_out;
}
/*
* Why not inline 'rep movsd'? I failed to find information on what value in
* Direction Flag one can expect and consequently have to apply
* "better-safe-than-sorry" approach and assume "undefined." I could
* explicitly clear it and restore the original value upon return from
* padlock_aes_cipher, but it's presumably too much trouble for too little
* gain... In case you wonder 'rep xcrypt*' instructions above are *not*
* affected by the Direction Flag and pointers advance toward larger
* addresses unconditionally.
*/
static inline unsigned char *padlock_memcpy(void *dst, const void *src,
size_t n)
{
- long *d = dst;
- const long *s = src;
+ size_t *d = dst;
+ const size_t *s = src;
n /= sizeof(*d);
do {
*d++ = *s++;
} while (--n);
return dst;
}
# elif defined(_MSC_VER)
/*
* Unlike GCC these are real functions. In order to minimize impact
* on performance we adhere to __fastcall calling convention in
* order to get two first arguments passed through %ecx and %edx.
* Which kind of suits very well, as instructions in question use
* both %ecx and %edx as input:-)
*/
# define REP_XCRYPT(code) \
_asm _emit 0xf3 \
_asm _emit 0x0f _asm _emit 0xa7 \
_asm _emit code
/*
* BIG FAT WARNING: The offsets used with 'lea' instructions describe items
* of the 'padlock_cipher_data' structure.
*/
# define PADLOCK_XCRYPT_ASM(name,code) \
static void * __fastcall \
name (size_t cnt, void *cdata, \
void *outp, const void *inp) \