#include <stdio.h>
#include <string.h>
+#include <openssl/opensslconf.h>
#include <openssl/crypto.h>
#include <openssl/dso.h>
#include <openssl/engine.h>
#include <openssl/evp.h>
+#ifndef OPENSSL_NO_AES
#include <openssl/aes.h>
+#endif
+#include <openssl/rand.h>
+#include <openssl/err.h>
#ifndef OPENSSL_NO_HW
#ifndef OPENSSL_NO_HW_PADLOCK
compiler choice is limited to GCC and Microsoft C. */
#undef COMPILE_HW_PADLOCK
#if !defined(I386_ONLY) && !defined(OPENSSL_NO_INLINE_ASM)
-# if defined(__i386__) || defined(__i386) || defined(_M_IX86)
+# if (defined(__GNUC__) && (defined(__i386__) || defined(__i386))) || \
+ (defined(_MSC_VER) && defined(_M_IX86))
# define COMPILE_HW_PADLOCK
+static ENGINE *ENGINE_padlock (void);
# endif
#endif
-static ENGINE *ENGINE_padlock (void);
-
void ENGINE_load_padlock (void)
{
/* On non-x86 CPUs it just returns. */
}
#ifdef COMPILE_HW_PADLOCK
+/* We do these includes here to avoid header problems on platforms that
+ do not have the VIA padlock anyway... */
+#ifdef _MSC_VER
+# include <malloc.h>
+# define alloca _alloca
+#else
+# include <stdlib.h>
+#endif
+
/* Function for ENGINE detection and control */
static int padlock_available(void);
static int padlock_init(ENGINE *e);
static RAND_METHOD padlock_rand;
/* Cipher Stuff */
+#ifndef OPENSSL_NO_AES
static int padlock_ciphers(ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid);
+#endif
/* Engine names */
static const char *padlock_id = "padlock";
/* Available features */
static int padlock_use_ace = 0; /* Advanced Cryptography Engine */
static int padlock_use_rng = 0; /* Random Number Generator */
+#ifndef OPENSSL_NO_AES
static int padlock_aes_align_required = 1;
+#endif
/* ===== Engine "management" functions ===== */
#endif
/* Generate a nice engine name with available features */
- snprintf(padlock_name, sizeof(padlock_name), "VIA PadLock (%s, %s)",
+ BIO_snprintf(padlock_name, sizeof(padlock_name),
+ "VIA PadLock (%s, %s)",
padlock_use_rng ? "RNG" : "no-RNG",
padlock_use_ace ? "ACE" : "no-ACE");
!ENGINE_set_name(e, padlock_name) ||
!ENGINE_set_init_function(e, padlock_init) ||
-
+#ifndef OPENSSL_NO_AES
(padlock_use_ace && !ENGINE_set_ciphers (e, padlock_ciphers)) ||
+#endif
(padlock_use_rng && !ENGINE_set_RAND (e, &padlock_rand))) {
return 0;
}
/* ===== Here comes the "real" engine ===== */
+#ifndef OPENSSL_NO_AES
/* Some AES-related constants */
#define AES_BLOCK_SIZE 16
#define AES_KEY_SIZE_128 16
AES_KEY ks; /* Encryption key */
};
+/*
+ * Essentially this variable belongs in thread local storage.
+ * Having this variable global on the other hand can only cause
+ * few bogus key reloads [if any at all on single-CPU system],
+ * so we accept the penatly...
+ */
+static volatile struct padlock_cipher_data *padlock_saved_context;
+#endif
+
/*
* =======================================================
* Inline assembler section(s).
* =======================================================
* Order of arguments is chosen to facilitate Windows port
* using __fastcall calling convention. If you wish to add
- * more routines, keep in mind that in __fastcall first
+ * more routines, keep in mind that first __fastcall
* argument is passed in %ecx and second - in %edx.
* =======================================================
*/
return padlock_use_ace + padlock_use_rng;
}
+#ifndef OPENSSL_NO_AES
/* Our own htonl()/ntohl() */
static inline void
padlock_bswapl(AES_KEY *ks)
{
size_t i = sizeof(ks->rd_key)/sizeof(ks->rd_key[0]);
- unsigned long *key = ks->rd_key;
+ unsigned int *key = ks->rd_key;
while (i--) {
asm volatile ("bswapl %0" : "+r"(*key));
key++;
}
}
+#endif
/* Force key reload from memory to the CPU microcode.
Loading EFLAGS from the stack clears EFLAGS[30]
asm volatile ("pushfl; popfl");
}
+#ifndef OPENSSL_NO_AES
/*
* This is heuristic key context tracing. At first one
* believes that one should use atomic swap instructions,
* but it's not actually necessary. Point is that if
- * saved_cdata was changed by another thread after we've
- * read it and before we compare it with cdata, our key
- * *shall* be reloaded upon thread context switch and
- * we are therefore set in either case...
+ * padlock_saved_context was changed by another thread
+ * after we've read it and before we compare it with cdata,
+ * our key *shall* be reloaded upon thread context switch
+ * and we are therefore set in either case...
*/
static inline void
padlock_verify_context(struct padlock_cipher_data *cdata)
{
- static struct padlock_cipher_data *saved_cdata;
-
asm volatile (
"pushfl\n"
-" bt $30,(%%esp)\n"
+" btl $30,(%%esp)\n"
" jnc 1f\n"
-" cmp %2,%1\n"
+" cmpl %2,%1\n"
" je 1f\n"
-" mov %2,%0\n"
" popfl\n"
-" sub $4,%%esp\n"
-"1: add $4,%%esp"
- :"+m"(saved_cdata) : "r"(saved_cdata), "r"(cdata) : "cc");
+" subl $4,%%esp\n"
+"1: addl $4,%%esp\n"
+" movl %2,%0"
+ :"+m"(padlock_saved_context)
+ : "r"(padlock_saved_context), "r"(cdata) : "cc");
}
/* Template for padlock_xcrypt_* modes */
}
/* Generate all functions with appropriate opcodes */
-PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb, ".byte 0xf3,0x0f,0xa7,0xc8"); /* rep xcryptecb */
-PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc, ".byte 0xf3,0x0f,0xa7,0xd0"); /* rep xcryptcbc */
-PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb, ".byte 0xf3,0x0f,0xa7,0xe0"); /* rep xcryptcfb */
-PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb, ".byte 0xf3,0x0f,0xa7,0xe8"); /* rep xcryptofb */
+PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb, ".byte 0xf3,0x0f,0xa7,0xc8") /* rep xcryptecb */
+PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc, ".byte 0xf3,0x0f,0xa7,0xd0") /* rep xcryptcbc */
+PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb, ".byte 0xf3,0x0f,0xa7,0xe0") /* rep xcryptcfb */
+PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb, ".byte 0xf3,0x0f,0xa7,0xe8") /* rep xcryptofb */
+#endif
/* The RNG call itself */
static inline unsigned int
return eax_out;
}
+/* Why not inline 'rep movsd'? I failed to find information on what
+ * value in Direction Flag one can expect and consequently have to
+ * apply "better-safe-than-sorry" approach and assume "undefined."
+ * I could explicitly clear it and restore the original value upon
+ * return from padlock_aes_cipher, but it's presumably too much
+ * trouble for too little gain...
+ *
+ * In case you wonder 'rep xcrypt*' instructions above are *not*
+ * affected by the Direction Flag and pointers advance toward
+ * larger addresses unconditionally.
+ */
+static inline unsigned char *
+padlock_memcpy(void *dst,const void *src,size_t n)
+{
+ long *d=dst;
+ const long *s=src;
+
+ n /= sizeof(*d);
+ do { *d++ = *s++; } while (--n);
+
+ return dst;
+}
+
#elif defined(_MSC_VER)
/*
* Unlike GCC these are real functions. In order to minimize impact
name (size_t cnt, void *cdata, \
void *outp, const void *inp) \
{ _asm mov eax,edx \
- _asm lea ebx,[eax+16] \
- _asm lea edx,[eax+32] \
+ _asm lea edx,[eax+16] \
+ _asm lea ebx,[eax+32] \
_asm mov edi,outp \
_asm mov esi,inp \
REP_XCRYPT(code) \
static void __fastcall
padlock_verify_context(void *cdata)
-{ static void *saved_cdata;
-
- _asm {
+{ _asm {
pushfd
bt DWORD PTR[esp],30
jnc skip
- cmp ecx,saved_cdata
+ cmp ecx,padlock_saved_context
je skip
- mov saved_cdata,ecx
popfd
sub esp,4
skip: add esp,4
+ mov padlock_saved_context,ecx
}
}
-sttic int __fastcall
+static int
padlock_available(void)
{ _asm {
pushfd
mov esi,ecx
mov edi,ecx
mov ecx,60
- up:
- lodsd
+ up: lodsd
bswap eax
stosd
loop up
popfd
}
}
+
+/* MS actually specifies status of Direction Flag and compiler even
+ * manages to compile following as 'rep movsd' all by itself...
+ */
+#define padlock_memcpy(o,i,n) ((unsigned char *)memcpy((o),(i),(n)&~3U))
#endif
/* ===== AES encryption/decryption ===== */
+#ifndef OPENSSL_NO_AES
#if defined(NID_aes_128_cfb128) && ! defined (NID_aes_128_cfb)
#define NID_aes_128_cfb NID_aes_128_cfb128
NID_aes_192_ecb,
NID_aes_192_cbc,
-// NID_aes_192_cfb, /* FIXME: AES192/256 CFB/OFB don't work. */
-// NID_aes_192_ofb,
+#if 0
+ NID_aes_192_cfb, /* FIXME: AES192/256 CFB/OFB don't work. */
+ NID_aes_192_ofb,
+#endif
NID_aes_256_ecb,
NID_aes_256_cbc,
-// NID_aes_256_cfb,
-// NID_aes_256_ofb,
+#if 0
+ NID_aes_256_cfb,
+ NID_aes_256_ofb,
+#endif
};
static int padlock_cipher_nids_num = (sizeof(padlock_cipher_nids)/
sizeof(padlock_cipher_nids[0]));
static int padlock_aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
const unsigned char *iv, int enc);
static int padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
- const unsigned char *in, unsigned int nbytes);
+ const unsigned char *in, size_t nbytes);
-#define ALIGNED_CIPHER_DATA(ctx) ((struct padlock_cipher_data *)(ctx->cipher_data + ((0x10 - ((size_t)(ctx->cipher_data) & 0x0F)) & 0x0F)))
+#define NEAREST_ALIGNED(ptr) ( (unsigned char *)(ptr) + \
+ ( (0x10 - ((size_t)(ptr) & 0x0F)) & 0x0F ) )
+#define ALIGNED_CIPHER_DATA(ctx) ((struct padlock_cipher_data *)\
+ NEAREST_ALIGNED(ctx->cipher_data))
/* Declaring so many ciphers by hand would be a pain.
Instead introduce a bit of preprocessor magic :-) */
case 256:
/* Generate an extended AES key in software.
Needed for AES192/AES256 */
+ /* Well, the above applies to Stepping 8 CPUs
+ and is listed as hardware errata. They most
+ likely will fix it at some point and then
+ a check for stepping would be due here. */
if (enc)
AES_set_encrypt_key(key, key_len, &cdata->ks);
else
AES_set_decrypt_key(key, key_len, &cdata->ks);
-
- /* OpenSSL internal functions use byte-swapped extended key. */
+#ifndef AES_ASM
+ /* OpenSSL C functions use byte-swapped extended key. */
padlock_bswapl(&cdata->ks);
-
+#endif
cdata->cword.b.keygen = 1;
break;
}
#ifndef PADLOCK_CHUNK
-# define PADLOCK_CHUNK 4096 /* Must be a power of 2 larger than 16 */
+# define PADLOCK_CHUNK 512 /* Must be a power of 2 larger than 16 */
#endif
#if PADLOCK_CHUNK<16 || PADLOCK_CHUNK&(PADLOCK_CHUNK-1)
# error "insane PADLOCK_CHUNK..."
{
struct padlock_cipher_data *cdata;
const void *inp;
- void *out, *iv;
+ unsigned char *out;
+ void *iv;
int inp_misaligned, out_misaligned, realign_in_loop;
- size_t chunk, allocated;
+ size_t chunk, allocated=0;
if (nbytes == 0)
return 1;
if (nbytes % AES_BLOCK_SIZE)
return 0; /* are we expected to do tail processing? */
-#if 0
- /* There is more work to support CPUs that don't require alignment.
- Therefore disabled completely for now... */
+ /* VIA promises CPUs that won't require alignment in the future.
+ For now padlock_aes_align_required is initialized to 1 and
+ the condition is never met... */
+ /* C7 core is capable to manage unaligned input in non-ECB[!]
+ mode, but performance penalties appear to be approximately
+ same as for software alignment below or ~3x. They promise to
+ improve it in the future, but for now we can just as well
+ pretend that it can only handle aligned input... */
if (!padlock_aes_align_required)
return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes);
-#endif
inp_misaligned = (((size_t)in_arg) & 0x0F);
out_misaligned = (((size_t)out_arg) & 0x0F);
if (out_misaligned) {
/* optmize for small input */
allocated = (chunk<nbytes?PADLOCK_CHUNK:nbytes);
-#ifdef _MSC_VER
- out = _alloca(0x10 + allocated);
-#else
- out = alloca(0x10 + allocated);
-#endif
- out += (0x10 - ((size_t)out & 0x0F)) & 0x0F;
+ out = alloca(0x10 + allocated);
+ out = NEAREST_ALIGNED(out);
}
else
out = out_arg;
case EVP_CIPH_ECB_MODE:
do {
if (inp_misaligned)
- inp = memcpy(out, in_arg, chunk);
+ inp = padlock_memcpy(out, in_arg, chunk);
else
inp = in_arg;
in_arg += chunk;
padlock_xcrypt_ecb(chunk/AES_BLOCK_SIZE, cdata, out, inp);
if (out_misaligned)
- out_arg = memcpy(out_arg, out, chunk) + chunk;
+ out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
else
out = out_arg+=chunk;
chunk = PADLOCK_CHUNK;
cbc_shortcut: /* optimize for small input */
if (inp_misaligned)
- inp = memcpy(out, in_arg, chunk);
+ inp = padlock_memcpy(out, in_arg, chunk);
else
inp = in_arg;
in_arg += chunk;
iv = padlock_xcrypt_cbc(chunk/AES_BLOCK_SIZE, cdata, out, inp);
if (out_misaligned)
- out_arg = memcpy(out_arg, out, chunk) + chunk;
+ out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
else
out = out_arg+=chunk;
chunk = PADLOCK_CHUNK;
cfb_shortcut: /* optimize for small input */
if (inp_misaligned)
- inp = memcpy(out, in_arg, chunk);
+ inp = padlock_memcpy(out, in_arg, chunk);
else
inp = in_arg;
in_arg += chunk;
iv = padlock_xcrypt_cfb(chunk/AES_BLOCK_SIZE, cdata, out, inp);
if (out_misaligned)
- out_arg = memcpy(out_arg, out, chunk) + chunk;
+ out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
else
out = out_arg+=chunk;
memcpy(cdata->iv, ctx->iv, AES_BLOCK_SIZE);
do {
if (inp_misaligned)
- inp = memcpy(out, in_arg, chunk);
+ inp = padlock_memcpy(out, in_arg, chunk);
else
inp = in_arg;
in_arg += chunk;
padlock_xcrypt_ofb(chunk/AES_BLOCK_SIZE, cdata, out, inp);
if (out_misaligned)
- out_arg = memcpy(out_arg, out, chunk) + chunk;
+ out_arg = padlock_memcpy(out_arg, out, chunk) + chunk;
else
out = out_arg+=chunk;
/* Clean the realign buffer if it was used */
if (out_misaligned) {
- volatile unsigned long *p=out;
+ volatile unsigned long *p=(void *)out;
size_t n = allocated/sizeof(*p);
while (n--) *p++=0;
}
return 1;
}
+#endif /* OPENSSL_NO_AES */
+
/* ===== Random Number Generator ===== */
/*
* This code is not engaged. The reason is that it does not comply