X-Git-Url: https://git.openssl.org/?p=openssl.git;a=blobdiff_plain;f=crypto%2Fengine%2Feng_padlock.c;h=5067bc02dd5f13c212f9ee19753043fa49ff1cac;hp=09f10c695ec062735979cbd16aeaf6cd3710f5b3;hb=36734b2bab61e47b252ec3be85f8133a0d1c28f7;hpb=5b17246324500c1854b7355b6607a40b4f96c8cf diff --git a/crypto/engine/eng_padlock.c b/crypto/engine/eng_padlock.c index 09f10c695e..5067bc02dd 100644 --- a/crypto/engine/eng_padlock.c +++ b/crypto/engine/eng_padlock.c @@ -65,6 +65,11 @@ #include #include +#include +#ifdef _MSC_VER +# define alloca _alloca +# define snprintf _snprintf +#endif #include #include @@ -98,11 +103,10 @@ #if !defined(I386_ONLY) && !defined(OPENSSL_NO_INLINE_ASM) # if defined(__i386__) || defined(__i386) || defined(_M_IX86) # define COMPILE_HW_PADLOCK +static ENGINE *ENGINE_padlock (void); # endif #endif -static ENGINE *ENGINE_padlock (void); - void ENGINE_load_padlock (void) { /* On non-x86 CPUs it just returns. */ @@ -149,7 +153,8 @@ padlock_bind_helper(ENGINE *e) #endif /* Generate a nice engine name with available features */ - snprintf(padlock_name, sizeof(padlock_name), "VIA PadLock (%s, %s)", + BIO_snprintf(padlock_name, sizeof(padlock_name), + "VIA PadLock (%s, %s)", padlock_use_rng ? "RNG" : "no-RNG", padlock_use_ace ? "ACE" : "no-ACE"); @@ -246,13 +251,21 @@ struct padlock_cipher_data AES_KEY ks; /* Encryption key */ }; +/* + * Essentially this variable belongs in thread local storage. + * Having this variable global on the other hand can only cause + * few bogus key reloads [if any at all on single-CPU system], + * so we accept the penatly... + */ +static volatile struct padlock_cipher_data *padlock_saved_context; + /* * ======================================================= * Inline assembler section(s). * ======================================================= * Order of arguments is chosen to facilitate Windows port * using __fastcall calling convention. If you wish to add - * more routines, keep in mind that in __fastcall first + * more routines, keep in mind that first __fastcall * argument is passed in %ecx and second - in %edx. * ======================================================= */ @@ -362,16 +375,14 @@ padlock_reload_key(void) * This is heuristic key context tracing. At first one * believes that one should use atomic swap instructions, * but it's not actually necessary. Point is that if - * saved_cdata was changed by another thread after we've - * read it and before we compare it with cdata, our key - * *shall* be reloaded upon thread context switch and - * we are therefore set in either case... + * padlock_saved_context was changed by another thread + * after we've read it and before we compare it with cdata, + * our key *shall* be reloaded upon thread context switch + * and we are therefore set in either case... */ static inline void padlock_verify_context(struct padlock_cipher_data *cdata) { - static struct padlock_cipher_data *saved_cdata; - asm volatile ( "pushfl\n" " bt $30,(%%esp)\n" @@ -382,7 +393,8 @@ padlock_verify_context(struct padlock_cipher_data *cdata) " popfl\n" " sub $4,%%esp\n" "1: add $4,%%esp" - :"+m"(saved_cdata) : "r"(saved_cdata), "r"(cdata) : "cc"); + :"+m"(padlock_saved_context) + : "r"(padlock_saved_context), "r"(cdata) : "cc"); } /* Template for padlock_xcrypt_* modes */ @@ -408,10 +420,10 @@ static inline void *name(size_t cnt, \ } /* Generate all functions with appropriate opcodes */ -PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb, ".byte 0xf3,0x0f,0xa7,0xc8"); /* rep xcryptecb */ -PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc, ".byte 0xf3,0x0f,0xa7,0xd0"); /* rep xcryptcbc */ -PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb, ".byte 0xf3,0x0f,0xa7,0xe0"); /* rep xcryptcfb */ -PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb, ".byte 0xf3,0x0f,0xa7,0xe8"); /* rep xcryptofb */ +PADLOCK_XCRYPT_ASM(padlock_xcrypt_ecb, ".byte 0xf3,0x0f,0xa7,0xc8") /* rep xcryptecb */ +PADLOCK_XCRYPT_ASM(padlock_xcrypt_cbc, ".byte 0xf3,0x0f,0xa7,0xd0") /* rep xcryptcbc */ +PADLOCK_XCRYPT_ASM(padlock_xcrypt_cfb, ".byte 0xf3,0x0f,0xa7,0xe0") /* rep xcryptcfb */ +PADLOCK_XCRYPT_ASM(padlock_xcrypt_ofb, ".byte 0xf3,0x0f,0xa7,0xe8") /* rep xcryptofb */ /* The RNG call itself */ static inline unsigned int @@ -427,6 +439,29 @@ padlock_xstore(void *addr, unsigned int edx_in) return eax_out; } +/* Why not inline 'rep movsd'? I failed to find information on what + * value in Direction Flag one can expect and consequently have to + * apply "better-safe-than-sorry" approach and assume "undefined." + * I could explicitly clear it and restore the original value upon + * return from padlock_aes_cipher, but it's presumably too much + * trouble for too little gain... + * + * In case you wonder 'rep xcrypt*' instructions above are *not* + * affected by the Direction Flag and pointers advance toward + * larger addresses unconditionally. + */ +static inline unsigned char * +padlock_memcpy(void *dst,const void *src,size_t n) +{ + long *d=dst; + const long *s=src; + + n /= sizeof(*d); + do { *d++ = *s++; } while (--n); + + return dst; +} + #elif defined(_MSC_VER) /* * Unlike GCC these are real functions. In order to minimize impact @@ -450,8 +485,8 @@ static void * __fastcall \ name (size_t cnt, void *cdata, \ void *outp, const void *inp) \ { _asm mov eax,edx \ - _asm lea ebx,[eax+16] \ - _asm lea edx,[eax+32] \ + _asm lea edx,[eax+16] \ + _asm lea ebx,[eax+32] \ _asm mov edi,outp \ _asm mov esi,inp \ REP_XCRYPT(code) \ @@ -474,22 +509,20 @@ padlock_reload_key(void) static void __fastcall padlock_verify_context(void *cdata) -{ static void *saved_cdata; - - _asm { +{ _asm { pushfd bt DWORD PTR[esp],30 jnc skip - cmp ecx,saved_cdata + cmp ecx,padlock_saved_context je skip - mov saved_cdata,ecx + mov padlock_saved_context,ecx popfd sub esp,4 skip: add esp,4 } } -sttic int __fastcall +static int padlock_available(void) { _asm { pushfd @@ -546,14 +579,18 @@ padlock_bswapl(void *key) mov esi,ecx mov edi,ecx mov ecx,60 - up: - lodsd + up: lodsd bswap eax stosd loop up popfd } } + +/* MS actually specifies status of Direction Flag and compiler even + * manages to compile following as 'rep movsd' all by itself... + */ +#define padlock_memcpy(o,i,n) ((unsigned char *)memcpy((o),(i),(n)&~3U)) #endif /* ===== AES encryption/decryption ===== */ @@ -591,13 +628,17 @@ static int padlock_cipher_nids[] = { NID_aes_192_ecb, NID_aes_192_cbc, -// NID_aes_192_cfb, /* FIXME: AES192/256 CFB/OFB don't work. */ -// NID_aes_192_ofb, +#if 0 + NID_aes_192_cfb, /* FIXME: AES192/256 CFB/OFB don't work. */ + NID_aes_192_ofb, +#endif NID_aes_256_ecb, NID_aes_256_cbc, -// NID_aes_256_cfb, -// NID_aes_256_ofb, +#if 0 + NID_aes_256_cfb, + NID_aes_256_ofb, +#endif }; static int padlock_cipher_nids_num = (sizeof(padlock_cipher_nids)/ sizeof(padlock_cipher_nids[0])); @@ -606,9 +647,12 @@ static int padlock_cipher_nids_num = (sizeof(padlock_cipher_nids)/ static int padlock_aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, const unsigned char *iv, int enc); static int padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, - const unsigned char *in, unsigned int nbytes); + const unsigned char *in, size_t nbytes); -#define ALIGNED_CIPHER_DATA(ctx) ((struct padlock_cipher_data *)(ctx->cipher_data + ((0x10 - ((size_t)(ctx->cipher_data) & 0x0F)) & 0x0F))) +#define NEAREST_ALIGNED(ptr) ( (unsigned char *)(ptr) + \ + ( (0x10 - ((size_t)(ptr) & 0x0F)) & 0x0F ) ) +#define ALIGNED_CIPHER_DATA(ctx) ((struct padlock_cipher_data *)\ + NEAREST_ALIGNED(ctx->cipher_data)) /* Declaring so many ciphers by hand would be a pain. Instead introduce a bit of preprocessor magic :-) */ @@ -733,6 +777,10 @@ padlock_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key, case 256: /* Generate an extended AES key in software. Needed for AES192/AES256 */ + /* Well, the above applies to Stepping 8 CPUs + and is listed as hardware errata. They most + likely will fix it at some point and then + a check for stepping would be due here. */ if (enc) AES_set_encrypt_key(key, key_len, &cdata->ks); else @@ -822,21 +870,21 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, { struct padlock_cipher_data *cdata; const void *inp; - void *out, *iv; + unsigned char *out; + void *iv; int inp_misaligned, out_misaligned, realign_in_loop; - size_t chunk, allocated; + size_t chunk, allocated=0; if (nbytes == 0) return 1; if (nbytes % AES_BLOCK_SIZE) return 0; /* are we expected to do tail processing? */ -#if 0 - /* There is more work to support CPUs that don't require alignment. - Therefore disabled completely for now... */ + /* VIA promises CPUs that won't require alignment in the future. + For now padlock_aes_align_required is initialized to 1 and + the condition is never met... */ if (!padlock_aes_align_required) return padlock_aes_cipher_omnivorous(ctx, out_arg, in_arg, nbytes); -#endif inp_misaligned = (((size_t)in_arg) & 0x0F); out_misaligned = (((size_t)out_arg) & 0x0F); @@ -858,12 +906,8 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, if (out_misaligned) { /* optmize for small input */ allocated = (chunkiv, ctx->iv, AES_BLOCK_SIZE); do { if (inp_misaligned) - inp = memcpy(out, in_arg, chunk); + inp = padlock_memcpy(out, in_arg, chunk); else inp = in_arg; in_arg += chunk; @@ -954,7 +998,7 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, padlock_xcrypt_ofb(chunk/AES_BLOCK_SIZE, cdata, out, inp); if (out_misaligned) - out_arg = memcpy(out_arg, out, chunk) + chunk; + out_arg = padlock_memcpy(out_arg, out, chunk) + chunk; else out = out_arg+=chunk; @@ -970,7 +1014,7 @@ padlock_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg, /* Clean the realign buffer if it was used */ if (out_misaligned) { - volatile unsigned long *p=out; + volatile unsigned long *p=(void *)out; size_t n = allocated/sizeof(*p); while (n--) *p++=0; }