x86[_64] assembly pack: add optimized AES-NI OCB subroutines.
authorAndy Polyakov <appro@openssl.org>
Wed, 2 Dec 2015 13:27:23 +0000 (14:27 +0100)
committerAndy Polyakov <appro@openssl.org>
Thu, 10 Dec 2015 12:11:26 +0000 (13:11 +0100)
Reviewed-by: Richard Levitte <levitte@openssl.org>
crypto/aes/asm/aesni-x86.pl
crypto/aes/asm/aesni-x86_64.pl
crypto/evp/e_aes.c
crypto/modes/modes_lcl.h
crypto/modes/ocb128.c
include/openssl/modes.h
test/evptests.txt

index 9b2e37aafb1a05977a8675bccb3bf508ab9f49e4..536f0359f1a4f6a30414c0af036bbc0b49f86ec1 100644 (file)
 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
 
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt.
+
 ######################################################################
 # Current large-block performance in cycles per byte processed with
 # 128-bit key (less is better).
 #
-#              CBC en-/decrypt CTR     XTS     ECB
+#              CBC en-/decrypt CTR     XTS     ECB     OCB
 # Westmere     3.77/1.37       1.37    1.52    1.27
-# * Bridge     5.07/0.98       0.99    1.09    0.91
-# Haswell      4.44/0.80       0.97    1.03    0.72
-# Silvermont   5.77/3.56       3.67    4.03    3.46
-# Bulldozer    5.80/0.98       1.05    1.24    0.93
+# * Bridge     5.07/0.98       0.99    1.09    0.91    1.10
+# Haswell      4.44/0.80       0.97    1.03    0.72    0.76
+# Silvermont   5.77/3.56       3.67    4.03    3.46    4.03
+# Bulldozer    5.80/0.98       1.05    1.24    0.93    1.23
 
 $PREFIX="aesni";       # if $PREFIX is set to "AES", the script
                        # generates drop-in replacement for
@@ -1831,6 +1835,877 @@ if ($PREFIX eq "aesni") {
        &mov    ("esp",&DWP(16*7+4,"esp"));     # restore %esp
 &function_end("aesni_xts_decrypt");
 }
+\f
+######################################################################
+# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
+#      const AES_KEY *key, unsigned int start_block_num,
+#      unsigned char offset_i[16], const unsigned char L_[][16],
+#      unsigned char checksum[16]);
+#
+{
+# offsets within stack frame
+my $checksum = 16*6;
+my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
+
+# reassigned registers
+my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
+# $l_, $blocks, $inp, $key are permanently allocated in registers;
+# remaining non-volatile ones are offloaded to stack, which even
+# stay invariant after written to stack.
+
+&function_begin("aesni_ocb_encrypt");
+       &mov    ($rounds,&wparam(5));           # &offset_i
+       &mov    ($rounds_,&wparam(7));          # &checksum
+
+       &mov    ($inp,&wparam(0));
+       &mov    ($out,&wparam(1));
+       &mov    ($len,&wparam(2));
+       &mov    ($key,&wparam(3));
+       &movdqu ($rndkey0,&QWP(0,$rounds));     # load offset_i
+       &mov    ($block,&wparam(4));            # start_block_num
+       &movdqu ($rndkey1,&QWP(0,$rounds_));    # load checksum
+       &mov    ($l_,&wparam(6));               # L_
+
+       &mov    ($rounds,"esp");
+       &sub    ("esp",$esp_off+4);             # alloca
+       &and    ("esp",-16);                    # align stack
+
+       &sub    ($out,$inp);
+       &shl    ($len,4);
+       &lea    ($len,&DWP(-16*6,$inp,$len));   # end of input - 16*6
+       &mov    (&DWP($out_off,"esp"),$out);
+       &mov    (&DWP($end_off,"esp"),$len);
+       &mov    (&DWP($esp_off,"esp"),$rounds);
+
+       &mov    ($rounds,&DWP(240,$key));
+
+       &test   ($block,1);
+       &jnz    (&label("odd"));
+
+       &bsf            ($i3,$block);
+       &add            ($block,1);
+       &shl            ($i3,4);
+       &movdqu         ($inout5,&QWP(0,$l_,$i3));
+       &mov            ($i3,$key);                     # put aside key
+
+       &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
+       &lea            ($inp,&DWP(16,$inp));
+
+       &pxor           ($inout5,$rndkey0);             # ^ last offset_i
+       &pxor           ($rndkey1,$inout0);             # checksum
+       &pxor           ($inout0,$inout5);              # ^ offset_i
+
+       &movdqa         ($inout4,$rndkey1);
+       if ($inline)
+       {   &aesni_inline_generate1("enc");     }
+       else
+       {   &call       ("_aesni_encrypt1");    }
+
+       &xorps          ($inout0,$inout5);              # ^ offset_i
+       &movdqa         ($rndkey0,$inout5);             # pass last offset_i
+       &movdqa         ($rndkey1,$inout4);             # pass the checksum
+
+       &movups         (&QWP(-16,$out,$inp),$inout0);  # store output
+
+       &mov            ($rounds,&DWP(240,$i3));
+       &mov            ($key,$i3);                     # restore key
+       &mov            ($len,&DWP($end_off,"esp"));
+
+&set_label("odd");
+       &shl            ($rounds,4);
+       &mov            ($out,16);
+       &sub            ($out,$rounds);                 # twisted rounds
+       &mov            (&DWP($key_off,"esp"),$key);
+       &lea            ($key,&DWP(32,$key,$rounds));   # end of key schedule
+       &mov            (&DWP($rounds_off,"esp"),$out);
+
+       &cmp            ($inp,$len);
+       &ja             (&label("short"));
+       &jmp            (&label("grandloop"));
+
+&set_label("grandloop",32);
+       &lea            ($i1,&DWP(1,$block));
+       &lea            ($i3,&DWP(3,$block));
+       &lea            ($i5,&DWP(5,$block));
+       &add            ($block,6);
+       &bsf            ($i1,$i1);
+       &bsf            ($i3,$i3);
+       &bsf            ($i5,$i5);
+       &shl            ($i1,4);
+       &shl            ($i3,4);
+       &shl            ($i5,4);
+       &movdqu         ($inout0,&QWP(0,$l_));
+       &movdqu         ($inout1,&QWP(0,$l_,$i1));
+       &mov            ($rounds,&DWP($rounds_off,"esp"));
+       &movdqa         ($inout2,$inout0);
+       &movdqu         ($inout3,&QWP(0,$l_,$i3));
+       &movdqa         ($inout4,$inout0);
+       &movdqu         ($inout5,&QWP(0,$l_,$i5));
+
+       &pxor           ($inout0,$rndkey0);             # ^ last offset_i
+       &pxor           ($inout1,$inout0);
+       &movdqa         (&QWP(16*0,"esp"),$inout0);
+       &pxor           ($inout2,$inout1);
+       &movdqa         (&QWP(16*1,"esp"),$inout1);
+       &pxor           ($inout3,$inout2);
+       &movdqa         (&QWP(16*2,"esp"),$inout2);
+       &pxor           ($inout4,$inout3);
+       &movdqa         (&QWP(16*3,"esp"),$inout3);
+       &pxor           ($inout5,$inout4);
+       &movdqa         (&QWP(16*4,"esp"),$inout4);
+       &movdqa         (&QWP(16*5,"esp"),$inout5);
+
+       &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
+       &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
+       &movdqu         ($inout1,&QWP(16*1,$inp));
+       &movdqu         ($inout2,&QWP(16*2,$inp));
+       &movdqu         ($inout3,&QWP(16*3,$inp));
+       &movdqu         ($inout4,&QWP(16*4,$inp));
+       &movdqu         ($inout5,&QWP(16*5,$inp));
+       &lea            ($inp,&DWP(16*6,$inp));
+
+       &pxor           ($rndkey1,$inout0);             # checksum
+       &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
+       &pxor           ($rndkey1,$inout1);
+       &pxor           ($inout1,$rndkey0);
+       &pxor           ($rndkey1,$inout2);
+       &pxor           ($inout2,$rndkey0);
+       &pxor           ($rndkey1,$inout3);
+       &pxor           ($inout3,$rndkey0);
+       &pxor           ($rndkey1,$inout4);
+       &pxor           ($inout4,$rndkey0);
+       &pxor           ($rndkey1,$inout5);
+       &pxor           ($inout5,$rndkey0);
+       &movdqa         (&QWP($checksum,"esp"),$rndkey1);
+
+       &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
+       &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
+       &pxor           ($inout1,&QWP(16*1,"esp"));
+       &pxor           ($inout2,&QWP(16*2,"esp"));
+       &pxor           ($inout3,&QWP(16*3,"esp"));
+       &pxor           ($inout4,&QWP(16*4,"esp"));
+       &pxor           ($inout5,&QWP(16*5,"esp"));
+
+       &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
+       &aesenc         ($inout0,$rndkey1);
+       &aesenc         ($inout1,$rndkey1);
+       &aesenc         ($inout2,$rndkey1);
+       &aesenc         ($inout3,$rndkey1);
+       &aesenc         ($inout4,$rndkey1);
+       &aesenc         ($inout5,$rndkey1);
+
+       &mov            ($out,&DWP($out_off,"esp"));
+       &mov            ($len,&DWP($end_off,"esp"));
+       &call           ("_aesni_encrypt6_enter");
+
+       &movdqa         ($rndkey0,&QWP(16*5,"esp"));    # pass last offset_i
+       &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
+       &pxor           ($inout1,&QWP(16*1,"esp"));
+       &pxor           ($inout2,&QWP(16*2,"esp"));
+       &pxor           ($inout3,&QWP(16*3,"esp"));
+       &pxor           ($inout4,&QWP(16*4,"esp"));
+       &pxor           ($inout5,$rndkey0);
+       &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+
+       &movdqu         (&QWP(-16*6,$out,$inp),$inout0);# store output
+       &movdqu         (&QWP(-16*5,$out,$inp),$inout1);
+       &movdqu         (&QWP(-16*4,$out,$inp),$inout2);
+       &movdqu         (&QWP(-16*3,$out,$inp),$inout3);
+       &movdqu         (&QWP(-16*2,$out,$inp),$inout4);
+       &movdqu         (&QWP(-16*1,$out,$inp),$inout5);
+       &cmp            ($inp,$len);                    # done yet?
+       &jb             (&label("grandloop"));
+
+&set_label("short");
+       &add            ($len,16*6);
+       &sub            ($len,$inp);
+       &jz             (&label("done"));
+
+       &cmp            ($len,16*2);
+       &jb             (&label("one"));
+       &je             (&label("two"));
+
+       &cmp            ($len,16*4);
+       &jb             (&label("three"));
+       &je             (&label("four"));
+
+       &lea            ($i1,&DWP(1,$block));
+       &lea            ($i3,&DWP(3,$block));
+       &bsf            ($i1,$i1);
+       &bsf            ($i3,$i3);
+       &shl            ($i1,4);
+       &shl            ($i3,4);
+       &movdqu         ($inout0,&QWP(0,$l_));
+       &movdqu         ($inout1,&QWP(0,$l_,$i1));
+       &mov            ($rounds,&DWP($rounds_off,"esp"));
+       &movdqa         ($inout2,$inout0);
+       &movdqu         ($inout3,&QWP(0,$l_,$i3));
+       &movdqa         ($inout4,$inout0);
+
+       &pxor           ($inout0,$rndkey0);             # ^ last offset_i
+       &pxor           ($inout1,$inout0);
+       &movdqa         (&QWP(16*0,"esp"),$inout0);
+       &pxor           ($inout2,$inout1);
+       &movdqa         (&QWP(16*1,"esp"),$inout1);
+       &pxor           ($inout3,$inout2);
+       &movdqa         (&QWP(16*2,"esp"),$inout2);
+       &pxor           ($inout4,$inout3);
+       &movdqa         (&QWP(16*3,"esp"),$inout3);
+       &pxor           ($inout5,$inout4);
+       &movdqa         (&QWP(16*4,"esp"),$inout4);
+
+       &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
+       &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
+       &movdqu         ($inout1,&QWP(16*1,$inp));
+       &movdqu         ($inout2,&QWP(16*2,$inp));
+       &movdqu         ($inout3,&QWP(16*3,$inp));
+       &movdqu         ($inout4,&QWP(16*4,$inp));
+       &pxor           ($inout5,$inout5);
+
+       &pxor           ($rndkey1,$inout0);             # checksum
+       &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
+       &pxor           ($rndkey1,$inout1);
+       &pxor           ($inout1,$rndkey0);
+       &pxor           ($rndkey1,$inout2);
+       &pxor           ($inout2,$rndkey0);
+       &pxor           ($rndkey1,$inout3);
+       &pxor           ($inout3,$rndkey0);
+       &pxor           ($rndkey1,$inout4);
+       &pxor           ($inout4,$rndkey0);
+       &movdqa         (&QWP($checksum,"esp"),$rndkey1);
+
+       &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
+       &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
+       &pxor           ($inout1,&QWP(16*1,"esp"));
+       &pxor           ($inout2,&QWP(16*2,"esp"));
+       &pxor           ($inout3,&QWP(16*3,"esp"));
+       &pxor           ($inout4,&QWP(16*4,"esp"));
+
+       &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
+       &aesenc         ($inout0,$rndkey1);
+       &aesenc         ($inout1,$rndkey1);
+       &aesenc         ($inout2,$rndkey1);
+       &aesenc         ($inout3,$rndkey1);
+       &aesenc         ($inout4,$rndkey1);
+       &aesenc         ($inout5,$rndkey1);
+
+       &mov            ($out,&DWP($out_off,"esp"));
+       &call           ("_aesni_encrypt6_enter");
+
+       &movdqa         ($rndkey0,&QWP(16*4,"esp"));    # pass last offset_i
+       &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
+       &pxor           ($inout1,&QWP(16*1,"esp"));
+       &pxor           ($inout2,&QWP(16*2,"esp"));
+       &pxor           ($inout3,&QWP(16*3,"esp"));
+       &pxor           ($inout4,$rndkey0);
+       &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+
+       &movdqu         (&QWP(16*0,$out,$inp),$inout0); # store output
+       &movdqu         (&QWP(16*1,$out,$inp),$inout1);
+       &movdqu         (&QWP(16*2,$out,$inp),$inout2);
+       &movdqu         (&QWP(16*3,$out,$inp),$inout3);
+       &movdqu         (&QWP(16*4,$out,$inp),$inout4);
+
+       &jmp            (&label("done"));
+
+&set_label("one",16);
+       &movdqu         ($inout5,&QWP(0,$l_));
+       &mov            ($key,&DWP($key_off,"esp"));    # restore key
+
+       &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
+       &mov            ($rounds,&DWP(240,$key));
+
+       &pxor           ($inout5,$rndkey0);             # ^ last offset_i
+       &pxor           ($rndkey1,$inout0);             # checksum
+       &pxor           ($inout0,$inout5);              # ^ offset_i
+
+       &movdqa         ($inout4,$rndkey1);
+       &mov            ($out,&DWP($out_off,"esp"));
+       if ($inline)
+       {   &aesni_inline_generate1("enc");     }
+       else
+       {   &call       ("_aesni_encrypt1");    }
+
+       &xorps          ($inout0,$inout5);              # ^ offset_i
+       &movdqa         ($rndkey0,$inout5);             # pass last offset_i
+       &movdqa         ($rndkey1,$inout4);             # pass the checksum
+       &movups         (&QWP(0,$out,$inp),$inout0);
+
+       &jmp            (&label("done"));
+
+&set_label("two",16);
+       &lea            ($i1,&DWP(1,$block));
+       &mov            ($key,&DWP($key_off,"esp"));    # restore key
+       &bsf            ($i1,$i1);
+       &shl            ($i1,4);
+       &movdqu         ($inout4,&QWP(0,$l_));
+       &movdqu         ($inout5,&QWP(0,$l_,$i1));
+
+       &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
+       &movdqu         ($inout1,&QWP(16*1,$inp));
+       &mov            ($rounds,&DWP(240,$key));
+
+       &pxor           ($inout4,$rndkey0);             # ^ last offset_i
+       &pxor           ($inout5,$inout4);
+
+       &pxor           ($rndkey1,$inout0);             # checksum
+       &pxor           ($inout0,$inout4);              # ^ offset_i
+       &pxor           ($rndkey1,$inout1);
+       &pxor           ($inout1,$inout5);
+
+       &movdqa         ($inout3,$rndkey1)
+       &mov            ($out,&DWP($out_off,"esp"));
+       &call           ("_aesni_encrypt2");
+
+       &xorps          ($inout0,$inout4);              # ^ offset_i
+       &xorps          ($inout1,$inout5);
+       &movdqa         ($rndkey0,$inout5);             # pass last offset_i
+       &movdqa         ($rndkey1,$inout3);             # pass the checksum
+       &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
+       &movups         (&QWP(16*1,$out,$inp),$inout1);
+
+       &jmp            (&label("done"));
+
+&set_label("three",16);
+       &lea            ($i1,&DWP(1,$block));
+       &mov            ($key,&DWP($key_off,"esp"));    # restore key
+       &bsf            ($i1,$i1);
+       &shl            ($i1,4);
+       &movdqu         ($inout3,&QWP(0,$l_));
+       &movdqu         ($inout4,&QWP(0,$l_,$i1));
+       &movdqa         ($inout5,$inout3);
+
+       &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
+       &movdqu         ($inout1,&QWP(16*1,$inp));
+       &movdqu         ($inout2,&QWP(16*2,$inp));
+       &mov            ($rounds,&DWP(240,$key));
+
+       &pxor           ($inout3,$rndkey0);             # ^ last offset_i
+       &pxor           ($inout4,$inout3);
+       &pxor           ($inout5,$inout4);
+
+       &pxor           ($rndkey1,$inout0);             # checksum
+       &pxor           ($inout0,$inout3);              # ^ offset_i
+       &pxor           ($rndkey1,$inout1);
+       &pxor           ($inout1,$inout4);
+       &pxor           ($rndkey1,$inout2);
+       &pxor           ($inout2,$inout5);
+
+       &movdqa         (&QWP($checksum,"esp"),$rndkey1);
+       &mov            ($out,&DWP($out_off,"esp"));
+       &call           ("_aesni_encrypt3");
+
+       &xorps          ($inout0,$inout3);              # ^ offset_i
+       &xorps          ($inout1,$inout4);
+       &xorps          ($inout2,$inout5);
+       &movdqa         ($rndkey0,$inout5);             # pass last offset_i
+       &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+       &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
+       &movups         (&QWP(16*1,$out,$inp),$inout1);
+       &movups         (&QWP(16*2,$out,$inp),$inout2);
+
+       &jmp            (&label("done"));
+
+&set_label("four",16);
+       &lea            ($i1,&DWP(1,$block));
+       &lea            ($i3,&DWP(3,$block));
+       &bsf            ($i1,$i1);
+       &bsf            ($i3,$i3);
+       &mov            ($key,&DWP($key_off,"esp"));    # restore key
+       &shl            ($i1,4);
+       &shl            ($i3,4);
+       &movdqu         ($inout2,&QWP(0,$l_));
+       &movdqu         ($inout3,&QWP(0,$l_,$i1));
+       &movdqa         ($inout4,$inout2);
+       &movdqu         ($inout5,&QWP(0,$l_,$i3));
+
+       &pxor           ($inout2,$rndkey0);             # ^ last offset_i
+       &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
+       &pxor           ($inout3,$inout2);
+       &movdqu         ($inout1,&QWP(16*1,$inp));
+       &pxor           ($inout4,$inout3);
+       &movdqa         (&QWP(16*0,"esp"),$inout2);
+       &pxor           ($inout5,$inout4);
+       &movdqa         (&QWP(16*1,"esp"),$inout3);
+       &movdqu         ($inout2,&QWP(16*2,$inp));
+       &movdqu         ($inout3,&QWP(16*3,$inp));
+       &mov            ($rounds,&DWP(240,$key));
+
+       &pxor           ($rndkey1,$inout0);             # checksum
+       &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
+       &pxor           ($rndkey1,$inout1);
+       &pxor           ($inout1,&QWP(16*1,"esp"));
+       &pxor           ($rndkey1,$inout2);
+       &pxor           ($inout2,$inout4);
+       &pxor           ($rndkey1,$inout3);
+       &pxor           ($inout3,$inout5);
+
+       &movdqa         (&QWP($checksum,"esp"),$rndkey1)
+       &mov            ($out,&DWP($out_off,"esp"));
+       &call           ("_aesni_encrypt4");
+
+       &xorps          ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
+       &xorps          ($inout1,&QWP(16*1,"esp"));
+       &xorps          ($inout2,$inout4);
+       &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
+       &xorps          ($inout3,$inout5);
+       &movups         (&QWP(16*1,$out,$inp),$inout1);
+       &movdqa         ($rndkey0,$inout5);             # pass last offset_i
+       &movups         (&QWP(16*2,$out,$inp),$inout2);
+       &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+       &movups         (&QWP(16*3,$out,$inp),$inout3);
+
+&set_label("done");
+       &mov    ($key,&DWP($esp_off,"esp"));
+       &pxor   ($inout0,$inout0);              # clear register bank
+       &pxor   ($inout1,$inout1);
+       &movdqa (&QWP(16*0,"esp"),$inout0);     # clear stack
+       &pxor   ($inout2,$inout2);
+       &movdqa (&QWP(16*1,"esp"),$inout0);
+       &pxor   ($inout3,$inout3);
+       &movdqa (&QWP(16*2,"esp"),$inout0);
+       &pxor   ($inout4,$inout4);
+       &movdqa (&QWP(16*3,"esp"),$inout0);
+       &pxor   ($inout5,$inout5);
+       &movdqa (&QWP(16*4,"esp"),$inout0);
+       &movdqa (&QWP(16*5,"esp"),$inout0);
+       &movdqa (&QWP(16*6,"esp"),$inout0);
+
+       &lea    ("esp",&DWP(0,$key));
+       &mov    ($rounds,&wparam(5));           # &offset_i
+       &mov    ($rounds_,&wparam(7));          # &checksum
+       &movdqu (&QWP(0,$rounds),$rndkey0);
+       &pxor   ($rndkey0,$rndkey0);
+       &movdqu (&QWP(0,$rounds_),$rndkey1);
+       &pxor   ($rndkey1,$rndkey1);
+&function_end("aesni_ocb_encrypt");
+
+&function_begin("aesni_ocb_decrypt");
+       &mov    ($rounds,&wparam(5));           # &offset_i
+       &mov    ($rounds_,&wparam(7));          # &checksum
+
+       &mov    ($inp,&wparam(0));
+       &mov    ($out,&wparam(1));
+       &mov    ($len,&wparam(2));
+       &mov    ($key,&wparam(3));
+       &movdqu ($rndkey0,&QWP(0,$rounds));     # load offset_i
+       &mov    ($block,&wparam(4));            # start_block_num
+       &movdqu ($rndkey1,&QWP(0,$rounds_));    # load checksum
+       &mov    ($l_,&wparam(6));               # L_
+
+       &mov    ($rounds,"esp");
+       &sub    ("esp",$esp_off+4);             # alloca
+       &and    ("esp",-16);                    # align stack
+
+       &sub    ($out,$inp);
+       &shl    ($len,4);
+       &lea    ($len,&DWP(-16*6,$inp,$len));   # end of input - 16*6
+       &mov    (&DWP($out_off,"esp"),$out);
+       &mov    (&DWP($end_off,"esp"),$len);
+       &mov    (&DWP($esp_off,"esp"),$rounds);
+
+       &mov    ($rounds,&DWP(240,$key));
+
+       &test   ($block,1);
+       &jnz    (&label("odd"));
+
+       &bsf            ($i3,$block);
+       &add            ($block,1);
+       &shl            ($i3,4);
+       &movdqu         ($inout5,&QWP(0,$l_,$i3));
+       &mov            ($i3,$key);                     # put aside key
+
+       &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
+       &lea            ($inp,&DWP(16,$inp));
+
+       &pxor           ($inout5,$rndkey0);             # ^ last offset_i
+       &pxor           ($inout0,$inout5);              # ^ offset_i
+
+       &movdqa         ($inout4,$rndkey1);
+       if ($inline)
+       {   &aesni_inline_generate1("dec");     }
+       else
+       {   &call       ("_aesni_decrypt1");    }
+
+       &xorps          ($inout0,$inout5);              # ^ offset_i
+       &movaps         ($rndkey1,$inout4);             # pass the checksum
+       &movdqa         ($rndkey0,$inout5);             # pass last offset_i
+       &xorps          ($rndkey1,$inout0);             # checksum
+       &movups         (&QWP(-16,$out,$inp),$inout0);  # store output
+
+       &mov            ($rounds,&DWP(240,$i3));
+       &mov            ($key,$i3);                     # restore key
+       &mov            ($len,&DWP($end_off,"esp"));
+
+&set_label("odd");
+       &shl            ($rounds,4);
+       &mov            ($out,16);
+       &sub            ($out,$rounds);                 # twisted rounds
+       &mov            (&DWP($key_off,"esp"),$key);
+       &lea            ($key,&DWP(32,$key,$rounds));   # end of key schedule
+       &mov            (&DWP($rounds_off,"esp"),$out);
+
+       &cmp            ($inp,$len);
+       &ja             (&label("short"));
+       &jmp            (&label("grandloop"));
+
+&set_label("grandloop",32);
+       &lea            ($i1,&DWP(1,$block));
+       &lea            ($i3,&DWP(3,$block));
+       &lea            ($i5,&DWP(5,$block));
+       &add            ($block,6);
+       &bsf            ($i1,$i1);
+       &bsf            ($i3,$i3);
+       &bsf            ($i5,$i5);
+       &shl            ($i1,4);
+       &shl            ($i3,4);
+       &shl            ($i5,4);
+       &movdqu         ($inout0,&QWP(0,$l_));
+       &movdqu         ($inout1,&QWP(0,$l_,$i1));
+       &mov            ($rounds,&DWP($rounds_off,"esp"));
+       &movdqa         ($inout2,$inout0);
+       &movdqu         ($inout3,&QWP(0,$l_,$i3));
+       &movdqa         ($inout4,$inout0);
+       &movdqu         ($inout5,&QWP(0,$l_,$i5));
+
+       &pxor           ($inout0,$rndkey0);             # ^ last offset_i
+       &pxor           ($inout1,$inout0);
+       &movdqa         (&QWP(16*0,"esp"),$inout0);
+       &pxor           ($inout2,$inout1);
+       &movdqa         (&QWP(16*1,"esp"),$inout1);
+       &pxor           ($inout3,$inout2);
+       &movdqa         (&QWP(16*2,"esp"),$inout2);
+       &pxor           ($inout4,$inout3);
+       &movdqa         (&QWP(16*3,"esp"),$inout3);
+       &pxor           ($inout5,$inout4);
+       &movdqa         (&QWP(16*4,"esp"),$inout4);
+       &movdqa         (&QWP(16*5,"esp"),$inout5);
+
+       &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
+       &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
+       &movdqu         ($inout1,&QWP(16*1,$inp));
+       &movdqu         ($inout2,&QWP(16*2,$inp));
+       &movdqu         ($inout3,&QWP(16*3,$inp));
+       &movdqu         ($inout4,&QWP(16*4,$inp));
+       &movdqu         ($inout5,&QWP(16*5,$inp));
+       &lea            ($inp,&DWP(16*6,$inp));
+
+       &movdqa         (&QWP($checksum,"esp"),$rndkey1);
+       &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
+       &pxor           ($inout1,$rndkey0);
+       &pxor           ($inout2,$rndkey0);
+       &pxor           ($inout3,$rndkey0);
+       &pxor           ($inout4,$rndkey0);
+       &pxor           ($inout5,$rndkey0);
+
+       &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
+       &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
+       &pxor           ($inout1,&QWP(16*1,"esp"));
+       &pxor           ($inout2,&QWP(16*2,"esp"));
+       &pxor           ($inout3,&QWP(16*3,"esp"));
+       &pxor           ($inout4,&QWP(16*4,"esp"));
+       &pxor           ($inout5,&QWP(16*5,"esp"));
+
+       &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
+       &aesdec         ($inout0,$rndkey1);
+       &aesdec         ($inout1,$rndkey1);
+       &aesdec         ($inout2,$rndkey1);
+       &aesdec         ($inout3,$rndkey1);
+       &aesdec         ($inout4,$rndkey1);
+       &aesdec         ($inout5,$rndkey1);
+
+       &mov            ($out,&DWP($out_off,"esp"));
+       &mov            ($len,&DWP($end_off,"esp"));
+       &call           ("_aesni_decrypt6_enter");
+
+       &movdqa         ($rndkey0,&QWP(16*5,"esp"));    # pass last offset_i
+       &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
+       &movdqa         ($rndkey1,&QWP($checksum,"esp"));
+       &pxor           ($inout1,&QWP(16*1,"esp"));
+       &pxor           ($inout2,&QWP(16*2,"esp"));
+       &pxor           ($inout3,&QWP(16*3,"esp"));
+       &pxor           ($inout4,&QWP(16*4,"esp"));
+       &pxor           ($inout5,$rndkey0);
+
+       &pxor           ($rndkey1,$inout0);             # checksum
+       &movdqu         (&QWP(-16*6,$out,$inp),$inout0);# store output
+       &pxor           ($rndkey1,$inout1);
+       &movdqu         (&QWP(-16*5,$out,$inp),$inout1);
+       &pxor           ($rndkey1,$inout2);
+       &movdqu         (&QWP(-16*4,$out,$inp),$inout2);
+       &pxor           ($rndkey1,$inout3);
+       &movdqu         (&QWP(-16*3,$out,$inp),$inout3);
+       &pxor           ($rndkey1,$inout4);
+       &movdqu         (&QWP(-16*2,$out,$inp),$inout4);
+       &pxor           ($rndkey1,$inout5);
+       &movdqu         (&QWP(-16*1,$out,$inp),$inout5);
+       &cmp            ($inp,$len);                    # done yet?
+       &jb             (&label("grandloop"));
+
+&set_label("short");
+       &add            ($len,16*6);
+       &sub            ($len,$inp);
+       &jz             (&label("done"));
+
+       &cmp            ($len,16*2);
+       &jb             (&label("one"));
+       &je             (&label("two"));
+
+       &cmp            ($len,16*4);
+       &jb             (&label("three"));
+       &je             (&label("four"));
+
+       &lea            ($i1,&DWP(1,$block));
+       &lea            ($i3,&DWP(3,$block));
+       &bsf            ($i1,$i1);
+       &bsf            ($i3,$i3);
+       &shl            ($i1,4);
+       &shl            ($i3,4);
+       &movdqu         ($inout0,&QWP(0,$l_));
+       &movdqu         ($inout1,&QWP(0,$l_,$i1));
+       &mov            ($rounds,&DWP($rounds_off,"esp"));
+       &movdqa         ($inout2,$inout0);
+       &movdqu         ($inout3,&QWP(0,$l_,$i3));
+       &movdqa         ($inout4,$inout0);
+
+       &pxor           ($inout0,$rndkey0);             # ^ last offset_i
+       &pxor           ($inout1,$inout0);
+       &movdqa         (&QWP(16*0,"esp"),$inout0);
+       &pxor           ($inout2,$inout1);
+       &movdqa         (&QWP(16*1,"esp"),$inout1);
+       &pxor           ($inout3,$inout2);
+       &movdqa         (&QWP(16*2,"esp"),$inout2);
+       &pxor           ($inout4,$inout3);
+       &movdqa         (&QWP(16*3,"esp"),$inout3);
+       &pxor           ($inout5,$inout4);
+       &movdqa         (&QWP(16*4,"esp"),$inout4);
+
+       &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
+       &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
+       &movdqu         ($inout1,&QWP(16*1,$inp));
+       &movdqu         ($inout2,&QWP(16*2,$inp));
+       &movdqu         ($inout3,&QWP(16*3,$inp));
+       &movdqu         ($inout4,&QWP(16*4,$inp));
+       &pxor           ($inout5,$inout5);
+
+       &movdqa         (&QWP($checksum,"esp"),$rndkey1);
+       &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
+       &pxor           ($inout1,$rndkey0);
+       &pxor           ($inout2,$rndkey0);
+       &pxor           ($inout3,$rndkey0);
+       &pxor           ($inout4,$rndkey0);
+
+       &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
+       &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
+       &pxor           ($inout1,&QWP(16*1,"esp"));
+       &pxor           ($inout2,&QWP(16*2,"esp"));
+       &pxor           ($inout3,&QWP(16*3,"esp"));
+       &pxor           ($inout4,&QWP(16*4,"esp"));
+
+       &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
+       &aesdec         ($inout0,$rndkey1);
+       &aesdec         ($inout1,$rndkey1);
+       &aesdec         ($inout2,$rndkey1);
+       &aesdec         ($inout3,$rndkey1);
+       &aesdec         ($inout4,$rndkey1);
+       &aesdec         ($inout5,$rndkey1);
+
+       &mov            ($out,&DWP($out_off,"esp"));
+       &call           ("_aesni_decrypt6_enter");
+
+       &movdqa         ($rndkey0,&QWP(16*4,"esp"));    # pass last offset_i
+       &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
+       &movdqa         ($rndkey1,&QWP($checksum,"esp"));
+       &pxor           ($inout1,&QWP(16*1,"esp"));
+       &pxor           ($inout2,&QWP(16*2,"esp"));
+       &pxor           ($inout3,&QWP(16*3,"esp"));
+       &pxor           ($inout4,$rndkey0);
+
+       &pxor           ($rndkey1,$inout0);             # checksum
+       &movdqu         (&QWP(16*0,$out,$inp),$inout0); # store output
+       &pxor           ($rndkey1,$inout1);
+       &movdqu         (&QWP(16*1,$out,$inp),$inout1);
+       &pxor           ($rndkey1,$inout2);
+       &movdqu         (&QWP(16*2,$out,$inp),$inout2);
+       &pxor           ($rndkey1,$inout3);
+       &movdqu         (&QWP(16*3,$out,$inp),$inout3);
+       &pxor           ($rndkey1,$inout4);
+       &movdqu         (&QWP(16*4,$out,$inp),$inout4);
+
+       &jmp            (&label("done"));
+
+&set_label("one",16);
+       &movdqu         ($inout5,&QWP(0,$l_));
+       &mov            ($key,&DWP($key_off,"esp"));    # restore key
+
+       &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
+       &mov            ($rounds,&DWP(240,$key));
+
+       &pxor           ($inout5,$rndkey0);             # ^ last offset_i
+       &pxor           ($inout0,$inout5);              # ^ offset_i
+
+       &movdqa         ($inout4,$rndkey1);
+       &mov            ($out,&DWP($out_off,"esp"));
+       if ($inline)
+       {   &aesni_inline_generate1("dec");     }
+       else
+       {   &call       ("_aesni_decrypt1");    }
+
+       &xorps          ($inout0,$inout5);              # ^ offset_i
+       &movaps         ($rndkey1,$inout4);             # pass the checksum
+       &movdqa         ($rndkey0,$inout5);             # pass last offset_i
+       &xorps          ($rndkey1,$inout0);             # checksum
+       &movups         (&QWP(0,$out,$inp),$inout0);
+
+       &jmp            (&label("done"));
+
+&set_label("two",16);
+       &lea            ($i1,&DWP(1,$block));
+       &mov            ($key,&DWP($key_off,"esp"));    # restore key
+       &bsf            ($i1,$i1);
+       &shl            ($i1,4);
+       &movdqu         ($inout4,&QWP(0,$l_));
+       &movdqu         ($inout5,&QWP(0,$l_,$i1));
+
+       &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
+       &movdqu         ($inout1,&QWP(16*1,$inp));
+       &mov            ($rounds,&DWP(240,$key));
+
+       &movdqa         ($inout3,$rndkey1);
+       &pxor           ($inout4,$rndkey0);             # ^ last offset_i
+       &pxor           ($inout5,$inout4);
+
+       &pxor           ($inout0,$inout4);              # ^ offset_i
+       &pxor           ($inout1,$inout5);
+
+       &mov            ($out,&DWP($out_off,"esp"));
+       &call           ("_aesni_decrypt2");
+
+       &xorps          ($inout0,$inout4);              # ^ offset_i
+       &xorps          ($inout1,$inout5);
+       &movdqa         ($rndkey0,$inout5);             # pass last offset_i
+       &xorps          ($inout3,$inout0);              # checksum
+       &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
+       &xorps          ($inout3,$inout1);
+       &movups         (&QWP(16*1,$out,$inp),$inout1);
+       &movaps         ($rndkey1,$inout3);             # pass the checksum
+
+       &jmp            (&label("done"));
+
+&set_label("three",16);
+       &lea            ($i1,&DWP(1,$block));
+       &mov            ($key,&DWP($key_off,"esp"));    # restore key
+       &bsf            ($i1,$i1);
+       &shl            ($i1,4);
+       &movdqu         ($inout3,&QWP(0,$l_));
+       &movdqu         ($inout4,&QWP(0,$l_,$i1));
+       &movdqa         ($inout5,$inout3);
+
+       &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
+       &movdqu         ($inout1,&QWP(16*1,$inp));
+       &movdqu         ($inout2,&QWP(16*2,$inp));
+       &mov            ($rounds,&DWP(240,$key));
+
+       &movdqa         (&QWP($checksum,"esp"),$rndkey1);
+       &pxor           ($inout3,$rndkey0);             # ^ last offset_i
+       &pxor           ($inout4,$inout3);
+       &pxor           ($inout5,$inout4);
+
+       &pxor           ($inout0,$inout3);              # ^ offset_i
+       &pxor           ($inout1,$inout4);
+       &pxor           ($inout2,$inout5);
+
+       &mov            ($out,&DWP($out_off,"esp"));
+       &call           ("_aesni_decrypt3");
+
+       &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+       &xorps          ($inout0,$inout3);              # ^ offset_i
+       &xorps          ($inout1,$inout4);
+       &xorps          ($inout2,$inout5);
+       &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
+       &pxor           ($rndkey1,$inout0);             # checksum
+       &movdqa         ($rndkey0,$inout5);             # pass last offset_i
+       &movups         (&QWP(16*1,$out,$inp),$inout1);
+       &pxor           ($rndkey1,$inout1);
+       &movups         (&QWP(16*2,$out,$inp),$inout2);
+       &pxor           ($rndkey1,$inout2);
+
+       &jmp            (&label("done"));
+
+&set_label("four",16);
+       &lea            ($i1,&DWP(1,$block));
+       &lea            ($i3,&DWP(3,$block));
+       &bsf            ($i1,$i1);
+       &bsf            ($i3,$i3);
+       &mov            ($key,&DWP($key_off,"esp"));    # restore key
+       &shl            ($i1,4);
+       &shl            ($i3,4);
+       &movdqu         ($inout2,&QWP(0,$l_));
+       &movdqu         ($inout3,&QWP(0,$l_,$i1));
+       &movdqa         ($inout4,$inout2);
+       &movdqu         ($inout5,&QWP(0,$l_,$i3));
+
+       &pxor           ($inout2,$rndkey0);             # ^ last offset_i
+       &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
+       &pxor           ($inout3,$inout2);
+       &movdqu         ($inout1,&QWP(16*1,$inp));
+       &pxor           ($inout4,$inout3);
+       &movdqa         (&QWP(16*0,"esp"),$inout2);
+       &pxor           ($inout5,$inout4);
+       &movdqa         (&QWP(16*1,"esp"),$inout3);
+       &movdqu         ($inout2,&QWP(16*2,$inp));
+       &movdqu         ($inout3,&QWP(16*3,$inp));
+       &mov            ($rounds,&DWP(240,$key));
+
+       &movdqa         (&QWP($checksum,"esp"),$rndkey1);
+       &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
+       &pxor           ($inout1,&QWP(16*1,"esp"));
+       &pxor           ($inout2,$inout4);
+       &pxor           ($inout3,$inout5);
+
+       &mov            ($out,&DWP($out_off,"esp"));
+       &call           ("_aesni_decrypt4");
+
+       &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
+       &xorps          ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
+       &xorps          ($inout1,&QWP(16*1,"esp"));
+       &xorps          ($inout2,$inout4);
+       &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
+       &pxor           ($rndkey1,$inout0);             # checksum
+       &xorps          ($inout3,$inout5);
+       &movups         (&QWP(16*1,$out,$inp),$inout1);
+       &pxor           ($rndkey1,$inout1);
+       &movdqa         ($rndkey0,$inout5);             # pass last offset_i
+       &movups         (&QWP(16*2,$out,$inp),$inout2);
+       &pxor           ($rndkey1,$inout2);
+       &movups         (&QWP(16*3,$out,$inp),$inout3);
+       &pxor           ($rndkey1,$inout3);
+
+&set_label("done");
+       &mov    ($key,&DWP($esp_off,"esp"));
+       &pxor   ($inout0,$inout0);              # clear register bank
+       &pxor   ($inout1,$inout1);
+       &movdqa (&QWP(16*0,"esp"),$inout0);     # clear stack
+       &pxor   ($inout2,$inout2);
+       &movdqa (&QWP(16*1,"esp"),$inout0);
+       &pxor   ($inout3,$inout3);
+       &movdqa (&QWP(16*2,"esp"),$inout0);
+       &pxor   ($inout4,$inout4);
+       &movdqa (&QWP(16*3,"esp"),$inout0);
+       &pxor   ($inout5,$inout5);
+       &movdqa (&QWP(16*4,"esp"),$inout0);
+       &movdqa (&QWP(16*5,"esp"),$inout0);
+       &movdqa (&QWP(16*6,"esp"),$inout0);
+
+       &lea    ("esp",&DWP(0,$key));
+       &mov    ($rounds,&wparam(5));           # &offset_i
+       &mov    ($rounds_,&wparam(7));          # &checksum
+       &movdqu (&QWP(0,$rounds),$rndkey0);
+       &pxor   ($rndkey0,$rndkey0);
+       &movdqu (&QWP(0,$rounds_),$rndkey1);
+       &pxor   ($rndkey1,$rndkey1);
+&function_end("aesni_ocb_decrypt");
+}
 }
 \f
 ######################################################################
@@ -2419,7 +3294,7 @@ if ($PREFIX eq "aesni") {
        &pxor           ("xmm3","xmm3");
        &aesenclast     ("xmm2","xmm3");
 
-       &movdqa         ("xmm3","xmm1")
+       &movdqa         ("xmm3","xmm1");
        &pslldq         ("xmm1",4);
        &pxor           ("xmm3","xmm1");
        &pslldq         ("xmm1",4);
index 6037e9e76e3f9ae7afd8c9e33c3ffe02b792d89e..6e41a1ae86efab7cfcb78a35354ad5ac09dc35b9 100644 (file)
 # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
 # in CTR mode AES instruction interleave factor was chosen to be 6x.
 
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
+# chosen to be 6x.
+
 ######################################################################
 # Current large-block performance in cycles per byte processed with
 # 128-bit key (less is better).
 #
-#              CBC en-/decrypt CTR     XTS     ECB
+#              CBC en-/decrypt CTR     XTS     ECB     OCB
 # Westmere     3.77/1.25       1.25    1.25    1.26
-# * Bridge     5.07/0.74       0.75    0.90    0.85
-# Haswell      4.44/0.63       0.63    0.73    0.63
+# * Bridge     5.07/0.74       0.75    0.90    0.85    0.98
+# Haswell      4.44/0.63       0.63    0.73    0.63    0.70
 # Skylake      2.62/0.63       0.63    0.63    0.63
-# Silvermont   5.75/3.54       3.56    4.12    3.87(*)
-# Bulldozer    5.77/0.70       0.72    0.90    0.70
+# Silvermont   5.75/3.54       3.56    4.12    3.87(*) 4.11
+# Bulldozer    5.77/0.70       0.72    0.90    0.70    0.95
 #
 # (*)  Atom Silvermont ECB result is suboptimal because of penalties
 #      incurred by operations on %xmm8-15. As ECB is not considered
@@ -2709,6 +2714,925 @@ $code.=<<___;
        ret
 .size  aesni_xts_decrypt,.-aesni_xts_decrypt
 ___
+}
+\f
+######################################################################
+# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
+#      const AES_KEY *key, unsigned int start_block_num,
+#      unsigned char offset_i[16], const unsigned char L_[][16],
+#      unsigned char checksum[16]);
+#
+{
+my @offset=map("%xmm$_",(10..15));
+my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
+my ($block_num,$offset_p)=("%r8","%r9");               # 5th and 6th arguments
+my ($L_p,$checksum_p) = ("%rbx","%rbp");
+my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
+my $seventh_arg = $win64 ? 56 : 8;
+my $blocks = $len;
+
+$code.=<<___;
+.globl aesni_ocb_encrypt
+.type  aesni_ocb_encrypt,\@function,6
+.align 32
+aesni_ocb_encrypt:
+       lea     (%rsp),%rax
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+___
+$code.=<<___ if ($win64);
+       lea     -0xa0(%rsp),%rsp
+       movaps  %xmm6,0x00(%rsp)                # offload everything
+       movaps  %xmm7,0x10(%rsp)
+       movaps  %xmm8,0x20(%rsp)
+       movaps  %xmm9,0x30(%rsp)
+       movaps  %xmm10,0x40(%rsp)
+       movaps  %xmm11,0x50(%rsp)
+       movaps  %xmm12,0x60(%rsp)
+       movaps  %xmm13,0x70(%rsp)
+       movaps  %xmm14,0x80(%rsp)
+       movaps  %xmm15,0x90(%rsp)
+.Locb_enc_body:
+___
+$code.=<<___;
+       mov     $seventh_arg(%rax),$L_p         # 7th argument
+       mov     $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+       mov     240($key),$rnds_
+       mov     $key,$key_
+       shl     \$4,$rnds_
+       $movkey ($key),$rndkey0l                # round[0]
+       $movkey 16($key,$rnds_),$rndkey1        # round[last]
+
+       movdqu  ($offset_p),@offset[5]          # load last offset_i
+       pxor    $rndkey1,$rndkey0l              # round[0] ^ round[last]
+       pxor    $rndkey1,@offset[5]             # offset_i ^ round[last]
+
+       mov     \$16+32,$rounds
+       lea     32($key_,$rnds_),$key
+       $movkey 16($key_),$rndkey1              # round[1]
+       sub     %r10,%rax                       # twisted $rounds
+       mov     %rax,%r10                       # backup twisted $rounds
+
+       movdqu  ($L_p),@offset[0]               # L_0 for all odd-numbered blocks
+       movdqu  ($checksum_p),$checksum         # load checksum
+
+       test    \$1,$block_num                  # is first block number odd?
+       jnz     .Locb_enc_odd
+
+       bsf     $block_num,$i1
+       add     \$1,$block_num
+       shl     \$4,$i1
+       movdqu  ($L_p,$i1),$inout5              # borrow
+       movdqu  ($inp),$inout0
+       lea     16($inp),$inp
+
+       call    __ocb_encrypt1
+
+       movdqa  $inout5,@offset[5]
+       movups  $inout0,($out)
+       lea     16($out),$out
+       sub     \$1,$blocks
+       jz      .Locb_enc_done
+
+.Locb_enc_odd:
+       lea     1($block_num),$i1               # even-numbered blocks
+       lea     3($block_num),$i3
+       lea     5($block_num),$i5
+       lea     6($block_num),$block_num
+       bsf     $i1,$i1                         # ntz(block)
+       bsf     $i3,$i3
+       bsf     $i5,$i5
+       shl     \$4,$i1                         # ntz(block) -> table offset
+       shl     \$4,$i3
+       shl     \$4,$i5
+
+       sub     \$6,$blocks
+       jc      .Locb_enc_short
+       jmp     .Locb_enc_grandloop
+
+.align 32
+.Locb_enc_grandloop:
+       movdqu  `16*0`($inp),$inout0            # load input
+       movdqu  `16*1`($inp),$inout1
+       movdqu  `16*2`($inp),$inout2
+       movdqu  `16*3`($inp),$inout3
+       movdqu  `16*4`($inp),$inout4
+       movdqu  `16*5`($inp),$inout5
+       lea     `16*6`($inp),$inp
+
+       call    __ocb_encrypt6
+
+       movups  $inout0,`16*0`($out)            # store output
+       movups  $inout1,`16*1`($out)
+       movups  $inout2,`16*2`($out)
+       movups  $inout3,`16*3`($out)
+       movups  $inout4,`16*4`($out)
+       movups  $inout5,`16*5`($out)
+       lea     `16*6`($out),$out
+       sub     \$6,$blocks
+       jnc     .Locb_enc_grandloop
+
+.Locb_enc_short:
+       add     \$6,$blocks
+       jz      .Locb_enc_done
+
+       movdqu  `16*0`($inp),$inout0
+       cmp     \$2,$blocks
+       jb      .Locb_enc_one
+       movdqu  `16*1`($inp),$inout1
+       je      .Locb_enc_two
+
+       movdqu  `16*2`($inp),$inout2
+       cmp     \$4,$blocks
+       jb      .Locb_enc_three
+       movdqu  `16*3`($inp),$inout3
+       je      .Locb_enc_four
+
+       movdqu  `16*4`($inp),$inout4
+       pxor    $inout5,$inout5
+
+       call    __ocb_encrypt6
+
+       movdqa  @offset[4],@offset[5]
+       movups  $inout0,`16*0`($out)
+       movups  $inout1,`16*1`($out)
+       movups  $inout2,`16*2`($out)
+       movups  $inout3,`16*3`($out)
+       movups  $inout4,`16*4`($out)
+
+       jmp     .Locb_enc_done
+
+.align 16
+.Locb_enc_one:
+       movdqa  @offset[0],$inout5              # borrow
+
+       call    __ocb_encrypt1
+
+       movdqa  $inout5,@offset[5]
+       movups  $inout0,`16*0`($out)
+       jmp     .Locb_enc_done
+
+.align 16
+.Locb_enc_two:
+       pxor    $inout2,$inout2
+       pxor    $inout3,$inout3
+
+       call    __ocb_encrypt4
+
+       movdqa  @offset[1],@offset[5]
+       movups  $inout0,`16*0`($out)
+       movups  $inout1,`16*1`($out)
+
+       jmp     .Locb_enc_done
+
+.align 16
+.Locb_enc_three:
+       pxor    $inout3,$inout3
+
+       call    __ocb_encrypt4
+
+       movdqa  @offset[2],@offset[5]
+       movups  $inout0,`16*0`($out)
+       movups  $inout1,`16*1`($out)
+       movups  $inout2,`16*2`($out)
+
+       jmp     .Locb_enc_done
+
+.align 16
+.Locb_enc_four:
+       call    __ocb_encrypt4
+
+       movdqa  @offset[3],@offset[5]
+       movups  $inout0,`16*0`($out)
+       movups  $inout1,`16*1`($out)
+       movups  $inout2,`16*2`($out)
+       movups  $inout3,`16*3`($out)
+
+.Locb_enc_done:
+       pxor    $rndkey0,@offset[5]             # "remove" round[last]
+       movdqu  $checksum,($checksum_p)         # store checksum
+       movdqu  @offset[5],($offset_p)          # store last offset_i
+
+       xorps   %xmm0,%xmm0                     # clear register bank
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
+       pxor    %xmm10,%xmm10
+       pxor    %xmm11,%xmm11
+       pxor    %xmm12,%xmm12
+       pxor    %xmm13,%xmm13
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+       movaps  0x00(%rsp),%xmm6
+       movaps  %xmm0,0x00(%rsp)                # clear stack
+       movaps  0x10(%rsp),%xmm7
+       movaps  %xmm0,0x10(%rsp)
+       movaps  0x20(%rsp),%xmm8
+       movaps  %xmm0,0x20(%rsp)
+       movaps  0x30(%rsp),%xmm9
+       movaps  %xmm0,0x30(%rsp)
+       movaps  0x40(%rsp),%xmm10
+       movaps  %xmm0,0x40(%rsp)
+       movaps  0x50(%rsp),%xmm11
+       movaps  %xmm0,0x50(%rsp)
+       movaps  0x60(%rsp),%xmm12
+       movaps  %xmm0,0x60(%rsp)
+       movaps  0x70(%rsp),%xmm13
+       movaps  %xmm0,0x70(%rsp)
+       movaps  0x80(%rsp),%xmm14
+       movaps  %xmm0,0x80(%rsp)
+       movaps  0x90(%rsp),%xmm15
+       movaps  %xmm0,0x90(%rsp)
+       lea     0xa0+0x28(%rsp),%rax
+.Locb_enc_pop:
+       lea     0xa0(%rsp),%rsp
+___
+$code.=<<___;
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+.Locb_enc_epilogue:
+       ret
+.size  aesni_ocb_encrypt,.-aesni_ocb_encrypt
+
+.type  __ocb_encrypt6,\@abi-omnipotent
+.align 32
+__ocb_encrypt6:
+        pxor           $rndkey0l,@offset[5]    # offset_i ^ round[0]
+        movdqu         ($L_p,$i1),@offset[1]
+        movdqa         @offset[0],@offset[2]
+        movdqu         ($L_p,$i3),@offset[3]
+        movdqa         @offset[0],@offset[4]
+        pxor           @offset[5],@offset[0]
+        movdqu         ($L_p,$i5),@offset[5]
+        pxor           @offset[0],@offset[1]
+       pxor            $inout0,$checksum       # accumulate checksum
+       pxor            @offset[0],$inout0      # input ^ round[0] ^ offset_i
+        pxor           @offset[1],@offset[2]
+       pxor            $inout1,$checksum
+       pxor            @offset[1],$inout1
+        pxor           @offset[2],@offset[3]
+       pxor            $inout2,$checksum
+       pxor            @offset[2],$inout2
+        pxor           @offset[3],@offset[4]
+       pxor            $inout3,$checksum
+       pxor            @offset[3],$inout3
+        pxor           @offset[4],@offset[5]
+       pxor            $inout4,$checksum
+       pxor            @offset[4],$inout4
+       pxor            $inout5,$checksum
+       pxor            @offset[5],$inout5
+       $movkey         32($key_),$rndkey0
+
+       lea             1($block_num),$i1       # even-numbered blocks
+       lea             3($block_num),$i3
+       lea             5($block_num),$i5
+       add             \$6,$block_num
+        pxor           $rndkey0l,@offset[0]    # offset_i ^ round[last]
+       bsf             $i1,$i1                 # ntz(block)
+       bsf             $i3,$i3
+       bsf             $i5,$i5
+
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+        pxor           $rndkey0l,@offset[1]
+        pxor           $rndkey0l,@offset[2]
+       aesenc          $rndkey1,$inout4
+        pxor           $rndkey0l,@offset[3]
+        pxor           $rndkey0l,@offset[4]
+       aesenc          $rndkey1,$inout5
+       $movkey         48($key_),$rndkey1
+        pxor           $rndkey0l,@offset[5]
+
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       aesenc          $rndkey0,$inout2
+       aesenc          $rndkey0,$inout3
+       aesenc          $rndkey0,$inout4
+       aesenc          $rndkey0,$inout5
+       $movkey         64($key_),$rndkey0
+       shl             \$4,$i1                 # ntz(block) -> table offset
+       shl             \$4,$i3
+       jmp             .Locb_enc_loop6
+
+.align 32
+.Locb_enc_loop6:
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       aesenc          $rndkey1,$inout4
+       aesenc          $rndkey1,$inout5
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       aesenc          $rndkey0,$inout2
+       aesenc          $rndkey0,$inout3
+       aesenc          $rndkey0,$inout4
+       aesenc          $rndkey0,$inout5
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .Locb_enc_loop6
+
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       aesenc          $rndkey1,$inout4
+       aesenc          $rndkey1,$inout5
+       $movkey         16($key_),$rndkey1
+       shl             \$4,$i5
+
+       aesenclast      @offset[0],$inout0
+       movdqu          ($L_p),@offset[0]       # L_0 for all odd-numbered blocks
+       mov             %r10,%rax               # restore twisted rounds
+       aesenclast      @offset[1],$inout1
+       aesenclast      @offset[2],$inout2
+       aesenclast      @offset[3],$inout3
+       aesenclast      @offset[4],$inout4
+       aesenclast      @offset[5],$inout5
+       ret
+.size  __ocb_encrypt6,.-__ocb_encrypt6
+
+.type  __ocb_encrypt4,\@abi-omnipotent
+.align 32
+__ocb_encrypt4:
+        pxor           $rndkey0l,@offset[5]    # offset_i ^ round[0]
+        movdqu         ($L_p,$i1),@offset[1]
+        movdqa         @offset[0],@offset[2]
+        movdqu         ($L_p,$i3),@offset[3]
+        pxor           @offset[5],@offset[0]
+        pxor           @offset[0],@offset[1]
+       pxor            $inout0,$checksum       # accumulate checksum
+       pxor            @offset[0],$inout0      # input ^ round[0] ^ offset_i
+        pxor           @offset[1],@offset[2]
+       pxor            $inout1,$checksum
+       pxor            @offset[1],$inout1
+        pxor           @offset[2],@offset[3]
+       pxor            $inout2,$checksum
+       pxor            @offset[2],$inout2
+       pxor            $inout3,$checksum
+       pxor            @offset[3],$inout3
+       $movkey         32($key_),$rndkey0
+
+        pxor           $rndkey0l,@offset[0]    # offset_i ^ round[last]
+        pxor           $rndkey0l,@offset[1]
+        pxor           $rndkey0l,@offset[2]
+        pxor           $rndkey0l,@offset[3]
+
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       $movkey         48($key_),$rndkey1
+
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       aesenc          $rndkey0,$inout2
+       aesenc          $rndkey0,$inout3
+       $movkey         64($key_),$rndkey0
+       jmp             .Locb_enc_loop4
+
+.align 32
+.Locb_enc_loop4:
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+
+       aesenc          $rndkey0,$inout0
+       aesenc          $rndkey0,$inout1
+       aesenc          $rndkey0,$inout2
+       aesenc          $rndkey0,$inout3
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .Locb_enc_loop4
+
+       aesenc          $rndkey1,$inout0
+       aesenc          $rndkey1,$inout1
+       aesenc          $rndkey1,$inout2
+       aesenc          $rndkey1,$inout3
+       $movkey         16($key_),$rndkey1
+       mov             %r10,%rax               # restore twisted rounds
+
+       aesenclast      @offset[0],$inout0
+       aesenclast      @offset[1],$inout1
+       aesenclast      @offset[2],$inout2
+       aesenclast      @offset[3],$inout3
+       ret
+.size  __ocb_encrypt4,.-__ocb_encrypt4
+
+.type  __ocb_encrypt1,\@abi-omnipotent
+.align 32
+__ocb_encrypt1:
+        pxor           @offset[5],$inout5      # offset_i
+        pxor           $rndkey0l,$inout5       # offset_i ^ round[0]
+       pxor            $inout0,$checksum       # accumulate checksum
+       pxor            $inout5,$inout0         # input ^ round[0] ^ offset_i
+       $movkey         32($key_),$rndkey0
+
+       aesenc          $rndkey1,$inout0
+       $movkey         48($key_),$rndkey1
+       pxor            $rndkey0l,$inout5       # offset_i ^ round[last]
+
+       aesenc          $rndkey0,$inout0
+       $movkey         64($key_),$rndkey0
+       jmp             .Locb_enc_loop1
+
+.align 32
+.Locb_enc_loop1:
+       aesenc          $rndkey1,$inout0
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+
+       aesenc          $rndkey0,$inout0
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .Locb_enc_loop1
+
+       aesenc          $rndkey1,$inout0
+       $movkey         16($key_),$rndkey1      # redundant in tail
+       mov             %r10,%rax               # restore twisted rounds
+
+       aesenclast      $inout5,$inout0
+       ret
+.size  __ocb_encrypt1,.-__ocb_encrypt1
+
+.globl aesni_ocb_decrypt
+.type  aesni_ocb_decrypt,\@function,6
+.align 32
+aesni_ocb_decrypt:
+       lea     (%rsp),%rax
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+___
+$code.=<<___ if ($win64);
+       lea     -0xa0(%rsp),%rsp
+       movaps  %xmm6,0x00(%rsp)                # offload everything
+       movaps  %xmm7,0x10(%rsp)
+       movaps  %xmm8,0x20(%rsp)
+       movaps  %xmm9,0x30(%rsp)
+       movaps  %xmm10,0x40(%rsp)
+       movaps  %xmm11,0x50(%rsp)
+       movaps  %xmm12,0x60(%rsp)
+       movaps  %xmm13,0x70(%rsp)
+       movaps  %xmm14,0x80(%rsp)
+       movaps  %xmm15,0x90(%rsp)
+.Locb_dec_body:
+___
+$code.=<<___;
+       mov     $seventh_arg(%rax),$L_p         # 7th argument
+       mov     $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+       mov     240($key),$rnds_
+       mov     $key,$key_
+       shl     \$4,$rnds_
+       $movkey ($key),$rndkey0l                # round[0]
+       $movkey 16($key,$rnds_),$rndkey1        # round[last]
+
+       movdqu  ($offset_p),@offset[5]          # load last offset_i
+       pxor    $rndkey1,$rndkey0l              # round[0] ^ round[last]
+       pxor    $rndkey1,@offset[5]             # offset_i ^ round[last]
+
+       mov     \$16+32,$rounds
+       lea     32($key_,$rnds_),$key
+       $movkey 16($key_),$rndkey1              # round[1]
+       sub     %r10,%rax                       # twisted $rounds
+       mov     %rax,%r10                       # backup twisted $rounds
+
+       movdqu  ($L_p),@offset[0]               # L_0 for all odd-numbered blocks
+       movdqu  ($checksum_p),$checksum         # load checksum
+
+       test    \$1,$block_num                  # is first block number odd?
+       jnz     .Locb_dec_odd
+
+       bsf     $block_num,$i1
+       add     \$1,$block_num
+       shl     \$4,$i1
+       movdqu  ($L_p,$i1),$inout5              # borrow
+       movdqu  ($inp),$inout0
+       lea     16($inp),$inp
+
+       call    __ocb_decrypt1
+
+       movdqa  $inout5,@offset[5]
+       movups  $inout0,($out)
+       xorps   $inout0,$checksum               # accumulate checksum
+       lea     16($out),$out
+       sub     \$1,$blocks
+       jz      .Locb_dec_done
+
+.Locb_dec_odd:
+       lea     1($block_num),$i1               # even-numbered blocks
+       lea     3($block_num),$i3
+       lea     5($block_num),$i5
+       lea     6($block_num),$block_num
+       bsf     $i1,$i1                         # ntz(block)
+       bsf     $i3,$i3
+       bsf     $i5,$i5
+       shl     \$4,$i1                         # ntz(block) -> table offset
+       shl     \$4,$i3
+       shl     \$4,$i5
+
+       sub     \$6,$blocks
+       jc      .Locb_dec_short
+       jmp     .Locb_dec_grandloop
+
+.align 32
+.Locb_dec_grandloop:
+       movdqu  `16*0`($inp),$inout0            # load input
+       movdqu  `16*1`($inp),$inout1
+       movdqu  `16*2`($inp),$inout2
+       movdqu  `16*3`($inp),$inout3
+       movdqu  `16*4`($inp),$inout4
+       movdqu  `16*5`($inp),$inout5
+       lea     `16*6`($inp),$inp
+
+       call    __ocb_decrypt6
+
+       movups  $inout0,`16*0`($out)            # store output
+       pxor    $inout0,$checksum               # accumulate checksum
+       movups  $inout1,`16*1`($out)
+       pxor    $inout1,$checksum
+       movups  $inout2,`16*2`($out)
+       pxor    $inout2,$checksum
+       movups  $inout3,`16*3`($out)
+       pxor    $inout3,$checksum
+       movups  $inout4,`16*4`($out)
+       pxor    $inout4,$checksum
+       movups  $inout5,`16*5`($out)
+       pxor    $inout5,$checksum
+       lea     `16*6`($out),$out
+       sub     \$6,$blocks
+       jnc     .Locb_dec_grandloop
+
+.Locb_dec_short:
+       add     \$6,$blocks
+       jz      .Locb_dec_done
+
+       movdqu  `16*0`($inp),$inout0
+       cmp     \$2,$blocks
+       jb      .Locb_dec_one
+       movdqu  `16*1`($inp),$inout1
+       je      .Locb_dec_two
+
+       movdqu  `16*2`($inp),$inout2
+       cmp     \$4,$blocks
+       jb      .Locb_dec_three
+       movdqu  `16*3`($inp),$inout3
+       je      .Locb_dec_four
+
+       movdqu  `16*4`($inp),$inout4
+       pxor    $inout5,$inout5
+
+       call    __ocb_decrypt6
+
+       movdqa  @offset[4],@offset[5]
+       movups  $inout0,`16*0`($out)            # store output
+       pxor    $inout0,$checksum               # accumulate checksum
+       movups  $inout1,`16*1`($out)
+       pxor    $inout1,$checksum
+       movups  $inout2,`16*2`($out)
+       pxor    $inout2,$checksum
+       movups  $inout3,`16*3`($out)
+       pxor    $inout3,$checksum
+       movups  $inout4,`16*4`($out)
+       pxor    $inout4,$checksum
+
+       jmp     .Locb_dec_done
+
+.align 16
+.Locb_dec_one:
+       movdqa  @offset[0],$inout5              # borrow
+
+       call    __ocb_decrypt1
+
+       movdqa  $inout5,@offset[5]
+       movups  $inout0,`16*0`($out)            # store output
+       xorps   $inout0,$checksum               # accumulate checksum
+       jmp     .Locb_dec_done
+
+.align 16
+.Locb_dec_two:
+       pxor    $inout2,$inout2
+       pxor    $inout3,$inout3
+
+       call    __ocb_decrypt4
+
+       movdqa  @offset[1],@offset[5]
+       movups  $inout0,`16*0`($out)            # store output
+       xorps   $inout0,$checksum               # accumulate checksum
+       movups  $inout1,`16*1`($out)
+       xorps   $inout1,$checksum
+
+       jmp     .Locb_dec_done
+
+.align 16
+.Locb_dec_three:
+       pxor    $inout3,$inout3
+
+       call    __ocb_decrypt4
+
+       movdqa  @offset[2],@offset[5]
+       movups  $inout0,`16*0`($out)            # store output
+       xorps   $inout0,$checksum               # accumulate checksum
+       movups  $inout1,`16*1`($out)
+       xorps   $inout1,$checksum
+       movups  $inout2,`16*2`($out)
+       xorps   $inout2,$checksum
+
+       jmp     .Locb_dec_done
+
+.align 16
+.Locb_dec_four:
+       call    __ocb_decrypt4
+
+       movdqa  @offset[3],@offset[5]
+       movups  $inout0,`16*0`($out)            # store output
+       pxor    $inout0,$checksum               # accumulate checksum
+       movups  $inout1,`16*1`($out)
+       pxor    $inout1,$checksum
+       movups  $inout2,`16*2`($out)
+       pxor    $inout2,$checksum
+       movups  $inout3,`16*3`($out)
+       pxor    $inout3,$checksum
+
+.Locb_dec_done:
+       pxor    $rndkey0,@offset[5]             # "remove" round[last]
+       movdqu  $checksum,($checksum_p)         # store checksum
+       movdqu  @offset[5],($offset_p)          # store last offset_i
+
+       xorps   %xmm0,%xmm0                     # clear register bank
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
+       pxor    %xmm10,%xmm10
+       pxor    %xmm11,%xmm11
+       pxor    %xmm12,%xmm12
+       pxor    %xmm13,%xmm13
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+       movaps  0x00(%rsp),%xmm6
+       movaps  %xmm0,0x00(%rsp)                # clear stack
+       movaps  0x10(%rsp),%xmm7
+       movaps  %xmm0,0x10(%rsp)
+       movaps  0x20(%rsp),%xmm8
+       movaps  %xmm0,0x20(%rsp)
+       movaps  0x30(%rsp),%xmm9
+       movaps  %xmm0,0x30(%rsp)
+       movaps  0x40(%rsp),%xmm10
+       movaps  %xmm0,0x40(%rsp)
+       movaps  0x50(%rsp),%xmm11
+       movaps  %xmm0,0x50(%rsp)
+       movaps  0x60(%rsp),%xmm12
+       movaps  %xmm0,0x60(%rsp)
+       movaps  0x70(%rsp),%xmm13
+       movaps  %xmm0,0x70(%rsp)
+       movaps  0x80(%rsp),%xmm14
+       movaps  %xmm0,0x80(%rsp)
+       movaps  0x90(%rsp),%xmm15
+       movaps  %xmm0,0x90(%rsp)
+       lea     0xa0+0x28(%rsp),%rax
+.Locb_dec_pop:
+       lea     0xa0(%rsp),%rsp
+___
+$code.=<<___;
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+.Locb_dec_epilogue:
+       ret
+.size  aesni_ocb_decrypt,.-aesni_ocb_decrypt
+
+.type  __ocb_decrypt6,\@abi-omnipotent
+.align 32
+__ocb_decrypt6:
+        pxor           $rndkey0l,@offset[5]    # offset_i ^ round[0]
+        movdqu         ($L_p,$i1),@offset[1]
+        movdqa         @offset[0],@offset[2]
+        movdqu         ($L_p,$i3),@offset[3]
+        movdqa         @offset[0],@offset[4]
+        pxor           @offset[5],@offset[0]
+        movdqu         ($L_p,$i5),@offset[5]
+        pxor           @offset[0],@offset[1]
+       pxor            @offset[0],$inout0      # input ^ round[0] ^ offset_i
+        pxor           @offset[1],@offset[2]
+       pxor            @offset[1],$inout1
+        pxor           @offset[2],@offset[3]
+       pxor            @offset[2],$inout2
+        pxor           @offset[3],@offset[4]
+       pxor            @offset[3],$inout3
+        pxor           @offset[4],@offset[5]
+       pxor            @offset[4],$inout4
+       pxor            @offset[5],$inout5
+       $movkey         32($key_),$rndkey0
+
+       lea             1($block_num),$i1       # even-numbered blocks
+       lea             3($block_num),$i3
+       lea             5($block_num),$i5
+       add             \$6,$block_num
+        pxor           $rndkey0l,@offset[0]    # offset_i ^ round[last]
+       bsf             $i1,$i1                 # ntz(block)
+       bsf             $i3,$i3
+       bsf             $i5,$i5
+
+       aesdec          $rndkey1,$inout0
+       aesdec          $rndkey1,$inout1
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+        pxor           $rndkey0l,@offset[1]
+        pxor           $rndkey0l,@offset[2]
+       aesdec          $rndkey1,$inout4
+        pxor           $rndkey0l,@offset[3]
+        pxor           $rndkey0l,@offset[4]
+       aesdec          $rndkey1,$inout5
+       $movkey         48($key_),$rndkey1
+        pxor           $rndkey0l,@offset[5]
+
+       aesdec          $rndkey0,$inout0
+       aesdec          $rndkey0,$inout1
+       aesdec          $rndkey0,$inout2
+       aesdec          $rndkey0,$inout3
+       aesdec          $rndkey0,$inout4
+       aesdec          $rndkey0,$inout5
+       $movkey         64($key_),$rndkey0
+       shl             \$4,$i1                 # ntz(block) -> table offset
+       shl             \$4,$i3
+       jmp             .Locb_dec_loop6
+
+.align 32
+.Locb_dec_loop6:
+       aesdec          $rndkey1,$inout0
+       aesdec          $rndkey1,$inout1
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+       aesdec          $rndkey1,$inout4
+       aesdec          $rndkey1,$inout5
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+
+       aesdec          $rndkey0,$inout0
+       aesdec          $rndkey0,$inout1
+       aesdec          $rndkey0,$inout2
+       aesdec          $rndkey0,$inout3
+       aesdec          $rndkey0,$inout4
+       aesdec          $rndkey0,$inout5
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .Locb_dec_loop6
+
+       aesdec          $rndkey1,$inout0
+       aesdec          $rndkey1,$inout1
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+       aesdec          $rndkey1,$inout4
+       aesdec          $rndkey1,$inout5
+       $movkey         16($key_),$rndkey1
+       shl             \$4,$i5
+
+       aesdeclast      @offset[0],$inout0
+       movdqu          ($L_p),@offset[0]       # L_0 for all odd-numbered blocks
+       mov             %r10,%rax               # restore twisted rounds
+       aesdeclast      @offset[1],$inout1
+       aesdeclast      @offset[2],$inout2
+       aesdeclast      @offset[3],$inout3
+       aesdeclast      @offset[4],$inout4
+       aesdeclast      @offset[5],$inout5
+       ret
+.size  __ocb_decrypt6,.-__ocb_decrypt6
+
+.type  __ocb_decrypt4,\@abi-omnipotent
+.align 32
+__ocb_decrypt4:
+        pxor           $rndkey0l,@offset[5]    # offset_i ^ round[0]
+        movdqu         ($L_p,$i1),@offset[1]
+        movdqa         @offset[0],@offset[2]
+        movdqu         ($L_p,$i3),@offset[3]
+        pxor           @offset[5],@offset[0]
+        pxor           @offset[0],@offset[1]
+       pxor            @offset[0],$inout0      # input ^ round[0] ^ offset_i
+        pxor           @offset[1],@offset[2]
+       pxor            @offset[1],$inout1
+        pxor           @offset[2],@offset[3]
+       pxor            @offset[2],$inout2
+       pxor            @offset[3],$inout3
+       $movkey         32($key_),$rndkey0
+
+        pxor           $rndkey0l,@offset[0]    # offset_i ^ round[last]
+        pxor           $rndkey0l,@offset[1]
+        pxor           $rndkey0l,@offset[2]
+        pxor           $rndkey0l,@offset[3]
+
+       aesdec          $rndkey1,$inout0
+       aesdec          $rndkey1,$inout1
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+       $movkey         48($key_),$rndkey1
+
+       aesdec          $rndkey0,$inout0
+       aesdec          $rndkey0,$inout1
+       aesdec          $rndkey0,$inout2
+       aesdec          $rndkey0,$inout3
+       $movkey         64($key_),$rndkey0
+       jmp             .Locb_dec_loop4
+
+.align 32
+.Locb_dec_loop4:
+       aesdec          $rndkey1,$inout0
+       aesdec          $rndkey1,$inout1
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+
+       aesdec          $rndkey0,$inout0
+       aesdec          $rndkey0,$inout1
+       aesdec          $rndkey0,$inout2
+       aesdec          $rndkey0,$inout3
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .Locb_dec_loop4
+
+       aesdec          $rndkey1,$inout0
+       aesdec          $rndkey1,$inout1
+       aesdec          $rndkey1,$inout2
+       aesdec          $rndkey1,$inout3
+       $movkey         16($key_),$rndkey1
+       mov             %r10,%rax               # restore twisted rounds
+
+       aesdeclast      @offset[0],$inout0
+       aesdeclast      @offset[1],$inout1
+       aesdeclast      @offset[2],$inout2
+       aesdeclast      @offset[3],$inout3
+       ret
+.size  __ocb_decrypt4,.-__ocb_decrypt4
+
+.type  __ocb_decrypt1,\@abi-omnipotent
+.align 32
+__ocb_decrypt1:
+        pxor           @offset[5],$inout5      # offset_i
+        pxor           $rndkey0l,$inout5       # offset_i ^ round[0]
+       pxor            $inout5,$inout0         # input ^ round[0] ^ offset_i
+       $movkey         32($key_),$rndkey0
+
+       aesdec          $rndkey1,$inout0
+       $movkey         48($key_),$rndkey1
+       pxor            $rndkey0l,$inout5       # offset_i ^ round[last]
+
+       aesdec          $rndkey0,$inout0
+       $movkey         64($key_),$rndkey0
+       jmp             .Locb_dec_loop1
+
+.align 32
+.Locb_dec_loop1:
+       aesdec          $rndkey1,$inout0
+       $movkey         ($key,%rax),$rndkey1
+       add             \$32,%rax
+
+       aesdec          $rndkey0,$inout0
+       $movkey         -16($key,%rax),$rndkey0
+       jnz             .Locb_dec_loop1
+
+       aesdec          $rndkey1,$inout0
+       $movkey         16($key_),$rndkey1      # redundant in tail
+       mov             %r10,%rax               # restore twisted rounds
+
+       aesdeclast      $inout5,$inout0
+       ret
+.size  __ocb_decrypt1,.-__ocb_decrypt1
+___
 } }}
 \f
 ########################################################################
@@ -3820,6 +4744,65 @@ ctr_xts_se_handler:
 
        jmp     .Lcommon_rbp_tail
 .size  ctr_xts_se_handler,.-ctr_xts_se_handler
+
+.type  ocb_se_handler,\@abi-omnipotent
+.align 16
+ocb_se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # prologue lable
+       cmp     %r10,%rbx               # context->Rip<prologue label
+       jb      .Lcommon_seh_tail
+
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lcommon_seh_tail
+
+       mov     8(%r11),%r10d           # HandlerData[2]
+       lea     (%rsi,%r10),%r10
+       cmp     %r10,%rbx               # context->Rip>=pop label
+       jae     .Locb_no_xmm
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       lea     (%rax),%rsi             # %xmm save area
+       lea     512($context),%rdi      # & context.Xmm6
+       mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
+       .long   0xa548f3fc              # cld; rep movsq
+       lea     0xa0+0x28(%rax),%rax
+
+.Locb_no_xmm:
+       mov     -8(%rax),%rbx
+       mov     -16(%rax),%rbp
+       mov     -24(%rax),%r12
+       mov     -32(%rax),%r13
+       mov     -40(%rax),%r14
+
+       mov     %rbx,144($context)      # restore context->Rbx
+       mov     %rbp,160($context)      # restore context->Rbp
+       mov     %r12,216($context)      # restore context->R12
+       mov     %r13,224($context)      # restore context->R13
+       mov     %r14,232($context)      # restore context->R14
+
+       jmp     .Lcommon_seh_tail
+.size  ocb_se_handler,.-ocb_se_handler
 ___
 $code.=<<___;
 .type  cbc_se_handler,\@abi-omnipotent
@@ -3933,6 +4916,14 @@ $code.=<<___ if ($PREFIX eq "aesni");
        .rva    .LSEH_begin_aesni_xts_decrypt
        .rva    .LSEH_end_aesni_xts_decrypt
        .rva    .LSEH_info_xts_dec
+
+       .rva    .LSEH_begin_aesni_ocb_encrypt
+       .rva    .LSEH_end_aesni_ocb_encrypt
+       .rva    .LSEH_info_ocb_enc
+
+       .rva    .LSEH_begin_aesni_ocb_decrypt
+       .rva    .LSEH_end_aesni_ocb_decrypt
+       .rva    .LSEH_info_ocb_dec
 ___
 $code.=<<___;
        .rva    .LSEH_begin_${PREFIX}_cbc_encrypt
@@ -3974,6 +4965,18 @@ $code.=<<___ if ($PREFIX eq "aesni");
        .byte   9,0,0,0
        .rva    ctr_xts_se_handler
        .rva    .Lxts_dec_body,.Lxts_dec_epilogue       # HandlerData[]
+.LSEH_info_ocb_enc:
+       .byte   9,0,0,0
+       .rva    ocb_se_handler
+       .rva    .Locb_enc_body,.Locb_enc_epilogue       # HandlerData[]
+       .rva    .Locb_enc_pop
+       .long   0
+.LSEH_info_ocb_dec:
+       .byte   9,0,0,0
+       .rva    ocb_se_handler
+       .rva    .Locb_dec_body,.Locb_dec_epilogue       # HandlerData[]
+       .rva    .Locb_dec_pop
+       .long   0
 ___
 $code.=<<___;
 .LSEH_info_cbc:
index efa724a36339bf0df5c1e05425b76bb21c12bf54..c356c9a035ce27e1d853b50347174841446a38de 100644 (file)
@@ -461,6 +461,19 @@ static int aesni_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
                             const unsigned char *in, size_t len);
 
 #  ifndef OPENSSL_NO_OCB
+void aesni_ocb_encrypt(const unsigned char *in, unsigned char *out,
+                       size_t blocks, const void *key,
+                       size_t start_block_num,
+                       unsigned char offset_i[16],
+                       const unsigned char L_[][16],
+                       unsigned char checksum[16]);
+void aesni_ocb_decrypt(const unsigned char *in, unsigned char *out,
+                       size_t blocks, const void *key,
+                       size_t start_block_num,
+                       unsigned char offset_i[16],
+                       const unsigned char L_[][16],
+                       unsigned char checksum[16]);
+
 static int aesni_ocb_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
                               const unsigned char *iv, int enc)
 {
@@ -479,7 +492,9 @@ static int aesni_ocb_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
             if (!CRYPTO_ocb128_init(&octx->ocb,
                                     &octx->ksenc.ks, &octx->ksdec.ks,
                                     (block128_f) aesni_encrypt,
-                                    (block128_f) aesni_decrypt))
+                                    (block128_f) aesni_decrypt,
+                                    enc ? aesni_ocb_encrypt
+                                        : aesni_ocb_decrypt))
                 return 0;
         }
         while (0);
@@ -2348,7 +2363,8 @@ static int aes_ocb_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
                 if (!CRYPTO_ocb128_init(&octx->ocb,
                                         &octx->ksenc.ks, &octx->ksdec.ks,
                                         (block128_f) vpaes_encrypt,
-                                        (block128_f) vpaes_decrypt))
+                                        (block128_f) vpaes_decrypt,
+                                        NULL))
                     return 0;
                 break;
             }
@@ -2358,7 +2374,8 @@ static int aes_ocb_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
             if (!CRYPTO_ocb128_init(&octx->ocb,
                                     &octx->ksenc.ks, &octx->ksdec.ks,
                                     (block128_f) AES_encrypt,
-                                    (block128_f) AES_decrypt))
+                                    (block128_f) AES_decrypt,
+                                    NULL))
                 return 0;
         }
         while (0);
index 2f61afe5dcdc88781a04c1f9ce179fb0169f985c..071b01465e9358b0158f352f790cda8d50db91e6 100644 (file)
@@ -164,6 +164,7 @@ struct ocb128_context {
     block128_f decrypt;
     void *keyenc;
     void *keydec;
+    ocb128_f stream;    /* direction dependent */
     /* Key dependent variables. Can be reused if key remains the same */
     size_t l_index;
     size_t max_l_index;
index 3a3f7a8d939e3c9c911dabd0010f1eb18a13cb26..c3daf7cd6ebaacce82823ed26f9be7b9e082864c 100644 (file)
@@ -159,7 +159,7 @@ static OCB_BLOCK *ocb_lookup_l(OCB128_CONTEXT *ctx, size_t idx)
         ctx->max_l_index += (idx - ctx->max_l_index + 4) & ~3;
         ctx->l =
             OPENSSL_realloc(ctx->l, ctx->max_l_index * sizeof(OCB_BLOCK));
-        if (!ctx->l)
+        if (ctx->l == NULL)
             return NULL;
     }
     while (l_index < idx) {
@@ -171,35 +171,19 @@ static OCB_BLOCK *ocb_lookup_l(OCB128_CONTEXT *ctx, size_t idx)
     return ctx->l + idx;
 }
 
-/*
- * Encrypt a block from |in| and store the result in |out|
- */
-static void ocb_encrypt(OCB128_CONTEXT *ctx, OCB_BLOCK *in, OCB_BLOCK *out,
-                        void *keyenc)
-{
-    ctx->encrypt(in->c, out->c, keyenc);
-}
-
-/*
- * Decrypt a block from |in| and store the result in |out|
- */
-static void ocb_decrypt(OCB128_CONTEXT *ctx, OCB_BLOCK *in, OCB_BLOCK *out,
-                        void *keydec)
-{
-    ctx->decrypt(in->c, out->c, keydec);
-}
-
 /*
  * Create a new OCB128_CONTEXT
  */
 OCB128_CONTEXT *CRYPTO_ocb128_new(void *keyenc, void *keydec,
-                                  block128_f encrypt, block128_f decrypt)
+                                  block128_f encrypt, block128_f decrypt,
+                                  ocb128_f stream)
 {
     OCB128_CONTEXT *octx;
     int ret;
 
     if ((octx = OPENSSL_malloc(sizeof(*octx))) != NULL) {
-        ret = CRYPTO_ocb128_init(octx, keyenc, keydec, encrypt, decrypt);
+        ret = CRYPTO_ocb128_init(octx, keyenc, keydec, encrypt, decrypt,
+                                 stream);
         if (ret)
             return octx;
         OPENSSL_free(octx);
@@ -212,7 +196,8 @@ OCB128_CONTEXT *CRYPTO_ocb128_new(void *keyenc, void *keydec,
  * Initialise an existing OCB128_CONTEXT
  */
 int CRYPTO_ocb128_init(OCB128_CONTEXT *ctx, void *keyenc, void *keydec,
-                       block128_f encrypt, block128_f decrypt)
+                       block128_f encrypt, block128_f decrypt,
+                       ocb128_f stream)
 {
     memset(ctx, 0, sizeof(*ctx));
     ctx->l_index = 0;
@@ -228,11 +213,12 @@ int CRYPTO_ocb128_init(OCB128_CONTEXT *ctx, void *keyenc, void *keydec,
      */
     ctx->encrypt = encrypt;
     ctx->decrypt = decrypt;
+    ctx->stream = stream;
     ctx->keyenc = keyenc;
     ctx->keydec = keydec;
 
     /* L_* = ENCIPHER(K, zeros(128)) */
-    ocb_encrypt(ctx, &ctx->l_star, &ctx->l_star, ctx->keyenc);
+    ctx->encrypt(ctx->l_star.c, ctx->l_star.c, ctx->keyenc);
 
     /* L_$ = double(L_*) */
     ocb_double(&ctx->l_star, &ctx->l_dollar);
@@ -324,11 +310,10 @@ int CRYPTO_ocb128_setiv(OCB128_CONTEXT *ctx, const unsigned char *iv,
 int CRYPTO_ocb128_aad(OCB128_CONTEXT *ctx, const unsigned char *aad,
                       size_t len)
 {
-    u64 all_num_blocks, num_blocks;
-    u64 i;
+    u64 i, all_num_blocks;
+    size_t num_blocks, last_len;
     OCB_BLOCK tmp1;
     OCB_BLOCK tmp2;
-    int last_len;
 
     /* Calculate the number of blocks of AAD provided now, and so far */
     num_blocks = len / 16;
@@ -341,14 +326,14 @@ int CRYPTO_ocb128_aad(OCB128_CONTEXT *ctx, const unsigned char *aad,
 
         /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
         lookup = ocb_lookup_l(ctx, ocb_ntz(i));
-        if (!lookup)
+        if (lookup == NULL)
             return 0;
         ocb_block16_xor(&ctx->offset_aad, lookup, &ctx->offset_aad);
 
         /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
         aad_block = (OCB_BLOCK *)(aad + ((i - ctx->blocks_hashed - 1) * 16));
         ocb_block16_xor(&ctx->offset_aad, aad_block, &tmp1);
-        ocb_encrypt(ctx, &tmp1, &tmp2, ctx->keyenc);
+        ctx->encrypt(tmp1.c, tmp2.c, ctx->keyenc);
         ocb_block16_xor(&ctx->sum, &tmp2, &ctx->sum);
     }
 
@@ -369,7 +354,7 @@ int CRYPTO_ocb128_aad(OCB128_CONTEXT *ctx, const unsigned char *aad,
         ocb_block16_xor(&ctx->offset_aad, &tmp1, &tmp2);
 
         /* Sum = Sum_m xor ENCIPHER(K, CipherInput) */
-        ocb_encrypt(ctx, &tmp2, &tmp1, ctx->keyenc);
+        ctx->encrypt(tmp2.c, tmp1.c, ctx->keyenc);
         ocb_block16_xor(&ctx->sum, &tmp1, &ctx->sum);
     }
 
@@ -386,12 +371,11 @@ int CRYPTO_ocb128_encrypt(OCB128_CONTEXT *ctx,
                           const unsigned char *in, unsigned char *out,
                           size_t len)
 {
-    u64 i;
-    u64 all_num_blocks, num_blocks;
+    u64 i, all_num_blocks;
+    size_t num_blocks, last_len;
     OCB_BLOCK tmp1;
     OCB_BLOCK tmp2;
     OCB_BLOCK pad;
-    int last_len;
 
     /*
      * Calculate the number of blocks of data to be encrypted provided now, and
@@ -400,28 +384,46 @@ int CRYPTO_ocb128_encrypt(OCB128_CONTEXT *ctx,
     num_blocks = len / 16;
     all_num_blocks = num_blocks + ctx->blocks_processed;
 
-    /* Loop through all full blocks to be encrypted */
-    for (i = ctx->blocks_processed + 1; i <= all_num_blocks; i++) {
-        OCB_BLOCK *lookup;
-        OCB_BLOCK *inblock;
-        OCB_BLOCK *outblock;
+    if (num_blocks && all_num_blocks == (size_t)all_num_blocks
+        && ctx->stream != NULL) {
+        size_t max_idx = 0, top = (size_t)all_num_blocks;
 
-        /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-        lookup = ocb_lookup_l(ctx, ocb_ntz(i));
-        if (!lookup)
+        /*
+         * See how many L_{i} entries we need to process data at hand
+         * and pre-compute missing entries in the table [if any]...
+         */
+        while (top >>= 1)
+            max_idx++;
+        if (ocb_lookup_l(ctx, max_idx) == NULL)
             return 0;
-        ocb_block16_xor(&ctx->offset, lookup, &ctx->offset);
-
-        /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
-        inblock = (OCB_BLOCK *)(in + ((i - ctx->blocks_processed - 1) * 16));
-        ocb_block16_xor_misaligned(&ctx->offset, inblock, &tmp1);
-        /* Checksum_i = Checksum_{i-1} xor P_i */
-        ocb_block16_xor_misaligned(&ctx->checksum, inblock, &ctx->checksum);
-        ocb_encrypt(ctx, &tmp1, &tmp2, ctx->keyenc);
-        outblock =
-            (OCB_BLOCK *)(out + ((i - ctx->blocks_processed - 1) * 16));
-        ocb_block16_xor_misaligned(&ctx->offset, &tmp2, outblock);
 
+        ctx->stream(in, out, num_blocks, ctx->keyenc,
+                    (size_t)ctx->blocks_processed + 1, ctx->offset.c,
+                    (const unsigned char (*)[16])ctx->l, ctx->checksum.c);
+    } else {
+        /* Loop through all full blocks to be encrypted */
+        for (i = ctx->blocks_processed + 1; i <= all_num_blocks; i++) {
+            OCB_BLOCK *lookup;
+            OCB_BLOCK *inblock;
+            OCB_BLOCK *outblock;
+
+            /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+            lookup = ocb_lookup_l(ctx, ocb_ntz(i));
+            if (lookup == NULL)
+                return 0;
+            ocb_block16_xor(&ctx->offset, lookup, &ctx->offset);
+
+            /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+            inblock =
+               (OCB_BLOCK *)(in + ((i - ctx->blocks_processed - 1) * 16));
+            ocb_block16_xor_misaligned(&ctx->offset, inblock, &tmp1);
+            /* Checksum_i = Checksum_{i-1} xor P_i */
+            ocb_block16_xor_misaligned(&ctx->checksum, inblock, &ctx->checksum);
+            ctx->encrypt(tmp1.c, tmp2.c, ctx->keyenc);
+            outblock =
+                (OCB_BLOCK *)(out + ((i - ctx->blocks_processed - 1) * 16));
+            ocb_block16_xor_misaligned(&ctx->offset, &tmp2, outblock);
+        }
     }
 
     /*
@@ -435,7 +437,7 @@ int CRYPTO_ocb128_encrypt(OCB128_CONTEXT *ctx,
         ocb_block16_xor(&ctx->offset, &ctx->l_star, &ctx->offset);
 
         /* Pad = ENCIPHER(K, Offset_*) */
-        ocb_encrypt(ctx, &ctx->offset, &pad, ctx->keyenc);
+        ctx->encrypt(ctx->offset.c, pad.c, ctx->keyenc);
 
         /* C_* = P_* xor Pad[1..bitlen(P_*)] */
         ocb_block_xor(in + (len / 16) * 16, (unsigned char *)&pad, last_len,
@@ -461,12 +463,12 @@ int CRYPTO_ocb128_decrypt(OCB128_CONTEXT *ctx,
                           const unsigned char *in, unsigned char *out,
                           size_t len)
 {
-    u64 i;
-    u64 all_num_blocks, num_blocks;
+    u64 i, all_num_blocks;
+    size_t num_blocks, last_len;
     OCB_BLOCK tmp1;
     OCB_BLOCK tmp2;
     OCB_BLOCK pad;
-    int last_len;
+
     /*
      * Calculate the number of blocks of data to be decrypted provided now, and
      * so far
@@ -474,27 +476,46 @@ int CRYPTO_ocb128_decrypt(OCB128_CONTEXT *ctx,
     num_blocks = len / 16;
     all_num_blocks = num_blocks + ctx->blocks_processed;
 
-    /* Loop through all full blocks to be decrypted */
-    for (i = ctx->blocks_processed + 1; i <= all_num_blocks; i++) {
-        OCB_BLOCK *inblock;
-        OCB_BLOCK *outblock;
+    if (num_blocks && all_num_blocks == (size_t)all_num_blocks
+        && ctx->stream != NULL) {
+        size_t max_idx = 0, top = (size_t)all_num_blocks;
 
-        /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-        OCB_BLOCK *lookup = ocb_lookup_l(ctx, ocb_ntz(i));
-        if (!lookup)
+        /*
+         * See how many L_{i} entries we need to process data at hand
+         * and pre-compute missing entries in the table [if any]...
+         */
+        while (top >>= 1)
+            max_idx++;
+        if (ocb_lookup_l(ctx, max_idx) == NULL)
             return 0;
-        ocb_block16_xor(&ctx->offset, lookup, &ctx->offset);
-
-        /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
-        inblock = (OCB_BLOCK *)(in + ((i - ctx->blocks_processed - 1) * 16));
-        ocb_block16_xor_misaligned(&ctx->offset, inblock, &tmp1);
-        ocb_decrypt(ctx, &tmp1, &tmp2, ctx->keydec);
-        outblock =
-            (OCB_BLOCK *)(out + ((i - ctx->blocks_processed - 1) * 16));
-        ocb_block16_xor_misaligned(&ctx->offset, &tmp2, outblock);
-
-        /* Checksum_i = Checksum_{i-1} xor P_i */
-        ocb_block16_xor_misaligned(&ctx->checksum, outblock, &ctx->checksum);
+
+        ctx->stream(in, out, num_blocks, ctx->keydec,
+                    (size_t)ctx->blocks_processed + 1, ctx->offset.c,
+                    (const unsigned char (*)[16])ctx->l, ctx->checksum.c);
+    } else {
+        /* Loop through all full blocks to be decrypted */
+        for (i = ctx->blocks_processed + 1; i <= all_num_blocks; i++) {
+            OCB_BLOCK *inblock;
+            OCB_BLOCK *outblock;
+
+            /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+            OCB_BLOCK *lookup = ocb_lookup_l(ctx, ocb_ntz(i));
+            if (lookup == NULL)
+                return 0;
+            ocb_block16_xor(&ctx->offset, lookup, &ctx->offset);
+
+            /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
+            inblock =
+                (OCB_BLOCK *)(in + ((i - ctx->blocks_processed - 1) * 16));
+            ocb_block16_xor_misaligned(&ctx->offset, inblock, &tmp1);
+            ctx->decrypt(tmp1.c, tmp2.c, ctx->keydec);
+            outblock =
+                (OCB_BLOCK *)(out + ((i - ctx->blocks_processed - 1) * 16));
+            ocb_block16_xor_misaligned(&ctx->offset, &tmp2, outblock);
+
+            /* Checksum_i = Checksum_{i-1} xor P_i */
+            ocb_block16_xor_misaligned(&ctx->checksum, outblock, &ctx->checksum);
+        }
     }
 
     /*
@@ -508,7 +529,7 @@ int CRYPTO_ocb128_decrypt(OCB128_CONTEXT *ctx,
         ocb_block16_xor(&ctx->offset, &ctx->l_star, &ctx->offset);
 
         /* Pad = ENCIPHER(K, Offset_*) */
-        ocb_encrypt(ctx, &ctx->offset, &pad, ctx->keyenc);
+        ctx->encrypt(ctx->offset.c, pad.c, ctx->keyenc);
 
         /* P_* = C_* xor Pad[1..bitlen(C_*)] */
         ocb_block_xor(in + (len / 16) * 16, (unsigned char *)&pad, last_len,
@@ -539,7 +560,7 @@ int CRYPTO_ocb128_finish(OCB128_CONTEXT *ctx, const unsigned char *tag,
      */
     ocb_block16_xor(&ctx->checksum, &ctx->offset, &tmp1);
     ocb_block16_xor(&tmp1, &ctx->l_dollar, &tmp2);
-    ocb_encrypt(ctx, &tmp2, &tmp1, ctx->keyenc);
+    ctx->encrypt(tmp2.c, tmp1.c, ctx->keyenc);
     ocb_block16_xor(&tmp1, &ctx->sum, &ctx->tag);
 
     if (len > 16 || len < 1) {
index f5767f56d0605a86b49d8f80a6a163ebb48c67ec..11bbb68356250b2d4a47455b045106149cf55a67 100644 (file)
@@ -167,10 +167,19 @@ size_t CRYPTO_128_unwrap_pad(void *key, const unsigned char *icv,
 #ifndef OPENSSL_NO_OCB
 typedef struct ocb128_context OCB128_CONTEXT;
 
+typedef void (*ocb128_f) (const unsigned char *in, unsigned char *out,
+                          size_t blocks, const void *key,
+                          size_t start_block_num,
+                          unsigned char offset_i[16],
+                          const unsigned char L_[][16],
+                          unsigned char checksum[16]);
+
 OCB128_CONTEXT *CRYPTO_ocb128_new(void *keyenc, void *keydec,
-                                  block128_f encrypt, block128_f decrypt);
+                                  block128_f encrypt, block128_f decrypt,
+                                  ocb128_f stream);
 int CRYPTO_ocb128_init(OCB128_CONTEXT *ctx, void *keyenc, void *keydec,
-                       block128_f encrypt, block128_f decrypt);
+                       block128_f encrypt, block128_f decrypt,
+                       ocb128_f stream);
 int CRYPTO_ocb128_copy_ctx(OCB128_CONTEXT *dest, OCB128_CONTEXT *src,
                            void *keyenc, void *keydec);
 int CRYPTO_ocb128_setiv(OCB128_CONTEXT *ctx, const unsigned char *iv,
index 8bdca594ffebaf658e0febc15886b22340153847..99ffe60c73f53d53ef776c3215f0307508885b32 100644 (file)
@@ -1854,6 +1854,46 @@ Tag = 1ad62009901f40cba7cd7156f94a7324
 Plaintext = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F2021222324252627
 Ciphertext = 5e2fa7367ffbdb3938845cfd415fcc71ec79634eb31451609d27505f5e2978f43c44213d8fa441ee
 
+Cipher = aes-128-ocb
+Key = 000102030405060708090A0B0C0D0E0F
+IV = 000102030405060708090A0B
+AAD = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F2021222324252627
+Tag = C203F98CE28F7DAD3F31C021
+Plaintext = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F3031
+Ciphertext = 09A4FD29DE949D9A9AA9924248422097AD4883B4713E6C214FF6567ADA08A967B2176C12F110DD441B7CAA3A509B13C822D6
+
+Cipher = aes-128-ocb
+Key = 000102030405060708090A0B0C0D0E0F
+IV = 000102030405060708090A0B
+AAD = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F2021222324252627
+Tag = 8346D7D47C5D893ED472F5AB
+Plaintext = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F4041
+Ciphertext = 09A4FD29DE949D9A9AA9924248422097AD4883B4713E6C214FF6567ADA08A967B2176C12F110DD441B7CAA3A509B13C86A023AFCEE998BEE42028D44507B15F714FF
+
+Cipher = aes-128-ocb
+Key = 000102030405060708090A0B0C0D0E0F
+IV = 000102030405060708090A0B
+AAD = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F2021222324252627
+Tag = 5822A9A70FDF55D29D2984A6
+Plaintext = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F404142434445464748494A4B4C4D4E4F5051
+Ciphertext = 09A4FD29DE949D9A9AA9924248422097AD4883B4713E6C214FF6567ADA08A967B2176C12F110DD441B7CAA3A509B13C86A023AFCEE998BEE42028D44507B15F77C528A1DE6406B519BCEE8FCB8294170634D
+
+Cipher = aes-128-ocb
+Key = 000102030405060708090A0B0C0D0E0F
+IV = 000102030405060708090A0B
+AAD = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F2021222324252627
+Tag = 81772B6741ABB4ECA9D2DEB2
+Plaintext = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F6061
+Ciphertext = 09A4FD29DE949D9A9AA9924248422097AD4883B4713E6C214FF6567ADA08A967B2176C12F110DD441B7CAA3A509B13C86A023AFCEE998BEE42028D44507B15F77C528A1DE6406B519BCEE8FCB829417001E54E15A7576C4DF32366E0F439C7050FAA
+
+Cipher = aes-128-ocb
+Key = 000102030405060708090A0B0C0D0E0F
+IV = 000102030405060708090A0B
+AAD = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F2021222324252627
+Tag = 3E52A01D068DE85456DB03B7
+Plaintext = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F202122232425262728292A2B2C2D2E2F303132333435363738393A3B3C3D3E3F404142434445464748494A4B4C4D4E4F505152535455565758595A5B5C5D5E5F606162636465666768696A6B6C6D6E6F7071
+Ciphertext = 09A4FD29DE949D9A9AA9924248422097AD4883B4713E6C214FF6567ADA08A967B2176C12F110DD441B7CAA3A509B13C86A023AFCEE998BEE42028D44507B15F77C528A1DE6406B519BCEE8FCB829417001E54E15A7576C4DF32366E0F439C7051CB4824B8114E9A720CBC1CE0185B156B486
+
 # AES XTS test vectors from IEEE Std 1619-2007
 Cipher = aes-128-xts
 Key = 0000000000000000000000000000000000000000000000000000000000000000