+&function_begin("aesni_ccm64_encrypt_blocks");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &mov ($rounds,&wparam(5));
+ &mov ($key_,"esp");
+ &sub ("esp",60);
+ &and ("esp",-16); # align stack
+ &mov (&DWP(48,"esp"),$key_);
+
+ &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
+ &movdqu ($inout1,&QWP(0,$rounds)); # load cmac
+
+ # compose byte-swap control mask for pshufb on stack
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
+ &mov (&DWP(4,"esp"),0x08090a0b);
+ &mov (&DWP(8,"esp"),0x04050607);
+ &mov (&DWP(12,"esp"),0x00010203);
+
+ # compose counter increment vector on stack
+ &mov ($rounds,1);
+ &xor ($key_,$key_);
+ &mov (&DWP(16,"esp"),$rounds);
+ &mov (&DWP(20,"esp"),$key_);
+ &mov (&DWP(24,"esp"),$key_);
+ &mov (&DWP(28,"esp"),$key_);
+
+ &movdqa ($inout3,&QWP(0,"esp"));
+ &pshufb ($ivec,$inout3); # keep iv in reverse order
+
+ &mov ($rounds,&DWP(240,$key));
+ &mov ($key_,$key);
+ &mov ($rounds_,$rounds);
+ &movdqa ($inout0,$ivec);
+
+&set_label("ccm64_enc_outer");
+ &movdqu ($in0,&QWP(0,$inp));
+ &pshufb ($inout0,$inout3);
+ &mov ($key,$key_);
+ &mov ($rounds,$rounds_);
+ &pxor ($inout1,$in0); # cmac^=inp
+ &pxor ($inout2,$inout2);
+
+ &call ("_aesni_encrypt3");
+
+ &paddq ($ivec,&QWP(16,"esp"));
+ &dec ($len);
+ &lea ($inp,&DWP(16,$inp));
+ &pxor ($in0,$inout0); # inp^=E(ivec)
+ &movdqa ($inout0,$ivec);
+ &movdqu (&QWP(0,$out),$in0);
+ &lea ($out,&DWP(16,$out));
+ &jnz (&label("ccm64_enc_outer"));
+
+ &mov ("esp",&DWP(48,"esp"));
+ &mov ($out,&wparam(5));
+ &movdqu (&QWP(0,$out),$inout1);
+&function_end("aesni_ccm64_encrypt_blocks");
+
+&function_begin("aesni_ccm64_decrypt_blocks");
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3));
+ &mov ($rounds_,&wparam(4));
+ &mov ($rounds,&wparam(5));
+ &mov ($key_,"esp");
+ &sub ("esp",60);
+ &and ("esp",-16); # align stack
+ &mov (&DWP(48,"esp"),$key_);
+
+ &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
+ &movdqu ($inout1,&QWP(0,$rounds)); # load cmac
+
+ # compose byte-swap control mask for pshufb on stack
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
+ &mov (&DWP(4,"esp"),0x08090a0b);
+ &mov (&DWP(8,"esp"),0x04050607);
+ &mov (&DWP(12,"esp"),0x00010203);
+
+ # compose counter increment vector on stack
+ &mov ($rounds,1);
+ &xor ($key_,$key_);
+ &mov (&DWP(16,"esp"),$rounds);
+ &mov (&DWP(20,"esp"),$key_);
+ &mov (&DWP(24,"esp"),$key_);
+ &mov (&DWP(28,"esp"),$key_);
+
+ &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
+ &movdqa ($inout0,$ivec);
+ &pshufb ($ivec,$inout3); # keep iv in reverse order
+
+ &mov ($rounds,&DWP(240,$key));
+ &mov ($key_,$key);
+ &mov ($rounds_,$rounds);
+
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+
+&set_label("ccm64_dec_outer");
+ &movdqu ($in0,&QWP(0,$inp));
+ &paddq ($ivec,&QWP(16,"esp"));
+ &dec ($len);
+ &lea ($inp,&QWP(16,$inp));
+ &pxor ($in0,$inout0);
+ &movdqa ($inout0,$ivec);
+ &mov ($key,$key_);
+ &mov ($rounds,$rounds_);
+ &pshufb ($inout0,$inout3);
+ &movdqu (&QWP(0,$out),$in0);
+ &lea ($out,&DWP(16,$out));
+
+ &jz (&label("ccm64_dec_break"));
+
+ &pxor ($inout2,$inout2);
+ &call ("_aesni_encrypt3");
+
+ &jmp (&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_break",16);
+ if ($inline)
+ { &aesni_inline_generate1("enc",$inout1); }
+ else
+ { &call ("_aesni_encrypt1",$inout1); }
+
+ &mov ("esp",&DWP(48,"esp"));
+ &mov ($out,&wparam(5));
+ &movdqu (&QWP(0,$out),$inout1);
+&function_end("aesni_ccm64_decrypt_blocks");
+\f
+######################################################################