+{ my ($hi,$lo,$i,$A_flat, $len,$bsz,$inp) = map("r$_",(5..8, 10..12));
+
+########################################################################
+# Stack layout
+# ----->+-----------------------+
+# | uint64_t A[5][5] |
+# | ... |
+# | ... |
+# +336->+-----------------------+
+# | uint64_t *A |
+# +340->+-----------------------+
+# | const void *inp |
+# +344->+-----------------------+
+# | size_t len |
+# +348->+-----------------------+
+# | size_t bs |
+# +352->+-----------------------+
+# | ....
+
+$code.=<<___;
+.global SHA3_absorb
+.type SHA3_absorb,%function
+.align 5
+SHA3_absorb:
+ stmdb sp!,{r0-r12,lr}
+ sub sp,sp,#320+16
+
+ mov r12,r0
+ add r14,sp,#0
+ mov $len,r2
+ mov $bsz,r3
+
+ ldmia r12!,{@C[0]-@C[9]} @ copy A[5][5] to stack
+ stmia r14!,{@C[0]-@C[9]}
+ ldmia r12!,{@C[0]-@C[9]}
+ stmia r14!,{@C[0]-@C[9]}
+ ldmia r12!,{@C[0]-@C[9]}
+ stmia r14!,{@C[0]-@C[9]}
+ ldmia r12!,{@C[0]-@C[9]}
+ stmia r14!,{@C[0]-@C[9]}
+ ldmia r12, {@C[0]-@C[9]}
+ stmia r14, {@C[0]-@C[9]}
+
+ ldr $inp,[sp,#340]
+
+.Loop_absorb:
+ subs r0,$len,$bsz
+ blo .Labsorbed
+ add $A_flat,sp,#0
+ str r0,[sp,#344] @ save len - bsz
+
+.Loop_block:
+ ldmia $A_flat,{r2-r3} @ A_flat[i]
+ ldrb r0,[$inp,#7]! @ inp[7]
+ mov $i,#8
+
+.Lane_loop:
+ subs $i,$i,#1
+ lsl r1,r0,#24
+ blo .Lane_done
+#ifdef __thumb2__
+ it ne
+ ldrbne r0,[$inp,#-1]!
+#else
+ ldrneb r0,[$inp,#-1]!
+#endif
+ adds r1,r1,r1 @ sip through carry flag
+ adc $hi,$hi,$hi
+ adds r1,r1,r1
+ adc $lo,$lo,$lo
+ adds r1,r1,r1
+ adc $hi,$hi,$hi
+ adds r1,r1,r1
+ adc $lo,$lo,$lo
+ adds r1,r1,r1
+ adc $hi,$hi,$hi
+ adds r1,r1,r1
+ adc $lo,$lo,$lo
+ adds r1,r1,r1
+ adc $hi,$hi,$hi
+ adds r1,r1,r1
+ adc $lo,$lo,$lo
+ b .Lane_loop
+
+.Lane_done:
+ eor r2,r2,$lo
+ eor r3,r3,$hi
+ add $inp,$inp,#8
+ stmia $A_flat!,{r2-r3} @ A_flat[i++] ^= BitInterleave(inp[0..7])
+ subs $bsz,$bsz,#8
+ bhi .Loop_block
+
+ str $inp,[sp,#340]
+
+ bl KeccakF1600_int
+
+ ldr $inp,[sp,#340]
+ ldr $len,[sp,#344]
+ ldr $bsz,[sp,#348]
+ b .Loop_absorb
+
+.align 4
+.Labsorbed:
+ add r12,sp,#$A[1][0]
+ ldr r14, [sp,#336] @ pull pointer to A[5][5]
+ ldmia sp, {@C[0]-@C[9]}
+ stmia r14!,{@C[0]-@C[9]} @ return A[5][5]
+ ldmia r12!,{@C[0]-@C[9]}
+ stmia r14!,{@C[0]-@C[9]}
+ ldmia r12!,{@C[0]-@C[9]}
+ stmia r14!,{@C[0]-@C[9]}
+ ldmia r12!,{@C[0]-@C[9]}
+ stmia r14!,{@C[0]-@C[9]}
+ ldmia r12, {@C[0]-@C[9]}
+ stmia r14, {@C[0]-@C[9]}
+
+ add sp,sp,#320+32
+ mov r0,$len @ return value
+ ldmia sp!,{r4-r12,pc}
+.size SHA3_absorb,.-SHA3_absorb
+___
+}
+{ my ($A_flat,$out,$len,$bsz, $byte,$shl) = map("r$_", (4..9));
+
+$code.=<<___;
+.global SHA3_squeeze
+.type SHA3_squeeze,%function
+.align 5
+SHA3_squeeze:
+ stmdb sp!,{r4-r10,lr}
+ mov r12,r0
+ mov $A_flat,r0
+ mov $out,r1
+ mov $len,r2
+ mov $bsz,r3
+ mov r14,r3
+ b .Loop_squeeze
+
+.align 4
+.Loop_squeeze:
+ ldmia r12!,{r0,r1} @ A_flat[i++]
+ mov $shl,#28
+
+.Lane_squeeze:
+ lsl r2,r0,$shl
+ lsl r3,r1,$shl
+ eor $byte,$byte,$byte
+ adds r3,r3,r3 @ sip through carry flag
+ adc $byte,$byte,$byte
+ adds r2,r2,r2
+ adc $byte,$byte,$byte
+ adds r3,r3,r3
+ adc $byte,$byte,$byte
+ adds r2,r2,r2
+ adc $byte,$byte,$byte
+ adds r3,r3,r3
+ adc $byte,$byte,$byte
+ adds r2,r2,r2
+ adc $byte,$byte,$byte
+ adds r3,r3,r3
+ adc $byte,$byte,$byte
+ adds r2,r2,r2
+ adc $byte,$byte,$byte
+ subs $len,$len,#1 @ len -= 1
+ str $byte,[$out],#1
+ beq .Lsqueeze_done
+ subs $shl,$shl,#4
+ bhs .Lane_squeeze
+
+ subs r14,r14,#8 @ bsz -= 8
+ bhi .Loop_squeeze
+
+ mov r0,$A_flat
+
+ bl KeccakF1600
+
+ mov r12,$A_flat
+ mov r14,$bsz
+ b .Loop_squeeze
+
+.Lsqueeze_done:
+ ldmia sp!,{r4-r10,pc}
+.size SHA3_squeeze,.-SHA3_squeeze
+.asciz "Keccak-1600 absorb and squeeze for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+}
+