3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
12 # SHA256 block procedure for ARMv4. May 2007.
14 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
15 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16 # byte [on single-issue Xscale PXA250 core].
20 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
21 # Cortex A8 core and ~20 cycles per processed byte.
25 # Profiler-assisted and platform-specific optimization resulted in 16%
26 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
30 # Add NEON implementation. On Cortex A8 it was measured to process one
31 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33 # code (meaning that latter performs sub-optimally, nothing was done
38 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
40 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
41 open STDOUT,">$output";
55 @V=($A,$B,$C,$D,$E,$F,$G,$H);
65 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
67 $code.=<<___ if ($i<16);
69 @ ldr $t1,[$inp],#4 @ $i
71 str $inp,[sp,#17*4] @ make room for $t4
73 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
74 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
75 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
78 @ ldrb $t1,[$inp,#3] @ $i
79 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
84 orr $t1,$t1,$t0,lsl#16
86 str $inp,[sp,#17*4] @ make room for $t4
88 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
89 orr $t1,$t1,$t2,lsl#24
90 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
94 ldr $t2,[$Ktbl],#4 @ *K256++
95 add $h,$h,$t1 @ h+=X[i]
96 str $t1,[sp,#`$i%16`*4]
98 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
100 add $h,$h,$t2 @ h+=K256[i]
101 eor $t1,$t1,$g @ Ch(e,f,g)
102 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
103 add $h,$h,$t1 @ h+=Ch(e,f,g)
106 cmp $t2,#0xf2 @ done?
110 ldr $t1,[$inp],#4 @ prefetch
114 eor $t2,$a,$b @ a^b, b^c in next round
116 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
117 eor $t2,$a,$b @ a^b, b^c in next round
118 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
120 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
121 and $t3,$t3,$t2 @ (b^c)&=(a^b)
123 eor $t3,$t3,$b @ Maj(a,b,c)
124 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
125 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
131 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
134 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
135 @ ldr $t4,[sp,#`($i+14)%16`*4]
136 mov $t0,$t1,ror#$sigma0[0]
137 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
138 mov $t2,$t4,ror#$sigma1[0]
139 eor $t0,$t0,$t1,ror#$sigma0[1]
140 eor $t2,$t2,$t4,ror#$sigma1[1]
141 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
142 ldr $t1,[sp,#`($i+0)%16`*4]
143 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
144 ldr $t4,[sp,#`($i+9)%16`*4]
147 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
149 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
150 add $t1,$t1,$t4 @ X[i]
157 # include "arm_arch.h"
159 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
160 # define __ARM_MAX_ARCH__ 7
178 .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
179 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
180 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
181 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
182 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
183 .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
184 .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
185 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
186 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
187 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
188 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
189 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
190 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
191 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
192 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
193 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
196 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
198 .word OPENSSL_armcap_P-sha256_block_data_order
202 .global sha256_block_data_order
203 .type sha256_block_data_order,%function
204 sha256_block_data_order:
206 sub r3,pc,#8 @ sha256_block_data_order
208 adr r3,sha256_block_data_order
210 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
211 ldr r12,.LOPENSSL_armcap
212 ldr r12,[r3,r12] @ OPENSSL_armcap_P
213 tst r12,#ARMV8_SHA256
218 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
219 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
220 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
221 sub $Ktbl,r3,#256+32 @ K256
222 sub sp,sp,#16*4 @ alloca(X[16])
229 eor $t3,$B,$C @ magic
232 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
233 $code.=".Lrounds_16_xx:\n";
234 for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
237 ite eq @ Thumb2 thing, sanity check in ARM
239 ldreq $t3,[sp,#16*4] @ pull ctx
242 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
257 ldr $inp,[sp,#17*4] @ pull inp
258 ldr $t2,[sp,#18*4] @ pull inp+len
261 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
263 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
266 add sp,sp,#`16+3`*4 @ destroy frame
268 ldmia sp!,{r4-r11,pc}
270 ldmia sp!,{r4-r11,lr}
272 moveq pc,lr @ be binary compatible with V4, yet
273 bx lr @ interoperable with Thumb ISA:-)
275 .size sha256_block_data_order,.-sha256_block_data_order
277 ######################################################################
281 my @X=map("q$_",(0..3));
282 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
286 sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
287 sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
289 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
290 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
292 $arg = "#$arg" if ($arg*1 eq $arg);
293 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
299 my @insns = (&$body,&$body,&$body,&$body);
300 my ($a,$b,$c,$d,$e,$f,$g,$h);
302 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
306 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
310 &vshr_u32 ($T2,$T0,$sigma0[0]);
313 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
316 &vshr_u32 ($T1,$T0,$sigma0[2]);
319 &vsli_32 ($T2,$T0,32-$sigma0[0]);
322 &vshr_u32 ($T3,$T0,$sigma0[1]);
328 &vsli_32 ($T3,$T0,32-$sigma0[1]);
331 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
334 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
337 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
340 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
343 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
349 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
352 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
355 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
358 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
361 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
364 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
367 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
373 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
376 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
379 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
382 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
385 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
388 &vadd_i32 ($T0,$T0,@X[0]);
389 while($#insns>=2) { eval(shift(@insns)); }
390 &vst1_32 ("{$T0}","[$Xfer,:128]!");
394 push(@X,shift(@X)); # "rotate" X[]
400 my @insns = (&$body,&$body,&$body,&$body);
401 my ($a,$b,$c,$d,$e,$f,$g,$h);
407 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
412 &vrev32_8 (@X[0],@X[0]);
417 &vadd_i32 ($T0,$T0,@X[0]);
418 foreach (@insns) { eval; } # remaining instructions
419 &vst1_32 ("{$T0}","[$Xfer,:128]!");
421 push(@X,shift(@X)); # "rotate" X[]
426 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
427 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
429 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
430 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
432 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
433 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
434 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
435 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
436 '&eor ($t2,$a,$b)', # a^b, b^c in next round
437 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
438 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
439 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
440 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
441 '&ldr ($t1,"[sp,#64]") if ($j==31)',
442 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
443 '&add ($d,$d,$h)', # d+=h
444 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
445 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
446 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
451 #if __ARM_MAX_ARCH__>=7
455 .global sha256_block_data_order_neon
456 .type sha256_block_data_order_neon,%function
458 sha256_block_data_order_neon:
460 stmdb sp!,{r4-r12,lr}
464 bic $H,$H,#15 @ align for 128-bit stores
467 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
469 vld1.8 {@X[0]},[$inp]!
470 vld1.8 {@X[1]},[$inp]!
471 vld1.8 {@X[2]},[$inp]!
472 vld1.8 {@X[3]},[$inp]!
473 vld1.32 {$T0},[$Ktbl,:128]!
474 vld1.32 {$T1},[$Ktbl,:128]!
475 vld1.32 {$T2},[$Ktbl,:128]!
476 vld1.32 {$T3},[$Ktbl,:128]!
477 vrev32.8 @X[0],@X[0] @ yes, even on
479 vrev32.8 @X[1],@X[1] @ big-endian
485 str $t2,[sp,#76] @ save original sp
486 vadd.i32 $T0,$T0,@X[0]
487 vadd.i32 $T1,$T1,@X[1]
488 vst1.32 {$T0},[$Xfer,:128]!
489 vadd.i32 $T2,$T2,@X[2]
490 vst1.32 {$T1},[$Xfer,:128]!
491 vadd.i32 $T3,$T3,@X[3]
492 vst1.32 {$T2},[$Xfer,:128]!
493 vst1.32 {$T3},[$Xfer,:128]!
505 &Xupdate(\&body_00_15);
506 &Xupdate(\&body_00_15);
507 &Xupdate(\&body_00_15);
508 &Xupdate(\&body_00_15);
510 teq $t1,#0 @ check for K256 terminator
517 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
520 subeq $inp,$inp,#64 @ avoid SEGV
521 vld1.8 {@X[0]},[$inp]! @ load next input block
522 vld1.8 {@X[1]},[$inp]!
523 vld1.8 {@X[2]},[$inp]!
524 vld1.8 {@X[3]},[$inp]!
529 &Xpreload(\&body_00_15);
530 &Xpreload(\&body_00_15);
531 &Xpreload(\&body_00_15);
532 &Xpreload(\&body_00_15);
535 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
539 add $A,$A,$t0 @ accumulate
561 ldreq sp,[sp,#76] @ restore original sp
566 ldmia sp!,{r4-r12,pc}
567 .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
571 ######################################################################
575 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
576 my @MSG=map("q$_",(8..11));
577 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
581 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
584 # define INST(a,b,c,d) .byte c,d|0xc,a,b
586 # define INST(a,b,c,d) .byte a,b,c,d
589 .type sha256_block_data_order_armv8,%function
591 sha256_block_data_order_armv8:
593 vld1.32 {$ABCD,$EFGH},[$ctx]
596 sub $Ktbl,$Ktbl,#.LARMv8-K256
598 sub $Ktbl,$Ktbl,#256+32
600 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
603 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
604 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
605 vld1.32 {$W0},[$Ktbl]!
606 vrev32.8 @MSG[0],@MSG[0]
607 vrev32.8 @MSG[1],@MSG[1]
608 vrev32.8 @MSG[2],@MSG[2]
609 vrev32.8 @MSG[3],@MSG[3]
610 vmov $ABCD_SAVE,$ABCD @ offload
611 vmov $EFGH_SAVE,$EFGH
614 for($i=0;$i<12;$i++) {
616 vld1.32 {$W1},[$Ktbl]!
617 vadd.i32 $W0,$W0,@MSG[0]
618 sha256su0 @MSG[0],@MSG[1]
620 sha256h $ABCD,$EFGH,$W0
621 sha256h2 $EFGH,$abcd,$W0
622 sha256su1 @MSG[0],@MSG[2],@MSG[3]
624 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
627 vld1.32 {$W1},[$Ktbl]!
628 vadd.i32 $W0,$W0,@MSG[0]
630 sha256h $ABCD,$EFGH,$W0
631 sha256h2 $EFGH,$abcd,$W0
633 vld1.32 {$W0},[$Ktbl]!
634 vadd.i32 $W1,$W1,@MSG[1]
636 sha256h $ABCD,$EFGH,$W1
637 sha256h2 $EFGH,$abcd,$W1
639 vld1.32 {$W1},[$Ktbl]
640 vadd.i32 $W0,$W0,@MSG[2]
641 sub $Ktbl,$Ktbl,#256-16 @ rewind
643 sha256h $ABCD,$EFGH,$W0
644 sha256h2 $EFGH,$abcd,$W0
646 vadd.i32 $W1,$W1,@MSG[3]
648 sha256h $ABCD,$EFGH,$W1
649 sha256h2 $EFGH,$abcd,$W1
651 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
652 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
656 vst1.32 {$ABCD,$EFGH},[$ctx]
659 .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
664 .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
666 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
667 .comm OPENSSL_armcap_P,4,4
674 last if (!s/^#/@/ and !/^$/);
680 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
681 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
684 my ($mnemonic,$arg)=@_;
686 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
687 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
688 |(($2&7)<<17)|(($2&8)<<4)
689 |(($3&7)<<1) |(($3&8)<<2);
690 # since ARMv7 instructions are always encoded little-endian.
691 # correct solution is to use .inst directive, but older
692 # assemblers don't implement it:-(
693 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
694 $word&0xff,($word>>8)&0xff,
695 ($word>>16)&0xff,($word>>24)&0xff,
701 foreach (split($/,$code)) {
703 s/\`([^\`]*)\`/eval $1/geo;
705 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
707 s/\bret\b/bx lr/go or
708 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
713 close STDOUT; # enforce flush