2 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
4 # This code is taken from the OpenSSL project but the author, Andy Polyakov,
5 # has relicensed it under the licenses specified in the SPDX header above.
6 # The original headers, including the original license headers, are
7 # included below for completeness.
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
22 # Improve scalar performance per Eric Biggers' suggestion to eliminate
23 # separate rotates. This requires b[0..3] and d[0..3] to be maintained
24 # pre-rotated, hence odd twists prior inner loop and when accumulating
25 # key material. Since amount of instructions is reduced as result, even
26 # NEON performance is improved somewhat, most notably by ~9% on low-end
27 # Cortex-A5/A7. Full unroll was shown to provide even better scalar
28 # performance on Cortex-A5/A7, naturally at the cost of manyfold size
29 # increase. We let it be. Oversized code works in benchmarks, but is not
30 # necessarily optimal in real life, when it's likely to be out-of-cache
31 # upon entry and evict significant part of cache upon completion.
33 # Performance in cycles per byte out of large buffer.
35 # IALU/gcc-4.4 1xNEON 3xNEON+1xIALU
37 # Cortex-A5 14.2(*)/+160% 21.8 12.9(**)
38 # Cortex-A8 10.2(*)/+190% 13.9 6.10
39 # Cortex-A9 10.8(*)/+150% 14.3 6.50
40 # Cortex-A15 11.0/+40% 16.0 4.90
41 # Snapdragon S4 13.9(***)/+90% 13.6 4.90
43 # (*) most "favourable" result for aligned data on little-endian
44 # processor, result for misaligned data is 10-15% lower;
45 # (**) pure 4xNEON [with "vertical" layout] was shown to provide ~8%
46 # better performance on Cortex-A5/A7, but not on others;
47 # (***) it's 17% slower than original, trade-off is considered
48 # acceptable, because of improvement on others, specifically
49 # +36% on Cortex-A5/A7 and +20% on Cortex-A9;
52 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
53 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
55 if ($flavour && $flavour ne "void") {
56 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
57 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
58 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
59 die "can't locate arm-xlate.pl";
61 open STDOUT,"| \"$^X\" $xlate $flavour $output";
63 open STDOUT,">$output";
66 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
67 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
69 $arg = "#$arg" if ($arg*1 eq $arg);
70 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
73 my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
74 my @t=map("r$_",(8..11));
77 my ($a0,$b0,$c0,$d0)=@_;
78 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
79 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
80 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
82 my ($xc,$xc_) = (@t[0..1]);
83 my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
86 # Consider order in which variables are addressed by their
91 # 0 4 8 12 < even round
95 # 0 5 10 15 < odd round
100 # 'a', 'b' are permanently allocated in registers, @x[0..7],
101 # while 'c's and pair of 'd's are maintained in memory. If
102 # you observe 'c' column, you'll notice that pair of 'c's is
103 # invariant between rounds. This means that we have to reload
104 # them once per round, in the middle. This is why you'll see
105 # bunch of 'c' stores and loads in the middle, but none in
106 # the beginning or end. If you observe 'd' column, you'll
107 # notice that 15 and 13 are reused in next pair of rounds.
108 # This is why these two are chosen for offloading to memory,
109 # to make loads count more.
111 "&add (@x[$a0],@x[$a0],@x[$b0],'ror#13')",
112 "&add (@x[$a1],@x[$a1],@x[$b1],'ror#13')",
113 "&eor ($xd,@x[$a0],$xd,'ror#24')",
114 "&eor ($xd_,@x[$a1],$xd_,'ror#24')",
116 "&add ($xc,$xc,$xd,'ror#16')",
117 "&add ($xc_,$xc_,$xd_,'ror#16')",
118 "&eor (@x[$b0],$xc, @x[$b0],'ror#13')",
119 "&eor (@x[$b1],$xc_,@x[$b1],'ror#13')",
121 "&add (@x[$a0],@x[$a0],@x[$b0],'ror#20')",
122 "&add (@x[$a1],@x[$a1],@x[$b1],'ror#20')",
123 "&eor ($xd,@x[$a0],$xd,'ror#16')",
124 "&eor ($xd_,@x[$a1],$xd_,'ror#16')" );
126 "&str ($xd,'[sp,#4*(16+$d0)]')" ) if ($odd);
128 "&add ($xc,$xc,$xd,'ror#24')" );
130 "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd);
132 "&str ($xd_,'[sp,#4*(16+$d1)]')" ) if (!$odd);
134 "&add ($xc_,$xc_,$xd_,'ror#24')" );
136 "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd);
138 "&str ($xc,'[sp,#4*(16+$c0)]')",
139 "&eor (@x[$b0],@x[$b0],$xc,'ror#12')",
140 "&str ($xc_,'[sp,#4*(16+$c1)]')",
141 "&eor (@x[$b1],@x[$b1],$xc_,'ror#12')" );
143 $xd=@x[$d2] if (!$odd);
144 $xd_=@x[$d3] if ($odd);
146 "&ldr ($xc,'[sp,#4*(16+$c2)]')",
147 "&add (@x[$a2],@x[$a2],@x[$b2],'ror#13')",
148 "&ldr ($xc_,'[sp,#4*(16+$c3)]')",
149 "&add (@x[$a3],@x[$a3],@x[$b3],'ror#13')",
150 "&eor ($xd,@x[$a2],$xd,'ror#24')",
151 "&eor ($xd_,@x[$a3],$xd_,'ror#24')",
153 "&add ($xc,$xc,$xd,'ror#16')",
154 "&add ($xc_,$xc_,$xd_,'ror#16')",
155 "&eor (@x[$b2],$xc, @x[$b2],'ror#13')",
156 "&eor (@x[$b3],$xc_,@x[$b3],'ror#13')",
158 "&add (@x[$a2],@x[$a2],@x[$b2],'ror#20')",
159 "&add (@x[$a3],@x[$a3],@x[$b3],'ror#20')",
160 "&eor ($xd,@x[$a2],$xd,'ror#16')",
161 "&eor ($xd_,@x[$a3],$xd_,'ror#16')",
163 "&add ($xc,$xc,$xd,'ror#24')",
164 "&add ($xc_,$xc_,$xd_,'ror#24')",
165 "&eor (@x[$b2],@x[$b2],$xc,'ror#12')",
166 "&eor (@x[$b3],@x[$b3],$xc_,'ror#12')" );
173 # include "arm_arch.h"
175 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
176 # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
177 # define ChaCha20_ctr32 chacha20_arm_cryptogams
178 # define ChaCha20_neon chacha20_neon
182 #if defined(__thumb2__) || defined(__clang__)
184 # define ldrhsb ldrbhs
186 #if defined(__thumb2__)
194 .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
198 .long 0x02010003,0x06050407
199 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
201 .word OPENSSL_armcap_P-.LChaCha20_ctr32
206 .globl ChaCha20_ctr32
207 .type ChaCha20_ctr32,%function
211 ldr r12,[sp,#0] @ pull pointer to counter and nonce
212 stmdb sp!,{r0-r2,r4-r11,lr}
213 #if __ARM_ARCH__<7 && !defined(__thumb2__)
214 sub r14,pc,#16 @ ChaCha20_ctr32
216 adr r14,.LChaCha20_ctr32
224 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
225 cmp r2,#192 @ test len
236 ldmia r12,{r4-r7} @ load counter and nonce
237 sub sp,sp,#4*(16) @ off-load area
238 sub r14,r14,#64 @ .Lsigma
239 stmdb sp!,{r4-r7} @ copy counter and nonce
240 ldmia r3,{r4-r11} @ load key
241 ldmia r14,{r0-r3} @ load sigma
242 stmdb sp!,{r4-r11} @ copy key
243 stmdb sp!,{r0-r3} @ copy sigma
244 str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
245 str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
250 ldmia sp,{r0-r9} @ load key material
251 str @t[3],[sp,#4*(32+2)] @ save len
252 str r12, [sp,#4*(32+1)] @ save inp
253 str r14, [sp,#4*(32+0)] @ save out
255 ldr @t[3], [sp,#4*(15)]
256 mov @x[4],@x[4],ror#19 @ twist b[0..3]
257 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
258 mov @x[5],@x[5],ror#19
259 ldr @t[2], [sp,#4*(13)]
260 mov @x[6],@x[6],ror#19
261 ldr @x[14],[sp,#4*(14)]
262 mov @x[7],@x[7],ror#19
263 mov @t[3],@t[3],ror#8 @ twist d[0..3]
264 mov @x[12],@x[12],ror#8
265 mov @t[2],@t[2],ror#8
266 mov @x[14],@x[14],ror#8
267 str @t[3], [sp,#4*(16+15)]
275 foreach (&ROUND(0, 4, 8,12)) { eval; }
276 foreach (&ROUND(0, 5,10,15)) { eval; }
280 ldr @t[3],[sp,#4*(32+2)] @ load len
282 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
283 str @t[1], [sp,#4*(16+9)]
284 str @x[12],[sp,#4*(16+12)]
285 str @t[2], [sp,#4*(16+13)]
286 str @x[14],[sp,#4*(16+14)]
288 @ at this point we have first half of 512-bit result in
289 @ @x[0-7] and second half at sp+4*(16+8)
291 cmp @t[3],#64 @ done yet?
295 addlo r12,sp,#4*(0) @ shortcut or ...
296 ldrhs r12,[sp,#4*(32+1)] @ ... load inp
297 addlo r14,sp,#4*(0) @ shortcut or ...
298 ldrhs r14,[sp,#4*(32+0)] @ ... load out
300 ldr @t[0],[sp,#4*(0)] @ load key material
301 ldr @t[1],[sp,#4*(1)]
303 #if __ARM_ARCH__>=6 || !defined(__ARMEB__)
306 tst @t[2],#3 @ are input and output aligned?
307 ldr @t[2],[sp,#4*(2)]
309 cmp @t[3],#64 @ restore flags
311 ldr @t[2],[sp,#4*(2)]
313 ldr @t[3],[sp,#4*(3)]
315 add @x[0],@x[0],@t[0] @ accumulate key material
316 add @x[1],@x[1],@t[1]
320 ldrhs @t[0],[r12],#16 @ load input
321 ldrhs @t[1],[r12,#-12]
323 add @x[2],@x[2],@t[2]
324 add @x[3],@x[3],@t[3]
328 ldrhs @t[2],[r12,#-8]
329 ldrhs @t[3],[r12,#-4]
330 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
339 eorhs @x[0],@x[0],@t[0] @ xor with input
340 eorhs @x[1],@x[1],@t[1]
342 str @x[0],[r14],#16 @ store output
346 eorhs @x[2],@x[2],@t[2]
347 eorhs @x[3],@x[3],@t[3]
348 ldmia @t[0],{@t[0]-@t[3]} @ load key material
353 add @x[4],@t[0],@x[4],ror#13 @ accumulate key material
354 add @x[5],@t[1],@x[5],ror#13
358 ldrhs @t[0],[r12],#16 @ load input
359 ldrhs @t[1],[r12,#-12]
360 add @x[6],@t[2],@x[6],ror#13
361 add @x[7],@t[3],@x[7],ror#13
365 ldrhs @t[2],[r12,#-8]
366 ldrhs @t[3],[r12,#-4]
367 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
376 eorhs @x[4],@x[4],@t[0]
377 eorhs @x[5],@x[5],@t[1]
379 str @x[4],[r14],#16 @ store output
383 eorhs @x[6],@x[6],@t[2]
384 eorhs @x[7],@x[7],@t[3]
386 ldmia @t[0],{@t[0]-@t[3]} @ load key material
388 add @x[0],sp,#4*(16+8)
391 ldmia @x[0],{@x[0]-@x[7]} @ load second half
393 add @x[0],@x[0],@t[0] @ accumulate key material
394 add @x[1],@x[1],@t[1]
398 ldrhs @t[0],[r12],#16 @ load input
399 ldrhs @t[1],[r12,#-12]
403 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
404 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
405 add @x[2],@x[2],@t[2]
406 add @x[3],@x[3],@t[3]
410 ldrhs @t[2],[r12,#-8]
411 ldrhs @t[3],[r12,#-4]
412 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
421 eorhs @x[0],@x[0],@t[0]
422 eorhs @x[1],@x[1],@t[1]
424 str @x[0],[r14],#16 @ store output
428 eorhs @x[2],@x[2],@t[2]
429 eorhs @x[3],@x[3],@t[3]
431 ldmia @t[0],{@t[0]-@t[3]} @ load key material
435 add @x[4],@t[0],@x[4],ror#24 @ accumulate key material
436 add @x[5],@t[1],@x[5],ror#24
440 addhi @t[0],@t[0],#1 @ next counter value
441 strhi @t[0],[sp,#4*(12)] @ save next counter value
445 ldrhs @t[0],[r12],#16 @ load input
446 ldrhs @t[1],[r12,#-12]
447 add @x[6],@t[2],@x[6],ror#24
448 add @x[7],@t[3],@x[7],ror#24
452 ldrhs @t[2],[r12,#-8]
453 ldrhs @t[3],[r12,#-4]
454 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
463 eorhs @x[4],@x[4],@t[0]
464 eorhs @x[5],@x[5],@t[1]
468 ldrne @t[0],[sp,#4*(32+2)] @ re-load len
472 eorhs @x[6],@x[6],@t[2]
473 eorhs @x[7],@x[7],@t[3]
474 str @x[4],[r14],#16 @ store output
479 subhs @t[3],@t[0],#64 @ len-=64
489 .Lunaligned: @ unaligned endian-neutral path
490 cmp @t[3],#64 @ restore flags
494 ldr @t[3],[sp,#4*(3)]
496 for ($i=0;$i<16;$i+=4) {
499 if ($i==4) { $twist = ",ror#13"; }
500 elsif ($i==12) { $twist = ",ror#24"; }
502 $code.=<<___ if ($i==4);
503 add @x[0],sp,#4*(16+8)
505 $code.=<<___ if ($i==8);
506 ldmia @x[0],{@x[0]-@x[7]} @ load second half
510 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]"
511 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]"
514 add @x[$j+0],@t[0],@x[$j+0]$twist @ accumulate key material
516 $code.=<<___ if ($i==12);
520 addhi @t[0],@t[0],#1 @ next counter value
521 strhi @t[0],[sp,#4*(12)] @ save next counter value
524 add @x[$j+1],@t[1],@x[$j+1]$twist
525 add @x[$j+2],@t[2],@x[$j+2]$twist
529 eorlo @t[0],@t[0],@t[0] @ zero or ...
530 ldrhsb @t[0],[r12],#16 @ ... load input
531 eorlo @t[1],@t[1],@t[1]
532 ldrhsb @t[1],[r12,#-12]
534 add @x[$j+3],@t[3],@x[$j+3]$twist
538 eorlo @t[2],@t[2],@t[2]
539 ldrhsb @t[2],[r12,#-8]
540 eorlo @t[3],@t[3],@t[3]
541 ldrhsb @t[3],[r12,#-4]
543 eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero)
544 eor @x[$j+1],@t[1],@x[$j+1]
548 ldrhsb @t[0],[r12,#-15] @ load more input
549 ldrhsb @t[1],[r12,#-11]
550 eor @x[$j+2],@t[2],@x[$j+2]
551 strb @x[$j+0],[r14],#16 @ store output
552 eor @x[$j+3],@t[3],@x[$j+3]
556 ldrhsb @t[2],[r12,#-7]
557 ldrhsb @t[3],[r12,#-3]
558 strb @x[$j+1],[r14,#-12]
559 eor @x[$j+0],@t[0],@x[$j+0],lsr#8
560 strb @x[$j+2],[r14,#-8]
561 eor @x[$j+1],@t[1],@x[$j+1],lsr#8
565 ldrhsb @t[0],[r12,#-14] @ load more input
566 ldrhsb @t[1],[r12,#-10]
567 strb @x[$j+3],[r14,#-4]
568 eor @x[$j+2],@t[2],@x[$j+2],lsr#8
569 strb @x[$j+0],[r14,#-15]
570 eor @x[$j+3],@t[3],@x[$j+3],lsr#8
574 ldrhsb @t[2],[r12,#-6]
575 ldrhsb @t[3],[r12,#-2]
576 strb @x[$j+1],[r14,#-11]
577 eor @x[$j+0],@t[0],@x[$j+0],lsr#8
578 strb @x[$j+2],[r14,#-7]
579 eor @x[$j+1],@t[1],@x[$j+1],lsr#8
583 ldrhsb @t[0],[r12,#-13] @ load more input
584 ldrhsb @t[1],[r12,#-9]
585 strb @x[$j+3],[r14,#-3]
586 eor @x[$j+2],@t[2],@x[$j+2],lsr#8
587 strb @x[$j+0],[r14,#-14]
588 eor @x[$j+3],@t[3],@x[$j+3],lsr#8
592 ldrhsb @t[2],[r12,#-5]
593 ldrhsb @t[3],[r12,#-1]
594 strb @x[$j+1],[r14,#-10]
595 strb @x[$j+2],[r14,#-6]
596 eor @x[$j+0],@t[0],@x[$j+0],lsr#8
597 strb @x[$j+3],[r14,#-2]
598 eor @x[$j+1],@t[1],@x[$j+1],lsr#8
599 strb @x[$j+0],[r14,#-13]
600 eor @x[$j+2],@t[2],@x[$j+2],lsr#8
601 strb @x[$j+1],[r14,#-9]
602 eor @x[$j+3],@t[3],@x[$j+3],lsr#8
603 strb @x[$j+2],[r14,#-5]
604 strb @x[$j+3],[r14,#-1]
606 $code.=<<___ if ($i<12);
607 add @t[0],sp,#4*(4+$i)
608 ldmia @t[0],{@t[0]-@t[3]} @ load key material
615 ldrne @t[0],[sp,#4*(32+2)] @ re-load len
619 subhs @t[3],@t[0],#64 @ len-=64
626 ldr r12,[sp,#4*(32+1)] @ load inp
628 ldr r14,[sp,#4*(32+0)] @ load out
631 ldrb @t[2],[@t[1]],#1 @ read buffer on stack
632 ldrb @t[3],[r12],#1 @ read input
634 eor @t[3],@t[3],@t[2]
635 strb @t[3],[r14],#1 @ store output
642 ldmia sp!,{r4-r11,pc}
644 ldmia sp!,{r4-r12,lr}
646 moveq pc,lr @ be binary compatible with V4, yet
647 .long 0xe12fff1e @ interoperable with Thumb ISA:-)
649 .size ChaCha20_ctr32,.-ChaCha20_ctr32
653 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
656 # This can replace vshr-by-24+vsli-by-8. It gives ~3% improvement on
657 # Cortex-A5/A7, but hurts Cortex-A9 by 5% and Snapdragon S4 by 14%!
659 { my ($dst,$src,$tbl) = @_;
660 $code .= " vtbl.8 $dst#lo,{$src#lo},$tbl#lo\n";
661 $code .= " vtbl.8 $dst#hi,{$src#hi},$tbl#lo\n";
666 my ($a,$b,$c,$d,$t)=@_;
669 "&vadd_i32 ($a,$a,$b)",
671 "&vrev32_16 ($d,$d)", # vrot ($d,16)
673 "&vadd_i32 ($c,$c,$d)",
675 "&vshr_u32 ($b,$t,20)",
676 "&vsli_32 ($b,$t,12)",
678 "&vadd_i32 ($a,$a,$b)",
680 "&vshr_u32 ($d,$t,24)",
681 "&vsli_32 ($d,$t,8)",
682 #"&vperm ($d,$t,$t3)",
684 "&vadd_i32 ($c,$c,$d)",
686 "&vshr_u32 ($b,$t,25)",
687 "&vsli_32 ($b,$t,7)",
689 "&vext_8 ($a,$a,$a,$odd?4:12)",
690 "&vext_8 ($d,$d,$d,8)",
691 "&vext_8 ($c,$c,$c,$odd?12:4)"
696 #if (defined(__KERNEL__) && defined(CONFIG_KERNEL_MODE_NEON)) || (!defined(__KERNEL__) && __ARM_MAX_ARCH__>=7)
702 @ For optimal performance it's appropriate for caller to enforce
703 @ minimum input length, 193 bytes is suggested.
705 .type ChaCha20_neon,%function
708 ldr r12,[sp,#0] @ pull pointer to counter and nonce
709 stmdb sp!,{r0-r2,r4-r11,lr}
712 vstmdb sp!,{d8-d15} @ ABI spec says so
715 vld1.32 {$b0-$c0},[r3] @ load key
716 ldmia r3,{r4-r11} @ load key
719 vld1.32 {$d0},[r12] @ load counter and nonce
721 ldmia r14,{r0-r3} @ load sigma
722 vld1.32 {$a0},[r14]! @ load sigma
723 vld1.32 {$t0},[r14]! @ one
724 @ vld1.32 {$t3#lo},[r14] @ rot8
725 vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce
726 vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key
728 str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
729 str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
730 vshl.i32 $t1#lo,$t0#lo,#1 @ two
731 vstr $t0#lo,[sp,#4*(16+0)]
732 vshl.i32 $t2#lo,$t0#lo,#2 @ four
733 vstr $t1#lo,[sp,#4*(16+2)]
735 vstr $t2#lo,[sp,#4*(16+4)]
737 @ vstr $t3#lo,[sp,#4*(16+6)]
744 ldmia sp,{r0-r9} @ load key material
745 cmp @t[3],#64*2 @ if len<=64*2
746 bls .Lbreak_neon @ switch to integer-only
747 @ vldr $t3#lo,[sp,#4*(16+6)] @ rot8
749 str @t[3],[sp,#4*(32+2)] @ save len
751 str r12, [sp,#4*(32+1)] @ save inp
753 str r14, [sp,#4*(32+0)] @ save out
756 ldr @t[3], [sp,#4*(15)]
757 mov @x[4],@x[4],ror#19 @ twist b[0..3]
758 vadd.i32 $d1,$d0,$t0 @ counter+1
759 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
760 mov @x[5],@x[5],ror#19
762 ldr @t[2], [sp,#4*(13)]
763 mov @x[6],@x[6],ror#19
765 ldr @x[14],[sp,#4*(14)]
766 mov @x[7],@x[7],ror#19
767 vadd.i32 $d2,$d1,$t0 @ counter+2
768 add @x[12],@x[12],#3 @ counter+3
769 mov @t[3],@t[3],ror#8 @ twist d[0..3]
770 mov @x[12],@x[12],ror#8
771 mov @t[2],@t[2],ror#8
772 mov @x[14],@x[14],ror#8
773 str @t[3], [sp,#4*(16+15)]
781 my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
782 my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
783 my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
784 my @thread3=&ROUND(0,4,8,12);
787 eval; eval(shift(@thread3));
788 eval(shift(@thread1)); eval(shift(@thread3));
789 eval(shift(@thread2)); eval(shift(@thread3));
792 @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
793 @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
794 @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
795 @thread3=&ROUND(0,5,10,15);
798 eval; eval(shift(@thread3));
799 eval(shift(@thread1)); eval(shift(@thread3));
800 eval(shift(@thread2)); eval(shift(@thread3));
806 vld1.32 {$t0-$t1},[sp] @ load key material
807 vld1.32 {$t2-$t3},[@t[3]]
809 ldr @t[3],[sp,#4*(32+2)] @ load len
811 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
812 str @t[1], [sp,#4*(16+9)]
813 str @x[12],[sp,#4*(16+12)]
814 str @t[2], [sp,#4*(16+13)]
815 str @x[14],[sp,#4*(16+14)]
817 @ at this point we have first half of 512-bit result in
818 @ @x[0-7] and second half at sp+4*(16+8)
820 ldr r12,[sp,#4*(32+1)] @ load inp
821 ldr r14,[sp,#4*(32+0)] @ load out
823 vadd.i32 $a0,$a0,$t0 @ accumulate key material
826 vldr $t0#lo,[sp,#4*(16+0)] @ one
831 vldr $t1#lo,[sp,#4*(16+2)] @ two
836 vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1
837 vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2
846 vld1.8 {$t0-$t1},[r12]! @ load input
848 vld1.8 {$t2-$t3},[r12]!
849 veor $a0,$a0,$t0 @ xor with input
851 vld1.8 {$t0-$t1},[r12]!
854 vld1.8 {$t2-$t3},[r12]!
857 vst1.8 {$a0-$b0},[r14]! @ store output
859 vld1.8 {$t0-$t1},[r12]!
861 vst1.8 {$c0-$d0},[r14]!
863 vld1.8 {$t2-$t3},[r12]!
866 vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration
867 veor $t0#hi,$t0#hi,$t0#hi
868 vldr $t0#lo,[sp,#4*(16+4)] @ four
870 vld1.32 {$c0-$d0},[@t[3]]
872 vst1.8 {$a1-$b1},[r14]!
874 vst1.8 {$c1-$d1},[r14]!
876 vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value
877 vldr $t0#lo,[sp,#4*(16+0)] @ one
879 ldmia sp,{@t[0]-@t[3]} @ load key material
880 add @x[0],@x[0],@t[0] @ accumulate key material
881 ldr @t[0],[r12],#16 @ load input
882 vst1.8 {$a2-$b2},[r14]!
883 add @x[1],@x[1],@t[1]
885 vst1.8 {$c2-$d2},[r14]!
886 add @x[2],@x[2],@t[2]
888 add @x[3],@x[3],@t[3]
896 eor @x[0],@x[0],@t[0] @ xor with input
898 eor @x[1],@x[1],@t[1]
899 str @x[0],[r14],#16 @ store output
900 eor @x[2],@x[2],@t[2]
902 eor @x[3],@x[3],@t[3]
903 ldmia @t[0],{@t[0]-@t[3]} @ load key material
907 add @x[4],@t[0],@x[4],ror#13 @ accumulate key material
908 ldr @t[0],[r12],#16 @ load input
909 add @x[5],@t[1],@x[5],ror#13
911 add @x[6],@t[2],@x[6],ror#13
913 add @x[7],@t[3],@x[7],ror#13
921 eor @x[4],@x[4],@t[0]
923 eor @x[5],@x[5],@t[1]
924 str @x[4],[r14],#16 @ store output
925 eor @x[6],@x[6],@t[2]
927 eor @x[7],@x[7],@t[3]
928 ldmia @t[0],{@t[0]-@t[3]} @ load key material
930 add @x[0],sp,#4*(16+8)
933 ldmia @x[0],{@x[0]-@x[7]} @ load second half
935 add @x[0],@x[0],@t[0] @ accumulate key material
936 ldr @t[0],[r12],#16 @ load input
937 add @x[1],@x[1],@t[1]
942 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
943 add @x[2],@x[2],@t[2]
948 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
949 add @x[3],@x[3],@t[3]
957 eor @x[0],@x[0],@t[0]
959 eor @x[1],@x[1],@t[1]
960 str @x[0],[r14],#16 @ store output
961 eor @x[2],@x[2],@t[2]
963 eor @x[3],@x[3],@t[3]
964 ldmia @t[0],{@t[0]-@t[3]} @ load key material
968 add @x[4],@t[0],@x[4],ror#24 @ accumulate key material
969 add @t[0],@t[0],#4 @ next counter value
970 add @x[5],@t[1],@x[5],ror#24
971 str @t[0],[sp,#4*(12)] @ save next counter value
972 ldr @t[0],[r12],#16 @ load input
973 add @x[6],@t[2],@x[6],ror#24
974 add @x[4],@x[4],#3 @ counter+3
976 add @x[7],@t[3],@x[7],ror#24
985 eor @x[4],@x[4],@t[0]
989 ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
990 eor @x[5],@x[5],@t[1]
991 eor @x[6],@x[6],@t[2]
992 str @x[4],[r14],#16 @ store output
993 eor @x[7],@x[7],@t[3]
995 sub @t[3],@t[0],#64*4 @ len-=64*4
1004 @ harmonize NEON and integer-only stack frames: load data
1005 @ from NEON frame, but save to integer-only one; distance
1006 @ between the two is 4*(32+4+16-32)=4*(20).
1008 str @t[3], [sp,#4*(20+32+2)] @ save len
1009 add @t[3],sp,#4*(32+4)
1010 str r12, [sp,#4*(20+32+1)] @ save inp
1011 str r14, [sp,#4*(20+32+0)] @ save out
1013 ldr @x[12],[sp,#4*(16+10)]
1014 ldr @x[14],[sp,#4*(16+11)]
1015 vldmia @t[3],{d8-d15} @ fulfill ABI requirement
1016 str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]"
1017 str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]"
1019 ldr @t[3], [sp,#4*(15)]
1020 mov @x[4],@x[4],ror#19 @ twist b[0..3]
1021 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
1022 mov @x[5],@x[5],ror#19
1023 ldr @t[2], [sp,#4*(13)]
1024 mov @x[6],@x[6],ror#19
1025 ldr @x[14],[sp,#4*(14)]
1026 mov @x[7],@x[7],ror#19
1027 mov @t[3],@t[3],ror#8 @ twist d[0..3]
1028 mov @x[12],@x[12],ror#8
1029 mov @t[2],@t[2],ror#8
1030 mov @x[14],@x[14],ror#8
1031 str @t[3], [sp,#4*(20+16+15)]
1032 add @t[3],sp,#4*(20)
1033 vst1.32 {$a0-$b0},[@t[3]]! @ copy key
1034 add sp,sp,#4*(20) @ switch frame
1035 vst1.32 {$c0-$d0},[@t[3]]
1037 b .Loop @ go integer-only
1042 bhs .L192_or_more_neon
1044 bhs .L128_or_more_neon
1046 bhs .L64_or_more_neon
1049 vst1.8 {$a0-$b0},[sp]
1051 vst1.8 {$c0-$d0},[@t[0]]
1056 vld1.8 {$t0-$t1},[r12]!
1057 vld1.8 {$t2-$t3},[r12]!
1062 vst1.8 {$a0-$b0},[r14]!
1063 vst1.8 {$c0-$d0},[r14]!
1068 vst1.8 {$a1-$b1},[sp]
1070 vst1.8 {$c1-$d1},[@t[0]]
1071 sub @t[3],@t[3],#64*1 @ len-=64*1
1076 vld1.8 {$t0-$t1},[r12]!
1077 vld1.8 {$t2-$t3},[r12]!
1080 vld1.8 {$t0-$t1},[r12]!
1083 vld1.8 {$t2-$t3},[r12]!
1087 vst1.8 {$a0-$b0},[r14]!
1089 vst1.8 {$c0-$d0},[r14]!
1091 vst1.8 {$a1-$b1},[r14]!
1092 vst1.8 {$c1-$d1},[r14]!
1097 vst1.8 {$a2-$b2},[sp]
1099 vst1.8 {$c2-$d2},[@t[0]]
1100 sub @t[3],@t[3],#64*2 @ len-=64*2
1105 vld1.8 {$t0-$t1},[r12]!
1106 vld1.8 {$t2-$t3},[r12]!
1109 vld1.8 {$t0-$t1},[r12]!
1112 vld1.8 {$t2-$t3},[r12]!
1116 vld1.8 {$t0-$t1},[r12]!
1118 vst1.8 {$a0-$b0},[r14]!
1120 vld1.8 {$t2-$t3},[r12]!
1123 vst1.8 {$c0-$d0},[r14]!
1125 vst1.8 {$a1-$b1},[r14]!
1127 vst1.8 {$c1-$d1},[r14]!
1129 vst1.8 {$a2-$b2},[r14]!
1130 vst1.8 {$c2-$d2},[r14]!
1134 ldmia sp,{@t[0]-@t[3]} @ load key material
1135 add @x[0],@x[0],@t[0] @ accumulate key material
1137 add @x[1],@x[1],@t[1]
1138 add @x[2],@x[2],@t[2]
1139 add @x[3],@x[3],@t[3]
1140 ldmia @t[0],{@t[0]-@t[3]} @ load key material
1142 add @x[4],@t[0],@x[4],ror#13 @ accumulate key material
1144 add @x[5],@t[1],@x[5],ror#13
1145 add @x[6],@t[2],@x[6],ror#13
1146 add @x[7],@t[3],@x[7],ror#13
1147 ldmia @t[0],{@t[0]-@t[3]} @ load key material
1158 stmia sp,{@x[0]-@x[7]}
1159 add @x[0],sp,#4*(16+8)
1161 ldmia @x[0],{@x[0]-@x[7]} @ load second half
1163 add @x[0],@x[0],@t[0] @ accumulate key material
1164 add @t[0],sp,#4*(12)
1165 add @x[1],@x[1],@t[1]
1166 add @x[2],@x[2],@t[2]
1167 add @x[3],@x[3],@t[3]
1168 ldmia @t[0],{@t[0]-@t[3]} @ load key material
1170 add @x[4],@t[0],@x[4],ror#24 @ accumulate key material
1172 add @x[5],@t[1],@x[5],ror#24
1173 add @x[4],@x[4],#3 @ counter+3
1174 add @x[6],@t[2],@x[6],ror#24
1175 add @x[7],@t[3],@x[7],ror#24
1176 ldr @t[3],[sp,#4*(32+2)] @ re-load len
1187 stmia @t[0],{@x[0]-@x[7]}
1189 sub @t[3],@t[3],#64*3 @ len-=64*3
1192 ldrb @t[0],[@t[2]],#1 @ read buffer on stack
1193 ldrb @t[1],[r12],#1 @ read input
1195 eor @t[0],@t[0],@t[1]
1196 strb @t[0],[r14],#1 @ store output
1203 ldmia sp!,{r4-r11,pc}
1204 .size ChaCha20_neon,.-ChaCha20_neon
1206 .comm OPENSSL_armcap_P,4,4
1215 last if (!s/^#/@/ and !/^$/);
1220 foreach (split("\n",$code)) {
1221 s/\`([^\`]*)\`/eval $1/geo;
1223 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;