]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl
service(8): use an environment more consistent with init(8)
[FreeBSD/FreeBSD.git] / sys / dev / if_wg / module / crypto / zinc / chacha20 / chacha20-arm.pl
1 #!/usr/bin/env perl
2 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
3 #
4 # This code is taken from the OpenSSL project but the author, Andy Polyakov,
5 # has relicensed it under the licenses specified in the SPDX header above.
6 # The original headers, including the original license headers, are
7 # included below for completeness.
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # December 2014
17 #
18 # ChaCha20 for ARMv4.
19 #
20 # September 2018
21 #
22 # Improve scalar performance per Eric Biggers' suggestion to eliminate
23 # separate rotates. This requires b[0..3] and d[0..3] to be maintained
24 # pre-rotated, hence odd twists prior inner loop and when accumulating
25 # key material. Since amount of instructions is reduced as result, even
26 # NEON performance is improved somewhat, most notably by ~9% on low-end
27 # Cortex-A5/A7. Full unroll was shown to provide even better scalar
28 # performance on Cortex-A5/A7, naturally at the cost of manyfold size
29 # increase. We let it be. Oversized code works in benchmarks, but is not
30 # necessarily optimal in real life, when it's likely to be out-of-cache
31 # upon entry and evict significant part of cache upon completion.
32 #
33 # Performance in cycles per byte out of large buffer.
34 #
35 #                       IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
36 #
37 # Cortex-A5             14.2(*)/+160%   21.8        12.9(**)
38 # Cortex-A8             10.2(*)/+190%   13.9        6.10
39 # Cortex-A9             10.8(*)/+150%   14.3        6.50
40 # Cortex-A15            11.0/+40%       16.0        4.90
41 # Snapdragon S4         13.9(***)/+90%  13.6        4.90
42 #
43 # (*)   most "favourable" result for aligned data on little-endian
44 #       processor, result for misaligned data is 10-15% lower;
45 # (**)  pure 4xNEON [with "vertical" layout] was shown to provide ~8%
46 #       better performance on Cortex-A5/A7, but not on others;
47 # (***) it's 17% slower than original, trade-off is considered
48 #       acceptable, because of improvement on others, specifically
49 #       +36% on Cortex-A5/A7 and +20% on Cortex-A9;
50
51 $flavour = shift;
52 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
53 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
54
55 if ($flavour && $flavour ne "void") {
56     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
57     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
58     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
59     die "can't locate arm-xlate.pl";
60
61     open STDOUT,"| \"$^X\" $xlate $flavour $output";
62 } else {
63     open STDOUT,">$output";
64 }
65
66 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
67 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
68   my $arg = pop;
69     $arg = "#$arg" if ($arg*1 eq $arg);
70     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
71 }
72
73 my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
74 my @t=map("r$_",(8..11));
75
76 sub ROUND {
77 my ($a0,$b0,$c0,$d0)=@_;
78 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
79 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
80 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
81 my $odd = $d0&1;
82 my ($xc,$xc_) = (@t[0..1]);
83 my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
84 my @ret;
85
86         # Consider order in which variables are addressed by their
87         # index:
88         #
89         #       a   b   c   d
90         #
91         #       0   4   8  12 < even round
92         #       1   5   9  13
93         #       2   6  10  14
94         #       3   7  11  15
95         #       0   5  10  15 < odd round
96         #       1   6  11  12
97         #       2   7   8  13
98         #       3   4   9  14
99         #
100         # 'a', 'b' are permanently allocated in registers, @x[0..7],
101         # while 'c's and pair of 'd's are maintained in memory. If
102         # you observe 'c' column, you'll notice that pair of 'c's is
103         # invariant between rounds. This means that we have to reload
104         # them once per round, in the middle. This is why you'll see
105         # bunch of 'c' stores and loads in the middle, but none in
106         # the beginning or end. If you observe 'd' column, you'll
107         # notice that 15 and 13 are reused in next pair of rounds.
108         # This is why these two are chosen for offloading to memory,
109         # to make loads count more.
110                                                         push @ret,(
111         "&add   (@x[$a0],@x[$a0],@x[$b0],'ror#13')",
112          "&add  (@x[$a1],@x[$a1],@x[$b1],'ror#13')",
113         "&eor   ($xd,@x[$a0],$xd,'ror#24')",
114          "&eor  ($xd_,@x[$a1],$xd_,'ror#24')",
115
116         "&add   ($xc,$xc,$xd,'ror#16')",
117          "&add  ($xc_,$xc_,$xd_,'ror#16')",
118         "&eor   (@x[$b0],$xc, @x[$b0],'ror#13')",
119          "&eor  (@x[$b1],$xc_,@x[$b1],'ror#13')",
120
121         "&add   (@x[$a0],@x[$a0],@x[$b0],'ror#20')",
122          "&add  (@x[$a1],@x[$a1],@x[$b1],'ror#20')",
123         "&eor   ($xd,@x[$a0],$xd,'ror#16')",
124          "&eor  ($xd_,@x[$a1],$xd_,'ror#16')"           );
125                                                         push @ret,(
126         "&str   ($xd,'[sp,#4*(16+$d0)]')"               ) if ($odd);
127                                                         push @ret,(
128         "&add   ($xc,$xc,$xd,'ror#24')"                 );
129                                                         push @ret,(
130         "&ldr   ($xd,'[sp,#4*(16+$d2)]')"               ) if ($odd);
131                                                         push @ret,(
132          "&str  ($xd_,'[sp,#4*(16+$d1)]')"              ) if (!$odd);
133                                                         push @ret,(
134          "&add  ($xc_,$xc_,$xd_,'ror#24')"              );
135                                                         push @ret,(
136          "&ldr  ($xd_,'[sp,#4*(16+$d3)]')"              ) if (!$odd);
137                                                         push @ret,(
138         "&str   ($xc,'[sp,#4*(16+$c0)]')",
139         "&eor   (@x[$b0],@x[$b0],$xc,'ror#12')",
140          "&str  ($xc_,'[sp,#4*(16+$c1)]')",
141          "&eor  (@x[$b1],@x[$b1],$xc_,'ror#12')"        );
142
143         $xd=@x[$d2]                                     if (!$odd);
144         $xd_=@x[$d3]                                    if ($odd);
145                                                         push @ret,(
146         "&ldr   ($xc,'[sp,#4*(16+$c2)]')",
147         "&add   (@x[$a2],@x[$a2],@x[$b2],'ror#13')",
148          "&ldr  ($xc_,'[sp,#4*(16+$c3)]')",
149          "&add  (@x[$a3],@x[$a3],@x[$b3],'ror#13')",
150         "&eor   ($xd,@x[$a2],$xd,'ror#24')",
151          "&eor  ($xd_,@x[$a3],$xd_,'ror#24')",
152
153         "&add   ($xc,$xc,$xd,'ror#16')",
154          "&add  ($xc_,$xc_,$xd_,'ror#16')",
155         "&eor   (@x[$b2],$xc, @x[$b2],'ror#13')",
156          "&eor  (@x[$b3],$xc_,@x[$b3],'ror#13')",
157
158         "&add   (@x[$a2],@x[$a2],@x[$b2],'ror#20')",
159          "&add  (@x[$a3],@x[$a3],@x[$b3],'ror#20')",
160         "&eor   ($xd,@x[$a2],$xd,'ror#16')",
161          "&eor  ($xd_,@x[$a3],$xd_,'ror#16')",
162
163         "&add   ($xc,$xc,$xd,'ror#24')",
164          "&add  ($xc_,$xc_,$xd_,'ror#24')",
165         "&eor   (@x[$b2],@x[$b2],$xc,'ror#12')",
166          "&eor  (@x[$b3],@x[$b3],$xc_,'ror#12')"        );
167
168         @ret;
169 }
170
171 $code.=<<___;
172 #ifndef __KERNEL__
173 # include "arm_arch.h"
174 #else
175 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
176 # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
177 # define ChaCha20_ctr32 chacha20_arm_cryptogams
178 # define ChaCha20_neon  chacha20_neon
179 #endif
180
181 .text
182 #if defined(__thumb2__) || defined(__clang__)
183 .syntax unified
184 # define ldrhsb ldrbhs
185 #endif
186 #if defined(__thumb2__)
187 .thumb
188 #else
189 .code   32
190 #endif
191
192 .align  5
193 .Lsigma:
194 .long   0x61707865,0x3320646e,0x79622d32,0x6b206574     @ endian-neutral
195 .Lone:
196 .long   1,0,0,0
197 .Lrot8:
198 .long   0x02010003,0x06050407
199 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
200 .LOPENSSL_armcap:
201 .word   OPENSSL_armcap_P-.LChaCha20_ctr32
202 #else
203 .word   -1
204 #endif
205
206 .globl  ChaCha20_ctr32
207 .type   ChaCha20_ctr32,%function
208 .align  5
209 ChaCha20_ctr32:
210 .LChaCha20_ctr32:
211         ldr     r12,[sp,#0]             @ pull pointer to counter and nonce
212         stmdb   sp!,{r0-r2,r4-r11,lr}
213 #if __ARM_ARCH__<7 && !defined(__thumb2__)
214         sub     r14,pc,#16              @ ChaCha20_ctr32
215 #else
216         adr     r14,.LChaCha20_ctr32
217 #endif
218         cmp     r2,#0                   @ len==0?
219 #ifdef  __thumb2__
220         itt     eq
221 #endif
222         addeq   sp,sp,#4*3
223         beq     .Lno_data
224 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
225         cmp     r2,#192                 @ test len
226         bls     .Lshort
227         ldr     r4,[r14,#-24]
228         ldr     r4,[r14,r4]
229 # ifdef __APPLE__
230         ldr     r4,[r4]
231 # endif
232         tst     r4,#ARMV7_NEON
233         bne     .LChaCha20_neon
234 .Lshort:
235 #endif
236         ldmia   r12,{r4-r7}             @ load counter and nonce
237         sub     sp,sp,#4*(16)           @ off-load area
238         sub     r14,r14,#64             @ .Lsigma
239         stmdb   sp!,{r4-r7}             @ copy counter and nonce
240         ldmia   r3,{r4-r11}             @ load key
241         ldmia   r14,{r0-r3}             @ load sigma
242         stmdb   sp!,{r4-r11}            @ copy key
243         stmdb   sp!,{r0-r3}             @ copy sigma
244         str     r10,[sp,#4*(16+10)]     @ off-load "@x[10]"
245         str     r11,[sp,#4*(16+11)]     @ off-load "@x[11]"
246         b       .Loop_outer_enter
247
248 .align  4
249 .Loop_outer:
250         ldmia   sp,{r0-r9}              @ load key material
251         str     @t[3],[sp,#4*(32+2)]    @ save len
252         str     r12,  [sp,#4*(32+1)]    @ save inp
253         str     r14,  [sp,#4*(32+0)]    @ save out
254 .Loop_outer_enter:
255         ldr     @t[3], [sp,#4*(15)]
256          mov    @x[4],@x[4],ror#19      @ twist b[0..3]
257         ldr     @x[12],[sp,#4*(12)]     @ modulo-scheduled load
258          mov    @x[5],@x[5],ror#19
259         ldr     @t[2], [sp,#4*(13)]
260          mov    @x[6],@x[6],ror#19
261         ldr     @x[14],[sp,#4*(14)]
262          mov    @x[7],@x[7],ror#19
263         mov     @t[3],@t[3],ror#8       @ twist d[0..3]
264         mov     @x[12],@x[12],ror#8
265         mov     @t[2],@t[2],ror#8
266         mov     @x[14],@x[14],ror#8
267         str     @t[3], [sp,#4*(16+15)]
268         mov     @t[3],#10
269         b       .Loop
270
271 .align  4
272 .Loop:
273         subs    @t[3],@t[3],#1
274 ___
275         foreach (&ROUND(0, 4, 8,12)) { eval; }
276         foreach (&ROUND(0, 5,10,15)) { eval; }
277 $code.=<<___;
278         bne     .Loop
279
280         ldr     @t[3],[sp,#4*(32+2)]    @ load len
281
282         str     @t[0], [sp,#4*(16+8)]   @ modulo-scheduled store
283         str     @t[1], [sp,#4*(16+9)]
284         str     @x[12],[sp,#4*(16+12)]
285         str     @t[2], [sp,#4*(16+13)]
286         str     @x[14],[sp,#4*(16+14)]
287
288         @ at this point we have first half of 512-bit result in
289         @ @x[0-7] and second half at sp+4*(16+8)
290
291         cmp     @t[3],#64               @ done yet?
292 #ifdef  __thumb2__
293         itete   lo
294 #endif
295         addlo   r12,sp,#4*(0)           @ shortcut or ...
296         ldrhs   r12,[sp,#4*(32+1)]      @ ... load inp
297         addlo   r14,sp,#4*(0)           @ shortcut or ...
298         ldrhs   r14,[sp,#4*(32+0)]      @ ... load out
299
300         ldr     @t[0],[sp,#4*(0)]       @ load key material
301         ldr     @t[1],[sp,#4*(1)]
302
303 #if __ARM_ARCH__>=6 || !defined(__ARMEB__)
304 # if __ARM_ARCH__<7
305         orr     @t[2],r12,r14
306         tst     @t[2],#3                @ are input and output aligned?
307         ldr     @t[2],[sp,#4*(2)]
308         bne     .Lunaligned
309         cmp     @t[3],#64               @ restore flags
310 # else
311         ldr     @t[2],[sp,#4*(2)]
312 # endif
313         ldr     @t[3],[sp,#4*(3)]
314
315         add     @x[0],@x[0],@t[0]       @ accumulate key material
316         add     @x[1],@x[1],@t[1]
317 # ifdef __thumb2__
318         itt     hs
319 # endif
320         ldrhs   @t[0],[r12],#16         @ load input
321         ldrhs   @t[1],[r12,#-12]
322
323         add     @x[2],@x[2],@t[2]
324         add     @x[3],@x[3],@t[3]
325 # ifdef __thumb2__
326         itt     hs
327 # endif
328         ldrhs   @t[2],[r12,#-8]
329         ldrhs   @t[3],[r12,#-4]
330 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
331         rev     @x[0],@x[0]
332         rev     @x[1],@x[1]
333         rev     @x[2],@x[2]
334         rev     @x[3],@x[3]
335 # endif
336 # ifdef __thumb2__
337         itt     hs
338 # endif
339         eorhs   @x[0],@x[0],@t[0]       @ xor with input
340         eorhs   @x[1],@x[1],@t[1]
341          add    @t[0],sp,#4*(4)
342         str     @x[0],[r14],#16         @ store output
343 # ifdef __thumb2__
344         itt     hs
345 # endif
346         eorhs   @x[2],@x[2],@t[2]
347         eorhs   @x[3],@x[3],@t[3]
348          ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
349         str     @x[1],[r14,#-12]
350         str     @x[2],[r14,#-8]
351         str     @x[3],[r14,#-4]
352
353         add     @x[4],@t[0],@x[4],ror#13 @ accumulate key material
354         add     @x[5],@t[1],@x[5],ror#13
355 # ifdef __thumb2__
356         itt     hs
357 # endif
358         ldrhs   @t[0],[r12],#16         @ load input
359         ldrhs   @t[1],[r12,#-12]
360         add     @x[6],@t[2],@x[6],ror#13
361         add     @x[7],@t[3],@x[7],ror#13
362 # ifdef __thumb2__
363         itt     hs
364 # endif
365         ldrhs   @t[2],[r12,#-8]
366         ldrhs   @t[3],[r12,#-4]
367 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
368         rev     @x[4],@x[4]
369         rev     @x[5],@x[5]
370         rev     @x[6],@x[6]
371         rev     @x[7],@x[7]
372 # endif
373 # ifdef __thumb2__
374         itt     hs
375 # endif
376         eorhs   @x[4],@x[4],@t[0]
377         eorhs   @x[5],@x[5],@t[1]
378          add    @t[0],sp,#4*(8)
379         str     @x[4],[r14],#16         @ store output
380 # ifdef __thumb2__
381         itt     hs
382 # endif
383         eorhs   @x[6],@x[6],@t[2]
384         eorhs   @x[7],@x[7],@t[3]
385         str     @x[5],[r14,#-12]
386          ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
387         str     @x[6],[r14,#-8]
388          add    @x[0],sp,#4*(16+8)
389         str     @x[7],[r14,#-4]
390
391         ldmia   @x[0],{@x[0]-@x[7]}     @ load second half
392
393         add     @x[0],@x[0],@t[0]       @ accumulate key material
394         add     @x[1],@x[1],@t[1]
395 # ifdef __thumb2__
396         itt     hs
397 # endif
398         ldrhs   @t[0],[r12],#16         @ load input
399         ldrhs   @t[1],[r12,#-12]
400 # ifdef __thumb2__
401         itt     hi
402 # endif
403          strhi  @t[2],[sp,#4*(16+10)]   @ copy "@x[10]" while at it
404          strhi  @t[3],[sp,#4*(16+11)]   @ copy "@x[11]" while at it
405         add     @x[2],@x[2],@t[2]
406         add     @x[3],@x[3],@t[3]
407 # ifdef __thumb2__
408         itt     hs
409 # endif
410         ldrhs   @t[2],[r12,#-8]
411         ldrhs   @t[3],[r12,#-4]
412 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
413         rev     @x[0],@x[0]
414         rev     @x[1],@x[1]
415         rev     @x[2],@x[2]
416         rev     @x[3],@x[3]
417 # endif
418 # ifdef __thumb2__
419         itt     hs
420 # endif
421         eorhs   @x[0],@x[0],@t[0]
422         eorhs   @x[1],@x[1],@t[1]
423          add    @t[0],sp,#4*(12)
424         str     @x[0],[r14],#16         @ store output
425 # ifdef __thumb2__
426         itt     hs
427 # endif
428         eorhs   @x[2],@x[2],@t[2]
429         eorhs   @x[3],@x[3],@t[3]
430         str     @x[1],[r14,#-12]
431          ldmia  @t[0],{@t[0]-@t[3]}     @ load key material
432         str     @x[2],[r14,#-8]
433         str     @x[3],[r14,#-4]
434
435         add     @x[4],@t[0],@x[4],ror#24 @ accumulate key material
436         add     @x[5],@t[1],@x[5],ror#24
437 # ifdef __thumb2__
438         itt     hi
439 # endif
440          addhi  @t[0],@t[0],#1          @ next counter value
441          strhi  @t[0],[sp,#4*(12)]      @ save next counter value
442 # ifdef __thumb2__
443         itt     hs
444 # endif
445         ldrhs   @t[0],[r12],#16         @ load input
446         ldrhs   @t[1],[r12,#-12]
447         add     @x[6],@t[2],@x[6],ror#24
448         add     @x[7],@t[3],@x[7],ror#24
449 # ifdef __thumb2__
450         itt     hs
451 # endif
452         ldrhs   @t[2],[r12,#-8]
453         ldrhs   @t[3],[r12,#-4]
454 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
455         rev     @x[4],@x[4]
456         rev     @x[5],@x[5]
457         rev     @x[6],@x[6]
458         rev     @x[7],@x[7]
459 # endif
460 # ifdef __thumb2__
461         itt     hs
462 # endif
463         eorhs   @x[4],@x[4],@t[0]
464         eorhs   @x[5],@x[5],@t[1]
465 # ifdef __thumb2__
466          it     ne
467 # endif
468          ldrne  @t[0],[sp,#4*(32+2)]    @ re-load len
469 # ifdef __thumb2__
470         itt     hs
471 # endif
472         eorhs   @x[6],@x[6],@t[2]
473         eorhs   @x[7],@x[7],@t[3]
474         str     @x[4],[r14],#16         @ store output
475         str     @x[5],[r14,#-12]
476 # ifdef __thumb2__
477         it      hs
478 # endif
479          subhs  @t[3],@t[0],#64         @ len-=64
480         str     @x[6],[r14,#-8]
481         str     @x[7],[r14,#-4]
482         bhi     .Loop_outer
483
484         beq     .Ldone
485 # if __ARM_ARCH__<7
486         b       .Ltail
487
488 .align  4
489 .Lunaligned:                            @ unaligned endian-neutral path
490         cmp     @t[3],#64               @ restore flags
491 # endif
492 #endif
493 #if __ARM_ARCH__<7
494         ldr     @t[3],[sp,#4*(3)]
495 ___
496 for ($i=0;$i<16;$i+=4) {
497 my $j=$i&0x7;
498 my $twist="";
499 if ($i==4)     { $twist = ",ror#13"; }
500 elsif ($i==12) { $twist = ",ror#24"; }
501
502 $code.=<<___    if ($i==4);
503         add     @x[0],sp,#4*(16+8)
504 ___
505 $code.=<<___    if ($i==8);
506         ldmia   @x[0],{@x[0]-@x[7]}             @ load second half
507 # ifdef __thumb2__
508         itt     hi
509 # endif
510         strhi   @t[2],[sp,#4*(16+10)]           @ copy "@x[10]"
511         strhi   @t[3],[sp,#4*(16+11)]           @ copy "@x[11]"
512 ___
513 $code.=<<___;
514         add     @x[$j+0],@t[0],@x[$j+0]$twist   @ accumulate key material
515 ___
516 $code.=<<___    if ($i==12);
517 # ifdef __thumb2__
518         itt     hi
519 # endif
520         addhi   @t[0],@t[0],#1                  @ next counter value
521         strhi   @t[0],[sp,#4*(12)]              @ save next counter value
522 ___
523 $code.=<<___;
524         add     @x[$j+1],@t[1],@x[$j+1]$twist
525         add     @x[$j+2],@t[2],@x[$j+2]$twist
526 # ifdef __thumb2__
527         itete   lo
528 # endif
529         eorlo   @t[0],@t[0],@t[0]               @ zero or ...
530         ldrhsb  @t[0],[r12],#16                 @ ... load input
531         eorlo   @t[1],@t[1],@t[1]
532         ldrhsb  @t[1],[r12,#-12]
533
534         add     @x[$j+3],@t[3],@x[$j+3]$twist
535 # ifdef __thumb2__
536         itete   lo
537 # endif
538         eorlo   @t[2],@t[2],@t[2]
539         ldrhsb  @t[2],[r12,#-8]
540         eorlo   @t[3],@t[3],@t[3]
541         ldrhsb  @t[3],[r12,#-4]
542
543         eor     @x[$j+0],@t[0],@x[$j+0]         @ xor with input (or zero)
544         eor     @x[$j+1],@t[1],@x[$j+1]
545 # ifdef __thumb2__
546         itt     hs
547 # endif
548         ldrhsb  @t[0],[r12,#-15]                @ load more input
549         ldrhsb  @t[1],[r12,#-11]
550         eor     @x[$j+2],@t[2],@x[$j+2]
551          strb   @x[$j+0],[r14],#16              @ store output
552         eor     @x[$j+3],@t[3],@x[$j+3]
553 # ifdef __thumb2__
554         itt     hs
555 # endif
556         ldrhsb  @t[2],[r12,#-7]
557         ldrhsb  @t[3],[r12,#-3]
558          strb   @x[$j+1],[r14,#-12]
559         eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
560          strb   @x[$j+2],[r14,#-8]
561         eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
562 # ifdef __thumb2__
563         itt     hs
564 # endif
565         ldrhsb  @t[0],[r12,#-14]                @ load more input
566         ldrhsb  @t[1],[r12,#-10]
567          strb   @x[$j+3],[r14,#-4]
568         eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
569          strb   @x[$j+0],[r14,#-15]
570         eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
571 # ifdef __thumb2__
572         itt     hs
573 # endif
574         ldrhsb  @t[2],[r12,#-6]
575         ldrhsb  @t[3],[r12,#-2]
576          strb   @x[$j+1],[r14,#-11]
577         eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
578          strb   @x[$j+2],[r14,#-7]
579         eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
580 # ifdef __thumb2__
581         itt     hs
582 # endif
583         ldrhsb  @t[0],[r12,#-13]                @ load more input
584         ldrhsb  @t[1],[r12,#-9]
585          strb   @x[$j+3],[r14,#-3]
586         eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
587          strb   @x[$j+0],[r14,#-14]
588         eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
589 # ifdef __thumb2__
590         itt     hs
591 # endif
592         ldrhsb  @t[2],[r12,#-5]
593         ldrhsb  @t[3],[r12,#-1]
594          strb   @x[$j+1],[r14,#-10]
595          strb   @x[$j+2],[r14,#-6]
596         eor     @x[$j+0],@t[0],@x[$j+0],lsr#8
597          strb   @x[$j+3],[r14,#-2]
598         eor     @x[$j+1],@t[1],@x[$j+1],lsr#8
599          strb   @x[$j+0],[r14,#-13]
600         eor     @x[$j+2],@t[2],@x[$j+2],lsr#8
601          strb   @x[$j+1],[r14,#-9]
602         eor     @x[$j+3],@t[3],@x[$j+3],lsr#8
603          strb   @x[$j+2],[r14,#-5]
604          strb   @x[$j+3],[r14,#-1]
605 ___
606 $code.=<<___    if ($i<12);
607         add     @t[0],sp,#4*(4+$i)
608         ldmia   @t[0],{@t[0]-@t[3]}             @ load key material
609 ___
610 }
611 $code.=<<___;
612 # ifdef __thumb2__
613         it      ne
614 # endif
615         ldrne   @t[0],[sp,#4*(32+2)]            @ re-load len
616 # ifdef __thumb2__
617         it      hs
618 # endif
619         subhs   @t[3],@t[0],#64                 @ len-=64
620         bhi     .Loop_outer
621
622         beq     .Ldone
623 #endif
624
625 .Ltail:
626         ldr     r12,[sp,#4*(32+1)]      @ load inp
627         add     @t[1],sp,#4*(0)
628         ldr     r14,[sp,#4*(32+0)]      @ load out
629
630 .Loop_tail:
631         ldrb    @t[2],[@t[1]],#1        @ read buffer on stack
632         ldrb    @t[3],[r12],#1          @ read input
633         subs    @t[0],@t[0],#1
634         eor     @t[3],@t[3],@t[2]
635         strb    @t[3],[r14],#1          @ store output
636         bne     .Loop_tail
637
638 .Ldone:
639         add     sp,sp,#4*(32+3)
640 .Lno_data:
641 #if __ARM_ARCH__>=5
642         ldmia   sp!,{r4-r11,pc}
643 #else
644         ldmia   sp!,{r4-r12,lr}
645         tst     lr,#1
646         moveq   pc,lr                   @ be binary compatible with V4, yet
647         .long   0xe12fff1e              @ interoperable with Thumb ISA:-)
648 #endif
649 .size   ChaCha20_ctr32,.-ChaCha20_ctr32
650 ___
651
652 {{{
653 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
654     map("q$_",(0..15));
655
656 # This can replace vshr-by-24+vsli-by-8. It gives ~3% improvement on
657 # Cortex-A5/A7, but hurts Cortex-A9 by 5% and Snapdragon S4 by 14%!
658 sub vperm()
659 { my ($dst,$src,$tbl) = @_;
660     $code .= "  vtbl.8  $dst#lo,{$src#lo},$tbl#lo\n";
661     $code .= "  vtbl.8  $dst#hi,{$src#hi},$tbl#lo\n";
662 }
663
664 sub NEONROUND {
665 my $odd = pop;
666 my ($a,$b,$c,$d,$t)=@_;
667
668         (
669         "&vadd_i32      ($a,$a,$b)",
670         "&veor          ($d,$d,$a)",
671         "&vrev32_16     ($d,$d)",       # vrot ($d,16)
672
673         "&vadd_i32      ($c,$c,$d)",
674         "&veor          ($t,$b,$c)",
675         "&vshr_u32      ($b,$t,20)",
676         "&vsli_32       ($b,$t,12)",
677
678         "&vadd_i32      ($a,$a,$b)",
679         "&veor          ($t,$d,$a)",
680         "&vshr_u32      ($d,$t,24)",
681         "&vsli_32       ($d,$t,8)",
682         #"&vperm        ($d,$t,$t3)",
683
684         "&vadd_i32      ($c,$c,$d)",
685         "&veor          ($t,$b,$c)",
686         "&vshr_u32      ($b,$t,25)",
687         "&vsli_32       ($b,$t,7)",
688
689         "&vext_8        ($a,$a,$a,$odd?4:12)",
690         "&vext_8        ($d,$d,$d,8)",
691         "&vext_8        ($c,$c,$c,$odd?12:4)"
692         );
693 }
694
695 $code.=<<___;
696 #if (defined(__KERNEL__) && defined(CONFIG_KERNEL_MODE_NEON)) || (!defined(__KERNEL__) && __ARM_MAX_ARCH__>=7)
697 .arch   armv7-a
698 .fpu    neon
699
700 # ifdef __KERNEL__
701 .globl  ChaCha20_neon
702 @ For optimal performance it's appropriate for caller to enforce
703 @ minimum input length, 193 bytes is suggested.
704 # endif
705 .type   ChaCha20_neon,%function
706 .align  5
707 ChaCha20_neon:
708         ldr             r12,[sp,#0]             @ pull pointer to counter and nonce
709         stmdb           sp!,{r0-r2,r4-r11,lr}
710 .LChaCha20_neon:
711         adr             r14,.Lsigma
712         vstmdb          sp!,{d8-d15}            @ ABI spec says so
713         stmdb           sp!,{r0-r3}
714
715         vld1.32         {$b0-$c0},[r3]          @ load key
716         ldmia           r3,{r4-r11}             @ load key
717
718         sub             sp,sp,#4*(16+16)
719         vld1.32         {$d0},[r12]             @ load counter and nonce
720         add             r12,sp,#4*8
721         ldmia           r14,{r0-r3}             @ load sigma
722         vld1.32         {$a0},[r14]!            @ load sigma
723         vld1.32         {$t0},[r14]!            @ one
724         @ vld1.32       {$t3#lo},[r14]          @ rot8
725         vst1.32         {$c0-$d0},[r12]         @ copy 1/2key|counter|nonce
726         vst1.32         {$a0-$b0},[sp]          @ copy sigma|1/2key
727
728         str             r10,[sp,#4*(16+10)]     @ off-load "@x[10]"
729         str             r11,[sp,#4*(16+11)]     @ off-load "@x[11]"
730         vshl.i32        $t1#lo,$t0#lo,#1        @ two
731         vstr            $t0#lo,[sp,#4*(16+0)]
732         vshl.i32        $t2#lo,$t0#lo,#2        @ four
733         vstr            $t1#lo,[sp,#4*(16+2)]
734         vmov            $a1,$a0
735         vstr            $t2#lo,[sp,#4*(16+4)]
736         vmov            $a2,$a0
737         @ vstr          $t3#lo,[sp,#4*(16+6)]
738         vmov            $b1,$b0
739         vmov            $b2,$b0
740         b               .Loop_neon_enter
741
742 .align  4
743 .Loop_neon_outer:
744         ldmia           sp,{r0-r9}              @ load key material
745         cmp             @t[3],#64*2             @ if len<=64*2
746         bls             .Lbreak_neon            @ switch to integer-only
747         @ vldr          $t3#lo,[sp,#4*(16+6)]   @ rot8
748         vmov            $a1,$a0
749         str             @t[3],[sp,#4*(32+2)]    @ save len
750         vmov            $a2,$a0
751         str             r12,  [sp,#4*(32+1)]    @ save inp
752         vmov            $b1,$b0
753         str             r14,  [sp,#4*(32+0)]    @ save out
754         vmov            $b2,$b0
755 .Loop_neon_enter:
756         ldr             @t[3], [sp,#4*(15)]
757          mov            @x[4],@x[4],ror#19      @ twist b[0..3]
758         vadd.i32        $d1,$d0,$t0             @ counter+1
759         ldr             @x[12],[sp,#4*(12)]     @ modulo-scheduled load
760          mov            @x[5],@x[5],ror#19
761         vmov            $c1,$c0
762         ldr             @t[2], [sp,#4*(13)]
763          mov            @x[6],@x[6],ror#19
764         vmov            $c2,$c0
765         ldr             @x[14],[sp,#4*(14)]
766          mov            @x[7],@x[7],ror#19
767         vadd.i32        $d2,$d1,$t0             @ counter+2
768         add             @x[12],@x[12],#3        @ counter+3
769         mov             @t[3],@t[3],ror#8       @ twist d[0..3]
770         mov             @x[12],@x[12],ror#8
771         mov             @t[2],@t[2],ror#8
772         mov             @x[14],@x[14],ror#8
773         str             @t[3], [sp,#4*(16+15)]
774         mov             @t[3],#10
775         b               .Loop_neon
776
777 .align  4
778 .Loop_neon:
779         subs            @t[3],@t[3],#1
780 ___
781         my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
782         my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
783         my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
784         my @thread3=&ROUND(0,4,8,12);
785
786         foreach (@thread0) {
787                 eval;                   eval(shift(@thread3));
788                 eval(shift(@thread1));  eval(shift(@thread3));
789                 eval(shift(@thread2));  eval(shift(@thread3));
790         }
791
792         @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
793         @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
794         @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
795         @thread3=&ROUND(0,5,10,15);
796
797         foreach (@thread0) {
798                 eval;                   eval(shift(@thread3));
799                 eval(shift(@thread1));  eval(shift(@thread3));
800                 eval(shift(@thread2));  eval(shift(@thread3));
801         }
802 $code.=<<___;
803         bne             .Loop_neon
804
805         add             @t[3],sp,#32
806         vld1.32         {$t0-$t1},[sp]          @ load key material
807         vld1.32         {$t2-$t3},[@t[3]]
808
809         ldr             @t[3],[sp,#4*(32+2)]    @ load len
810
811         str             @t[0], [sp,#4*(16+8)]   @ modulo-scheduled store
812         str             @t[1], [sp,#4*(16+9)]
813         str             @x[12],[sp,#4*(16+12)]
814         str             @t[2], [sp,#4*(16+13)]
815         str             @x[14],[sp,#4*(16+14)]
816
817         @ at this point we have first half of 512-bit result in
818         @ @x[0-7] and second half at sp+4*(16+8)
819
820         ldr             r12,[sp,#4*(32+1)]      @ load inp
821         ldr             r14,[sp,#4*(32+0)]      @ load out
822
823         vadd.i32        $a0,$a0,$t0             @ accumulate key material
824         vadd.i32        $a1,$a1,$t0
825         vadd.i32        $a2,$a2,$t0
826         vldr            $t0#lo,[sp,#4*(16+0)]   @ one
827
828         vadd.i32        $b0,$b0,$t1
829         vadd.i32        $b1,$b1,$t1
830         vadd.i32        $b2,$b2,$t1
831         vldr            $t1#lo,[sp,#4*(16+2)]   @ two
832
833         vadd.i32        $c0,$c0,$t2
834         vadd.i32        $c1,$c1,$t2
835         vadd.i32        $c2,$c2,$t2
836         vadd.i32        $d1#lo,$d1#lo,$t0#lo    @ counter+1
837         vadd.i32        $d2#lo,$d2#lo,$t1#lo    @ counter+2
838
839         vadd.i32        $d0,$d0,$t3
840         vadd.i32        $d1,$d1,$t3
841         vadd.i32        $d2,$d2,$t3
842
843         cmp             @t[3],#64*4
844         blo             .Ltail_neon
845
846         vld1.8          {$t0-$t1},[r12]!        @ load input
847          mov            @t[3],sp
848         vld1.8          {$t2-$t3},[r12]!
849         veor            $a0,$a0,$t0             @ xor with input
850         veor            $b0,$b0,$t1
851         vld1.8          {$t0-$t1},[r12]!
852         veor            $c0,$c0,$t2
853         veor            $d0,$d0,$t3
854         vld1.8          {$t2-$t3},[r12]!
855
856         veor            $a1,$a1,$t0
857          vst1.8         {$a0-$b0},[r14]!        @ store output
858         veor            $b1,$b1,$t1
859         vld1.8          {$t0-$t1},[r12]!
860         veor            $c1,$c1,$t2
861          vst1.8         {$c0-$d0},[r14]!
862         veor            $d1,$d1,$t3
863         vld1.8          {$t2-$t3},[r12]!
864
865         veor            $a2,$a2,$t0
866          vld1.32        {$a0-$b0},[@t[3]]!      @ load for next iteration
867          veor           $t0#hi,$t0#hi,$t0#hi
868          vldr           $t0#lo,[sp,#4*(16+4)]   @ four
869         veor            $b2,$b2,$t1
870          vld1.32        {$c0-$d0},[@t[3]]
871         veor            $c2,$c2,$t2
872          vst1.8         {$a1-$b1},[r14]!
873         veor            $d2,$d2,$t3
874          vst1.8         {$c1-$d1},[r14]!
875
876         vadd.i32        $d0#lo,$d0#lo,$t0#lo    @ next counter value
877         vldr            $t0#lo,[sp,#4*(16+0)]   @ one
878
879         ldmia           sp,{@t[0]-@t[3]}        @ load key material
880         add             @x[0],@x[0],@t[0]       @ accumulate key material
881         ldr             @t[0],[r12],#16         @ load input
882          vst1.8         {$a2-$b2},[r14]!
883         add             @x[1],@x[1],@t[1]
884         ldr             @t[1],[r12,#-12]
885          vst1.8         {$c2-$d2},[r14]!
886         add             @x[2],@x[2],@t[2]
887         ldr             @t[2],[r12,#-8]
888         add             @x[3],@x[3],@t[3]
889         ldr             @t[3],[r12,#-4]
890 # ifdef __ARMEB__
891         rev             @x[0],@x[0]
892         rev             @x[1],@x[1]
893         rev             @x[2],@x[2]
894         rev             @x[3],@x[3]
895 # endif
896         eor             @x[0],@x[0],@t[0]       @ xor with input
897          add            @t[0],sp,#4*(4)
898         eor             @x[1],@x[1],@t[1]
899         str             @x[0],[r14],#16         @ store output
900         eor             @x[2],@x[2],@t[2]
901         str             @x[1],[r14,#-12]
902         eor             @x[3],@x[3],@t[3]
903          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
904         str             @x[2],[r14,#-8]
905         str             @x[3],[r14,#-4]
906
907         add             @x[4],@t[0],@x[4],ror#13 @ accumulate key material
908         ldr             @t[0],[r12],#16         @ load input
909         add             @x[5],@t[1],@x[5],ror#13
910         ldr             @t[1],[r12,#-12]
911         add             @x[6],@t[2],@x[6],ror#13
912         ldr             @t[2],[r12,#-8]
913         add             @x[7],@t[3],@x[7],ror#13
914         ldr             @t[3],[r12,#-4]
915 # ifdef __ARMEB__
916         rev             @x[4],@x[4]
917         rev             @x[5],@x[5]
918         rev             @x[6],@x[6]
919         rev             @x[7],@x[7]
920 # endif
921         eor             @x[4],@x[4],@t[0]
922          add            @t[0],sp,#4*(8)
923         eor             @x[5],@x[5],@t[1]
924         str             @x[4],[r14],#16         @ store output
925         eor             @x[6],@x[6],@t[2]
926         str             @x[5],[r14,#-12]
927         eor             @x[7],@x[7],@t[3]
928          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
929         str             @x[6],[r14,#-8]
930          add            @x[0],sp,#4*(16+8)
931         str             @x[7],[r14,#-4]
932
933         ldmia           @x[0],{@x[0]-@x[7]}     @ load second half
934
935         add             @x[0],@x[0],@t[0]       @ accumulate key material
936         ldr             @t[0],[r12],#16         @ load input
937         add             @x[1],@x[1],@t[1]
938         ldr             @t[1],[r12,#-12]
939 # ifdef __thumb2__
940         it      hi
941 # endif
942          strhi          @t[2],[sp,#4*(16+10)]   @ copy "@x[10]" while at it
943         add             @x[2],@x[2],@t[2]
944         ldr             @t[2],[r12,#-8]
945 # ifdef __thumb2__
946         it      hi
947 # endif
948          strhi          @t[3],[sp,#4*(16+11)]   @ copy "@x[11]" while at it
949         add             @x[3],@x[3],@t[3]
950         ldr             @t[3],[r12,#-4]
951 # ifdef __ARMEB__
952         rev             @x[0],@x[0]
953         rev             @x[1],@x[1]
954         rev             @x[2],@x[2]
955         rev             @x[3],@x[3]
956 # endif
957         eor             @x[0],@x[0],@t[0]
958          add            @t[0],sp,#4*(12)
959         eor             @x[1],@x[1],@t[1]
960         str             @x[0],[r14],#16         @ store output
961         eor             @x[2],@x[2],@t[2]
962         str             @x[1],[r14,#-12]
963         eor             @x[3],@x[3],@t[3]
964          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
965         str             @x[2],[r14,#-8]
966         str             @x[3],[r14,#-4]
967
968         add             @x[4],@t[0],@x[4],ror#24 @ accumulate key material
969          add            @t[0],@t[0],#4          @ next counter value
970         add             @x[5],@t[1],@x[5],ror#24
971          str            @t[0],[sp,#4*(12)]      @ save next counter value
972         ldr             @t[0],[r12],#16         @ load input
973         add             @x[6],@t[2],@x[6],ror#24
974          add            @x[4],@x[4],#3          @ counter+3
975         ldr             @t[1],[r12,#-12]
976         add             @x[7],@t[3],@x[7],ror#24
977         ldr             @t[2],[r12,#-8]
978         ldr             @t[3],[r12,#-4]
979 # ifdef __ARMEB__
980         rev             @x[4],@x[4]
981         rev             @x[5],@x[5]
982         rev             @x[6],@x[6]
983         rev             @x[7],@x[7]
984 # endif
985         eor             @x[4],@x[4],@t[0]
986 # ifdef __thumb2__
987         it      hi
988 # endif
989          ldrhi          @t[0],[sp,#4*(32+2)]    @ re-load len
990         eor             @x[5],@x[5],@t[1]
991         eor             @x[6],@x[6],@t[2]
992         str             @x[4],[r14],#16         @ store output
993         eor             @x[7],@x[7],@t[3]
994         str             @x[5],[r14,#-12]
995          sub            @t[3],@t[0],#64*4       @ len-=64*4
996         str             @x[6],[r14,#-8]
997         str             @x[7],[r14,#-4]
998         bhi             .Loop_neon_outer
999
1000         b               .Ldone_neon
1001
1002 .align  4
1003 .Lbreak_neon:
1004         @ harmonize NEON and integer-only stack frames: load data
1005         @ from NEON frame, but save to integer-only one; distance
1006         @ between the two is 4*(32+4+16-32)=4*(20).
1007
1008         str             @t[3], [sp,#4*(20+32+2)]        @ save len
1009          add            @t[3],sp,#4*(32+4)
1010         str             r12,   [sp,#4*(20+32+1)]        @ save inp
1011         str             r14,   [sp,#4*(20+32+0)]        @ save out
1012
1013         ldr             @x[12],[sp,#4*(16+10)]
1014         ldr             @x[14],[sp,#4*(16+11)]
1015          vldmia         @t[3],{d8-d15}                  @ fulfill ABI requirement
1016         str             @x[12],[sp,#4*(20+16+10)]       @ copy "@x[10]"
1017         str             @x[14],[sp,#4*(20+16+11)]       @ copy "@x[11]"
1018
1019         ldr             @t[3], [sp,#4*(15)]
1020          mov            @x[4],@x[4],ror#19              @ twist b[0..3]
1021         ldr             @x[12],[sp,#4*(12)]             @ modulo-scheduled load
1022          mov            @x[5],@x[5],ror#19
1023         ldr             @t[2], [sp,#4*(13)]
1024          mov            @x[6],@x[6],ror#19
1025         ldr             @x[14],[sp,#4*(14)]
1026          mov            @x[7],@x[7],ror#19
1027         mov             @t[3],@t[3],ror#8               @ twist d[0..3]
1028         mov             @x[12],@x[12],ror#8
1029         mov             @t[2],@t[2],ror#8
1030         mov             @x[14],@x[14],ror#8
1031         str             @t[3], [sp,#4*(20+16+15)]
1032         add             @t[3],sp,#4*(20)
1033         vst1.32         {$a0-$b0},[@t[3]]!              @ copy key
1034         add             sp,sp,#4*(20)                   @ switch frame
1035         vst1.32         {$c0-$d0},[@t[3]]
1036         mov             @t[3],#10
1037         b               .Loop                           @ go integer-only
1038
1039 .align  4
1040 .Ltail_neon:
1041         cmp             @t[3],#64*3
1042         bhs             .L192_or_more_neon
1043         cmp             @t[3],#64*2
1044         bhs             .L128_or_more_neon
1045         cmp             @t[3],#64*1
1046         bhs             .L64_or_more_neon
1047
1048         add             @t[0],sp,#4*(8)
1049         vst1.8          {$a0-$b0},[sp]
1050         add             @t[2],sp,#4*(0)
1051         vst1.8          {$c0-$d0},[@t[0]]
1052         b               .Loop_tail_neon
1053
1054 .align  4
1055 .L64_or_more_neon:
1056         vld1.8          {$t0-$t1},[r12]!
1057         vld1.8          {$t2-$t3},[r12]!
1058         veor            $a0,$a0,$t0
1059         veor            $b0,$b0,$t1
1060         veor            $c0,$c0,$t2
1061         veor            $d0,$d0,$t3
1062         vst1.8          {$a0-$b0},[r14]!
1063         vst1.8          {$c0-$d0},[r14]!
1064
1065         beq             .Ldone_neon
1066
1067         add             @t[0],sp,#4*(8)
1068         vst1.8          {$a1-$b1},[sp]
1069         add             @t[2],sp,#4*(0)
1070         vst1.8          {$c1-$d1},[@t[0]]
1071         sub             @t[3],@t[3],#64*1       @ len-=64*1
1072         b               .Loop_tail_neon
1073
1074 .align  4
1075 .L128_or_more_neon:
1076         vld1.8          {$t0-$t1},[r12]!
1077         vld1.8          {$t2-$t3},[r12]!
1078         veor            $a0,$a0,$t0
1079         veor            $b0,$b0,$t1
1080         vld1.8          {$t0-$t1},[r12]!
1081         veor            $c0,$c0,$t2
1082         veor            $d0,$d0,$t3
1083         vld1.8          {$t2-$t3},[r12]!
1084
1085         veor            $a1,$a1,$t0
1086         veor            $b1,$b1,$t1
1087          vst1.8         {$a0-$b0},[r14]!
1088         veor            $c1,$c1,$t2
1089          vst1.8         {$c0-$d0},[r14]!
1090         veor            $d1,$d1,$t3
1091         vst1.8          {$a1-$b1},[r14]!
1092         vst1.8          {$c1-$d1},[r14]!
1093
1094         beq             .Ldone_neon
1095
1096         add             @t[0],sp,#4*(8)
1097         vst1.8          {$a2-$b2},[sp]
1098         add             @t[2],sp,#4*(0)
1099         vst1.8          {$c2-$d2},[@t[0]]
1100         sub             @t[3],@t[3],#64*2       @ len-=64*2
1101         b               .Loop_tail_neon
1102
1103 .align  4
1104 .L192_or_more_neon:
1105         vld1.8          {$t0-$t1},[r12]!
1106         vld1.8          {$t2-$t3},[r12]!
1107         veor            $a0,$a0,$t0
1108         veor            $b0,$b0,$t1
1109         vld1.8          {$t0-$t1},[r12]!
1110         veor            $c0,$c0,$t2
1111         veor            $d0,$d0,$t3
1112         vld1.8          {$t2-$t3},[r12]!
1113
1114         veor            $a1,$a1,$t0
1115         veor            $b1,$b1,$t1
1116         vld1.8          {$t0-$t1},[r12]!
1117         veor            $c1,$c1,$t2
1118          vst1.8         {$a0-$b0},[r14]!
1119         veor            $d1,$d1,$t3
1120         vld1.8          {$t2-$t3},[r12]!
1121
1122         veor            $a2,$a2,$t0
1123          vst1.8         {$c0-$d0},[r14]!
1124         veor            $b2,$b2,$t1
1125          vst1.8         {$a1-$b1},[r14]!
1126         veor            $c2,$c2,$t2
1127          vst1.8         {$c1-$d1},[r14]!
1128         veor            $d2,$d2,$t3
1129         vst1.8          {$a2-$b2},[r14]!
1130         vst1.8          {$c2-$d2},[r14]!
1131
1132         beq             .Ldone_neon
1133
1134         ldmia           sp,{@t[0]-@t[3]}        @ load key material
1135         add             @x[0],@x[0],@t[0]       @ accumulate key material
1136          add            @t[0],sp,#4*(4)
1137         add             @x[1],@x[1],@t[1]
1138         add             @x[2],@x[2],@t[2]
1139         add             @x[3],@x[3],@t[3]
1140          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
1141
1142         add             @x[4],@t[0],@x[4],ror#13 @ accumulate key material
1143          add            @t[0],sp,#4*(8)
1144         add             @x[5],@t[1],@x[5],ror#13
1145         add             @x[6],@t[2],@x[6],ror#13
1146         add             @x[7],@t[3],@x[7],ror#13
1147          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
1148 # ifdef __ARMEB__
1149         rev             @x[0],@x[0]
1150         rev             @x[1],@x[1]
1151         rev             @x[2],@x[2]
1152         rev             @x[3],@x[3]
1153         rev             @x[4],@x[4]
1154         rev             @x[5],@x[5]
1155         rev             @x[6],@x[6]
1156         rev             @x[7],@x[7]
1157 # endif
1158         stmia           sp,{@x[0]-@x[7]}
1159          add            @x[0],sp,#4*(16+8)
1160
1161         ldmia           @x[0],{@x[0]-@x[7]}     @ load second half
1162
1163         add             @x[0],@x[0],@t[0]       @ accumulate key material
1164          add            @t[0],sp,#4*(12)
1165         add             @x[1],@x[1],@t[1]
1166         add             @x[2],@x[2],@t[2]
1167         add             @x[3],@x[3],@t[3]
1168          ldmia          @t[0],{@t[0]-@t[3]}     @ load key material
1169
1170         add             @x[4],@t[0],@x[4],ror#24 @ accumulate key material
1171          add            @t[0],sp,#4*(8)
1172         add             @x[5],@t[1],@x[5],ror#24
1173          add            @x[4],@x[4],#3          @ counter+3
1174         add             @x[6],@t[2],@x[6],ror#24
1175         add             @x[7],@t[3],@x[7],ror#24
1176          ldr            @t[3],[sp,#4*(32+2)]    @ re-load len
1177 # ifdef __ARMEB__
1178         rev             @x[0],@x[0]
1179         rev             @x[1],@x[1]
1180         rev             @x[2],@x[2]
1181         rev             @x[3],@x[3]
1182         rev             @x[4],@x[4]
1183         rev             @x[5],@x[5]
1184         rev             @x[6],@x[6]
1185         rev             @x[7],@x[7]
1186 # endif
1187         stmia           @t[0],{@x[0]-@x[7]}
1188          add            @t[2],sp,#4*(0)
1189          sub            @t[3],@t[3],#64*3       @ len-=64*3
1190
1191 .Loop_tail_neon:
1192         ldrb            @t[0],[@t[2]],#1        @ read buffer on stack
1193         ldrb            @t[1],[r12],#1          @ read input
1194         subs            @t[3],@t[3],#1
1195         eor             @t[0],@t[0],@t[1]
1196         strb            @t[0],[r14],#1          @ store output
1197         bne             .Loop_tail_neon
1198
1199 .Ldone_neon:
1200         add             sp,sp,#4*(32+4)
1201         vldmia          sp,{d8-d15}
1202         add             sp,sp,#4*(16+3)
1203         ldmia           sp!,{r4-r11,pc}
1204 .size   ChaCha20_neon,.-ChaCha20_neon
1205 # ifndef __KERNEL__
1206 .comm   OPENSSL_armcap_P,4,4
1207 # endif
1208 #endif
1209 ___
1210 }}}
1211
1212 open SELF,$0;
1213 while(<SELF>) {
1214         next if (/^#!/);
1215         last if (!s/^#/@/ and !/^$/);
1216         print;
1217 }
1218 close SELF;
1219
1220 foreach (split("\n",$code)) {
1221         s/\`([^\`]*)\`/eval $1/geo;
1222
1223         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1224
1225         print $_,"\n";
1226 }
1227 close STDOUT;