]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - secure/lib/libcrypto/aarch64/keccak1600-armv8.S
Regen assemply files for aarch64.
[FreeBSD/FreeBSD.git] / secure / lib / libcrypto / aarch64 / keccak1600-armv8.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from keccak1600-armv8.pl. */
3 .text
4
5 .align  8       // strategic alignment and padding that allows to use
6                 // address value as loop termination condition...
7 .quad   0,0,0,0,0,0,0,0
8 .type   iotas,%object
9 iotas:
10 .quad   0x0000000000000001
11 .quad   0x0000000000008082
12 .quad   0x800000000000808a
13 .quad   0x8000000080008000
14 .quad   0x000000000000808b
15 .quad   0x0000000080000001
16 .quad   0x8000000080008081
17 .quad   0x8000000000008009
18 .quad   0x000000000000008a
19 .quad   0x0000000000000088
20 .quad   0x0000000080008009
21 .quad   0x000000008000000a
22 .quad   0x000000008000808b
23 .quad   0x800000000000008b
24 .quad   0x8000000000008089
25 .quad   0x8000000000008003
26 .quad   0x8000000000008002
27 .quad   0x8000000000000080
28 .quad   0x000000000000800a
29 .quad   0x800000008000000a
30 .quad   0x8000000080008081
31 .quad   0x8000000000008080
32 .quad   0x0000000080000001
33 .quad   0x8000000080008008
34 .size   iotas,.-iotas
35 .type   KeccakF1600_int,%function
36 .align  5
37 KeccakF1600_int:
38         adr     x28,iotas
39         stp     x28,x30,[sp,#16]                // 32 bytes on top are mine
40         b       .Loop
41 .align  4
42 .Loop:
43         ////////////////////////////////////////// Theta
44         eor     x26,x0,x5
45         stp     x4,x9,[sp,#0]   // offload pair...
46         eor     x27,x1,x6
47         eor     x28,x2,x7
48         eor     x30,x3,x8
49         eor     x4,x4,x9
50         eor     x26,x26,x10
51         eor     x27,x27,x11
52         eor     x28,x28,x12
53         eor     x30,x30,x13
54         eor     x4,x4,x14
55         eor     x26,x26,x15
56         eor     x27,x27,x16
57         eor     x28,x28,x17
58         eor     x30,x30,x25
59         eor     x4,x4,x19
60         eor     x26,x26,x20
61         eor     x28,x28,x22
62         eor     x27,x27,x21
63         eor     x30,x30,x23
64         eor     x4,x4,x24
65
66         eor     x9,x26,x28,ror#63
67
68         eor     x1,x1,x9
69         eor     x6,x6,x9
70         eor     x11,x11,x9
71         eor     x16,x16,x9
72         eor     x21,x21,x9
73
74         eor     x9,x27,x30,ror#63
75         eor     x28,x28,x4,ror#63
76         eor     x30,x30,x26,ror#63
77         eor     x4,x4,x27,ror#63
78
79         eor     x27,   x2,x9            // mov  x27,x2
80         eor     x7,x7,x9
81         eor     x12,x12,x9
82         eor     x17,x17,x9
83         eor     x22,x22,x9
84
85         eor     x0,x0,x4
86         eor     x5,x5,x4
87         eor     x10,x10,x4
88         eor     x15,x15,x4
89         eor     x20,x20,x4
90         ldp     x4,x9,[sp,#0]   // re-load offloaded data
91         eor     x26,   x3,x28           // mov  x26,x3
92         eor     x8,x8,x28
93         eor     x13,x13,x28
94         eor     x25,x25,x28
95         eor     x23,x23,x28
96
97         eor     x28,   x4,x30           // mov  x28,x4
98         eor     x9,x9,x30
99         eor     x14,x14,x30
100         eor     x19,x19,x30
101         eor     x24,x24,x30
102
103         ////////////////////////////////////////// Rho+Pi
104         mov     x30,x1
105         ror     x1,x6,#64-44
106         //mov   x27,x2
107         ror     x2,x12,#64-43
108         //mov   x26,x3
109         ror     x3,x25,#64-21
110         //mov   x28,x4
111         ror     x4,x24,#64-14
112
113         ror     x6,x9,#64-20
114         ror     x12,x13,#64-25
115         ror     x25,x17,#64-15
116         ror     x24,x21,#64-2
117
118         ror     x9,x22,#64-61
119         ror     x13,x19,#64-8
120         ror     x17,x11,#64-10
121         ror     x21,x8,#64-55
122
123         ror     x22,x14,#64-39
124         ror     x19,x23,#64-56
125         ror     x11,x7,#64-6
126         ror     x8,x16,#64-45
127
128         ror     x14,x20,#64-18
129         ror     x23,x15,#64-41
130         ror     x7,x10,#64-3
131         ror     x16,x5,#64-36
132
133         ror     x5,x26,#64-28
134         ror     x10,x30,#64-1
135         ror     x15,x28,#64-27
136         ror     x20,x27,#64-62
137
138         ////////////////////////////////////////// Chi+Iota
139         bic     x26,x2,x1
140         bic     x27,x3,x2
141         bic     x28,x0,x4
142         bic     x30,x1,x0
143         eor     x0,x0,x26
144         bic     x26,x4,x3
145         eor     x1,x1,x27
146         ldr     x27,[sp,#16]
147         eor     x3,x3,x28
148         eor     x4,x4,x30
149         eor     x2,x2,x26
150         ldr     x30,[x27],#8            // Iota[i++]
151
152         bic     x26,x7,x6
153         tst     x27,#255                        // are we done?
154         str     x27,[sp,#16]
155         bic     x27,x8,x7
156         bic     x28,x5,x9
157         eor     x0,x0,x30               // A[0][0] ^= Iota
158         bic     x30,x6,x5
159         eor     x5,x5,x26
160         bic     x26,x9,x8
161         eor     x6,x6,x27
162         eor     x8,x8,x28
163         eor     x9,x9,x30
164         eor     x7,x7,x26
165
166         bic     x26,x12,x11
167         bic     x27,x13,x12
168         bic     x28,x10,x14
169         bic     x30,x11,x10
170         eor     x10,x10,x26
171         bic     x26,x14,x13
172         eor     x11,x11,x27
173         eor     x13,x13,x28
174         eor     x14,x14,x30
175         eor     x12,x12,x26
176
177         bic     x26,x17,x16
178         bic     x27,x25,x17
179         bic     x28,x15,x19
180         bic     x30,x16,x15
181         eor     x15,x15,x26
182         bic     x26,x19,x25
183         eor     x16,x16,x27
184         eor     x25,x25,x28
185         eor     x19,x19,x30
186         eor     x17,x17,x26
187
188         bic     x26,x22,x21
189         bic     x27,x23,x22
190         bic     x28,x20,x24
191         bic     x30,x21,x20
192         eor     x20,x20,x26
193         bic     x26,x24,x23
194         eor     x21,x21,x27
195         eor     x23,x23,x28
196         eor     x24,x24,x30
197         eor     x22,x22,x26
198
199         bne     .Loop
200
201         ldr     x30,[sp,#24]
202         ret
203 .size   KeccakF1600_int,.-KeccakF1600_int
204
205 .type   KeccakF1600,%function
206 .align  5
207 KeccakF1600:
208         stp     x29,x30,[sp,#-128]!
209         add     x29,sp,#0
210         stp     x19,x20,[sp,#16]
211         stp     x21,x22,[sp,#32]
212         stp     x23,x24,[sp,#48]
213         stp     x25,x26,[sp,#64]
214         stp     x27,x28,[sp,#80]
215         sub     sp,sp,#48
216
217         str     x0,[sp,#32]                     // offload argument
218         mov     x26,x0
219         ldp     x0,x1,[x0,#16*0]
220         ldp     x2,x3,[x26,#16*1]
221         ldp     x4,x5,[x26,#16*2]
222         ldp     x6,x7,[x26,#16*3]
223         ldp     x8,x9,[x26,#16*4]
224         ldp     x10,x11,[x26,#16*5]
225         ldp     x12,x13,[x26,#16*6]
226         ldp     x14,x15,[x26,#16*7]
227         ldp     x16,x17,[x26,#16*8]
228         ldp     x25,x19,[x26,#16*9]
229         ldp     x20,x21,[x26,#16*10]
230         ldp     x22,x23,[x26,#16*11]
231         ldr     x24,[x26,#16*12]
232
233         bl      KeccakF1600_int
234
235         ldr     x26,[sp,#32]
236         stp     x0,x1,[x26,#16*0]
237         stp     x2,x3,[x26,#16*1]
238         stp     x4,x5,[x26,#16*2]
239         stp     x6,x7,[x26,#16*3]
240         stp     x8,x9,[x26,#16*4]
241         stp     x10,x11,[x26,#16*5]
242         stp     x12,x13,[x26,#16*6]
243         stp     x14,x15,[x26,#16*7]
244         stp     x16,x17,[x26,#16*8]
245         stp     x25,x19,[x26,#16*9]
246         stp     x20,x21,[x26,#16*10]
247         stp     x22,x23,[x26,#16*11]
248         str     x24,[x26,#16*12]
249
250         ldp     x19,x20,[x29,#16]
251         add     sp,sp,#48
252         ldp     x21,x22,[x29,#32]
253         ldp     x23,x24,[x29,#48]
254         ldp     x25,x26,[x29,#64]
255         ldp     x27,x28,[x29,#80]
256         ldp     x29,x30,[sp],#128
257         ret
258 .size   KeccakF1600,.-KeccakF1600
259
260 .globl  SHA3_absorb
261 .type   SHA3_absorb,%function
262 .align  5
263 SHA3_absorb:
264         stp     x29,x30,[sp,#-128]!
265         add     x29,sp,#0
266         stp     x19,x20,[sp,#16]
267         stp     x21,x22,[sp,#32]
268         stp     x23,x24,[sp,#48]
269         stp     x25,x26,[sp,#64]
270         stp     x27,x28,[sp,#80]
271         sub     sp,sp,#64
272
273         stp     x0,x1,[sp,#32]                  // offload arguments
274         stp     x2,x3,[sp,#48]
275
276         mov     x26,x0                  // uint64_t A[5][5]
277         mov     x27,x1                  // const void *inp
278         mov     x28,x2                  // size_t len
279         mov     x30,x3                  // size_t bsz
280         ldp     x0,x1,[x26,#16*0]
281         ldp     x2,x3,[x26,#16*1]
282         ldp     x4,x5,[x26,#16*2]
283         ldp     x6,x7,[x26,#16*3]
284         ldp     x8,x9,[x26,#16*4]
285         ldp     x10,x11,[x26,#16*5]
286         ldp     x12,x13,[x26,#16*6]
287         ldp     x14,x15,[x26,#16*7]
288         ldp     x16,x17,[x26,#16*8]
289         ldp     x25,x19,[x26,#16*9]
290         ldp     x20,x21,[x26,#16*10]
291         ldp     x22,x23,[x26,#16*11]
292         ldr     x24,[x26,#16*12]
293         b       .Loop_absorb
294
295 .align  4
296 .Loop_absorb:
297         subs    x26,x28,x30             // len - bsz
298         blo     .Labsorbed
299
300         str     x26,[sp,#48]                    // save len - bsz
301         ldr     x26,[x27],#8            // *inp++
302 #ifdef  __AARCH64EB__
303         rev     x26,x26
304 #endif
305         eor     x0,x0,x26
306         cmp     x30,#8*(0+2)
307         blo     .Lprocess_block
308         ldr     x26,[x27],#8            // *inp++
309 #ifdef  __AARCH64EB__
310         rev     x26,x26
311 #endif
312         eor     x1,x1,x26
313         beq     .Lprocess_block
314         ldr     x26,[x27],#8            // *inp++
315 #ifdef  __AARCH64EB__
316         rev     x26,x26
317 #endif
318         eor     x2,x2,x26
319         cmp     x30,#8*(2+2)
320         blo     .Lprocess_block
321         ldr     x26,[x27],#8            // *inp++
322 #ifdef  __AARCH64EB__
323         rev     x26,x26
324 #endif
325         eor     x3,x3,x26
326         beq     .Lprocess_block
327         ldr     x26,[x27],#8            // *inp++
328 #ifdef  __AARCH64EB__
329         rev     x26,x26
330 #endif
331         eor     x4,x4,x26
332         cmp     x30,#8*(4+2)
333         blo     .Lprocess_block
334         ldr     x26,[x27],#8            // *inp++
335 #ifdef  __AARCH64EB__
336         rev     x26,x26
337 #endif
338         eor     x5,x5,x26
339         beq     .Lprocess_block
340         ldr     x26,[x27],#8            // *inp++
341 #ifdef  __AARCH64EB__
342         rev     x26,x26
343 #endif
344         eor     x6,x6,x26
345         cmp     x30,#8*(6+2)
346         blo     .Lprocess_block
347         ldr     x26,[x27],#8            // *inp++
348 #ifdef  __AARCH64EB__
349         rev     x26,x26
350 #endif
351         eor     x7,x7,x26
352         beq     .Lprocess_block
353         ldr     x26,[x27],#8            // *inp++
354 #ifdef  __AARCH64EB__
355         rev     x26,x26
356 #endif
357         eor     x8,x8,x26
358         cmp     x30,#8*(8+2)
359         blo     .Lprocess_block
360         ldr     x26,[x27],#8            // *inp++
361 #ifdef  __AARCH64EB__
362         rev     x26,x26
363 #endif
364         eor     x9,x9,x26
365         beq     .Lprocess_block
366         ldr     x26,[x27],#8            // *inp++
367 #ifdef  __AARCH64EB__
368         rev     x26,x26
369 #endif
370         eor     x10,x10,x26
371         cmp     x30,#8*(10+2)
372         blo     .Lprocess_block
373         ldr     x26,[x27],#8            // *inp++
374 #ifdef  __AARCH64EB__
375         rev     x26,x26
376 #endif
377         eor     x11,x11,x26
378         beq     .Lprocess_block
379         ldr     x26,[x27],#8            // *inp++
380 #ifdef  __AARCH64EB__
381         rev     x26,x26
382 #endif
383         eor     x12,x12,x26
384         cmp     x30,#8*(12+2)
385         blo     .Lprocess_block
386         ldr     x26,[x27],#8            // *inp++
387 #ifdef  __AARCH64EB__
388         rev     x26,x26
389 #endif
390         eor     x13,x13,x26
391         beq     .Lprocess_block
392         ldr     x26,[x27],#8            // *inp++
393 #ifdef  __AARCH64EB__
394         rev     x26,x26
395 #endif
396         eor     x14,x14,x26
397         cmp     x30,#8*(14+2)
398         blo     .Lprocess_block
399         ldr     x26,[x27],#8            // *inp++
400 #ifdef  __AARCH64EB__
401         rev     x26,x26
402 #endif
403         eor     x15,x15,x26
404         beq     .Lprocess_block
405         ldr     x26,[x27],#8            // *inp++
406 #ifdef  __AARCH64EB__
407         rev     x26,x26
408 #endif
409         eor     x16,x16,x26
410         cmp     x30,#8*(16+2)
411         blo     .Lprocess_block
412         ldr     x26,[x27],#8            // *inp++
413 #ifdef  __AARCH64EB__
414         rev     x26,x26
415 #endif
416         eor     x17,x17,x26
417         beq     .Lprocess_block
418         ldr     x26,[x27],#8            // *inp++
419 #ifdef  __AARCH64EB__
420         rev     x26,x26
421 #endif
422         eor     x25,x25,x26
423         cmp     x30,#8*(18+2)
424         blo     .Lprocess_block
425         ldr     x26,[x27],#8            // *inp++
426 #ifdef  __AARCH64EB__
427         rev     x26,x26
428 #endif
429         eor     x19,x19,x26
430         beq     .Lprocess_block
431         ldr     x26,[x27],#8            // *inp++
432 #ifdef  __AARCH64EB__
433         rev     x26,x26
434 #endif
435         eor     x20,x20,x26
436         cmp     x30,#8*(20+2)
437         blo     .Lprocess_block
438         ldr     x26,[x27],#8            // *inp++
439 #ifdef  __AARCH64EB__
440         rev     x26,x26
441 #endif
442         eor     x21,x21,x26
443         beq     .Lprocess_block
444         ldr     x26,[x27],#8            // *inp++
445 #ifdef  __AARCH64EB__
446         rev     x26,x26
447 #endif
448         eor     x22,x22,x26
449         cmp     x30,#8*(22+2)
450         blo     .Lprocess_block
451         ldr     x26,[x27],#8            // *inp++
452 #ifdef  __AARCH64EB__
453         rev     x26,x26
454 #endif
455         eor     x23,x23,x26
456         beq     .Lprocess_block
457         ldr     x26,[x27],#8            // *inp++
458 #ifdef  __AARCH64EB__
459         rev     x26,x26
460 #endif
461         eor     x24,x24,x26
462
463 .Lprocess_block:
464         str     x27,[sp,#40]                    // save inp
465
466         bl      KeccakF1600_int
467
468         ldr     x27,[sp,#40]                    // restore arguments
469         ldp     x28,x30,[sp,#48]
470         b       .Loop_absorb
471
472 .align  4
473 .Labsorbed:
474         ldr     x27,[sp,#32]
475         stp     x0,x1,[x27,#16*0]
476         stp     x2,x3,[x27,#16*1]
477         stp     x4,x5,[x27,#16*2]
478         stp     x6,x7,[x27,#16*3]
479         stp     x8,x9,[x27,#16*4]
480         stp     x10,x11,[x27,#16*5]
481         stp     x12,x13,[x27,#16*6]
482         stp     x14,x15,[x27,#16*7]
483         stp     x16,x17,[x27,#16*8]
484         stp     x25,x19,[x27,#16*9]
485         stp     x20,x21,[x27,#16*10]
486         stp     x22,x23,[x27,#16*11]
487         str     x24,[x27,#16*12]
488
489         mov     x0,x28                  // return value
490         ldp     x19,x20,[x29,#16]
491         add     sp,sp,#64
492         ldp     x21,x22,[x29,#32]
493         ldp     x23,x24,[x29,#48]
494         ldp     x25,x26,[x29,#64]
495         ldp     x27,x28,[x29,#80]
496         ldp     x29,x30,[sp],#128
497         ret
498 .size   SHA3_absorb,.-SHA3_absorb
499 .globl  SHA3_squeeze
500 .type   SHA3_squeeze,%function
501 .align  5
502 SHA3_squeeze:
503         stp     x29,x30,[sp,#-48]!
504         add     x29,sp,#0
505         stp     x19,x20,[sp,#16]
506         stp     x21,x22,[sp,#32]
507
508         mov     x19,x0                  // put aside arguments
509         mov     x20,x1
510         mov     x21,x2
511         mov     x22,x3
512
513 .Loop_squeeze:
514         ldr     x4,[x0],#8
515         cmp     x21,#8
516         blo     .Lsqueeze_tail
517 #ifdef  __AARCH64EB__
518         rev     x4,x4
519 #endif
520         str     x4,[x20],#8
521         subs    x21,x21,#8
522         beq     .Lsqueeze_done
523
524         subs    x3,x3,#8
525         bhi     .Loop_squeeze
526
527         mov     x0,x19
528         bl      KeccakF1600
529         mov     x0,x19
530         mov     x3,x22
531         b       .Loop_squeeze
532
533 .align  4
534 .Lsqueeze_tail:
535         strb    w4,[x20],#1
536         lsr     x4,x4,#8
537         subs    x21,x21,#1
538         beq     .Lsqueeze_done
539         strb    w4,[x20],#1
540         lsr     x4,x4,#8
541         subs    x21,x21,#1
542         beq     .Lsqueeze_done
543         strb    w4,[x20],#1
544         lsr     x4,x4,#8
545         subs    x21,x21,#1
546         beq     .Lsqueeze_done
547         strb    w4,[x20],#1
548         lsr     x4,x4,#8
549         subs    x21,x21,#1
550         beq     .Lsqueeze_done
551         strb    w4,[x20],#1
552         lsr     x4,x4,#8
553         subs    x21,x21,#1
554         beq     .Lsqueeze_done
555         strb    w4,[x20],#1
556         lsr     x4,x4,#8
557         subs    x21,x21,#1
558         beq     .Lsqueeze_done
559         strb    w4,[x20],#1
560
561 .Lsqueeze_done:
562         ldp     x19,x20,[sp,#16]
563         ldp     x21,x22,[sp,#32]
564         ldp     x29,x30,[sp],#48
565         ret
566 .size   SHA3_squeeze,.-SHA3_squeeze
567 .type   KeccakF1600_ce,%function
568 .align  5
569 KeccakF1600_ce:
570         mov     x9,#12
571         adr     x10,iotas
572         b       .Loop_ce
573 .align  4
574 .Loop_ce:
575         ////////////////////////////////////////////////// Theta
576 .inst   0xce052819      //eor3 v25.16b,v0.16b,v5.16b,v10.16b
577 .inst   0xce062c3a      //eor3 v26.16b,v1.16b,v6.16b,v11.16b
578 .inst   0xce07305b      //eor3 v27.16b,v2.16b,v7.16b,v12.16b
579 .inst   0xce08347c      //eor3 v28.16b,v3.16b,v8.16b,v13.16b
580 .inst   0xce09389d      //eor3 v29.16b,v4.16b,v9.16b,v14.16b
581 .inst   0xce0f5339      //eor3 v25.16b,v25.16b,   v15.16b,v20.16b
582 .inst   0xce10575a      //eor3 v26.16b,v26.16b,   v16.16b,v21.16b
583 .inst   0xce115b7b      //eor3 v27.16b,v27.16b,   v17.16b,v22.16b
584 .inst   0xce125f9c      //eor3 v28.16b,v28.16b,   v18.16b,v23.16b
585 .inst   0xce1363bd      //eor3 v29.16b,v29.16b,   v19.16b,v24.16b
586
587 .inst   0xce7b8f3e      //rax1 v30.16b,v25.16b,v27.16b                  // D[1]
588 .inst   0xce7c8f5f      //rax1 v31.16b,v26.16b,v28.16b                  // D[2]
589 .inst   0xce7d8f7b      //rax1 v27.16b,v27.16b,v29.16b                  // D[3]
590 .inst   0xce798f9c      //rax1 v28.16b,v28.16b,v25.16b                  // D[4]
591 .inst   0xce7a8fbd      //rax1 v29.16b,v29.16b,v26.16b                  // D[0]
592
593         ////////////////////////////////////////////////// Theta+Rho+Pi
594 .inst   0xce9e50d9      //xar v25.16b,   v6.16b,v30.16b,#64-44  // C[0]=A[0][1]
595 .inst   0xce9cb126      //xar v6.16b,v9.16b,v28.16b,#64-20
596 .inst   0xce9f0ec9      //xar v9.16b,v22.16b,v31.16b,#64-61
597 .inst   0xce9c65d6      //xar v22.16b,v14.16b,v28.16b,#64-39
598 .inst   0xce9dba8e      //xar v14.16b,v20.16b,v29.16b,#64-18
599
600 .inst   0xce9f0854      //xar v20.16b,v2.16b,v31.16b,#64-62
601
602 .inst   0xce9f5582      //xar v2.16b,v12.16b,v31.16b,#64-43
603 .inst   0xce9b9dac      //xar v12.16b,v13.16b,v27.16b,#64-25
604 .inst   0xce9ce26d      //xar v13.16b,v19.16b,v28.16b,#64-8
605 .inst   0xce9b22f3      //xar v19.16b,v23.16b,v27.16b,#64-56
606 .inst   0xce9d5df7      //xar v23.16b,v15.16b,v29.16b,#64-41
607
608 .inst   0xce9c948f      //xar v15.16b,v4.16b,v28.16b,#64-27
609
610         eor     v0.16b,v0.16b,v29.16b
611         ldr     x11,[x10],#8
612
613 .inst   0xce9bae5a      //xar v26.16b,   v18.16b,v27.16b,#64-21 // C[1]=A[0][3]
614 .inst   0xce9fc632      //xar v18.16b,v17.16b,v31.16b,#64-15
615 .inst   0xce9ed971      //xar v17.16b,v11.16b,v30.16b,#64-10
616 .inst   0xce9fe8eb      //xar v11.16b,v7.16b,v31.16b,#64-6
617 .inst   0xce9df547      //xar v7.16b,v10.16b,v29.16b,#64-3
618
619 .inst   0xce9efc2a      //xar v10.16b,v1.16b,v30.16b,#64-1      // *
620
621 .inst   0xce9ccb04      //xar v4.16b,v24.16b,v28.16b,#64-14
622 .inst   0xce9efab8      //xar v24.16b,v21.16b,v30.16b,#64-2
623 .inst   0xce9b2515      //xar v21.16b,v8.16b,v27.16b,#64-55
624 .inst   0xce9e4e08      //xar v8.16b,v16.16b,v30.16b,#64-45
625 .inst   0xce9d70b0      //xar v16.16b,v5.16b,v29.16b,#64-36
626
627 .inst   0xce9b907b      //xar v27.16b,   v3.16b,v27.16b,#64-28  // C[2]=A[1][0]
628
629         ////////////////////////////////////////////////// Chi+Iota
630         dup     v31.2d,x11                              // borrow C[6]
631 .inst   0xce22641c      //bcax v28.16b,   v0.16b,v2.16b,v25.16b // *
632 .inst   0xce3a0b21      //bcax v1.16b,v25.16b,   v26.16b,   v2.16b      // *
633 .inst   0xce246842      //bcax v2.16b,v2.16b,v4.16b,v26.16b
634 .inst   0xce201343      //bcax v3.16b,v26.16b,   v0.16b,v4.16b
635 .inst   0xce390084      //bcax v4.16b,v4.16b,v25.16b,   v0.16b
636
637 .inst   0xce271b65      //bcax v5.16b,v27.16b,   v7.16b,v6.16b  // *
638 .inst   0xce281cd9      //bcax v25.16b,   v6.16b,v8.16b,v7.16b  // *
639 .inst   0xce2920e7      //bcax v7.16b,v7.16b,v9.16b,v8.16b
640 .inst   0xce3b2508      //bcax v8.16b,v8.16b,v27.16b,   v9.16b
641 .inst   0xce266d29      //bcax v9.16b,v9.16b,v6.16b,v27.16b
642
643         eor     v0.16b,v28.16b,v31.16b                  // Iota
644
645 .inst   0xce2c2d5a      //bcax v26.16b,   v10.16b,v12.16b,v11.16b       // *
646 .inst   0xce2d317b      //bcax v27.16b,   v11.16b,v13.16b,v12.16b       // *
647 .inst   0xce2e358c      //bcax v12.16b,v12.16b,v14.16b,v13.16b
648 .inst   0xce2a39ad      //bcax v13.16b,v13.16b,v10.16b,v14.16b
649 .inst   0xce2b29ce      //bcax v14.16b,v14.16b,v11.16b,v10.16b
650
651 .inst   0xce3141fc      //bcax v28.16b,   v15.16b,v17.16b,v16.16b       // *
652 .inst   0xce32461d      //bcax v29.16b,   v16.16b,v18.16b,v17.16b       // *
653 .inst   0xce334a31      //bcax v17.16b,v17.16b,v19.16b,v18.16b
654 .inst   0xce2f4e52      //bcax v18.16b,v18.16b,v15.16b,v19.16b
655 .inst   0xce303e73      //bcax v19.16b,v19.16b,v16.16b,v15.16b
656
657 .inst   0xce36569e      //bcax v30.16b,   v20.16b,v22.16b,v21.16b       // *
658 .inst   0xce375abf      //bcax v31.16b,   v21.16b,v23.16b,v22.16b       // *
659 .inst   0xce385ed6      //bcax v22.16b,v22.16b,v24.16b,v23.16b
660 .inst   0xce3462f7      //bcax v23.16b,v23.16b,v20.16b,v24.16b
661 .inst   0xce355318      //bcax v24.16b,v24.16b,v21.16b,v20.16b
662         ////////////////////////////////////////////////// Theta
663 .inst   0xce056806      //eor3 v6.16b,v0.16b,v5.16b,v26.16b
664 .inst   0xce196c2a      //eor3 v10.16b,v1.16b,v25.16b,v27.16b
665 .inst   0xce07304b      //eor3 v11.16b,v2.16b,v7.16b,v12.16b
666 .inst   0xce08346f      //eor3 v15.16b,v3.16b,v8.16b,v13.16b
667 .inst   0xce093890      //eor3 v16.16b,v4.16b,v9.16b,v14.16b
668 .inst   0xce1c78c6      //eor3 v6.16b,v6.16b,   v28.16b,v30.16b
669 .inst   0xce1d7d4a      //eor3 v10.16b,v10.16b,   v29.16b,v31.16b
670 .inst   0xce11596b      //eor3 v11.16b,v11.16b,   v17.16b,v22.16b
671 .inst   0xce125def      //eor3 v15.16b,v15.16b,   v18.16b,v23.16b
672 .inst   0xce136210      //eor3 v16.16b,v16.16b,   v19.16b,v24.16b
673
674 .inst   0xce6b8cd4      //rax1 v20.16b,v6.16b,v11.16b                   // D[1]
675 .inst   0xce6f8d55      //rax1 v21.16b,v10.16b,v15.16b                  // D[2]
676 .inst   0xce708d6b      //rax1 v11.16b,v11.16b,v16.16b                  // D[3]
677 .inst   0xce668def      //rax1 v15.16b,v15.16b,v6.16b                   // D[4]
678 .inst   0xce6a8e10      //rax1 v16.16b,v16.16b,v10.16b                  // D[0]
679
680         ////////////////////////////////////////////////// Theta+Rho+Pi
681 .inst   0xce945326      //xar v6.16b,   v25.16b,v20.16b,#64-44  // C[0]=A[0][1]
682 .inst   0xce8fb139      //xar v25.16b,v9.16b,v15.16b,#64-20
683 .inst   0xce950ec9      //xar v9.16b,v22.16b,v21.16b,#64-61
684 .inst   0xce8f65d6      //xar v22.16b,v14.16b,v15.16b,#64-39
685 .inst   0xce90bbce      //xar v14.16b,v30.16b,v16.16b,#64-18
686
687 .inst   0xce95085e      //xar v30.16b,v2.16b,v21.16b,#64-62
688
689 .inst   0xce955582      //xar v2.16b,v12.16b,v21.16b,#64-43
690 .inst   0xce8b9dac      //xar v12.16b,v13.16b,v11.16b,#64-25
691 .inst   0xce8fe26d      //xar v13.16b,v19.16b,v15.16b,#64-8
692 .inst   0xce8b22f3      //xar v19.16b,v23.16b,v11.16b,#64-56
693 .inst   0xce905f97      //xar v23.16b,v28.16b,v16.16b,#64-41
694
695 .inst   0xce8f949c      //xar v28.16b,v4.16b,v15.16b,#64-27
696
697         eor     v0.16b,v0.16b,v16.16b
698         ldr     x11,[x10],#8
699
700 .inst   0xce8bae4a      //xar v10.16b,   v18.16b,v11.16b,#64-21 // C[1]=A[0][3]
701 .inst   0xce95c632      //xar v18.16b,v17.16b,v21.16b,#64-15
702 .inst   0xce94db71      //xar v17.16b,v27.16b,v20.16b,#64-10
703 .inst   0xce95e8fb      //xar v27.16b,v7.16b,v21.16b,#64-6
704 .inst   0xce90f747      //xar v7.16b,v26.16b,v16.16b,#64-3
705
706 .inst   0xce94fc3a      //xar v26.16b,v1.16b,v20.16b,#64-1      // *
707
708 .inst   0xce8fcb04      //xar v4.16b,v24.16b,v15.16b,#64-14
709 .inst   0xce94fbf8      //xar v24.16b,v31.16b,v20.16b,#64-2
710 .inst   0xce8b251f      //xar v31.16b,v8.16b,v11.16b,#64-55
711 .inst   0xce944fa8      //xar v8.16b,v29.16b,v20.16b,#64-45
712 .inst   0xce9070bd      //xar v29.16b,v5.16b,v16.16b,#64-36
713
714 .inst   0xce8b906b      //xar v11.16b,   v3.16b,v11.16b,#64-28  // C[2]=A[1][0]
715
716         ////////////////////////////////////////////////// Chi+Iota
717         dup     v21.2d,x11                              // borrow C[6]
718 .inst   0xce22180f      //bcax v15.16b,   v0.16b,v2.16b,v6.16b  // *
719 .inst   0xce2a08c1      //bcax v1.16b,v6.16b,   v10.16b,   v2.16b       // *
720 .inst   0xce242842      //bcax v2.16b,v2.16b,v4.16b,v10.16b
721 .inst   0xce201143      //bcax v3.16b,v10.16b,   v0.16b,v4.16b
722 .inst   0xce260084      //bcax v4.16b,v4.16b,v6.16b,   v0.16b
723
724 .inst   0xce276565      //bcax v5.16b,v11.16b,   v7.16b,v25.16b // *
725 .inst   0xce281f26      //bcax v6.16b,   v25.16b,v8.16b,v7.16b  // *
726 .inst   0xce2920e7      //bcax v7.16b,v7.16b,v9.16b,v8.16b
727 .inst   0xce2b2508      //bcax v8.16b,v8.16b,v11.16b,   v9.16b
728 .inst   0xce392d29      //bcax v9.16b,v9.16b,v25.16b,v11.16b
729
730         eor     v0.16b,v15.16b,v21.16b                  // Iota
731
732 .inst   0xce2c6f4a      //bcax v10.16b,   v26.16b,v12.16b,v27.16b       // *
733 .inst   0xce2d336b      //bcax v11.16b,   v27.16b,v13.16b,v12.16b       // *
734 .inst   0xce2e358c      //bcax v12.16b,v12.16b,v14.16b,v13.16b
735 .inst   0xce3a39ad      //bcax v13.16b,v13.16b,v26.16b,v14.16b
736 .inst   0xce3b69ce      //bcax v14.16b,v14.16b,v27.16b,v26.16b
737
738 .inst   0xce31778f      //bcax v15.16b,   v28.16b,v17.16b,v29.16b       // *
739 .inst   0xce3247b0      //bcax v16.16b,   v29.16b,v18.16b,v17.16b       // *
740 .inst   0xce334a31      //bcax v17.16b,v17.16b,v19.16b,v18.16b
741 .inst   0xce3c4e52      //bcax v18.16b,v18.16b,v28.16b,v19.16b
742 .inst   0xce3d7273      //bcax v19.16b,v19.16b,v29.16b,v28.16b
743
744 .inst   0xce367fd4      //bcax v20.16b,   v30.16b,v22.16b,v31.16b       // *
745 .inst   0xce375bf5      //bcax v21.16b,   v31.16b,v23.16b,v22.16b       // *
746 .inst   0xce385ed6      //bcax v22.16b,v22.16b,v24.16b,v23.16b
747 .inst   0xce3e62f7      //bcax v23.16b,v23.16b,v30.16b,v24.16b
748 .inst   0xce3f7b18      //bcax v24.16b,v24.16b,v31.16b,v30.16b
749         subs    x9,x9,#1
750         bne     .Loop_ce
751
752         ret
753 .size   KeccakF1600_ce,.-KeccakF1600_ce
754
755 .type   KeccakF1600_cext,%function
756 .align  5
757 KeccakF1600_cext:
758         stp     x29,x30,[sp,#-80]!
759         add     x29,sp,#0
760         stp     d8,d9,[sp,#16]          // per ABI requirement
761         stp     d10,d11,[sp,#32]
762         stp     d12,d13,[sp,#48]
763         stp     d14,d15,[sp,#64]
764         ldp     d0,d1,[x0,#8*0]
765         ldp     d2,d3,[x0,#8*2]
766         ldp     d4,d5,[x0,#8*4]
767         ldp     d6,d7,[x0,#8*6]
768         ldp     d8,d9,[x0,#8*8]
769         ldp     d10,d11,[x0,#8*10]
770         ldp     d12,d13,[x0,#8*12]
771         ldp     d14,d15,[x0,#8*14]
772         ldp     d16,d17,[x0,#8*16]
773         ldp     d18,d19,[x0,#8*18]
774         ldp     d20,d21,[x0,#8*20]
775         ldp     d22,d23,[x0,#8*22]
776         ldr     d24,[x0,#8*24]
777         bl      KeccakF1600_ce
778         ldr     x30,[sp,#8]
779         stp     d0,d1,[x0,#8*0]
780         stp     d2,d3,[x0,#8*2]
781         stp     d4,d5,[x0,#8*4]
782         stp     d6,d7,[x0,#8*6]
783         stp     d8,d9,[x0,#8*8]
784         stp     d10,d11,[x0,#8*10]
785         stp     d12,d13,[x0,#8*12]
786         stp     d14,d15,[x0,#8*14]
787         stp     d16,d17,[x0,#8*16]
788         stp     d18,d19,[x0,#8*18]
789         stp     d20,d21,[x0,#8*20]
790         stp     d22,d23,[x0,#8*22]
791         str     d24,[x0,#8*24]
792
793         ldp     d8,d9,[sp,#16]
794         ldp     d10,d11,[sp,#32]
795         ldp     d12,d13,[sp,#48]
796         ldp     d14,d15,[sp,#64]
797         ldr     x29,[sp],#80
798         ret
799 .size   KeccakF1600_cext,.-KeccakF1600_cext
800 .globl  SHA3_absorb_cext
801 .type   SHA3_absorb_cext,%function
802 .align  5
803 SHA3_absorb_cext:
804         stp     x29,x30,[sp,#-80]!
805         add     x29,sp,#0
806         stp     d8,d9,[sp,#16]          // per ABI requirement
807         stp     d10,d11,[sp,#32]
808         stp     d12,d13,[sp,#48]
809         stp     d14,d15,[sp,#64]
810         ldp     d0,d1,[x0,#8*0]
811         ldp     d2,d3,[x0,#8*2]
812         ldp     d4,d5,[x0,#8*4]
813         ldp     d6,d7,[x0,#8*6]
814         ldp     d8,d9,[x0,#8*8]
815         ldp     d10,d11,[x0,#8*10]
816         ldp     d12,d13,[x0,#8*12]
817         ldp     d14,d15,[x0,#8*14]
818         ldp     d16,d17,[x0,#8*16]
819         ldp     d18,d19,[x0,#8*18]
820         ldp     d20,d21,[x0,#8*20]
821         ldp     d22,d23,[x0,#8*22]
822         ldr     d24,[x0,#8*24]
823         b       .Loop_absorb_ce
824
825 .align  4
826 .Loop_absorb_ce:
827         subs    x2,x2,x3                // len - bsz
828         blo     .Labsorbed_ce
829         ldr     d31,[x1],#8             // *inp++
830 #ifdef  __AARCH64EB__
831         rev64   v31.16b,v31.16b
832 #endif
833         eor     v0.16b,v0.16b,v31.16b
834         cmp     x3,#8*(0+2)
835         blo     .Lprocess_block_ce
836         ldr     d31,[x1],#8             // *inp++
837 #ifdef  __AARCH64EB__
838         rev     v31.16b,v31.16b
839 #endif
840         eor     v1.16b,v1.16b,v31.16b
841         beq     .Lprocess_block_ce
842         ldr     d31,[x1],#8             // *inp++
843 #ifdef  __AARCH64EB__
844         rev64   v31.16b,v31.16b
845 #endif
846         eor     v2.16b,v2.16b,v31.16b
847         cmp     x3,#8*(2+2)
848         blo     .Lprocess_block_ce
849         ldr     d31,[x1],#8             // *inp++
850 #ifdef  __AARCH64EB__
851         rev     v31.16b,v31.16b
852 #endif
853         eor     v3.16b,v3.16b,v31.16b
854         beq     .Lprocess_block_ce
855         ldr     d31,[x1],#8             // *inp++
856 #ifdef  __AARCH64EB__
857         rev64   v31.16b,v31.16b
858 #endif
859         eor     v4.16b,v4.16b,v31.16b
860         cmp     x3,#8*(4+2)
861         blo     .Lprocess_block_ce
862         ldr     d31,[x1],#8             // *inp++
863 #ifdef  __AARCH64EB__
864         rev     v31.16b,v31.16b
865 #endif
866         eor     v5.16b,v5.16b,v31.16b
867         beq     .Lprocess_block_ce
868         ldr     d31,[x1],#8             // *inp++
869 #ifdef  __AARCH64EB__
870         rev64   v31.16b,v31.16b
871 #endif
872         eor     v6.16b,v6.16b,v31.16b
873         cmp     x3,#8*(6+2)
874         blo     .Lprocess_block_ce
875         ldr     d31,[x1],#8             // *inp++
876 #ifdef  __AARCH64EB__
877         rev     v31.16b,v31.16b
878 #endif
879         eor     v7.16b,v7.16b,v31.16b
880         beq     .Lprocess_block_ce
881         ldr     d31,[x1],#8             // *inp++
882 #ifdef  __AARCH64EB__
883         rev64   v31.16b,v31.16b
884 #endif
885         eor     v8.16b,v8.16b,v31.16b
886         cmp     x3,#8*(8+2)
887         blo     .Lprocess_block_ce
888         ldr     d31,[x1],#8             // *inp++
889 #ifdef  __AARCH64EB__
890         rev     v31.16b,v31.16b
891 #endif
892         eor     v9.16b,v9.16b,v31.16b
893         beq     .Lprocess_block_ce
894         ldr     d31,[x1],#8             // *inp++
895 #ifdef  __AARCH64EB__
896         rev64   v31.16b,v31.16b
897 #endif
898         eor     v10.16b,v10.16b,v31.16b
899         cmp     x3,#8*(10+2)
900         blo     .Lprocess_block_ce
901         ldr     d31,[x1],#8             // *inp++
902 #ifdef  __AARCH64EB__
903         rev     v31.16b,v31.16b
904 #endif
905         eor     v11.16b,v11.16b,v31.16b
906         beq     .Lprocess_block_ce
907         ldr     d31,[x1],#8             // *inp++
908 #ifdef  __AARCH64EB__
909         rev64   v31.16b,v31.16b
910 #endif
911         eor     v12.16b,v12.16b,v31.16b
912         cmp     x3,#8*(12+2)
913         blo     .Lprocess_block_ce
914         ldr     d31,[x1],#8             // *inp++
915 #ifdef  __AARCH64EB__
916         rev     v31.16b,v31.16b
917 #endif
918         eor     v13.16b,v13.16b,v31.16b
919         beq     .Lprocess_block_ce
920         ldr     d31,[x1],#8             // *inp++
921 #ifdef  __AARCH64EB__
922         rev64   v31.16b,v31.16b
923 #endif
924         eor     v14.16b,v14.16b,v31.16b
925         cmp     x3,#8*(14+2)
926         blo     .Lprocess_block_ce
927         ldr     d31,[x1],#8             // *inp++
928 #ifdef  __AARCH64EB__
929         rev     v31.16b,v31.16b
930 #endif
931         eor     v15.16b,v15.16b,v31.16b
932         beq     .Lprocess_block_ce
933         ldr     d31,[x1],#8             // *inp++
934 #ifdef  __AARCH64EB__
935         rev64   v31.16b,v31.16b
936 #endif
937         eor     v16.16b,v16.16b,v31.16b
938         cmp     x3,#8*(16+2)
939         blo     .Lprocess_block_ce
940         ldr     d31,[x1],#8             // *inp++
941 #ifdef  __AARCH64EB__
942         rev     v31.16b,v31.16b
943 #endif
944         eor     v17.16b,v17.16b,v31.16b
945         beq     .Lprocess_block_ce
946         ldr     d31,[x1],#8             // *inp++
947 #ifdef  __AARCH64EB__
948         rev64   v31.16b,v31.16b
949 #endif
950         eor     v18.16b,v18.16b,v31.16b
951         cmp     x3,#8*(18+2)
952         blo     .Lprocess_block_ce
953         ldr     d31,[x1],#8             // *inp++
954 #ifdef  __AARCH64EB__
955         rev     v31.16b,v31.16b
956 #endif
957         eor     v19.16b,v19.16b,v31.16b
958         beq     .Lprocess_block_ce
959         ldr     d31,[x1],#8             // *inp++
960 #ifdef  __AARCH64EB__
961         rev64   v31.16b,v31.16b
962 #endif
963         eor     v20.16b,v20.16b,v31.16b
964         cmp     x3,#8*(20+2)
965         blo     .Lprocess_block_ce
966         ldr     d31,[x1],#8             // *inp++
967 #ifdef  __AARCH64EB__
968         rev     v31.16b,v31.16b
969 #endif
970         eor     v21.16b,v21.16b,v31.16b
971         beq     .Lprocess_block_ce
972         ldr     d31,[x1],#8             // *inp++
973 #ifdef  __AARCH64EB__
974         rev64   v31.16b,v31.16b
975 #endif
976         eor     v22.16b,v22.16b,v31.16b
977         cmp     x3,#8*(22+2)
978         blo     .Lprocess_block_ce
979         ldr     d31,[x1],#8             // *inp++
980 #ifdef  __AARCH64EB__
981         rev     v31.16b,v31.16b
982 #endif
983         eor     v23.16b,v23.16b,v31.16b
984         beq     .Lprocess_block_ce
985         ldr     d31,[x1],#8             // *inp++
986 #ifdef  __AARCH64EB__
987         rev     v31.16b,v31.16b
988 #endif
989         eor     v24.16b,v24.16b,v31.16b
990
991 .Lprocess_block_ce:
992
993         bl      KeccakF1600_ce
994
995         b       .Loop_absorb_ce
996
997 .align  4
998 .Labsorbed_ce:
999         stp     d0,d1,[x0,#8*0]
1000         stp     d2,d3,[x0,#8*2]
1001         stp     d4,d5,[x0,#8*4]
1002         stp     d6,d7,[x0,#8*6]
1003         stp     d8,d9,[x0,#8*8]
1004         stp     d10,d11,[x0,#8*10]
1005         stp     d12,d13,[x0,#8*12]
1006         stp     d14,d15,[x0,#8*14]
1007         stp     d16,d17,[x0,#8*16]
1008         stp     d18,d19,[x0,#8*18]
1009         stp     d20,d21,[x0,#8*20]
1010         stp     d22,d23,[x0,#8*22]
1011         str     d24,[x0,#8*24]
1012         add     x0,x2,x3                // return value
1013
1014         ldp     d8,d9,[sp,#16]
1015         ldp     d10,d11,[sp,#32]
1016         ldp     d12,d13,[sp,#48]
1017         ldp     d14,d15,[sp,#64]
1018         ldp     x29,x30,[sp],#80
1019         ret
1020 .size   SHA3_absorb_cext,.-SHA3_absorb_cext
1021 .globl  SHA3_squeeze_cext
1022 .type   SHA3_squeeze_cext,%function
1023 .align  5
1024 SHA3_squeeze_cext:
1025         stp     x29,x30,[sp,#-16]!
1026         add     x29,sp,#0
1027         mov     x9,x0
1028         mov     x10,x3
1029
1030 .Loop_squeeze_ce:
1031         ldr     x4,[x9],#8
1032         cmp     x2,#8
1033         blo     .Lsqueeze_tail_ce
1034 #ifdef  __AARCH64EB__
1035         rev     x4,x4
1036 #endif
1037         str     x4,[x1],#8
1038         beq     .Lsqueeze_done_ce
1039
1040         sub     x2,x2,#8
1041         subs    x10,x10,#8
1042         bhi     .Loop_squeeze_ce
1043
1044         bl      KeccakF1600_cext
1045         ldr     x30,[sp,#8]
1046         mov     x9,x0
1047         mov     x10,x3
1048         b       .Loop_squeeze_ce
1049
1050 .align  4
1051 .Lsqueeze_tail_ce:
1052         strb    w4,[x1],#1
1053         lsr     x4,x4,#8
1054         subs    x2,x2,#1
1055         beq     .Lsqueeze_done_ce
1056         strb    w4,[x1],#1
1057         lsr     x4,x4,#8
1058         subs    x2,x2,#1
1059         beq     .Lsqueeze_done_ce
1060         strb    w4,[x1],#1
1061         lsr     x4,x4,#8
1062         subs    x2,x2,#1
1063         beq     .Lsqueeze_done_ce
1064         strb    w4,[x1],#1
1065         lsr     x4,x4,#8
1066         subs    x2,x2,#1
1067         beq     .Lsqueeze_done_ce
1068         strb    w4,[x1],#1
1069         lsr     x4,x4,#8
1070         subs    x2,x2,#1
1071         beq     .Lsqueeze_done_ce
1072         strb    w4,[x1],#1
1073         lsr     x4,x4,#8
1074         subs    x2,x2,#1
1075         beq     .Lsqueeze_done_ce
1076         strb    w4,[x1],#1
1077
1078 .Lsqueeze_done_ce:
1079         ldr     x29,[sp],#16
1080         ret
1081 .size   SHA3_squeeze_cext,.-SHA3_squeeze_cext
1082 .byte   75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1083 .align  2