]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/crypto/openssl/aarch64/sha512-armv8.S
Merge OpenSSL 1.1.1i.
[FreeBSD/FreeBSD.git] / sys / crypto / openssl / aarch64 / sha512-armv8.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from sha512-armv8.pl. */
3 // Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
4 //
5 // Licensed under the OpenSSL license (the "License").  You may not use
6 // this file except in compliance with the License.  You can obtain a copy
7 // in the file LICENSE in the source distribution or at
8 // https://www.openssl.org/source/license.html
9
10 // ====================================================================
11 // Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 // project. The module is, however, dual licensed under OpenSSL and
13 // CRYPTOGAMS licenses depending on where you obtain it. For further
14 // details see http://www.openssl.org/~appro/cryptogams/.
15 //
16 // Permission to use under GPLv2 terms is granted.
17 // ====================================================================
18 //
19 // SHA256/512 for ARMv8.
20 //
21 // Performance in cycles per processed byte and improvement coefficient
22 // over code generated with "default" compiler:
23 //
24 //              SHA256-hw       SHA256(*)       SHA512
25 // Apple A7     1.97            10.5 (+33%)     6.73 (-1%(**))
26 // Cortex-A53   2.38            15.5 (+115%)    10.0 (+150%(***))
27 // Cortex-A57   2.31            11.6 (+86%)     7.51 (+260%(***))
28 // Denver       2.01            10.5 (+26%)     6.70 (+8%)
29 // X-Gene                       20.0 (+100%)    12.8 (+300%(***))
30 // Mongoose     2.36            13.0 (+50%)     8.36 (+33%)
31 // Kryo         1.92            17.4 (+30%)     11.2 (+8%)
32 //
33 // (*)  Software SHA256 results are of lesser relevance, presented
34 //      mostly for informational purposes.
35 // (**) The result is a trade-off: it's possible to improve it by
36 //      10% (or by 1 cycle per round), but at the cost of 20% loss
37 //      on Cortex-A53 (or by 4 cycles per round).
38 // (***)        Super-impressive coefficients over gcc-generated code are
39 //      indication of some compiler "pathology", most notably code
40 //      generated with -mgeneral-regs-only is significantly faster
41 //      and the gap is only 40-90%.
42 //
43 // October 2016.
44 //
45 // Originally it was reckoned that it makes no sense to implement NEON
46 // version of SHA256 for 64-bit processors. This is because performance
47 // improvement on most wide-spread Cortex-A5x processors was observed
48 // to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
49 // observed that 32-bit NEON SHA256 performs significantly better than
50 // 64-bit scalar version on *some* of the more recent processors. As
51 // result 64-bit NEON version of SHA256 was added to provide best
52 // all-round performance. For example it executes ~30% faster on X-Gene
53 // and Mongoose. [For reference, NEON version of SHA512 is bound to
54 // deliver much less improvement, likely *negative* on Cortex-A5x.
55 // Which is why NEON support is limited to SHA256.]
56
57 #ifndef __KERNEL__
58 # include "arm_arch.h"
59 #endif
60
61 .text
62
63
64 .hidden OPENSSL_armcap_P
65 .globl  sha512_block_data_order
66 .type   sha512_block_data_order,%function
67 .align  6
68 sha512_block_data_order:
69 #ifndef __KERNEL__
70 # ifdef __ILP32__
71         ldrsw   x16,.LOPENSSL_armcap_P
72 # else
73         ldr     x16,.LOPENSSL_armcap_P
74 # endif
75         adr     x17,.LOPENSSL_armcap_P
76         add     x16,x16,x17
77         ldr     w16,[x16]
78         tst     w16,#ARMV8_SHA512
79         b.ne    .Lv8_entry
80 #endif
81 .inst   0xd503233f                              // paciasp
82         stp     x29,x30,[sp,#-128]!
83         add     x29,sp,#0
84
85         stp     x19,x20,[sp,#16]
86         stp     x21,x22,[sp,#32]
87         stp     x23,x24,[sp,#48]
88         stp     x25,x26,[sp,#64]
89         stp     x27,x28,[sp,#80]
90         sub     sp,sp,#4*8
91
92         ldp     x20,x21,[x0]                            // load context
93         ldp     x22,x23,[x0,#2*8]
94         ldp     x24,x25,[x0,#4*8]
95         add     x2,x1,x2,lsl#7  // end of input
96         ldp     x26,x27,[x0,#6*8]
97         adr     x30,.LK512
98         stp     x0,x2,[x29,#96]
99
100 .Loop:
101         ldp     x3,x4,[x1],#2*8
102         ldr     x19,[x30],#8                    // *K++
103         eor     x28,x21,x22                             // magic seed
104         str     x1,[x29,#112]
105 #ifndef __AARCH64EB__
106         rev     x3,x3                   // 0
107 #endif
108         ror     x16,x24,#14
109         add     x27,x27,x19                     // h+=K[i]
110         eor     x6,x24,x24,ror#23
111         and     x17,x25,x24
112         bic     x19,x26,x24
113         add     x27,x27,x3                      // h+=X[i]
114         orr     x17,x17,x19                     // Ch(e,f,g)
115         eor     x19,x20,x21                     // a^b, b^c in next round
116         eor     x16,x16,x6,ror#18       // Sigma1(e)
117         ror     x6,x20,#28
118         add     x27,x27,x17                     // h+=Ch(e,f,g)
119         eor     x17,x20,x20,ror#5
120         add     x27,x27,x16                     // h+=Sigma1(e)
121         and     x28,x28,x19                     // (b^c)&=(a^b)
122         add     x23,x23,x27                     // d+=h
123         eor     x28,x28,x21                     // Maj(a,b,c)
124         eor     x17,x6,x17,ror#34       // Sigma0(a)
125         add     x27,x27,x28                     // h+=Maj(a,b,c)
126         ldr     x28,[x30],#8            // *K++, x19 in next round
127         //add   x27,x27,x17                     // h+=Sigma0(a)
128 #ifndef __AARCH64EB__
129         rev     x4,x4                   // 1
130 #endif
131         ldp     x5,x6,[x1],#2*8
132         add     x27,x27,x17                     // h+=Sigma0(a)
133         ror     x16,x23,#14
134         add     x26,x26,x28                     // h+=K[i]
135         eor     x7,x23,x23,ror#23
136         and     x17,x24,x23
137         bic     x28,x25,x23
138         add     x26,x26,x4                      // h+=X[i]
139         orr     x17,x17,x28                     // Ch(e,f,g)
140         eor     x28,x27,x20                     // a^b, b^c in next round
141         eor     x16,x16,x7,ror#18       // Sigma1(e)
142         ror     x7,x27,#28
143         add     x26,x26,x17                     // h+=Ch(e,f,g)
144         eor     x17,x27,x27,ror#5
145         add     x26,x26,x16                     // h+=Sigma1(e)
146         and     x19,x19,x28                     // (b^c)&=(a^b)
147         add     x22,x22,x26                     // d+=h
148         eor     x19,x19,x20                     // Maj(a,b,c)
149         eor     x17,x7,x17,ror#34       // Sigma0(a)
150         add     x26,x26,x19                     // h+=Maj(a,b,c)
151         ldr     x19,[x30],#8            // *K++, x28 in next round
152         //add   x26,x26,x17                     // h+=Sigma0(a)
153 #ifndef __AARCH64EB__
154         rev     x5,x5                   // 2
155 #endif
156         add     x26,x26,x17                     // h+=Sigma0(a)
157         ror     x16,x22,#14
158         add     x25,x25,x19                     // h+=K[i]
159         eor     x8,x22,x22,ror#23
160         and     x17,x23,x22
161         bic     x19,x24,x22
162         add     x25,x25,x5                      // h+=X[i]
163         orr     x17,x17,x19                     // Ch(e,f,g)
164         eor     x19,x26,x27                     // a^b, b^c in next round
165         eor     x16,x16,x8,ror#18       // Sigma1(e)
166         ror     x8,x26,#28
167         add     x25,x25,x17                     // h+=Ch(e,f,g)
168         eor     x17,x26,x26,ror#5
169         add     x25,x25,x16                     // h+=Sigma1(e)
170         and     x28,x28,x19                     // (b^c)&=(a^b)
171         add     x21,x21,x25                     // d+=h
172         eor     x28,x28,x27                     // Maj(a,b,c)
173         eor     x17,x8,x17,ror#34       // Sigma0(a)
174         add     x25,x25,x28                     // h+=Maj(a,b,c)
175         ldr     x28,[x30],#8            // *K++, x19 in next round
176         //add   x25,x25,x17                     // h+=Sigma0(a)
177 #ifndef __AARCH64EB__
178         rev     x6,x6                   // 3
179 #endif
180         ldp     x7,x8,[x1],#2*8
181         add     x25,x25,x17                     // h+=Sigma0(a)
182         ror     x16,x21,#14
183         add     x24,x24,x28                     // h+=K[i]
184         eor     x9,x21,x21,ror#23
185         and     x17,x22,x21
186         bic     x28,x23,x21
187         add     x24,x24,x6                      // h+=X[i]
188         orr     x17,x17,x28                     // Ch(e,f,g)
189         eor     x28,x25,x26                     // a^b, b^c in next round
190         eor     x16,x16,x9,ror#18       // Sigma1(e)
191         ror     x9,x25,#28
192         add     x24,x24,x17                     // h+=Ch(e,f,g)
193         eor     x17,x25,x25,ror#5
194         add     x24,x24,x16                     // h+=Sigma1(e)
195         and     x19,x19,x28                     // (b^c)&=(a^b)
196         add     x20,x20,x24                     // d+=h
197         eor     x19,x19,x26                     // Maj(a,b,c)
198         eor     x17,x9,x17,ror#34       // Sigma0(a)
199         add     x24,x24,x19                     // h+=Maj(a,b,c)
200         ldr     x19,[x30],#8            // *K++, x28 in next round
201         //add   x24,x24,x17                     // h+=Sigma0(a)
202 #ifndef __AARCH64EB__
203         rev     x7,x7                   // 4
204 #endif
205         add     x24,x24,x17                     // h+=Sigma0(a)
206         ror     x16,x20,#14
207         add     x23,x23,x19                     // h+=K[i]
208         eor     x10,x20,x20,ror#23
209         and     x17,x21,x20
210         bic     x19,x22,x20
211         add     x23,x23,x7                      // h+=X[i]
212         orr     x17,x17,x19                     // Ch(e,f,g)
213         eor     x19,x24,x25                     // a^b, b^c in next round
214         eor     x16,x16,x10,ror#18      // Sigma1(e)
215         ror     x10,x24,#28
216         add     x23,x23,x17                     // h+=Ch(e,f,g)
217         eor     x17,x24,x24,ror#5
218         add     x23,x23,x16                     // h+=Sigma1(e)
219         and     x28,x28,x19                     // (b^c)&=(a^b)
220         add     x27,x27,x23                     // d+=h
221         eor     x28,x28,x25                     // Maj(a,b,c)
222         eor     x17,x10,x17,ror#34      // Sigma0(a)
223         add     x23,x23,x28                     // h+=Maj(a,b,c)
224         ldr     x28,[x30],#8            // *K++, x19 in next round
225         //add   x23,x23,x17                     // h+=Sigma0(a)
226 #ifndef __AARCH64EB__
227         rev     x8,x8                   // 5
228 #endif
229         ldp     x9,x10,[x1],#2*8
230         add     x23,x23,x17                     // h+=Sigma0(a)
231         ror     x16,x27,#14
232         add     x22,x22,x28                     // h+=K[i]
233         eor     x11,x27,x27,ror#23
234         and     x17,x20,x27
235         bic     x28,x21,x27
236         add     x22,x22,x8                      // h+=X[i]
237         orr     x17,x17,x28                     // Ch(e,f,g)
238         eor     x28,x23,x24                     // a^b, b^c in next round
239         eor     x16,x16,x11,ror#18      // Sigma1(e)
240         ror     x11,x23,#28
241         add     x22,x22,x17                     // h+=Ch(e,f,g)
242         eor     x17,x23,x23,ror#5
243         add     x22,x22,x16                     // h+=Sigma1(e)
244         and     x19,x19,x28                     // (b^c)&=(a^b)
245         add     x26,x26,x22                     // d+=h
246         eor     x19,x19,x24                     // Maj(a,b,c)
247         eor     x17,x11,x17,ror#34      // Sigma0(a)
248         add     x22,x22,x19                     // h+=Maj(a,b,c)
249         ldr     x19,[x30],#8            // *K++, x28 in next round
250         //add   x22,x22,x17                     // h+=Sigma0(a)
251 #ifndef __AARCH64EB__
252         rev     x9,x9                   // 6
253 #endif
254         add     x22,x22,x17                     // h+=Sigma0(a)
255         ror     x16,x26,#14
256         add     x21,x21,x19                     // h+=K[i]
257         eor     x12,x26,x26,ror#23
258         and     x17,x27,x26
259         bic     x19,x20,x26
260         add     x21,x21,x9                      // h+=X[i]
261         orr     x17,x17,x19                     // Ch(e,f,g)
262         eor     x19,x22,x23                     // a^b, b^c in next round
263         eor     x16,x16,x12,ror#18      // Sigma1(e)
264         ror     x12,x22,#28
265         add     x21,x21,x17                     // h+=Ch(e,f,g)
266         eor     x17,x22,x22,ror#5
267         add     x21,x21,x16                     // h+=Sigma1(e)
268         and     x28,x28,x19                     // (b^c)&=(a^b)
269         add     x25,x25,x21                     // d+=h
270         eor     x28,x28,x23                     // Maj(a,b,c)
271         eor     x17,x12,x17,ror#34      // Sigma0(a)
272         add     x21,x21,x28                     // h+=Maj(a,b,c)
273         ldr     x28,[x30],#8            // *K++, x19 in next round
274         //add   x21,x21,x17                     // h+=Sigma0(a)
275 #ifndef __AARCH64EB__
276         rev     x10,x10                 // 7
277 #endif
278         ldp     x11,x12,[x1],#2*8
279         add     x21,x21,x17                     // h+=Sigma0(a)
280         ror     x16,x25,#14
281         add     x20,x20,x28                     // h+=K[i]
282         eor     x13,x25,x25,ror#23
283         and     x17,x26,x25
284         bic     x28,x27,x25
285         add     x20,x20,x10                     // h+=X[i]
286         orr     x17,x17,x28                     // Ch(e,f,g)
287         eor     x28,x21,x22                     // a^b, b^c in next round
288         eor     x16,x16,x13,ror#18      // Sigma1(e)
289         ror     x13,x21,#28
290         add     x20,x20,x17                     // h+=Ch(e,f,g)
291         eor     x17,x21,x21,ror#5
292         add     x20,x20,x16                     // h+=Sigma1(e)
293         and     x19,x19,x28                     // (b^c)&=(a^b)
294         add     x24,x24,x20                     // d+=h
295         eor     x19,x19,x22                     // Maj(a,b,c)
296         eor     x17,x13,x17,ror#34      // Sigma0(a)
297         add     x20,x20,x19                     // h+=Maj(a,b,c)
298         ldr     x19,[x30],#8            // *K++, x28 in next round
299         //add   x20,x20,x17                     // h+=Sigma0(a)
300 #ifndef __AARCH64EB__
301         rev     x11,x11                 // 8
302 #endif
303         add     x20,x20,x17                     // h+=Sigma0(a)
304         ror     x16,x24,#14
305         add     x27,x27,x19                     // h+=K[i]
306         eor     x14,x24,x24,ror#23
307         and     x17,x25,x24
308         bic     x19,x26,x24
309         add     x27,x27,x11                     // h+=X[i]
310         orr     x17,x17,x19                     // Ch(e,f,g)
311         eor     x19,x20,x21                     // a^b, b^c in next round
312         eor     x16,x16,x14,ror#18      // Sigma1(e)
313         ror     x14,x20,#28
314         add     x27,x27,x17                     // h+=Ch(e,f,g)
315         eor     x17,x20,x20,ror#5
316         add     x27,x27,x16                     // h+=Sigma1(e)
317         and     x28,x28,x19                     // (b^c)&=(a^b)
318         add     x23,x23,x27                     // d+=h
319         eor     x28,x28,x21                     // Maj(a,b,c)
320         eor     x17,x14,x17,ror#34      // Sigma0(a)
321         add     x27,x27,x28                     // h+=Maj(a,b,c)
322         ldr     x28,[x30],#8            // *K++, x19 in next round
323         //add   x27,x27,x17                     // h+=Sigma0(a)
324 #ifndef __AARCH64EB__
325         rev     x12,x12                 // 9
326 #endif
327         ldp     x13,x14,[x1],#2*8
328         add     x27,x27,x17                     // h+=Sigma0(a)
329         ror     x16,x23,#14
330         add     x26,x26,x28                     // h+=K[i]
331         eor     x15,x23,x23,ror#23
332         and     x17,x24,x23
333         bic     x28,x25,x23
334         add     x26,x26,x12                     // h+=X[i]
335         orr     x17,x17,x28                     // Ch(e,f,g)
336         eor     x28,x27,x20                     // a^b, b^c in next round
337         eor     x16,x16,x15,ror#18      // Sigma1(e)
338         ror     x15,x27,#28
339         add     x26,x26,x17                     // h+=Ch(e,f,g)
340         eor     x17,x27,x27,ror#5
341         add     x26,x26,x16                     // h+=Sigma1(e)
342         and     x19,x19,x28                     // (b^c)&=(a^b)
343         add     x22,x22,x26                     // d+=h
344         eor     x19,x19,x20                     // Maj(a,b,c)
345         eor     x17,x15,x17,ror#34      // Sigma0(a)
346         add     x26,x26,x19                     // h+=Maj(a,b,c)
347         ldr     x19,[x30],#8            // *K++, x28 in next round
348         //add   x26,x26,x17                     // h+=Sigma0(a)
349 #ifndef __AARCH64EB__
350         rev     x13,x13                 // 10
351 #endif
352         add     x26,x26,x17                     // h+=Sigma0(a)
353         ror     x16,x22,#14
354         add     x25,x25,x19                     // h+=K[i]
355         eor     x0,x22,x22,ror#23
356         and     x17,x23,x22
357         bic     x19,x24,x22
358         add     x25,x25,x13                     // h+=X[i]
359         orr     x17,x17,x19                     // Ch(e,f,g)
360         eor     x19,x26,x27                     // a^b, b^c in next round
361         eor     x16,x16,x0,ror#18       // Sigma1(e)
362         ror     x0,x26,#28
363         add     x25,x25,x17                     // h+=Ch(e,f,g)
364         eor     x17,x26,x26,ror#5
365         add     x25,x25,x16                     // h+=Sigma1(e)
366         and     x28,x28,x19                     // (b^c)&=(a^b)
367         add     x21,x21,x25                     // d+=h
368         eor     x28,x28,x27                     // Maj(a,b,c)
369         eor     x17,x0,x17,ror#34       // Sigma0(a)
370         add     x25,x25,x28                     // h+=Maj(a,b,c)
371         ldr     x28,[x30],#8            // *K++, x19 in next round
372         //add   x25,x25,x17                     // h+=Sigma0(a)
373 #ifndef __AARCH64EB__
374         rev     x14,x14                 // 11
375 #endif
376         ldp     x15,x0,[x1],#2*8
377         add     x25,x25,x17                     // h+=Sigma0(a)
378         str     x6,[sp,#24]
379         ror     x16,x21,#14
380         add     x24,x24,x28                     // h+=K[i]
381         eor     x6,x21,x21,ror#23
382         and     x17,x22,x21
383         bic     x28,x23,x21
384         add     x24,x24,x14                     // h+=X[i]
385         orr     x17,x17,x28                     // Ch(e,f,g)
386         eor     x28,x25,x26                     // a^b, b^c in next round
387         eor     x16,x16,x6,ror#18       // Sigma1(e)
388         ror     x6,x25,#28
389         add     x24,x24,x17                     // h+=Ch(e,f,g)
390         eor     x17,x25,x25,ror#5
391         add     x24,x24,x16                     // h+=Sigma1(e)
392         and     x19,x19,x28                     // (b^c)&=(a^b)
393         add     x20,x20,x24                     // d+=h
394         eor     x19,x19,x26                     // Maj(a,b,c)
395         eor     x17,x6,x17,ror#34       // Sigma0(a)
396         add     x24,x24,x19                     // h+=Maj(a,b,c)
397         ldr     x19,[x30],#8            // *K++, x28 in next round
398         //add   x24,x24,x17                     // h+=Sigma0(a)
399 #ifndef __AARCH64EB__
400         rev     x15,x15                 // 12
401 #endif
402         add     x24,x24,x17                     // h+=Sigma0(a)
403         str     x7,[sp,#0]
404         ror     x16,x20,#14
405         add     x23,x23,x19                     // h+=K[i]
406         eor     x7,x20,x20,ror#23
407         and     x17,x21,x20
408         bic     x19,x22,x20
409         add     x23,x23,x15                     // h+=X[i]
410         orr     x17,x17,x19                     // Ch(e,f,g)
411         eor     x19,x24,x25                     // a^b, b^c in next round
412         eor     x16,x16,x7,ror#18       // Sigma1(e)
413         ror     x7,x24,#28
414         add     x23,x23,x17                     // h+=Ch(e,f,g)
415         eor     x17,x24,x24,ror#5
416         add     x23,x23,x16                     // h+=Sigma1(e)
417         and     x28,x28,x19                     // (b^c)&=(a^b)
418         add     x27,x27,x23                     // d+=h
419         eor     x28,x28,x25                     // Maj(a,b,c)
420         eor     x17,x7,x17,ror#34       // Sigma0(a)
421         add     x23,x23,x28                     // h+=Maj(a,b,c)
422         ldr     x28,[x30],#8            // *K++, x19 in next round
423         //add   x23,x23,x17                     // h+=Sigma0(a)
424 #ifndef __AARCH64EB__
425         rev     x0,x0                   // 13
426 #endif
427         ldp     x1,x2,[x1]
428         add     x23,x23,x17                     // h+=Sigma0(a)
429         str     x8,[sp,#8]
430         ror     x16,x27,#14
431         add     x22,x22,x28                     // h+=K[i]
432         eor     x8,x27,x27,ror#23
433         and     x17,x20,x27
434         bic     x28,x21,x27
435         add     x22,x22,x0                      // h+=X[i]
436         orr     x17,x17,x28                     // Ch(e,f,g)
437         eor     x28,x23,x24                     // a^b, b^c in next round
438         eor     x16,x16,x8,ror#18       // Sigma1(e)
439         ror     x8,x23,#28
440         add     x22,x22,x17                     // h+=Ch(e,f,g)
441         eor     x17,x23,x23,ror#5
442         add     x22,x22,x16                     // h+=Sigma1(e)
443         and     x19,x19,x28                     // (b^c)&=(a^b)
444         add     x26,x26,x22                     // d+=h
445         eor     x19,x19,x24                     // Maj(a,b,c)
446         eor     x17,x8,x17,ror#34       // Sigma0(a)
447         add     x22,x22,x19                     // h+=Maj(a,b,c)
448         ldr     x19,[x30],#8            // *K++, x28 in next round
449         //add   x22,x22,x17                     // h+=Sigma0(a)
450 #ifndef __AARCH64EB__
451         rev     x1,x1                   // 14
452 #endif
453         ldr     x6,[sp,#24]
454         add     x22,x22,x17                     // h+=Sigma0(a)
455         str     x9,[sp,#16]
456         ror     x16,x26,#14
457         add     x21,x21,x19                     // h+=K[i]
458         eor     x9,x26,x26,ror#23
459         and     x17,x27,x26
460         bic     x19,x20,x26
461         add     x21,x21,x1                      // h+=X[i]
462         orr     x17,x17,x19                     // Ch(e,f,g)
463         eor     x19,x22,x23                     // a^b, b^c in next round
464         eor     x16,x16,x9,ror#18       // Sigma1(e)
465         ror     x9,x22,#28
466         add     x21,x21,x17                     // h+=Ch(e,f,g)
467         eor     x17,x22,x22,ror#5
468         add     x21,x21,x16                     // h+=Sigma1(e)
469         and     x28,x28,x19                     // (b^c)&=(a^b)
470         add     x25,x25,x21                     // d+=h
471         eor     x28,x28,x23                     // Maj(a,b,c)
472         eor     x17,x9,x17,ror#34       // Sigma0(a)
473         add     x21,x21,x28                     // h+=Maj(a,b,c)
474         ldr     x28,[x30],#8            // *K++, x19 in next round
475         //add   x21,x21,x17                     // h+=Sigma0(a)
476 #ifndef __AARCH64EB__
477         rev     x2,x2                   // 15
478 #endif
479         ldr     x7,[sp,#0]
480         add     x21,x21,x17                     // h+=Sigma0(a)
481         str     x10,[sp,#24]
482         ror     x16,x25,#14
483         add     x20,x20,x28                     // h+=K[i]
484         ror     x9,x4,#1
485         and     x17,x26,x25
486         ror     x8,x1,#19
487         bic     x28,x27,x25
488         ror     x10,x21,#28
489         add     x20,x20,x2                      // h+=X[i]
490         eor     x16,x16,x25,ror#18
491         eor     x9,x9,x4,ror#8
492         orr     x17,x17,x28                     // Ch(e,f,g)
493         eor     x28,x21,x22                     // a^b, b^c in next round
494         eor     x16,x16,x25,ror#41      // Sigma1(e)
495         eor     x10,x10,x21,ror#34
496         add     x20,x20,x17                     // h+=Ch(e,f,g)
497         and     x19,x19,x28                     // (b^c)&=(a^b)
498         eor     x8,x8,x1,ror#61
499         eor     x9,x9,x4,lsr#7  // sigma0(X[i+1])
500         add     x20,x20,x16                     // h+=Sigma1(e)
501         eor     x19,x19,x22                     // Maj(a,b,c)
502         eor     x17,x10,x21,ror#39      // Sigma0(a)
503         eor     x8,x8,x1,lsr#6  // sigma1(X[i+14])
504         add     x3,x3,x12
505         add     x24,x24,x20                     // d+=h
506         add     x20,x20,x19                     // h+=Maj(a,b,c)
507         ldr     x19,[x30],#8            // *K++, x28 in next round
508         add     x3,x3,x9
509         add     x20,x20,x17                     // h+=Sigma0(a)
510         add     x3,x3,x8
511 .Loop_16_xx:
512         ldr     x8,[sp,#8]
513         str     x11,[sp,#0]
514         ror     x16,x24,#14
515         add     x27,x27,x19                     // h+=K[i]
516         ror     x10,x5,#1
517         and     x17,x25,x24
518         ror     x9,x2,#19
519         bic     x19,x26,x24
520         ror     x11,x20,#28
521         add     x27,x27,x3                      // h+=X[i]
522         eor     x16,x16,x24,ror#18
523         eor     x10,x10,x5,ror#8
524         orr     x17,x17,x19                     // Ch(e,f,g)
525         eor     x19,x20,x21                     // a^b, b^c in next round
526         eor     x16,x16,x24,ror#41      // Sigma1(e)
527         eor     x11,x11,x20,ror#34
528         add     x27,x27,x17                     // h+=Ch(e,f,g)
529         and     x28,x28,x19                     // (b^c)&=(a^b)
530         eor     x9,x9,x2,ror#61
531         eor     x10,x10,x5,lsr#7        // sigma0(X[i+1])
532         add     x27,x27,x16                     // h+=Sigma1(e)
533         eor     x28,x28,x21                     // Maj(a,b,c)
534         eor     x17,x11,x20,ror#39      // Sigma0(a)
535         eor     x9,x9,x2,lsr#6  // sigma1(X[i+14])
536         add     x4,x4,x13
537         add     x23,x23,x27                     // d+=h
538         add     x27,x27,x28                     // h+=Maj(a,b,c)
539         ldr     x28,[x30],#8            // *K++, x19 in next round
540         add     x4,x4,x10
541         add     x27,x27,x17                     // h+=Sigma0(a)
542         add     x4,x4,x9
543         ldr     x9,[sp,#16]
544         str     x12,[sp,#8]
545         ror     x16,x23,#14
546         add     x26,x26,x28                     // h+=K[i]
547         ror     x11,x6,#1
548         and     x17,x24,x23
549         ror     x10,x3,#19
550         bic     x28,x25,x23
551         ror     x12,x27,#28
552         add     x26,x26,x4                      // h+=X[i]
553         eor     x16,x16,x23,ror#18
554         eor     x11,x11,x6,ror#8
555         orr     x17,x17,x28                     // Ch(e,f,g)
556         eor     x28,x27,x20                     // a^b, b^c in next round
557         eor     x16,x16,x23,ror#41      // Sigma1(e)
558         eor     x12,x12,x27,ror#34
559         add     x26,x26,x17                     // h+=Ch(e,f,g)
560         and     x19,x19,x28                     // (b^c)&=(a^b)
561         eor     x10,x10,x3,ror#61
562         eor     x11,x11,x6,lsr#7        // sigma0(X[i+1])
563         add     x26,x26,x16                     // h+=Sigma1(e)
564         eor     x19,x19,x20                     // Maj(a,b,c)
565         eor     x17,x12,x27,ror#39      // Sigma0(a)
566         eor     x10,x10,x3,lsr#6        // sigma1(X[i+14])
567         add     x5,x5,x14
568         add     x22,x22,x26                     // d+=h
569         add     x26,x26,x19                     // h+=Maj(a,b,c)
570         ldr     x19,[x30],#8            // *K++, x28 in next round
571         add     x5,x5,x11
572         add     x26,x26,x17                     // h+=Sigma0(a)
573         add     x5,x5,x10
574         ldr     x10,[sp,#24]
575         str     x13,[sp,#16]
576         ror     x16,x22,#14
577         add     x25,x25,x19                     // h+=K[i]
578         ror     x12,x7,#1
579         and     x17,x23,x22
580         ror     x11,x4,#19
581         bic     x19,x24,x22
582         ror     x13,x26,#28
583         add     x25,x25,x5                      // h+=X[i]
584         eor     x16,x16,x22,ror#18
585         eor     x12,x12,x7,ror#8
586         orr     x17,x17,x19                     // Ch(e,f,g)
587         eor     x19,x26,x27                     // a^b, b^c in next round
588         eor     x16,x16,x22,ror#41      // Sigma1(e)
589         eor     x13,x13,x26,ror#34
590         add     x25,x25,x17                     // h+=Ch(e,f,g)
591         and     x28,x28,x19                     // (b^c)&=(a^b)
592         eor     x11,x11,x4,ror#61
593         eor     x12,x12,x7,lsr#7        // sigma0(X[i+1])
594         add     x25,x25,x16                     // h+=Sigma1(e)
595         eor     x28,x28,x27                     // Maj(a,b,c)
596         eor     x17,x13,x26,ror#39      // Sigma0(a)
597         eor     x11,x11,x4,lsr#6        // sigma1(X[i+14])
598         add     x6,x6,x15
599         add     x21,x21,x25                     // d+=h
600         add     x25,x25,x28                     // h+=Maj(a,b,c)
601         ldr     x28,[x30],#8            // *K++, x19 in next round
602         add     x6,x6,x12
603         add     x25,x25,x17                     // h+=Sigma0(a)
604         add     x6,x6,x11
605         ldr     x11,[sp,#0]
606         str     x14,[sp,#24]
607         ror     x16,x21,#14
608         add     x24,x24,x28                     // h+=K[i]
609         ror     x13,x8,#1
610         and     x17,x22,x21
611         ror     x12,x5,#19
612         bic     x28,x23,x21
613         ror     x14,x25,#28
614         add     x24,x24,x6                      // h+=X[i]
615         eor     x16,x16,x21,ror#18
616         eor     x13,x13,x8,ror#8
617         orr     x17,x17,x28                     // Ch(e,f,g)
618         eor     x28,x25,x26                     // a^b, b^c in next round
619         eor     x16,x16,x21,ror#41      // Sigma1(e)
620         eor     x14,x14,x25,ror#34
621         add     x24,x24,x17                     // h+=Ch(e,f,g)
622         and     x19,x19,x28                     // (b^c)&=(a^b)
623         eor     x12,x12,x5,ror#61
624         eor     x13,x13,x8,lsr#7        // sigma0(X[i+1])
625         add     x24,x24,x16                     // h+=Sigma1(e)
626         eor     x19,x19,x26                     // Maj(a,b,c)
627         eor     x17,x14,x25,ror#39      // Sigma0(a)
628         eor     x12,x12,x5,lsr#6        // sigma1(X[i+14])
629         add     x7,x7,x0
630         add     x20,x20,x24                     // d+=h
631         add     x24,x24,x19                     // h+=Maj(a,b,c)
632         ldr     x19,[x30],#8            // *K++, x28 in next round
633         add     x7,x7,x13
634         add     x24,x24,x17                     // h+=Sigma0(a)
635         add     x7,x7,x12
636         ldr     x12,[sp,#8]
637         str     x15,[sp,#0]
638         ror     x16,x20,#14
639         add     x23,x23,x19                     // h+=K[i]
640         ror     x14,x9,#1
641         and     x17,x21,x20
642         ror     x13,x6,#19
643         bic     x19,x22,x20
644         ror     x15,x24,#28
645         add     x23,x23,x7                      // h+=X[i]
646         eor     x16,x16,x20,ror#18
647         eor     x14,x14,x9,ror#8
648         orr     x17,x17,x19                     // Ch(e,f,g)
649         eor     x19,x24,x25                     // a^b, b^c in next round
650         eor     x16,x16,x20,ror#41      // Sigma1(e)
651         eor     x15,x15,x24,ror#34
652         add     x23,x23,x17                     // h+=Ch(e,f,g)
653         and     x28,x28,x19                     // (b^c)&=(a^b)
654         eor     x13,x13,x6,ror#61
655         eor     x14,x14,x9,lsr#7        // sigma0(X[i+1])
656         add     x23,x23,x16                     // h+=Sigma1(e)
657         eor     x28,x28,x25                     // Maj(a,b,c)
658         eor     x17,x15,x24,ror#39      // Sigma0(a)
659         eor     x13,x13,x6,lsr#6        // sigma1(X[i+14])
660         add     x8,x8,x1
661         add     x27,x27,x23                     // d+=h
662         add     x23,x23,x28                     // h+=Maj(a,b,c)
663         ldr     x28,[x30],#8            // *K++, x19 in next round
664         add     x8,x8,x14
665         add     x23,x23,x17                     // h+=Sigma0(a)
666         add     x8,x8,x13
667         ldr     x13,[sp,#16]
668         str     x0,[sp,#8]
669         ror     x16,x27,#14
670         add     x22,x22,x28                     // h+=K[i]
671         ror     x15,x10,#1
672         and     x17,x20,x27
673         ror     x14,x7,#19
674         bic     x28,x21,x27
675         ror     x0,x23,#28
676         add     x22,x22,x8                      // h+=X[i]
677         eor     x16,x16,x27,ror#18
678         eor     x15,x15,x10,ror#8
679         orr     x17,x17,x28                     // Ch(e,f,g)
680         eor     x28,x23,x24                     // a^b, b^c in next round
681         eor     x16,x16,x27,ror#41      // Sigma1(e)
682         eor     x0,x0,x23,ror#34
683         add     x22,x22,x17                     // h+=Ch(e,f,g)
684         and     x19,x19,x28                     // (b^c)&=(a^b)
685         eor     x14,x14,x7,ror#61
686         eor     x15,x15,x10,lsr#7       // sigma0(X[i+1])
687         add     x22,x22,x16                     // h+=Sigma1(e)
688         eor     x19,x19,x24                     // Maj(a,b,c)
689         eor     x17,x0,x23,ror#39       // Sigma0(a)
690         eor     x14,x14,x7,lsr#6        // sigma1(X[i+14])
691         add     x9,x9,x2
692         add     x26,x26,x22                     // d+=h
693         add     x22,x22,x19                     // h+=Maj(a,b,c)
694         ldr     x19,[x30],#8            // *K++, x28 in next round
695         add     x9,x9,x15
696         add     x22,x22,x17                     // h+=Sigma0(a)
697         add     x9,x9,x14
698         ldr     x14,[sp,#24]
699         str     x1,[sp,#16]
700         ror     x16,x26,#14
701         add     x21,x21,x19                     // h+=K[i]
702         ror     x0,x11,#1
703         and     x17,x27,x26
704         ror     x15,x8,#19
705         bic     x19,x20,x26
706         ror     x1,x22,#28
707         add     x21,x21,x9                      // h+=X[i]
708         eor     x16,x16,x26,ror#18
709         eor     x0,x0,x11,ror#8
710         orr     x17,x17,x19                     // Ch(e,f,g)
711         eor     x19,x22,x23                     // a^b, b^c in next round
712         eor     x16,x16,x26,ror#41      // Sigma1(e)
713         eor     x1,x1,x22,ror#34
714         add     x21,x21,x17                     // h+=Ch(e,f,g)
715         and     x28,x28,x19                     // (b^c)&=(a^b)
716         eor     x15,x15,x8,ror#61
717         eor     x0,x0,x11,lsr#7 // sigma0(X[i+1])
718         add     x21,x21,x16                     // h+=Sigma1(e)
719         eor     x28,x28,x23                     // Maj(a,b,c)
720         eor     x17,x1,x22,ror#39       // Sigma0(a)
721         eor     x15,x15,x8,lsr#6        // sigma1(X[i+14])
722         add     x10,x10,x3
723         add     x25,x25,x21                     // d+=h
724         add     x21,x21,x28                     // h+=Maj(a,b,c)
725         ldr     x28,[x30],#8            // *K++, x19 in next round
726         add     x10,x10,x0
727         add     x21,x21,x17                     // h+=Sigma0(a)
728         add     x10,x10,x15
729         ldr     x15,[sp,#0]
730         str     x2,[sp,#24]
731         ror     x16,x25,#14
732         add     x20,x20,x28                     // h+=K[i]
733         ror     x1,x12,#1
734         and     x17,x26,x25
735         ror     x0,x9,#19
736         bic     x28,x27,x25
737         ror     x2,x21,#28
738         add     x20,x20,x10                     // h+=X[i]
739         eor     x16,x16,x25,ror#18
740         eor     x1,x1,x12,ror#8
741         orr     x17,x17,x28                     // Ch(e,f,g)
742         eor     x28,x21,x22                     // a^b, b^c in next round
743         eor     x16,x16,x25,ror#41      // Sigma1(e)
744         eor     x2,x2,x21,ror#34
745         add     x20,x20,x17                     // h+=Ch(e,f,g)
746         and     x19,x19,x28                     // (b^c)&=(a^b)
747         eor     x0,x0,x9,ror#61
748         eor     x1,x1,x12,lsr#7 // sigma0(X[i+1])
749         add     x20,x20,x16                     // h+=Sigma1(e)
750         eor     x19,x19,x22                     // Maj(a,b,c)
751         eor     x17,x2,x21,ror#39       // Sigma0(a)
752         eor     x0,x0,x9,lsr#6  // sigma1(X[i+14])
753         add     x11,x11,x4
754         add     x24,x24,x20                     // d+=h
755         add     x20,x20,x19                     // h+=Maj(a,b,c)
756         ldr     x19,[x30],#8            // *K++, x28 in next round
757         add     x11,x11,x1
758         add     x20,x20,x17                     // h+=Sigma0(a)
759         add     x11,x11,x0
760         ldr     x0,[sp,#8]
761         str     x3,[sp,#0]
762         ror     x16,x24,#14
763         add     x27,x27,x19                     // h+=K[i]
764         ror     x2,x13,#1
765         and     x17,x25,x24
766         ror     x1,x10,#19
767         bic     x19,x26,x24
768         ror     x3,x20,#28
769         add     x27,x27,x11                     // h+=X[i]
770         eor     x16,x16,x24,ror#18
771         eor     x2,x2,x13,ror#8
772         orr     x17,x17,x19                     // Ch(e,f,g)
773         eor     x19,x20,x21                     // a^b, b^c in next round
774         eor     x16,x16,x24,ror#41      // Sigma1(e)
775         eor     x3,x3,x20,ror#34
776         add     x27,x27,x17                     // h+=Ch(e,f,g)
777         and     x28,x28,x19                     // (b^c)&=(a^b)
778         eor     x1,x1,x10,ror#61
779         eor     x2,x2,x13,lsr#7 // sigma0(X[i+1])
780         add     x27,x27,x16                     // h+=Sigma1(e)
781         eor     x28,x28,x21                     // Maj(a,b,c)
782         eor     x17,x3,x20,ror#39       // Sigma0(a)
783         eor     x1,x1,x10,lsr#6 // sigma1(X[i+14])
784         add     x12,x12,x5
785         add     x23,x23,x27                     // d+=h
786         add     x27,x27,x28                     // h+=Maj(a,b,c)
787         ldr     x28,[x30],#8            // *K++, x19 in next round
788         add     x12,x12,x2
789         add     x27,x27,x17                     // h+=Sigma0(a)
790         add     x12,x12,x1
791         ldr     x1,[sp,#16]
792         str     x4,[sp,#8]
793         ror     x16,x23,#14
794         add     x26,x26,x28                     // h+=K[i]
795         ror     x3,x14,#1
796         and     x17,x24,x23
797         ror     x2,x11,#19
798         bic     x28,x25,x23
799         ror     x4,x27,#28
800         add     x26,x26,x12                     // h+=X[i]
801         eor     x16,x16,x23,ror#18
802         eor     x3,x3,x14,ror#8
803         orr     x17,x17,x28                     // Ch(e,f,g)
804         eor     x28,x27,x20                     // a^b, b^c in next round
805         eor     x16,x16,x23,ror#41      // Sigma1(e)
806         eor     x4,x4,x27,ror#34
807         add     x26,x26,x17                     // h+=Ch(e,f,g)
808         and     x19,x19,x28                     // (b^c)&=(a^b)
809         eor     x2,x2,x11,ror#61
810         eor     x3,x3,x14,lsr#7 // sigma0(X[i+1])
811         add     x26,x26,x16                     // h+=Sigma1(e)
812         eor     x19,x19,x20                     // Maj(a,b,c)
813         eor     x17,x4,x27,ror#39       // Sigma0(a)
814         eor     x2,x2,x11,lsr#6 // sigma1(X[i+14])
815         add     x13,x13,x6
816         add     x22,x22,x26                     // d+=h
817         add     x26,x26,x19                     // h+=Maj(a,b,c)
818         ldr     x19,[x30],#8            // *K++, x28 in next round
819         add     x13,x13,x3
820         add     x26,x26,x17                     // h+=Sigma0(a)
821         add     x13,x13,x2
822         ldr     x2,[sp,#24]
823         str     x5,[sp,#16]
824         ror     x16,x22,#14
825         add     x25,x25,x19                     // h+=K[i]
826         ror     x4,x15,#1
827         and     x17,x23,x22
828         ror     x3,x12,#19
829         bic     x19,x24,x22
830         ror     x5,x26,#28
831         add     x25,x25,x13                     // h+=X[i]
832         eor     x16,x16,x22,ror#18
833         eor     x4,x4,x15,ror#8
834         orr     x17,x17,x19                     // Ch(e,f,g)
835         eor     x19,x26,x27                     // a^b, b^c in next round
836         eor     x16,x16,x22,ror#41      // Sigma1(e)
837         eor     x5,x5,x26,ror#34
838         add     x25,x25,x17                     // h+=Ch(e,f,g)
839         and     x28,x28,x19                     // (b^c)&=(a^b)
840         eor     x3,x3,x12,ror#61
841         eor     x4,x4,x15,lsr#7 // sigma0(X[i+1])
842         add     x25,x25,x16                     // h+=Sigma1(e)
843         eor     x28,x28,x27                     // Maj(a,b,c)
844         eor     x17,x5,x26,ror#39       // Sigma0(a)
845         eor     x3,x3,x12,lsr#6 // sigma1(X[i+14])
846         add     x14,x14,x7
847         add     x21,x21,x25                     // d+=h
848         add     x25,x25,x28                     // h+=Maj(a,b,c)
849         ldr     x28,[x30],#8            // *K++, x19 in next round
850         add     x14,x14,x4
851         add     x25,x25,x17                     // h+=Sigma0(a)
852         add     x14,x14,x3
853         ldr     x3,[sp,#0]
854         str     x6,[sp,#24]
855         ror     x16,x21,#14
856         add     x24,x24,x28                     // h+=K[i]
857         ror     x5,x0,#1
858         and     x17,x22,x21
859         ror     x4,x13,#19
860         bic     x28,x23,x21
861         ror     x6,x25,#28
862         add     x24,x24,x14                     // h+=X[i]
863         eor     x16,x16,x21,ror#18
864         eor     x5,x5,x0,ror#8
865         orr     x17,x17,x28                     // Ch(e,f,g)
866         eor     x28,x25,x26                     // a^b, b^c in next round
867         eor     x16,x16,x21,ror#41      // Sigma1(e)
868         eor     x6,x6,x25,ror#34
869         add     x24,x24,x17                     // h+=Ch(e,f,g)
870         and     x19,x19,x28                     // (b^c)&=(a^b)
871         eor     x4,x4,x13,ror#61
872         eor     x5,x5,x0,lsr#7  // sigma0(X[i+1])
873         add     x24,x24,x16                     // h+=Sigma1(e)
874         eor     x19,x19,x26                     // Maj(a,b,c)
875         eor     x17,x6,x25,ror#39       // Sigma0(a)
876         eor     x4,x4,x13,lsr#6 // sigma1(X[i+14])
877         add     x15,x15,x8
878         add     x20,x20,x24                     // d+=h
879         add     x24,x24,x19                     // h+=Maj(a,b,c)
880         ldr     x19,[x30],#8            // *K++, x28 in next round
881         add     x15,x15,x5
882         add     x24,x24,x17                     // h+=Sigma0(a)
883         add     x15,x15,x4
884         ldr     x4,[sp,#8]
885         str     x7,[sp,#0]
886         ror     x16,x20,#14
887         add     x23,x23,x19                     // h+=K[i]
888         ror     x6,x1,#1
889         and     x17,x21,x20
890         ror     x5,x14,#19
891         bic     x19,x22,x20
892         ror     x7,x24,#28
893         add     x23,x23,x15                     // h+=X[i]
894         eor     x16,x16,x20,ror#18
895         eor     x6,x6,x1,ror#8
896         orr     x17,x17,x19                     // Ch(e,f,g)
897         eor     x19,x24,x25                     // a^b, b^c in next round
898         eor     x16,x16,x20,ror#41      // Sigma1(e)
899         eor     x7,x7,x24,ror#34
900         add     x23,x23,x17                     // h+=Ch(e,f,g)
901         and     x28,x28,x19                     // (b^c)&=(a^b)
902         eor     x5,x5,x14,ror#61
903         eor     x6,x6,x1,lsr#7  // sigma0(X[i+1])
904         add     x23,x23,x16                     // h+=Sigma1(e)
905         eor     x28,x28,x25                     // Maj(a,b,c)
906         eor     x17,x7,x24,ror#39       // Sigma0(a)
907         eor     x5,x5,x14,lsr#6 // sigma1(X[i+14])
908         add     x0,x0,x9
909         add     x27,x27,x23                     // d+=h
910         add     x23,x23,x28                     // h+=Maj(a,b,c)
911         ldr     x28,[x30],#8            // *K++, x19 in next round
912         add     x0,x0,x6
913         add     x23,x23,x17                     // h+=Sigma0(a)
914         add     x0,x0,x5
915         ldr     x5,[sp,#16]
916         str     x8,[sp,#8]
917         ror     x16,x27,#14
918         add     x22,x22,x28                     // h+=K[i]
919         ror     x7,x2,#1
920         and     x17,x20,x27
921         ror     x6,x15,#19
922         bic     x28,x21,x27
923         ror     x8,x23,#28
924         add     x22,x22,x0                      // h+=X[i]
925         eor     x16,x16,x27,ror#18
926         eor     x7,x7,x2,ror#8
927         orr     x17,x17,x28                     // Ch(e,f,g)
928         eor     x28,x23,x24                     // a^b, b^c in next round
929         eor     x16,x16,x27,ror#41      // Sigma1(e)
930         eor     x8,x8,x23,ror#34
931         add     x22,x22,x17                     // h+=Ch(e,f,g)
932         and     x19,x19,x28                     // (b^c)&=(a^b)
933         eor     x6,x6,x15,ror#61
934         eor     x7,x7,x2,lsr#7  // sigma0(X[i+1])
935         add     x22,x22,x16                     // h+=Sigma1(e)
936         eor     x19,x19,x24                     // Maj(a,b,c)
937         eor     x17,x8,x23,ror#39       // Sigma0(a)
938         eor     x6,x6,x15,lsr#6 // sigma1(X[i+14])
939         add     x1,x1,x10
940         add     x26,x26,x22                     // d+=h
941         add     x22,x22,x19                     // h+=Maj(a,b,c)
942         ldr     x19,[x30],#8            // *K++, x28 in next round
943         add     x1,x1,x7
944         add     x22,x22,x17                     // h+=Sigma0(a)
945         add     x1,x1,x6
946         ldr     x6,[sp,#24]
947         str     x9,[sp,#16]
948         ror     x16,x26,#14
949         add     x21,x21,x19                     // h+=K[i]
950         ror     x8,x3,#1
951         and     x17,x27,x26
952         ror     x7,x0,#19
953         bic     x19,x20,x26
954         ror     x9,x22,#28
955         add     x21,x21,x1                      // h+=X[i]
956         eor     x16,x16,x26,ror#18
957         eor     x8,x8,x3,ror#8
958         orr     x17,x17,x19                     // Ch(e,f,g)
959         eor     x19,x22,x23                     // a^b, b^c in next round
960         eor     x16,x16,x26,ror#41      // Sigma1(e)
961         eor     x9,x9,x22,ror#34
962         add     x21,x21,x17                     // h+=Ch(e,f,g)
963         and     x28,x28,x19                     // (b^c)&=(a^b)
964         eor     x7,x7,x0,ror#61
965         eor     x8,x8,x3,lsr#7  // sigma0(X[i+1])
966         add     x21,x21,x16                     // h+=Sigma1(e)
967         eor     x28,x28,x23                     // Maj(a,b,c)
968         eor     x17,x9,x22,ror#39       // Sigma0(a)
969         eor     x7,x7,x0,lsr#6  // sigma1(X[i+14])
970         add     x2,x2,x11
971         add     x25,x25,x21                     // d+=h
972         add     x21,x21,x28                     // h+=Maj(a,b,c)
973         ldr     x28,[x30],#8            // *K++, x19 in next round
974         add     x2,x2,x8
975         add     x21,x21,x17                     // h+=Sigma0(a)
976         add     x2,x2,x7
977         ldr     x7,[sp,#0]
978         str     x10,[sp,#24]
979         ror     x16,x25,#14
980         add     x20,x20,x28                     // h+=K[i]
981         ror     x9,x4,#1
982         and     x17,x26,x25
983         ror     x8,x1,#19
984         bic     x28,x27,x25
985         ror     x10,x21,#28
986         add     x20,x20,x2                      // h+=X[i]
987         eor     x16,x16,x25,ror#18
988         eor     x9,x9,x4,ror#8
989         orr     x17,x17,x28                     // Ch(e,f,g)
990         eor     x28,x21,x22                     // a^b, b^c in next round
991         eor     x16,x16,x25,ror#41      // Sigma1(e)
992         eor     x10,x10,x21,ror#34
993         add     x20,x20,x17                     // h+=Ch(e,f,g)
994         and     x19,x19,x28                     // (b^c)&=(a^b)
995         eor     x8,x8,x1,ror#61
996         eor     x9,x9,x4,lsr#7  // sigma0(X[i+1])
997         add     x20,x20,x16                     // h+=Sigma1(e)
998         eor     x19,x19,x22                     // Maj(a,b,c)
999         eor     x17,x10,x21,ror#39      // Sigma0(a)
1000         eor     x8,x8,x1,lsr#6  // sigma1(X[i+14])
1001         add     x3,x3,x12
1002         add     x24,x24,x20                     // d+=h
1003         add     x20,x20,x19                     // h+=Maj(a,b,c)
1004         ldr     x19,[x30],#8            // *K++, x28 in next round
1005         add     x3,x3,x9
1006         add     x20,x20,x17                     // h+=Sigma0(a)
1007         add     x3,x3,x8
1008         cbnz    x19,.Loop_16_xx
1009
1010         ldp     x0,x2,[x29,#96]
1011         ldr     x1,[x29,#112]
1012         sub     x30,x30,#648            // rewind
1013
1014         ldp     x3,x4,[x0]
1015         ldp     x5,x6,[x0,#2*8]
1016         add     x1,x1,#14*8                     // advance input pointer
1017         ldp     x7,x8,[x0,#4*8]
1018         add     x20,x20,x3
1019         ldp     x9,x10,[x0,#6*8]
1020         add     x21,x21,x4
1021         add     x22,x22,x5
1022         add     x23,x23,x6
1023         stp     x20,x21,[x0]
1024         add     x24,x24,x7
1025         add     x25,x25,x8
1026         stp     x22,x23,[x0,#2*8]
1027         add     x26,x26,x9
1028         add     x27,x27,x10
1029         cmp     x1,x2
1030         stp     x24,x25,[x0,#4*8]
1031         stp     x26,x27,[x0,#6*8]
1032         b.ne    .Loop
1033
1034         ldp     x19,x20,[x29,#16]
1035         add     sp,sp,#4*8
1036         ldp     x21,x22,[x29,#32]
1037         ldp     x23,x24,[x29,#48]
1038         ldp     x25,x26,[x29,#64]
1039         ldp     x27,x28,[x29,#80]
1040         ldp     x29,x30,[sp],#128
1041 .inst   0xd50323bf                              // autiasp
1042         ret
1043 .size   sha512_block_data_order,.-sha512_block_data_order
1044
1045 .align  6
1046 .type   .LK512,%object
1047 .LK512:
1048 .quad   0x428a2f98d728ae22,0x7137449123ef65cd
1049 .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1050 .quad   0x3956c25bf348b538,0x59f111f1b605d019
1051 .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
1052 .quad   0xd807aa98a3030242,0x12835b0145706fbe
1053 .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1054 .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
1055 .quad   0x9bdc06a725c71235,0xc19bf174cf692694
1056 .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
1057 .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1058 .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
1059 .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1060 .quad   0x983e5152ee66dfab,0xa831c66d2db43210
1061 .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
1062 .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
1063 .quad   0x06ca6351e003826f,0x142929670a0e6e70
1064 .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
1065 .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1066 .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
1067 .quad   0x81c2c92e47edaee6,0x92722c851482353b
1068 .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
1069 .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
1070 .quad   0xd192e819d6ef5218,0xd69906245565a910
1071 .quad   0xf40e35855771202a,0x106aa07032bbd1b8
1072 .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
1073 .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1074 .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1075 .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1076 .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
1077 .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
1078 .quad   0x90befffa23631e28,0xa4506cebde82bde9
1079 .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
1080 .quad   0xca273eceea26619c,0xd186b8c721c0c207
1081 .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1082 .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
1083 .quad   0x113f9804bef90dae,0x1b710b35131c471b
1084 .quad   0x28db77f523047d84,0x32caab7b40c72493
1085 .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1086 .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1087 .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
1088 .quad   0       // terminator
1089 .size   .LK512,.-.LK512
1090 #ifndef __KERNEL__
1091 .align  3
1092 .LOPENSSL_armcap_P:
1093 # ifdef __ILP32__
1094 .long   OPENSSL_armcap_P-.
1095 # else
1096 .quad   OPENSSL_armcap_P-.
1097 # endif
1098 #endif
1099 .byte   83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1100 .align  2
1101 .align  2
1102 #ifndef __KERNEL__
1103 .type   sha512_block_armv8,%function
1104 .align  6
1105 sha512_block_armv8:
1106 .Lv8_entry:
1107         stp     x29,x30,[sp,#-16]!
1108         add     x29,sp,#0
1109
1110         ld1     {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64      // load input
1111         ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1112
1113         ld1     {v0.2d,v1.2d,v2.2d,v3.2d},[x0]          // load context
1114         adr     x3,.LK512
1115
1116         rev64   v16.16b,v16.16b
1117         rev64   v17.16b,v17.16b
1118         rev64   v18.16b,v18.16b
1119         rev64   v19.16b,v19.16b
1120         rev64   v20.16b,v20.16b
1121         rev64   v21.16b,v21.16b
1122         rev64   v22.16b,v22.16b
1123         rev64   v23.16b,v23.16b
1124         b       .Loop_hw
1125
1126 .align  4
1127 .Loop_hw:
1128         ld1     {v24.2d},[x3],#16
1129         subs    x2,x2,#1
1130         sub     x4,x1,#128
1131         orr     v26.16b,v0.16b,v0.16b                   // offload
1132         orr     v27.16b,v1.16b,v1.16b
1133         orr     v28.16b,v2.16b,v2.16b
1134         orr     v29.16b,v3.16b,v3.16b
1135         csel    x1,x1,x4,ne                     // conditional rewind
1136         add     v24.2d,v24.2d,v16.2d
1137         ld1     {v25.2d},[x3],#16
1138         ext     v24.16b,v24.16b,v24.16b,#8
1139         ext     v5.16b,v2.16b,v3.16b,#8
1140         ext     v6.16b,v1.16b,v2.16b,#8
1141         add     v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
1142 .inst   0xcec08230      //sha512su0 v16.16b,v17.16b
1143         ext     v7.16b,v20.16b,v21.16b,#8
1144 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1145 .inst   0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
1146         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1147 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1148         add     v25.2d,v25.2d,v17.2d
1149         ld1     {v24.2d},[x3],#16
1150         ext     v25.16b,v25.16b,v25.16b,#8
1151         ext     v5.16b,v4.16b,v2.16b,#8
1152         ext     v6.16b,v0.16b,v4.16b,#8
1153         add     v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
1154 .inst   0xcec08251      //sha512su0 v17.16b,v18.16b
1155         ext     v7.16b,v21.16b,v22.16b,#8
1156 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1157 .inst   0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
1158         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1159 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1160         add     v24.2d,v24.2d,v18.2d
1161         ld1     {v25.2d},[x3],#16
1162         ext     v24.16b,v24.16b,v24.16b,#8
1163         ext     v5.16b,v1.16b,v4.16b,#8
1164         ext     v6.16b,v3.16b,v1.16b,#8
1165         add     v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
1166 .inst   0xcec08272      //sha512su0 v18.16b,v19.16b
1167         ext     v7.16b,v22.16b,v23.16b,#8
1168 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1169 .inst   0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
1170         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1171 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1172         add     v25.2d,v25.2d,v19.2d
1173         ld1     {v24.2d},[x3],#16
1174         ext     v25.16b,v25.16b,v25.16b,#8
1175         ext     v5.16b,v0.16b,v1.16b,#8
1176         ext     v6.16b,v2.16b,v0.16b,#8
1177         add     v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
1178 .inst   0xcec08293      //sha512su0 v19.16b,v20.16b
1179         ext     v7.16b,v23.16b,v16.16b,#8
1180 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1181 .inst   0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
1182         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1183 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1184         add     v24.2d,v24.2d,v20.2d
1185         ld1     {v25.2d},[x3],#16
1186         ext     v24.16b,v24.16b,v24.16b,#8
1187         ext     v5.16b,v3.16b,v0.16b,#8
1188         ext     v6.16b,v4.16b,v3.16b,#8
1189         add     v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
1190 .inst   0xcec082b4      //sha512su0 v20.16b,v21.16b
1191         ext     v7.16b,v16.16b,v17.16b,#8
1192 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1193 .inst   0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
1194         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1195 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1196         add     v25.2d,v25.2d,v21.2d
1197         ld1     {v24.2d},[x3],#16
1198         ext     v25.16b,v25.16b,v25.16b,#8
1199         ext     v5.16b,v2.16b,v3.16b,#8
1200         ext     v6.16b,v1.16b,v2.16b,#8
1201         add     v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
1202 .inst   0xcec082d5      //sha512su0 v21.16b,v22.16b
1203         ext     v7.16b,v17.16b,v18.16b,#8
1204 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1205 .inst   0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
1206         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1207 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1208         add     v24.2d,v24.2d,v22.2d
1209         ld1     {v25.2d},[x3],#16
1210         ext     v24.16b,v24.16b,v24.16b,#8
1211         ext     v5.16b,v4.16b,v2.16b,#8
1212         ext     v6.16b,v0.16b,v4.16b,#8
1213         add     v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
1214 .inst   0xcec082f6      //sha512su0 v22.16b,v23.16b
1215         ext     v7.16b,v18.16b,v19.16b,#8
1216 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1217 .inst   0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
1218         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1219 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1220         add     v25.2d,v25.2d,v23.2d
1221         ld1     {v24.2d},[x3],#16
1222         ext     v25.16b,v25.16b,v25.16b,#8
1223         ext     v5.16b,v1.16b,v4.16b,#8
1224         ext     v6.16b,v3.16b,v1.16b,#8
1225         add     v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
1226 .inst   0xcec08217      //sha512su0 v23.16b,v16.16b
1227         ext     v7.16b,v19.16b,v20.16b,#8
1228 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1229 .inst   0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
1230         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1231 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1232         add     v24.2d,v24.2d,v16.2d
1233         ld1     {v25.2d},[x3],#16
1234         ext     v24.16b,v24.16b,v24.16b,#8
1235         ext     v5.16b,v0.16b,v1.16b,#8
1236         ext     v6.16b,v2.16b,v0.16b,#8
1237         add     v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
1238 .inst   0xcec08230      //sha512su0 v16.16b,v17.16b
1239         ext     v7.16b,v20.16b,v21.16b,#8
1240 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1241 .inst   0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
1242         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1243 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1244         add     v25.2d,v25.2d,v17.2d
1245         ld1     {v24.2d},[x3],#16
1246         ext     v25.16b,v25.16b,v25.16b,#8
1247         ext     v5.16b,v3.16b,v0.16b,#8
1248         ext     v6.16b,v4.16b,v3.16b,#8
1249         add     v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
1250 .inst   0xcec08251      //sha512su0 v17.16b,v18.16b
1251         ext     v7.16b,v21.16b,v22.16b,#8
1252 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1253 .inst   0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
1254         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1255 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1256         add     v24.2d,v24.2d,v18.2d
1257         ld1     {v25.2d},[x3],#16
1258         ext     v24.16b,v24.16b,v24.16b,#8
1259         ext     v5.16b,v2.16b,v3.16b,#8
1260         ext     v6.16b,v1.16b,v2.16b,#8
1261         add     v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
1262 .inst   0xcec08272      //sha512su0 v18.16b,v19.16b
1263         ext     v7.16b,v22.16b,v23.16b,#8
1264 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1265 .inst   0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
1266         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1267 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1268         add     v25.2d,v25.2d,v19.2d
1269         ld1     {v24.2d},[x3],#16
1270         ext     v25.16b,v25.16b,v25.16b,#8
1271         ext     v5.16b,v4.16b,v2.16b,#8
1272         ext     v6.16b,v0.16b,v4.16b,#8
1273         add     v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
1274 .inst   0xcec08293      //sha512su0 v19.16b,v20.16b
1275         ext     v7.16b,v23.16b,v16.16b,#8
1276 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1277 .inst   0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
1278         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1279 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1280         add     v24.2d,v24.2d,v20.2d
1281         ld1     {v25.2d},[x3],#16
1282         ext     v24.16b,v24.16b,v24.16b,#8
1283         ext     v5.16b,v1.16b,v4.16b,#8
1284         ext     v6.16b,v3.16b,v1.16b,#8
1285         add     v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
1286 .inst   0xcec082b4      //sha512su0 v20.16b,v21.16b
1287         ext     v7.16b,v16.16b,v17.16b,#8
1288 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1289 .inst   0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
1290         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1291 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1292         add     v25.2d,v25.2d,v21.2d
1293         ld1     {v24.2d},[x3],#16
1294         ext     v25.16b,v25.16b,v25.16b,#8
1295         ext     v5.16b,v0.16b,v1.16b,#8
1296         ext     v6.16b,v2.16b,v0.16b,#8
1297         add     v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
1298 .inst   0xcec082d5      //sha512su0 v21.16b,v22.16b
1299         ext     v7.16b,v17.16b,v18.16b,#8
1300 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1301 .inst   0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
1302         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1303 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1304         add     v24.2d,v24.2d,v22.2d
1305         ld1     {v25.2d},[x3],#16
1306         ext     v24.16b,v24.16b,v24.16b,#8
1307         ext     v5.16b,v3.16b,v0.16b,#8
1308         ext     v6.16b,v4.16b,v3.16b,#8
1309         add     v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
1310 .inst   0xcec082f6      //sha512su0 v22.16b,v23.16b
1311         ext     v7.16b,v18.16b,v19.16b,#8
1312 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1313 .inst   0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
1314         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1315 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1316         add     v25.2d,v25.2d,v23.2d
1317         ld1     {v24.2d},[x3],#16
1318         ext     v25.16b,v25.16b,v25.16b,#8
1319         ext     v5.16b,v2.16b,v3.16b,#8
1320         ext     v6.16b,v1.16b,v2.16b,#8
1321         add     v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
1322 .inst   0xcec08217      //sha512su0 v23.16b,v16.16b
1323         ext     v7.16b,v19.16b,v20.16b,#8
1324 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1325 .inst   0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
1326         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1327 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1328         add     v24.2d,v24.2d,v16.2d
1329         ld1     {v25.2d},[x3],#16
1330         ext     v24.16b,v24.16b,v24.16b,#8
1331         ext     v5.16b,v4.16b,v2.16b,#8
1332         ext     v6.16b,v0.16b,v4.16b,#8
1333         add     v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
1334 .inst   0xcec08230      //sha512su0 v16.16b,v17.16b
1335         ext     v7.16b,v20.16b,v21.16b,#8
1336 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1337 .inst   0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
1338         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1339 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1340         add     v25.2d,v25.2d,v17.2d
1341         ld1     {v24.2d},[x3],#16
1342         ext     v25.16b,v25.16b,v25.16b,#8
1343         ext     v5.16b,v1.16b,v4.16b,#8
1344         ext     v6.16b,v3.16b,v1.16b,#8
1345         add     v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
1346 .inst   0xcec08251      //sha512su0 v17.16b,v18.16b
1347         ext     v7.16b,v21.16b,v22.16b,#8
1348 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1349 .inst   0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
1350         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1351 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1352         add     v24.2d,v24.2d,v18.2d
1353         ld1     {v25.2d},[x3],#16
1354         ext     v24.16b,v24.16b,v24.16b,#8
1355         ext     v5.16b,v0.16b,v1.16b,#8
1356         ext     v6.16b,v2.16b,v0.16b,#8
1357         add     v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
1358 .inst   0xcec08272      //sha512su0 v18.16b,v19.16b
1359         ext     v7.16b,v22.16b,v23.16b,#8
1360 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1361 .inst   0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
1362         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1363 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1364         add     v25.2d,v25.2d,v19.2d
1365         ld1     {v24.2d},[x3],#16
1366         ext     v25.16b,v25.16b,v25.16b,#8
1367         ext     v5.16b,v3.16b,v0.16b,#8
1368         ext     v6.16b,v4.16b,v3.16b,#8
1369         add     v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
1370 .inst   0xcec08293      //sha512su0 v19.16b,v20.16b
1371         ext     v7.16b,v23.16b,v16.16b,#8
1372 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1373 .inst   0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
1374         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1375 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1376         add     v24.2d,v24.2d,v20.2d
1377         ld1     {v25.2d},[x3],#16
1378         ext     v24.16b,v24.16b,v24.16b,#8
1379         ext     v5.16b,v2.16b,v3.16b,#8
1380         ext     v6.16b,v1.16b,v2.16b,#8
1381         add     v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
1382 .inst   0xcec082b4      //sha512su0 v20.16b,v21.16b
1383         ext     v7.16b,v16.16b,v17.16b,#8
1384 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1385 .inst   0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
1386         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1387 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1388         add     v25.2d,v25.2d,v21.2d
1389         ld1     {v24.2d},[x3],#16
1390         ext     v25.16b,v25.16b,v25.16b,#8
1391         ext     v5.16b,v4.16b,v2.16b,#8
1392         ext     v6.16b,v0.16b,v4.16b,#8
1393         add     v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
1394 .inst   0xcec082d5      //sha512su0 v21.16b,v22.16b
1395         ext     v7.16b,v17.16b,v18.16b,#8
1396 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1397 .inst   0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
1398         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1399 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1400         add     v24.2d,v24.2d,v22.2d
1401         ld1     {v25.2d},[x3],#16
1402         ext     v24.16b,v24.16b,v24.16b,#8
1403         ext     v5.16b,v1.16b,v4.16b,#8
1404         ext     v6.16b,v3.16b,v1.16b,#8
1405         add     v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
1406 .inst   0xcec082f6      //sha512su0 v22.16b,v23.16b
1407         ext     v7.16b,v18.16b,v19.16b,#8
1408 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1409 .inst   0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
1410         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1411 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1412         add     v25.2d,v25.2d,v23.2d
1413         ld1     {v24.2d},[x3],#16
1414         ext     v25.16b,v25.16b,v25.16b,#8
1415         ext     v5.16b,v0.16b,v1.16b,#8
1416         ext     v6.16b,v2.16b,v0.16b,#8
1417         add     v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
1418 .inst   0xcec08217      //sha512su0 v23.16b,v16.16b
1419         ext     v7.16b,v19.16b,v20.16b,#8
1420 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1421 .inst   0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
1422         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1423 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1424         add     v24.2d,v24.2d,v16.2d
1425         ld1     {v25.2d},[x3],#16
1426         ext     v24.16b,v24.16b,v24.16b,#8
1427         ext     v5.16b,v3.16b,v0.16b,#8
1428         ext     v6.16b,v4.16b,v3.16b,#8
1429         add     v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
1430 .inst   0xcec08230      //sha512su0 v16.16b,v17.16b
1431         ext     v7.16b,v20.16b,v21.16b,#8
1432 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1433 .inst   0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
1434         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1435 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1436         add     v25.2d,v25.2d,v17.2d
1437         ld1     {v24.2d},[x3],#16
1438         ext     v25.16b,v25.16b,v25.16b,#8
1439         ext     v5.16b,v2.16b,v3.16b,#8
1440         ext     v6.16b,v1.16b,v2.16b,#8
1441         add     v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
1442 .inst   0xcec08251      //sha512su0 v17.16b,v18.16b
1443         ext     v7.16b,v21.16b,v22.16b,#8
1444 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1445 .inst   0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
1446         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1447 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1448         add     v24.2d,v24.2d,v18.2d
1449         ld1     {v25.2d},[x3],#16
1450         ext     v24.16b,v24.16b,v24.16b,#8
1451         ext     v5.16b,v4.16b,v2.16b,#8
1452         ext     v6.16b,v0.16b,v4.16b,#8
1453         add     v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
1454 .inst   0xcec08272      //sha512su0 v18.16b,v19.16b
1455         ext     v7.16b,v22.16b,v23.16b,#8
1456 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1457 .inst   0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
1458         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1459 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1460         add     v25.2d,v25.2d,v19.2d
1461         ld1     {v24.2d},[x3],#16
1462         ext     v25.16b,v25.16b,v25.16b,#8
1463         ext     v5.16b,v1.16b,v4.16b,#8
1464         ext     v6.16b,v3.16b,v1.16b,#8
1465         add     v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
1466 .inst   0xcec08293      //sha512su0 v19.16b,v20.16b
1467         ext     v7.16b,v23.16b,v16.16b,#8
1468 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1469 .inst   0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
1470         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1471 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1472         add     v24.2d,v24.2d,v20.2d
1473         ld1     {v25.2d},[x3],#16
1474         ext     v24.16b,v24.16b,v24.16b,#8
1475         ext     v5.16b,v0.16b,v1.16b,#8
1476         ext     v6.16b,v2.16b,v0.16b,#8
1477         add     v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
1478 .inst   0xcec082b4      //sha512su0 v20.16b,v21.16b
1479         ext     v7.16b,v16.16b,v17.16b,#8
1480 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1481 .inst   0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
1482         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1483 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1484         add     v25.2d,v25.2d,v21.2d
1485         ld1     {v24.2d},[x3],#16
1486         ext     v25.16b,v25.16b,v25.16b,#8
1487         ext     v5.16b,v3.16b,v0.16b,#8
1488         ext     v6.16b,v4.16b,v3.16b,#8
1489         add     v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
1490 .inst   0xcec082d5      //sha512su0 v21.16b,v22.16b
1491         ext     v7.16b,v17.16b,v18.16b,#8
1492 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1493 .inst   0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
1494         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1495 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1496         add     v24.2d,v24.2d,v22.2d
1497         ld1     {v25.2d},[x3],#16
1498         ext     v24.16b,v24.16b,v24.16b,#8
1499         ext     v5.16b,v2.16b,v3.16b,#8
1500         ext     v6.16b,v1.16b,v2.16b,#8
1501         add     v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
1502 .inst   0xcec082f6      //sha512su0 v22.16b,v23.16b
1503         ext     v7.16b,v18.16b,v19.16b,#8
1504 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1505 .inst   0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
1506         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1507 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1508         add     v25.2d,v25.2d,v23.2d
1509         ld1     {v24.2d},[x3],#16
1510         ext     v25.16b,v25.16b,v25.16b,#8
1511         ext     v5.16b,v4.16b,v2.16b,#8
1512         ext     v6.16b,v0.16b,v4.16b,#8
1513         add     v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
1514 .inst   0xcec08217      //sha512su0 v23.16b,v16.16b
1515         ext     v7.16b,v19.16b,v20.16b,#8
1516 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1517 .inst   0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
1518         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1519 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1520         ld1     {v25.2d},[x3],#16
1521         add     v24.2d,v24.2d,v16.2d
1522         ld1     {v16.16b},[x1],#16              // load next input
1523         ext     v24.16b,v24.16b,v24.16b,#8
1524         ext     v5.16b,v1.16b,v4.16b,#8
1525         ext     v6.16b,v3.16b,v1.16b,#8
1526         add     v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
1527 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1528         rev64   v16.16b,v16.16b
1529         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1530 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1531         ld1     {v24.2d},[x3],#16
1532         add     v25.2d,v25.2d,v17.2d
1533         ld1     {v17.16b},[x1],#16              // load next input
1534         ext     v25.16b,v25.16b,v25.16b,#8
1535         ext     v5.16b,v0.16b,v1.16b,#8
1536         ext     v6.16b,v2.16b,v0.16b,#8
1537         add     v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
1538 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1539         rev64   v17.16b,v17.16b
1540         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1541 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1542         ld1     {v25.2d},[x3],#16
1543         add     v24.2d,v24.2d,v18.2d
1544         ld1     {v18.16b},[x1],#16              // load next input
1545         ext     v24.16b,v24.16b,v24.16b,#8
1546         ext     v5.16b,v3.16b,v0.16b,#8
1547         ext     v6.16b,v4.16b,v3.16b,#8
1548         add     v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
1549 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1550         rev64   v18.16b,v18.16b
1551         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1552 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1553         ld1     {v24.2d},[x3],#16
1554         add     v25.2d,v25.2d,v19.2d
1555         ld1     {v19.16b},[x1],#16              // load next input
1556         ext     v25.16b,v25.16b,v25.16b,#8
1557         ext     v5.16b,v2.16b,v3.16b,#8
1558         ext     v6.16b,v1.16b,v2.16b,#8
1559         add     v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
1560 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1561         rev64   v19.16b,v19.16b
1562         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1563 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1564         ld1     {v25.2d},[x3],#16
1565         add     v24.2d,v24.2d,v20.2d
1566         ld1     {v20.16b},[x1],#16              // load next input
1567         ext     v24.16b,v24.16b,v24.16b,#8
1568         ext     v5.16b,v4.16b,v2.16b,#8
1569         ext     v6.16b,v0.16b,v4.16b,#8
1570         add     v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
1571 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1572         rev64   v20.16b,v20.16b
1573         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1574 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1575         ld1     {v24.2d},[x3],#16
1576         add     v25.2d,v25.2d,v21.2d
1577         ld1     {v21.16b},[x1],#16              // load next input
1578         ext     v25.16b,v25.16b,v25.16b,#8
1579         ext     v5.16b,v1.16b,v4.16b,#8
1580         ext     v6.16b,v3.16b,v1.16b,#8
1581         add     v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
1582 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1583         rev64   v21.16b,v21.16b
1584         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1585 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1586         ld1     {v25.2d},[x3],#16
1587         add     v24.2d,v24.2d,v22.2d
1588         ld1     {v22.16b},[x1],#16              // load next input
1589         ext     v24.16b,v24.16b,v24.16b,#8
1590         ext     v5.16b,v0.16b,v1.16b,#8
1591         ext     v6.16b,v2.16b,v0.16b,#8
1592         add     v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
1593 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1594         rev64   v22.16b,v22.16b
1595         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1596 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1597         sub     x3,x3,#80*8     // rewind
1598         add     v25.2d,v25.2d,v23.2d
1599         ld1     {v23.16b},[x1],#16              // load next input
1600         ext     v25.16b,v25.16b,v25.16b,#8
1601         ext     v5.16b,v3.16b,v0.16b,#8
1602         ext     v6.16b,v4.16b,v3.16b,#8
1603         add     v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
1604 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1605         rev64   v23.16b,v23.16b
1606         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1607 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1608         add     v0.2d,v0.2d,v26.2d                      // accumulate
1609         add     v1.2d,v1.2d,v27.2d
1610         add     v2.2d,v2.2d,v28.2d
1611         add     v3.2d,v3.2d,v29.2d
1612
1613         cbnz    x2,.Loop_hw
1614
1615         st1     {v0.2d,v1.2d,v2.2d,v3.2d},[x0]          // store context
1616
1617         ldr     x29,[sp],#16
1618         ret
1619 .size   sha512_block_armv8,.-sha512_block_armv8
1620 #endif