]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - secure/lib/libcrypto/aarch64/sha512-armv8.S
Regen assemply files for aarch64.
[FreeBSD/FreeBSD.git] / secure / lib / libcrypto / aarch64 / sha512-armv8.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from sha512-armv8.pl. */
3 // Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
4 //
5 // Licensed under the OpenSSL license (the "License").  You may not use
6 // this file except in compliance with the License.  You can obtain a copy
7 // in the file LICENSE in the source distribution or at
8 // https://www.openssl.org/source/license.html
9
10 // ====================================================================
11 // Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 // project. The module is, however, dual licensed under OpenSSL and
13 // CRYPTOGAMS licenses depending on where you obtain it. For further
14 // details see http://www.openssl.org/~appro/cryptogams/.
15 //
16 // Permission to use under GPLv2 terms is granted.
17 // ====================================================================
18 //
19 // SHA256/512 for ARMv8.
20 //
21 // Performance in cycles per processed byte and improvement coefficient
22 // over code generated with "default" compiler:
23 //
24 //              SHA256-hw       SHA256(*)       SHA512
25 // Apple A7     1.97            10.5 (+33%)     6.73 (-1%(**))
26 // Cortex-A53   2.38            15.5 (+115%)    10.0 (+150%(***))
27 // Cortex-A57   2.31            11.6 (+86%)     7.51 (+260%(***))
28 // Denver       2.01            10.5 (+26%)     6.70 (+8%)
29 // X-Gene                       20.0 (+100%)    12.8 (+300%(***))
30 // Mongoose     2.36            13.0 (+50%)     8.36 (+33%)
31 // Kryo         1.92            17.4 (+30%)     11.2 (+8%)
32 //
33 // (*)  Software SHA256 results are of lesser relevance, presented
34 //      mostly for informational purposes.
35 // (**) The result is a trade-off: it's possible to improve it by
36 //      10% (or by 1 cycle per round), but at the cost of 20% loss
37 //      on Cortex-A53 (or by 4 cycles per round).
38 // (***)        Super-impressive coefficients over gcc-generated code are
39 //      indication of some compiler "pathology", most notably code
40 //      generated with -mgeneral-regs-only is significantly faster
41 //      and the gap is only 40-90%.
42 //
43 // October 2016.
44 //
45 // Originally it was reckoned that it makes no sense to implement NEON
46 // version of SHA256 for 64-bit processors. This is because performance
47 // improvement on most wide-spread Cortex-A5x processors was observed
48 // to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
49 // observed that 32-bit NEON SHA256 performs significantly better than
50 // 64-bit scalar version on *some* of the more recent processors. As
51 // result 64-bit NEON version of SHA256 was added to provide best
52 // all-round performance. For example it executes ~30% faster on X-Gene
53 // and Mongoose. [For reference, NEON version of SHA512 is bound to
54 // deliver much less improvement, likely *negative* on Cortex-A5x.
55 // Which is why NEON support is limited to SHA256.]
56
57 #ifndef __KERNEL__
58 # include "arm_arch.h"
59 #endif
60
61 .text
62
63
64 .globl  sha512_block_data_order
65 .type   sha512_block_data_order,%function
66 .align  6
67 sha512_block_data_order:
68 #ifndef __KERNEL__
69 # ifdef __ILP32__
70         ldrsw   x16,.LOPENSSL_armcap_P
71 # else
72         ldr     x16,.LOPENSSL_armcap_P
73 # endif
74         adr     x17,.LOPENSSL_armcap_P
75         add     x16,x16,x17
76         ldr     w16,[x16]
77         tst     w16,#ARMV8_SHA512
78         b.ne    .Lv8_entry
79 #endif
80         stp     x29,x30,[sp,#-128]!
81         add     x29,sp,#0
82
83         stp     x19,x20,[sp,#16]
84         stp     x21,x22,[sp,#32]
85         stp     x23,x24,[sp,#48]
86         stp     x25,x26,[sp,#64]
87         stp     x27,x28,[sp,#80]
88         sub     sp,sp,#4*8
89
90         ldp     x20,x21,[x0]                            // load context
91         ldp     x22,x23,[x0,#2*8]
92         ldp     x24,x25,[x0,#4*8]
93         add     x2,x1,x2,lsl#7  // end of input
94         ldp     x26,x27,[x0,#6*8]
95         adr     x30,.LK512
96         stp     x0,x2,[x29,#96]
97
98 .Loop:
99         ldp     x3,x4,[x1],#2*8
100         ldr     x19,[x30],#8                    // *K++
101         eor     x28,x21,x22                             // magic seed
102         str     x1,[x29,#112]
103 #ifndef __AARCH64EB__
104         rev     x3,x3                   // 0
105 #endif
106         ror     x16,x24,#14
107         add     x27,x27,x19                     // h+=K[i]
108         eor     x6,x24,x24,ror#23
109         and     x17,x25,x24
110         bic     x19,x26,x24
111         add     x27,x27,x3                      // h+=X[i]
112         orr     x17,x17,x19                     // Ch(e,f,g)
113         eor     x19,x20,x21                     // a^b, b^c in next round
114         eor     x16,x16,x6,ror#18       // Sigma1(e)
115         ror     x6,x20,#28
116         add     x27,x27,x17                     // h+=Ch(e,f,g)
117         eor     x17,x20,x20,ror#5
118         add     x27,x27,x16                     // h+=Sigma1(e)
119         and     x28,x28,x19                     // (b^c)&=(a^b)
120         add     x23,x23,x27                     // d+=h
121         eor     x28,x28,x21                     // Maj(a,b,c)
122         eor     x17,x6,x17,ror#34       // Sigma0(a)
123         add     x27,x27,x28                     // h+=Maj(a,b,c)
124         ldr     x28,[x30],#8            // *K++, x19 in next round
125         //add   x27,x27,x17                     // h+=Sigma0(a)
126 #ifndef __AARCH64EB__
127         rev     x4,x4                   // 1
128 #endif
129         ldp     x5,x6,[x1],#2*8
130         add     x27,x27,x17                     // h+=Sigma0(a)
131         ror     x16,x23,#14
132         add     x26,x26,x28                     // h+=K[i]
133         eor     x7,x23,x23,ror#23
134         and     x17,x24,x23
135         bic     x28,x25,x23
136         add     x26,x26,x4                      // h+=X[i]
137         orr     x17,x17,x28                     // Ch(e,f,g)
138         eor     x28,x27,x20                     // a^b, b^c in next round
139         eor     x16,x16,x7,ror#18       // Sigma1(e)
140         ror     x7,x27,#28
141         add     x26,x26,x17                     // h+=Ch(e,f,g)
142         eor     x17,x27,x27,ror#5
143         add     x26,x26,x16                     // h+=Sigma1(e)
144         and     x19,x19,x28                     // (b^c)&=(a^b)
145         add     x22,x22,x26                     // d+=h
146         eor     x19,x19,x20                     // Maj(a,b,c)
147         eor     x17,x7,x17,ror#34       // Sigma0(a)
148         add     x26,x26,x19                     // h+=Maj(a,b,c)
149         ldr     x19,[x30],#8            // *K++, x28 in next round
150         //add   x26,x26,x17                     // h+=Sigma0(a)
151 #ifndef __AARCH64EB__
152         rev     x5,x5                   // 2
153 #endif
154         add     x26,x26,x17                     // h+=Sigma0(a)
155         ror     x16,x22,#14
156         add     x25,x25,x19                     // h+=K[i]
157         eor     x8,x22,x22,ror#23
158         and     x17,x23,x22
159         bic     x19,x24,x22
160         add     x25,x25,x5                      // h+=X[i]
161         orr     x17,x17,x19                     // Ch(e,f,g)
162         eor     x19,x26,x27                     // a^b, b^c in next round
163         eor     x16,x16,x8,ror#18       // Sigma1(e)
164         ror     x8,x26,#28
165         add     x25,x25,x17                     // h+=Ch(e,f,g)
166         eor     x17,x26,x26,ror#5
167         add     x25,x25,x16                     // h+=Sigma1(e)
168         and     x28,x28,x19                     // (b^c)&=(a^b)
169         add     x21,x21,x25                     // d+=h
170         eor     x28,x28,x27                     // Maj(a,b,c)
171         eor     x17,x8,x17,ror#34       // Sigma0(a)
172         add     x25,x25,x28                     // h+=Maj(a,b,c)
173         ldr     x28,[x30],#8            // *K++, x19 in next round
174         //add   x25,x25,x17                     // h+=Sigma0(a)
175 #ifndef __AARCH64EB__
176         rev     x6,x6                   // 3
177 #endif
178         ldp     x7,x8,[x1],#2*8
179         add     x25,x25,x17                     // h+=Sigma0(a)
180         ror     x16,x21,#14
181         add     x24,x24,x28                     // h+=K[i]
182         eor     x9,x21,x21,ror#23
183         and     x17,x22,x21
184         bic     x28,x23,x21
185         add     x24,x24,x6                      // h+=X[i]
186         orr     x17,x17,x28                     // Ch(e,f,g)
187         eor     x28,x25,x26                     // a^b, b^c in next round
188         eor     x16,x16,x9,ror#18       // Sigma1(e)
189         ror     x9,x25,#28
190         add     x24,x24,x17                     // h+=Ch(e,f,g)
191         eor     x17,x25,x25,ror#5
192         add     x24,x24,x16                     // h+=Sigma1(e)
193         and     x19,x19,x28                     // (b^c)&=(a^b)
194         add     x20,x20,x24                     // d+=h
195         eor     x19,x19,x26                     // Maj(a,b,c)
196         eor     x17,x9,x17,ror#34       // Sigma0(a)
197         add     x24,x24,x19                     // h+=Maj(a,b,c)
198         ldr     x19,[x30],#8            // *K++, x28 in next round
199         //add   x24,x24,x17                     // h+=Sigma0(a)
200 #ifndef __AARCH64EB__
201         rev     x7,x7                   // 4
202 #endif
203         add     x24,x24,x17                     // h+=Sigma0(a)
204         ror     x16,x20,#14
205         add     x23,x23,x19                     // h+=K[i]
206         eor     x10,x20,x20,ror#23
207         and     x17,x21,x20
208         bic     x19,x22,x20
209         add     x23,x23,x7                      // h+=X[i]
210         orr     x17,x17,x19                     // Ch(e,f,g)
211         eor     x19,x24,x25                     // a^b, b^c in next round
212         eor     x16,x16,x10,ror#18      // Sigma1(e)
213         ror     x10,x24,#28
214         add     x23,x23,x17                     // h+=Ch(e,f,g)
215         eor     x17,x24,x24,ror#5
216         add     x23,x23,x16                     // h+=Sigma1(e)
217         and     x28,x28,x19                     // (b^c)&=(a^b)
218         add     x27,x27,x23                     // d+=h
219         eor     x28,x28,x25                     // Maj(a,b,c)
220         eor     x17,x10,x17,ror#34      // Sigma0(a)
221         add     x23,x23,x28                     // h+=Maj(a,b,c)
222         ldr     x28,[x30],#8            // *K++, x19 in next round
223         //add   x23,x23,x17                     // h+=Sigma0(a)
224 #ifndef __AARCH64EB__
225         rev     x8,x8                   // 5
226 #endif
227         ldp     x9,x10,[x1],#2*8
228         add     x23,x23,x17                     // h+=Sigma0(a)
229         ror     x16,x27,#14
230         add     x22,x22,x28                     // h+=K[i]
231         eor     x11,x27,x27,ror#23
232         and     x17,x20,x27
233         bic     x28,x21,x27
234         add     x22,x22,x8                      // h+=X[i]
235         orr     x17,x17,x28                     // Ch(e,f,g)
236         eor     x28,x23,x24                     // a^b, b^c in next round
237         eor     x16,x16,x11,ror#18      // Sigma1(e)
238         ror     x11,x23,#28
239         add     x22,x22,x17                     // h+=Ch(e,f,g)
240         eor     x17,x23,x23,ror#5
241         add     x22,x22,x16                     // h+=Sigma1(e)
242         and     x19,x19,x28                     // (b^c)&=(a^b)
243         add     x26,x26,x22                     // d+=h
244         eor     x19,x19,x24                     // Maj(a,b,c)
245         eor     x17,x11,x17,ror#34      // Sigma0(a)
246         add     x22,x22,x19                     // h+=Maj(a,b,c)
247         ldr     x19,[x30],#8            // *K++, x28 in next round
248         //add   x22,x22,x17                     // h+=Sigma0(a)
249 #ifndef __AARCH64EB__
250         rev     x9,x9                   // 6
251 #endif
252         add     x22,x22,x17                     // h+=Sigma0(a)
253         ror     x16,x26,#14
254         add     x21,x21,x19                     // h+=K[i]
255         eor     x12,x26,x26,ror#23
256         and     x17,x27,x26
257         bic     x19,x20,x26
258         add     x21,x21,x9                      // h+=X[i]
259         orr     x17,x17,x19                     // Ch(e,f,g)
260         eor     x19,x22,x23                     // a^b, b^c in next round
261         eor     x16,x16,x12,ror#18      // Sigma1(e)
262         ror     x12,x22,#28
263         add     x21,x21,x17                     // h+=Ch(e,f,g)
264         eor     x17,x22,x22,ror#5
265         add     x21,x21,x16                     // h+=Sigma1(e)
266         and     x28,x28,x19                     // (b^c)&=(a^b)
267         add     x25,x25,x21                     // d+=h
268         eor     x28,x28,x23                     // Maj(a,b,c)
269         eor     x17,x12,x17,ror#34      // Sigma0(a)
270         add     x21,x21,x28                     // h+=Maj(a,b,c)
271         ldr     x28,[x30],#8            // *K++, x19 in next round
272         //add   x21,x21,x17                     // h+=Sigma0(a)
273 #ifndef __AARCH64EB__
274         rev     x10,x10                 // 7
275 #endif
276         ldp     x11,x12,[x1],#2*8
277         add     x21,x21,x17                     // h+=Sigma0(a)
278         ror     x16,x25,#14
279         add     x20,x20,x28                     // h+=K[i]
280         eor     x13,x25,x25,ror#23
281         and     x17,x26,x25
282         bic     x28,x27,x25
283         add     x20,x20,x10                     // h+=X[i]
284         orr     x17,x17,x28                     // Ch(e,f,g)
285         eor     x28,x21,x22                     // a^b, b^c in next round
286         eor     x16,x16,x13,ror#18      // Sigma1(e)
287         ror     x13,x21,#28
288         add     x20,x20,x17                     // h+=Ch(e,f,g)
289         eor     x17,x21,x21,ror#5
290         add     x20,x20,x16                     // h+=Sigma1(e)
291         and     x19,x19,x28                     // (b^c)&=(a^b)
292         add     x24,x24,x20                     // d+=h
293         eor     x19,x19,x22                     // Maj(a,b,c)
294         eor     x17,x13,x17,ror#34      // Sigma0(a)
295         add     x20,x20,x19                     // h+=Maj(a,b,c)
296         ldr     x19,[x30],#8            // *K++, x28 in next round
297         //add   x20,x20,x17                     // h+=Sigma0(a)
298 #ifndef __AARCH64EB__
299         rev     x11,x11                 // 8
300 #endif
301         add     x20,x20,x17                     // h+=Sigma0(a)
302         ror     x16,x24,#14
303         add     x27,x27,x19                     // h+=K[i]
304         eor     x14,x24,x24,ror#23
305         and     x17,x25,x24
306         bic     x19,x26,x24
307         add     x27,x27,x11                     // h+=X[i]
308         orr     x17,x17,x19                     // Ch(e,f,g)
309         eor     x19,x20,x21                     // a^b, b^c in next round
310         eor     x16,x16,x14,ror#18      // Sigma1(e)
311         ror     x14,x20,#28
312         add     x27,x27,x17                     // h+=Ch(e,f,g)
313         eor     x17,x20,x20,ror#5
314         add     x27,x27,x16                     // h+=Sigma1(e)
315         and     x28,x28,x19                     // (b^c)&=(a^b)
316         add     x23,x23,x27                     // d+=h
317         eor     x28,x28,x21                     // Maj(a,b,c)
318         eor     x17,x14,x17,ror#34      // Sigma0(a)
319         add     x27,x27,x28                     // h+=Maj(a,b,c)
320         ldr     x28,[x30],#8            // *K++, x19 in next round
321         //add   x27,x27,x17                     // h+=Sigma0(a)
322 #ifndef __AARCH64EB__
323         rev     x12,x12                 // 9
324 #endif
325         ldp     x13,x14,[x1],#2*8
326         add     x27,x27,x17                     // h+=Sigma0(a)
327         ror     x16,x23,#14
328         add     x26,x26,x28                     // h+=K[i]
329         eor     x15,x23,x23,ror#23
330         and     x17,x24,x23
331         bic     x28,x25,x23
332         add     x26,x26,x12                     // h+=X[i]
333         orr     x17,x17,x28                     // Ch(e,f,g)
334         eor     x28,x27,x20                     // a^b, b^c in next round
335         eor     x16,x16,x15,ror#18      // Sigma1(e)
336         ror     x15,x27,#28
337         add     x26,x26,x17                     // h+=Ch(e,f,g)
338         eor     x17,x27,x27,ror#5
339         add     x26,x26,x16                     // h+=Sigma1(e)
340         and     x19,x19,x28                     // (b^c)&=(a^b)
341         add     x22,x22,x26                     // d+=h
342         eor     x19,x19,x20                     // Maj(a,b,c)
343         eor     x17,x15,x17,ror#34      // Sigma0(a)
344         add     x26,x26,x19                     // h+=Maj(a,b,c)
345         ldr     x19,[x30],#8            // *K++, x28 in next round
346         //add   x26,x26,x17                     // h+=Sigma0(a)
347 #ifndef __AARCH64EB__
348         rev     x13,x13                 // 10
349 #endif
350         add     x26,x26,x17                     // h+=Sigma0(a)
351         ror     x16,x22,#14
352         add     x25,x25,x19                     // h+=K[i]
353         eor     x0,x22,x22,ror#23
354         and     x17,x23,x22
355         bic     x19,x24,x22
356         add     x25,x25,x13                     // h+=X[i]
357         orr     x17,x17,x19                     // Ch(e,f,g)
358         eor     x19,x26,x27                     // a^b, b^c in next round
359         eor     x16,x16,x0,ror#18       // Sigma1(e)
360         ror     x0,x26,#28
361         add     x25,x25,x17                     // h+=Ch(e,f,g)
362         eor     x17,x26,x26,ror#5
363         add     x25,x25,x16                     // h+=Sigma1(e)
364         and     x28,x28,x19                     // (b^c)&=(a^b)
365         add     x21,x21,x25                     // d+=h
366         eor     x28,x28,x27                     // Maj(a,b,c)
367         eor     x17,x0,x17,ror#34       // Sigma0(a)
368         add     x25,x25,x28                     // h+=Maj(a,b,c)
369         ldr     x28,[x30],#8            // *K++, x19 in next round
370         //add   x25,x25,x17                     // h+=Sigma0(a)
371 #ifndef __AARCH64EB__
372         rev     x14,x14                 // 11
373 #endif
374         ldp     x15,x0,[x1],#2*8
375         add     x25,x25,x17                     // h+=Sigma0(a)
376         str     x6,[sp,#24]
377         ror     x16,x21,#14
378         add     x24,x24,x28                     // h+=K[i]
379         eor     x6,x21,x21,ror#23
380         and     x17,x22,x21
381         bic     x28,x23,x21
382         add     x24,x24,x14                     // h+=X[i]
383         orr     x17,x17,x28                     // Ch(e,f,g)
384         eor     x28,x25,x26                     // a^b, b^c in next round
385         eor     x16,x16,x6,ror#18       // Sigma1(e)
386         ror     x6,x25,#28
387         add     x24,x24,x17                     // h+=Ch(e,f,g)
388         eor     x17,x25,x25,ror#5
389         add     x24,x24,x16                     // h+=Sigma1(e)
390         and     x19,x19,x28                     // (b^c)&=(a^b)
391         add     x20,x20,x24                     // d+=h
392         eor     x19,x19,x26                     // Maj(a,b,c)
393         eor     x17,x6,x17,ror#34       // Sigma0(a)
394         add     x24,x24,x19                     // h+=Maj(a,b,c)
395         ldr     x19,[x30],#8            // *K++, x28 in next round
396         //add   x24,x24,x17                     // h+=Sigma0(a)
397 #ifndef __AARCH64EB__
398         rev     x15,x15                 // 12
399 #endif
400         add     x24,x24,x17                     // h+=Sigma0(a)
401         str     x7,[sp,#0]
402         ror     x16,x20,#14
403         add     x23,x23,x19                     // h+=K[i]
404         eor     x7,x20,x20,ror#23
405         and     x17,x21,x20
406         bic     x19,x22,x20
407         add     x23,x23,x15                     // h+=X[i]
408         orr     x17,x17,x19                     // Ch(e,f,g)
409         eor     x19,x24,x25                     // a^b, b^c in next round
410         eor     x16,x16,x7,ror#18       // Sigma1(e)
411         ror     x7,x24,#28
412         add     x23,x23,x17                     // h+=Ch(e,f,g)
413         eor     x17,x24,x24,ror#5
414         add     x23,x23,x16                     // h+=Sigma1(e)
415         and     x28,x28,x19                     // (b^c)&=(a^b)
416         add     x27,x27,x23                     // d+=h
417         eor     x28,x28,x25                     // Maj(a,b,c)
418         eor     x17,x7,x17,ror#34       // Sigma0(a)
419         add     x23,x23,x28                     // h+=Maj(a,b,c)
420         ldr     x28,[x30],#8            // *K++, x19 in next round
421         //add   x23,x23,x17                     // h+=Sigma0(a)
422 #ifndef __AARCH64EB__
423         rev     x0,x0                   // 13
424 #endif
425         ldp     x1,x2,[x1]
426         add     x23,x23,x17                     // h+=Sigma0(a)
427         str     x8,[sp,#8]
428         ror     x16,x27,#14
429         add     x22,x22,x28                     // h+=K[i]
430         eor     x8,x27,x27,ror#23
431         and     x17,x20,x27
432         bic     x28,x21,x27
433         add     x22,x22,x0                      // h+=X[i]
434         orr     x17,x17,x28                     // Ch(e,f,g)
435         eor     x28,x23,x24                     // a^b, b^c in next round
436         eor     x16,x16,x8,ror#18       // Sigma1(e)
437         ror     x8,x23,#28
438         add     x22,x22,x17                     // h+=Ch(e,f,g)
439         eor     x17,x23,x23,ror#5
440         add     x22,x22,x16                     // h+=Sigma1(e)
441         and     x19,x19,x28                     // (b^c)&=(a^b)
442         add     x26,x26,x22                     // d+=h
443         eor     x19,x19,x24                     // Maj(a,b,c)
444         eor     x17,x8,x17,ror#34       // Sigma0(a)
445         add     x22,x22,x19                     // h+=Maj(a,b,c)
446         ldr     x19,[x30],#8            // *K++, x28 in next round
447         //add   x22,x22,x17                     // h+=Sigma0(a)
448 #ifndef __AARCH64EB__
449         rev     x1,x1                   // 14
450 #endif
451         ldr     x6,[sp,#24]
452         add     x22,x22,x17                     // h+=Sigma0(a)
453         str     x9,[sp,#16]
454         ror     x16,x26,#14
455         add     x21,x21,x19                     // h+=K[i]
456         eor     x9,x26,x26,ror#23
457         and     x17,x27,x26
458         bic     x19,x20,x26
459         add     x21,x21,x1                      // h+=X[i]
460         orr     x17,x17,x19                     // Ch(e,f,g)
461         eor     x19,x22,x23                     // a^b, b^c in next round
462         eor     x16,x16,x9,ror#18       // Sigma1(e)
463         ror     x9,x22,#28
464         add     x21,x21,x17                     // h+=Ch(e,f,g)
465         eor     x17,x22,x22,ror#5
466         add     x21,x21,x16                     // h+=Sigma1(e)
467         and     x28,x28,x19                     // (b^c)&=(a^b)
468         add     x25,x25,x21                     // d+=h
469         eor     x28,x28,x23                     // Maj(a,b,c)
470         eor     x17,x9,x17,ror#34       // Sigma0(a)
471         add     x21,x21,x28                     // h+=Maj(a,b,c)
472         ldr     x28,[x30],#8            // *K++, x19 in next round
473         //add   x21,x21,x17                     // h+=Sigma0(a)
474 #ifndef __AARCH64EB__
475         rev     x2,x2                   // 15
476 #endif
477         ldr     x7,[sp,#0]
478         add     x21,x21,x17                     // h+=Sigma0(a)
479         str     x10,[sp,#24]
480         ror     x16,x25,#14
481         add     x20,x20,x28                     // h+=K[i]
482         ror     x9,x4,#1
483         and     x17,x26,x25
484         ror     x8,x1,#19
485         bic     x28,x27,x25
486         ror     x10,x21,#28
487         add     x20,x20,x2                      // h+=X[i]
488         eor     x16,x16,x25,ror#18
489         eor     x9,x9,x4,ror#8
490         orr     x17,x17,x28                     // Ch(e,f,g)
491         eor     x28,x21,x22                     // a^b, b^c in next round
492         eor     x16,x16,x25,ror#41      // Sigma1(e)
493         eor     x10,x10,x21,ror#34
494         add     x20,x20,x17                     // h+=Ch(e,f,g)
495         and     x19,x19,x28                     // (b^c)&=(a^b)
496         eor     x8,x8,x1,ror#61
497         eor     x9,x9,x4,lsr#7  // sigma0(X[i+1])
498         add     x20,x20,x16                     // h+=Sigma1(e)
499         eor     x19,x19,x22                     // Maj(a,b,c)
500         eor     x17,x10,x21,ror#39      // Sigma0(a)
501         eor     x8,x8,x1,lsr#6  // sigma1(X[i+14])
502         add     x3,x3,x12
503         add     x24,x24,x20                     // d+=h
504         add     x20,x20,x19                     // h+=Maj(a,b,c)
505         ldr     x19,[x30],#8            // *K++, x28 in next round
506         add     x3,x3,x9
507         add     x20,x20,x17                     // h+=Sigma0(a)
508         add     x3,x3,x8
509 .Loop_16_xx:
510         ldr     x8,[sp,#8]
511         str     x11,[sp,#0]
512         ror     x16,x24,#14
513         add     x27,x27,x19                     // h+=K[i]
514         ror     x10,x5,#1
515         and     x17,x25,x24
516         ror     x9,x2,#19
517         bic     x19,x26,x24
518         ror     x11,x20,#28
519         add     x27,x27,x3                      // h+=X[i]
520         eor     x16,x16,x24,ror#18
521         eor     x10,x10,x5,ror#8
522         orr     x17,x17,x19                     // Ch(e,f,g)
523         eor     x19,x20,x21                     // a^b, b^c in next round
524         eor     x16,x16,x24,ror#41      // Sigma1(e)
525         eor     x11,x11,x20,ror#34
526         add     x27,x27,x17                     // h+=Ch(e,f,g)
527         and     x28,x28,x19                     // (b^c)&=(a^b)
528         eor     x9,x9,x2,ror#61
529         eor     x10,x10,x5,lsr#7        // sigma0(X[i+1])
530         add     x27,x27,x16                     // h+=Sigma1(e)
531         eor     x28,x28,x21                     // Maj(a,b,c)
532         eor     x17,x11,x20,ror#39      // Sigma0(a)
533         eor     x9,x9,x2,lsr#6  // sigma1(X[i+14])
534         add     x4,x4,x13
535         add     x23,x23,x27                     // d+=h
536         add     x27,x27,x28                     // h+=Maj(a,b,c)
537         ldr     x28,[x30],#8            // *K++, x19 in next round
538         add     x4,x4,x10
539         add     x27,x27,x17                     // h+=Sigma0(a)
540         add     x4,x4,x9
541         ldr     x9,[sp,#16]
542         str     x12,[sp,#8]
543         ror     x16,x23,#14
544         add     x26,x26,x28                     // h+=K[i]
545         ror     x11,x6,#1
546         and     x17,x24,x23
547         ror     x10,x3,#19
548         bic     x28,x25,x23
549         ror     x12,x27,#28
550         add     x26,x26,x4                      // h+=X[i]
551         eor     x16,x16,x23,ror#18
552         eor     x11,x11,x6,ror#8
553         orr     x17,x17,x28                     // Ch(e,f,g)
554         eor     x28,x27,x20                     // a^b, b^c in next round
555         eor     x16,x16,x23,ror#41      // Sigma1(e)
556         eor     x12,x12,x27,ror#34
557         add     x26,x26,x17                     // h+=Ch(e,f,g)
558         and     x19,x19,x28                     // (b^c)&=(a^b)
559         eor     x10,x10,x3,ror#61
560         eor     x11,x11,x6,lsr#7        // sigma0(X[i+1])
561         add     x26,x26,x16                     // h+=Sigma1(e)
562         eor     x19,x19,x20                     // Maj(a,b,c)
563         eor     x17,x12,x27,ror#39      // Sigma0(a)
564         eor     x10,x10,x3,lsr#6        // sigma1(X[i+14])
565         add     x5,x5,x14
566         add     x22,x22,x26                     // d+=h
567         add     x26,x26,x19                     // h+=Maj(a,b,c)
568         ldr     x19,[x30],#8            // *K++, x28 in next round
569         add     x5,x5,x11
570         add     x26,x26,x17                     // h+=Sigma0(a)
571         add     x5,x5,x10
572         ldr     x10,[sp,#24]
573         str     x13,[sp,#16]
574         ror     x16,x22,#14
575         add     x25,x25,x19                     // h+=K[i]
576         ror     x12,x7,#1
577         and     x17,x23,x22
578         ror     x11,x4,#19
579         bic     x19,x24,x22
580         ror     x13,x26,#28
581         add     x25,x25,x5                      // h+=X[i]
582         eor     x16,x16,x22,ror#18
583         eor     x12,x12,x7,ror#8
584         orr     x17,x17,x19                     // Ch(e,f,g)
585         eor     x19,x26,x27                     // a^b, b^c in next round
586         eor     x16,x16,x22,ror#41      // Sigma1(e)
587         eor     x13,x13,x26,ror#34
588         add     x25,x25,x17                     // h+=Ch(e,f,g)
589         and     x28,x28,x19                     // (b^c)&=(a^b)
590         eor     x11,x11,x4,ror#61
591         eor     x12,x12,x7,lsr#7        // sigma0(X[i+1])
592         add     x25,x25,x16                     // h+=Sigma1(e)
593         eor     x28,x28,x27                     // Maj(a,b,c)
594         eor     x17,x13,x26,ror#39      // Sigma0(a)
595         eor     x11,x11,x4,lsr#6        // sigma1(X[i+14])
596         add     x6,x6,x15
597         add     x21,x21,x25                     // d+=h
598         add     x25,x25,x28                     // h+=Maj(a,b,c)
599         ldr     x28,[x30],#8            // *K++, x19 in next round
600         add     x6,x6,x12
601         add     x25,x25,x17                     // h+=Sigma0(a)
602         add     x6,x6,x11
603         ldr     x11,[sp,#0]
604         str     x14,[sp,#24]
605         ror     x16,x21,#14
606         add     x24,x24,x28                     // h+=K[i]
607         ror     x13,x8,#1
608         and     x17,x22,x21
609         ror     x12,x5,#19
610         bic     x28,x23,x21
611         ror     x14,x25,#28
612         add     x24,x24,x6                      // h+=X[i]
613         eor     x16,x16,x21,ror#18
614         eor     x13,x13,x8,ror#8
615         orr     x17,x17,x28                     // Ch(e,f,g)
616         eor     x28,x25,x26                     // a^b, b^c in next round
617         eor     x16,x16,x21,ror#41      // Sigma1(e)
618         eor     x14,x14,x25,ror#34
619         add     x24,x24,x17                     // h+=Ch(e,f,g)
620         and     x19,x19,x28                     // (b^c)&=(a^b)
621         eor     x12,x12,x5,ror#61
622         eor     x13,x13,x8,lsr#7        // sigma0(X[i+1])
623         add     x24,x24,x16                     // h+=Sigma1(e)
624         eor     x19,x19,x26                     // Maj(a,b,c)
625         eor     x17,x14,x25,ror#39      // Sigma0(a)
626         eor     x12,x12,x5,lsr#6        // sigma1(X[i+14])
627         add     x7,x7,x0
628         add     x20,x20,x24                     // d+=h
629         add     x24,x24,x19                     // h+=Maj(a,b,c)
630         ldr     x19,[x30],#8            // *K++, x28 in next round
631         add     x7,x7,x13
632         add     x24,x24,x17                     // h+=Sigma0(a)
633         add     x7,x7,x12
634         ldr     x12,[sp,#8]
635         str     x15,[sp,#0]
636         ror     x16,x20,#14
637         add     x23,x23,x19                     // h+=K[i]
638         ror     x14,x9,#1
639         and     x17,x21,x20
640         ror     x13,x6,#19
641         bic     x19,x22,x20
642         ror     x15,x24,#28
643         add     x23,x23,x7                      // h+=X[i]
644         eor     x16,x16,x20,ror#18
645         eor     x14,x14,x9,ror#8
646         orr     x17,x17,x19                     // Ch(e,f,g)
647         eor     x19,x24,x25                     // a^b, b^c in next round
648         eor     x16,x16,x20,ror#41      // Sigma1(e)
649         eor     x15,x15,x24,ror#34
650         add     x23,x23,x17                     // h+=Ch(e,f,g)
651         and     x28,x28,x19                     // (b^c)&=(a^b)
652         eor     x13,x13,x6,ror#61
653         eor     x14,x14,x9,lsr#7        // sigma0(X[i+1])
654         add     x23,x23,x16                     // h+=Sigma1(e)
655         eor     x28,x28,x25                     // Maj(a,b,c)
656         eor     x17,x15,x24,ror#39      // Sigma0(a)
657         eor     x13,x13,x6,lsr#6        // sigma1(X[i+14])
658         add     x8,x8,x1
659         add     x27,x27,x23                     // d+=h
660         add     x23,x23,x28                     // h+=Maj(a,b,c)
661         ldr     x28,[x30],#8            // *K++, x19 in next round
662         add     x8,x8,x14
663         add     x23,x23,x17                     // h+=Sigma0(a)
664         add     x8,x8,x13
665         ldr     x13,[sp,#16]
666         str     x0,[sp,#8]
667         ror     x16,x27,#14
668         add     x22,x22,x28                     // h+=K[i]
669         ror     x15,x10,#1
670         and     x17,x20,x27
671         ror     x14,x7,#19
672         bic     x28,x21,x27
673         ror     x0,x23,#28
674         add     x22,x22,x8                      // h+=X[i]
675         eor     x16,x16,x27,ror#18
676         eor     x15,x15,x10,ror#8
677         orr     x17,x17,x28                     // Ch(e,f,g)
678         eor     x28,x23,x24                     // a^b, b^c in next round
679         eor     x16,x16,x27,ror#41      // Sigma1(e)
680         eor     x0,x0,x23,ror#34
681         add     x22,x22,x17                     // h+=Ch(e,f,g)
682         and     x19,x19,x28                     // (b^c)&=(a^b)
683         eor     x14,x14,x7,ror#61
684         eor     x15,x15,x10,lsr#7       // sigma0(X[i+1])
685         add     x22,x22,x16                     // h+=Sigma1(e)
686         eor     x19,x19,x24                     // Maj(a,b,c)
687         eor     x17,x0,x23,ror#39       // Sigma0(a)
688         eor     x14,x14,x7,lsr#6        // sigma1(X[i+14])
689         add     x9,x9,x2
690         add     x26,x26,x22                     // d+=h
691         add     x22,x22,x19                     // h+=Maj(a,b,c)
692         ldr     x19,[x30],#8            // *K++, x28 in next round
693         add     x9,x9,x15
694         add     x22,x22,x17                     // h+=Sigma0(a)
695         add     x9,x9,x14
696         ldr     x14,[sp,#24]
697         str     x1,[sp,#16]
698         ror     x16,x26,#14
699         add     x21,x21,x19                     // h+=K[i]
700         ror     x0,x11,#1
701         and     x17,x27,x26
702         ror     x15,x8,#19
703         bic     x19,x20,x26
704         ror     x1,x22,#28
705         add     x21,x21,x9                      // h+=X[i]
706         eor     x16,x16,x26,ror#18
707         eor     x0,x0,x11,ror#8
708         orr     x17,x17,x19                     // Ch(e,f,g)
709         eor     x19,x22,x23                     // a^b, b^c in next round
710         eor     x16,x16,x26,ror#41      // Sigma1(e)
711         eor     x1,x1,x22,ror#34
712         add     x21,x21,x17                     // h+=Ch(e,f,g)
713         and     x28,x28,x19                     // (b^c)&=(a^b)
714         eor     x15,x15,x8,ror#61
715         eor     x0,x0,x11,lsr#7 // sigma0(X[i+1])
716         add     x21,x21,x16                     // h+=Sigma1(e)
717         eor     x28,x28,x23                     // Maj(a,b,c)
718         eor     x17,x1,x22,ror#39       // Sigma0(a)
719         eor     x15,x15,x8,lsr#6        // sigma1(X[i+14])
720         add     x10,x10,x3
721         add     x25,x25,x21                     // d+=h
722         add     x21,x21,x28                     // h+=Maj(a,b,c)
723         ldr     x28,[x30],#8            // *K++, x19 in next round
724         add     x10,x10,x0
725         add     x21,x21,x17                     // h+=Sigma0(a)
726         add     x10,x10,x15
727         ldr     x15,[sp,#0]
728         str     x2,[sp,#24]
729         ror     x16,x25,#14
730         add     x20,x20,x28                     // h+=K[i]
731         ror     x1,x12,#1
732         and     x17,x26,x25
733         ror     x0,x9,#19
734         bic     x28,x27,x25
735         ror     x2,x21,#28
736         add     x20,x20,x10                     // h+=X[i]
737         eor     x16,x16,x25,ror#18
738         eor     x1,x1,x12,ror#8
739         orr     x17,x17,x28                     // Ch(e,f,g)
740         eor     x28,x21,x22                     // a^b, b^c in next round
741         eor     x16,x16,x25,ror#41      // Sigma1(e)
742         eor     x2,x2,x21,ror#34
743         add     x20,x20,x17                     // h+=Ch(e,f,g)
744         and     x19,x19,x28                     // (b^c)&=(a^b)
745         eor     x0,x0,x9,ror#61
746         eor     x1,x1,x12,lsr#7 // sigma0(X[i+1])
747         add     x20,x20,x16                     // h+=Sigma1(e)
748         eor     x19,x19,x22                     // Maj(a,b,c)
749         eor     x17,x2,x21,ror#39       // Sigma0(a)
750         eor     x0,x0,x9,lsr#6  // sigma1(X[i+14])
751         add     x11,x11,x4
752         add     x24,x24,x20                     // d+=h
753         add     x20,x20,x19                     // h+=Maj(a,b,c)
754         ldr     x19,[x30],#8            // *K++, x28 in next round
755         add     x11,x11,x1
756         add     x20,x20,x17                     // h+=Sigma0(a)
757         add     x11,x11,x0
758         ldr     x0,[sp,#8]
759         str     x3,[sp,#0]
760         ror     x16,x24,#14
761         add     x27,x27,x19                     // h+=K[i]
762         ror     x2,x13,#1
763         and     x17,x25,x24
764         ror     x1,x10,#19
765         bic     x19,x26,x24
766         ror     x3,x20,#28
767         add     x27,x27,x11                     // h+=X[i]
768         eor     x16,x16,x24,ror#18
769         eor     x2,x2,x13,ror#8
770         orr     x17,x17,x19                     // Ch(e,f,g)
771         eor     x19,x20,x21                     // a^b, b^c in next round
772         eor     x16,x16,x24,ror#41      // Sigma1(e)
773         eor     x3,x3,x20,ror#34
774         add     x27,x27,x17                     // h+=Ch(e,f,g)
775         and     x28,x28,x19                     // (b^c)&=(a^b)
776         eor     x1,x1,x10,ror#61
777         eor     x2,x2,x13,lsr#7 // sigma0(X[i+1])
778         add     x27,x27,x16                     // h+=Sigma1(e)
779         eor     x28,x28,x21                     // Maj(a,b,c)
780         eor     x17,x3,x20,ror#39       // Sigma0(a)
781         eor     x1,x1,x10,lsr#6 // sigma1(X[i+14])
782         add     x12,x12,x5
783         add     x23,x23,x27                     // d+=h
784         add     x27,x27,x28                     // h+=Maj(a,b,c)
785         ldr     x28,[x30],#8            // *K++, x19 in next round
786         add     x12,x12,x2
787         add     x27,x27,x17                     // h+=Sigma0(a)
788         add     x12,x12,x1
789         ldr     x1,[sp,#16]
790         str     x4,[sp,#8]
791         ror     x16,x23,#14
792         add     x26,x26,x28                     // h+=K[i]
793         ror     x3,x14,#1
794         and     x17,x24,x23
795         ror     x2,x11,#19
796         bic     x28,x25,x23
797         ror     x4,x27,#28
798         add     x26,x26,x12                     // h+=X[i]
799         eor     x16,x16,x23,ror#18
800         eor     x3,x3,x14,ror#8
801         orr     x17,x17,x28                     // Ch(e,f,g)
802         eor     x28,x27,x20                     // a^b, b^c in next round
803         eor     x16,x16,x23,ror#41      // Sigma1(e)
804         eor     x4,x4,x27,ror#34
805         add     x26,x26,x17                     // h+=Ch(e,f,g)
806         and     x19,x19,x28                     // (b^c)&=(a^b)
807         eor     x2,x2,x11,ror#61
808         eor     x3,x3,x14,lsr#7 // sigma0(X[i+1])
809         add     x26,x26,x16                     // h+=Sigma1(e)
810         eor     x19,x19,x20                     // Maj(a,b,c)
811         eor     x17,x4,x27,ror#39       // Sigma0(a)
812         eor     x2,x2,x11,lsr#6 // sigma1(X[i+14])
813         add     x13,x13,x6
814         add     x22,x22,x26                     // d+=h
815         add     x26,x26,x19                     // h+=Maj(a,b,c)
816         ldr     x19,[x30],#8            // *K++, x28 in next round
817         add     x13,x13,x3
818         add     x26,x26,x17                     // h+=Sigma0(a)
819         add     x13,x13,x2
820         ldr     x2,[sp,#24]
821         str     x5,[sp,#16]
822         ror     x16,x22,#14
823         add     x25,x25,x19                     // h+=K[i]
824         ror     x4,x15,#1
825         and     x17,x23,x22
826         ror     x3,x12,#19
827         bic     x19,x24,x22
828         ror     x5,x26,#28
829         add     x25,x25,x13                     // h+=X[i]
830         eor     x16,x16,x22,ror#18
831         eor     x4,x4,x15,ror#8
832         orr     x17,x17,x19                     // Ch(e,f,g)
833         eor     x19,x26,x27                     // a^b, b^c in next round
834         eor     x16,x16,x22,ror#41      // Sigma1(e)
835         eor     x5,x5,x26,ror#34
836         add     x25,x25,x17                     // h+=Ch(e,f,g)
837         and     x28,x28,x19                     // (b^c)&=(a^b)
838         eor     x3,x3,x12,ror#61
839         eor     x4,x4,x15,lsr#7 // sigma0(X[i+1])
840         add     x25,x25,x16                     // h+=Sigma1(e)
841         eor     x28,x28,x27                     // Maj(a,b,c)
842         eor     x17,x5,x26,ror#39       // Sigma0(a)
843         eor     x3,x3,x12,lsr#6 // sigma1(X[i+14])
844         add     x14,x14,x7
845         add     x21,x21,x25                     // d+=h
846         add     x25,x25,x28                     // h+=Maj(a,b,c)
847         ldr     x28,[x30],#8            // *K++, x19 in next round
848         add     x14,x14,x4
849         add     x25,x25,x17                     // h+=Sigma0(a)
850         add     x14,x14,x3
851         ldr     x3,[sp,#0]
852         str     x6,[sp,#24]
853         ror     x16,x21,#14
854         add     x24,x24,x28                     // h+=K[i]
855         ror     x5,x0,#1
856         and     x17,x22,x21
857         ror     x4,x13,#19
858         bic     x28,x23,x21
859         ror     x6,x25,#28
860         add     x24,x24,x14                     // h+=X[i]
861         eor     x16,x16,x21,ror#18
862         eor     x5,x5,x0,ror#8
863         orr     x17,x17,x28                     // Ch(e,f,g)
864         eor     x28,x25,x26                     // a^b, b^c in next round
865         eor     x16,x16,x21,ror#41      // Sigma1(e)
866         eor     x6,x6,x25,ror#34
867         add     x24,x24,x17                     // h+=Ch(e,f,g)
868         and     x19,x19,x28                     // (b^c)&=(a^b)
869         eor     x4,x4,x13,ror#61
870         eor     x5,x5,x0,lsr#7  // sigma0(X[i+1])
871         add     x24,x24,x16                     // h+=Sigma1(e)
872         eor     x19,x19,x26                     // Maj(a,b,c)
873         eor     x17,x6,x25,ror#39       // Sigma0(a)
874         eor     x4,x4,x13,lsr#6 // sigma1(X[i+14])
875         add     x15,x15,x8
876         add     x20,x20,x24                     // d+=h
877         add     x24,x24,x19                     // h+=Maj(a,b,c)
878         ldr     x19,[x30],#8            // *K++, x28 in next round
879         add     x15,x15,x5
880         add     x24,x24,x17                     // h+=Sigma0(a)
881         add     x15,x15,x4
882         ldr     x4,[sp,#8]
883         str     x7,[sp,#0]
884         ror     x16,x20,#14
885         add     x23,x23,x19                     // h+=K[i]
886         ror     x6,x1,#1
887         and     x17,x21,x20
888         ror     x5,x14,#19
889         bic     x19,x22,x20
890         ror     x7,x24,#28
891         add     x23,x23,x15                     // h+=X[i]
892         eor     x16,x16,x20,ror#18
893         eor     x6,x6,x1,ror#8
894         orr     x17,x17,x19                     // Ch(e,f,g)
895         eor     x19,x24,x25                     // a^b, b^c in next round
896         eor     x16,x16,x20,ror#41      // Sigma1(e)
897         eor     x7,x7,x24,ror#34
898         add     x23,x23,x17                     // h+=Ch(e,f,g)
899         and     x28,x28,x19                     // (b^c)&=(a^b)
900         eor     x5,x5,x14,ror#61
901         eor     x6,x6,x1,lsr#7  // sigma0(X[i+1])
902         add     x23,x23,x16                     // h+=Sigma1(e)
903         eor     x28,x28,x25                     // Maj(a,b,c)
904         eor     x17,x7,x24,ror#39       // Sigma0(a)
905         eor     x5,x5,x14,lsr#6 // sigma1(X[i+14])
906         add     x0,x0,x9
907         add     x27,x27,x23                     // d+=h
908         add     x23,x23,x28                     // h+=Maj(a,b,c)
909         ldr     x28,[x30],#8            // *K++, x19 in next round
910         add     x0,x0,x6
911         add     x23,x23,x17                     // h+=Sigma0(a)
912         add     x0,x0,x5
913         ldr     x5,[sp,#16]
914         str     x8,[sp,#8]
915         ror     x16,x27,#14
916         add     x22,x22,x28                     // h+=K[i]
917         ror     x7,x2,#1
918         and     x17,x20,x27
919         ror     x6,x15,#19
920         bic     x28,x21,x27
921         ror     x8,x23,#28
922         add     x22,x22,x0                      // h+=X[i]
923         eor     x16,x16,x27,ror#18
924         eor     x7,x7,x2,ror#8
925         orr     x17,x17,x28                     // Ch(e,f,g)
926         eor     x28,x23,x24                     // a^b, b^c in next round
927         eor     x16,x16,x27,ror#41      // Sigma1(e)
928         eor     x8,x8,x23,ror#34
929         add     x22,x22,x17                     // h+=Ch(e,f,g)
930         and     x19,x19,x28                     // (b^c)&=(a^b)
931         eor     x6,x6,x15,ror#61
932         eor     x7,x7,x2,lsr#7  // sigma0(X[i+1])
933         add     x22,x22,x16                     // h+=Sigma1(e)
934         eor     x19,x19,x24                     // Maj(a,b,c)
935         eor     x17,x8,x23,ror#39       // Sigma0(a)
936         eor     x6,x6,x15,lsr#6 // sigma1(X[i+14])
937         add     x1,x1,x10
938         add     x26,x26,x22                     // d+=h
939         add     x22,x22,x19                     // h+=Maj(a,b,c)
940         ldr     x19,[x30],#8            // *K++, x28 in next round
941         add     x1,x1,x7
942         add     x22,x22,x17                     // h+=Sigma0(a)
943         add     x1,x1,x6
944         ldr     x6,[sp,#24]
945         str     x9,[sp,#16]
946         ror     x16,x26,#14
947         add     x21,x21,x19                     // h+=K[i]
948         ror     x8,x3,#1
949         and     x17,x27,x26
950         ror     x7,x0,#19
951         bic     x19,x20,x26
952         ror     x9,x22,#28
953         add     x21,x21,x1                      // h+=X[i]
954         eor     x16,x16,x26,ror#18
955         eor     x8,x8,x3,ror#8
956         orr     x17,x17,x19                     // Ch(e,f,g)
957         eor     x19,x22,x23                     // a^b, b^c in next round
958         eor     x16,x16,x26,ror#41      // Sigma1(e)
959         eor     x9,x9,x22,ror#34
960         add     x21,x21,x17                     // h+=Ch(e,f,g)
961         and     x28,x28,x19                     // (b^c)&=(a^b)
962         eor     x7,x7,x0,ror#61
963         eor     x8,x8,x3,lsr#7  // sigma0(X[i+1])
964         add     x21,x21,x16                     // h+=Sigma1(e)
965         eor     x28,x28,x23                     // Maj(a,b,c)
966         eor     x17,x9,x22,ror#39       // Sigma0(a)
967         eor     x7,x7,x0,lsr#6  // sigma1(X[i+14])
968         add     x2,x2,x11
969         add     x25,x25,x21                     // d+=h
970         add     x21,x21,x28                     // h+=Maj(a,b,c)
971         ldr     x28,[x30],#8            // *K++, x19 in next round
972         add     x2,x2,x8
973         add     x21,x21,x17                     // h+=Sigma0(a)
974         add     x2,x2,x7
975         ldr     x7,[sp,#0]
976         str     x10,[sp,#24]
977         ror     x16,x25,#14
978         add     x20,x20,x28                     // h+=K[i]
979         ror     x9,x4,#1
980         and     x17,x26,x25
981         ror     x8,x1,#19
982         bic     x28,x27,x25
983         ror     x10,x21,#28
984         add     x20,x20,x2                      // h+=X[i]
985         eor     x16,x16,x25,ror#18
986         eor     x9,x9,x4,ror#8
987         orr     x17,x17,x28                     // Ch(e,f,g)
988         eor     x28,x21,x22                     // a^b, b^c in next round
989         eor     x16,x16,x25,ror#41      // Sigma1(e)
990         eor     x10,x10,x21,ror#34
991         add     x20,x20,x17                     // h+=Ch(e,f,g)
992         and     x19,x19,x28                     // (b^c)&=(a^b)
993         eor     x8,x8,x1,ror#61
994         eor     x9,x9,x4,lsr#7  // sigma0(X[i+1])
995         add     x20,x20,x16                     // h+=Sigma1(e)
996         eor     x19,x19,x22                     // Maj(a,b,c)
997         eor     x17,x10,x21,ror#39      // Sigma0(a)
998         eor     x8,x8,x1,lsr#6  // sigma1(X[i+14])
999         add     x3,x3,x12
1000         add     x24,x24,x20                     // d+=h
1001         add     x20,x20,x19                     // h+=Maj(a,b,c)
1002         ldr     x19,[x30],#8            // *K++, x28 in next round
1003         add     x3,x3,x9
1004         add     x20,x20,x17                     // h+=Sigma0(a)
1005         add     x3,x3,x8
1006         cbnz    x19,.Loop_16_xx
1007
1008         ldp     x0,x2,[x29,#96]
1009         ldr     x1,[x29,#112]
1010         sub     x30,x30,#648            // rewind
1011
1012         ldp     x3,x4,[x0]
1013         ldp     x5,x6,[x0,#2*8]
1014         add     x1,x1,#14*8                     // advance input pointer
1015         ldp     x7,x8,[x0,#4*8]
1016         add     x20,x20,x3
1017         ldp     x9,x10,[x0,#6*8]
1018         add     x21,x21,x4
1019         add     x22,x22,x5
1020         add     x23,x23,x6
1021         stp     x20,x21,[x0]
1022         add     x24,x24,x7
1023         add     x25,x25,x8
1024         stp     x22,x23,[x0,#2*8]
1025         add     x26,x26,x9
1026         add     x27,x27,x10
1027         cmp     x1,x2
1028         stp     x24,x25,[x0,#4*8]
1029         stp     x26,x27,[x0,#6*8]
1030         b.ne    .Loop
1031
1032         ldp     x19,x20,[x29,#16]
1033         add     sp,sp,#4*8
1034         ldp     x21,x22,[x29,#32]
1035         ldp     x23,x24,[x29,#48]
1036         ldp     x25,x26,[x29,#64]
1037         ldp     x27,x28,[x29,#80]
1038         ldp     x29,x30,[sp],#128
1039         ret
1040 .size   sha512_block_data_order,.-sha512_block_data_order
1041
1042 .align  6
1043 .type   .LK512,%object
1044 .LK512:
1045 .quad   0x428a2f98d728ae22,0x7137449123ef65cd
1046 .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1047 .quad   0x3956c25bf348b538,0x59f111f1b605d019
1048 .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
1049 .quad   0xd807aa98a3030242,0x12835b0145706fbe
1050 .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1051 .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
1052 .quad   0x9bdc06a725c71235,0xc19bf174cf692694
1053 .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
1054 .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1055 .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
1056 .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1057 .quad   0x983e5152ee66dfab,0xa831c66d2db43210
1058 .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
1059 .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
1060 .quad   0x06ca6351e003826f,0x142929670a0e6e70
1061 .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
1062 .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1063 .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
1064 .quad   0x81c2c92e47edaee6,0x92722c851482353b
1065 .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
1066 .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
1067 .quad   0xd192e819d6ef5218,0xd69906245565a910
1068 .quad   0xf40e35855771202a,0x106aa07032bbd1b8
1069 .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
1070 .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1071 .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1072 .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1073 .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
1074 .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
1075 .quad   0x90befffa23631e28,0xa4506cebde82bde9
1076 .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
1077 .quad   0xca273eceea26619c,0xd186b8c721c0c207
1078 .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1079 .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
1080 .quad   0x113f9804bef90dae,0x1b710b35131c471b
1081 .quad   0x28db77f523047d84,0x32caab7b40c72493
1082 .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1083 .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1084 .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
1085 .quad   0       // terminator
1086 .size   .LK512,.-.LK512
1087 #ifndef __KERNEL__
1088 .align  3
1089 .LOPENSSL_armcap_P:
1090 # ifdef __ILP32__
1091 .long   OPENSSL_armcap_P-.
1092 # else
1093 .quad   OPENSSL_armcap_P-.
1094 # endif
1095 #endif
1096 .byte   83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1097 .align  2
1098 .align  2
1099 #ifndef __KERNEL__
1100 .type   sha512_block_armv8,%function
1101 .align  6
1102 sha512_block_armv8:
1103 .Lv8_entry:
1104         stp     x29,x30,[sp,#-16]!
1105         add     x29,sp,#0
1106
1107         ld1     {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64      // load input
1108         ld1     {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1109
1110         ld1     {v0.2d,v1.2d,v2.2d,v3.2d},[x0]          // load context
1111         adr     x3,.LK512
1112
1113         rev64   v16.16b,v16.16b
1114         rev64   v17.16b,v17.16b
1115         rev64   v18.16b,v18.16b
1116         rev64   v19.16b,v19.16b
1117         rev64   v20.16b,v20.16b
1118         rev64   v21.16b,v21.16b
1119         rev64   v22.16b,v22.16b
1120         rev64   v23.16b,v23.16b
1121         b       .Loop_hw
1122
1123 .align  4
1124 .Loop_hw:
1125         ld1     {v24.2d},[x3],#16
1126         subs    x2,x2,#1
1127         sub     x4,x1,#128
1128         orr     v26.16b,v0.16b,v0.16b                   // offload
1129         orr     v27.16b,v1.16b,v1.16b
1130         orr     v28.16b,v2.16b,v2.16b
1131         orr     v29.16b,v3.16b,v3.16b
1132         csel    x1,x1,x4,ne                     // conditional rewind
1133         add     v24.2d,v24.2d,v16.2d
1134         ld1     {v25.2d},[x3],#16
1135         ext     v24.16b,v24.16b,v24.16b,#8
1136         ext     v5.16b,v2.16b,v3.16b,#8
1137         ext     v6.16b,v1.16b,v2.16b,#8
1138         add     v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
1139 .inst   0xcec08230      //sha512su0 v16.16b,v17.16b
1140         ext     v7.16b,v20.16b,v21.16b,#8
1141 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1142 .inst   0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
1143         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1144 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1145         add     v25.2d,v25.2d,v17.2d
1146         ld1     {v24.2d},[x3],#16
1147         ext     v25.16b,v25.16b,v25.16b,#8
1148         ext     v5.16b,v4.16b,v2.16b,#8
1149         ext     v6.16b,v0.16b,v4.16b,#8
1150         add     v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
1151 .inst   0xcec08251      //sha512su0 v17.16b,v18.16b
1152         ext     v7.16b,v21.16b,v22.16b,#8
1153 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1154 .inst   0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
1155         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1156 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1157         add     v24.2d,v24.2d,v18.2d
1158         ld1     {v25.2d},[x3],#16
1159         ext     v24.16b,v24.16b,v24.16b,#8
1160         ext     v5.16b,v1.16b,v4.16b,#8
1161         ext     v6.16b,v3.16b,v1.16b,#8
1162         add     v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
1163 .inst   0xcec08272      //sha512su0 v18.16b,v19.16b
1164         ext     v7.16b,v22.16b,v23.16b,#8
1165 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1166 .inst   0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
1167         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1168 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1169         add     v25.2d,v25.2d,v19.2d
1170         ld1     {v24.2d},[x3],#16
1171         ext     v25.16b,v25.16b,v25.16b,#8
1172         ext     v5.16b,v0.16b,v1.16b,#8
1173         ext     v6.16b,v2.16b,v0.16b,#8
1174         add     v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
1175 .inst   0xcec08293      //sha512su0 v19.16b,v20.16b
1176         ext     v7.16b,v23.16b,v16.16b,#8
1177 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1178 .inst   0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
1179         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1180 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1181         add     v24.2d,v24.2d,v20.2d
1182         ld1     {v25.2d},[x3],#16
1183         ext     v24.16b,v24.16b,v24.16b,#8
1184         ext     v5.16b,v3.16b,v0.16b,#8
1185         ext     v6.16b,v4.16b,v3.16b,#8
1186         add     v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
1187 .inst   0xcec082b4      //sha512su0 v20.16b,v21.16b
1188         ext     v7.16b,v16.16b,v17.16b,#8
1189 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1190 .inst   0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
1191         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1192 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1193         add     v25.2d,v25.2d,v21.2d
1194         ld1     {v24.2d},[x3],#16
1195         ext     v25.16b,v25.16b,v25.16b,#8
1196         ext     v5.16b,v2.16b,v3.16b,#8
1197         ext     v6.16b,v1.16b,v2.16b,#8
1198         add     v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
1199 .inst   0xcec082d5      //sha512su0 v21.16b,v22.16b
1200         ext     v7.16b,v17.16b,v18.16b,#8
1201 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1202 .inst   0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
1203         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1204 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1205         add     v24.2d,v24.2d,v22.2d
1206         ld1     {v25.2d},[x3],#16
1207         ext     v24.16b,v24.16b,v24.16b,#8
1208         ext     v5.16b,v4.16b,v2.16b,#8
1209         ext     v6.16b,v0.16b,v4.16b,#8
1210         add     v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
1211 .inst   0xcec082f6      //sha512su0 v22.16b,v23.16b
1212         ext     v7.16b,v18.16b,v19.16b,#8
1213 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1214 .inst   0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
1215         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1216 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1217         add     v25.2d,v25.2d,v23.2d
1218         ld1     {v24.2d},[x3],#16
1219         ext     v25.16b,v25.16b,v25.16b,#8
1220         ext     v5.16b,v1.16b,v4.16b,#8
1221         ext     v6.16b,v3.16b,v1.16b,#8
1222         add     v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
1223 .inst   0xcec08217      //sha512su0 v23.16b,v16.16b
1224         ext     v7.16b,v19.16b,v20.16b,#8
1225 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1226 .inst   0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
1227         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1228 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1229         add     v24.2d,v24.2d,v16.2d
1230         ld1     {v25.2d},[x3],#16
1231         ext     v24.16b,v24.16b,v24.16b,#8
1232         ext     v5.16b,v0.16b,v1.16b,#8
1233         ext     v6.16b,v2.16b,v0.16b,#8
1234         add     v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
1235 .inst   0xcec08230      //sha512su0 v16.16b,v17.16b
1236         ext     v7.16b,v20.16b,v21.16b,#8
1237 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1238 .inst   0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
1239         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1240 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1241         add     v25.2d,v25.2d,v17.2d
1242         ld1     {v24.2d},[x3],#16
1243         ext     v25.16b,v25.16b,v25.16b,#8
1244         ext     v5.16b,v3.16b,v0.16b,#8
1245         ext     v6.16b,v4.16b,v3.16b,#8
1246         add     v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
1247 .inst   0xcec08251      //sha512su0 v17.16b,v18.16b
1248         ext     v7.16b,v21.16b,v22.16b,#8
1249 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1250 .inst   0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
1251         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1252 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1253         add     v24.2d,v24.2d,v18.2d
1254         ld1     {v25.2d},[x3],#16
1255         ext     v24.16b,v24.16b,v24.16b,#8
1256         ext     v5.16b,v2.16b,v3.16b,#8
1257         ext     v6.16b,v1.16b,v2.16b,#8
1258         add     v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
1259 .inst   0xcec08272      //sha512su0 v18.16b,v19.16b
1260         ext     v7.16b,v22.16b,v23.16b,#8
1261 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1262 .inst   0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
1263         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1264 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1265         add     v25.2d,v25.2d,v19.2d
1266         ld1     {v24.2d},[x3],#16
1267         ext     v25.16b,v25.16b,v25.16b,#8
1268         ext     v5.16b,v4.16b,v2.16b,#8
1269         ext     v6.16b,v0.16b,v4.16b,#8
1270         add     v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
1271 .inst   0xcec08293      //sha512su0 v19.16b,v20.16b
1272         ext     v7.16b,v23.16b,v16.16b,#8
1273 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1274 .inst   0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
1275         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1276 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1277         add     v24.2d,v24.2d,v20.2d
1278         ld1     {v25.2d},[x3],#16
1279         ext     v24.16b,v24.16b,v24.16b,#8
1280         ext     v5.16b,v1.16b,v4.16b,#8
1281         ext     v6.16b,v3.16b,v1.16b,#8
1282         add     v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
1283 .inst   0xcec082b4      //sha512su0 v20.16b,v21.16b
1284         ext     v7.16b,v16.16b,v17.16b,#8
1285 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1286 .inst   0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
1287         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1288 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1289         add     v25.2d,v25.2d,v21.2d
1290         ld1     {v24.2d},[x3],#16
1291         ext     v25.16b,v25.16b,v25.16b,#8
1292         ext     v5.16b,v0.16b,v1.16b,#8
1293         ext     v6.16b,v2.16b,v0.16b,#8
1294         add     v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
1295 .inst   0xcec082d5      //sha512su0 v21.16b,v22.16b
1296         ext     v7.16b,v17.16b,v18.16b,#8
1297 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1298 .inst   0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
1299         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1300 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1301         add     v24.2d,v24.2d,v22.2d
1302         ld1     {v25.2d},[x3],#16
1303         ext     v24.16b,v24.16b,v24.16b,#8
1304         ext     v5.16b,v3.16b,v0.16b,#8
1305         ext     v6.16b,v4.16b,v3.16b,#8
1306         add     v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
1307 .inst   0xcec082f6      //sha512su0 v22.16b,v23.16b
1308         ext     v7.16b,v18.16b,v19.16b,#8
1309 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1310 .inst   0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
1311         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1312 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1313         add     v25.2d,v25.2d,v23.2d
1314         ld1     {v24.2d},[x3],#16
1315         ext     v25.16b,v25.16b,v25.16b,#8
1316         ext     v5.16b,v2.16b,v3.16b,#8
1317         ext     v6.16b,v1.16b,v2.16b,#8
1318         add     v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
1319 .inst   0xcec08217      //sha512su0 v23.16b,v16.16b
1320         ext     v7.16b,v19.16b,v20.16b,#8
1321 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1322 .inst   0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
1323         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1324 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1325         add     v24.2d,v24.2d,v16.2d
1326         ld1     {v25.2d},[x3],#16
1327         ext     v24.16b,v24.16b,v24.16b,#8
1328         ext     v5.16b,v4.16b,v2.16b,#8
1329         ext     v6.16b,v0.16b,v4.16b,#8
1330         add     v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
1331 .inst   0xcec08230      //sha512su0 v16.16b,v17.16b
1332         ext     v7.16b,v20.16b,v21.16b,#8
1333 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1334 .inst   0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
1335         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1336 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1337         add     v25.2d,v25.2d,v17.2d
1338         ld1     {v24.2d},[x3],#16
1339         ext     v25.16b,v25.16b,v25.16b,#8
1340         ext     v5.16b,v1.16b,v4.16b,#8
1341         ext     v6.16b,v3.16b,v1.16b,#8
1342         add     v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
1343 .inst   0xcec08251      //sha512su0 v17.16b,v18.16b
1344         ext     v7.16b,v21.16b,v22.16b,#8
1345 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1346 .inst   0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
1347         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1348 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1349         add     v24.2d,v24.2d,v18.2d
1350         ld1     {v25.2d},[x3],#16
1351         ext     v24.16b,v24.16b,v24.16b,#8
1352         ext     v5.16b,v0.16b,v1.16b,#8
1353         ext     v6.16b,v2.16b,v0.16b,#8
1354         add     v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
1355 .inst   0xcec08272      //sha512su0 v18.16b,v19.16b
1356         ext     v7.16b,v22.16b,v23.16b,#8
1357 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1358 .inst   0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
1359         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1360 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1361         add     v25.2d,v25.2d,v19.2d
1362         ld1     {v24.2d},[x3],#16
1363         ext     v25.16b,v25.16b,v25.16b,#8
1364         ext     v5.16b,v3.16b,v0.16b,#8
1365         ext     v6.16b,v4.16b,v3.16b,#8
1366         add     v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
1367 .inst   0xcec08293      //sha512su0 v19.16b,v20.16b
1368         ext     v7.16b,v23.16b,v16.16b,#8
1369 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1370 .inst   0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
1371         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1372 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1373         add     v24.2d,v24.2d,v20.2d
1374         ld1     {v25.2d},[x3],#16
1375         ext     v24.16b,v24.16b,v24.16b,#8
1376         ext     v5.16b,v2.16b,v3.16b,#8
1377         ext     v6.16b,v1.16b,v2.16b,#8
1378         add     v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
1379 .inst   0xcec082b4      //sha512su0 v20.16b,v21.16b
1380         ext     v7.16b,v16.16b,v17.16b,#8
1381 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1382 .inst   0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
1383         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1384 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1385         add     v25.2d,v25.2d,v21.2d
1386         ld1     {v24.2d},[x3],#16
1387         ext     v25.16b,v25.16b,v25.16b,#8
1388         ext     v5.16b,v4.16b,v2.16b,#8
1389         ext     v6.16b,v0.16b,v4.16b,#8
1390         add     v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
1391 .inst   0xcec082d5      //sha512su0 v21.16b,v22.16b
1392         ext     v7.16b,v17.16b,v18.16b,#8
1393 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1394 .inst   0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
1395         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1396 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1397         add     v24.2d,v24.2d,v22.2d
1398         ld1     {v25.2d},[x3],#16
1399         ext     v24.16b,v24.16b,v24.16b,#8
1400         ext     v5.16b,v1.16b,v4.16b,#8
1401         ext     v6.16b,v3.16b,v1.16b,#8
1402         add     v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
1403 .inst   0xcec082f6      //sha512su0 v22.16b,v23.16b
1404         ext     v7.16b,v18.16b,v19.16b,#8
1405 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1406 .inst   0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
1407         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1408 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1409         add     v25.2d,v25.2d,v23.2d
1410         ld1     {v24.2d},[x3],#16
1411         ext     v25.16b,v25.16b,v25.16b,#8
1412         ext     v5.16b,v0.16b,v1.16b,#8
1413         ext     v6.16b,v2.16b,v0.16b,#8
1414         add     v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
1415 .inst   0xcec08217      //sha512su0 v23.16b,v16.16b
1416         ext     v7.16b,v19.16b,v20.16b,#8
1417 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1418 .inst   0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
1419         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1420 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1421         add     v24.2d,v24.2d,v16.2d
1422         ld1     {v25.2d},[x3],#16
1423         ext     v24.16b,v24.16b,v24.16b,#8
1424         ext     v5.16b,v3.16b,v0.16b,#8
1425         ext     v6.16b,v4.16b,v3.16b,#8
1426         add     v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
1427 .inst   0xcec08230      //sha512su0 v16.16b,v17.16b
1428         ext     v7.16b,v20.16b,v21.16b,#8
1429 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1430 .inst   0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
1431         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1432 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1433         add     v25.2d,v25.2d,v17.2d
1434         ld1     {v24.2d},[x3],#16
1435         ext     v25.16b,v25.16b,v25.16b,#8
1436         ext     v5.16b,v2.16b,v3.16b,#8
1437         ext     v6.16b,v1.16b,v2.16b,#8
1438         add     v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
1439 .inst   0xcec08251      //sha512su0 v17.16b,v18.16b
1440         ext     v7.16b,v21.16b,v22.16b,#8
1441 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1442 .inst   0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
1443         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1444 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1445         add     v24.2d,v24.2d,v18.2d
1446         ld1     {v25.2d},[x3],#16
1447         ext     v24.16b,v24.16b,v24.16b,#8
1448         ext     v5.16b,v4.16b,v2.16b,#8
1449         ext     v6.16b,v0.16b,v4.16b,#8
1450         add     v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
1451 .inst   0xcec08272      //sha512su0 v18.16b,v19.16b
1452         ext     v7.16b,v22.16b,v23.16b,#8
1453 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1454 .inst   0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
1455         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1456 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1457         add     v25.2d,v25.2d,v19.2d
1458         ld1     {v24.2d},[x3],#16
1459         ext     v25.16b,v25.16b,v25.16b,#8
1460         ext     v5.16b,v1.16b,v4.16b,#8
1461         ext     v6.16b,v3.16b,v1.16b,#8
1462         add     v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
1463 .inst   0xcec08293      //sha512su0 v19.16b,v20.16b
1464         ext     v7.16b,v23.16b,v16.16b,#8
1465 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1466 .inst   0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
1467         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1468 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1469         add     v24.2d,v24.2d,v20.2d
1470         ld1     {v25.2d},[x3],#16
1471         ext     v24.16b,v24.16b,v24.16b,#8
1472         ext     v5.16b,v0.16b,v1.16b,#8
1473         ext     v6.16b,v2.16b,v0.16b,#8
1474         add     v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
1475 .inst   0xcec082b4      //sha512su0 v20.16b,v21.16b
1476         ext     v7.16b,v16.16b,v17.16b,#8
1477 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1478 .inst   0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
1479         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1480 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1481         add     v25.2d,v25.2d,v21.2d
1482         ld1     {v24.2d},[x3],#16
1483         ext     v25.16b,v25.16b,v25.16b,#8
1484         ext     v5.16b,v3.16b,v0.16b,#8
1485         ext     v6.16b,v4.16b,v3.16b,#8
1486         add     v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
1487 .inst   0xcec082d5      //sha512su0 v21.16b,v22.16b
1488         ext     v7.16b,v17.16b,v18.16b,#8
1489 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1490 .inst   0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
1491         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1492 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1493         add     v24.2d,v24.2d,v22.2d
1494         ld1     {v25.2d},[x3],#16
1495         ext     v24.16b,v24.16b,v24.16b,#8
1496         ext     v5.16b,v2.16b,v3.16b,#8
1497         ext     v6.16b,v1.16b,v2.16b,#8
1498         add     v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
1499 .inst   0xcec082f6      //sha512su0 v22.16b,v23.16b
1500         ext     v7.16b,v18.16b,v19.16b,#8
1501 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1502 .inst   0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
1503         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1504 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1505         add     v25.2d,v25.2d,v23.2d
1506         ld1     {v24.2d},[x3],#16
1507         ext     v25.16b,v25.16b,v25.16b,#8
1508         ext     v5.16b,v4.16b,v2.16b,#8
1509         ext     v6.16b,v0.16b,v4.16b,#8
1510         add     v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
1511 .inst   0xcec08217      //sha512su0 v23.16b,v16.16b
1512         ext     v7.16b,v19.16b,v20.16b,#8
1513 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1514 .inst   0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
1515         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1516 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1517         ld1     {v25.2d},[x3],#16
1518         add     v24.2d,v24.2d,v16.2d
1519         ld1     {v16.16b},[x1],#16              // load next input
1520         ext     v24.16b,v24.16b,v24.16b,#8
1521         ext     v5.16b,v1.16b,v4.16b,#8
1522         ext     v6.16b,v3.16b,v1.16b,#8
1523         add     v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
1524 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1525         rev64   v16.16b,v16.16b
1526         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1527 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1528         ld1     {v24.2d},[x3],#16
1529         add     v25.2d,v25.2d,v17.2d
1530         ld1     {v17.16b},[x1],#16              // load next input
1531         ext     v25.16b,v25.16b,v25.16b,#8
1532         ext     v5.16b,v0.16b,v1.16b,#8
1533         ext     v6.16b,v2.16b,v0.16b,#8
1534         add     v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
1535 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1536         rev64   v17.16b,v17.16b
1537         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1538 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1539         ld1     {v25.2d},[x3],#16
1540         add     v24.2d,v24.2d,v18.2d
1541         ld1     {v18.16b},[x1],#16              // load next input
1542         ext     v24.16b,v24.16b,v24.16b,#8
1543         ext     v5.16b,v3.16b,v0.16b,#8
1544         ext     v6.16b,v4.16b,v3.16b,#8
1545         add     v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
1546 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1547         rev64   v18.16b,v18.16b
1548         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1549 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1550         ld1     {v24.2d},[x3],#16
1551         add     v25.2d,v25.2d,v19.2d
1552         ld1     {v19.16b},[x1],#16              // load next input
1553         ext     v25.16b,v25.16b,v25.16b,#8
1554         ext     v5.16b,v2.16b,v3.16b,#8
1555         ext     v6.16b,v1.16b,v2.16b,#8
1556         add     v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
1557 .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
1558         rev64   v19.16b,v19.16b
1559         add     v4.2d,v1.2d,v3.2d               // "D + T1"
1560 .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
1561         ld1     {v25.2d},[x3],#16
1562         add     v24.2d,v24.2d,v20.2d
1563         ld1     {v20.16b},[x1],#16              // load next input
1564         ext     v24.16b,v24.16b,v24.16b,#8
1565         ext     v5.16b,v4.16b,v2.16b,#8
1566         ext     v6.16b,v0.16b,v4.16b,#8
1567         add     v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
1568 .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
1569         rev64   v20.16b,v20.16b
1570         add     v1.2d,v0.2d,v2.2d               // "D + T1"
1571 .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
1572         ld1     {v24.2d},[x3],#16
1573         add     v25.2d,v25.2d,v21.2d
1574         ld1     {v21.16b},[x1],#16              // load next input
1575         ext     v25.16b,v25.16b,v25.16b,#8
1576         ext     v5.16b,v1.16b,v4.16b,#8
1577         ext     v6.16b,v3.16b,v1.16b,#8
1578         add     v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
1579 .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
1580         rev64   v21.16b,v21.16b
1581         add     v0.2d,v3.2d,v4.2d               // "D + T1"
1582 .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
1583         ld1     {v25.2d},[x3],#16
1584         add     v24.2d,v24.2d,v22.2d
1585         ld1     {v22.16b},[x1],#16              // load next input
1586         ext     v24.16b,v24.16b,v24.16b,#8
1587         ext     v5.16b,v0.16b,v1.16b,#8
1588         ext     v6.16b,v2.16b,v0.16b,#8
1589         add     v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
1590 .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
1591         rev64   v22.16b,v22.16b
1592         add     v3.2d,v2.2d,v1.2d               // "D + T1"
1593 .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
1594         sub     x3,x3,#80*8     // rewind
1595         add     v25.2d,v25.2d,v23.2d
1596         ld1     {v23.16b},[x1],#16              // load next input
1597         ext     v25.16b,v25.16b,v25.16b,#8
1598         ext     v5.16b,v3.16b,v0.16b,#8
1599         ext     v6.16b,v4.16b,v3.16b,#8
1600         add     v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
1601 .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
1602         rev64   v23.16b,v23.16b
1603         add     v2.2d,v4.2d,v0.2d               // "D + T1"
1604 .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
1605         add     v0.2d,v0.2d,v26.2d                      // accumulate
1606         add     v1.2d,v1.2d,v27.2d
1607         add     v2.2d,v2.2d,v28.2d
1608         add     v3.2d,v3.2d,v29.2d
1609
1610         cbnz    x2,.Loop_hw
1611
1612         st1     {v0.2d,v1.2d,v2.2d,v3.2d},[x0]          // store context
1613
1614         ldr     x29,[sp],#16
1615         ret
1616 .size   sha512_block_armv8,.-sha512_block_armv8
1617 #endif
1618 #ifndef __KERNEL__
1619 .comm   OPENSSL_armcap_P,4,4
1620 #endif