2 /* Do not modify. This file is auto-generated from sha512-armv8.pl. */
3 // Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
5 // Licensed under the OpenSSL license (the "License"). You may not use
6 // this file except in compliance with the License. You can obtain a copy
7 // in the file LICENSE in the source distribution or at
8 // https://www.openssl.org/source/license.html
10 // ====================================================================
11 // Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 // project. The module is, however, dual licensed under OpenSSL and
13 // CRYPTOGAMS licenses depending on where you obtain it. For further
14 // details see http://www.openssl.org/~appro/cryptogams/.
16 // Permission to use under GPLv2 terms is granted.
17 // ====================================================================
19 // SHA256/512 for ARMv8.
21 // Performance in cycles per processed byte and improvement coefficient
22 // over code generated with "default" compiler:
24 // SHA256-hw SHA256(*) SHA512
25 // Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
26 // Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
27 // Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
28 // Denver 2.01 10.5 (+26%) 6.70 (+8%)
29 // X-Gene 20.0 (+100%) 12.8 (+300%(***))
30 // Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
31 // Kryo 1.92 17.4 (+30%) 11.2 (+8%)
33 // (*) Software SHA256 results are of lesser relevance, presented
34 // mostly for informational purposes.
35 // (**) The result is a trade-off: it's possible to improve it by
36 // 10% (or by 1 cycle per round), but at the cost of 20% loss
37 // on Cortex-A53 (or by 4 cycles per round).
38 // (***) Super-impressive coefficients over gcc-generated code are
39 // indication of some compiler "pathology", most notably code
40 // generated with -mgeneral-regs-only is significantly faster
41 // and the gap is only 40-90%.
45 // Originally it was reckoned that it makes no sense to implement NEON
46 // version of SHA256 for 64-bit processors. This is because performance
47 // improvement on most wide-spread Cortex-A5x processors was observed
48 // to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
49 // observed that 32-bit NEON SHA256 performs significantly better than
50 // 64-bit scalar version on *some* of the more recent processors. As
51 // result 64-bit NEON version of SHA256 was added to provide best
52 // all-round performance. For example it executes ~30% faster on X-Gene
53 // and Mongoose. [For reference, NEON version of SHA512 is bound to
54 // deliver much less improvement, likely *negative* on Cortex-A5x.
55 // Which is why NEON support is limited to SHA256.]
58 # include "arm_arch.h"
64 .globl sha512_block_data_order
65 .type sha512_block_data_order,%function
67 sha512_block_data_order:
70 ldrsw x16,.LOPENSSL_armcap_P
72 ldr x16,.LOPENSSL_armcap_P
74 adr x17,.LOPENSSL_armcap_P
80 stp x29,x30,[sp,#-128]!
90 ldp x20,x21,[x0] // load context
93 add x2,x1,x2,lsl#7 // end of input
100 ldr x19,[x30],#8 // *K++
101 eor x28,x21,x22 // magic seed
103 #ifndef __AARCH64EB__
107 add x27,x27,x19 // h+=K[i]
108 eor x6,x24,x24,ror#23
111 add x27,x27,x3 // h+=X[i]
112 orr x17,x17,x19 // Ch(e,f,g)
113 eor x19,x20,x21 // a^b, b^c in next round
114 eor x16,x16,x6,ror#18 // Sigma1(e)
116 add x27,x27,x17 // h+=Ch(e,f,g)
117 eor x17,x20,x20,ror#5
118 add x27,x27,x16 // h+=Sigma1(e)
119 and x28,x28,x19 // (b^c)&=(a^b)
120 add x23,x23,x27 // d+=h
121 eor x28,x28,x21 // Maj(a,b,c)
122 eor x17,x6,x17,ror#34 // Sigma0(a)
123 add x27,x27,x28 // h+=Maj(a,b,c)
124 ldr x28,[x30],#8 // *K++, x19 in next round
125 //add x27,x27,x17 // h+=Sigma0(a)
126 #ifndef __AARCH64EB__
130 add x27,x27,x17 // h+=Sigma0(a)
132 add x26,x26,x28 // h+=K[i]
133 eor x7,x23,x23,ror#23
136 add x26,x26,x4 // h+=X[i]
137 orr x17,x17,x28 // Ch(e,f,g)
138 eor x28,x27,x20 // a^b, b^c in next round
139 eor x16,x16,x7,ror#18 // Sigma1(e)
141 add x26,x26,x17 // h+=Ch(e,f,g)
142 eor x17,x27,x27,ror#5
143 add x26,x26,x16 // h+=Sigma1(e)
144 and x19,x19,x28 // (b^c)&=(a^b)
145 add x22,x22,x26 // d+=h
146 eor x19,x19,x20 // Maj(a,b,c)
147 eor x17,x7,x17,ror#34 // Sigma0(a)
148 add x26,x26,x19 // h+=Maj(a,b,c)
149 ldr x19,[x30],#8 // *K++, x28 in next round
150 //add x26,x26,x17 // h+=Sigma0(a)
151 #ifndef __AARCH64EB__
154 add x26,x26,x17 // h+=Sigma0(a)
156 add x25,x25,x19 // h+=K[i]
157 eor x8,x22,x22,ror#23
160 add x25,x25,x5 // h+=X[i]
161 orr x17,x17,x19 // Ch(e,f,g)
162 eor x19,x26,x27 // a^b, b^c in next round
163 eor x16,x16,x8,ror#18 // Sigma1(e)
165 add x25,x25,x17 // h+=Ch(e,f,g)
166 eor x17,x26,x26,ror#5
167 add x25,x25,x16 // h+=Sigma1(e)
168 and x28,x28,x19 // (b^c)&=(a^b)
169 add x21,x21,x25 // d+=h
170 eor x28,x28,x27 // Maj(a,b,c)
171 eor x17,x8,x17,ror#34 // Sigma0(a)
172 add x25,x25,x28 // h+=Maj(a,b,c)
173 ldr x28,[x30],#8 // *K++, x19 in next round
174 //add x25,x25,x17 // h+=Sigma0(a)
175 #ifndef __AARCH64EB__
179 add x25,x25,x17 // h+=Sigma0(a)
181 add x24,x24,x28 // h+=K[i]
182 eor x9,x21,x21,ror#23
185 add x24,x24,x6 // h+=X[i]
186 orr x17,x17,x28 // Ch(e,f,g)
187 eor x28,x25,x26 // a^b, b^c in next round
188 eor x16,x16,x9,ror#18 // Sigma1(e)
190 add x24,x24,x17 // h+=Ch(e,f,g)
191 eor x17,x25,x25,ror#5
192 add x24,x24,x16 // h+=Sigma1(e)
193 and x19,x19,x28 // (b^c)&=(a^b)
194 add x20,x20,x24 // d+=h
195 eor x19,x19,x26 // Maj(a,b,c)
196 eor x17,x9,x17,ror#34 // Sigma0(a)
197 add x24,x24,x19 // h+=Maj(a,b,c)
198 ldr x19,[x30],#8 // *K++, x28 in next round
199 //add x24,x24,x17 // h+=Sigma0(a)
200 #ifndef __AARCH64EB__
203 add x24,x24,x17 // h+=Sigma0(a)
205 add x23,x23,x19 // h+=K[i]
206 eor x10,x20,x20,ror#23
209 add x23,x23,x7 // h+=X[i]
210 orr x17,x17,x19 // Ch(e,f,g)
211 eor x19,x24,x25 // a^b, b^c in next round
212 eor x16,x16,x10,ror#18 // Sigma1(e)
214 add x23,x23,x17 // h+=Ch(e,f,g)
215 eor x17,x24,x24,ror#5
216 add x23,x23,x16 // h+=Sigma1(e)
217 and x28,x28,x19 // (b^c)&=(a^b)
218 add x27,x27,x23 // d+=h
219 eor x28,x28,x25 // Maj(a,b,c)
220 eor x17,x10,x17,ror#34 // Sigma0(a)
221 add x23,x23,x28 // h+=Maj(a,b,c)
222 ldr x28,[x30],#8 // *K++, x19 in next round
223 //add x23,x23,x17 // h+=Sigma0(a)
224 #ifndef __AARCH64EB__
228 add x23,x23,x17 // h+=Sigma0(a)
230 add x22,x22,x28 // h+=K[i]
231 eor x11,x27,x27,ror#23
234 add x22,x22,x8 // h+=X[i]
235 orr x17,x17,x28 // Ch(e,f,g)
236 eor x28,x23,x24 // a^b, b^c in next round
237 eor x16,x16,x11,ror#18 // Sigma1(e)
239 add x22,x22,x17 // h+=Ch(e,f,g)
240 eor x17,x23,x23,ror#5
241 add x22,x22,x16 // h+=Sigma1(e)
242 and x19,x19,x28 // (b^c)&=(a^b)
243 add x26,x26,x22 // d+=h
244 eor x19,x19,x24 // Maj(a,b,c)
245 eor x17,x11,x17,ror#34 // Sigma0(a)
246 add x22,x22,x19 // h+=Maj(a,b,c)
247 ldr x19,[x30],#8 // *K++, x28 in next round
248 //add x22,x22,x17 // h+=Sigma0(a)
249 #ifndef __AARCH64EB__
252 add x22,x22,x17 // h+=Sigma0(a)
254 add x21,x21,x19 // h+=K[i]
255 eor x12,x26,x26,ror#23
258 add x21,x21,x9 // h+=X[i]
259 orr x17,x17,x19 // Ch(e,f,g)
260 eor x19,x22,x23 // a^b, b^c in next round
261 eor x16,x16,x12,ror#18 // Sigma1(e)
263 add x21,x21,x17 // h+=Ch(e,f,g)
264 eor x17,x22,x22,ror#5
265 add x21,x21,x16 // h+=Sigma1(e)
266 and x28,x28,x19 // (b^c)&=(a^b)
267 add x25,x25,x21 // d+=h
268 eor x28,x28,x23 // Maj(a,b,c)
269 eor x17,x12,x17,ror#34 // Sigma0(a)
270 add x21,x21,x28 // h+=Maj(a,b,c)
271 ldr x28,[x30],#8 // *K++, x19 in next round
272 //add x21,x21,x17 // h+=Sigma0(a)
273 #ifndef __AARCH64EB__
276 ldp x11,x12,[x1],#2*8
277 add x21,x21,x17 // h+=Sigma0(a)
279 add x20,x20,x28 // h+=K[i]
280 eor x13,x25,x25,ror#23
283 add x20,x20,x10 // h+=X[i]
284 orr x17,x17,x28 // Ch(e,f,g)
285 eor x28,x21,x22 // a^b, b^c in next round
286 eor x16,x16,x13,ror#18 // Sigma1(e)
288 add x20,x20,x17 // h+=Ch(e,f,g)
289 eor x17,x21,x21,ror#5
290 add x20,x20,x16 // h+=Sigma1(e)
291 and x19,x19,x28 // (b^c)&=(a^b)
292 add x24,x24,x20 // d+=h
293 eor x19,x19,x22 // Maj(a,b,c)
294 eor x17,x13,x17,ror#34 // Sigma0(a)
295 add x20,x20,x19 // h+=Maj(a,b,c)
296 ldr x19,[x30],#8 // *K++, x28 in next round
297 //add x20,x20,x17 // h+=Sigma0(a)
298 #ifndef __AARCH64EB__
301 add x20,x20,x17 // h+=Sigma0(a)
303 add x27,x27,x19 // h+=K[i]
304 eor x14,x24,x24,ror#23
307 add x27,x27,x11 // h+=X[i]
308 orr x17,x17,x19 // Ch(e,f,g)
309 eor x19,x20,x21 // a^b, b^c in next round
310 eor x16,x16,x14,ror#18 // Sigma1(e)
312 add x27,x27,x17 // h+=Ch(e,f,g)
313 eor x17,x20,x20,ror#5
314 add x27,x27,x16 // h+=Sigma1(e)
315 and x28,x28,x19 // (b^c)&=(a^b)
316 add x23,x23,x27 // d+=h
317 eor x28,x28,x21 // Maj(a,b,c)
318 eor x17,x14,x17,ror#34 // Sigma0(a)
319 add x27,x27,x28 // h+=Maj(a,b,c)
320 ldr x28,[x30],#8 // *K++, x19 in next round
321 //add x27,x27,x17 // h+=Sigma0(a)
322 #ifndef __AARCH64EB__
325 ldp x13,x14,[x1],#2*8
326 add x27,x27,x17 // h+=Sigma0(a)
328 add x26,x26,x28 // h+=K[i]
329 eor x15,x23,x23,ror#23
332 add x26,x26,x12 // h+=X[i]
333 orr x17,x17,x28 // Ch(e,f,g)
334 eor x28,x27,x20 // a^b, b^c in next round
335 eor x16,x16,x15,ror#18 // Sigma1(e)
337 add x26,x26,x17 // h+=Ch(e,f,g)
338 eor x17,x27,x27,ror#5
339 add x26,x26,x16 // h+=Sigma1(e)
340 and x19,x19,x28 // (b^c)&=(a^b)
341 add x22,x22,x26 // d+=h
342 eor x19,x19,x20 // Maj(a,b,c)
343 eor x17,x15,x17,ror#34 // Sigma0(a)
344 add x26,x26,x19 // h+=Maj(a,b,c)
345 ldr x19,[x30],#8 // *K++, x28 in next round
346 //add x26,x26,x17 // h+=Sigma0(a)
347 #ifndef __AARCH64EB__
350 add x26,x26,x17 // h+=Sigma0(a)
352 add x25,x25,x19 // h+=K[i]
353 eor x0,x22,x22,ror#23
356 add x25,x25,x13 // h+=X[i]
357 orr x17,x17,x19 // Ch(e,f,g)
358 eor x19,x26,x27 // a^b, b^c in next round
359 eor x16,x16,x0,ror#18 // Sigma1(e)
361 add x25,x25,x17 // h+=Ch(e,f,g)
362 eor x17,x26,x26,ror#5
363 add x25,x25,x16 // h+=Sigma1(e)
364 and x28,x28,x19 // (b^c)&=(a^b)
365 add x21,x21,x25 // d+=h
366 eor x28,x28,x27 // Maj(a,b,c)
367 eor x17,x0,x17,ror#34 // Sigma0(a)
368 add x25,x25,x28 // h+=Maj(a,b,c)
369 ldr x28,[x30],#8 // *K++, x19 in next round
370 //add x25,x25,x17 // h+=Sigma0(a)
371 #ifndef __AARCH64EB__
375 add x25,x25,x17 // h+=Sigma0(a)
378 add x24,x24,x28 // h+=K[i]
379 eor x6,x21,x21,ror#23
382 add x24,x24,x14 // h+=X[i]
383 orr x17,x17,x28 // Ch(e,f,g)
384 eor x28,x25,x26 // a^b, b^c in next round
385 eor x16,x16,x6,ror#18 // Sigma1(e)
387 add x24,x24,x17 // h+=Ch(e,f,g)
388 eor x17,x25,x25,ror#5
389 add x24,x24,x16 // h+=Sigma1(e)
390 and x19,x19,x28 // (b^c)&=(a^b)
391 add x20,x20,x24 // d+=h
392 eor x19,x19,x26 // Maj(a,b,c)
393 eor x17,x6,x17,ror#34 // Sigma0(a)
394 add x24,x24,x19 // h+=Maj(a,b,c)
395 ldr x19,[x30],#8 // *K++, x28 in next round
396 //add x24,x24,x17 // h+=Sigma0(a)
397 #ifndef __AARCH64EB__
400 add x24,x24,x17 // h+=Sigma0(a)
403 add x23,x23,x19 // h+=K[i]
404 eor x7,x20,x20,ror#23
407 add x23,x23,x15 // h+=X[i]
408 orr x17,x17,x19 // Ch(e,f,g)
409 eor x19,x24,x25 // a^b, b^c in next round
410 eor x16,x16,x7,ror#18 // Sigma1(e)
412 add x23,x23,x17 // h+=Ch(e,f,g)
413 eor x17,x24,x24,ror#5
414 add x23,x23,x16 // h+=Sigma1(e)
415 and x28,x28,x19 // (b^c)&=(a^b)
416 add x27,x27,x23 // d+=h
417 eor x28,x28,x25 // Maj(a,b,c)
418 eor x17,x7,x17,ror#34 // Sigma0(a)
419 add x23,x23,x28 // h+=Maj(a,b,c)
420 ldr x28,[x30],#8 // *K++, x19 in next round
421 //add x23,x23,x17 // h+=Sigma0(a)
422 #ifndef __AARCH64EB__
426 add x23,x23,x17 // h+=Sigma0(a)
429 add x22,x22,x28 // h+=K[i]
430 eor x8,x27,x27,ror#23
433 add x22,x22,x0 // h+=X[i]
434 orr x17,x17,x28 // Ch(e,f,g)
435 eor x28,x23,x24 // a^b, b^c in next round
436 eor x16,x16,x8,ror#18 // Sigma1(e)
438 add x22,x22,x17 // h+=Ch(e,f,g)
439 eor x17,x23,x23,ror#5
440 add x22,x22,x16 // h+=Sigma1(e)
441 and x19,x19,x28 // (b^c)&=(a^b)
442 add x26,x26,x22 // d+=h
443 eor x19,x19,x24 // Maj(a,b,c)
444 eor x17,x8,x17,ror#34 // Sigma0(a)
445 add x22,x22,x19 // h+=Maj(a,b,c)
446 ldr x19,[x30],#8 // *K++, x28 in next round
447 //add x22,x22,x17 // h+=Sigma0(a)
448 #ifndef __AARCH64EB__
452 add x22,x22,x17 // h+=Sigma0(a)
455 add x21,x21,x19 // h+=K[i]
456 eor x9,x26,x26,ror#23
459 add x21,x21,x1 // h+=X[i]
460 orr x17,x17,x19 // Ch(e,f,g)
461 eor x19,x22,x23 // a^b, b^c in next round
462 eor x16,x16,x9,ror#18 // Sigma1(e)
464 add x21,x21,x17 // h+=Ch(e,f,g)
465 eor x17,x22,x22,ror#5
466 add x21,x21,x16 // h+=Sigma1(e)
467 and x28,x28,x19 // (b^c)&=(a^b)
468 add x25,x25,x21 // d+=h
469 eor x28,x28,x23 // Maj(a,b,c)
470 eor x17,x9,x17,ror#34 // Sigma0(a)
471 add x21,x21,x28 // h+=Maj(a,b,c)
472 ldr x28,[x30],#8 // *K++, x19 in next round
473 //add x21,x21,x17 // h+=Sigma0(a)
474 #ifndef __AARCH64EB__
478 add x21,x21,x17 // h+=Sigma0(a)
481 add x20,x20,x28 // h+=K[i]
487 add x20,x20,x2 // h+=X[i]
488 eor x16,x16,x25,ror#18
490 orr x17,x17,x28 // Ch(e,f,g)
491 eor x28,x21,x22 // a^b, b^c in next round
492 eor x16,x16,x25,ror#41 // Sigma1(e)
493 eor x10,x10,x21,ror#34
494 add x20,x20,x17 // h+=Ch(e,f,g)
495 and x19,x19,x28 // (b^c)&=(a^b)
497 eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
498 add x20,x20,x16 // h+=Sigma1(e)
499 eor x19,x19,x22 // Maj(a,b,c)
500 eor x17,x10,x21,ror#39 // Sigma0(a)
501 eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
503 add x24,x24,x20 // d+=h
504 add x20,x20,x19 // h+=Maj(a,b,c)
505 ldr x19,[x30],#8 // *K++, x28 in next round
507 add x20,x20,x17 // h+=Sigma0(a)
513 add x27,x27,x19 // h+=K[i]
519 add x27,x27,x3 // h+=X[i]
520 eor x16,x16,x24,ror#18
522 orr x17,x17,x19 // Ch(e,f,g)
523 eor x19,x20,x21 // a^b, b^c in next round
524 eor x16,x16,x24,ror#41 // Sigma1(e)
525 eor x11,x11,x20,ror#34
526 add x27,x27,x17 // h+=Ch(e,f,g)
527 and x28,x28,x19 // (b^c)&=(a^b)
529 eor x10,x10,x5,lsr#7 // sigma0(X[i+1])
530 add x27,x27,x16 // h+=Sigma1(e)
531 eor x28,x28,x21 // Maj(a,b,c)
532 eor x17,x11,x20,ror#39 // Sigma0(a)
533 eor x9,x9,x2,lsr#6 // sigma1(X[i+14])
535 add x23,x23,x27 // d+=h
536 add x27,x27,x28 // h+=Maj(a,b,c)
537 ldr x28,[x30],#8 // *K++, x19 in next round
539 add x27,x27,x17 // h+=Sigma0(a)
544 add x26,x26,x28 // h+=K[i]
550 add x26,x26,x4 // h+=X[i]
551 eor x16,x16,x23,ror#18
553 orr x17,x17,x28 // Ch(e,f,g)
554 eor x28,x27,x20 // a^b, b^c in next round
555 eor x16,x16,x23,ror#41 // Sigma1(e)
556 eor x12,x12,x27,ror#34
557 add x26,x26,x17 // h+=Ch(e,f,g)
558 and x19,x19,x28 // (b^c)&=(a^b)
559 eor x10,x10,x3,ror#61
560 eor x11,x11,x6,lsr#7 // sigma0(X[i+1])
561 add x26,x26,x16 // h+=Sigma1(e)
562 eor x19,x19,x20 // Maj(a,b,c)
563 eor x17,x12,x27,ror#39 // Sigma0(a)
564 eor x10,x10,x3,lsr#6 // sigma1(X[i+14])
566 add x22,x22,x26 // d+=h
567 add x26,x26,x19 // h+=Maj(a,b,c)
568 ldr x19,[x30],#8 // *K++, x28 in next round
570 add x26,x26,x17 // h+=Sigma0(a)
575 add x25,x25,x19 // h+=K[i]
581 add x25,x25,x5 // h+=X[i]
582 eor x16,x16,x22,ror#18
584 orr x17,x17,x19 // Ch(e,f,g)
585 eor x19,x26,x27 // a^b, b^c in next round
586 eor x16,x16,x22,ror#41 // Sigma1(e)
587 eor x13,x13,x26,ror#34
588 add x25,x25,x17 // h+=Ch(e,f,g)
589 and x28,x28,x19 // (b^c)&=(a^b)
590 eor x11,x11,x4,ror#61
591 eor x12,x12,x7,lsr#7 // sigma0(X[i+1])
592 add x25,x25,x16 // h+=Sigma1(e)
593 eor x28,x28,x27 // Maj(a,b,c)
594 eor x17,x13,x26,ror#39 // Sigma0(a)
595 eor x11,x11,x4,lsr#6 // sigma1(X[i+14])
597 add x21,x21,x25 // d+=h
598 add x25,x25,x28 // h+=Maj(a,b,c)
599 ldr x28,[x30],#8 // *K++, x19 in next round
601 add x25,x25,x17 // h+=Sigma0(a)
606 add x24,x24,x28 // h+=K[i]
612 add x24,x24,x6 // h+=X[i]
613 eor x16,x16,x21,ror#18
615 orr x17,x17,x28 // Ch(e,f,g)
616 eor x28,x25,x26 // a^b, b^c in next round
617 eor x16,x16,x21,ror#41 // Sigma1(e)
618 eor x14,x14,x25,ror#34
619 add x24,x24,x17 // h+=Ch(e,f,g)
620 and x19,x19,x28 // (b^c)&=(a^b)
621 eor x12,x12,x5,ror#61
622 eor x13,x13,x8,lsr#7 // sigma0(X[i+1])
623 add x24,x24,x16 // h+=Sigma1(e)
624 eor x19,x19,x26 // Maj(a,b,c)
625 eor x17,x14,x25,ror#39 // Sigma0(a)
626 eor x12,x12,x5,lsr#6 // sigma1(X[i+14])
628 add x20,x20,x24 // d+=h
629 add x24,x24,x19 // h+=Maj(a,b,c)
630 ldr x19,[x30],#8 // *K++, x28 in next round
632 add x24,x24,x17 // h+=Sigma0(a)
637 add x23,x23,x19 // h+=K[i]
643 add x23,x23,x7 // h+=X[i]
644 eor x16,x16,x20,ror#18
646 orr x17,x17,x19 // Ch(e,f,g)
647 eor x19,x24,x25 // a^b, b^c in next round
648 eor x16,x16,x20,ror#41 // Sigma1(e)
649 eor x15,x15,x24,ror#34
650 add x23,x23,x17 // h+=Ch(e,f,g)
651 and x28,x28,x19 // (b^c)&=(a^b)
652 eor x13,x13,x6,ror#61
653 eor x14,x14,x9,lsr#7 // sigma0(X[i+1])
654 add x23,x23,x16 // h+=Sigma1(e)
655 eor x28,x28,x25 // Maj(a,b,c)
656 eor x17,x15,x24,ror#39 // Sigma0(a)
657 eor x13,x13,x6,lsr#6 // sigma1(X[i+14])
659 add x27,x27,x23 // d+=h
660 add x23,x23,x28 // h+=Maj(a,b,c)
661 ldr x28,[x30],#8 // *K++, x19 in next round
663 add x23,x23,x17 // h+=Sigma0(a)
668 add x22,x22,x28 // h+=K[i]
674 add x22,x22,x8 // h+=X[i]
675 eor x16,x16,x27,ror#18
676 eor x15,x15,x10,ror#8
677 orr x17,x17,x28 // Ch(e,f,g)
678 eor x28,x23,x24 // a^b, b^c in next round
679 eor x16,x16,x27,ror#41 // Sigma1(e)
681 add x22,x22,x17 // h+=Ch(e,f,g)
682 and x19,x19,x28 // (b^c)&=(a^b)
683 eor x14,x14,x7,ror#61
684 eor x15,x15,x10,lsr#7 // sigma0(X[i+1])
685 add x22,x22,x16 // h+=Sigma1(e)
686 eor x19,x19,x24 // Maj(a,b,c)
687 eor x17,x0,x23,ror#39 // Sigma0(a)
688 eor x14,x14,x7,lsr#6 // sigma1(X[i+14])
690 add x26,x26,x22 // d+=h
691 add x22,x22,x19 // h+=Maj(a,b,c)
692 ldr x19,[x30],#8 // *K++, x28 in next round
694 add x22,x22,x17 // h+=Sigma0(a)
699 add x21,x21,x19 // h+=K[i]
705 add x21,x21,x9 // h+=X[i]
706 eor x16,x16,x26,ror#18
708 orr x17,x17,x19 // Ch(e,f,g)
709 eor x19,x22,x23 // a^b, b^c in next round
710 eor x16,x16,x26,ror#41 // Sigma1(e)
712 add x21,x21,x17 // h+=Ch(e,f,g)
713 and x28,x28,x19 // (b^c)&=(a^b)
714 eor x15,x15,x8,ror#61
715 eor x0,x0,x11,lsr#7 // sigma0(X[i+1])
716 add x21,x21,x16 // h+=Sigma1(e)
717 eor x28,x28,x23 // Maj(a,b,c)
718 eor x17,x1,x22,ror#39 // Sigma0(a)
719 eor x15,x15,x8,lsr#6 // sigma1(X[i+14])
721 add x25,x25,x21 // d+=h
722 add x21,x21,x28 // h+=Maj(a,b,c)
723 ldr x28,[x30],#8 // *K++, x19 in next round
725 add x21,x21,x17 // h+=Sigma0(a)
730 add x20,x20,x28 // h+=K[i]
736 add x20,x20,x10 // h+=X[i]
737 eor x16,x16,x25,ror#18
739 orr x17,x17,x28 // Ch(e,f,g)
740 eor x28,x21,x22 // a^b, b^c in next round
741 eor x16,x16,x25,ror#41 // Sigma1(e)
743 add x20,x20,x17 // h+=Ch(e,f,g)
744 and x19,x19,x28 // (b^c)&=(a^b)
746 eor x1,x1,x12,lsr#7 // sigma0(X[i+1])
747 add x20,x20,x16 // h+=Sigma1(e)
748 eor x19,x19,x22 // Maj(a,b,c)
749 eor x17,x2,x21,ror#39 // Sigma0(a)
750 eor x0,x0,x9,lsr#6 // sigma1(X[i+14])
752 add x24,x24,x20 // d+=h
753 add x20,x20,x19 // h+=Maj(a,b,c)
754 ldr x19,[x30],#8 // *K++, x28 in next round
756 add x20,x20,x17 // h+=Sigma0(a)
761 add x27,x27,x19 // h+=K[i]
767 add x27,x27,x11 // h+=X[i]
768 eor x16,x16,x24,ror#18
770 orr x17,x17,x19 // Ch(e,f,g)
771 eor x19,x20,x21 // a^b, b^c in next round
772 eor x16,x16,x24,ror#41 // Sigma1(e)
774 add x27,x27,x17 // h+=Ch(e,f,g)
775 and x28,x28,x19 // (b^c)&=(a^b)
777 eor x2,x2,x13,lsr#7 // sigma0(X[i+1])
778 add x27,x27,x16 // h+=Sigma1(e)
779 eor x28,x28,x21 // Maj(a,b,c)
780 eor x17,x3,x20,ror#39 // Sigma0(a)
781 eor x1,x1,x10,lsr#6 // sigma1(X[i+14])
783 add x23,x23,x27 // d+=h
784 add x27,x27,x28 // h+=Maj(a,b,c)
785 ldr x28,[x30],#8 // *K++, x19 in next round
787 add x27,x27,x17 // h+=Sigma0(a)
792 add x26,x26,x28 // h+=K[i]
798 add x26,x26,x12 // h+=X[i]
799 eor x16,x16,x23,ror#18
801 orr x17,x17,x28 // Ch(e,f,g)
802 eor x28,x27,x20 // a^b, b^c in next round
803 eor x16,x16,x23,ror#41 // Sigma1(e)
805 add x26,x26,x17 // h+=Ch(e,f,g)
806 and x19,x19,x28 // (b^c)&=(a^b)
808 eor x3,x3,x14,lsr#7 // sigma0(X[i+1])
809 add x26,x26,x16 // h+=Sigma1(e)
810 eor x19,x19,x20 // Maj(a,b,c)
811 eor x17,x4,x27,ror#39 // Sigma0(a)
812 eor x2,x2,x11,lsr#6 // sigma1(X[i+14])
814 add x22,x22,x26 // d+=h
815 add x26,x26,x19 // h+=Maj(a,b,c)
816 ldr x19,[x30],#8 // *K++, x28 in next round
818 add x26,x26,x17 // h+=Sigma0(a)
823 add x25,x25,x19 // h+=K[i]
829 add x25,x25,x13 // h+=X[i]
830 eor x16,x16,x22,ror#18
832 orr x17,x17,x19 // Ch(e,f,g)
833 eor x19,x26,x27 // a^b, b^c in next round
834 eor x16,x16,x22,ror#41 // Sigma1(e)
836 add x25,x25,x17 // h+=Ch(e,f,g)
837 and x28,x28,x19 // (b^c)&=(a^b)
839 eor x4,x4,x15,lsr#7 // sigma0(X[i+1])
840 add x25,x25,x16 // h+=Sigma1(e)
841 eor x28,x28,x27 // Maj(a,b,c)
842 eor x17,x5,x26,ror#39 // Sigma0(a)
843 eor x3,x3,x12,lsr#6 // sigma1(X[i+14])
845 add x21,x21,x25 // d+=h
846 add x25,x25,x28 // h+=Maj(a,b,c)
847 ldr x28,[x30],#8 // *K++, x19 in next round
849 add x25,x25,x17 // h+=Sigma0(a)
854 add x24,x24,x28 // h+=K[i]
860 add x24,x24,x14 // h+=X[i]
861 eor x16,x16,x21,ror#18
863 orr x17,x17,x28 // Ch(e,f,g)
864 eor x28,x25,x26 // a^b, b^c in next round
865 eor x16,x16,x21,ror#41 // Sigma1(e)
867 add x24,x24,x17 // h+=Ch(e,f,g)
868 and x19,x19,x28 // (b^c)&=(a^b)
870 eor x5,x5,x0,lsr#7 // sigma0(X[i+1])
871 add x24,x24,x16 // h+=Sigma1(e)
872 eor x19,x19,x26 // Maj(a,b,c)
873 eor x17,x6,x25,ror#39 // Sigma0(a)
874 eor x4,x4,x13,lsr#6 // sigma1(X[i+14])
876 add x20,x20,x24 // d+=h
877 add x24,x24,x19 // h+=Maj(a,b,c)
878 ldr x19,[x30],#8 // *K++, x28 in next round
880 add x24,x24,x17 // h+=Sigma0(a)
885 add x23,x23,x19 // h+=K[i]
891 add x23,x23,x15 // h+=X[i]
892 eor x16,x16,x20,ror#18
894 orr x17,x17,x19 // Ch(e,f,g)
895 eor x19,x24,x25 // a^b, b^c in next round
896 eor x16,x16,x20,ror#41 // Sigma1(e)
898 add x23,x23,x17 // h+=Ch(e,f,g)
899 and x28,x28,x19 // (b^c)&=(a^b)
901 eor x6,x6,x1,lsr#7 // sigma0(X[i+1])
902 add x23,x23,x16 // h+=Sigma1(e)
903 eor x28,x28,x25 // Maj(a,b,c)
904 eor x17,x7,x24,ror#39 // Sigma0(a)
905 eor x5,x5,x14,lsr#6 // sigma1(X[i+14])
907 add x27,x27,x23 // d+=h
908 add x23,x23,x28 // h+=Maj(a,b,c)
909 ldr x28,[x30],#8 // *K++, x19 in next round
911 add x23,x23,x17 // h+=Sigma0(a)
916 add x22,x22,x28 // h+=K[i]
922 add x22,x22,x0 // h+=X[i]
923 eor x16,x16,x27,ror#18
925 orr x17,x17,x28 // Ch(e,f,g)
926 eor x28,x23,x24 // a^b, b^c in next round
927 eor x16,x16,x27,ror#41 // Sigma1(e)
929 add x22,x22,x17 // h+=Ch(e,f,g)
930 and x19,x19,x28 // (b^c)&=(a^b)
932 eor x7,x7,x2,lsr#7 // sigma0(X[i+1])
933 add x22,x22,x16 // h+=Sigma1(e)
934 eor x19,x19,x24 // Maj(a,b,c)
935 eor x17,x8,x23,ror#39 // Sigma0(a)
936 eor x6,x6,x15,lsr#6 // sigma1(X[i+14])
938 add x26,x26,x22 // d+=h
939 add x22,x22,x19 // h+=Maj(a,b,c)
940 ldr x19,[x30],#8 // *K++, x28 in next round
942 add x22,x22,x17 // h+=Sigma0(a)
947 add x21,x21,x19 // h+=K[i]
953 add x21,x21,x1 // h+=X[i]
954 eor x16,x16,x26,ror#18
956 orr x17,x17,x19 // Ch(e,f,g)
957 eor x19,x22,x23 // a^b, b^c in next round
958 eor x16,x16,x26,ror#41 // Sigma1(e)
960 add x21,x21,x17 // h+=Ch(e,f,g)
961 and x28,x28,x19 // (b^c)&=(a^b)
963 eor x8,x8,x3,lsr#7 // sigma0(X[i+1])
964 add x21,x21,x16 // h+=Sigma1(e)
965 eor x28,x28,x23 // Maj(a,b,c)
966 eor x17,x9,x22,ror#39 // Sigma0(a)
967 eor x7,x7,x0,lsr#6 // sigma1(X[i+14])
969 add x25,x25,x21 // d+=h
970 add x21,x21,x28 // h+=Maj(a,b,c)
971 ldr x28,[x30],#8 // *K++, x19 in next round
973 add x21,x21,x17 // h+=Sigma0(a)
978 add x20,x20,x28 // h+=K[i]
984 add x20,x20,x2 // h+=X[i]
985 eor x16,x16,x25,ror#18
987 orr x17,x17,x28 // Ch(e,f,g)
988 eor x28,x21,x22 // a^b, b^c in next round
989 eor x16,x16,x25,ror#41 // Sigma1(e)
990 eor x10,x10,x21,ror#34
991 add x20,x20,x17 // h+=Ch(e,f,g)
992 and x19,x19,x28 // (b^c)&=(a^b)
994 eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
995 add x20,x20,x16 // h+=Sigma1(e)
996 eor x19,x19,x22 // Maj(a,b,c)
997 eor x17,x10,x21,ror#39 // Sigma0(a)
998 eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
1000 add x24,x24,x20 // d+=h
1001 add x20,x20,x19 // h+=Maj(a,b,c)
1002 ldr x19,[x30],#8 // *K++, x28 in next round
1004 add x20,x20,x17 // h+=Sigma0(a)
1006 cbnz x19,.Loop_16_xx
1010 sub x30,x30,#648 // rewind
1014 add x1,x1,#14*8 // advance input pointer
1017 ldp x9,x10,[x0,#6*8]
1024 stp x22,x23,[x0,#2*8]
1028 stp x24,x25,[x0,#4*8]
1029 stp x26,x27,[x0,#6*8]
1032 ldp x19,x20,[x29,#16]
1034 ldp x21,x22,[x29,#32]
1035 ldp x23,x24,[x29,#48]
1036 ldp x25,x26,[x29,#64]
1037 ldp x27,x28,[x29,#80]
1038 ldp x29,x30,[sp],#128
1040 .size sha512_block_data_order,.-sha512_block_data_order
1043 .type .LK512,%object
1045 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
1046 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1047 .quad 0x3956c25bf348b538,0x59f111f1b605d019
1048 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
1049 .quad 0xd807aa98a3030242,0x12835b0145706fbe
1050 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1051 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
1052 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
1053 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
1054 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1055 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
1056 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1057 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
1058 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
1059 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
1060 .quad 0x06ca6351e003826f,0x142929670a0e6e70
1061 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
1062 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1063 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
1064 .quad 0x81c2c92e47edaee6,0x92722c851482353b
1065 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
1066 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
1067 .quad 0xd192e819d6ef5218,0xd69906245565a910
1068 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
1069 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
1070 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1071 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1072 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1073 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
1074 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
1075 .quad 0x90befffa23631e28,0xa4506cebde82bde9
1076 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
1077 .quad 0xca273eceea26619c,0xd186b8c721c0c207
1078 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1079 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
1080 .quad 0x113f9804bef90dae,0x1b710b35131c471b
1081 .quad 0x28db77f523047d84,0x32caab7b40c72493
1082 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1083 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1084 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
1085 .quad 0 // terminator
1086 .size .LK512,.-.LK512
1091 .long OPENSSL_armcap_P-.
1093 .quad OPENSSL_armcap_P-.
1096 .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1100 .type sha512_block_armv8,%function
1104 stp x29,x30,[sp,#-16]!
1107 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input
1108 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1110 ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context
1113 rev64 v16.16b,v16.16b
1114 rev64 v17.16b,v17.16b
1115 rev64 v18.16b,v18.16b
1116 rev64 v19.16b,v19.16b
1117 rev64 v20.16b,v20.16b
1118 rev64 v21.16b,v21.16b
1119 rev64 v22.16b,v22.16b
1120 rev64 v23.16b,v23.16b
1125 ld1 {v24.2d},[x3],#16
1128 orr v26.16b,v0.16b,v0.16b // offload
1129 orr v27.16b,v1.16b,v1.16b
1130 orr v28.16b,v2.16b,v2.16b
1131 orr v29.16b,v3.16b,v3.16b
1132 csel x1,x1,x4,ne // conditional rewind
1133 add v24.2d,v24.2d,v16.2d
1134 ld1 {v25.2d},[x3],#16
1135 ext v24.16b,v24.16b,v24.16b,#8
1136 ext v5.16b,v2.16b,v3.16b,#8
1137 ext v6.16b,v1.16b,v2.16b,#8
1138 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
1139 .inst 0xcec08230 //sha512su0 v16.16b,v17.16b
1140 ext v7.16b,v20.16b,v21.16b,#8
1141 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
1142 .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
1143 add v4.2d,v1.2d,v3.2d // "D + T1"
1144 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
1145 add v25.2d,v25.2d,v17.2d
1146 ld1 {v24.2d},[x3],#16
1147 ext v25.16b,v25.16b,v25.16b,#8
1148 ext v5.16b,v4.16b,v2.16b,#8
1149 ext v6.16b,v0.16b,v4.16b,#8
1150 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
1151 .inst 0xcec08251 //sha512su0 v17.16b,v18.16b
1152 ext v7.16b,v21.16b,v22.16b,#8
1153 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
1154 .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
1155 add v1.2d,v0.2d,v2.2d // "D + T1"
1156 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
1157 add v24.2d,v24.2d,v18.2d
1158 ld1 {v25.2d},[x3],#16
1159 ext v24.16b,v24.16b,v24.16b,#8
1160 ext v5.16b,v1.16b,v4.16b,#8
1161 ext v6.16b,v3.16b,v1.16b,#8
1162 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
1163 .inst 0xcec08272 //sha512su0 v18.16b,v19.16b
1164 ext v7.16b,v22.16b,v23.16b,#8
1165 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
1166 .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
1167 add v0.2d,v3.2d,v4.2d // "D + T1"
1168 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
1169 add v25.2d,v25.2d,v19.2d
1170 ld1 {v24.2d},[x3],#16
1171 ext v25.16b,v25.16b,v25.16b,#8
1172 ext v5.16b,v0.16b,v1.16b,#8
1173 ext v6.16b,v2.16b,v0.16b,#8
1174 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
1175 .inst 0xcec08293 //sha512su0 v19.16b,v20.16b
1176 ext v7.16b,v23.16b,v16.16b,#8
1177 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
1178 .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
1179 add v3.2d,v2.2d,v1.2d // "D + T1"
1180 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
1181 add v24.2d,v24.2d,v20.2d
1182 ld1 {v25.2d},[x3],#16
1183 ext v24.16b,v24.16b,v24.16b,#8
1184 ext v5.16b,v3.16b,v0.16b,#8
1185 ext v6.16b,v4.16b,v3.16b,#8
1186 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
1187 .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
1188 ext v7.16b,v16.16b,v17.16b,#8
1189 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
1190 .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
1191 add v2.2d,v4.2d,v0.2d // "D + T1"
1192 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
1193 add v25.2d,v25.2d,v21.2d
1194 ld1 {v24.2d},[x3],#16
1195 ext v25.16b,v25.16b,v25.16b,#8
1196 ext v5.16b,v2.16b,v3.16b,#8
1197 ext v6.16b,v1.16b,v2.16b,#8
1198 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
1199 .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
1200 ext v7.16b,v17.16b,v18.16b,#8
1201 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
1202 .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
1203 add v4.2d,v1.2d,v3.2d // "D + T1"
1204 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
1205 add v24.2d,v24.2d,v22.2d
1206 ld1 {v25.2d},[x3],#16
1207 ext v24.16b,v24.16b,v24.16b,#8
1208 ext v5.16b,v4.16b,v2.16b,#8
1209 ext v6.16b,v0.16b,v4.16b,#8
1210 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
1211 .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
1212 ext v7.16b,v18.16b,v19.16b,#8
1213 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
1214 .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
1215 add v1.2d,v0.2d,v2.2d // "D + T1"
1216 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
1217 add v25.2d,v25.2d,v23.2d
1218 ld1 {v24.2d},[x3],#16
1219 ext v25.16b,v25.16b,v25.16b,#8
1220 ext v5.16b,v1.16b,v4.16b,#8
1221 ext v6.16b,v3.16b,v1.16b,#8
1222 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
1223 .inst 0xcec08217 //sha512su0 v23.16b,v16.16b
1224 ext v7.16b,v19.16b,v20.16b,#8
1225 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
1226 .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
1227 add v0.2d,v3.2d,v4.2d // "D + T1"
1228 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
1229 add v24.2d,v24.2d,v16.2d
1230 ld1 {v25.2d},[x3],#16
1231 ext v24.16b,v24.16b,v24.16b,#8
1232 ext v5.16b,v0.16b,v1.16b,#8
1233 ext v6.16b,v2.16b,v0.16b,#8
1234 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
1235 .inst 0xcec08230 //sha512su0 v16.16b,v17.16b
1236 ext v7.16b,v20.16b,v21.16b,#8
1237 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
1238 .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
1239 add v3.2d,v2.2d,v1.2d // "D + T1"
1240 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
1241 add v25.2d,v25.2d,v17.2d
1242 ld1 {v24.2d},[x3],#16
1243 ext v25.16b,v25.16b,v25.16b,#8
1244 ext v5.16b,v3.16b,v0.16b,#8
1245 ext v6.16b,v4.16b,v3.16b,#8
1246 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
1247 .inst 0xcec08251 //sha512su0 v17.16b,v18.16b
1248 ext v7.16b,v21.16b,v22.16b,#8
1249 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
1250 .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
1251 add v2.2d,v4.2d,v0.2d // "D + T1"
1252 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
1253 add v24.2d,v24.2d,v18.2d
1254 ld1 {v25.2d},[x3],#16
1255 ext v24.16b,v24.16b,v24.16b,#8
1256 ext v5.16b,v2.16b,v3.16b,#8
1257 ext v6.16b,v1.16b,v2.16b,#8
1258 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
1259 .inst 0xcec08272 //sha512su0 v18.16b,v19.16b
1260 ext v7.16b,v22.16b,v23.16b,#8
1261 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
1262 .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
1263 add v4.2d,v1.2d,v3.2d // "D + T1"
1264 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
1265 add v25.2d,v25.2d,v19.2d
1266 ld1 {v24.2d},[x3],#16
1267 ext v25.16b,v25.16b,v25.16b,#8
1268 ext v5.16b,v4.16b,v2.16b,#8
1269 ext v6.16b,v0.16b,v4.16b,#8
1270 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
1271 .inst 0xcec08293 //sha512su0 v19.16b,v20.16b
1272 ext v7.16b,v23.16b,v16.16b,#8
1273 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
1274 .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
1275 add v1.2d,v0.2d,v2.2d // "D + T1"
1276 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
1277 add v24.2d,v24.2d,v20.2d
1278 ld1 {v25.2d},[x3],#16
1279 ext v24.16b,v24.16b,v24.16b,#8
1280 ext v5.16b,v1.16b,v4.16b,#8
1281 ext v6.16b,v3.16b,v1.16b,#8
1282 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
1283 .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
1284 ext v7.16b,v16.16b,v17.16b,#8
1285 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
1286 .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
1287 add v0.2d,v3.2d,v4.2d // "D + T1"
1288 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
1289 add v25.2d,v25.2d,v21.2d
1290 ld1 {v24.2d},[x3],#16
1291 ext v25.16b,v25.16b,v25.16b,#8
1292 ext v5.16b,v0.16b,v1.16b,#8
1293 ext v6.16b,v2.16b,v0.16b,#8
1294 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
1295 .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
1296 ext v7.16b,v17.16b,v18.16b,#8
1297 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
1298 .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
1299 add v3.2d,v2.2d,v1.2d // "D + T1"
1300 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
1301 add v24.2d,v24.2d,v22.2d
1302 ld1 {v25.2d},[x3],#16
1303 ext v24.16b,v24.16b,v24.16b,#8
1304 ext v5.16b,v3.16b,v0.16b,#8
1305 ext v6.16b,v4.16b,v3.16b,#8
1306 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
1307 .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
1308 ext v7.16b,v18.16b,v19.16b,#8
1309 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
1310 .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
1311 add v2.2d,v4.2d,v0.2d // "D + T1"
1312 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
1313 add v25.2d,v25.2d,v23.2d
1314 ld1 {v24.2d},[x3],#16
1315 ext v25.16b,v25.16b,v25.16b,#8
1316 ext v5.16b,v2.16b,v3.16b,#8
1317 ext v6.16b,v1.16b,v2.16b,#8
1318 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
1319 .inst 0xcec08217 //sha512su0 v23.16b,v16.16b
1320 ext v7.16b,v19.16b,v20.16b,#8
1321 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
1322 .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
1323 add v4.2d,v1.2d,v3.2d // "D + T1"
1324 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
1325 add v24.2d,v24.2d,v16.2d
1326 ld1 {v25.2d},[x3],#16
1327 ext v24.16b,v24.16b,v24.16b,#8
1328 ext v5.16b,v4.16b,v2.16b,#8
1329 ext v6.16b,v0.16b,v4.16b,#8
1330 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
1331 .inst 0xcec08230 //sha512su0 v16.16b,v17.16b
1332 ext v7.16b,v20.16b,v21.16b,#8
1333 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
1334 .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
1335 add v1.2d,v0.2d,v2.2d // "D + T1"
1336 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
1337 add v25.2d,v25.2d,v17.2d
1338 ld1 {v24.2d},[x3],#16
1339 ext v25.16b,v25.16b,v25.16b,#8
1340 ext v5.16b,v1.16b,v4.16b,#8
1341 ext v6.16b,v3.16b,v1.16b,#8
1342 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
1343 .inst 0xcec08251 //sha512su0 v17.16b,v18.16b
1344 ext v7.16b,v21.16b,v22.16b,#8
1345 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
1346 .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
1347 add v0.2d,v3.2d,v4.2d // "D + T1"
1348 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
1349 add v24.2d,v24.2d,v18.2d
1350 ld1 {v25.2d},[x3],#16
1351 ext v24.16b,v24.16b,v24.16b,#8
1352 ext v5.16b,v0.16b,v1.16b,#8
1353 ext v6.16b,v2.16b,v0.16b,#8
1354 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
1355 .inst 0xcec08272 //sha512su0 v18.16b,v19.16b
1356 ext v7.16b,v22.16b,v23.16b,#8
1357 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
1358 .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
1359 add v3.2d,v2.2d,v1.2d // "D + T1"
1360 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
1361 add v25.2d,v25.2d,v19.2d
1362 ld1 {v24.2d},[x3],#16
1363 ext v25.16b,v25.16b,v25.16b,#8
1364 ext v5.16b,v3.16b,v0.16b,#8
1365 ext v6.16b,v4.16b,v3.16b,#8
1366 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
1367 .inst 0xcec08293 //sha512su0 v19.16b,v20.16b
1368 ext v7.16b,v23.16b,v16.16b,#8
1369 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
1370 .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
1371 add v2.2d,v4.2d,v0.2d // "D + T1"
1372 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
1373 add v24.2d,v24.2d,v20.2d
1374 ld1 {v25.2d},[x3],#16
1375 ext v24.16b,v24.16b,v24.16b,#8
1376 ext v5.16b,v2.16b,v3.16b,#8
1377 ext v6.16b,v1.16b,v2.16b,#8
1378 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
1379 .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
1380 ext v7.16b,v16.16b,v17.16b,#8
1381 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
1382 .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
1383 add v4.2d,v1.2d,v3.2d // "D + T1"
1384 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
1385 add v25.2d,v25.2d,v21.2d
1386 ld1 {v24.2d},[x3],#16
1387 ext v25.16b,v25.16b,v25.16b,#8
1388 ext v5.16b,v4.16b,v2.16b,#8
1389 ext v6.16b,v0.16b,v4.16b,#8
1390 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
1391 .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
1392 ext v7.16b,v17.16b,v18.16b,#8
1393 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
1394 .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
1395 add v1.2d,v0.2d,v2.2d // "D + T1"
1396 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
1397 add v24.2d,v24.2d,v22.2d
1398 ld1 {v25.2d},[x3],#16
1399 ext v24.16b,v24.16b,v24.16b,#8
1400 ext v5.16b,v1.16b,v4.16b,#8
1401 ext v6.16b,v3.16b,v1.16b,#8
1402 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
1403 .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
1404 ext v7.16b,v18.16b,v19.16b,#8
1405 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
1406 .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
1407 add v0.2d,v3.2d,v4.2d // "D + T1"
1408 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
1409 add v25.2d,v25.2d,v23.2d
1410 ld1 {v24.2d},[x3],#16
1411 ext v25.16b,v25.16b,v25.16b,#8
1412 ext v5.16b,v0.16b,v1.16b,#8
1413 ext v6.16b,v2.16b,v0.16b,#8
1414 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
1415 .inst 0xcec08217 //sha512su0 v23.16b,v16.16b
1416 ext v7.16b,v19.16b,v20.16b,#8
1417 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
1418 .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
1419 add v3.2d,v2.2d,v1.2d // "D + T1"
1420 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
1421 add v24.2d,v24.2d,v16.2d
1422 ld1 {v25.2d},[x3],#16
1423 ext v24.16b,v24.16b,v24.16b,#8
1424 ext v5.16b,v3.16b,v0.16b,#8
1425 ext v6.16b,v4.16b,v3.16b,#8
1426 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
1427 .inst 0xcec08230 //sha512su0 v16.16b,v17.16b
1428 ext v7.16b,v20.16b,v21.16b,#8
1429 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
1430 .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
1431 add v2.2d,v4.2d,v0.2d // "D + T1"
1432 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
1433 add v25.2d,v25.2d,v17.2d
1434 ld1 {v24.2d},[x3],#16
1435 ext v25.16b,v25.16b,v25.16b,#8
1436 ext v5.16b,v2.16b,v3.16b,#8
1437 ext v6.16b,v1.16b,v2.16b,#8
1438 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
1439 .inst 0xcec08251 //sha512su0 v17.16b,v18.16b
1440 ext v7.16b,v21.16b,v22.16b,#8
1441 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
1442 .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
1443 add v4.2d,v1.2d,v3.2d // "D + T1"
1444 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
1445 add v24.2d,v24.2d,v18.2d
1446 ld1 {v25.2d},[x3],#16
1447 ext v24.16b,v24.16b,v24.16b,#8
1448 ext v5.16b,v4.16b,v2.16b,#8
1449 ext v6.16b,v0.16b,v4.16b,#8
1450 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
1451 .inst 0xcec08272 //sha512su0 v18.16b,v19.16b
1452 ext v7.16b,v22.16b,v23.16b,#8
1453 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
1454 .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
1455 add v1.2d,v0.2d,v2.2d // "D + T1"
1456 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
1457 add v25.2d,v25.2d,v19.2d
1458 ld1 {v24.2d},[x3],#16
1459 ext v25.16b,v25.16b,v25.16b,#8
1460 ext v5.16b,v1.16b,v4.16b,#8
1461 ext v6.16b,v3.16b,v1.16b,#8
1462 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
1463 .inst 0xcec08293 //sha512su0 v19.16b,v20.16b
1464 ext v7.16b,v23.16b,v16.16b,#8
1465 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
1466 .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
1467 add v0.2d,v3.2d,v4.2d // "D + T1"
1468 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
1469 add v24.2d,v24.2d,v20.2d
1470 ld1 {v25.2d},[x3],#16
1471 ext v24.16b,v24.16b,v24.16b,#8
1472 ext v5.16b,v0.16b,v1.16b,#8
1473 ext v6.16b,v2.16b,v0.16b,#8
1474 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
1475 .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
1476 ext v7.16b,v16.16b,v17.16b,#8
1477 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
1478 .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
1479 add v3.2d,v2.2d,v1.2d // "D + T1"
1480 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
1481 add v25.2d,v25.2d,v21.2d
1482 ld1 {v24.2d},[x3],#16
1483 ext v25.16b,v25.16b,v25.16b,#8
1484 ext v5.16b,v3.16b,v0.16b,#8
1485 ext v6.16b,v4.16b,v3.16b,#8
1486 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
1487 .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
1488 ext v7.16b,v17.16b,v18.16b,#8
1489 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
1490 .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
1491 add v2.2d,v4.2d,v0.2d // "D + T1"
1492 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
1493 add v24.2d,v24.2d,v22.2d
1494 ld1 {v25.2d},[x3],#16
1495 ext v24.16b,v24.16b,v24.16b,#8
1496 ext v5.16b,v2.16b,v3.16b,#8
1497 ext v6.16b,v1.16b,v2.16b,#8
1498 add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
1499 .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
1500 ext v7.16b,v18.16b,v19.16b,#8
1501 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
1502 .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
1503 add v4.2d,v1.2d,v3.2d // "D + T1"
1504 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
1505 add v25.2d,v25.2d,v23.2d
1506 ld1 {v24.2d},[x3],#16
1507 ext v25.16b,v25.16b,v25.16b,#8
1508 ext v5.16b,v4.16b,v2.16b,#8
1509 ext v6.16b,v0.16b,v4.16b,#8
1510 add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
1511 .inst 0xcec08217 //sha512su0 v23.16b,v16.16b
1512 ext v7.16b,v19.16b,v20.16b,#8
1513 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
1514 .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
1515 add v1.2d,v0.2d,v2.2d // "D + T1"
1516 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
1517 ld1 {v25.2d},[x3],#16
1518 add v24.2d,v24.2d,v16.2d
1519 ld1 {v16.16b},[x1],#16 // load next input
1520 ext v24.16b,v24.16b,v24.16b,#8
1521 ext v5.16b,v1.16b,v4.16b,#8
1522 ext v6.16b,v3.16b,v1.16b,#8
1523 add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
1524 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
1525 rev64 v16.16b,v16.16b
1526 add v0.2d,v3.2d,v4.2d // "D + T1"
1527 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
1528 ld1 {v24.2d},[x3],#16
1529 add v25.2d,v25.2d,v17.2d
1530 ld1 {v17.16b},[x1],#16 // load next input
1531 ext v25.16b,v25.16b,v25.16b,#8
1532 ext v5.16b,v0.16b,v1.16b,#8
1533 ext v6.16b,v2.16b,v0.16b,#8
1534 add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
1535 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
1536 rev64 v17.16b,v17.16b
1537 add v3.2d,v2.2d,v1.2d // "D + T1"
1538 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
1539 ld1 {v25.2d},[x3],#16
1540 add v24.2d,v24.2d,v18.2d
1541 ld1 {v18.16b},[x1],#16 // load next input
1542 ext v24.16b,v24.16b,v24.16b,#8
1543 ext v5.16b,v3.16b,v0.16b,#8
1544 ext v6.16b,v4.16b,v3.16b,#8
1545 add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
1546 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
1547 rev64 v18.16b,v18.16b
1548 add v2.2d,v4.2d,v0.2d // "D + T1"
1549 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
1550 ld1 {v24.2d},[x3],#16
1551 add v25.2d,v25.2d,v19.2d
1552 ld1 {v19.16b},[x1],#16 // load next input
1553 ext v25.16b,v25.16b,v25.16b,#8
1554 ext v5.16b,v2.16b,v3.16b,#8
1555 ext v6.16b,v1.16b,v2.16b,#8
1556 add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
1557 .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
1558 rev64 v19.16b,v19.16b
1559 add v4.2d,v1.2d,v3.2d // "D + T1"
1560 .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
1561 ld1 {v25.2d},[x3],#16
1562 add v24.2d,v24.2d,v20.2d
1563 ld1 {v20.16b},[x1],#16 // load next input
1564 ext v24.16b,v24.16b,v24.16b,#8
1565 ext v5.16b,v4.16b,v2.16b,#8
1566 ext v6.16b,v0.16b,v4.16b,#8
1567 add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
1568 .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
1569 rev64 v20.16b,v20.16b
1570 add v1.2d,v0.2d,v2.2d // "D + T1"
1571 .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
1572 ld1 {v24.2d},[x3],#16
1573 add v25.2d,v25.2d,v21.2d
1574 ld1 {v21.16b},[x1],#16 // load next input
1575 ext v25.16b,v25.16b,v25.16b,#8
1576 ext v5.16b,v1.16b,v4.16b,#8
1577 ext v6.16b,v3.16b,v1.16b,#8
1578 add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
1579 .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
1580 rev64 v21.16b,v21.16b
1581 add v0.2d,v3.2d,v4.2d // "D + T1"
1582 .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
1583 ld1 {v25.2d},[x3],#16
1584 add v24.2d,v24.2d,v22.2d
1585 ld1 {v22.16b},[x1],#16 // load next input
1586 ext v24.16b,v24.16b,v24.16b,#8
1587 ext v5.16b,v0.16b,v1.16b,#8
1588 ext v6.16b,v2.16b,v0.16b,#8
1589 add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
1590 .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
1591 rev64 v22.16b,v22.16b
1592 add v3.2d,v2.2d,v1.2d // "D + T1"
1593 .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
1594 sub x3,x3,#80*8 // rewind
1595 add v25.2d,v25.2d,v23.2d
1596 ld1 {v23.16b},[x1],#16 // load next input
1597 ext v25.16b,v25.16b,v25.16b,#8
1598 ext v5.16b,v3.16b,v0.16b,#8
1599 ext v6.16b,v4.16b,v3.16b,#8
1600 add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
1601 .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
1602 rev64 v23.16b,v23.16b
1603 add v2.2d,v4.2d,v0.2d // "D + T1"
1604 .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
1605 add v0.2d,v0.2d,v26.2d // accumulate
1606 add v1.2d,v1.2d,v27.2d
1607 add v2.2d,v2.2d,v28.2d
1608 add v3.2d,v3.2d,v29.2d
1612 st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context
1616 .size sha512_block_armv8,.-sha512_block_armv8
1619 .comm OPENSSL_armcap_P,4,4