2 /* Do not modify. This file is auto-generated from chacha-armv8.pl. */
8 .hidden OPENSSL_armcap_P
12 .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
17 .long OPENSSL_armcap_P-.
19 .quad OPENSSL_armcap_P-.
21 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
25 .type ChaCha20_ctr32,%function
29 adr x5,.LOPENSSL_armcap_P
42 .inst 0xd503233f // paciasp
43 stp x29,x30,[sp,#-96]!
54 ldp x22,x23,[x5] // load sigma
55 ldp x24,x25,[x3] // load key
57 ldp x28,x30,[x4] // load counter
68 mov w5,w22 // unpack key block
187 add w5,w5,w22 // accumulate key block
192 add x10,x10,x24,lsr#32
194 add x12,x12,x25,lsr#32
196 add x14,x14,x26,lsr#32
198 add x16,x16,x27,lsr#32
200 add x19,x19,x28,lsr#32
202 add x21,x21,x30,lsr#32
206 add x5,x5,x6,lsl#32 // pack
208 ldp x6,x8,[x1,#0] // load input
210 add x11,x11,x12,lsl#32
212 add x13,x13,x14,lsl#32
213 add x15,x15,x16,lsl#32
215 add x17,x17,x19,lsl#32
216 add x20,x20,x21,lsl#32
238 stp x5,x7,[x0,#0] // store output
239 add x28,x28,#1 // increment counter
247 ldp x19,x20,[x29,#16]
249 ldp x21,x22,[x29,#32]
250 ldp x23,x24,[x29,#48]
251 ldp x25,x26,[x29,#64]
252 ldp x27,x28,[x29,#80]
254 .inst 0xd50323bf // autiasp
268 add x5,x5,x6,lsl#32 // pack
271 add x11,x11,x12,lsl#32
272 add x13,x13,x14,lsl#32
273 add x15,x15,x16,lsl#32
274 add x17,x17,x19,lsl#32
275 add x20,x20,x21,lsl#32
304 ldp x19,x20,[x29,#16]
306 ldp x21,x22,[x29,#32]
307 ldp x23,x24,[x29,#48]
308 ldp x25,x26,[x29,#64]
309 ldp x27,x28,[x29,#80]
311 .inst 0xd50323bf // autiasp
313 .size ChaCha20_ctr32,.-ChaCha20_ctr32
315 .type ChaCha20_neon,%function
318 .inst 0xd503233f // paciasp
319 stp x29,x30,[sp,#-96]!
329 b.hs .L512_or_more_neon
333 ldp x22,x23,[x5] // load sigma
334 ld1 {v24.4s},[x5],#16
335 ldp x24,x25,[x3] // load key
337 ld1 {v25.4s,v26.4s},[x3]
338 ldp x28,x30,[x4] // load counter
350 add v27.4s,v27.4s,v31.4s // += 1
351 add v28.4s,v27.4s,v31.4s
352 add v29.4s,v28.4s,v31.4s
353 shl v31.4s,v31.4s,#2 // 1 -> 4
356 mov w5,w22 // unpack key block
389 add v0.4s,v0.4s,v1.4s
391 add v4.4s,v4.4s,v5.4s
393 add v16.4s,v16.4s,v17.4s
395 eor v3.16b,v3.16b,v0.16b
397 eor v7.16b,v7.16b,v4.16b
399 eor v19.16b,v19.16b,v16.16b
407 add v2.4s,v2.4s,v3.4s
409 add v6.4s,v6.4s,v7.4s
411 add v18.4s,v18.4s,v19.4s
413 eor v20.16b,v1.16b,v2.16b
415 eor v21.16b,v5.16b,v6.16b
417 eor v22.16b,v17.16b,v18.16b
419 ushr v1.4s,v20.4s,#20
421 ushr v5.4s,v21.4s,#20
423 ushr v17.4s,v22.4s,#20
429 sli v17.4s,v22.4s,#12
431 add v0.4s,v0.4s,v1.4s
433 add v4.4s,v4.4s,v5.4s
435 add v16.4s,v16.4s,v17.4s
437 eor v20.16b,v3.16b,v0.16b
439 eor v21.16b,v7.16b,v4.16b
441 eor v22.16b,v19.16b,v16.16b
443 ushr v3.4s,v20.4s,#24
445 ushr v7.4s,v21.4s,#24
447 ushr v19.4s,v22.4s,#24
455 add v2.4s,v2.4s,v3.4s
457 add v6.4s,v6.4s,v7.4s
459 add v18.4s,v18.4s,v19.4s
461 eor v20.16b,v1.16b,v2.16b
463 eor v21.16b,v5.16b,v6.16b
465 eor v22.16b,v17.16b,v18.16b
467 ushr v1.4s,v20.4s,#25
469 ushr v5.4s,v21.4s,#25
471 ushr v17.4s,v22.4s,#25
479 ext v2.16b,v2.16b,v2.16b,#8
481 ext v6.16b,v6.16b,v6.16b,#8
483 ext v18.16b,v18.16b,v18.16b,#8
485 ext v3.16b,v3.16b,v3.16b,#12
486 ext v7.16b,v7.16b,v7.16b,#12
487 ext v19.16b,v19.16b,v19.16b,#12
488 ext v1.16b,v1.16b,v1.16b,#4
489 ext v5.16b,v5.16b,v5.16b,#4
490 ext v17.16b,v17.16b,v17.16b,#4
491 add v0.4s,v0.4s,v1.4s
493 add v4.4s,v4.4s,v5.4s
495 add v16.4s,v16.4s,v17.4s
497 eor v3.16b,v3.16b,v0.16b
499 eor v7.16b,v7.16b,v4.16b
501 eor v19.16b,v19.16b,v16.16b
509 add v2.4s,v2.4s,v3.4s
511 add v6.4s,v6.4s,v7.4s
513 add v18.4s,v18.4s,v19.4s
515 eor v20.16b,v1.16b,v2.16b
517 eor v21.16b,v5.16b,v6.16b
519 eor v22.16b,v17.16b,v18.16b
521 ushr v1.4s,v20.4s,#20
523 ushr v5.4s,v21.4s,#20
525 ushr v17.4s,v22.4s,#20
531 sli v17.4s,v22.4s,#12
533 add v0.4s,v0.4s,v1.4s
535 add v4.4s,v4.4s,v5.4s
537 add v16.4s,v16.4s,v17.4s
539 eor v20.16b,v3.16b,v0.16b
541 eor v21.16b,v7.16b,v4.16b
543 eor v22.16b,v19.16b,v16.16b
545 ushr v3.4s,v20.4s,#24
547 ushr v7.4s,v21.4s,#24
549 ushr v19.4s,v22.4s,#24
557 add v2.4s,v2.4s,v3.4s
559 add v6.4s,v6.4s,v7.4s
561 add v18.4s,v18.4s,v19.4s
563 eor v20.16b,v1.16b,v2.16b
565 eor v21.16b,v5.16b,v6.16b
567 eor v22.16b,v17.16b,v18.16b
569 ushr v1.4s,v20.4s,#25
571 ushr v5.4s,v21.4s,#25
573 ushr v17.4s,v22.4s,#25
581 ext v2.16b,v2.16b,v2.16b,#8
583 ext v6.16b,v6.16b,v6.16b,#8
585 ext v18.16b,v18.16b,v18.16b,#8
587 ext v3.16b,v3.16b,v3.16b,#4
588 ext v7.16b,v7.16b,v7.16b,#4
589 ext v19.16b,v19.16b,v19.16b,#4
590 ext v1.16b,v1.16b,v1.16b,#12
591 ext v5.16b,v5.16b,v5.16b,#12
592 ext v17.16b,v17.16b,v17.16b,#12
595 add w5,w5,w22 // accumulate key block
596 add v0.4s,v0.4s,v24.4s
598 add v4.4s,v4.4s,v24.4s
600 add v16.4s,v16.4s,v24.4s
602 add v2.4s,v2.4s,v26.4s
604 add v6.4s,v6.4s,v26.4s
605 add x10,x10,x24,lsr#32
606 add v18.4s,v18.4s,v26.4s
608 add v3.4s,v3.4s,v27.4s
609 add x12,x12,x25,lsr#32
611 add v7.4s,v7.4s,v28.4s
612 add x14,x14,x26,lsr#32
614 add v19.4s,v19.4s,v29.4s
615 add x16,x16,x27,lsr#32
617 add v1.4s,v1.4s,v25.4s
618 add x19,x19,x28,lsr#32
620 add v5.4s,v5.4s,v25.4s
621 add x21,x21,x30,lsr#32
622 add v17.4s,v17.4s,v25.4s
626 add x5,x5,x6,lsl#32 // pack
628 ldp x6,x8,[x1,#0] // load input
630 add x11,x11,x12,lsl#32
632 add x13,x13,x14,lsl#32
633 add x15,x15,x16,lsl#32
635 add x17,x17,x19,lsl#32
636 add x20,x20,x21,lsl#32
649 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
655 eor v0.16b,v0.16b,v20.16b
657 eor v1.16b,v1.16b,v21.16b
659 eor v2.16b,v2.16b,v22.16b
661 eor v3.16b,v3.16b,v23.16b
662 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
664 stp x5,x7,[x0,#0] // store output
665 add x28,x28,#4 // increment counter
667 add v27.4s,v27.4s,v31.4s // += 4
669 add v28.4s,v28.4s,v31.4s
671 add v29.4s,v29.4s,v31.4s
674 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
675 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
677 eor v4.16b,v4.16b,v20.16b
678 eor v5.16b,v5.16b,v21.16b
679 eor v6.16b,v6.16b,v22.16b
680 eor v7.16b,v7.16b,v23.16b
681 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
683 eor v16.16b,v16.16b,v0.16b
684 eor v17.16b,v17.16b,v1.16b
685 eor v18.16b,v18.16b,v2.16b
686 eor v19.16b,v19.16b,v3.16b
687 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
689 b.hi .Loop_outer_neon
691 ldp x19,x20,[x29,#16]
693 ldp x21,x22,[x29,#32]
694 ldp x23,x24,[x29,#48]
695 ldp x25,x26,[x29,#64]
696 ldp x27,x28,[x29,#80]
698 .inst 0xd50323bf // autiasp
706 add x5,x5,x6,lsl#32 // pack
708 ldp x6,x8,[x1,#0] // load input
710 add x11,x11,x12,lsl#32
712 add x13,x13,x14,lsl#32
713 add x15,x15,x16,lsl#32
715 add x17,x17,x19,lsl#32
716 add x20,x20,x21,lsl#32
738 stp x5,x7,[x0,#0] // store output
739 add x28,x28,#4 // increment counter
749 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
750 eor v0.16b,v0.16b,v20.16b
751 eor v1.16b,v1.16b,v21.16b
752 eor v2.16b,v2.16b,v22.16b
753 eor v3.16b,v3.16b,v23.16b
754 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
760 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
761 eor v4.16b,v4.16b,v20.16b
762 eor v5.16b,v5.16b,v21.16b
763 eor v6.16b,v6.16b,v22.16b
764 eor v7.16b,v7.16b,v23.16b
765 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
769 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
773 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
776 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
793 cbnz x2,.Loop_tail_neon
801 ldp x19,x20,[x29,#16]
803 ldp x21,x22,[x29,#32]
804 ldp x23,x24,[x29,#48]
805 ldp x25,x26,[x29,#64]
806 ldp x27,x28,[x29,#80]
808 .inst 0xd50323bf // autiasp
810 .size ChaCha20_neon,.-ChaCha20_neon
811 .type ChaCha20_512_neon,%function
814 .inst 0xd503233f // paciasp
815 stp x29,x30,[sp,#-96]!
828 ldp x22,x23,[x5] // load sigma
829 ld1 {v24.4s},[x5],#16
830 ldp x24,x25,[x3] // load key
832 ld1 {v25.4s,v26.4s},[x3]
833 ldp x28,x30,[x4] // load counter
845 add v27.4s,v27.4s,v31.4s // += 1
846 stp q24,q25,[sp,#0] // off-load key block, invariant part
847 add v27.4s,v27.4s,v31.4s // not typo
849 add v28.4s,v27.4s,v31.4s
850 add v29.4s,v28.4s,v31.4s
851 add v30.4s,v29.4s,v31.4s
852 shl v31.4s,v31.4s,#2 // 1 -> 4
854 stp d8,d9,[sp,#128+0] // meet ABI requirements
855 stp d10,d11,[sp,#128+16]
856 stp d12,d13,[sp,#128+32]
857 stp d14,d15,[sp,#128+48]
859 sub x2,x2,#512 // not typo
861 .Loop_outer_512_neon:
869 mov w5,w22 // unpack key block
892 add v19.4s,v3.4s,v31.4s // +4
894 add v23.4s,v7.4s,v31.4s // +4
901 stp q27,q28,[sp,#48] // off-load key block, variable part
909 add v0.4s,v0.4s,v1.4s
911 add v4.4s,v4.4s,v5.4s
913 add v8.4s,v8.4s,v9.4s
915 add v12.4s,v12.4s,v13.4s
917 add v16.4s,v16.4s,v17.4s
919 add v20.4s,v20.4s,v21.4s
921 eor v3.16b,v3.16b,v0.16b
923 eor v7.16b,v7.16b,v4.16b
925 eor v11.16b,v11.16b,v8.16b
927 eor v15.16b,v15.16b,v12.16b
929 eor v19.16b,v19.16b,v16.16b
931 eor v23.16b,v23.16b,v20.16b
945 add v2.4s,v2.4s,v3.4s
947 add v6.4s,v6.4s,v7.4s
949 add v10.4s,v10.4s,v11.4s
951 add v14.4s,v14.4s,v15.4s
953 add v18.4s,v18.4s,v19.4s
955 add v22.4s,v22.4s,v23.4s
957 eor v24.16b,v1.16b,v2.16b
959 eor v25.16b,v5.16b,v6.16b
961 eor v26.16b,v9.16b,v10.16b
963 eor v27.16b,v13.16b,v14.16b
965 eor v28.16b,v17.16b,v18.16b
967 eor v29.16b,v21.16b,v22.16b
969 ushr v1.4s,v24.4s,#20
971 ushr v5.4s,v25.4s,#20
973 ushr v9.4s,v26.4s,#20
975 ushr v13.4s,v27.4s,#20
977 ushr v17.4s,v28.4s,#20
979 ushr v21.4s,v29.4s,#20
987 sli v13.4s,v27.4s,#12
989 sli v17.4s,v28.4s,#12
991 sli v21.4s,v29.4s,#12
993 add v0.4s,v0.4s,v1.4s
995 add v4.4s,v4.4s,v5.4s
997 add v8.4s,v8.4s,v9.4s
999 add v12.4s,v12.4s,v13.4s
1001 add v16.4s,v16.4s,v17.4s
1003 add v20.4s,v20.4s,v21.4s
1005 eor v24.16b,v3.16b,v0.16b
1007 eor v25.16b,v7.16b,v4.16b
1009 eor v26.16b,v11.16b,v8.16b
1011 eor v27.16b,v15.16b,v12.16b
1013 eor v28.16b,v19.16b,v16.16b
1015 eor v29.16b,v23.16b,v20.16b
1017 ushr v3.4s,v24.4s,#24
1019 ushr v7.4s,v25.4s,#24
1021 ushr v11.4s,v26.4s,#24
1023 ushr v15.4s,v27.4s,#24
1025 ushr v19.4s,v28.4s,#24
1027 ushr v23.4s,v29.4s,#24
1033 sli v11.4s,v26.4s,#8
1035 sli v15.4s,v27.4s,#8
1037 sli v19.4s,v28.4s,#8
1039 sli v23.4s,v29.4s,#8
1041 add v2.4s,v2.4s,v3.4s
1043 add v6.4s,v6.4s,v7.4s
1045 add v10.4s,v10.4s,v11.4s
1047 add v14.4s,v14.4s,v15.4s
1049 add v18.4s,v18.4s,v19.4s
1051 add v22.4s,v22.4s,v23.4s
1053 eor v24.16b,v1.16b,v2.16b
1055 eor v25.16b,v5.16b,v6.16b
1057 eor v26.16b,v9.16b,v10.16b
1059 eor v27.16b,v13.16b,v14.16b
1061 eor v28.16b,v17.16b,v18.16b
1063 eor v29.16b,v21.16b,v22.16b
1065 ushr v1.4s,v24.4s,#25
1067 ushr v5.4s,v25.4s,#25
1069 ushr v9.4s,v26.4s,#25
1071 ushr v13.4s,v27.4s,#25
1073 ushr v17.4s,v28.4s,#25
1075 ushr v21.4s,v29.4s,#25
1083 sli v13.4s,v27.4s,#7
1085 sli v17.4s,v28.4s,#7
1087 sli v21.4s,v29.4s,#7
1089 ext v2.16b,v2.16b,v2.16b,#8
1091 ext v6.16b,v6.16b,v6.16b,#8
1093 ext v10.16b,v10.16b,v10.16b,#8
1095 ext v14.16b,v14.16b,v14.16b,#8
1097 ext v18.16b,v18.16b,v18.16b,#8
1099 ext v22.16b,v22.16b,v22.16b,#8
1101 ext v3.16b,v3.16b,v3.16b,#12
1102 ext v7.16b,v7.16b,v7.16b,#12
1103 ext v11.16b,v11.16b,v11.16b,#12
1104 ext v15.16b,v15.16b,v15.16b,#12
1105 ext v19.16b,v19.16b,v19.16b,#12
1106 ext v23.16b,v23.16b,v23.16b,#12
1107 ext v1.16b,v1.16b,v1.16b,#4
1108 ext v5.16b,v5.16b,v5.16b,#4
1109 ext v9.16b,v9.16b,v9.16b,#4
1110 ext v13.16b,v13.16b,v13.16b,#4
1111 ext v17.16b,v17.16b,v17.16b,#4
1112 ext v21.16b,v21.16b,v21.16b,#4
1113 add v0.4s,v0.4s,v1.4s
1115 add v4.4s,v4.4s,v5.4s
1117 add v8.4s,v8.4s,v9.4s
1119 add v12.4s,v12.4s,v13.4s
1121 add v16.4s,v16.4s,v17.4s
1123 add v20.4s,v20.4s,v21.4s
1125 eor v3.16b,v3.16b,v0.16b
1127 eor v7.16b,v7.16b,v4.16b
1129 eor v11.16b,v11.16b,v8.16b
1131 eor v15.16b,v15.16b,v12.16b
1133 eor v19.16b,v19.16b,v16.16b
1135 eor v23.16b,v23.16b,v20.16b
1149 add v2.4s,v2.4s,v3.4s
1151 add v6.4s,v6.4s,v7.4s
1153 add v10.4s,v10.4s,v11.4s
1155 add v14.4s,v14.4s,v15.4s
1157 add v18.4s,v18.4s,v19.4s
1159 add v22.4s,v22.4s,v23.4s
1161 eor v24.16b,v1.16b,v2.16b
1163 eor v25.16b,v5.16b,v6.16b
1165 eor v26.16b,v9.16b,v10.16b
1167 eor v27.16b,v13.16b,v14.16b
1169 eor v28.16b,v17.16b,v18.16b
1171 eor v29.16b,v21.16b,v22.16b
1173 ushr v1.4s,v24.4s,#20
1175 ushr v5.4s,v25.4s,#20
1177 ushr v9.4s,v26.4s,#20
1179 ushr v13.4s,v27.4s,#20
1181 ushr v17.4s,v28.4s,#20
1183 ushr v21.4s,v29.4s,#20
1185 sli v1.4s,v24.4s,#12
1187 sli v5.4s,v25.4s,#12
1189 sli v9.4s,v26.4s,#12
1191 sli v13.4s,v27.4s,#12
1193 sli v17.4s,v28.4s,#12
1195 sli v21.4s,v29.4s,#12
1197 add v0.4s,v0.4s,v1.4s
1199 add v4.4s,v4.4s,v5.4s
1201 add v8.4s,v8.4s,v9.4s
1203 add v12.4s,v12.4s,v13.4s
1205 add v16.4s,v16.4s,v17.4s
1207 add v20.4s,v20.4s,v21.4s
1209 eor v24.16b,v3.16b,v0.16b
1211 eor v25.16b,v7.16b,v4.16b
1213 eor v26.16b,v11.16b,v8.16b
1215 eor v27.16b,v15.16b,v12.16b
1217 eor v28.16b,v19.16b,v16.16b
1219 eor v29.16b,v23.16b,v20.16b
1221 ushr v3.4s,v24.4s,#24
1223 ushr v7.4s,v25.4s,#24
1225 ushr v11.4s,v26.4s,#24
1227 ushr v15.4s,v27.4s,#24
1229 ushr v19.4s,v28.4s,#24
1231 ushr v23.4s,v29.4s,#24
1237 sli v11.4s,v26.4s,#8
1239 sli v15.4s,v27.4s,#8
1241 sli v19.4s,v28.4s,#8
1243 sli v23.4s,v29.4s,#8
1245 add v2.4s,v2.4s,v3.4s
1247 add v6.4s,v6.4s,v7.4s
1249 add v10.4s,v10.4s,v11.4s
1251 add v14.4s,v14.4s,v15.4s
1253 add v18.4s,v18.4s,v19.4s
1255 add v22.4s,v22.4s,v23.4s
1257 eor v24.16b,v1.16b,v2.16b
1259 eor v25.16b,v5.16b,v6.16b
1261 eor v26.16b,v9.16b,v10.16b
1263 eor v27.16b,v13.16b,v14.16b
1265 eor v28.16b,v17.16b,v18.16b
1267 eor v29.16b,v21.16b,v22.16b
1269 ushr v1.4s,v24.4s,#25
1271 ushr v5.4s,v25.4s,#25
1273 ushr v9.4s,v26.4s,#25
1275 ushr v13.4s,v27.4s,#25
1277 ushr v17.4s,v28.4s,#25
1279 ushr v21.4s,v29.4s,#25
1287 sli v13.4s,v27.4s,#7
1289 sli v17.4s,v28.4s,#7
1291 sli v21.4s,v29.4s,#7
1293 ext v2.16b,v2.16b,v2.16b,#8
1295 ext v6.16b,v6.16b,v6.16b,#8
1297 ext v10.16b,v10.16b,v10.16b,#8
1299 ext v14.16b,v14.16b,v14.16b,#8
1301 ext v18.16b,v18.16b,v18.16b,#8
1303 ext v22.16b,v22.16b,v22.16b,#8
1305 ext v3.16b,v3.16b,v3.16b,#4
1306 ext v7.16b,v7.16b,v7.16b,#4
1307 ext v11.16b,v11.16b,v11.16b,#4
1308 ext v15.16b,v15.16b,v15.16b,#4
1309 ext v19.16b,v19.16b,v19.16b,#4
1310 ext v23.16b,v23.16b,v23.16b,#4
1311 ext v1.16b,v1.16b,v1.16b,#12
1312 ext v5.16b,v5.16b,v5.16b,#12
1313 ext v9.16b,v9.16b,v9.16b,#12
1314 ext v13.16b,v13.16b,v13.16b,#12
1315 ext v17.16b,v17.16b,v17.16b,#12
1316 ext v21.16b,v21.16b,v21.16b,#12
1317 cbnz x4,.Loop_upper_neon
1319 add w5,w5,w22 // accumulate key block
1320 add x6,x6,x22,lsr#32
1322 add x8,x8,x23,lsr#32
1324 add x10,x10,x24,lsr#32
1326 add x12,x12,x25,lsr#32
1328 add x14,x14,x26,lsr#32
1330 add x16,x16,x27,lsr#32
1332 add x19,x19,x28,lsr#32
1334 add x21,x21,x30,lsr#32
1336 add x5,x5,x6,lsl#32 // pack
1338 ldp x6,x8,[x1,#0] // load input
1339 add x9,x9,x10,lsl#32
1340 add x11,x11,x12,lsl#32
1341 ldp x10,x12,[x1,#16]
1342 add x13,x13,x14,lsl#32
1343 add x15,x15,x16,lsl#32
1344 ldp x14,x16,[x1,#32]
1345 add x17,x17,x19,lsl#32
1346 add x20,x20,x21,lsl#32
1347 ldp x19,x21,[x1,#48]
1368 stp x5,x7,[x0,#0] // store output
1369 add x28,x28,#1 // increment counter
1370 mov w5,w22 // unpack key block
1375 stp x13,x15,[x0,#32]
1378 stp x17,x20,[x0,#48]
1394 add v0.4s,v0.4s,v1.4s
1396 add v4.4s,v4.4s,v5.4s
1398 add v8.4s,v8.4s,v9.4s
1400 add v12.4s,v12.4s,v13.4s
1402 add v16.4s,v16.4s,v17.4s
1404 add v20.4s,v20.4s,v21.4s
1406 eor v3.16b,v3.16b,v0.16b
1408 eor v7.16b,v7.16b,v4.16b
1410 eor v11.16b,v11.16b,v8.16b
1412 eor v15.16b,v15.16b,v12.16b
1414 eor v19.16b,v19.16b,v16.16b
1416 eor v23.16b,v23.16b,v20.16b
1430 add v2.4s,v2.4s,v3.4s
1432 add v6.4s,v6.4s,v7.4s
1434 add v10.4s,v10.4s,v11.4s
1436 add v14.4s,v14.4s,v15.4s
1438 add v18.4s,v18.4s,v19.4s
1440 add v22.4s,v22.4s,v23.4s
1442 eor v24.16b,v1.16b,v2.16b
1444 eor v25.16b,v5.16b,v6.16b
1446 eor v26.16b,v9.16b,v10.16b
1448 eor v27.16b,v13.16b,v14.16b
1450 eor v28.16b,v17.16b,v18.16b
1452 eor v29.16b,v21.16b,v22.16b
1454 ushr v1.4s,v24.4s,#20
1456 ushr v5.4s,v25.4s,#20
1458 ushr v9.4s,v26.4s,#20
1460 ushr v13.4s,v27.4s,#20
1462 ushr v17.4s,v28.4s,#20
1464 ushr v21.4s,v29.4s,#20
1466 sli v1.4s,v24.4s,#12
1468 sli v5.4s,v25.4s,#12
1470 sli v9.4s,v26.4s,#12
1472 sli v13.4s,v27.4s,#12
1474 sli v17.4s,v28.4s,#12
1476 sli v21.4s,v29.4s,#12
1478 add v0.4s,v0.4s,v1.4s
1480 add v4.4s,v4.4s,v5.4s
1482 add v8.4s,v8.4s,v9.4s
1484 add v12.4s,v12.4s,v13.4s
1486 add v16.4s,v16.4s,v17.4s
1488 add v20.4s,v20.4s,v21.4s
1490 eor v24.16b,v3.16b,v0.16b
1492 eor v25.16b,v7.16b,v4.16b
1494 eor v26.16b,v11.16b,v8.16b
1496 eor v27.16b,v15.16b,v12.16b
1498 eor v28.16b,v19.16b,v16.16b
1500 eor v29.16b,v23.16b,v20.16b
1502 ushr v3.4s,v24.4s,#24
1504 ushr v7.4s,v25.4s,#24
1506 ushr v11.4s,v26.4s,#24
1508 ushr v15.4s,v27.4s,#24
1510 ushr v19.4s,v28.4s,#24
1512 ushr v23.4s,v29.4s,#24
1518 sli v11.4s,v26.4s,#8
1520 sli v15.4s,v27.4s,#8
1522 sli v19.4s,v28.4s,#8
1524 sli v23.4s,v29.4s,#8
1526 add v2.4s,v2.4s,v3.4s
1528 add v6.4s,v6.4s,v7.4s
1530 add v10.4s,v10.4s,v11.4s
1532 add v14.4s,v14.4s,v15.4s
1534 add v18.4s,v18.4s,v19.4s
1536 add v22.4s,v22.4s,v23.4s
1538 eor v24.16b,v1.16b,v2.16b
1540 eor v25.16b,v5.16b,v6.16b
1542 eor v26.16b,v9.16b,v10.16b
1544 eor v27.16b,v13.16b,v14.16b
1546 eor v28.16b,v17.16b,v18.16b
1548 eor v29.16b,v21.16b,v22.16b
1550 ushr v1.4s,v24.4s,#25
1552 ushr v5.4s,v25.4s,#25
1554 ushr v9.4s,v26.4s,#25
1556 ushr v13.4s,v27.4s,#25
1558 ushr v17.4s,v28.4s,#25
1560 ushr v21.4s,v29.4s,#25
1568 sli v13.4s,v27.4s,#7
1570 sli v17.4s,v28.4s,#7
1572 sli v21.4s,v29.4s,#7
1574 ext v2.16b,v2.16b,v2.16b,#8
1576 ext v6.16b,v6.16b,v6.16b,#8
1578 ext v10.16b,v10.16b,v10.16b,#8
1580 ext v14.16b,v14.16b,v14.16b,#8
1582 ext v18.16b,v18.16b,v18.16b,#8
1584 ext v22.16b,v22.16b,v22.16b,#8
1586 ext v3.16b,v3.16b,v3.16b,#12
1587 ext v7.16b,v7.16b,v7.16b,#12
1588 ext v11.16b,v11.16b,v11.16b,#12
1589 ext v15.16b,v15.16b,v15.16b,#12
1590 ext v19.16b,v19.16b,v19.16b,#12
1591 ext v23.16b,v23.16b,v23.16b,#12
1592 ext v1.16b,v1.16b,v1.16b,#4
1593 ext v5.16b,v5.16b,v5.16b,#4
1594 ext v9.16b,v9.16b,v9.16b,#4
1595 ext v13.16b,v13.16b,v13.16b,#4
1596 ext v17.16b,v17.16b,v17.16b,#4
1597 ext v21.16b,v21.16b,v21.16b,#4
1598 add v0.4s,v0.4s,v1.4s
1600 add v4.4s,v4.4s,v5.4s
1602 add v8.4s,v8.4s,v9.4s
1604 add v12.4s,v12.4s,v13.4s
1606 add v16.4s,v16.4s,v17.4s
1608 add v20.4s,v20.4s,v21.4s
1610 eor v3.16b,v3.16b,v0.16b
1612 eor v7.16b,v7.16b,v4.16b
1614 eor v11.16b,v11.16b,v8.16b
1616 eor v15.16b,v15.16b,v12.16b
1618 eor v19.16b,v19.16b,v16.16b
1620 eor v23.16b,v23.16b,v20.16b
1634 add v2.4s,v2.4s,v3.4s
1636 add v6.4s,v6.4s,v7.4s
1638 add v10.4s,v10.4s,v11.4s
1640 add v14.4s,v14.4s,v15.4s
1642 add v18.4s,v18.4s,v19.4s
1644 add v22.4s,v22.4s,v23.4s
1646 eor v24.16b,v1.16b,v2.16b
1648 eor v25.16b,v5.16b,v6.16b
1650 eor v26.16b,v9.16b,v10.16b
1652 eor v27.16b,v13.16b,v14.16b
1654 eor v28.16b,v17.16b,v18.16b
1656 eor v29.16b,v21.16b,v22.16b
1658 ushr v1.4s,v24.4s,#20
1660 ushr v5.4s,v25.4s,#20
1662 ushr v9.4s,v26.4s,#20
1664 ushr v13.4s,v27.4s,#20
1666 ushr v17.4s,v28.4s,#20
1668 ushr v21.4s,v29.4s,#20
1670 sli v1.4s,v24.4s,#12
1672 sli v5.4s,v25.4s,#12
1674 sli v9.4s,v26.4s,#12
1676 sli v13.4s,v27.4s,#12
1678 sli v17.4s,v28.4s,#12
1680 sli v21.4s,v29.4s,#12
1682 add v0.4s,v0.4s,v1.4s
1684 add v4.4s,v4.4s,v5.4s
1686 add v8.4s,v8.4s,v9.4s
1688 add v12.4s,v12.4s,v13.4s
1690 add v16.4s,v16.4s,v17.4s
1692 add v20.4s,v20.4s,v21.4s
1694 eor v24.16b,v3.16b,v0.16b
1696 eor v25.16b,v7.16b,v4.16b
1698 eor v26.16b,v11.16b,v8.16b
1700 eor v27.16b,v15.16b,v12.16b
1702 eor v28.16b,v19.16b,v16.16b
1704 eor v29.16b,v23.16b,v20.16b
1706 ushr v3.4s,v24.4s,#24
1708 ushr v7.4s,v25.4s,#24
1710 ushr v11.4s,v26.4s,#24
1712 ushr v15.4s,v27.4s,#24
1714 ushr v19.4s,v28.4s,#24
1716 ushr v23.4s,v29.4s,#24
1722 sli v11.4s,v26.4s,#8
1724 sli v15.4s,v27.4s,#8
1726 sli v19.4s,v28.4s,#8
1728 sli v23.4s,v29.4s,#8
1730 add v2.4s,v2.4s,v3.4s
1732 add v6.4s,v6.4s,v7.4s
1734 add v10.4s,v10.4s,v11.4s
1736 add v14.4s,v14.4s,v15.4s
1738 add v18.4s,v18.4s,v19.4s
1740 add v22.4s,v22.4s,v23.4s
1742 eor v24.16b,v1.16b,v2.16b
1744 eor v25.16b,v5.16b,v6.16b
1746 eor v26.16b,v9.16b,v10.16b
1748 eor v27.16b,v13.16b,v14.16b
1750 eor v28.16b,v17.16b,v18.16b
1752 eor v29.16b,v21.16b,v22.16b
1754 ushr v1.4s,v24.4s,#25
1756 ushr v5.4s,v25.4s,#25
1758 ushr v9.4s,v26.4s,#25
1760 ushr v13.4s,v27.4s,#25
1762 ushr v17.4s,v28.4s,#25
1764 ushr v21.4s,v29.4s,#25
1772 sli v13.4s,v27.4s,#7
1774 sli v17.4s,v28.4s,#7
1776 sli v21.4s,v29.4s,#7
1778 ext v2.16b,v2.16b,v2.16b,#8
1780 ext v6.16b,v6.16b,v6.16b,#8
1782 ext v10.16b,v10.16b,v10.16b,#8
1784 ext v14.16b,v14.16b,v14.16b,#8
1786 ext v18.16b,v18.16b,v18.16b,#8
1788 ext v22.16b,v22.16b,v22.16b,#8
1790 ext v3.16b,v3.16b,v3.16b,#4
1791 ext v7.16b,v7.16b,v7.16b,#4
1792 ext v11.16b,v11.16b,v11.16b,#4
1793 ext v15.16b,v15.16b,v15.16b,#4
1794 ext v19.16b,v19.16b,v19.16b,#4
1795 ext v23.16b,v23.16b,v23.16b,#4
1796 ext v1.16b,v1.16b,v1.16b,#12
1797 ext v5.16b,v5.16b,v5.16b,#12
1798 ext v9.16b,v9.16b,v9.16b,#12
1799 ext v13.16b,v13.16b,v13.16b,#12
1800 ext v17.16b,v17.16b,v17.16b,#12
1801 ext v21.16b,v21.16b,v21.16b,#12
1802 cbnz x4,.Loop_lower_neon
1804 add w5,w5,w22 // accumulate key block
1806 add x6,x6,x22,lsr#32
1807 ldp q26,q27,[sp,#32]
1809 ldp q28,q29,[sp,#64]
1810 add x8,x8,x23,lsr#32
1811 add v0.4s,v0.4s,v24.4s
1813 add v4.4s,v4.4s,v24.4s
1814 add x10,x10,x24,lsr#32
1815 add v8.4s,v8.4s,v24.4s
1817 add v12.4s,v12.4s,v24.4s
1818 add x12,x12,x25,lsr#32
1819 add v16.4s,v16.4s,v24.4s
1821 add v20.4s,v20.4s,v24.4s
1822 add x14,x14,x26,lsr#32
1823 add v2.4s,v2.4s,v26.4s
1825 add v6.4s,v6.4s,v26.4s
1826 add x16,x16,x27,lsr#32
1827 add v10.4s,v10.4s,v26.4s
1829 add v14.4s,v14.4s,v26.4s
1830 add x19,x19,x28,lsr#32
1831 add v18.4s,v18.4s,v26.4s
1833 add v22.4s,v22.4s,v26.4s
1834 add x21,x21,x30,lsr#32
1835 add v19.4s,v19.4s,v31.4s // +4
1836 add x5,x5,x6,lsl#32 // pack
1837 add v23.4s,v23.4s,v31.4s // +4
1839 add v3.4s,v3.4s,v27.4s
1840 ldp x6,x8,[x1,#0] // load input
1841 add v7.4s,v7.4s,v28.4s
1842 add x9,x9,x10,lsl#32
1843 add v11.4s,v11.4s,v29.4s
1844 add x11,x11,x12,lsl#32
1845 add v15.4s,v15.4s,v30.4s
1846 ldp x10,x12,[x1,#16]
1847 add v19.4s,v19.4s,v27.4s
1848 add x13,x13,x14,lsl#32
1849 add v23.4s,v23.4s,v28.4s
1850 add x15,x15,x16,lsl#32
1851 add v1.4s,v1.4s,v25.4s
1852 ldp x14,x16,[x1,#32]
1853 add v5.4s,v5.4s,v25.4s
1854 add x17,x17,x19,lsl#32
1855 add v9.4s,v9.4s,v25.4s
1856 add x20,x20,x21,lsl#32
1857 add v13.4s,v13.4s,v25.4s
1858 ldp x19,x21,[x1,#48]
1859 add v17.4s,v17.4s,v25.4s
1861 add v21.4s,v21.4s,v25.4s
1873 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1879 eor v0.16b,v0.16b,v24.16b
1881 eor v1.16b,v1.16b,v25.16b
1883 eor v2.16b,v2.16b,v26.16b
1885 eor v3.16b,v3.16b,v27.16b
1886 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1888 stp x5,x7,[x0,#0] // store output
1889 add x28,x28,#7 // increment counter
1891 stp x13,x15,[x0,#32]
1892 stp x17,x20,[x0,#48]
1894 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1896 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1897 eor v4.16b,v4.16b,v24.16b
1898 eor v5.16b,v5.16b,v25.16b
1899 eor v6.16b,v6.16b,v26.16b
1900 eor v7.16b,v7.16b,v27.16b
1901 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1903 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1904 eor v8.16b,v8.16b,v0.16b
1906 eor v9.16b,v9.16b,v1.16b
1907 ldp q26,q27,[sp,#32]
1908 eor v10.16b,v10.16b,v2.16b
1909 eor v11.16b,v11.16b,v3.16b
1910 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1912 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1913 eor v12.16b,v12.16b,v4.16b
1914 eor v13.16b,v13.16b,v5.16b
1915 eor v14.16b,v14.16b,v6.16b
1916 eor v15.16b,v15.16b,v7.16b
1917 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1919 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1920 eor v16.16b,v16.16b,v8.16b
1921 eor v17.16b,v17.16b,v9.16b
1922 eor v18.16b,v18.16b,v10.16b
1923 eor v19.16b,v19.16b,v11.16b
1924 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1926 shl v0.4s,v31.4s,#1 // 4 -> 8
1927 eor v20.16b,v20.16b,v12.16b
1928 eor v21.16b,v21.16b,v13.16b
1929 eor v22.16b,v22.16b,v14.16b
1930 eor v23.16b,v23.16b,v15.16b
1931 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1933 add v27.4s,v27.4s,v0.4s // += 8
1934 add v28.4s,v28.4s,v0.4s
1935 add v29.4s,v29.4s,v0.4s
1936 add v30.4s,v30.4s,v0.4s
1938 b.hs .Loop_outer_512_neon
1941 ushr v0.4s,v31.4s,#2 // 4 -> 1
1943 ldp d8,d9,[sp,#128+0] // meet ABI requirements
1944 ldp d10,d11,[sp,#128+16]
1945 ldp d12,d13,[sp,#128+32]
1946 ldp d14,d15,[sp,#128+48]
1948 stp q24,q31,[sp,#0] // wipe off-load area
1949 stp q24,q31,[sp,#32]
1950 stp q24,q31,[sp,#64]
1952 b.eq .Ldone_512_neon
1955 sub v27.4s,v27.4s,v0.4s // -= 1
1956 sub v28.4s,v28.4s,v0.4s
1957 sub v29.4s,v29.4s,v0.4s
1959 b.hs .Loop_outer_neon
1961 eor v25.16b,v25.16b,v25.16b
1962 eor v26.16b,v26.16b,v26.16b
1963 eor v27.16b,v27.16b,v27.16b
1964 eor v28.16b,v28.16b,v28.16b
1965 eor v29.16b,v29.16b,v29.16b
1966 eor v30.16b,v30.16b,v30.16b
1970 ldp x19,x20,[x29,#16]
1972 ldp x21,x22,[x29,#32]
1973 ldp x23,x24,[x29,#48]
1974 ldp x25,x26,[x29,#64]
1975 ldp x27,x28,[x29,#80]
1976 ldp x29,x30,[sp],#96
1977 .inst 0xd50323bf // autiasp
1979 .size ChaCha20_512_neon,.-ChaCha20_512_neon