2 /* Do not modify. This file is auto-generated from chacha-armv8.pl. */
11 .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
16 .long OPENSSL_armcap_P-.
18 .quad OPENSSL_armcap_P-.
20 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
24 .type ChaCha20_ctr32,%function
28 adr x5,.LOPENSSL_armcap_P
41 .inst 0xd503233f // paciasp
42 stp x29,x30,[sp,#-96]!
53 ldp x22,x23,[x5] // load sigma
54 ldp x24,x25,[x3] // load key
56 ldp x28,x30,[x4] // load counter
67 mov w5,w22 // unpack key block
186 add w5,w5,w22 // accumulate key block
191 add x10,x10,x24,lsr#32
193 add x12,x12,x25,lsr#32
195 add x14,x14,x26,lsr#32
197 add x16,x16,x27,lsr#32
199 add x19,x19,x28,lsr#32
201 add x21,x21,x30,lsr#32
205 add x5,x5,x6,lsl#32 // pack
207 ldp x6,x8,[x1,#0] // load input
209 add x11,x11,x12,lsl#32
211 add x13,x13,x14,lsl#32
212 add x15,x15,x16,lsl#32
214 add x17,x17,x19,lsl#32
215 add x20,x20,x21,lsl#32
237 stp x5,x7,[x0,#0] // store output
238 add x28,x28,#1 // increment counter
246 ldp x19,x20,[x29,#16]
248 ldp x21,x22,[x29,#32]
249 ldp x23,x24,[x29,#48]
250 ldp x25,x26,[x29,#64]
251 ldp x27,x28,[x29,#80]
253 .inst 0xd50323bf // autiasp
267 add x5,x5,x6,lsl#32 // pack
270 add x11,x11,x12,lsl#32
271 add x13,x13,x14,lsl#32
272 add x15,x15,x16,lsl#32
273 add x17,x17,x19,lsl#32
274 add x20,x20,x21,lsl#32
303 ldp x19,x20,[x29,#16]
305 ldp x21,x22,[x29,#32]
306 ldp x23,x24,[x29,#48]
307 ldp x25,x26,[x29,#64]
308 ldp x27,x28,[x29,#80]
310 .inst 0xd50323bf // autiasp
312 .size ChaCha20_ctr32,.-ChaCha20_ctr32
314 .type ChaCha20_neon,%function
317 .inst 0xd503233f // paciasp
318 stp x29,x30,[sp,#-96]!
328 b.hs .L512_or_more_neon
332 ldp x22,x23,[x5] // load sigma
333 ld1 {v24.4s},[x5],#16
334 ldp x24,x25,[x3] // load key
336 ld1 {v25.4s,v26.4s},[x3]
337 ldp x28,x30,[x4] // load counter
349 add v27.4s,v27.4s,v31.4s // += 1
350 add v28.4s,v27.4s,v31.4s
351 add v29.4s,v28.4s,v31.4s
352 shl v31.4s,v31.4s,#2 // 1 -> 4
355 mov w5,w22 // unpack key block
388 add v0.4s,v0.4s,v1.4s
390 add v4.4s,v4.4s,v5.4s
392 add v16.4s,v16.4s,v17.4s
394 eor v3.16b,v3.16b,v0.16b
396 eor v7.16b,v7.16b,v4.16b
398 eor v19.16b,v19.16b,v16.16b
406 add v2.4s,v2.4s,v3.4s
408 add v6.4s,v6.4s,v7.4s
410 add v18.4s,v18.4s,v19.4s
412 eor v20.16b,v1.16b,v2.16b
414 eor v21.16b,v5.16b,v6.16b
416 eor v22.16b,v17.16b,v18.16b
418 ushr v1.4s,v20.4s,#20
420 ushr v5.4s,v21.4s,#20
422 ushr v17.4s,v22.4s,#20
428 sli v17.4s,v22.4s,#12
430 add v0.4s,v0.4s,v1.4s
432 add v4.4s,v4.4s,v5.4s
434 add v16.4s,v16.4s,v17.4s
436 eor v20.16b,v3.16b,v0.16b
438 eor v21.16b,v7.16b,v4.16b
440 eor v22.16b,v19.16b,v16.16b
442 ushr v3.4s,v20.4s,#24
444 ushr v7.4s,v21.4s,#24
446 ushr v19.4s,v22.4s,#24
454 add v2.4s,v2.4s,v3.4s
456 add v6.4s,v6.4s,v7.4s
458 add v18.4s,v18.4s,v19.4s
460 eor v20.16b,v1.16b,v2.16b
462 eor v21.16b,v5.16b,v6.16b
464 eor v22.16b,v17.16b,v18.16b
466 ushr v1.4s,v20.4s,#25
468 ushr v5.4s,v21.4s,#25
470 ushr v17.4s,v22.4s,#25
478 ext v2.16b,v2.16b,v2.16b,#8
480 ext v6.16b,v6.16b,v6.16b,#8
482 ext v18.16b,v18.16b,v18.16b,#8
484 ext v3.16b,v3.16b,v3.16b,#12
485 ext v7.16b,v7.16b,v7.16b,#12
486 ext v19.16b,v19.16b,v19.16b,#12
487 ext v1.16b,v1.16b,v1.16b,#4
488 ext v5.16b,v5.16b,v5.16b,#4
489 ext v17.16b,v17.16b,v17.16b,#4
490 add v0.4s,v0.4s,v1.4s
492 add v4.4s,v4.4s,v5.4s
494 add v16.4s,v16.4s,v17.4s
496 eor v3.16b,v3.16b,v0.16b
498 eor v7.16b,v7.16b,v4.16b
500 eor v19.16b,v19.16b,v16.16b
508 add v2.4s,v2.4s,v3.4s
510 add v6.4s,v6.4s,v7.4s
512 add v18.4s,v18.4s,v19.4s
514 eor v20.16b,v1.16b,v2.16b
516 eor v21.16b,v5.16b,v6.16b
518 eor v22.16b,v17.16b,v18.16b
520 ushr v1.4s,v20.4s,#20
522 ushr v5.4s,v21.4s,#20
524 ushr v17.4s,v22.4s,#20
530 sli v17.4s,v22.4s,#12
532 add v0.4s,v0.4s,v1.4s
534 add v4.4s,v4.4s,v5.4s
536 add v16.4s,v16.4s,v17.4s
538 eor v20.16b,v3.16b,v0.16b
540 eor v21.16b,v7.16b,v4.16b
542 eor v22.16b,v19.16b,v16.16b
544 ushr v3.4s,v20.4s,#24
546 ushr v7.4s,v21.4s,#24
548 ushr v19.4s,v22.4s,#24
556 add v2.4s,v2.4s,v3.4s
558 add v6.4s,v6.4s,v7.4s
560 add v18.4s,v18.4s,v19.4s
562 eor v20.16b,v1.16b,v2.16b
564 eor v21.16b,v5.16b,v6.16b
566 eor v22.16b,v17.16b,v18.16b
568 ushr v1.4s,v20.4s,#25
570 ushr v5.4s,v21.4s,#25
572 ushr v17.4s,v22.4s,#25
580 ext v2.16b,v2.16b,v2.16b,#8
582 ext v6.16b,v6.16b,v6.16b,#8
584 ext v18.16b,v18.16b,v18.16b,#8
586 ext v3.16b,v3.16b,v3.16b,#4
587 ext v7.16b,v7.16b,v7.16b,#4
588 ext v19.16b,v19.16b,v19.16b,#4
589 ext v1.16b,v1.16b,v1.16b,#12
590 ext v5.16b,v5.16b,v5.16b,#12
591 ext v17.16b,v17.16b,v17.16b,#12
594 add w5,w5,w22 // accumulate key block
595 add v0.4s,v0.4s,v24.4s
597 add v4.4s,v4.4s,v24.4s
599 add v16.4s,v16.4s,v24.4s
601 add v2.4s,v2.4s,v26.4s
603 add v6.4s,v6.4s,v26.4s
604 add x10,x10,x24,lsr#32
605 add v18.4s,v18.4s,v26.4s
607 add v3.4s,v3.4s,v27.4s
608 add x12,x12,x25,lsr#32
610 add v7.4s,v7.4s,v28.4s
611 add x14,x14,x26,lsr#32
613 add v19.4s,v19.4s,v29.4s
614 add x16,x16,x27,lsr#32
616 add v1.4s,v1.4s,v25.4s
617 add x19,x19,x28,lsr#32
619 add v5.4s,v5.4s,v25.4s
620 add x21,x21,x30,lsr#32
621 add v17.4s,v17.4s,v25.4s
625 add x5,x5,x6,lsl#32 // pack
627 ldp x6,x8,[x1,#0] // load input
629 add x11,x11,x12,lsl#32
631 add x13,x13,x14,lsl#32
632 add x15,x15,x16,lsl#32
634 add x17,x17,x19,lsl#32
635 add x20,x20,x21,lsl#32
648 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
654 eor v0.16b,v0.16b,v20.16b
656 eor v1.16b,v1.16b,v21.16b
658 eor v2.16b,v2.16b,v22.16b
660 eor v3.16b,v3.16b,v23.16b
661 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
663 stp x5,x7,[x0,#0] // store output
664 add x28,x28,#4 // increment counter
666 add v27.4s,v27.4s,v31.4s // += 4
668 add v28.4s,v28.4s,v31.4s
670 add v29.4s,v29.4s,v31.4s
673 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
674 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
676 eor v4.16b,v4.16b,v20.16b
677 eor v5.16b,v5.16b,v21.16b
678 eor v6.16b,v6.16b,v22.16b
679 eor v7.16b,v7.16b,v23.16b
680 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
682 eor v16.16b,v16.16b,v0.16b
683 eor v17.16b,v17.16b,v1.16b
684 eor v18.16b,v18.16b,v2.16b
685 eor v19.16b,v19.16b,v3.16b
686 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
688 b.hi .Loop_outer_neon
690 ldp x19,x20,[x29,#16]
692 ldp x21,x22,[x29,#32]
693 ldp x23,x24,[x29,#48]
694 ldp x25,x26,[x29,#64]
695 ldp x27,x28,[x29,#80]
697 .inst 0xd50323bf // autiasp
705 add x5,x5,x6,lsl#32 // pack
707 ldp x6,x8,[x1,#0] // load input
709 add x11,x11,x12,lsl#32
711 add x13,x13,x14,lsl#32
712 add x15,x15,x16,lsl#32
714 add x17,x17,x19,lsl#32
715 add x20,x20,x21,lsl#32
737 stp x5,x7,[x0,#0] // store output
738 add x28,x28,#4 // increment counter
748 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
749 eor v0.16b,v0.16b,v20.16b
750 eor v1.16b,v1.16b,v21.16b
751 eor v2.16b,v2.16b,v22.16b
752 eor v3.16b,v3.16b,v23.16b
753 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
759 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
760 eor v4.16b,v4.16b,v20.16b
761 eor v5.16b,v5.16b,v21.16b
762 eor v6.16b,v6.16b,v22.16b
763 eor v7.16b,v7.16b,v23.16b
764 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
768 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
772 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
775 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
792 cbnz x2,.Loop_tail_neon
800 ldp x19,x20,[x29,#16]
802 ldp x21,x22,[x29,#32]
803 ldp x23,x24,[x29,#48]
804 ldp x25,x26,[x29,#64]
805 ldp x27,x28,[x29,#80]
807 .inst 0xd50323bf // autiasp
809 .size ChaCha20_neon,.-ChaCha20_neon
810 .type ChaCha20_512_neon,%function
813 .inst 0xd503233f // paciasp
814 stp x29,x30,[sp,#-96]!
827 ldp x22,x23,[x5] // load sigma
828 ld1 {v24.4s},[x5],#16
829 ldp x24,x25,[x3] // load key
831 ld1 {v25.4s,v26.4s},[x3]
832 ldp x28,x30,[x4] // load counter
844 add v27.4s,v27.4s,v31.4s // += 1
845 stp q24,q25,[sp,#0] // off-load key block, invariant part
846 add v27.4s,v27.4s,v31.4s // not typo
848 add v28.4s,v27.4s,v31.4s
849 add v29.4s,v28.4s,v31.4s
850 add v30.4s,v29.4s,v31.4s
851 shl v31.4s,v31.4s,#2 // 1 -> 4
853 stp d8,d9,[sp,#128+0] // meet ABI requirements
854 stp d10,d11,[sp,#128+16]
855 stp d12,d13,[sp,#128+32]
856 stp d14,d15,[sp,#128+48]
858 sub x2,x2,#512 // not typo
860 .Loop_outer_512_neon:
868 mov w5,w22 // unpack key block
891 add v19.4s,v3.4s,v31.4s // +4
893 add v23.4s,v7.4s,v31.4s // +4
900 stp q27,q28,[sp,#48] // off-load key block, variable part
908 add v0.4s,v0.4s,v1.4s
910 add v4.4s,v4.4s,v5.4s
912 add v8.4s,v8.4s,v9.4s
914 add v12.4s,v12.4s,v13.4s
916 add v16.4s,v16.4s,v17.4s
918 add v20.4s,v20.4s,v21.4s
920 eor v3.16b,v3.16b,v0.16b
922 eor v7.16b,v7.16b,v4.16b
924 eor v11.16b,v11.16b,v8.16b
926 eor v15.16b,v15.16b,v12.16b
928 eor v19.16b,v19.16b,v16.16b
930 eor v23.16b,v23.16b,v20.16b
944 add v2.4s,v2.4s,v3.4s
946 add v6.4s,v6.4s,v7.4s
948 add v10.4s,v10.4s,v11.4s
950 add v14.4s,v14.4s,v15.4s
952 add v18.4s,v18.4s,v19.4s
954 add v22.4s,v22.4s,v23.4s
956 eor v24.16b,v1.16b,v2.16b
958 eor v25.16b,v5.16b,v6.16b
960 eor v26.16b,v9.16b,v10.16b
962 eor v27.16b,v13.16b,v14.16b
964 eor v28.16b,v17.16b,v18.16b
966 eor v29.16b,v21.16b,v22.16b
968 ushr v1.4s,v24.4s,#20
970 ushr v5.4s,v25.4s,#20
972 ushr v9.4s,v26.4s,#20
974 ushr v13.4s,v27.4s,#20
976 ushr v17.4s,v28.4s,#20
978 ushr v21.4s,v29.4s,#20
986 sli v13.4s,v27.4s,#12
988 sli v17.4s,v28.4s,#12
990 sli v21.4s,v29.4s,#12
992 add v0.4s,v0.4s,v1.4s
994 add v4.4s,v4.4s,v5.4s
996 add v8.4s,v8.4s,v9.4s
998 add v12.4s,v12.4s,v13.4s
1000 add v16.4s,v16.4s,v17.4s
1002 add v20.4s,v20.4s,v21.4s
1004 eor v24.16b,v3.16b,v0.16b
1006 eor v25.16b,v7.16b,v4.16b
1008 eor v26.16b,v11.16b,v8.16b
1010 eor v27.16b,v15.16b,v12.16b
1012 eor v28.16b,v19.16b,v16.16b
1014 eor v29.16b,v23.16b,v20.16b
1016 ushr v3.4s,v24.4s,#24
1018 ushr v7.4s,v25.4s,#24
1020 ushr v11.4s,v26.4s,#24
1022 ushr v15.4s,v27.4s,#24
1024 ushr v19.4s,v28.4s,#24
1026 ushr v23.4s,v29.4s,#24
1032 sli v11.4s,v26.4s,#8
1034 sli v15.4s,v27.4s,#8
1036 sli v19.4s,v28.4s,#8
1038 sli v23.4s,v29.4s,#8
1040 add v2.4s,v2.4s,v3.4s
1042 add v6.4s,v6.4s,v7.4s
1044 add v10.4s,v10.4s,v11.4s
1046 add v14.4s,v14.4s,v15.4s
1048 add v18.4s,v18.4s,v19.4s
1050 add v22.4s,v22.4s,v23.4s
1052 eor v24.16b,v1.16b,v2.16b
1054 eor v25.16b,v5.16b,v6.16b
1056 eor v26.16b,v9.16b,v10.16b
1058 eor v27.16b,v13.16b,v14.16b
1060 eor v28.16b,v17.16b,v18.16b
1062 eor v29.16b,v21.16b,v22.16b
1064 ushr v1.4s,v24.4s,#25
1066 ushr v5.4s,v25.4s,#25
1068 ushr v9.4s,v26.4s,#25
1070 ushr v13.4s,v27.4s,#25
1072 ushr v17.4s,v28.4s,#25
1074 ushr v21.4s,v29.4s,#25
1082 sli v13.4s,v27.4s,#7
1084 sli v17.4s,v28.4s,#7
1086 sli v21.4s,v29.4s,#7
1088 ext v2.16b,v2.16b,v2.16b,#8
1090 ext v6.16b,v6.16b,v6.16b,#8
1092 ext v10.16b,v10.16b,v10.16b,#8
1094 ext v14.16b,v14.16b,v14.16b,#8
1096 ext v18.16b,v18.16b,v18.16b,#8
1098 ext v22.16b,v22.16b,v22.16b,#8
1100 ext v3.16b,v3.16b,v3.16b,#12
1101 ext v7.16b,v7.16b,v7.16b,#12
1102 ext v11.16b,v11.16b,v11.16b,#12
1103 ext v15.16b,v15.16b,v15.16b,#12
1104 ext v19.16b,v19.16b,v19.16b,#12
1105 ext v23.16b,v23.16b,v23.16b,#12
1106 ext v1.16b,v1.16b,v1.16b,#4
1107 ext v5.16b,v5.16b,v5.16b,#4
1108 ext v9.16b,v9.16b,v9.16b,#4
1109 ext v13.16b,v13.16b,v13.16b,#4
1110 ext v17.16b,v17.16b,v17.16b,#4
1111 ext v21.16b,v21.16b,v21.16b,#4
1112 add v0.4s,v0.4s,v1.4s
1114 add v4.4s,v4.4s,v5.4s
1116 add v8.4s,v8.4s,v9.4s
1118 add v12.4s,v12.4s,v13.4s
1120 add v16.4s,v16.4s,v17.4s
1122 add v20.4s,v20.4s,v21.4s
1124 eor v3.16b,v3.16b,v0.16b
1126 eor v7.16b,v7.16b,v4.16b
1128 eor v11.16b,v11.16b,v8.16b
1130 eor v15.16b,v15.16b,v12.16b
1132 eor v19.16b,v19.16b,v16.16b
1134 eor v23.16b,v23.16b,v20.16b
1148 add v2.4s,v2.4s,v3.4s
1150 add v6.4s,v6.4s,v7.4s
1152 add v10.4s,v10.4s,v11.4s
1154 add v14.4s,v14.4s,v15.4s
1156 add v18.4s,v18.4s,v19.4s
1158 add v22.4s,v22.4s,v23.4s
1160 eor v24.16b,v1.16b,v2.16b
1162 eor v25.16b,v5.16b,v6.16b
1164 eor v26.16b,v9.16b,v10.16b
1166 eor v27.16b,v13.16b,v14.16b
1168 eor v28.16b,v17.16b,v18.16b
1170 eor v29.16b,v21.16b,v22.16b
1172 ushr v1.4s,v24.4s,#20
1174 ushr v5.4s,v25.4s,#20
1176 ushr v9.4s,v26.4s,#20
1178 ushr v13.4s,v27.4s,#20
1180 ushr v17.4s,v28.4s,#20
1182 ushr v21.4s,v29.4s,#20
1184 sli v1.4s,v24.4s,#12
1186 sli v5.4s,v25.4s,#12
1188 sli v9.4s,v26.4s,#12
1190 sli v13.4s,v27.4s,#12
1192 sli v17.4s,v28.4s,#12
1194 sli v21.4s,v29.4s,#12
1196 add v0.4s,v0.4s,v1.4s
1198 add v4.4s,v4.4s,v5.4s
1200 add v8.4s,v8.4s,v9.4s
1202 add v12.4s,v12.4s,v13.4s
1204 add v16.4s,v16.4s,v17.4s
1206 add v20.4s,v20.4s,v21.4s
1208 eor v24.16b,v3.16b,v0.16b
1210 eor v25.16b,v7.16b,v4.16b
1212 eor v26.16b,v11.16b,v8.16b
1214 eor v27.16b,v15.16b,v12.16b
1216 eor v28.16b,v19.16b,v16.16b
1218 eor v29.16b,v23.16b,v20.16b
1220 ushr v3.4s,v24.4s,#24
1222 ushr v7.4s,v25.4s,#24
1224 ushr v11.4s,v26.4s,#24
1226 ushr v15.4s,v27.4s,#24
1228 ushr v19.4s,v28.4s,#24
1230 ushr v23.4s,v29.4s,#24
1236 sli v11.4s,v26.4s,#8
1238 sli v15.4s,v27.4s,#8
1240 sli v19.4s,v28.4s,#8
1242 sli v23.4s,v29.4s,#8
1244 add v2.4s,v2.4s,v3.4s
1246 add v6.4s,v6.4s,v7.4s
1248 add v10.4s,v10.4s,v11.4s
1250 add v14.4s,v14.4s,v15.4s
1252 add v18.4s,v18.4s,v19.4s
1254 add v22.4s,v22.4s,v23.4s
1256 eor v24.16b,v1.16b,v2.16b
1258 eor v25.16b,v5.16b,v6.16b
1260 eor v26.16b,v9.16b,v10.16b
1262 eor v27.16b,v13.16b,v14.16b
1264 eor v28.16b,v17.16b,v18.16b
1266 eor v29.16b,v21.16b,v22.16b
1268 ushr v1.4s,v24.4s,#25
1270 ushr v5.4s,v25.4s,#25
1272 ushr v9.4s,v26.4s,#25
1274 ushr v13.4s,v27.4s,#25
1276 ushr v17.4s,v28.4s,#25
1278 ushr v21.4s,v29.4s,#25
1286 sli v13.4s,v27.4s,#7
1288 sli v17.4s,v28.4s,#7
1290 sli v21.4s,v29.4s,#7
1292 ext v2.16b,v2.16b,v2.16b,#8
1294 ext v6.16b,v6.16b,v6.16b,#8
1296 ext v10.16b,v10.16b,v10.16b,#8
1298 ext v14.16b,v14.16b,v14.16b,#8
1300 ext v18.16b,v18.16b,v18.16b,#8
1302 ext v22.16b,v22.16b,v22.16b,#8
1304 ext v3.16b,v3.16b,v3.16b,#4
1305 ext v7.16b,v7.16b,v7.16b,#4
1306 ext v11.16b,v11.16b,v11.16b,#4
1307 ext v15.16b,v15.16b,v15.16b,#4
1308 ext v19.16b,v19.16b,v19.16b,#4
1309 ext v23.16b,v23.16b,v23.16b,#4
1310 ext v1.16b,v1.16b,v1.16b,#12
1311 ext v5.16b,v5.16b,v5.16b,#12
1312 ext v9.16b,v9.16b,v9.16b,#12
1313 ext v13.16b,v13.16b,v13.16b,#12
1314 ext v17.16b,v17.16b,v17.16b,#12
1315 ext v21.16b,v21.16b,v21.16b,#12
1316 cbnz x4,.Loop_upper_neon
1318 add w5,w5,w22 // accumulate key block
1319 add x6,x6,x22,lsr#32
1321 add x8,x8,x23,lsr#32
1323 add x10,x10,x24,lsr#32
1325 add x12,x12,x25,lsr#32
1327 add x14,x14,x26,lsr#32
1329 add x16,x16,x27,lsr#32
1331 add x19,x19,x28,lsr#32
1333 add x21,x21,x30,lsr#32
1335 add x5,x5,x6,lsl#32 // pack
1337 ldp x6,x8,[x1,#0] // load input
1338 add x9,x9,x10,lsl#32
1339 add x11,x11,x12,lsl#32
1340 ldp x10,x12,[x1,#16]
1341 add x13,x13,x14,lsl#32
1342 add x15,x15,x16,lsl#32
1343 ldp x14,x16,[x1,#32]
1344 add x17,x17,x19,lsl#32
1345 add x20,x20,x21,lsl#32
1346 ldp x19,x21,[x1,#48]
1367 stp x5,x7,[x0,#0] // store output
1368 add x28,x28,#1 // increment counter
1369 mov w5,w22 // unpack key block
1374 stp x13,x15,[x0,#32]
1377 stp x17,x20,[x0,#48]
1393 add v0.4s,v0.4s,v1.4s
1395 add v4.4s,v4.4s,v5.4s
1397 add v8.4s,v8.4s,v9.4s
1399 add v12.4s,v12.4s,v13.4s
1401 add v16.4s,v16.4s,v17.4s
1403 add v20.4s,v20.4s,v21.4s
1405 eor v3.16b,v3.16b,v0.16b
1407 eor v7.16b,v7.16b,v4.16b
1409 eor v11.16b,v11.16b,v8.16b
1411 eor v15.16b,v15.16b,v12.16b
1413 eor v19.16b,v19.16b,v16.16b
1415 eor v23.16b,v23.16b,v20.16b
1429 add v2.4s,v2.4s,v3.4s
1431 add v6.4s,v6.4s,v7.4s
1433 add v10.4s,v10.4s,v11.4s
1435 add v14.4s,v14.4s,v15.4s
1437 add v18.4s,v18.4s,v19.4s
1439 add v22.4s,v22.4s,v23.4s
1441 eor v24.16b,v1.16b,v2.16b
1443 eor v25.16b,v5.16b,v6.16b
1445 eor v26.16b,v9.16b,v10.16b
1447 eor v27.16b,v13.16b,v14.16b
1449 eor v28.16b,v17.16b,v18.16b
1451 eor v29.16b,v21.16b,v22.16b
1453 ushr v1.4s,v24.4s,#20
1455 ushr v5.4s,v25.4s,#20
1457 ushr v9.4s,v26.4s,#20
1459 ushr v13.4s,v27.4s,#20
1461 ushr v17.4s,v28.4s,#20
1463 ushr v21.4s,v29.4s,#20
1465 sli v1.4s,v24.4s,#12
1467 sli v5.4s,v25.4s,#12
1469 sli v9.4s,v26.4s,#12
1471 sli v13.4s,v27.4s,#12
1473 sli v17.4s,v28.4s,#12
1475 sli v21.4s,v29.4s,#12
1477 add v0.4s,v0.4s,v1.4s
1479 add v4.4s,v4.4s,v5.4s
1481 add v8.4s,v8.4s,v9.4s
1483 add v12.4s,v12.4s,v13.4s
1485 add v16.4s,v16.4s,v17.4s
1487 add v20.4s,v20.4s,v21.4s
1489 eor v24.16b,v3.16b,v0.16b
1491 eor v25.16b,v7.16b,v4.16b
1493 eor v26.16b,v11.16b,v8.16b
1495 eor v27.16b,v15.16b,v12.16b
1497 eor v28.16b,v19.16b,v16.16b
1499 eor v29.16b,v23.16b,v20.16b
1501 ushr v3.4s,v24.4s,#24
1503 ushr v7.4s,v25.4s,#24
1505 ushr v11.4s,v26.4s,#24
1507 ushr v15.4s,v27.4s,#24
1509 ushr v19.4s,v28.4s,#24
1511 ushr v23.4s,v29.4s,#24
1517 sli v11.4s,v26.4s,#8
1519 sli v15.4s,v27.4s,#8
1521 sli v19.4s,v28.4s,#8
1523 sli v23.4s,v29.4s,#8
1525 add v2.4s,v2.4s,v3.4s
1527 add v6.4s,v6.4s,v7.4s
1529 add v10.4s,v10.4s,v11.4s
1531 add v14.4s,v14.4s,v15.4s
1533 add v18.4s,v18.4s,v19.4s
1535 add v22.4s,v22.4s,v23.4s
1537 eor v24.16b,v1.16b,v2.16b
1539 eor v25.16b,v5.16b,v6.16b
1541 eor v26.16b,v9.16b,v10.16b
1543 eor v27.16b,v13.16b,v14.16b
1545 eor v28.16b,v17.16b,v18.16b
1547 eor v29.16b,v21.16b,v22.16b
1549 ushr v1.4s,v24.4s,#25
1551 ushr v5.4s,v25.4s,#25
1553 ushr v9.4s,v26.4s,#25
1555 ushr v13.4s,v27.4s,#25
1557 ushr v17.4s,v28.4s,#25
1559 ushr v21.4s,v29.4s,#25
1567 sli v13.4s,v27.4s,#7
1569 sli v17.4s,v28.4s,#7
1571 sli v21.4s,v29.4s,#7
1573 ext v2.16b,v2.16b,v2.16b,#8
1575 ext v6.16b,v6.16b,v6.16b,#8
1577 ext v10.16b,v10.16b,v10.16b,#8
1579 ext v14.16b,v14.16b,v14.16b,#8
1581 ext v18.16b,v18.16b,v18.16b,#8
1583 ext v22.16b,v22.16b,v22.16b,#8
1585 ext v3.16b,v3.16b,v3.16b,#12
1586 ext v7.16b,v7.16b,v7.16b,#12
1587 ext v11.16b,v11.16b,v11.16b,#12
1588 ext v15.16b,v15.16b,v15.16b,#12
1589 ext v19.16b,v19.16b,v19.16b,#12
1590 ext v23.16b,v23.16b,v23.16b,#12
1591 ext v1.16b,v1.16b,v1.16b,#4
1592 ext v5.16b,v5.16b,v5.16b,#4
1593 ext v9.16b,v9.16b,v9.16b,#4
1594 ext v13.16b,v13.16b,v13.16b,#4
1595 ext v17.16b,v17.16b,v17.16b,#4
1596 ext v21.16b,v21.16b,v21.16b,#4
1597 add v0.4s,v0.4s,v1.4s
1599 add v4.4s,v4.4s,v5.4s
1601 add v8.4s,v8.4s,v9.4s
1603 add v12.4s,v12.4s,v13.4s
1605 add v16.4s,v16.4s,v17.4s
1607 add v20.4s,v20.4s,v21.4s
1609 eor v3.16b,v3.16b,v0.16b
1611 eor v7.16b,v7.16b,v4.16b
1613 eor v11.16b,v11.16b,v8.16b
1615 eor v15.16b,v15.16b,v12.16b
1617 eor v19.16b,v19.16b,v16.16b
1619 eor v23.16b,v23.16b,v20.16b
1633 add v2.4s,v2.4s,v3.4s
1635 add v6.4s,v6.4s,v7.4s
1637 add v10.4s,v10.4s,v11.4s
1639 add v14.4s,v14.4s,v15.4s
1641 add v18.4s,v18.4s,v19.4s
1643 add v22.4s,v22.4s,v23.4s
1645 eor v24.16b,v1.16b,v2.16b
1647 eor v25.16b,v5.16b,v6.16b
1649 eor v26.16b,v9.16b,v10.16b
1651 eor v27.16b,v13.16b,v14.16b
1653 eor v28.16b,v17.16b,v18.16b
1655 eor v29.16b,v21.16b,v22.16b
1657 ushr v1.4s,v24.4s,#20
1659 ushr v5.4s,v25.4s,#20
1661 ushr v9.4s,v26.4s,#20
1663 ushr v13.4s,v27.4s,#20
1665 ushr v17.4s,v28.4s,#20
1667 ushr v21.4s,v29.4s,#20
1669 sli v1.4s,v24.4s,#12
1671 sli v5.4s,v25.4s,#12
1673 sli v9.4s,v26.4s,#12
1675 sli v13.4s,v27.4s,#12
1677 sli v17.4s,v28.4s,#12
1679 sli v21.4s,v29.4s,#12
1681 add v0.4s,v0.4s,v1.4s
1683 add v4.4s,v4.4s,v5.4s
1685 add v8.4s,v8.4s,v9.4s
1687 add v12.4s,v12.4s,v13.4s
1689 add v16.4s,v16.4s,v17.4s
1691 add v20.4s,v20.4s,v21.4s
1693 eor v24.16b,v3.16b,v0.16b
1695 eor v25.16b,v7.16b,v4.16b
1697 eor v26.16b,v11.16b,v8.16b
1699 eor v27.16b,v15.16b,v12.16b
1701 eor v28.16b,v19.16b,v16.16b
1703 eor v29.16b,v23.16b,v20.16b
1705 ushr v3.4s,v24.4s,#24
1707 ushr v7.4s,v25.4s,#24
1709 ushr v11.4s,v26.4s,#24
1711 ushr v15.4s,v27.4s,#24
1713 ushr v19.4s,v28.4s,#24
1715 ushr v23.4s,v29.4s,#24
1721 sli v11.4s,v26.4s,#8
1723 sli v15.4s,v27.4s,#8
1725 sli v19.4s,v28.4s,#8
1727 sli v23.4s,v29.4s,#8
1729 add v2.4s,v2.4s,v3.4s
1731 add v6.4s,v6.4s,v7.4s
1733 add v10.4s,v10.4s,v11.4s
1735 add v14.4s,v14.4s,v15.4s
1737 add v18.4s,v18.4s,v19.4s
1739 add v22.4s,v22.4s,v23.4s
1741 eor v24.16b,v1.16b,v2.16b
1743 eor v25.16b,v5.16b,v6.16b
1745 eor v26.16b,v9.16b,v10.16b
1747 eor v27.16b,v13.16b,v14.16b
1749 eor v28.16b,v17.16b,v18.16b
1751 eor v29.16b,v21.16b,v22.16b
1753 ushr v1.4s,v24.4s,#25
1755 ushr v5.4s,v25.4s,#25
1757 ushr v9.4s,v26.4s,#25
1759 ushr v13.4s,v27.4s,#25
1761 ushr v17.4s,v28.4s,#25
1763 ushr v21.4s,v29.4s,#25
1771 sli v13.4s,v27.4s,#7
1773 sli v17.4s,v28.4s,#7
1775 sli v21.4s,v29.4s,#7
1777 ext v2.16b,v2.16b,v2.16b,#8
1779 ext v6.16b,v6.16b,v6.16b,#8
1781 ext v10.16b,v10.16b,v10.16b,#8
1783 ext v14.16b,v14.16b,v14.16b,#8
1785 ext v18.16b,v18.16b,v18.16b,#8
1787 ext v22.16b,v22.16b,v22.16b,#8
1789 ext v3.16b,v3.16b,v3.16b,#4
1790 ext v7.16b,v7.16b,v7.16b,#4
1791 ext v11.16b,v11.16b,v11.16b,#4
1792 ext v15.16b,v15.16b,v15.16b,#4
1793 ext v19.16b,v19.16b,v19.16b,#4
1794 ext v23.16b,v23.16b,v23.16b,#4
1795 ext v1.16b,v1.16b,v1.16b,#12
1796 ext v5.16b,v5.16b,v5.16b,#12
1797 ext v9.16b,v9.16b,v9.16b,#12
1798 ext v13.16b,v13.16b,v13.16b,#12
1799 ext v17.16b,v17.16b,v17.16b,#12
1800 ext v21.16b,v21.16b,v21.16b,#12
1801 cbnz x4,.Loop_lower_neon
1803 add w5,w5,w22 // accumulate key block
1805 add x6,x6,x22,lsr#32
1806 ldp q26,q27,[sp,#32]
1808 ldp q28,q29,[sp,#64]
1809 add x8,x8,x23,lsr#32
1810 add v0.4s,v0.4s,v24.4s
1812 add v4.4s,v4.4s,v24.4s
1813 add x10,x10,x24,lsr#32
1814 add v8.4s,v8.4s,v24.4s
1816 add v12.4s,v12.4s,v24.4s
1817 add x12,x12,x25,lsr#32
1818 add v16.4s,v16.4s,v24.4s
1820 add v20.4s,v20.4s,v24.4s
1821 add x14,x14,x26,lsr#32
1822 add v2.4s,v2.4s,v26.4s
1824 add v6.4s,v6.4s,v26.4s
1825 add x16,x16,x27,lsr#32
1826 add v10.4s,v10.4s,v26.4s
1828 add v14.4s,v14.4s,v26.4s
1829 add x19,x19,x28,lsr#32
1830 add v18.4s,v18.4s,v26.4s
1832 add v22.4s,v22.4s,v26.4s
1833 add x21,x21,x30,lsr#32
1834 add v19.4s,v19.4s,v31.4s // +4
1835 add x5,x5,x6,lsl#32 // pack
1836 add v23.4s,v23.4s,v31.4s // +4
1838 add v3.4s,v3.4s,v27.4s
1839 ldp x6,x8,[x1,#0] // load input
1840 add v7.4s,v7.4s,v28.4s
1841 add x9,x9,x10,lsl#32
1842 add v11.4s,v11.4s,v29.4s
1843 add x11,x11,x12,lsl#32
1844 add v15.4s,v15.4s,v30.4s
1845 ldp x10,x12,[x1,#16]
1846 add v19.4s,v19.4s,v27.4s
1847 add x13,x13,x14,lsl#32
1848 add v23.4s,v23.4s,v28.4s
1849 add x15,x15,x16,lsl#32
1850 add v1.4s,v1.4s,v25.4s
1851 ldp x14,x16,[x1,#32]
1852 add v5.4s,v5.4s,v25.4s
1853 add x17,x17,x19,lsl#32
1854 add v9.4s,v9.4s,v25.4s
1855 add x20,x20,x21,lsl#32
1856 add v13.4s,v13.4s,v25.4s
1857 ldp x19,x21,[x1,#48]
1858 add v17.4s,v17.4s,v25.4s
1860 add v21.4s,v21.4s,v25.4s
1872 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1878 eor v0.16b,v0.16b,v24.16b
1880 eor v1.16b,v1.16b,v25.16b
1882 eor v2.16b,v2.16b,v26.16b
1884 eor v3.16b,v3.16b,v27.16b
1885 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1887 stp x5,x7,[x0,#0] // store output
1888 add x28,x28,#7 // increment counter
1890 stp x13,x15,[x0,#32]
1891 stp x17,x20,[x0,#48]
1893 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1895 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1896 eor v4.16b,v4.16b,v24.16b
1897 eor v5.16b,v5.16b,v25.16b
1898 eor v6.16b,v6.16b,v26.16b
1899 eor v7.16b,v7.16b,v27.16b
1900 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1902 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1903 eor v8.16b,v8.16b,v0.16b
1905 eor v9.16b,v9.16b,v1.16b
1906 ldp q26,q27,[sp,#32]
1907 eor v10.16b,v10.16b,v2.16b
1908 eor v11.16b,v11.16b,v3.16b
1909 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1911 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1912 eor v12.16b,v12.16b,v4.16b
1913 eor v13.16b,v13.16b,v5.16b
1914 eor v14.16b,v14.16b,v6.16b
1915 eor v15.16b,v15.16b,v7.16b
1916 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1918 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1919 eor v16.16b,v16.16b,v8.16b
1920 eor v17.16b,v17.16b,v9.16b
1921 eor v18.16b,v18.16b,v10.16b
1922 eor v19.16b,v19.16b,v11.16b
1923 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1925 shl v0.4s,v31.4s,#1 // 4 -> 8
1926 eor v20.16b,v20.16b,v12.16b
1927 eor v21.16b,v21.16b,v13.16b
1928 eor v22.16b,v22.16b,v14.16b
1929 eor v23.16b,v23.16b,v15.16b
1930 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1932 add v27.4s,v27.4s,v0.4s // += 8
1933 add v28.4s,v28.4s,v0.4s
1934 add v29.4s,v29.4s,v0.4s
1935 add v30.4s,v30.4s,v0.4s
1937 b.hs .Loop_outer_512_neon
1940 ushr v0.4s,v31.4s,#2 // 4 -> 1
1942 ldp d8,d9,[sp,#128+0] // meet ABI requirements
1943 ldp d10,d11,[sp,#128+16]
1944 ldp d12,d13,[sp,#128+32]
1945 ldp d14,d15,[sp,#128+48]
1947 stp q24,q31,[sp,#0] // wipe off-load area
1948 stp q24,q31,[sp,#32]
1949 stp q24,q31,[sp,#64]
1951 b.eq .Ldone_512_neon
1954 sub v27.4s,v27.4s,v0.4s // -= 1
1955 sub v28.4s,v28.4s,v0.4s
1956 sub v29.4s,v29.4s,v0.4s
1958 b.hs .Loop_outer_neon
1960 eor v25.16b,v25.16b,v25.16b
1961 eor v26.16b,v26.16b,v26.16b
1962 eor v27.16b,v27.16b,v27.16b
1963 eor v28.16b,v28.16b,v28.16b
1964 eor v29.16b,v29.16b,v29.16b
1965 eor v30.16b,v30.16b,v30.16b
1969 ldp x19,x20,[x29,#16]
1971 ldp x21,x22,[x29,#32]
1972 ldp x23,x24,[x29,#48]
1973 ldp x25,x26,[x29,#64]
1974 ldp x27,x28,[x29,#80]
1975 ldp x29,x30,[sp],#96
1976 .inst 0xd50323bf // autiasp
1978 .size ChaCha20_512_neon,.-ChaCha20_512_neon