2 /* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
7 // forward "declarations" are required for Apple
13 .type poly1305_init,%function
17 stp xzr,xzr,[x0] // zero hash value
18 stp xzr,xzr,[x0,#16] // [along with is_base2_26]
24 ldrsw x11,.LOPENSSL_armcap_P
26 ldr x11,.LOPENSSL_armcap_P
28 adr x10,.LOPENSSL_armcap_P
30 ldp x7,x8,[x1] // load key
31 mov x9,#0xfffffffc0fffffff
32 movk x9,#0x0fff,lsl#48
35 rev x7,x7 // flip bytes
38 and x7,x7,x9 // &=0ffffffc0fffffff
40 and x8,x8,x9 // &=0ffffffc0ffffffc
41 stp x7,x8,[x0,#32] // save key value
45 adr x12,poly1305_blocks
46 adr x7,poly1305_blocks_neon
48 adr x8,poly1305_emit_neon
62 .size poly1305_init,.-poly1305_init
64 .type poly1305_blocks,%function
70 ldp x4,x5,[x0] // load hash value
71 ldp x7,x8,[x0,#32] // load key value
73 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
78 ldp x10,x11,[x1],#16 // load input
84 adds x4,x4,x10 // accumulate input
87 mul x12,x4,x7 // h0*r0
91 mul x10,x5,x9 // h1*5*r1
95 mul x10,x4,x8 // h0*r1
100 mul x10,x5,x7 // h1*r0
105 mul x10,x6,x9 // h2*5*r1
107 mul x11,x6,x7 // h2*r0
112 and x10,x14,#-4 // final reduction
114 add x10,x10,x14,lsr#2
121 stp x4,x5,[x0] // store hash value
126 .size poly1305_blocks,.-poly1305_blocks
128 .type poly1305_emit,%function
131 ldp x4,x5,[x0] // load hash base 2^64
133 ldp x10,x11,[x2] // load nonce
135 adds x12,x4,#5 // compare to modulus
139 tst x14,#-4 // see if it's carried/borrowed
145 ror x10,x10,#32 // flip nonce words
148 adds x4,x4,x10 // accumulate nonce
151 rev x4,x4 // flip output bytes
154 stp x4,x5,[x1] // write result
157 .size poly1305_emit,.-poly1305_emit
158 .type poly1305_mult,%function
161 mul x12,x4,x7 // h0*r0
164 mul x10,x5,x9 // h1*5*r1
168 mul x10,x4,x8 // h0*r1
173 mul x10,x5,x7 // h1*r0
178 mul x10,x6,x9 // h2*5*r1
180 mul x11,x6,x7 // h2*r0
185 and x10,x14,#-4 // final reduction
187 add x10,x10,x14,lsr#2
193 .size poly1305_mult,.-poly1305_mult
195 .type poly1305_splat,%function
198 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
201 and x14,x14,#0x03ffffff
205 str w12,[x0,#16*0] // r0
206 add w12,w13,w13,lsl#2 // r1*5
207 str w13,[x0,#16*1] // r1
208 add w13,w14,w14,lsl#2 // r2*5
209 str w12,[x0,#16*2] // s1
210 str w14,[x0,#16*3] // r2
211 add w14,w15,w15,lsl#2 // r3*5
212 str w13,[x0,#16*4] // s2
213 str w15,[x0,#16*5] // r3
214 add w15,w16,w16,lsl#2 // r4*5
215 str w14,[x0,#16*6] // s3
216 str w16,[x0,#16*7] // r4
217 str w15,[x0,#16*8] // s4
220 .size poly1305_splat,.-poly1305_splat
222 .type poly1305_blocks_neon,%function
224 poly1305_blocks_neon:
228 cbz x17,poly1305_blocks
231 .inst 0xd503233f // paciasp
232 stp x29,x30,[sp,#-80]!
238 cbz x17,.Lbase2_64_neon
240 ldp w10,w11,[x0] // load hash value base 2^26
247 ldp x7,x8,[x0,#32] // load key value
249 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
251 adds x4,x4,x12,lsl#52
255 adds x5,x5,x14,lsl#40
256 adc x14,x6,xzr // can be partially reduced...
258 ldp x12,x13,[x1],#16 // load input
260 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
262 and x10,x14,#-4 // ... so reduce
264 add x10,x10,x14,lsr#2
273 adds x4,x4,x12 // accumulate input
280 cbz x3,.Lstore_base2_64_neon
282 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
285 and x12,x12,#0x03ffffff
291 stp w10,w11,[x0] // store hash value base 2^26
297 .Lstore_base2_64_neon:
298 stp x4,x5,[x0] // store hash value base 2^64
299 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
304 ldp x7,x8,[x0,#32] // load key value
306 ldp x4,x5,[x0] // load hash value base 2^64
312 ldp x12,x13,[x1],#16 // load input
314 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
319 adds x4,x4,x12 // accumulate input
326 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
329 and x12,x12,#0x03ffffff
333 stp d8,d9,[sp,#16] // meet ABI requirements
344 ////////////////////////////////// initialize r^n table
346 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
352 bl poly1305_mult // r^2
356 bl poly1305_mult // r^3
360 bl poly1305_mult // r^4
371 str x4,[x0,#-24] // set is_base2_26
372 sub x0,x0,#48 // restore original x0
382 stp d8,d9,[sp,#16] // meet ABI requirements
394 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
406 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
407 and x5,x9,#0x03ffffff
410 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
413 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
415 and x8,x8,#0x03ffffff
416 and x9,x9,#0x03ffffff
419 add x12,x3,x12,lsr#40
420 add x13,x3,x13,lsr#40
421 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
423 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
424 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
429 ldp x8,x12,[x1],#16 // inp[0:1]
432 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
433 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
442 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
443 and x5,x9,#0x03ffffff
446 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
449 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
451 and x8,x8,#0x03ffffff
452 and x9,x9,#0x03ffffff
455 add x12,x3,x12,lsr#40
456 add x13,x3,x13,lsr#40
457 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
459 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
460 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
465 ushr v31.2d,v31.2d,#38
471 ////////////////////////////////////////////////////////////////
472 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
473 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
474 // ___________________/
475 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
476 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
477 // ___________________/ ____________________/
479 // Note that we start with inp[2:3]*r^2. This is because it
480 // doesn't depend on reduction in previous iteration.
481 ////////////////////////////////////////////////////////////////
482 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
483 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
484 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
485 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
486 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
489 umull v23.2d,v14.2s,v7.s[2]
491 umull v22.2d,v14.2s,v5.s[2]
492 umull v21.2d,v14.2s,v3.s[2]
493 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
494 umull v20.2d,v14.2s,v1.s[2]
496 umull v19.2d,v14.2s,v0.s[2]
504 umlal v23.2d,v15.2s,v5.s[2]
505 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
506 umlal v22.2d,v15.2s,v3.s[2]
507 and x5,x9,#0x03ffffff
508 umlal v21.2d,v15.2s,v1.s[2]
510 umlal v20.2d,v15.2s,v0.s[2]
512 umlal v19.2d,v15.2s,v8.s[2]
513 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
515 umlal v23.2d,v16.2s,v3.s[2]
517 umlal v22.2d,v16.2s,v1.s[2]
519 umlal v21.2d,v16.2s,v0.s[2]
520 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
521 umlal v20.2d,v16.2s,v8.s[2]
523 umlal v19.2d,v16.2s,v6.s[2]
524 and x8,x8,#0x03ffffff
526 umlal v23.2d,v17.2s,v1.s[2]
527 and x9,x9,#0x03ffffff
528 umlal v22.2d,v17.2s,v0.s[2]
530 umlal v21.2d,v17.2s,v8.s[2]
532 umlal v20.2d,v17.2s,v6.s[2]
533 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
534 umlal v19.2d,v17.2s,v4.s[2]
537 add v11.2s,v11.2s,v26.2s
538 add x12,x3,x12,lsr#40
539 umlal v23.2d,v18.2s,v0.s[2]
540 add x13,x3,x13,lsr#40
541 umlal v22.2d,v18.2s,v8.s[2]
542 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
543 umlal v21.2d,v18.2s,v6.s[2]
544 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
545 umlal v20.2d,v18.2s,v4.s[2]
547 umlal v19.2d,v18.2s,v2.s[2]
550 ////////////////////////////////////////////////////////////////
551 // (hash+inp[0:1])*r^4 and accumulate
553 add v9.2s,v9.2s,v24.2s
555 umlal v22.2d,v11.2s,v1.s[0]
556 ldp x8,x12,[x1],#16 // inp[0:1]
557 umlal v19.2d,v11.2s,v6.s[0]
559 umlal v23.2d,v11.2s,v3.s[0]
560 umlal v20.2d,v11.2s,v8.s[0]
561 umlal v21.2d,v11.2s,v0.s[0]
569 add v10.2s,v10.2s,v25.2s
570 umlal v22.2d,v9.2s,v5.s[0]
571 umlal v23.2d,v9.2s,v7.s[0]
572 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
573 umlal v21.2d,v9.2s,v3.s[0]
574 and x5,x9,#0x03ffffff
575 umlal v19.2d,v9.2s,v0.s[0]
577 umlal v20.2d,v9.2s,v1.s[0]
580 add v12.2s,v12.2s,v27.2s
581 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
582 umlal v22.2d,v10.2s,v3.s[0]
584 umlal v23.2d,v10.2s,v5.s[0]
586 umlal v19.2d,v10.2s,v8.s[0]
587 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
588 umlal v21.2d,v10.2s,v1.s[0]
590 umlal v20.2d,v10.2s,v0.s[0]
591 and x8,x8,#0x03ffffff
593 add v13.2s,v13.2s,v28.2s
594 and x9,x9,#0x03ffffff
595 umlal v22.2d,v12.2s,v0.s[0]
597 umlal v19.2d,v12.2s,v4.s[0]
599 umlal v23.2d,v12.2s,v1.s[0]
600 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
601 umlal v20.2d,v12.2s,v6.s[0]
603 umlal v21.2d,v12.2s,v8.s[0]
604 add x12,x3,x12,lsr#40
606 umlal v22.2d,v13.2s,v8.s[0]
607 add x13,x3,x13,lsr#40
608 umlal v19.2d,v13.2s,v2.s[0]
609 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
610 umlal v23.2d,v13.2s,v0.s[0]
611 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
612 umlal v20.2d,v13.2s,v4.s[0]
614 umlal v21.2d,v13.2s,v6.s[0]
618 /////////////////////////////////////////////////////////////////
619 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
622 // [see discussion in poly1305-armv4 module]
624 ushr v29.2d,v22.2d,#26
626 ushr v30.2d,v19.2d,#26
627 and v19.16b,v19.16b,v31.16b
628 add v23.2d,v23.2d,v29.2d // h3 -> h4
629 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
630 add v20.2d,v20.2d,v30.2d // h0 -> h1
632 ushr v29.2d,v23.2d,#26
634 ushr v30.2d,v20.2d,#26
636 bic v28.2s,#0xfc,lsl#24
637 add v21.2d,v21.2d,v30.2d // h1 -> h2
639 add v19.2d,v19.2d,v29.2d
641 shrn v30.2s,v21.2d,#26
643 add v19.2d,v19.2d,v29.2d // h4 -> h0
644 bic v25.2s,#0xfc,lsl#24
645 add v27.2s,v27.2s,v30.2s // h2 -> h3
646 bic v26.2s,#0xfc,lsl#24
648 shrn v29.2s,v19.2d,#26
650 ushr v30.2s,v27.2s,#26
651 bic v27.2s,#0xfc,lsl#24
652 bic v24.2s,#0xfc,lsl#24
653 add v25.2s,v25.2s,v29.2s // h0 -> h1
654 add v28.2s,v28.2s,v30.2s // h3 -> h4
660 add v11.2s,v11.2s,v26.2s
662 ////////////////////////////////////////////////////////////////
663 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
669 add v14.2s,v9.2s,v24.2s
670 add v17.2s,v12.2s,v27.2s
671 add v15.2s,v10.2s,v25.2s
672 add v18.2s,v13.2s,v28.2s
676 umull2 v19.2d,v16.4s,v6.4s
677 umull2 v22.2d,v16.4s,v1.4s
678 umull2 v23.2d,v16.4s,v3.4s
679 umull2 v21.2d,v16.4s,v0.4s
680 umull2 v20.2d,v16.4s,v8.4s
683 umlal2 v19.2d,v14.4s,v0.4s
684 umlal2 v21.2d,v14.4s,v3.4s
685 umlal2 v22.2d,v14.4s,v5.4s
686 umlal2 v23.2d,v14.4s,v7.4s
687 umlal2 v20.2d,v14.4s,v1.4s
690 umlal2 v19.2d,v15.4s,v8.4s
691 umlal2 v22.2d,v15.4s,v3.4s
692 umlal2 v21.2d,v15.4s,v1.4s
693 umlal2 v23.2d,v15.4s,v5.4s
694 umlal2 v20.2d,v15.4s,v0.4s
697 umlal2 v22.2d,v17.4s,v0.4s
698 umlal2 v23.2d,v17.4s,v1.4s
699 umlal2 v19.2d,v17.4s,v4.4s
700 umlal2 v20.2d,v17.4s,v6.4s
701 umlal2 v21.2d,v17.4s,v8.4s
703 umlal2 v22.2d,v18.4s,v8.4s
704 umlal2 v19.2d,v18.4s,v2.4s
705 umlal2 v23.2d,v18.4s,v0.4s
706 umlal2 v20.2d,v18.4s,v4.4s
707 umlal2 v21.2d,v18.4s,v6.4s
711 ////////////////////////////////////////////////////////////////
712 // (hash+inp[0:1])*r^4:r^3 and accumulate
714 add v9.2s,v9.2s,v24.2s
715 umlal v22.2d,v11.2s,v1.2s
716 umlal v19.2d,v11.2s,v6.2s
717 umlal v23.2d,v11.2s,v3.2s
718 umlal v20.2d,v11.2s,v8.2s
719 umlal v21.2d,v11.2s,v0.2s
721 add v10.2s,v10.2s,v25.2s
722 umlal v22.2d,v9.2s,v5.2s
723 umlal v19.2d,v9.2s,v0.2s
724 umlal v23.2d,v9.2s,v7.2s
725 umlal v20.2d,v9.2s,v1.2s
726 umlal v21.2d,v9.2s,v3.2s
728 add v12.2s,v12.2s,v27.2s
729 umlal v22.2d,v10.2s,v3.2s
730 umlal v19.2d,v10.2s,v8.2s
731 umlal v23.2d,v10.2s,v5.2s
732 umlal v20.2d,v10.2s,v0.2s
733 umlal v21.2d,v10.2s,v1.2s
735 add v13.2s,v13.2s,v28.2s
736 umlal v22.2d,v12.2s,v0.2s
737 umlal v19.2d,v12.2s,v4.2s
738 umlal v23.2d,v12.2s,v1.2s
739 umlal v20.2d,v12.2s,v6.2s
740 umlal v21.2d,v12.2s,v8.2s
742 umlal v22.2d,v13.2s,v8.2s
743 umlal v19.2d,v13.2s,v2.2s
744 umlal v23.2d,v13.2s,v0.2s
745 umlal v20.2d,v13.2s,v4.2s
746 umlal v21.2d,v13.2s,v6.2s
749 ////////////////////////////////////////////////////////////////
752 addp v22.2d,v22.2d,v22.2d
753 ldp d8,d9,[sp,#16] // meet ABI requirements
754 addp v19.2d,v19.2d,v19.2d
756 addp v23.2d,v23.2d,v23.2d
758 addp v20.2d,v20.2d,v20.2d
760 addp v21.2d,v21.2d,v21.2d
762 ////////////////////////////////////////////////////////////////
763 // lazy reduction, but without narrowing
765 ushr v29.2d,v22.2d,#26
766 and v22.16b,v22.16b,v31.16b
767 ushr v30.2d,v19.2d,#26
768 and v19.16b,v19.16b,v31.16b
770 add v23.2d,v23.2d,v29.2d // h3 -> h4
771 add v20.2d,v20.2d,v30.2d // h0 -> h1
773 ushr v29.2d,v23.2d,#26
774 and v23.16b,v23.16b,v31.16b
775 ushr v30.2d,v20.2d,#26
776 and v20.16b,v20.16b,v31.16b
777 add v21.2d,v21.2d,v30.2d // h1 -> h2
779 add v19.2d,v19.2d,v29.2d
781 ushr v30.2d,v21.2d,#26
782 and v21.16b,v21.16b,v31.16b
783 add v19.2d,v19.2d,v29.2d // h4 -> h0
784 add v22.2d,v22.2d,v30.2d // h2 -> h3
786 ushr v29.2d,v19.2d,#26
787 and v19.16b,v19.16b,v31.16b
788 ushr v30.2d,v22.2d,#26
789 and v22.16b,v22.16b,v31.16b
790 add v20.2d,v20.2d,v29.2d // h0 -> h1
791 add v23.2d,v23.2d,v30.2d // h3 -> h4
793 ////////////////////////////////////////////////////////////////
794 // write the result, can be partially reduced
796 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
800 .inst 0xd50323bf // autiasp
803 .size poly1305_blocks_neon,.-poly1305_blocks_neon
805 .type poly1305_emit_neon,%function
809 cbz x17,poly1305_emit
811 ldp w10,w11,[x0] // load hash value base 2^26
815 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
817 adds x4,x4,x12,lsl#52
821 adds x5,x5,x14,lsl#40
822 adc x6,x6,xzr // can be partially reduced...
824 ldp x10,x11,[x2] // load nonce
826 and x12,x6,#-4 // ... so reduce
833 adds x12,x4,#5 // compare to modulus
837 tst x14,#-4 // see if it's carried/borrowed
843 ror x10,x10,#32 // flip nonce words
846 adds x4,x4,x10 // accumulate nonce
849 rev x4,x4 // flip output bytes
852 stp x4,x5,[x1] // write result
855 .size poly1305_emit_neon,.-poly1305_emit_neon
859 .long 0,0,0,0,0,0,0,0
862 .long OPENSSL_armcap_P-.
864 .quad OPENSSL_armcap_P-.
866 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0