2 /* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
7 // forward "declarations" are required for Apple
9 .hidden OPENSSL_armcap_P
12 .globl poly1305_blocks
13 .hidden poly1305_blocks
17 .type poly1305_init,%function
21 stp xzr,xzr,[x0] // zero hash value
22 stp xzr,xzr,[x0,#16] // [along with is_base2_26]
28 ldrsw x11,.LOPENSSL_armcap_P
30 ldr x11,.LOPENSSL_armcap_P
32 adr x10,.LOPENSSL_armcap_P
34 ldp x7,x8,[x1] // load key
35 mov x9,#0xfffffffc0fffffff
36 movk x9,#0x0fff,lsl#48
39 rev x7,x7 // flip bytes
42 and x7,x7,x9 // &=0ffffffc0fffffff
44 and x8,x8,x9 // &=0ffffffc0ffffffc
45 stp x7,x8,[x0,#32] // save key value
49 adr x12,poly1305_blocks
50 adr x7,poly1305_blocks_neon
52 adr x8,poly1305_emit_neon
66 .size poly1305_init,.-poly1305_init
68 .type poly1305_blocks,%function
74 ldp x4,x5,[x0] // load hash value
75 ldp x7,x8,[x0,#32] // load key value
77 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
82 ldp x10,x11,[x1],#16 // load input
88 adds x4,x4,x10 // accumulate input
91 mul x12,x4,x7 // h0*r0
95 mul x10,x5,x9 // h1*5*r1
99 mul x10,x4,x8 // h0*r1
104 mul x10,x5,x7 // h1*r0
109 mul x10,x6,x9 // h2*5*r1
111 mul x11,x6,x7 // h2*r0
116 and x10,x14,#-4 // final reduction
118 add x10,x10,x14,lsr#2
125 stp x4,x5,[x0] // store hash value
130 .size poly1305_blocks,.-poly1305_blocks
132 .type poly1305_emit,%function
135 ldp x4,x5,[x0] // load hash base 2^64
137 ldp x10,x11,[x2] // load nonce
139 adds x12,x4,#5 // compare to modulus
143 tst x14,#-4 // see if it's carried/borrowed
149 ror x10,x10,#32 // flip nonce words
152 adds x4,x4,x10 // accumulate nonce
155 rev x4,x4 // flip output bytes
158 stp x4,x5,[x1] // write result
161 .size poly1305_emit,.-poly1305_emit
162 .type poly1305_mult,%function
165 mul x12,x4,x7 // h0*r0
168 mul x10,x5,x9 // h1*5*r1
172 mul x10,x4,x8 // h0*r1
177 mul x10,x5,x7 // h1*r0
182 mul x10,x6,x9 // h2*5*r1
184 mul x11,x6,x7 // h2*r0
189 and x10,x14,#-4 // final reduction
191 add x10,x10,x14,lsr#2
197 .size poly1305_mult,.-poly1305_mult
199 .type poly1305_splat,%function
202 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
205 and x14,x14,#0x03ffffff
209 str w12,[x0,#16*0] // r0
210 add w12,w13,w13,lsl#2 // r1*5
211 str w13,[x0,#16*1] // r1
212 add w13,w14,w14,lsl#2 // r2*5
213 str w12,[x0,#16*2] // s1
214 str w14,[x0,#16*3] // r2
215 add w14,w15,w15,lsl#2 // r3*5
216 str w13,[x0,#16*4] // s2
217 str w15,[x0,#16*5] // r3
218 add w15,w16,w16,lsl#2 // r4*5
219 str w14,[x0,#16*6] // s3
220 str w16,[x0,#16*7] // r4
221 str w15,[x0,#16*8] // s4
224 .size poly1305_splat,.-poly1305_splat
226 .type poly1305_blocks_neon,%function
228 poly1305_blocks_neon:
232 cbz x17,poly1305_blocks
235 .inst 0xd503233f // paciasp
236 stp x29,x30,[sp,#-80]!
242 cbz x17,.Lbase2_64_neon
244 ldp w10,w11,[x0] // load hash value base 2^26
251 ldp x7,x8,[x0,#32] // load key value
253 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
255 adds x4,x4,x12,lsl#52
259 adds x5,x5,x14,lsl#40
260 adc x14,x6,xzr // can be partially reduced...
262 ldp x12,x13,[x1],#16 // load input
264 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
266 and x10,x14,#-4 // ... so reduce
268 add x10,x10,x14,lsr#2
277 adds x4,x4,x12 // accumulate input
284 cbz x3,.Lstore_base2_64_neon
286 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
289 and x12,x12,#0x03ffffff
295 stp w10,w11,[x0] // store hash value base 2^26
301 .Lstore_base2_64_neon:
302 stp x4,x5,[x0] // store hash value base 2^64
303 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
308 ldp x7,x8,[x0,#32] // load key value
310 ldp x4,x5,[x0] // load hash value base 2^64
316 ldp x12,x13,[x1],#16 // load input
318 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
323 adds x4,x4,x12 // accumulate input
330 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
333 and x12,x12,#0x03ffffff
337 stp d8,d9,[sp,#16] // meet ABI requirements
348 ////////////////////////////////// initialize r^n table
350 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
356 bl poly1305_mult // r^2
360 bl poly1305_mult // r^3
364 bl poly1305_mult // r^4
375 str x4,[x0,#-24] // set is_base2_26
376 sub x0,x0,#48 // restore original x0
386 stp d8,d9,[sp,#16] // meet ABI requirements
398 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
410 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
411 and x5,x9,#0x03ffffff
414 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
417 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
419 and x8,x8,#0x03ffffff
420 and x9,x9,#0x03ffffff
423 add x12,x3,x12,lsr#40
424 add x13,x3,x13,lsr#40
425 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
427 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
428 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
433 ldp x8,x12,[x1],#16 // inp[0:1]
436 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
437 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
446 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
447 and x5,x9,#0x03ffffff
450 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
453 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
455 and x8,x8,#0x03ffffff
456 and x9,x9,#0x03ffffff
459 add x12,x3,x12,lsr#40
460 add x13,x3,x13,lsr#40
461 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
463 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
464 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
469 ushr v31.2d,v31.2d,#38
475 ////////////////////////////////////////////////////////////////
476 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
477 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
478 // ___________________/
479 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
480 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
481 // ___________________/ ____________________/
483 // Note that we start with inp[2:3]*r^2. This is because it
484 // doesn't depend on reduction in previous iteration.
485 ////////////////////////////////////////////////////////////////
486 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
487 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
488 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
489 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
490 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
493 umull v23.2d,v14.2s,v7.s[2]
495 umull v22.2d,v14.2s,v5.s[2]
496 umull v21.2d,v14.2s,v3.s[2]
497 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
498 umull v20.2d,v14.2s,v1.s[2]
500 umull v19.2d,v14.2s,v0.s[2]
508 umlal v23.2d,v15.2s,v5.s[2]
509 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
510 umlal v22.2d,v15.2s,v3.s[2]
511 and x5,x9,#0x03ffffff
512 umlal v21.2d,v15.2s,v1.s[2]
514 umlal v20.2d,v15.2s,v0.s[2]
516 umlal v19.2d,v15.2s,v8.s[2]
517 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
519 umlal v23.2d,v16.2s,v3.s[2]
521 umlal v22.2d,v16.2s,v1.s[2]
523 umlal v21.2d,v16.2s,v0.s[2]
524 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
525 umlal v20.2d,v16.2s,v8.s[2]
527 umlal v19.2d,v16.2s,v6.s[2]
528 and x8,x8,#0x03ffffff
530 umlal v23.2d,v17.2s,v1.s[2]
531 and x9,x9,#0x03ffffff
532 umlal v22.2d,v17.2s,v0.s[2]
534 umlal v21.2d,v17.2s,v8.s[2]
536 umlal v20.2d,v17.2s,v6.s[2]
537 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
538 umlal v19.2d,v17.2s,v4.s[2]
541 add v11.2s,v11.2s,v26.2s
542 add x12,x3,x12,lsr#40
543 umlal v23.2d,v18.2s,v0.s[2]
544 add x13,x3,x13,lsr#40
545 umlal v22.2d,v18.2s,v8.s[2]
546 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
547 umlal v21.2d,v18.2s,v6.s[2]
548 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
549 umlal v20.2d,v18.2s,v4.s[2]
551 umlal v19.2d,v18.2s,v2.s[2]
554 ////////////////////////////////////////////////////////////////
555 // (hash+inp[0:1])*r^4 and accumulate
557 add v9.2s,v9.2s,v24.2s
559 umlal v22.2d,v11.2s,v1.s[0]
560 ldp x8,x12,[x1],#16 // inp[0:1]
561 umlal v19.2d,v11.2s,v6.s[0]
563 umlal v23.2d,v11.2s,v3.s[0]
564 umlal v20.2d,v11.2s,v8.s[0]
565 umlal v21.2d,v11.2s,v0.s[0]
573 add v10.2s,v10.2s,v25.2s
574 umlal v22.2d,v9.2s,v5.s[0]
575 umlal v23.2d,v9.2s,v7.s[0]
576 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
577 umlal v21.2d,v9.2s,v3.s[0]
578 and x5,x9,#0x03ffffff
579 umlal v19.2d,v9.2s,v0.s[0]
581 umlal v20.2d,v9.2s,v1.s[0]
584 add v12.2s,v12.2s,v27.2s
585 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
586 umlal v22.2d,v10.2s,v3.s[0]
588 umlal v23.2d,v10.2s,v5.s[0]
590 umlal v19.2d,v10.2s,v8.s[0]
591 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
592 umlal v21.2d,v10.2s,v1.s[0]
594 umlal v20.2d,v10.2s,v0.s[0]
595 and x8,x8,#0x03ffffff
597 add v13.2s,v13.2s,v28.2s
598 and x9,x9,#0x03ffffff
599 umlal v22.2d,v12.2s,v0.s[0]
601 umlal v19.2d,v12.2s,v4.s[0]
603 umlal v23.2d,v12.2s,v1.s[0]
604 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
605 umlal v20.2d,v12.2s,v6.s[0]
607 umlal v21.2d,v12.2s,v8.s[0]
608 add x12,x3,x12,lsr#40
610 umlal v22.2d,v13.2s,v8.s[0]
611 add x13,x3,x13,lsr#40
612 umlal v19.2d,v13.2s,v2.s[0]
613 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
614 umlal v23.2d,v13.2s,v0.s[0]
615 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
616 umlal v20.2d,v13.2s,v4.s[0]
618 umlal v21.2d,v13.2s,v6.s[0]
622 /////////////////////////////////////////////////////////////////
623 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
626 // [see discussion in poly1305-armv4 module]
628 ushr v29.2d,v22.2d,#26
630 ushr v30.2d,v19.2d,#26
631 and v19.16b,v19.16b,v31.16b
632 add v23.2d,v23.2d,v29.2d // h3 -> h4
633 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
634 add v20.2d,v20.2d,v30.2d // h0 -> h1
636 ushr v29.2d,v23.2d,#26
638 ushr v30.2d,v20.2d,#26
640 bic v28.2s,#0xfc,lsl#24
641 add v21.2d,v21.2d,v30.2d // h1 -> h2
643 add v19.2d,v19.2d,v29.2d
645 shrn v30.2s,v21.2d,#26
647 add v19.2d,v19.2d,v29.2d // h4 -> h0
648 bic v25.2s,#0xfc,lsl#24
649 add v27.2s,v27.2s,v30.2s // h2 -> h3
650 bic v26.2s,#0xfc,lsl#24
652 shrn v29.2s,v19.2d,#26
654 ushr v30.2s,v27.2s,#26
655 bic v27.2s,#0xfc,lsl#24
656 bic v24.2s,#0xfc,lsl#24
657 add v25.2s,v25.2s,v29.2s // h0 -> h1
658 add v28.2s,v28.2s,v30.2s // h3 -> h4
664 add v11.2s,v11.2s,v26.2s
666 ////////////////////////////////////////////////////////////////
667 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
673 add v14.2s,v9.2s,v24.2s
674 add v17.2s,v12.2s,v27.2s
675 add v15.2s,v10.2s,v25.2s
676 add v18.2s,v13.2s,v28.2s
680 umull2 v19.2d,v16.4s,v6.4s
681 umull2 v22.2d,v16.4s,v1.4s
682 umull2 v23.2d,v16.4s,v3.4s
683 umull2 v21.2d,v16.4s,v0.4s
684 umull2 v20.2d,v16.4s,v8.4s
687 umlal2 v19.2d,v14.4s,v0.4s
688 umlal2 v21.2d,v14.4s,v3.4s
689 umlal2 v22.2d,v14.4s,v5.4s
690 umlal2 v23.2d,v14.4s,v7.4s
691 umlal2 v20.2d,v14.4s,v1.4s
694 umlal2 v19.2d,v15.4s,v8.4s
695 umlal2 v22.2d,v15.4s,v3.4s
696 umlal2 v21.2d,v15.4s,v1.4s
697 umlal2 v23.2d,v15.4s,v5.4s
698 umlal2 v20.2d,v15.4s,v0.4s
701 umlal2 v22.2d,v17.4s,v0.4s
702 umlal2 v23.2d,v17.4s,v1.4s
703 umlal2 v19.2d,v17.4s,v4.4s
704 umlal2 v20.2d,v17.4s,v6.4s
705 umlal2 v21.2d,v17.4s,v8.4s
707 umlal2 v22.2d,v18.4s,v8.4s
708 umlal2 v19.2d,v18.4s,v2.4s
709 umlal2 v23.2d,v18.4s,v0.4s
710 umlal2 v20.2d,v18.4s,v4.4s
711 umlal2 v21.2d,v18.4s,v6.4s
715 ////////////////////////////////////////////////////////////////
716 // (hash+inp[0:1])*r^4:r^3 and accumulate
718 add v9.2s,v9.2s,v24.2s
719 umlal v22.2d,v11.2s,v1.2s
720 umlal v19.2d,v11.2s,v6.2s
721 umlal v23.2d,v11.2s,v3.2s
722 umlal v20.2d,v11.2s,v8.2s
723 umlal v21.2d,v11.2s,v0.2s
725 add v10.2s,v10.2s,v25.2s
726 umlal v22.2d,v9.2s,v5.2s
727 umlal v19.2d,v9.2s,v0.2s
728 umlal v23.2d,v9.2s,v7.2s
729 umlal v20.2d,v9.2s,v1.2s
730 umlal v21.2d,v9.2s,v3.2s
732 add v12.2s,v12.2s,v27.2s
733 umlal v22.2d,v10.2s,v3.2s
734 umlal v19.2d,v10.2s,v8.2s
735 umlal v23.2d,v10.2s,v5.2s
736 umlal v20.2d,v10.2s,v0.2s
737 umlal v21.2d,v10.2s,v1.2s
739 add v13.2s,v13.2s,v28.2s
740 umlal v22.2d,v12.2s,v0.2s
741 umlal v19.2d,v12.2s,v4.2s
742 umlal v23.2d,v12.2s,v1.2s
743 umlal v20.2d,v12.2s,v6.2s
744 umlal v21.2d,v12.2s,v8.2s
746 umlal v22.2d,v13.2s,v8.2s
747 umlal v19.2d,v13.2s,v2.2s
748 umlal v23.2d,v13.2s,v0.2s
749 umlal v20.2d,v13.2s,v4.2s
750 umlal v21.2d,v13.2s,v6.2s
753 ////////////////////////////////////////////////////////////////
756 addp v22.2d,v22.2d,v22.2d
757 ldp d8,d9,[sp,#16] // meet ABI requirements
758 addp v19.2d,v19.2d,v19.2d
760 addp v23.2d,v23.2d,v23.2d
762 addp v20.2d,v20.2d,v20.2d
764 addp v21.2d,v21.2d,v21.2d
766 ////////////////////////////////////////////////////////////////
767 // lazy reduction, but without narrowing
769 ushr v29.2d,v22.2d,#26
770 and v22.16b,v22.16b,v31.16b
771 ushr v30.2d,v19.2d,#26
772 and v19.16b,v19.16b,v31.16b
774 add v23.2d,v23.2d,v29.2d // h3 -> h4
775 add v20.2d,v20.2d,v30.2d // h0 -> h1
777 ushr v29.2d,v23.2d,#26
778 and v23.16b,v23.16b,v31.16b
779 ushr v30.2d,v20.2d,#26
780 and v20.16b,v20.16b,v31.16b
781 add v21.2d,v21.2d,v30.2d // h1 -> h2
783 add v19.2d,v19.2d,v29.2d
785 ushr v30.2d,v21.2d,#26
786 and v21.16b,v21.16b,v31.16b
787 add v19.2d,v19.2d,v29.2d // h4 -> h0
788 add v22.2d,v22.2d,v30.2d // h2 -> h3
790 ushr v29.2d,v19.2d,#26
791 and v19.16b,v19.16b,v31.16b
792 ushr v30.2d,v22.2d,#26
793 and v22.16b,v22.16b,v31.16b
794 add v20.2d,v20.2d,v29.2d // h0 -> h1
795 add v23.2d,v23.2d,v30.2d // h3 -> h4
797 ////////////////////////////////////////////////////////////////
798 // write the result, can be partially reduced
800 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
805 .inst 0xd50323bf // autiasp
807 .size poly1305_blocks_neon,.-poly1305_blocks_neon
809 .type poly1305_emit_neon,%function
813 cbz x17,poly1305_emit
815 ldp w10,w11,[x0] // load hash value base 2^26
819 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
821 adds x4,x4,x12,lsl#52
825 adds x5,x5,x14,lsl#40
826 adc x6,x6,xzr // can be partially reduced...
828 ldp x10,x11,[x2] // load nonce
830 and x12,x6,#-4 // ... so reduce
837 adds x12,x4,#5 // compare to modulus
841 tst x14,#-4 // see if it's carried/borrowed
847 ror x10,x10,#32 // flip nonce words
850 adds x4,x4,x10 // accumulate nonce
853 rev x4,x4 // flip output bytes
856 stp x4,x5,[x1] // write result
859 .size poly1305_emit_neon,.-poly1305_emit_neon
863 .long 0,0,0,0,0,0,0,0
866 .long OPENSSL_armcap_P-.
868 .quad OPENSSL_armcap_P-.
870 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0