2 /* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
7 // forward "declarations" are required for Apple
13 .type poly1305_init,%function
17 stp xzr,xzr,[x0] // zero hash value
18 stp xzr,xzr,[x0,#16] // [along with is_base2_26]
24 ldrsw x11,.LOPENSSL_armcap_P
26 ldr x11,.LOPENSSL_armcap_P
28 adr x10,.LOPENSSL_armcap_P
30 ldp x7,x8,[x1] // load key
31 mov x9,#0xfffffffc0fffffff
32 movk x9,#0x0fff,lsl#48
35 rev x7,x7 // flip bytes
38 and x7,x7,x9 // &=0ffffffc0fffffff
40 and x8,x8,x9 // &=0ffffffc0ffffffc
41 stp x7,x8,[x0,#32] // save key value
45 adr x12,poly1305_blocks
46 adr x7,poly1305_blocks_neon
48 adr x8,poly1305_emit_neon
62 .size poly1305_init,.-poly1305_init
64 .type poly1305_blocks,%function
70 ldp x4,x5,[x0] // load hash value
71 ldp x7,x8,[x0,#32] // load key value
73 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
78 ldp x10,x11,[x1],#16 // load input
84 adds x4,x4,x10 // accumulate input
87 mul x12,x4,x7 // h0*r0
91 mul x10,x5,x9 // h1*5*r1
95 mul x10,x4,x8 // h0*r1
100 mul x10,x5,x7 // h1*r0
105 mul x10,x6,x9 // h2*5*r1
107 mul x11,x6,x7 // h2*r0
112 and x10,x14,#-4 // final reduction
114 add x10,x10,x14,lsr#2
121 stp x4,x5,[x0] // store hash value
126 .size poly1305_blocks,.-poly1305_blocks
128 .type poly1305_emit,%function
131 ldp x4,x5,[x0] // load hash base 2^64
133 ldp x10,x11,[x2] // load nonce
135 adds x12,x4,#5 // compare to modulus
139 tst x14,#-4 // see if it's carried/borrowed
145 ror x10,x10,#32 // flip nonce words
148 adds x4,x4,x10 // accumulate nonce
151 rev x4,x4 // flip output bytes
154 stp x4,x5,[x1] // write result
157 .size poly1305_emit,.-poly1305_emit
158 .type poly1305_mult,%function
161 mul x12,x4,x7 // h0*r0
164 mul x10,x5,x9 // h1*5*r1
168 mul x10,x4,x8 // h0*r1
173 mul x10,x5,x7 // h1*r0
178 mul x10,x6,x9 // h2*5*r1
180 mul x11,x6,x7 // h2*r0
185 and x10,x14,#-4 // final reduction
187 add x10,x10,x14,lsr#2
193 .size poly1305_mult,.-poly1305_mult
195 .type poly1305_splat,%function
198 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
201 and x14,x14,#0x03ffffff
205 str w12,[x0,#16*0] // r0
206 add w12,w13,w13,lsl#2 // r1*5
207 str w13,[x0,#16*1] // r1
208 add w13,w14,w14,lsl#2 // r2*5
209 str w12,[x0,#16*2] // s1
210 str w14,[x0,#16*3] // r2
211 add w14,w15,w15,lsl#2 // r3*5
212 str w13,[x0,#16*4] // s2
213 str w15,[x0,#16*5] // r3
214 add w15,w16,w16,lsl#2 // r4*5
215 str w14,[x0,#16*6] // s3
216 str w16,[x0,#16*7] // r4
217 str w15,[x0,#16*8] // s4
220 .size poly1305_splat,.-poly1305_splat
222 .type poly1305_blocks_neon,%function
224 poly1305_blocks_neon:
228 cbz x17,poly1305_blocks
231 stp x29,x30,[sp,#-80]!
237 cbz x17,.Lbase2_64_neon
239 ldp w10,w11,[x0] // load hash value base 2^26
246 ldp x7,x8,[x0,#32] // load key value
248 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
250 adds x4,x4,x12,lsl#52
254 adds x5,x5,x14,lsl#40
255 adc x14,x6,xzr // can be partially reduced...
257 ldp x12,x13,[x1],#16 // load input
259 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
261 and x10,x14,#-4 // ... so reduce
263 add x10,x10,x14,lsr#2
272 adds x4,x4,x12 // accumulate input
279 cbz x3,.Lstore_base2_64_neon
281 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
284 and x12,x12,#0x03ffffff
290 stp w10,w11,[x0] // store hash value base 2^26
296 .Lstore_base2_64_neon:
297 stp x4,x5,[x0] // store hash value base 2^64
298 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
303 ldp x7,x8,[x0,#32] // load key value
305 ldp x4,x5,[x0] // load hash value base 2^64
311 ldp x12,x13,[x1],#16 // load input
313 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
318 adds x4,x4,x12 // accumulate input
325 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
328 and x12,x12,#0x03ffffff
332 stp d8,d9,[sp,#16] // meet ABI requirements
343 ////////////////////////////////// initialize r^n table
345 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
351 bl poly1305_mult // r^2
355 bl poly1305_mult // r^3
359 bl poly1305_mult // r^4
370 str x4,[x0,#-24] // set is_base2_26
371 sub x0,x0,#48 // restore original x0
381 stp d8,d9,[sp,#16] // meet ABI requirements
393 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
405 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
406 and x5,x9,#0x03ffffff
409 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
412 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
414 and x8,x8,#0x03ffffff
415 and x9,x9,#0x03ffffff
418 add x12,x3,x12,lsr#40
419 add x13,x3,x13,lsr#40
420 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
422 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
423 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
428 ldp x8,x12,[x1],#16 // inp[0:1]
431 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
432 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
441 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
442 and x5,x9,#0x03ffffff
445 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
448 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
450 and x8,x8,#0x03ffffff
451 and x9,x9,#0x03ffffff
454 add x12,x3,x12,lsr#40
455 add x13,x3,x13,lsr#40
456 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
458 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
459 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
464 ushr v31.2d,v31.2d,#38
470 ////////////////////////////////////////////////////////////////
471 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
472 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
473 // ___________________/
474 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
475 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
476 // ___________________/ ____________________/
478 // Note that we start with inp[2:3]*r^2. This is because it
479 // doesn't depend on reduction in previous iteration.
480 ////////////////////////////////////////////////////////////////
481 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
482 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
483 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
484 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
485 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
488 umull v23.2d,v14.2s,v7.s[2]
490 umull v22.2d,v14.2s,v5.s[2]
491 umull v21.2d,v14.2s,v3.s[2]
492 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
493 umull v20.2d,v14.2s,v1.s[2]
495 umull v19.2d,v14.2s,v0.s[2]
503 umlal v23.2d,v15.2s,v5.s[2]
504 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
505 umlal v22.2d,v15.2s,v3.s[2]
506 and x5,x9,#0x03ffffff
507 umlal v21.2d,v15.2s,v1.s[2]
509 umlal v20.2d,v15.2s,v0.s[2]
511 umlal v19.2d,v15.2s,v8.s[2]
512 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
514 umlal v23.2d,v16.2s,v3.s[2]
516 umlal v22.2d,v16.2s,v1.s[2]
518 umlal v21.2d,v16.2s,v0.s[2]
519 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
520 umlal v20.2d,v16.2s,v8.s[2]
522 umlal v19.2d,v16.2s,v6.s[2]
523 and x8,x8,#0x03ffffff
525 umlal v23.2d,v17.2s,v1.s[2]
526 and x9,x9,#0x03ffffff
527 umlal v22.2d,v17.2s,v0.s[2]
529 umlal v21.2d,v17.2s,v8.s[2]
531 umlal v20.2d,v17.2s,v6.s[2]
532 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
533 umlal v19.2d,v17.2s,v4.s[2]
536 add v11.2s,v11.2s,v26.2s
537 add x12,x3,x12,lsr#40
538 umlal v23.2d,v18.2s,v0.s[2]
539 add x13,x3,x13,lsr#40
540 umlal v22.2d,v18.2s,v8.s[2]
541 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
542 umlal v21.2d,v18.2s,v6.s[2]
543 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
544 umlal v20.2d,v18.2s,v4.s[2]
546 umlal v19.2d,v18.2s,v2.s[2]
549 ////////////////////////////////////////////////////////////////
550 // (hash+inp[0:1])*r^4 and accumulate
552 add v9.2s,v9.2s,v24.2s
554 umlal v22.2d,v11.2s,v1.s[0]
555 ldp x8,x12,[x1],#16 // inp[0:1]
556 umlal v19.2d,v11.2s,v6.s[0]
558 umlal v23.2d,v11.2s,v3.s[0]
559 umlal v20.2d,v11.2s,v8.s[0]
560 umlal v21.2d,v11.2s,v0.s[0]
568 add v10.2s,v10.2s,v25.2s
569 umlal v22.2d,v9.2s,v5.s[0]
570 umlal v23.2d,v9.2s,v7.s[0]
571 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
572 umlal v21.2d,v9.2s,v3.s[0]
573 and x5,x9,#0x03ffffff
574 umlal v19.2d,v9.2s,v0.s[0]
576 umlal v20.2d,v9.2s,v1.s[0]
579 add v12.2s,v12.2s,v27.2s
580 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
581 umlal v22.2d,v10.2s,v3.s[0]
583 umlal v23.2d,v10.2s,v5.s[0]
585 umlal v19.2d,v10.2s,v8.s[0]
586 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
587 umlal v21.2d,v10.2s,v1.s[0]
589 umlal v20.2d,v10.2s,v0.s[0]
590 and x8,x8,#0x03ffffff
592 add v13.2s,v13.2s,v28.2s
593 and x9,x9,#0x03ffffff
594 umlal v22.2d,v12.2s,v0.s[0]
596 umlal v19.2d,v12.2s,v4.s[0]
598 umlal v23.2d,v12.2s,v1.s[0]
599 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
600 umlal v20.2d,v12.2s,v6.s[0]
602 umlal v21.2d,v12.2s,v8.s[0]
603 add x12,x3,x12,lsr#40
605 umlal v22.2d,v13.2s,v8.s[0]
606 add x13,x3,x13,lsr#40
607 umlal v19.2d,v13.2s,v2.s[0]
608 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
609 umlal v23.2d,v13.2s,v0.s[0]
610 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
611 umlal v20.2d,v13.2s,v4.s[0]
613 umlal v21.2d,v13.2s,v6.s[0]
617 /////////////////////////////////////////////////////////////////
618 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
621 // [see discussion in poly1305-armv4 module]
623 ushr v29.2d,v22.2d,#26
625 ushr v30.2d,v19.2d,#26
626 and v19.16b,v19.16b,v31.16b
627 add v23.2d,v23.2d,v29.2d // h3 -> h4
628 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
629 add v20.2d,v20.2d,v30.2d // h0 -> h1
631 ushr v29.2d,v23.2d,#26
633 ushr v30.2d,v20.2d,#26
635 bic v28.2s,#0xfc,lsl#24
636 add v21.2d,v21.2d,v30.2d // h1 -> h2
638 add v19.2d,v19.2d,v29.2d
640 shrn v30.2s,v21.2d,#26
642 add v19.2d,v19.2d,v29.2d // h4 -> h0
643 bic v25.2s,#0xfc,lsl#24
644 add v27.2s,v27.2s,v30.2s // h2 -> h3
645 bic v26.2s,#0xfc,lsl#24
647 shrn v29.2s,v19.2d,#26
649 ushr v30.2s,v27.2s,#26
650 bic v27.2s,#0xfc,lsl#24
651 bic v24.2s,#0xfc,lsl#24
652 add v25.2s,v25.2s,v29.2s // h0 -> h1
653 add v28.2s,v28.2s,v30.2s // h3 -> h4
659 add v11.2s,v11.2s,v26.2s
661 ////////////////////////////////////////////////////////////////
662 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
668 add v14.2s,v9.2s,v24.2s
669 add v17.2s,v12.2s,v27.2s
670 add v15.2s,v10.2s,v25.2s
671 add v18.2s,v13.2s,v28.2s
675 umull2 v19.2d,v16.4s,v6.4s
676 umull2 v22.2d,v16.4s,v1.4s
677 umull2 v23.2d,v16.4s,v3.4s
678 umull2 v21.2d,v16.4s,v0.4s
679 umull2 v20.2d,v16.4s,v8.4s
682 umlal2 v19.2d,v14.4s,v0.4s
683 umlal2 v21.2d,v14.4s,v3.4s
684 umlal2 v22.2d,v14.4s,v5.4s
685 umlal2 v23.2d,v14.4s,v7.4s
686 umlal2 v20.2d,v14.4s,v1.4s
689 umlal2 v19.2d,v15.4s,v8.4s
690 umlal2 v22.2d,v15.4s,v3.4s
691 umlal2 v21.2d,v15.4s,v1.4s
692 umlal2 v23.2d,v15.4s,v5.4s
693 umlal2 v20.2d,v15.4s,v0.4s
696 umlal2 v22.2d,v17.4s,v0.4s
697 umlal2 v23.2d,v17.4s,v1.4s
698 umlal2 v19.2d,v17.4s,v4.4s
699 umlal2 v20.2d,v17.4s,v6.4s
700 umlal2 v21.2d,v17.4s,v8.4s
702 umlal2 v22.2d,v18.4s,v8.4s
703 umlal2 v19.2d,v18.4s,v2.4s
704 umlal2 v23.2d,v18.4s,v0.4s
705 umlal2 v20.2d,v18.4s,v4.4s
706 umlal2 v21.2d,v18.4s,v6.4s
710 ////////////////////////////////////////////////////////////////
711 // (hash+inp[0:1])*r^4:r^3 and accumulate
713 add v9.2s,v9.2s,v24.2s
714 umlal v22.2d,v11.2s,v1.2s
715 umlal v19.2d,v11.2s,v6.2s
716 umlal v23.2d,v11.2s,v3.2s
717 umlal v20.2d,v11.2s,v8.2s
718 umlal v21.2d,v11.2s,v0.2s
720 add v10.2s,v10.2s,v25.2s
721 umlal v22.2d,v9.2s,v5.2s
722 umlal v19.2d,v9.2s,v0.2s
723 umlal v23.2d,v9.2s,v7.2s
724 umlal v20.2d,v9.2s,v1.2s
725 umlal v21.2d,v9.2s,v3.2s
727 add v12.2s,v12.2s,v27.2s
728 umlal v22.2d,v10.2s,v3.2s
729 umlal v19.2d,v10.2s,v8.2s
730 umlal v23.2d,v10.2s,v5.2s
731 umlal v20.2d,v10.2s,v0.2s
732 umlal v21.2d,v10.2s,v1.2s
734 add v13.2s,v13.2s,v28.2s
735 umlal v22.2d,v12.2s,v0.2s
736 umlal v19.2d,v12.2s,v4.2s
737 umlal v23.2d,v12.2s,v1.2s
738 umlal v20.2d,v12.2s,v6.2s
739 umlal v21.2d,v12.2s,v8.2s
741 umlal v22.2d,v13.2s,v8.2s
742 umlal v19.2d,v13.2s,v2.2s
743 umlal v23.2d,v13.2s,v0.2s
744 umlal v20.2d,v13.2s,v4.2s
745 umlal v21.2d,v13.2s,v6.2s
748 ////////////////////////////////////////////////////////////////
751 addp v22.2d,v22.2d,v22.2d
752 ldp d8,d9,[sp,#16] // meet ABI requirements
753 addp v19.2d,v19.2d,v19.2d
755 addp v23.2d,v23.2d,v23.2d
757 addp v20.2d,v20.2d,v20.2d
759 addp v21.2d,v21.2d,v21.2d
761 ////////////////////////////////////////////////////////////////
762 // lazy reduction, but without narrowing
764 ushr v29.2d,v22.2d,#26
765 and v22.16b,v22.16b,v31.16b
766 ushr v30.2d,v19.2d,#26
767 and v19.16b,v19.16b,v31.16b
769 add v23.2d,v23.2d,v29.2d // h3 -> h4
770 add v20.2d,v20.2d,v30.2d // h0 -> h1
772 ushr v29.2d,v23.2d,#26
773 and v23.16b,v23.16b,v31.16b
774 ushr v30.2d,v20.2d,#26
775 and v20.16b,v20.16b,v31.16b
776 add v21.2d,v21.2d,v30.2d // h1 -> h2
778 add v19.2d,v19.2d,v29.2d
780 ushr v30.2d,v21.2d,#26
781 and v21.16b,v21.16b,v31.16b
782 add v19.2d,v19.2d,v29.2d // h4 -> h0
783 add v22.2d,v22.2d,v30.2d // h2 -> h3
785 ushr v29.2d,v19.2d,#26
786 and v19.16b,v19.16b,v31.16b
787 ushr v30.2d,v22.2d,#26
788 and v22.16b,v22.16b,v31.16b
789 add v20.2d,v20.2d,v29.2d // h0 -> h1
790 add v23.2d,v23.2d,v30.2d // h3 -> h4
792 ////////////////////////////////////////////////////////////////
793 // write the result, can be partially reduced
795 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
801 .size poly1305_blocks_neon,.-poly1305_blocks_neon
803 .type poly1305_emit_neon,%function
807 cbz x17,poly1305_emit
809 ldp w10,w11,[x0] // load hash value base 2^26
813 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
815 adds x4,x4,x12,lsl#52
819 adds x5,x5,x14,lsl#40
820 adc x6,x6,xzr // can be partially reduced...
822 ldp x10,x11,[x2] // load nonce
824 and x12,x6,#-4 // ... so reduce
831 adds x12,x4,#5 // compare to modulus
835 tst x14,#-4 // see if it's carried/borrowed
841 ror x10,x10,#32 // flip nonce words
844 adds x4,x4,x10 // accumulate nonce
847 rev x4,x4 // flip output bytes
850 stp x4,x5,[x1] // write result
853 .size poly1305_emit_neon,.-poly1305_emit_neon
857 .long 0,0,0,0,0,0,0,0
860 .long OPENSSL_armcap_P-.
862 .quad OPENSSL_armcap_P-.
864 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0