2 /* Do not modify. This file is auto-generated from poly1305-armv4.pl. */
6 #if defined(__thumb2__)
14 .globl poly1305_blocks
16 .type poly1305_init,%function
20 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
24 str r3,[r0,#0] @ zero hash value
29 str r3,[r0,#36] @ is_base2_26
38 #if __ARM_MAX_ARCH__>=7
39 adr r11,.Lpoly1305_init
40 ldr r12,.LOPENSSL_armcap
45 and r3,r10,#-4 @ 0x0ffffffc
56 #if __ARM_MAX_ARCH__>=7
57 ldr r12,[r11,r12] @ OPENSSL_armcap_P
71 #if __ARM_MAX_ARCH__>=7
72 tst r12,#ARMV7_NEON @ check for NEON
74 adr r9,poly1305_blocks_neon
75 adr r11,poly1305_blocks
81 adr r10,poly1305_emit_neon
90 addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
91 addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
92 addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
93 addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
96 orr r12,r12,#1 @ thumb-ify address
118 #if __ARM_MAX_ARCH__>=7
119 stmia r2,{r11,r12} @ fill functions table
125 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
130 moveq pc,lr @ be binary compatible with V4, yet
131 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
133 .size poly1305_init,.-poly1305_init
134 .type poly1305_blocks,%function
138 stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
144 add r2,r2,r1 @ end pointer
147 ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12} @ load context
149 str r0,[sp,#12] @ offload stuff
159 ldrb r0,[lr],#16 @ load input
163 addhi r8,r8,#1 @ 1<<128
173 adds r4,r4,r3 @ accumulate input
195 str lr,[sp,#8] @ offload input pointer
197 add r10,r10,r10,lsr#2
200 ldr r0,[lr],#16 @ load input
204 addhi r8,r8,#1 @ padbit
214 adds r4,r4,r0 @ accumulate input
215 str lr,[sp,#8] @ offload input pointer
217 add r10,r10,r10,lsr#2
220 add r11,r11,r11,lsr#2
222 add r12,r12,r12,lsr#2
229 ldr r10,[sp,#20] @ reload r10
235 str r0,[sp,#0] @ future r4
237 ldr r11,[sp,#24] @ reload r11
238 adds r2,r2,r1 @ d1+=d0>>32
240 adc lr,r3,#0 @ future r6
241 str r2,[sp,#4] @ future r5
246 ldr r12,[sp,#28] @ reload r12
258 adds r6,lr,r0 @ d2+=d1>>32
259 ldr lr,[sp,#8] @ reload input pointer
261 adds r7,r2,r1 @ d3+=d2>>32
262 ldr r0,[sp,#16] @ reload end pointer
264 add r8,r8,r3 @ h4+=d3>>32
268 add r1,r1,r1,lsr#2 @ *=5
275 cmp r0,lr @ done yet?
280 stmia r0,{r4,r5,r6,r7,r8} @ store the result
284 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc}
286 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
288 moveq pc,lr @ be binary compatible with V4, yet
289 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
291 .size poly1305_blocks,.-poly1305_blocks
292 .type poly1305_emit,%function
295 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
296 .Lpoly1305_emit_enter:
298 ldmia r0,{r3,r4,r5,r6,r7}
299 adds r8,r3,#5 @ compare to modulus
304 tst r7,#4 @ did it carry/borrow?
376 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
381 moveq pc,lr @ be binary compatible with V4, yet
382 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
384 .size poly1305_emit,.-poly1305_emit
385 #if __ARM_MAX_ARCH__>=7
388 .type poly1305_init_neon,%function
391 ldr r4,[r0,#20] @ load key base 2^32
396 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
404 and r3,r3,#0x03ffffff
405 and r4,r4,#0x03ffffff
406 and r5,r5,#0x03ffffff
408 vdup.32 d0,r2 @ r^1 in both lanes
409 add r2,r3,r3,lsl#2 @ *5
425 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
426 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
427 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
428 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
429 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
430 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
432 vmull.u32 q5,d0,d0[1]
433 vmull.u32 q6,d1,d0[1]
434 vmull.u32 q7,d3,d0[1]
435 vmull.u32 q8,d5,d0[1]
436 vmull.u32 q9,d7,d0[1]
438 vmlal.u32 q5,d7,d2[1]
439 vmlal.u32 q6,d0,d1[1]
440 vmlal.u32 q7,d1,d1[1]
441 vmlal.u32 q8,d3,d1[1]
442 vmlal.u32 q9,d5,d1[1]
444 vmlal.u32 q5,d5,d4[1]
445 vmlal.u32 q6,d7,d4[1]
446 vmlal.u32 q8,d1,d3[1]
447 vmlal.u32 q7,d0,d3[1]
448 vmlal.u32 q9,d3,d3[1]
450 vmlal.u32 q5,d3,d6[1]
451 vmlal.u32 q8,d0,d5[1]
452 vmlal.u32 q6,d5,d6[1]
453 vmlal.u32 q7,d7,d6[1]
454 vmlal.u32 q9,d1,d5[1]
456 vmlal.u32 q8,d7,d8[1]
457 vmlal.u32 q5,d1,d8[1]
458 vmlal.u32 q6,d3,d8[1]
459 vmlal.u32 q7,d5,d8[1]
460 vmlal.u32 q9,d0,d7[1]
462 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
463 @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
466 @ H0>>+H1>>+H2>>+H3>>+H4
467 @ H3>>+H4>>*5+H0>>+H1
471 @ Result of multiplication of n-bit number by m-bit number is
472 @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
473 @ m-bit number multiplied by 2^n is still n+m bits wide.
475 @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
476 @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
477 @ one is n+1 bits wide.
479 @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
480 @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
481 @ can be 27. However! In cases when their width exceeds 26 bits
482 @ they are limited by 2^26+2^6. This in turn means that *sum*
483 @ of the products with these values can still be viewed as sum
484 @ of 52-bit numbers as long as the amount of addends is not a
485 @ power of 2. For example,
487 @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
489 @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
490 @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
491 @ 8 * (2^52) or 2^55. However, the value is then multiplied by
492 @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
493 @ which is less than 32 * (2^52) or 2^57. And when processing
494 @ data we are looking at triple as many addends...
496 @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
497 @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
498 @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
499 @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
500 @ instruction accepts 2x32-bit input and writes 2x64-bit result.
501 @ This means that result of reduction have to be compressed upon
502 @ loop wrap-around. This can be done in the process of reduction
503 @ to minimize amount of instructions [as well as amount of
504 @ 128-bit instructions, which benefits low-end processors], but
505 @ one has to watch for H2 (which is narrower than H0) and 5*H4
506 @ not being wider than 58 bits, so that result of right shift
507 @ by 26 bits fits in 32 bits. This is also useful on x86,
508 @ because it allows to use paddd in place for paddq, which
509 @ benefits Atom, where paddq is ridiculously slow.
515 vadd.i64 q9,q9,q15 @ h3 -> h4
516 vbic.i32 d16,#0xfc000000 @ &=0x03ffffff
517 vadd.i64 q6,q6,q4 @ h0 -> h1
518 vbic.i32 d10,#0xfc000000
524 vadd.i64 q7,q7,q4 @ h1 -> h2
525 vbic.i32 d18,#0xfc000000
526 vbic.i32 d12,#0xfc000000
532 vadd.i32 d10,d10,d30 @ h4 -> h0
533 vadd.i32 d16,d16,d8 @ h2 -> h3
534 vbic.i32 d14,#0xfc000000
537 vbic.i32 d10,#0xfc000000
539 vbic.i32 d16,#0xfc000000
540 vadd.i32 d12,d12,d30 @ h0 -> h1
541 vadd.i32 d18,d18,d8 @ h3 -> h4
544 beq .Lsquare_break_neon
546 add r6,r0,#(48+0*9*4)
547 add r7,r0,#(48+1*9*4)
549 vtrn.32 d0,d10 @ r^2:r^1
555 vshl.u32 d4,d3,#2 @ *5
564 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
565 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
566 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
567 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
568 vst1.32 {d8[0]},[r6,:32]
569 vst1.32 {d8[1]},[r7,:32]
575 add r6,r0,#(48+2*4*9)
576 add r7,r0,#(48+3*4*9)
578 vmov d0,d10 @ r^4:r^3
579 vshl.u32 d2,d12,#2 @ *5
592 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
593 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
594 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
595 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
600 .size poly1305_init_neon,.-poly1305_init_neon
602 .type poly1305_blocks_neon,%function
604 poly1305_blocks_neon:
605 ldr ip,[r0,#36] @ is_base2_26
611 tst ip,ip @ is_base2_26?
612 beq .Lpoly1305_blocks
615 stmdb sp!,{r4,r5,r6,r7}
616 vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
618 tst ip,ip @ is_base2_26?
621 stmdb sp!,{r1,r2,r3,lr}
622 bl poly1305_init_neon
624 ldr r4,[r0,#0] @ load hash value base 2^32
630 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
642 and r3,r3,#0x03ffffff
645 and r4,r4,#0x03ffffff
647 and r5,r5,#0x03ffffff
648 str r1,[r0,#36] @ is_base2_26
657 ldmia sp!,{r1,r2,r3,lr}
662 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
670 vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
672 vld1.32 {d18[0]},[r0]
673 sub r0,r0,#16 @ rewind
681 vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
692 vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
697 vadd.i32 d29,d28,d18 @ add hash value and move to #hi
699 vbic.i32 d26,#0xfc000000
703 vbic.i32 d24,#0xfc000000
707 vbic.i32 d20,#0xfc000000
708 vbic.i32 d22,#0xfc000000
726 vmov.i32 q14,#1<<24 @ padbit, yes, always
727 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
729 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
732 addhi r7,r0,#(48+1*9*4)
733 addhi r6,r0,#(48+3*9*4)
741 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
747 vbic.i32 q13,#0xfc000000
751 vbic.i32 q12,#0xfc000000
754 vbic.i32 q10,#0xfc000000
755 vbic.i32 q11,#0xfc000000
759 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
760 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
761 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
762 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
767 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
768 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
769 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
770 @ ___________________/
771 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
772 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
773 @ ___________________/ ____________________/
775 @ Note that we start with inp[2:3]*r^2. This is because it
776 @ doesn't depend on reduction in previous iteration.
777 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
778 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
779 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
780 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
781 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
782 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
784 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
787 vadd.i32 d24,d24,d14 @ accumulate inp[0:1]
788 vmull.u32 q7,d25,d0[1]
790 vmull.u32 q5,d21,d0[1]
792 vmull.u32 q8,d27,d0[1]
793 vmlal.u32 q7,d23,d1[1]
795 vmull.u32 q6,d23,d0[1]
798 vmull.u32 q9,d29,d0[1]
800 vmlal.u32 q5,d29,d2[1]
803 vmlal.u32 q8,d25,d1[1]
804 vld1.32 d8[1],[r7,:32]
805 vmlal.u32 q6,d21,d1[1]
806 vmlal.u32 q9,d27,d1[1]
808 vmlal.u32 q5,d27,d4[1]
809 vmlal.u32 q8,d23,d3[1]
810 vmlal.u32 q9,d25,d3[1]
811 vmlal.u32 q6,d29,d4[1]
812 vmlal.u32 q7,d21,d3[1]
814 vmlal.u32 q8,d21,d5[1]
815 vmlal.u32 q5,d25,d6[1]
816 vmlal.u32 q9,d23,d5[1]
817 vmlal.u32 q6,d27,d6[1]
818 vmlal.u32 q7,d29,d6[1]
820 vmlal.u32 q8,d29,d8[1]
821 vmlal.u32 q5,d23,d8[1]
822 vmlal.u32 q9,d21,d7[1]
823 vmlal.u32 q6,d25,d8[1]
824 vmlal.u32 q7,d27,d8[1]
826 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
829 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
830 @ (hash+inp[0:1])*r^4 and accumulate
832 vmlal.u32 q8,d26,d0[0]
833 vmlal.u32 q5,d20,d0[0]
834 vmlal.u32 q9,d28,d0[0]
835 vmlal.u32 q6,d22,d0[0]
836 vmlal.u32 q7,d24,d0[0]
837 vld1.32 d8[0],[r6,:32]
839 vmlal.u32 q8,d24,d1[0]
840 vmlal.u32 q5,d28,d2[0]
841 vmlal.u32 q9,d26,d1[0]
842 vmlal.u32 q6,d20,d1[0]
843 vmlal.u32 q7,d22,d1[0]
845 vmlal.u32 q8,d22,d3[0]
846 vmlal.u32 q5,d26,d4[0]
847 vmlal.u32 q9,d24,d3[0]
848 vmlal.u32 q6,d28,d4[0]
849 vmlal.u32 q7,d20,d3[0]
851 vmlal.u32 q8,d20,d5[0]
852 vmlal.u32 q5,d24,d6[0]
853 vmlal.u32 q9,d22,d5[0]
854 vmlal.u32 q6,d26,d6[0]
855 vmlal.u32 q8,d28,d8[0]
857 vmlal.u32 q7,d28,d6[0]
858 vmlal.u32 q5,d22,d8[0]
859 vmlal.u32 q9,d20,d7[0]
860 vmov.i32 q14,#1<<24 @ padbit, yes, always
861 vmlal.u32 q6,d24,d8[0]
862 vmlal.u32 q7,d26,d8[0]
864 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
873 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
874 @ lazy reduction interleaved with base 2^32 -> base 2^26 of
875 @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
881 vadd.i64 q9,q9,q15 @ h3 -> h4
882 vbic.i32 d16,#0xfc000000
883 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
884 vadd.i64 q6,q6,q4 @ h0 -> h1
886 vbic.i32 d10,#0xfc000000
892 vadd.i64 q7,q7,q4 @ h1 -> h2
894 vbic.i32 d18,#0xfc000000
896 vbic.i32 d12,#0xfc000000
900 vbic.i32 q13,#0xfc000000
903 vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]
905 vadd.i32 d16,d16,d8 @ h2 -> h3
907 vbic.i32 d14,#0xfc000000
908 vbic.i32 q12,#0xfc000000
910 vshrn.u64 d30,q5,#26 @ re-narrow
913 vbic.i32 q10,#0xfc000000
915 vbic.i32 d16,#0xfc000000
916 vbic.i32 d10,#0xfc000000
917 vadd.i32 d12,d12,d30 @ h0 -> h1
918 vadd.i32 d18,d18,d8 @ h3 -> h4
919 vbic.i32 q11,#0xfc000000
924 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
925 @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
927 add r7,r0,#(48+0*9*4)
928 add r6,r0,#(48+1*9*4)
934 vadd.i32 d25,d24,d14 @ add hash value and move to #hi
941 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
942 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
944 vadd.i32 d24,d24,d14 @ can be redundant
956 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
958 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
964 vld1.32 d8[1],[r7,:32]
966 vld1.32 d8[0],[r6,:32]
973 addne r7,r0,#(48+2*9*4)
976 addne r6,r0,#(48+3*9*4)
982 vorn q0,q0,q0 @ all-ones, can be redundant
991 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
992 @ (hash+inp[0:1])*r^4:r^3 and accumulate
994 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
995 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
1004 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
1006 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
1012 vld1.32 d8[1],[r7,:32]
1014 vld1.32 d8[0],[r6,:32]
1026 vorn q0,q0,q0 @ all-ones
1034 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1035 @ horizontal addition
1037 vadd.i64 d16,d16,d17
1038 vadd.i64 d10,d10,d11
1039 vadd.i64 d18,d18,d19
1040 vadd.i64 d12,d12,d13
1041 vadd.i64 d14,d14,d15
1043 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1044 @ lazy reduction, but without narrowing
1050 vadd.i64 q9,q9,q15 @ h3 -> h4
1051 vadd.i64 q6,q6,q4 @ h0 -> h1
1057 vadd.i64 q7,q7,q4 @ h1 -> h2
1063 vadd.i64 q5,q5,q15 @ h4 -> h0
1064 vadd.i64 q8,q8,q4 @ h2 -> h3
1070 vadd.i64 q6,q6,q15 @ h0 -> h1
1071 vadd.i64 q9,q9,q4 @ h3 -> h4
1076 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1079 vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
1080 vst1.32 {d18[0]},[r0]
1082 vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ epilogue
1083 ldmia sp!,{r4,r5,r6,r7}
1086 .size poly1305_blocks_neon,.-poly1305_blocks_neon
1088 .type poly1305_emit_neon,%function
1091 ldr ip,[r0,#36] @ is_base2_26
1093 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1096 beq .Lpoly1305_emit_enter
1098 ldmia r0,{r3,r4,r5,r6,r7}
1101 adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32
1103 adcs r4,r4,r5,lsl#20
1105 adcs r5,r5,r6,lsl#14
1108 adc r7,r8,r7,lsr#24 @ can be partially reduced ...
1110 and r8,r7,#-4 @ ... so reduce
1112 add r8,r8,r8,lsr#2 @ *= 5
1119 adds r8,r3,#5 @ compare to modulus
1124 tst r7,#4 @ did it carry/borrow?
1139 adds r3,r3,r8 @ accumulate nonce
1150 str r3,[r1,#0] @ store the result
1155 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1157 .size poly1305_emit_neon,.-poly1305_emit_neon
1161 .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1163 .word OPENSSL_armcap_P-.Lpoly1305_init
1165 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1168 #if __ARM_MAX_ARCH__>=7
1169 .comm OPENSSL_armcap_P,4,4