2 /* Do not modify. This file is auto-generated from poly1305-armv4.pl. */
6 #if defined(__thumb2__)
14 .globl poly1305_blocks
16 .type poly1305_init,%function
20 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
24 str r3,[r0,#0] @ zero hash value
29 str r3,[r0,#36] @ is_base2_26
38 #if __ARM_MAX_ARCH__>=7
39 adr r11,.Lpoly1305_init
40 ldr r12,.LOPENSSL_armcap
45 and r3,r10,#-4 @ 0x0ffffffc
56 #if __ARM_MAX_ARCH__>=7
57 ldr r12,[r11,r12] @ OPENSSL_armcap_P
71 #if __ARM_MAX_ARCH__>=7
72 tst r12,#ARMV7_NEON @ check for NEON
74 adr r9,poly1305_blocks_neon
75 adr r11,poly1305_blocks
81 adr r10,poly1305_emit_neon
90 addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
91 addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
92 addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
93 addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
96 orr r12,r12,#1 @ thumb-ify address
118 #if __ARM_MAX_ARCH__>=7
119 stmia r2,{r11,r12} @ fill functions table
125 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
130 moveq pc,lr @ be binary compatible with V4, yet
131 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
133 .size poly1305_init,.-poly1305_init
134 .type poly1305_blocks,%function
138 stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
144 add r2,r2,r1 @ end pointer
147 ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12} @ load context
149 str r0,[sp,#12] @ offload stuff
159 ldrb r0,[lr],#16 @ load input
163 addhi r8,r8,#1 @ 1<<128
173 adds r4,r4,r3 @ accumulate input
195 str lr,[sp,#8] @ offload input pointer
197 add r10,r10,r10,lsr#2
200 ldr r0,[lr],#16 @ load input
204 addhi r8,r8,#1 @ padbit
214 adds r4,r4,r0 @ accumulate input
215 str lr,[sp,#8] @ offload input pointer
217 add r10,r10,r10,lsr#2
220 add r11,r11,r11,lsr#2
222 add r12,r12,r12,lsr#2
229 ldr r10,[sp,#20] @ reload r10
235 str r0,[sp,#0] @ future r4
237 ldr r11,[sp,#24] @ reload r11
238 adds r2,r2,r1 @ d1+=d0>>32
240 adc lr,r3,#0 @ future r6
241 str r2,[sp,#4] @ future r5
246 ldr r12,[sp,#28] @ reload r12
258 adds r6,lr,r0 @ d2+=d1>>32
259 ldr lr,[sp,#8] @ reload input pointer
261 adds r7,r2,r1 @ d3+=d2>>32
262 ldr r0,[sp,#16] @ reload end pointer
264 add r8,r8,r3 @ h4+=d3>>32
268 add r1,r1,r1,lsr#2 @ *=5
275 cmp r0,lr @ done yet?
280 stmia r0,{r4,r5,r6,r7,r8} @ store the result
284 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc}
286 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
288 moveq pc,lr @ be binary compatible with V4, yet
289 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
291 .size poly1305_blocks,.-poly1305_blocks
292 .type poly1305_emit,%function
296 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
297 .Lpoly1305_emit_enter:
299 ldmia r0,{r3,r4,r5,r6,r7}
300 adds r8,r3,#5 @ compare to modulus
305 tst r7,#4 @ did it carry/borrow?
377 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
382 moveq pc,lr @ be binary compatible with V4, yet
383 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
385 .size poly1305_emit,.-poly1305_emit
386 #if __ARM_MAX_ARCH__>=7
389 .type poly1305_init_neon,%function
392 ldr r4,[r0,#20] @ load key base 2^32
397 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
405 and r3,r3,#0x03ffffff
406 and r4,r4,#0x03ffffff
407 and r5,r5,#0x03ffffff
409 vdup.32 d0,r2 @ r^1 in both lanes
410 add r2,r3,r3,lsl#2 @ *5
426 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
427 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
428 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
429 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
430 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
431 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
433 vmull.u32 q5,d0,d0[1]
434 vmull.u32 q6,d1,d0[1]
435 vmull.u32 q7,d3,d0[1]
436 vmull.u32 q8,d5,d0[1]
437 vmull.u32 q9,d7,d0[1]
439 vmlal.u32 q5,d7,d2[1]
440 vmlal.u32 q6,d0,d1[1]
441 vmlal.u32 q7,d1,d1[1]
442 vmlal.u32 q8,d3,d1[1]
443 vmlal.u32 q9,d5,d1[1]
445 vmlal.u32 q5,d5,d4[1]
446 vmlal.u32 q6,d7,d4[1]
447 vmlal.u32 q8,d1,d3[1]
448 vmlal.u32 q7,d0,d3[1]
449 vmlal.u32 q9,d3,d3[1]
451 vmlal.u32 q5,d3,d6[1]
452 vmlal.u32 q8,d0,d5[1]
453 vmlal.u32 q6,d5,d6[1]
454 vmlal.u32 q7,d7,d6[1]
455 vmlal.u32 q9,d1,d5[1]
457 vmlal.u32 q8,d7,d8[1]
458 vmlal.u32 q5,d1,d8[1]
459 vmlal.u32 q6,d3,d8[1]
460 vmlal.u32 q7,d5,d8[1]
461 vmlal.u32 q9,d0,d7[1]
463 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
464 @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
467 @ H0>>+H1>>+H2>>+H3>>+H4
468 @ H3>>+H4>>*5+H0>>+H1
472 @ Result of multiplication of n-bit number by m-bit number is
473 @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
474 @ m-bit number multiplied by 2^n is still n+m bits wide.
476 @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
477 @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
478 @ one is n+1 bits wide.
480 @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
481 @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
482 @ can be 27. However! In cases when their width exceeds 26 bits
483 @ they are limited by 2^26+2^6. This in turn means that *sum*
484 @ of the products with these values can still be viewed as sum
485 @ of 52-bit numbers as long as the amount of addends is not a
486 @ power of 2. For example,
488 @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
490 @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
491 @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
492 @ 8 * (2^52) or 2^55. However, the value is then multiplied by
493 @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
494 @ which is less than 32 * (2^52) or 2^57. And when processing
495 @ data we are looking at triple as many addends...
497 @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
498 @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
499 @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
500 @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
501 @ instruction accepts 2x32-bit input and writes 2x64-bit result.
502 @ This means that result of reduction have to be compressed upon
503 @ loop wrap-around. This can be done in the process of reduction
504 @ to minimize amount of instructions [as well as amount of
505 @ 128-bit instructions, which benefits low-end processors], but
506 @ one has to watch for H2 (which is narrower than H0) and 5*H4
507 @ not being wider than 58 bits, so that result of right shift
508 @ by 26 bits fits in 32 bits. This is also useful on x86,
509 @ because it allows to use paddd in place for paddq, which
510 @ benefits Atom, where paddq is ridiculously slow.
516 vadd.i64 q9,q9,q15 @ h3 -> h4
517 vbic.i32 d16,#0xfc000000 @ &=0x03ffffff
518 vadd.i64 q6,q6,q4 @ h0 -> h1
519 vbic.i32 d10,#0xfc000000
525 vadd.i64 q7,q7,q4 @ h1 -> h2
526 vbic.i32 d18,#0xfc000000
527 vbic.i32 d12,#0xfc000000
533 vadd.i32 d10,d10,d30 @ h4 -> h0
534 vadd.i32 d16,d16,d8 @ h2 -> h3
535 vbic.i32 d14,#0xfc000000
538 vbic.i32 d10,#0xfc000000
540 vbic.i32 d16,#0xfc000000
541 vadd.i32 d12,d12,d30 @ h0 -> h1
542 vadd.i32 d18,d18,d8 @ h3 -> h4
545 beq .Lsquare_break_neon
547 add r6,r0,#(48+0*9*4)
548 add r7,r0,#(48+1*9*4)
550 vtrn.32 d0,d10 @ r^2:r^1
556 vshl.u32 d4,d3,#2 @ *5
565 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
566 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
567 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
568 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
569 vst1.32 {d8[0]},[r6,:32]
570 vst1.32 {d8[1]},[r7,:32]
576 add r6,r0,#(48+2*4*9)
577 add r7,r0,#(48+3*4*9)
579 vmov d0,d10 @ r^4:r^3
580 vshl.u32 d2,d12,#2 @ *5
593 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
594 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
595 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
596 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
601 .size poly1305_init_neon,.-poly1305_init_neon
603 .type poly1305_blocks_neon,%function
605 poly1305_blocks_neon:
606 .Lpoly1305_blocks_neon:
607 ldr ip,[r0,#36] @ is_base2_26
613 tst ip,ip @ is_base2_26?
614 beq .Lpoly1305_blocks
617 stmdb sp!,{r4,r5,r6,r7}
618 vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
620 tst ip,ip @ is_base2_26?
623 stmdb sp!,{r1,r2,r3,lr}
624 bl poly1305_init_neon
626 ldr r4,[r0,#0] @ load hash value base 2^32
632 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
644 and r3,r3,#0x03ffffff
647 and r4,r4,#0x03ffffff
649 and r5,r5,#0x03ffffff
650 str r1,[r0,#36] @ is_base2_26
659 ldmia sp!,{r1,r2,r3,lr}
664 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
672 vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
674 vld1.32 {d18[0]},[r0]
675 sub r0,r0,#16 @ rewind
683 vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
694 vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
699 vadd.i32 d29,d28,d18 @ add hash value and move to #hi
701 vbic.i32 d26,#0xfc000000
705 vbic.i32 d24,#0xfc000000
709 vbic.i32 d20,#0xfc000000
710 vbic.i32 d22,#0xfc000000
728 vmov.i32 q14,#1<<24 @ padbit, yes, always
729 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
731 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
734 addhi r7,r0,#(48+1*9*4)
735 addhi r6,r0,#(48+3*9*4)
743 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
749 vbic.i32 q13,#0xfc000000
753 vbic.i32 q12,#0xfc000000
756 vbic.i32 q10,#0xfc000000
757 vbic.i32 q11,#0xfc000000
761 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
762 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
763 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
764 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
769 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
770 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
771 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
772 @ ___________________/
773 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
774 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
775 @ ___________________/ ____________________/
777 @ Note that we start with inp[2:3]*r^2. This is because it
778 @ doesn't depend on reduction in previous iteration.
779 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
780 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
781 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
782 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
783 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
784 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
786 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
789 vadd.i32 d24,d24,d14 @ accumulate inp[0:1]
790 vmull.u32 q7,d25,d0[1]
792 vmull.u32 q5,d21,d0[1]
794 vmull.u32 q8,d27,d0[1]
795 vmlal.u32 q7,d23,d1[1]
797 vmull.u32 q6,d23,d0[1]
800 vmull.u32 q9,d29,d0[1]
802 vmlal.u32 q5,d29,d2[1]
805 vmlal.u32 q8,d25,d1[1]
806 vld1.32 d8[1],[r7,:32]
807 vmlal.u32 q6,d21,d1[1]
808 vmlal.u32 q9,d27,d1[1]
810 vmlal.u32 q5,d27,d4[1]
811 vmlal.u32 q8,d23,d3[1]
812 vmlal.u32 q9,d25,d3[1]
813 vmlal.u32 q6,d29,d4[1]
814 vmlal.u32 q7,d21,d3[1]
816 vmlal.u32 q8,d21,d5[1]
817 vmlal.u32 q5,d25,d6[1]
818 vmlal.u32 q9,d23,d5[1]
819 vmlal.u32 q6,d27,d6[1]
820 vmlal.u32 q7,d29,d6[1]
822 vmlal.u32 q8,d29,d8[1]
823 vmlal.u32 q5,d23,d8[1]
824 vmlal.u32 q9,d21,d7[1]
825 vmlal.u32 q6,d25,d8[1]
826 vmlal.u32 q7,d27,d8[1]
828 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
831 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
832 @ (hash+inp[0:1])*r^4 and accumulate
834 vmlal.u32 q8,d26,d0[0]
835 vmlal.u32 q5,d20,d0[0]
836 vmlal.u32 q9,d28,d0[0]
837 vmlal.u32 q6,d22,d0[0]
838 vmlal.u32 q7,d24,d0[0]
839 vld1.32 d8[0],[r6,:32]
841 vmlal.u32 q8,d24,d1[0]
842 vmlal.u32 q5,d28,d2[0]
843 vmlal.u32 q9,d26,d1[0]
844 vmlal.u32 q6,d20,d1[0]
845 vmlal.u32 q7,d22,d1[0]
847 vmlal.u32 q8,d22,d3[0]
848 vmlal.u32 q5,d26,d4[0]
849 vmlal.u32 q9,d24,d3[0]
850 vmlal.u32 q6,d28,d4[0]
851 vmlal.u32 q7,d20,d3[0]
853 vmlal.u32 q8,d20,d5[0]
854 vmlal.u32 q5,d24,d6[0]
855 vmlal.u32 q9,d22,d5[0]
856 vmlal.u32 q6,d26,d6[0]
857 vmlal.u32 q8,d28,d8[0]
859 vmlal.u32 q7,d28,d6[0]
860 vmlal.u32 q5,d22,d8[0]
861 vmlal.u32 q9,d20,d7[0]
862 vmov.i32 q14,#1<<24 @ padbit, yes, always
863 vmlal.u32 q6,d24,d8[0]
864 vmlal.u32 q7,d26,d8[0]
866 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
875 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
876 @ lazy reduction interleaved with base 2^32 -> base 2^26 of
877 @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
883 vadd.i64 q9,q9,q15 @ h3 -> h4
884 vbic.i32 d16,#0xfc000000
885 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
886 vadd.i64 q6,q6,q4 @ h0 -> h1
888 vbic.i32 d10,#0xfc000000
894 vadd.i64 q7,q7,q4 @ h1 -> h2
896 vbic.i32 d18,#0xfc000000
898 vbic.i32 d12,#0xfc000000
902 vbic.i32 q13,#0xfc000000
905 vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]
907 vadd.i32 d16,d16,d8 @ h2 -> h3
909 vbic.i32 d14,#0xfc000000
910 vbic.i32 q12,#0xfc000000
912 vshrn.u64 d30,q5,#26 @ re-narrow
915 vbic.i32 q10,#0xfc000000
917 vbic.i32 d16,#0xfc000000
918 vbic.i32 d10,#0xfc000000
919 vadd.i32 d12,d12,d30 @ h0 -> h1
920 vadd.i32 d18,d18,d8 @ h3 -> h4
921 vbic.i32 q11,#0xfc000000
926 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
927 @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
929 add r7,r0,#(48+0*9*4)
930 add r6,r0,#(48+1*9*4)
936 vadd.i32 d25,d24,d14 @ add hash value and move to #hi
943 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
944 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
946 vadd.i32 d24,d24,d14 @ can be redundant
958 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
960 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
966 vld1.32 d8[1],[r7,:32]
968 vld1.32 d8[0],[r6,:32]
975 addne r7,r0,#(48+2*9*4)
978 addne r6,r0,#(48+3*9*4)
984 vorn q0,q0,q0 @ all-ones, can be redundant
993 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
994 @ (hash+inp[0:1])*r^4:r^3 and accumulate
996 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
997 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
1006 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
1008 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
1014 vld1.32 d8[1],[r7,:32]
1016 vld1.32 d8[0],[r6,:32]
1028 vorn q0,q0,q0 @ all-ones
1036 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1037 @ horizontal addition
1039 vadd.i64 d16,d16,d17
1040 vadd.i64 d10,d10,d11
1041 vadd.i64 d18,d18,d19
1042 vadd.i64 d12,d12,d13
1043 vadd.i64 d14,d14,d15
1045 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1046 @ lazy reduction, but without narrowing
1052 vadd.i64 q9,q9,q15 @ h3 -> h4
1053 vadd.i64 q6,q6,q4 @ h0 -> h1
1059 vadd.i64 q7,q7,q4 @ h1 -> h2
1065 vadd.i64 q5,q5,q15 @ h4 -> h0
1066 vadd.i64 q8,q8,q4 @ h2 -> h3
1072 vadd.i64 q6,q6,q15 @ h0 -> h1
1073 vadd.i64 q9,q9,q4 @ h3 -> h4
1078 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1081 vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
1082 vst1.32 {d18[0]},[r0]
1084 vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ epilogue
1085 ldmia sp!,{r4,r5,r6,r7}
1088 .size poly1305_blocks_neon,.-poly1305_blocks_neon
1090 .type poly1305_emit_neon,%function
1093 .Lpoly1305_emit_neon:
1094 ldr ip,[r0,#36] @ is_base2_26
1096 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1099 beq .Lpoly1305_emit_enter
1101 ldmia r0,{r3,r4,r5,r6,r7}
1104 adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32
1106 adcs r4,r4,r5,lsl#20
1108 adcs r5,r5,r6,lsl#14
1111 adc r7,r8,r7,lsr#24 @ can be partially reduced ...
1113 and r8,r7,#-4 @ ... so reduce
1115 add r8,r8,r8,lsr#2 @ *= 5
1122 adds r8,r3,#5 @ compare to modulus
1127 tst r7,#4 @ did it carry/borrow?
1142 adds r3,r3,r8 @ accumulate nonce
1153 str r3,[r1,#0] @ store the result
1158 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1160 .size poly1305_emit_neon,.-poly1305_emit_neon
1164 .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1166 .word OPENSSL_armcap_P-.Lpoly1305_init
1168 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1171 #if __ARM_MAX_ARCH__>=7
1172 .comm OPENSSL_armcap_P,4,4