]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - secure/lib/libcrypto/arm/armv4-mont.S
Merge OpenSSL 1.0.2p.
[FreeBSD/FreeBSD.git] / secure / lib / libcrypto / arm / armv4-mont.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from armv4-mont.pl. */
3 #include "arm_arch.h"
4
5 .text
6 .code   32
7
8 #if __ARM_MAX_ARCH__>=7
9 .align  5
10 .LOPENSSL_armcap:
11 .word   OPENSSL_armcap_P-bn_mul_mont
12 #endif
13
14 .global bn_mul_mont
15 .type   bn_mul_mont,%function
16
17 .align  5
18 bn_mul_mont:
19         ldr     ip,[sp,#4]              @ load num
20         stmdb   sp!,{r0,r2}             @ sp points at argument block
21 #if __ARM_MAX_ARCH__>=7
22         tst     ip,#7
23         bne     .Lialu
24         adr     r0,bn_mul_mont
25         ldr     r2,.LOPENSSL_armcap
26         ldr     r0,[r0,r2]
27         tst     r0,#1                   @ NEON available?
28         ldmia   sp, {r0,r2}
29         beq     .Lialu
30         add     sp,sp,#8
31         b       bn_mul8x_mont_neon
32 .align  4
33 .Lialu:
34 #endif
35         cmp     ip,#2
36         mov     r0,ip                   @ load num
37         movlt   r0,#0
38         addlt   sp,sp,#2*4
39         blt     .Labrt
40
41         stmdb   sp!,{r4-r12,lr}         @ save 10 registers
42
43         mov     r0,r0,lsl#2             @ rescale r0 for byte count
44         sub     sp,sp,r0                @ alloca(4*num)
45         sub     sp,sp,#4                @ +extra dword
46         sub     r0,r0,#4                @ "num=num-1"
47         add     r4,r2,r0                @ &bp[num-1]
48
49         add     r0,sp,r0                @ r0 to point at &tp[num-1]
50         ldr     r8,[r0,#14*4]           @ &n0
51         ldr     r2,[r2]         @ bp[0]
52         ldr     r5,[r1],#4              @ ap[0],ap++
53         ldr     r6,[r3],#4              @ np[0],np++
54         ldr     r8,[r8]         @ *n0
55         str     r4,[r0,#15*4]           @ save &bp[num]
56
57         umull   r10,r11,r5,r2   @ ap[0]*bp[0]
58         str     r8,[r0,#14*4]           @ save n0 value
59         mul     r8,r10,r8               @ "tp[0]"*n0
60         mov     r12,#0
61         umlal   r10,r12,r6,r8   @ np[0]*n0+"t[0]"
62         mov     r4,sp
63
64 .L1st:
65         ldr     r5,[r1],#4              @ ap[j],ap++
66         mov     r10,r11
67         ldr     r6,[r3],#4              @ np[j],np++
68         mov     r11,#0
69         umlal   r10,r11,r5,r2   @ ap[j]*bp[0]
70         mov     r14,#0
71         umlal   r12,r14,r6,r8   @ np[j]*n0
72         adds    r12,r12,r10
73         str     r12,[r4],#4             @ tp[j-1]=,tp++
74         adc     r12,r14,#0
75         cmp     r4,r0
76         bne     .L1st
77
78         adds    r12,r12,r11
79         ldr     r4,[r0,#13*4]           @ restore bp
80         mov     r14,#0
81         ldr     r8,[r0,#14*4]           @ restore n0
82         adc     r14,r14,#0
83         str     r12,[r0]                @ tp[num-1]=
84         str     r14,[r0,#4]             @ tp[num]=
85 \f
86 .Louter:
87         sub     r7,r0,sp                @ "original" r0-1 value
88         sub     r1,r1,r7                @ "rewind" ap to &ap[1]
89         ldr     r2,[r4,#4]!             @ *(++bp)
90         sub     r3,r3,r7                @ "rewind" np to &np[1]
91         ldr     r5,[r1,#-4]             @ ap[0]
92         ldr     r10,[sp]                @ tp[0]
93         ldr     r6,[r3,#-4]             @ np[0]
94         ldr     r7,[sp,#4]              @ tp[1]
95
96         mov     r11,#0
97         umlal   r10,r11,r5,r2   @ ap[0]*bp[i]+tp[0]
98         str     r4,[r0,#13*4]           @ save bp
99         mul     r8,r10,r8
100         mov     r12,#0
101         umlal   r10,r12,r6,r8   @ np[0]*n0+"tp[0]"
102         mov     r4,sp
103
104 .Linner:
105         ldr     r5,[r1],#4              @ ap[j],ap++
106         adds    r10,r11,r7              @ +=tp[j]
107         ldr     r6,[r3],#4              @ np[j],np++
108         mov     r11,#0
109         umlal   r10,r11,r5,r2   @ ap[j]*bp[i]
110         mov     r14,#0
111         umlal   r12,r14,r6,r8   @ np[j]*n0
112         adc     r11,r11,#0
113         ldr     r7,[r4,#8]              @ tp[j+1]
114         adds    r12,r12,r10
115         str     r12,[r4],#4             @ tp[j-1]=,tp++
116         adc     r12,r14,#0
117         cmp     r4,r0
118         bne     .Linner
119
120         adds    r12,r12,r11
121         mov     r14,#0
122         ldr     r4,[r0,#13*4]           @ restore bp
123         adc     r14,r14,#0
124         ldr     r8,[r0,#14*4]           @ restore n0
125         adds    r12,r12,r7
126         ldr     r7,[r0,#15*4]           @ restore &bp[num]
127         adc     r14,r14,#0
128         str     r12,[r0]                @ tp[num-1]=
129         str     r14,[r0,#4]             @ tp[num]=
130
131         cmp     r4,r7
132         bne     .Louter
133 \f
134         ldr     r2,[r0,#12*4]           @ pull rp
135         add     r0,r0,#4                @ r0 to point at &tp[num]
136         sub     r5,r0,sp                @ "original" num value
137         mov     r4,sp                   @ "rewind" r4
138         mov     r1,r4                   @ "borrow" r1
139         sub     r3,r3,r5                @ "rewind" r3 to &np[0]
140
141         subs    r7,r7,r7                @ "clear" carry flag
142 .Lsub:  ldr     r7,[r4],#4
143         ldr     r6,[r3],#4
144         sbcs    r7,r7,r6                @ tp[j]-np[j]
145         str     r7,[r2],#4              @ rp[j]=
146         teq     r4,r0           @ preserve carry
147         bne     .Lsub
148         sbcs    r14,r14,#0              @ upmost carry
149         mov     r4,sp                   @ "rewind" r4
150         sub     r2,r2,r5                @ "rewind" r2
151
152 .Lcopy: ldr     r7,[r4]         @ conditional copy
153         ldr     r5,[r2]
154         str     sp,[r4],#4              @ zap tp
155 #ifdef  __thumb2__
156         it      cc
157 #endif
158         movcc   r5,r7
159         str     r5,[r2],#4
160         teq     r4,r0           @ preserve carry
161         bne     .Lcopy
162
163         add     sp,r0,#4                @ skip over tp[num+1]
164         ldmia   sp!,{r4-r12,lr}         @ restore registers
165         add     sp,sp,#2*4              @ skip over {r0,r2}
166         mov     r0,#1
167 .Labrt:
168 #if __ARM_ARCH__>=5
169         bx      lr                              @ .word 0xe12fff1e
170 #else
171         tst     lr,#1
172         moveq   pc,lr                   @ be binary compatible with V4, yet
173         .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
174 #endif
175 .size   bn_mul_mont,.-bn_mul_mont
176 #if __ARM_MAX_ARCH__>=7
177 .arch   armv7-a
178 .fpu    neon
179
180 .type   bn_mul8x_mont_neon,%function
181 .align  5
182 bn_mul8x_mont_neon:
183         mov     ip,sp
184         stmdb   sp!,{r4-r11}
185         vstmdb  sp!,{d8-d15}            @ ABI specification says so
186         ldmia   ip,{r4-r5}              @ load rest of parameter block
187
188         sub             r7,sp,#16
189         vld1.32         {d28[0]}, [r2,:32]!
190         sub             r7,r7,r5,lsl#4
191         vld1.32         {d0-d3},  [r1]!         @ can't specify :32 :-(
192         and             r7,r7,#-64
193         vld1.32         {d30[0]}, [r4,:32]
194         mov             sp,r7                   @ alloca
195         veor            d8,d8,d8
196         subs            r8,r5,#8
197         vzip.16         d28,d8
198
199         vmull.u32       q6,d28,d0[0]
200         vmull.u32       q7,d28,d0[1]
201         vmull.u32       q8,d28,d1[0]
202         vshl.i64        d10,d13,#16
203         vmull.u32       q9,d28,d1[1]
204
205         vadd.u64        d10,d10,d12
206         veor            d8,d8,d8
207         vmul.u32        d29,d10,d30
208
209         vmull.u32       q10,d28,d2[0]
210          vld1.32        {d4-d7}, [r3]!
211         vmull.u32       q11,d28,d2[1]
212         vmull.u32       q12,d28,d3[0]
213         vzip.16         d29,d8
214         vmull.u32       q13,d28,d3[1]
215
216         bne     .LNEON_1st
217
218         @ special case for num=8, everything is in register bank...
219
220         vmlal.u32       q6,d29,d4[0]
221         sub             r9,r5,#1
222         vmlal.u32       q7,d29,d4[1]
223         vmlal.u32       q8,d29,d5[0]
224         vmlal.u32       q9,d29,d5[1]
225
226         vmlal.u32       q10,d29,d6[0]
227         vmov            q5,q6
228         vmlal.u32       q11,d29,d6[1]
229         vmov            q6,q7
230         vmlal.u32       q12,d29,d7[0]
231         vmov            q7,q8
232         vmlal.u32       q13,d29,d7[1]
233         vmov            q8,q9
234         vmov            q9,q10
235         vshr.u64        d10,d10,#16
236         vmov            q10,q11
237         vmov            q11,q12
238         vadd.u64        d10,d10,d11
239         vmov            q12,q13
240         veor            q13,q13
241         vshr.u64        d10,d10,#16
242
243         b       .LNEON_outer8
244
245 .align  4
246 .LNEON_outer8:
247         vld1.32         {d28[0]}, [r2,:32]!
248         veor            d8,d8,d8
249         vzip.16         d28,d8
250         vadd.u64        d12,d12,d10
251
252         vmlal.u32       q6,d28,d0[0]
253         vmlal.u32       q7,d28,d0[1]
254         vmlal.u32       q8,d28,d1[0]
255         vshl.i64        d10,d13,#16
256         vmlal.u32       q9,d28,d1[1]
257
258         vadd.u64        d10,d10,d12
259         veor            d8,d8,d8
260         subs            r9,r9,#1
261         vmul.u32        d29,d10,d30
262
263         vmlal.u32       q10,d28,d2[0]
264         vmlal.u32       q11,d28,d2[1]
265         vmlal.u32       q12,d28,d3[0]
266         vzip.16         d29,d8
267         vmlal.u32       q13,d28,d3[1]
268
269         vmlal.u32       q6,d29,d4[0]
270         vmlal.u32       q7,d29,d4[1]
271         vmlal.u32       q8,d29,d5[0]
272         vmlal.u32       q9,d29,d5[1]
273
274         vmlal.u32       q10,d29,d6[0]
275         vmov            q5,q6
276         vmlal.u32       q11,d29,d6[1]
277         vmov            q6,q7
278         vmlal.u32       q12,d29,d7[0]
279         vmov            q7,q8
280         vmlal.u32       q13,d29,d7[1]
281         vmov            q8,q9
282         vmov            q9,q10
283         vshr.u64        d10,d10,#16
284         vmov            q10,q11
285         vmov            q11,q12
286         vadd.u64        d10,d10,d11
287         vmov            q12,q13
288         veor            q13,q13
289         vshr.u64        d10,d10,#16
290
291         bne     .LNEON_outer8
292
293         vadd.u64        d12,d12,d10
294         mov             r7,sp
295         vshr.u64        d10,d12,#16
296         mov             r8,r5
297         vadd.u64        d13,d13,d10
298         add             r6,sp,#16
299         vshr.u64        d10,d13,#16
300         vzip.16         d12,d13
301
302         b       .LNEON_tail2
303
304 .align  4
305 .LNEON_1st:
306         vmlal.u32       q6,d29,d4[0]
307          vld1.32        {d0-d3}, [r1]!
308         vmlal.u32       q7,d29,d4[1]
309         subs            r8,r8,#8
310         vmlal.u32       q8,d29,d5[0]
311         vmlal.u32       q9,d29,d5[1]
312
313         vmlal.u32       q10,d29,d6[0]
314          vld1.32        {d4-d5}, [r3]!
315         vmlal.u32       q11,d29,d6[1]
316          vst1.64        {q6-q7}, [r7,:256]!
317         vmlal.u32       q12,d29,d7[0]
318         vmlal.u32       q13,d29,d7[1]
319          vst1.64        {q8-q9}, [r7,:256]!
320
321         vmull.u32       q6,d28,d0[0]
322          vld1.32        {d6-d7}, [r3]!
323         vmull.u32       q7,d28,d0[1]
324          vst1.64        {q10-q11}, [r7,:256]!
325         vmull.u32       q8,d28,d1[0]
326         vmull.u32       q9,d28,d1[1]
327          vst1.64        {q12-q13}, [r7,:256]!
328
329         vmull.u32       q10,d28,d2[0]
330         vmull.u32       q11,d28,d2[1]
331         vmull.u32       q12,d28,d3[0]
332         vmull.u32       q13,d28,d3[1]
333
334         bne     .LNEON_1st
335
336         vmlal.u32       q6,d29,d4[0]
337         add             r6,sp,#16
338         vmlal.u32       q7,d29,d4[1]
339         sub             r1,r1,r5,lsl#2          @ rewind r1
340         vmlal.u32       q8,d29,d5[0]
341          vld1.64        {q5}, [sp,:128]
342         vmlal.u32       q9,d29,d5[1]
343         sub             r9,r5,#1
344
345         vmlal.u32       q10,d29,d6[0]
346         vst1.64         {q6-q7}, [r7,:256]!
347         vmlal.u32       q11,d29,d6[1]
348         vshr.u64        d10,d10,#16
349          vld1.64        {q6},       [r6, :128]!
350         vmlal.u32       q12,d29,d7[0]
351         vst1.64         {q8-q9}, [r7,:256]!
352         vmlal.u32       q13,d29,d7[1]
353
354         vst1.64         {q10-q11}, [r7,:256]!
355         vadd.u64        d10,d10,d11
356         veor            q4,q4,q4
357         vst1.64         {q12-q13}, [r7,:256]!
358          vld1.64        {q7-q8}, [r6, :256]!
359         vst1.64         {q4},          [r7,:128]
360         vshr.u64        d10,d10,#16
361
362         b               .LNEON_outer
363
364 .align  4
365 .LNEON_outer:
366         vld1.32         {d28[0]}, [r2,:32]!
367         sub             r3,r3,r5,lsl#2          @ rewind r3
368         vld1.32         {d0-d3},  [r1]!
369         veor            d8,d8,d8
370         mov             r7,sp
371         vzip.16         d28,d8
372         sub             r8,r5,#8
373         vadd.u64        d12,d12,d10
374
375         vmlal.u32       q6,d28,d0[0]
376          vld1.64        {q9-q10},[r6,:256]!
377         vmlal.u32       q7,d28,d0[1]
378         vmlal.u32       q8,d28,d1[0]
379          vld1.64        {q11-q12},[r6,:256]!
380         vmlal.u32       q9,d28,d1[1]
381
382         vshl.i64        d10,d13,#16
383         veor            d8,d8,d8
384         vadd.u64        d10,d10,d12
385          vld1.64        {q13},[r6,:128]!
386         vmul.u32        d29,d10,d30
387
388         vmlal.u32       q10,d28,d2[0]
389          vld1.32        {d4-d7}, [r3]!
390         vmlal.u32       q11,d28,d2[1]
391         vmlal.u32       q12,d28,d3[0]
392         vzip.16         d29,d8
393         vmlal.u32       q13,d28,d3[1]
394
395 .LNEON_inner:
396         vmlal.u32       q6,d29,d4[0]
397          vld1.32        {d0-d3}, [r1]!
398         vmlal.u32       q7,d29,d4[1]
399          subs           r8,r8,#8
400         vmlal.u32       q8,d29,d5[0]
401         vmlal.u32       q9,d29,d5[1]
402         vst1.64         {q6-q7}, [r7,:256]!
403
404         vmlal.u32       q10,d29,d6[0]
405          vld1.64        {q6},       [r6, :128]!
406         vmlal.u32       q11,d29,d6[1]
407         vst1.64         {q8-q9}, [r7,:256]!
408         vmlal.u32       q12,d29,d7[0]
409          vld1.64        {q7-q8}, [r6, :256]!
410         vmlal.u32       q13,d29,d7[1]
411         vst1.64         {q10-q11}, [r7,:256]!
412
413         vmlal.u32       q6,d28,d0[0]
414          vld1.64        {q9-q10}, [r6, :256]!
415         vmlal.u32       q7,d28,d0[1]
416         vst1.64         {q12-q13}, [r7,:256]!
417         vmlal.u32       q8,d28,d1[0]
418          vld1.64        {q11-q12}, [r6, :256]!
419         vmlal.u32       q9,d28,d1[1]
420          vld1.32        {d4-d7}, [r3]!
421
422         vmlal.u32       q10,d28,d2[0]
423          vld1.64        {q13},       [r6, :128]!
424         vmlal.u32       q11,d28,d2[1]
425         vmlal.u32       q12,d28,d3[0]
426         vmlal.u32       q13,d28,d3[1]
427
428         bne     .LNEON_inner
429
430         vmlal.u32       q6,d29,d4[0]
431         add             r6,sp,#16
432         vmlal.u32       q7,d29,d4[1]
433         sub             r1,r1,r5,lsl#2          @ rewind r1
434         vmlal.u32       q8,d29,d5[0]
435          vld1.64        {q5}, [sp,:128]
436         vmlal.u32       q9,d29,d5[1]
437         subs            r9,r9,#1
438
439         vmlal.u32       q10,d29,d6[0]
440         vst1.64         {q6-q7}, [r7,:256]!
441         vmlal.u32       q11,d29,d6[1]
442          vld1.64        {q6},       [r6, :128]!
443         vshr.u64        d10,d10,#16
444         vst1.64         {q8-q9}, [r7,:256]!
445         vmlal.u32       q12,d29,d7[0]
446          vld1.64        {q7-q8}, [r6, :256]!
447         vmlal.u32       q13,d29,d7[1]
448
449         vst1.64         {q10-q11}, [r7,:256]!
450         vadd.u64        d10,d10,d11
451         vst1.64         {q12-q13}, [r7,:256]!
452         vshr.u64        d10,d10,#16
453
454         bne     .LNEON_outer
455
456         mov             r7,sp
457         mov             r8,r5
458
459 .LNEON_tail:
460         vadd.u64        d12,d12,d10
461         vld1.64         {q9-q10}, [r6, :256]!
462         vshr.u64        d10,d12,#16
463         vadd.u64        d13,d13,d10
464         vld1.64         {q11-q12}, [r6, :256]!
465         vshr.u64        d10,d13,#16
466         vld1.64         {q13},       [r6, :128]!
467         vzip.16         d12,d13
468
469 .LNEON_tail2:
470         vadd.u64        d14,d14,d10
471         vst1.32         {d12[0]}, [r7, :32]!
472         vshr.u64        d10,d14,#16
473         vadd.u64        d15,d15,d10
474         vshr.u64        d10,d15,#16
475         vzip.16         d14,d15
476
477         vadd.u64        d16,d16,d10
478         vst1.32         {d14[0]}, [r7, :32]!
479         vshr.u64        d10,d16,#16
480         vadd.u64        d17,d17,d10
481         vshr.u64        d10,d17,#16
482         vzip.16         d16,d17
483
484         vadd.u64        d18,d18,d10
485         vst1.32         {d16[0]}, [r7, :32]!
486         vshr.u64        d10,d18,#16
487         vadd.u64        d19,d19,d10
488         vshr.u64        d10,d19,#16
489         vzip.16         d18,d19
490
491         vadd.u64        d20,d20,d10
492         vst1.32         {d18[0]}, [r7, :32]!
493         vshr.u64        d10,d20,#16
494         vadd.u64        d21,d21,d10
495         vshr.u64        d10,d21,#16
496         vzip.16         d20,d21
497
498         vadd.u64        d22,d22,d10
499         vst1.32         {d20[0]}, [r7, :32]!
500         vshr.u64        d10,d22,#16
501         vadd.u64        d23,d23,d10
502         vshr.u64        d10,d23,#16
503         vzip.16         d22,d23
504
505         vadd.u64        d24,d24,d10
506         vst1.32         {d22[0]}, [r7, :32]!
507         vshr.u64        d10,d24,#16
508         vadd.u64        d25,d25,d10
509         vld1.64         {q6}, [r6, :128]!
510         vshr.u64        d10,d25,#16
511         vzip.16         d24,d25
512
513         vadd.u64        d26,d26,d10
514         vst1.32         {d24[0]}, [r7, :32]!
515         vshr.u64        d10,d26,#16
516         vadd.u64        d27,d27,d10
517         vld1.64         {q7-q8},        [r6, :256]!
518         vshr.u64        d10,d27,#16
519         vzip.16         d26,d27
520         subs            r8,r8,#8
521         vst1.32         {d26[0]}, [r7, :32]!
522
523         bne     .LNEON_tail
524
525         vst1.32 {d10[0]}, [r7, :32]             @ top-most bit
526         sub     r3,r3,r5,lsl#2                  @ rewind r3
527         subs    r1,sp,#0                                @ clear carry flag
528         add     r2,sp,r5,lsl#2
529
530 .LNEON_sub:
531         ldmia   r1!, {r4-r7}
532         ldmia   r3!, {r8-r11}
533         sbcs    r8, r4,r8
534         sbcs    r9, r5,r9
535         sbcs    r10,r6,r10
536         sbcs    r11,r7,r11
537         teq     r1,r2                           @ preserves carry
538         stmia   r0!, {r8-r11}
539         bne     .LNEON_sub
540
541         ldr     r10, [r1]                               @ load top-most bit
542         veor    q0,q0,q0
543         sub     r11,r2,sp                               @ this is num*4
544         veor    q1,q1,q1
545         mov     r1,sp
546         sub     r0,r0,r11                               @ rewind r0
547         mov     r3,r2                           @ second 3/4th of frame
548         sbcs    r10,r10,#0                              @ result is carry flag
549
550 .LNEON_copy_n_zap:
551         ldmia   r1!, {r4-r7}
552         ldmia   r0,  {r8-r11}
553         movcc   r8, r4
554         vst1.64 {q0-q1}, [r3,:256]!                     @ wipe
555         movcc   r9, r5
556         movcc   r10,r6
557         vst1.64 {q0-q1}, [r3,:256]!                     @ wipe
558         movcc   r11,r7
559         ldmia   r1, {r4-r7}
560         stmia   r0!, {r8-r11}
561         sub     r1,r1,#16
562         ldmia   r0, {r8-r11}
563         movcc   r8, r4
564         vst1.64 {q0-q1}, [r1,:256]!                     @ wipe
565         movcc   r9, r5
566         movcc   r10,r6
567         vst1.64 {q0-q1}, [r3,:256]!                     @ wipe
568         movcc   r11,r7
569         teq     r1,r2                           @ preserves carry
570         stmia   r0!, {r8-r11}
571         bne     .LNEON_copy_n_zap
572
573         sub     sp,ip,#96
574         vldmia  sp!,{d8-d15}
575         ldmia   sp!,{r4-r11}
576         bx      lr                                              @ .word 0xe12fff1e
577 .size   bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
578 #endif
579 .asciz  "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
580 .align  2
581 #if __ARM_MAX_ARCH__>=7
582 .comm   OPENSSL_armcap_P,4,4
583 #endif