]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - secure/lib/libcrypto/aarch64/ghashv8-armx.S
MFV r323105 (partial): 8300 fix man page issues found by mandoc 1.14.1
[FreeBSD/FreeBSD.git] / secure / lib / libcrypto / aarch64 / ghashv8-armx.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
3 #include "arm_arch.h"
4
5 .text
6 .global gcm_init_v8
7 .type   gcm_init_v8,%function
8 .align  4
9 gcm_init_v8:
10         ld1             {v17.2d},[x1]           //load input H
11         movi            v19.16b,#0xe1
12         shl     v19.2d,v19.2d,#57               //0xc2.0
13         ext             v3.16b,v17.16b,v17.16b,#8
14         ushr    v18.2d,v19.2d,#63
15         dup             v17.4s,v17.s[1]
16         ext             v16.16b,v18.16b,v19.16b,#8              //t0=0xc2....01
17         ushr    v18.2d,v3.2d,#63
18         sshr    v17.4s,v17.4s,#31               //broadcast carry bit
19         and             v18.16b,v18.16b,v16.16b
20         shl     v3.2d,v3.2d,#1
21         ext             v18.16b,v18.16b,v18.16b,#8
22         and             v16.16b,v16.16b,v17.16b
23         orr             v3.16b,v3.16b,v18.16b           //H<<<=1
24         eor             v20.16b,v3.16b,v16.16b          //twisted H
25         st1             {v20.2d},[x0],#16               //store Htable[0]
26
27         //calculate H^2
28         ext             v16.16b,v20.16b,v20.16b,#8              //Karatsuba pre-processing
29         pmull   v0.1q,v20.1d,v20.1d
30         eor             v16.16b,v16.16b,v20.16b
31         pmull2  v2.1q,v20.2d,v20.2d
32         pmull   v1.1q,v16.1d,v16.1d
33
34         ext             v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
35         eor             v18.16b,v0.16b,v2.16b
36         eor             v1.16b,v1.16b,v17.16b
37         eor             v1.16b,v1.16b,v18.16b
38         pmull   v18.1q,v0.1d,v19.1d             //1st phase
39
40         ins     v2.d[0],v1.d[1]
41         ins     v1.d[1],v0.d[0]
42         eor             v0.16b,v1.16b,v18.16b
43
44         ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase
45         pmull   v0.1q,v0.1d,v19.1d
46         eor             v18.16b,v18.16b,v2.16b
47         eor             v22.16b,v0.16b,v18.16b
48
49         ext             v17.16b,v22.16b,v22.16b,#8              //Karatsuba pre-processing
50         eor             v17.16b,v17.16b,v22.16b
51         ext             v21.16b,v16.16b,v17.16b,#8              //pack Karatsuba pre-processed
52         st1             {v21.2d-v22.2d},[x0]            //store Htable[1..2]
53
54         ret
55 .size   gcm_init_v8,.-gcm_init_v8
56 .global gcm_gmult_v8
57 .type   gcm_gmult_v8,%function
58 .align  4
59 gcm_gmult_v8:
60         ld1             {v17.2d},[x0]           //load Xi
61         movi            v19.16b,#0xe1
62         ld1             {v20.2d-v21.2d},[x1]    //load twisted H, ...
63         shl     v19.2d,v19.2d,#57
64 #ifndef __ARMEB__
65         rev64   v17.16b,v17.16b
66 #endif
67         ext             v3.16b,v17.16b,v17.16b,#8
68
69         pmull   v0.1q,v20.1d,v3.1d              //H.lo·Xi.lo
70         eor             v17.16b,v17.16b,v3.16b          //Karatsuba pre-processing
71         pmull2  v2.1q,v20.2d,v3.2d              //H.hi·Xi.hi
72         pmull   v1.1q,v21.1d,v17.1d             //(H.lo+H.hi)·(Xi.lo+Xi.hi)
73
74         ext             v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
75         eor             v18.16b,v0.16b,v2.16b
76         eor             v1.16b,v1.16b,v17.16b
77         eor             v1.16b,v1.16b,v18.16b
78         pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
79
80         ins     v2.d[0],v1.d[1]
81         ins     v1.d[1],v0.d[0]
82         eor             v0.16b,v1.16b,v18.16b
83
84         ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
85         pmull   v0.1q,v0.1d,v19.1d
86         eor             v18.16b,v18.16b,v2.16b
87         eor             v0.16b,v0.16b,v18.16b
88
89 #ifndef __ARMEB__
90         rev64   v0.16b,v0.16b
91 #endif
92         ext             v0.16b,v0.16b,v0.16b,#8
93         st1             {v0.2d},[x0]            //write out Xi
94
95         ret
96 .size   gcm_gmult_v8,.-gcm_gmult_v8
97 .global gcm_ghash_v8
98 .type   gcm_ghash_v8,%function
99 .align  4
100 gcm_ghash_v8:
101         ld1             {v0.2d},[x0]            //load [rotated] Xi
102                                                 //"[rotated]" means that
103                                                 //loaded value would have
104                                                 //to be rotated in order to
105                                                 //make it appear as in
106                                                 //alorithm specification
107         subs            x3,x3,#32               //see if x3 is 32 or larger
108         mov             x12,#16         //x12 is used as post-
109                                                 //increment for input pointer;
110                                                 //as loop is modulo-scheduled
111                                                 //x12 is zeroed just in time
112                                                 //to preclude oversteping
113                                                 //inp[len], which means that
114                                                 //last block[s] are actually
115                                                 //loaded twice, but last
116                                                 //copy is not processed
117         ld1             {v20.2d-v21.2d},[x1],#32        //load twisted H, ..., H^2
118         movi            v19.16b,#0xe1
119         ld1             {v22.2d},[x1]
120         csel    x12,xzr,x12,eq                  //is it time to zero x12?
121         ext             v0.16b,v0.16b,v0.16b,#8         //rotate Xi
122         ld1             {v16.2d},[x2],#16       //load [rotated] I[0]
123         shl     v19.2d,v19.2d,#57               //compose 0xc2.0 constant
124 #ifndef __ARMEB__
125         rev64   v16.16b,v16.16b
126         rev64   v0.16b,v0.16b
127 #endif
128         ext             v3.16b,v16.16b,v16.16b,#8               //rotate I[0]
129         b.lo            .Lodd_tail_v8           //x3 was less than 32
130         ld1             {v17.2d},[x2],x12       //load [rotated] I[1]
131 #ifndef __ARMEB__
132         rev64   v17.16b,v17.16b
133 #endif
134         ext             v7.16b,v17.16b,v17.16b,#8
135         eor             v3.16b,v3.16b,v0.16b            //I[i]^=Xi
136         pmull   v4.1q,v20.1d,v7.1d              //H·Ii+1
137         eor             v17.16b,v17.16b,v7.16b          //Karatsuba pre-processing
138         pmull2  v6.1q,v20.2d,v7.2d
139         b               .Loop_mod2x_v8
140
141 .align  4
142 .Loop_mod2x_v8:
143         ext             v18.16b,v3.16b,v3.16b,#8
144         subs            x3,x3,#32               //is there more data?
145         pmull   v0.1q,v22.1d,v3.1d              //H^2.lo·Xi.lo
146         csel    x12,xzr,x12,lo                  //is it time to zero x12?
147
148          pmull  v5.1q,v21.1d,v17.1d
149         eor             v18.16b,v18.16b,v3.16b          //Karatsuba pre-processing
150         pmull2  v2.1q,v22.2d,v3.2d              //H^2.hi·Xi.hi
151         eor             v0.16b,v0.16b,v4.16b            //accumulate
152         pmull2  v1.1q,v21.2d,v18.2d             //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
153          ld1    {v16.2d},[x2],x12       //load [rotated] I[i+2]
154
155         eor             v2.16b,v2.16b,v6.16b
156          csel   x12,xzr,x12,eq                  //is it time to zero x12?
157         eor             v1.16b,v1.16b,v5.16b
158
159         ext             v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
160         eor             v18.16b,v0.16b,v2.16b
161         eor             v1.16b,v1.16b,v17.16b
162          ld1    {v17.2d},[x2],x12       //load [rotated] I[i+3]
163 #ifndef __ARMEB__
164          rev64  v16.16b,v16.16b
165 #endif
166         eor             v1.16b,v1.16b,v18.16b
167         pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
168
169 #ifndef __ARMEB__
170          rev64  v17.16b,v17.16b
171 #endif
172         ins     v2.d[0],v1.d[1]
173         ins     v1.d[1],v0.d[0]
174          ext            v7.16b,v17.16b,v17.16b,#8
175          ext            v3.16b,v16.16b,v16.16b,#8
176         eor             v0.16b,v1.16b,v18.16b
177          pmull  v4.1q,v20.1d,v7.1d              //H·Ii+1
178         eor             v3.16b,v3.16b,v2.16b            //accumulate v3.16b early
179
180         ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
181         pmull   v0.1q,v0.1d,v19.1d
182         eor             v3.16b,v3.16b,v18.16b
183          eor            v17.16b,v17.16b,v7.16b          //Karatsuba pre-processing
184         eor             v3.16b,v3.16b,v0.16b
185          pmull2 v6.1q,v20.2d,v7.2d
186         b.hs            .Loop_mod2x_v8          //there was at least 32 more bytes
187
188         eor             v2.16b,v2.16b,v18.16b
189         ext             v3.16b,v16.16b,v16.16b,#8               //re-construct v3.16b
190         adds            x3,x3,#32               //re-construct x3
191         eor             v0.16b,v0.16b,v2.16b            //re-construct v0.16b
192         b.eq            .Ldone_v8               //is x3 zero?
193 .Lodd_tail_v8:
194         ext             v18.16b,v0.16b,v0.16b,#8
195         eor             v3.16b,v3.16b,v0.16b            //inp^=Xi
196         eor             v17.16b,v16.16b,v18.16b         //v17.16b is rotated inp^Xi
197
198         pmull   v0.1q,v20.1d,v3.1d              //H.lo·Xi.lo
199         eor             v17.16b,v17.16b,v3.16b          //Karatsuba pre-processing
200         pmull2  v2.1q,v20.2d,v3.2d              //H.hi·Xi.hi
201         pmull   v1.1q,v21.1d,v17.1d             //(H.lo+H.hi)·(Xi.lo+Xi.hi)
202
203         ext             v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
204         eor             v18.16b,v0.16b,v2.16b
205         eor             v1.16b,v1.16b,v17.16b
206         eor             v1.16b,v1.16b,v18.16b
207         pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
208
209         ins     v2.d[0],v1.d[1]
210         ins     v1.d[1],v0.d[0]
211         eor             v0.16b,v1.16b,v18.16b
212
213         ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
214         pmull   v0.1q,v0.1d,v19.1d
215         eor             v18.16b,v18.16b,v2.16b
216         eor             v0.16b,v0.16b,v18.16b
217
218 .Ldone_v8:
219 #ifndef __ARMEB__
220         rev64   v0.16b,v0.16b
221 #endif
222         ext             v0.16b,v0.16b,v0.16b,#8
223         st1             {v0.2d},[x0]            //write out Xi
224
225         ret
226 .size   gcm_ghash_v8,.-gcm_ghash_v8
227 .asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
228 .align  2