]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - secure/lib/libcrypto/aarch64/ghashv8-armx.S
caroot: cumulative cert update
[FreeBSD/FreeBSD.git] / secure / lib / libcrypto / aarch64 / ghashv8-armx.S
1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
3 #include "arm_arch.h"
4
5 #if __ARM_MAX_ARCH__>=7
6 .text
7 .globl  gcm_init_v8
8 .type   gcm_init_v8,%function
9 .align  4
10 gcm_init_v8:
11         ld1     {v17.2d},[x1]           //load input H
12         movi    v19.16b,#0xe1
13         shl     v19.2d,v19.2d,#57               //0xc2.0
14         ext     v3.16b,v17.16b,v17.16b,#8
15         ushr    v18.2d,v19.2d,#63
16         dup     v17.4s,v17.s[1]
17         ext     v16.16b,v18.16b,v19.16b,#8              //t0=0xc2....01
18         ushr    v18.2d,v3.2d,#63
19         sshr    v17.4s,v17.4s,#31               //broadcast carry bit
20         and     v18.16b,v18.16b,v16.16b
21         shl     v3.2d,v3.2d,#1
22         ext     v18.16b,v18.16b,v18.16b,#8
23         and     v16.16b,v16.16b,v17.16b
24         orr     v3.16b,v3.16b,v18.16b           //H<<<=1
25         eor     v20.16b,v3.16b,v16.16b          //twisted H
26         st1     {v20.2d},[x0],#16               //store Htable[0]
27
28         //calculate H^2
29         ext     v16.16b,v20.16b,v20.16b,#8              //Karatsuba pre-processing
30         pmull   v0.1q,v20.1d,v20.1d
31         eor     v16.16b,v16.16b,v20.16b
32         pmull2  v2.1q,v20.2d,v20.2d
33         pmull   v1.1q,v16.1d,v16.1d
34
35         ext     v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
36         eor     v18.16b,v0.16b,v2.16b
37         eor     v1.16b,v1.16b,v17.16b
38         eor     v1.16b,v1.16b,v18.16b
39         pmull   v18.1q,v0.1d,v19.1d             //1st phase
40
41         ins     v2.d[0],v1.d[1]
42         ins     v1.d[1],v0.d[0]
43         eor     v0.16b,v1.16b,v18.16b
44
45         ext     v18.16b,v0.16b,v0.16b,#8                //2nd phase
46         pmull   v0.1q,v0.1d,v19.1d
47         eor     v18.16b,v18.16b,v2.16b
48         eor     v22.16b,v0.16b,v18.16b
49
50         ext     v17.16b,v22.16b,v22.16b,#8              //Karatsuba pre-processing
51         eor     v17.16b,v17.16b,v22.16b
52         ext     v21.16b,v16.16b,v17.16b,#8              //pack Karatsuba pre-processed
53         st1     {v21.2d,v22.2d},[x0],#32        //store Htable[1..2]
54         //calculate H^3 and H^4
55         pmull   v0.1q,v20.1d, v22.1d
56         pmull   v5.1q,v22.1d,v22.1d
57         pmull2  v2.1q,v20.2d, v22.2d
58         pmull2  v7.1q,v22.2d,v22.2d
59         pmull   v1.1q,v16.1d,v17.1d
60         pmull   v6.1q,v17.1d,v17.1d
61
62         ext     v16.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
63         ext     v17.16b,v5.16b,v7.16b,#8
64         eor     v18.16b,v0.16b,v2.16b
65         eor     v1.16b,v1.16b,v16.16b
66         eor     v4.16b,v5.16b,v7.16b
67         eor     v6.16b,v6.16b,v17.16b
68         eor     v1.16b,v1.16b,v18.16b
69         pmull   v18.1q,v0.1d,v19.1d             //1st phase
70         eor     v6.16b,v6.16b,v4.16b
71         pmull   v4.1q,v5.1d,v19.1d
72
73         ins     v2.d[0],v1.d[1]
74         ins     v7.d[0],v6.d[1]
75         ins     v1.d[1],v0.d[0]
76         ins     v6.d[1],v5.d[0]
77         eor     v0.16b,v1.16b,v18.16b
78         eor     v5.16b,v6.16b,v4.16b
79
80         ext     v18.16b,v0.16b,v0.16b,#8                //2nd phase
81         ext     v4.16b,v5.16b,v5.16b,#8
82         pmull   v0.1q,v0.1d,v19.1d
83         pmull   v5.1q,v5.1d,v19.1d
84         eor     v18.16b,v18.16b,v2.16b
85         eor     v4.16b,v4.16b,v7.16b
86         eor     v20.16b, v0.16b,v18.16b         //H^3
87         eor     v22.16b,v5.16b,v4.16b           //H^4
88
89         ext     v16.16b,v20.16b, v20.16b,#8             //Karatsuba pre-processing
90         ext     v17.16b,v22.16b,v22.16b,#8
91         eor     v16.16b,v16.16b,v20.16b
92         eor     v17.16b,v17.16b,v22.16b
93         ext     v21.16b,v16.16b,v17.16b,#8              //pack Karatsuba pre-processed
94         st1     {v20.2d,v21.2d,v22.2d},[x0]             //store Htable[3..5]
95         ret
96 .size   gcm_init_v8,.-gcm_init_v8
97 .globl  gcm_gmult_v8
98 .type   gcm_gmult_v8,%function
99 .align  4
100 gcm_gmult_v8:
101         ld1     {v17.2d},[x0]           //load Xi
102         movi    v19.16b,#0xe1
103         ld1     {v20.2d,v21.2d},[x1]    //load twisted H, ...
104         shl     v19.2d,v19.2d,#57
105 #ifndef __ARMEB__
106         rev64   v17.16b,v17.16b
107 #endif
108         ext     v3.16b,v17.16b,v17.16b,#8
109
110         pmull   v0.1q,v20.1d,v3.1d              //H.lo·Xi.lo
111         eor     v17.16b,v17.16b,v3.16b          //Karatsuba pre-processing
112         pmull2  v2.1q,v20.2d,v3.2d              //H.hi·Xi.hi
113         pmull   v1.1q,v21.1d,v17.1d             //(H.lo+H.hi)·(Xi.lo+Xi.hi)
114
115         ext     v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
116         eor     v18.16b,v0.16b,v2.16b
117         eor     v1.16b,v1.16b,v17.16b
118         eor     v1.16b,v1.16b,v18.16b
119         pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
120
121         ins     v2.d[0],v1.d[1]
122         ins     v1.d[1],v0.d[0]
123         eor     v0.16b,v1.16b,v18.16b
124
125         ext     v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
126         pmull   v0.1q,v0.1d,v19.1d
127         eor     v18.16b,v18.16b,v2.16b
128         eor     v0.16b,v0.16b,v18.16b
129
130 #ifndef __ARMEB__
131         rev64   v0.16b,v0.16b
132 #endif
133         ext     v0.16b,v0.16b,v0.16b,#8
134         st1     {v0.2d},[x0]            //write out Xi
135
136         ret
137 .size   gcm_gmult_v8,.-gcm_gmult_v8
138 .globl  gcm_ghash_v8
139 .type   gcm_ghash_v8,%function
140 .align  4
141 gcm_ghash_v8:
142         cmp     x3,#64
143         b.hs    .Lgcm_ghash_v8_4x
144         ld1     {v0.2d},[x0]            //load [rotated] Xi
145                                                 //"[rotated]" means that
146                                                 //loaded value would have
147                                                 //to be rotated in order to
148                                                 //make it appear as in
149                                                 //algorithm specification
150         subs    x3,x3,#32               //see if x3 is 32 or larger
151         mov     x12,#16         //x12 is used as post-
152                                                 //increment for input pointer;
153                                                 //as loop is modulo-scheduled
154                                                 //x12 is zeroed just in time
155                                                 //to preclude overstepping
156                                                 //inp[len], which means that
157                                                 //last block[s] are actually
158                                                 //loaded twice, but last
159                                                 //copy is not processed
160         ld1     {v20.2d,v21.2d},[x1],#32        //load twisted H, ..., H^2
161         movi    v19.16b,#0xe1
162         ld1     {v22.2d},[x1]
163         csel    x12,xzr,x12,eq                  //is it time to zero x12?
164         ext     v0.16b,v0.16b,v0.16b,#8         //rotate Xi
165         ld1     {v16.2d},[x2],#16       //load [rotated] I[0]
166         shl     v19.2d,v19.2d,#57               //compose 0xc2.0 constant
167 #ifndef __ARMEB__
168         rev64   v16.16b,v16.16b
169         rev64   v0.16b,v0.16b
170 #endif
171         ext     v3.16b,v16.16b,v16.16b,#8               //rotate I[0]
172         b.lo    .Lodd_tail_v8           //x3 was less than 32
173         ld1     {v17.2d},[x2],x12       //load [rotated] I[1]
174 #ifndef __ARMEB__
175         rev64   v17.16b,v17.16b
176 #endif
177         ext     v7.16b,v17.16b,v17.16b,#8
178         eor     v3.16b,v3.16b,v0.16b            //I[i]^=Xi
179         pmull   v4.1q,v20.1d,v7.1d              //H·Ii+1
180         eor     v17.16b,v17.16b,v7.16b          //Karatsuba pre-processing
181         pmull2  v6.1q,v20.2d,v7.2d
182         b       .Loop_mod2x_v8
183
184 .align  4
185 .Loop_mod2x_v8:
186         ext     v18.16b,v3.16b,v3.16b,#8
187         subs    x3,x3,#32               //is there more data?
188         pmull   v0.1q,v22.1d,v3.1d              //H^2.lo·Xi.lo
189         csel    x12,xzr,x12,lo                  //is it time to zero x12?
190
191         pmull   v5.1q,v21.1d,v17.1d
192         eor     v18.16b,v18.16b,v3.16b          //Karatsuba pre-processing
193         pmull2  v2.1q,v22.2d,v3.2d              //H^2.hi·Xi.hi
194         eor     v0.16b,v0.16b,v4.16b            //accumulate
195         pmull2  v1.1q,v21.2d,v18.2d             //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
196         ld1     {v16.2d},[x2],x12       //load [rotated] I[i+2]
197
198         eor     v2.16b,v2.16b,v6.16b
199         csel    x12,xzr,x12,eq                  //is it time to zero x12?
200         eor     v1.16b,v1.16b,v5.16b
201
202         ext     v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
203         eor     v18.16b,v0.16b,v2.16b
204         eor     v1.16b,v1.16b,v17.16b
205         ld1     {v17.2d},[x2],x12       //load [rotated] I[i+3]
206 #ifndef __ARMEB__
207         rev64   v16.16b,v16.16b
208 #endif
209         eor     v1.16b,v1.16b,v18.16b
210         pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
211
212 #ifndef __ARMEB__
213         rev64   v17.16b,v17.16b
214 #endif
215         ins     v2.d[0],v1.d[1]
216         ins     v1.d[1],v0.d[0]
217         ext     v7.16b,v17.16b,v17.16b,#8
218         ext     v3.16b,v16.16b,v16.16b,#8
219         eor     v0.16b,v1.16b,v18.16b
220         pmull   v4.1q,v20.1d,v7.1d              //H·Ii+1
221         eor     v3.16b,v3.16b,v2.16b            //accumulate v3.16b early
222
223         ext     v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
224         pmull   v0.1q,v0.1d,v19.1d
225         eor     v3.16b,v3.16b,v18.16b
226         eor     v17.16b,v17.16b,v7.16b          //Karatsuba pre-processing
227         eor     v3.16b,v3.16b,v0.16b
228         pmull2  v6.1q,v20.2d,v7.2d
229         b.hs    .Loop_mod2x_v8          //there was at least 32 more bytes
230
231         eor     v2.16b,v2.16b,v18.16b
232         ext     v3.16b,v16.16b,v16.16b,#8               //re-construct v3.16b
233         adds    x3,x3,#32               //re-construct x3
234         eor     v0.16b,v0.16b,v2.16b            //re-construct v0.16b
235         b.eq    .Ldone_v8               //is x3 zero?
236 .Lodd_tail_v8:
237         ext     v18.16b,v0.16b,v0.16b,#8
238         eor     v3.16b,v3.16b,v0.16b            //inp^=Xi
239         eor     v17.16b,v16.16b,v18.16b         //v17.16b is rotated inp^Xi
240
241         pmull   v0.1q,v20.1d,v3.1d              //H.lo·Xi.lo
242         eor     v17.16b,v17.16b,v3.16b          //Karatsuba pre-processing
243         pmull2  v2.1q,v20.2d,v3.2d              //H.hi·Xi.hi
244         pmull   v1.1q,v21.1d,v17.1d             //(H.lo+H.hi)·(Xi.lo+Xi.hi)
245
246         ext     v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
247         eor     v18.16b,v0.16b,v2.16b
248         eor     v1.16b,v1.16b,v17.16b
249         eor     v1.16b,v1.16b,v18.16b
250         pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
251
252         ins     v2.d[0],v1.d[1]
253         ins     v1.d[1],v0.d[0]
254         eor     v0.16b,v1.16b,v18.16b
255
256         ext     v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
257         pmull   v0.1q,v0.1d,v19.1d
258         eor     v18.16b,v18.16b,v2.16b
259         eor     v0.16b,v0.16b,v18.16b
260
261 .Ldone_v8:
262 #ifndef __ARMEB__
263         rev64   v0.16b,v0.16b
264 #endif
265         ext     v0.16b,v0.16b,v0.16b,#8
266         st1     {v0.2d},[x0]            //write out Xi
267
268         ret
269 .size   gcm_ghash_v8,.-gcm_ghash_v8
270 .type   gcm_ghash_v8_4x,%function
271 .align  4
272 gcm_ghash_v8_4x:
273 .Lgcm_ghash_v8_4x:
274         ld1     {v0.2d},[x0]            //load [rotated] Xi
275         ld1     {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
276         movi    v19.16b,#0xe1
277         ld1     {v26.2d,v27.2d,v28.2d},[x1]     //load twisted H^3, ..., H^4
278         shl     v19.2d,v19.2d,#57               //compose 0xc2.0 constant
279
280         ld1     {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
281 #ifndef __ARMEB__
282         rev64   v0.16b,v0.16b
283         rev64   v5.16b,v5.16b
284         rev64   v6.16b,v6.16b
285         rev64   v7.16b,v7.16b
286         rev64   v4.16b,v4.16b
287 #endif
288         ext     v25.16b,v7.16b,v7.16b,#8
289         ext     v24.16b,v6.16b,v6.16b,#8
290         ext     v23.16b,v5.16b,v5.16b,#8
291
292         pmull   v29.1q,v20.1d,v25.1d            //H·Ii+3
293         eor     v7.16b,v7.16b,v25.16b
294         pmull2  v31.1q,v20.2d,v25.2d
295         pmull   v30.1q,v21.1d,v7.1d
296
297         pmull   v16.1q,v22.1d,v24.1d            //H^2·Ii+2
298         eor     v6.16b,v6.16b,v24.16b
299         pmull2  v24.1q,v22.2d,v24.2d
300         pmull2  v6.1q,v21.2d,v6.2d
301
302         eor     v29.16b,v29.16b,v16.16b
303         eor     v31.16b,v31.16b,v24.16b
304         eor     v30.16b,v30.16b,v6.16b
305
306         pmull   v7.1q,v26.1d,v23.1d             //H^3·Ii+1
307         eor     v5.16b,v5.16b,v23.16b
308         pmull2  v23.1q,v26.2d,v23.2d
309         pmull   v5.1q,v27.1d,v5.1d
310
311         eor     v29.16b,v29.16b,v7.16b
312         eor     v31.16b,v31.16b,v23.16b
313         eor     v30.16b,v30.16b,v5.16b
314
315         subs    x3,x3,#128
316         b.lo    .Ltail4x
317
318         b       .Loop4x
319
320 .align  4
321 .Loop4x:
322         eor     v16.16b,v4.16b,v0.16b
323         ld1     {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
324         ext     v3.16b,v16.16b,v16.16b,#8
325 #ifndef __ARMEB__
326         rev64   v5.16b,v5.16b
327         rev64   v6.16b,v6.16b
328         rev64   v7.16b,v7.16b
329         rev64   v4.16b,v4.16b
330 #endif
331
332         pmull   v0.1q,v28.1d,v3.1d              //H^4·(Xi+Ii)
333         eor     v16.16b,v16.16b,v3.16b
334         pmull2  v2.1q,v28.2d,v3.2d
335         ext     v25.16b,v7.16b,v7.16b,#8
336         pmull2  v1.1q,v27.2d,v16.2d
337
338         eor     v0.16b,v0.16b,v29.16b
339         eor     v2.16b,v2.16b,v31.16b
340         ext     v24.16b,v6.16b,v6.16b,#8
341         eor     v1.16b,v1.16b,v30.16b
342         ext     v23.16b,v5.16b,v5.16b,#8
343
344         ext     v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
345         eor     v18.16b,v0.16b,v2.16b
346         pmull   v29.1q,v20.1d,v25.1d            //H·Ii+3
347         eor     v7.16b,v7.16b,v25.16b
348         eor     v1.16b,v1.16b,v17.16b
349         pmull2  v31.1q,v20.2d,v25.2d
350         eor     v1.16b,v1.16b,v18.16b
351         pmull   v30.1q,v21.1d,v7.1d
352
353         pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
354         ins     v2.d[0],v1.d[1]
355         ins     v1.d[1],v0.d[0]
356         pmull   v16.1q,v22.1d,v24.1d            //H^2·Ii+2
357         eor     v6.16b,v6.16b,v24.16b
358         pmull2  v24.1q,v22.2d,v24.2d
359         eor     v0.16b,v1.16b,v18.16b
360         pmull2  v6.1q,v21.2d,v6.2d
361
362         eor     v29.16b,v29.16b,v16.16b
363         eor     v31.16b,v31.16b,v24.16b
364         eor     v30.16b,v30.16b,v6.16b
365
366         ext     v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
367         pmull   v0.1q,v0.1d,v19.1d
368         pmull   v7.1q,v26.1d,v23.1d             //H^3·Ii+1
369         eor     v5.16b,v5.16b,v23.16b
370         eor     v18.16b,v18.16b,v2.16b
371         pmull2  v23.1q,v26.2d,v23.2d
372         pmull   v5.1q,v27.1d,v5.1d
373
374         eor     v0.16b,v0.16b,v18.16b
375         eor     v29.16b,v29.16b,v7.16b
376         eor     v31.16b,v31.16b,v23.16b
377         ext     v0.16b,v0.16b,v0.16b,#8
378         eor     v30.16b,v30.16b,v5.16b
379
380         subs    x3,x3,#64
381         b.hs    .Loop4x
382
383 .Ltail4x:
384         eor     v16.16b,v4.16b,v0.16b
385         ext     v3.16b,v16.16b,v16.16b,#8
386
387         pmull   v0.1q,v28.1d,v3.1d              //H^4·(Xi+Ii)
388         eor     v16.16b,v16.16b,v3.16b
389         pmull2  v2.1q,v28.2d,v3.2d
390         pmull2  v1.1q,v27.2d,v16.2d
391
392         eor     v0.16b,v0.16b,v29.16b
393         eor     v2.16b,v2.16b,v31.16b
394         eor     v1.16b,v1.16b,v30.16b
395
396         adds    x3,x3,#64
397         b.eq    .Ldone4x
398
399         cmp     x3,#32
400         b.lo    .Lone
401         b.eq    .Ltwo
402 .Lthree:
403         ext     v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
404         eor     v18.16b,v0.16b,v2.16b
405         eor     v1.16b,v1.16b,v17.16b
406         ld1     {v4.2d,v5.2d,v6.2d},[x2]
407         eor     v1.16b,v1.16b,v18.16b
408 #ifndef __ARMEB__
409         rev64   v5.16b,v5.16b
410         rev64   v6.16b,v6.16b
411         rev64   v4.16b,v4.16b
412 #endif
413
414         pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
415         ins     v2.d[0],v1.d[1]
416         ins     v1.d[1],v0.d[0]
417         ext     v24.16b,v6.16b,v6.16b,#8
418         ext     v23.16b,v5.16b,v5.16b,#8
419         eor     v0.16b,v1.16b,v18.16b
420
421         pmull   v29.1q,v20.1d,v24.1d            //H·Ii+2
422         eor     v6.16b,v6.16b,v24.16b
423
424         ext     v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
425         pmull   v0.1q,v0.1d,v19.1d
426         eor     v18.16b,v18.16b,v2.16b
427         pmull2  v31.1q,v20.2d,v24.2d
428         pmull   v30.1q,v21.1d,v6.1d
429         eor     v0.16b,v0.16b,v18.16b
430         pmull   v7.1q,v22.1d,v23.1d             //H^2·Ii+1
431         eor     v5.16b,v5.16b,v23.16b
432         ext     v0.16b,v0.16b,v0.16b,#8
433
434         pmull2  v23.1q,v22.2d,v23.2d
435         eor     v16.16b,v4.16b,v0.16b
436         pmull2  v5.1q,v21.2d,v5.2d
437         ext     v3.16b,v16.16b,v16.16b,#8
438
439         eor     v29.16b,v29.16b,v7.16b
440         eor     v31.16b,v31.16b,v23.16b
441         eor     v30.16b,v30.16b,v5.16b
442
443         pmull   v0.1q,v26.1d,v3.1d              //H^3·(Xi+Ii)
444         eor     v16.16b,v16.16b,v3.16b
445         pmull2  v2.1q,v26.2d,v3.2d
446         pmull   v1.1q,v27.1d,v16.1d
447
448         eor     v0.16b,v0.16b,v29.16b
449         eor     v2.16b,v2.16b,v31.16b
450         eor     v1.16b,v1.16b,v30.16b
451         b       .Ldone4x
452
453 .align  4
454 .Ltwo:
455         ext     v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
456         eor     v18.16b,v0.16b,v2.16b
457         eor     v1.16b,v1.16b,v17.16b
458         ld1     {v4.2d,v5.2d},[x2]
459         eor     v1.16b,v1.16b,v18.16b
460 #ifndef __ARMEB__
461         rev64   v5.16b,v5.16b
462         rev64   v4.16b,v4.16b
463 #endif
464
465         pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
466         ins     v2.d[0],v1.d[1]
467         ins     v1.d[1],v0.d[0]
468         ext     v23.16b,v5.16b,v5.16b,#8
469         eor     v0.16b,v1.16b,v18.16b
470
471         ext     v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
472         pmull   v0.1q,v0.1d,v19.1d
473         eor     v18.16b,v18.16b,v2.16b
474         eor     v0.16b,v0.16b,v18.16b
475         ext     v0.16b,v0.16b,v0.16b,#8
476
477         pmull   v29.1q,v20.1d,v23.1d            //H·Ii+1
478         eor     v5.16b,v5.16b,v23.16b
479
480         eor     v16.16b,v4.16b,v0.16b
481         ext     v3.16b,v16.16b,v16.16b,#8
482
483         pmull2  v31.1q,v20.2d,v23.2d
484         pmull   v30.1q,v21.1d,v5.1d
485
486         pmull   v0.1q,v22.1d,v3.1d              //H^2·(Xi+Ii)
487         eor     v16.16b,v16.16b,v3.16b
488         pmull2  v2.1q,v22.2d,v3.2d
489         pmull2  v1.1q,v21.2d,v16.2d
490
491         eor     v0.16b,v0.16b,v29.16b
492         eor     v2.16b,v2.16b,v31.16b
493         eor     v1.16b,v1.16b,v30.16b
494         b       .Ldone4x
495
496 .align  4
497 .Lone:
498         ext     v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
499         eor     v18.16b,v0.16b,v2.16b
500         eor     v1.16b,v1.16b,v17.16b
501         ld1     {v4.2d},[x2]
502         eor     v1.16b,v1.16b,v18.16b
503 #ifndef __ARMEB__
504         rev64   v4.16b,v4.16b
505 #endif
506
507         pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
508         ins     v2.d[0],v1.d[1]
509         ins     v1.d[1],v0.d[0]
510         eor     v0.16b,v1.16b,v18.16b
511
512         ext     v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
513         pmull   v0.1q,v0.1d,v19.1d
514         eor     v18.16b,v18.16b,v2.16b
515         eor     v0.16b,v0.16b,v18.16b
516         ext     v0.16b,v0.16b,v0.16b,#8
517
518         eor     v16.16b,v4.16b,v0.16b
519         ext     v3.16b,v16.16b,v16.16b,#8
520
521         pmull   v0.1q,v20.1d,v3.1d
522         eor     v16.16b,v16.16b,v3.16b
523         pmull2  v2.1q,v20.2d,v3.2d
524         pmull   v1.1q,v21.1d,v16.1d
525
526 .Ldone4x:
527         ext     v17.16b,v0.16b,v2.16b,#8                //Karatsuba post-processing
528         eor     v18.16b,v0.16b,v2.16b
529         eor     v1.16b,v1.16b,v17.16b
530         eor     v1.16b,v1.16b,v18.16b
531
532         pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
533         ins     v2.d[0],v1.d[1]
534         ins     v1.d[1],v0.d[0]
535         eor     v0.16b,v1.16b,v18.16b
536
537         ext     v18.16b,v0.16b,v0.16b,#8                //2nd phase of reduction
538         pmull   v0.1q,v0.1d,v19.1d
539         eor     v18.16b,v18.16b,v2.16b
540         eor     v0.16b,v0.16b,v18.16b
541         ext     v0.16b,v0.16b,v0.16b,#8
542
543 #ifndef __ARMEB__
544         rev64   v0.16b,v0.16b
545 #endif
546         st1     {v0.2d},[x0]            //write out Xi
547
548         ret
549 .size   gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
550 .byte   71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
551 .align  2
552 .align  2
553 #endif