2 /* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
5 #if __ARM_MAX_ARCH__>=7
8 .type gcm_init_v8,%function
11 ld1 {v17.2d},[x1] //load input H
13 shl v19.2d,v19.2d,#57 //0xc2.0
14 ext v3.16b,v17.16b,v17.16b,#8
15 ushr v18.2d,v19.2d,#63
17 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
19 sshr v17.4s,v17.4s,#31 //broadcast carry bit
20 and v18.16b,v18.16b,v16.16b
22 ext v18.16b,v18.16b,v18.16b,#8
23 and v16.16b,v16.16b,v17.16b
24 orr v3.16b,v3.16b,v18.16b //H<<<=1
25 eor v20.16b,v3.16b,v16.16b //twisted H
26 st1 {v20.2d},[x0],#16 //store Htable[0]
29 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
30 pmull v0.1q,v20.1d,v20.1d
31 eor v16.16b,v16.16b,v20.16b
32 pmull2 v2.1q,v20.2d,v20.2d
33 pmull v1.1q,v16.1d,v16.1d
35 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
36 eor v18.16b,v0.16b,v2.16b
37 eor v1.16b,v1.16b,v17.16b
38 eor v1.16b,v1.16b,v18.16b
39 pmull v18.1q,v0.1d,v19.1d //1st phase
43 eor v0.16b,v1.16b,v18.16b
45 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
46 pmull v0.1q,v0.1d,v19.1d
47 eor v18.16b,v18.16b,v2.16b
48 eor v22.16b,v0.16b,v18.16b
50 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
51 eor v17.16b,v17.16b,v22.16b
52 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
53 st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2]
54 //calculate H^3 and H^4
55 pmull v0.1q,v20.1d, v22.1d
56 pmull v5.1q,v22.1d,v22.1d
57 pmull2 v2.1q,v20.2d, v22.2d
58 pmull2 v7.1q,v22.2d,v22.2d
59 pmull v1.1q,v16.1d,v17.1d
60 pmull v6.1q,v17.1d,v17.1d
62 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
63 ext v17.16b,v5.16b,v7.16b,#8
64 eor v18.16b,v0.16b,v2.16b
65 eor v1.16b,v1.16b,v16.16b
66 eor v4.16b,v5.16b,v7.16b
67 eor v6.16b,v6.16b,v17.16b
68 eor v1.16b,v1.16b,v18.16b
69 pmull v18.1q,v0.1d,v19.1d //1st phase
70 eor v6.16b,v6.16b,v4.16b
71 pmull v4.1q,v5.1d,v19.1d
77 eor v0.16b,v1.16b,v18.16b
78 eor v5.16b,v6.16b,v4.16b
80 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
81 ext v4.16b,v5.16b,v5.16b,#8
82 pmull v0.1q,v0.1d,v19.1d
83 pmull v5.1q,v5.1d,v19.1d
84 eor v18.16b,v18.16b,v2.16b
85 eor v4.16b,v4.16b,v7.16b
86 eor v20.16b, v0.16b,v18.16b //H^3
87 eor v22.16b,v5.16b,v4.16b //H^4
89 ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing
90 ext v17.16b,v22.16b,v22.16b,#8
91 eor v16.16b,v16.16b,v20.16b
92 eor v17.16b,v17.16b,v22.16b
93 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
94 st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5]
96 .size gcm_init_v8,.-gcm_init_v8
98 .type gcm_gmult_v8,%function
101 ld1 {v17.2d},[x0] //load Xi
103 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
104 shl v19.2d,v19.2d,#57
106 rev64 v17.16b,v17.16b
108 ext v3.16b,v17.16b,v17.16b,#8
110 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
111 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
112 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
113 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
115 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
116 eor v18.16b,v0.16b,v2.16b
117 eor v1.16b,v1.16b,v17.16b
118 eor v1.16b,v1.16b,v18.16b
119 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
123 eor v0.16b,v1.16b,v18.16b
125 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
126 pmull v0.1q,v0.1d,v19.1d
127 eor v18.16b,v18.16b,v2.16b
128 eor v0.16b,v0.16b,v18.16b
133 ext v0.16b,v0.16b,v0.16b,#8
134 st1 {v0.2d},[x0] //write out Xi
137 .size gcm_gmult_v8,.-gcm_gmult_v8
139 .type gcm_ghash_v8,%function
143 b.hs .Lgcm_ghash_v8_4x
144 ld1 {v0.2d},[x0] //load [rotated] Xi
145 //"[rotated]" means that
146 //loaded value would have
147 //to be rotated in order to
148 //make it appear as in
149 //algorithm specification
150 subs x3,x3,#32 //see if x3 is 32 or larger
151 mov x12,#16 //x12 is used as post-
152 //increment for input pointer;
153 //as loop is modulo-scheduled
154 //x12 is zeroed just in time
155 //to preclude overstepping
156 //inp[len], which means that
157 //last block[s] are actually
158 //loaded twice, but last
159 //copy is not processed
160 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
163 csel x12,xzr,x12,eq //is it time to zero x12?
164 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
165 ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
166 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
168 rev64 v16.16b,v16.16b
171 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
172 b.lo .Lodd_tail_v8 //x3 was less than 32
173 ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
175 rev64 v17.16b,v17.16b
177 ext v7.16b,v17.16b,v17.16b,#8
178 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
179 pmull v4.1q,v20.1d,v7.1d //H·Ii+1
180 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
181 pmull2 v6.1q,v20.2d,v7.2d
186 ext v18.16b,v3.16b,v3.16b,#8
187 subs x3,x3,#32 //is there more data?
188 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
189 csel x12,xzr,x12,lo //is it time to zero x12?
191 pmull v5.1q,v21.1d,v17.1d
192 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
193 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
194 eor v0.16b,v0.16b,v4.16b //accumulate
195 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
196 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
198 eor v2.16b,v2.16b,v6.16b
199 csel x12,xzr,x12,eq //is it time to zero x12?
200 eor v1.16b,v1.16b,v5.16b
202 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
203 eor v18.16b,v0.16b,v2.16b
204 eor v1.16b,v1.16b,v17.16b
205 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
207 rev64 v16.16b,v16.16b
209 eor v1.16b,v1.16b,v18.16b
210 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
213 rev64 v17.16b,v17.16b
217 ext v7.16b,v17.16b,v17.16b,#8
218 ext v3.16b,v16.16b,v16.16b,#8
219 eor v0.16b,v1.16b,v18.16b
220 pmull v4.1q,v20.1d,v7.1d //H·Ii+1
221 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
223 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
224 pmull v0.1q,v0.1d,v19.1d
225 eor v3.16b,v3.16b,v18.16b
226 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
227 eor v3.16b,v3.16b,v0.16b
228 pmull2 v6.1q,v20.2d,v7.2d
229 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
231 eor v2.16b,v2.16b,v18.16b
232 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
233 adds x3,x3,#32 //re-construct x3
234 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
235 b.eq .Ldone_v8 //is x3 zero?
237 ext v18.16b,v0.16b,v0.16b,#8
238 eor v3.16b,v3.16b,v0.16b //inp^=Xi
239 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
241 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
242 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
243 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
244 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
246 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
247 eor v18.16b,v0.16b,v2.16b
248 eor v1.16b,v1.16b,v17.16b
249 eor v1.16b,v1.16b,v18.16b
250 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
254 eor v0.16b,v1.16b,v18.16b
256 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
257 pmull v0.1q,v0.1d,v19.1d
258 eor v18.16b,v18.16b,v2.16b
259 eor v0.16b,v0.16b,v18.16b
265 ext v0.16b,v0.16b,v0.16b,#8
266 st1 {v0.2d},[x0] //write out Xi
269 .size gcm_ghash_v8,.-gcm_ghash_v8
270 .type gcm_ghash_v8_4x,%function
274 ld1 {v0.2d},[x0] //load [rotated] Xi
275 ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
277 ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
278 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
280 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
288 ext v25.16b,v7.16b,v7.16b,#8
289 ext v24.16b,v6.16b,v6.16b,#8
290 ext v23.16b,v5.16b,v5.16b,#8
292 pmull v29.1q,v20.1d,v25.1d //H·Ii+3
293 eor v7.16b,v7.16b,v25.16b
294 pmull2 v31.1q,v20.2d,v25.2d
295 pmull v30.1q,v21.1d,v7.1d
297 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
298 eor v6.16b,v6.16b,v24.16b
299 pmull2 v24.1q,v22.2d,v24.2d
300 pmull2 v6.1q,v21.2d,v6.2d
302 eor v29.16b,v29.16b,v16.16b
303 eor v31.16b,v31.16b,v24.16b
304 eor v30.16b,v30.16b,v6.16b
306 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
307 eor v5.16b,v5.16b,v23.16b
308 pmull2 v23.1q,v26.2d,v23.2d
309 pmull v5.1q,v27.1d,v5.1d
311 eor v29.16b,v29.16b,v7.16b
312 eor v31.16b,v31.16b,v23.16b
313 eor v30.16b,v30.16b,v5.16b
322 eor v16.16b,v4.16b,v0.16b
323 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
324 ext v3.16b,v16.16b,v16.16b,#8
332 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
333 eor v16.16b,v16.16b,v3.16b
334 pmull2 v2.1q,v28.2d,v3.2d
335 ext v25.16b,v7.16b,v7.16b,#8
336 pmull2 v1.1q,v27.2d,v16.2d
338 eor v0.16b,v0.16b,v29.16b
339 eor v2.16b,v2.16b,v31.16b
340 ext v24.16b,v6.16b,v6.16b,#8
341 eor v1.16b,v1.16b,v30.16b
342 ext v23.16b,v5.16b,v5.16b,#8
344 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
345 eor v18.16b,v0.16b,v2.16b
346 pmull v29.1q,v20.1d,v25.1d //H·Ii+3
347 eor v7.16b,v7.16b,v25.16b
348 eor v1.16b,v1.16b,v17.16b
349 pmull2 v31.1q,v20.2d,v25.2d
350 eor v1.16b,v1.16b,v18.16b
351 pmull v30.1q,v21.1d,v7.1d
353 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
356 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
357 eor v6.16b,v6.16b,v24.16b
358 pmull2 v24.1q,v22.2d,v24.2d
359 eor v0.16b,v1.16b,v18.16b
360 pmull2 v6.1q,v21.2d,v6.2d
362 eor v29.16b,v29.16b,v16.16b
363 eor v31.16b,v31.16b,v24.16b
364 eor v30.16b,v30.16b,v6.16b
366 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
367 pmull v0.1q,v0.1d,v19.1d
368 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
369 eor v5.16b,v5.16b,v23.16b
370 eor v18.16b,v18.16b,v2.16b
371 pmull2 v23.1q,v26.2d,v23.2d
372 pmull v5.1q,v27.1d,v5.1d
374 eor v0.16b,v0.16b,v18.16b
375 eor v29.16b,v29.16b,v7.16b
376 eor v31.16b,v31.16b,v23.16b
377 ext v0.16b,v0.16b,v0.16b,#8
378 eor v30.16b,v30.16b,v5.16b
384 eor v16.16b,v4.16b,v0.16b
385 ext v3.16b,v16.16b,v16.16b,#8
387 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
388 eor v16.16b,v16.16b,v3.16b
389 pmull2 v2.1q,v28.2d,v3.2d
390 pmull2 v1.1q,v27.2d,v16.2d
392 eor v0.16b,v0.16b,v29.16b
393 eor v2.16b,v2.16b,v31.16b
394 eor v1.16b,v1.16b,v30.16b
403 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
404 eor v18.16b,v0.16b,v2.16b
405 eor v1.16b,v1.16b,v17.16b
406 ld1 {v4.2d,v5.2d,v6.2d},[x2]
407 eor v1.16b,v1.16b,v18.16b
414 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
417 ext v24.16b,v6.16b,v6.16b,#8
418 ext v23.16b,v5.16b,v5.16b,#8
419 eor v0.16b,v1.16b,v18.16b
421 pmull v29.1q,v20.1d,v24.1d //H·Ii+2
422 eor v6.16b,v6.16b,v24.16b
424 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
425 pmull v0.1q,v0.1d,v19.1d
426 eor v18.16b,v18.16b,v2.16b
427 pmull2 v31.1q,v20.2d,v24.2d
428 pmull v30.1q,v21.1d,v6.1d
429 eor v0.16b,v0.16b,v18.16b
430 pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
431 eor v5.16b,v5.16b,v23.16b
432 ext v0.16b,v0.16b,v0.16b,#8
434 pmull2 v23.1q,v22.2d,v23.2d
435 eor v16.16b,v4.16b,v0.16b
436 pmull2 v5.1q,v21.2d,v5.2d
437 ext v3.16b,v16.16b,v16.16b,#8
439 eor v29.16b,v29.16b,v7.16b
440 eor v31.16b,v31.16b,v23.16b
441 eor v30.16b,v30.16b,v5.16b
443 pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
444 eor v16.16b,v16.16b,v3.16b
445 pmull2 v2.1q,v26.2d,v3.2d
446 pmull v1.1q,v27.1d,v16.1d
448 eor v0.16b,v0.16b,v29.16b
449 eor v2.16b,v2.16b,v31.16b
450 eor v1.16b,v1.16b,v30.16b
455 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
456 eor v18.16b,v0.16b,v2.16b
457 eor v1.16b,v1.16b,v17.16b
458 ld1 {v4.2d,v5.2d},[x2]
459 eor v1.16b,v1.16b,v18.16b
465 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
468 ext v23.16b,v5.16b,v5.16b,#8
469 eor v0.16b,v1.16b,v18.16b
471 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
472 pmull v0.1q,v0.1d,v19.1d
473 eor v18.16b,v18.16b,v2.16b
474 eor v0.16b,v0.16b,v18.16b
475 ext v0.16b,v0.16b,v0.16b,#8
477 pmull v29.1q,v20.1d,v23.1d //H·Ii+1
478 eor v5.16b,v5.16b,v23.16b
480 eor v16.16b,v4.16b,v0.16b
481 ext v3.16b,v16.16b,v16.16b,#8
483 pmull2 v31.1q,v20.2d,v23.2d
484 pmull v30.1q,v21.1d,v5.1d
486 pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
487 eor v16.16b,v16.16b,v3.16b
488 pmull2 v2.1q,v22.2d,v3.2d
489 pmull2 v1.1q,v21.2d,v16.2d
491 eor v0.16b,v0.16b,v29.16b
492 eor v2.16b,v2.16b,v31.16b
493 eor v1.16b,v1.16b,v30.16b
498 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
499 eor v18.16b,v0.16b,v2.16b
500 eor v1.16b,v1.16b,v17.16b
502 eor v1.16b,v1.16b,v18.16b
507 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
510 eor v0.16b,v1.16b,v18.16b
512 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
513 pmull v0.1q,v0.1d,v19.1d
514 eor v18.16b,v18.16b,v2.16b
515 eor v0.16b,v0.16b,v18.16b
516 ext v0.16b,v0.16b,v0.16b,#8
518 eor v16.16b,v4.16b,v0.16b
519 ext v3.16b,v16.16b,v16.16b,#8
521 pmull v0.1q,v20.1d,v3.1d
522 eor v16.16b,v16.16b,v3.16b
523 pmull2 v2.1q,v20.2d,v3.2d
524 pmull v1.1q,v21.1d,v16.1d
527 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
528 eor v18.16b,v0.16b,v2.16b
529 eor v1.16b,v1.16b,v17.16b
530 eor v1.16b,v1.16b,v18.16b
532 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
535 eor v0.16b,v1.16b,v18.16b
537 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
538 pmull v0.1q,v0.1d,v19.1d
539 eor v18.16b,v18.16b,v2.16b
540 eor v0.16b,v0.16b,v18.16b
541 ext v0.16b,v0.16b,v0.16b,#8
546 st1 {v0.2d},[x0] //write out Xi
549 .size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
550 .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0