2 /* Do not modify. This file is auto-generated from ghash-armv4.pl. */
13 .type rem_4bit,%object
16 .short 0x0000,0x1C20,0x3840,0x2460
17 .short 0x7080,0x6CA0,0x48C0,0x54E0
18 .short 0xE100,0xFD20,0xD940,0xC560
19 .short 0x9180,0x8DA0,0xA9C0,0xB5E0
20 .size rem_4bit,.-rem_4bit
22 .type rem_4bit_get,%function
25 sub r2,r2,#32 @ &rem_4bit
28 .size rem_4bit_get,.-rem_4bit_get
30 .global gcm_ghash_4bit
31 .type gcm_ghash_4bit,%function
34 add r3,r2,r3 @ r3 to point at the end
35 stmdb sp!,{r3-r11,lr} @ save r3/end too
36 sub r12,r12,#48 @ &rem_4bit
38 ldmia r12,{r4-r11} @ copy rem_4bit ...
39 stmdb sp!,{r4-r11} @ ... to stack
50 ldmia r7,{r4-r7} @ load Htbl[nlo]
55 ldmia r11,{r8-r11} @ load Htbl[nhi]
58 ldrh r8,[sp,r14] @ rem_4bit[rem]
76 ldmia r11,{r8-r11} @ load Htbl[nlo]
81 ldrh r8,[sp,r12] @ rem_4bit[rem]
89 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
91 ldmia r11,{r8-r11} @ load Htbl[nhi]
104 eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem]
107 ldr r3,[sp,#32] @ re-load r3/end
110 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
113 #elif defined(__ARMEB__)
125 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
128 #elif defined(__ARMEB__)
140 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
143 #elif defined(__ARMEB__)
155 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
158 #elif defined(__ARMEB__)
174 ldmia sp!,{r4-r11,pc}
176 ldmia sp!,{r4-r11,lr}
178 moveq pc,lr @ be binary compatible with V4, yet
179 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
181 .size gcm_ghash_4bit,.-gcm_ghash_4bit
183 .global gcm_gmult_4bit
184 .type gcm_gmult_4bit,%function
186 stmdb sp!,{r4-r11,lr}
195 ldmia r7,{r4-r7} @ load Htbl[nlo]
199 and r14,r4,#0xf @ rem
200 ldmia r11,{r8-r11} @ load Htbl[nhi]
203 ldrh r8,[r2,r14] @ rem_4bit[rem]
216 and r12,r4,#0xf @ rem
219 ldmia r11,{r8-r11} @ load Htbl[nlo]
224 ldrh r8,[r2,r12] @ rem_4bit[rem]
231 and r14,r4,#0xf @ rem
232 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
234 ldmia r11,{r8-r11} @ load Htbl[nhi]
238 ldrh r8,[r2,r14] @ rem_4bit[rem]
245 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
247 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
250 #elif defined(__ARMEB__)
262 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
265 #elif defined(__ARMEB__)
277 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
280 #elif defined(__ARMEB__)
292 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
295 #elif defined(__ARMEB__)
308 ldmia sp!,{r4-r11,pc}
310 ldmia sp!,{r4-r11,lr}
312 moveq pc,lr @ be binary compatible with V4, yet
313 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
315 .size gcm_gmult_4bit,.-gcm_gmult_4bit
316 #if __ARM_MAX_ARCH__>=7
320 .global gcm_init_neon
321 .type gcm_init_neon,%function
324 vld1.64 d7,[r1,:64]! @ load H
328 vshr.u64 d16,#63 @ t0=0xc2....01
331 vshr.s8 q9,#7 @ broadcast carry bit
335 veor q3,q3,q8 @ twisted H
339 .size gcm_init_neon,.-gcm_init_neon
341 .global gcm_gmult_neon
342 .type gcm_gmult_neon,%function
345 vld1.64 d7,[r0,:64]! @ load Xi
347 vmov.i64 d29,#0x0000ffffffffffff
348 vldmia r1,{d26-d27} @ load twisted H
349 vmov.i64 d30,#0x00000000ffffffff
353 vmov.i64 d31,#0x000000000000ffff
354 veor d28,d26,d27 @ Karatsuba pre-processing
357 .size gcm_gmult_neon,.-gcm_gmult_neon
359 .global gcm_ghash_neon
360 .type gcm_ghash_neon,%function
363 vld1.64 d1,[r0,:64]! @ load Xi
365 vmov.i64 d29,#0x0000ffffffffffff
366 vldmia r1,{d26-d27} @ load twisted H
367 vmov.i64 d30,#0x00000000ffffffff
371 vmov.i64 d31,#0x000000000000ffff
372 veor d28,d26,d27 @ Karatsuba pre-processing
375 vld1.64 d7,[r2]! @ load inp
382 vext.8 d16, d26, d26, #1 @ A1
383 vmull.p8 q8, d16, d6 @ F = A1*B
384 vext.8 d0, d6, d6, #1 @ B1
385 vmull.p8 q0, d26, d0 @ E = A*B1
386 vext.8 d18, d26, d26, #2 @ A2
387 vmull.p8 q9, d18, d6 @ H = A2*B
388 vext.8 d22, d6, d6, #2 @ B2
389 vmull.p8 q11, d26, d22 @ G = A*B2
390 vext.8 d20, d26, d26, #3 @ A3
391 veor q8, q8, q0 @ L = E + F
392 vmull.p8 q10, d20, d6 @ J = A3*B
393 vext.8 d0, d6, d6, #3 @ B3
394 veor q9, q9, q11 @ M = G + H
395 vmull.p8 q0, d26, d0 @ I = A*B3
396 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
398 vext.8 d22, d6, d6, #4 @ B4
399 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
401 vmull.p8 q11, d26, d22 @ K = A*B4
402 veor q10, q10, q0 @ N = I + J
405 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
407 vext.8 q8, q8, q8, #15
408 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
410 vext.8 q9, q9, q9, #14
412 vmull.p8 q0, d26, d6 @ D = A*B
413 vext.8 q11, q11, q11, #12
414 vext.8 q10, q10, q10, #13
419 veor d6,d6,d7 @ Karatsuba pre-processing
420 vext.8 d16, d28, d28, #1 @ A1
421 vmull.p8 q8, d16, d6 @ F = A1*B
422 vext.8 d2, d6, d6, #1 @ B1
423 vmull.p8 q1, d28, d2 @ E = A*B1
424 vext.8 d18, d28, d28, #2 @ A2
425 vmull.p8 q9, d18, d6 @ H = A2*B
426 vext.8 d22, d6, d6, #2 @ B2
427 vmull.p8 q11, d28, d22 @ G = A*B2
428 vext.8 d20, d28, d28, #3 @ A3
429 veor q8, q8, q1 @ L = E + F
430 vmull.p8 q10, d20, d6 @ J = A3*B
431 vext.8 d2, d6, d6, #3 @ B3
432 veor q9, q9, q11 @ M = G + H
433 vmull.p8 q1, d28, d2 @ I = A*B3
434 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
436 vext.8 d22, d6, d6, #4 @ B4
437 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
439 vmull.p8 q11, d28, d22 @ K = A*B4
440 veor q10, q10, q1 @ N = I + J
443 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
445 vext.8 q8, q8, q8, #15
446 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
448 vext.8 q9, q9, q9, #14
450 vmull.p8 q1, d28, d6 @ D = A*B
451 vext.8 q11, q11, q11, #12
452 vext.8 q10, q10, q10, #13
457 vext.8 d16, d27, d27, #1 @ A1
458 vmull.p8 q8, d16, d7 @ F = A1*B
459 vext.8 d4, d7, d7, #1 @ B1
460 vmull.p8 q2, d27, d4 @ E = A*B1
461 vext.8 d18, d27, d27, #2 @ A2
462 vmull.p8 q9, d18, d7 @ H = A2*B
463 vext.8 d22, d7, d7, #2 @ B2
464 vmull.p8 q11, d27, d22 @ G = A*B2
465 vext.8 d20, d27, d27, #3 @ A3
466 veor q8, q8, q2 @ L = E + F
467 vmull.p8 q10, d20, d7 @ J = A3*B
468 vext.8 d4, d7, d7, #3 @ B3
469 veor q9, q9, q11 @ M = G + H
470 vmull.p8 q2, d27, d4 @ I = A*B3
471 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
473 vext.8 d22, d7, d7, #4 @ B4
474 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
476 vmull.p8 q11, d27, d22 @ K = A*B4
477 veor q10, q10, q2 @ N = I + J
480 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
482 vext.8 q8, q8, q8, #15
483 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
485 vext.8 q9, q9, q9, #14
487 vmull.p8 q2, d27, d7 @ D = A*B
488 vext.8 q11, q11, q11, #12
489 vext.8 q10, q10, q10, #13
494 veor q1,q1,q0 @ Karatsuba post-processing
497 veor d4,d4,d3 @ Xh|Xl - 256-bit result
499 @ equivalent of reduction_avx from ghash-x86_64.pl
500 vshl.i64 q9,q0,#57 @ 1st phase
508 vshr.u64 q10,q0,#1 @ 2nd phase
523 vst1.64 d1,[r0,:64]! @ write out Xi
527 .size gcm_ghash_neon,.-gcm_ghash_neon
529 .asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"