2 # Copyright 2011-2019 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
14 ## By Mike Hamburg (Stanford University), 2009
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
20 ######################################################################
23 # Interface to OpenSSL as "almost" drop-in replacement for
24 # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
25 # doesn't handle partial vectors (doesn't have to if called from
26 # EVP only). "Drop-in" implies that this module doesn't share key
27 # schedule structure with the original nor does it make assumption
28 # about its alignment...
30 # Performance summary. aes-x86_64.pl column lists large-block CBC
31 # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
32 # byte processed with 128-bit key, and vpaes-x86_64.pl column -
33 # [also large-block CBC] encrypt/decrypt.
35 # aes-x86_64.pl vpaes-x86_64.pl
37 # Core 2(**) 29.6/41.1/14.3 21.9/25.2(***)
38 # Nehalem 29.6/40.3/14.6 10.0/11.8
39 # Atom 57.3/74.2/32.1 60.9/77.2(***)
40 # Silvermont 52.7/64.0/19.5 48.8/60.8(***)
41 # Goldmont 38.9/49.0/17.8 10.6/12.6
43 # (*) "Hyper-threading" in the context refers rather to cache shared
44 # among multiple cores, than to specifically Intel HTT. As vast
45 # majority of contemporary cores share cache, slower code path
46 # is common place. In other words "with-hyper-threading-off"
47 # results are presented mostly for reference purposes.
49 # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
51 # (***) Less impressive improvement on Core 2 and Atom is due to slow
52 # pshufb, yet it's respectable +36%/62% improvement on Core 2
53 # (as implied, over "hyper-threading-safe" code path).
59 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
61 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
63 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
64 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
65 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
66 die "can't locate x86_64-xlate.pl";
68 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
83 ## %xmm9-%xmm15 as in _vpaes_preheat
84 ## (%rdx) = scheduled keys
87 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
88 ## Preserves %xmm6 - %xmm8 so you get some local vectors
91 .type _vpaes_encrypt_core,\@abi-omnipotent
99 movdqa .Lk_ipt(%rip), %xmm2 # iptlo
101 movdqu (%r9), %xmm5 # round0 key
105 movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
110 lea .Lk_mc_backward(%rip),%r10
115 # middle of middle round
116 movdqa %xmm13, %xmm4 # 4 : sb1u
117 movdqa %xmm12, %xmm0 # 0 : sb1t
118 pshufb %xmm2, %xmm4 # 4 = sb1u
119 pshufb %xmm3, %xmm0 # 0 = sb1t
120 pxor %xmm5, %xmm4 # 4 = sb1u + k
121 movdqa %xmm15, %xmm5 # 4 : sb2u
122 pxor %xmm4, %xmm0 # 0 = A
123 movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
124 pshufb %xmm2, %xmm5 # 4 = sb2u
125 movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
126 movdqa %xmm14, %xmm2 # 2 : sb2t
127 pshufb %xmm3, %xmm2 # 2 = sb2t
128 movdqa %xmm0, %xmm3 # 3 = A
129 pxor %xmm5, %xmm2 # 2 = 2A
130 pshufb %xmm1, %xmm0 # 0 = B
131 add \$16, %r9 # next key
132 pxor %xmm2, %xmm0 # 0 = 2A+B
133 pshufb %xmm4, %xmm3 # 3 = D
134 add \$16, %r11 # next mc
135 pxor %xmm0, %xmm3 # 3 = 2A+B+D
136 pshufb %xmm1, %xmm0 # 0 = 2B+C
137 and \$0x30, %r11 # ... mod 4
139 pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
143 movdqa %xmm9, %xmm1 # 1 : i
144 movdqa %xmm11, %xmm5 # 2 : a/k
145 pandn %xmm0, %xmm1 # 1 = i<<4
146 psrld \$4, %xmm1 # 1 = i
147 pand %xmm9, %xmm0 # 0 = k
148 pshufb %xmm0, %xmm5 # 2 = a/k
149 movdqa %xmm10, %xmm3 # 3 : 1/i
150 pxor %xmm1, %xmm0 # 0 = j
151 pshufb %xmm1, %xmm3 # 3 = 1/i
152 movdqa %xmm10, %xmm4 # 4 : 1/j
153 pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
154 pshufb %xmm0, %xmm4 # 4 = 1/j
155 movdqa %xmm10, %xmm2 # 2 : 1/iak
156 pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
157 pshufb %xmm3, %xmm2 # 2 = 1/iak
158 movdqa %xmm10, %xmm3 # 3 : 1/jak
159 pxor %xmm0, %xmm2 # 2 = io
160 pshufb %xmm4, %xmm3 # 3 = 1/jak
162 pxor %xmm1, %xmm3 # 3 = jo
165 # middle of last round
166 movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
167 movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
168 pshufb %xmm2, %xmm4 # 4 = sbou
169 pxor %xmm5, %xmm4 # 4 = sb1u + k
170 pshufb %xmm3, %xmm0 # 0 = sb1t
171 movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
172 pxor %xmm4, %xmm0 # 0 = A
176 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
181 ## Same API as encryption core.
183 .type _vpaes_decrypt_core,\@abi-omnipotent
187 mov %rdx, %r9 # load key
190 movdqa .Lk_dipt(%rip), %xmm2 # iptlo
194 movdqu (%r9), %xmm5 # round0 key
198 movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
200 lea .Lk_dsbd(%rip),%r10
204 movdqa .Lk_mc_forward+48(%rip), %xmm5
213 ## Inverse mix columns
215 movdqa -0x20(%r10),%xmm4 # 4 : sb9u
216 movdqa -0x10(%r10),%xmm1 # 0 : sb9t
217 pshufb %xmm2, %xmm4 # 4 = sb9u
218 pshufb %xmm3, %xmm1 # 0 = sb9t
220 movdqa 0x00(%r10),%xmm4 # 4 : sbdu
221 pxor %xmm1, %xmm0 # 0 = ch
222 movdqa 0x10(%r10),%xmm1 # 0 : sbdt
224 pshufb %xmm2, %xmm4 # 4 = sbdu
225 pshufb %xmm5, %xmm0 # MC ch
226 pshufb %xmm3, %xmm1 # 0 = sbdt
227 pxor %xmm4, %xmm0 # 4 = ch
228 movdqa 0x20(%r10),%xmm4 # 4 : sbbu
229 pxor %xmm1, %xmm0 # 0 = ch
230 movdqa 0x30(%r10),%xmm1 # 0 : sbbt
232 pshufb %xmm2, %xmm4 # 4 = sbbu
233 pshufb %xmm5, %xmm0 # MC ch
234 pshufb %xmm3, %xmm1 # 0 = sbbt
235 pxor %xmm4, %xmm0 # 4 = ch
236 movdqa 0x40(%r10),%xmm4 # 4 : sbeu
237 pxor %xmm1, %xmm0 # 0 = ch
238 movdqa 0x50(%r10),%xmm1 # 0 : sbet
240 pshufb %xmm2, %xmm4 # 4 = sbeu
241 pshufb %xmm5, %xmm0 # MC ch
242 pshufb %xmm3, %xmm1 # 0 = sbet
243 pxor %xmm4, %xmm0 # 4 = ch
244 add \$16, %r9 # next round key
245 palignr \$12, %xmm5, %xmm5
246 pxor %xmm1, %xmm0 # 0 = ch
251 movdqa %xmm9, %xmm1 # 1 : i
252 pandn %xmm0, %xmm1 # 1 = i<<4
253 movdqa %xmm11, %xmm2 # 2 : a/k
254 psrld \$4, %xmm1 # 1 = i
255 pand %xmm9, %xmm0 # 0 = k
256 pshufb %xmm0, %xmm2 # 2 = a/k
257 movdqa %xmm10, %xmm3 # 3 : 1/i
258 pxor %xmm1, %xmm0 # 0 = j
259 pshufb %xmm1, %xmm3 # 3 = 1/i
260 movdqa %xmm10, %xmm4 # 4 : 1/j
261 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
262 pshufb %xmm0, %xmm4 # 4 = 1/j
263 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
264 movdqa %xmm10, %xmm2 # 2 : 1/iak
265 pshufb %xmm3, %xmm2 # 2 = 1/iak
266 movdqa %xmm10, %xmm3 # 3 : 1/jak
267 pxor %xmm0, %xmm2 # 2 = io
268 pshufb %xmm4, %xmm3 # 3 = 1/jak
270 pxor %xmm1, %xmm3 # 3 = jo
273 # middle of last round
274 movdqa 0x60(%r10), %xmm4 # 3 : sbou
275 pshufb %xmm2, %xmm4 # 4 = sbou
276 pxor %xmm0, %xmm4 # 4 = sb1u + k
277 movdqa 0x70(%r10), %xmm0 # 0 : sbot
278 movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
279 pshufb %xmm3, %xmm0 # 0 = sb1t
280 pxor %xmm4, %xmm0 # 0 = A
284 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
286 ########################################################
288 ## AES key schedule ##
290 ########################################################
291 .type _vpaes_schedule_core,\@abi-omnipotent
293 _vpaes_schedule_core:
298 # rcx = direction. 0=encrypt, 1=decrypt
300 call _vpaes_preheat # load the tables
301 movdqa .Lk_rcon(%rip), %xmm8 # load rcon
302 movdqu (%rdi), %xmm0 # load key (unaligned)
306 lea .Lk_ipt(%rip), %r11
307 call _vpaes_schedule_transform
310 lea .Lk_sr(%rip),%r10
312 jnz .Lschedule_am_decrypting
314 # encrypting, output zeroth round key after transform
318 .Lschedule_am_decrypting:
319 # decrypting, output zeroth round key after shiftrows
320 movdqa (%r8,%r10),%xmm1
334 ## 128-bit specific part of key schedule.
336 ## This schedule is really simple, because all its parts
337 ## are accomplished by the subroutines.
343 call _vpaes_schedule_round
345 jz .Lschedule_mangle_last
346 call _vpaes_schedule_mangle # write output
347 jmp .Loop_schedule_128
352 ## 192-bit specific part of key schedule.
354 ## The main body of this schedule is the same as the 128-bit
355 ## schedule, but with more smearing. The long, high side is
356 ## stored in %xmm7 as before, and the short, low side is in
357 ## the high bits of %xmm6.
359 ## This schedule is somewhat nastier, however, because each
360 ## round produces 192 bits of key material, or 1.5 round keys.
361 ## Therefore, on each cycle we do 2 rounds and produce 3 round
366 movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
367 call _vpaes_schedule_transform # input transform
368 movdqa %xmm0, %xmm6 # save short part
369 pxor %xmm4, %xmm4 # clear 4
370 movhlps %xmm4, %xmm6 # clobber low side with zeros
374 call _vpaes_schedule_round
375 palignr \$8,%xmm6,%xmm0
376 call _vpaes_schedule_mangle # save key n
377 call _vpaes_schedule_192_smear
378 call _vpaes_schedule_mangle # save key n+1
379 call _vpaes_schedule_round
381 jz .Lschedule_mangle_last
382 call _vpaes_schedule_mangle # save key n+2
383 call _vpaes_schedule_192_smear
384 jmp .Loop_schedule_192
389 ## 256-bit specific part of key schedule.
391 ## The structure here is very similar to the 128-bit
392 ## schedule, but with an additional "low side" in
393 ## %xmm6. The low side's rounds are the same as the
394 ## high side's, except no rcon and no rotation.
398 movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
399 call _vpaes_schedule_transform # input transform
403 call _vpaes_schedule_mangle # output low result
404 movdqa %xmm0, %xmm6 # save cur_lo in xmm6
407 call _vpaes_schedule_round
409 jz .Lschedule_mangle_last
410 call _vpaes_schedule_mangle
412 # low round. swap xmm7 and xmm6
413 pshufd \$0xFF, %xmm0, %xmm0
416 call _vpaes_schedule_low_round
419 jmp .Loop_schedule_256
423 ## .aes_schedule_mangle_last
425 ## Mangler for last round of key schedule
427 ## when encrypting, outputs out(%xmm0) ^ 63
428 ## when decrypting, outputs unskew(%xmm0)
430 ## Always called right before return... jumps to cleanup and exits
433 .Lschedule_mangle_last:
434 # schedule last round key from xmm0
435 lea .Lk_deskew(%rip),%r11 # prepare to deskew
437 jnz .Lschedule_mangle_last_dec
440 movdqa (%r8,%r10),%xmm1
441 pshufb %xmm1, %xmm0 # output permute
442 lea .Lk_opt(%rip), %r11 # prepare to output transform
445 .Lschedule_mangle_last_dec:
447 pxor .Lk_s63(%rip), %xmm0
448 call _vpaes_schedule_transform # output transform
449 movdqu %xmm0, (%rdx) # save last key
462 .size _vpaes_schedule_core,.-_vpaes_schedule_core
465 ## .aes_schedule_192_smear
467 ## Smear the short, low side in the 192-bit key schedule.
470 ## %xmm7: high side, b a x y
471 ## %xmm6: low side, d c 0 0
475 ## %xmm6: b+c+d b+c 0 0
476 ## %xmm0: b+c+d b+c b a
478 .type _vpaes_schedule_192_smear,\@abi-omnipotent
480 _vpaes_schedule_192_smear:
482 pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
483 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
484 pxor %xmm1, %xmm6 # -> c+d c 0 0
486 pxor %xmm0, %xmm6 # -> b+c+d b+c b a
488 movhlps %xmm1, %xmm6 # clobber low side with zeros
491 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
494 ## .aes_schedule_round
496 ## Runs one main round of the key schedule on %xmm0, %xmm7
498 ## Specifically, runs subbytes on the high dword of %xmm0
499 ## then rotates it by one byte and xors into the low dword of
502 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
505 ## Smears the dwords of %xmm7 by xoring the low into the
506 ## second low, result into third, result into highest.
508 ## Returns results in %xmm7 = %xmm0.
509 ## Clobbers %xmm1-%xmm4, %r11.
511 .type _vpaes_schedule_round,\@abi-omnipotent
513 _vpaes_schedule_round:
515 # extract rcon from xmm8
517 palignr \$15, %xmm8, %xmm1
518 palignr \$15, %xmm8, %xmm8
522 pshufd \$0xFF, %xmm0, %xmm0
523 palignr \$1, %xmm0, %xmm0
527 # low round: same as high round, but no rotation and no rcon.
528 _vpaes_schedule_low_round:
536 pxor .Lk_s63(%rip), %xmm7
541 psrld \$4, %xmm1 # 1 = i
542 pand %xmm9, %xmm0 # 0 = k
543 movdqa %xmm11, %xmm2 # 2 : a/k
544 pshufb %xmm0, %xmm2 # 2 = a/k
545 pxor %xmm1, %xmm0 # 0 = j
546 movdqa %xmm10, %xmm3 # 3 : 1/i
547 pshufb %xmm1, %xmm3 # 3 = 1/i
548 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
549 movdqa %xmm10, %xmm4 # 4 : 1/j
550 pshufb %xmm0, %xmm4 # 4 = 1/j
551 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
552 movdqa %xmm10, %xmm2 # 2 : 1/iak
553 pshufb %xmm3, %xmm2 # 2 = 1/iak
554 pxor %xmm0, %xmm2 # 2 = io
555 movdqa %xmm10, %xmm3 # 3 : 1/jak
556 pshufb %xmm4, %xmm3 # 3 = 1/jak
557 pxor %xmm1, %xmm3 # 3 = jo
558 movdqa %xmm13, %xmm4 # 4 : sbou
559 pshufb %xmm2, %xmm4 # 4 = sbou
560 movdqa %xmm12, %xmm0 # 0 : sbot
561 pshufb %xmm3, %xmm0 # 0 = sb1t
562 pxor %xmm4, %xmm0 # 0 = sbox output
564 # add in smeared stuff
569 .size _vpaes_schedule_round,.-_vpaes_schedule_round
572 ## .aes_schedule_transform
574 ## Linear-transform %xmm0 according to tables at (%r11)
576 ## Requires that %xmm9 = 0x0F0F... as in preheat
578 ## Clobbers %xmm1, %xmm2
580 .type _vpaes_schedule_transform,\@abi-omnipotent
582 _vpaes_schedule_transform:
588 movdqa (%r11), %xmm2 # lo
590 movdqa 16(%r11), %xmm0 # hi
595 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
598 ## .aes_schedule_mangle
600 ## Mangle xmm0 from (basis-transformed) standard version
605 ## multiply by circulant 0,1,1,1
606 ## apply shiftrows transform
610 ## multiply by "inverse mixcolumns" circulant E,B,D,9
612 ## apply shiftrows transform
615 ## Writes out to (%rdx), and increments or decrements it
616 ## Keeps track of round number mod 4 in %r8
618 ## Clobbers xmm1-xmm5
620 .type _vpaes_schedule_mangle,\@abi-omnipotent
622 _vpaes_schedule_mangle:
624 movdqa %xmm0, %xmm4 # save xmm0 for later
625 movdqa .Lk_mc_forward(%rip),%xmm5
627 jnz .Lschedule_mangle_dec
631 pxor .Lk_s63(%rip),%xmm4
639 jmp .Lschedule_mangle_both
641 .Lschedule_mangle_dec:
642 # inverse mix columns
643 lea .Lk_dksd(%rip),%r11
646 psrld \$4, %xmm1 # 1 = hi
647 pand %xmm9, %xmm4 # 4 = lo
649 movdqa 0x00(%r11), %xmm2
651 movdqa 0x10(%r11), %xmm3
656 movdqa 0x20(%r11), %xmm2
659 movdqa 0x30(%r11), %xmm3
664 movdqa 0x40(%r11), %xmm2
667 movdqa 0x50(%r11), %xmm3
672 movdqa 0x60(%r11), %xmm2
675 movdqa 0x70(%r11), %xmm3
681 .Lschedule_mangle_both:
682 movdqa (%r8,%r10),%xmm1
689 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
692 # Interface to OpenSSL
694 .globl ${PREFIX}_set_encrypt_key
695 .type ${PREFIX}_set_encrypt_key,\@function,3
697 ${PREFIX}_set_encrypt_key:
700 $code.=<<___ if ($win64);
702 movaps %xmm6,0x10(%rsp)
703 movaps %xmm7,0x20(%rsp)
704 movaps %xmm8,0x30(%rsp)
705 movaps %xmm9,0x40(%rsp)
706 movaps %xmm10,0x50(%rsp)
707 movaps %xmm11,0x60(%rsp)
708 movaps %xmm12,0x70(%rsp)
709 movaps %xmm13,0x80(%rsp)
710 movaps %xmm14,0x90(%rsp)
711 movaps %xmm15,0xa0(%rsp)
718 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
722 call _vpaes_schedule_core
724 $code.=<<___ if ($win64);
725 movaps 0x10(%rsp),%xmm6
726 movaps 0x20(%rsp),%xmm7
727 movaps 0x30(%rsp),%xmm8
728 movaps 0x40(%rsp),%xmm9
729 movaps 0x50(%rsp),%xmm10
730 movaps 0x60(%rsp),%xmm11
731 movaps 0x70(%rsp),%xmm12
732 movaps 0x80(%rsp),%xmm13
733 movaps 0x90(%rsp),%xmm14
734 movaps 0xa0(%rsp),%xmm15
742 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
744 .globl ${PREFIX}_set_decrypt_key
745 .type ${PREFIX}_set_decrypt_key,\@function,3
747 ${PREFIX}_set_decrypt_key:
750 $code.=<<___ if ($win64);
752 movaps %xmm6,0x10(%rsp)
753 movaps %xmm7,0x20(%rsp)
754 movaps %xmm8,0x30(%rsp)
755 movaps %xmm9,0x40(%rsp)
756 movaps %xmm10,0x50(%rsp)
757 movaps %xmm11,0x60(%rsp)
758 movaps %xmm12,0x70(%rsp)
759 movaps %xmm13,0x80(%rsp)
760 movaps %xmm14,0x90(%rsp)
761 movaps %xmm15,0xa0(%rsp)
768 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
770 lea 16(%rdx,%rax),%rdx
776 xor \$32,%r8d # nbits==192?0:32
777 call _vpaes_schedule_core
779 $code.=<<___ if ($win64);
780 movaps 0x10(%rsp),%xmm6
781 movaps 0x20(%rsp),%xmm7
782 movaps 0x30(%rsp),%xmm8
783 movaps 0x40(%rsp),%xmm9
784 movaps 0x50(%rsp),%xmm10
785 movaps 0x60(%rsp),%xmm11
786 movaps 0x70(%rsp),%xmm12
787 movaps 0x80(%rsp),%xmm13
788 movaps 0x90(%rsp),%xmm14
789 movaps 0xa0(%rsp),%xmm15
797 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
799 .globl ${PREFIX}_encrypt
800 .type ${PREFIX}_encrypt,\@function,3
805 $code.=<<___ if ($win64);
807 movaps %xmm6,0x10(%rsp)
808 movaps %xmm7,0x20(%rsp)
809 movaps %xmm8,0x30(%rsp)
810 movaps %xmm9,0x40(%rsp)
811 movaps %xmm10,0x50(%rsp)
812 movaps %xmm11,0x60(%rsp)
813 movaps %xmm12,0x70(%rsp)
814 movaps %xmm13,0x80(%rsp)
815 movaps %xmm14,0x90(%rsp)
816 movaps %xmm15,0xa0(%rsp)
822 call _vpaes_encrypt_core
825 $code.=<<___ if ($win64);
826 movaps 0x10(%rsp),%xmm6
827 movaps 0x20(%rsp),%xmm7
828 movaps 0x30(%rsp),%xmm8
829 movaps 0x40(%rsp),%xmm9
830 movaps 0x50(%rsp),%xmm10
831 movaps 0x60(%rsp),%xmm11
832 movaps 0x70(%rsp),%xmm12
833 movaps 0x80(%rsp),%xmm13
834 movaps 0x90(%rsp),%xmm14
835 movaps 0xa0(%rsp),%xmm15
842 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
844 .globl ${PREFIX}_decrypt
845 .type ${PREFIX}_decrypt,\@function,3
850 $code.=<<___ if ($win64);
852 movaps %xmm6,0x10(%rsp)
853 movaps %xmm7,0x20(%rsp)
854 movaps %xmm8,0x30(%rsp)
855 movaps %xmm9,0x40(%rsp)
856 movaps %xmm10,0x50(%rsp)
857 movaps %xmm11,0x60(%rsp)
858 movaps %xmm12,0x70(%rsp)
859 movaps %xmm13,0x80(%rsp)
860 movaps %xmm14,0x90(%rsp)
861 movaps %xmm15,0xa0(%rsp)
867 call _vpaes_decrypt_core
870 $code.=<<___ if ($win64);
871 movaps 0x10(%rsp),%xmm6
872 movaps 0x20(%rsp),%xmm7
873 movaps 0x30(%rsp),%xmm8
874 movaps 0x40(%rsp),%xmm9
875 movaps 0x50(%rsp),%xmm10
876 movaps 0x60(%rsp),%xmm11
877 movaps 0x70(%rsp),%xmm12
878 movaps 0x80(%rsp),%xmm13
879 movaps 0x90(%rsp),%xmm14
880 movaps 0xa0(%rsp),%xmm15
887 .size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
890 my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
891 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
892 # size_t length, const AES_KEY *key,
893 # unsigned char *ivp,const int enc);
895 .globl ${PREFIX}_cbc_encrypt
896 .type ${PREFIX}_cbc_encrypt,\@function,6
898 ${PREFIX}_cbc_encrypt:
902 ($len,$key)=($key,$len);
907 $code.=<<___ if ($win64);
909 movaps %xmm6,0x10(%rsp)
910 movaps %xmm7,0x20(%rsp)
911 movaps %xmm8,0x30(%rsp)
912 movaps %xmm9,0x40(%rsp)
913 movaps %xmm10,0x50(%rsp)
914 movaps %xmm11,0x60(%rsp)
915 movaps %xmm12,0x70(%rsp)
916 movaps %xmm13,0x80(%rsp)
917 movaps %xmm14,0x90(%rsp)
918 movaps %xmm15,0xa0(%rsp)
922 movdqu ($ivp),%xmm6 # load IV
932 call _vpaes_encrypt_core
934 movdqu %xmm0,($out,$inp)
943 call _vpaes_decrypt_core
946 movdqu %xmm0,($out,$inp)
951 movdqu %xmm6,($ivp) # save IV
953 $code.=<<___ if ($win64);
954 movaps 0x10(%rsp),%xmm6
955 movaps 0x20(%rsp),%xmm7
956 movaps 0x30(%rsp),%xmm8
957 movaps 0x40(%rsp),%xmm9
958 movaps 0x50(%rsp),%xmm10
959 movaps 0x60(%rsp),%xmm11
960 movaps 0x70(%rsp),%xmm12
961 movaps 0x80(%rsp),%xmm13
962 movaps 0x90(%rsp),%xmm14
963 movaps 0xa0(%rsp),%xmm15
971 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
978 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
979 ## and %xmm9-%xmm15 as specified below.
981 .type _vpaes_preheat,\@abi-omnipotent
985 lea .Lk_s0F(%rip), %r10
986 movdqa -0x20(%r10), %xmm10 # .Lk_inv
987 movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
988 movdqa 0x00(%r10), %xmm9 # .Lk_s0F
989 movdqa 0x30(%r10), %xmm13 # .Lk_sb1
990 movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
991 movdqa 0x50(%r10), %xmm15 # .Lk_sb2
992 movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
995 .size _vpaes_preheat,.-_vpaes_preheat
996 ########################################################
1000 ########################################################
1001 .type _vpaes_consts,\@object
1004 .Lk_inv: # inv, inva
1005 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
1006 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
1009 .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
1011 .Lk_ipt: # input transform (lo, hi)
1012 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
1013 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
1015 .Lk_sb1: # sb1u, sb1t
1016 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
1017 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
1018 .Lk_sb2: # sb2u, sb2t
1019 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
1020 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
1021 .Lk_sbo: # sbou, sbot
1022 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
1023 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
1025 .Lk_mc_forward: # mc_forward
1026 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
1027 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
1028 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
1029 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
1031 .Lk_mc_backward:# mc_backward
1032 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
1033 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
1034 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
1035 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
1038 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
1039 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
1040 .quad 0x0F060D040B020900, 0x070E050C030A0108
1041 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
1044 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1046 .Lk_s63: # s63: all equal to 0x63 transformed
1047 .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1049 .Lk_opt: # output transform
1050 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
1051 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1053 .Lk_deskew: # deskew tables: inverts the sbox's "skew"
1054 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1055 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1059 ## Key schedule constants
1061 .Lk_dksd: # decryption key schedule: invskew x*D
1062 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1063 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1064 .Lk_dksb: # decryption key schedule: invskew x*B
1065 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
1066 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1067 .Lk_dkse: # decryption key schedule: invskew x*E + 0x63
1068 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
1069 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1070 .Lk_dks9: # decryption key schedule: invskew x*9
1071 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
1072 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
1076 ## Round function constants
1078 .Lk_dipt: # decryption input transform
1079 .quad 0x0F505B040B545F00, 0x154A411E114E451A
1080 .quad 0x86E383E660056500, 0x12771772F491F194
1082 .Lk_dsb9: # decryption sbox output *9*u, *9*t
1083 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
1084 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1085 .Lk_dsbd: # decryption sbox output *D*u, *D*t
1086 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1087 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1088 .Lk_dsbb: # decryption sbox output *B*u, *B*t
1089 .quad 0xD022649296B44200, 0x602646F6B0F2D404
1090 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1091 .Lk_dsbe: # decryption sbox output *E*u, *E*t
1092 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
1093 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1094 .Lk_dsbo: # decryption sbox final output
1095 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1096 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1097 .asciz "Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
1099 .size _vpaes_consts,.-_vpaes_consts
1103 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1104 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1111 .extern __imp_RtlVirtualUnwind
1112 .type se_handler,\@abi-omnipotent
1126 mov 120($context),%rax # pull context->Rax
1127 mov 248($context),%rbx # pull context->Rip
1129 mov 8($disp),%rsi # disp->ImageBase
1130 mov 56($disp),%r11 # disp->HandlerData
1132 mov 0(%r11),%r10d # HandlerData[0]
1133 lea (%rsi,%r10),%r10 # prologue label
1134 cmp %r10,%rbx # context->Rip<prologue label
1137 mov 152($context),%rax # pull context->Rsp
1139 mov 4(%r11),%r10d # HandlerData[1]
1140 lea (%rsi,%r10),%r10 # epilogue label
1141 cmp %r10,%rbx # context->Rip>=epilogue label
1144 lea 16(%rax),%rsi # %xmm save area
1145 lea 512($context),%rdi # &context.Xmm6
1146 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1147 .long 0xa548f3fc # cld; rep movsq
1148 lea 0xb8(%rax),%rax # adjust stack pointer
1153 mov %rax,152($context) # restore context->Rsp
1154 mov %rsi,168($context) # restore context->Rsi
1155 mov %rdi,176($context) # restore context->Rdi
1157 mov 40($disp),%rdi # disp->ContextRecord
1158 mov $context,%rsi # context
1159 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1160 .long 0xa548f3fc # cld; rep movsq
1163 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1164 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1165 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1166 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1167 mov 40(%rsi),%r10 # disp->ContextRecord
1168 lea 56(%rsi),%r11 # &disp->HandlerData
1169 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1170 mov %r10,32(%rsp) # arg5
1171 mov %r11,40(%rsp) # arg6
1172 mov %r12,48(%rsp) # arg7
1173 mov %rcx,56(%rsp) # arg8, (NULL)
1174 call *__imp_RtlVirtualUnwind(%rip)
1176 mov \$1,%eax # ExceptionContinueSearch
1188 .size se_handler,.-se_handler
1192 .rva .LSEH_begin_${PREFIX}_set_encrypt_key
1193 .rva .LSEH_end_${PREFIX}_set_encrypt_key
1194 .rva .LSEH_info_${PREFIX}_set_encrypt_key
1196 .rva .LSEH_begin_${PREFIX}_set_decrypt_key
1197 .rva .LSEH_end_${PREFIX}_set_decrypt_key
1198 .rva .LSEH_info_${PREFIX}_set_decrypt_key
1200 .rva .LSEH_begin_${PREFIX}_encrypt
1201 .rva .LSEH_end_${PREFIX}_encrypt
1202 .rva .LSEH_info_${PREFIX}_encrypt
1204 .rva .LSEH_begin_${PREFIX}_decrypt
1205 .rva .LSEH_end_${PREFIX}_decrypt
1206 .rva .LSEH_info_${PREFIX}_decrypt
1208 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
1209 .rva .LSEH_end_${PREFIX}_cbc_encrypt
1210 .rva .LSEH_info_${PREFIX}_cbc_encrypt
1214 .LSEH_info_${PREFIX}_set_encrypt_key:
1217 .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
1218 .LSEH_info_${PREFIX}_set_decrypt_key:
1221 .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
1222 .LSEH_info_${PREFIX}_encrypt:
1225 .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
1226 .LSEH_info_${PREFIX}_decrypt:
1229 .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
1230 .LSEH_info_${PREFIX}_cbc_encrypt:
1233 .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
1237 $code =~ s/\`([^\`]*)\`/eval($1)/gem;