]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - crypto/openssl/crypto/camellia/asm/cmll-x86_64.pl
MFC: r359060, r359061, r359066
[FreeBSD/FreeBSD.git] / crypto / openssl / crypto / camellia / asm / cmll-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2008-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
12 #
13 # This module may be used under the terms of either the GNU General
14 # Public License version 2 or later, the GNU Lesser General Public
15 # License version 2.1 or later, the Mozilla Public License version
16 # 1.1 or the BSD License. The exact terms of either license are
17 # distributed along with this module. For further details see
18 # http://www.openssl.org/~appro/camellia/.
19 # ====================================================================
20
21 # Performance in cycles per processed byte (less is better) in
22 # 'openssl speed ...' benchmark:
23 #
24 #                       AMD64   Core2   EM64T
25 # -evp camellia-128-ecb 16.7    21.0    22.7
26 # + over gcc 3.4.6      +25%    +5%     0%
27 #
28 # camellia-128-cbc      15.7    20.4    21.1
29 #
30 # 128-bit key setup     128     216     205     cycles/key
31 # + over gcc 3.4.6      +54%    +39%    +15%
32 #
33 # Numbers in "+" rows represent performance improvement over compiler
34 # generated code. Key setup timings are impressive on AMD and Core2
35 # thanks to 64-bit operations being covertly deployed. Improvement on
36 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
37 # apparently emulates some of 64-bit operations in [32-bit] microcode.
38
39 $flavour = shift;
40 $output  = shift;
41 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
42
43 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44
45 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
47 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
48 die "can't locate x86_64-xlate.pl";
49
50 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
51 *STDOUT=*OUT;
52
53 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
54 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
55                         $r =~ s/%[er]([sd]i)/%\1l/;
56                         $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
57
58 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
59 @S=("%r8d","%r9d","%r10d","%r11d");
60 $i0="%esi";
61 $i1="%edi";
62 $Tbl="%rbp";    # size optimization
63 $inp="%r12";
64 $out="%r13";
65 $key="%r14";
66 $keyend="%r15";
67 $arg0d=$win64?"%ecx":"%edi";
68
69 # const unsigned int Camellia_SBOX[4][256];
70 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
71 # and [2][] - with [3][]. This is done to minimize code size.
72 $SBOX1_1110=0;          # Camellia_SBOX[0]
73 $SBOX4_4404=4;          # Camellia_SBOX[1]
74 $SBOX2_0222=2048;       # Camellia_SBOX[2]
75 $SBOX3_3033=2052;       # Camellia_SBOX[3]
76
77 sub Camellia_Feistel {
78 my $i=@_[0];
79 my $seed=defined(@_[1])?@_[1]:0;
80 my $scale=$seed<0?-8:8;
81 my $j=($i&1)*2;
82 my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]);
83
84 $code.=<<___;
85         xor     $s0,$t0                         # t0^=key[0]
86         xor     $s1,$t1                         # t1^=key[1]
87         movz    `&hi("$t0")`,$i0                # (t0>>8)&0xff
88         movz    `&lo("$t1")`,$i1                # (t1>>0)&0xff
89         mov     $SBOX3_3033($Tbl,$i0,8),$t3     # t3=SBOX3_3033[0]
90         mov     $SBOX1_1110($Tbl,$i1,8),$t2     # t2=SBOX1_1110[1]
91         movz    `&lo("$t0")`,$i0                # (t0>>0)&0xff
92         shr     \$16,$t0
93         movz    `&hi("$t1")`,$i1                # (t1>>8)&0xff
94         xor     $SBOX4_4404($Tbl,$i0,8),$t3     # t3^=SBOX4_4404[0]
95         shr     \$16,$t1
96         xor     $SBOX4_4404($Tbl,$i1,8),$t2     # t2^=SBOX4_4404[1]
97         movz    `&hi("$t0")`,$i0                # (t0>>24)&0xff
98         movz    `&lo("$t1")`,$i1                # (t1>>16)&0xff
99         xor     $SBOX1_1110($Tbl,$i0,8),$t3     # t3^=SBOX1_1110[0]
100         xor     $SBOX3_3033($Tbl,$i1,8),$t2     # t2^=SBOX3_3033[1]
101         movz    `&lo("$t0")`,$i0                # (t0>>16)&0xff
102         movz    `&hi("$t1")`,$i1                # (t1>>24)&0xff
103         xor     $SBOX2_0222($Tbl,$i0,8),$t3     # t3^=SBOX2_0222[0]
104         xor     $SBOX2_0222($Tbl,$i1,8),$t2     # t2^=SBOX2_0222[1]
105         mov     `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
106         mov     `$seed+($i+1)*$scale+4`($key),$t0
107         xor     $t3,$t2                         # t2^=t3
108         ror     \$8,$t3                         # t3=RightRotate(t3,8)
109         xor     $t2,$s2
110         xor     $t2,$s3
111         xor     $t3,$s3
112 ___
113 }
114
115 # void Camellia_EncryptBlock_Rounds(
116 #               int grandRounds,
117 #               const Byte plaintext[],
118 #               const KEY_TABLE_TYPE keyTable,
119 #               Byte ciphertext[])
120 $code=<<___;
121 .text
122
123 # V1.x API
124 .globl  Camellia_EncryptBlock
125 .type   Camellia_EncryptBlock,\@abi-omnipotent
126 .align  16
127 Camellia_EncryptBlock:
128 .cfi_startproc
129         movl    \$128,%eax
130         subl    $arg0d,%eax
131         movl    \$3,$arg0d
132         adcl    \$0,$arg0d      # keyBitLength==128?3:4
133         jmp     .Lenc_rounds
134 .cfi_endproc
135 .size   Camellia_EncryptBlock,.-Camellia_EncryptBlock
136 # V2
137 .globl  Camellia_EncryptBlock_Rounds
138 .type   Camellia_EncryptBlock_Rounds,\@function,4
139 .align  16
140 .Lenc_rounds:
141 Camellia_EncryptBlock_Rounds:
142 .cfi_startproc
143         push    %rbx
144 .cfi_push       %rbx
145         push    %rbp
146 .cfi_push       %rbp
147         push    %r13
148 .cfi_push       %r13
149         push    %r14
150 .cfi_push       %r14
151         push    %r15
152 .cfi_push       %r15
153 .Lenc_prologue:
154
155         #mov    %rsi,$inp               # put away arguments
156         mov     %rcx,$out
157         mov     %rdx,$key
158
159         shl     \$6,%edi                # process grandRounds
160         lea     .LCamellia_SBOX(%rip),$Tbl
161         lea     ($key,%rdi),$keyend
162
163         mov     0(%rsi),@S[0]           # load plaintext
164         mov     4(%rsi),@S[1]
165         mov     8(%rsi),@S[2]
166         bswap   @S[0]
167         mov     12(%rsi),@S[3]
168         bswap   @S[1]
169         bswap   @S[2]
170         bswap   @S[3]
171
172         call    _x86_64_Camellia_encrypt
173
174         bswap   @S[0]
175         bswap   @S[1]
176         bswap   @S[2]
177         mov     @S[0],0($out)
178         bswap   @S[3]
179         mov     @S[1],4($out)
180         mov     @S[2],8($out)
181         mov     @S[3],12($out)
182
183         mov     0(%rsp),%r15
184 .cfi_restore    %r15
185         mov     8(%rsp),%r14
186 .cfi_restore    %r14
187         mov     16(%rsp),%r13
188 .cfi_restore    %r13
189         mov     24(%rsp),%rbp
190 .cfi_restore    %rbp
191         mov     32(%rsp),%rbx
192 .cfi_restore    %rbx
193         lea     40(%rsp),%rsp
194 .cfi_adjust_cfa_offset  -40
195 .Lenc_epilogue:
196         ret
197 .cfi_endproc
198 .size   Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
199
200 .type   _x86_64_Camellia_encrypt,\@abi-omnipotent
201 .align  16
202 _x86_64_Camellia_encrypt:
203 .cfi_startproc
204         xor     0($key),@S[1]
205         xor     4($key),@S[0]           # ^=key[0-3]
206         xor     8($key),@S[3]
207         xor     12($key),@S[2]
208 .align  16
209 .Leloop:
210         mov     16($key),$t1            # prefetch key[4-5]
211         mov     20($key),$t0
212
213 ___
214         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
215 $code.=<<___;
216         lea     16*4($key),$key
217         cmp     $keyend,$key
218         mov     8($key),$t3             # prefetch key[2-3]
219         mov     12($key),$t2
220         je      .Ledone
221
222         and     @S[0],$t0
223         or      @S[3],$t3
224         rol     \$1,$t0
225         xor     $t3,@S[2]               # s2^=s3|key[3];
226         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
227         and     @S[2],$t2
228         or      @S[1],$t1
229         rol     \$1,$t2
230         xor     $t1,@S[0]               # s0^=s1|key[1];
231         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
232         jmp     .Leloop
233
234 .align  16
235 .Ledone:
236         xor     @S[2],$t0               # SwapHalf
237         xor     @S[3],$t1
238         xor     @S[0],$t2
239         xor     @S[1],$t3
240
241         mov     $t0,@S[0]
242         mov     $t1,@S[1]
243         mov     $t2,@S[2]
244         mov     $t3,@S[3]
245
246         .byte   0xf3,0xc3               # rep ret
247 .cfi_endproc
248 .size   _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
249
250 # V1.x API
251 .globl  Camellia_DecryptBlock
252 .type   Camellia_DecryptBlock,\@abi-omnipotent
253 .align  16
254 Camellia_DecryptBlock:
255 .cfi_startproc
256         movl    \$128,%eax
257         subl    $arg0d,%eax
258         movl    \$3,$arg0d
259         adcl    \$0,$arg0d      # keyBitLength==128?3:4
260         jmp     .Ldec_rounds
261 .cfi_endproc
262 .size   Camellia_DecryptBlock,.-Camellia_DecryptBlock
263 # V2
264 .globl  Camellia_DecryptBlock_Rounds
265 .type   Camellia_DecryptBlock_Rounds,\@function,4
266 .align  16
267 .Ldec_rounds:
268 Camellia_DecryptBlock_Rounds:
269 .cfi_startproc
270         push    %rbx
271 .cfi_push       %rbx
272         push    %rbp
273 .cfi_push       %rbp
274         push    %r13
275 .cfi_push       %r13
276         push    %r14
277 .cfi_push       %r14
278         push    %r15
279 .cfi_push       %r15
280 .Ldec_prologue:
281
282         #mov    %rsi,$inp               # put away arguments
283         mov     %rcx,$out
284         mov     %rdx,$keyend
285
286         shl     \$6,%edi                # process grandRounds
287         lea     .LCamellia_SBOX(%rip),$Tbl
288         lea     ($keyend,%rdi),$key
289
290         mov     0(%rsi),@S[0]           # load plaintext
291         mov     4(%rsi),@S[1]
292         mov     8(%rsi),@S[2]
293         bswap   @S[0]
294         mov     12(%rsi),@S[3]
295         bswap   @S[1]
296         bswap   @S[2]
297         bswap   @S[3]
298
299         call    _x86_64_Camellia_decrypt
300
301         bswap   @S[0]
302         bswap   @S[1]
303         bswap   @S[2]
304         mov     @S[0],0($out)
305         bswap   @S[3]
306         mov     @S[1],4($out)
307         mov     @S[2],8($out)
308         mov     @S[3],12($out)
309
310         mov     0(%rsp),%r15
311 .cfi_restore    %r15
312         mov     8(%rsp),%r14
313 .cfi_restore    %r14
314         mov     16(%rsp),%r13
315 .cfi_restore    %r13
316         mov     24(%rsp),%rbp
317 .cfi_restore    %rbp
318         mov     32(%rsp),%rbx
319 .cfi_restore    %rbx
320         lea     40(%rsp),%rsp
321 .cfi_adjust_cfa_offset  -40
322 .Ldec_epilogue:
323         ret
324 .cfi_endproc
325 .size   Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
326
327 .type   _x86_64_Camellia_decrypt,\@abi-omnipotent
328 .align  16
329 _x86_64_Camellia_decrypt:
330 .cfi_startproc
331         xor     0($key),@S[1]
332         xor     4($key),@S[0]           # ^=key[0-3]
333         xor     8($key),@S[3]
334         xor     12($key),@S[2]
335 .align  16
336 .Ldloop:
337         mov     -8($key),$t1            # prefetch key[4-5]
338         mov     -4($key),$t0
339
340 ___
341         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
342 $code.=<<___;
343         lea     -16*4($key),$key
344         cmp     $keyend,$key
345         mov     0($key),$t3             # prefetch key[2-3]
346         mov     4($key),$t2
347         je      .Lddone
348
349         and     @S[0],$t0
350         or      @S[3],$t3
351         rol     \$1,$t0
352         xor     $t3,@S[2]               # s2^=s3|key[3];
353         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
354         and     @S[2],$t2
355         or      @S[1],$t1
356         rol     \$1,$t2
357         xor     $t1,@S[0]               # s0^=s1|key[1];
358         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
359
360         jmp     .Ldloop
361
362 .align  16
363 .Lddone:
364         xor     @S[2],$t2
365         xor     @S[3],$t3
366         xor     @S[0],$t0
367         xor     @S[1],$t1
368
369         mov     $t2,@S[0]               # SwapHalf
370         mov     $t3,@S[1]
371         mov     $t0,@S[2]
372         mov     $t1,@S[3]
373
374         .byte   0xf3,0xc3               # rep ret
375 .cfi_endproc
376 .size   _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
377 ___
378
379 sub _saveround {
380 my ($rnd,$key,@T)=@_;
381 my $bias=int(@T[0])?shift(@T):0;
382
383     if ($#T==3) {
384         $code.=<<___;
385         mov     @T[1],`$bias+$rnd*8+0`($key)
386         mov     @T[0],`$bias+$rnd*8+4`($key)
387         mov     @T[3],`$bias+$rnd*8+8`($key)
388         mov     @T[2],`$bias+$rnd*8+12`($key)
389 ___
390     } else {
391         $code.="        mov     @T[0],`$bias+$rnd*8+0`($key)\n";
392         $code.="        mov     @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
393     }
394 }
395
396 sub _loadround {
397 my ($rnd,$key,@T)=@_;
398 my $bias=int(@T[0])?shift(@T):0;
399
400 $code.="        mov     `$bias+$rnd*8+0`($key),@T[0]\n";
401 $code.="        mov     `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
402 }
403
404 # shld is very slow on Intel EM64T family. Even on AMD it limits
405 # instruction decode rate [because it's VectorPath] and consequently
406 # performance...
407 sub __rotl128 {
408 my ($i0,$i1,$rot)=@_;
409
410     if ($rot) {
411         $code.=<<___;
412         mov     $i0,%r11
413         shld    \$$rot,$i1,$i0
414         shld    \$$rot,%r11,$i1
415 ___
416     }
417 }
418
419 # ... Implementing 128-bit rotate without shld gives 80% better
420 # performance EM64T, +15% on AMD64 and only ~7% degradation on
421 # Core2. This is therefore preferred.
422 sub _rotl128 {
423 my ($i0,$i1,$rot)=@_;
424
425     if ($rot) {
426         $code.=<<___;
427         mov     $i0,%r11
428         shl     \$$rot,$i0
429         mov     $i1,%r9
430         shr     \$`64-$rot`,%r9
431         shr     \$`64-$rot`,%r11
432         or      %r9,$i0
433         shl     \$$rot,$i1
434         or      %r11,$i1
435 ___
436     }
437 }
438
439 { my $step=0;
440
441 $code.=<<___;
442 .globl  Camellia_Ekeygen
443 .type   Camellia_Ekeygen,\@function,3
444 .align  16
445 Camellia_Ekeygen:
446 .cfi_startproc
447         push    %rbx
448 .cfi_push       %rbx
449         push    %rbp
450 .cfi_push       %rbp
451         push    %r13
452 .cfi_push       %r13
453         push    %r14
454 .cfi_push       %r14
455         push    %r15
456 .cfi_push       %r15
457 .Lkey_prologue:
458
459         mov     %edi,${keyend}d         # put away arguments, keyBitLength
460         mov     %rdx,$out               # keyTable
461
462         mov     0(%rsi),@S[0]           # load 0-127 bits
463         mov     4(%rsi),@S[1]
464         mov     8(%rsi),@S[2]
465         mov     12(%rsi),@S[3]
466
467         bswap   @S[0]
468         bswap   @S[1]
469         bswap   @S[2]
470         bswap   @S[3]
471 ___
472         &_saveround     (0,$out,@S);    # KL<<<0
473 $code.=<<___;
474         cmp     \$128,$keyend           # check keyBitLength
475         je      .L1st128
476
477         mov     16(%rsi),@S[0]          # load 128-191 bits
478         mov     20(%rsi),@S[1]
479         cmp     \$192,$keyend
480         je      .L1st192
481         mov     24(%rsi),@S[2]          # load 192-255 bits
482         mov     28(%rsi),@S[3]
483         jmp     .L1st256
484 .L1st192:
485         mov     @S[0],@S[2]
486         mov     @S[1],@S[3]
487         not     @S[2]
488         not     @S[3]
489 .L1st256:
490         bswap   @S[0]
491         bswap   @S[1]
492         bswap   @S[2]
493         bswap   @S[3]
494 ___
495         &_saveround     (4,$out,@S);    # temp storage for KR!
496 $code.=<<___;
497         xor     0($out),@S[1]           # KR^KL
498         xor     4($out),@S[0]
499         xor     8($out),@S[3]
500         xor     12($out),@S[2]
501
502 .L1st128:
503         lea     .LCamellia_SIGMA(%rip),$key
504         lea     .LCamellia_SBOX(%rip),$Tbl
505
506         mov     0($key),$t1
507         mov     4($key),$t0
508 ___
509         &Camellia_Feistel($step++);
510         &Camellia_Feistel($step++);
511 $code.=<<___;
512         xor     0($out),@S[1]           # ^KL
513         xor     4($out),@S[0]
514         xor     8($out),@S[3]
515         xor     12($out),@S[2]
516 ___
517         &Camellia_Feistel($step++);
518         &Camellia_Feistel($step++);
519 $code.=<<___;
520         cmp     \$128,$keyend
521         jne     .L2nd256
522
523         lea     128($out),$out          # size optimization
524         shl     \$32,%r8                # @S[0]||
525         shl     \$32,%r10               # @S[2]||
526         or      %r9,%r8                 # ||@S[1]
527         or      %r11,%r10               # ||@S[3]
528 ___
529         &_loadround     (0,$out,-128,"%rax","%rbx");    # KL
530         &_saveround     (2,$out,-128,"%r8","%r10");     # KA<<<0
531         &_rotl128       ("%rax","%rbx",15);
532         &_saveround     (4,$out,-128,"%rax","%rbx");    # KL<<<15
533         &_rotl128       ("%r8","%r10",15);
534         &_saveround     (6,$out,-128,"%r8","%r10");     # KA<<<15
535         &_rotl128       ("%r8","%r10",15);              # 15+15=30
536         &_saveround     (8,$out,-128,"%r8","%r10");     # KA<<<30
537         &_rotl128       ("%rax","%rbx",30);             # 15+30=45
538         &_saveround     (10,$out,-128,"%rax","%rbx");   # KL<<<45
539         &_rotl128       ("%r8","%r10",15);              # 30+15=45
540         &_saveround     (12,$out,-128,"%r8");           # KA<<<45
541         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
542         &_saveround     (13,$out,-128,"%rbx");          # KL<<<60
543         &_rotl128       ("%r8","%r10",15);              # 45+15=60
544         &_saveround     (14,$out,-128,"%r8","%r10");    # KA<<<60
545         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
546         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<77
547         &_rotl128       ("%rax","%rbx",17);             # 77+17=94
548         &_saveround     (18,$out,-128,"%rax","%rbx");   # KL<<<94
549         &_rotl128       ("%r8","%r10",34);              # 60+34=94
550         &_saveround     (20,$out,-128,"%r8","%r10");    # KA<<<94
551         &_rotl128       ("%rax","%rbx",17);             # 94+17=111
552         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<111
553         &_rotl128       ("%r8","%r10",17);              # 94+17=111
554         &_saveround     (24,$out,-128,"%r8","%r10");    # KA<<<111
555 $code.=<<___;
556         mov     \$3,%eax
557         jmp     .Ldone
558 .align  16
559 .L2nd256:
560 ___
561         &_saveround     (6,$out,@S);    # temp storage for KA!
562 $code.=<<___;
563         xor     `4*8+0`($out),@S[1]     # KA^KR
564         xor     `4*8+4`($out),@S[0]
565         xor     `5*8+0`($out),@S[3]
566         xor     `5*8+4`($out),@S[2]
567 ___
568         &Camellia_Feistel($step++);
569         &Camellia_Feistel($step++);
570
571         &_loadround     (0,$out,"%rax","%rbx"); # KL
572         &_loadround     (4,$out,"%rcx","%rdx"); # KR
573         &_loadround     (6,$out,"%r14","%r15"); # KA
574 $code.=<<___;
575         lea     128($out),$out          # size optimization
576         shl     \$32,%r8                # @S[0]||
577         shl     \$32,%r10               # @S[2]||
578         or      %r9,%r8                 # ||@S[1]
579         or      %r11,%r10               # ||@S[3]
580 ___
581         &_saveround     (2,$out,-128,"%r8","%r10");     # KB<<<0
582         &_rotl128       ("%rcx","%rdx",15);
583         &_saveround     (4,$out,-128,"%rcx","%rdx");    # KR<<<15
584         &_rotl128       ("%r14","%r15",15);
585         &_saveround     (6,$out,-128,"%r14","%r15");    # KA<<<15
586         &_rotl128       ("%rcx","%rdx",15);             # 15+15=30
587         &_saveround     (8,$out,-128,"%rcx","%rdx");    # KR<<<30
588         &_rotl128       ("%r8","%r10",30);
589         &_saveround     (10,$out,-128,"%r8","%r10");    # KB<<<30
590         &_rotl128       ("%rax","%rbx",45);
591         &_saveround     (12,$out,-128,"%rax","%rbx");   # KL<<<45
592         &_rotl128       ("%r14","%r15",30);             # 15+30=45
593         &_saveround     (14,$out,-128,"%r14","%r15");   # KA<<<45
594         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
595         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<60
596         &_rotl128       ("%rcx","%rdx",30);             # 30+30=60
597         &_saveround     (18,$out,-128,"%rcx","%rdx");   # KR<<<60
598         &_rotl128       ("%r8","%r10",30);              # 30+30=60
599         &_saveround     (20,$out,-128,"%r8","%r10");    # KB<<<60
600         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
601         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<77
602         &_rotl128       ("%r14","%r15",32);             # 45+32=77
603         &_saveround     (24,$out,-128,"%r14","%r15");   # KA<<<77
604         &_rotl128       ("%rcx","%rdx",34);             # 60+34=94
605         &_saveround     (26,$out,-128,"%rcx","%rdx");   # KR<<<94
606         &_rotl128       ("%r14","%r15",17);             # 77+17=94
607         &_saveround     (28,$out,-128,"%r14","%r15");   # KA<<<77
608         &_rotl128       ("%rax","%rbx",34);             # 77+34=111
609         &_saveround     (30,$out,-128,"%rax","%rbx");   # KL<<<111
610         &_rotl128       ("%r8","%r10",51);              # 60+51=111
611         &_saveround     (32,$out,-128,"%r8","%r10");    # KB<<<111
612 $code.=<<___;
613         mov     \$4,%eax
614 .Ldone:
615         mov     0(%rsp),%r15
616 .cfi_restore    %r15
617         mov     8(%rsp),%r14
618 .cfi_restore    %r14
619         mov     16(%rsp),%r13
620 .cfi_restore    %r13
621         mov     24(%rsp),%rbp
622 .cfi_restore    %rbp
623         mov     32(%rsp),%rbx
624 .cfi_restore    %rbx
625         lea     40(%rsp),%rsp
626 .cfi_adjust_cfa_offset  -40
627 .Lkey_epilogue:
628         ret
629 .cfi_endproc
630 .size   Camellia_Ekeygen,.-Camellia_Ekeygen
631 ___
632 }
633
634 @SBOX=(
635 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
636  35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
637 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
638 166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
639 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
640 223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
641  20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
642 254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
643 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
644  16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
645 135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
646  82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
647 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
648 120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
649 114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
650  64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
651
652 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
653 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
654 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
655 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
656
657 $code.=<<___;
658 .align  64
659 .LCamellia_SIGMA:
660 .long   0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
661 .long   0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
662 .long   0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
663 .long   0,          0,          0,          0
664 .LCamellia_SBOX:
665 ___
666 # tables are interleaved, remember?
667 sub data_word { $code.=".long\t".join(',',@_)."\n"; }
668 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
669 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
670
671 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
672 #                       size_t length, const CAMELLIA_KEY *key,
673 #                       unsigned char *ivp,const int enc);
674 {
675 $_key="0(%rsp)";
676 $_end="8(%rsp)";        # inp+len&~15
677 $_res="16(%rsp)";       # len&15
678 $ivec="24(%rsp)";
679 $_ivp="40(%rsp)";
680 $_rsp="48(%rsp)";
681
682 $code.=<<___;
683 .globl  Camellia_cbc_encrypt
684 .type   Camellia_cbc_encrypt,\@function,6
685 .align  16
686 Camellia_cbc_encrypt:
687 .cfi_startproc
688         cmp     \$0,%rdx
689         je      .Lcbc_abort
690         push    %rbx
691 .cfi_push       %rbx
692         push    %rbp
693 .cfi_push       %rbp
694         push    %r12
695 .cfi_push       %r12
696         push    %r13
697 .cfi_push       %r13
698         push    %r14
699 .cfi_push       %r14
700         push    %r15
701 .cfi_push       %r15
702 .Lcbc_prologue:
703
704         mov     %rsp,%rbp
705 .cfi_def_cfa_register   %rbp
706         sub     \$64,%rsp
707         and     \$-64,%rsp
708
709         # place stack frame just "above mod 1024" the key schedule,
710         # this ensures that cache associativity suffices
711         lea     -64-63(%rcx),%r10
712         sub     %rsp,%r10
713         neg     %r10
714         and     \$0x3C0,%r10
715         sub     %r10,%rsp
716         #add    \$8,%rsp                # 8 is reserved for callee's ra
717
718         mov     %rdi,$inp               # inp argument
719         mov     %rsi,$out               # out argument
720         mov     %r8,%rbx                # ivp argument
721         mov     %rcx,$key               # key argument
722         mov     272(%rcx),${keyend}d    # grandRounds
723
724         mov     %r8,$_ivp
725         mov     %rbp,$_rsp
726 .cfi_cfa_expression     $_rsp,deref,+56
727
728 .Lcbc_body:
729         lea     .LCamellia_SBOX(%rip),$Tbl
730
731         mov     \$32,%ecx
732 .align  4
733 .Lcbc_prefetch_sbox:
734         mov     0($Tbl),%rax
735         mov     32($Tbl),%rsi
736         mov     64($Tbl),%rdi
737         mov     96($Tbl),%r11
738         lea     128($Tbl),$Tbl
739         loop    .Lcbc_prefetch_sbox
740         sub     \$4096,$Tbl
741         shl     \$6,$keyend
742         mov     %rdx,%rcx               # len argument
743         lea     ($key,$keyend),$keyend
744
745         cmp     \$0,%r9d                # enc argument
746         je      .LCBC_DECRYPT
747
748         and     \$-16,%rdx
749         and     \$15,%rcx               # length residue
750         lea     ($inp,%rdx),%rdx
751         mov     $key,$_key
752         mov     %rdx,$_end
753         mov     %rcx,$_res
754
755         cmp     $inp,%rdx
756         mov     0(%rbx),@S[0]           # load IV
757         mov     4(%rbx),@S[1]
758         mov     8(%rbx),@S[2]
759         mov     12(%rbx),@S[3]
760         je      .Lcbc_enc_tail
761         jmp     .Lcbc_eloop
762
763 .align  16
764 .Lcbc_eloop:
765         xor     0($inp),@S[0]
766         xor     4($inp),@S[1]
767         xor     8($inp),@S[2]
768         bswap   @S[0]
769         xor     12($inp),@S[3]
770         bswap   @S[1]
771         bswap   @S[2]
772         bswap   @S[3]
773
774         call    _x86_64_Camellia_encrypt
775
776         mov     $_key,$key              # "rewind" the key
777         bswap   @S[0]
778         mov     $_end,%rdx
779         bswap   @S[1]
780         mov     $_res,%rcx
781         bswap   @S[2]
782         mov     @S[0],0($out)
783         bswap   @S[3]
784         mov     @S[1],4($out)
785         mov     @S[2],8($out)
786         lea     16($inp),$inp
787         mov     @S[3],12($out)
788         cmp     %rdx,$inp
789         lea     16($out),$out
790         jne     .Lcbc_eloop
791
792         cmp     \$0,%rcx
793         jne     .Lcbc_enc_tail
794
795         mov     $_ivp,$out
796         mov     @S[0],0($out)           # write out IV residue
797         mov     @S[1],4($out)
798         mov     @S[2],8($out)
799         mov     @S[3],12($out)
800         jmp     .Lcbc_done
801
802 .align  16
803 .Lcbc_enc_tail:
804         xor     %rax,%rax
805         mov     %rax,0+$ivec
806         mov     %rax,8+$ivec
807         mov     %rax,$_res
808
809 .Lcbc_enc_pushf:
810         pushfq
811         cld
812         mov     $inp,%rsi
813         lea     8+$ivec,%rdi
814         .long   0x9066A4F3              # rep movsb
815         popfq
816 .Lcbc_enc_popf:
817
818         lea     $ivec,$inp
819         lea     16+$ivec,%rax
820         mov     %rax,$_end
821         jmp     .Lcbc_eloop             # one more time
822
823 .align  16
824 .LCBC_DECRYPT:
825         xchg    $key,$keyend
826         add     \$15,%rdx
827         and     \$15,%rcx               # length residue
828         and     \$-16,%rdx
829         mov     $key,$_key
830         lea     ($inp,%rdx),%rdx
831         mov     %rdx,$_end
832         mov     %rcx,$_res
833
834         mov     (%rbx),%rax             # load IV
835         mov     8(%rbx),%rbx
836         jmp     .Lcbc_dloop
837 .align  16
838 .Lcbc_dloop:
839         mov     0($inp),@S[0]
840         mov     4($inp),@S[1]
841         mov     8($inp),@S[2]
842         bswap   @S[0]
843         mov     12($inp),@S[3]
844         bswap   @S[1]
845         mov     %rax,0+$ivec            # save IV to temporary storage
846         bswap   @S[2]
847         mov     %rbx,8+$ivec
848         bswap   @S[3]
849
850         call    _x86_64_Camellia_decrypt
851
852         mov     $_key,$key              # "rewind" the key
853         mov     $_end,%rdx
854         mov     $_res,%rcx
855
856         bswap   @S[0]
857         mov     ($inp),%rax             # load IV for next iteration
858         bswap   @S[1]
859         mov     8($inp),%rbx
860         bswap   @S[2]
861         xor     0+$ivec,@S[0]
862         bswap   @S[3]
863         xor     4+$ivec,@S[1]
864         xor     8+$ivec,@S[2]
865         lea     16($inp),$inp
866         xor     12+$ivec,@S[3]
867         cmp     %rdx,$inp
868         je      .Lcbc_ddone
869
870         mov     @S[0],0($out)
871         mov     @S[1],4($out)
872         mov     @S[2],8($out)
873         mov     @S[3],12($out)
874
875         lea     16($out),$out
876         jmp     .Lcbc_dloop
877
878 .align  16
879 .Lcbc_ddone:
880         mov     $_ivp,%rdx
881         cmp     \$0,%rcx
882         jne     .Lcbc_dec_tail
883
884         mov     @S[0],0($out)
885         mov     @S[1],4($out)
886         mov     @S[2],8($out)
887         mov     @S[3],12($out)
888
889         mov     %rax,(%rdx)             # write out IV residue
890         mov     %rbx,8(%rdx)
891         jmp     .Lcbc_done
892 .align  16
893 .Lcbc_dec_tail:
894         mov     @S[0],0+$ivec
895         mov     @S[1],4+$ivec
896         mov     @S[2],8+$ivec
897         mov     @S[3],12+$ivec
898
899 .Lcbc_dec_pushf:
900         pushfq
901         cld
902         lea     8+$ivec,%rsi
903         lea     ($out),%rdi
904         .long   0x9066A4F3              # rep movsb
905         popfq
906 .Lcbc_dec_popf:
907
908         mov     %rax,(%rdx)             # write out IV residue
909         mov     %rbx,8(%rdx)
910         jmp     .Lcbc_done
911
912 .align  16
913 .Lcbc_done:
914         mov     $_rsp,%rcx
915 .cfi_def_cfa    %rcx,56
916         mov     0(%rcx),%r15
917 .cfi_restore    %r15
918         mov     8(%rcx),%r14
919 .cfi_restore    %r14
920         mov     16(%rcx),%r13
921 .cfi_restore    %r13
922         mov     24(%rcx),%r12
923 .cfi_restore    %r12
924         mov     32(%rcx),%rbp
925 .cfi_restore    %rbp
926         mov     40(%rcx),%rbx
927 .cfi_restore    %rbx
928         lea     48(%rcx),%rsp
929 .cfi_def_cfa    %rsp,8
930 .Lcbc_abort:
931         ret
932 .cfi_endproc
933 .size   Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
934
935 .asciz  "Camellia for x86_64 by <appro\@openssl.org>"
936 ___
937 }
938
939 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
940 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
941 if ($win64) {
942 $rec="%rcx";
943 $frame="%rdx";
944 $context="%r8";
945 $disp="%r9";
946
947 $code.=<<___;
948 .extern __imp_RtlVirtualUnwind
949 .type   common_se_handler,\@abi-omnipotent
950 .align  16
951 common_se_handler:
952         push    %rsi
953         push    %rdi
954         push    %rbx
955         push    %rbp
956         push    %r12
957         push    %r13
958         push    %r14
959         push    %r15
960         pushfq
961         lea     -64(%rsp),%rsp
962
963         mov     120($context),%rax      # pull context->Rax
964         mov     248($context),%rbx      # pull context->Rip
965
966         mov     8($disp),%rsi           # disp->ImageBase
967         mov     56($disp),%r11          # disp->HandlerData
968
969         mov     0(%r11),%r10d           # HandlerData[0]
970         lea     (%rsi,%r10),%r10        # prologue label
971         cmp     %r10,%rbx               # context->Rip<prologue label
972         jb      .Lin_prologue
973
974         mov     152($context),%rax      # pull context->Rsp
975
976         mov     4(%r11),%r10d           # HandlerData[1]
977         lea     (%rsi,%r10),%r10        # epilogue label
978         cmp     %r10,%rbx               # context->Rip>=epilogue label
979         jae     .Lin_prologue
980
981         lea     40(%rax),%rax
982         mov     -8(%rax),%rbx
983         mov     -16(%rax),%rbp
984         mov     -24(%rax),%r13
985         mov     -32(%rax),%r14
986         mov     -40(%rax),%r15
987         mov     %rbx,144($context)      # restore context->Rbx
988         mov     %rbp,160($context)      # restore context->Rbp
989         mov     %r13,224($context)      # restore context->R13
990         mov     %r14,232($context)      # restore context->R14
991         mov     %r15,240($context)      # restore context->R15
992
993 .Lin_prologue:
994         mov     8(%rax),%rdi
995         mov     16(%rax),%rsi
996         mov     %rax,152($context)      # restore context->Rsp
997         mov     %rsi,168($context)      # restore context->Rsi
998         mov     %rdi,176($context)      # restore context->Rdi
999
1000         jmp     .Lcommon_seh_exit
1001 .size   common_se_handler,.-common_se_handler
1002
1003 .type   cbc_se_handler,\@abi-omnipotent
1004 .align  16
1005 cbc_se_handler:
1006         push    %rsi
1007         push    %rdi
1008         push    %rbx
1009         push    %rbp
1010         push    %r12
1011         push    %r13
1012         push    %r14
1013         push    %r15
1014         pushfq
1015         lea     -64(%rsp),%rsp
1016
1017         mov     120($context),%rax      # pull context->Rax
1018         mov     248($context),%rbx      # pull context->Rip
1019
1020         lea     .Lcbc_prologue(%rip),%r10
1021         cmp     %r10,%rbx               # context->Rip<.Lcbc_prologue
1022         jb      .Lin_cbc_prologue
1023
1024         lea     .Lcbc_body(%rip),%r10
1025         cmp     %r10,%rbx               # context->Rip<.Lcbc_body
1026         jb      .Lin_cbc_frame_setup
1027
1028         mov     152($context),%rax      # pull context->Rsp
1029
1030         lea     .Lcbc_abort(%rip),%r10
1031         cmp     %r10,%rbx               # context->Rip>=.Lcbc_abort
1032         jae     .Lin_cbc_prologue
1033
1034         # handle pushf/popf in Camellia_cbc_encrypt
1035         lea     .Lcbc_enc_pushf(%rip),%r10
1036         cmp     %r10,%rbx               # context->Rip<=.Lcbc_enc_pushf
1037         jbe     .Lin_cbc_no_flag
1038         lea     8(%rax),%rax
1039         lea     .Lcbc_enc_popf(%rip),%r10
1040         cmp     %r10,%rbx               # context->Rip<.Lcbc_enc_popf
1041         jb      .Lin_cbc_no_flag
1042         lea     -8(%rax),%rax
1043         lea     .Lcbc_dec_pushf(%rip),%r10
1044         cmp     %r10,%rbx               # context->Rip<=.Lcbc_dec_pushf
1045         jbe     .Lin_cbc_no_flag
1046         lea     8(%rax),%rax
1047         lea     .Lcbc_dec_popf(%rip),%r10
1048         cmp     %r10,%rbx               # context->Rip<.Lcbc_dec_popf
1049         jb      .Lin_cbc_no_flag
1050         lea     -8(%rax),%rax
1051
1052 .Lin_cbc_no_flag:
1053         mov     48(%rax),%rax           # $_rsp
1054         lea     48(%rax),%rax
1055
1056 .Lin_cbc_frame_setup:
1057         mov     -8(%rax),%rbx
1058         mov     -16(%rax),%rbp
1059         mov     -24(%rax),%r12
1060         mov     -32(%rax),%r13
1061         mov     -40(%rax),%r14
1062         mov     -48(%rax),%r15
1063         mov     %rbx,144($context)      # restore context->Rbx
1064         mov     %rbp,160($context)      # restore context->Rbp
1065         mov     %r12,216($context)      # restore context->R12
1066         mov     %r13,224($context)      # restore context->R13
1067         mov     %r14,232($context)      # restore context->R14
1068         mov     %r15,240($context)      # restore context->R15
1069
1070 .Lin_cbc_prologue:
1071         mov     8(%rax),%rdi
1072         mov     16(%rax),%rsi
1073         mov     %rax,152($context)      # restore context->Rsp
1074         mov     %rsi,168($context)      # restore context->Rsi
1075         mov     %rdi,176($context)      # restore context->Rdi
1076
1077 .align  4
1078 .Lcommon_seh_exit:
1079
1080         mov     40($disp),%rdi          # disp->ContextRecord
1081         mov     $context,%rsi           # context
1082         mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
1083         .long   0xa548f3fc              # cld; rep movsq
1084
1085         mov     $disp,%rsi
1086         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1087         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1088         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1089         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1090         mov     40(%rsi),%r10           # disp->ContextRecord
1091         lea     56(%rsi),%r11           # &disp->HandlerData
1092         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1093         mov     %r10,32(%rsp)           # arg5
1094         mov     %r11,40(%rsp)           # arg6
1095         mov     %r12,48(%rsp)           # arg7
1096         mov     %rcx,56(%rsp)           # arg8, (NULL)
1097         call    *__imp_RtlVirtualUnwind(%rip)
1098
1099         mov     \$1,%eax                # ExceptionContinueSearch
1100         lea     64(%rsp),%rsp
1101         popfq
1102         pop     %r15
1103         pop     %r14
1104         pop     %r13
1105         pop     %r12
1106         pop     %rbp
1107         pop     %rbx
1108         pop     %rdi
1109         pop     %rsi
1110         ret
1111 .size   cbc_se_handler,.-cbc_se_handler
1112
1113 .section        .pdata
1114 .align  4
1115         .rva    .LSEH_begin_Camellia_EncryptBlock_Rounds
1116         .rva    .LSEH_end_Camellia_EncryptBlock_Rounds
1117         .rva    .LSEH_info_Camellia_EncryptBlock_Rounds
1118
1119         .rva    .LSEH_begin_Camellia_DecryptBlock_Rounds
1120         .rva    .LSEH_end_Camellia_DecryptBlock_Rounds
1121         .rva    .LSEH_info_Camellia_DecryptBlock_Rounds
1122
1123         .rva    .LSEH_begin_Camellia_Ekeygen
1124         .rva    .LSEH_end_Camellia_Ekeygen
1125         .rva    .LSEH_info_Camellia_Ekeygen
1126
1127         .rva    .LSEH_begin_Camellia_cbc_encrypt
1128         .rva    .LSEH_end_Camellia_cbc_encrypt
1129         .rva    .LSEH_info_Camellia_cbc_encrypt
1130
1131 .section        .xdata
1132 .align  8
1133 .LSEH_info_Camellia_EncryptBlock_Rounds:
1134         .byte   9,0,0,0
1135         .rva    common_se_handler
1136         .rva    .Lenc_prologue,.Lenc_epilogue   # HandlerData[]
1137 .LSEH_info_Camellia_DecryptBlock_Rounds:
1138         .byte   9,0,0,0
1139         .rva    common_se_handler
1140         .rva    .Ldec_prologue,.Ldec_epilogue   # HandlerData[]
1141 .LSEH_info_Camellia_Ekeygen:
1142         .byte   9,0,0,0
1143         .rva    common_se_handler
1144         .rva    .Lkey_prologue,.Lkey_epilogue   # HandlerData[]
1145 .LSEH_info_Camellia_cbc_encrypt:
1146         .byte   9,0,0,0
1147         .rva    cbc_se_handler
1148 ___
1149 }
1150
1151 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1152 print $code;
1153 close STDOUT or die "error closing STDOUT: $!";