2 # Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
21 # Software performance improvement over gcc-generated code is ~70% and
22 # in absolute terms is ~73 cycles per byte processed with 128-bit key.
23 # You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
24 # *strictly* in-order execution and issued instruction [in this case
25 # load value from memory is critical] has to complete before execution
26 # flow proceeds. S-boxes are compressed to 2KB[+256B].
28 # As for hardware acceleration support. It's basically a "teaser," as
29 # it can and should be improved in several ways. Most notably support
30 # for CBC is not utilized, nor multiple blocks are ever processed.
31 # Then software key schedule can be postponed till hardware support
32 # detection... Performance improvement over assembler is reportedly
33 # ~2.5x, but can reach >8x [naturally on larger chunks] if proper
34 # support is implemented.
38 # Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
39 # for 128-bit keys, if hardware support is detected.
43 # Add support for hardware AES192/256 and reschedule instructions to
44 # minimize/avoid Address Generation Interlock hazard and to favour
45 # dual-issue z10 pipeline. This gave ~25% improvement on z10 and
46 # almost 50% on z9. The gain is smaller on z10, because being dual-
47 # issue z10 makes it impossible to eliminate the interlock condition:
48 # critial path is not long enough. Yet it spends ~24 cycles per byte
49 # processed with 128-bit key.
51 # Unlike previous version hardware support detection takes place only
52 # at the moment of key schedule setup, which is denoted in key->rounds.
53 # This is done, because deferred key setup can't be made MT-safe, not
54 # for keys longer than 128 bits.
56 # Add AES_cbc_encrypt, which gives incredible performance improvement,
57 # it was measured to be ~6.6x. It's less than previously mentioned 8x,
58 # because software implementation was optimized.
62 # Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
63 # performance improvement over "generic" counter mode routine relying
64 # on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
65 # to the fact that exact throughput value depends on current stack
66 # frame alignment within 4KB page. In worst case you get ~75% of the
67 # maximum, but *on average* it would be as much as ~98%. Meaning that
68 # worst case is unlike, it's like hitting ravine on plateau.
72 # Adapt for -m31 build. If kernel supports what's called "highgprs"
73 # feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
74 # instructions and achieve "64-bit" performance even in 31-bit legacy
75 # application context. The feature is not specific to any particular
76 # processor, as long as it's "z-CPU". Latter implies that the code
77 # remains z/Architecture specific. On z990 it was measured to perform
78 # 2x better than code generated by gcc 4.3.
82 # Add support for z196 "cipher message with counter" instruction.
83 # Note however that it's disengaged, because it was measured to
84 # perform ~12% worse than vanilla km-based code...
88 # Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
89 # instructions, which deliver ~70% improvement at 8KB block size over
90 # vanilla km-based code, 37% - at most like 512-bytes block size.
94 if ($flavour =~ /3[12]/) {
102 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
103 open STDOUT,">$output";
105 $softonly=0; # allow hardware support
107 $t0="%r0"; $mask="%r0";
109 $t2="%r2"; $inp="%r2";
110 $t3="%r3"; $out="%r3"; $bits="%r3";
124 $stdframe=16*$SIZE_T+4*8;
128 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
132 #include "s390x_arch.h"
136 .type AES_Te,\@object
141 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
142 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
143 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
144 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
145 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
146 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
147 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
148 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
149 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
150 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
151 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
152 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
153 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
154 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
155 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
156 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
157 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
158 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
159 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
160 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
161 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
162 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
163 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
164 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
165 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
166 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
167 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
168 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
169 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
170 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
171 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
172 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
173 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
174 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
175 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
176 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
177 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
178 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
179 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
180 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
181 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
182 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
183 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
184 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
185 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
186 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
187 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
188 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
189 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
190 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
191 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
192 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
193 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
194 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
195 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
196 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
197 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
198 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
199 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
200 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
201 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
202 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
203 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
204 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
207 .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
208 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
209 .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
210 .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
211 .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
212 .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
213 .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
214 .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
215 .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
216 .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
217 .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
218 .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
219 .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
220 .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
221 .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
222 .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
223 .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
224 .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
225 .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
226 .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
227 .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
228 .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
229 .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
230 .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
231 .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
232 .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
233 .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
234 .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
235 .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
236 .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
237 .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
238 .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
240 .long 0x01000000, 0x02000000, 0x04000000, 0x08000000
241 .long 0x10000000, 0x20000000, 0x40000000, 0x80000000
242 .long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
244 .size AES_Te,.-AES_Te
246 # void AES_encrypt(const unsigned char *inp, unsigned char *out,
247 # const AES_KEY *key) {
249 .type AES_encrypt,\@function
252 $code.=<<___ if (!$softonly);
261 lghi %r3,16 # single block length
262 .long 0xb92e0042 # km %r4,%r2
263 brc 1,.-4 # can this happen?
269 stm${g} %r3,$ra,3*$SIZE_T($sp)
277 bras $ra,_s390x_AES_encrypt
279 l${g} $out,3*$SIZE_T($sp)
285 lm${g} %r6,$ra,6*$SIZE_T($sp)
287 .size AES_encrypt,.-AES_encrypt
289 .type _s390x_AES_encrypt,\@function
292 st${g} $ra,15*$SIZE_T($sp)
298 llill $mask,`0xff<<3`
312 srlg $i1,$s1,`16-3` # i0
321 l $s0,0($s0,$tbl) # Te0[s0>>24]
322 l $t1,1($t1,$tbl) # Te3[s0>>0]
323 l $t2,2($t2,$tbl) # Te2[s0>>8]
324 l $t3,3($t3,$tbl) # Te1[s0>>16]
326 x $s0,3($i1,$tbl) # Te1[s1>>16]
327 l $s1,0($s1,$tbl) # Te0[s1>>24]
328 x $t2,1($i2,$tbl) # Te3[s1>>0]
329 x $t3,2($i3,$tbl) # Te2[s1>>8]
331 srlg $i1,$s2,`8-3` # i0
332 srlg $i2,$s2,`16-3` # i1
341 srlg $ra,$s3,`8-3` # i1
342 sllg $t1,$s3,`0+3` # i0
347 x $s0,2($i1,$tbl) # Te2[s2>>8]
348 x $s1,3($i2,$tbl) # Te1[s2>>16]
349 l $s2,0($s2,$tbl) # Te0[s2>>24]
350 x $t3,1($i3,$tbl) # Te3[s2>>0]
352 srlg $i3,$s3,`16-3` # i2
363 x $s0,1($t1,$tbl) # Te3[s3>>0]
364 x $s1,2($ra,$tbl) # Te2[s3>>8]
365 x $s2,3($i3,$tbl) # Te1[s3>>16]
366 l $s3,0($s3,$tbl) # Te0[s3>>24]
369 brct $rounds,.Lenc_loop
381 srlg $i1,$s1,`16-3` # i0
390 llgc $s0,2($s0,$tbl) # Te4[s0>>24]
391 llgc $t1,2($t1,$tbl) # Te4[s0>>0]
393 llgc $t2,2($t2,$tbl) # Te4[s0>>8]
394 llgc $t3,2($t3,$tbl) # Te4[s0>>16]
398 llgc $i1,2($i1,$tbl) # Te4[s1>>16]
399 llgc $s1,2($s1,$tbl) # Te4[s1>>24]
400 llgc $i2,2($i2,$tbl) # Te4[s1>>0]
401 llgc $i3,2($i3,$tbl) # Te4[s1>>8]
410 srlg $i1,$s2,`8-3` # i0
411 srlg $i2,$s2,`16-3` # i1
419 sllg $t1,$s3,`0+3` # i0
420 srlg $ra,$s3,`8-3` # i1
423 llgc $i1,2($i1,$tbl) # Te4[s2>>8]
424 llgc $i2,2($i2,$tbl) # Te4[s2>>16]
426 llgc $s2,2($s2,$tbl) # Te4[s2>>24]
427 llgc $i3,2($i3,$tbl) # Te4[s2>>0]
436 srlg $i3,$s3,`16-3` # i2
444 llgc $i1,2($t1,$tbl) # Te4[s3>>0]
445 llgc $i2,2($ra,$tbl) # Te4[s3>>8]
446 llgc $i3,2($i3,$tbl) # Te4[s3>>16]
447 llgc $s3,2($s3,$tbl) # Te4[s3>>24]
456 l${g} $ra,15*$SIZE_T($sp)
463 .size _s390x_AES_encrypt,.-_s390x_AES_encrypt
467 .type AES_Td,\@object
472 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
473 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
474 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
475 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
476 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
477 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
478 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
479 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
480 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
481 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
482 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
483 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
484 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
485 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
486 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
487 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
488 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
489 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
490 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
491 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
492 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
493 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
494 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
495 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
496 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
497 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
498 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
499 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
500 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
501 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
502 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
503 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
504 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
505 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
506 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
507 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
508 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
509 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
510 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
511 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
512 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
513 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
514 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
515 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
516 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
517 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
518 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
519 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
520 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
521 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
522 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
523 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
524 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
525 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
526 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
527 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
528 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
529 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
530 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
531 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
532 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
533 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
534 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
535 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
538 .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
539 .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
540 .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
541 .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
542 .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
543 .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
544 .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
545 .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
546 .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
547 .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
548 .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
549 .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
550 .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
551 .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
552 .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
553 .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
554 .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
555 .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
556 .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
557 .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
558 .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
559 .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
560 .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
561 .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
562 .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
563 .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
564 .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
565 .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
566 .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
567 .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
568 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
569 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
570 .size AES_Td,.-AES_Td
572 # void AES_decrypt(const unsigned char *inp, unsigned char *out,
573 # const AES_KEY *key) {
575 .type AES_decrypt,\@function
578 $code.=<<___ if (!$softonly);
587 lghi %r3,16 # single block length
588 .long 0xb92e0042 # km %r4,%r2
589 brc 1,.-4 # can this happen?
595 stm${g} %r3,$ra,3*$SIZE_T($sp)
603 bras $ra,_s390x_AES_decrypt
605 l${g} $out,3*$SIZE_T($sp)
611 lm${g} %r6,$ra,6*$SIZE_T($sp)
613 .size AES_decrypt,.-AES_decrypt
615 .type _s390x_AES_decrypt,\@function
618 st${g} $ra,15*$SIZE_T($sp)
624 llill $mask,`0xff<<3`
638 sllg $i1,$s1,`0+3` # i0
647 l $s0,0($s0,$tbl) # Td0[s0>>24]
648 l $t1,3($t1,$tbl) # Td1[s0>>16]
649 l $t2,2($t2,$tbl) # Td2[s0>>8]
650 l $t3,1($t3,$tbl) # Td3[s0>>0]
652 x $s0,1($i1,$tbl) # Td3[s1>>0]
653 l $s1,0($s1,$tbl) # Td0[s1>>24]
654 x $t2,3($i2,$tbl) # Td1[s1>>16]
655 x $t3,2($i3,$tbl) # Td2[s1>>8]
657 srlg $i1,$s2,`8-3` # i0
658 sllg $i2,$s2,`0+3` # i1
667 srlg $ra,$s3,`8-3` # i1
668 srlg $t1,$s3,`16-3` # i0
673 x $s0,2($i1,$tbl) # Td2[s2>>8]
674 x $s1,1($i2,$tbl) # Td3[s2>>0]
675 l $s2,0($s2,$tbl) # Td0[s2>>24]
676 x $t3,3($i3,$tbl) # Td1[s2>>16]
678 sllg $i3,$s3,`0+3` # i2
689 x $s0,3($t1,$tbl) # Td1[s3>>16]
690 x $s1,2($ra,$tbl) # Td2[s3>>8]
691 x $s2,1($i3,$tbl) # Td3[s3>>0]
692 l $s3,0($s3,$tbl) # Td0[s3>>24]
695 brct $rounds,.Ldec_loop
698 l $t1,`2048+0`($tbl) # prefetch Td4
699 l $t2,`2048+64`($tbl)
700 l $t3,`2048+128`($tbl)
701 l $i1,`2048+192`($tbl)
718 llgc $i3,2048($i3,$tbl) # Td4[s0>>24]
719 llgc $t1,2048($t1,$tbl) # Td4[s0>>16]
720 llgc $t2,2048($t2,$tbl) # Td4[s0>>8]
722 llgc $t3,2048($s0,$tbl) # Td4[s0>>0]
726 llgc $s1,2048($s1,$tbl) # Td4[s1>>0]
727 llgc $i1,2048($i1,$tbl) # Td4[s1>>24]
728 llgc $i2,2048($i2,$tbl) # Td4[s1>>16]
730 llgc $i3,2048($ra,$tbl) # Td4[s1>>8]
744 llgc $i1,2048($i1,$tbl) # Td4[s2>>8]
745 llgc $s1,2048($s2,$tbl) # Td4[s2>>0]
746 llgc $i2,2048($i2,$tbl) # Td4[s2>>24]
747 llgc $i3,2048($i3,$tbl) # Td4[s2>>16]
762 l${g} $ra,15*$SIZE_T($sp)
767 llgc $i1,2048($i1,$tbl) # Td4[s3>>16]
768 llgc $i2,2048($i2,$tbl) # Td4[s3>>8]
770 llgc $s2,2048($s3,$tbl) # Td4[s3>>0]
771 llgc $s3,2048($i3,$tbl) # Td4[s3>>24]
785 .size _s390x_AES_decrypt,.-_s390x_AES_decrypt
789 # void AES_set_encrypt_key(const unsigned char *in, int bits,
791 .globl AES_set_encrypt_key
792 .type AES_set_encrypt_key,\@function
795 _s390x_AES_set_encrypt_key:
817 $code.=<<___ if (!$softonly);
818 # convert bits to km(c) code, [128,192,256]->[18,19,20]
825 larl %r1,OPENSSL_s390xcap_P
828 ng %r0,S390X_KM(%r1) # check availability of both km...
829 ng %r0,S390X_KMC(%r1) # ...and kmc support for given key length
832 lmg %r0,%r1,0($inp) # just copy 128 bits...
842 1: st $bits,236($key) # save bits [for debugging purposes]
844 st %r5,240($key) # save km(c) code
851 stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
853 larl $tbl,AES_Te+2048
872 llgfr $t2,$s3 # temp=rk[3]
886 icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8
887 icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16
888 icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24
889 icm $t2,1,0($i3) # Te4[rk[3]>>24]
890 x $t2,256($t3,$tbl) # rcon[i]
891 xr $s0,$t2 # rk[4]=rk[0]^...
892 xr $s1,$s0 # rk[5]=rk[1]^rk[4]
893 xr $s2,$s1 # rk[6]=rk[2]^rk[5]
894 xr $s3,$s2 # rk[7]=rk[3]^rk[6]
896 llgfr $t2,$s3 # temp=rk[3]
908 la $key,16($key) # key+=4
910 brct $rounds,.L128_loop
913 lm${g} %r4,%r13,4*$SIZE_T($sp)
945 icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8
946 icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16
947 icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24
948 icm $t1,1,0($i3) # Te4[rk[5]>>24]
949 x $t1,256($t3,$tbl) # rcon[i]
950 xr $s0,$t1 # rk[6]=rk[0]^...
951 xr $s1,$s0 # rk[7]=rk[1]^rk[6]
952 xr $s2,$s1 # rk[8]=rk[2]^rk[7]
953 xr $s3,$s2 # rk[9]=rk[3]^rk[8]
959 brct $rounds,.L192_continue
962 lm${g} %r4,%r13,4*$SIZE_T($sp)
968 x $t1,16($key) # rk[10]=rk[4]^rk[9]
970 x $t1,20($key) # rk[11]=rk[5]^rk[10]
980 la $key,24($key) # key+=6
1009 icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8
1010 icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16
1011 icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24
1012 icm $t1,1,0($i3) # Te4[rk[7]>>24]
1013 x $t1,256($t3,$tbl) # rcon[i]
1014 xr $s0,$t1 # rk[8]=rk[0]^...
1015 xr $s1,$s0 # rk[9]=rk[1]^rk[8]
1016 xr $s2,$s1 # rk[10]=rk[2]^rk[9]
1017 xr $s3,$s2 # rk[11]=rk[3]^rk[10]
1022 brct $rounds,.L256_continue
1025 lm${g} %r4,%r13,4*$SIZE_T($sp)
1030 lgr $t1,$s3 # temp=rk[11]
1041 llgc $t1,0($t1) # Te4[rk[11]>>0]
1042 icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8
1043 icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16
1044 icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24
1045 x $t1,16($key) # rk[12]=rk[4]^...
1047 x $t1,20($key) # rk[13]=rk[5]^rk[12]
1049 x $t1,24($key) # rk[14]=rk[6]^rk[13]
1051 x $t1,28($key) # rk[15]=rk[7]^rk[14]
1061 la $key,32($key) # key+=8
1068 .size AES_set_encrypt_key,.-AES_set_encrypt_key
1070 # void AES_set_decrypt_key(const unsigned char *in, int bits,
1072 .globl AES_set_decrypt_key
1073 .type AES_set_decrypt_key,\@function
1075 AES_set_decrypt_key:
1076 #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
1077 st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
1078 bras $ra,_s390x_AES_set_encrypt_key
1079 #l${g} $key,4*$SIZE_T($sp)
1080 l${g} $ra,14*$SIZE_T($sp)
1084 $code.=<<___ if (!$softonly);
1089 oill $t0,S390X_DECRYPT # set "decrypt" bit
1095 .Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
1103 .Linv: lmg $s0,$s1,0($i1)
1115 llgf $rounds,240($key)
1117 sll $rounds,2 # (rounds-1)*4
1118 llilh $mask80,0x8080
1119 llilh $mask1b,0x1b1b
1120 llilh $maskfe,0xfefe
1126 .Lmix: l $s0,16($key) # tp1
1154 xr $s1,$s0 # tp2^tp1
1155 xr $s2,$s0 # tp4^tp1
1156 rll $s0,$s0,24 # = ROTATE(tp1,8)
1158 xr $s0,$s1 # ^=tp2^tp1
1159 xr $s1,$s3 # tp2^tp1^tp8
1160 xr $s0,$s2 # ^=tp4^tp1^tp8
1163 xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24)
1165 xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16)
1166 xr $s0,$s3 # ^= ROTATE(tp8,8)
1172 lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
1175 .size AES_set_decrypt_key,.-AES_set_decrypt_key
1178 ########################################################################
1179 # void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1180 # size_t length, const AES_KEY *key,
1181 # unsigned char *ivec, const int enc)
1184 my $out="%r4"; # length and out are swapped
1190 .globl AES_cbc_encrypt
1191 .type AES_cbc_encrypt,\@function
1194 xgr %r3,%r4 # flip %r3 and %r4, out and len
1198 $code.=<<___ if (!$softonly);
1203 lg %r0,0($ivp) # copy ivec
1205 stmg %r0,%r1,16($sp)
1206 lmg %r0,%r1,0($key) # copy key, cover 256 bit
1207 stmg %r0,%r1,32($sp)
1208 lmg %r0,%r1,16($key)
1209 stmg %r0,%r1,48($sp)
1210 l %r0,240($key) # load kmc code
1211 lghi $key,15 # res=len%16, len-=res;
1214 la %r1,16($sp) # parameter block - ivec || key
1216 .long 0xb92f0042 # kmc %r4,%r2
1217 brc 1,.-4 # pay attention to "partial completion"
1221 lmg %r0,%r1,16($sp) # copy ivec to caller
1227 ahi $key,-1 # it's the way it's encoded in mvc
1228 tmll %r0,S390X_DECRYPT
1229 jnz .Lkmc_truncated_dec
1231 stg %r1,16*$SIZE_T($sp)
1232 stg %r1,16*$SIZE_T+8($sp)
1234 mvc 16*$SIZE_T(1,$sp),0($inp)
1236 la %r1,16($sp) # restore parameter block
1237 la $inp,16*$SIZE_T($sp)
1239 .long 0xb92f0042 # kmc %r4,%r2
1242 .Lkmc_truncated_dec:
1243 st${g} $out,4*$SIZE_T($sp)
1244 la $out,16*$SIZE_T($sp)
1246 .long 0xb92f0042 # kmc %r4,%r2
1247 l${g} $out,4*$SIZE_T($sp)
1249 mvc 0(1,$out),16*$SIZE_T($sp)
1256 stm${g} $key,$ra,5*$SIZE_T($sp)
1258 cl %r0,`$stdframe+$SIZE_T-4`($sp)
1270 brc 4,.Lcbc_enc_tail # if borrow
1272 stm${g} $inp,$out,2*$SIZE_T($sp)
1279 bras $ra,_s390x_AES_encrypt
1281 lm${g} $inp,$key,2*$SIZE_T($sp)
1293 brc 4,.Lcbc_enc_tail # if borrow
1297 l${g} $ivp,6*$SIZE_T($sp)
1303 lm${g} %r7,$ra,7*$SIZE_T($sp)
1310 stg $t0,16*$SIZE_T($sp)
1311 stg $t0,16*$SIZE_T+8($sp)
1313 mvc 16*$SIZE_T(1,$sp),0($inp)
1316 la $inp,16*$SIZE_T($sp)
1325 stmg $t0,$t1,16*$SIZE_T($sp)
1328 stm${g} $inp,$out,2*$SIZE_T($sp)
1335 bras $ra,_s390x_AES_decrypt
1337 lm${g} $inp,$key,2*$SIZE_T($sp)
1345 xg $s0,16*$SIZE_T($sp)
1346 xg $s2,16*$SIZE_T+8($sp)
1349 brc 4,.Lcbc_dec_tail # if borrow
1350 brc 2,.Lcbc_dec_done # if zero
1353 stmg $t0,$t1,16*$SIZE_T($sp)
1363 lm${g} %r6,$ra,6*$SIZE_T($sp)
1364 stmg $t0,$t1,0($ivp)
1371 stg $s0,16*$SIZE_T($sp)
1372 stg $s2,16*$SIZE_T+8($sp)
1374 mvc 0(1,$out),16*$SIZE_T($sp)
1377 .size AES_cbc_encrypt,.-AES_cbc_encrypt
1380 ########################################################################
1381 # void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
1382 # size_t blocks, const AES_KEY *key,
1383 # const unsigned char *ivec)
1386 my $out="%r4"; # blocks and out are swapped
1388 my $key="%r5"; my $iv0="%r5";
1393 .globl AES_ctr32_encrypt
1394 .type AES_ctr32_encrypt,\@function
1397 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1400 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
1402 $code.=<<___ if (!$softonly);
1408 st${g} $s2,10*$SIZE_T($sp)
1409 st${g} $s3,11*$SIZE_T($sp)
1411 clr $len,%r1 # does work even in 64-bit mode
1412 jle .Lctr32_nokma # kma is slower for <= 16 blocks
1414 larl %r1,OPENSSL_s390xcap_P
1418 ng $s3,S390X_KMA(%r1) # check kma capability vector
1421 l${g}hi %r1,-$stdframe-112
1423 la $sp,0(%r1,$sp) # prepare parameter block
1427 or %r0,%r1 # set HS and LAAD flags
1429 st${g} $s3,0($sp) # backchain
1430 la %r1,$stdframe($sp)
1432 lmg $s2,$s3,0($key) # copy key
1433 stg $s2,$stdframe+80($sp)
1434 stg $s3,$stdframe+88($sp)
1435 lmg $s2,$s3,16($key)
1436 stg $s2,$stdframe+96($sp)
1437 stg $s3,$stdframe+104($sp)
1439 lmg $s2,$s3,0($ivp) # copy iv
1440 stg $s2,$stdframe+64($sp)
1441 ahi $s3,-1 # kma requires counter-1
1442 stg $s3,$stdframe+72($sp)
1443 st $s3,$stdframe+12($sp) # copy counter
1448 .long 0xb929a042 # kma $out,$s2,$inp
1449 brc 1,.-4 # pay attention to "partial completion"
1451 stg %r0,$stdframe+80($sp) # wipe key
1452 stg %r0,$stdframe+88($sp)
1453 stg %r0,$stdframe+96($sp)
1454 stg %r0,$stdframe+104($sp)
1455 la $sp,$stdframe+112($sp)
1457 lm${g} $s2,$s3,10*$SIZE_T($sp)
1462 stm${g} %r6,$s1,6*$SIZE_T($sp)
1465 la %r1,0($key) # %r1 is permanent copy of $key
1466 lg $iv0,0($ivp) # load ivec
1469 # prepare and allocate stack frame at the top of 4K page
1470 # with 1K reserved for eventual signal handling
1471 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1475 ngr $s0,$s1 # align at page boundary
1476 slgr $fp,$s0 # total buffer size
1478 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1479 slgr $fp,$s1 # deduct reservation to get usable buffer size
1480 # buffer size is at lest 256 and at most 3072+256-16
1482 la $sp,1024($s0) # alloca
1483 srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
1484 st${g} $s2,0($sp) # back-chain
1485 st${g} $fp,$SIZE_T($sp)
1488 brc 1,.Lctr32_hw_switch # not zero, no borrow
1489 algr $fp,$len # input is shorter than allocated buffer
1491 st${g} $fp,$SIZE_T($sp)
1495 $code.=<<___ if (!$softonly && 0);# kmctr code was measured to be ~12% slower
1498 larl %r1,OPENSSL_s390xcap_P
1499 llihh %r0,0x8000 # check if kmctr supports the function code
1501 ng %r0,S390X_KMCTR(%r1) # check kmctr capability vector
1507 algr $out,$inp # restore $out
1508 lgr $s1,$len # $s1 undertakes $len
1509 j .Lctr32_kmctr_loop
1514 .Lctr32_kmctr_prepare:
1518 ahi $ivp,1 # 32-bit increment, preserves upper half
1519 brct $s3,.Lctr32_kmctr_prepare
1521 #la $inp,0($inp) # inp
1522 sllg $len,$fp,4 # len
1523 #la $out,0($out) # out
1525 .long 0xb92da042 # kmctr $out,$s2,$inp
1526 brc 1,.-4 # pay attention to "partial completion"
1529 brc 1,.Lctr32_kmctr_loop # not zero, no borrow
1532 brc 4+1,.Lctr32_kmctr_loop # not zero
1535 lm${g} %r6,$s3,6*$SIZE_T($sp)
1539 $code.=<<___ if (!$softonly);
1547 ahi $ivp,1 # 32-bit increment, preserves upper half
1548 brct $s3,.Lctr32_km_prepare
1550 la $s0,16($sp) # inp
1551 sllg $s1,$fp,4 # len
1552 la $s2,16($sp) # out
1553 .long 0xb92e00a8 # km %r10,%r8
1554 brc 1,.-4 # pay attention to "partial completion"
1564 stg $s0,0($out,$inp)
1565 stg $s1,8($out,$inp)
1567 brct $s3,.Lctr32_km_xor
1570 brc 1,.Lctr32_km_loop # not zero, no borrow
1573 brc 4+1,.Lctr32_km_loop # not zero
1576 l${g} $s1,$SIZE_T($sp)
1582 brct $s1,.Lctr32_km_zap
1585 lm${g} %r6,$s3,6*$SIZE_T($sp)
1591 stm${g} $key,$ra,5*$SIZE_T($sp)
1597 stm${g} $inp,$out,2*$SIZE_T($sp)
1602 st $t1,16*$SIZE_T($sp)
1605 bras $ra,_s390x_AES_encrypt
1607 lm${g} $inp,$ivp,2*$SIZE_T($sp)
1608 llgf $t1,16*$SIZE_T($sp)
1616 ahi $t1,1 # 32-bit increment
1617 brct $len,.Lctr32_loop
1619 lm${g} %r6,$ra,6*$SIZE_T($sp)
1621 .size AES_ctr32_encrypt,.-AES_ctr32_encrypt
1625 ########################################################################
1626 # void AES_xts_encrypt(const unsigned char *inp, unsigned char *out,
1627 # size_t len, const AES_KEY *key1, const AES_KEY *key2,
1628 # const unsigned char iv[16]);
1632 my $out="%r4"; # len and out are swapped
1634 my $key1="%r5"; # $i1
1635 my $key2="%r6"; # $i2
1637 my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
1640 .type _s390x_xts_km,\@function
1645 llgfr $s0,%r0 # put aside the function code
1648 larl %r1,OPENSSL_s390xcap_P
1650 srlg %r0,%r0,32($s1) # check for 32+function code
1651 ng %r0,S390X_KM(%r1) # check km capability vector
1652 lgr %r0,$s0 # restore the function code
1653 la %r1,0($key1) # restore $key1
1656 lmg $i2,$i3,$tweak($sp) # put aside the tweak value
1659 oill %r0,32 # switch to xts function code
1661 sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
1662 la %r1,$tweak-16($sp)
1663 slgr %r1,$s1 # parameter block position
1664 lmg $s0,$s3,0($key1) # load 256 bits of key material,
1665 stmg $s0,$s3,0(%r1) # and copy it to parameter block.
1666 # yes, it contains junk and overlaps
1667 # with the tweak in 128-bit case.
1668 # it's done to avoid conditional
1670 stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
1672 .long 0xb92e0042 # km %r4,%r2
1673 brc 1,.-4 # pay attention to "partial completion"
1675 lrvg $s0,$tweak+0($sp) # load the last tweak
1676 lrvg $s1,$tweak+8($sp)
1677 stmg %r0,%r3,$tweak-32($sp) # wipe copy of the key
1679 nill %r0,0xffdf # switch back to original function code
1680 la %r1,0($key1) # restore pointer to $key1
1683 llgc $len,2*$SIZE_T-1($sp)
1684 nill $len,0x0f # $len%=16
1691 # prepare and allocate stack frame at the top of 4K page
1692 # with 1K reserved for eventual signal handling
1693 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1697 ngr $s0,$s1 # align at page boundary
1698 slgr $fp,$s0 # total buffer size
1700 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1701 slgr $fp,$s1 # deduct reservation to get usable buffer size
1702 # buffer size is at lest 256 and at most 3072+256-16
1704 la $sp,1024($s0) # alloca
1705 nill $fp,0xfff0 # round to 16*n
1706 st${g} $s2,0($sp) # back-chain
1707 nill $len,0xfff0 # redundant
1708 st${g} $fp,$SIZE_T($sp)
1711 brc 1,.Lxts_km_go # not zero, no borrow
1712 algr $fp,$len # input is shorter than allocated buffer
1714 st${g} $fp,$SIZE_T($sp)
1717 lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
1718 lrvg $s1,$tweak+8($s2)
1720 la $s2,16($sp) # vector of ascending tweak values
1731 srag $i2,$s1,63 # broadcast upper bit
1737 lrvgr $i1,$s0 # flip byte order
1743 stg $i1,0($out,$inp)
1744 stg $i2,8($out,$inp)
1746 brct $s3,.Lxts_km_prepare
1748 slgr $inp,$fp # rewind $inp
1751 .long 0xb92e00aa # km $s2,$s2
1752 brc 1,.-4 # pay attention to "partial completion"
1762 stg $i1,0($out,$inp)
1763 stg $i2,8($out,$inp)
1765 brct $s3,.Lxts_km_xor
1768 brc 1,.Lxts_km_loop # not zero, no borrow
1771 brc 4+1,.Lxts_km_loop # not zero
1773 l${g} $i1,0($sp) # back-chain
1774 llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
1781 brct $fp,.Lxts_km_zap
1784 llgc $len,2*$SIZE_T-1($i1)
1785 nill $len,0x0f # $len%=16
1788 # generate one more tweak...
1790 srag $i2,$s1,63 # broadcast upper bit
1796 ltr $len,$len # clear zero flag
1798 .size _s390x_xts_km,.-_s390x_xts_km
1800 .globl AES_xts_encrypt
1801 .type AES_xts_encrypt,\@function
1804 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1808 $code.=<<___ if ($SIZE_T==4);
1812 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1813 srag $len,$len,4 # formally wrong, because it expands
1814 # sign byte, but who can afford asking
1815 # to process more than 2^63-1 bytes?
1816 # I use it, because it sets condition
1818 bcr 8,$ra # abort if zero (i.e. less than 16)
1820 $code.=<<___ if (!$softonly);
1824 jl .Lxts_enc_software
1826 st${g} $ra,5*$SIZE_T($sp)
1827 stm${g} %r6,$s3,6*$SIZE_T($sp)
1829 sllg $len,$len,4 # $len&=~15
1832 # generate the tweak value
1833 l${g} $s3,$stdframe($sp) # pointer to iv
1838 la %r1,0($key2) # $key2 is not needed anymore
1839 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1840 brc 1,.-4 # can this happen?
1843 la %r1,0($key1) # $key1 is not needed anymore
1844 bras $ra,_s390x_xts_km
1845 jz .Lxts_enc_km_done
1847 aghi $inp,-16 # take one step back
1848 la $i3,0($out,$inp) # put aside real $out
1851 llgc $i2,0($out,$inp)
1852 stc $i1,0($out,$inp)
1853 stc $i2,16($out,$inp)
1855 brct $len,.Lxts_enc_km_steal
1859 lrvgr $i1,$s0 # flip byte order
1865 .long 0xb92e00aa # km $s2,$s2
1866 brc 1,.-4 # can this happen?
1867 lrvgr $i1,$s0 # flip byte order
1875 stg $sp,$tweak+0($sp) # wipe tweak
1876 stg $sp,$tweak+8($sp)
1877 l${g} $ra,5*$SIZE_T($sp)
1878 lm${g} %r6,$s3,6*$SIZE_T($sp)
1884 stm${g} %r6,$ra,6*$SIZE_T($sp)
1888 l${g} $s3,$stdframe($sp) # ivp
1889 llgf $s0,0($s3) # load iv
1893 stm${g} %r2,%r5,2*$SIZE_T($sp)
1896 bras $ra,_s390x_AES_encrypt # generate the tweak
1897 lm${g} %r2,%r5,2*$SIZE_T($sp)
1898 stm $s0,$s3,$tweak($sp) # save the tweak
1903 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1904 lrvg $s3,$tweak+8($sp)
1906 srag %r0,$s3,63 # broadcast upper bit
1911 lrvgr $s1,$s1 # flip byte order
1913 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1914 stg $s1,$tweak+0($sp) # save the tweak
1917 stg $s3,$tweak+8($sp)
1919 la $inp,16($inp) # $inp+=16
1921 x $s0,0($inp) # ^=*($inp)
1925 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
1927 bras $ra,_s390x_AES_encrypt
1928 lm${g} %r2,%r5,2*$SIZE_T($sp)
1929 x $s0,$tweak+0($sp) # ^=tweak
1932 x $s3,$tweak+12($sp)
1936 st $s3,12($out,$inp)
1937 brct${g} $len,.Lxts_enc_loop
1939 llgc $len,`2*$SIZE_T-1`($sp)
1940 nill $len,0x0f # $len%16
1943 la $i3,0($inp,$out) # put aside real $out
1946 llgc %r1,0($out,$inp)
1947 stc %r0,0($out,$inp)
1948 stc %r1,16($out,$inp)
1950 brct $len,.Lxts_enc_steal
1951 la $out,0($i3) # restore real $out
1953 # generate last tweak...
1954 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1955 lrvg $s3,$tweak+8($sp)
1957 srag %r0,$s3,63 # broadcast upper bit
1962 lrvgr $s1,$s1 # flip byte order
1964 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1965 stg $s1,$tweak+0($sp) # save the tweak
1968 stg $s3,$tweak+8($sp)
1971 x $s0,0($out) # ^=*(inp)|stolen cipther-text
1975 st${g} $out,4*$SIZE_T($sp)
1977 bras $ra,_s390x_AES_encrypt
1978 l${g} $out,4*$SIZE_T($sp)
1979 x $s0,`$tweak+0`($sp) # ^=tweak
1980 x $s1,`$tweak+4`($sp)
1981 x $s2,`$tweak+8`($sp)
1982 x $s3,`$tweak+12`($sp)
1989 stg $sp,$tweak+0($sp) # wipe tweak
1990 stg $sp,$twesk+8($sp)
1991 lm${g} %r6,$ra,6*$SIZE_T($sp)
1993 .size AES_xts_encrypt,.-AES_xts_encrypt
1995 # void AES_xts_decrypt(const unsigned char *inp, unsigned char *out,
1996 # size_t len, const AES_KEY *key1, const AES_KEY *key2,
1997 # const unsigned char iv[16]);
2000 .globl AES_xts_decrypt
2001 .type AES_xts_decrypt,\@function
2004 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
2008 $code.=<<___ if ($SIZE_T==4);
2012 st${g} $len,1*$SIZE_T($sp) # save copy of $len
2014 bcr 4,$ra # abort if less than zero. formally
2015 # wrong, because $len is unsigned,
2016 # but who can afford asking to
2017 # process more than 2^63-1 bytes?
2019 jnz .Lxts_dec_proceed
2023 $code.=<<___ if (!$softonly);
2027 jl .Lxts_dec_software
2029 st${g} $ra,5*$SIZE_T($sp)
2030 stm${g} %r6,$s3,6*$SIZE_T($sp)
2032 nill $len,0xfff0 # $len&=~15
2035 # generate the tweak value
2036 l${g} $s3,$stdframe($sp) # pointer to iv
2041 la %r1,0($key2) # $key2 is not needed past this point
2042 .long 0xb92e00aa # km $s2,$s2, generate the tweak
2043 brc 1,.-4 # can this happen?
2046 la %r1,0($key1) # $key1 is not needed anymore
2049 jz .Lxts_dec_km_short
2050 bras $ra,_s390x_xts_km
2051 jz .Lxts_dec_km_done
2053 lrvgr $s2,$s0 # make copy in reverse byte order
2055 j .Lxts_dec_km_2ndtweak
2058 llgc $len,`2*$SIZE_T-1`($sp)
2059 nill $len,0x0f # $len%=16
2060 lrvg $s0,$tweak+0($sp) # load the tweak
2061 lrvg $s1,$tweak+8($sp)
2062 lrvgr $s2,$s0 # make copy in reverse byte order
2065 .Lxts_dec_km_2ndtweak:
2067 srag $i2,$s1,63 # broadcast upper bit
2072 lrvgr $i1,$s0 # flip byte order
2077 stg $i1,0($out,$inp)
2078 stg $i2,8($out,$inp)
2081 .long 0xb92e0066 # km $i2,$i2
2082 brc 1,.-4 # can this happen?
2087 stg $i1,0($out,$inp)
2088 stg $i2,8($out,$inp)
2090 la $i3,0($out,$inp) # put aside real $out
2093 llgc $i2,0($out,$inp)
2094 stc $i1,0($out,$inp)
2095 stc $i2,16($out,$inp)
2097 brct $len,.Lxts_dec_km_steal
2107 .long 0xb92e0088 # km $s0,$s0
2108 brc 1,.-4 # can this happen?
2114 stg $sp,$tweak+0($sp) # wipe tweak
2115 stg $sp,$tweak+8($sp)
2116 l${g} $ra,5*$SIZE_T($sp)
2117 lm${g} %r6,$s3,6*$SIZE_T($sp)
2123 stm${g} %r6,$ra,6*$SIZE_T($sp)
2128 l${g} $s3,$stdframe($sp) # ivp
2129 llgf $s0,0($s3) # load iv
2133 stm${g} %r2,%r5,2*$SIZE_T($sp)
2136 bras $ra,_s390x_AES_encrypt # generate the tweak
2137 lm${g} %r2,%r5,2*$SIZE_T($sp)
2140 stm $s0,$s3,$tweak($sp) # save the tweak
2146 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2147 lrvg $s3,$tweak+8($sp)
2149 srag %r0,$s3,63 # broadcast upper bit
2154 lrvgr $s1,$s1 # flip byte order
2156 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2157 stg $s1,$tweak+0($sp) # save the tweak
2160 stg $s3,$tweak+8($sp)
2163 x $s0,0($inp) # tweak^=*(inp)
2167 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
2169 bras $ra,_s390x_AES_decrypt
2170 lm${g} %r2,%r5,2*$SIZE_T($sp)
2171 x $s0,$tweak+0($sp) # ^=tweak
2174 x $s3,$tweak+12($sp)
2178 st $s3,12($out,$inp)
2180 brct${g} $len,.Lxts_dec_loop
2182 llgc $len,`2*$SIZE_T-1`($sp)
2183 nill $len,0x0f # $len%16
2186 # generate pair of tweaks...
2187 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2188 lrvg $s3,$tweak+8($sp)
2190 srag %r0,$s3,63 # broadcast upper bit
2195 lrvgr $i2,$s1 # flip byte order
2197 stmg $i2,$i3,$tweak($sp) # save the 1st tweak
2198 j .Lxts_dec_2ndtweak
2202 llgc $len,`2*$SIZE_T-1`($sp)
2203 nill $len,0x0f # $len%16
2204 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2205 lrvg $s3,$tweak+8($sp)
2208 srag %r0,$s3,63 # broadcast upper bit
2213 lrvgr $s1,$s1 # flip byte order
2215 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2216 stg $s1,$tweak-16+0($sp) # save the 2nd tweak
2219 stg $s3,$tweak-16+8($sp)
2222 x $s0,0($inp) # tweak_the_2nd^=*(inp)
2226 stm${g} %r2,%r3,2*$SIZE_T($sp)
2228 bras $ra,_s390x_AES_decrypt
2229 lm${g} %r2,%r5,2*$SIZE_T($sp)
2230 x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
2231 x $s1,$tweak-16+4($sp)
2232 x $s2,$tweak-16+8($sp)
2233 x $s3,$tweak-16+12($sp)
2237 st $s3,12($out,$inp)
2239 la $i3,0($out,$inp) # put aside real $out
2242 llgc %r1,0($out,$inp)
2243 stc %r0,0($out,$inp)
2244 stc %r1,16($out,$inp)
2246 brct $len,.Lxts_dec_steal
2247 la $out,0($i3) # restore real $out
2249 lm $s0,$s3,$tweak($sp) # load the 1st tweak
2250 x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
2254 st${g} $out,4*$SIZE_T($sp)
2256 bras $ra,_s390x_AES_decrypt
2257 l${g} $out,4*$SIZE_T($sp)
2258 x $s0,$tweak+0($sp) # ^=tweak
2261 x $s3,$tweak+12($sp)
2266 stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
2267 stg $sp,$tweak-16+8($sp)
2269 stg $sp,$tweak+0($sp) # wipe tweak
2270 stg $sp,$twesk+8($sp)
2271 lm${g} %r6,$ra,6*$SIZE_T($sp)
2273 .size AES_xts_decrypt,.-AES_xts_decrypt
2277 .string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2280 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2282 close STDOUT; # force flush