crypto/openssl/crypto/bn/asm/ppc.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2004-2019 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 # Implemented as a Perl wrapper as we want to support several different
  10 # architectures with single file. We pick up the target based on the
  11 # file name we are asked to generate.
  12 #
  13 # It should be noted though that this perl code is nothing like
  14 # <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
  15 # as pre-processor to cover for platform differences in name decoration,
  16 # linker tables, 32-/64-bit instruction sets...
  17 #
  18 # As you might know there're several PowerPC ABI in use. Most notably
  19 # Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
  20 # are similar enough to implement leaf(!) functions, which would be ABI
  21 # neutral. And that's what you find here: ABI neutral leaf functions.
  22 # In case you wonder what that is...
  23 #
  24 #       AIX performance
  25 #
  26 #       MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
  27 #
  28 #       The following is the performance of 32-bit compiler
  29 #       generated code:
  30 #
  31 #       OpenSSL 0.9.6c 21 dec 2001
  32 #       built on: Tue Jun 11 11:06:51 EDT 2002
  33 #       options:bn(64,32) ...
  34 #compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
  35 #                  sign    verify    sign/s verify/s
  36 #rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
  37 #rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
  38 #rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
  39 #rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
  40 #dsa  512 bits   0.0087s   0.0106s    114.3     94.5
  41 #dsa 1024 bits   0.0256s   0.0313s     39.0     32.0
  42 #
  43 #       Same benchmark with this assembler code:
  44 #
  45 #rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
  46 #rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
  47 #rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
  48 #rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
  49 #dsa  512 bits   0.0052s   0.0062s    191.6    162.0
  50 #dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
  51 #
  52 #       Number of operations increases by at almost 75%
  53 #
  54 #       Here are performance numbers for 64-bit compiler
  55 #       generated code:
  56 #
  57 #       OpenSSL 0.9.6g [engine] 9 Aug 2002
  58 #       built on: Fri Apr 18 16:59:20 EDT 2003
  59 #       options:bn(64,64) ...
  60 #       compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
  61 #                  sign    verify    sign/s verify/s
  62 #rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
  63 #rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
  64 #rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
  65 #rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
  66 #dsa  512 bits   0.0026s   0.0032s    382.5    313.7
  67 #dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
  68 #
  69 #       Same benchmark with this assembler code:
  70 #
  71 #rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
  72 #rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
  73 #rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
  74 #rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
  75 #dsa  512 bits   0.0016s   0.0020s    610.7    507.1
  76 #dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
  77 #
  78 #       Again, performance increases by at about 75%
  79 #
  80 #       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
  81 #       OpenSSL 0.9.7c 30 Sep 2003
  82 #
  83 #       Original code.
  84 #
  85 #rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
  86 #rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
  87 #rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
  88 #rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
  89 #dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
  90 #dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
  91 #dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
  92 #
  93 #       Same benchmark with this assembler code:
  94 #
  95 #rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
  96 #rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
  97 #rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
  98 #rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
  99 #dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
 100 #dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
 101 #dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
 102 #
 103 #        Performance increase of ~60%
 104 #        Based on submission from Suresh N. Chari of IBM
 105
 106 $flavour = shift;
 107
 108 if ($flavour =~ /32/) {
 109         $BITS=  32;
 110         $BNSZ=  $BITS/8;
 111         $ISA=   "\"ppc\"";
 112
 113         $LD=    "lwz";          # load
 114         $LDU=   "lwzu";         # load and update
 115         $ST=    "stw";          # store
 116         $STU=   "stwu";         # store and update
 117         $UMULL= "mullw";        # unsigned multiply low
 118         $UMULH= "mulhwu";       # unsigned multiply high
 119         $UDIV=  "divwu";        # unsigned divide
 120         $UCMPI= "cmplwi";       # unsigned compare with immediate
 121         $UCMP=  "cmplw";        # unsigned compare
 122         $CNTLZ= "cntlzw";       # count leading zeros
 123         $SHL=   "slw";          # shift left
 124         $SHR=   "srw";          # unsigned shift right
 125         $SHRI=  "srwi";         # unsigned shift right by immediate
 126         $SHLI=  "slwi";         # shift left by immediate
 127         $CLRU=  "clrlwi";       # clear upper bits
 128         $INSR=  "insrwi";       # insert right
 129         $ROTL=  "rotlwi";       # rotate left by immediate
 130         $TR=    "tw";           # conditional trap
 131 } elsif ($flavour =~ /64/) {
 132         $BITS=  64;
 133         $BNSZ=  $BITS/8;
 134         $ISA=   "\"ppc64\"";
 135
 136         # same as above, but 64-bit mnemonics...
 137         $LD=    "ld";           # load
 138         $LDU=   "ldu";          # load and update
 139         $ST=    "std";          # store
 140         $STU=   "stdu";         # store and update
 141         $UMULL= "mulld";        # unsigned multiply low
 142         $UMULH= "mulhdu";       # unsigned multiply high
 143         $UDIV=  "divdu";        # unsigned divide
 144         $UCMPI= "cmpldi";       # unsigned compare with immediate
 145         $UCMP=  "cmpld";        # unsigned compare
 146         $CNTLZ= "cntlzd";       # count leading zeros
 147         $SHL=   "sld";          # shift left
 148         $SHR=   "srd";          # unsigned shift right
 149         $SHRI=  "srdi";         # unsigned shift right by immediate
 150         $SHLI=  "sldi";         # shift left by immediate
 151         $CLRU=  "clrldi";       # clear upper bits
 152         $INSR=  "insrdi";       # insert right
 153         $ROTL=  "rotldi";       # rotate left by immediate
 154         $TR=    "td";           # conditional trap
 155 } else { die "nonsense $flavour"; }
 156
 157 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 158 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 159 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
 160 die "can't locate ppc-xlate.pl";
 161
 162 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 163
 164 $data=<<EOF;
 165 #--------------------------------------------------------------------
 166 #
 167 #
 168 #
 169 #
 170 #       File:           ppc32.s
 171 #
 172 #       Created by:     Suresh Chari
 173 #                       IBM Thomas J. Watson Research Library
 174 #                       Hawthorne, NY
 175 #
 176 #
 177 #       Description:    Optimized assembly routines for OpenSSL crypto
 178 #                       on the 32 bitPowerPC platform.
 179 #
 180 #
 181 #       Version History
 182 #
 183 #       2. Fixed bn_add,bn_sub and bn_div_words, added comments,
 184 #          cleaned up code. Also made a single version which can
 185 #          be used for both the AIX and Linux compilers. See NOTE
 186 #          below.
 187 #                               12/05/03                Suresh Chari
 188 #                       (with lots of help from)        Andy Polyakov
 189 ##
 190 #       1. Initial version      10/20/02                Suresh Chari
 191 #
 192 #
 193 #       The following file works for the xlc,cc
 194 #       and gcc compilers.
 195 #
 196 #       NOTE:   To get the file to link correctly with the gcc compiler
 197 #               you have to change the names of the routines and remove
 198 #               the first .(dot) character. This should automatically
 199 #               be done in the build process.
 200 #
 201 #       Hand optimized assembly code for the following routines
 202 #
 203 #       bn_sqr_comba4
 204 #       bn_sqr_comba8
 205 #       bn_mul_comba4
 206 #       bn_mul_comba8
 207 #       bn_sub_words
 208 #       bn_add_words
 209 #       bn_div_words
 210 #       bn_sqr_words
 211 #       bn_mul_words
 212 #       bn_mul_add_words
 213 #
 214 #       NOTE:   It is possible to optimize this code more for
 215 #       specific PowerPC or Power architectures. On the Northstar
 216 #       architecture the optimizations in this file do
 217 #        NOT provide much improvement.
 218 #
 219 #       If you have comments or suggestions to improve code send
 220 #       me a note at schari\@us.ibm.com
 221 #
 222 #--------------------------------------------------------------------------
 223 #
 224 #       Defines to be used in the assembly code.
 225 #
 226 #.set r0,0      # we use it as storage for value of 0
 227 #.set SP,1      # preserved
 228 #.set RTOC,2    # preserved
 229 #.set r3,3      # 1st argument/return value
 230 #.set r4,4      # 2nd argument/volatile register
 231 #.set r5,5      # 3rd argument/volatile register
 232 #.set r6,6      # ...
 233 #.set r7,7
 234 #.set r8,8
 235 #.set r9,9
 236 #.set r10,10
 237 #.set r11,11
 238 #.set r12,12
 239 #.set r13,13    # not used, nor any other "below" it...
 240
 241 #       Declare function names to be global
 242 #       NOTE:   For gcc these names MUST be changed to remove
 243 #               the first . i.e. for example change ".bn_sqr_comba4"
 244 #               to "bn_sqr_comba4". This should be automatically done
 245 #               in the build.
 246
 247         .globl  .bn_sqr_comba4
 248         .globl  .bn_sqr_comba8
 249         .globl  .bn_mul_comba4
 250         .globl  .bn_mul_comba8
 251         .globl  .bn_sub_words
 252         .globl  .bn_add_words
 253         .globl  .bn_div_words
 254         .globl  .bn_sqr_words
 255         .globl  .bn_mul_words
 256         .globl  .bn_mul_add_words
 257
 258 # .text section
 259
 260         .machine        "any"
 261         .text
 262
 263 #
 264 #       NOTE:   The following label name should be changed to
 265 #               "bn_sqr_comba4" i.e. remove the first dot
 266 #               for the gcc compiler. This should be automatically
 267 #               done in the build
 268 #
 269
 270 .align  4
 271 .bn_sqr_comba4:
 272 #
 273 # Optimized version of bn_sqr_comba4.
 274 #
 275 # void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
 276 # r3 contains r
 277 # r4 contains a
 278 #
 279 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
 280 #
 281 # r5,r6 are the two BN_ULONGs being multiplied.
 282 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
 283 # r9,r10, r11 are the equivalents of c1,c2, c3.
 284 # Here's the assembly
 285 #
 286 #
 287         xor             r0,r0,r0                # set r0 = 0. Used in the addze
 288                                                 # instructions below
 289
 290                                                 #sqr_add_c(a,0,c1,c2,c3)
 291         $LD             r5,`0*$BNSZ`(r4)
 292         $UMULL          r9,r5,r5
 293         $UMULH          r10,r5,r5               #in first iteration. No need
 294                                                 #to add since c1=c2=c3=0.
 295                                                 # Note c3(r11) is NOT set to 0
 296                                                 # but will be.
 297
 298         $ST             r9,`0*$BNSZ`(r3)        # r[0]=c1;
 299                                                 # sqr_add_c2(a,1,0,c2,c3,c1);
 300         $LD             r6,`1*$BNSZ`(r4)
 301         $UMULL          r7,r5,r6
 302         $UMULH          r8,r5,r6
 303
 304         addc            r7,r7,r7                # compute (r7,r8)=2*(r7,r8)
 305         adde            r8,r8,r8
 306         addze           r9,r0                   # catch carry if any.
 307                                                 # r9= r0(=0) and carry
 308
 309         addc            r10,r7,r10              # now add to temp result.
 310         addze           r11,r8                  # r8 added to r11 which is 0
 311         addze           r9,r9
 312
 313         $ST             r10,`1*$BNSZ`(r3)       #r[1]=c2;
 314                                                 #sqr_add_c(a,1,c3,c1,c2)
 315         $UMULL          r7,r6,r6
 316         $UMULH          r8,r6,r6
 317         addc            r11,r7,r11
 318         adde            r9,r8,r9
 319         addze           r10,r0
 320                                                 #sqr_add_c2(a,2,0,c3,c1,c2)
 321         $LD             r6,`2*$BNSZ`(r4)
 322         $UMULL          r7,r5,r6
 323         $UMULH          r8,r5,r6
 324
 325         addc            r7,r7,r7
 326         adde            r8,r8,r8
 327         addze           r10,r10
 328
 329         addc            r11,r7,r11
 330         adde            r9,r8,r9
 331         addze           r10,r10
 332         $ST             r11,`2*$BNSZ`(r3)       #r[2]=c3
 333                                                 #sqr_add_c2(a,3,0,c1,c2,c3);
 334         $LD             r6,`3*$BNSZ`(r4)
 335         $UMULL          r7,r5,r6
 336         $UMULH          r8,r5,r6
 337         addc            r7,r7,r7
 338         adde            r8,r8,r8
 339         addze           r11,r0
 340
 341         addc            r9,r7,r9
 342         adde            r10,r8,r10
 343         addze           r11,r11
 344                                                 #sqr_add_c2(a,2,1,c1,c2,c3);
 345         $LD             r5,`1*$BNSZ`(r4)
 346         $LD             r6,`2*$BNSZ`(r4)
 347         $UMULL          r7,r5,r6
 348         $UMULH          r8,r5,r6
 349
 350         addc            r7,r7,r7
 351         adde            r8,r8,r8
 352         addze           r11,r11
 353         addc            r9,r7,r9
 354         adde            r10,r8,r10
 355         addze           r11,r11
 356         $ST             r9,`3*$BNSZ`(r3)        #r[3]=c1
 357                                                 #sqr_add_c(a,2,c2,c3,c1);
 358         $UMULL          r7,r6,r6
 359         $UMULH          r8,r6,r6
 360         addc            r10,r7,r10
 361         adde            r11,r8,r11
 362         addze           r9,r0
 363                                                 #sqr_add_c2(a,3,1,c2,c3,c1);
 364         $LD             r6,`3*$BNSZ`(r4)
 365         $UMULL          r7,r5,r6
 366         $UMULH          r8,r5,r6
 367         addc            r7,r7,r7
 368         adde            r8,r8,r8
 369         addze           r9,r9
 370
 371         addc            r10,r7,r10
 372         adde            r11,r8,r11
 373         addze           r9,r9
 374         $ST             r10,`4*$BNSZ`(r3)       #r[4]=c2
 375                                                 #sqr_add_c2(a,3,2,c3,c1,c2);
 376         $LD             r5,`2*$BNSZ`(r4)
 377         $UMULL          r7,r5,r6
 378         $UMULH          r8,r5,r6
 379         addc            r7,r7,r7
 380         adde            r8,r8,r8
 381         addze           r10,r0
 382
 383         addc            r11,r7,r11
 384         adde            r9,r8,r9
 385         addze           r10,r10
 386         $ST             r11,`5*$BNSZ`(r3)       #r[5] = c3
 387                                                 #sqr_add_c(a,3,c1,c2,c3);
 388         $UMULL          r7,r6,r6
 389         $UMULH          r8,r6,r6
 390         addc            r9,r7,r9
 391         adde            r10,r8,r10
 392
 393         $ST             r9,`6*$BNSZ`(r3)        #r[6]=c1
 394         $ST             r10,`7*$BNSZ`(r3)       #r[7]=c2
 395         blr
 396         .long   0
 397         .byte   0,12,0x14,0,0,0,2,0
 398         .long   0
 399 .size   .bn_sqr_comba4,.-.bn_sqr_comba4
 400
 401 #
 402 #       NOTE:   The following label name should be changed to
 403 #               "bn_sqr_comba8" i.e. remove the first dot
 404 #               for the gcc compiler. This should be automatically
 405 #               done in the build
 406 #
 407
 408 .align  4
 409 .bn_sqr_comba8:
 410 #
 411 # This is an optimized version of the bn_sqr_comba8 routine.
 412 # Tightly uses the adde instruction
 413 #
 414 #
 415 # void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
 416 # r3 contains r
 417 # r4 contains a
 418 #
 419 # Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
 420 #
 421 # r5,r6 are the two BN_ULONGs being multiplied.
 422 # r7,r8 are the results of the 32x32 giving 64 bit multiply.
 423 # r9,r10, r11 are the equivalents of c1,c2, c3.
 424 #
 425 # Possible optimization of loading all 8 longs of a into registers
 426 # doesn't provide any speedup
 427 #
 428
 429         xor             r0,r0,r0                #set r0 = 0.Used in addze
 430                                                 #instructions below.
 431
 432                                                 #sqr_add_c(a,0,c1,c2,c3);
 433         $LD             r5,`0*$BNSZ`(r4)
 434         $UMULL          r9,r5,r5                #1st iteration: no carries.
 435         $UMULH          r10,r5,r5
 436         $ST             r9,`0*$BNSZ`(r3)        # r[0]=c1;
 437                                                 #sqr_add_c2(a,1,0,c2,c3,c1);
 438         $LD             r6,`1*$BNSZ`(r4)
 439         $UMULL          r7,r5,r6
 440         $UMULH          r8,r5,r6
 441
 442         addc            r10,r7,r10              #add the two register number
 443         adde            r11,r8,r0               # (r8,r7) to the three register
 444         addze           r9,r0                   # number (r9,r11,r10).NOTE:r0=0
 445
 446         addc            r10,r7,r10              #add the two register number
 447         adde            r11,r8,r11              # (r8,r7) to the three register
 448         addze           r9,r9                   # number (r9,r11,r10).
 449
 450         $ST             r10,`1*$BNSZ`(r3)       # r[1]=c2
 451
 452                                                 #sqr_add_c(a,1,c3,c1,c2);
 453         $UMULL          r7,r6,r6
 454         $UMULH          r8,r6,r6
 455         addc            r11,r7,r11
 456         adde            r9,r8,r9
 457         addze           r10,r0
 458                                                 #sqr_add_c2(a,2,0,c3,c1,c2);
 459         $LD             r6,`2*$BNSZ`(r4)
 460         $UMULL          r7,r5,r6
 461         $UMULH          r8,r5,r6
 462
 463         addc            r11,r7,r11
 464         adde            r9,r8,r9
 465         addze           r10,r10
 466
 467         addc            r11,r7,r11
 468         adde            r9,r8,r9
 469         addze           r10,r10
 470
 471         $ST             r11,`2*$BNSZ`(r3)       #r[2]=c3
 472                                                 #sqr_add_c2(a,3,0,c1,c2,c3);
 473         $LD             r6,`3*$BNSZ`(r4)        #r6 = a[3]. r5 is already a[0].
 474         $UMULL          r7,r5,r6
 475         $UMULH          r8,r5,r6
 476
 477         addc            r9,r7,r9
 478         adde            r10,r8,r10
 479         addze           r11,r0
 480
 481         addc            r9,r7,r9
 482         adde            r10,r8,r10
 483         addze           r11,r11
 484                                                 #sqr_add_c2(a,2,1,c1,c2,c3);
 485         $LD             r5,`1*$BNSZ`(r4)
 486         $LD             r6,`2*$BNSZ`(r4)
 487         $UMULL          r7,r5,r6
 488         $UMULH          r8,r5,r6
 489
 490         addc            r9,r7,r9
 491         adde            r10,r8,r10
 492         addze           r11,r11
 493
 494         addc            r9,r7,r9
 495         adde            r10,r8,r10
 496         addze           r11,r11
 497
 498         $ST             r9,`3*$BNSZ`(r3)        #r[3]=c1;
 499                                                 #sqr_add_c(a,2,c2,c3,c1);
 500         $UMULL          r7,r6,r6
 501         $UMULH          r8,r6,r6
 502
 503         addc            r10,r7,r10
 504         adde            r11,r8,r11
 505         addze           r9,r0
 506                                                 #sqr_add_c2(a,3,1,c2,c3,c1);
 507         $LD             r6,`3*$BNSZ`(r4)
 508         $UMULL          r7,r5,r6
 509         $UMULH          r8,r5,r6
 510
 511         addc            r10,r7,r10
 512         adde            r11,r8,r11
 513         addze           r9,r9
 514
 515         addc            r10,r7,r10
 516         adde            r11,r8,r11
 517         addze           r9,r9
 518                                                 #sqr_add_c2(a,4,0,c2,c3,c1);
 519         $LD             r5,`0*$BNSZ`(r4)
 520         $LD             r6,`4*$BNSZ`(r4)
 521         $UMULL          r7,r5,r6
 522         $UMULH          r8,r5,r6
 523
 524         addc            r10,r7,r10
 525         adde            r11,r8,r11
 526         addze           r9,r9
 527
 528         addc            r10,r7,r10
 529         adde            r11,r8,r11
 530         addze           r9,r9
 531         $ST             r10,`4*$BNSZ`(r3)       #r[4]=c2;
 532                                                 #sqr_add_c2(a,5,0,c3,c1,c2);
 533         $LD             r6,`5*$BNSZ`(r4)
 534         $UMULL          r7,r5,r6
 535         $UMULH          r8,r5,r6
 536
 537         addc            r11,r7,r11
 538         adde            r9,r8,r9
 539         addze           r10,r0
 540
 541         addc            r11,r7,r11
 542         adde            r9,r8,r9
 543         addze           r10,r10
 544                                                 #sqr_add_c2(a,4,1,c3,c1,c2);
 545         $LD             r5,`1*$BNSZ`(r4)
 546         $LD             r6,`4*$BNSZ`(r4)
 547         $UMULL          r7,r5,r6
 548         $UMULH          r8,r5,r6
 549
 550         addc            r11,r7,r11
 551         adde            r9,r8,r9
 552         addze           r10,r10
 553
 554         addc            r11,r7,r11
 555         adde            r9,r8,r9
 556         addze           r10,r10
 557                                                 #sqr_add_c2(a,3,2,c3,c1,c2);
 558         $LD             r5,`2*$BNSZ`(r4)
 559         $LD             r6,`3*$BNSZ`(r4)
 560         $UMULL          r7,r5,r6
 561         $UMULH          r8,r5,r6
 562
 563         addc            r11,r7,r11
 564         adde            r9,r8,r9
 565         addze           r10,r10
 566
 567         addc            r11,r7,r11
 568         adde            r9,r8,r9
 569         addze           r10,r10
 570         $ST             r11,`5*$BNSZ`(r3)       #r[5]=c3;
 571                                                 #sqr_add_c(a,3,c1,c2,c3);
 572         $UMULL          r7,r6,r6
 573         $UMULH          r8,r6,r6
 574         addc            r9,r7,r9
 575         adde            r10,r8,r10
 576         addze           r11,r0
 577                                                 #sqr_add_c2(a,4,2,c1,c2,c3);
 578         $LD             r6,`4*$BNSZ`(r4)
 579         $UMULL          r7,r5,r6
 580         $UMULH          r8,r5,r6
 581
 582         addc            r9,r7,r9
 583         adde            r10,r8,r10
 584         addze           r11,r11
 585
 586         addc            r9,r7,r9
 587         adde            r10,r8,r10
 588         addze           r11,r11
 589                                                 #sqr_add_c2(a,5,1,c1,c2,c3);
 590         $LD             r5,`1*$BNSZ`(r4)
 591         $LD             r6,`5*$BNSZ`(r4)
 592         $UMULL          r7,r5,r6
 593         $UMULH          r8,r5,r6
 594
 595         addc            r9,r7,r9
 596         adde            r10,r8,r10
 597         addze           r11,r11
 598
 599         addc            r9,r7,r9
 600         adde            r10,r8,r10
 601         addze           r11,r11
 602                                                 #sqr_add_c2(a,6,0,c1,c2,c3);
 603         $LD             r5,`0*$BNSZ`(r4)
 604         $LD             r6,`6*$BNSZ`(r4)
 605         $UMULL          r7,r5,r6
 606         $UMULH          r8,r5,r6
 607         addc            r9,r7,r9
 608         adde            r10,r8,r10
 609         addze           r11,r11
 610         addc            r9,r7,r9
 611         adde            r10,r8,r10
 612         addze           r11,r11
 613         $ST             r9,`6*$BNSZ`(r3)        #r[6]=c1;
 614                                                 #sqr_add_c2(a,7,0,c2,c3,c1);
 615         $LD             r6,`7*$BNSZ`(r4)
 616         $UMULL          r7,r5,r6
 617         $UMULH          r8,r5,r6
 618
 619         addc            r10,r7,r10
 620         adde            r11,r8,r11
 621         addze           r9,r0
 622         addc            r10,r7,r10
 623         adde            r11,r8,r11
 624         addze           r9,r9
 625                                                 #sqr_add_c2(a,6,1,c2,c3,c1);
 626         $LD             r5,`1*$BNSZ`(r4)
 627         $LD             r6,`6*$BNSZ`(r4)
 628         $UMULL          r7,r5,r6
 629         $UMULH          r8,r5,r6
 630
 631         addc            r10,r7,r10
 632         adde            r11,r8,r11
 633         addze           r9,r9
 634         addc            r10,r7,r10
 635         adde            r11,r8,r11
 636         addze           r9,r9
 637                                                 #sqr_add_c2(a,5,2,c2,c3,c1);
 638         $LD             r5,`2*$BNSZ`(r4)
 639         $LD             r6,`5*$BNSZ`(r4)
 640         $UMULL          r7,r5,r6
 641         $UMULH          r8,r5,r6
 642         addc            r10,r7,r10
 643         adde            r11,r8,r11
 644         addze           r9,r9
 645         addc            r10,r7,r10
 646         adde            r11,r8,r11
 647         addze           r9,r9
 648                                                 #sqr_add_c2(a,4,3,c2,c3,c1);
 649         $LD             r5,`3*$BNSZ`(r4)
 650         $LD             r6,`4*$BNSZ`(r4)
 651         $UMULL          r7,r5,r6
 652         $UMULH          r8,r5,r6
 653
 654         addc            r10,r7,r10
 655         adde            r11,r8,r11
 656         addze           r9,r9
 657         addc            r10,r7,r10
 658         adde            r11,r8,r11
 659         addze           r9,r9
 660         $ST             r10,`7*$BNSZ`(r3)       #r[7]=c2;
 661                                                 #sqr_add_c(a,4,c3,c1,c2);
 662         $UMULL          r7,r6,r6
 663         $UMULH          r8,r6,r6
 664         addc            r11,r7,r11
 665         adde            r9,r8,r9
 666         addze           r10,r0
 667                                                 #sqr_add_c2(a,5,3,c3,c1,c2);
 668         $LD             r6,`5*$BNSZ`(r4)
 669         $UMULL          r7,r5,r6
 670         $UMULH          r8,r5,r6
 671         addc            r11,r7,r11
 672         adde            r9,r8,r9
 673         addze           r10,r10
 674         addc            r11,r7,r11
 675         adde            r9,r8,r9
 676         addze           r10,r10
 677                                                 #sqr_add_c2(a,6,2,c3,c1,c2);
 678         $LD             r5,`2*$BNSZ`(r4)
 679         $LD             r6,`6*$BNSZ`(r4)
 680         $UMULL          r7,r5,r6
 681         $UMULH          r8,r5,r6
 682         addc            r11,r7,r11
 683         adde            r9,r8,r9
 684         addze           r10,r10
 685
 686         addc            r11,r7,r11
 687         adde            r9,r8,r9
 688         addze           r10,r10
 689                                                 #sqr_add_c2(a,7,1,c3,c1,c2);
 690         $LD             r5,`1*$BNSZ`(r4)
 691         $LD             r6,`7*$BNSZ`(r4)
 692         $UMULL          r7,r5,r6
 693         $UMULH          r8,r5,r6
 694         addc            r11,r7,r11
 695         adde            r9,r8,r9
 696         addze           r10,r10
 697         addc            r11,r7,r11
 698         adde            r9,r8,r9
 699         addze           r10,r10
 700         $ST             r11,`8*$BNSZ`(r3)       #r[8]=c3;
 701                                                 #sqr_add_c2(a,7,2,c1,c2,c3);
 702         $LD             r5,`2*$BNSZ`(r4)
 703         $UMULL          r7,r5,r6
 704         $UMULH          r8,r5,r6
 705
 706         addc            r9,r7,r9
 707         adde            r10,r8,r10
 708         addze           r11,r0
 709         addc            r9,r7,r9
 710         adde            r10,r8,r10
 711         addze           r11,r11
 712                                                 #sqr_add_c2(a,6,3,c1,c2,c3);
 713         $LD             r5,`3*$BNSZ`(r4)
 714         $LD             r6,`6*$BNSZ`(r4)
 715         $UMULL          r7,r5,r6
 716         $UMULH          r8,r5,r6
 717         addc            r9,r7,r9
 718         adde            r10,r8,r10
 719         addze           r11,r11
 720         addc            r9,r7,r9
 721         adde            r10,r8,r10
 722         addze           r11,r11
 723                                                 #sqr_add_c2(a,5,4,c1,c2,c3);
 724         $LD             r5,`4*$BNSZ`(r4)
 725         $LD             r6,`5*$BNSZ`(r4)
 726         $UMULL          r7,r5,r6
 727         $UMULH          r8,r5,r6
 728         addc            r9,r7,r9
 729         adde            r10,r8,r10
 730         addze           r11,r11
 731         addc            r9,r7,r9
 732         adde            r10,r8,r10
 733         addze           r11,r11
 734         $ST             r9,`9*$BNSZ`(r3)        #r[9]=c1;
 735                                                 #sqr_add_c(a,5,c2,c3,c1);
 736         $UMULL          r7,r6,r6
 737         $UMULH          r8,r6,r6
 738         addc            r10,r7,r10
 739         adde            r11,r8,r11
 740         addze           r9,r0
 741                                                 #sqr_add_c2(a,6,4,c2,c3,c1);
 742         $LD             r6,`6*$BNSZ`(r4)
 743         $UMULL          r7,r5,r6
 744         $UMULH          r8,r5,r6
 745         addc            r10,r7,r10
 746         adde            r11,r8,r11
 747         addze           r9,r9
 748         addc            r10,r7,r10
 749         adde            r11,r8,r11
 750         addze           r9,r9
 751                                                 #sqr_add_c2(a,7,3,c2,c3,c1);
 752         $LD             r5,`3*$BNSZ`(r4)
 753         $LD             r6,`7*$BNSZ`(r4)
 754         $UMULL          r7,r5,r6
 755         $UMULH          r8,r5,r6
 756         addc            r10,r7,r10
 757         adde            r11,r8,r11
 758         addze           r9,r9
 759         addc            r10,r7,r10
 760         adde            r11,r8,r11
 761         addze           r9,r9
 762         $ST             r10,`10*$BNSZ`(r3)      #r[10]=c2;
 763                                                 #sqr_add_c2(a,7,4,c3,c1,c2);
 764         $LD             r5,`4*$BNSZ`(r4)
 765         $UMULL          r7,r5,r6
 766         $UMULH          r8,r5,r6
 767         addc            r11,r7,r11
 768         adde            r9,r8,r9
 769         addze           r10,r0
 770         addc            r11,r7,r11
 771         adde            r9,r8,r9
 772         addze           r10,r10
 773                                                 #sqr_add_c2(a,6,5,c3,c1,c2);
 774         $LD             r5,`5*$BNSZ`(r4)
 775         $LD             r6,`6*$BNSZ`(r4)
 776         $UMULL          r7,r5,r6
 777         $UMULH          r8,r5,r6
 778         addc            r11,r7,r11
 779         adde            r9,r8,r9
 780         addze           r10,r10
 781         addc            r11,r7,r11
 782         adde            r9,r8,r9
 783         addze           r10,r10
 784         $ST             r11,`11*$BNSZ`(r3)      #r[11]=c3;
 785                                                 #sqr_add_c(a,6,c1,c2,c3);
 786         $UMULL          r7,r6,r6
 787         $UMULH          r8,r6,r6
 788         addc            r9,r7,r9
 789         adde            r10,r8,r10
 790         addze           r11,r0
 791                                                 #sqr_add_c2(a,7,5,c1,c2,c3)
 792         $LD             r6,`7*$BNSZ`(r4)
 793         $UMULL          r7,r5,r6
 794         $UMULH          r8,r5,r6
 795         addc            r9,r7,r9
 796         adde            r10,r8,r10
 797         addze           r11,r11
 798         addc            r9,r7,r9
 799         adde            r10,r8,r10
 800         addze           r11,r11
 801         $ST             r9,`12*$BNSZ`(r3)       #r[12]=c1;
 802
 803                                                 #sqr_add_c2(a,7,6,c2,c3,c1)
 804         $LD             r5,`6*$BNSZ`(r4)
 805         $UMULL          r7,r5,r6
 806         $UMULH          r8,r5,r6
 807         addc            r10,r7,r10
 808         adde            r11,r8,r11
 809         addze           r9,r0
 810         addc            r10,r7,r10
 811         adde            r11,r8,r11
 812         addze           r9,r9
 813         $ST             r10,`13*$BNSZ`(r3)      #r[13]=c2;
 814                                                 #sqr_add_c(a,7,c3,c1,c2);
 815         $UMULL          r7,r6,r6
 816         $UMULH          r8,r6,r6
 817         addc            r11,r7,r11
 818         adde            r9,r8,r9
 819         $ST             r11,`14*$BNSZ`(r3)      #r[14]=c3;
 820         $ST             r9, `15*$BNSZ`(r3)      #r[15]=c1;
 821
 822
 823         blr
 824         .long   0
 825         .byte   0,12,0x14,0,0,0,2,0
 826         .long   0
 827 .size   .bn_sqr_comba8,.-.bn_sqr_comba8
 828
 829 #
 830 #       NOTE:   The following label name should be changed to
 831 #               "bn_mul_comba4" i.e. remove the first dot
 832 #               for the gcc compiler. This should be automatically
 833 #               done in the build
 834 #
 835
 836 .align  4
 837 .bn_mul_comba4:
 838 #
 839 # This is an optimized version of the bn_mul_comba4 routine.
 840 #
 841 # void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 842 # r3 contains r
 843 # r4 contains a
 844 # r5 contains b
 845 # r6, r7 are the 2 BN_ULONGs being multiplied.
 846 # r8, r9 are the results of the 32x32 giving 64 multiply.
 847 # r10, r11, r12 are the equivalents of c1, c2, and c3.
 848 #
 849         xor     r0,r0,r0                #r0=0. Used in addze below.
 850                                         #mul_add_c(a[0],b[0],c1,c2,c3);
 851         $LD     r6,`0*$BNSZ`(r4)
 852         $LD     r7,`0*$BNSZ`(r5)
 853         $UMULL  r10,r6,r7
 854         $UMULH  r11,r6,r7
 855         $ST     r10,`0*$BNSZ`(r3)       #r[0]=c1
 856                                         #mul_add_c(a[0],b[1],c2,c3,c1);
 857         $LD     r7,`1*$BNSZ`(r5)
 858         $UMULL  r8,r6,r7
 859         $UMULH  r9,r6,r7
 860         addc    r11,r8,r11
 861         adde    r12,r9,r0
 862         addze   r10,r0
 863                                         #mul_add_c(a[1],b[0],c2,c3,c1);
 864         $LD     r6, `1*$BNSZ`(r4)
 865         $LD     r7, `0*$BNSZ`(r5)
 866         $UMULL  r8,r6,r7
 867         $UMULH  r9,r6,r7
 868         addc    r11,r8,r11
 869         adde    r12,r9,r12
 870         addze   r10,r10
 871         $ST     r11,`1*$BNSZ`(r3)       #r[1]=c2
 872                                         #mul_add_c(a[2],b[0],c3,c1,c2);
 873         $LD     r6,`2*$BNSZ`(r4)
 874         $UMULL  r8,r6,r7
 875         $UMULH  r9,r6,r7
 876         addc    r12,r8,r12
 877         adde    r10,r9,r10
 878         addze   r11,r0
 879                                         #mul_add_c(a[1],b[1],c3,c1,c2);
 880         $LD     r6,`1*$BNSZ`(r4)
 881         $LD     r7,`1*$BNSZ`(r5)
 882         $UMULL  r8,r6,r7
 883         $UMULH  r9,r6,r7
 884         addc    r12,r8,r12
 885         adde    r10,r9,r10
 886         addze   r11,r11
 887                                         #mul_add_c(a[0],b[2],c3,c1,c2);
 888         $LD     r6,`0*$BNSZ`(r4)
 889         $LD     r7,`2*$BNSZ`(r5)
 890         $UMULL  r8,r6,r7
 891         $UMULH  r9,r6,r7
 892         addc    r12,r8,r12
 893         adde    r10,r9,r10
 894         addze   r11,r11
 895         $ST     r12,`2*$BNSZ`(r3)       #r[2]=c3
 896                                         #mul_add_c(a[0],b[3],c1,c2,c3);
 897         $LD     r7,`3*$BNSZ`(r5)
 898         $UMULL  r8,r6,r7
 899         $UMULH  r9,r6,r7
 900         addc    r10,r8,r10
 901         adde    r11,r9,r11
 902         addze   r12,r0
 903                                         #mul_add_c(a[1],b[2],c1,c2,c3);
 904         $LD     r6,`1*$BNSZ`(r4)
 905         $LD     r7,`2*$BNSZ`(r5)
 906         $UMULL  r8,r6,r7
 907         $UMULH  r9,r6,r7
 908         addc    r10,r8,r10
 909         adde    r11,r9,r11
 910         addze   r12,r12
 911                                         #mul_add_c(a[2],b[1],c1,c2,c3);
 912         $LD     r6,`2*$BNSZ`(r4)
 913         $LD     r7,`1*$BNSZ`(r5)
 914         $UMULL  r8,r6,r7
 915         $UMULH  r9,r6,r7
 916         addc    r10,r8,r10
 917         adde    r11,r9,r11
 918         addze   r12,r12
 919                                         #mul_add_c(a[3],b[0],c1,c2,c3);
 920         $LD     r6,`3*$BNSZ`(r4)
 921         $LD     r7,`0*$BNSZ`(r5)
 922         $UMULL  r8,r6,r7
 923         $UMULH  r9,r6,r7
 924         addc    r10,r8,r10
 925         adde    r11,r9,r11
 926         addze   r12,r12
 927         $ST     r10,`3*$BNSZ`(r3)       #r[3]=c1
 928                                         #mul_add_c(a[3],b[1],c2,c3,c1);
 929         $LD     r7,`1*$BNSZ`(r5)
 930         $UMULL  r8,r6,r7
 931         $UMULH  r9,r6,r7
 932         addc    r11,r8,r11
 933         adde    r12,r9,r12
 934         addze   r10,r0
 935                                         #mul_add_c(a[2],b[2],c2,c3,c1);
 936         $LD     r6,`2*$BNSZ`(r4)
 937         $LD     r7,`2*$BNSZ`(r5)
 938         $UMULL  r8,r6,r7
 939         $UMULH  r9,r6,r7
 940         addc    r11,r8,r11
 941         adde    r12,r9,r12
 942         addze   r10,r10
 943                                         #mul_add_c(a[1],b[3],c2,c3,c1);
 944         $LD     r6,`1*$BNSZ`(r4)
 945         $LD     r7,`3*$BNSZ`(r5)
 946         $UMULL  r8,r6,r7
 947         $UMULH  r9,r6,r7
 948         addc    r11,r8,r11
 949         adde    r12,r9,r12
 950         addze   r10,r10
 951         $ST     r11,`4*$BNSZ`(r3)       #r[4]=c2
 952                                         #mul_add_c(a[2],b[3],c3,c1,c2);
 953         $LD     r6,`2*$BNSZ`(r4)
 954         $UMULL  r8,r6,r7
 955         $UMULH  r9,r6,r7
 956         addc    r12,r8,r12
 957         adde    r10,r9,r10
 958         addze   r11,r0
 959                                         #mul_add_c(a[3],b[2],c3,c1,c2);
 960         $LD     r6,`3*$BNSZ`(r4)
 961         $LD     r7,`2*$BNSZ`(r5)
 962         $UMULL  r8,r6,r7
 963         $UMULH  r9,r6,r7
 964         addc    r12,r8,r12
 965         adde    r10,r9,r10
 966         addze   r11,r11
 967         $ST     r12,`5*$BNSZ`(r3)       #r[5]=c3
 968                                         #mul_add_c(a[3],b[3],c1,c2,c3);
 969         $LD     r7,`3*$BNSZ`(r5)
 970         $UMULL  r8,r6,r7
 971         $UMULH  r9,r6,r7
 972         addc    r10,r8,r10
 973         adde    r11,r9,r11
 974
 975         $ST     r10,`6*$BNSZ`(r3)       #r[6]=c1
 976         $ST     r11,`7*$BNSZ`(r3)       #r[7]=c2
 977         blr
 978         .long   0
 979         .byte   0,12,0x14,0,0,0,3,0
 980         .long   0
 981 .size   .bn_mul_comba4,.-.bn_mul_comba4
 982
 983 #
 984 #       NOTE:   The following label name should be changed to
 985 #               "bn_mul_comba8" i.e. remove the first dot
 986 #               for the gcc compiler. This should be automatically
 987 #               done in the build
 988 #
 989
 990 .align  4
 991 .bn_mul_comba8:
 992 #
 993 # Optimized version of the bn_mul_comba8 routine.
 994 #
 995 # void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 996 # r3 contains r
 997 # r4 contains a
 998 # r5 contains b
 999 # r6, r7 are the 2 BN_ULONGs being multiplied.
1000 # r8, r9 are the results of the 32x32 giving 64 multiply.
1001 # r10, r11, r12 are the equivalents of c1, c2, and c3.
1002 #
1003         xor     r0,r0,r0                #r0=0. Used in addze below.
1004
1005                                         #mul_add_c(a[0],b[0],c1,c2,c3);
1006         $LD     r6,`0*$BNSZ`(r4)        #a[0]
1007         $LD     r7,`0*$BNSZ`(r5)        #b[0]
1008         $UMULL  r10,r6,r7
1009         $UMULH  r11,r6,r7
1010         $ST     r10,`0*$BNSZ`(r3)       #r[0]=c1;
1011                                         #mul_add_c(a[0],b[1],c2,c3,c1);
1012         $LD     r7,`1*$BNSZ`(r5)
1013         $UMULL  r8,r6,r7
1014         $UMULH  r9,r6,r7
1015         addc    r11,r11,r8
1016         addze   r12,r9                  # since we didn't set r12 to zero before.
1017         addze   r10,r0
1018                                         #mul_add_c(a[1],b[0],c2,c3,c1);
1019         $LD     r6,`1*$BNSZ`(r4)
1020         $LD     r7,`0*$BNSZ`(r5)
1021         $UMULL  r8,r6,r7
1022         $UMULH  r9,r6,r7
1023         addc    r11,r11,r8
1024         adde    r12,r12,r9
1025         addze   r10,r10
1026         $ST     r11,`1*$BNSZ`(r3)       #r[1]=c2;
1027                                         #mul_add_c(a[2],b[0],c3,c1,c2);
1028         $LD     r6,`2*$BNSZ`(r4)
1029         $UMULL  r8,r6,r7
1030         $UMULH  r9,r6,r7
1031         addc    r12,r12,r8
1032         adde    r10,r10,r9
1033         addze   r11,r0
1034                                         #mul_add_c(a[1],b[1],c3,c1,c2);
1035         $LD     r6,`1*$BNSZ`(r4)
1036         $LD     r7,`1*$BNSZ`(r5)
1037         $UMULL  r8,r6,r7
1038         $UMULH  r9,r6,r7
1039         addc    r12,r12,r8
1040         adde    r10,r10,r9
1041         addze   r11,r11
1042                                         #mul_add_c(a[0],b[2],c3,c1,c2);
1043         $LD     r6,`0*$BNSZ`(r4)
1044         $LD     r7,`2*$BNSZ`(r5)
1045         $UMULL  r8,r6,r7
1046         $UMULH  r9,r6,r7
1047         addc    r12,r12,r8
1048         adde    r10,r10,r9
1049         addze   r11,r11
1050         $ST     r12,`2*$BNSZ`(r3)       #r[2]=c3;
1051                                         #mul_add_c(a[0],b[3],c1,c2,c3);
1052         $LD     r7,`3*$BNSZ`(r5)
1053         $UMULL  r8,r6,r7
1054         $UMULH  r9,r6,r7
1055         addc    r10,r10,r8
1056         adde    r11,r11,r9
1057         addze   r12,r0
1058                                         #mul_add_c(a[1],b[2],c1,c2,c3);
1059         $LD     r6,`1*$BNSZ`(r4)
1060         $LD     r7,`2*$BNSZ`(r5)
1061         $UMULL  r8,r6,r7
1062         $UMULH  r9,r6,r7
1063         addc    r10,r10,r8
1064         adde    r11,r11,r9
1065         addze   r12,r12
1066
1067                                         #mul_add_c(a[2],b[1],c1,c2,c3);
1068         $LD     r6,`2*$BNSZ`(r4)
1069         $LD     r7,`1*$BNSZ`(r5)
1070         $UMULL  r8,r6,r7
1071         $UMULH  r9,r6,r7
1072         addc    r10,r10,r8
1073         adde    r11,r11,r9
1074         addze   r12,r12
1075                                         #mul_add_c(a[3],b[0],c1,c2,c3);
1076         $LD     r6,`3*$BNSZ`(r4)
1077         $LD     r7,`0*$BNSZ`(r5)
1078         $UMULL  r8,r6,r7
1079         $UMULH  r9,r6,r7
1080         addc    r10,r10,r8
1081         adde    r11,r11,r9
1082         addze   r12,r12
1083         $ST     r10,`3*$BNSZ`(r3)       #r[3]=c1;
1084                                         #mul_add_c(a[4],b[0],c2,c3,c1);
1085         $LD     r6,`4*$BNSZ`(r4)
1086         $UMULL  r8,r6,r7
1087         $UMULH  r9,r6,r7
1088         addc    r11,r11,r8
1089         adde    r12,r12,r9
1090         addze   r10,r0
1091                                         #mul_add_c(a[3],b[1],c2,c3,c1);
1092         $LD     r6,`3*$BNSZ`(r4)
1093         $LD     r7,`1*$BNSZ`(r5)
1094         $UMULL  r8,r6,r7
1095         $UMULH  r9,r6,r7
1096         addc    r11,r11,r8
1097         adde    r12,r12,r9
1098         addze   r10,r10
1099                                         #mul_add_c(a[2],b[2],c2,c3,c1);
1100         $LD     r6,`2*$BNSZ`(r4)
1101         $LD     r7,`2*$BNSZ`(r5)
1102         $UMULL  r8,r6,r7
1103         $UMULH  r9,r6,r7
1104         addc    r11,r11,r8
1105         adde    r12,r12,r9
1106         addze   r10,r10
1107                                         #mul_add_c(a[1],b[3],c2,c3,c1);
1108         $LD     r6,`1*$BNSZ`(r4)
1109         $LD     r7,`3*$BNSZ`(r5)
1110         $UMULL  r8,r6,r7
1111         $UMULH  r9,r6,r7
1112         addc    r11,r11,r8
1113         adde    r12,r12,r9
1114         addze   r10,r10
1115                                         #mul_add_c(a[0],b[4],c2,c3,c1);
1116         $LD     r6,`0*$BNSZ`(r4)
1117         $LD     r7,`4*$BNSZ`(r5)
1118         $UMULL  r8,r6,r7
1119         $UMULH  r9,r6,r7
1120         addc    r11,r11,r8
1121         adde    r12,r12,r9
1122         addze   r10,r10
1123         $ST     r11,`4*$BNSZ`(r3)       #r[4]=c2;
1124                                         #mul_add_c(a[0],b[5],c3,c1,c2);
1125         $LD     r7,`5*$BNSZ`(r5)
1126         $UMULL  r8,r6,r7
1127         $UMULH  r9,r6,r7
1128         addc    r12,r12,r8
1129         adde    r10,r10,r9
1130         addze   r11,r0
1131                                         #mul_add_c(a[1],b[4],c3,c1,c2);
1132         $LD     r6,`1*$BNSZ`(r4)
1133         $LD     r7,`4*$BNSZ`(r5)
1134         $UMULL  r8,r6,r7
1135         $UMULH  r9,r6,r7
1136         addc    r12,r12,r8
1137         adde    r10,r10,r9
1138         addze   r11,r11
1139                                         #mul_add_c(a[2],b[3],c3,c1,c2);
1140         $LD     r6,`2*$BNSZ`(r4)
1141         $LD     r7,`3*$BNSZ`(r5)
1142         $UMULL  r8,r6,r7
1143         $UMULH  r9,r6,r7
1144         addc    r12,r12,r8
1145         adde    r10,r10,r9
1146         addze   r11,r11
1147                                         #mul_add_c(a[3],b[2],c3,c1,c2);
1148         $LD     r6,`3*$BNSZ`(r4)
1149         $LD     r7,`2*$BNSZ`(r5)
1150         $UMULL  r8,r6,r7
1151         $UMULH  r9,r6,r7
1152         addc    r12,r12,r8
1153         adde    r10,r10,r9
1154         addze   r11,r11
1155                                         #mul_add_c(a[4],b[1],c3,c1,c2);
1156         $LD     r6,`4*$BNSZ`(r4)
1157         $LD     r7,`1*$BNSZ`(r5)
1158         $UMULL  r8,r6,r7
1159         $UMULH  r9,r6,r7
1160         addc    r12,r12,r8
1161         adde    r10,r10,r9
1162         addze   r11,r11
1163                                         #mul_add_c(a[5],b[0],c3,c1,c2);
1164         $LD     r6,`5*$BNSZ`(r4)
1165         $LD     r7,`0*$BNSZ`(r5)
1166         $UMULL  r8,r6,r7
1167         $UMULH  r9,r6,r7
1168         addc    r12,r12,r8
1169         adde    r10,r10,r9
1170         addze   r11,r11
1171         $ST     r12,`5*$BNSZ`(r3)       #r[5]=c3;
1172                                         #mul_add_c(a[6],b[0],c1,c2,c3);
1173         $LD     r6,`6*$BNSZ`(r4)
1174         $UMULL  r8,r6,r7
1175         $UMULH  r9,r6,r7
1176         addc    r10,r10,r8
1177         adde    r11,r11,r9
1178         addze   r12,r0
1179                                         #mul_add_c(a[5],b[1],c1,c2,c3);
1180         $LD     r6,`5*$BNSZ`(r4)
1181         $LD     r7,`1*$BNSZ`(r5)
1182         $UMULL  r8,r6,r7
1183         $UMULH  r9,r6,r7
1184         addc    r10,r10,r8
1185         adde    r11,r11,r9
1186         addze   r12,r12
1187                                         #mul_add_c(a[4],b[2],c1,c2,c3);
1188         $LD     r6,`4*$BNSZ`(r4)
1189         $LD     r7,`2*$BNSZ`(r5)
1190         $UMULL  r8,r6,r7
1191         $UMULH  r9,r6,r7
1192         addc    r10,r10,r8
1193         adde    r11,r11,r9
1194         addze   r12,r12
1195                                         #mul_add_c(a[3],b[3],c1,c2,c3);
1196         $LD     r6,`3*$BNSZ`(r4)
1197         $LD     r7,`3*$BNSZ`(r5)
1198         $UMULL  r8,r6,r7
1199         $UMULH  r9,r6,r7
1200         addc    r10,r10,r8
1201         adde    r11,r11,r9
1202         addze   r12,r12
1203                                         #mul_add_c(a[2],b[4],c1,c2,c3);
1204         $LD     r6,`2*$BNSZ`(r4)
1205         $LD     r7,`4*$BNSZ`(r5)
1206         $UMULL  r8,r6,r7
1207         $UMULH  r9,r6,r7
1208         addc    r10,r10,r8
1209         adde    r11,r11,r9
1210         addze   r12,r12
1211                                         #mul_add_c(a[1],b[5],c1,c2,c3);
1212         $LD     r6,`1*$BNSZ`(r4)
1213         $LD     r7,`5*$BNSZ`(r5)
1214         $UMULL  r8,r6,r7
1215         $UMULH  r9,r6,r7
1216         addc    r10,r10,r8
1217         adde    r11,r11,r9
1218         addze   r12,r12
1219                                         #mul_add_c(a[0],b[6],c1,c2,c3);
1220         $LD     r6,`0*$BNSZ`(r4)
1221         $LD     r7,`6*$BNSZ`(r5)
1222         $UMULL  r8,r6,r7
1223         $UMULH  r9,r6,r7
1224         addc    r10,r10,r8
1225         adde    r11,r11,r9
1226         addze   r12,r12
1227         $ST     r10,`6*$BNSZ`(r3)       #r[6]=c1;
1228                                         #mul_add_c(a[0],b[7],c2,c3,c1);
1229         $LD     r7,`7*$BNSZ`(r5)
1230         $UMULL  r8,r6,r7
1231         $UMULH  r9,r6,r7
1232         addc    r11,r11,r8
1233         adde    r12,r12,r9
1234         addze   r10,r0
1235                                         #mul_add_c(a[1],b[6],c2,c3,c1);
1236         $LD     r6,`1*$BNSZ`(r4)
1237         $LD     r7,`6*$BNSZ`(r5)
1238         $UMULL  r8,r6,r7
1239         $UMULH  r9,r6,r7
1240         addc    r11,r11,r8
1241         adde    r12,r12,r9
1242         addze   r10,r10
1243                                         #mul_add_c(a[2],b[5],c2,c3,c1);
1244         $LD     r6,`2*$BNSZ`(r4)
1245         $LD     r7,`5*$BNSZ`(r5)
1246         $UMULL  r8,r6,r7
1247         $UMULH  r9,r6,r7
1248         addc    r11,r11,r8
1249         adde    r12,r12,r9
1250         addze   r10,r10
1251                                         #mul_add_c(a[3],b[4],c2,c3,c1);
1252         $LD     r6,`3*$BNSZ`(r4)
1253         $LD     r7,`4*$BNSZ`(r5)
1254         $UMULL  r8,r6,r7
1255         $UMULH  r9,r6,r7
1256         addc    r11,r11,r8
1257         adde    r12,r12,r9
1258         addze   r10,r10
1259                                         #mul_add_c(a[4],b[3],c2,c3,c1);
1260         $LD     r6,`4*$BNSZ`(r4)
1261         $LD     r7,`3*$BNSZ`(r5)
1262         $UMULL  r8,r6,r7
1263         $UMULH  r9,r6,r7
1264         addc    r11,r11,r8
1265         adde    r12,r12,r9
1266         addze   r10,r10
1267                                         #mul_add_c(a[5],b[2],c2,c3,c1);
1268         $LD     r6,`5*$BNSZ`(r4)
1269         $LD     r7,`2*$BNSZ`(r5)
1270         $UMULL  r8,r6,r7
1271         $UMULH  r9,r6,r7
1272         addc    r11,r11,r8
1273         adde    r12,r12,r9
1274         addze   r10,r10
1275                                         #mul_add_c(a[6],b[1],c2,c3,c1);
1276         $LD     r6,`6*$BNSZ`(r4)
1277         $LD     r7,`1*$BNSZ`(r5)
1278         $UMULL  r8,r6,r7
1279         $UMULH  r9,r6,r7
1280         addc    r11,r11,r8
1281         adde    r12,r12,r9
1282         addze   r10,r10
1283                                         #mul_add_c(a[7],b[0],c2,c3,c1);
1284         $LD     r6,`7*$BNSZ`(r4)
1285         $LD     r7,`0*$BNSZ`(r5)
1286         $UMULL  r8,r6,r7
1287         $UMULH  r9,r6,r7
1288         addc    r11,r11,r8
1289         adde    r12,r12,r9
1290         addze   r10,r10
1291         $ST     r11,`7*$BNSZ`(r3)       #r[7]=c2;
1292                                         #mul_add_c(a[7],b[1],c3,c1,c2);
1293         $LD     r7,`1*$BNSZ`(r5)
1294         $UMULL  r8,r6,r7
1295         $UMULH  r9,r6,r7
1296         addc    r12,r12,r8
1297         adde    r10,r10,r9
1298         addze   r11,r0
1299                                         #mul_add_c(a[6],b[2],c3,c1,c2);
1300         $LD     r6,`6*$BNSZ`(r4)
1301         $LD     r7,`2*$BNSZ`(r5)
1302         $UMULL  r8,r6,r7
1303         $UMULH  r9,r6,r7
1304         addc    r12,r12,r8
1305         adde    r10,r10,r9
1306         addze   r11,r11
1307                                         #mul_add_c(a[5],b[3],c3,c1,c2);
1308         $LD     r6,`5*$BNSZ`(r4)
1309         $LD     r7,`3*$BNSZ`(r5)
1310         $UMULL  r8,r6,r7
1311         $UMULH  r9,r6,r7
1312         addc    r12,r12,r8
1313         adde    r10,r10,r9
1314         addze   r11,r11
1315                                         #mul_add_c(a[4],b[4],c3,c1,c2);
1316         $LD     r6,`4*$BNSZ`(r4)
1317         $LD     r7,`4*$BNSZ`(r5)
1318         $UMULL  r8,r6,r7
1319         $UMULH  r9,r6,r7
1320         addc    r12,r12,r8
1321         adde    r10,r10,r9
1322         addze   r11,r11
1323                                         #mul_add_c(a[3],b[5],c3,c1,c2);
1324         $LD     r6,`3*$BNSZ`(r4)
1325         $LD     r7,`5*$BNSZ`(r5)
1326         $UMULL  r8,r6,r7
1327         $UMULH  r9,r6,r7
1328         addc    r12,r12,r8
1329         adde    r10,r10,r9
1330         addze   r11,r11
1331                                         #mul_add_c(a[2],b[6],c3,c1,c2);
1332         $LD     r6,`2*$BNSZ`(r4)
1333         $LD     r7,`6*$BNSZ`(r5)
1334         $UMULL  r8,r6,r7
1335         $UMULH  r9,r6,r7
1336         addc    r12,r12,r8
1337         adde    r10,r10,r9
1338         addze   r11,r11
1339                                         #mul_add_c(a[1],b[7],c3,c1,c2);
1340         $LD     r6,`1*$BNSZ`(r4)
1341         $LD     r7,`7*$BNSZ`(r5)
1342         $UMULL  r8,r6,r7
1343         $UMULH  r9,r6,r7
1344         addc    r12,r12,r8
1345         adde    r10,r10,r9
1346         addze   r11,r11
1347         $ST     r12,`8*$BNSZ`(r3)       #r[8]=c3;
1348                                         #mul_add_c(a[2],b[7],c1,c2,c3);
1349         $LD     r6,`2*$BNSZ`(r4)
1350         $UMULL  r8,r6,r7
1351         $UMULH  r9,r6,r7
1352         addc    r10,r10,r8
1353         adde    r11,r11,r9
1354         addze   r12,r0
1355                                         #mul_add_c(a[3],b[6],c1,c2,c3);
1356         $LD     r6,`3*$BNSZ`(r4)
1357         $LD     r7,`6*$BNSZ`(r5)
1358         $UMULL  r8,r6,r7
1359         $UMULH  r9,r6,r7
1360         addc    r10,r10,r8
1361         adde    r11,r11,r9
1362         addze   r12,r12
1363                                         #mul_add_c(a[4],b[5],c1,c2,c3);
1364         $LD     r6,`4*$BNSZ`(r4)
1365         $LD     r7,`5*$BNSZ`(r5)
1366         $UMULL  r8,r6,r7
1367         $UMULH  r9,r6,r7
1368         addc    r10,r10,r8
1369         adde    r11,r11,r9
1370         addze   r12,r12
1371                                         #mul_add_c(a[5],b[4],c1,c2,c3);
1372         $LD     r6,`5*$BNSZ`(r4)
1373         $LD     r7,`4*$BNSZ`(r5)
1374         $UMULL  r8,r6,r7
1375         $UMULH  r9,r6,r7
1376         addc    r10,r10,r8
1377         adde    r11,r11,r9
1378         addze   r12,r12
1379                                         #mul_add_c(a[6],b[3],c1,c2,c3);
1380         $LD     r6,`6*$BNSZ`(r4)
1381         $LD     r7,`3*$BNSZ`(r5)
1382         $UMULL  r8,r6,r7
1383         $UMULH  r9,r6,r7
1384         addc    r10,r10,r8
1385         adde    r11,r11,r9
1386         addze   r12,r12
1387                                         #mul_add_c(a[7],b[2],c1,c2,c3);
1388         $LD     r6,`7*$BNSZ`(r4)
1389         $LD     r7,`2*$BNSZ`(r5)
1390         $UMULL  r8,r6,r7
1391         $UMULH  r9,r6,r7
1392         addc    r10,r10,r8
1393         adde    r11,r11,r9
1394         addze   r12,r12
1395         $ST     r10,`9*$BNSZ`(r3)       #r[9]=c1;
1396                                         #mul_add_c(a[7],b[3],c2,c3,c1);
1397         $LD     r7,`3*$BNSZ`(r5)
1398         $UMULL  r8,r6,r7
1399         $UMULH  r9,r6,r7
1400         addc    r11,r11,r8
1401         adde    r12,r12,r9
1402         addze   r10,r0
1403                                         #mul_add_c(a[6],b[4],c2,c3,c1);
1404         $LD     r6,`6*$BNSZ`(r4)
1405         $LD     r7,`4*$BNSZ`(r5)
1406         $UMULL  r8,r6,r7
1407         $UMULH  r9,r6,r7
1408         addc    r11,r11,r8
1409         adde    r12,r12,r9
1410         addze   r10,r10
1411                                         #mul_add_c(a[5],b[5],c2,c3,c1);
1412         $LD     r6,`5*$BNSZ`(r4)
1413         $LD     r7,`5*$BNSZ`(r5)
1414         $UMULL  r8,r6,r7
1415         $UMULH  r9,r6,r7
1416         addc    r11,r11,r8
1417         adde    r12,r12,r9
1418         addze   r10,r10
1419                                         #mul_add_c(a[4],b[6],c2,c3,c1);
1420         $LD     r6,`4*$BNSZ`(r4)
1421         $LD     r7,`6*$BNSZ`(r5)
1422         $UMULL  r8,r6,r7
1423         $UMULH  r9,r6,r7
1424         addc    r11,r11,r8
1425         adde    r12,r12,r9
1426         addze   r10,r10
1427                                         #mul_add_c(a[3],b[7],c2,c3,c1);
1428         $LD     r6,`3*$BNSZ`(r4)
1429         $LD     r7,`7*$BNSZ`(r5)
1430         $UMULL  r8,r6,r7
1431         $UMULH  r9,r6,r7
1432         addc    r11,r11,r8
1433         adde    r12,r12,r9
1434         addze   r10,r10
1435         $ST     r11,`10*$BNSZ`(r3)      #r[10]=c2;
1436                                         #mul_add_c(a[4],b[7],c3,c1,c2);
1437         $LD     r6,`4*$BNSZ`(r4)
1438         $UMULL  r8,r6,r7
1439         $UMULH  r9,r6,r7
1440         addc    r12,r12,r8
1441         adde    r10,r10,r9
1442         addze   r11,r0
1443                                         #mul_add_c(a[5],b[6],c3,c1,c2);
1444         $LD     r6,`5*$BNSZ`(r4)
1445         $LD     r7,`6*$BNSZ`(r5)
1446         $UMULL  r8,r6,r7
1447         $UMULH  r9,r6,r7
1448         addc    r12,r12,r8
1449         adde    r10,r10,r9
1450         addze   r11,r11
1451                                         #mul_add_c(a[6],b[5],c3,c1,c2);
1452         $LD     r6,`6*$BNSZ`(r4)
1453         $LD     r7,`5*$BNSZ`(r5)
1454         $UMULL  r8,r6,r7
1455         $UMULH  r9,r6,r7
1456         addc    r12,r12,r8
1457         adde    r10,r10,r9
1458         addze   r11,r11
1459                                         #mul_add_c(a[7],b[4],c3,c1,c2);
1460         $LD     r6,`7*$BNSZ`(r4)
1461         $LD     r7,`4*$BNSZ`(r5)
1462         $UMULL  r8,r6,r7
1463         $UMULH  r9,r6,r7
1464         addc    r12,r12,r8
1465         adde    r10,r10,r9
1466         addze   r11,r11
1467         $ST     r12,`11*$BNSZ`(r3)      #r[11]=c3;
1468                                         #mul_add_c(a[7],b[5],c1,c2,c3);
1469         $LD     r7,`5*$BNSZ`(r5)
1470         $UMULL  r8,r6,r7
1471         $UMULH  r9,r6,r7
1472         addc    r10,r10,r8
1473         adde    r11,r11,r9
1474         addze   r12,r0
1475                                         #mul_add_c(a[6],b[6],c1,c2,c3);
1476         $LD     r6,`6*$BNSZ`(r4)
1477         $LD     r7,`6*$BNSZ`(r5)
1478         $UMULL  r8,r6,r7
1479         $UMULH  r9,r6,r7
1480         addc    r10,r10,r8
1481         adde    r11,r11,r9
1482         addze   r12,r12
1483                                         #mul_add_c(a[5],b[7],c1,c2,c3);
1484         $LD     r6,`5*$BNSZ`(r4)
1485         $LD     r7,`7*$BNSZ`(r5)
1486         $UMULL  r8,r6,r7
1487         $UMULH  r9,r6,r7
1488         addc    r10,r10,r8
1489         adde    r11,r11,r9
1490         addze   r12,r12
1491         $ST     r10,`12*$BNSZ`(r3)      #r[12]=c1;
1492                                         #mul_add_c(a[6],b[7],c2,c3,c1);
1493         $LD     r6,`6*$BNSZ`(r4)
1494         $UMULL  r8,r6,r7
1495         $UMULH  r9,r6,r7
1496         addc    r11,r11,r8
1497         adde    r12,r12,r9
1498         addze   r10,r0
1499                                         #mul_add_c(a[7],b[6],c2,c3,c1);
1500         $LD     r6,`7*$BNSZ`(r4)
1501         $LD     r7,`6*$BNSZ`(r5)
1502         $UMULL  r8,r6,r7
1503         $UMULH  r9,r6,r7
1504         addc    r11,r11,r8
1505         adde    r12,r12,r9
1506         addze   r10,r10
1507         $ST     r11,`13*$BNSZ`(r3)      #r[13]=c2;
1508                                         #mul_add_c(a[7],b[7],c3,c1,c2);
1509         $LD     r7,`7*$BNSZ`(r5)
1510         $UMULL  r8,r6,r7
1511         $UMULH  r9,r6,r7
1512         addc    r12,r12,r8
1513         adde    r10,r10,r9
1514         $ST     r12,`14*$BNSZ`(r3)      #r[14]=c3;
1515         $ST     r10,`15*$BNSZ`(r3)      #r[15]=c1;
1516         blr
1517         .long   0
1518         .byte   0,12,0x14,0,0,0,3,0
1519         .long   0
1520 .size   .bn_mul_comba8,.-.bn_mul_comba8
1521
1522 #
1523 #       NOTE:   The following label name should be changed to
1524 #               "bn_sub_words" i.e. remove the first dot
1525 #               for the gcc compiler. This should be automatically
1526 #               done in the build
1527 #
1528 #
1529 .align  4
1530 .bn_sub_words:
1531 #
1532 #       Handcoded version of bn_sub_words
1533 #
1534 #BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1535 #
1536 #       r3 = r
1537 #       r4 = a
1538 #       r5 = b
1539 #       r6 = n
1540 #
1541 #       Note:   No loop unrolling done since this is not a performance
1542 #               critical loop.
1543
1544         xor     r0,r0,r0        #set r0 = 0
1545 #
1546 #       check for r6 = 0 AND set carry bit.
1547 #
1548         subfc.  r7,r0,r6        # If r6 is 0 then result is 0.
1549                                 # if r6 > 0 then result !=0
1550                                 # In either case carry bit is set.
1551         beq     Lppcasm_sub_adios
1552         addi    r4,r4,-$BNSZ
1553         addi    r3,r3,-$BNSZ
1554         addi    r5,r5,-$BNSZ
1555         mtctr   r6
1556 Lppcasm_sub_mainloop:
1557         $LDU    r7,$BNSZ(r4)
1558         $LDU    r8,$BNSZ(r5)
1559         subfe   r6,r8,r7        # r6 = r7+carry bit + onescomplement(r8)
1560                                 # if carry = 1 this is r7-r8. Else it
1561                                 # is r7-r8 -1 as we need.
1562         $STU    r6,$BNSZ(r3)
1563         bdnz    Lppcasm_sub_mainloop
1564 Lppcasm_sub_adios:
1565         subfze  r3,r0           # if carry bit is set then r3 = 0 else -1
1566         andi.   r3,r3,1         # keep only last bit.
1567         blr
1568         .long   0
1569         .byte   0,12,0x14,0,0,0,4,0
1570         .long   0
1571 .size   .bn_sub_words,.-.bn_sub_words
1572
1573 #
1574 #       NOTE:   The following label name should be changed to
1575 #               "bn_add_words" i.e. remove the first dot
1576 #               for the gcc compiler. This should be automatically
1577 #               done in the build
1578 #
1579
1580 .align  4
1581 .bn_add_words:
1582 #
1583 #       Handcoded version of bn_add_words
1584 #
1585 #BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1586 #
1587 #       r3 = r
1588 #       r4 = a
1589 #       r5 = b
1590 #       r6 = n
1591 #
1592 #       Note:   No loop unrolling done since this is not a performance
1593 #               critical loop.
1594
1595         xor     r0,r0,r0
1596 #
1597 #       check for r6 = 0. Is this needed?
1598 #
1599         addic.  r6,r6,0         #test r6 and clear carry bit.
1600         beq     Lppcasm_add_adios
1601         addi    r4,r4,-$BNSZ
1602         addi    r3,r3,-$BNSZ
1603         addi    r5,r5,-$BNSZ
1604         mtctr   r6
1605 Lppcasm_add_mainloop:
1606         $LDU    r7,$BNSZ(r4)
1607         $LDU    r8,$BNSZ(r5)
1608         adde    r8,r7,r8
1609         $STU    r8,$BNSZ(r3)
1610         bdnz    Lppcasm_add_mainloop
1611 Lppcasm_add_adios:
1612         addze   r3,r0                   #return carry bit.
1613         blr
1614         .long   0
1615         .byte   0,12,0x14,0,0,0,4,0
1616         .long   0
1617 .size   .bn_add_words,.-.bn_add_words
1618
1619 #
1620 #       NOTE:   The following label name should be changed to
1621 #               "bn_div_words" i.e. remove the first dot
1622 #               for the gcc compiler. This should be automatically
1623 #               done in the build
1624 #
1625
1626 .align  4
1627 .bn_div_words:
1628 #
1629 #       This is a cleaned up version of code generated by
1630 #       the AIX compiler. The only optimization is to use
1631 #       the PPC instruction to count leading zeros instead
1632 #       of call to num_bits_word. Since this was compiled
1633 #       only at level -O2 we can possibly squeeze it more?
1634 #
1635 #       r3 = h
1636 #       r4 = l
1637 #       r5 = d
1638
1639         $UCMPI  0,r5,0                  # compare r5 and 0
1640         bne     Lppcasm_div1            # proceed if d!=0
1641         li      r3,-1                   # d=0 return -1
1642         blr
1643 Lppcasm_div1:
1644         xor     r0,r0,r0                #r0=0
1645         li      r8,$BITS
1646         $CNTLZ. r7,r5                   #r7 = num leading 0s in d.
1647         beq     Lppcasm_div2            #proceed if no leading zeros
1648         subf    r8,r7,r8                #r8 = BN_num_bits_word(d)
1649         $SHR.   r9,r3,r8                #are there any bits above r8'th?
1650         $TR     16,r9,r0                #if there're, signal to dump core...
1651 Lppcasm_div2:
1652         $UCMP   0,r3,r5                 #h>=d?
1653         blt     Lppcasm_div3            #goto Lppcasm_div3 if not
1654         subf    r3,r5,r3                #h-=d ;
1655 Lppcasm_div3:                           #r7 = BN_BITS2-i. so r7=i
1656         cmpi    0,0,r7,0                # is (i == 0)?
1657         beq     Lppcasm_div4
1658         $SHL    r3,r3,r7                # h = (h<< i)
1659         $SHR    r8,r4,r8                # r8 = (l >> BN_BITS2 -i)
1660         $SHL    r5,r5,r7                # d<<=i
1661         or      r3,r3,r8                # h = (h<<i)|(l>>(BN_BITS2-i))
1662         $SHL    r4,r4,r7                # l <<=i
1663 Lppcasm_div4:
1664         $SHRI   r9,r5,`$BITS/2`         # r9 = dh
1665                                         # dl will be computed when needed
1666                                         # as it saves registers.
1667         li      r6,2                    #r6=2
1668         mtctr   r6                      #counter will be in count.
1669 Lppcasm_divouterloop:
1670         $SHRI   r8,r3,`$BITS/2`         #r8 = (h>>BN_BITS4)
1671         $SHRI   r11,r4,`$BITS/2`        #r11= (l&BN_MASK2h)>>BN_BITS4
1672                                         # compute here for innerloop.
1673         $UCMP   0,r8,r9                 # is (h>>BN_BITS4)==dh
1674         bne     Lppcasm_div5            # goto Lppcasm_div5 if not
1675
1676         li      r8,-1
1677         $CLRU   r8,r8,`$BITS/2`         #q = BN_MASK2l
1678         b       Lppcasm_div6
1679 Lppcasm_div5:
1680         $UDIV   r8,r3,r9                #q = h/dh
1681 Lppcasm_div6:
1682         $UMULL  r12,r9,r8               #th = q*dh
1683         $CLRU   r10,r5,`$BITS/2`        #r10=dl
1684         $UMULL  r6,r8,r10               #tl = q*dl
1685
1686 Lppcasm_divinnerloop:
1687         subf    r10,r12,r3              #t = h -th
1688         $SHRI   r7,r10,`$BITS/2`        #r7= (t &BN_MASK2H), sort of...
1689         addic.  r7,r7,0                 #test if r7 == 0. used below.
1690                                         # now want to compute
1691                                         # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1692                                         # the following 2 instructions do that
1693         $SHLI   r7,r10,`$BITS/2`        # r7 = (t<<BN_BITS4)
1694         or      r7,r7,r11               # r7|=((l&BN_MASK2h)>>BN_BITS4)
1695         $UCMP   cr1,r6,r7               # compare (tl <= r7)
1696         bne     Lppcasm_divinnerexit
1697         ble     cr1,Lppcasm_divinnerexit
1698         addi    r8,r8,-1                #q--
1699         subf    r12,r9,r12              #th -=dh
1700         $CLRU   r10,r5,`$BITS/2`        #r10=dl. t is no longer needed in loop.
1701         subf    r6,r10,r6               #tl -=dl
1702         b       Lppcasm_divinnerloop
1703 Lppcasm_divinnerexit:
1704         $SHRI   r10,r6,`$BITS/2`        #t=(tl>>BN_BITS4)
1705         $SHLI   r11,r6,`$BITS/2`        #tl=(tl<<BN_BITS4)&BN_MASK2h;
1706         $UCMP   cr1,r4,r11              # compare l and tl
1707         add     r12,r12,r10             # th+=t
1708         bge     cr1,Lppcasm_div7        # if (l>=tl) goto Lppcasm_div7
1709         addi    r12,r12,1               # th++
1710 Lppcasm_div7:
1711         subf    r11,r11,r4              #r11=l-tl
1712         $UCMP   cr1,r3,r12              #compare h and th
1713         bge     cr1,Lppcasm_div8        #if (h>=th) goto Lppcasm_div8
1714         addi    r8,r8,-1                # q--
1715         add     r3,r5,r3                # h+=d
1716 Lppcasm_div8:
1717         subf    r12,r12,r3              #r12 = h-th
1718         $SHLI   r4,r11,`$BITS/2`        #l=(l&BN_MASK2l)<<BN_BITS4
1719                                         # want to compute
1720                                         # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1721                                         # the following 2 instructions will do this.
1722         $INSR   r11,r12,`$BITS/2`,`$BITS/2`     # r11 is the value we want rotated $BITS/2.
1723         $ROTL   r3,r11,`$BITS/2`        # rotate by $BITS/2 and store in r3
1724         bdz     Lppcasm_div9            #if (count==0) break ;
1725         $SHLI   r0,r8,`$BITS/2`         #ret =q<<BN_BITS4
1726         b       Lppcasm_divouterloop
1727 Lppcasm_div9:
1728         or      r3,r8,r0
1729         blr
1730         .long   0
1731         .byte   0,12,0x14,0,0,0,3,0
1732         .long   0
1733 .size   .bn_div_words,.-.bn_div_words
1734
1735 #
1736 #       NOTE:   The following label name should be changed to
1737 #               "bn_sqr_words" i.e. remove the first dot
1738 #               for the gcc compiler. This should be automatically
1739 #               done in the build
1740 #
1741 .align  4
1742 .bn_sqr_words:
1743 #
1744 #       Optimized version of bn_sqr_words
1745 #
1746 #       void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1747 #
1748 #       r3 = r
1749 #       r4 = a
1750 #       r5 = n
1751 #
1752 #       r6 = a[i].
1753 #       r7,r8 = product.
1754 #
1755 #       No unrolling done here. Not performance critical.
1756
1757         addic.  r5,r5,0                 #test r5.
1758         beq     Lppcasm_sqr_adios
1759         addi    r4,r4,-$BNSZ
1760         addi    r3,r3,-$BNSZ
1761         mtctr   r5
1762 Lppcasm_sqr_mainloop:
1763                                         #sqr(r[0],r[1],a[0]);
1764         $LDU    r6,$BNSZ(r4)
1765         $UMULL  r7,r6,r6
1766         $UMULH  r8,r6,r6
1767         $STU    r7,$BNSZ(r3)
1768         $STU    r8,$BNSZ(r3)
1769         bdnz    Lppcasm_sqr_mainloop
1770 Lppcasm_sqr_adios:
1771         blr
1772         .long   0
1773         .byte   0,12,0x14,0,0,0,3,0
1774         .long   0
1775 .size   .bn_sqr_words,.-.bn_sqr_words
1776
1777 #
1778 #       NOTE:   The following label name should be changed to
1779 #               "bn_mul_words" i.e. remove the first dot
1780 #               for the gcc compiler. This should be automatically
1781 #               done in the build
1782 #
1783
1784 .align  4
1785 .bn_mul_words:
1786 #
1787 # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1788 #
1789 # r3 = rp
1790 # r4 = ap
1791 # r5 = num
1792 # r6 = w
1793         xor     r0,r0,r0
1794         xor     r12,r12,r12             # used for carry
1795         rlwinm. r7,r5,30,2,31           # num >> 2
1796         beq     Lppcasm_mw_REM
1797         mtctr   r7
1798 Lppcasm_mw_LOOP:
1799                                         #mul(rp[0],ap[0],w,c1);
1800         $LD     r8,`0*$BNSZ`(r4)
1801         $UMULL  r9,r6,r8
1802         $UMULH  r10,r6,r8
1803         addc    r9,r9,r12
1804         #addze  r10,r10                 #carry is NOT ignored.
1805                                         #will be taken care of
1806                                         #in second spin below
1807                                         #using adde.
1808         $ST     r9,`0*$BNSZ`(r3)
1809                                         #mul(rp[1],ap[1],w,c1);
1810         $LD     r8,`1*$BNSZ`(r4)
1811         $UMULL  r11,r6,r8
1812         $UMULH  r12,r6,r8
1813         adde    r11,r11,r10
1814         #addze  r12,r12
1815         $ST     r11,`1*$BNSZ`(r3)
1816                                         #mul(rp[2],ap[2],w,c1);
1817         $LD     r8,`2*$BNSZ`(r4)
1818         $UMULL  r9,r6,r8
1819         $UMULH  r10,r6,r8
1820         adde    r9,r9,r12
1821         #addze  r10,r10
1822         $ST     r9,`2*$BNSZ`(r3)
1823                                         #mul_add(rp[3],ap[3],w,c1);
1824         $LD     r8,`3*$BNSZ`(r4)
1825         $UMULL  r11,r6,r8
1826         $UMULH  r12,r6,r8
1827         adde    r11,r11,r10
1828         addze   r12,r12                 #this spin we collect carry into
1829                                         #r12
1830         $ST     r11,`3*$BNSZ`(r3)
1831
1832         addi    r3,r3,`4*$BNSZ`
1833         addi    r4,r4,`4*$BNSZ`
1834         bdnz    Lppcasm_mw_LOOP
1835
1836 Lppcasm_mw_REM:
1837         andi.   r5,r5,0x3
1838         beq     Lppcasm_mw_OVER
1839                                         #mul(rp[0],ap[0],w,c1);
1840         $LD     r8,`0*$BNSZ`(r4)
1841         $UMULL  r9,r6,r8
1842         $UMULH  r10,r6,r8
1843         addc    r9,r9,r12
1844         addze   r10,r10
1845         $ST     r9,`0*$BNSZ`(r3)
1846         addi    r12,r10,0
1847
1848         addi    r5,r5,-1
1849         cmpli   0,0,r5,0
1850         beq     Lppcasm_mw_OVER
1851
1852
1853                                         #mul(rp[1],ap[1],w,c1);
1854         $LD     r8,`1*$BNSZ`(r4)
1855         $UMULL  r9,r6,r8
1856         $UMULH  r10,r6,r8
1857         addc    r9,r9,r12
1858         addze   r10,r10
1859         $ST     r9,`1*$BNSZ`(r3)
1860         addi    r12,r10,0
1861
1862         addi    r5,r5,-1
1863         cmpli   0,0,r5,0
1864         beq     Lppcasm_mw_OVER
1865
1866                                         #mul_add(rp[2],ap[2],w,c1);
1867         $LD     r8,`2*$BNSZ`(r4)
1868         $UMULL  r9,r6,r8
1869         $UMULH  r10,r6,r8
1870         addc    r9,r9,r12
1871         addze   r10,r10
1872         $ST     r9,`2*$BNSZ`(r3)
1873         addi    r12,r10,0
1874
1875 Lppcasm_mw_OVER:
1876         addi    r3,r12,0
1877         blr
1878         .long   0
1879         .byte   0,12,0x14,0,0,0,4,0
1880         .long   0
1881 .size   .bn_mul_words,.-.bn_mul_words
1882
1883 #
1884 #       NOTE:   The following label name should be changed to
1885 #               "bn_mul_add_words" i.e. remove the first dot
1886 #               for the gcc compiler. This should be automatically
1887 #               done in the build
1888 #
1889
1890 .align  4
1891 .bn_mul_add_words:
1892 #
1893 # BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1894 #
1895 # r3 = rp
1896 # r4 = ap
1897 # r5 = num
1898 # r6 = w
1899 #
1900 # empirical evidence suggests that unrolled version performs best!!
1901 #
1902         xor     r0,r0,r0                #r0 = 0
1903         xor     r12,r12,r12             #r12 = 0 . used for carry
1904         rlwinm. r7,r5,30,2,31           # num >> 2
1905         beq     Lppcasm_maw_leftover    # if (num < 4) go LPPCASM_maw_leftover
1906         mtctr   r7
1907 Lppcasm_maw_mainloop:
1908                                         #mul_add(rp[0],ap[0],w,c1);
1909         $LD     r8,`0*$BNSZ`(r4)
1910         $LD     r11,`0*$BNSZ`(r3)
1911         $UMULL  r9,r6,r8
1912         $UMULH  r10,r6,r8
1913         addc    r9,r9,r12               #r12 is carry.
1914         addze   r10,r10
1915         addc    r9,r9,r11
1916         #addze  r10,r10
1917                                         #the above instruction addze
1918                                         #is NOT needed. Carry will NOT
1919                                         #be ignored. It's not affected
1920                                         #by multiply and will be collected
1921                                         #in the next spin
1922         $ST     r9,`0*$BNSZ`(r3)
1923
1924                                         #mul_add(rp[1],ap[1],w,c1);
1925         $LD     r8,`1*$BNSZ`(r4)
1926         $LD     r9,`1*$BNSZ`(r3)
1927         $UMULL  r11,r6,r8
1928         $UMULH  r12,r6,r8
1929         adde    r11,r11,r10             #r10 is carry.
1930         addze   r12,r12
1931         addc    r11,r11,r9
1932         #addze  r12,r12
1933         $ST     r11,`1*$BNSZ`(r3)
1934
1935                                         #mul_add(rp[2],ap[2],w,c1);
1936         $LD     r8,`2*$BNSZ`(r4)
1937         $UMULL  r9,r6,r8
1938         $LD     r11,`2*$BNSZ`(r3)
1939         $UMULH  r10,r6,r8
1940         adde    r9,r9,r12
1941         addze   r10,r10
1942         addc    r9,r9,r11
1943         #addze  r10,r10
1944         $ST     r9,`2*$BNSZ`(r3)
1945
1946                                         #mul_add(rp[3],ap[3],w,c1);
1947         $LD     r8,`3*$BNSZ`(r4)
1948         $UMULL  r11,r6,r8
1949         $LD     r9,`3*$BNSZ`(r3)
1950         $UMULH  r12,r6,r8
1951         adde    r11,r11,r10
1952         addze   r12,r12
1953         addc    r11,r11,r9
1954         addze   r12,r12
1955         $ST     r11,`3*$BNSZ`(r3)
1956         addi    r3,r3,`4*$BNSZ`
1957         addi    r4,r4,`4*$BNSZ`
1958         bdnz    Lppcasm_maw_mainloop
1959
1960 Lppcasm_maw_leftover:
1961         andi.   r5,r5,0x3
1962         beq     Lppcasm_maw_adios
1963         addi    r3,r3,-$BNSZ
1964         addi    r4,r4,-$BNSZ
1965                                         #mul_add(rp[0],ap[0],w,c1);
1966         mtctr   r5
1967         $LDU    r8,$BNSZ(r4)
1968         $UMULL  r9,r6,r8
1969         $UMULH  r10,r6,r8
1970         $LDU    r11,$BNSZ(r3)
1971         addc    r9,r9,r11
1972         addze   r10,r10
1973         addc    r9,r9,r12
1974         addze   r12,r10
1975         $ST     r9,0(r3)
1976
1977         bdz     Lppcasm_maw_adios
1978                                         #mul_add(rp[1],ap[1],w,c1);
1979         $LDU    r8,$BNSZ(r4)
1980         $UMULL  r9,r6,r8
1981         $UMULH  r10,r6,r8
1982         $LDU    r11,$BNSZ(r3)
1983         addc    r9,r9,r11
1984         addze   r10,r10
1985         addc    r9,r9,r12
1986         addze   r12,r10
1987         $ST     r9,0(r3)
1988
1989         bdz     Lppcasm_maw_adios
1990                                         #mul_add(rp[2],ap[2],w,c1);
1991         $LDU    r8,$BNSZ(r4)
1992         $UMULL  r9,r6,r8
1993         $UMULH  r10,r6,r8
1994         $LDU    r11,$BNSZ(r3)
1995         addc    r9,r9,r11
1996         addze   r10,r10
1997         addc    r9,r9,r12
1998         addze   r12,r10
1999         $ST     r9,0(r3)
2000
2001 Lppcasm_maw_adios:
2002         addi    r3,r12,0
2003         blr
2004         .long   0
2005         .byte   0,12,0x14,0,0,0,4,0
2006         .long   0
2007 .size   .bn_mul_add_words,.-.bn_mul_add_words
2008         .align  4
2009 EOF
2010 $data =~ s/\`([^\`]*)\`/eval $1/gem;
2011 print $data;
2012 close STDOUT;