crypto/openssl/crypto/bn/asm/sparcv8plus.S

   1 .ident  "sparcv8plus.s, Version 1.4"
   2 .ident  "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
   3
   4 /*
   5  * ====================================================================
   6  * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   7  * project.
   8  *
   9  * Rights for redistribution and usage in source and binary forms are
  10  * granted according to the OpenSSL license. Warranty of any kind is
  11  * disclaimed.
  12  * ====================================================================
  13  */
  14
  15 /*
  16  * This is my modest contributon to OpenSSL project (see
  17  * http://www.openssl.org/ for more information about it) and is
  18  * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
  19  * module. For updates see http://fy.chalmers.se/~appro/hpe/.
  20  *
  21  * Questions-n-answers.
  22  *
  23  * Q. How to compile?
  24  * A. With SC4.x/SC5.x:
  25  *
  26  *      cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
  27  *
  28  *    and with gcc:
  29  *
  30  *      gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
  31  *
  32  *    or if above fails (it does if you have gas installed):
  33  *
  34  *      gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
  35  *
  36  *    Quick-n-dirty way to fuse the module into the library.
  37  *    Provided that the library is already configured and built
  38  *    (in 0.9.2 case with no-asm option):
  39  *
  40  *      # cd crypto/bn
  41  *      # cp /some/place/bn_asm.sparc.v8plus.S .
  42  *      # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
  43  *      # make
  44  *      # cd ../..
  45  *      # make; make test
  46  *
  47  *    Quick-n-dirty way to get rid of it:
  48  *
  49  *      # cd crypto/bn
  50  *      # touch bn_asm.c
  51  *      # make
  52  *      # cd ../..
  53  *      # make; make test
  54  *
  55  * Q. V8plus achitecture? What kind of beast is that?
  56  * A. Well, it's rather a programming model than an architecture...
  57  *    It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
  58  *    special conditions, namely when kernel doesn't preserve upper
  59  *    32 bits of otherwise 64-bit registers during a context switch.
  60  *
  61  * Q. Why just UltraSPARC? What about SuperSPARC?
  62  * A. Original release did target UltraSPARC only. Now SuperSPARC
  63  *    version is provided along. Both version share bn_*comba[48]
  64  *    implementations (see comment later in code for explanation).
  65  *    But what's so special about this UltraSPARC implementation?
  66  *    Why didn't I let compiler do the job? Trouble is that most of
  67  *    available compilers (well, SC5.0 is the only exception) don't
  68  *    attempt to take advantage of UltraSPARC's 64-bitness under
  69  *    32-bit kernels even though it's perfectly possible (see next
  70  *    question).
  71  *
  72  * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
  73  *    doesn't work?
  74  * A. You can't adress *all* registers as 64-bit wide:-( The catch is
  75  *    that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
  76  *    preserved if you're in a leaf function, i.e. such never calling
  77  *    any other functions. All functions in this module are leaf and
  78  *    10 registers is a handful. And as a matter of fact none-"comba"
  79  *    routines don't require even that much and I could even afford to
  80  *    not allocate own stack frame for 'em:-)
  81  *
  82  * Q. What about 64-bit kernels?
  83  * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
  84  *    under evaluation and development...
  85  *
  86  * Q. What about shared libraries?
  87  * A. What about 'em? Kidding again:-) Code does *not* contain any
  88  *    code position dependencies and it's safe to include it into
  89  *    shared library as is.
  90  *
  91  * Q. How much faster does it go?
  92  * A. Do you have a good benchmark? In either case below is what I
  93  *    experience with crypto/bn/expspeed.c test program:
  94  *
  95  *      v8plus module on U10/300MHz against bn_asm.c compiled with:
  96  *
  97  *      cc-5.0 -xarch=v8plus -xO5 -xdepend      +7-12%
  98  *      cc-4.2 -xarch=v8plus -xO5 -xdepend      +25-35%
  99  *      egcs-1.1.2 -mcpu=ultrasparc -O3         +35-45%
 100  *
 101  *      v8 module on SS10/60MHz against bn_asm.c compiled with:
 102  *
 103  *      cc-5.0 -xarch=v8 -xO5 -xdepend          +7-10%
 104  *      cc-4.2 -xarch=v8 -xO5 -xdepend          +10%
 105  *      egcs-1.1.2 -mv8 -O3                     +35-45%
 106  *
 107  *    As you can see it's damn hard to beat the new Sun C compiler
 108  *    and it's in first place GNU C users who will appreciate this
 109  *    assembler implementation:-)
 110  */
 111
 112 /*
 113  * Revision history.
 114  *
 115  * 1.0  - initial release;
 116  * 1.1  - new loop unrolling model(*);
 117  *      - some more fine tuning;
 118  * 1.2  - made gas friendly;
 119  *      - updates to documentation concerning v9;
 120  *      - new performance comparison matrix;
 121  * 1.3  - fixed problem with /usr/ccs/lib/cpp;
 122  * 1.4  - native V9 bn_*_comba[48] implementation (15% more efficient)
 123  *        resulting in slight overall performance kick;
 124  *      - some retunes;
 125  *      - support for GNU as added;
 126  *
 127  * (*)  Originally unrolled loop looked like this:
 128  *          for (;;) {
 129  *              op(p+0); if (--n==0) break;
 130  *              op(p+1); if (--n==0) break;
 131  *              op(p+2); if (--n==0) break;
 132  *              op(p+3); if (--n==0) break;
 133  *              p+=4;
 134  *          }
 135  *      I unroll according to following:
 136  *          while (n&~3) {
 137  *              op(p+0); op(p+1); op(p+2); op(p+3);
 138  *              p+=4; n=-4;
 139  *          }
 140  *          if (n) {
 141  *              op(p+0); if (--n==0) return;
 142  *              op(p+2); if (--n==0) return;
 143  *              op(p+3); return;
 144  *          }
 145  */
 146
 147 /*
 148  * GNU assembler can't stand stuw:-(
 149  */
 150 #define stuw st
 151
 152 .section        ".text",#alloc,#execinstr
 153 .file           "bn_asm.sparc.v8plus.S"
 154
 155 .align  32
 156
 157 .global bn_mul_add_words
 158 /*
 159  * BN_ULONG bn_mul_add_words(rp,ap,num,w)
 160  * BN_ULONG *rp,*ap;
 161  * int num;
 162  * BN_ULONG w;
 163  */
 164 bn_mul_add_words:
 165         sra     %o2,%g0,%o2     ! signx %o2
 166         brgz,a  %o2,.L_bn_mul_add_words_proceed
 167         lduw    [%o1],%g2
 168         retl
 169         clr     %o0
 170         nop
 171         nop
 172         nop
 173
 174 .L_bn_mul_add_words_proceed:
 175         srl     %o3,%g0,%o3     ! clruw %o3
 176         andcc   %o2,-4,%g0
 177         bz,pn   %icc,.L_bn_mul_add_words_tail
 178         clr     %o5
 179
 180 .L_bn_mul_add_words_loop:       ! wow! 32 aligned!
 181         lduw    [%o0],%g1
 182         lduw    [%o1+4],%g3
 183         mulx    %o3,%g2,%g2
 184         add     %g1,%o5,%o4
 185         nop
 186         add     %o4,%g2,%o4
 187         stuw    %o4,[%o0]
 188         srlx    %o4,32,%o5
 189
 190         lduw    [%o0+4],%g1
 191         lduw    [%o1+8],%g2
 192         mulx    %o3,%g3,%g3
 193         add     %g1,%o5,%o4
 194         dec     4,%o2
 195         add     %o4,%g3,%o4
 196         stuw    %o4,[%o0+4]
 197         srlx    %o4,32,%o5
 198
 199         lduw    [%o0+8],%g1
 200         lduw    [%o1+12],%g3
 201         mulx    %o3,%g2,%g2
 202         add     %g1,%o5,%o4
 203         inc     16,%o1
 204         add     %o4,%g2,%o4
 205         stuw    %o4,[%o0+8]
 206         srlx    %o4,32,%o5
 207
 208         lduw    [%o0+12],%g1
 209         mulx    %o3,%g3,%g3
 210         add     %g1,%o5,%o4
 211         inc     16,%o0
 212         add     %o4,%g3,%o4
 213         andcc   %o2,-4,%g0
 214         stuw    %o4,[%o0-4]
 215         srlx    %o4,32,%o5
 216         bnz,a,pt        %icc,.L_bn_mul_add_words_loop
 217         lduw    [%o1],%g2
 218
 219         brnz,a,pn       %o2,.L_bn_mul_add_words_tail
 220         lduw    [%o1],%g2
 221 .L_bn_mul_add_words_return:
 222         retl
 223         mov     %o5,%o0
 224
 225 .L_bn_mul_add_words_tail:
 226         lduw    [%o0],%g1
 227         mulx    %o3,%g2,%g2
 228         add     %g1,%o5,%o4
 229         dec     %o2
 230         add     %o4,%g2,%o4
 231         srlx    %o4,32,%o5
 232         brz,pt  %o2,.L_bn_mul_add_words_return
 233         stuw    %o4,[%o0]
 234
 235         lduw    [%o1+4],%g2
 236         lduw    [%o0+4],%g1
 237         mulx    %o3,%g2,%g2
 238         add     %g1,%o5,%o4
 239         dec     %o2
 240         add     %o4,%g2,%o4
 241         srlx    %o4,32,%o5
 242         brz,pt  %o2,.L_bn_mul_add_words_return
 243         stuw    %o4,[%o0+4]
 244
 245         lduw    [%o1+8],%g2
 246         lduw    [%o0+8],%g1
 247         mulx    %o3,%g2,%g2
 248         add     %g1,%o5,%o4
 249         add     %o4,%g2,%o4
 250         stuw    %o4,[%o0+8]
 251         retl
 252         srlx    %o4,32,%o0
 253
 254 .type   bn_mul_add_words,#function
 255 .size   bn_mul_add_words,(.-bn_mul_add_words)
 256
 257 .align  32
 258
 259 .global bn_mul_words
 260 /*
 261  * BN_ULONG bn_mul_words(rp,ap,num,w)
 262  * BN_ULONG *rp,*ap;
 263  * int num;
 264  * BN_ULONG w;
 265  */
 266 bn_mul_words:
 267         sra     %o2,%g0,%o2     ! signx %o2
 268         brgz,a  %o2,.L_bn_mul_words_proceeed
 269         lduw    [%o1],%g2
 270         retl
 271         clr     %o0
 272         nop
 273         nop
 274         nop
 275
 276 .L_bn_mul_words_proceeed:
 277         srl     %o3,%g0,%o3     ! clruw %o3
 278         andcc   %o2,-4,%g0
 279         bz,pn   %icc,.L_bn_mul_words_tail
 280         clr     %o5
 281
 282 .L_bn_mul_words_loop:           ! wow! 32 aligned!
 283         lduw    [%o1+4],%g3
 284         mulx    %o3,%g2,%g2
 285         add     %g2,%o5,%o4
 286         nop
 287         stuw    %o4,[%o0]
 288         srlx    %o4,32,%o5
 289
 290         lduw    [%o1+8],%g2
 291         mulx    %o3,%g3,%g3
 292         add     %g3,%o5,%o4
 293         dec     4,%o2
 294         stuw    %o4,[%o0+4]
 295         srlx    %o4,32,%o5
 296
 297         lduw    [%o1+12],%g3
 298         mulx    %o3,%g2,%g2
 299         add     %g2,%o5,%o4
 300         inc     16,%o1
 301         stuw    %o4,[%o0+8]
 302         srlx    %o4,32,%o5
 303
 304         mulx    %o3,%g3,%g3
 305         add     %g3,%o5,%o4
 306         inc     16,%o0
 307         stuw    %o4,[%o0-4]
 308         srlx    %o4,32,%o5
 309         andcc   %o2,-4,%g0
 310         bnz,a,pt        %icc,.L_bn_mul_words_loop
 311         lduw    [%o1],%g2
 312         nop
 313         nop
 314
 315         brnz,a,pn       %o2,.L_bn_mul_words_tail
 316         lduw    [%o1],%g2
 317 .L_bn_mul_words_return:
 318         retl
 319         mov     %o5,%o0
 320
 321 .L_bn_mul_words_tail:
 322         mulx    %o3,%g2,%g2
 323         add     %g2,%o5,%o4
 324         dec     %o2
 325         srlx    %o4,32,%o5
 326         brz,pt  %o2,.L_bn_mul_words_return
 327         stuw    %o4,[%o0]
 328
 329         lduw    [%o1+4],%g2
 330         mulx    %o3,%g2,%g2
 331         add     %g2,%o5,%o4
 332         dec     %o2
 333         srlx    %o4,32,%o5
 334         brz,pt  %o2,.L_bn_mul_words_return
 335         stuw    %o4,[%o0+4]
 336
 337         lduw    [%o1+8],%g2
 338         mulx    %o3,%g2,%g2
 339         add     %g2,%o5,%o4
 340         stuw    %o4,[%o0+8]
 341         retl
 342         srlx    %o4,32,%o0
 343
 344 .type   bn_mul_words,#function
 345 .size   bn_mul_words,(.-bn_mul_words)
 346
 347 .align  32
 348 .global bn_sqr_words
 349 /*
 350  * void bn_sqr_words(r,a,n)
 351  * BN_ULONG *r,*a;
 352  * int n;
 353  */
 354 bn_sqr_words:
 355         sra     %o2,%g0,%o2     ! signx %o2
 356         brgz,a  %o2,.L_bn_sqr_words_proceeed
 357         lduw    [%o1],%g2
 358         retl
 359         clr     %o0
 360         nop
 361         nop
 362         nop
 363
 364 .L_bn_sqr_words_proceeed:
 365         andcc   %o2,-4,%g0
 366         nop
 367         bz,pn   %icc,.L_bn_sqr_words_tail
 368         nop
 369
 370 .L_bn_sqr_words_loop:           ! wow! 32 aligned!
 371         lduw    [%o1+4],%g3
 372         mulx    %g2,%g2,%o4
 373         stuw    %o4,[%o0]
 374         srlx    %o4,32,%o5
 375         stuw    %o5,[%o0+4]
 376         nop
 377
 378         lduw    [%o1+8],%g2
 379         mulx    %g3,%g3,%o4
 380         dec     4,%o2
 381         stuw    %o4,[%o0+8]
 382         srlx    %o4,32,%o5
 383         stuw    %o5,[%o0+12]
 384
 385         lduw    [%o1+12],%g3
 386         mulx    %g2,%g2,%o4
 387         srlx    %o4,32,%o5
 388         stuw    %o4,[%o0+16]
 389         inc     16,%o1
 390         stuw    %o5,[%o0+20]
 391
 392         mulx    %g3,%g3,%o4
 393         inc     32,%o0
 394         stuw    %o4,[%o0-8]
 395         srlx    %o4,32,%o5
 396         andcc   %o2,-4,%g2
 397         stuw    %o5,[%o0-4]
 398         bnz,a,pt        %icc,.L_bn_sqr_words_loop
 399         lduw    [%o1],%g2
 400         nop
 401
 402         brnz,a,pn       %o2,.L_bn_sqr_words_tail
 403         lduw    [%o1],%g2
 404 .L_bn_sqr_words_return:
 405         retl
 406         clr     %o0
 407
 408 .L_bn_sqr_words_tail:
 409         mulx    %g2,%g2,%o4
 410         dec     %o2
 411         stuw    %o4,[%o0]
 412         srlx    %o4,32,%o5
 413         brz,pt  %o2,.L_bn_sqr_words_return
 414         stuw    %o5,[%o0+4]
 415
 416         lduw    [%o1+4],%g2
 417         mulx    %g2,%g2,%o4
 418         dec     %o2
 419         stuw    %o4,[%o0+8]
 420         srlx    %o4,32,%o5
 421         brz,pt  %o2,.L_bn_sqr_words_return
 422         stuw    %o5,[%o0+12]
 423
 424         lduw    [%o1+8],%g2
 425         mulx    %g2,%g2,%o4
 426         srlx    %o4,32,%o5
 427         stuw    %o4,[%o0+16]
 428         stuw    %o5,[%o0+20]
 429         retl
 430         clr     %o0
 431
 432 .type   bn_sqr_words,#function
 433 .size   bn_sqr_words,(.-bn_sqr_words)
 434
 435 .align  32
 436 .global bn_div_words
 437 /*
 438  * BN_ULONG bn_div_words(h,l,d)
 439  * BN_ULONG h,l,d;
 440  */
 441 bn_div_words:
 442         sllx    %o0,32,%o0
 443         or      %o0,%o1,%o0
 444         udivx   %o0,%o2,%o0
 445         retl
 446         srl     %o0,%g0,%o0     ! clruw %o0
 447
 448 .type   bn_div_words,#function
 449 .size   bn_div_words,(.-bn_div_words)
 450
 451 .align  32
 452
 453 .global bn_add_words
 454 /*
 455  * BN_ULONG bn_add_words(rp,ap,bp,n)
 456  * BN_ULONG *rp,*ap,*bp;
 457  * int n;
 458  */
 459 bn_add_words:
 460         sra     %o3,%g0,%o3     ! signx %o3
 461         brgz,a  %o3,.L_bn_add_words_proceed
 462         lduw    [%o1],%o4
 463         retl
 464         clr     %o0
 465
 466 .L_bn_add_words_proceed:
 467         andcc   %o3,-4,%g0
 468         bz,pn   %icc,.L_bn_add_words_tail
 469         addcc   %g0,0,%g0       ! clear carry flag
 470
 471 .L_bn_add_words_loop:           ! wow! 32 aligned!
 472         dec     4,%o3
 473         lduw    [%o2],%o5
 474         lduw    [%o1+4],%g1
 475         lduw    [%o2+4],%g2
 476         lduw    [%o1+8],%g3
 477         lduw    [%o2+8],%g4
 478         addccc  %o5,%o4,%o5
 479         stuw    %o5,[%o0]
 480
 481         lduw    [%o1+12],%o4
 482         lduw    [%o2+12],%o5
 483         inc     16,%o1
 484         addccc  %g1,%g2,%g1
 485         stuw    %g1,[%o0+4]
 486
 487         inc     16,%o2
 488         addccc  %g3,%g4,%g3
 489         stuw    %g3,[%o0+8]
 490
 491         inc     16,%o0
 492         addccc  %o5,%o4,%o5
 493         stuw    %o5,[%o0-4]
 494         and     %o3,-4,%g1
 495         brnz,a,pt       %g1,.L_bn_add_words_loop
 496         lduw    [%o1],%o4
 497
 498         brnz,a,pn       %o3,.L_bn_add_words_tail
 499         lduw    [%o1],%o4
 500 .L_bn_add_words_return:
 501         clr     %o0
 502         retl
 503         movcs   %icc,1,%o0
 504         nop
 505
 506 .L_bn_add_words_tail:
 507         lduw    [%o2],%o5
 508         dec     %o3
 509         addccc  %o5,%o4,%o5
 510         brz,pt  %o3,.L_bn_add_words_return
 511         stuw    %o5,[%o0]
 512
 513         lduw    [%o1+4],%o4
 514         lduw    [%o2+4],%o5
 515         dec     %o3
 516         addccc  %o5,%o4,%o5
 517         brz,pt  %o3,.L_bn_add_words_return
 518         stuw    %o5,[%o0+4]
 519
 520         lduw    [%o1+8],%o4
 521         lduw    [%o2+8],%o5
 522         addccc  %o5,%o4,%o5
 523         stuw    %o5,[%o0+8]
 524         clr     %o0
 525         retl
 526         movcs   %icc,1,%o0
 527
 528 .type   bn_add_words,#function
 529 .size   bn_add_words,(.-bn_add_words)
 530
 531 .global bn_sub_words
 532 /*
 533  * BN_ULONG bn_sub_words(rp,ap,bp,n)
 534  * BN_ULONG *rp,*ap,*bp;
 535  * int n;
 536  */
 537 bn_sub_words:
 538         sra     %o3,%g0,%o3     ! signx %o3
 539         brgz,a  %o3,.L_bn_sub_words_proceed
 540         lduw    [%o1],%o4
 541         retl
 542         clr     %o0
 543
 544 .L_bn_sub_words_proceed:
 545         andcc   %o3,-4,%g0
 546         bz,pn   %icc,.L_bn_sub_words_tail
 547         addcc   %g0,0,%g0       ! clear carry flag
 548
 549 .L_bn_sub_words_loop:           ! wow! 32 aligned!
 550         dec     4,%o3
 551         lduw    [%o2],%o5
 552         lduw    [%o1+4],%g1
 553         lduw    [%o2+4],%g2
 554         lduw    [%o1+8],%g3
 555         lduw    [%o2+8],%g4
 556         subccc  %o4,%o5,%o5
 557         stuw    %o5,[%o0]
 558
 559         lduw    [%o1+12],%o4
 560         lduw    [%o2+12],%o5
 561         inc     16,%o1
 562         subccc  %g1,%g2,%g2
 563         stuw    %g2,[%o0+4]
 564
 565         inc     16,%o2
 566         subccc  %g3,%g4,%g4
 567         stuw    %g4,[%o0+8]
 568
 569         inc     16,%o0
 570         subccc  %o4,%o5,%o5
 571         stuw    %o5,[%o0-4]
 572         and     %o3,-4,%g1
 573         brnz,a,pt       %g1,.L_bn_sub_words_loop
 574         lduw    [%o1],%o4
 575
 576         brnz,a,pn       %o3,.L_bn_sub_words_tail
 577         lduw    [%o1],%o4
 578 .L_bn_sub_words_return:
 579         clr     %o0
 580         retl
 581         movcs   %icc,1,%o0
 582         nop
 583
 584 .L_bn_sub_words_tail:           ! wow! 32 aligned!
 585         lduw    [%o2],%o5
 586         dec     %o3
 587         subccc  %o4,%o5,%o5
 588         brz,pt  %o3,.L_bn_sub_words_return
 589         stuw    %o5,[%o0]
 590
 591         lduw    [%o1+4],%o4
 592         lduw    [%o2+4],%o5
 593         dec     %o3
 594         subccc  %o4,%o5,%o5
 595         brz,pt  %o3,.L_bn_sub_words_return
 596         stuw    %o5,[%o0+4]
 597
 598         lduw    [%o1+8],%o4
 599         lduw    [%o2+8],%o5
 600         subccc  %o4,%o5,%o5
 601         stuw    %o5,[%o0+8]
 602         clr     %o0
 603         retl
 604         movcs   %icc,1,%o0
 605
 606 .type   bn_sub_words,#function
 607 .size   bn_sub_words,(.-bn_sub_words)
 608
 609 /*
 610  * Code below depends on the fact that upper parts of the %l0-%l7
 611  * and %i0-%i7 are zeroed by kernel after context switch. In
 612  * previous versions this comment stated that "the trouble is that
 613  * it's not feasible to implement the mumbo-jumbo in less V9
 614  * instructions:-(" which apparently isn't true thanks to
 615  * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement
 616  * results not from the shorter code, but from elimination of
 617  * multicycle none-pairable 'rd %y,%rd' instructions.
 618  *
 619  *                                                      Andy.
 620  */
 621
 622 #define FRAME_SIZE      -96
 623
 624 /*
 625  * Here is register usage map for *all* routines below.
 626  */
 627 #define t_1     %o0
 628 #define t_2     %o1
 629 #define c_12    %o2
 630 #define c_3     %o3
 631
 632 #define ap(I)   [%i1+4*I]
 633 #define bp(I)   [%i2+4*I]
 634 #define rp(I)   [%i0+4*I]
 635
 636 #define a_0     %l0
 637 #define a_1     %l1
 638 #define a_2     %l2
 639 #define a_3     %l3
 640 #define a_4     %l4
 641 #define a_5     %l5
 642 #define a_6     %l6
 643 #define a_7     %l7
 644
 645 #define b_0     %i3
 646 #define b_1     %i4
 647 #define b_2     %i5
 648 #define b_3     %o4
 649 #define b_4     %o5
 650 #define b_5     %o7
 651 #define b_6     %g1
 652 #define b_7     %g4
 653
 654 .align  32
 655 .global bn_mul_comba8
 656 /*
 657  * void bn_mul_comba8(r,a,b)
 658  * BN_ULONG *r,*a,*b;
 659  */
 660 bn_mul_comba8:
 661         save    %sp,FRAME_SIZE,%sp
 662         mov     1,t_2
 663         lduw    ap(0),a_0
 664         sllx    t_2,32,t_2
 665         lduw    bp(0),b_0       !=
 666         lduw    bp(1),b_1
 667         mulx    a_0,b_0,t_1     !mul_add_c(a[0],b[0],c1,c2,c3);
 668         srlx    t_1,32,c_12
 669         stuw    t_1,rp(0)       !=!r[0]=c1;
 670
 671         lduw    ap(1),a_1
 672         mulx    a_0,b_1,t_1     !mul_add_c(a[0],b[1],c2,c3,c1);
 673         addcc   c_12,t_1,c_12
 674         clr     c_3             !=
 675         bcs,a   %xcc,.+8
 676         add     c_3,t_2,c_3
 677         lduw    ap(2),a_2
 678         mulx    a_1,b_0,t_1     !=!mul_add_c(a[1],b[0],c2,c3,c1);
 679         addcc   c_12,t_1,t_1
 680         bcs,a   %xcc,.+8
 681         add     c_3,t_2,c_3
 682         srlx    t_1,32,c_12     !=
 683         stuw    t_1,rp(1)       !r[1]=c2;
 684         or      c_12,c_3,c_12
 685
 686         mulx    a_2,b_0,t_1     !mul_add_c(a[2],b[0],c3,c1,c2);
 687         addcc   c_12,t_1,c_12   !=
 688         clr     c_3
 689         bcs,a   %xcc,.+8
 690         add     c_3,t_2,c_3
 691         lduw    bp(2),b_2       !=
 692         mulx    a_1,b_1,t_1     !mul_add_c(a[1],b[1],c3,c1,c2);
 693         addcc   c_12,t_1,c_12
 694         bcs,a   %xcc,.+8
 695         add     c_3,t_2,c_3     !=
 696         lduw    bp(3),b_3
 697         mulx    a_0,b_2,t_1     !mul_add_c(a[0],b[2],c3,c1,c2);
 698         addcc   c_12,t_1,t_1
 699         bcs,a   %xcc,.+8        !=
 700         add     c_3,t_2,c_3
 701         srlx    t_1,32,c_12
 702         stuw    t_1,rp(2)       !r[2]=c3;
 703         or      c_12,c_3,c_12   !=
 704
 705         mulx    a_0,b_3,t_1     !mul_add_c(a[0],b[3],c1,c2,c3);
 706         addcc   c_12,t_1,c_12
 707         clr     c_3
 708         bcs,a   %xcc,.+8        !=
 709         add     c_3,t_2,c_3
 710         mulx    a_1,b_2,t_1     !=!mul_add_c(a[1],b[2],c1,c2,c3);
 711         addcc   c_12,t_1,c_12
 712         bcs,a   %xcc,.+8        !=
 713         add     c_3,t_2,c_3
 714         lduw    ap(3),a_3
 715         mulx    a_2,b_1,t_1     !mul_add_c(a[2],b[1],c1,c2,c3);
 716         addcc   c_12,t_1,c_12   !=
 717         bcs,a   %xcc,.+8
 718         add     c_3,t_2,c_3
 719         lduw    ap(4),a_4
 720         mulx    a_3,b_0,t_1     !=!mul_add_c(a[3],b[0],c1,c2,c3);!=
 721         addcc   c_12,t_1,t_1
 722         bcs,a   %xcc,.+8
 723         add     c_3,t_2,c_3
 724         srlx    t_1,32,c_12     !=
 725         stuw    t_1,rp(3)       !r[3]=c1;
 726         or      c_12,c_3,c_12
 727
 728         mulx    a_4,b_0,t_1     !mul_add_c(a[4],b[0],c2,c3,c1);
 729         addcc   c_12,t_1,c_12   !=
 730         clr     c_3
 731         bcs,a   %xcc,.+8
 732         add     c_3,t_2,c_3
 733         mulx    a_3,b_1,t_1     !=!mul_add_c(a[3],b[1],c2,c3,c1);
 734         addcc   c_12,t_1,c_12
 735         bcs,a   %xcc,.+8
 736         add     c_3,t_2,c_3
 737         mulx    a_2,b_2,t_1     !=!mul_add_c(a[2],b[2],c2,c3,c1);
 738         addcc   c_12,t_1,c_12
 739         bcs,a   %xcc,.+8
 740         add     c_3,t_2,c_3
 741         lduw    bp(4),b_4       !=
 742         mulx    a_1,b_3,t_1     !mul_add_c(a[1],b[3],c2,c3,c1);
 743         addcc   c_12,t_1,c_12
 744         bcs,a   %xcc,.+8
 745         add     c_3,t_2,c_3     !=
 746         lduw    bp(5),b_5
 747         mulx    a_0,b_4,t_1     !mul_add_c(a[0],b[4],c2,c3,c1);
 748         addcc   c_12,t_1,t_1
 749         bcs,a   %xcc,.+8        !=
 750         add     c_3,t_2,c_3
 751         srlx    t_1,32,c_12
 752         stuw    t_1,rp(4)       !r[4]=c2;
 753         or      c_12,c_3,c_12   !=
 754
 755         mulx    a_0,b_5,t_1     !mul_add_c(a[0],b[5],c3,c1,c2);
 756         addcc   c_12,t_1,c_12
 757         clr     c_3
 758         bcs,a   %xcc,.+8        !=
 759         add     c_3,t_2,c_3
 760         mulx    a_1,b_4,t_1     !mul_add_c(a[1],b[4],c3,c1,c2);
 761         addcc   c_12,t_1,c_12
 762         bcs,a   %xcc,.+8        !=
 763         add     c_3,t_2,c_3
 764         mulx    a_2,b_3,t_1     !mul_add_c(a[2],b[3],c3,c1,c2);
 765         addcc   c_12,t_1,c_12
 766         bcs,a   %xcc,.+8        !=
 767         add     c_3,t_2,c_3
 768         mulx    a_3,b_2,t_1     !mul_add_c(a[3],b[2],c3,c1,c2);
 769         addcc   c_12,t_1,c_12
 770         bcs,a   %xcc,.+8        !=
 771         add     c_3,t_2,c_3
 772         lduw    ap(5),a_5
 773         mulx    a_4,b_1,t_1     !mul_add_c(a[4],b[1],c3,c1,c2);
 774         addcc   c_12,t_1,c_12   !=
 775         bcs,a   %xcc,.+8
 776         add     c_3,t_2,c_3
 777         lduw    ap(6),a_6
 778         mulx    a_5,b_0,t_1     !=!mul_add_c(a[5],b[0],c3,c1,c2);
 779         addcc   c_12,t_1,t_1
 780         bcs,a   %xcc,.+8
 781         add     c_3,t_2,c_3
 782         srlx    t_1,32,c_12     !=
 783         stuw    t_1,rp(5)       !r[5]=c3;
 784         or      c_12,c_3,c_12
 785
 786         mulx    a_6,b_0,t_1     !mul_add_c(a[6],b[0],c1,c2,c3);
 787         addcc   c_12,t_1,c_12   !=
 788         clr     c_3
 789         bcs,a   %xcc,.+8
 790         add     c_3,t_2,c_3
 791         mulx    a_5,b_1,t_1     !=!mul_add_c(a[5],b[1],c1,c2,c3);
 792         addcc   c_12,t_1,c_12
 793         bcs,a   %xcc,.+8
 794         add     c_3,t_2,c_3
 795         mulx    a_4,b_2,t_1     !=!mul_add_c(a[4],b[2],c1,c2,c3);
 796         addcc   c_12,t_1,c_12
 797         bcs,a   %xcc,.+8
 798         add     c_3,t_2,c_3
 799         mulx    a_3,b_3,t_1     !=!mul_add_c(a[3],b[3],c1,c2,c3);
 800         addcc   c_12,t_1,c_12
 801         bcs,a   %xcc,.+8
 802         add     c_3,t_2,c_3
 803         mulx    a_2,b_4,t_1     !=!mul_add_c(a[2],b[4],c1,c2,c3);
 804         addcc   c_12,t_1,c_12
 805         bcs,a   %xcc,.+8
 806         add     c_3,t_2,c_3
 807         lduw    bp(6),b_6       !=
 808         mulx    a_1,b_5,t_1     !mul_add_c(a[1],b[5],c1,c2,c3);
 809         addcc   c_12,t_1,c_12
 810         bcs,a   %xcc,.+8
 811         add     c_3,t_2,c_3     !=
 812         lduw    bp(7),b_7
 813         mulx    a_0,b_6,t_1     !mul_add_c(a[0],b[6],c1,c2,c3);
 814         addcc   c_12,t_1,t_1
 815         bcs,a   %xcc,.+8        !=
 816         add     c_3,t_2,c_3
 817         srlx    t_1,32,c_12
 818         stuw    t_1,rp(6)       !r[6]=c1;
 819         or      c_12,c_3,c_12   !=
 820
 821         mulx    a_0,b_7,t_1     !mul_add_c(a[0],b[7],c2,c3,c1);
 822         addcc   c_12,t_1,c_12
 823         clr     c_3
 824         bcs,a   %xcc,.+8        !=
 825         add     c_3,t_2,c_3
 826         mulx    a_1,b_6,t_1     !mul_add_c(a[1],b[6],c2,c3,c1);
 827         addcc   c_12,t_1,c_12
 828         bcs,a   %xcc,.+8        !=
 829         add     c_3,t_2,c_3
 830         mulx    a_2,b_5,t_1     !mul_add_c(a[2],b[5],c2,c3,c1);
 831         addcc   c_12,t_1,c_12
 832         bcs,a   %xcc,.+8        !=
 833         add     c_3,t_2,c_3
 834         mulx    a_3,b_4,t_1     !mul_add_c(a[3],b[4],c2,c3,c1);
 835         addcc   c_12,t_1,c_12
 836         bcs,a   %xcc,.+8        !=
 837         add     c_3,t_2,c_3
 838         mulx    a_4,b_3,t_1     !mul_add_c(a[4],b[3],c2,c3,c1);
 839         addcc   c_12,t_1,c_12
 840         bcs,a   %xcc,.+8        !=
 841         add     c_3,t_2,c_3
 842         mulx    a_5,b_2,t_1     !mul_add_c(a[5],b[2],c2,c3,c1);
 843         addcc   c_12,t_1,c_12
 844         bcs,a   %xcc,.+8        !=
 845         add     c_3,t_2,c_3
 846         lduw    ap(7),a_7
 847         mulx    a_6,b_1,t_1     !=!mul_add_c(a[6],b[1],c2,c3,c1);
 848         addcc   c_12,t_1,c_12
 849         bcs,a   %xcc,.+8
 850         add     c_3,t_2,c_3
 851         mulx    a_7,b_0,t_1     !=!mul_add_c(a[7],b[0],c2,c3,c1);
 852         addcc   c_12,t_1,t_1
 853         bcs,a   %xcc,.+8
 854         add     c_3,t_2,c_3
 855         srlx    t_1,32,c_12     !=
 856         stuw    t_1,rp(7)       !r[7]=c2;
 857         or      c_12,c_3,c_12
 858
 859         mulx    a_7,b_1,t_1     !=!mul_add_c(a[7],b[1],c3,c1,c2);
 860         addcc   c_12,t_1,c_12
 861         clr     c_3
 862         bcs,a   %xcc,.+8
 863         add     c_3,t_2,c_3     !=
 864         mulx    a_6,b_2,t_1     !mul_add_c(a[6],b[2],c3,c1,c2);
 865         addcc   c_12,t_1,c_12
 866         bcs,a   %xcc,.+8
 867         add     c_3,t_2,c_3     !=
 868         mulx    a_5,b_3,t_1     !mul_add_c(a[5],b[3],c3,c1,c2);
 869         addcc   c_12,t_1,c_12
 870         bcs,a   %xcc,.+8
 871         add     c_3,t_2,c_3     !=
 872         mulx    a_4,b_4,t_1     !mul_add_c(a[4],b[4],c3,c1,c2);
 873         addcc   c_12,t_1,c_12
 874         bcs,a   %xcc,.+8
 875         add     c_3,t_2,c_3     !=
 876         mulx    a_3,b_5,t_1     !mul_add_c(a[3],b[5],c3,c1,c2);
 877         addcc   c_12,t_1,c_12
 878         bcs,a   %xcc,.+8
 879         add     c_3,t_2,c_3     !=
 880         mulx    a_2,b_6,t_1     !mul_add_c(a[2],b[6],c3,c1,c2);
 881         addcc   c_12,t_1,c_12
 882         bcs,a   %xcc,.+8
 883         add     c_3,t_2,c_3     !=
 884         mulx    a_1,b_7,t_1     !mul_add_c(a[1],b[7],c3,c1,c2);
 885         addcc   c_12,t_1,t_1
 886         bcs,a   %xcc,.+8
 887         add     c_3,t_2,c_3     !=
 888         srlx    t_1,32,c_12
 889         stuw    t_1,rp(8)       !r[8]=c3;
 890         or      c_12,c_3,c_12
 891
 892         mulx    a_2,b_7,t_1     !=!mul_add_c(a[2],b[7],c1,c2,c3);
 893         addcc   c_12,t_1,c_12
 894         clr     c_3
 895         bcs,a   %xcc,.+8
 896         add     c_3,t_2,c_3     !=
 897         mulx    a_3,b_6,t_1     !mul_add_c(a[3],b[6],c1,c2,c3);
 898         addcc   c_12,t_1,c_12
 899         bcs,a   %xcc,.+8        !=
 900         add     c_3,t_2,c_3
 901         mulx    a_4,b_5,t_1     !mul_add_c(a[4],b[5],c1,c2,c3);
 902         addcc   c_12,t_1,c_12
 903         bcs,a   %xcc,.+8        !=
 904         add     c_3,t_2,c_3
 905         mulx    a_5,b_4,t_1     !mul_add_c(a[5],b[4],c1,c2,c3);
 906         addcc   c_12,t_1,c_12
 907         bcs,a   %xcc,.+8        !=
 908         add     c_3,t_2,c_3
 909         mulx    a_6,b_3,t_1     !mul_add_c(a[6],b[3],c1,c2,c3);
 910         addcc   c_12,t_1,c_12
 911         bcs,a   %xcc,.+8        !=
 912         add     c_3,t_2,c_3
 913         mulx    a_7,b_2,t_1     !mul_add_c(a[7],b[2],c1,c2,c3);
 914         addcc   c_12,t_1,t_1
 915         bcs,a   %xcc,.+8        !=
 916         add     c_3,t_2,c_3
 917         srlx    t_1,32,c_12
 918         stuw    t_1,rp(9)       !r[9]=c1;
 919         or      c_12,c_3,c_12   !=
 920
 921         mulx    a_7,b_3,t_1     !mul_add_c(a[7],b[3],c2,c3,c1);
 922         addcc   c_12,t_1,c_12
 923         clr     c_3
 924         bcs,a   %xcc,.+8        !=
 925         add     c_3,t_2,c_3
 926         mulx    a_6,b_4,t_1     !mul_add_c(a[6],b[4],c2,c3,c1);
 927         addcc   c_12,t_1,c_12
 928         bcs,a   %xcc,.+8        !=
 929         add     c_3,t_2,c_3
 930         mulx    a_5,b_5,t_1     !mul_add_c(a[5],b[5],c2,c3,c1);
 931         addcc   c_12,t_1,c_12
 932         bcs,a   %xcc,.+8        !=
 933         add     c_3,t_2,c_3
 934         mulx    a_4,b_6,t_1     !mul_add_c(a[4],b[6],c2,c3,c1);
 935         addcc   c_12,t_1,c_12
 936         bcs,a   %xcc,.+8        !=
 937         add     c_3,t_2,c_3
 938         mulx    a_3,b_7,t_1     !mul_add_c(a[3],b[7],c2,c3,c1);
 939         addcc   c_12,t_1,t_1
 940         bcs,a   %xcc,.+8        !=
 941         add     c_3,t_2,c_3
 942         srlx    t_1,32,c_12
 943         stuw    t_1,rp(10)      !r[10]=c2;
 944         or      c_12,c_3,c_12   !=
 945
 946         mulx    a_4,b_7,t_1     !mul_add_c(a[4],b[7],c3,c1,c2);
 947         addcc   c_12,t_1,c_12
 948         clr     c_3
 949         bcs,a   %xcc,.+8        !=
 950         add     c_3,t_2,c_3
 951         mulx    a_5,b_6,t_1     !mul_add_c(a[5],b[6],c3,c1,c2);
 952         addcc   c_12,t_1,c_12
 953         bcs,a   %xcc,.+8        !=
 954         add     c_3,t_2,c_3
 955         mulx    a_6,b_5,t_1     !mul_add_c(a[6],b[5],c3,c1,c2);
 956         addcc   c_12,t_1,c_12
 957         bcs,a   %xcc,.+8        !=
 958         add     c_3,t_2,c_3
 959         mulx    a_7,b_4,t_1     !mul_add_c(a[7],b[4],c3,c1,c2);
 960         addcc   c_12,t_1,t_1
 961         bcs,a   %xcc,.+8        !=
 962         add     c_3,t_2,c_3
 963         srlx    t_1,32,c_12
 964         stuw    t_1,rp(11)      !r[11]=c3;
 965         or      c_12,c_3,c_12   !=
 966
 967         mulx    a_7,b_5,t_1     !mul_add_c(a[7],b[5],c1,c2,c3);
 968         addcc   c_12,t_1,c_12
 969         clr     c_3
 970         bcs,a   %xcc,.+8        !=
 971         add     c_3,t_2,c_3
 972         mulx    a_6,b_6,t_1     !mul_add_c(a[6],b[6],c1,c2,c3);
 973         addcc   c_12,t_1,c_12
 974         bcs,a   %xcc,.+8        !=
 975         add     c_3,t_2,c_3
 976         mulx    a_5,b_7,t_1     !mul_add_c(a[5],b[7],c1,c2,c3);
 977         addcc   c_12,t_1,t_1
 978         bcs,a   %xcc,.+8        !=
 979         add     c_3,t_2,c_3
 980         srlx    t_1,32,c_12
 981         stuw    t_1,rp(12)      !r[12]=c1;
 982         or      c_12,c_3,c_12   !=
 983
 984         mulx    a_6,b_7,t_1     !mul_add_c(a[6],b[7],c2,c3,c1);
 985         addcc   c_12,t_1,c_12
 986         clr     c_3
 987         bcs,a   %xcc,.+8        !=
 988         add     c_3,t_2,c_3
 989         mulx    a_7,b_6,t_1     !mul_add_c(a[7],b[6],c2,c3,c1);
 990         addcc   c_12,t_1,t_1
 991         bcs,a   %xcc,.+8        !=
 992         add     c_3,t_2,c_3
 993         srlx    t_1,32,c_12
 994         st      t_1,rp(13)      !r[13]=c2;
 995         or      c_12,c_3,c_12   !=
 996
 997         mulx    a_7,b_7,t_1     !mul_add_c(a[7],b[7],c3,c1,c2);
 998         addcc   c_12,t_1,t_1
 999         srlx    t_1,32,c_12     !=
1000         stuw    t_1,rp(14)      !r[14]=c3;
1001         stuw    c_12,rp(15)     !r[15]=c1;
1002
1003         ret
1004         restore %g0,%g0,%o0     !=
1005
1006 .type   bn_mul_comba8,#function
1007 .size   bn_mul_comba8,(.-bn_mul_comba8)
1008
1009 .align  32
1010
1011 .global bn_mul_comba4
1012 /*
1013  * void bn_mul_comba4(r,a,b)
1014  * BN_ULONG *r,*a,*b;
1015  */
1016 bn_mul_comba4:
1017         save    %sp,FRAME_SIZE,%sp
1018         lduw    ap(0),a_0
1019         mov     1,t_2
1020         lduw    bp(0),b_0
1021         sllx    t_2,32,t_2      !=
1022         lduw    bp(1),b_1
1023         mulx    a_0,b_0,t_1     !mul_add_c(a[0],b[0],c1,c2,c3);
1024         srlx    t_1,32,c_12
1025         stuw    t_1,rp(0)       !=!r[0]=c1;
1026
1027         lduw    ap(1),a_1
1028         mulx    a_0,b_1,t_1     !mul_add_c(a[0],b[1],c2,c3,c1);
1029         addcc   c_12,t_1,c_12
1030         clr     c_3             !=
1031         bcs,a   %xcc,.+8
1032         add     c_3,t_2,c_3
1033         lduw    ap(2),a_2
1034         mulx    a_1,b_0,t_1     !=!mul_add_c(a[1],b[0],c2,c3,c1);
1035         addcc   c_12,t_1,t_1
1036         bcs,a   %xcc,.+8
1037         add     c_3,t_2,c_3
1038         srlx    t_1,32,c_12     !=
1039         stuw    t_1,rp(1)       !r[1]=c2;
1040         or      c_12,c_3,c_12
1041
1042         mulx    a_2,b_0,t_1     !mul_add_c(a[2],b[0],c3,c1,c2);
1043         addcc   c_12,t_1,c_12   !=
1044         clr     c_3
1045         bcs,a   %xcc,.+8
1046         add     c_3,t_2,c_3
1047         lduw    bp(2),b_2       !=
1048         mulx    a_1,b_1,t_1     !mul_add_c(a[1],b[1],c3,c1,c2);
1049         addcc   c_12,t_1,c_12
1050         bcs,a   %xcc,.+8
1051         add     c_3,t_2,c_3     !=
1052         lduw    bp(3),b_3
1053         mulx    a_0,b_2,t_1     !mul_add_c(a[0],b[2],c3,c1,c2);
1054         addcc   c_12,t_1,t_1
1055         bcs,a   %xcc,.+8        !=
1056         add     c_3,t_2,c_3
1057         srlx    t_1,32,c_12
1058         stuw    t_1,rp(2)       !r[2]=c3;
1059         or      c_12,c_3,c_12   !=
1060
1061         mulx    a_0,b_3,t_1     !mul_add_c(a[0],b[3],c1,c2,c3);
1062         addcc   c_12,t_1,c_12
1063         clr     c_3
1064         bcs,a   %xcc,.+8        !=
1065         add     c_3,t_2,c_3
1066         mulx    a_1,b_2,t_1     !mul_add_c(a[1],b[2],c1,c2,c3);
1067         addcc   c_12,t_1,c_12
1068         bcs,a   %xcc,.+8        !=
1069         add     c_3,t_2,c_3
1070         lduw    ap(3),a_3
1071         mulx    a_2,b_1,t_1     !mul_add_c(a[2],b[1],c1,c2,c3);
1072         addcc   c_12,t_1,c_12   !=
1073         bcs,a   %xcc,.+8
1074         add     c_3,t_2,c_3
1075         mulx    a_3,b_0,t_1     !mul_add_c(a[3],b[0],c1,c2,c3);!=
1076         addcc   c_12,t_1,t_1    !=
1077         bcs,a   %xcc,.+8
1078         add     c_3,t_2,c_3
1079         srlx    t_1,32,c_12
1080         stuw    t_1,rp(3)       !=!r[3]=c1;
1081         or      c_12,c_3,c_12
1082
1083         mulx    a_3,b_1,t_1     !mul_add_c(a[3],b[1],c2,c3,c1);
1084         addcc   c_12,t_1,c_12
1085         clr     c_3             !=
1086         bcs,a   %xcc,.+8
1087         add     c_3,t_2,c_3
1088         mulx    a_2,b_2,t_1     !mul_add_c(a[2],b[2],c2,c3,c1);
1089         addcc   c_12,t_1,c_12   !=
1090         bcs,a   %xcc,.+8
1091         add     c_3,t_2,c_3
1092         mulx    a_1,b_3,t_1     !mul_add_c(a[1],b[3],c2,c3,c1);
1093         addcc   c_12,t_1,t_1    !=
1094         bcs,a   %xcc,.+8
1095         add     c_3,t_2,c_3
1096         srlx    t_1,32,c_12
1097         stuw    t_1,rp(4)       !=!r[4]=c2;
1098         or      c_12,c_3,c_12
1099
1100         mulx    a_2,b_3,t_1     !mul_add_c(a[2],b[3],c3,c1,c2);
1101         addcc   c_12,t_1,c_12
1102         clr     c_3             !=
1103         bcs,a   %xcc,.+8
1104         add     c_3,t_2,c_3
1105         mulx    a_3,b_2,t_1     !mul_add_c(a[3],b[2],c3,c1,c2);
1106         addcc   c_12,t_1,t_1    !=
1107         bcs,a   %xcc,.+8
1108         add     c_3,t_2,c_3
1109         srlx    t_1,32,c_12
1110         stuw    t_1,rp(5)       !=!r[5]=c3;
1111         or      c_12,c_3,c_12
1112
1113         mulx    a_3,b_3,t_1     !mul_add_c(a[3],b[3],c1,c2,c3);
1114         addcc   c_12,t_1,t_1
1115         srlx    t_1,32,c_12     !=
1116         stuw    t_1,rp(6)       !r[6]=c1;
1117         stuw    c_12,rp(7)      !r[7]=c2;
1118
1119         ret
1120         restore %g0,%g0,%o0
1121
1122 .type   bn_mul_comba4,#function
1123 .size   bn_mul_comba4,(.-bn_mul_comba4)
1124
1125 .align  32
1126
1127 .global bn_sqr_comba8
1128 bn_sqr_comba8:
1129         save    %sp,FRAME_SIZE,%sp
1130         mov     1,t_2
1131         lduw    ap(0),a_0
1132         sllx    t_2,32,t_2
1133         lduw    ap(1),a_1
1134         mulx    a_0,a_0,t_1     !sqr_add_c(a,0,c1,c2,c3);
1135         srlx    t_1,32,c_12
1136         stuw    t_1,rp(0)       !r[0]=c1;
1137
1138         lduw    ap(2),a_2
1139         mulx    a_0,a_1,t_1     !=!sqr_add_c2(a,1,0,c2,c3,c1);
1140         addcc   c_12,t_1,c_12
1141         clr     c_3
1142         bcs,a   %xcc,.+8
1143         add     c_3,t_2,c_3
1144         addcc   c_12,t_1,t_1
1145         bcs,a   %xcc,.+8
1146         add     c_3,t_2,c_3
1147         srlx    t_1,32,c_12
1148         stuw    t_1,rp(1)       !r[1]=c2;
1149         or      c_12,c_3,c_12
1150
1151         mulx    a_2,a_0,t_1     !sqr_add_c2(a,2,0,c3,c1,c2);
1152         addcc   c_12,t_1,c_12
1153         clr     c_3
1154         bcs,a   %xcc,.+8
1155         add     c_3,t_2,c_3
1156         addcc   c_12,t_1,c_12
1157         bcs,a   %xcc,.+8
1158         add     c_3,t_2,c_3
1159         lduw    ap(3),a_3
1160         mulx    a_1,a_1,t_1     !sqr_add_c(a,1,c3,c1,c2);
1161         addcc   c_12,t_1,t_1
1162         bcs,a   %xcc,.+8
1163         add     c_3,t_2,c_3
1164         srlx    t_1,32,c_12
1165         stuw    t_1,rp(2)       !r[2]=c3;
1166         or      c_12,c_3,c_12
1167
1168         mulx    a_0,a_3,t_1     !sqr_add_c2(a,3,0,c1,c2,c3);
1169         addcc   c_12,t_1,c_12
1170         clr     c_3
1171         bcs,a   %xcc,.+8
1172         add     c_3,t_2,c_3
1173         addcc   c_12,t_1,c_12
1174         bcs,a   %xcc,.+8
1175         add     c_3,t_2,c_3
1176         lduw    ap(4),a_4
1177         mulx    a_1,a_2,t_1     !sqr_add_c2(a,2,1,c1,c2,c3);
1178         addcc   c_12,t_1,c_12
1179         bcs,a   %xcc,.+8
1180         add     c_3,t_2,c_3
1181         addcc   c_12,t_1,t_1
1182         bcs,a   %xcc,.+8
1183         add     c_3,t_2,c_3
1184         srlx    t_1,32,c_12
1185         st      t_1,rp(3)       !r[3]=c1;
1186         or      c_12,c_3,c_12
1187
1188         mulx    a_4,a_0,t_1     !sqr_add_c2(a,4,0,c2,c3,c1);
1189         addcc   c_12,t_1,c_12
1190         clr     c_3
1191         bcs,a   %xcc,.+8
1192         add     c_3,t_2,c_3
1193         addcc   c_12,t_1,c_12
1194         bcs,a   %xcc,.+8
1195         add     c_3,t_2,c_3
1196         mulx    a_3,a_1,t_1     !sqr_add_c2(a,3,1,c2,c3,c1);
1197         addcc   c_12,t_1,c_12
1198         bcs,a   %xcc,.+8
1199         add     c_3,t_2,c_3
1200         addcc   c_12,t_1,c_12
1201         bcs,a   %xcc,.+8
1202         add     c_3,t_2,c_3
1203         lduw    ap(5),a_5
1204         mulx    a_2,a_2,t_1     !sqr_add_c(a,2,c2,c3,c1);
1205         addcc   c_12,t_1,t_1
1206         bcs,a   %xcc,.+8
1207         add     c_3,t_2,c_3
1208         srlx    t_1,32,c_12
1209         stuw    t_1,rp(4)       !r[4]=c2;
1210         or      c_12,c_3,c_12
1211
1212         mulx    a_0,a_5,t_1     !sqr_add_c2(a,5,0,c3,c1,c2);
1213         addcc   c_12,t_1,c_12
1214         clr     c_3
1215         bcs,a   %xcc,.+8
1216         add     c_3,t_2,c_3
1217         addcc   c_12,t_1,c_12
1218         bcs,a   %xcc,.+8
1219         add     c_3,t_2,c_3
1220         mulx    a_1,a_4,t_1     !sqr_add_c2(a,4,1,c3,c1,c2);
1221         addcc   c_12,t_1,c_12
1222         bcs,a   %xcc,.+8
1223         add     c_3,t_2,c_3
1224         addcc   c_12,t_1,c_12
1225         bcs,a   %xcc,.+8
1226         add     c_3,t_2,c_3
1227         lduw    ap(6),a_6
1228         mulx    a_2,a_3,t_1     !sqr_add_c2(a,3,2,c3,c1,c2);
1229         addcc   c_12,t_1,c_12
1230         bcs,a   %xcc,.+8
1231         add     c_3,t_2,c_3
1232         addcc   c_12,t_1,t_1
1233         bcs,a   %xcc,.+8
1234         add     c_3,t_2,c_3
1235         srlx    t_1,32,c_12
1236         stuw    t_1,rp(5)       !r[5]=c3;
1237         or      c_12,c_3,c_12
1238
1239         mulx    a_6,a_0,t_1     !sqr_add_c2(a,6,0,c1,c2,c3);
1240         addcc   c_12,t_1,c_12
1241         clr     c_3
1242         bcs,a   %xcc,.+8
1243         add     c_3,t_2,c_3
1244         addcc   c_12,t_1,c_12
1245         bcs,a   %xcc,.+8
1246         add     c_3,t_2,c_3
1247         mulx    a_5,a_1,t_1     !sqr_add_c2(a,5,1,c1,c2,c3);
1248         addcc   c_12,t_1,c_12
1249         bcs,a   %xcc,.+8
1250         add     c_3,t_2,c_3
1251         addcc   c_12,t_1,c_12
1252         bcs,a   %xcc,.+8
1253         add     c_3,t_2,c_3
1254         mulx    a_4,a_2,t_1     !sqr_add_c2(a,4,2,c1,c2,c3);
1255         addcc   c_12,t_1,c_12
1256         bcs,a   %xcc,.+8
1257         add     c_3,t_2,c_3
1258         addcc   c_12,t_1,c_12
1259         bcs,a   %xcc,.+8
1260         add     c_3,t_2,c_3
1261         lduw    ap(7),a_7
1262         mulx    a_3,a_3,t_1     !=!sqr_add_c(a,3,c1,c2,c3);
1263         addcc   c_12,t_1,t_1
1264         bcs,a   %xcc,.+8
1265         add     c_3,t_2,c_3
1266         srlx    t_1,32,c_12
1267         stuw    t_1,rp(6)       !r[6]=c1;
1268         or      c_12,c_3,c_12
1269
1270         mulx    a_0,a_7,t_1     !sqr_add_c2(a,7,0,c2,c3,c1);
1271         addcc   c_12,t_1,c_12
1272         clr     c_3
1273         bcs,a   %xcc,.+8
1274         add     c_3,t_2,c_3
1275         addcc   c_12,t_1,c_12
1276         bcs,a   %xcc,.+8
1277         add     c_3,t_2,c_3
1278         mulx    a_1,a_6,t_1     !sqr_add_c2(a,6,1,c2,c3,c1);
1279         addcc   c_12,t_1,c_12
1280         bcs,a   %xcc,.+8
1281         add     c_3,t_2,c_3
1282         addcc   c_12,t_1,c_12
1283         bcs,a   %xcc,.+8
1284         add     c_3,t_2,c_3
1285         mulx    a_2,a_5,t_1     !sqr_add_c2(a,5,2,c2,c3,c1);
1286         addcc   c_12,t_1,c_12
1287         bcs,a   %xcc,.+8
1288         add     c_3,t_2,c_3
1289         addcc   c_12,t_1,c_12
1290         bcs,a   %xcc,.+8
1291         add     c_3,t_2,c_3
1292         mulx    a_3,a_4,t_1     !sqr_add_c2(a,4,3,c2,c3,c1);
1293         addcc   c_12,t_1,c_12
1294         bcs,a   %xcc,.+8
1295         add     c_3,t_2,c_3
1296         addcc   c_12,t_1,t_1
1297         bcs,a   %xcc,.+8
1298         add     c_3,t_2,c_3
1299         srlx    t_1,32,c_12
1300         stuw    t_1,rp(7)       !r[7]=c2;
1301         or      c_12,c_3,c_12
1302
1303         mulx    a_7,a_1,t_1     !sqr_add_c2(a,7,1,c3,c1,c2);
1304         addcc   c_12,t_1,c_12
1305         clr     c_3
1306         bcs,a   %xcc,.+8
1307         add     c_3,t_2,c_3
1308         addcc   c_12,t_1,c_12
1309         bcs,a   %xcc,.+8
1310         add     c_3,t_2,c_3
1311         mulx    a_6,a_2,t_1     !sqr_add_c2(a,6,2,c3,c1,c2);
1312         addcc   c_12,t_1,c_12
1313         bcs,a   %xcc,.+8
1314         add     c_3,t_2,c_3
1315         addcc   c_12,t_1,c_12
1316         bcs,a   %xcc,.+8
1317         add     c_3,t_2,c_3
1318         mulx    a_5,a_3,t_1     !sqr_add_c2(a,5,3,c3,c1,c2);
1319         addcc   c_12,t_1,c_12
1320         bcs,a   %xcc,.+8
1321         add     c_3,t_2,c_3
1322         addcc   c_12,t_1,c_12
1323         bcs,a   %xcc,.+8
1324         add     c_3,t_2,c_3
1325         mulx    a_4,a_4,t_1     !sqr_add_c(a,4,c3,c1,c2);
1326         addcc   c_12,t_1,t_1
1327         bcs,a   %xcc,.+8
1328         add     c_3,t_2,c_3
1329         srlx    t_1,32,c_12
1330         stuw    t_1,rp(8)       !r[8]=c3;
1331         or      c_12,c_3,c_12
1332
1333         mulx    a_2,a_7,t_1     !sqr_add_c2(a,7,2,c1,c2,c3);
1334         addcc   c_12,t_1,c_12
1335         clr     c_3
1336         bcs,a   %xcc,.+8
1337         add     c_3,t_2,c_3
1338         addcc   c_12,t_1,c_12
1339         bcs,a   %xcc,.+8
1340         add     c_3,t_2,c_3
1341         mulx    a_3,a_6,t_1     !sqr_add_c2(a,6,3,c1,c2,c3);
1342         addcc   c_12,t_1,c_12
1343         bcs,a   %xcc,.+8
1344         add     c_3,t_2,c_3
1345         addcc   c_12,t_1,c_12
1346         bcs,a   %xcc,.+8
1347         add     c_3,t_2,c_3
1348         mulx    a_4,a_5,t_1     !sqr_add_c2(a,5,4,c1,c2,c3);
1349         addcc   c_12,t_1,c_12
1350         bcs,a   %xcc,.+8
1351         add     c_3,t_2,c_3
1352         addcc   c_12,t_1,t_1
1353         bcs,a   %xcc,.+8
1354         add     c_3,t_2,c_3
1355         srlx    t_1,32,c_12
1356         stuw    t_1,rp(9)       !r[9]=c1;
1357         or      c_12,c_3,c_12
1358
1359         mulx    a_7,a_3,t_1     !sqr_add_c2(a,7,3,c2,c3,c1);
1360         addcc   c_12,t_1,c_12
1361         clr     c_3
1362         bcs,a   %xcc,.+8
1363         add     c_3,t_2,c_3
1364         addcc   c_12,t_1,c_12
1365         bcs,a   %xcc,.+8
1366         add     c_3,t_2,c_3
1367         mulx    a_6,a_4,t_1     !sqr_add_c2(a,6,4,c2,c3,c1);
1368         addcc   c_12,t_1,c_12
1369         bcs,a   %xcc,.+8
1370         add     c_3,t_2,c_3
1371         addcc   c_12,t_1,c_12
1372         bcs,a   %xcc,.+8
1373         add     c_3,t_2,c_3
1374         mulx    a_5,a_5,t_1     !sqr_add_c(a,5,c2,c3,c1);
1375         addcc   c_12,t_1,t_1
1376         bcs,a   %xcc,.+8
1377         add     c_3,t_2,c_3
1378         srlx    t_1,32,c_12
1379         stuw    t_1,rp(10)      !r[10]=c2;
1380         or      c_12,c_3,c_12
1381
1382         mulx    a_4,a_7,t_1     !sqr_add_c2(a,7,4,c3,c1,c2);
1383         addcc   c_12,t_1,c_12
1384         clr     c_3
1385         bcs,a   %xcc,.+8
1386         add     c_3,t_2,c_3
1387         addcc   c_12,t_1,c_12
1388         bcs,a   %xcc,.+8
1389         add     c_3,t_2,c_3
1390         mulx    a_5,a_6,t_1     !sqr_add_c2(a,6,5,c3,c1,c2);
1391         addcc   c_12,t_1,c_12
1392         bcs,a   %xcc,.+8
1393         add     c_3,t_2,c_3
1394         addcc   c_12,t_1,t_1
1395         bcs,a   %xcc,.+8
1396         add     c_3,t_2,c_3
1397         srlx    t_1,32,c_12
1398         stuw    t_1,rp(11)      !r[11]=c3;
1399         or      c_12,c_3,c_12
1400
1401         mulx    a_7,a_5,t_1     !sqr_add_c2(a,7,5,c1,c2,c3);
1402         addcc   c_12,t_1,c_12
1403         clr     c_3
1404         bcs,a   %xcc,.+8
1405         add     c_3,t_2,c_3
1406         addcc   c_12,t_1,c_12
1407         bcs,a   %xcc,.+8
1408         add     c_3,t_2,c_3
1409         mulx    a_6,a_6,t_1     !sqr_add_c(a,6,c1,c2,c3);
1410         addcc   c_12,t_1,t_1
1411         bcs,a   %xcc,.+8
1412         add     c_3,t_2,c_3
1413         srlx    t_1,32,c_12
1414         stuw    t_1,rp(12)      !r[12]=c1;
1415         or      c_12,c_3,c_12
1416
1417         mulx    a_6,a_7,t_1     !sqr_add_c2(a,7,6,c2,c3,c1);
1418         addcc   c_12,t_1,c_12
1419         clr     c_3
1420         bcs,a   %xcc,.+8
1421         add     c_3,t_2,c_3
1422         addcc   c_12,t_1,t_1
1423         bcs,a   %xcc,.+8
1424         add     c_3,t_2,c_3
1425         srlx    t_1,32,c_12
1426         stuw    t_1,rp(13)      !r[13]=c2;
1427         or      c_12,c_3,c_12
1428
1429         mulx    a_7,a_7,t_1     !sqr_add_c(a,7,c3,c1,c2);
1430         addcc   c_12,t_1,t_1
1431         srlx    t_1,32,c_12
1432         stuw    t_1,rp(14)      !r[14]=c3;
1433         stuw    c_12,rp(15)     !r[15]=c1;
1434
1435         ret
1436         restore %g0,%g0,%o0
1437
1438 .type   bn_sqr_comba8,#function
1439 .size   bn_sqr_comba8,(.-bn_sqr_comba8)
1440
1441 .align  32
1442
1443 .global bn_sqr_comba4
1444 /*
1445  * void bn_sqr_comba4(r,a)
1446  * BN_ULONG *r,*a;
1447  */
1448 bn_sqr_comba4:
1449         save    %sp,FRAME_SIZE,%sp
1450         mov     1,t_2
1451         lduw    ap(0),a_0
1452         sllx    t_2,32,t_2
1453         lduw    ap(1),a_1
1454         mulx    a_0,a_0,t_1     !sqr_add_c(a,0,c1,c2,c3);
1455         srlx    t_1,32,c_12
1456         stuw    t_1,rp(0)       !r[0]=c1;
1457
1458         lduw    ap(2),a_2
1459         mulx    a_0,a_1,t_1     !sqr_add_c2(a,1,0,c2,c3,c1);
1460         addcc   c_12,t_1,c_12
1461         clr     c_3
1462         bcs,a   %xcc,.+8
1463         add     c_3,t_2,c_3
1464         addcc   c_12,t_1,t_1
1465         bcs,a   %xcc,.+8
1466         add     c_3,t_2,c_3
1467         srlx    t_1,32,c_12
1468         stuw    t_1,rp(1)       !r[1]=c2;
1469         or      c_12,c_3,c_12
1470
1471         mulx    a_2,a_0,t_1     !sqr_add_c2(a,2,0,c3,c1,c2);
1472         addcc   c_12,t_1,c_12
1473         clr     c_3
1474         bcs,a   %xcc,.+8
1475         add     c_3,t_2,c_3
1476         addcc   c_12,t_1,c_12
1477         bcs,a   %xcc,.+8
1478         add     c_3,t_2,c_3
1479         lduw    ap(3),a_3
1480         mulx    a_1,a_1,t_1     !sqr_add_c(a,1,c3,c1,c2);
1481         addcc   c_12,t_1,t_1
1482         bcs,a   %xcc,.+8
1483         add     c_3,t_2,c_3
1484         srlx    t_1,32,c_12
1485         stuw    t_1,rp(2)       !r[2]=c3;
1486         or      c_12,c_3,c_12
1487
1488         mulx    a_0,a_3,t_1     !sqr_add_c2(a,3,0,c1,c2,c3);
1489         addcc   c_12,t_1,c_12
1490         clr     c_3
1491         bcs,a   %xcc,.+8
1492         add     c_3,t_2,c_3
1493         addcc   c_12,t_1,c_12
1494         bcs,a   %xcc,.+8
1495         add     c_3,t_2,c_3
1496         mulx    a_1,a_2,t_1     !sqr_add_c2(a,2,1,c1,c2,c3);
1497         addcc   c_12,t_1,c_12
1498         bcs,a   %xcc,.+8
1499         add     c_3,t_2,c_3
1500         addcc   c_12,t_1,t_1
1501         bcs,a   %xcc,.+8
1502         add     c_3,t_2,c_3
1503         srlx    t_1,32,c_12
1504         stuw    t_1,rp(3)       !r[3]=c1;
1505         or      c_12,c_3,c_12
1506
1507         mulx    a_3,a_1,t_1     !sqr_add_c2(a,3,1,c2,c3,c1);
1508         addcc   c_12,t_1,c_12
1509         clr     c_3
1510         bcs,a   %xcc,.+8
1511         add     c_3,t_2,c_3
1512         addcc   c_12,t_1,c_12
1513         bcs,a   %xcc,.+8
1514         add     c_3,t_2,c_3
1515         mulx    a_2,a_2,t_1     !sqr_add_c(a,2,c2,c3,c1);
1516         addcc   c_12,t_1,t_1
1517         bcs,a   %xcc,.+8
1518         add     c_3,t_2,c_3
1519         srlx    t_1,32,c_12
1520         stuw    t_1,rp(4)       !r[4]=c2;
1521         or      c_12,c_3,c_12
1522
1523         mulx    a_2,a_3,t_1     !sqr_add_c2(a,3,2,c3,c1,c2);
1524         addcc   c_12,t_1,c_12
1525         clr     c_3
1526         bcs,a   %xcc,.+8
1527         add     c_3,t_2,c_3
1528         addcc   c_12,t_1,t_1
1529         bcs,a   %xcc,.+8
1530         add     c_3,t_2,c_3
1531         srlx    t_1,32,c_12
1532         stuw    t_1,rp(5)       !r[5]=c3;
1533         or      c_12,c_3,c_12
1534
1535         mulx    a_3,a_3,t_1     !sqr_add_c(a,3,c1,c2,c3);
1536         addcc   c_12,t_1,t_1
1537         srlx    t_1,32,c_12
1538         stuw    t_1,rp(6)       !r[6]=c1;
1539         stuw    c_12,rp(7)      !r[7]=c2;
1540
1541         ret
1542         restore %g0,%g0,%o0
1543
1544 .type   bn_sqr_comba4,#function
1545 .size   bn_sqr_comba4,(.-bn_sqr_comba4)
1546
1547 .align  32