sys/crypto/skein/amd64/skein_block_asm.s

   1 #
   2 #----------------------------------------------------------------
   3 # 64-bit x86 assembler code (gnu as) for Skein block functions
   4 #
   5 # Author: Doug Whiting, Hifn/Exar
   6 #
   7 # This code is released to the public domain.
   8 #----------------------------------------------------------------
   9 # $FreeBSD$
  10 #
  11     .text
  12     .altmacro
  13     .psize 0,128                            #list file has no page boundaries
  14 #
  15 _MASK_ALL_  =  (256+512+1024)               #all three algorithm bits
  16 _MAX_FRAME_ =  240
  17 #
  18 #################
  19 .ifndef SKEIN_USE_ASM
  20 _USE_ASM_         = _MASK_ALL_
  21 .else
  22 _USE_ASM_         = SKEIN_USE_ASM
  23 .endif
  24 #################
  25 .ifndef SKEIN_LOOP                          #configure loop unrolling
  26 _SKEIN_LOOP       =   2                     #default is fully unrolled for 256/512, twice for 1024
  27 .else
  28 _SKEIN_LOOP       = SKEIN_LOOP
  29   .irp _NN_,%_SKEIN_LOOP                #only display loop unrolling if default changed on command line
  30 .print  "+++ SKEIN_LOOP = \_NN_"
  31   .endr
  32 .endif
  33 # the unroll counts (0 --> fully unrolled)
  34 SKEIN_UNROLL_256  = (_SKEIN_LOOP / 100) % 10
  35 SKEIN_UNROLL_512  = (_SKEIN_LOOP /  10) % 10
  36 SKEIN_UNROLL_1024 = (_SKEIN_LOOP      ) % 10
  37 #
  38 SKEIN_ASM_UNROLL  = 0
  39   .irp _NN_,256,512,1024
  40     .if (SKEIN_UNROLL_\_NN_) == 0
  41 SKEIN_ASM_UNROLL  = SKEIN_ASM_UNROLL + \_NN_
  42     .endif
  43   .endr
  44 #################
  45 #
  46 .ifndef SKEIN_ROUNDS
  47 ROUNDS_256  =   72
  48 ROUNDS_512  =   72
  49 ROUNDS_1024 =   80
  50 .else
  51 ROUNDS_256  = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
  52 ROUNDS_512  = 8*((((SKEIN_ROUNDS /  10) + 5) % 10) + 5)
  53 ROUNDS_1024 = 8*((((SKEIN_ROUNDS      ) + 5) % 10) + 5)
  54 # only display rounds if default size is changed on command line
  55 .irp _NN_,256,512,1024
  56   .if _USE_ASM_ && \_NN_
  57     .irp _RR_,%(ROUNDS_\_NN_)
  58       .if _NN_ < 1024
  59 .print  "+++ SKEIN_ROUNDS_\_NN_  = \_RR_"
  60       .else
  61 .print  "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
  62       .endif
  63     .endr
  64   .endif
  65 .endr
  66 .endif
  67 #################
  68 #
  69 .ifdef SKEIN_CODE_SIZE
  70 _SKEIN_CODE_SIZE = (1)
  71 .else
  72 .ifdef  SKEIN_PERF                           #use code size if SKEIN_PERF is defined
  73 _SKEIN_CODE_SIZE = (1)
  74 .else
  75 _SKEIN_CODE_SIZE = (0)
  76 .endif
  77 .endif
  78 #
  79 #################
  80 #
  81 .ifndef SKEIN_DEBUG
  82 _SKEIN_DEBUG      = 0
  83 .else
  84 _SKEIN_DEBUG      = 1
  85 .endif
  86 #################
  87 #
  88 # define offsets of fields in hash context structure
  89 #
  90 HASH_BITS   =   0                   #bits of hash output
  91 BCNT        =   8 + HASH_BITS       #number of bytes in BUFFER[]
  92 TWEAK       =   8 + BCNT            #tweak values[0..1]
  93 X_VARS      =  16 + TWEAK           #chaining vars
  94 #
  95 #(Note: buffer[] in context structure is NOT needed here :-)
  96 #
  97 KW_PARITY   =   0x1BD11BDAA9FC1A22  #overall parity of key schedule words
  98 FIRST_MASK  =   ~ (1 <<  6)
  99 FIRST_MASK64=   ~ (1 << 62)
 100 #
 101 # rotation constants for Skein
 102 #
 103 RC_256_0_0  = 14
 104 RC_256_0_1  = 16
 105
 106 RC_256_1_0  = 52
 107 RC_256_1_1  = 57
 108
 109 RC_256_2_0  = 23
 110 RC_256_2_1  = 40
 111
 112 RC_256_3_0  =  5
 113 RC_256_3_1  = 37
 114
 115 RC_256_4_0  = 25
 116 RC_256_4_1  = 33
 117
 118 RC_256_5_0  = 46
 119 RC_256_5_1  = 12
 120
 121 RC_256_6_0  = 58
 122 RC_256_6_1  = 22
 123
 124 RC_256_7_0  = 32
 125 RC_256_7_1  = 32
 126
 127 RC_512_0_0  = 46
 128 RC_512_0_1  = 36
 129 RC_512_0_2  = 19
 130 RC_512_0_3  = 37
 131
 132 RC_512_1_0  = 33
 133 RC_512_1_1  = 27
 134 RC_512_1_2  = 14
 135 RC_512_1_3  = 42
 136
 137 RC_512_2_0  = 17
 138 RC_512_2_1  = 49
 139 RC_512_2_2  = 36
 140 RC_512_2_3  = 39
 141
 142 RC_512_3_0  = 44
 143 RC_512_3_1  =  9
 144 RC_512_3_2  = 54
 145 RC_512_3_3  = 56
 146
 147 RC_512_4_0  = 39
 148 RC_512_4_1  = 30
 149 RC_512_4_2  = 34
 150 RC_512_4_3  = 24
 151
 152 RC_512_5_0  = 13
 153 RC_512_5_1  = 50
 154 RC_512_5_2  = 10
 155 RC_512_5_3  = 17
 156
 157 RC_512_6_0  = 25
 158 RC_512_6_1  = 29
 159 RC_512_6_2  = 39
 160 RC_512_6_3  = 43
 161
 162 RC_512_7_0  =  8
 163 RC_512_7_1  = 35
 164 RC_512_7_2  = 56
 165 RC_512_7_3  = 22
 166
 167 RC_1024_0_0 = 24
 168 RC_1024_0_1 = 13
 169 RC_1024_0_2 =  8
 170 RC_1024_0_3 = 47
 171 RC_1024_0_4 =  8
 172 RC_1024_0_5 = 17
 173 RC_1024_0_6 = 22
 174 RC_1024_0_7 = 37
 175
 176 RC_1024_1_0 = 38
 177 RC_1024_1_1 = 19
 178 RC_1024_1_2 = 10
 179 RC_1024_1_3 = 55
 180 RC_1024_1_4 = 49
 181 RC_1024_1_5 = 18
 182 RC_1024_1_6 = 23
 183 RC_1024_1_7 = 52
 184
 185 RC_1024_2_0 = 33
 186 RC_1024_2_1 =  4
 187 RC_1024_2_2 = 51
 188 RC_1024_2_3 = 13
 189 RC_1024_2_4 = 34
 190 RC_1024_2_5 = 41
 191 RC_1024_2_6 = 59
 192 RC_1024_2_7 = 17
 193
 194 RC_1024_3_0 =  5
 195 RC_1024_3_1 = 20
 196 RC_1024_3_2 = 48
 197 RC_1024_3_3 = 41
 198 RC_1024_3_4 = 47
 199 RC_1024_3_5 = 28
 200 RC_1024_3_6 = 16
 201 RC_1024_3_7 = 25
 202
 203 RC_1024_4_0 = 41
 204 RC_1024_4_1 =  9
 205 RC_1024_4_2 = 37
 206 RC_1024_4_3 = 31
 207 RC_1024_4_4 = 12
 208 RC_1024_4_5 = 47
 209 RC_1024_4_6 = 44
 210 RC_1024_4_7 = 30
 211
 212 RC_1024_5_0 = 16
 213 RC_1024_5_1 = 34
 214 RC_1024_5_2 = 56
 215 RC_1024_5_3 = 51
 216 RC_1024_5_4 =  4
 217 RC_1024_5_5 = 53
 218 RC_1024_5_6 = 42
 219 RC_1024_5_7 = 41
 220
 221 RC_1024_6_0 = 31
 222 RC_1024_6_1 = 44
 223 RC_1024_6_2 = 47
 224 RC_1024_6_3 = 46
 225 RC_1024_6_4 = 19
 226 RC_1024_6_5 = 42
 227 RC_1024_6_6 = 44
 228 RC_1024_6_7 = 25
 229
 230 RC_1024_7_0 =  9
 231 RC_1024_7_1 = 48
 232 RC_1024_7_2 = 35
 233 RC_1024_7_3 = 52
 234 RC_1024_7_4 = 23
 235 RC_1024_7_5 = 31
 236 RC_1024_7_6 = 37
 237 RC_1024_7_7 = 20
 238 #
 239 #  Input:  reg
 240 # Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
 241 #
 242 .macro RotL64   reg,BLK_SIZE,ROUND_NUM,MIX_NUM
 243 _RCNT_ = RC_\BLK_SIZE&_\ROUND_NUM&_\MIX_NUM
 244   .if _RCNT_  #is there anything to do?
 245     rolq    $_RCNT_,%\reg
 246   .endif
 247 .endm
 248 #
 249 #----------------------------------------------------------------
 250 #
 251 # MACROS: define local vars and configure stack
 252 #
 253 #----------------------------------------------------------------
 254 # declare allocated space on the stack
 255 .macro StackVar localName,localSize
 256 \localName  =   _STK_OFFS_
 257 _STK_OFFS_  =   _STK_OFFS_+(\localSize)
 258 .endm #StackVar
 259 #
 260 #----------------------------------------------------------------
 261 #
 262 # MACRO: Configure stack frame, allocate local vars
 263 #
 264 .macro Setup_Stack BLK_BITS,KS_CNT,debugCnt
 265     WCNT    =    (\BLK_BITS)/64
 266 #
 267 _PushCnt_   =   0                   #save nonvolatile regs on stack
 268   .irp _reg_,rbp,rbx,r12,r13,r14,r15
 269        pushq    %\_reg_
 270 _PushCnt_ = _PushCnt_ + 1           #track count to keep alignment
 271   .endr
 272 #
 273 _STK_OFFS_  =   0                   #starting offset from rsp
 274     #---- local  variables         #<-- rsp
 275     StackVar    X_stk  ,8*(WCNT)    #local context vars
 276     StackVar    ksTwk  ,8*3         #key schedule: tweak words
 277     StackVar    ksKey  ,8*(WCNT)+8  #key schedule: key   words
 278   .if (SKEIN_ASM_UNROLL && (\BLK_BITS)) == 0
 279     StackVar    ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen
 280   .endif
 281     StackVar    Wcopy  ,8*(WCNT)    #copy of input block
 282   .if _SKEIN_DEBUG
 283   .if \debugCnt + 0                 #temp location for debug X[] info
 284     StackVar    xDebug_\BLK_BITS ,8*(\debugCnt)
 285   .endif
 286   .endif
 287   .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0
 288     StackVar    align16,8           #keep 16-byte aligned (adjust for retAddr?)
 289 tmpStk_\BLK_BITS = align16          #use this
 290   .endif
 291     #---- saved caller parameters (from regs rdi, rsi, rdx, rcx)
 292     StackVar    ctxPtr ,8           #context ptr
 293     StackVar    blkPtr ,8           #pointer to block data
 294     StackVar    blkCnt ,8           #number of full blocks to process
 295     StackVar    bitAdd ,8           #bit count to add to tweak
 296 LOCAL_SIZE  =   _STK_OFFS_          #size of "local" vars
 297     #----
 298     StackVar    savRegs,8*_PushCnt_ #saved registers
 299     StackVar    retAddr,8           #return address
 300     #---- caller's stack frame (aligned mod 16)
 301 #
 302 # set up the stack frame pointer (rbp)
 303 #
 304 FRAME_OFFS  =   ksTwk + 128         #allow short (negative) offset to ksTwk, kwKey
 305   .if FRAME_OFFS > _STK_OFFS_       #keep rbp in the "locals" range
 306 FRAME_OFFS  =      _STK_OFFS_
 307   .endif
 308 F_O         =   -FRAME_OFFS
 309 #
 310   #put some useful defines in the .lst file (for grep)
 311 __STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE
 312 __STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_
 313 __STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS
 314 #
 315 # Notes on stack frame setup:
 316 #   * the most frequently used variable is X_stk[], based at [rsp+0]
 317 #   * the next most used is the key schedule arrays, ksKey and ksTwk
 318 #       so rbp is "centered" there, allowing short offsets to the key
 319 #       schedule even in 1024-bit Skein case
 320 #   * the Wcopy variables are infrequently accessed, but they have long
 321 #       offsets from both rsp and rbp only in the 1024-bit case.
 322 #   * all other local vars and calling parameters can be accessed
 323 #       with short offsets, except in the 1024-bit case
 324 #
 325     subq    $LOCAL_SIZE,%rsp        #make room for the locals
 326     leaq    FRAME_OFFS(%rsp),%rbp   #maximize use of short offsets
 327     movq    %rdi, ctxPtr+F_O(%rbp)  #save caller's parameters on the stack
 328     movq    %rsi, blkPtr+F_O(%rbp)
 329     movq    %rdx, blkCnt+F_O(%rbp)
 330     movq    %rcx, bitAdd+F_O(%rbp)
 331 #
 332 .endm #Setup_Stack
 333 #
 334 #----------------------------------------------------------------
 335 #
 336 .macro Reset_Stack
 337     addq    $LOCAL_SIZE,%rsp        #get rid of locals (wipe??)
 338   .irp _reg_,r15,r14,r13,r12,rbx,rbp
 339     popq    %\_reg_                 #restore caller's regs
 340 _PushCnt_ = _PushCnt_ - 1
 341   .endr
 342   .if _PushCnt_
 343     .error  "Mismatched push/pops?"
 344   .endif
 345 .endm # Reset_Stack
 346 #
 347 #----------------------------------------------------------------
 348 # macros to help debug internals
 349 #
 350 .if _SKEIN_DEBUG
 351     .extern  Skein_Show_Block     #calls to C routines
 352     .extern  Skein_Show_Round
 353 #
 354 SKEIN_RND_SPECIAL       =   1000
 355 SKEIN_RND_KEY_INITIAL   =   SKEIN_RND_SPECIAL+0
 356 SKEIN_RND_KEY_INJECT    =   SKEIN_RND_SPECIAL+1
 357 SKEIN_RND_FEED_FWD      =   SKEIN_RND_SPECIAL+2
 358 #
 359 .macro Skein_Debug_Block BLK_BITS
 360 #
 361 #void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
 362 #                     const u08b_t *blkPtr, const u64b_t *wPtr,
 363 #                     const u64b_t *ksPtr,const u64b_t *tsPtr)
 364 #
 365 _NN_ = 0
 366   .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11
 367     pushq   %\_reg_                 #save all volatile regs on tack before the call
 368 _NN_ = _NN_ + 1
 369   .endr
 370     # get and push call parameters
 371     movq    $\BLK_BITS      ,%rdi   #bits
 372     movq    ctxPtr+F_O(%rbp),%rsi   #h (pointer)
 373     leaq    X_VARS    (%rsi),%rdx   #X (pointer)
 374     movq    blkPtr+F_O(%rbp),%rcx   #blkPtr
 375     leaq    Wcopy +F_O(%rbp),%r8    #wPtr
 376     leaq    ksKey +F_O(%rbp),%r9    #key pointer
 377     leaq    ksTwk +F_O(%rbp),%rax   #tweak pointer
 378     pushq   %rax                    #   (pass on the stack)
 379     call    Skein_Show_Block        #call external debug handler
 380     addq    $8*1,%rsp               #discard parameters on stack
 381   .if (_NN_ % 2 ) == 0              #check stack alignment
 382     .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS"
 383   .endif
 384   .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax
 385     popq    %\_reg_                 #restore regs
 386 _NN_ = _NN_ - 1
 387   .endr
 388   .if _NN_
 389     .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS"
 390   .endif
 391 .endm # Skein_Debug_Block
 392 #
 393 # the macro to "call" to debug a round
 394 #
 395 .macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
 396     # call the appropriate (local) debug "function"
 397     pushq   %rdx                    #save rdx, so we can use it for round "number"
 398   .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
 399     movq    $\R,%rdx
 400   .else                             #compute round number using edi
 401 _rOffs_ = \RDI_OFFS + 0
 402    .if \BLK_BITS == 1024
 403     movq    rIdx_offs+8(%rsp),%rdx  #get rIdx off the stack (adjust for pushq rdx above)
 404     leaq    1+(((\R)-1) && 3)+_rOffs_(,%rdx,4),%rdx
 405    .else
 406     leaq    1+(((\R)-1) && 3)+_rOffs_(,%rdi,4),%rdx
 407    .endif
 408   .endif
 409     call    Skein_Debug_Round_\BLK_BITS
 410     popq    %rdx                    #restore origianl rdx value
 411 #
 412     afterOp
 413 .endm  #  Skein_Debug_Round
 414 .else  #------- _SKEIN_DEBUG (dummy macros if debug not enabled)
 415 .macro Skein_Debug_Block BLK_BITS
 416 .endm
 417 #
 418 .macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
 419 .endm
 420 #
 421 .endif # _SKEIN_DEBUG
 422 #
 423 #----------------------------------------------------------------
 424 #
 425 .macro  addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
 426   .if \immOffs + 0
 427        leaq    \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
 428   .elseif ((\useAddOp + 0) == 0)
 429     .ifndef ASM_NO_LEA  #lea seems to be faster on Core 2 Duo CPUs!
 430        leaq   (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
 431     .else
 432        addq    %\srcReg_A\srcReg_B,%\dstReg
 433     .endif
 434   .else
 435        addq    %\srcReg_A\srcReg_B,%\dstReg
 436   .endif
 437 .endm
 438
 439 # keep Intel-style ordering here, to match addReg
 440 .macro  xorReg dstReg,srcReg_A,srcReg_B
 441         xorq   %\srcReg_A\srcReg_B,%\dstReg
 442 .endm
 443 #
 444 #----------------------------------------------------------------
 445 #
 446 .macro C_label lName
 447  \lName:        #use both "genders" to work across linkage conventions
 448 _\lName:
 449     .global  \lName
 450     .global _\lName
 451 .endm
 452 #
 453 #=================================== Skein_256 =============================================
 454 #
 455 .if _USE_ASM_ & 256
 456 #
 457 # void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
 458 #
 459 #################
 460 #
 461 # code
 462 #
 463 C_label Skein_256_Process_Block
 464     Setup_Stack 256,((ROUNDS_256/8)+1)
 465     movq    TWEAK+8(%rdi),%r14
 466     jmp     Skein_256_block_loop
 467     .p2align 4
 468     # main hash loop for Skein_256
 469 Skein_256_block_loop:
 470     #
 471     # general register usage:
 472     #   RAX..RDX        = X0..X3
 473     #   R08..R12        = ks[0..4]
 474     #   R13..R15        = ts[0..2]
 475     #   RSP, RBP        = stack/frame pointers
 476     #   RDI             = round counter or context pointer
 477     #   RSI             = temp
 478     #
 479     movq    TWEAK+0(%rdi)     ,%r13
 480     addq    bitAdd+F_O(%rbp)  ,%r13  #computed updated tweak value T0
 481     movq    %r14              ,%r15
 482     xorq    %r13              ,%r15  #now %r13.%r15 is set as the tweak
 483
 484     movq    $KW_PARITY        ,%r12
 485     movq       X_VARS+ 0(%rdi),%r8
 486     movq       X_VARS+ 8(%rdi),%r9
 487     movq       X_VARS+16(%rdi),%r10
 488     movq       X_VARS+24(%rdi),%r11
 489     movq    %r13,TWEAK+0(%rdi)       #save updated tweak value ctx->h.T[0]
 490     xorq    %r8               ,%r12  #start accumulating overall parity
 491
 492     movq    blkPtr +F_O(%rbp) ,%rsi  #esi --> input block
 493     xorq    %r9               ,%r12
 494     movq     0(%rsi)          ,%rax  #get X[0..3]
 495     xorq    %r10              ,%r12
 496     movq     8(%rsi)          ,%rbx
 497     xorq    %r11              ,%r12
 498     movq    16(%rsi)          ,%rcx
 499     movq    24(%rsi)          ,%rdx
 500
 501     movq    %rax,Wcopy+ 0+F_O(%rbp)  #save copy of input block
 502     movq    %rbx,Wcopy+ 8+F_O(%rbp)
 503     movq    %rcx,Wcopy+16+F_O(%rbp)
 504     movq    %rdx,Wcopy+24+F_O(%rbp)
 505
 506     addq    %r8 ,%rax                #initial key injection
 507     addq    %r9 ,%rbx
 508     addq    %r10,%rcx
 509     addq    %r11,%rdx
 510     addq    %r13,%rbx
 511     addq    %r14,%rcx
 512
 513 .if _SKEIN_DEBUG
 514     movq    %r14,TWEAK+ 8(%rdi)      #save updated tweak T[1] (start bit cleared?)
 515     movq    %r8 ,ksKey+ 0+F_O(%rbp)  #save key schedule on stack for Skein_Debug_Block
 516     movq    %r9 ,ksKey+ 8+F_O(%rbp)
 517     movq    %r10,ksKey+16+F_O(%rbp)
 518     movq    %r11,ksKey+24+F_O(%rbp)
 519     movq    %r12,ksKey+32+F_O(%rbp)
 520
 521     movq    %r13,ksTwk+ 0+F_O(%rbp)
 522     movq    %r14,ksTwk+ 8+F_O(%rbp)
 523     movq    %r15,ksTwk+16+F_O(%rbp)
 524
 525     movq    %rax,X_stk + 0(%rsp)     #save X[] on stack for Skein_Debug_Block
 526     movq    %rbx,X_stk + 8(%rsp)
 527     movq    %rcx,X_stk +16(%rsp)
 528     movq    %rdx,X_stk +24(%rsp)
 529
 530     Skein_Debug_Block 256            #debug dump
 531     Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
 532 .endif
 533 #
 534 .if ((SKEIN_ASM_UNROLL & 256) == 0)
 535     movq    %r8 ,ksKey+40+F_O(%rbp)  #save key schedule on stack for looping code
 536     movq    %r9 ,ksKey+ 8+F_O(%rbp)
 537     movq    %r10,ksKey+16+F_O(%rbp)
 538     movq    %r11,ksKey+24+F_O(%rbp)
 539     movq    %r12,ksKey+32+F_O(%rbp)
 540
 541     movq    %r13,ksTwk+24+F_O(%rbp)
 542     movq    %r14,ksTwk+ 8+F_O(%rbp)
 543     movq    %r15,ksTwk+16+F_O(%rbp)
 544 .endif
 545     addq    $WCNT*8,%rsi             #skip the block
 546     movq    %rsi,blkPtr  +F_O(%rbp)  #update block pointer
 547     #
 548     # now the key schedule is computed. Start the rounds
 549     #
 550 .if SKEIN_ASM_UNROLL & 256
 551 _UNROLL_CNT =   ROUNDS_256/8
 552 .else
 553 _UNROLL_CNT =   SKEIN_UNROLL_256
 554   .if ((ROUNDS_256/8) % _UNROLL_CNT)
 555     .error "Invalid SKEIN_UNROLL_256"
 556   .endif
 557     xorq    %rdi,%rdi                #rdi = iteration count
 558 Skein_256_round_loop:
 559 .endif
 560 _Rbase_ = 0
 561 .rept _UNROLL_CNT*2
 562     # all X and ks vars in regs      # (ops to "rotate" ks vars, via mem, if not unrolled)
 563     # round 4*_RBase_ + 0
 564     addReg  rax, rbx
 565     RotL64  rbx, 256,%((4*_Rbase_+0) % 8),0
 566     addReg  rcx, rdx
 567                 .if (SKEIN_ASM_UNROLL & 256) == 0
 568                     movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8
 569                 .endif
 570     xorReg  rbx, rax
 571     RotL64  rdx, 256,%((4*_Rbase_+0) % 8),1
 572     xorReg  rdx, rcx
 573   .if SKEIN_ASM_UNROLL & 256
 574     .irp _r0_,%( 8+(_Rbase_+3) % 5)
 575     .irp _r1_,%(13+(_Rbase_+2) % 3)
 576       leaq   (%r\_r0_,%r\_r1_),%rdi    #precompute key injection value for %rcx
 577     .endr
 578     .endr
 579   .endif
 580                 .if (SKEIN_ASM_UNROLL & 256) == 0
 581                     movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13
 582                 .endif
 583     Skein_Debug_Round 256,%(4*_Rbase_+1)
 584
 585     # round 4*_Rbase_ + 1
 586     addReg  rax, rdx
 587     RotL64  rdx, 256,%((4*_Rbase_+1) % 8),0
 588     xorReg  rdx, rax
 589                 .if (SKEIN_ASM_UNROLL & 256) == 0
 590                     movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9
 591                 .endif
 592     addReg  rcx, rbx
 593     RotL64  rbx, 256,%((4*_Rbase_+1) % 8),1
 594     xorReg  rbx, rcx
 595                 .if (SKEIN_ASM_UNROLL & 256) == 0
 596                     movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11
 597                 .endif
 598     Skein_Debug_Round 256,%(4*_Rbase_+2)
 599  .if SKEIN_ASM_UNROLL & 256
 600     .irp _r0_,%( 8+(_Rbase_+2) % 5)
 601     .irp _r1_,%(13+(_Rbase_+1) % 3)
 602       leaq   (%r\_r0_,%r\_r1_),%rsi     #precompute key injection value for %rbx
 603     .endr
 604     .endr
 605  .endif
 606     # round 4*_Rbase_ + 2
 607     addReg  rax, rbx
 608     RotL64  rbx, 256,%((4*_Rbase_+2) % 8),0
 609     addReg  rcx, rdx
 610                 .if (SKEIN_ASM_UNROLL & 256) == 0
 611                     movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10
 612                 .endif
 613     xorReg  rbx, rax
 614     RotL64  rdx, 256,%((4*_Rbase_+2) % 8),1
 615     xorReg  rdx, rcx
 616                 .if (SKEIN_ASM_UNROLL & 256) == 0
 617                     movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8)  #"rotate" the key
 618                     leaq 1(%r11,%rdi),%r11               #precompute key + tweak
 619                 .endif
 620     Skein_Debug_Round 256,%(4*_Rbase_+3)
 621     # round 4*_Rbase_ + 3
 622     addReg  rax, rdx
 623     RotL64  rdx, 256,%((4*_Rbase_+3) % 8),0
 624     addReg  rcx, rbx
 625                 .if (SKEIN_ASM_UNROLL & 256) == 0
 626                     addq      ksTwk+8*2+F_O(%rbp,%rdi,8),%r10  #precompute key + tweak
 627                     movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8)       #"rotate" the tweak
 628                 .endif
 629     xorReg  rdx, rax
 630     RotL64  rbx, 256,%((4*_Rbase_+3) % 8),1
 631     xorReg  rbx, rcx
 632     Skein_Debug_Round 256,%(4*_Rbase_+4)
 633                 .if (SKEIN_ASM_UNROLL & 256) == 0
 634                     addReg r9 ,r13           #precompute key+tweak
 635                 .endif
 636       #inject key schedule words
 637 _Rbase_ = _Rbase_+1
 638   .if SKEIN_ASM_UNROLL & 256
 639     addReg    rax,r,%(8+((_Rbase_+0) % 5))
 640     addReg    rbx,rsi
 641     addReg    rcx,rdi
 642     addReg    rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_
 643   .else
 644     incq      %rdi
 645     addReg    rax,r8
 646     addReg    rcx,r10
 647     addReg    rbx,r9
 648     addReg    rdx,r11
 649   .endif
 650     Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
 651 .endr #rept _UNROLL_CNT
 652 #
 653 .if (SKEIN_ASM_UNROLL & 256) == 0
 654     cmpq    $2*(ROUNDS_256/8),%rdi
 655     jb      Skein_256_round_loop
 656 .endif # (SKEIN_ASM_UNROLL & 256) == 0
 657     movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
 658
 659     #----------------------------
 660     # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..3}
 661     movq    $FIRST_MASK64 ,%r14
 662     xorq    Wcopy + 0+F_O (%rbp),%rax
 663     xorq    Wcopy + 8+F_O (%rbp),%rbx
 664     xorq    Wcopy +16+F_O (%rbp),%rcx
 665     xorq    Wcopy +24+F_O (%rbp),%rdx
 666     andq    TWEAK + 8     (%rdi),%r14
 667     movq    %rax,X_VARS+ 0(%rdi)             #store final result
 668     movq    %rbx,X_VARS+ 8(%rdi)
 669     movq    %rcx,X_VARS+16(%rdi)
 670     movq    %rdx,X_VARS+24(%rdi)
 671
 672     Skein_Debug_Round 256,SKEIN_RND_FEED_FWD
 673
 674     # go back for more blocks, if needed
 675     decq    blkCnt+F_O(%rbp)
 676     jnz     Skein_256_block_loop
 677     movq    %r14,TWEAK + 8(%rdi)
 678     Reset_Stack
 679     ret
 680 Skein_256_Process_Block_End:
 681
 682   .if _SKEIN_DEBUG
 683 Skein_Debug_Round_256:               #here with rdx == round "number" from macro
 684     pushq   %rsi                     #save two regs for BLK_BITS-specific parms
 685     pushq   %rdi
 686     movq    24(%rsp),%rdi            #get back original rdx (pushed on stack in macro call) to rdi
 687     movq    %rax,X_stk+ 0+F_O(%rbp)  #save X[] state on stack so debug routines can access it
 688     movq    %rbx,X_stk+ 8+F_O(%rbp)  #(use FP_ since rsp has changed!)
 689     movq    %rcx,X_stk+16+F_O(%rbp)
 690     movq    %rdi,X_stk+24+F_O(%rbp)
 691
 692     movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
 693     movq    $256,%rdi                #now <rdi,rsi,rdx> are set for the call
 694     jmp     Skein_Debug_Round_Common
 695   .endif
 696 #
 697 .if _SKEIN_CODE_SIZE
 698 C_label  Skein_256_Process_Block_CodeSize
 699     movq    $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax
 700     ret
 701 #
 702 C_label Skein_256_Unroll_Cnt
 703   .if _UNROLL_CNT <> ROUNDS_256/8
 704     movq    $_UNROLL_CNT,%rax
 705   .else
 706     xorq    %rax,%rax
 707   .endif
 708     ret
 709 .endif
 710 #
 711 .endif #_USE_ASM_ & 256
 712 #
 713 #=================================== Skein_512 =============================================
 714 #
 715 .if _USE_ASM_ & 512
 716 #
 717 # void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)
 718 #
 719 # X[i] == %r[8+i]          #register assignments for X[] values during rounds (i=0..7)
 720 #
 721 #################
 722 # MACRO: one round for 512-bit blocks
 723 #
 724 .macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4
 725 #
 726     addReg      r\rn0, r\rn1
 727     RotL64      r\rn1, 512,%((_Rn_) % 8),0
 728     xorReg      r\rn1, r\rn0
 729             op1
 730     addReg      r\rn2, r\rn3
 731     RotL64      r\rn3, 512,%((_Rn_) % 8),1
 732     xorReg      r\rn3, r\rn2
 733             op2
 734     addReg      r\rn4, r\rn5
 735     RotL64      r\rn5, 512,%((_Rn_) % 8),2
 736     xorReg      r\rn5, r\rn4
 737             op3
 738     addReg      r\rn6, r\rn7
 739     RotL64      r\rn7, 512,%((_Rn_) % 8),3
 740     xorReg      r\rn7, r\rn6
 741             op4
 742     Skein_Debug_Round 512,%(_Rn_+1),-4
 743 #
 744 .endm #R_512_OneRound
 745 #
 746 #################
 747 # MACRO: eight rounds for 512-bit blocks
 748 #
 749 .macro R_512_FourRounds _RR_    #RR = base round number (0 % 8)
 750   .if (SKEIN_ASM_UNROLL && 512)
 751     # here for fully unrolled case.
 752     _II_ = ((_RR_)/4) + 1       #key injection counter
 753     R_512_OneRound  8, 9,10,11,12,13,14,15,%((_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>
 754     R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>
 755     R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>
 756     R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,
 757     # inject the key schedule
 758     addq    ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8
 759     addReg   r11, rax
 760     addq    ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9
 761     addReg   r12, rbx
 762     addq    ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10
 763     addReg   r13, rcx
 764     addReg   r14, rdx
 765     addReg   r15, rsi,,,(_II_)
 766   .else
 767     # here for looping case                                                    #"rotate" key/tweak schedule (move up on stack)
 768     incq    %rdi                 #bump key injection counter
 769     R_512_OneRound  8, 9,10,11,12,13,14,15,%((_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq      ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq      ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>
 770     R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8)     >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>
 771     R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq      ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq      ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>
 772     R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq      ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>
 773     # inject the key schedule
 774     addq    ksKey+8*0+F_O(%rbp,%rdi,8),%r8
 775     addReg   r11, rax
 776     addReg   r12, rbx
 777     addq    ksKey+8*1+F_O(%rbp,%rdi,8),%r9
 778     addReg   r13, rcx
 779     addReg   r14, rdx
 780     addq    ksKey+8*2+F_O(%rbp,%rdi,8),%r10
 781     addReg   r15, rsi
 782     addReg   r15, rdi              #inject the round number
 783   .endif
 784
 785     #show the result of the key injection
 786     Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT
 787 .endm #R_512_EightRounds
 788 #
 789 #################
 790 # instantiated code
 791 #
 792 C_label Skein_512_Process_Block
 793     Setup_Stack 512,ROUNDS_512/8
 794     movq    TWEAK+ 8(%rdi),%rbx
 795     jmp     Skein_512_block_loop
 796     .p2align 4
 797     # main hash loop for Skein_512
 798 Skein_512_block_loop:
 799     # general register usage:
 800     #   RAX..RDX       = temps for key schedule pre-loads
 801     #   R8 ..R15       = X0..X7
 802     #   RSP, RBP       = stack/frame pointers
 803     #   RDI            = round counter or context pointer
 804     #   RSI            = temp
 805     #
 806     movq    TWEAK +  0(%rdi),%rax
 807     addq    bitAdd+F_O(%rbp),%rax     #computed updated tweak value T0
 808     movq    %rbx,%rcx
 809     xorq    %rax,%rcx                 #%rax/%rbx/%rcx = tweak schedule
 810     movq    %rax,TWEAK+ 0    (%rdi)   #save updated tweak value ctx->h.T[0]
 811     movq    %rax,ksTwk+ 0+F_O(%rbp)
 812     movq    $KW_PARITY,%rdx
 813     movq    blkPtr +F_O(%rbp),%rsi    #%rsi --> input block
 814     movq    %rbx,ksTwk+ 8+F_O(%rbp)
 815     movq    %rcx,ksTwk+16+F_O(%rbp)
 816     .irp _Rn_,8,9,10,11,12,13,14,15
 817       movq  X_VARS+8*(_Rn_-8)(%rdi),%r\_Rn_
 818       xorq  %r\_Rn_,%rdx              #compute overall parity
 819       movq  %r\_Rn_,ksKey+8*(_Rn_-8)+F_O(%rbp)
 820     .endr                             #load state into %r8 ..%r15, compute parity
 821       movq  %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity
 822
 823     addReg   r13,rax                  #precompute key injection for tweak
 824     addReg   r14, rbx
 825 .if _SKEIN_DEBUG
 826     movq    %rbx,TWEAK+ 8(%rdi)       #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below
 827 .endif
 828     movq     0(%rsi),%rax             #load input block
 829     movq     8(%rsi),%rbx
 830     movq    16(%rsi),%rcx
 831     movq    24(%rsi),%rdx
 832     addReg   r8 , rax                 #do initial key injection
 833     addReg   r9 , rbx
 834     movq    %rax,Wcopy+ 0+F_O(%rbp)   #keep local copy for feedforward
 835     movq    %rbx,Wcopy+ 8+F_O(%rbp)
 836     addReg   r10, rcx
 837     addReg   r11, rdx
 838     movq    %rcx,Wcopy+16+F_O(%rbp)
 839     movq    %rdx,Wcopy+24+F_O(%rbp)
 840
 841     movq    32(%rsi),%rax
 842     movq    40(%rsi),%rbx
 843     movq    48(%rsi),%rcx
 844     movq    56(%rsi),%rdx
 845     addReg   r12, rax
 846     addReg   r13, rbx
 847     addReg   r14, rcx
 848     addReg   r15, rdx
 849     movq    %rax,Wcopy+32+F_O(%rbp)
 850     movq    %rbx,Wcopy+40+F_O(%rbp)
 851     movq    %rcx,Wcopy+48+F_O(%rbp)
 852     movq    %rdx,Wcopy+56+F_O(%rbp)
 853
 854 .if _SKEIN_DEBUG
 855     .irp _Rn_,8,9,10,11,12,13,14,15   #save values on stack for debug output
 856       movq  %r\_Rn_,X_stk+8*(_Rn_-8)(%rsp)
 857     .endr
 858
 859     Skein_Debug_Block 512             #debug dump
 860     Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL
 861 .endif
 862     addq    $8*WCNT,%rsi              #skip the block
 863     movq    %rsi,blkPtr+F_O(%rbp)     #update block pointer
 864     #
 865     #################
 866     # now the key schedule is computed. Start the rounds
 867     #
 868 .if SKEIN_ASM_UNROLL & 512
 869 _UNROLL_CNT =   ROUNDS_512/8
 870 .else
 871 _UNROLL_CNT =   SKEIN_UNROLL_512
 872   .if ((ROUNDS_512/8) % _UNROLL_CNT)
 873     .error "Invalid SKEIN_UNROLL_512"
 874   .endif
 875     xorq    %rdi,%rdi                 #rdi = round counter
 876 Skein_512_round_loop:
 877 .endif
 878 #
 879 _Rbase_ = 0
 880 .rept _UNROLL_CNT*2
 881       R_512_FourRounds %(4*_Rbase_+00)
 882 _Rbase_ = _Rbase_+1
 883 .endr #rept _UNROLL_CNT
 884 #
 885 .if (SKEIN_ASM_UNROLL & 512) == 0
 886     cmpq    $2*(ROUNDS_512/8),%rdi
 887     jb      Skein_512_round_loop
 888     movq    ctxPtr +F_O(%rbp),%rdi           #restore rdi --> context
 889 .endif
 890     # end of rounds
 891     #################
 892     # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..7}
 893     .irp _Rn_,8,9,10,11,12,13,14,15
 894   .if (_Rn_ == 8)
 895     movq    $FIRST_MASK64,%rbx
 896   .endif
 897       xorq  Wcopy+8*(_Rn_-8)+F_O(%rbp),%r\_Rn_  #feedforward XOR
 898       movq  %r\_Rn_,X_VARS+8*(_Rn_-8)(%rdi)     #and store result
 899   .if (_Rn_ == 14)
 900     andq    TWEAK+ 8(%rdi),%rbx
 901   .endif
 902     .endr
 903     Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
 904
 905     # go back for more blocks, if needed
 906     decq    blkCnt+F_O(%rbp)
 907     jnz     Skein_512_block_loop
 908     movq    %rbx,TWEAK + 8(%rdi)
 909
 910     Reset_Stack
 911     ret
 912 Skein_512_Process_Block_End:
 913 #
 914   .if _SKEIN_DEBUG
 915 # call here with rdx  = "round number"
 916 Skein_Debug_Round_512:
 917     pushq   %rsi                     #save two regs for BLK_BITS-specific parms
 918     pushq   %rdi
 919   .irp _Rn_,8,9,10,11,12,13,14,15    #save X[] state on stack so debug routines can access it
 920     movq    %r\_Rn_,X_stk+8*(_Rn_-8)+F_O(%rbp)
 921   .endr
 922     movq    ctxPtr+F_O(%rbp),%rsi    #ctx_hdr_ptr
 923     movq    $512,%rdi                #now <rdi,rsi,rdx> are set for the call
 924     jmp     Skein_Debug_Round_Common
 925   .endif
 926 #
 927 .if _SKEIN_CODE_SIZE
 928 C_label Skein_512_Process_Block_CodeSize
 929     movq    $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax
 930     ret
 931 #
 932 C_label Skein_512_Unroll_Cnt
 933   .if _UNROLL_CNT <> (ROUNDS_512/8)
 934     movq    $_UNROLL_CNT,%rax
 935   .else
 936     xorq    %rax,%rax
 937   .endif
 938     ret
 939 .endif
 940 #
 941 .endif # _USE_ASM_ & 512
 942 #
 943 #=================================== Skein1024 =============================================
 944 .if _USE_ASM_ & 1024
 945 #
 946 # void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
 947 #
 948 #################
 949 # use details of permutation to make register assignments
 950 #
 951 o1K_rdi =  0        #offsets in X[] associated with each register
 952 o1K_rsi =  1
 953 o1K_rbp =  2
 954 o1K_rax =  3
 955 o1K_rcx =  4        #rcx is "shared" with X6, since X4/X6 alternate
 956 o1K_rbx =  5
 957 o1K_rdx =  7
 958 o1K_r8  =  8
 959 o1K_r9  =  9
 960 o1K_r10 = 10
 961 o1K_r11 = 11
 962 o1K_r12 = 12
 963 o1K_r13 = 13
 964 o1K_r14 = 14
 965 o1K_r15 = 15
 966 #
 967 rIdx_offs = tmpStk_1024
 968 #
 969 .macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1
 970     addReg      \reg0 , \reg1                      #perform the MIX
 971     RotL64      \reg1 , 1024,%((_RN0_) % 8),_Rn1_
 972     xorReg      \reg1 , \reg0
 973 .if ((_RN0_) && 3) == 3         #time to do key injection?
 974  .if _SKEIN_DEBUG
 975     movq       %\reg0 , xDebug_1024+8*w0(%rsp)     #save intermediate values for Debug_Round
 976     movq       %\reg1 , xDebug_1024+8*w1(%rsp)     # (before inline key injection)
 977  .endif
 978 _II_ = ((_RN0_)/4)+1            #injection count
 979  .if SKEIN_ASM_UNROLL && 1024   #here to do fully unrolled key injection
 980     addq        ksKey+ 8*((_II_+w0) % 17)(%rsp),%\reg0
 981     addq        ksKey+ 8*((_II_+w1) % 17)(%rsp),%\reg1
 982   .if     w1 == 13                                 #tweak injection
 983     addq        ksTwk+ 8*((_II_+ 0) %  3)(%rsp),%\reg1
 984   .elseif w0 == 14
 985     addq        ksTwk+ 8*((_II_+ 1) %  3)(%rsp),%\reg0
 986   .elseif w1 == 15
 987     addq        $_II_, %\reg1                      #(injection counter)
 988   .endif
 989  .else                          #here to do looping  key injection
 990   .if  (w0 == 0)
 991     movq        %rdi, X_stk+8*w0(%rsp)             #if so, store N0 so we can use reg as index
 992     movq         rIdx_offs(%rsp),%rdi              #get the injection counter index into rdi
 993   .else
 994     addq         ksKey+8+8*w0(%rsp,%rdi,8),%\reg0  #even key injection
 995   .endif
 996   .if     w1 == 13                                 #tweak injection
 997     addq         ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1
 998   .elseif w0 == 14
 999     addq         ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0
1000   .elseif w1 == 15
1001     addReg      \reg1,rdi,,,1                      #(injection counter)
1002   .endif
1003     addq         ksKey+8+8*w1(%rsp,%rdi,8),%\reg1  #odd key injection
1004  .endif
1005 .endif
1006     # insert the op provided, .if any
1007     op1
1008 .endm
1009 #################
1010 # MACRO: four rounds for 1024-bit blocks
1011 #
1012 .macro r1024_FourRounds _RR_    #RR = base round number (0 mod 4)
1013     # should be here with X4 set properly, X6 stored on stack
1014 _Rn_ = (_RR_) + 0
1015         r1024_Mix  0, 1,rdi,rsi,_Rn_,0
1016         r1024_Mix  2, 3,rbp,rax,_Rn_,1
1017         r1024_Mix  4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)>       #save X4  on  stack (x4/x6 alternate)
1018         r1024_Mix  8, 9,r8 ,r9 ,_Rn_,4,<movq      X_stk+8*6(%rsp),%rcx>  #load X6 from stack
1019         r1024_Mix 10,11,r10,r11,_Rn_,5
1020         r1024_Mix 12,13,r12,r13,_Rn_,6
1021         r1024_Mix  6, 7,rcx,rdx,_Rn_,3
1022         r1024_Mix 14,15,r14,r15,_Rn_,7
1023     .if _SKEIN_DEBUG
1024       Skein_Debug_Round 1024,%(_Rn_+1)
1025     .endif
1026 _Rn_ = (_RR_) + 1
1027         r1024_Mix  0, 9,rdi,r9 ,_Rn_,0
1028         r1024_Mix  2,13,rbp,r13,_Rn_,1
1029         r1024_Mix  6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)>       #save X6  on  stack (x4/x6 alternate)
1030         r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq      X_stk+8*4(%rsp),%rcx>  #load X4 from stack
1031         r1024_Mix 12, 3,r12,rax,_Rn_,5
1032         r1024_Mix 14, 5,r14,rbx,_Rn_,6
1033         r1024_Mix  4,15,rcx,r15,_Rn_,3
1034         r1024_Mix  8, 1,r8 ,rsi,_Rn_,7
1035     .if _SKEIN_DEBUG
1036       Skein_Debug_Round 1024,%(_Rn_+1)
1037     .endif
1038 _Rn_ = (_RR_) + 2
1039         r1024_Mix  0, 7,rdi,rdx,_Rn_,0
1040         r1024_Mix  2, 5,rbp,rbx,_Rn_,1
1041         r1024_Mix  4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)>       #save X4  on  stack (x4/x6 alternate)
1042         r1024_Mix 12,15,r12,r15,_Rn_,4,<movq      X_stk+8*6(%rsp),%rcx>  #load X6 from stack
1043         r1024_Mix 14,13,r14,r13,_Rn_,5
1044         r1024_Mix  8,11,r8 ,r11,_Rn_,6
1045         r1024_Mix  6, 1,rcx,rsi,_Rn_,3
1046         r1024_Mix 10, 9,r10,r9 ,_Rn_,7
1047     .if _SKEIN_DEBUG
1048       Skein_Debug_Round 1024,%(_Rn_+1)
1049     .endif
1050 _Rn_ = (_RR_) + 3
1051         r1024_Mix  0,15,rdi,r15,_Rn_,0
1052         r1024_Mix  2,11,rbp,r11,_Rn_,1
1053         r1024_Mix  6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)>       #save X6  on  stack (x4/x6 alternate)
1054         r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq      X_stk+8*4(%rsp),%rcx>  #load X4 from stack
1055         r1024_Mix  8, 5,r8 ,rbx,_Rn_,5
1056         r1024_Mix 10, 3,r10,rax,_Rn_,6
1057         r1024_Mix  4, 9,rcx,r9 ,_Rn_,3
1058         r1024_Mix 12, 7,r12,rdx,_Rn_,7
1059     .if _SKEIN_DEBUG
1060       Skein_Debug_Round 1024,%(_Rn_+1)
1061     .endif
1062
1063   .if (SKEIN_ASM_UNROLL && 1024) == 0           #here with rdi == rIdx, X0 on stack
1064     #"rotate" the key schedule on the stack
1065 i8 = o1K_r8
1066 i0 = o1K_rdi
1067     movq    %r8 , X_stk+8*i8(%rsp)              #free up a register (save it on the stack)
1068     movq          ksKey+8* 0(%rsp,%rdi,8),%r8   #get  key  word
1069     movq    %r8 , ksKey+8*17(%rsp,%rdi,8)       #rotate key (must do key first or tweak clobbers it!)
1070     movq          ksTwk+8* 0(%rsp,%rdi,8),%r8   #get tweak word
1071     movq    %r8 , ksTwk+8* 3(%rsp,%rdi,8)       #rotate tweak (onto the stack)
1072     movq          X_stk+8*i8(%rsp)       ,%r8   #get the reg back
1073     incq    %rdi                                #bump the index
1074     movq    %rdi, rIdx_offs (%rsp)              #save rdi again
1075     movq          ksKey+8*i0(%rsp,%rdi,8),%rdi  #get the key schedule word for X0 back
1076     addq          X_stk+8*i0(%rsp)       ,%rdi  #perform the X0 key injection
1077   .endif
1078     #show the result of the key injection
1079     Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT
1080 .endm #r1024_FourRounds
1081 #
1082 ################
1083 # code
1084 #
1085 C_label Skein1024_Process_Block
1086 #
1087     Setup_Stack 1024,ROUNDS_1024/8,WCNT
1088     movq    TWEAK+ 8(%rdi),%r9
1089     jmp     Skein1024_block_loop
1090     # main hash loop for Skein1024
1091     .p2align 4
1092 Skein1024_block_loop:
1093     # general register usage:
1094     #   RSP              = stack pointer
1095     #   RAX..RDX,RSI,RDI = X1, X3..X7 (state words)
1096     #   R8 ..R15         = X8..X15    (state words)
1097     #   RBP              = temp (used for X0 and X2)
1098     #
1099   .if (SKEIN_ASM_UNROLL & 1024) == 0
1100     xorq    %rax,%rax                      #init loop index on the stack
1101     movq    %rax,rIdx_offs(%rsp)
1102   .endif
1103     movq         TWEAK+     0(%rdi),%r8
1104     addq         bitAdd+  F_O(%rbp),%r8    #computed updated tweak value T0
1105     movq    %r9 ,%r10
1106     xorq    %r8 ,%r10                      #%rax/%rbx/%rcx = tweak schedule
1107     movq    %r8 ,TWEAK+     0(%rdi)        #save updated tweak value ctx->h.T[0]
1108     movq    %r8 ,ksTwk+ 0+F_O(%rbp)
1109     movq    %r9 ,ksTwk+ 8+F_O(%rbp)        #keep values in %r8 ,%r9  for initial tweak injection below
1110     movq    %r10,ksTwk+16+F_O(%rbp)
1111   .if _SKEIN_DEBUG
1112     movq    %r9 ,TWEAK+     8(%rdi)        #save updated tweak value ctx->h.T[1] for Skein_Debug_Block
1113   .endif
1114     movq         blkPtr +F_O(%rbp),%rsi    # rsi --> input block
1115     movq        $KW_PARITY        ,%rax    #overall key schedule parity
1116
1117     # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3]
1118     .irp _rN_,0,1,2,3,4,6                  #process the "initial" words, using r14/r15 as temps
1119       movq       X_VARS+8*_rN_(%rdi),%r14  #get state word
1120       movq              8*_rN_(%rsi),%r15  #get msg   word
1121       xorq  %r14,%rax                      #update key schedule overall parity
1122       movq  %r14,ksKey +8*_rN_+F_O(%rbp)   #save key schedule word on stack
1123       movq  %r15,Wcopy +8*_rN_+F_O(%rbp)   #save local msg Wcopy
1124       addq  %r15,%r14                      #do the initial key injection
1125       movq  %r14,X_stk +8*_rN_    (%rsp)   #save initial state var on stack
1126     .endr
1127     # now process the rest, using the "real" registers
1128     #     (MUST do it in reverse order to inject tweaks r8/r9 first)
1129     .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx
1130 _oo_ = o1K_\_rr_                           #offset assocated with the register
1131       movq  X_VARS+8*_oo_(%rdi),%\_rr_     #get key schedule word from context
1132       movq         8*_oo_(%rsi),%rcx       #get next input msg word
1133       movq  %\_rr_, ksKey +8*_oo_(%rsp)    #save key schedule on stack
1134       xorq  %\_rr_, %rax                   #accumulate key schedule parity
1135       movq  %rcx,Wcopy+8*_oo_+F_O(%rbp)    #save copy of msg word for feedforward
1136       addq  %rcx,%\_rr_                    #do the initial  key  injection
1137       .if    _oo_ == 13                    #do the initial tweak injection
1138         addReg _rr_,r8                     #          (only in words 13/14)
1139       .elseif _oo_ == 14
1140         addReg _rr_,r9
1141       .endif
1142     .endr
1143     movq    %rax,ksKey+8*WCNT+F_O(%rbp)    #save key schedule parity
1144 .if _SKEIN_DEBUG
1145     Skein_Debug_Block 1024                 #initial debug dump
1146 .endif
1147     addq     $8*WCNT,%rsi                  #bump the msg ptr
1148     movq     %rsi,blkPtr+F_O(%rbp)         #save bumped msg ptr
1149     # re-load words 0..4 from stack, enter the main loop
1150     .irp _rr_,rdi,rsi,rbp,rax,rcx          #(no need to re-load x6, already on stack)
1151       movq  X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go!
1152     .endr
1153 .if _SKEIN_DEBUG
1154     Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL        #show state after initial key injection
1155 .endif
1156     #
1157     #################
1158     # now the key schedule is computed. Start the rounds
1159     #
1160 .if SKEIN_ASM_UNROLL & 1024
1161 _UNROLL_CNT =   ROUNDS_1024/8
1162 .else
1163 _UNROLL_CNT =   SKEIN_UNROLL_1024
1164   .if ((ROUNDS_1024/8) % _UNROLL_CNT)
1165     .error "Invalid SKEIN_UNROLL_1024"
1166   .endif
1167 Skein1024_round_loop:
1168 .endif
1169 #
1170 _Rbase_ = 0
1171 .rept _UNROLL_CNT*2                        #implement the rounds, 4 at a time
1172       r1024_FourRounds %(4*_Rbase_+00)
1173 _Rbase_ = _Rbase_+1
1174 .endr #rept _UNROLL_CNT
1175 #
1176 .if (SKEIN_ASM_UNROLL & 1024) == 0
1177     cmpq    $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done
1178     jb      Skein1024_round_loop
1179 .endif
1180     # end of rounds
1181     #################
1182     #
1183     # feedforward:   ctx->X[i] = X[i] ^ w[i], {i=0..15}
1184     movq    %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack
1185     movq       ctxPtr(%rsp),%rdx
1186
1187     .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15   #do all but x6,x7
1188 _oo_ = o1K_\_rr_
1189       xorq  Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR
1190       movq  %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context
1191       .if (_oo_ ==  9)
1192         movq   $FIRST_MASK64 ,%r9
1193       .endif
1194       .if (_oo_ == 14)
1195         andq   TWEAK+ 8(%rdx),%r9
1196       .endif
1197     .endr
1198     #
1199     movq         X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above)
1200     movq         X_stk +8*7(%rsp),%rbx
1201     xorq         Wcopy +8*6(%rsp),%rax
1202     xorq         Wcopy +8*7(%rsp),%rbx
1203     movq    %rax,X_VARS+8*6(%rdx)
1204     decq             blkCnt(%rsp)      #set zero flag iff done
1205     movq    %rbx,X_VARS+8*7(%rdx)
1206
1207     Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)>
1208     # go back for more blocks, if needed
1209     movq             ctxPtr(%rsp),%rdi #don't muck with the flags here!
1210     lea          FRAME_OFFS(%rsp),%rbp
1211     jnz     Skein1024_block_loop
1212     movq    %r9 ,TWEAK+   8(%rdx)
1213     Reset_Stack
1214     ret
1215 #
1216 Skein1024_Process_Block_End:
1217 #
1218 .if _SKEIN_DEBUG
1219 Skein_Debug_Round_1024:
1220     # call here with rdx  = "round number",
1221 _SP_OFFS_ = 8*2                     #stack "offset" here: rdx, return addr
1222     #
1223   #save rest of X[] state on stack so debug routines can access it
1224   .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15
1225     movq    %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp)
1226   .endr
1227     # Figure out what to do with x0 (rdi).  When rdx == 0 mod 4, it's already on stack
1228     cmpq    $SKEIN_RND_SPECIAL,%rdx #special rounds always save
1229     jae     save_x0
1230     testq   $3,%rdx                 #otherwise only if rdx != 0 mod 4
1231     jz      save_x0_not
1232 save_x0:
1233     movq    %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp)
1234 save_x0_not:
1235     #figure out the x4/x6 swapping state and save the correct one!
1236     cmpq    $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4
1237     jae     save_x4
1238     testq   $1,%rdx                  #and even ones have r4 as well
1239     jz      save_x4
1240     movq    %rcx,X_stk+8*6+_SP_OFFS_(%rsp)
1241     jmp     debug_1024_go
1242 save_x4:
1243     movq    %rcx,X_stk+8*4+_SP_OFFS_(%rsp)
1244 debug_1024_go:
1245     #now all is saved in Xstk[] except for rdx
1246     push    %rsi                    #save two regs for BLK_BITS-specific parms
1247     push    %rdi
1248 _SP_OFFS_ = _SP_OFFS_ + 16          #adjust stack offset accordingly (now 32)
1249
1250     movq    _SP_OFFS_-8(%rsp),%rsi  #get back original %rdx (pushed on stack in macro call)
1251     movq    %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[]
1252
1253     movq    ctxPtr+_SP_OFFS_(%rsp),%rsi  #rsi = ctx_hdr_ptr
1254     movq    $1024,%rdi                   #rdi = block size
1255     jmp     Skein_Debug_Round_Common
1256 .endif
1257 #
1258 .if _SKEIN_CODE_SIZE
1259 C_label Skein1024_Process_Block_CodeSize
1260     movq    $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax
1261     ret
1262 #
1263 C_label Skein1024_Unroll_Cnt
1264   .if _UNROLL_CNT <> (ROUNDS_1024/8)
1265     movq    $_UNROLL_CNT,%rax
1266   .else
1267     xorq    %rax,%rax
1268   .endif
1269     ret
1270 .endif
1271 #
1272 .endif # _USE_ASM_ and 1024
1273 #
1274 .if _SKEIN_DEBUG
1275 #----------------------------------------------------------------
1276 #local debug routine to set up for calls to:
1277 #  void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)
1278 #                       [       rdi                        rsi   rdx              rcx]
1279 #
1280 # here with %rdx = round number
1281 #           %rsi = ctx_hdr_ptr
1282 #           %rdi = block size (256/512/1024)
1283 # on stack: saved rdi, saved rsi, retAddr, saved rdx
1284 #
1285 Skein_Debug_Round_Common:
1286 _SP_OFFS_ = 32                        #account for four words on stack already
1287   .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15  #save the rest of the regs
1288     pushq %\_rr_
1289 _SP_OFFS_ = _SP_OFFS_+8
1290   .endr
1291   .if (_SP_OFFS_ % 16)                # make sure stack is still 16-byte aligned here
1292     .error  "Debug_Round_Common: stack alignment"
1293   .endif
1294     # compute %rcx  = ptr to the X[] array on the stack (final parameter to call)
1295     leaq    X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address
1296     cmpq    $SKEIN_RND_FEED_FWD,%rdx   #special handling for feedforward "round"?
1297     jnz     _got_rcxA
1298     leaq    X_VARS(%rsi),%rcx
1299 _got_rcxA:
1300   .if _USE_ASM_ & 1024
1301     # special handling for 1024-bit case
1302     #    (for rounds right before with key injection:
1303     #        use xDebug_1024[] instead of X_stk[])
1304     cmpq    $SKEIN_RND_SPECIAL,%rdx
1305     jae     _got_rcxB               #must be a normal round
1306     orq     %rdx,%rdx
1307     jz      _got_rcxB               #just before key injection
1308     test    $3,%rdx
1309     jne     _got_rcxB
1310     cmp     $1024,%rdi              #only 1024-bit(s) for now
1311     jne     _got_rcxB
1312     leaq    xDebug_1024+_SP_OFFS_(%rsp),%rcx
1313 _got_rcxB:
1314   .endif
1315     call    Skein_Show_Round        #call external debug handler
1316
1317   .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax  #restore regs
1318     popq  %\_rr_
1319 _SP_OFFS_ = _SP_OFFS_-8
1320   .endr
1321   .if _SP_OFFS_ - 32
1322     .error   "Debug_Round_Common: push/pop misalignment!"
1323   .endif
1324     popq    %rdi
1325     popq    %rsi
1326     ret
1327 .endif
1328 #----------------------------------------------------------------
1329     .section .note.GNU-stack,"",@progbits
1330
1331     .end