2 #----------------------------------------------------------------
3 # 64-bit x86 assembler code (gnu as) for Skein block functions
5 # Author: Doug Whiting, Hifn/Exar
7 # This code is released to the public domain.
8 #----------------------------------------------------------------
13 .psize 0,128 #list file has no page boundaries
15 _MASK_ALL_ = (256+512+1024) #all three algorithm bits
20 _USE_ASM_ = _MASK_ALL_
22 _USE_ASM_ = SKEIN_USE_ASM
25 .ifndef SKEIN_LOOP #configure loop unrolling
26 _SKEIN_LOOP = 2 #default is fully unrolled for 256/512, twice for 1024
28 _SKEIN_LOOP = SKEIN_LOOP
29 .irp _NN_,%_SKEIN_LOOP #only display loop unrolling if default changed on command line
30 .print "+++ SKEIN_LOOP = \_NN_"
33 # the unroll counts (0 --> fully unrolled)
34 SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10
35 SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10
36 SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10
39 .irp _NN_,256,512,1024
40 .if (SKEIN_UNROLL_\_NN_) == 0
41 SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + \_NN_
51 ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
52 ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5)
53 ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5)
54 # only display rounds if default size is changed on command line
55 .irp _NN_,256,512,1024
56 .if _USE_ASM_ && \_NN_
57 .irp _RR_,%(ROUNDS_\_NN_)
59 .print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
61 .print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
69 .ifdef SKEIN_CODE_SIZE
70 _SKEIN_CODE_SIZE = (1)
72 .ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined
73 _SKEIN_CODE_SIZE = (1)
75 _SKEIN_CODE_SIZE = (0)
88 # define offsets of fields in hash context structure
90 HASH_BITS = 0 #bits of hash output
91 BCNT = 8 + HASH_BITS #number of bytes in BUFFER[]
92 TWEAK = 8 + BCNT #tweak values[0..1]
93 X_VARS = 16 + TWEAK #chaining vars
95 #(Note: buffer[] in context structure is NOT needed here :-)
97 KW_PARITY = 0x1BD11BDAA9FC1A22 #overall parity of key schedule words
98 FIRST_MASK = ~ (1 << 6)
99 FIRST_MASK64= ~ (1 << 62)
101 # rotation constants for Skein
240 # Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
242 .macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM
243 _RCNT_ = RC_\BLK_SIZE&_\ROUND_NUM&_\MIX_NUM
244 .if _RCNT_ #is there anything to do?
249 #----------------------------------------------------------------
251 # MACROS: define local vars and configure stack
253 #----------------------------------------------------------------
254 # declare allocated space on the stack
255 .macro StackVar localName,localSize
256 \localName = _STK_OFFS_
257 _STK_OFFS_ = _STK_OFFS_+(\localSize)
260 #----------------------------------------------------------------
262 # MACRO: Configure stack frame, allocate local vars
264 .macro Setup_Stack BLK_BITS,KS_CNT,debugCnt
265 WCNT = (\BLK_BITS)/64
267 _PushCnt_ = 0 #save nonvolatile regs on stack
268 .irp _reg_,rbp,rbx,r12,r13,r14,r15
270 _PushCnt_ = _PushCnt_ + 1 #track count to keep alignment
273 _STK_OFFS_ = 0 #starting offset from rsp
274 #---- local variables #<-- rsp
275 StackVar X_stk ,8*(WCNT) #local context vars
276 StackVar ksTwk ,8*3 #key schedule: tweak words
277 StackVar ksKey ,8*(WCNT)+8 #key schedule: key words
278 .if (SKEIN_ASM_UNROLL && (\BLK_BITS)) == 0
279 StackVar ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen
281 StackVar Wcopy ,8*(WCNT) #copy of input block
283 .if \debugCnt + 0 #temp location for debug X[] info
284 StackVar xDebug_\BLK_BITS ,8*(\debugCnt)
287 .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0
288 StackVar align16,8 #keep 16-byte aligned (adjust for retAddr?)
289 tmpStk_\BLK_BITS = align16 #use this
291 #---- saved caller parameters (from regs rdi, rsi, rdx, rcx)
292 StackVar ctxPtr ,8 #context ptr
293 StackVar blkPtr ,8 #pointer to block data
294 StackVar blkCnt ,8 #number of full blocks to process
295 StackVar bitAdd ,8 #bit count to add to tweak
296 LOCAL_SIZE = _STK_OFFS_ #size of "local" vars
298 StackVar savRegs,8*_PushCnt_ #saved registers
299 StackVar retAddr,8 #return address
300 #---- caller's stack frame (aligned mod 16)
302 # set up the stack frame pointer (rbp)
304 FRAME_OFFS = ksTwk + 128 #allow short (negative) offset to ksTwk, kwKey
305 .if FRAME_OFFS > _STK_OFFS_ #keep rbp in the "locals" range
306 FRAME_OFFS = _STK_OFFS_
310 #put some useful defines in the .lst file (for grep)
311 __STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE
312 __STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_
313 __STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS
315 # Notes on stack frame setup:
316 # * the most frequently used variable is X_stk[], based at [rsp+0]
317 # * the next most used is the key schedule arrays, ksKey and ksTwk
318 # so rbp is "centered" there, allowing short offsets to the key
319 # schedule even in 1024-bit Skein case
320 # * the Wcopy variables are infrequently accessed, but they have long
321 # offsets from both rsp and rbp only in the 1024-bit case.
322 # * all other local vars and calling parameters can be accessed
323 # with short offsets, except in the 1024-bit case
325 subq $LOCAL_SIZE,%rsp #make room for the locals
326 leaq FRAME_OFFS(%rsp),%rbp #maximize use of short offsets
327 movq %rdi, ctxPtr+F_O(%rbp) #save caller's parameters on the stack
328 movq %rsi, blkPtr+F_O(%rbp)
329 movq %rdx, blkCnt+F_O(%rbp)
330 movq %rcx, bitAdd+F_O(%rbp)
334 #----------------------------------------------------------------
337 addq $LOCAL_SIZE,%rsp #get rid of locals (wipe??)
338 .irp _reg_,r15,r14,r13,r12,rbx,rbp
339 popq %\_reg_ #restore caller's regs
340 _PushCnt_ = _PushCnt_ - 1
343 .error "Mismatched push/pops?"
347 #----------------------------------------------------------------
348 # macros to help debug internals
351 .extern Skein_Show_Block #calls to C routines
352 .extern Skein_Show_Round
354 SKEIN_RND_SPECIAL = 1000
355 SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0
356 SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1
357 SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2
359 .macro Skein_Debug_Block BLK_BITS
361 #void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
362 # const u08b_t *blkPtr, const u64b_t *wPtr,
363 # const u64b_t *ksPtr,const u64b_t *tsPtr)
366 .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11
367 pushq %\_reg_ #save all volatile regs on tack before the call
370 # get and push call parameters
371 movq $\BLK_BITS ,%rdi #bits
372 movq ctxPtr+F_O(%rbp),%rsi #h (pointer)
373 leaq X_VARS (%rsi),%rdx #X (pointer)
374 movq blkPtr+F_O(%rbp),%rcx #blkPtr
375 leaq Wcopy +F_O(%rbp),%r8 #wPtr
376 leaq ksKey +F_O(%rbp),%r9 #key pointer
377 leaq ksTwk +F_O(%rbp),%rax #tweak pointer
378 pushq %rax # (pass on the stack)
379 call Skein_Show_Block #call external debug handler
380 addq $8*1,%rsp #discard parameters on stack
381 .if (_NN_ % 2 ) == 0 #check stack alignment
382 .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS"
384 .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax
385 popq %\_reg_ #restore regs
389 .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS"
391 .endm # Skein_Debug_Block
393 # the macro to "call" to debug a round
395 .macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
396 # call the appropriate (local) debug "function"
397 pushq %rdx #save rdx, so we can use it for round "number"
398 .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
400 .else #compute round number using edi
401 _rOffs_ = \RDI_OFFS + 0
402 .if \BLK_BITS == 1024
403 movq rIdx_offs+8(%rsp),%rdx #get rIdx off the stack (adjust for pushq rdx above)
404 leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdx,4),%rdx
406 leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdi,4),%rdx
409 call Skein_Debug_Round_\BLK_BITS
410 popq %rdx #restore origianl rdx value
413 .endm # Skein_Debug_Round
414 .else #------- _SKEIN_DEBUG (dummy macros if debug not enabled)
415 .macro Skein_Debug_Block BLK_BITS
418 .macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
421 .endif # _SKEIN_DEBUG
423 #----------------------------------------------------------------
425 .macro addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
427 leaq \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
428 .elseif ((\useAddOp + 0) == 0)
429 .ifndef ASM_NO_LEA #lea seems to be faster on Core 2 Duo CPUs!
430 leaq (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
432 addq %\srcReg_A\srcReg_B,%\dstReg
435 addq %\srcReg_A\srcReg_B,%\dstReg
439 # keep Intel-style ordering here, to match addReg
440 .macro xorReg dstReg,srcReg_A,srcReg_B
441 xorq %\srcReg_A\srcReg_B,%\dstReg
444 #----------------------------------------------------------------
447 \lName: #use both "genders" to work across linkage conventions
453 #=================================== Skein_256 =============================================
457 # void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
463 C_label Skein_256_Process_Block
464 Setup_Stack 256,((ROUNDS_256/8)+1)
465 movq TWEAK+8(%rdi),%r14
466 jmp Skein_256_block_loop
468 # main hash loop for Skein_256
469 Skein_256_block_loop:
471 # general register usage:
473 # R08..R12 = ks[0..4]
474 # R13..R15 = ts[0..2]
475 # RSP, RBP = stack/frame pointers
476 # RDI = round counter or context pointer
479 movq TWEAK+0(%rdi) ,%r13
480 addq bitAdd+F_O(%rbp) ,%r13 #computed updated tweak value T0
482 xorq %r13 ,%r15 #now %r13.%r15 is set as the tweak
484 movq $KW_PARITY ,%r12
485 movq X_VARS+ 0(%rdi),%r8
486 movq X_VARS+ 8(%rdi),%r9
487 movq X_VARS+16(%rdi),%r10
488 movq X_VARS+24(%rdi),%r11
489 movq %r13,TWEAK+0(%rdi) #save updated tweak value ctx->h.T[0]
490 xorq %r8 ,%r12 #start accumulating overall parity
492 movq blkPtr +F_O(%rbp) ,%rsi #esi --> input block
494 movq 0(%rsi) ,%rax #get X[0..3]
501 movq %rax,Wcopy+ 0+F_O(%rbp) #save copy of input block
502 movq %rbx,Wcopy+ 8+F_O(%rbp)
503 movq %rcx,Wcopy+16+F_O(%rbp)
504 movq %rdx,Wcopy+24+F_O(%rbp)
506 addq %r8 ,%rax #initial key injection
514 movq %r14,TWEAK+ 8(%rdi) #save updated tweak T[1] (start bit cleared?)
515 movq %r8 ,ksKey+ 0+F_O(%rbp) #save key schedule on stack for Skein_Debug_Block
516 movq %r9 ,ksKey+ 8+F_O(%rbp)
517 movq %r10,ksKey+16+F_O(%rbp)
518 movq %r11,ksKey+24+F_O(%rbp)
519 movq %r12,ksKey+32+F_O(%rbp)
521 movq %r13,ksTwk+ 0+F_O(%rbp)
522 movq %r14,ksTwk+ 8+F_O(%rbp)
523 movq %r15,ksTwk+16+F_O(%rbp)
525 movq %rax,X_stk + 0(%rsp) #save X[] on stack for Skein_Debug_Block
526 movq %rbx,X_stk + 8(%rsp)
527 movq %rcx,X_stk +16(%rsp)
528 movq %rdx,X_stk +24(%rsp)
530 Skein_Debug_Block 256 #debug dump
531 Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
534 .if ((SKEIN_ASM_UNROLL & 256) == 0)
535 movq %r8 ,ksKey+40+F_O(%rbp) #save key schedule on stack for looping code
536 movq %r9 ,ksKey+ 8+F_O(%rbp)
537 movq %r10,ksKey+16+F_O(%rbp)
538 movq %r11,ksKey+24+F_O(%rbp)
539 movq %r12,ksKey+32+F_O(%rbp)
541 movq %r13,ksTwk+24+F_O(%rbp)
542 movq %r14,ksTwk+ 8+F_O(%rbp)
543 movq %r15,ksTwk+16+F_O(%rbp)
545 addq $WCNT*8,%rsi #skip the block
546 movq %rsi,blkPtr +F_O(%rbp) #update block pointer
548 # now the key schedule is computed. Start the rounds
550 .if SKEIN_ASM_UNROLL & 256
551 _UNROLL_CNT = ROUNDS_256/8
553 _UNROLL_CNT = SKEIN_UNROLL_256
554 .if ((ROUNDS_256/8) % _UNROLL_CNT)
555 .error "Invalid SKEIN_UNROLL_256"
557 xorq %rdi,%rdi #rdi = iteration count
558 Skein_256_round_loop:
562 # all X and ks vars in regs # (ops to "rotate" ks vars, via mem, if not unrolled)
563 # round 4*_RBase_ + 0
565 RotL64 rbx, 256,%((4*_Rbase_+0) % 8),0
567 .if (SKEIN_ASM_UNROLL & 256) == 0
568 movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8
571 RotL64 rdx, 256,%((4*_Rbase_+0) % 8),1
573 .if SKEIN_ASM_UNROLL & 256
574 .irp _r0_,%( 8+(_Rbase_+3) % 5)
575 .irp _r1_,%(13+(_Rbase_+2) % 3)
576 leaq (%r\_r0_,%r\_r1_),%rdi #precompute key injection value for %rcx
580 .if (SKEIN_ASM_UNROLL & 256) == 0
581 movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13
583 Skein_Debug_Round 256,%(4*_Rbase_+1)
585 # round 4*_Rbase_ + 1
587 RotL64 rdx, 256,%((4*_Rbase_+1) % 8),0
589 .if (SKEIN_ASM_UNROLL & 256) == 0
590 movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9
593 RotL64 rbx, 256,%((4*_Rbase_+1) % 8),1
595 .if (SKEIN_ASM_UNROLL & 256) == 0
596 movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11
598 Skein_Debug_Round 256,%(4*_Rbase_+2)
599 .if SKEIN_ASM_UNROLL & 256
600 .irp _r0_,%( 8+(_Rbase_+2) % 5)
601 .irp _r1_,%(13+(_Rbase_+1) % 3)
602 leaq (%r\_r0_,%r\_r1_),%rsi #precompute key injection value for %rbx
606 # round 4*_Rbase_ + 2
608 RotL64 rbx, 256,%((4*_Rbase_+2) % 8),0
610 .if (SKEIN_ASM_UNROLL & 256) == 0
611 movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10
614 RotL64 rdx, 256,%((4*_Rbase_+2) % 8),1
616 .if (SKEIN_ASM_UNROLL & 256) == 0
617 movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8) #"rotate" the key
618 leaq 1(%r11,%rdi),%r11 #precompute key + tweak
620 Skein_Debug_Round 256,%(4*_Rbase_+3)
621 # round 4*_Rbase_ + 3
623 RotL64 rdx, 256,%((4*_Rbase_+3) % 8),0
625 .if (SKEIN_ASM_UNROLL & 256) == 0
626 addq ksTwk+8*2+F_O(%rbp,%rdi,8),%r10 #precompute key + tweak
627 movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8) #"rotate" the tweak
630 RotL64 rbx, 256,%((4*_Rbase_+3) % 8),1
632 Skein_Debug_Round 256,%(4*_Rbase_+4)
633 .if (SKEIN_ASM_UNROLL & 256) == 0
634 addReg r9 ,r13 #precompute key+tweak
636 #inject key schedule words
638 .if SKEIN_ASM_UNROLL & 256
639 addReg rax,r,%(8+((_Rbase_+0) % 5))
642 addReg rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_
650 Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
651 .endr #rept _UNROLL_CNT
653 .if (SKEIN_ASM_UNROLL & 256) == 0
654 cmpq $2*(ROUNDS_256/8),%rdi
655 jb Skein_256_round_loop
656 .endif # (SKEIN_ASM_UNROLL & 256) == 0
657 movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context
659 #----------------------------
660 # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3}
661 movq $FIRST_MASK64 ,%r14
662 xorq Wcopy + 0+F_O (%rbp),%rax
663 xorq Wcopy + 8+F_O (%rbp),%rbx
664 xorq Wcopy +16+F_O (%rbp),%rcx
665 xorq Wcopy +24+F_O (%rbp),%rdx
666 andq TWEAK + 8 (%rdi),%r14
667 movq %rax,X_VARS+ 0(%rdi) #store final result
668 movq %rbx,X_VARS+ 8(%rdi)
669 movq %rcx,X_VARS+16(%rdi)
670 movq %rdx,X_VARS+24(%rdi)
672 Skein_Debug_Round 256,SKEIN_RND_FEED_FWD
674 # go back for more blocks, if needed
675 decq blkCnt+F_O(%rbp)
676 jnz Skein_256_block_loop
677 movq %r14,TWEAK + 8(%rdi)
680 Skein_256_Process_Block_End:
683 Skein_Debug_Round_256: #here with rdx == round "number" from macro
684 pushq %rsi #save two regs for BLK_BITS-specific parms
686 movq 24(%rsp),%rdi #get back original rdx (pushed on stack in macro call) to rdi
687 movq %rax,X_stk+ 0+F_O(%rbp) #save X[] state on stack so debug routines can access it
688 movq %rbx,X_stk+ 8+F_O(%rbp) #(use FP_ since rsp has changed!)
689 movq %rcx,X_stk+16+F_O(%rbp)
690 movq %rdi,X_stk+24+F_O(%rbp)
692 movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr
693 movq $256,%rdi #now <rdi,rsi,rdx> are set for the call
694 jmp Skein_Debug_Round_Common
698 C_label Skein_256_Process_Block_CodeSize
699 movq $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax
702 C_label Skein_256_Unroll_Cnt
703 .if _UNROLL_CNT <> ROUNDS_256/8
704 movq $_UNROLL_CNT,%rax
711 .endif #_USE_ASM_ & 256
713 #=================================== Skein_512 =============================================
717 # void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)
719 # X[i] == %r[8+i] #register assignments for X[] values during rounds (i=0..7)
722 # MACRO: one round for 512-bit blocks
724 .macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4
727 RotL64 r\rn1, 512,%((_Rn_) % 8),0
731 RotL64 r\rn3, 512,%((_Rn_) % 8),1
735 RotL64 r\rn5, 512,%((_Rn_) % 8),2
739 RotL64 r\rn7, 512,%((_Rn_) % 8),3
742 Skein_Debug_Round 512,%(_Rn_+1),-4
744 .endm #R_512_OneRound
747 # MACRO: eight rounds for 512-bit blocks
749 .macro R_512_FourRounds _RR_ #RR = base round number (0 % 8)
750 .if (SKEIN_ASM_UNROLL && 512)
751 # here for fully unrolled case.
752 _II_ = ((_RR_)/4) + 1 #key injection counter
753 R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>
754 R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>
755 R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>
756 R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,
757 # inject the key schedule
758 addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8
760 addq ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9
762 addq ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10
765 addReg r15, rsi,,,(_II_)
767 # here for looping case #"rotate" key/tweak schedule (move up on stack)
768 incq %rdi #bump key injection counter
769 R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>
770 R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8) >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>
771 R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>
772 R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>
773 # inject the key schedule
774 addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8
777 addq ksKey+8*1+F_O(%rbp,%rdi,8),%r9
780 addq ksKey+8*2+F_O(%rbp,%rdi,8),%r10
782 addReg r15, rdi #inject the round number
785 #show the result of the key injection
786 Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT
787 .endm #R_512_EightRounds
792 C_label Skein_512_Process_Block
793 Setup_Stack 512,ROUNDS_512/8
794 movq TWEAK+ 8(%rdi),%rbx
795 jmp Skein_512_block_loop
797 # main hash loop for Skein_512
798 Skein_512_block_loop:
799 # general register usage:
800 # RAX..RDX = temps for key schedule pre-loads
802 # RSP, RBP = stack/frame pointers
803 # RDI = round counter or context pointer
806 movq TWEAK + 0(%rdi),%rax
807 addq bitAdd+F_O(%rbp),%rax #computed updated tweak value T0
809 xorq %rax,%rcx #%rax/%rbx/%rcx = tweak schedule
810 movq %rax,TWEAK+ 0 (%rdi) #save updated tweak value ctx->h.T[0]
811 movq %rax,ksTwk+ 0+F_O(%rbp)
813 movq blkPtr +F_O(%rbp),%rsi #%rsi --> input block
814 movq %rbx,ksTwk+ 8+F_O(%rbp)
815 movq %rcx,ksTwk+16+F_O(%rbp)
816 .irp _Rn_,8,9,10,11,12,13,14,15
817 movq X_VARS+8*(_Rn_-8)(%rdi),%r\_Rn_
818 xorq %r\_Rn_,%rdx #compute overall parity
819 movq %r\_Rn_,ksKey+8*(_Rn_-8)+F_O(%rbp)
820 .endr #load state into %r8 ..%r15, compute parity
821 movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity
823 addReg r13,rax #precompute key injection for tweak
826 movq %rbx,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below
828 movq 0(%rsi),%rax #load input block
832 addReg r8 , rax #do initial key injection
834 movq %rax,Wcopy+ 0+F_O(%rbp) #keep local copy for feedforward
835 movq %rbx,Wcopy+ 8+F_O(%rbp)
838 movq %rcx,Wcopy+16+F_O(%rbp)
839 movq %rdx,Wcopy+24+F_O(%rbp)
849 movq %rax,Wcopy+32+F_O(%rbp)
850 movq %rbx,Wcopy+40+F_O(%rbp)
851 movq %rcx,Wcopy+48+F_O(%rbp)
852 movq %rdx,Wcopy+56+F_O(%rbp)
855 .irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output
856 movq %r\_Rn_,X_stk+8*(_Rn_-8)(%rsp)
859 Skein_Debug_Block 512 #debug dump
860 Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL
862 addq $8*WCNT,%rsi #skip the block
863 movq %rsi,blkPtr+F_O(%rbp) #update block pointer
866 # now the key schedule is computed. Start the rounds
868 .if SKEIN_ASM_UNROLL & 512
869 _UNROLL_CNT = ROUNDS_512/8
871 _UNROLL_CNT = SKEIN_UNROLL_512
872 .if ((ROUNDS_512/8) % _UNROLL_CNT)
873 .error "Invalid SKEIN_UNROLL_512"
875 xorq %rdi,%rdi #rdi = round counter
876 Skein_512_round_loop:
881 R_512_FourRounds %(4*_Rbase_+00)
883 .endr #rept _UNROLL_CNT
885 .if (SKEIN_ASM_UNROLL & 512) == 0
886 cmpq $2*(ROUNDS_512/8),%rdi
887 jb Skein_512_round_loop
888 movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context
892 # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7}
893 .irp _Rn_,8,9,10,11,12,13,14,15
895 movq $FIRST_MASK64,%rbx
897 xorq Wcopy+8*(_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR
898 movq %r\_Rn_,X_VARS+8*(_Rn_-8)(%rdi) #and store result
900 andq TWEAK+ 8(%rdi),%rbx
903 Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
905 # go back for more blocks, if needed
906 decq blkCnt+F_O(%rbp)
907 jnz Skein_512_block_loop
908 movq %rbx,TWEAK + 8(%rdi)
912 Skein_512_Process_Block_End:
915 # call here with rdx = "round number"
916 Skein_Debug_Round_512:
917 pushq %rsi #save two regs for BLK_BITS-specific parms
919 .irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it
920 movq %r\_Rn_,X_stk+8*(_Rn_-8)+F_O(%rbp)
922 movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr
923 movq $512,%rdi #now <rdi,rsi,rdx> are set for the call
924 jmp Skein_Debug_Round_Common
928 C_label Skein_512_Process_Block_CodeSize
929 movq $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax
932 C_label Skein_512_Unroll_Cnt
933 .if _UNROLL_CNT <> (ROUNDS_512/8)
934 movq $_UNROLL_CNT,%rax
941 .endif # _USE_ASM_ & 512
943 #=================================== Skein1024 =============================================
946 # void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
949 # use details of permutation to make register assignments
951 o1K_rdi = 0 #offsets in X[] associated with each register
955 o1K_rcx = 4 #rcx is "shared" with X6, since X4/X6 alternate
967 rIdx_offs = tmpStk_1024
969 .macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1
970 addReg \reg0 , \reg1 #perform the MIX
971 RotL64 \reg1 , 1024,%((_RN0_) % 8),_Rn1_
973 .if ((_RN0_) && 3) == 3 #time to do key injection?
975 movq %\reg0 , xDebug_1024+8*w0(%rsp) #save intermediate values for Debug_Round
976 movq %\reg1 , xDebug_1024+8*w1(%rsp) # (before inline key injection)
978 _II_ = ((_RN0_)/4)+1 #injection count
979 .if SKEIN_ASM_UNROLL && 1024 #here to do fully unrolled key injection
980 addq ksKey+ 8*((_II_+w0) % 17)(%rsp),%\reg0
981 addq ksKey+ 8*((_II_+w1) % 17)(%rsp),%\reg1
982 .if w1 == 13 #tweak injection
983 addq ksTwk+ 8*((_II_+ 0) % 3)(%rsp),%\reg1
985 addq ksTwk+ 8*((_II_+ 1) % 3)(%rsp),%\reg0
987 addq $_II_, %\reg1 #(injection counter)
989 .else #here to do looping key injection
991 movq %rdi, X_stk+8*w0(%rsp) #if so, store N0 so we can use reg as index
992 movq rIdx_offs(%rsp),%rdi #get the injection counter index into rdi
994 addq ksKey+8+8*w0(%rsp,%rdi,8),%\reg0 #even key injection
996 .if w1 == 13 #tweak injection
997 addq ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1
999 addq ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0
1001 addReg \reg1,rdi,,,1 #(injection counter)
1003 addq ksKey+8+8*w1(%rsp,%rdi,8),%\reg1 #odd key injection
1006 # insert the op provided, .if any
1010 # MACRO: four rounds for 1024-bit blocks
1012 .macro r1024_FourRounds _RR_ #RR = base round number (0 mod 4)
1013 # should be here with X4 set properly, X6 stored on stack
1015 r1024_Mix 0, 1,rdi,rsi,_Rn_,0
1016 r1024_Mix 2, 3,rbp,rax,_Rn_,1
1017 r1024_Mix 4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate)
1018 r1024_Mix 8, 9,r8 ,r9 ,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack
1019 r1024_Mix 10,11,r10,r11,_Rn_,5
1020 r1024_Mix 12,13,r12,r13,_Rn_,6
1021 r1024_Mix 6, 7,rcx,rdx,_Rn_,3
1022 r1024_Mix 14,15,r14,r15,_Rn_,7
1024 Skein_Debug_Round 1024,%(_Rn_+1)
1027 r1024_Mix 0, 9,rdi,r9 ,_Rn_,0
1028 r1024_Mix 2,13,rbp,r13,_Rn_,1
1029 r1024_Mix 6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate)
1030 r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack
1031 r1024_Mix 12, 3,r12,rax,_Rn_,5
1032 r1024_Mix 14, 5,r14,rbx,_Rn_,6
1033 r1024_Mix 4,15,rcx,r15,_Rn_,3
1034 r1024_Mix 8, 1,r8 ,rsi,_Rn_,7
1036 Skein_Debug_Round 1024,%(_Rn_+1)
1039 r1024_Mix 0, 7,rdi,rdx,_Rn_,0
1040 r1024_Mix 2, 5,rbp,rbx,_Rn_,1
1041 r1024_Mix 4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate)
1042 r1024_Mix 12,15,r12,r15,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack
1043 r1024_Mix 14,13,r14,r13,_Rn_,5
1044 r1024_Mix 8,11,r8 ,r11,_Rn_,6
1045 r1024_Mix 6, 1,rcx,rsi,_Rn_,3
1046 r1024_Mix 10, 9,r10,r9 ,_Rn_,7
1048 Skein_Debug_Round 1024,%(_Rn_+1)
1051 r1024_Mix 0,15,rdi,r15,_Rn_,0
1052 r1024_Mix 2,11,rbp,r11,_Rn_,1
1053 r1024_Mix 6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate)
1054 r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack
1055 r1024_Mix 8, 5,r8 ,rbx,_Rn_,5
1056 r1024_Mix 10, 3,r10,rax,_Rn_,6
1057 r1024_Mix 4, 9,rcx,r9 ,_Rn_,3
1058 r1024_Mix 12, 7,r12,rdx,_Rn_,7
1060 Skein_Debug_Round 1024,%(_Rn_+1)
1063 .if (SKEIN_ASM_UNROLL && 1024) == 0 #here with rdi == rIdx, X0 on stack
1064 #"rotate" the key schedule on the stack
1067 movq %r8 , X_stk+8*i8(%rsp) #free up a register (save it on the stack)
1068 movq ksKey+8* 0(%rsp,%rdi,8),%r8 #get key word
1069 movq %r8 , ksKey+8*17(%rsp,%rdi,8) #rotate key (must do key first or tweak clobbers it!)
1070 movq ksTwk+8* 0(%rsp,%rdi,8),%r8 #get tweak word
1071 movq %r8 , ksTwk+8* 3(%rsp,%rdi,8) #rotate tweak (onto the stack)
1072 movq X_stk+8*i8(%rsp) ,%r8 #get the reg back
1073 incq %rdi #bump the index
1074 movq %rdi, rIdx_offs (%rsp) #save rdi again
1075 movq ksKey+8*i0(%rsp,%rdi,8),%rdi #get the key schedule word for X0 back
1076 addq X_stk+8*i0(%rsp) ,%rdi #perform the X0 key injection
1078 #show the result of the key injection
1079 Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT
1080 .endm #r1024_FourRounds
1085 C_label Skein1024_Process_Block
1087 Setup_Stack 1024,ROUNDS_1024/8,WCNT
1088 movq TWEAK+ 8(%rdi),%r9
1089 jmp Skein1024_block_loop
1090 # main hash loop for Skein1024
1092 Skein1024_block_loop:
1093 # general register usage:
1094 # RSP = stack pointer
1095 # RAX..RDX,RSI,RDI = X1, X3..X7 (state words)
1096 # R8 ..R15 = X8..X15 (state words)
1097 # RBP = temp (used for X0 and X2)
1099 .if (SKEIN_ASM_UNROLL & 1024) == 0
1100 xorq %rax,%rax #init loop index on the stack
1101 movq %rax,rIdx_offs(%rsp)
1103 movq TWEAK+ 0(%rdi),%r8
1104 addq bitAdd+ F_O(%rbp),%r8 #computed updated tweak value T0
1106 xorq %r8 ,%r10 #%rax/%rbx/%rcx = tweak schedule
1107 movq %r8 ,TWEAK+ 0(%rdi) #save updated tweak value ctx->h.T[0]
1108 movq %r8 ,ksTwk+ 0+F_O(%rbp)
1109 movq %r9 ,ksTwk+ 8+F_O(%rbp) #keep values in %r8 ,%r9 for initial tweak injection below
1110 movq %r10,ksTwk+16+F_O(%rbp)
1112 movq %r9 ,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block
1114 movq blkPtr +F_O(%rbp),%rsi # rsi --> input block
1115 movq $KW_PARITY ,%rax #overall key schedule parity
1117 # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3]
1118 .irp _rN_,0,1,2,3,4,6 #process the "initial" words, using r14/r15 as temps
1119 movq X_VARS+8*_rN_(%rdi),%r14 #get state word
1120 movq 8*_rN_(%rsi),%r15 #get msg word
1121 xorq %r14,%rax #update key schedule overall parity
1122 movq %r14,ksKey +8*_rN_+F_O(%rbp) #save key schedule word on stack
1123 movq %r15,Wcopy +8*_rN_+F_O(%rbp) #save local msg Wcopy
1124 addq %r15,%r14 #do the initial key injection
1125 movq %r14,X_stk +8*_rN_ (%rsp) #save initial state var on stack
1127 # now process the rest, using the "real" registers
1128 # (MUST do it in reverse order to inject tweaks r8/r9 first)
1129 .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx
1130 _oo_ = o1K_\_rr_ #offset assocated with the register
1131 movq X_VARS+8*_oo_(%rdi),%\_rr_ #get key schedule word from context
1132 movq 8*_oo_(%rsi),%rcx #get next input msg word
1133 movq %\_rr_, ksKey +8*_oo_(%rsp) #save key schedule on stack
1134 xorq %\_rr_, %rax #accumulate key schedule parity
1135 movq %rcx,Wcopy+8*_oo_+F_O(%rbp) #save copy of msg word for feedforward
1136 addq %rcx,%\_rr_ #do the initial key injection
1137 .if _oo_ == 13 #do the initial tweak injection
1138 addReg _rr_,r8 # (only in words 13/14)
1143 movq %rax,ksKey+8*WCNT+F_O(%rbp) #save key schedule parity
1145 Skein_Debug_Block 1024 #initial debug dump
1147 addq $8*WCNT,%rsi #bump the msg ptr
1148 movq %rsi,blkPtr+F_O(%rbp) #save bumped msg ptr
1149 # re-load words 0..4 from stack, enter the main loop
1150 .irp _rr_,rdi,rsi,rbp,rax,rcx #(no need to re-load x6, already on stack)
1151 movq X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go!
1154 Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL #show state after initial key injection
1158 # now the key schedule is computed. Start the rounds
1160 .if SKEIN_ASM_UNROLL & 1024
1161 _UNROLL_CNT = ROUNDS_1024/8
1163 _UNROLL_CNT = SKEIN_UNROLL_1024
1164 .if ((ROUNDS_1024/8) % _UNROLL_CNT)
1165 .error "Invalid SKEIN_UNROLL_1024"
1167 Skein1024_round_loop:
1171 .rept _UNROLL_CNT*2 #implement the rounds, 4 at a time
1172 r1024_FourRounds %(4*_Rbase_+00)
1174 .endr #rept _UNROLL_CNT
1176 .if (SKEIN_ASM_UNROLL & 1024) == 0
1177 cmpq $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done
1178 jb Skein1024_round_loop
1183 # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15}
1184 movq %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack
1185 movq ctxPtr(%rsp),%rdx
1187 .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15 #do all but x6,x7
1189 xorq Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR
1190 movq %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context
1192 movq $FIRST_MASK64 ,%r9
1195 andq TWEAK+ 8(%rdx),%r9
1199 movq X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above)
1200 movq X_stk +8*7(%rsp),%rbx
1201 xorq Wcopy +8*6(%rsp),%rax
1202 xorq Wcopy +8*7(%rsp),%rbx
1203 movq %rax,X_VARS+8*6(%rdx)
1204 decq blkCnt(%rsp) #set zero flag iff done
1205 movq %rbx,X_VARS+8*7(%rdx)
1207 Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)>
1208 # go back for more blocks, if needed
1209 movq ctxPtr(%rsp),%rdi #don't muck with the flags here!
1210 lea FRAME_OFFS(%rsp),%rbp
1211 jnz Skein1024_block_loop
1212 movq %r9 ,TWEAK+ 8(%rdx)
1216 Skein1024_Process_Block_End:
1219 Skein_Debug_Round_1024:
1220 # call here with rdx = "round number",
1221 _SP_OFFS_ = 8*2 #stack "offset" here: rdx, return addr
1223 #save rest of X[] state on stack so debug routines can access it
1224 .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15
1225 movq %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp)
1227 # Figure out what to do with x0 (rdi). When rdx == 0 mod 4, it's already on stack
1228 cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always save
1230 testq $3,%rdx #otherwise only if rdx != 0 mod 4
1233 movq %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp)
1235 #figure out the x4/x6 swapping state and save the correct one!
1236 cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4
1238 testq $1,%rdx #and even ones have r4 as well
1240 movq %rcx,X_stk+8*6+_SP_OFFS_(%rsp)
1243 movq %rcx,X_stk+8*4+_SP_OFFS_(%rsp)
1245 #now all is saved in Xstk[] except for rdx
1246 push %rsi #save two regs for BLK_BITS-specific parms
1248 _SP_OFFS_ = _SP_OFFS_ + 16 #adjust stack offset accordingly (now 32)
1250 movq _SP_OFFS_-8(%rsp),%rsi #get back original %rdx (pushed on stack in macro call)
1251 movq %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[]
1253 movq ctxPtr+_SP_OFFS_(%rsp),%rsi #rsi = ctx_hdr_ptr
1254 movq $1024,%rdi #rdi = block size
1255 jmp Skein_Debug_Round_Common
1258 .if _SKEIN_CODE_SIZE
1259 C_label Skein1024_Process_Block_CodeSize
1260 movq $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax
1263 C_label Skein1024_Unroll_Cnt
1264 .if _UNROLL_CNT <> (ROUNDS_1024/8)
1265 movq $_UNROLL_CNT,%rax
1272 .endif # _USE_ASM_ and 1024
1275 #----------------------------------------------------------------
1276 #local debug routine to set up for calls to:
1277 # void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)
1278 # [ rdi rsi rdx rcx]
1280 # here with %rdx = round number
1281 # %rsi = ctx_hdr_ptr
1282 # %rdi = block size (256/512/1024)
1283 # on stack: saved rdi, saved rsi, retAddr, saved rdx
1285 Skein_Debug_Round_Common:
1286 _SP_OFFS_ = 32 #account for four words on stack already
1287 .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15 #save the rest of the regs
1289 _SP_OFFS_ = _SP_OFFS_+8
1291 .if (_SP_OFFS_ % 16) # make sure stack is still 16-byte aligned here
1292 .error "Debug_Round_Common: stack alignment"
1294 # compute %rcx = ptr to the X[] array on the stack (final parameter to call)
1295 leaq X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address
1296 cmpq $SKEIN_RND_FEED_FWD,%rdx #special handling for feedforward "round"?
1298 leaq X_VARS(%rsi),%rcx
1300 .if _USE_ASM_ & 1024
1301 # special handling for 1024-bit case
1302 # (for rounds right before with key injection:
1303 # use xDebug_1024[] instead of X_stk[])
1304 cmpq $SKEIN_RND_SPECIAL,%rdx
1305 jae _got_rcxB #must be a normal round
1307 jz _got_rcxB #just before key injection
1310 cmp $1024,%rdi #only 1024-bit(s) for now
1312 leaq xDebug_1024+_SP_OFFS_(%rsp),%rcx
1315 call Skein_Show_Round #call external debug handler
1317 .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax #restore regs
1319 _SP_OFFS_ = _SP_OFFS_-8
1322 .error "Debug_Round_Common: push/pop misalignment!"
1328 #----------------------------------------------------------------
1329 .section .note.GNU-stack,"",@progbits