sys/sun4v/cddl/t1_copy.S

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http:     //www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information:  Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22
  23 /*
  24  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  25  * Use is subject to license terms.
  26  */
  27
  28 #include <machine/asm.h>
  29 __FBSDID("$FreeBSD$")
  30
  31 #include <machine/asi.h>
  32 #include <machine/asmacros.h>
  33 #include <machine/ktr.h>
  34 #include <machine/pstate.h>
  35 #include <machine/trap.h>
  36 #include <machine/tstate.h>
  37 #include <machine/wstate.h>
  38 #include <machine/hypervisorvar.h>
  39
  40        .register %g2,#ignore
  41        .register %g3,#ignore
  42        .register %g6,#ignore
  43        .register %g7,#ignore
  44
  45
  46 /*
  47  * This define is to align data for the unaligned source cases.
  48  * The data1, data2 and data3 is merged into data1 and data2.
  49  * The data3 is preserved for next merge.
  50  */
  51 #define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)    \
  52         sllx    data1, lshift, data1                            ;\
  53         srlx    data2, rshift, tmp                              ;\
  54         or      data1, tmp, data1                               ;\
  55         sllx    data2, lshift, data2                            ;\
  56         srlx    data3, rshift, tmp                              ;\
  57         or      data2, tmp, data2
  58 /*
  59  * This macro is to align the data. Basically it merges
  60  * data1 and data2 to form double word.
  61  */
  62 #define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)        \
  63         sllx    data1, lshift, data1                            ;\
  64         srlx    data2, rshift, tmp                              ;\
  65         or      data1, tmp, data1
  66
  67
  68
  69
  70
  71 /*
  72  * DGDEF and DGDEF2 provide global data declarations.
  73  *
  74  * DGDEF provides a word aligned word of storage.
  75  *
  76  * DGDEF2 allocates "sz" bytes of storage with **NO** alignment.  This
  77  * implies this macro is best used for byte arrays.
  78  *
  79  * DGDEF3 allocates "sz" bytes of storage with "algn" alignment.
  80  */
  81 #define DGDEF2(name, sz) \
  82         .section        ".data" ;  \
  83         .global name    ;  \
  84         .type   name, @object ;  \
  85         .size   name, sz;  \
  86 name:
  87
  88 #define DGDEF3(name, sz, algn) \
  89         .section        ".data" ;  \
  90         .align  algn    ;  \
  91         .global name    ;  \
  92         .type   name, @object ;  \
  93         .size   name, sz;  \
  94 name:
  95
  96 #define DGDEF(name)     DGDEF3(name, 4, 4)
  97
  98 .align  4
  99 DGDEF(hw_copy_limit_1)
 100 .word   0x100
 101 DGDEF(hw_copy_limit_2)
 102 .word   0x200
 103 DGDEF(hw_copy_limit_4)
 104 .word   0x400
 105 DGDEF(hw_copy_limit_8)
 106 .word   0x400
 107 .align  64
 108 .section ".text"
 109
 110
 111 #if defined(lint)
 112
 113 /*ARGSUSED*/
 114 void
 115 ovbcopy(const void *from, void *to, size_t count)
 116 {}
 117
 118 #else   /* lint */
 119
 120 ENTRY(bcopy)
 121         tst     %o2                     ! check count
 122         bgu,a   %xcc, 1f                ! nothing to do or bad arguments
 123         subcc   %o0, %o1, %o3           ! difference of from and to address
 124
 125         retl                            ! return
 126           nop
 127 1:
 128         bneg,a  %xcc, 2f
 129         neg     %o3                     ! if < 0, make it positive
 130 2:      cmp     %o2, %o3                ! cmp size and abs(from - to)
 131         bleu    %xcc, novbcopy          ! if size <= abs(diff): use bcopy,
 132           nop
 133         cmp     %o0, %o1                ! compare from and to addresses
 134         blu     %xcc, ov_bkwd           ! if from < to, copy backwards
 135           nop
 136         !
 137         ! Copy forwards.
 138         !
 139 ov_fwd:
 140         ldub    [%o0], %o3              ! read from address
 141         inc     %o0                     ! inc from address
 142         stb     %o3, [%o1]              ! write to address
 143         deccc   %o2                     ! dec count
 144         bgu     %xcc, ov_fwd            ! loop till done
 145           inc   %o1                     ! inc to address
 146
 147         retl                            ! return
 148         nop
 149         !
 150         ! Copy backwards.
 151         !
 152 ov_bkwd:
 153         deccc   %o2                     ! dec count
 154         ldub    [%o0 + %o2], %o3        ! get byte at end of src
 155         bgu     %xcc, ov_bkwd           ! loop till done
 156           stb   %o3, [%o1 + %o2]        ! delay slot, store at end of dst
 157
 158         retl                            ! return
 159         nop
 160 END(bcopy)
 161
 162 #endif  /* lint */
 163
 164
 165
 166 /*
 167  * Copy a block of storage - must not overlap (from + len <= to).
 168  */
 169 ENTRY(novbcopy)
 170
 171         save    %sp, -SA(MINFRAME), %sp
 172
 173 do_copy:
 174         cmp     %i2, 12                 ! for small counts
 175         blu     %xcc, bytecp            ! just copy bytes
 176           nop
 177
 178         cmp     %i2, 128                ! for less than 128 bytes
 179         blu,pn  %xcc, bcb_punt          ! no block st/quad ld
 180           nop
 181 #if 0
 182         set     use_hw_bcopy, %o2
 183         ld      [%o2], %o2
 184         tst     %o2
 185         bz      bcb_punt
 186           nop
 187 #endif
 188         subcc   %i1, %i0, %i3
 189         bneg,a,pn %xcc, 1f
 190           neg   %i3
 191 1:
 192         /*
 193          * Compare against 256 since we should be checking block addresses
 194          * and (dest & ~63) - (src & ~63) can be 3 blocks even if
 195          * src = dest + (64 * 3) + 63.
 196          */
 197         cmp     %i3, 256
 198         blu,pn  %xcc, bcb_punt
 199           nop
 200
 201         /*
 202          * Copy that reach here have at least 2 blocks of data to copy.
 203          */
 204 do_blockcopy:
 205         ! Swap src/dst since the code below is memcpy code
 206         ! and memcpy/bcopy have different calling sequences
 207         mov     %i1, %i5
 208         mov     %i0, %i1
 209         mov     %i5, %i0
 210
 211         andcc   %i0, 0x3f, %i3          ! is dst aligned on a 64 bytes
 212         bz      %xcc, chksrc            ! dst is already double aligned
 213           sub   %i3, 0x40, %i3
 214         neg     %i3                     ! bytes till dst 64 bytes aligned
 215         sub     %i2, %i3, %i2           ! update i2 with new count
 216
 217 1:      ldub    [%i1], %i4
 218         stb     %i4, [%i0]
 219         inc     %i1
 220         deccc   %i3
 221         bgu     %xcc, 1b
 222           inc   %i0
 223
 224         ! Now Destination is block (64 bytes) aligned
 225 chksrc:
 226         andn    %i2, 0x3f, %i3          ! %i3 count is multiple of block size
 227         sub     %i2, %i3, %i2           ! Residue bytes in %i2
 228
 229         wr      %g0, ASI_LDSTBI_P, %asi
 230
 231         andcc   %i1, 0xf, %o2           ! is src quadword aligned
 232         bz,pn   %xcc, blkcpy            ! src offset in %o2
 233         nop
 234         cmp     %o2, 0x8
 235         bg      cpy_upper_double
 236         nop
 237         bl      cpy_lower_double
 238         nop
 239
 240         ! Falls through when source offset is equal to 8 i.e.
 241         ! source is double word aligned.
 242         ! In this case no shift/merge of data is required
 243         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
 244         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
 245         prefetch [%l0+0x0], #one_read
 246         ldda    [%i1+0x0]%asi, %l2
 247 loop0:
 248         ldda    [%i1+0x10]%asi, %l4
 249         prefetch [%l0+0x40], #one_read
 250
 251         stxa    %l3, [%i0+0x0]%asi
 252         stxa    %l4, [%i0+0x8]%asi
 253
 254         ldda    [%i1+0x20]%asi, %l2
 255         stxa    %l5, [%i0+0x10]%asi
 256         stxa    %l2, [%i0+0x18]%asi
 257
 258         ldda    [%i1+0x30]%asi, %l4
 259         stxa    %l3, [%i0+0x20]%asi
 260         stxa    %l4, [%i0+0x28]%asi
 261
 262         ldda    [%i1+0x40]%asi, %l2
 263         stxa    %l5, [%i0+0x30]%asi
 264         stxa    %l2, [%i0+0x38]%asi
 265
 266         add     %l0, 0x40, %l0
 267         add     %i1, 0x40, %i1
 268         subcc   %i3, 0x40, %i3
 269         bgu,pt  %xcc, loop0
 270           add   %i0, 0x40, %i0
 271         ba      blkdone
 272         add     %i1, %o2, %i1           ! increment the source by src offset
 273                                         ! the src offset was stored in %o2
 274
 275 cpy_lower_double:
 276         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
 277         sll     %o2, 3, %o0             ! %o0 left shift
 278         mov     0x40, %o1
 279         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
 280         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
 281         prefetch [%l0+0x0], #one_read
 282         ldda    [%i1+0x0]%asi, %l2      ! partial data in %l2 and %l3 has
 283                                         ! complete data
 284 loop1:
 285         ldda    [%i1+0x10]%asi, %l4     ! %l4 has partial data for this read.
 286         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)        ! merge %l2, %l3 and %l4
 287                                                         ! into %l2 and %l3
 288         prefetch [%l0+0x40], #one_read
 289         stxa    %l2, [%i0+0x0]%asi
 290         stxa    %l3, [%i0+0x8]%asi
 291
 292         ldda    [%i1+0x20]%asi, %l2
 293         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)        ! merge %l2 with %l5 and
 294         stxa    %l4, [%i0+0x10]%asi                     ! %l4 from previous read
 295         stxa    %l5, [%i0+0x18]%asi                     ! into %l4 and %l5
 296
 297         ! Repeat the same for next 32 bytes.
 298
 299         ldda    [%i1+0x30]%asi, %l4
 300         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
 301         stxa    %l2, [%i0+0x20]%asi
 302         stxa    %l3, [%i0+0x28]%asi
 303
 304         ldda    [%i1+0x40]%asi, %l2
 305         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
 306         stxa    %l4, [%i0+0x30]%asi
 307         stxa    %l5, [%i0+0x38]%asi
 308
 309         add     %l0, 0x40, %l0
 310         add     %i1, 0x40, %i1
 311         subcc   %i3, 0x40, %i3
 312         bgu,pt  %xcc, loop1
 313           add   %i0, 0x40, %i0
 314         ba      blkdone
 315         add     %i1, %o2, %i1           ! increment the source by src offset
 316                                         ! the src offset was stored in %o2
 317
 318 cpy_upper_double:
 319         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
 320         mov     0x8, %o0
 321         sub     %o2, %o0, %o0
 322         sll     %o0, 3, %o0             ! %o0 left shift
 323         mov     0x40, %o1
 324         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
 325         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
 326         prefetch [%l0+0x0], #one_read
 327         ldda    [%i1+0x0]%asi, %l2      ! partial data in %l3 for this read and
 328                                         ! no data in %l2
 329 loop2:
 330         ldda    [%i1+0x10]%asi, %l4     ! %l4 has complete data and %l5 has
 331                                         ! partial
 332         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)        ! merge %l3, %l4 and %l5
 333                                                         ! into %l3 and %l4
 334         prefetch [%l0+0x40], #one_read
 335         stxa    %l3, [%i0+0x0]%asi
 336         stxa    %l4, [%i0+0x8]%asi
 337
 338         ldda    [%i1+0x20]%asi, %l2
 339         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)        ! merge %l2 and %l3 with
 340         stxa    %l5, [%i0+0x10]%asi                     ! %l5 from previous read
 341         stxa    %l2, [%i0+0x18]%asi                     ! into %l5 and %l2
 342
 343         ! Repeat the same for next 32 bytes.
 344
 345         ldda    [%i1+0x30]%asi, %l4
 346         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
 347         stxa    %l3, [%i0+0x20]%asi
 348         stxa    %l4, [%i0+0x28]%asi
 349
 350         ldda    [%i1+0x40]%asi, %l2
 351         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
 352         stxa    %l5, [%i0+0x30]%asi
 353         stxa    %l2, [%i0+0x38]%asi
 354
 355         add     %l0, 0x40, %l0
 356         add     %i1, 0x40, %i1
 357         subcc   %i3, 0x40, %i3
 358         bgu,pt  %xcc, loop2
 359           add   %i0, 0x40, %i0
 360         ba      blkdone
 361         add     %i1, %o2, %i1           ! increment the source by src offset
 362                                         ! the src offset was stored in %o2
 363
 364
 365         ! Both Source and Destination are block aligned.
 366         ! Do fast copy using ASI_LDSTBI_P
 367 blkcpy:
 368         prefetch [%i1+0x0], #one_read
 369 1:
 370         ldda    [%i1+0x0]%asi, %l0
 371         ldda    [%i1+0x10]%asi, %l2
 372         prefetch [%i1+0x40], #one_read
 373
 374         stxa    %l0, [%i0+0x0]%asi
 375         ldda    [%i1+0x20]%asi, %l4
 376         ldda    [%i1+0x30]%asi, %l6
 377
 378         stxa    %l1, [%i0+0x8]%asi
 379         stxa    %l2, [%i0+0x10]%asi
 380         stxa    %l3, [%i0+0x18]%asi
 381         stxa    %l4, [%i0+0x20]%asi
 382         stxa    %l5, [%i0+0x28]%asi
 383         stxa    %l6, [%i0+0x30]%asi
 384         stxa    %l7, [%i0+0x38]%asi
 385
 386         add     %i1, 0x40, %i1
 387         subcc   %i3, 0x40, %i3
 388         bgu,pt  %xcc, 1b
 389           add   %i0, 0x40, %i0
 390
 391 blkdone:
 392         tst     %i2
 393         bz,pt   %xcc, blkexit
 394         nop
 395
 396 residue:
 397         ldub    [%i1], %i4
 398         stb     %i4, [%i0]
 399         inc     %i1
 400         deccc   %i2
 401         bgu     %xcc, residue
 402           inc   %i0
 403
 404 blkexit:
 405         membar  #Sync                           ! sync error barrier
 406         ret
 407         restore %g0, 0, %o0
 408
 409 bcb_punt:
 410         !
 411         ! use aligned transfers where possible
 412         !
 413         xor     %i0, %i1, %o4           ! xor from and to address
 414         btst    7, %o4                  ! if lower three bits zero
 415         bz      aldoubcp                ! can align on double boundary
 416           nop   ! assembler complaints about label
 417
 418         xor     %i0, %i1, %o4           ! xor from and to address
 419         btst    3, %o4                  ! if lower two bits zero
 420         bz      alwordcp                ! can align on word boundary
 421         btst    3, %i0                  ! delay slot, from address unaligned?
 422         !
 423         ! use aligned reads and writes where possible
 424         ! this differs from wordcp in that it copes
 425         ! with odd alignment between source and destnation
 426         ! using word reads and writes with the proper shifts
 427         ! in between to align transfers to and from memory
 428         ! i0 - src address, i1 - dest address, i2 - count
 429         ! i3, i4 - tmps for used generating complete word
 430         ! i5 (word to write)
 431         ! l0 size in bits of upper part of source word (US)
 432         ! l1 size in bits of lower part of source word (LS = 32 - US)
 433         ! l2 size in bits of upper part of destination word (UD)
 434         ! l3 size in bits of lower part of destination word (LD = 32 - UD)
 435         ! l4 number of bytes leftover after aligned transfers complete
 436         ! l5 the number 32
 437         !
 438         mov     32, %l5                 ! load an oft-needed constant
 439         bz      align_dst_only
 440           btst  3, %i1                  ! is destnation address aligned?
 441         clr     %i4                     ! clear registers used in either case
 442         bz      align_src_only
 443           clr   %l0
 444         !
 445         ! both source and destination addresses are unaligned
 446         !
 447 1:                                      ! align source
 448         ldub    [%i0], %i3              ! read a byte from source address
 449         add     %i0, 1, %i0             ! increment source address
 450         or      %i4, %i3, %i4           ! or in with previous bytes (if any)
 451         btst    3, %i0                  ! is source aligned?
 452         add     %l0, 8, %l0             ! increment size of upper source (US)
 453         bnz,a   1b
 454           sll   %i4, 8, %i4             ! make room for next byte
 455
 456         sub     %l5, %l0, %l1           ! generate shift left count (LS)
 457         sll     %i4, %l1, %i4           ! prepare to get rest
 458         ld      [%i0], %i3              ! read a word
 459         add     %i0, 4, %i0             ! increment source address
 460         srl     %i3, %l0, %i5           ! upper src bits into lower dst bits
 461         or      %i4, %i5, %i5           ! merge
 462         mov     24, %l3                 ! align destination
 463 1:
 464         srl     %i5, %l3, %i4           ! prepare to write a single byte
 465         stb     %i4, [%i1]              ! write a byte
 466         add     %i1, 1, %i1             ! increment destination address
 467         sub     %i2, 1, %i2             ! decrement count
 468         btst    3, %i1                  ! is destination aligned?
 469         bnz,a   1b
 470           sub   %l3, 8, %l3             ! delay slot, decrement shift count (LD)
 471         sub     %l5, %l3, %l2           ! generate shift left count (UD)
 472         sll     %i5, %l2, %i5           ! move leftover into upper bytes
 473         cmp     %l2, %l0                ! cmp # reqd to fill dst w old src left
 474         bgu     %xcc, more_needed       ! need more to fill than we have
 475           nop
 476
 477         sll     %i3, %l1, %i3           ! clear upper used byte(s)
 478         srl     %i3, %l1, %i3
 479         ! get the odd bytes between alignments
 480         sub     %l0, %l2, %l0           ! regenerate shift count
 481         sub     %l5, %l0, %l1           ! generate new shift left count (LS)
 482         and     %i2, 3, %l4             ! must do remaining bytes if count%4 > 0
 483         andn    %i2, 3, %i2             ! # of aligned bytes that can be moved
 484         srl     %i3, %l0, %i4
 485         or      %i5, %i4, %i5
 486         st      %i5, [%i1]              ! write a word
 487         subcc   %i2, 4, %i2             ! decrement count
 488         bz      %xcc, unalign_out
 489           add   %i1, 4, %i1             ! increment destination address
 490
 491         b       2f
 492         sll     %i3, %l1, %i5           ! get leftover into upper bits
 493 more_needed:
 494         sll     %i3, %l0, %i3           ! save remaining byte(s)
 495         srl     %i3, %l0, %i3
 496         sub     %l2, %l0, %l1           ! regenerate shift count
 497         sub     %l5, %l1, %l0           ! generate new shift left count
 498         sll     %i3, %l1, %i4           ! move to fill empty space
 499         b       3f
 500         or      %i5, %i4, %i5           ! merge to complete word
 501         !
 502         ! the source address is aligned and destination is not
 503         !
 504 align_dst_only:
 505         ld      [%i0], %i4              ! read a word
 506         add     %i0, 4, %i0             ! increment source address
 507         mov     24, %l0                 ! initial shift alignment count
 508 1:
 509         srl     %i4, %l0, %i3           ! prepare to write a single byte
 510         stb     %i3, [%i1]              ! write a byte
 511         add     %i1, 1, %i1             ! increment destination address
 512         sub     %i2, 1, %i2             ! decrement count
 513         btst    3, %i1                  ! is destination aligned?
 514         bnz,a   1b
 515           sub   %l0, 8, %l0             ! delay slot, decrement shift count
 516 xfer:
 517         sub     %l5, %l0, %l1           ! generate shift left count
 518         sll     %i4, %l1, %i5           ! get leftover
 519 3:
 520         and     %i2, 3, %l4             ! must do remaining bytes if count%4 > 0
 521         andn    %i2, 3, %i2             ! # of aligned bytes that can be moved
 522 2:
 523         ld      [%i0], %i3              ! read a source word
 524         add     %i0, 4, %i0             ! increment source address
 525         srl     %i3, %l0, %i4           ! upper src bits into lower dst bits
 526         or      %i5, %i4, %i5           ! merge with upper dest bits (leftover)
 527         st      %i5, [%i1]              ! write a destination word
 528         subcc   %i2, 4, %i2             ! decrement count
 529         bz      %xcc, unalign_out       ! check if done
 530           add   %i1, 4, %i1             ! increment destination address
 531         b       2b                      ! loop
 532         sll     %i3, %l1, %i5           ! get leftover
 533 unalign_out:
 534         tst     %l4                     ! any bytes leftover?
 535         bz      %xcc, cpdone
 536           nop
 537 1:
 538         sub     %l0, 8, %l0             ! decrement shift
 539         srl     %i3, %l0, %i4           ! upper src byte into lower dst byte
 540         stb     %i4, [%i1]              ! write a byte
 541         subcc   %l4, 1, %l4             ! decrement count
 542         bz      %xcc, cpdone            ! done?
 543         add     %i1, 1, %i1             ! increment destination
 544         tst     %l0                     ! any more previously read bytes
 545         bnz     %xcc, 1b                ! we have leftover bytes
 546           mov   %l4, %i2                ! delay slot, mv cnt where dbytecp wants
 547         b       dbytecp                 ! let dbytecp do the rest
 548           sub   %i0, %i1, %i0           ! i0 gets the difference of src and dst
 549         !
 550         ! the destination address is aligned and the source is not
 551         !
 552 align_src_only:
 553         ldub    [%i0], %i3              ! read a byte from source address
 554         add     %i0, 1, %i0             ! increment source address
 555         or      %i4, %i3, %i4           ! or in with previous bytes (if any)
 556         btst    3, %i0                  ! is source aligned?
 557         add     %l0, 8, %l0             ! increment shift count (US)
 558         bnz,a   align_src_only
 559           sll   %i4, 8, %i4             ! make room for next byte
 560         b,a     xfer
 561         !
 562         ! if from address unaligned for double-word moves,
 563         ! move bytes till it is, if count is < 56 it could take
 564         ! longer to align the thing than to do the transfer
 565         ! in word size chunks right away
 566         !
 567 aldoubcp:
 568         cmp     %i2, 56                 ! if count < 56, use wordcp, it takes
 569         blu,a   %xcc, alwordcp          ! longer to align doubles than words
 570           mov   3, %o0                  ! mask for word alignment
 571         call    alignit                 ! copy bytes until aligned
 572         mov     7, %o0                  ! mask for double alignment
 573           !
 574         ! source and destination are now double-word aligned
 575         ! i3 has aligned count returned by alignit
 576         !
 577         and     %i2, 7, %i2             ! unaligned leftover count
 578         sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst
 579 5:
 580         ldx     [%i0+%i1], %o4          ! read from address
 581         stx     %o4, [%i1]              ! write at destination address
 582         subcc   %i3, 8, %i3             ! dec count
 583         bgu     %xcc, 5b
 584           add   %i1, 8, %i1             ! delay slot, inc to address
 585         cmp     %i2, 4                  ! see if we can copy a word
 586         blu     %xcc, dbytecp           ! if 3 or less bytes use bytecp
 587           nop
 588         !
 589         ! for leftover bytes we fall into wordcp, if needed
 590         !
 591 wordcp:
 592         and     %i2, 3, %i2             ! unaligned leftover count
 593 5:
 594         ld      [%i0+%i1], %o4          ! read from address
 595         st      %o4, [%i1]              ! write at destination address
 596         subcc   %i3, 4, %i3             ! dec count
 597         bgu     %xcc, 5b
 598           add   %i1, 4, %i1             ! delay slot, inc to address
 599         b,a     dbytecp
 600
 601         ! we come here to align copies on word boundaries
 602 alwordcp:
 603         call    alignit         ! go word-align it
 604           mov   3, %o0                  ! bits that must be zero to be aligned
 605         b       wordcp
 606           sub   %i0, %i1, %i0           ! i0 gets the difference of src and dst
 607
 608         !
 609         ! byte copy, works with any alignment
 610         !
 611 bytecp:
 612         b       dbytecp
 613           sub   %i0, %i1, %i0           ! i0 gets difference of src and dst
 614
 615         !
 616         ! differenced byte copy, works with any alignment
 617         ! assumes dest in %i1 and (source - dest) in %i0
 618         !
 619 1:
 620         stb     %o4, [%i1]              ! write to address
 621         inc     %i1                     ! inc to address
 622 dbytecp:
 623         deccc   %i2                     ! dec count
 624         bgeu,a  %xcc, 1b                ! loop till done
 625         ldub    [%i0+%i1], %o4          ! read from address
 626 cpdone:
 627         membar  #Sync                           ! sync error barrier
 628         ret
 629           restore %g0, 0, %o0           ! return (0)
 630
 631 /*
 632  * Common code used to align transfers on word and doubleword
 633  * boudaries.  Aligns source and destination and returns a count
 634  * of aligned bytes to transfer in %i3
 635  */
 636 1:
 637         inc     %i0                     ! inc from
 638         stb     %o4, [%i1]              ! write a byte
 639         inc     %i1                     ! inc to
 640         dec     %i2                     ! dec count
 641 alignit:
 642         btst    %o0, %i0                ! %o0 is bit mask to check for alignment
 643         bnz,a   1b
 644           ldub  [%i0], %o4              ! read next byte
 645
 646         retl
 647         andn    %i2, %o0, %i3           ! return size of aligned bytes
 648 END(novbcopy)
 649
 650
 651 /*
 652  * hwblkclr - clears block-aligned, block-multiple-sized regions that are
 653  * longer than 256 bytes in length using Niagara's block stores/quad store.
 654  * If the criteria for using this routine are not met then it calls bzero
 655  * and returns 1.  Otherwise 0 is returned indicating success.
 656  * Caller is responsible for ensuring use_hw_bzero is true and that
 657  * kpreempt_disable() has been called.
 658  */
 659 #ifdef lint
 660 /*ARGSUSED*/
 661 int
 662 hwblkclr(void *addr, size_t len)
 663 {
 664         return(0);
 665 }
 666 #else /* lint */
 667         ! %i0 - start address
 668         ! %i1 - length of region (multiple of 64)
 669
 670 ENTRY(hwblkclr)
 671         save    %sp, -SA(MINFRAME), %sp
 672
 673         ! Must be block-aligned
 674         andcc   %i0, 0x3f, %g0
 675         bnz,pn  %xcc, 1f
 676           nop
 677
 678         ! ... and must be 256 bytes or more
 679         cmp     %i1, 0x100
 680         blu,pn  %xcc, 1f
 681           nop
 682
 683         ! ... and length must be a multiple of 64
 684         andcc   %i1, 0x3f, %g0
 685         bz,pn   %xcc, pz_doblock
 686         wr      %g0, ASI_LDSTBI_P, %asi
 687
 688 1:      ! punt, call bzero but notify the caller that bzero was used
 689         mov     %i0, %o0
 690         call    bzero
 691           mov   %i1, %o1
 692         ret
 693           restore       %g0, 1, %o0     ! return (1) - did not use block operations
 694
 695         ! Already verified that there are at least 256 bytes to set
 696 pz_doblock:
 697         stxa    %g0, [%i0+0x0]%asi
 698         stxa    %g0, [%i0+0x40]%asi
 699         stxa    %g0, [%i0+0x80]%asi
 700         stxa    %g0, [%i0+0xc0]%asi
 701
 702         stxa    %g0, [%i0+0x8]%asi
 703         stxa    %g0, [%i0+0x10]%asi
 704         stxa    %g0, [%i0+0x18]%asi
 705         stxa    %g0, [%i0+0x20]%asi
 706         stxa    %g0, [%i0+0x28]%asi
 707         stxa    %g0, [%i0+0x30]%asi
 708         stxa    %g0, [%i0+0x38]%asi
 709
 710         stxa    %g0, [%i0+0x48]%asi
 711         stxa    %g0, [%i0+0x50]%asi
 712         stxa    %g0, [%i0+0x58]%asi
 713         stxa    %g0, [%i0+0x60]%asi
 714         stxa    %g0, [%i0+0x68]%asi
 715         stxa    %g0, [%i0+0x70]%asi
 716         stxa    %g0, [%i0+0x78]%asi
 717
 718         stxa    %g0, [%i0+0x88]%asi
 719         stxa    %g0, [%i0+0x90]%asi
 720         stxa    %g0, [%i0+0x98]%asi
 721         stxa    %g0, [%i0+0xa0]%asi
 722         stxa    %g0, [%i0+0xa8]%asi
 723         stxa    %g0, [%i0+0xb0]%asi
 724         stxa    %g0, [%i0+0xb8]%asi
 725
 726         stxa    %g0, [%i0+0xc8]%asi
 727         stxa    %g0, [%i0+0xd0]%asi
 728         stxa    %g0, [%i0+0xd8]%asi
 729         stxa    %g0, [%i0+0xe0]%asi
 730         stxa    %g0, [%i0+0xe8]%asi
 731         stxa    %g0, [%i0+0xf0]%asi
 732         stxa    %g0, [%i0+0xf8]%asi
 733
 734         sub     %i1, 0x100, %i1
 735         cmp     %i1, 0x100
 736         bgu,pt  %xcc, pz_doblock
 737           add   %i0, 0x100, %i0
 738
 739 2:
 740         ! Check if more than 64 bytes to set
 741         cmp     %i1,0x40
 742         blu     %xcc, pz_finish
 743           nop
 744
 745 3:
 746         stxa    %g0, [%i0+0x0]%asi
 747         stxa    %g0, [%i0+0x8]%asi
 748         stxa    %g0, [%i0+0x10]%asi
 749         stxa    %g0, [%i0+0x18]%asi
 750         stxa    %g0, [%i0+0x20]%asi
 751         stxa    %g0, [%i0+0x28]%asi
 752         stxa    %g0, [%i0+0x30]%asi
 753         stxa    %g0, [%i0+0x38]%asi
 754
 755         subcc   %i1, 0x40, %i1
 756         bgu,pt  %xcc, 3b
 757           add   %i0, 0x40, %i0
 758
 759 pz_finish:
 760         membar  #Sync
 761         ret
 762           restore       %g0, 0, %o0             ! return (bzero or not)
 763 END(hwblkclr)
 764 #endif  /* lint */
 765
 766 #if defined(lint)
 767
 768 /* ARGSUSED */
 769 void
 770 bzero(void *addr, size_t count)
 771 {}
 772
 773 #else   /* lint */
 774
 775 ENTRY(bzero)
 776         wr      %g0, ASI_P, %asi
 777
 778         cmp     %o1, 7
 779         blu,pn  %xcc, byteclr
 780           nop
 781
 782         cmp     %o1, 15
 783         blu,pn  %xcc, wdalign
 784           nop
 785
 786         andcc   %o0, 7, %o3             ! is add aligned on a 8 byte bound
 787         bz,pt   %xcc, blkalign          ! already double aligned
 788           sub   %o3, 8, %o3             ! -(bytes till double aligned)
 789         add     %o1, %o3, %o1           ! update o1 with new count
 790
 791 1:
 792         stba    %g0, [%o0]%asi
 793         inccc   %o3
 794         bl,pt   %xcc, 1b
 795           inc   %o0
 796
 797         ! Now address is double aligned
 798 blkalign:
 799         cmp     %o1, 0x80               ! check if there are 128 bytes to set
 800         blu,pn  %xcc, bzero_small
 801           mov   %o1, %o3
 802 #if 0
 803         sethi   %hi(use_hw_bzero), %o2
 804         ld      [%o2 + %lo(use_hw_bzero)], %o2
 805         tst     %o2
 806         bz      %xcc, bzero_small
 807           mov   %o1, %o3
 808 #endif
 809         rd      %asi, %o3
 810         wr      %g0, ASI_LDSTBI_P, %asi
 811         cmp     %o3, ASI_P
 812         bne,a   %xcc, algnblk
 813           wr    %g0, ASI_LDSTBI_AIUS, %asi
 814
 815 algnblk:
 816         andcc   %o0, 0x3f, %o3          ! is block aligned?
 817         bz,pt   %xcc, bzero_blk
 818           sub   %o3, 0x40, %o3          ! -(bytes till block aligned)
 819         add     %o1, %o3, %o1           ! o1 is the remainder
 820
 821         ! Clear -(%o3) bytes till block aligned
 822 1:
 823         stxa    %g0, [%o0]%asi
 824         addcc   %o3, 8, %o3
 825         bl,pt   %xcc, 1b
 826           add   %o0, 8, %o0
 827
 828 bzero_blk:
 829         and     %o1, 0x3f, %o3          ! calc bytes left after blk clear
 830         andn    %o1, 0x3f, %o4          ! calc size of blocks in bytes
 831
 832         cmp     %o4, 0x100              ! 256 bytes or more
 833         blu,pn  %xcc, 3f
 834           nop
 835
 836 2:
 837         stxa    %g0, [%o0+0x0]%asi
 838         stxa    %g0, [%o0+0x40]%asi
 839         stxa    %g0, [%o0+0x80]%asi
 840         stxa    %g0, [%o0+0xc0]%asi
 841
 842         stxa    %g0, [%o0+0x8]%asi
 843         stxa    %g0, [%o0+0x10]%asi
 844         stxa    %g0, [%o0+0x18]%asi
 845         stxa    %g0, [%o0+0x20]%asi
 846         stxa    %g0, [%o0+0x28]%asi
 847         stxa    %g0, [%o0+0x30]%asi
 848         stxa    %g0, [%o0+0x38]%asi
 849
 850         stxa    %g0, [%o0+0x48]%asi
 851         stxa    %g0, [%o0+0x50]%asi
 852         stxa    %g0, [%o0+0x58]%asi
 853         stxa    %g0, [%o0+0x60]%asi
 854         stxa    %g0, [%o0+0x68]%asi
 855         stxa    %g0, [%o0+0x70]%asi
 856         stxa    %g0, [%o0+0x78]%asi
 857
 858         stxa    %g0, [%o0+0x88]%asi
 859         stxa    %g0, [%o0+0x90]%asi
 860         stxa    %g0, [%o0+0x98]%asi
 861         stxa    %g0, [%o0+0xa0]%asi
 862         stxa    %g0, [%o0+0xa8]%asi
 863         stxa    %g0, [%o0+0xb0]%asi
 864         stxa    %g0, [%o0+0xb8]%asi
 865
 866         stxa    %g0, [%o0+0xc8]%asi
 867         stxa    %g0, [%o0+0xd0]%asi
 868         stxa    %g0, [%o0+0xd8]%asi
 869         stxa    %g0, [%o0+0xe0]%asi
 870         stxa    %g0, [%o0+0xe8]%asi
 871         stxa    %g0, [%o0+0xf0]%asi
 872         stxa    %g0, [%o0+0xf8]%asi
 873
 874         sub     %o4, 0x100, %o4
 875         cmp     %o4, 0x100
 876         bgu,pt  %xcc, 2b
 877           add   %o0, 0x100, %o0
 878
 879 3:
 880         ! ... check if 64 bytes to set
 881         cmp     %o4, 0x40
 882         blu     %xcc, bzero_blk_done
 883           nop
 884
 885 4:
 886         stxa    %g0, [%o0+0x0]%asi
 887         stxa    %g0, [%o0+0x8]%asi
 888         stxa    %g0, [%o0+0x10]%asi
 889         stxa    %g0, [%o0+0x18]%asi
 890         stxa    %g0, [%o0+0x20]%asi
 891         stxa    %g0, [%o0+0x28]%asi
 892         stxa    %g0, [%o0+0x30]%asi
 893         stxa    %g0, [%o0+0x38]%asi
 894
 895         subcc   %o4, 0x40, %o4
 896         bgu,pt  %xcc, 3b
 897           add   %o0, 0x40, %o0
 898
 899 bzero_blk_done:
 900         membar  #Sync
 901         !
 902         ! Undo asi register setting.
 903         !
 904         rd      %asi, %o4
 905         wr      %g0, ASI_P, %asi
 906         cmp     %o4, ASI_LDSTBI_P
 907         bne,a   %xcc, bzero_small
 908           wr    %g0, ASI_AIUS, %asi
 909
 910 bzero_small:
 911         ! Set the remaining doubles
 912         subcc   %o3, 8, %o3             ! Can we store any doubles?
 913         blu,pn  %xcc, byteclr
 914           and   %o1, 7, %o1             ! calc bytes left after doubles
 915
 916 dbclr:
 917         stxa    %g0, [%o0]%asi          ! Clear the doubles
 918         subcc   %o3, 8, %o3
 919         bgeu,pt %xcc, dbclr
 920           add   %o0, 8, %o0
 921
 922         ba      byteclr
 923           nop
 924
 925 wdalign:
 926         andcc   %o0, 3, %o3             ! is add aligned on a word boundary
 927         bz,pn   %xcc, wdclr
 928         andn    %o1, 3, %o3             ! create word sized count in %o3
 929
 930         dec     %o1                     ! decrement count
 931         stba    %g0, [%o0]%asi          ! clear a byte
 932         ba      wdalign
 933           inc   %o0                     ! next byte
 934
 935 wdclr:
 936         sta     %g0, [%o0]%asi          ! 4-byte clearing loop
 937         subcc   %o3, 4, %o3
 938         bnz,pt  %xcc, wdclr
 939           inc   4, %o0
 940
 941         and     %o1, 3, %o1             ! leftover count, if any
 942
 943 byteclr:
 944         ! Set the leftover bytes
 945         brz     %o1, bzero_exit
 946         nop
 947
 948 7:
 949         deccc   %o1                     ! byte clearing loop
 950         stba    %g0, [%o0]%asi
 951         bgu,pt  %xcc, 7b
 952           inc   %o0
 953
 954 bzero_exit:
 955         retl
 956           clr   %o0                     ! return (0)
 957
 958 END(bzero)
 959 #endif  /* lint */
 960
 961
 962 #if 0
 963 #define SMALL_LIMIT 7
 964 #if defined(lint)
 965
 966 /*ARGSUSED*/
 967 int
 968 copyin(const void *uaddr, void *kaddr, size_t count)
 969 { return (0); }
 970
 971 #else   /* lint */
 972
 973 ENTRY(copyin)
 974         !
 975         ! Check the length and bail if zero.
 976         !
 977         tst     %o2
 978         bnz,pt  %xcc, 1f
 979           nop
 980         retl
 981           clr   %o0
 982 #if 0
 983 1:
 984         sethi   %hi(copyio_fault), %o4
 985         or      %o4, %lo(copyio_fault), %o4
 986         sethi   %hi(copyio_fault_nowindow), %o3
 987         ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
 988         or      %o3, %lo(copyio_fault_nowindow), %o3
 989         membar  #Sync
 990         stn     %o3, [THREAD_REG + T_LOFAULT]
 991
 992         mov     %o0, SAVE_SRC
 993         mov     %o1, SAVE_DST
 994         mov     %o2, SAVE_COUNT
 995 #endif
 996         !
 997         ! Check to see if we're more than SMALL_LIMIT.
 998         !
 999         subcc   %o2, SMALL_LIMIT, %o3
1000         bgu,a,pt %xcc, dci_ns
1001           or    %o0, %o1, %o3
1002         !
1003         ! What was previously ".small_copyin"
1004         !
1005 dcibcp:
1006         sub     %g0, %o2, %o3           ! setup for copy loop
1007         add     %o0, %o2, %o0
1008         add     %o1, %o2, %o1
1009         ba,pt   %xcc, dcicl
1010         lduba   [%o0 + %o3]ASI_AIUS, %o4
1011         !
1012         ! %o0 and %o1 point at the end and remain pointing at the end
1013         ! of their buffers. We pull things out by adding %o3 (which is
1014         ! the negation of the length) to the buffer end which gives us
1015         ! the curent location in the buffers. By incrementing %o3 we walk
1016         ! through both buffers without having to bump each buffer's
1017         ! pointer. A very fast 4 instruction loop.
1018         !
1019         .align 16
1020 dcicl:
1021         stb     %o4, [%o1 + %o3]
1022         inccc   %o3
1023         bl,a,pt %xcc, dcicl
1024         lduba   [%o0 + %o3]ASI_AIUS, %o4
1025         !
1026         ! We're done. Go home.
1027         !
1028         membar  #Sync
1029         retl
1030           clr   %o0
1031         !
1032         ! Try aligned copies from here.
1033         !
1034 dci_ns:
1035         !
1036         ! See if we're single byte aligned. If we are, check the
1037         ! limit for single byte copies. If we're smaller, or equal,
1038         ! bounce to the byte for byte copy loop. Otherwise do it in
1039         ! HW (if enabled).
1040         !
1041         btst    1, %o3
1042         bz,a,pt %icc, dcih8
1043         btst    7, %o3
1044         !
1045         ! We're single byte aligned.
1046         !
1047         sethi   %hi(hw_copy_limit_1), %o3
1048         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
1049         !
1050         ! Is HW copy on? If not do everything byte for byte.
1051         !
1052         tst     %o3
1053         bz,pn   %icc, dcibcp
1054           subcc %o3, %o2, %o3
1055         !
1056         ! Are we bigger than the HW limit? If not
1057         ! go to byte for byte.
1058         !
1059         bge,pt  %xcc, dcibcp
1060           nop
1061         !
1062         ! We're big enough and copy is on. Do it with HW.
1063         !
1064         ba,pt   %xcc, big_copyin
1065         nop
1066 dcih8:
1067         !
1068         ! 8 byte aligned?
1069         !
1070         bnz,a   %xcc, dcih4
1071         btst    3, %o3
1072         !
1073         ! We're eight byte aligned.
1074         !
1075         sethi   %hi(hw_copy_limit_8), %o3
1076         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
1077         !
1078         ! Is HW assist on? If not, do it with the aligned copy.
1079         !
1080         tst     %o3
1081         bz,pn   %icc, dcis8
1082         subcc   %o3, %o2, %o3
1083         bge     %xcc, dcis8
1084         nop
1085         ba,pt   %xcc, big_copyin
1086         nop
1087 dcis8:
1088         !
1089         ! Housekeeping for copy loops. Uses same idea as in the byte for
1090         ! byte copy loop above.
1091         !
1092         add     %o0, %o2, %o0
1093         add     %o1, %o2, %o1
1094         sub     %g0, %o2, %o3
1095         ba,pt   %xcc, didebc
1096         srl     %o2, 3, %o2             ! Number of 8 byte chunks to copy
1097         !
1098         ! 4 byte aligned?
1099         !
1100 dcih4:
1101         bnz     %xcc, dcih2
1102         sethi   %hi(hw_copy_limit_4), %o3
1103         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
1104         !
1105         ! Is HW assist on? If not, do it with the aligned copy.
1106         !
1107         tst     %o3
1108         bz,pn   %icc, dcis4
1109           subcc %o3, %o2, %o3
1110         !
1111         ! We're negative if our size is less than or equal to hw_copy_limit_4.
1112         !
1113         bge     %xcc, dcis4
1114           nop
1115         ba,pt   %xcc, big_copyin
1116           nop
1117 dcis4:
1118         !
1119         ! Housekeeping for copy loops. Uses same idea as in the byte
1120         ! for byte copy loop above.
1121         !
1122         add     %o0, %o2, %o0
1123         add     %o1, %o2, %o1
1124         sub     %g0, %o2, %o3
1125         ba,pt   %xcc, didfbc
1126           srl   %o2, 2, %o2             ! Number of 4 byte chunks to copy
1127 dcih2:
1128         !
1129         ! We're two byte aligned. Check for "smallness"
1130         ! done in delay at .dcih4
1131         !
1132         bleu,pt %xcc, dcis2
1133         sethi   %hi(hw_copy_limit_2), %o3
1134         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
1135         !
1136         ! Is HW assist on? If not, do it with the aligned copy.
1137         !
1138         tst     %o3
1139         bz,pn   %icc, dcis2
1140           subcc %o3, %o2, %o3
1141         !
1142         ! Are we larger than the HW limit?
1143         !
1144         bge     %xcc, dcis2
1145         nop
1146         !
1147         ! HW assist is on and we're large enough to use it.
1148         !
1149         ba,pt   %xcc, big_copyin
1150         nop
1151         !
1152         ! Housekeeping for copy loops. Uses same idea as in the byte
1153         ! for byte copy loop above.
1154         !
1155 dcis2:
1156         add     %o0, %o2, %o0
1157         add     %o1, %o2, %o1
1158         sub     %g0, %o2, %o3
1159         ba,pt   %xcc, didtbc
1160         srl     %o2, 1, %o2             ! Number of 2 byte chunks to copy
1161         !
1162 small_copyin:
1163         !
1164         ! Why are we doing this AGAIN? There are certain conditions in
1165         ! big copyin that will cause us to forgo the HW assisted copys
1166         ! and bounce back to a non-hw assisted copy. This dispatches
1167         ! those copies. Note that we branch around this in the main line
1168         ! code.
1169         !
1170         ! We make no check for limits or HW enablement here. We've
1171         ! already been told that we're a poster child so just go off
1172         ! and do it.
1173         !
1174         or      %o0, %o1, %o3
1175         btst    1, %o3
1176         bnz     %icc, dcibcp            ! Most likely
1177           btst  7, %o3
1178         bz      %icc, dcis8
1179           btst  3, %o3
1180         bz      %icc, dcis4
1181           nop
1182         ba,pt   %xcc, dcis2
1183           nop
1184         !
1185         ! Eight byte aligned copies. A steal from the original .small_copyin
1186         ! with modifications. %o2 is number of 8 byte chunks to copy. When
1187         ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
1188         ! to copy.
1189         !
1190         .align 32
1191 didebc:
1192         ldxa    [%o0 + %o3]ASI_AIUS, %o4
1193         deccc   %o2
1194         stx     %o4, [%o1 + %o3]
1195         bg,pt   %xcc, didebc
1196         addcc   %o3, 8, %o3
1197         !
1198         ! End of copy loop. Most 8 byte aligned copies end here.
1199         !
1200         bz,pt   %xcc, dcifh
1201         nop
1202         !
1203         ! Something is left. Do it byte for byte.
1204         !
1205         ba,pt   %xcc, dcicl
1206         lduba   [%o0 + %o3]ASI_AIUS, %o4
1207         !
1208         ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
1209         !
1210         .align 32
1211 didfbc:
1212         lduwa   [%o0 + %o3]ASI_AIUS, %o4
1213         deccc   %o2
1214         st      %o4, [%o1 + %o3]
1215         bg,pt   %xcc, didfbc
1216         addcc   %o3, 4, %o3
1217         !
1218         ! End of copy loop. Most 4 byte aligned copies end here.
1219         !
1220         bz,pt   %xcc, dcifh
1221         nop
1222         !
1223         ! Something is left. Do it byte for byte.
1224         !
1225         ba,pt   %xcc, dcicl
1226         lduba   [%o0 + %o3]ASI_AIUS, %o4
1227         !
1228         ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
1229         ! copy.
1230         !
1231         .align 32
1232 didtbc:
1233         lduha   [%o0 + %o3]ASI_AIUS, %o4
1234         deccc   %o2
1235         sth     %o4, [%o1 + %o3]
1236         bg,pt   %xcc, didtbc
1237           addcc %o3, 2, %o3
1238         !
1239         ! End of copy loop. Most 2 byte aligned copies end here.
1240         !
1241         bz,pt   %xcc, dcifh
1242           nop
1243         !
1244         ! Deal with the last byte
1245         !
1246         lduba   [%o0 + %o3]ASI_AIUS, %o4
1247         stb     %o4, [%o1 + %o3]
1248 dcifh:
1249         membar  #Sync
1250         retl
1251           clr   %o0
1252
1253 big_copyin:
1254         !
1255         ! We're going off to do a block copy.
1256         ! Switch fault hendlers and grab a window. We
1257         ! don't do a membar #Sync since we've done only
1258         ! kernel data to this point.
1259         !
1260         save    %sp, -SA(MINFRAME), %sp
1261
1262         ! Copy in that reach here are larger than 256 bytes. The
1263         ! hw_copy_limit_1 is set to 256. Never set this limit less
1264         ! 128 bytes.
1265 do_blockcopyin:
1266
1267         ! Swap src/dst since the code below is memcpy code
1268         ! and memcpy/bcopy have different calling sequences
1269         mov     %i1, %i5
1270         mov     %i0, %i1
1271         mov     %i5, %i0
1272
1273         andcc   %i0, 7, %i3             ! is dst double aligned
1274         bz      %xcc, copyin_blkcpy
1275           sub   %i3, 8, %i3
1276         neg     %i3                     ! bytes till double aligned
1277         sub     %i2, %i3, %i2           ! update %i2 with new count
1278
1279         ! Align Destination on double-word boundary
1280
1281 1:      lduba   [%i1]ASI_AIUS, %i4
1282         inc     %i1
1283         stb     %i4, [%i0]
1284         deccc   %i3
1285         bgu     %xcc, 1b
1286           inc   %i0
1287
1288 copyin_blkcpy:
1289         andcc   %i0, 63, %i3
1290         bz,pn   %xcc, copyin_blalign    ! now block aligned
1291         sub     %i3, 64, %i3
1292         neg     %i3                     ! bytes till block aligned
1293         sub     %i2, %i3, %i2           ! update %i2 with new count
1294
1295         ! Copy %i3 bytes till dst is block (64 byte) aligned. use
1296         ! double word copies.
1297
1298         andcc   %i1, 7, %g1             ! is src aligned on a 8 bytes
1299         bz      %xcc, ci_dbcopy         ! %g1 has source offset (last 3-bits)
1300         sll     %g1, 3, %l1             ! left shift
1301         mov     0x40, %l2
1302         sub     %l2, %l1, %l2           ! right shift = (64 - left shift)
1303
1304         ! Now use double word copies to align destination.
1305 ci_double:
1306         sub     %i1, %g1, %i1           ! align the src at 8 bytes.
1307         ldxa    [%i1]ASI_AIUS, %o2
1308 2:
1309         add     %i1, 0x8, %i1
1310         ldxa    [%i1]ASI_AIUS, %o4
1311         ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3)
1312         stx     %o2, [%i0]
1313         mov     %o4, %o2
1314         subcc   %i3, 0x8, %i3
1315         bgu,pt  %xcc, 2b
1316         add     %i0, 0x8, %i0
1317         ba      copyin_blalign
1318         add     %i1, %g1, %i1
1319
1320         ! Both source and destination are double aligned.
1321         ! No shift and merge of data required in this case.
1322 ci_dbcopy:
1323         ldxa    [%i1]ASI_AIUS, %o2
1324         stx     %o2, [%i0]
1325         add     %i1, 0x8, %i1
1326         subcc   %i3, 0x8, %i3
1327         bgu,pt  %xcc, ci_dbcopy
1328         add     %i0, 0x8, %i0
1329
1330 copyin_blalign:
1331         andn    %i2, 0x3f, %i3          ! %i3 count is multiple of block size
1332         sub     %i2, %i3, %i2           ! Residue bytes in %i2
1333
1334         wr      %g0, ASI_LDSTBI_P, %asi
1335
1336         andcc   %i1, 0xf, %o2           ! is src quadword aligned
1337         bz,pn   %xcc, ci_blkcpy         ! src offset in %o2 (last 4-bits)
1338           nop
1339         cmp     %o2, 0x8
1340         bg      ci_upper_double
1341           nop
1342         bl      ci_lower_double
1343           nop
1344
1345         ! Falls through when source offset is equal to 8 i.e.
1346         ! source is double word aligned.
1347         ! In this case no shift/merge of data is required
1348
1349         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
1350         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
1351         prefetch [%l0+0x0], #one_read
1352         ldda    [%i1]ASI_LDSTBI_AIUS, %l2
1353 ci_loop0:
1354         add     %i1, 0x10, %i1
1355         ldda    [%i1]ASI_LDSTBI_AIUS, %l4
1356
1357         prefetch [%l0+0x40], #one_read
1358
1359         stxa    %l3, [%i0+0x0]%asi
1360         stxa    %l4, [%i0+0x8]%asi
1361
1362         add     %i1, 0x10, %i1
1363         ldda    [%i1]ASI_LDSTBI_AIUS, %l2
1364
1365         stxa    %l5, [%i0+0x10]%asi
1366         stxa    %l2, [%i0+0x18]%asi
1367
1368         add     %i1, 0x10, %i1
1369         ldda    [%i1]ASI_LDSTBI_AIUS, %l4
1370
1371         stxa    %l3, [%i0+0x20]%asi
1372         stxa    %l4, [%i0+0x28]%asi
1373
1374         add     %i1, 0x10, %i1
1375         ldda    [%i1]ASI_LDSTBI_AIUS, %l2
1376
1377         stxa    %l5, [%i0+0x30]%asi
1378         stxa    %l2, [%i0+0x38]%asi
1379
1380         add     %l0, 0x40, %l0
1381         subcc   %i3, 0x40, %i3
1382         bgu,pt  %xcc, ci_loop0
1383           add   %i0, 0x40, %i0
1384         ba      ci_blkdone
1385           add   %i1, %o2, %i1           ! increment the source by src offset
1386                                         ! the src offset was stored in %o2
1387
1388 ci_lower_double:
1389
1390         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
1391         sll     %o2, 3, %o0             ! %o0 left shift
1392         mov     0x40, %o1
1393         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
1394         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
1395         prefetch [%l0+0x0], #one_read
1396         ldda    [%i1]ASI_LDSTBI_AIUS, %l2       ! partial data in %l2
1397                                                         ! and %l3 has complete
1398                                                         ! data
1399 ci_loop1:
1400         add     %i1, 0x10, %i1
1401         ldda    [%i1]ASI_LDSTBI_AIUS, %l4       ! %l4 has partial data
1402                                                         ! for this read.
1403         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)        ! merge %l2, %l3 and %l4
1404                                                         ! into %l2 and %l3
1405
1406         prefetch [%l0+0x40], #one_read
1407
1408         stxa    %l2, [%i0+0x0]%asi
1409         stxa    %l3, [%i0+0x8]%asi
1410
1411         add     %i1, 0x10, %i1
1412         ldda    [%i1]ASI_LDSTBI_AIUS, %l2
1413         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)        ! merge %l2 with %l5 and
1414                                                         ! %l4 from previous read
1415                                                         ! into %l4 and %l5
1416         stxa    %l4, [%i0+0x10]%asi
1417         stxa    %l5, [%i0+0x18]%asi
1418
1419         ! Repeat the same for next 32 bytes.
1420
1421         add     %i1, 0x10, %i1
1422         ldda    [%i1]ASI_LDSTBI_AIUS, %l4
1423         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
1424
1425         stxa    %l2, [%i0+0x20]%asi
1426         stxa    %l3, [%i0+0x28]%asi
1427
1428         add     %i1, 0x10, %i1
1429         ldda    [%i1]ASI_LDSTBI_AIUS, %l2
1430         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
1431
1432         stxa    %l4, [%i0+0x30]%asi
1433         stxa    %l5, [%i0+0x38]%asi
1434
1435         add     %l0, 0x40, %l0
1436         subcc   %i3, 0x40, %i3
1437         bgu,pt  %xcc, ci_loop1
1438           add   %i0, 0x40, %i0
1439         ba      ci_blkdone
1440           add   %i1, %o2, %i1           ! increment the source by src offset
1441                                         ! the src offset was stored in %o2
1442
1443 ci_upper_double:
1444
1445         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
1446         sub     %o2, 0x8, %o0
1447         sll     %o0, 3, %o0             ! %o0 left shift
1448         mov     0x40, %o1
1449         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
1450         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
1451         prefetch [%l0+0x0], #one_read
1452         ldda    [%i1]ASI_LDSTBI_AIUS, %l2       ! partial data in %l3
1453                                                         ! for this read and
1454                                                         ! no data in %l2
1455 ci_loop2:
1456         add     %i1, 0x10, %i1
1457         ldda    [%i1]ASI_LDSTBI_AIUS, %l4       ! %l4 has complete data
1458                                                         ! and %l5 has partial
1459         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)        ! merge %l3, %l4 and %l5
1460                                                         ! into %l3 and %l4
1461         prefetch [%l0+0x40], #one_read
1462
1463         stxa    %l3, [%i0+0x0]%asi
1464         stxa    %l4, [%i0+0x8]%asi
1465
1466         add     %i1, 0x10, %i1
1467         ldda    [%i1]ASI_LDSTBI_AIUS, %l2
1468         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)        ! merge %l2 and %l3 with
1469                                                         ! %l5 from previous read
1470                                                         ! into %l5 and %l2
1471
1472         stxa    %l5, [%i0+0x10]%asi
1473         stxa    %l2, [%i0+0x18]%asi
1474
1475         ! Repeat the same for next 32 bytes.
1476
1477         add     %i1, 0x10, %i1
1478         ldda    [%i1]ASI_LDSTBI_AIUS, %l4
1479         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
1480
1481         stxa    %l3, [%i0+0x20]%asi
1482         stxa    %l4, [%i0+0x28]%asi
1483
1484         add     %i1, 0x10, %i1
1485         ldda    [%i1]ASI_LDSTBI_AIUS, %l2
1486         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
1487
1488         stxa    %l5, [%i0+0x30]%asi
1489         stxa    %l2, [%i0+0x38]%asi
1490
1491         add     %l0, 0x40, %l0
1492         subcc   %i3, 0x40, %i3
1493         bgu,pt  %xcc, ci_loop2
1494           add   %i0, 0x40, %i0
1495         ba      ci_blkdone
1496           add   %i1, %o2, %i1           ! increment the source by src offset
1497                                         ! the src offset was stored in %o2
1498
1499
1500         ! Do fast copy using ASI_LDSTBI_P
1501 ci_blkcpy:
1502
1503         andn    %i1, 0x3f, %o0          ! %o0 has block aligned source
1504         prefetch [%o0+0x0], #one_read
1505 1:
1506         ldda    [%i1]ASI_LDSTBI_AIUS, %l0
1507         add     %i1, 0x10, %i1
1508         ldda    [%i1]ASI_LDSTBI_AIUS, %l2
1509         add     %i1, 0x10, %i1
1510
1511         prefetch [%o0+0x40], #one_read
1512
1513         stxa    %l0, [%i0+0x0]%asi
1514
1515         ldda    [%i1]ASI_LDSTBI_AIUS, %l4
1516         add     %i1, 0x10, %i1
1517         ldda    [%i1]ASI_LDSTBI_AIUS, %l6
1518         add     %i1, 0x10, %i1
1519
1520         stxa    %l1, [%i0+0x8]%asi
1521         stxa    %l2, [%i0+0x10]%asi
1522         stxa    %l3, [%i0+0x18]%asi
1523         stxa    %l4, [%i0+0x20]%asi
1524         stxa    %l5, [%i0+0x28]%asi
1525         stxa    %l6, [%i0+0x30]%asi
1526         stxa    %l7, [%i0+0x38]%asi
1527
1528         add     %o0, 0x40, %o0
1529         subcc   %i3, 0x40, %i3
1530         bgu,pt  %xcc, 1b
1531           add   %i0, 0x40, %i0
1532
1533 ci_blkdone:
1534         membar  #Sync
1535
1536         ! Copy as much rest of the data as double word copy.
1537 ci_dwcp:
1538         cmp     %i2, 0x8                ! Not enough bytes to copy as double
1539         blu     %xcc, ci_dbdone
1540           nop
1541
1542         andn    %i2, 0x7, %i3           ! %i3 count is multiple of 8 bytes size
1543         sub     %i2, %i3, %i2           ! Residue bytes in %i2
1544
1545         andcc   %i1, 7, %g1             ! is src aligned on a 8 bytes
1546         bz      %xcc, ci_cpy_db
1547           nop
1548
1549         sll     %g1, 3, %l0             ! left shift
1550         mov     0x40, %l1
1551         sub     %l1, %l0, %l1           ! right shift = (64 - left shift)
1552
1553 ci_cpy_dbwd:
1554         sub     %i1, %g1, %i1           ! align the src at 8 bytes.
1555         ldxa    [%i1]ASI_AIUS, %o2
1556 3:
1557         add     %i1, 0x8, %i1
1558         ldxa    [%i1]ASI_AIUS, %o4
1559         ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3)
1560         stx     %o2, [%i0]
1561         mov     %o4, %o2
1562         subcc   %i3, 0x8, %i3
1563         bgu,pt  %xcc, 3b
1564           add   %i0, 0x8, %i0
1565         ba      ci_dbdone
1566           add   %i1, %g1, %i1
1567
1568 ci_cpy_db:
1569         ldxa    [%i1]ASI_AIUS, %o2
1570         stx     %o2, [%i0]
1571         add     %i1, 0x8, %i1
1572         subcc   %i3, 0x8, %i3
1573         bgu,pt  %xcc, ci_cpy_db
1574         add     %i0, 0x8, %i0
1575
1576 ci_dbdone:
1577         tst     %i2
1578         bz,pt   %xcc, copyin_exit
1579           nop
1580
1581         ! Copy the residue as byte copy
1582 ci_residue:
1583         lduba   [%i1]ASI_AIUS, %i4
1584         stb     %i4, [%i0]
1585         inc     %i1
1586         deccc   %i2
1587         bgu     %xcc, ci_residue
1588           inc   %i0
1589
1590 copyin_exit:
1591         membar  #Sync
1592         ret
1593           restore       %g0, 0, %o0
1594 END(copyin)
1595
1596 #endif  /* lint */
1597 #endif
1598