test/CodeGen/AMDGPU/vector-alloca.ll

   1 ; RUN: llc -march=amdgcn -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
   2 ; RUN: llc -march=amdgcn -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
   3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
   4 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
   5 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
   6 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
   7
   8 ; OPT-LABEL: @vector_read(
   9 ; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
  10 ; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
  11
  12 ; FUNC-LABEL: {{^}}vector_read:
  13 ; EG: MOV
  14 ; EG: MOV
  15 ; EG: MOV
  16 ; EG: MOV
  17 ; EG: MOVA_INT
  18 define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
  19 entry:
  20   %tmp = alloca [4 x i32]
  21   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
  22   %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
  23   %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
  24   %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
  25   store i32 0, i32* %x
  26   store i32 1, i32* %y
  27   store i32 2, i32* %z
  28   store i32 3, i32* %w
  29   %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index
  30   %tmp2 = load i32, i32* %tmp1
  31   store i32 %tmp2, i32 addrspace(1)* %out
  32   ret void
  33 }
  34
  35 ; OPT-LABEL: @vector_write(
  36 ; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
  37 ; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index
  38 ; OPT: store i32 %1, i32 addrspace(1)* %out, align 4
  39
  40 ; FUNC-LABEL: {{^}}vector_write:
  41 ; EG: MOV
  42 ; EG: MOV
  43 ; EG: MOV
  44 ; EG: MOV
  45 ; EG: MOVA_INT
  46 ; EG: MOVA_INT
  47 define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
  48 entry:
  49   %tmp = alloca [4 x i32]
  50   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
  51   %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
  52   %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
  53   %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
  54   store i32 0, i32* %x
  55   store i32 0, i32* %y
  56   store i32 0, i32* %z
  57   store i32 0, i32* %w
  58   %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %w_index
  59   store i32 1, i32* %tmp1
  60   %tmp2 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %r_index
  61   %tmp3 = load i32, i32* %tmp2
  62   store i32 %tmp3, i32 addrspace(1)* %out
  63   ret void
  64 }
  65
  66 ; This test should be optimize to:
  67 ; store i32 0, i32 addrspace(1)* %out
  68
  69 ; OPT-LABEL: @bitcast_gep(
  70 ; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4
  71
  72 ; FUNC-LABEL: {{^}}bitcast_gep:
  73 ; EG: STORE_RAW
  74 define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
  75 entry:
  76   %tmp = alloca [4 x i32]
  77   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
  78   %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
  79   %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
  80   %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
  81   store i32 0, i32* %x
  82   store i32 0, i32* %y
  83   store i32 0, i32* %z
  84   store i32 0, i32* %w
  85   %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
  86   %tmp2 = bitcast i32* %tmp1 to [4 x i32]*
  87   %tmp3 = getelementptr [4 x i32], [4 x i32]* %tmp2, i32 0, i32 0
  88   %tmp4 = load i32, i32* %tmp3
  89   store i32 %tmp4, i32 addrspace(1)* %out
  90   ret void
  91 }
  92
  93 ; OPT-LABEL: @vector_read_bitcast_gep(
  94 ; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
  95 ; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
  96 define void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
  97 entry:
  98   %tmp = alloca [4 x i32]
  99   %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
 100   %y = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
 101   %z = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
 102   %w = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
 103   %bc = bitcast i32* %x to float*
 104   store float 1.0, float* %bc
 105   store i32 1, i32* %y
 106   store i32 2, i32* %z
 107   store i32 3, i32* %w
 108   %tmp1 = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index
 109   %tmp2 = load i32, i32* %tmp1
 110   store i32 %tmp2, i32 addrspace(1)* %out
 111   ret void
 112 }
 113
 114 ; FIXME: Should be able to promote this. Instcombine should fold the
 115 ; cast in the hasOneUse case so it might not matter in practice
 116
 117 ; OPT-LABEL: @vector_read_bitcast_alloca(
 118 ; OPT: alloca [4 x float]
 119 ; OPT: store float
 120 ; OPT: store float
 121 ; OPT: store float
 122 ; OPT: store float
 123 ; OPT: load float
 124 define void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
 125 entry:
 126   %tmp = alloca [4 x i32]
 127   %tmp.bc = bitcast [4 x i32]* %tmp to [4 x float]*
 128   %x = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 0
 129   %y = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 1
 130   %z = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 2
 131   %w = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 3
 132   store float 0.0, float* %x
 133   store float 1.0, float* %y
 134   store float 2.0, float* %z
 135   store float 4.0, float* %w
 136   %tmp1 = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 %index
 137   %tmp2 = load float, float* %tmp1
 138   store float %tmp2, float addrspace(1)* %out
 139   ret void
 140 }