test/CodeGen/AMDGPU/branch-relaxation.ll

   1 ; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s | FileCheck -check-prefix=GCN %s
   2 ; Restrict maximum branch to between +7 and -8 dwords
   3
   4 ; Used to emit an always 4 byte instruction. Inline asm always assumes
   5 ; each instruction is the maximum size.
   6 declare void @llvm.amdgcn.s.sleep(i32) #0
   7
   8 declare i32 @llvm.amdgcn.workitem.id.x() #1
   9
  10
  11 ; GCN-LABEL: {{^}}uniform_conditional_max_short_forward_branch:
  12 ; GCN: s_load_dword [[CND:s[0-9]+]]
  13 ; GCN: s_cmp_eq_u32 [[CND]], 0
  14 ; GCN-NEXT: s_cbranch_scc1 [[BB3:BB[0-9]+_[0-9]+]]
  15
  16
  17 ; GCN-NEXT: ; BB#1: ; %bb2
  18 ; GCN-NEXT: ;;#ASMSTART
  19 ; GCN-NEXT: v_nop_e64
  20 ; GCN-NEXT: v_nop_e64
  21 ; GCN-NEXT: v_nop_e64
  22 ; GCN-NEXT: ;;#ASMEND
  23 ; GCN-NEXT: s_sleep 0
  24
  25 ; GCN-NEXT: [[BB3]]: ; %bb3
  26 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
  27 ; GCN: buffer_store_dword [[V_CND]]
  28 ; GCN: s_endpgm
  29 define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
  30 bb:
  31   %cmp = icmp eq i32 %cnd, 0
  32   br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
  33
  34 bb2:
  35 ; 24 bytes
  36   call void asm sideeffect
  37    "v_nop_e64
  38     v_nop_e64
  39     v_nop_e64", ""() #0
  40   call void @llvm.amdgcn.s.sleep(i32 0)
  41   br label %bb3
  42
  43 bb3:
  44   store volatile i32 %cnd, i32 addrspace(1)* %arg
  45   ret void
  46 }
  47
  48 ; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_branch:
  49 ; GCN: s_load_dword [[CND:s[0-9]+]]
  50 ; GCN: s_cmp_eq_u32 [[CND]], 0
  51 ; GCN-NEXT: s_cbranch_scc0 [[LONGBB:BB[0-9]+_[0-9]+]]
  52
  53 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
  54 ; GCN-NEXT: s_getpc_b64 vcc
  55 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
  56 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0
  57 ; GCN-NEXT: s_setpc_b64 vcc
  58
  59 ; GCN-NEXT: [[LONGBB]]:
  60 ; GCN-NEXT: ;;#ASMSTART
  61 ; GCN: v_nop_e64
  62 ; GCN: v_nop_e64
  63 ; GCN: v_nop_e64
  64 ; GCN: v_nop_e64
  65 ; GCN-NEXT: ;;#ASMEND
  66
  67 ; GCN-NEXT: [[ENDBB]]:
  68 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
  69 ; GCN: buffer_store_dword [[V_CND]]
  70 ; GCN: s_endpgm
  71 define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
  72 bb0:
  73   %cmp = icmp eq i32 %cnd, 0
  74   br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
  75
  76 bb2:
  77 ; 32 bytes
  78   call void asm sideeffect
  79    "v_nop_e64
  80     v_nop_e64
  81     v_nop_e64
  82     v_nop_e64", ""() #0
  83   br label %bb3
  84
  85 bb3:
  86   store volatile i32 %cnd, i32 addrspace(1)* %arg
  87   ret void
  88 }
  89
  90 ; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_vcnd_branch:
  91 ; GCN: s_load_dword [[CND:s[0-9]+]]
  92 ; GCN-DAG: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
  93 ; GCN-DAG: v_cmp_eq_f32_e64 vcc, [[CND]], 0
  94 ; GCN: s_cbranch_vccz [[LONGBB:BB[0-9]+_[0-9]+]]
  95
  96 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
  97 ; GCN-NEXT: s_getpc_b64 vcc
  98 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
  99 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0
 100 ; GCN-NEXT: s_setpc_b64 vcc
 101
 102 ; GCN-NEXT: [[LONGBB]]:
 103 ; GCN: v_nop_e64
 104 ; GCN: v_nop_e64
 105 ; GCN: v_nop_e64
 106 ; GCN: v_nop_e64
 107
 108 ; GCN: [[ENDBB]]:
 109 ; GCN: buffer_store_dword [[V_CND]]
 110 ; GCN: s_endpgm
 111 define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
 112 bb0:
 113   %cmp = fcmp oeq float %cnd, 0.0
 114   br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
 115
 116 bb2:
 117   call void asm sideeffect " ; 32 bytes
 118     v_nop_e64
 119     v_nop_e64
 120     v_nop_e64
 121     v_nop_e64", ""() #0
 122   br label %bb3
 123
 124 bb3:
 125   store volatile float %cnd, float addrspace(1)* %arg
 126   ret void
 127 }
 128
 129 ; GCN-LABEL: {{^}}min_long_forward_vbranch:
 130
 131 ; GCN: buffer_load_dword
 132 ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 133 ; GCN: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
 134 ; GCN: s_xor_b64 [[SAVE]], exec, [[SAVE]]
 135
 136 ; GCN: v_nop_e64
 137 ; GCN: v_nop_e64
 138 ; GCN: v_nop_e64
 139 ; GCN: v_nop_e64
 140
 141 ; GCN: s_or_b64 exec, exec, [[SAVE]]
 142 ; GCN: buffer_store_dword
 143 ; GCN: s_endpgm
 144 define amdgpu_kernel void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
 145 bb:
 146   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 147   %tid.ext = zext i32 %tid to i64
 148   %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tid.ext
 149   %load = load volatile i32, i32 addrspace(1)* %gep
 150   %cmp = icmp eq i32 %load, 0
 151   br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
 152
 153 bb2:
 154   call void asm sideeffect " ; 32 bytes
 155     v_nop_e64
 156     v_nop_e64
 157     v_nop_e64
 158     v_nop_e64", ""() #0
 159   br label %bb3
 160
 161 bb3:
 162   store volatile i32 %load, i32 addrspace(1)* %gep
 163   ret void
 164 }
 165
 166 ; GCN-LABEL: {{^}}long_backward_sbranch:
 167 ; GCN: s_mov_b32 [[LOOPIDX:s[0-9]+]], 0{{$}}
 168
 169 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: ; %bb2
 170 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
 171 ; GCN-NEXT: s_add_i32 [[INC:s[0-9]+]], [[LOOPIDX]], 1
 172 ; GCN-NEXT: s_cmp_lt_i32 [[INC]], 10
 173
 174 ; GCN-NEXT: ;;#ASMSTART
 175 ; GCN-NEXT: v_nop_e64
 176 ; GCN-NEXT: v_nop_e64
 177 ; GCN-NEXT: v_nop_e64
 178 ; GCN-NEXT: ;;#ASMEND
 179
 180 ; GCN-NEXT: s_cbranch_scc0 [[ENDBB:BB[0-9]+_[0-9]+]]
 181
 182 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2
 183 ; GCN-NEXT: ; in Loop: Header=[[LOOPBB]] Depth=1
 184 ; GCN-NEXT: s_getpc_b64 vcc
 185 ; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONG_JUMP]]+4)-[[LOOPBB]]
 186 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0
 187 ; GCN-NEXT: s_setpc_b64 vcc
 188
 189 ; GCN-NEXT: [[ENDBB]]:
 190 ; GCN-NEXT: s_endpgm
 191 define amdgpu_kernel void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
 192 bb:
 193   br label %bb2
 194
 195 bb2:
 196   %loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ]
 197    ; 24 bytes
 198   call void asm sideeffect
 199    "v_nop_e64
 200     v_nop_e64
 201     v_nop_e64", ""() #0
 202   %inc = add nsw i32 %loop.idx, 1 ; add cost 4
 203   %cmp = icmp slt i32 %inc, 10 ; condition cost = 8
 204   br i1 %cmp, label %bb2, label %bb3 ; -
 205
 206 bb3:
 207   ret void
 208 }
 209
 210 ; Requires expansion of unconditional branch from %bb2 to %bb4 (and
 211 ; expansion of conditional branch from %bb to %bb3.
 212
 213 ; GCN-LABEL: {{^}}uniform_unconditional_min_long_forward_branch:
 214 ; GCN: s_cmp_eq_u32
 215 ; GCN-NEXT: s_cbranch_scc0 [[BB2:BB[0-9]+_[0-9]+]]
 216
 217 ; GCN-NEXT: [[LONG_JUMP0:BB[0-9]+_[0-9]+]]: ; %bb0
 218 ; GCN-NEXT: s_getpc_b64 vcc
 219 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB3:BB[0-9]_[0-9]+]]-([[LONG_JUMP0]]+4)
 220 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
 221 ; GCN-NEXT: s_setpc_b64 vcc
 222
 223 ; GCN-NEXT: [[BB2]]: ; %bb2
 224 ; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17
 225 ; GCN: buffer_store_dword [[BB2_K]]
 226 ; GCN: s_waitcnt vmcnt(0)
 227
 228 ; GCN-NEXT: [[LONG_JUMP1:BB[0-9]+_[0-9]+]]: ; %bb2
 229 ; GCN-NEXT: s_getpc_b64 vcc
 230 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB4:BB[0-9]_[0-9]+]]-([[LONG_JUMP1]]+4)
 231 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
 232 ; GCN-NEXT: s_setpc_b64 vcc
 233
 234 ; GCN: [[BB3]]: ; %bb3
 235 ; GCN: v_nop_e64
 236 ; GCN: v_nop_e64
 237 ; GCN: v_nop_e64
 238 ; GCN: v_nop_e64
 239 ; GCN: ;;#ASMEND
 240
 241 ; GCN-NEXT: [[BB4]]: ; %bb4
 242 ; GCN: v_mov_b32_e32 [[BB4_K:v[0-9]+]], 63
 243 ; GCN: buffer_store_dword [[BB4_K]]
 244 ; GCN-NEXT: s_endpgm
 245 ; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
 246 define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 247 bb0:
 248   %tmp = icmp ne i32 %arg1, 0
 249   br i1 %tmp, label %bb2, label %bb3
 250
 251 bb2:
 252   store volatile i32 17, i32 addrspace(1)* undef
 253   br label %bb4
 254
 255 bb3:
 256   ; 32 byte asm
 257   call void asm sideeffect
 258    "v_nop_e64
 259     v_nop_e64
 260     v_nop_e64
 261     v_nop_e64", ""() #0
 262   br label %bb4
 263
 264 bb4:
 265   store volatile i32 63, i32 addrspace(1)* %arg
 266   ret void
 267 }
 268
 269 ; GCN-LABEL: {{^}}uniform_unconditional_min_long_backward_branch:
 270 ; GCN-NEXT: ; BB#0: ; %entry
 271
 272 ; GCN-NEXT: [[LOOP:BB[0-9]_[0-9]+]]: ; %loop
 273 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
 274 ; GCN-NEXT: ;;#ASMSTART
 275 ; GCN-NEXT: v_nop_e64
 276 ; GCN-NEXT: v_nop_e64
 277 ; GCN-NEXT: v_nop_e64
 278 ; GCN-NEXT: v_nop_e64
 279 ; GCN-NEXT: ;;#ASMEND
 280
 281 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop
 282 ; GCN-NEXT: ; in Loop: Header=[[LOOP]] Depth=1
 283 ; GCN-NEXT: s_getpc_b64 vcc
 284 ; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONGBB]]+4)-[[LOOP]]
 285 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0{{$}}
 286 ; GCN-NEXT: s_setpc_b64 vcc
 287 ; GCN-NEXT .Lfunc_end{{[0-9]+}}:
 288 define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 289 entry:
 290   br label %loop
 291
 292 loop:
 293   ; 32 byte asm
 294   call void asm sideeffect
 295    "v_nop_e64
 296     v_nop_e64
 297     v_nop_e64
 298     v_nop_e64", ""() #0
 299   br label %loop
 300 }
 301
 302 ; Expansion of branch from %bb1 to %bb3 introduces need to expand
 303 ; branch from %bb0 to %bb2
 304
 305 ; GCN-LABEL: {{^}}expand_requires_expand:
 306 ; GCN-NEXT: ; BB#0: ; %bb0
 307 ; GCN: s_load_dword
 308 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 0{{$}}
 309 ; GCN-NEXT: s_cbranch_scc0 [[BB1:BB[0-9]+_[0-9]+]]
 310
 311 ; GCN-NEXT: [[LONGBB0:BB[0-9]+_[0-9]+]]: ; %bb0
 312 ; GCN-NEXT: s_getpc_b64 vcc
 313 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB0]]+4)
 314 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
 315 ; GCN-NEXT: s_setpc_b64 vcc
 316
 317 ; GCN-NEXT: [[BB1]]: ; %bb1
 318 ; GCN-NEXT: s_load_dword
 319 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 320 ; GCN-NEXT: s_cmp_eq_u32 s{{[0-9]+}}, 3{{$}}
 321 ; GCN-NEXT: s_cbranch_scc0 [[BB2:BB[0-9]_[0-9]+]]
 322
 323 ; GCN-NEXT: [[LONGBB1:BB[0-9]+_[0-9]+]]: ; %bb1
 324 ; GCN-NEXT: s_getpc_b64 vcc
 325 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB3:BB[0-9]+_[0-9]+]]-([[LONGBB1]]+4)
 326 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
 327 ; GCN-NEXT: s_setpc_b64 vcc
 328
 329 ; GCN-NEXT: [[BB2]]: ; %bb2
 330 ; GCN-NEXT: ;;#ASMSTART
 331 ; GCN-NEXT: v_nop_e64
 332 ; GCN-NEXT: v_nop_e64
 333 ; GCN-NEXT: v_nop_e64
 334 ; GCN-NEXT: v_nop_e64
 335 ; GCN-NEXT: ;;#ASMEND
 336
 337 ; GCN-NEXT: [[BB3]]: ; %bb3
 338 ; GCN-NEXT: ;;#ASMSTART
 339 ; GCN-NEXT: v_nop_e64
 340 ; GCN-NEXT: ;;#ASMEND
 341 ; GCN-NEXT: ;;#ASMSTART
 342 ; GCN-NEXT: v_nop_e64
 343 ; GCN-NEXT: ;;#ASMEND
 344 ; GCN-NEXT: s_endpgm
 345 define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 346 bb0:
 347   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
 348   %cmp0 = icmp slt i32 %cond0, 0
 349   br i1 %cmp0, label %bb2, label %bb1
 350
 351 bb1:
 352   %val = load volatile i32, i32 addrspace(2)* undef
 353   %cmp1 = icmp eq i32 %val, 3
 354   br i1 %cmp1, label %bb3, label %bb2
 355
 356 bb2:
 357   call void asm sideeffect
 358    "v_nop_e64
 359     v_nop_e64
 360     v_nop_e64
 361     v_nop_e64", ""() #0
 362   br label %bb3
 363
 364 bb3:
 365 ; These NOPs prevent tail-duplication-based outlining
 366 ; from firing, which defeats the need to expand the branches and this test.
 367   call void asm sideeffect
 368    "v_nop_e64", ""() #0
 369   call void asm sideeffect
 370    "v_nop_e64", ""() #0
 371   ret void
 372 }
 373
 374 ; Requires expanding of required skip branch.
 375
 376 ; GCN-LABEL: {{^}}uniform_inside_divergent:
 377 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 378 ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
 379 ; GCN-NEXT: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 380 ; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]]
 381 ; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]]
 382
 383 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %entry
 384 ; GCN-NEXT: s_getpc_b64 vcc
 385 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB]]+4)
 386 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
 387 ; GCN-NEXT: s_setpc_b64 vcc
 388
 389 ; GCN-NEXT: [[IF]]: ; %if
 390 ; GCN: buffer_store_dword
 391 ; GCN: s_cmp_lg_u32
 392 ; GCN: s_cbranch_scc1 [[ENDIF]]
 393
 394 ; GCN-NEXT: ; BB#2: ; %if_uniform
 395 ; GCN: buffer_store_dword
 396 ; GCN: s_waitcnt vmcnt(0)
 397
 398 ; GCN-NEXT: [[ENDIF]]: ; %endif
 399 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 400 ; GCN-NEXT: s_sleep 5
 401 ; GCN-NEXT: s_endpgm
 402 define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
 403 entry:
 404   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 405   %d_cmp = icmp ult i32 %tid, 16
 406   br i1 %d_cmp, label %if, label %endif
 407
 408 if:
 409   store i32 0, i32 addrspace(1)* %out
 410   %u_cmp = icmp eq i32 %cond, 0
 411   br i1 %u_cmp, label %if_uniform, label %endif
 412
 413 if_uniform:
 414   store i32 1, i32 addrspace(1)* %out
 415   br label %endif
 416
 417 endif:
 418   ; layout can remove the split branch if it can copy the return block.
 419   ; This call makes the return block long enough that it doesn't get copied.
 420   call void @llvm.amdgcn.s.sleep(i32 5);
 421   ret void
 422 }
 423
 424 ; si_mask_branch
 425 ; s_cbranch_execz
 426 ; s_branch
 427
 428 ; GCN-LABEL: {{^}}analyze_mask_branch:
 429 ; GCN: v_cmp_lt_f32_e32 vcc
 430 ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
 431 ; GCN-NEXT: s_xor_b64 [[MASK]], exec, [[MASK]]
 432 ; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
 433 ; GCN-NEXT: s_cbranch_execz [[BRANCH_SKIP:BB[0-9]+_[0-9]+]]
 434 ; GCN-NEXT: s_branch [[LOOP_BODY:BB[0-9]+_[0-9]+]]
 435
 436 ; GCN-NEXT: [[BRANCH_SKIP]]: ; %entry
 437 ; GCN-NEXT: s_getpc_b64 vcc
 438 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[RET]]-([[BRANCH_SKIP]]+4)
 439 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0
 440 ; GCN-NEXT: s_setpc_b64 vcc
 441
 442 ; GCN-NEXT: [[LOOP_BODY]]: ; %loop_body
 443 ; GCN: s_mov_b64 vcc, -1{{$}}
 444 ; GCN: ;;#ASMSTART
 445 ; GCN: v_nop_e64
 446 ; GCN: v_nop_e64
 447 ; GCN: v_nop_e64
 448 ; GCN: v_nop_e64
 449 ; GCN: v_nop_e64
 450 ; GCN: v_nop_e64
 451 ; GCN: ;;#ASMEND
 452 ; GCN-NEXT: s_cbranch_vccz [[RET]]
 453
 454 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop_body
 455 ; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1
 456 ; GCN-NEXT: s_getpc_b64 vcc
 457 ; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONGBB]]+4)-[[LOOP_BODY]]
 458 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0
 459 ; GCN-NEXT: s_setpc_b64 vcc
 460
 461 ; GCN-NEXT: [[RET]]: ; %Flow
 462 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 463 ; GCN: buffer_store_dword
 464 ; GCN-NEXT: s_endpgm
 465 define amdgpu_kernel void @analyze_mask_branch() #0 {
 466 entry:
 467   %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"()
 468   %cmp0 = fcmp ogt float %reg, 0.000000e+00
 469   br i1 %cmp0, label %loop, label %ret
 470
 471 loop:
 472   %phi = phi float [ 0.000000e+00, %loop_body ], [ 1.000000e+00, %entry ]
 473   call void asm sideeffect
 474     "v_nop_e64
 475      v_nop_e64", ""() #0
 476   %cmp1 = fcmp olt float %phi, 8.0
 477   br i1 %cmp1, label %loop_body, label %ret
 478
 479 loop_body:
 480   call void asm sideeffect
 481   "v_nop_e64
 482    v_nop_e64
 483    v_nop_e64
 484    v_nop_e64", ""() #0
 485   br label %loop
 486
 487 ret:
 488   store volatile i32 7, i32 addrspace(1)* undef
 489   ret void
 490 }
 491
 492 ; GCN-LABEL: {{^}}long_branch_hang:
 493 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
 494 ; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
 495 ; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
 496 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
 497
 498 ; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
 499 ; GCN: s_setpc_b64
 500
 501 ; GCN-NEXT: [[LONG_BR_0]]:
 502 ; GCN-DAG: v_cmp_lt_i32
 503 ; GCN-DAG: v_cmp_gt_i32
 504 ; GCN: s_cbranch_vccnz
 505
 506 ; GCN: s_setpc_b64
 507 ; GCN: s_setpc_b64
 508
 509 ; GCN: [[LONG_BR_DEST0]]
 510 ; GCN: v_cmp_ne_u32_e32
 511 ; GCN-NEXT: s_cbranch_vccz
 512 ; GCN: s_setpc_b64
 513
 514 ; GCN: s_endpgm
 515 define amdgpu_kernel void @long_branch_hang(i32 addrspace(1)* nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 {
 516 bb:
 517   %tmp = icmp slt i32 %arg2, 9
 518   %tmp6 = icmp eq i32 %arg1, 0
 519   %tmp7 = icmp sgt i32 %arg4, 0
 520   %tmp8 = icmp sgt i32 %arg4, 5
 521   br i1 %tmp8, label %bb9, label %bb13
 522
 523 bb9:                                              ; preds = %bb
 524   %tmp10 = and i1 %tmp7, %tmp
 525   %tmp11 = icmp slt i32 %arg3, %arg4
 526   %tmp12 = or i1 %tmp11, %tmp7
 527   br i1 %tmp12, label %bb19, label %bb14
 528
 529 bb13:                                             ; preds = %bb
 530   br i1 %tmp6, label %bb19, label %bb14
 531
 532 bb14:                                             ; preds = %bb13, %bb9
 533   %tmp15 = icmp slt i32 %arg3, %arg4
 534   %tmp16 = or i1 %tmp15, %tmp
 535   %tmp17 = and i1 %tmp6, %tmp16
 536   %tmp18 = zext i1 %tmp17 to i32
 537   br label %bb19
 538
 539 bb19:                                             ; preds = %bb14, %bb13, %bb9
 540   %tmp20 = phi i32 [ undef, %bb9 ], [ undef, %bb13 ], [ %tmp18, %bb14 ]
 541   %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %arg5
 542   store i32 %tmp20, i32 addrspace(1)* %tmp21, align 4
 543   ret void
 544 }
 545
 546 attributes #0 = { nounwind }
 547 attributes #1 = { nounwind readnone }