test/CodeGen/AMDGPU/branch-relaxation.ll

   1 ; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s | FileCheck -check-prefix=GCN %s
   2 ; Restrict maximum branch to between +7 and -8 dwords
   3
   4 ; Used to emit an always 4 byte instruction. Inline asm always assumes
   5 ; each instruction is the maximum size.
   6 declare void @llvm.amdgcn.s.sleep(i32) #0
   7
   8 declare i32 @llvm.amdgcn.workitem.id.x() #1
   9
  10
  11 ; GCN-LABEL: {{^}}uniform_conditional_max_short_forward_branch:
  12 ; GCN: s_load_dword [[CND:s[0-9]+]]
  13 ; GCN: s_cmp_eq_u32 [[CND]], 0
  14 ; GCN-NEXT: s_cbranch_scc1 [[BB3:BB[0-9]+_[0-9]+]]
  15
  16
  17 ; GCN-NEXT: ; BB#1: ; %bb2
  18 ; GCN-NEXT: ;;#ASMSTART
  19 ; GCN-NEXT: v_nop_e64
  20 ; GCN-NEXT: v_nop_e64
  21 ; GCN-NEXT: v_nop_e64
  22 ; GCN-NEXT: ;;#ASMEND
  23 ; GCN-NEXT: s_sleep 0
  24
  25 ; GCN-NEXT: [[BB3]]: ; %bb3
  26 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
  27 ; GCN: buffer_store_dword [[V_CND]]
  28 ; GCN: s_endpgm
  29 define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
  30 bb:
  31   %cmp = icmp eq i32 %cnd, 0
  32   br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
  33
  34 bb2:
  35 ; 24 bytes
  36   call void asm sideeffect
  37    "v_nop_e64
  38     v_nop_e64
  39     v_nop_e64", ""() #0
  40   call void @llvm.amdgcn.s.sleep(i32 0)
  41   br label %bb3
  42
  43 bb3:
  44   store volatile i32 %cnd, i32 addrspace(1)* %arg
  45   ret void
  46 }
  47
  48 ; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_branch:
  49 ; GCN: s_load_dword [[CND:s[0-9]+]]
  50 ; GCN: s_cmp_eq_u32 [[CND]], 0
  51 ; GCN-NEXT: s_cbranch_scc0 [[LONGBB:BB[0-9]+_[0-9]+]]
  52
  53 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
  54 ; GCN-NEXT: s_getpc_b64 vcc
  55 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
  56 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0
  57 ; GCN-NEXT: s_setpc_b64 vcc
  58
  59 ; GCN-NEXT: [[LONGBB]]:
  60 ; GCN-NEXT: ;;#ASMSTART
  61 ; GCN: v_nop_e64
  62 ; GCN: v_nop_e64
  63 ; GCN: v_nop_e64
  64 ; GCN: v_nop_e64
  65 ; GCN-NEXT: ;;#ASMEND
  66
  67 ; GCN-NEXT: [[ENDBB]]:
  68 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
  69 ; GCN: buffer_store_dword [[V_CND]]
  70 ; GCN: s_endpgm
  71 define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
  72 bb0:
  73   %cmp = icmp eq i32 %cnd, 0
  74   br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
  75
  76 bb2:
  77 ; 32 bytes
  78   call void asm sideeffect
  79    "v_nop_e64
  80     v_nop_e64
  81     v_nop_e64
  82     v_nop_e64", ""() #0
  83   br label %bb3
  84
  85 bb3:
  86   store volatile i32 %cnd, i32 addrspace(1)* %arg
  87   ret void
  88 }
  89
  90 ; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_vcnd_branch:
  91 ; GCN: s_load_dword [[CND:s[0-9]+]]
  92 ; GCN-DAG: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
  93 ; GCN-DAG: v_cmp_eq_f32_e64 vcc, [[CND]], 0
  94 ; GCN: s_cbranch_vccz [[LONGBB:BB[0-9]+_[0-9]+]]
  95
  96 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
  97 ; GCN-NEXT: s_getpc_b64 vcc
  98 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
  99 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0
 100 ; GCN-NEXT: s_setpc_b64 vcc
 101
 102 ; GCN-NEXT: [[LONGBB]]:
 103 ; GCN: v_nop_e64
 104 ; GCN: v_nop_e64
 105 ; GCN: v_nop_e64
 106 ; GCN: v_nop_e64
 107
 108 ; GCN: [[ENDBB]]:
 109 ; GCN: buffer_store_dword [[V_CND]]
 110 ; GCN: s_endpgm
 111 define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
 112 bb0:
 113   %cmp = fcmp oeq float %cnd, 0.0
 114   br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
 115
 116 bb2:
 117   call void asm sideeffect " ; 32 bytes
 118     v_nop_e64
 119     v_nop_e64
 120     v_nop_e64
 121     v_nop_e64", ""() #0
 122   br label %bb3
 123
 124 bb3:
 125   store volatile float %cnd, float addrspace(1)* %arg
 126   ret void
 127 }
 128
 129 ; GCN-LABEL: {{^}}min_long_forward_vbranch:
 130
 131 ; GCN: buffer_load_dword
 132 ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 133 ; GCN: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
 134 ; GCN: s_xor_b64 [[SAVE]], exec, [[SAVE]]
 135
 136 ; GCN: v_nop_e64
 137 ; GCN: v_nop_e64
 138 ; GCN: v_nop_e64
 139 ; GCN: v_nop_e64
 140
 141 ; GCN: s_or_b64 exec, exec, [[SAVE]]
 142 ; GCN: buffer_store_dword
 143 ; GCN: s_endpgm
 144 define amdgpu_kernel void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
 145 bb:
 146   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 147   %tid.ext = zext i32 %tid to i64
 148   %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tid.ext
 149   %load = load volatile i32, i32 addrspace(1)* %gep
 150   %cmp = icmp eq i32 %load, 0
 151   br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
 152
 153 bb2:
 154   call void asm sideeffect " ; 32 bytes
 155     v_nop_e64
 156     v_nop_e64
 157     v_nop_e64
 158     v_nop_e64", ""() #0
 159   br label %bb3
 160
 161 bb3:
 162   store volatile i32 %load, i32 addrspace(1)* %gep
 163   ret void
 164 }
 165
 166 ; GCN-LABEL: {{^}}long_backward_sbranch:
 167 ; GCN: s_mov_b32 [[LOOPIDX:s[0-9]+]], 0{{$}}
 168
 169 ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: ; %bb2
 170 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
 171 ; GCN-NEXT: s_add_i32 [[INC:s[0-9]+]], [[LOOPIDX]], 1
 172 ; GCN-NEXT: s_cmp_lt_i32 [[INC]], 10
 173
 174 ; GCN-NEXT: ;;#ASMSTART
 175 ; GCN-NEXT: v_nop_e64
 176 ; GCN-NEXT: v_nop_e64
 177 ; GCN-NEXT: v_nop_e64
 178 ; GCN-NEXT: ;;#ASMEND
 179
 180 ; GCN-NEXT: s_cbranch_scc0 [[ENDBB:BB[0-9]+_[0-9]+]]
 181
 182 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2
 183 ; GCN-NEXT: ; in Loop: Header=[[LOOPBB]] Depth=1
 184 ; GCN-NEXT: s_getpc_b64 vcc
 185 ; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONG_JUMP]]+4)-[[LOOPBB]]
 186 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0
 187 ; GCN-NEXT: s_setpc_b64 vcc
 188
 189 ; GCN-NEXT: [[ENDBB]]:
 190 ; GCN-NEXT: s_endpgm
 191 define amdgpu_kernel void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
 192 bb:
 193   br label %bb2
 194
 195 bb2:
 196   %loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ]
 197    ; 24 bytes
 198   call void asm sideeffect
 199    "v_nop_e64
 200     v_nop_e64
 201     v_nop_e64", ""() #0
 202   %inc = add nsw i32 %loop.idx, 1 ; add cost 4
 203   %cmp = icmp slt i32 %inc, 10 ; condition cost = 8
 204   br i1 %cmp, label %bb2, label %bb3 ; -
 205
 206 bb3:
 207   ret void
 208 }
 209
 210 ; Requires expansion of unconditional branch from %bb2 to %bb4 (and
 211 ; expansion of conditional branch from %bb to %bb3.
 212
 213 ; GCN-LABEL: {{^}}uniform_unconditional_min_long_forward_branch:
 214 ; GCN: s_cmp_eq_u32
 215 ; GCN-NEXT: s_cbranch_scc0 [[BB2:BB[0-9]+_[0-9]+]]
 216
 217 ; GCN-NEXT: [[LONG_JUMP0:BB[0-9]+_[0-9]+]]: ; %bb0
 218 ; GCN-NEXT: s_getpc_b64 vcc
 219 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB3:BB[0-9]_[0-9]+]]-([[LONG_JUMP0]]+4)
 220 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
 221 ; GCN-NEXT: s_setpc_b64 vcc
 222
 223 ; GCN-NEXT: [[BB2]]: ; %bb2
 224 ; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17
 225 ; GCN: buffer_store_dword [[BB2_K]]
 226
 227 ; GCN-NEXT: [[LONG_JUMP1:BB[0-9]+_[0-9]+]]: ; %bb2
 228 ; GCN-NEXT: s_getpc_b64 vcc
 229 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB4:BB[0-9]_[0-9]+]]-([[LONG_JUMP1]]+4)
 230 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
 231 ; GCN-NEXT: s_setpc_b64 vcc
 232
 233 ; GCN: [[BB3]]: ; %bb3
 234 ; GCN: v_nop_e64
 235 ; GCN: v_nop_e64
 236 ; GCN: v_nop_e64
 237 ; GCN: v_nop_e64
 238 ; GCN: ;;#ASMEND
 239
 240 ; GCN-NEXT: [[BB4]]: ; %bb4
 241 ; GCN: v_mov_b32_e32 [[BB4_K:v[0-9]+]], 63
 242 ; GCN: buffer_store_dword [[BB4_K]]
 243 ; GCN-NEXT: s_endpgm
 244 ; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
 245 define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 246 bb0:
 247   %tmp = icmp ne i32 %arg1, 0
 248   br i1 %tmp, label %bb2, label %bb3
 249
 250 bb2:
 251   store volatile i32 17, i32 addrspace(1)* undef
 252   br label %bb4
 253
 254 bb3:
 255   ; 32 byte asm
 256   call void asm sideeffect
 257    "v_nop_e64
 258     v_nop_e64
 259     v_nop_e64
 260     v_nop_e64", ""() #0
 261   br label %bb4
 262
 263 bb4:
 264   store volatile i32 63, i32 addrspace(1)* %arg
 265   ret void
 266 }
 267
 268 ; GCN-LABEL: {{^}}uniform_unconditional_min_long_backward_branch:
 269 ; GCN-NEXT: ; BB#0: ; %entry
 270
 271 ; GCN-NEXT: [[LOOP:BB[0-9]_[0-9]+]]: ; %loop
 272 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
 273 ; GCN-NEXT: ;;#ASMSTART
 274 ; GCN-NEXT: v_nop_e64
 275 ; GCN-NEXT: v_nop_e64
 276 ; GCN-NEXT: v_nop_e64
 277 ; GCN-NEXT: v_nop_e64
 278 ; GCN-NEXT: ;;#ASMEND
 279
 280 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop
 281 ; GCN-NEXT: ; in Loop: Header=[[LOOP]] Depth=1
 282 ; GCN-NEXT: s_getpc_b64 vcc
 283 ; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONGBB]]+4)-[[LOOP]]
 284 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0{{$}}
 285 ; GCN-NEXT: s_setpc_b64 vcc
 286 ; GCN-NEXT .Lfunc_end{{[0-9]+}}:
 287 define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 288 entry:
 289   br label %loop
 290
 291 loop:
 292   ; 32 byte asm
 293   call void asm sideeffect
 294    "v_nop_e64
 295     v_nop_e64
 296     v_nop_e64
 297     v_nop_e64", ""() #0
 298   br label %loop
 299 }
 300
 301 ; Expansion of branch from %bb1 to %bb3 introduces need to expand
 302 ; branch from %bb0 to %bb2
 303
 304 ; GCN-LABEL: {{^}}expand_requires_expand:
 305 ; GCN-NEXT: ; BB#0: ; %bb0
 306 ; GCN: s_load_dword
 307 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 0{{$}}
 308 ; GCN-NEXT: s_cbranch_scc0 [[BB1:BB[0-9]+_[0-9]+]]
 309
 310 ; GCN-NEXT: [[LONGBB0:BB[0-9]+_[0-9]+]]: ; %bb0
 311 ; GCN-NEXT: s_getpc_b64 vcc
 312 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB0]]+4)
 313 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
 314 ; GCN-NEXT: s_setpc_b64 vcc
 315
 316 ; GCN-NEXT: [[BB1]]: ; %bb1
 317 ; GCN-NEXT: s_load_dword
 318 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
 319 ; GCN-NEXT: s_cmp_eq_u32 s{{[0-9]+}}, 3{{$}}
 320 ; GCN-NEXT: s_cbranch_scc0 [[BB2:BB[0-9]_[0-9]+]]
 321
 322 ; GCN-NEXT: [[LONGBB1:BB[0-9]+_[0-9]+]]: ; %bb1
 323 ; GCN-NEXT: s_getpc_b64 vcc
 324 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB3:BB[0-9]+_[0-9]+]]-([[LONGBB1]]+4)
 325 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
 326 ; GCN-NEXT: s_setpc_b64 vcc
 327
 328 ; GCN-NEXT: [[BB2]]: ; %bb2
 329 ; GCN-NEXT: ;;#ASMSTART
 330 ; GCN-NEXT: v_nop_e64
 331 ; GCN-NEXT: v_nop_e64
 332 ; GCN-NEXT: v_nop_e64
 333 ; GCN-NEXT: v_nop_e64
 334 ; GCN-NEXT: ;;#ASMEND
 335
 336 ; GCN-NEXT: [[BB3]]: ; %bb3
 337 ; GCN-NEXT: ;;#ASMSTART
 338 ; GCN-NEXT: v_nop_e64
 339 ; GCN-NEXT: ;;#ASMEND
 340 ; GCN-NEXT: ;;#ASMSTART
 341 ; GCN-NEXT: v_nop_e64
 342 ; GCN-NEXT: ;;#ASMEND
 343 ; GCN-NEXT: s_endpgm
 344 define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 345 bb0:
 346   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
 347   %cmp0 = icmp slt i32 %cond0, 0
 348   br i1 %cmp0, label %bb2, label %bb1
 349
 350 bb1:
 351   %val = load volatile i32, i32 addrspace(2)* undef
 352   %cmp1 = icmp eq i32 %val, 3
 353   br i1 %cmp1, label %bb3, label %bb2
 354
 355 bb2:
 356   call void asm sideeffect
 357    "v_nop_e64
 358     v_nop_e64
 359     v_nop_e64
 360     v_nop_e64", ""() #0
 361   br label %bb3
 362
 363 bb3:
 364 ; These NOPs prevent tail-duplication-based outlining
 365 ; from firing, which defeats the need to expand the branches and this test.
 366   call void asm sideeffect
 367    "v_nop_e64", ""() #0
 368   call void asm sideeffect
 369    "v_nop_e64", ""() #0
 370   ret void
 371 }
 372
 373 ; Requires expanding of required skip branch.
 374
 375 ; GCN-LABEL: {{^}}uniform_inside_divergent:
 376 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 377 ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
 378 ; GCN-NEXT: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 379 ; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]]
 380 ; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]]
 381
 382 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %entry
 383 ; GCN-NEXT: s_getpc_b64 vcc
 384 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB]]+4)
 385 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
 386 ; GCN-NEXT: s_setpc_b64 vcc
 387
 388 ; GCN-NEXT: [[IF]]: ; %if
 389 ; GCN: buffer_store_dword
 390 ; GCN: s_cmp_lg_u32
 391 ; GCN: s_cbranch_scc1 [[ENDIF]]
 392
 393 ; GCN-NEXT: ; BB#2: ; %if_uniform
 394 ; GCN: buffer_store_dword
 395
 396 ; GCN-NEXT: [[ENDIF]]: ; %endif
 397 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 398 ; GCN-NEXT: s_sleep 5
 399 ; GCN-NEXT: s_endpgm
 400 define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
 401 entry:
 402   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 403   %d_cmp = icmp ult i32 %tid, 16
 404   br i1 %d_cmp, label %if, label %endif
 405
 406 if:
 407   store i32 0, i32 addrspace(1)* %out
 408   %u_cmp = icmp eq i32 %cond, 0
 409   br i1 %u_cmp, label %if_uniform, label %endif
 410
 411 if_uniform:
 412   store i32 1, i32 addrspace(1)* %out
 413   br label %endif
 414
 415 endif:
 416   ; layout can remove the split branch if it can copy the return block.
 417   ; This call makes the return block long enough that it doesn't get copied.
 418   call void @llvm.amdgcn.s.sleep(i32 5);
 419   ret void
 420 }
 421
 422 ; si_mask_branch
 423 ; s_cbranch_execz
 424 ; s_branch
 425
 426 ; GCN-LABEL: {{^}}analyze_mask_branch:
 427 ; GCN: v_cmp_lt_f32_e32 vcc
 428 ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
 429 ; GCN-NEXT: s_xor_b64 [[MASK]], exec, [[MASK]]
 430 ; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
 431 ; GCN-NEXT: s_cbranch_execz [[BRANCH_SKIP:BB[0-9]+_[0-9]+]]
 432 ; GCN-NEXT: s_branch [[LOOP_BODY:BB[0-9]+_[0-9]+]]
 433
 434 ; GCN-NEXT: [[BRANCH_SKIP]]: ; %entry
 435 ; GCN-NEXT: s_getpc_b64 vcc
 436 ; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[RET]]-([[BRANCH_SKIP]]+4)
 437 ; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0
 438 ; GCN-NEXT: s_setpc_b64 vcc
 439
 440 ; GCN-NEXT: [[LOOP_BODY]]: ; %loop_body
 441 ; GCN: s_mov_b64 vcc, -1{{$}}
 442 ; GCN: ;;#ASMSTART
 443 ; GCN: v_nop_e64
 444 ; GCN: v_nop_e64
 445 ; GCN: v_nop_e64
 446 ; GCN: v_nop_e64
 447 ; GCN: v_nop_e64
 448 ; GCN: v_nop_e64
 449 ; GCN: ;;#ASMEND
 450 ; GCN-NEXT: s_cbranch_vccz [[RET]]
 451
 452 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop_body
 453 ; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1
 454 ; GCN-NEXT: s_getpc_b64 vcc
 455 ; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONGBB]]+4)-[[LOOP_BODY]]
 456 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0
 457 ; GCN-NEXT: s_setpc_b64 vcc
 458
 459 ; GCN-NEXT: [[RET]]: ; %Flow
 460 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 461 ; GCN: buffer_store_dword
 462 ; GCN-NEXT: s_endpgm
 463 define amdgpu_kernel void @analyze_mask_branch() #0 {
 464 entry:
 465   %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"()
 466   %cmp0 = fcmp ogt float %reg, 0.000000e+00
 467   br i1 %cmp0, label %loop, label %ret
 468
 469 loop:
 470   %phi = phi float [ 0.000000e+00, %loop_body ], [ 1.000000e+00, %entry ]
 471   call void asm sideeffect
 472     "v_nop_e64
 473      v_nop_e64", ""() #0
 474   %cmp1 = fcmp olt float %phi, 8.0
 475   br i1 %cmp1, label %loop_body, label %ret
 476
 477 loop_body:
 478   call void asm sideeffect
 479   "v_nop_e64
 480    v_nop_e64
 481    v_nop_e64
 482    v_nop_e64", ""() #0
 483   br label %loop
 484
 485 ret:
 486   store volatile i32 7, i32 addrspace(1)* undef
 487   ret void
 488 }
 489
 490 ; GCN-LABEL: {{^}}long_branch_hang:
 491 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
 492 ; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
 493 ; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
 494 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
 495
 496 ; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
 497 ; GCN: s_setpc_b64
 498
 499 ; GCN-NEXT: [[LONG_BR_0]]:
 500 ; GCN-DAG: v_cmp_lt_i32
 501 ; GCN-DAG: v_cmp_gt_i32
 502 ; GCN: s_cbranch_vccnz
 503
 504 ; GCN: s_setpc_b64
 505 ; GCN: s_setpc_b64
 506
 507 ; GCN: [[LONG_BR_DEST0]]
 508 ; GCN: v_cmp_ne_u32_e32
 509 ; GCN-NEXT: s_cbranch_vccz
 510 ; GCN: s_setpc_b64
 511
 512 ; GCN: s_endpgm
 513 define amdgpu_kernel void @long_branch_hang(i32 addrspace(1)* nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 {
 514 bb:
 515   %tmp = icmp slt i32 %arg2, 9
 516   %tmp6 = icmp eq i32 %arg1, 0
 517   %tmp7 = icmp sgt i32 %arg4, 0
 518   %tmp8 = icmp sgt i32 %arg4, 5
 519   br i1 %tmp8, label %bb9, label %bb13
 520
 521 bb9:                                              ; preds = %bb
 522   %tmp10 = and i1 %tmp7, %tmp
 523   %tmp11 = icmp slt i32 %arg3, %arg4
 524   %tmp12 = or i1 %tmp11, %tmp7
 525   br i1 %tmp12, label %bb19, label %bb14
 526
 527 bb13:                                             ; preds = %bb
 528   br i1 %tmp6, label %bb19, label %bb14
 529
 530 bb14:                                             ; preds = %bb13, %bb9
 531   %tmp15 = icmp slt i32 %arg3, %arg4
 532   %tmp16 = or i1 %tmp15, %tmp
 533   %tmp17 = and i1 %tmp6, %tmp16
 534   %tmp18 = zext i1 %tmp17 to i32
 535   br label %bb19
 536
 537 bb19:                                             ; preds = %bb14, %bb13, %bb9
 538   %tmp20 = phi i32 [ undef, %bb9 ], [ undef, %bb13 ], [ %tmp18, %bb14 ]
 539   %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %arg5
 540   store i32 %tmp20, i32 addrspace(1)* %tmp21, align 4
 541   ret void
 542 }
 543
 544 attributes #0 = { nounwind }
 545 attributes #1 = { nounwind readnone }