1 // Test target codegen - host bc file has to be created first.
2 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
3 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
4 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
5 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
6 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
7 // expected-no-diagnostics
11 // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
12 // CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
14 // Check that the execution mode of all 3 target regions is set to Spmd Mode.
15 // CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0
16 // CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0
17 // CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0
27 #pragma omp target parallel reduction(+: e)
32 #pragma omp target parallel reduction(^: c) reduction(*: d)
38 #pragma omp target parallel reduction(|: a) reduction(max: b)
50 a += ftemplate<char>(n);
55 // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}(
57 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
58 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
59 // CHECK: br label {{%?}}[[EXECUTE:.+]]
62 // CHECK: {{call|invoke}} void [[PFN:@.+]](i32*
63 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
66 // define internal void [[PFN]](
67 // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
68 // CHECK: [[EV:%.+]] = load double, double* [[E]], align
69 // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
70 // CHECK: store double [[ADD]], double* [[E]], align
71 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
72 // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8*
73 // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
74 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
75 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
76 // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
77 // CHECK: br i1 [[CMP]], label
79 // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
80 // CHECK: [[EV:%.+]] = load double, double* [[E]], align
81 // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
82 // CHECK: store double [[ADD]], double* [[E_IN]], align
83 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
90 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*)
91 // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
92 // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
93 // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double*
95 // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
96 // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
97 // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double*
99 // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]],
100 // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]],
101 // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]]
102 // CHECK: store double [[RES]], double* [[VAR_LHS]],
106 // Shuffle and reduce function
107 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
108 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
109 // CHECK: [[REMOTE_ELT:%.+]] = alloca double
111 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
112 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
113 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
115 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
116 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
117 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
118 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
120 // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
121 // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
122 // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align
123 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
124 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
125 // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
127 // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align
128 // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
129 // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align
131 // Condition to reduce
132 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
134 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
135 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
136 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
138 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
139 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
140 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
141 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
142 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
143 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
145 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
146 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
147 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
149 // CHECK: [[DO_REDUCE]]
150 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
151 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
152 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
153 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
155 // CHECK: [[REDUCE_ELSE]]
156 // CHECK: br label {{%?}}[[REDUCE_CONT]]
158 // CHECK: [[REDUCE_CONT]]
159 // Now check if we should just copy over the remote reduction list
160 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
161 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
162 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
163 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
165 // CHECK: [[DO_COPY]]
166 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
167 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
168 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
169 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
170 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
171 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
172 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
173 // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
174 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
176 // CHECK: [[COPY_ELSE]]
177 // CHECK: br label {{%?}}[[COPY_CONT]]
179 // CHECK: [[COPY_CONT]]
183 // Inter warp copy function
184 // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32)
185 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
186 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
187 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
188 // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
190 // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
191 // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
192 // CHECK: br i1 [[DONE_COPY]], label
193 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
194 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
195 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
198 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
199 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
200 // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
201 // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
203 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
204 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
205 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
206 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
208 // CHECK: [[COPY_ELSE]]
209 // CHECK: br label {{%?}}[[COPY_CONT]]
211 // Barrier after copy to shared memory storage medium.
212 // CHECK: [[COPY_CONT]]
213 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
214 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
217 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
218 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
220 // CHECK: [[DO_READ]]
221 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
222 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
223 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
224 // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
225 // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
226 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
227 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
228 // CHECK: br label {{%?}}[[READ_CONT:.+]]
230 // CHECK: [[READ_ELSE]]
231 // CHECK: br label {{%?}}[[READ_CONT]]
233 // CHECK: [[READ_CONT]]
234 // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
235 // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
248 // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}(
250 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
251 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
252 // CHECK: br label {{%?}}[[EXECUTE:.+]]
254 // CHECK: [[EXECUTE]]
255 // CHECK: {{call|invoke}} void [[PFN1:@.+]](i32*
256 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
259 // define internal void [[PFN1]](
260 // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align
261 // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align
262 // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32
263 // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2
264 // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
265 // CHECK: store i8 [[TRUNC]], i8* [[C]], align
266 // CHECK: [[DV:%.+]] = load float, float* [[D]], align
267 // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}}
268 // CHECK: store float [[MUL]], float* [[D]], align
269 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
270 // CHECK: store i8* [[C]], i8** [[PTR1]], align
271 // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
272 // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8*
273 // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align
274 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
275 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
276 // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
277 // CHECK: br i1 [[CMP]], label
278 // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
279 // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
280 // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
281 // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32
282 // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]]
283 // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
284 // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align
285 // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align
286 // CHECK: [[DV:%.+]] = load float, float* [[D]], align
287 // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
288 // CHECK: store float [[MUL]], float* [[D_IN]], align
289 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
295 // Reduction function
296 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*)
297 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
298 // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
300 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
301 // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
303 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
304 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
305 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float*
307 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
308 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
309 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float*
311 // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]],
312 // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32
313 // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]],
314 // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32
315 // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
316 // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8
317 // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]],
319 // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]],
320 // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]],
321 // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
322 // CHECK: store float [[RES]], float* [[VAR2_LHS]],
326 // Shuffle and reduce function
327 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
328 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
329 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8
330 // CHECK: [[REMOTE_ELT2:%.+]] = alloca float
332 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
333 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
334 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
336 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
337 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
338 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
339 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
341 // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32
342 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
343 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
344 // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
345 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8
347 // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align
348 // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
350 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
351 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
352 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
353 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
355 // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
356 // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
357 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align
358 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
359 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
360 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
362 // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align
363 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
364 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
366 // Condition to reduce
367 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
369 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
370 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
371 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
373 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
374 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
375 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
376 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
377 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
378 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
380 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
381 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
382 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
384 // CHECK: [[DO_REDUCE]]
385 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
386 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
387 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
388 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
390 // CHECK: [[REDUCE_ELSE]]
391 // CHECK: br label {{%?}}[[REDUCE_CONT]]
393 // CHECK: [[REDUCE_CONT]]
394 // Now check if we should just copy over the remote reduction list
395 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
396 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
397 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
398 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
400 // CHECK: [[DO_COPY]]
401 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
402 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
403 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
404 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
405 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align
406 // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
408 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
409 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
410 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
411 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
412 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
413 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
414 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
415 // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
416 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
418 // CHECK: [[COPY_ELSE]]
419 // CHECK: br label {{%?}}[[COPY_CONT]]
421 // CHECK: [[COPY_CONT]]
425 // Inter warp copy function
426 // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32)
427 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
428 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
429 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
430 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
431 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
432 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
435 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
436 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
438 // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
439 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
440 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
441 // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
442 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
444 // CHECK: [[COPY_ELSE]]
445 // CHECK: br label {{%?}}[[COPY_CONT]]
447 // Barrier after copy to shared memory storage medium.
448 // CHECK: [[COPY_CONT]]
449 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
450 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
453 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
454 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
456 // CHECK: [[DO_READ]]
457 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
458 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
459 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
460 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
461 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
462 // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
463 // CHECK: br label {{%?}}[[READ_CONT:.+]]
465 // CHECK: [[READ_ELSE]]
466 // CHECK: br label {{%?}}[[READ_CONT]]
468 // CHECK: [[READ_CONT]]
469 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
470 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
471 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
474 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
475 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
476 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
478 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
479 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
480 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
481 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
483 // CHECK: [[COPY_ELSE]]
484 // CHECK: br label {{%?}}[[COPY_CONT]]
486 // Barrier after copy to shared memory storage medium.
487 // CHECK: [[COPY_CONT]]
488 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
489 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
492 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
493 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
495 // CHECK: [[DO_READ]]
496 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
497 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
498 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
499 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
500 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
501 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
502 // CHECK: br label {{%?}}[[READ_CONT:.+]]
504 // CHECK: [[READ_ELSE]]
505 // CHECK: br label {{%?}}[[READ_CONT]]
507 // CHECK: [[READ_CONT]]
519 // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
521 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 0)
522 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
523 // CHECK: br label {{%?}}[[EXECUTE:.+]]
525 // CHECK: [[EXECUTE]]
526 // CHECK: {{call|invoke}} void [[PFN2:@.+]](i32*
527 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
530 // define internal void [[PFN2]](
531 // CHECK: store i32 0, i32* [[A:%.+]], align
532 // CHECK: store i16 -32768, i16* [[B:%.+]], align
533 // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align
534 // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1
535 // CHECK: store i32 [[OR]], i32* [[A]], align
536 // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
537 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
538 // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]]
539 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
542 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
544 // CHECK: [[MAX_ELSE]]
545 // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align
546 // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32
547 // CHECK: br label {{%?}}[[MAX_CONT]]
549 // CHECK: [[MAX_CONT]]
550 // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ]
551 // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16
552 // CHECK: store i16 [[TRUNC]], i16* [[B]], align
553 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
554 // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8*
555 // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
556 // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1
557 // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
558 // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
559 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
560 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
561 // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
562 // CHECK: br i1 [[CMP]], label
564 // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
565 // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
566 // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
567 // CHECK: store i32 [[OR]], i32* [[A_IN]], align
568 // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
569 // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
570 // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
571 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
572 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
573 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
576 // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
577 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
579 // CHECK: [[MAX_ELSE]]
580 // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align
581 // CHECK: br label {{%?}}[[MAX_CONT]]
583 // CHECK: [[MAX_CONT]]
584 // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
585 // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
586 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
592 // Reduction function
593 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*)
594 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
595 // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
596 // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
598 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
599 // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
600 // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
602 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
603 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
604 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
606 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
607 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
608 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
610 // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
611 // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
612 // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
613 // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
615 // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
616 // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
617 // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
618 // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
620 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
621 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
624 // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
625 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
627 // CHECK: [[MAX_ELSE]]
628 // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
629 // CHECK: br label {{%?}}[[MAX_CONT]]
631 // CHECK: [[MAX_CONT]]
632 // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
633 // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
637 // Shuffle and reduce function
638 // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
639 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
640 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
641 // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
643 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
644 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
645 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
647 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
648 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
649 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
650 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
651 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
653 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
654 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
655 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
657 // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
658 // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
659 // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
661 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
662 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
663 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
664 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
665 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
667 // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
668 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
669 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
670 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
671 // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
673 // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
674 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
675 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
677 // Condition to reduce
678 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
680 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
681 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
682 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
684 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
685 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
686 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
687 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
688 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
689 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
691 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
692 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
693 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
695 // CHECK: [[DO_REDUCE]]
696 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
697 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
698 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
699 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
701 // CHECK: [[REDUCE_ELSE]]
702 // CHECK: br label {{%?}}[[REDUCE_CONT]]
704 // CHECK: [[REDUCE_CONT]]
705 // Now check if we should just copy over the remote reduction list
706 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
707 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
708 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
709 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
711 // CHECK: [[DO_COPY]]
712 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
713 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
714 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
715 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
716 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
717 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
718 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
719 // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
721 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
722 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
723 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
724 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
725 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
726 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
727 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
728 // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
729 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
731 // CHECK: [[COPY_ELSE]]
732 // CHECK: br label {{%?}}[[COPY_CONT]]
734 // CHECK: [[COPY_CONT]]
738 // Inter warp copy function
739 // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32)
740 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
741 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
742 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
743 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
744 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
745 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
748 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
749 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
750 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
752 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
753 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
754 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
755 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
757 // CHECK: [[COPY_ELSE]]
758 // CHECK: br label {{%?}}[[COPY_CONT]]
760 // Barrier after copy to shared memory storage medium.
761 // CHECK: [[COPY_CONT]]
762 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
763 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
766 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
767 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
769 // CHECK: [[DO_READ]]
770 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
771 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
772 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
773 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
774 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
775 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
776 // CHECK: br label {{%?}}[[READ_CONT:.+]]
778 // CHECK: [[READ_ELSE]]
779 // CHECK: br label {{%?}}[[READ_CONT]]
781 // CHECK: [[READ_CONT]]
782 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
783 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
784 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
787 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
788 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
789 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
791 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
792 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
793 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
794 // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
795 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
797 // CHECK: [[COPY_ELSE]]
798 // CHECK: br label {{%?}}[[COPY_CONT]]
800 // Barrier after copy to shared memory storage medium.
801 // CHECK: [[COPY_CONT]]
802 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
803 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
806 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
807 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
809 // CHECK: [[DO_READ]]
810 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
811 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
812 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
813 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
814 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
815 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
816 // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
817 // CHECK: br label {{%?}}[[READ_CONT:.+]]
819 // CHECK: [[READ_ELSE]]
820 // CHECK: br label {{%?}}[[READ_CONT]]
822 // CHECK: [[READ_CONT]]