1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,HSA %s
2 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MESA %s
4 ; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty:
5 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
7 ; HSA: kernarg_segment_byte_size = 0
8 ; MESA: kernarg_segment_byte_size = 16
10 ; HSA: s_load_dword s0, s[4:5], 0x0
11 define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 {
12 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
13 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
14 %load = load volatile i32, i32 addrspace(4)* %cast
18 ; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr_empty:
19 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
21 ; HSA: kernarg_segment_byte_size = 48
22 ; MESA: kernarg_segment_byte_size = 16
24 ; HSA: s_load_dword s0, s[4:5], 0x0
25 define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 {
26 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
27 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
28 %load = load volatile i32, i32 addrspace(4)* %cast
32 ; GCN-LABEL: {{^}}kernel_implicitarg_ptr:
33 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
35 ; HSA: kernarg_segment_byte_size = 112
36 ; MESA: kernarg_segment_byte_size = 128
38 ; HSA: s_load_dword s0, s[4:5], 0x1c
39 define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {
40 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
41 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
42 %load = load volatile i32, i32 addrspace(4)* %cast
46 ; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr:
47 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
49 ; HSA: kernarg_segment_byte_size = 160
50 ; MESA: kernarg_segment_byte_size = 128
52 ; HSA: s_load_dword s0, s[4:5], 0x1c
53 define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 {
54 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
55 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
56 %load = load volatile i32, i32 addrspace(4)* %cast
60 ; GCN-LABEL: {{^}}func_implicitarg_ptr:
62 ; MESA: v_mov_b32_e32 v0, s6
63 ; MESA: v_mov_b32_e32 v1, s7
64 ; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
65 ; HSA: v_mov_b32_e32 v0, s6
66 ; HSA: v_mov_b32_e32 v1, s7
67 ; HSA: flat_load_dword v0, v[0:1]
69 ; GCN-NEXT: s_setpc_b64
70 define void @func_implicitarg_ptr() #0 {
71 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
72 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
73 %load = load volatile i32, i32 addrspace(4)* %cast
77 ; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
79 ; MESA: v_mov_b32_e32 v0, s6
80 ; MESA: v_mov_b32_e32 v1, s7
81 ; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
82 ; HSA: v_mov_b32_e32 v0, s6
83 ; HSA: v_mov_b32_e32 v1, s7
84 ; HSA: flat_load_dword v0, v[0:1]
86 ; GCN-NEXT: s_setpc_b64
87 define void @opencl_func_implicitarg_ptr() #0 {
88 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
89 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
90 %load = load volatile i32, i32 addrspace(4)* %cast
94 ; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty:
95 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
96 ; HSA: kernarg_segment_byte_size = 0
97 ; MESA: kernarg_segment_byte_size = 16
98 ; GCN: s_mov_b64 s[6:7], s[4:5]
100 define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 {
101 call void @func_implicitarg_ptr()
105 ; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func_empty:
106 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
107 ; HSA: kernarg_segment_byte_size = 48
108 ; MESA: kernarg_segment_byte_size = 16
109 ; GCN: s_mov_b64 s[6:7], s[4:5]
111 define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 {
112 call void @func_implicitarg_ptr()
116 ; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func:
117 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
118 ; HSA: kernarg_segment_byte_size = 112
119 ; MESA: kernarg_segment_byte_size = 128
121 ; HSA: s_add_u32 s6, s4, 0x70
122 ; MESA: s_add_u32 s6, s4, 0x70
124 ; GCN: s_addc_u32 s7, s5, 0{{$}}
126 define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
127 call void @func_implicitarg_ptr()
131 ; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func:
132 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
133 ; HSA: kernarg_segment_byte_size = 160
134 ; MESA: kernarg_segment_byte_size = 128
136 ; GCN: s_add_u32 s6, s4, 0x70
138 ; GCN: s_addc_u32 s7, s5, 0{{$}}
140 define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 {
141 call void @func_implicitarg_ptr()
145 ; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func:
149 define void @func_call_implicitarg_ptr_func() #0 {
150 call void @func_implicitarg_ptr()
154 ; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func:
158 define void @opencl_func_call_implicitarg_ptr_func() #0 {
159 call void @func_implicitarg_ptr()
163 ; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
165 ; MESA: v_mov_b32_e32 v0, s6
166 ; MESA: v_mov_b32_e32 v1, s7
167 ; MESA: v_mov_b32_e32 v2, s8
168 ; MESA: v_mov_b32_e32 v3, s9
169 ; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
170 ; HSA: v_mov_b32_e32 v0, s6
171 ; HSA: v_mov_b32_e32 v1, s7
172 ; HSA: flat_load_dword v0, v[0:1]
173 ; MESA: buffer_load_dword v0, v[2:3], s[8:11], 0 addr64
174 ; HSA: v_mov_b32_e32 v0, s8
175 ; HSA: v_mov_b32_e32 v1, s9
176 ; HSA: flat_load_dword v0, v[0:1]
178 ; GCN: s_waitcnt vmcnt(0)
179 define void @func_kernarg_implicitarg_ptr() #0 {
180 %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
181 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
182 %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
183 %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
184 %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
185 %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
189 ; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:
191 ; MESA: v_mov_b32_e32 v0, s6
192 ; MESA: v_mov_b32_e32 v1, s7
193 ; MESA: v_mov_b32_e32 v2, s8
194 ; MESA: v_mov_b32_e32 v3, s9
195 ; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
196 ; HSA: v_mov_b32_e32 v0, s6
197 ; HSA: v_mov_b32_e32 v1, s7
198 ; HSA: flat_load_dword v0, v[0:1]
199 ; MESA: buffer_load_dword v0, v[2:3], s[8:11], 0 addr64
200 ; HSA: v_mov_b32_e32 v0, s8
201 ; HSA: v_mov_b32_e32 v1, s9
202 ; HSA: flat_load_dword v0, v[0:1]
204 ; GCN: s_waitcnt vmcnt(0)
205 define void @opencl_func_kernarg_implicitarg_ptr() #0 {
206 %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
207 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
208 %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
209 %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
210 %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
211 %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
215 ; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
216 ; GCN: s_mov_b64 s[6:7], s[4:5]
217 ; GCN: s_add_u32 s8, s6, 0x70
218 ; GCN: s_addc_u32 s9, s7, 0
220 define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
221 call void @func_kernarg_implicitarg_ptr()
225 ; GCN-LABEL: {{^}}kernel_implicitarg_no_struct_align_padding:
226 ; HSA: kernarg_segment_byte_size = 120
227 ; MESA: kernarg_segment_byte_size = 84
228 ; GCN: kernarg_segment_alignment = 6
229 define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>, i32) #1 {
230 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
231 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
232 %load = load volatile i32, i32 addrspace(4)* %cast
236 declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2
237 declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2
239 attributes #0 = { nounwind noinline }
240 attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" }
241 attributes #2 = { nounwind readnone speculatable }