1 ; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
2 ; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
3 ; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
4 ; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
6 ; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
7 ; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
8 ; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
9 ; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
11 ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
13 target triple = "amdgcn--"
16 declare i32 @llvm.amdgcn.workitem.id.x() #1
17 declare float @llvm.fmuladd.f32(float, float, float) #1
18 declare half @llvm.fmuladd.f16(half, half, half) #1
19 declare float @llvm.fabs.f32(float) #1
21 ; GCN-LABEL: {{^}}fmuladd_f32:
22 ; GCN-FLUSH: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
24 ; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
26 ; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
27 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
28 define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
29 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
30 %r0 = load float, float addrspace(1)* %in1
31 %r1 = load float, float addrspace(1)* %in2
32 %r2 = load float, float addrspace(1)* %in3
33 %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2)
34 store float %r3, float addrspace(1)* %out
38 ; GCN-LABEL: {{^}}fmul_fadd_f32:
39 ; GCN-FLUSH: v_mac_f32
41 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32
43 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32
44 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32
46 ; GCN-DENORM-STRICT: v_mul_f32_e32
47 ; GCN-DENORM-STRICT: v_add_f32_e32
48 define void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
49 float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
50 %r0 = load volatile float, float addrspace(1)* %in1
51 %r1 = load volatile float, float addrspace(1)* %in2
52 %r2 = load volatile float, float addrspace(1)* %in3
53 %mul = fmul float %r0, %r1
54 %add = fadd float %mul, %r2
55 store float %add, float addrspace(1)* %out
59 ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32
60 ; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
61 ; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
63 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
64 ; SI-FLUSH: buffer_store_dword [[R2]]
65 ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
67 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
69 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
70 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
72 ; SI-DENORM buffer_store_dword [[RESULT]]
73 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
74 define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
75 %tid = call i32 @llvm.amdgcn.workitem.id.x()
76 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
77 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
78 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
80 %r1 = load volatile float, float addrspace(1)* %gep.0
81 %r2 = load volatile float, float addrspace(1)* %gep.1
83 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
84 store float %r3, float addrspace(1)* %gep.out
88 ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32
89 ; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
90 ; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
92 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
93 ; SI-FLUSH: buffer_store_dword [[R2]]
94 ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
96 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
98 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
99 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
101 ; SI-DENORM: buffer_store_dword [[RESULT]]
102 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
103 define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
104 %tid = call i32 @llvm.amdgcn.workitem.id.x()
105 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
106 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
107 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
109 %r1 = load volatile float, float addrspace(1)* %gep.0
110 %r2 = load volatile float, float addrspace(1)* %gep.1
112 %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
113 store float %r3, float addrspace(1)* %gep.out
117 ; GCN-LABEL: {{^}}fadd_a_a_b_f32:
118 ; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
119 ; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
121 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
122 ; SI-FLUSH: buffer_store_dword [[R2]]
123 ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
125 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
127 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
128 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
130 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
131 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
133 ; SI-DENORM: buffer_store_dword [[RESULT]]
134 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
135 define void @fadd_a_a_b_f32(float addrspace(1)* %out,
136 float addrspace(1)* %in1,
137 float addrspace(1)* %in2) #0 {
138 %tid = call i32 @llvm.amdgcn.workitem.id.x()
139 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
140 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
141 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
143 %r0 = load volatile float, float addrspace(1)* %gep.0
144 %r1 = load volatile float, float addrspace(1)* %gep.1
146 %add.0 = fadd float %r0, %r0
147 %add.1 = fadd float %add.0, %r1
148 store float %add.1, float addrspace(1)* %gep.out
152 ; GCN-LABEL: {{^}}fadd_b_a_a_f32:
153 ; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
154 ; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
156 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
157 ; SI-FLUSH: buffer_store_dword [[R2]]
158 ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
160 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
162 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
163 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
165 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
166 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
168 ; SI-DENORM: buffer_store_dword [[RESULT]]
169 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
170 define void @fadd_b_a_a_f32(float addrspace(1)* %out,
171 float addrspace(1)* %in1,
172 float addrspace(1)* %in2) #0 {
173 %tid = call i32 @llvm.amdgcn.workitem.id.x()
174 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
175 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
176 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
178 %r0 = load volatile float, float addrspace(1)* %gep.0
179 %r1 = load volatile float, float addrspace(1)* %gep.1
181 %add.0 = fadd float %r0, %r0
182 %add.1 = fadd float %r1, %add.0
183 store float %add.1, float addrspace(1)* %gep.out
187 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
188 ; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
189 ; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
190 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
192 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
194 ; GCN-DENORM-SLOWFMA: v_mul_f32_e32 [[TMP:v[0-9]+]], -2.0, [[R1]]
195 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
197 ; SI-DENORM: buffer_store_dword [[RESULT]]
198 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
199 define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
200 %tid = call i32 @llvm.amdgcn.workitem.id.x()
201 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
202 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
203 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
205 %r1 = load volatile float, float addrspace(1)* %gep.0
206 %r2 = load volatile float, float addrspace(1)* %gep.1
208 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
209 store float %r3, float addrspace(1)* %gep.out
213 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
214 ; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
215 ; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
217 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
218 ; SI-FLUSH: buffer_store_dword [[R2]]
219 ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
221 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]]
223 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
224 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
226 ; SI-DENORM: buffer_store_dword [[RESULT]]
227 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
228 define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
229 %tid = call i32 @llvm.amdgcn.workitem.id.x()
230 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
231 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
232 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
234 %r1 = load volatile float, float addrspace(1)* %gep.0
235 %r2 = load volatile float, float addrspace(1)* %gep.1
237 %r1.fneg = fsub float -0.000000e+00, %r1
239 %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2)
240 store float %r3, float addrspace(1)* %gep.out
244 ; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32:
245 ; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
246 ; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
248 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
249 ; SI-FLUSH: buffer_store_dword [[R2]]
250 ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
252 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]]
254 ; GCN-DENORM-SLOWFMA: v_mul_f32_e32 [[TMP:v[0-9]+]], -2.0, [[R1]]
255 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
257 ; SI-DENORM: buffer_store_dword [[RESULT]]
258 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
259 define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
260 %tid = call i32 @llvm.amdgcn.workitem.id.x()
261 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
262 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
263 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
265 %r1 = load volatile float, float addrspace(1)* %gep.0
266 %r2 = load volatile float, float addrspace(1)* %gep.1
268 %r1.fneg = fsub float -0.000000e+00, %r1
270 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2)
271 store float %r3, float addrspace(1)* %gep.out
275 ; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32:
276 ; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
277 ; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
278 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
279 ; SI-FLUSH: buffer_store_dword [[RESULT]]
280 ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
282 ; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
284 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
285 ; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
287 ; SI-DENORM: buffer_store_dword [[RESULT]]
288 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
289 define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
290 %tid = call i32 @llvm.amdgcn.workitem.id.x()
291 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
292 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
293 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
295 %r1 = load volatile float, float addrspace(1)* %gep.0
296 %r2 = load volatile float, float addrspace(1)* %gep.1
298 %r2.fneg = fsub float -0.000000e+00, %r2
300 %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg)
301 store float %r3, float addrspace(1)* %gep.out
305 ; GCN-LABEL: {{^}}mad_sub_f32:
306 ; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
307 ; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
308 ; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
309 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
311 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
313 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
314 ; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
316 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
317 ; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
319 ; SI: buffer_store_dword [[RESULT]]
320 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
321 define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
322 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
323 %tid.ext = sext i32 %tid to i64
324 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
325 %add1 = add i64 %tid.ext, 1
326 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
327 %add2 = add i64 %tid.ext, 2
328 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
329 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
330 %a = load volatile float, float addrspace(1)* %gep0, align 4
331 %b = load volatile float, float addrspace(1)* %gep1, align 4
332 %c = load volatile float, float addrspace(1)* %gep2, align 4
333 %mul = fmul float %a, %b
334 %sub = fsub float %mul, %c
335 store float %sub, float addrspace(1)* %outgep, align 4
339 ; GCN-LABEL: {{^}}mad_sub_inv_f32:
340 ; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
341 ; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
342 ; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
344 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
346 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
348 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
349 ; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
351 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
352 ; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
354 ; SI: buffer_store_dword [[RESULT]]
355 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
356 define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
357 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
358 %tid.ext = sext i32 %tid to i64
359 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
360 %add1 = add i64 %tid.ext, 1
361 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
362 %add2 = add i64 %tid.ext, 2
363 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
364 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
365 %a = load volatile float, float addrspace(1)* %gep0, align 4
366 %b = load volatile float, float addrspace(1)* %gep1, align 4
367 %c = load volatile float, float addrspace(1)* %gep2, align 4
368 %mul = fmul float %a, %b
369 %sub = fsub float %c, %mul
370 store float %sub, float addrspace(1)* %outgep, align 4
374 ; GCN-LABEL: {{^}}mad_sub_fabs_f32:
375 ; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
376 ; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
377 ; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
378 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
380 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
382 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
383 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
385 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
386 ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
388 ; SI: buffer_store_dword [[RESULT]]
389 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
390 define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
391 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
392 %tid.ext = sext i32 %tid to i64
393 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
394 %add1 = add i64 %tid.ext, 1
395 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
396 %add2 = add i64 %tid.ext, 2
397 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
398 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
399 %a = load volatile float, float addrspace(1)* %gep0, align 4
400 %b = load volatile float, float addrspace(1)* %gep1, align 4
401 %c = load volatile float, float addrspace(1)* %gep2, align 4
402 %c.abs = call float @llvm.fabs.f32(float %c) #0
403 %mul = fmul float %a, %b
404 %sub = fsub float %mul, %c.abs
405 store float %sub, float addrspace(1)* %outgep, align 4
409 ; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32:
410 ; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
411 ; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
412 ; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
413 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
415 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
417 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
418 ; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
420 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
421 ; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
423 ; SI: buffer_store_dword [[RESULT]]
424 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
425 define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
426 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
427 %tid.ext = sext i32 %tid to i64
428 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
429 %add1 = add i64 %tid.ext, 1
430 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
431 %add2 = add i64 %tid.ext, 2
432 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
433 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
434 %a = load volatile float, float addrspace(1)* %gep0, align 4
435 %b = load volatile float, float addrspace(1)* %gep1, align 4
436 %c = load volatile float, float addrspace(1)* %gep2, align 4
437 %c.abs = call float @llvm.fabs.f32(float %c) #0
438 %mul = fmul float %a, %b
439 %sub = fsub float %c.abs, %mul
440 store float %sub, float addrspace(1)* %outgep, align 4
444 ; GCN-LABEL: {{^}}neg_neg_mad_f32:
445 ; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
446 ; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
447 ; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
449 ; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGB]], [[REGA]]
450 ; SI-FLUSH: buffer_store_dword [[REGC]]
451 ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
453 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
455 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
456 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
458 ; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
459 ; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
461 ; SI-DENORM: buffer_store_dword [[RESULT]]
462 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
463 define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
464 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
465 %tid.ext = sext i32 %tid to i64
466 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
467 %add1 = add i64 %tid.ext, 1
468 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
469 %add2 = add i64 %tid.ext, 2
470 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
471 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
472 %a = load volatile float, float addrspace(1)* %gep0, align 4
473 %b = load volatile float, float addrspace(1)* %gep1, align 4
474 %c = load volatile float, float addrspace(1)* %gep2, align 4
475 %nega = fsub float -0.000000e+00, %a
476 %negb = fsub float -0.000000e+00, %b
477 %mul = fmul float %nega, %negb
478 %sub = fadd float %mul, %c
479 store float %sub, float addrspace(1)* %outgep, align 4
483 ; GCN-LABEL: {{^}}mad_fabs_sub_f32:
484 ; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
485 ; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
486 ; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
487 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
489 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
491 ; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
492 ; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
494 ; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
495 ; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
497 ; SI: buffer_store_dword [[RESULT]]
498 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
499 define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
500 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
501 %tid.ext = sext i32 %tid to i64
502 %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
503 %add1 = add i64 %tid.ext, 1
504 %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
505 %add2 = add i64 %tid.ext, 2
506 %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
507 %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
508 %a = load volatile float, float addrspace(1)* %gep0, align 4
509 %b = load volatile float, float addrspace(1)* %gep1, align 4
510 %c = load volatile float, float addrspace(1)* %gep2, align 4
511 %b.abs = call float @llvm.fabs.f32(float %b) #0
512 %mul = fmul float %a, %b.abs
513 %sub = fsub float %mul, %c
514 store float %sub, float addrspace(1)* %outgep, align 4
518 ; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32:
519 ; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
520 ; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
521 ; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
522 ; SI-FLUSH: buffer_store_dword [[R2]]
523 ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
525 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
527 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
528 ; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
530 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
531 ; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
533 ; SI-DENORM: buffer_store_dword [[RESULT]]
534 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
535 define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
536 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
537 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
538 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
539 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
541 %r1 = load volatile float, float addrspace(1)* %gep.0
542 %r2 = load volatile float, float addrspace(1)* %gep.1
544 %add = fadd float %r1, %r1
545 %r3 = fsub float %r2, %add
547 store float %r3, float addrspace(1)* %gep.out
551 ; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32:
552 ; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
553 ; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
554 ; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
556 ; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
558 ; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
559 ; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
561 ; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
562 ; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
564 ; SI: buffer_store_dword [[RESULT]]
565 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
566 define void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
567 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
568 %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
569 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
570 %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
572 %r1 = load volatile float, float addrspace(1)* %gep.0
573 %r2 = load volatile float, float addrspace(1)* %gep.1
575 %add = fadd float %r1, %r1
576 %r3 = fsub float %add, %r2
578 store float %r3, float addrspace(1)* %gep.out
582 attributes #0 = { nounwind }
583 attributes #1 = { nounwind readnone }