1 //===- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions ----------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file contains a pass that expands pseudo instructions into target
11 // instructions to allow proper scheduling and other late optimizations. This
12 // pass should be run after register allocation but before the post-regalloc
15 //===----------------------------------------------------------------------===//
17 #include "AArch64InstrInfo.h"
18 #include "AArch64Subtarget.h"
19 #include "MCTargetDesc/AArch64AddressingModes.h"
20 #include "Utils/AArch64BaseInfo.h"
21 #include "llvm/ADT/DenseMap.h"
22 #include "llvm/ADT/Triple.h"
23 #include "llvm/CodeGen/LivePhysRegs.h"
24 #include "llvm/CodeGen/MachineBasicBlock.h"
25 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineInstr.h"
28 #include "llvm/CodeGen/MachineInstrBuilder.h"
29 #include "llvm/CodeGen/MachineOperand.h"
30 #include "llvm/CodeGen/TargetSubtargetInfo.h"
31 #include "llvm/IR/DebugLoc.h"
32 #include "llvm/MC/MCInstrDesc.h"
33 #include "llvm/Pass.h"
34 #include "llvm/Support/CodeGen.h"
35 #include "llvm/Support/MathExtras.h"
36 #include "llvm/Target/TargetMachine.h"
45 #define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass"
49 class AArch64ExpandPseudo : public MachineFunctionPass {
51 const AArch64InstrInfo *TII;
55 AArch64ExpandPseudo() : MachineFunctionPass(ID) {
56 initializeAArch64ExpandPseudoPass(*PassRegistry::getPassRegistry());
59 bool runOnMachineFunction(MachineFunction &Fn) override;
61 StringRef getPassName() const override { return AARCH64_EXPAND_PSEUDO_NAME; }
64 bool expandMBB(MachineBasicBlock &MBB);
65 bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
66 MachineBasicBlock::iterator &NextMBBI);
67 bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
70 bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
71 unsigned LdarOp, unsigned StlrOp, unsigned CmpOp,
72 unsigned ExtendImm, unsigned ZeroReg,
73 MachineBasicBlock::iterator &NextMBBI);
74 bool expandCMP_SWAP_128(MachineBasicBlock &MBB,
75 MachineBasicBlock::iterator MBBI,
76 MachineBasicBlock::iterator &NextMBBI);
79 } // end anonymous namespace
81 char AArch64ExpandPseudo::ID = 0;
83 INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo",
84 AARCH64_EXPAND_PSEUDO_NAME, false, false)
86 /// \brief Transfer implicit operands on the pseudo instruction to the
87 /// instructions created from the expansion.
88 static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
89 MachineInstrBuilder &DefMI) {
90 const MCInstrDesc &Desc = OldMI.getDesc();
91 for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e;
93 const MachineOperand &MO = OldMI.getOperand(i);
94 assert(MO.isReg() && MO.getReg());
102 /// \brief Helper function which extracts the specified 16-bit chunk from a
104 static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
105 assert(ChunkIdx < 4 && "Out of range chunk index specified!");
107 return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
110 /// \brief Helper function which replicates a 16-bit chunk within a 64-bit
111 /// value. Indices correspond to element numbers in a v4i16.
112 static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
113 assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!");
114 const unsigned ShiftAmt = ToIdx * 16;
116 // Replicate the source chunk to the destination position.
117 const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt;
118 // Clear the destination chunk.
119 Imm &= ~(0xFFFFLL << ShiftAmt);
120 // Insert the replicated chunk.
124 /// \brief Helper function which tries to materialize a 64-bit value with an
125 /// ORR + MOVK instruction sequence.
126 static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
127 MachineBasicBlock &MBB,
128 MachineBasicBlock::iterator &MBBI,
129 const AArch64InstrInfo *TII, unsigned ChunkIdx) {
130 assert(ChunkIdx < 4 && "Out of range chunk index specified!");
131 const unsigned ShiftAmt = ChunkIdx * 16;
134 if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
135 // Create the ORR-immediate instruction.
136 MachineInstrBuilder MIB =
137 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
138 .add(MI.getOperand(0))
139 .addReg(AArch64::XZR)
142 // Create the MOVK instruction.
143 const unsigned Imm16 = getChunk(UImm, ChunkIdx);
144 const unsigned DstReg = MI.getOperand(0).getReg();
145 const bool DstIsDead = MI.getOperand(0).isDead();
146 MachineInstrBuilder MIB1 =
147 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
148 .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
151 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
153 transferImpOps(MI, MIB, MIB1);
154 MI.eraseFromParent();
161 /// \brief Check whether the given 16-bit chunk replicated to full 64-bit width
162 /// can be materialized with an ORR instruction.
163 static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
164 Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
166 return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding);
169 /// \brief Check for identical 16-bit chunks within the constant and if so
170 /// materialize them with a single ORR instruction. The remaining one or two
171 /// 16-bit chunks will be materialized with MOVK instructions.
173 /// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order
174 /// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with
175 /// an ORR instruction.
176 static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
177 MachineBasicBlock &MBB,
178 MachineBasicBlock::iterator &MBBI,
179 const AArch64InstrInfo *TII) {
180 using CountMap = DenseMap<uint64_t, unsigned>;
184 // Scan the constant and count how often every chunk occurs.
185 for (unsigned Idx = 0; Idx < 4; ++Idx)
186 ++Counts[getChunk(UImm, Idx)];
188 // Traverse the chunks to find one which occurs more than once.
189 for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end();
190 Chunk != End; ++Chunk) {
191 const uint64_t ChunkVal = Chunk->first;
192 const unsigned Count = Chunk->second;
194 uint64_t Encoding = 0;
196 // We are looking for chunks which have two or three instances and can be
197 // materialized with an ORR instruction.
198 if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding))
201 const bool CountThree = Count == 3;
202 // Create the ORR-immediate instruction.
203 MachineInstrBuilder MIB =
204 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
205 .add(MI.getOperand(0))
206 .addReg(AArch64::XZR)
209 const unsigned DstReg = MI.getOperand(0).getReg();
210 const bool DstIsDead = MI.getOperand(0).isDead();
212 unsigned ShiftAmt = 0;
214 // Find the first chunk not materialized with the ORR instruction.
215 for (; ShiftAmt < 64; ShiftAmt += 16) {
216 Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
218 if (Imm16 != ChunkVal)
222 // Create the first MOVK instruction.
223 MachineInstrBuilder MIB1 =
224 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
226 RegState::Define | getDeadRegState(DstIsDead && CountThree))
229 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
231 // In case we have three instances the whole constant is now materialized
234 transferImpOps(MI, MIB, MIB1);
235 MI.eraseFromParent();
239 // Find the remaining chunk which needs to be materialized.
240 for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) {
241 Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
243 if (Imm16 != ChunkVal)
247 // Create the second MOVK instruction.
248 MachineInstrBuilder MIB2 =
249 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
250 .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
253 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
255 transferImpOps(MI, MIB, MIB2);
256 MI.eraseFromParent();
263 /// \brief Check whether this chunk matches the pattern '1...0...'. This pattern
264 /// starts a contiguous sequence of ones if we look at the bits from the LSB
266 static bool isStartChunk(uint64_t Chunk) {
267 if (Chunk == 0 || Chunk == std::numeric_limits<uint64_t>::max())
270 return isMask_64(~Chunk);
273 /// \brief Check whether this chunk matches the pattern '0...1...' This pattern
274 /// ends a contiguous sequence of ones if we look at the bits from the LSB
276 static bool isEndChunk(uint64_t Chunk) {
277 if (Chunk == 0 || Chunk == std::numeric_limits<uint64_t>::max())
280 return isMask_64(Chunk);
283 /// \brief Clear or set all bits in the chunk at the given index.
284 static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
285 const uint64_t Mask = 0xFFFF;
288 // Clear chunk in the immediate.
289 Imm &= ~(Mask << (Idx * 16));
291 // Set all bits in the immediate for the particular chunk.
292 Imm |= Mask << (Idx * 16);
297 /// \brief Check whether the constant contains a sequence of contiguous ones,
298 /// which might be interrupted by one or two chunks. If so, materialize the
299 /// sequence of contiguous ones with an ORR instruction.
300 /// Materialize the chunks which are either interrupting the sequence or outside
301 /// of the sequence with a MOVK instruction.
303 /// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk
304 /// which ends the sequence (0...1...). Then we are looking for constants which
305 /// contain at least one S and E chunk.
306 /// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|.
308 /// We are also looking for constants like |S|A|B|E| where the contiguous
309 /// sequence of ones wraps around the MSB into the LSB.
310 static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
311 MachineBasicBlock &MBB,
312 MachineBasicBlock::iterator &MBBI,
313 const AArch64InstrInfo *TII) {
314 const int NotSet = -1;
315 const uint64_t Mask = 0xFFFF;
317 int StartIdx = NotSet;
319 // Try to find the chunks which start/end a contiguous sequence of ones.
320 for (int Idx = 0; Idx < 4; ++Idx) {
321 int64_t Chunk = getChunk(UImm, Idx);
322 // Sign extend the 16-bit chunk to 64-bit.
323 Chunk = (Chunk << 48) >> 48;
325 if (isStartChunk(Chunk))
327 else if (isEndChunk(Chunk))
331 // Early exit in case we can't find a start/end chunk.
332 if (StartIdx == NotSet || EndIdx == NotSet)
335 // Outside of the contiguous sequence of ones everything needs to be zero.
336 uint64_t Outside = 0;
337 // Chunks between the start and end chunk need to have all their bits set.
338 uint64_t Inside = Mask;
340 // If our contiguous sequence of ones wraps around from the MSB into the LSB,
341 // just swap indices and pretend we are materializing a contiguous sequence
342 // of zeros surrounded by a contiguous sequence of ones.
343 if (StartIdx > EndIdx) {
344 std::swap(StartIdx, EndIdx);
345 std::swap(Outside, Inside);
348 uint64_t OrrImm = UImm;
349 int FirstMovkIdx = NotSet;
350 int SecondMovkIdx = NotSet;
352 // Find out which chunks we need to patch up to obtain a contiguous sequence
354 for (int Idx = 0; Idx < 4; ++Idx) {
355 const uint64_t Chunk = getChunk(UImm, Idx);
357 // Check whether we are looking at a chunk which is not part of the
358 // contiguous sequence of ones.
359 if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) {
360 OrrImm = updateImm(OrrImm, Idx, Outside == 0);
362 // Remember the index we need to patch.
363 if (FirstMovkIdx == NotSet)
368 // Check whether we are looking a chunk which is part of the contiguous
370 } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) {
371 OrrImm = updateImm(OrrImm, Idx, Inside != Mask);
373 // Remember the index we need to patch.
374 if (FirstMovkIdx == NotSet)
380 assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!");
382 // Create the ORR-immediate instruction.
383 uint64_t Encoding = 0;
384 AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
385 MachineInstrBuilder MIB =
386 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
387 .add(MI.getOperand(0))
388 .addReg(AArch64::XZR)
391 const unsigned DstReg = MI.getOperand(0).getReg();
392 const bool DstIsDead = MI.getOperand(0).isDead();
394 const bool SingleMovk = SecondMovkIdx == NotSet;
395 // Create the first MOVK instruction.
396 MachineInstrBuilder MIB1 =
397 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
399 RegState::Define | getDeadRegState(DstIsDead && SingleMovk))
401 .addImm(getChunk(UImm, FirstMovkIdx))
403 AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16));
405 // Early exit in case we only need to emit a single MOVK instruction.
407 transferImpOps(MI, MIB, MIB1);
408 MI.eraseFromParent();
412 // Create the second MOVK instruction.
413 MachineInstrBuilder MIB2 =
414 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
415 .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
417 .addImm(getChunk(UImm, SecondMovkIdx))
419 AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16));
421 transferImpOps(MI, MIB, MIB2);
422 MI.eraseFromParent();
426 /// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
427 /// real move-immediate instructions to synthesize the immediate.
428 bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
429 MachineBasicBlock::iterator MBBI,
431 MachineInstr &MI = *MBBI;
432 unsigned DstReg = MI.getOperand(0).getReg();
433 uint64_t Imm = MI.getOperand(1).getImm();
434 const unsigned Mask = 0xFFFF;
436 if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) {
437 // Useless def, and we don't want to risk creating an invalid ORR (which
438 // would really write to sp).
439 MI.eraseFromParent();
443 // Try a MOVI instruction (aka ORR-immediate with the zero register).
444 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
446 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
447 unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
448 MachineInstrBuilder MIB =
449 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
450 .add(MI.getOperand(0))
451 .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR)
453 transferImpOps(MI, MIB, MIB);
454 MI.eraseFromParent();
458 // Scan the immediate and count the number of 16-bit chunks which are either
459 // all ones or all zeros.
460 unsigned OneChunks = 0;
461 unsigned ZeroChunks = 0;
462 for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
463 const unsigned Chunk = (Imm >> Shift) & Mask;
470 // Since we can't materialize the constant with a single ORR instruction,
471 // let's see whether we can materialize 3/4 of the constant with an ORR
472 // instruction and use an additional MOVK instruction to materialize the
475 // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|.
477 // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR,
478 // we would create the following instruction sequence:
480 // ORR x0, xzr, |A|X|A|X|
481 // MOVK x0, |B|, LSL #16
483 // Only look at 64-bit constants which can't be materialized with a single
484 // instruction e.g. which have less than either three all zero or all one
487 // Ignore 32-bit constants here, they always can be materialized with a
488 // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized
489 // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair.
490 // Thus we fall back to the default code below which in the best case creates
491 // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one).
493 if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) {
494 // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2
496 if (getChunk(UImm, 0) == getChunk(UImm, 2)) {
497 // See if we can come up with a constant which can be materialized with
498 // ORR-immediate by replicating element 3 into element 1.
499 uint64_t OrrImm = replicateChunk(UImm, 3, 1);
500 if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1))
503 // See if we can come up with a constant which can be materialized with
504 // ORR-immediate by replicating element 1 into element 3.
505 OrrImm = replicateChunk(UImm, 1, 3);
506 if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3))
509 // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3
511 } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) {
512 // See if we can come up with a constant which can be materialized with
513 // ORR-immediate by replicating element 2 into element 0.
514 uint64_t OrrImm = replicateChunk(UImm, 2, 0);
515 if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0))
518 // See if we can come up with a constant which can be materialized with
519 // ORR-immediate by replicating element 1 into element 3.
520 OrrImm = replicateChunk(UImm, 0, 2);
521 if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2))
526 // Check for identical 16-bit chunks within the constant and if so materialize
527 // them with a single ORR instruction. The remaining one or two 16-bit chunks
528 // will be materialized with MOVK instructions.
529 if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII))
532 // Check whether the constant contains a sequence of contiguous ones, which
533 // might be interrupted by one or two chunks. If so, materialize the sequence
534 // of contiguous ones with an ORR instruction. Materialize the chunks which
535 // are either interrupting the sequence or outside of the sequence with a
537 if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII))
540 // Use a MOVZ or MOVN instruction to set the high bits, followed by one or
541 // more MOVK instructions to insert additional 16-bit portions into the
545 // Use MOVN to materialize the high bits if we have more all one chunks
546 // than all zero chunks.
547 if (OneChunks > ZeroChunks) {
554 Imm &= (1LL << 32) - 1;
555 FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi);
557 FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi);
559 unsigned Shift = 0; // LSL amount for high bits with MOVZ/MOVN
560 unsigned LastShift = 0; // LSL amount for last MOVK
562 unsigned LZ = countLeadingZeros(Imm);
563 unsigned TZ = countTrailingZeros(Imm);
564 Shift = (TZ / 16) * 16;
565 LastShift = ((63 - LZ) / 16) * 16;
567 unsigned Imm16 = (Imm >> Shift) & Mask;
568 bool DstIsDead = MI.getOperand(0).isDead();
569 MachineInstrBuilder MIB1 =
570 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
571 .addReg(DstReg, RegState::Define |
572 getDeadRegState(DstIsDead && Shift == LastShift))
574 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
576 // If a MOVN was used for the high bits of a negative value, flip the rest
577 // of the bits back for use with MOVK.
581 if (Shift == LastShift) {
582 transferImpOps(MI, MIB1, MIB1);
583 MI.eraseFromParent();
587 MachineInstrBuilder MIB2;
588 unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
589 while (Shift < LastShift) {
591 Imm16 = (Imm >> Shift) & Mask;
592 if (Imm16 == (isNeg ? Mask : 0))
593 continue; // This 16-bit portion is already set correctly.
594 MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
597 getDeadRegState(DstIsDead && Shift == LastShift))
600 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
603 transferImpOps(MI, MIB1, MIB2);
604 MI.eraseFromParent();
608 bool AArch64ExpandPseudo::expandCMP_SWAP(
609 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp,
610 unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg,
611 MachineBasicBlock::iterator &NextMBBI) {
612 MachineInstr &MI = *MBBI;
613 DebugLoc DL = MI.getDebugLoc();
614 const MachineOperand &Dest = MI.getOperand(0);
615 unsigned StatusReg = MI.getOperand(1).getReg();
616 bool StatusDead = MI.getOperand(1).isDead();
617 // Duplicating undef operands into 2 instructions does not guarantee the same
618 // value on both; However undef should be replaced by xzr anyway.
619 assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
620 unsigned AddrReg = MI.getOperand(2).getReg();
621 unsigned DesiredReg = MI.getOperand(3).getReg();
622 unsigned NewReg = MI.getOperand(4).getReg();
624 MachineFunction *MF = MBB.getParent();
625 auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
626 auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
627 auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
629 MF->insert(++MBB.getIterator(), LoadCmpBB);
630 MF->insert(++LoadCmpBB->getIterator(), StoreBB);
631 MF->insert(++StoreBB->getIterator(), DoneBB);
635 // ldaxr xDest, [xAddr]
636 // cmp xDest, xDesired
639 BuildMI(LoadCmpBB, DL, TII->get(AArch64::MOVZWi), StatusReg)
640 .addImm(0).addImm(0);
641 BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg())
643 BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg)
644 .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
647 BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc))
648 .addImm(AArch64CC::NE)
650 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill);
651 LoadCmpBB->addSuccessor(DoneBB);
652 LoadCmpBB->addSuccessor(StoreBB);
655 // stlxr wStatus, xNew, [xAddr]
656 // cbnz wStatus, .Lloadcmp
657 BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg)
660 BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
661 .addReg(StatusReg, getKillRegState(StatusDead))
663 StoreBB->addSuccessor(LoadCmpBB);
664 StoreBB->addSuccessor(DoneBB);
666 DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
667 DoneBB->transferSuccessors(&MBB);
669 MBB.addSuccessor(LoadCmpBB);
671 NextMBBI = MBB.end();
672 MI.eraseFromParent();
674 // Recompute livein lists.
675 LivePhysRegs LiveRegs;
676 computeAndAddLiveIns(LiveRegs, *DoneBB);
677 computeAndAddLiveIns(LiveRegs, *StoreBB);
678 computeAndAddLiveIns(LiveRegs, *LoadCmpBB);
679 // Do an extra pass around the loop to get loop carried registers right.
680 StoreBB->clearLiveIns();
681 computeAndAddLiveIns(LiveRegs, *StoreBB);
682 LoadCmpBB->clearLiveIns();
683 computeAndAddLiveIns(LiveRegs, *LoadCmpBB);
688 bool AArch64ExpandPseudo::expandCMP_SWAP_128(
689 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
690 MachineBasicBlock::iterator &NextMBBI) {
691 MachineInstr &MI = *MBBI;
692 DebugLoc DL = MI.getDebugLoc();
693 MachineOperand &DestLo = MI.getOperand(0);
694 MachineOperand &DestHi = MI.getOperand(1);
695 unsigned StatusReg = MI.getOperand(2).getReg();
696 bool StatusDead = MI.getOperand(2).isDead();
697 // Duplicating undef operands into 2 instructions does not guarantee the same
698 // value on both; However undef should be replaced by xzr anyway.
699 assert(!MI.getOperand(3).isUndef() && "cannot handle undef");
700 unsigned AddrReg = MI.getOperand(3).getReg();
701 unsigned DesiredLoReg = MI.getOperand(4).getReg();
702 unsigned DesiredHiReg = MI.getOperand(5).getReg();
703 unsigned NewLoReg = MI.getOperand(6).getReg();
704 unsigned NewHiReg = MI.getOperand(7).getReg();
706 MachineFunction *MF = MBB.getParent();
707 auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
708 auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
709 auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
711 MF->insert(++MBB.getIterator(), LoadCmpBB);
712 MF->insert(++LoadCmpBB->getIterator(), StoreBB);
713 MF->insert(++StoreBB->getIterator(), DoneBB);
716 // ldaxp xDestLo, xDestHi, [xAddr]
717 // cmp xDestLo, xDesiredLo
718 // sbcs xDestHi, xDesiredHi
720 BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX))
721 .addReg(DestLo.getReg(), RegState::Define)
722 .addReg(DestHi.getReg(), RegState::Define)
724 BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
725 .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead()))
726 .addReg(DesiredLoReg)
728 BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
729 .addUse(AArch64::WZR)
730 .addUse(AArch64::WZR)
731 .addImm(AArch64CC::EQ);
732 BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
733 .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead()))
734 .addReg(DesiredHiReg)
736 BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
737 .addUse(StatusReg, RegState::Kill)
738 .addUse(StatusReg, RegState::Kill)
739 .addImm(AArch64CC::EQ);
740 BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW))
741 .addUse(StatusReg, getKillRegState(StatusDead))
743 LoadCmpBB->addSuccessor(DoneBB);
744 LoadCmpBB->addSuccessor(StoreBB);
747 // stlxp wStatus, xNewLo, xNewHi, [xAddr]
748 // cbnz wStatus, .Lloadcmp
749 BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg)
753 BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
754 .addReg(StatusReg, getKillRegState(StatusDead))
756 StoreBB->addSuccessor(LoadCmpBB);
757 StoreBB->addSuccessor(DoneBB);
759 DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
760 DoneBB->transferSuccessors(&MBB);
762 MBB.addSuccessor(LoadCmpBB);
764 NextMBBI = MBB.end();
765 MI.eraseFromParent();
767 // Recompute liveness bottom up.
768 LivePhysRegs LiveRegs;
769 computeAndAddLiveIns(LiveRegs, *DoneBB);
770 computeAndAddLiveIns(LiveRegs, *StoreBB);
771 computeAndAddLiveIns(LiveRegs, *LoadCmpBB);
772 // Do an extra pass in the loop to get the loop carried dependencies right.
773 StoreBB->clearLiveIns();
774 computeAndAddLiveIns(LiveRegs, *StoreBB);
775 LoadCmpBB->clearLiveIns();
776 computeAndAddLiveIns(LiveRegs, *LoadCmpBB);
781 /// \brief If MBBI references a pseudo instruction that should be expanded here,
782 /// do the expansion and return true. Otherwise return false.
783 bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
784 MachineBasicBlock::iterator MBBI,
785 MachineBasicBlock::iterator &NextMBBI) {
786 MachineInstr &MI = *MBBI;
787 unsigned Opcode = MI.getOpcode();
792 case AArch64::ADDWrr:
793 case AArch64::SUBWrr:
794 case AArch64::ADDXrr:
795 case AArch64::SUBXrr:
796 case AArch64::ADDSWrr:
797 case AArch64::SUBSWrr:
798 case AArch64::ADDSXrr:
799 case AArch64::SUBSXrr:
800 case AArch64::ANDWrr:
801 case AArch64::ANDXrr:
802 case AArch64::BICWrr:
803 case AArch64::BICXrr:
804 case AArch64::ANDSWrr:
805 case AArch64::ANDSXrr:
806 case AArch64::BICSWrr:
807 case AArch64::BICSXrr:
808 case AArch64::EONWrr:
809 case AArch64::EONXrr:
810 case AArch64::EORWrr:
811 case AArch64::EORXrr:
812 case AArch64::ORNWrr:
813 case AArch64::ORNXrr:
814 case AArch64::ORRWrr:
815 case AArch64::ORRXrr: {
817 switch (MI.getOpcode()) {
820 case AArch64::ADDWrr: Opcode = AArch64::ADDWrs; break;
821 case AArch64::SUBWrr: Opcode = AArch64::SUBWrs; break;
822 case AArch64::ADDXrr: Opcode = AArch64::ADDXrs; break;
823 case AArch64::SUBXrr: Opcode = AArch64::SUBXrs; break;
824 case AArch64::ADDSWrr: Opcode = AArch64::ADDSWrs; break;
825 case AArch64::SUBSWrr: Opcode = AArch64::SUBSWrs; break;
826 case AArch64::ADDSXrr: Opcode = AArch64::ADDSXrs; break;
827 case AArch64::SUBSXrr: Opcode = AArch64::SUBSXrs; break;
828 case AArch64::ANDWrr: Opcode = AArch64::ANDWrs; break;
829 case AArch64::ANDXrr: Opcode = AArch64::ANDXrs; break;
830 case AArch64::BICWrr: Opcode = AArch64::BICWrs; break;
831 case AArch64::BICXrr: Opcode = AArch64::BICXrs; break;
832 case AArch64::ANDSWrr: Opcode = AArch64::ANDSWrs; break;
833 case AArch64::ANDSXrr: Opcode = AArch64::ANDSXrs; break;
834 case AArch64::BICSWrr: Opcode = AArch64::BICSWrs; break;
835 case AArch64::BICSXrr: Opcode = AArch64::BICSXrs; break;
836 case AArch64::EONWrr: Opcode = AArch64::EONWrs; break;
837 case AArch64::EONXrr: Opcode = AArch64::EONXrs; break;
838 case AArch64::EORWrr: Opcode = AArch64::EORWrs; break;
839 case AArch64::EORXrr: Opcode = AArch64::EORXrs; break;
840 case AArch64::ORNWrr: Opcode = AArch64::ORNWrs; break;
841 case AArch64::ORNXrr: Opcode = AArch64::ORNXrs; break;
842 case AArch64::ORRWrr: Opcode = AArch64::ORRWrs; break;
843 case AArch64::ORRXrr: Opcode = AArch64::ORRXrs; break;
845 MachineInstrBuilder MIB1 =
846 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
847 MI.getOperand(0).getReg())
848 .add(MI.getOperand(1))
849 .add(MI.getOperand(2))
850 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
851 transferImpOps(MI, MIB1, MIB1);
852 MI.eraseFromParent();
856 case AArch64::LOADgot: {
857 // Expand into ADRP + LDR.
858 unsigned DstReg = MI.getOperand(0).getReg();
859 const MachineOperand &MO1 = MI.getOperand(1);
860 unsigned Flags = MO1.getTargetFlags();
861 MachineInstrBuilder MIB1 =
862 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
863 MachineInstrBuilder MIB2 =
864 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
865 .add(MI.getOperand(0))
868 if (MO1.isGlobal()) {
869 MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE);
870 MIB2.addGlobalAddress(MO1.getGlobal(), 0,
871 Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
872 } else if (MO1.isSymbol()) {
873 MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | AArch64II::MO_PAGE);
874 MIB2.addExternalSymbol(MO1.getSymbolName(),
875 Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
877 assert(MO1.isCPI() &&
878 "Only expect globals, externalsymbols, or constant pools");
879 MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
880 Flags | AArch64II::MO_PAGE);
881 MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
882 Flags | AArch64II::MO_PAGEOFF |
886 transferImpOps(MI, MIB1, MIB2);
887 MI.eraseFromParent();
891 case AArch64::MOVaddr:
892 case AArch64::MOVaddrJT:
893 case AArch64::MOVaddrCP:
894 case AArch64::MOVaddrBA:
895 case AArch64::MOVaddrTLS:
896 case AArch64::MOVaddrEXT: {
897 // Expand into ADRP + ADD.
898 unsigned DstReg = MI.getOperand(0).getReg();
899 MachineInstrBuilder MIB1 =
900 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
901 .add(MI.getOperand(1));
903 MachineInstrBuilder MIB2 =
904 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
905 .add(MI.getOperand(0))
907 .add(MI.getOperand(2))
910 transferImpOps(MI, MIB1, MIB2);
911 MI.eraseFromParent();
914 case AArch64::MOVbaseTLS: {
915 unsigned DstReg = MI.getOperand(0).getReg();
916 auto SysReg = AArch64SysReg::TPIDR_EL0;
917 MachineFunction *MF = MBB.getParent();
918 if (MF->getTarget().getTargetTriple().isOSFuchsia() &&
919 MF->getTarget().getCodeModel() == CodeModel::Kernel)
920 SysReg = AArch64SysReg::TPIDR_EL1;
921 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MRS), DstReg)
923 MI.eraseFromParent();
927 case AArch64::MOVi32imm:
928 return expandMOVImm(MBB, MBBI, 32);
929 case AArch64::MOVi64imm:
930 return expandMOVImm(MBB, MBBI, 64);
931 case AArch64::RET_ReallyLR: {
932 // Hiding the LR use with RET_ReallyLR may lead to extra kills in the
933 // function and missing live-ins. We are fine in practice because callee
934 // saved register handling ensures the register value is restored before
935 // RET, but we need the undef flag here to appease the MachineVerifier
937 MachineInstrBuilder MIB =
938 BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::RET))
939 .addReg(AArch64::LR, RegState::Undef);
940 transferImpOps(MI, MIB, MIB);
941 MI.eraseFromParent();
944 case AArch64::CMP_SWAP_8:
945 return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRB, AArch64::STLXRB,
947 AArch64_AM::getArithExtendImm(AArch64_AM::UXTB, 0),
948 AArch64::WZR, NextMBBI);
949 case AArch64::CMP_SWAP_16:
950 return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRH, AArch64::STLXRH,
952 AArch64_AM::getArithExtendImm(AArch64_AM::UXTH, 0),
953 AArch64::WZR, NextMBBI);
954 case AArch64::CMP_SWAP_32:
955 return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRW, AArch64::STLXRW,
957 AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
958 AArch64::WZR, NextMBBI);
959 case AArch64::CMP_SWAP_64:
960 return expandCMP_SWAP(MBB, MBBI,
961 AArch64::LDAXRX, AArch64::STLXRX, AArch64::SUBSXrs,
962 AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
963 AArch64::XZR, NextMBBI);
964 case AArch64::CMP_SWAP_128:
965 return expandCMP_SWAP_128(MBB, MBBI, NextMBBI);
967 case AArch64::AESMCrrTied:
968 case AArch64::AESIMCrrTied: {
969 MachineInstrBuilder MIB =
970 BuildMI(MBB, MBBI, MI.getDebugLoc(),
971 TII->get(Opcode == AArch64::AESMCrrTied ? AArch64::AESMCrr :
973 .add(MI.getOperand(0))
974 .add(MI.getOperand(1));
975 transferImpOps(MI, MIB, MIB);
976 MI.eraseFromParent();
983 /// \brief Iterate over the instructions in basic block MBB and expand any
984 /// pseudo instructions. Return true if anything was modified.
985 bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
986 bool Modified = false;
988 MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
990 MachineBasicBlock::iterator NMBBI = std::next(MBBI);
991 Modified |= expandMI(MBB, MBBI, NMBBI);
998 bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
999 TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
1001 bool Modified = false;
1002 for (auto &MBB : MF)
1003 Modified |= expandMBB(MBB);
1007 /// \brief Returns an instance of the pseudo instruction expansion pass.
1008 FunctionPass *llvm::createAArch64ExpandPseudoPass() {
1009 return new AArch64ExpandPseudo();