contrib/llvm/patches/patch-08-llvm-r230348-arm-fix-bad-ha.diff

   1 Pull in r230348 from upstream llvm trunk (by Tim Northover):
   2
   3   ARM: treat [N x i32] and [N x i64] as AAPCS composite types
   4
   5   The logic is almost there already, with our special homogeneous
   6   aggregate handling. Tweaking it like this allows front-ends to emit
   7   AAPCS compliant code without ever having to count registers or add
   8   discarded padding arguments.
   9
  10   Only arrays of i32 and i64 are needed to model AAPCS rules, but I
  11   decided to apply the logic to all integer arrays for more consistency.
  12
  13 This fixes a possible "Unexpected member type for HA" error when
  14 compiling lib/msun/bsdsrc/b_tgamma.c for armv6.
  15
  16 Reported by:    Jakub Palider <jpa@semihalf.com>
  17
  18 Introduced here: https://svnweb.freebsd.org/changeset/base/280400
  19
  20 Index: include/llvm/CodeGen/CallingConvLower.h
  21 ===================================================================
  22 --- include/llvm/CodeGen/CallingConvLower.h
  23 +++ include/llvm/CodeGen/CallingConvLower.h
  24 @@ -122,8 +122,8 @@ class CCValAssign {
  25    // There is no need to differentiate between a pending CCValAssign and other
  26    // kinds, as they are stored in a different list.
  27    static CCValAssign getPending(unsigned ValNo, MVT ValVT, MVT LocVT,
  28 -                                LocInfo HTP) {
  29 -    return getReg(ValNo, ValVT, 0, LocVT, HTP);
  30 +                                LocInfo HTP, unsigned ExtraInfo = 0) {
  31 +    return getReg(ValNo, ValVT, ExtraInfo, LocVT, HTP);
  32    }
  33
  34    void convertToReg(unsigned RegNo) {
  35 @@ -146,6 +146,7 @@ class CCValAssign {
  36
  37    unsigned getLocReg() const { assert(isRegLoc()); return Loc; }
  38    unsigned getLocMemOffset() const { assert(isMemLoc()); return Loc; }
  39 +  unsigned getExtraInfo() const { return Loc; }
  40    MVT getLocVT() const { return LocVT; }
  41
  42    LocInfo getLocInfo() const { return HTP; }
  43 Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
  44 ===================================================================
  45 --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
  46 +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
  47 @@ -7429,11 +7429,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLo
  48        }
  49        if (Args[i].isNest)
  50          Flags.setNest();
  51 -      if (NeedsRegBlock) {
  52 +      if (NeedsRegBlock)
  53          Flags.setInConsecutiveRegs();
  54 -        if (Value == NumValues - 1)
  55 -          Flags.setInConsecutiveRegsLast();
  56 -      }
  57        Flags.setOrigAlign(OriginalAlignment);
  58
  59        MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT);
  60 @@ -7482,6 +7479,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLo
  61          CLI.Outs.push_back(MyFlags);
  62          CLI.OutVals.push_back(Parts[j]);
  63        }
  64 +
  65 +      if (NeedsRegBlock && Value == NumValues - 1)
  66 +        CLI.Outs[CLI.Outs.size() - 1].Flags.setInConsecutiveRegsLast();
  67      }
  68    }
  69
  70 @@ -7697,11 +7697,8 @@ void SelectionDAGISel::LowerArguments(const Functi
  71        }
  72        if (F.getAttributes().hasAttribute(Idx, Attribute::Nest))
  73          Flags.setNest();
  74 -      if (NeedsRegBlock) {
  75 +      if (NeedsRegBlock)
  76          Flags.setInConsecutiveRegs();
  77 -        if (Value == NumValues - 1)
  78 -          Flags.setInConsecutiveRegsLast();
  79 -      }
  80        Flags.setOrigAlign(OriginalAlignment);
  81
  82        MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
  83 @@ -7716,6 +7713,8 @@ void SelectionDAGISel::LowerArguments(const Functi
  84            MyFlags.Flags.setOrigAlign(1);
  85          Ins.push_back(MyFlags);
  86        }
  87 +      if (NeedsRegBlock && Value == NumValues - 1)
  88 +        Ins[Ins.size() - 1].Flags.setInConsecutiveRegsLast();
  89        PartBase += VT.getStoreSize();
  90      }
  91    }
  92 Index: lib/Target/ARM/ARMCallingConv.h
  93 ===================================================================
  94 --- lib/Target/ARM/ARMCallingConv.h
  95 +++ lib/Target/ARM/ARMCallingConv.h
  96 @@ -160,6 +160,8 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &V
  97                                     State);
  98  }
  99
 100 +static const uint16_t RRegList[] = { ARM::R0,  ARM::R1,  ARM::R2,  ARM::R3 };
 101 +
 102  static const uint16_t SRegList[] = { ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
 103                                       ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
 104                                       ARM::S8,  ARM::S9,  ARM::S10, ARM::S11,
 105 @@ -168,81 +170,114 @@ static const uint16_t DRegList[] = { ARM::D0, ARM:
 106                                       ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
 107  static const uint16_t QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
 108
 109 +
 110  // Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA
 111  // has InConsecutiveRegs set, and that the last member also has
 112  // InConsecutiveRegsLast set. We must process all members of the HA before
 113  // we can allocate it, as we need to know the total number of registers that
 114  // will be needed in order to (attempt to) allocate a contiguous block.
 115 -static bool CC_ARM_AAPCS_Custom_HA(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
 116 -                                   CCValAssign::LocInfo &LocInfo,
 117 -                                   ISD::ArgFlagsTy &ArgFlags, CCState &State) {
 118 -  SmallVectorImpl<CCValAssign> &PendingHAMembers = State.getPendingLocs();
 119 +static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
 120 +                                          MVT &LocVT,
 121 +                                          CCValAssign::LocInfo &LocInfo,
 122 +                                          ISD::ArgFlagsTy &ArgFlags,
 123 +                                          CCState &State) {
 124 +  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
 125
 126    // AAPCS HFAs must have 1-4 elements, all of the same type
 127 -  assert(PendingHAMembers.size() < 4);
 128 -  if (PendingHAMembers.size() > 0)
 129 -    assert(PendingHAMembers[0].getLocVT() == LocVT);
 130 +  if (PendingMembers.size() > 0)
 131 +    assert(PendingMembers[0].getLocVT() == LocVT);
 132
 133    // Add the argument to the list to be allocated once we know the size of the
 134 -  // HA
 135 -  PendingHAMembers.push_back(
 136 -      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
 137 +  // aggregate. Store the type's required alignmnent as extra info for later: in
 138 +  // the [N x i64] case all trace has been removed by the time we actually get
 139 +  // to do allocation.
 140 +  PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo,
 141 +                                                   ArgFlags.getOrigAlign()));
 142
 143 -  if (ArgFlags.isInConsecutiveRegsLast()) {
 144 -    assert(PendingHAMembers.size() > 0 && PendingHAMembers.size() <= 4 &&
 145 -           "Homogeneous aggregates must have between 1 and 4 members");
 146 +  if (!ArgFlags.isInConsecutiveRegsLast())
 147 +    return true;
 148
 149 -    // Try to allocate a contiguous block of registers, each of the correct
 150 -    // size to hold one member.
 151 -    ArrayRef<uint16_t> RegList;
 152 -    switch (LocVT.SimpleTy) {
 153 -    case MVT::f32:
 154 -      RegList = SRegList;
 155 -      break;
 156 -    case MVT::f64:
 157 -      RegList = DRegList;
 158 -      break;
 159 -    case MVT::v2f64:
 160 -      RegList = QRegList;
 161 -      break;
 162 -    default:
 163 -      llvm_unreachable("Unexpected member type for HA");
 164 -      break;
 165 -    }
 166 +  // Try to allocate a contiguous block of registers, each of the correct
 167 +  // size to hold one member.
 168 +  unsigned Align = std::min(PendingMembers[0].getExtraInfo(), 8U);
 169
 170 -    unsigned RegResult =
 171 -        State.AllocateRegBlock(RegList, PendingHAMembers.size());
 172 +  ArrayRef<uint16_t> RegList;
 173 +  switch (LocVT.SimpleTy) {
 174 +  case MVT::i32: {
 175 +    RegList = RRegList;
 176 +    unsigned RegIdx = State.getFirstUnallocated(RegList.data(), RegList.size());
 177
 178 -    if (RegResult) {
 179 -      for (SmallVectorImpl<CCValAssign>::iterator It = PendingHAMembers.begin();
 180 -           It != PendingHAMembers.end(); ++It) {
 181 -        It->convertToReg(RegResult);
 182 -        State.addLoc(*It);
 183 -        ++RegResult;
 184 -      }
 185 -      PendingHAMembers.clear();
 186 -      return true;
 187 -    }
 188 +    // First consume all registers that would give an unaligned object. Whether
 189 +    // we go on stack or in regs, no-one will be using them in future.
 190 +    unsigned RegAlign = RoundUpToAlignment(Align, 4) / 4;
 191 +    while (RegIdx % RegAlign != 0 && RegIdx < RegList.size())
 192 +      State.AllocateReg(RegList[RegIdx++]);
 193
 194 -    // Register allocation failed, fall back to the stack
 195 +    break;
 196 +  }
 197 +  case MVT::f32:
 198 +    RegList = SRegList;
 199 +    break;
 200 +  case MVT::f64:
 201 +    RegList = DRegList;
 202 +    break;
 203 +  case MVT::v2f64:
 204 +    RegList = QRegList;
 205 +    break;
 206 +  default:
 207 +    llvm_unreachable("Unexpected member type for block aggregate");
 208 +    break;
 209 +  }
 210
 211 -    // Mark all VFP regs as unavailable (AAPCS rule C.2.vfp)
 212 -    for (unsigned regNo = 0; regNo < 16; ++regNo)
 213 -      State.AllocateReg(SRegList[regNo]);
 214 +  unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
 215 +  if (RegResult) {
 216 +    for (SmallVectorImpl<CCValAssign>::iterator It = PendingMembers.begin();
 217 +         It != PendingMembers.end(); ++It) {
 218 +      It->convertToReg(RegResult);
 219 +      State.addLoc(*It);
 220 +      ++RegResult;
 221 +    }
 222 +    PendingMembers.clear();
 223 +    return true;
 224 +  }
 225
 226 -    unsigned Size = LocVT.getSizeInBits() / 8;
 227 -    unsigned Align = std::min(Size, 8U);
 228 +  // Register allocation failed, we'll be needing the stack
 229 +  unsigned Size = LocVT.getSizeInBits() / 8;
 230 +  if (LocVT == MVT::i32 && State.getNextStackOffset() == 0) {
 231 +    // If nothing else has used the stack until this point, a non-HFA aggregate
 232 +    // can be split between regs and stack.
 233 +    unsigned RegIdx = State.getFirstUnallocated(RegList.data(), RegList.size());
 234 +    for (auto &It : PendingMembers) {
 235 +      if (RegIdx >= RegList.size())
 236 +        It.convertToMem(State.AllocateStack(Size, Size));
 237 +      else
 238 +        It.convertToReg(State.AllocateReg(RegList[RegIdx++]));
 239
 240 -    for (auto It : PendingHAMembers) {
 241 -      It.convertToMem(State.AllocateStack(Size, Align));
 242        State.addLoc(It);
 243      }
 244 +    PendingMembers.clear();
 245 +    return true;
 246 +  } else if (LocVT != MVT::i32)
 247 +    RegList = SRegList;
 248
 249 -    // All pending members have now been allocated
 250 -    PendingHAMembers.clear();
 251 +  // Mark all regs as unavailable (AAPCS rule C.2.vfp for VFP, C.6 for core)
 252 +  for (auto Reg : RegList)
 253 +    State.AllocateReg(Reg);
 254 +
 255 +  for (auto &It : PendingMembers) {
 256 +    It.convertToMem(State.AllocateStack(Size, Align));
 257 +    State.addLoc(It);
 258 +
 259 +    // After the first item has been allocated, the rest are packed as tightly
 260 +    // as possible. (E.g. an incoming i64 would have starting Align of 8, but
 261 +    // we'll be allocating a bunch of i32 slots).
 262 +    Align = Size;
 263    }
 264
 265 -  // This will be allocated by the last member of the HA
 266 +  // All pending members have now been allocated
 267 +  PendingMembers.clear();
 268 +
 269 +  // This will be allocated by the last member of the aggregate
 270    return true;
 271  }
 272
 273 Index: lib/Target/ARM/ARMCallingConv.td
 274 ===================================================================
 275 --- lib/Target/ARM/ARMCallingConv.td
 276 +++ lib/Target/ARM/ARMCallingConv.td
 277 @@ -175,7 +175,7 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
 278    CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 279
 280    // HFAs are passed in a contiguous block of registers, or on the stack
 281 -  CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_HA">>,
 282 +  CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_Aggregate">>,
 283
 284    CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
 285    CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
 286 Index: lib/Target/ARM/ARMISelLowering.cpp
 287 ===================================================================
 288 --- lib/Target/ARM/ARMISelLowering.cpp
 289 +++ lib/Target/ARM/ARMISelLowering.cpp
 290 @@ -11285,7 +11285,9 @@ static bool isHomogeneousAggregate(Type *Ty, HABas
 291    return (Members > 0 && Members <= 4);
 292  }
 293
 294 -/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate.
 295 +/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
 296 +/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
 297 +/// passing according to AAPCS rules.
 298  bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
 299      Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
 300    if (getEffectiveCallingConv(CallConv, isVarArg) !=
 301 @@ -11294,7 +11296,9 @@ bool ARMTargetLowering::functionArgumentNeedsConse
 302
 303    HABaseType Base = HA_UNKNOWN;
 304    uint64_t Members = 0;
 305 -  bool result = isHomogeneousAggregate(Ty, Base, Members);
 306 -  DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump());
 307 -  return result;
 308 +  bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
 309 +  DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
 310 +
 311 +  bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
 312 +  return IsHA || IsIntArray;
 313  }
 314 Index: test/CodeGen/ARM/aggregate-padding.ll
 315 ===================================================================
 316 --- test/CodeGen/ARM/aggregate-padding.ll
 317 +++ test/CodeGen/ARM/aggregate-padding.ll
 318 @@ -0,0 +1,101 @@
 319 +; RUN: llc -mtriple=armv7-linux-gnueabihf %s -o - | FileCheck %s
 320 +
 321 +; [2 x i64] should be contiguous when split (e.g. we shouldn't try to align all
 322 +; i32 components to 64 bits). Also makes sure i64 based types are properly
 323 +; aligned on the stack.
 324 +define i64 @test_i64_contiguous_on_stack([8 x double], float, i32 %in, [2 x i64] %arg) nounwind {
 325 +; CHECK-LABEL: test_i64_contiguous_on_stack:
 326 +; CHECK-DAG: ldr [[LO0:r[0-9]+]], [sp, #8]
 327 +; CHECK-DAG: ldr [[HI0:r[0-9]+]], [sp, #12]
 328 +; CHECK-DAG: ldr [[LO1:r[0-9]+]], [sp, #16]
 329 +; CHECK-DAG: ldr [[HI1:r[0-9]+]], [sp, #20]
 330 +; CHECK: adds r0, [[LO0]], [[LO1]]
 331 +; CHECK: adc r1, [[HI0]], [[HI1]]
 332 +
 333 +  %val1 = extractvalue [2 x i64] %arg, 0
 334 +  %val2 = extractvalue [2 x i64] %arg, 1
 335 +  %sum = add i64 %val1, %val2
 336 +  ret i64 %sum
 337 +}
 338 +
 339 +; [2 x i64] should try to use looks for 4 regs, not 8 (which might happen if the
 340 +; i64 -> i32, i32 split wasn't handled correctly).
 341 +define i64 @test_2xi64_uses_4_regs([8 x double], float, [2 x i64] %arg) nounwind {
 342 +; CHECK-LABEL: test_2xi64_uses_4_regs:
 343 +; CHECK-DAG: mov r0, r2
 344 +; CHECK-DAG: mov r1, r3
 345 +
 346 +  %val = extractvalue [2 x i64] %arg, 1
 347 +  ret i64 %val
 348 +}
 349 +
 350 +; An aggregate should be able to split between registers and stack if there is
 351 +; nothing else on the stack.
 352 +define i32 @test_aggregates_split([8 x double], i32, [4 x i32] %arg) nounwind {
 353 +; CHECK-LABEL: test_aggregates_split:
 354 +; CHECK: ldr [[VAL3:r[0-9]+]], [sp]
 355 +; CHECK: add r0, r1, [[VAL3]]
 356 +
 357 +  %val0 = extractvalue [4 x i32] %arg, 0
 358 +  %val3 = extractvalue [4 x i32] %arg, 3
 359 +  %sum = add i32 %val0, %val3
 360 +  ret i32 %sum
 361 +}
 362 +
 363 +; If an aggregate has to be moved entirely onto the stack, nothing should be
 364 +; able to use r0-r3 any more. Also checks that [2 x i64] properly aligned when
 365 +; it uses regs.
 366 +define i32 @test_no_int_backfilling([8 x double], float, i32, [2 x i64], i32 %arg) nounwind {
 367 +; CHECK-LABEL: test_no_int_backfilling:
 368 +; CHECK: ldr r0, [sp, #24]
 369 +  ret i32 %arg
 370 +}
 371 +
 372 +; Even if the argument was successfully allocated as reg block, there should be
 373 +; no backfillig to r1.
 374 +define i32 @test_no_int_backfilling_regsonly(i32, [1 x i64], i32 %arg) {
 375 +; CHECK-LABEL: test_no_int_backfilling_regsonly:
 376 +; CHECK: ldr r0, [sp]
 377 +  ret i32 %arg
 378 +}
 379 +
 380 +; If an aggregate has to be moved entirely onto the stack, nothing should be
 381 +; able to use r0-r3 any more.
 382 +define float @test_no_float_backfilling([7 x double], [4 x i32], i32, [4 x double], float %arg) nounwind {
 383 +; CHECK-LABEL: test_no_float_backfilling:
 384 +; CHECK: vldr s0, [sp, #40]
 385 +  ret float %arg
 386 +}
 387 +
 388 +; They're a bit pointless, but types like [N x i8] should work as well.
 389 +define i8 @test_i8_in_regs(i32, [3 x i8] %arg) {
 390 +; CHECK-LABEL: test_i8_in_regs:
 391 +; CHECK: add r0, r1, r3
 392 +  %val0 = extractvalue [3 x i8] %arg, 0
 393 +  %val2 = extractvalue [3 x i8] %arg, 2
 394 +  %sum = add i8 %val0, %val2
 395 +  ret i8 %sum
 396 +}
 397 +
 398 +define i16 @test_i16_split(i32, i32, [3 x i16] %arg) {
 399 +; CHECK-LABEL: test_i16_split:
 400 +; CHECK: ldrh [[VAL2:r[0-9]+]], [sp]
 401 +; CHECK: add r0, r2, [[VAL2]]
 402 +  %val0 = extractvalue [3 x i16] %arg, 0
 403 +  %val2 = extractvalue [3 x i16] %arg, 2
 404 +  %sum = add i16 %val0, %val2
 405 +  ret i16 %sum
 406 +}
 407 +
 408 +; Beware: on the stack each i16 still gets a 32-bit slot, the array is not
 409 +; packed.
 410 +define i16 @test_i16_forced_stack([8 x double], double, i32, i32, [3 x i16] %arg) {
 411 +; CHECK-LABEL: test_i16_forced_stack:
 412 +; CHECK-DAG: ldrh [[VAL0:r[0-9]+]], [sp, #8]
 413 +; CHECK-DAG: ldrh [[VAL2:r[0-9]+]], [sp, #16]
 414 +; CHECK: add r0, [[VAL0]], [[VAL2]]
 415 +  %val0 = extractvalue [3 x i16] %arg, 0
 416 +  %val2 = extractvalue [3 x i16] %arg, 2
 417 +  %sum = add i16 %val0, %val2
 418 +  ret i16 %sum
 419 +}