contrib/llvm/lib/Target/CellSPU/SPUISelLowering.cpp

   1 //===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
   2 //                     The LLVM Compiler Infrastructure
   3 //
   4 // This file is distributed under the University of Illinois Open Source
   5 // License. See LICENSE.TXT for details.
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file implements the SPUTargetLowering class.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "SPUISelLowering.h"
  14 #include "SPUTargetMachine.h"
  15 #include "SPUFrameLowering.h"
  16 #include "SPUMachineFunction.h"
  17 #include "llvm/Constants.h"
  18 #include "llvm/Function.h"
  19 #include "llvm/Intrinsics.h"
  20 #include "llvm/CallingConv.h"
  21 #include "llvm/Type.h"
  22 #include "llvm/CodeGen/CallingConvLower.h"
  23 #include "llvm/CodeGen/MachineFrameInfo.h"
  24 #include "llvm/CodeGen/MachineFunction.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/SelectionDAG.h"
  28 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
  29 #include "llvm/Target/TargetOptions.h"
  30 #include "llvm/ADT/VectorExtras.h"
  31 #include "llvm/Support/Debug.h"
  32 #include "llvm/Support/ErrorHandling.h"
  33 #include "llvm/Support/MathExtras.h"
  34 #include "llvm/Support/raw_ostream.h"
  35 #include <map>
  36
  37 using namespace llvm;
  38
  39 // Used in getTargetNodeName() below
  40 namespace {
  41   std::map<unsigned, const char *> node_names;
  42
  43   // Byte offset of the preferred slot (counted from the MSB)
  44   int prefslotOffset(EVT VT) {
  45     int retval=0;
  46     if (VT==MVT::i1) retval=3;
  47     if (VT==MVT::i8) retval=3;
  48     if (VT==MVT::i16) retval=2;
  49
  50     return retval;
  51   }
  52
  53   //! Expand a library call into an actual call DAG node
  54   /*!
  55    \note
  56    This code is taken from SelectionDAGLegalize, since it is not exposed as
  57    part of the LLVM SelectionDAG API.
  58    */
  59
  60   SDValue
  61   ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
  62                 bool isSigned, SDValue &Hi, const SPUTargetLowering &TLI) {
  63     // The input chain to this libcall is the entry node of the function.
  64     // Legalizing the call will automatically add the previous call to the
  65     // dependence.
  66     SDValue InChain = DAG.getEntryNode();
  67
  68     TargetLowering::ArgListTy Args;
  69     TargetLowering::ArgListEntry Entry;
  70     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
  71       EVT ArgVT = Op.getOperand(i).getValueType();
  72       const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
  73       Entry.Node = Op.getOperand(i);
  74       Entry.Ty = ArgTy;
  75       Entry.isSExt = isSigned;
  76       Entry.isZExt = !isSigned;
  77       Args.push_back(Entry);
  78     }
  79     SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
  80                                            TLI.getPointerTy());
  81
  82     // Splice the libcall in wherever FindInputOutputChains tells us to.
  83     const Type *RetTy =
  84                 Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
  85     std::pair<SDValue, SDValue> CallInfo =
  86             TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
  87                             0, TLI.getLibcallCallingConv(LC), false,
  88                             /*isReturnValueUsed=*/true,
  89                             Callee, Args, DAG, Op.getDebugLoc());
  90
  91     return CallInfo.first;
  92   }
  93 }
  94
  95 SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  96   : TargetLowering(TM, new TargetLoweringObjectFileELF()),
  97     SPUTM(TM) {
  98
  99   // Use _setjmp/_longjmp instead of setjmp/longjmp.
 100   setUseUnderscoreSetJmp(true);
 101   setUseUnderscoreLongJmp(true);
 102
 103   // Set RTLIB libcall names as used by SPU:
 104   setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
 105
 106   // Set up the SPU's register classes:
 107   addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
 108   addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
 109   addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
 110   addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
 111   addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
 112   addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
 113   addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
 114
 115   // SPU has no sign or zero extended loads for i1, i8, i16:
 116   setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
 117   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
 118   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
 119
 120   setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
 121   setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
 122
 123   setTruncStoreAction(MVT::i128, MVT::i64, Expand);
 124   setTruncStoreAction(MVT::i128, MVT::i32, Expand);
 125   setTruncStoreAction(MVT::i128, MVT::i16, Expand);
 126   setTruncStoreAction(MVT::i128, MVT::i8, Expand);
 127
 128   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 129
 130   // SPU constant load actions are custom lowered:
 131   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
 132   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 133
 134   // SPU's loads and stores have to be custom lowered:
 135   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
 136        ++sctype) {
 137     MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
 138
 139     setOperationAction(ISD::LOAD,   VT, Custom);
 140     setOperationAction(ISD::STORE,  VT, Custom);
 141     setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
 142     setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
 143     setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
 144
 145     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
 146       MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
 147       setTruncStoreAction(VT, StoreVT, Expand);
 148     }
 149   }
 150
 151   for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
 152        ++sctype) {
 153     MVT::SimpleValueType VT = (MVT::SimpleValueType) sctype;
 154
 155     setOperationAction(ISD::LOAD,   VT, Custom);
 156     setOperationAction(ISD::STORE,  VT, Custom);
 157
 158     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
 159       MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
 160       setTruncStoreAction(VT, StoreVT, Expand);
 161     }
 162   }
 163
 164   // Expand the jumptable branches
 165   setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
 166   setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
 167
 168   // Custom lower SELECT_CC for most cases, but expand by default
 169   setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
 170   setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
 171   setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
 172   setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
 173   setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
 174
 175   // SPU has no intrinsics for these particular operations:
 176   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
 177
 178   // SPU has no division/remainder instructions
 179   setOperationAction(ISD::SREM,    MVT::i8,   Expand);
 180   setOperationAction(ISD::UREM,    MVT::i8,   Expand);
 181   setOperationAction(ISD::SDIV,    MVT::i8,   Expand);
 182   setOperationAction(ISD::UDIV,    MVT::i8,   Expand);
 183   setOperationAction(ISD::SDIVREM, MVT::i8,   Expand);
 184   setOperationAction(ISD::UDIVREM, MVT::i8,   Expand);
 185   setOperationAction(ISD::SREM,    MVT::i16,  Expand);
 186   setOperationAction(ISD::UREM,    MVT::i16,  Expand);
 187   setOperationAction(ISD::SDIV,    MVT::i16,  Expand);
 188   setOperationAction(ISD::UDIV,    MVT::i16,  Expand);
 189   setOperationAction(ISD::SDIVREM, MVT::i16,  Expand);
 190   setOperationAction(ISD::UDIVREM, MVT::i16,  Expand);
 191   setOperationAction(ISD::SREM,    MVT::i32,  Expand);
 192   setOperationAction(ISD::UREM,    MVT::i32,  Expand);
 193   setOperationAction(ISD::SDIV,    MVT::i32,  Expand);
 194   setOperationAction(ISD::UDIV,    MVT::i32,  Expand);
 195   setOperationAction(ISD::SDIVREM, MVT::i32,  Expand);
 196   setOperationAction(ISD::UDIVREM, MVT::i32,  Expand);
 197   setOperationAction(ISD::SREM,    MVT::i64,  Expand);
 198   setOperationAction(ISD::UREM,    MVT::i64,  Expand);
 199   setOperationAction(ISD::SDIV,    MVT::i64,  Expand);
 200   setOperationAction(ISD::UDIV,    MVT::i64,  Expand);
 201   setOperationAction(ISD::SDIVREM, MVT::i64,  Expand);
 202   setOperationAction(ISD::UDIVREM, MVT::i64,  Expand);
 203   setOperationAction(ISD::SREM,    MVT::i128, Expand);
 204   setOperationAction(ISD::UREM,    MVT::i128, Expand);
 205   setOperationAction(ISD::SDIV,    MVT::i128, Expand);
 206   setOperationAction(ISD::UDIV,    MVT::i128, Expand);
 207   setOperationAction(ISD::SDIVREM, MVT::i128, Expand);
 208   setOperationAction(ISD::UDIVREM, MVT::i128, Expand);
 209
 210   // We don't support sin/cos/sqrt/fmod
 211   setOperationAction(ISD::FSIN , MVT::f64, Expand);
 212   setOperationAction(ISD::FCOS , MVT::f64, Expand);
 213   setOperationAction(ISD::FREM , MVT::f64, Expand);
 214   setOperationAction(ISD::FSIN , MVT::f32, Expand);
 215   setOperationAction(ISD::FCOS , MVT::f32, Expand);
 216   setOperationAction(ISD::FREM , MVT::f32, Expand);
 217
 218   // Expand fsqrt to the appropriate libcall (NOTE: should use h/w fsqrt
 219   // for f32!)
 220   setOperationAction(ISD::FSQRT, MVT::f64, Expand);
 221   setOperationAction(ISD::FSQRT, MVT::f32, Expand);
 222
 223   setOperationAction(ISD::FMA, MVT::f64, Expand);
 224   setOperationAction(ISD::FMA, MVT::f32, Expand);
 225
 226   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 227   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 228
 229   // SPU can do rotate right and left, so legalize it... but customize for i8
 230   // because instructions don't exist.
 231
 232   // FIXME: Change from "expand" to appropriate type once ROTR is supported in
 233   //        .td files.
 234   setOperationAction(ISD::ROTR, MVT::i32,    Expand /*Legal*/);
 235   setOperationAction(ISD::ROTR, MVT::i16,    Expand /*Legal*/);
 236   setOperationAction(ISD::ROTR, MVT::i8,     Expand /*Custom*/);
 237
 238   setOperationAction(ISD::ROTL, MVT::i32,    Legal);
 239   setOperationAction(ISD::ROTL, MVT::i16,    Legal);
 240   setOperationAction(ISD::ROTL, MVT::i8,     Custom);
 241
 242   // SPU has no native version of shift left/right for i8
 243   setOperationAction(ISD::SHL,  MVT::i8,     Custom);
 244   setOperationAction(ISD::SRL,  MVT::i8,     Custom);
 245   setOperationAction(ISD::SRA,  MVT::i8,     Custom);
 246
 247   // Make these operations legal and handle them during instruction selection:
 248   setOperationAction(ISD::SHL,  MVT::i64,    Legal);
 249   setOperationAction(ISD::SRL,  MVT::i64,    Legal);
 250   setOperationAction(ISD::SRA,  MVT::i64,    Legal);
 251
 252   // Custom lower i8, i32 and i64 multiplications
 253   setOperationAction(ISD::MUL,  MVT::i8,     Custom);
 254   setOperationAction(ISD::MUL,  MVT::i32,    Legal);
 255   setOperationAction(ISD::MUL,  MVT::i64,    Legal);
 256
 257   // Expand double-width multiplication
 258   // FIXME: It would probably be reasonable to support some of these operations
 259   setOperationAction(ISD::UMUL_LOHI, MVT::i8,  Expand);
 260   setOperationAction(ISD::SMUL_LOHI, MVT::i8,  Expand);
 261   setOperationAction(ISD::MULHU,     MVT::i8,  Expand);
 262   setOperationAction(ISD::MULHS,     MVT::i8,  Expand);
 263   setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
 264   setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
 265   setOperationAction(ISD::MULHU,     MVT::i16, Expand);
 266   setOperationAction(ISD::MULHS,     MVT::i16, Expand);
 267   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
 268   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
 269   setOperationAction(ISD::MULHU,     MVT::i32, Expand);
 270   setOperationAction(ISD::MULHS,     MVT::i32, Expand);
 271   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
 272   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 273   setOperationAction(ISD::MULHU,     MVT::i64, Expand);
 274   setOperationAction(ISD::MULHS,     MVT::i64, Expand);
 275
 276   // Need to custom handle (some) common i8, i64 math ops
 277   setOperationAction(ISD::ADD,  MVT::i8,     Custom);
 278   setOperationAction(ISD::ADD,  MVT::i64,    Legal);
 279   setOperationAction(ISD::SUB,  MVT::i8,     Custom);
 280   setOperationAction(ISD::SUB,  MVT::i64,    Legal);
 281
 282   // SPU does not have BSWAP. It does have i32 support CTLZ.
 283   // CTPOP has to be custom lowered.
 284   setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
 285   setOperationAction(ISD::BSWAP, MVT::i64,   Expand);
 286
 287   setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
 288   setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
 289   setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
 290   setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
 291   setOperationAction(ISD::CTPOP, MVT::i128,  Expand);
 292
 293   setOperationAction(ISD::CTTZ , MVT::i8,    Expand);
 294   setOperationAction(ISD::CTTZ , MVT::i16,   Expand);
 295   setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
 296   setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
 297   setOperationAction(ISD::CTTZ , MVT::i128,  Expand);
 298
 299   setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
 300   setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
 301   setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
 302   setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
 303   setOperationAction(ISD::CTLZ , MVT::i128,  Expand);
 304
 305   // SPU has a version of select that implements (a&~c)|(b&c), just like
 306   // select ought to work:
 307   setOperationAction(ISD::SELECT, MVT::i8,   Legal);
 308   setOperationAction(ISD::SELECT, MVT::i16,  Legal);
 309   setOperationAction(ISD::SELECT, MVT::i32,  Legal);
 310   setOperationAction(ISD::SELECT, MVT::i64,  Legal);
 311
 312   setOperationAction(ISD::SETCC, MVT::i8,    Legal);
 313   setOperationAction(ISD::SETCC, MVT::i16,   Legal);
 314   setOperationAction(ISD::SETCC, MVT::i32,   Legal);
 315   setOperationAction(ISD::SETCC, MVT::i64,   Legal);
 316   setOperationAction(ISD::SETCC, MVT::f64,   Custom);
 317
 318   // Custom lower i128 -> i64 truncates
 319   setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
 320
 321   // Custom lower i32/i64 -> i128 sign extend
 322   setOperationAction(ISD::SIGN_EXTEND, MVT::i128, Custom);
 323
 324   setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
 325   setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
 326   setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
 327   setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
 328   // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
 329   // to expand to a libcall, hence the custom lowering:
 330   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
 331   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 332   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
 333   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
 334   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Expand);
 335   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Expand);
 336
 337   // FDIV on SPU requires custom lowering
 338   setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall
 339
 340   // SPU has [U|S]INT_TO_FP for f32->i32, but not for f64->i32, f64->i64:
 341   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 342   setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
 343   setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
 344   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
 345   setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
 346   setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
 347   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 348   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 349
 350   setOperationAction(ISD::BITCAST, MVT::i32, Legal);
 351   setOperationAction(ISD::BITCAST, MVT::f32, Legal);
 352   setOperationAction(ISD::BITCAST, MVT::i64, Legal);
 353   setOperationAction(ISD::BITCAST, MVT::f64, Legal);
 354
 355   // We cannot sextinreg(i1).  Expand to shifts.
 356   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 357
 358   // We want to legalize GlobalAddress and ConstantPool nodes into the
 359   // appropriate instructions to materialize the address.
 360   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
 361        ++sctype) {
 362     MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
 363
 364     setOperationAction(ISD::GlobalAddress,  VT, Custom);
 365     setOperationAction(ISD::ConstantPool,   VT, Custom);
 366     setOperationAction(ISD::JumpTable,      VT, Custom);
 367   }
 368
 369   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 370   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 371
 372   // Use the default implementation.
 373   setOperationAction(ISD::VAARG             , MVT::Other, Expand);
 374   setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
 375   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 376   setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
 377   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
 378   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
 379   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);
 380
 381   // Cell SPU has instructions for converting between i64 and fp.
 382   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
 383   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 384
 385   // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
 386   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
 387
 388   // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
 389   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
 390
 391   // First set operation action for all vector types to expand. Then we
 392   // will selectively turn on ones that can be effectively codegen'd.
 393   addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
 394   addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
 395   addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
 396   addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
 397   addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
 398   addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
 399
 400   for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
 401        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
 402     MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
 403
 404     // add/sub are legal for all supported vector VT's.
 405     setOperationAction(ISD::ADD,     VT, Legal);
 406     setOperationAction(ISD::SUB,     VT, Legal);
 407     // mul has to be custom lowered.
 408     setOperationAction(ISD::MUL,     VT, Legal);
 409
 410     setOperationAction(ISD::AND,     VT, Legal);
 411     setOperationAction(ISD::OR,      VT, Legal);
 412     setOperationAction(ISD::XOR,     VT, Legal);
 413     setOperationAction(ISD::LOAD,    VT, Custom);
 414     setOperationAction(ISD::SELECT,  VT, Legal);
 415     setOperationAction(ISD::STORE,   VT, Custom);
 416
 417     // These operations need to be expanded:
 418     setOperationAction(ISD::SDIV,    VT, Expand);
 419     setOperationAction(ISD::SREM,    VT, Expand);
 420     setOperationAction(ISD::UDIV,    VT, Expand);
 421     setOperationAction(ISD::UREM,    VT, Expand);
 422
 423     // Custom lower build_vector, constant pool spills, insert and
 424     // extract vector elements:
 425     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
 426     setOperationAction(ISD::ConstantPool, VT, Custom);
 427     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
 428     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 429     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 430     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
 431   }
 432
 433   setOperationAction(ISD::AND, MVT::v16i8, Custom);
 434   setOperationAction(ISD::OR,  MVT::v16i8, Custom);
 435   setOperationAction(ISD::XOR, MVT::v16i8, Custom);
 436   setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
 437
 438   setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
 439
 440   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 441
 442   setStackPointerRegisterToSaveRestore(SPU::R1);
 443
 444   // We have target-specific dag combine patterns for the following nodes:
 445   setTargetDAGCombine(ISD::ADD);
 446   setTargetDAGCombine(ISD::ZERO_EXTEND);
 447   setTargetDAGCombine(ISD::SIGN_EXTEND);
 448   setTargetDAGCombine(ISD::ANY_EXTEND);
 449
 450   setMinFunctionAlignment(3);
 451
 452   computeRegisterProperties();
 453
 454   // Set pre-RA register scheduler default to BURR, which produces slightly
 455   // better code than the default (could also be TDRR, but TargetLowering.h
 456   // needs a mod to support that model):
 457   setSchedulingPreference(Sched::RegPressure);
 458 }
 459
 460 const char *
 461 SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
 462 {
 463   if (node_names.empty()) {
 464     node_names[(unsigned) SPUISD::RET_FLAG] = "SPUISD::RET_FLAG";
 465     node_names[(unsigned) SPUISD::Hi] = "SPUISD::Hi";
 466     node_names[(unsigned) SPUISD::Lo] = "SPUISD::Lo";
 467     node_names[(unsigned) SPUISD::PCRelAddr] = "SPUISD::PCRelAddr";
 468     node_names[(unsigned) SPUISD::AFormAddr] = "SPUISD::AFormAddr";
 469     node_names[(unsigned) SPUISD::IndirectAddr] = "SPUISD::IndirectAddr";
 470     node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT";
 471     node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL";
 472     node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
 473     node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK";
 474     node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
 475     node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC";
 476     node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
 477     node_names[(unsigned) SPUISD::SHL_BITS] = "SPUISD::SHL_BITS";
 478     node_names[(unsigned) SPUISD::SHL_BYTES] = "SPUISD::SHL_BYTES";
 479     node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
 480     node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
 481     node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
 482     node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
 483             "SPUISD::ROTBYTES_LEFT_BITS";
 484     node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
 485     node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
 486     node_names[(unsigned) SPUISD::ADD64_MARKER] = "SPUISD::ADD64_MARKER";
 487     node_names[(unsigned) SPUISD::SUB64_MARKER] = "SPUISD::SUB64_MARKER";
 488     node_names[(unsigned) SPUISD::MUL64_MARKER] = "SPUISD::MUL64_MARKER";
 489   }
 490
 491   std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);
 492
 493   return ((i != node_names.end()) ? i->second : 0);
 494 }
 495
 496 //===----------------------------------------------------------------------===//
 497 // Return the Cell SPU's SETCC result type
 498 //===----------------------------------------------------------------------===//
 499
 500 MVT::SimpleValueType SPUTargetLowering::getSetCCResultType(EVT VT) const {
 501   // i8, i16 and i32 are valid SETCC result types
 502   MVT::SimpleValueType retval;
 503
 504   switch(VT.getSimpleVT().SimpleTy){
 505     case MVT::i1:
 506     case MVT::i8:
 507       retval = MVT::i8; break;
 508     case MVT::i16:
 509       retval = MVT::i16; break;
 510     case MVT::i32:
 511     default:
 512       retval = MVT::i32;
 513   }
 514   return retval;
 515 }
 516
 517 //===----------------------------------------------------------------------===//
 518 // Calling convention code:
 519 //===----------------------------------------------------------------------===//
 520
 521 #include "SPUGenCallingConv.inc"
 522
 523 //===----------------------------------------------------------------------===//
 524 //  LowerOperation implementation
 525 //===----------------------------------------------------------------------===//
 526
 527 /// Custom lower loads for CellSPU
 528 /*!
 529  All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
 530  within a 16-byte block, we have to rotate to extract the requested element.
 531
 532  For extending loads, we also want to ensure that the following sequence is
 533  emitted, e.g. for MVT::f32 extending load to MVT::f64:
 534
 535 \verbatim
 536 %1  v16i8,ch = load
 537 %2  v16i8,ch = rotate %1
 538 %3  v4f8, ch = bitconvert %2
 539 %4  f32      = vec2perfslot %3
 540 %5  f64      = fp_extend %4
 541 \endverbatim
 542 */
 543 static SDValue
 544 LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 545   LoadSDNode *LN = cast<LoadSDNode>(Op);
 546   SDValue the_chain = LN->getChain();
 547   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 548   EVT InVT = LN->getMemoryVT();
 549   EVT OutVT = Op.getValueType();
 550   ISD::LoadExtType ExtType = LN->getExtensionType();
 551   unsigned alignment = LN->getAlignment();
 552   int pso = prefslotOffset(InVT);
 553   DebugLoc dl = Op.getDebugLoc();
 554   EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT,
 555                                                   (128 / InVT.getSizeInBits()));
 556
 557   // two sanity checks
 558   assert( LN->getAddressingMode() == ISD::UNINDEXED
 559           && "we should get only UNINDEXED adresses");
 560   // clean aligned loads can be selected as-is
 561   if (InVT.getSizeInBits() == 128 && (alignment%16) == 0)
 562     return SDValue();
 563
 564   // Get pointerinfos to the memory chunk(s) that contain the data to load
 565   uint64_t mpi_offset = LN->getPointerInfo().Offset;
 566   mpi_offset -= mpi_offset%16;
 567   MachinePointerInfo lowMemPtr(LN->getPointerInfo().V, mpi_offset);
 568   MachinePointerInfo highMemPtr(LN->getPointerInfo().V, mpi_offset+16);
 569
 570   SDValue result;
 571   SDValue basePtr = LN->getBasePtr();
 572   SDValue rotate;
 573
 574   if ((alignment%16) == 0) {
 575     ConstantSDNode *CN;
 576
 577     // Special cases for a known aligned load to simplify the base pointer
 578     // and the rotation amount:
 579     if (basePtr.getOpcode() == ISD::ADD
 580         && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
 581       // Known offset into basePtr
 582       int64_t offset = CN->getSExtValue();
 583       int64_t rotamt = int64_t((offset & 0xf) - pso);
 584
 585       if (rotamt < 0)
 586         rotamt += 16;
 587
 588       rotate = DAG.getConstant(rotamt, MVT::i16);
 589
 590       // Simplify the base pointer for this case:
 591       basePtr = basePtr.getOperand(0);
 592       if ((offset & ~0xf) > 0) {
 593         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 594                               basePtr,
 595                               DAG.getConstant((offset & ~0xf), PtrVT));
 596       }
 597     } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
 598                || (basePtr.getOpcode() == SPUISD::IndirectAddr
 599                    && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
 600                    && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
 601       // Plain aligned a-form address: rotate into preferred slot
 602       // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
 603       int64_t rotamt = -pso;
 604       if (rotamt < 0)
 605         rotamt += 16;
 606       rotate = DAG.getConstant(rotamt, MVT::i16);
 607     } else {
 608       // Offset the rotate amount by the basePtr and the preferred slot
 609       // byte offset
 610       int64_t rotamt = -pso;
 611       if (rotamt < 0)
 612         rotamt += 16;
 613       rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
 614                            basePtr,
 615                            DAG.getConstant(rotamt, PtrVT));
 616     }
 617   } else {
 618     // Unaligned load: must be more pessimistic about addressing modes:
 619     if (basePtr.getOpcode() == ISD::ADD) {
 620       MachineFunction &MF = DAG.getMachineFunction();
 621       MachineRegisterInfo &RegInfo = MF.getRegInfo();
 622       unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
 623       SDValue Flag;
 624
 625       SDValue Op0 = basePtr.getOperand(0);
 626       SDValue Op1 = basePtr.getOperand(1);
 627
 628       if (isa<ConstantSDNode>(Op1)) {
 629         // Convert the (add <ptr>, <const>) to an indirect address contained
 630         // in a register. Note that this is done because we need to avoid
 631         // creating a 0(reg) d-form address due to the SPU's block loads.
 632         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
 633         the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
 634         basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
 635       } else {
 636         // Convert the (add <arg1>, <arg2>) to an indirect address, which
 637         // will likely be lowered as a reg(reg) x-form address.
 638         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
 639       }
 640     } else {
 641       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 642                             basePtr,
 643                             DAG.getConstant(0, PtrVT));
 644    }
 645
 646     // Offset the rotate amount by the basePtr and the preferred slot
 647     // byte offset
 648     rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
 649                          basePtr,
 650                          DAG.getConstant(-pso, PtrVT));
 651   }
 652
 653   // Do the load as a i128 to allow possible shifting
 654   SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr,
 655                        lowMemPtr,
 656                        LN->isVolatile(), LN->isNonTemporal(), 16);
 657
 658   // When the size is not greater than alignment we get all data with just
 659   // one load
 660   if (alignment >= InVT.getSizeInBits()/8) {
 661     // Update the chain
 662     the_chain = low.getValue(1);
 663
 664     // Rotate into the preferred slot:
 665     result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128,
 666                          low.getValue(0), rotate);
 667
 668     // Convert the loaded v16i8 vector to the appropriate vector type
 669     // specified by the operand:
 670     EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
 671                                  InVT, (128 / InVT.getSizeInBits()));
 672     result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
 673                          DAG.getNode(ISD::BITCAST, dl, vecVT, result));
 674   }
 675   // When alignment is less than the size, we might need (known only at
 676   // run-time) two loads
 677   // TODO: if the memory address is composed only from constants, we have
 678   // extra kowledge, and might avoid the second load
 679   else {
 680     // storage position offset from lower 16 byte aligned memory chunk
 681     SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
 682                                   basePtr, DAG.getConstant( 0xf, MVT::i32 ) );
 683     // get a registerfull of ones. (this implementation is a workaround: LLVM
 684     // cannot handle 128 bit signed int constants)
 685     SDValue ones = DAG.getConstant(-1, MVT::v4i32 );
 686     ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
 687
 688     SDValue high = DAG.getLoad(MVT::i128, dl, the_chain,
 689                                DAG.getNode(ISD::ADD, dl, PtrVT,
 690                                            basePtr,
 691                                            DAG.getConstant(16, PtrVT)),
 692                                highMemPtr,
 693                                LN->isVolatile(), LN->isNonTemporal(), 16);
 694
 695     the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
 696                                                               high.getValue(1));
 697
 698     // Shift the (possible) high part right to compensate the misalignemnt.
 699     // if there is no highpart (i.e. value is i64 and offset is 4), this
 700     // will zero out the high value.
 701     high = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, high,
 702                                      DAG.getNode(ISD::SUB, dl, MVT::i32,
 703                                                  DAG.getConstant( 16, MVT::i32),
 704                                                  offset
 705                                                 ));
 706
 707     // Shift the low similarly
 708     // TODO: add SPUISD::SHL_BYTES
 709     low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset );
 710
 711     // Merge the two parts
 712     result = DAG.getNode(ISD::BITCAST, dl, vecVT,
 713                           DAG.getNode(ISD::OR, dl, MVT::i128, low, high));
 714
 715     if (!InVT.isVector()) {
 716       result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, result );
 717      }
 718
 719   }
 720     // Handle extending loads by extending the scalar result:
 721     if (ExtType == ISD::SEXTLOAD) {
 722       result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result);
 723     } else if (ExtType == ISD::ZEXTLOAD) {
 724       result = DAG.getNode(ISD::ZERO_EXTEND, dl, OutVT, result);
 725     } else if (ExtType == ISD::EXTLOAD) {
 726       unsigned NewOpc = ISD::ANY_EXTEND;
 727
 728       if (OutVT.isFloatingPoint())
 729         NewOpc = ISD::FP_EXTEND;
 730
 731       result = DAG.getNode(NewOpc, dl, OutVT, result);
 732     }
 733
 734     SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
 735     SDValue retops[2] = {
 736       result,
 737       the_chain
 738     };
 739
 740     result = DAG.getNode(SPUISD::LDRESULT, dl, retvts,
 741                          retops, sizeof(retops) / sizeof(retops[0]));
 742     return result;
 743 }
 744
 745 /// Custom lower stores for CellSPU
 746 /*!
 747  All CellSPU stores are aligned to 16-byte boundaries, so for elements
 748  within a 16-byte block, we have to generate a shuffle to insert the
 749  requested element into its place, then store the resulting block.
 750  */
 751 static SDValue
 752 LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 753   StoreSDNode *SN = cast<StoreSDNode>(Op);
 754   SDValue Value = SN->getValue();
 755   EVT VT = Value.getValueType();
 756   EVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
 757   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 758   DebugLoc dl = Op.getDebugLoc();
 759   unsigned alignment = SN->getAlignment();
 760   SDValue result;
 761   EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT,
 762                                                  (128 / StVT.getSizeInBits()));
 763   // Get pointerinfos to the memory chunk(s) that contain the data to load
 764   uint64_t mpi_offset = SN->getPointerInfo().Offset;
 765   mpi_offset -= mpi_offset%16;
 766   MachinePointerInfo lowMemPtr(SN->getPointerInfo().V, mpi_offset);
 767   MachinePointerInfo highMemPtr(SN->getPointerInfo().V, mpi_offset+16);
 768
 769
 770   // two sanity checks
 771   assert( SN->getAddressingMode() == ISD::UNINDEXED
 772           && "we should get only UNINDEXED adresses");
 773   // clean aligned loads can be selected as-is
 774   if (StVT.getSizeInBits() == 128 && (alignment%16) == 0)
 775     return SDValue();
 776
 777   SDValue alignLoadVec;
 778   SDValue basePtr = SN->getBasePtr();
 779   SDValue the_chain = SN->getChain();
 780   SDValue insertEltOffs;
 781
 782   if ((alignment%16) == 0) {
 783     ConstantSDNode *CN;
 784     // Special cases for a known aligned load to simplify the base pointer
 785     // and insertion byte:
 786     if (basePtr.getOpcode() == ISD::ADD
 787         && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
 788       // Known offset into basePtr
 789       int64_t offset = CN->getSExtValue();
 790
 791       // Simplify the base pointer for this case:
 792       basePtr = basePtr.getOperand(0);
 793       insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 794                                   basePtr,
 795                                   DAG.getConstant((offset & 0xf), PtrVT));
 796
 797       if ((offset & ~0xf) > 0) {
 798         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 799                               basePtr,
 800                               DAG.getConstant((offset & ~0xf), PtrVT));
 801       }
 802     } else {
 803       // Otherwise, assume it's at byte 0 of basePtr
 804       insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 805                                   basePtr,
 806                                   DAG.getConstant(0, PtrVT));
 807       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 808                                   basePtr,
 809                                   DAG.getConstant(0, PtrVT));
 810     }
 811   } else {
 812     // Unaligned load: must be more pessimistic about addressing modes:
 813     if (basePtr.getOpcode() == ISD::ADD) {
 814       MachineFunction &MF = DAG.getMachineFunction();
 815       MachineRegisterInfo &RegInfo = MF.getRegInfo();
 816       unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
 817       SDValue Flag;
 818
 819       SDValue Op0 = basePtr.getOperand(0);
 820       SDValue Op1 = basePtr.getOperand(1);
 821
 822       if (isa<ConstantSDNode>(Op1)) {
 823         // Convert the (add <ptr>, <const>) to an indirect address contained
 824         // in a register. Note that this is done because we need to avoid
 825         // creating a 0(reg) d-form address due to the SPU's block loads.
 826         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
 827         the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
 828         basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
 829       } else {
 830         // Convert the (add <arg1>, <arg2>) to an indirect address, which
 831         // will likely be lowered as a reg(reg) x-form address.
 832         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
 833       }
 834     } else {
 835       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 836                             basePtr,
 837                             DAG.getConstant(0, PtrVT));
 838     }
 839
 840     // Insertion point is solely determined by basePtr's contents
 841     insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT,
 842                                 basePtr,
 843                                 DAG.getConstant(0, PtrVT));
 844   }
 845
 846   // Load the lower part of the memory to which to store.
 847   SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr,
 848                           lowMemPtr, SN->isVolatile(), SN->isNonTemporal(), 16);
 849
 850   // if we don't need to store over the 16 byte boundary, one store suffices
 851   if (alignment >= StVT.getSizeInBits()/8) {
 852     // Update the chain
 853     the_chain = low.getValue(1);
 854
 855     LoadSDNode *LN = cast<LoadSDNode>(low);
 856     SDValue theValue = SN->getValue();
 857
 858     if (StVT != VT
 859         && (theValue.getOpcode() == ISD::AssertZext
 860             || theValue.getOpcode() == ISD::AssertSext)) {
 861       // Drill down and get the value for zero- and sign-extended
 862       // quantities
 863       theValue = theValue.getOperand(0);
 864     }
 865
 866     // If the base pointer is already a D-form address, then just create
 867     // a new D-form address with a slot offset and the orignal base pointer.
 868     // Otherwise generate a D-form address with the slot offset relative
 869     // to the stack pointer, which is always aligned.
 870 #if !defined(NDEBUG)
 871       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
 872         errs() << "CellSPU LowerSTORE: basePtr = ";
 873         basePtr.getNode()->dump(&DAG);
 874         errs() << "\n";
 875       }
 876 #endif
 877
 878     SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT,
 879                                       insertEltOffs);
 880     SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT,
 881                                       theValue);
 882
 883     result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
 884                          vectorizeOp, low,
 885                          DAG.getNode(ISD::BITCAST, dl,
 886                                      MVT::v4i32, insertEltOp));
 887
 888     result = DAG.getStore(the_chain, dl, result, basePtr,
 889                           lowMemPtr,
 890                           LN->isVolatile(), LN->isNonTemporal(),
 891                           16);
 892
 893   }
 894   // do the store when it might cross the 16 byte memory access boundary.
 895   else {
 896     // TODO issue a warning if SN->isVolatile()== true? This is likely not
 897     // what the user wanted.
 898
 899     // address offset from nearest lower 16byte alinged address
 900     SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
 901                                     SN->getBasePtr(),
 902                                     DAG.getConstant(0xf, MVT::i32));
 903     // 16 - offset
 904     SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32,
 905                                            DAG.getConstant( 16, MVT::i32),
 906                                            offset);
 907     // 16 - sizeof(Value)
 908     SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32,
 909                                      DAG.getConstant( 16, MVT::i32),
 910                                      DAG.getConstant( VT.getSizeInBits()/8,
 911                                                       MVT::i32));
 912     // get a registerfull of ones
 913     SDValue ones = DAG.getConstant(-1, MVT::v4i32);
 914     ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
 915
 916     // Create the 128 bit masks that have ones where the data to store is
 917     // located.
 918     SDValue lowmask, himask;
 919     // if the value to store don't fill up the an entire 128 bits, zero
 920     // out the last bits of the mask so that only the value we want to store
 921     // is masked.
 922     // this is e.g. in the case of store i32, align 2
 923     if (!VT.isVector()){
 924       Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value);
 925       lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus);
 926       lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
 927                                                                surplus);
 928       Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
 929       Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask);
 930
 931     }
 932     else {
 933       lowmask = ones;
 934       Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
 935     }
 936     // this will zero, if there are no data that goes to the high quad
 937     himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
 938                                                             offset_compl);
 939     lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask,
 940                                                              offset);
 941
 942     // Load in the old data and zero out the parts that will be overwritten with
 943     // the new data to store.
 944     SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain,
 945                                DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
 946                                            DAG.getConstant( 16, PtrVT)),
 947                                highMemPtr,
 948                                SN->isVolatile(), SN->isNonTemporal(), 16);
 949     the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
 950                                                               hi.getValue(1));
 951
 952     low = DAG.getNode(ISD::AND, dl, MVT::i128,
 953                         DAG.getNode( ISD::BITCAST, dl, MVT::i128, low),
 954                         DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones));
 955     hi = DAG.getNode(ISD::AND, dl, MVT::i128,
 956                         DAG.getNode( ISD::BITCAST, dl, MVT::i128, hi),
 957                         DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones));
 958
 959     // Shift the Value to store into place. rlow contains the parts that go to
 960     // the lower memory chunk, rhi has the parts that go to the upper one.
 961     SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset);
 962     rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask);
 963     SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value,
 964                                                             offset_compl);
 965
 966     // Merge the old data and the new data and store the results
 967     // Need to convert vectors here to integer as 'OR'ing floats assert
 968     rlow = DAG.getNode(ISD::OR, dl, MVT::i128,
 969                           DAG.getNode(ISD::BITCAST, dl, MVT::i128, low),
 970                           DAG.getNode(ISD::BITCAST, dl, MVT::i128, rlow));
 971     rhi = DAG.getNode(ISD::OR, dl, MVT::i128,
 972                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, hi),
 973                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, rhi));
 974
 975     low = DAG.getStore(the_chain, dl, rlow, basePtr,
 976                           lowMemPtr,
 977                           SN->isVolatile(), SN->isNonTemporal(), 16);
 978     hi  = DAG.getStore(the_chain, dl, rhi,
 979                             DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
 980                                         DAG.getConstant( 16, PtrVT)),
 981                             highMemPtr,
 982                             SN->isVolatile(), SN->isNonTemporal(), 16);
 983     result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0),
 984                                                            hi.getValue(0));
 985   }
 986
 987   return result;
 988 }
 989
 990 //! Generate the address of a constant pool entry.
 991 static SDValue
 992 LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 993   EVT PtrVT = Op.getValueType();
 994   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
 995   const Constant *C = CP->getConstVal();
 996   SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
 997   SDValue Zero = DAG.getConstant(0, PtrVT);
 998   const TargetMachine &TM = DAG.getTarget();
 999   // FIXME there is no actual debug info here
1000   DebugLoc dl = Op.getDebugLoc();
1001
1002   if (TM.getRelocationModel() == Reloc::Static) {
1003     if (!ST->usingLargeMem()) {
1004       // Just return the SDValue with the constant pool address in it.
1005       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, CPI, Zero);
1006     } else {
1007       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, CPI, Zero);
1008       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, CPI, Zero);
1009       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
1010     }
1011   }
1012
1013   llvm_unreachable("LowerConstantPool: Relocation model other than static"
1014                    " not supported.");
1015   return SDValue();
1016 }
1017
1018 //! Alternate entry point for generating the address of a constant pool entry
1019 SDValue
1020 SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
1021   return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
1022 }
1023
1024 static SDValue
1025 LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
1026   EVT PtrVT = Op.getValueType();
1027   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
1028   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
1029   SDValue Zero = DAG.getConstant(0, PtrVT);
1030   const TargetMachine &TM = DAG.getTarget();
1031   // FIXME there is no actual debug info here
1032   DebugLoc dl = Op.getDebugLoc();
1033
1034   if (TM.getRelocationModel() == Reloc::Static) {
1035     if (!ST->usingLargeMem()) {
1036       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, JTI, Zero);
1037     } else {
1038       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, JTI, Zero);
1039       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, JTI, Zero);
1040       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
1041     }
1042   }
1043
1044   llvm_unreachable("LowerJumpTable: Relocation model other than static"
1045                    " not supported.");
1046   return SDValue();
1047 }
1048
1049 static SDValue
1050 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
1051   EVT PtrVT = Op.getValueType();
1052   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
1053   const GlobalValue *GV = GSDN->getGlobal();
1054   SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
1055                                           PtrVT, GSDN->getOffset());
1056   const TargetMachine &TM = DAG.getTarget();
1057   SDValue Zero = DAG.getConstant(0, PtrVT);
1058   // FIXME there is no actual debug info here
1059   DebugLoc dl = Op.getDebugLoc();
1060
1061   if (TM.getRelocationModel() == Reloc::Static) {
1062     if (!ST->usingLargeMem()) {
1063       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, GA, Zero);
1064     } else {
1065       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, GA, Zero);
1066       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, GA, Zero);
1067       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
1068     }
1069   } else {
1070     report_fatal_error("LowerGlobalAddress: Relocation model other than static"
1071                       "not supported.");
1072     /*NOTREACHED*/
1073   }
1074
1075   return SDValue();
1076 }
1077
1078 //! Custom lower double precision floating point constants
1079 static SDValue
1080 LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
1081   EVT VT = Op.getValueType();
1082   // FIXME there is no actual debug info here
1083   DebugLoc dl = Op.getDebugLoc();
1084
1085   if (VT == MVT::f64) {
1086     ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
1087
1088     assert((FP != 0) &&
1089            "LowerConstantFP: Node is not ConstantFPSDNode");
1090
1091     uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
1092     SDValue T = DAG.getConstant(dbits, MVT::i64);
1093     SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T);
1094     return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
1095                        DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Tvec));
1096   }
1097
1098   return SDValue();
1099 }
1100
1101 SDValue
1102 SPUTargetLowering::LowerFormalArguments(SDValue Chain,
1103                                         CallingConv::ID CallConv, bool isVarArg,
1104                                         const SmallVectorImpl<ISD::InputArg>
1105                                           &Ins,
1106                                         DebugLoc dl, SelectionDAG &DAG,
1107                                         SmallVectorImpl<SDValue> &InVals)
1108                                           const {
1109
1110   MachineFunction &MF = DAG.getMachineFunction();
1111   MachineFrameInfo *MFI = MF.getFrameInfo();
1112   MachineRegisterInfo &RegInfo = MF.getRegInfo();
1113   SPUFunctionInfo *FuncInfo = MF.getInfo<SPUFunctionInfo>();
1114
1115   unsigned ArgOffset = SPUFrameLowering::minStackSize();
1116   unsigned ArgRegIdx = 0;
1117   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
1118
1119   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1120
1121   SmallVector<CCValAssign, 16> ArgLocs;
1122   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1123                  getTargetMachine(), ArgLocs, *DAG.getContext());
1124   // FIXME: allow for other calling conventions
1125   CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
1126
1127   // Add DAG nodes to load the arguments or copy them out of registers.
1128   for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
1129     EVT ObjectVT = Ins[ArgNo].VT;
1130     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
1131     SDValue ArgVal;
1132     CCValAssign &VA = ArgLocs[ArgNo];
1133
1134     if (VA.isRegLoc()) {
1135       const TargetRegisterClass *ArgRegClass;
1136
1137       switch (ObjectVT.getSimpleVT().SimpleTy) {
1138       default:
1139         report_fatal_error("LowerFormalArguments Unhandled argument type: " +
1140                            Twine(ObjectVT.getEVTString()));
1141       case MVT::i8:
1142         ArgRegClass = &SPU::R8CRegClass;
1143         break;
1144       case MVT::i16:
1145         ArgRegClass = &SPU::R16CRegClass;
1146         break;
1147       case MVT::i32:
1148         ArgRegClass = &SPU::R32CRegClass;
1149         break;
1150       case MVT::i64:
1151         ArgRegClass = &SPU::R64CRegClass;
1152         break;
1153       case MVT::i128:
1154         ArgRegClass = &SPU::GPRCRegClass;
1155         break;
1156       case MVT::f32:
1157         ArgRegClass = &SPU::R32FPRegClass;
1158         break;
1159       case MVT::f64:
1160         ArgRegClass = &SPU::R64FPRegClass;
1161         break;
1162       case MVT::v2f64:
1163       case MVT::v4f32:
1164       case MVT::v2i64:
1165       case MVT::v4i32:
1166       case MVT::v8i16:
1167       case MVT::v16i8:
1168         ArgRegClass = &SPU::VECREGRegClass;
1169         break;
1170       }
1171
1172       unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
1173       RegInfo.addLiveIn(VA.getLocReg(), VReg);
1174       ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
1175       ++ArgRegIdx;
1176     } else {
1177       // We need to load the argument to a virtual register if we determined
1178       // above that we ran out of physical registers of the appropriate type
1179       // or we're forced to do vararg
1180       int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true);
1181       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
1182       ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
1183                            false, false, 0);
1184       ArgOffset += StackSlotSize;
1185     }
1186
1187     InVals.push_back(ArgVal);
1188     // Update the chain
1189     Chain = ArgVal.getOperand(0);
1190   }
1191
1192   // vararg handling:
1193   if (isVarArg) {
1194     // FIXME: we should be able to query the argument registers from
1195     //        tablegen generated code.
1196     static const unsigned ArgRegs[] = {
1197       SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
1198       SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
1199       SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
1200       SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
1201       SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
1202       SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
1203       SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
1204       SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
1205       SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
1206       SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
1207       SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
1208     };
1209     // size of ArgRegs array
1210     unsigned NumArgRegs = 77;
1211
1212     // We will spill (79-3)+1 registers to the stack
1213     SmallVector<SDValue, 79-3+1> MemOps;
1214
1215     // Create the frame slot
1216     for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
1217       FuncInfo->setVarArgsFrameIndex(
1218         MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
1219       SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
1220       unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::VECREGRegClass);
1221       SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
1222       SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(),
1223                                    false, false, 0);
1224       Chain = Store.getOperand(0);
1225       MemOps.push_back(Store);
1226
1227       // Increment address by stack slot size for the next stored argument
1228       ArgOffset += StackSlotSize;
1229     }
1230     if (!MemOps.empty())
1231       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1232                           &MemOps[0], MemOps.size());
1233   }
1234
1235   return Chain;
1236 }
1237
1238 /// isLSAAddress - Return the immediate to use if the specified
1239 /// value is representable as a LSA address.
1240 static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
1241   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
1242   if (!C) return 0;
1243
1244   int Addr = C->getZExtValue();
1245   if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
1246       (Addr << 14 >> 14) != Addr)
1247     return 0;  // Top 14 bits have to be sext of immediate.
1248
1249   return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
1250 }
1251
1252 SDValue
1253 SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
1254                              CallingConv::ID CallConv, bool isVarArg,
1255                              bool &isTailCall,
1256                              const SmallVectorImpl<ISD::OutputArg> &Outs,
1257                              const SmallVectorImpl<SDValue> &OutVals,
1258                              const SmallVectorImpl<ISD::InputArg> &Ins,
1259                              DebugLoc dl, SelectionDAG &DAG,
1260                              SmallVectorImpl<SDValue> &InVals) const {
1261   // CellSPU target does not yet support tail call optimization.
1262   isTailCall = false;
1263
1264   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
1265   unsigned NumOps     = Outs.size();
1266   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
1267
1268   SmallVector<CCValAssign, 16> ArgLocs;
1269   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1270                  getTargetMachine(), ArgLocs, *DAG.getContext());
1271   // FIXME: allow for other calling conventions
1272   CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
1273
1274   const unsigned NumArgRegs = ArgLocs.size();
1275
1276
1277   // Handy pointer type
1278   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1279
1280   // Set up a copy of the stack pointer for use loading and storing any
1281   // arguments that may not fit in the registers available for argument
1282   // passing.
1283   SDValue StackPtr = DAG.getRegister(SPU::R1, MVT::i32);
1284
1285   // Figure out which arguments are going to go in registers, and which in
1286   // memory.
1287   unsigned ArgOffset = SPUFrameLowering::minStackSize(); // Just below [LR]
1288   unsigned ArgRegIdx = 0;
1289
1290   // Keep track of registers passing arguments
1291   std::vector<std::pair<unsigned, SDValue> > RegsToPass;
1292   // And the arguments passed on the stack
1293   SmallVector<SDValue, 8> MemOpChains;
1294
1295   for (; ArgRegIdx != NumOps; ++ArgRegIdx) {
1296     SDValue Arg = OutVals[ArgRegIdx];
1297     CCValAssign &VA = ArgLocs[ArgRegIdx];
1298
1299     // PtrOff will be used to store the current argument to the stack if a
1300     // register cannot be found for it.
1301     SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
1302     PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
1303
1304     switch (Arg.getValueType().getSimpleVT().SimpleTy) {
1305     default: llvm_unreachable("Unexpected ValueType for argument!");
1306     case MVT::i8:
1307     case MVT::i16:
1308     case MVT::i32:
1309     case MVT::i64:
1310     case MVT::i128:
1311     case MVT::f32:
1312     case MVT::f64:
1313     case MVT::v2i64:
1314     case MVT::v2f64:
1315     case MVT::v4f32:
1316     case MVT::v4i32:
1317     case MVT::v8i16:
1318     case MVT::v16i8:
1319       if (ArgRegIdx != NumArgRegs) {
1320         RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1321       } else {
1322         MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
1323                                            MachinePointerInfo(),
1324                                            false, false, 0));
1325         ArgOffset += StackSlotSize;
1326       }
1327       break;
1328     }
1329   }
1330
1331   // Accumulate how many bytes are to be pushed on the stack, including the
1332   // linkage area, and parameter passing area.  According to the SPU ABI,
1333   // we minimally need space for [LR] and [SP].
1334   unsigned NumStackBytes = ArgOffset - SPUFrameLowering::minStackSize();
1335
1336   // Insert a call sequence start
1337   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes,
1338                                                             true));
1339
1340   if (!MemOpChains.empty()) {
1341     // Adjust the stack pointer for the stack arguments.
1342     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1343                         &MemOpChains[0], MemOpChains.size());
1344   }
1345
1346   // Build a sequence of copy-to-reg nodes chained together with token chain
1347   // and flag operands which copy the outgoing args into the appropriate regs.
1348   SDValue InFlag;
1349   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1350     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1351                              RegsToPass[i].second, InFlag);
1352     InFlag = Chain.getValue(1);
1353   }
1354
1355   SmallVector<SDValue, 8> Ops;
1356   unsigned CallOpc = SPUISD::CALL;
1357
1358   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
1359   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
1360   // node so that legalize doesn't hack it.
1361   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1362     const GlobalValue *GV = G->getGlobal();
1363     EVT CalleeVT = Callee.getValueType();
1364     SDValue Zero = DAG.getConstant(0, PtrVT);
1365     SDValue GA = DAG.getTargetGlobalAddress(GV, dl, CalleeVT);
1366
1367     if (!ST->usingLargeMem()) {
1368       // Turn calls to targets that are defined (i.e., have bodies) into BRSL
1369       // style calls, otherwise, external symbols are BRASL calls. This assumes
1370       // that declared/defined symbols are in the same compilation unit and can
1371       // be reached through PC-relative jumps.
1372       //
1373       // NOTE:
1374       // This may be an unsafe assumption for JIT and really large compilation
1375       // units.
1376       if (GV->isDeclaration()) {
1377         Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, GA, Zero);
1378       } else {
1379         Callee = DAG.getNode(SPUISD::PCRelAddr, dl, CalleeVT, GA, Zero);
1380       }
1381     } else {
1382       // "Large memory" mode: Turn all calls into indirect calls with a X-form
1383       // address pairs:
1384       Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
1385     }
1386   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1387     EVT CalleeVT = Callee.getValueType();
1388     SDValue Zero = DAG.getConstant(0, PtrVT);
1389     SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
1390         Callee.getValueType());
1391
1392     if (!ST->usingLargeMem()) {
1393       Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, ExtSym, Zero);
1394     } else {
1395       Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, ExtSym, Zero);
1396     }
1397   } else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
1398     // If this is an absolute destination address that appears to be a legal
1399     // local store address, use the munged value.
1400     Callee = SDValue(Dest, 0);
1401   }
1402
1403   Ops.push_back(Chain);
1404   Ops.push_back(Callee);
1405
1406   // Add argument registers to the end of the list so that they are known live
1407   // into the call.
1408   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1409     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1410                                   RegsToPass[i].second.getValueType()));
1411
1412   if (InFlag.getNode())
1413     Ops.push_back(InFlag);
1414   // Returns a chain and a flag for retval copy to use.
1415   Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Glue),
1416                       &Ops[0], Ops.size());
1417   InFlag = Chain.getValue(1);
1418
1419   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
1420                              DAG.getIntPtrConstant(0, true), InFlag);
1421   if (!Ins.empty())
1422     InFlag = Chain.getValue(1);
1423
1424   // If the function returns void, just return the chain.
1425   if (Ins.empty())
1426     return Chain;
1427
1428   // Now handle the return value(s)
1429   SmallVector<CCValAssign, 16> RVLocs;
1430   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1431                     getTargetMachine(), RVLocs, *DAG.getContext());
1432   CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
1433
1434
1435   // If the call has results, copy the values out of the ret val registers.
1436   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1437     CCValAssign VA = RVLocs[i];
1438
1439     SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1440                                      InFlag);
1441     Chain = Val.getValue(1);
1442     InFlag = Val.getValue(2);
1443     InVals.push_back(Val);
1444    }
1445
1446   return Chain;
1447 }
1448
1449 SDValue
1450 SPUTargetLowering::LowerReturn(SDValue Chain,
1451                                CallingConv::ID CallConv, bool isVarArg,
1452                                const SmallVectorImpl<ISD::OutputArg> &Outs,
1453                                const SmallVectorImpl<SDValue> &OutVals,
1454                                DebugLoc dl, SelectionDAG &DAG) const {
1455
1456   SmallVector<CCValAssign, 16> RVLocs;
1457   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1458                  getTargetMachine(), RVLocs, *DAG.getContext());
1459   CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
1460
1461   // If this is the first return lowered for this function, add the regs to the
1462   // liveout set for the function.
1463   if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
1464     for (unsigned i = 0; i != RVLocs.size(); ++i)
1465       DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
1466   }
1467
1468   SDValue Flag;
1469
1470   // Copy the result values into the output registers.
1471   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1472     CCValAssign &VA = RVLocs[i];
1473     assert(VA.isRegLoc() && "Can only return in registers!");
1474     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
1475                              OutVals[i], Flag);
1476     Flag = Chain.getValue(1);
1477   }
1478
1479   if (Flag.getNode())
1480     return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
1481   else
1482     return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain);
1483 }
1484
1485
1486 //===----------------------------------------------------------------------===//
1487 // Vector related lowering:
1488 //===----------------------------------------------------------------------===//
1489
1490 static ConstantSDNode *
1491 getVecImm(SDNode *N) {
1492   SDValue OpVal(0, 0);
1493
1494   // Check to see if this buildvec has a single non-undef value in its elements.
1495   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
1496     if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
1497     if (OpVal.getNode() == 0)
1498       OpVal = N->getOperand(i);
1499     else if (OpVal != N->getOperand(i))
1500       return 0;
1501   }
1502
1503   if (OpVal.getNode() != 0) {
1504     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
1505       return CN;
1506     }
1507   }
1508
1509   return 0;
1510 }
1511
1512 /// get_vec_i18imm - Test if this vector is a vector filled with the same value
1513 /// and the value fits into an unsigned 18-bit constant, and if so, return the
1514 /// constant
1515 SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
1516                               EVT ValueType) {
1517   if (ConstantSDNode *CN = getVecImm(N)) {
1518     uint64_t Value = CN->getZExtValue();
1519     if (ValueType == MVT::i64) {
1520       uint64_t UValue = CN->getZExtValue();
1521       uint32_t upper = uint32_t(UValue >> 32);
1522       uint32_t lower = uint32_t(UValue);
1523       if (upper != lower)
1524         return SDValue();
1525       Value = Value >> 32;
1526     }
1527     if (Value <= 0x3ffff)
1528       return DAG.getTargetConstant(Value, ValueType);
1529   }
1530
1531   return SDValue();
1532 }
1533
1534 /// get_vec_i16imm - Test if this vector is a vector filled with the same value
1535 /// and the value fits into a signed 16-bit constant, and if so, return the
1536 /// constant
1537 SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
1538                               EVT ValueType) {
1539   if (ConstantSDNode *CN = getVecImm(N)) {
1540     int64_t Value = CN->getSExtValue();
1541     if (ValueType == MVT::i64) {
1542       uint64_t UValue = CN->getZExtValue();
1543       uint32_t upper = uint32_t(UValue >> 32);
1544       uint32_t lower = uint32_t(UValue);
1545       if (upper != lower)
1546         return SDValue();
1547       Value = Value >> 32;
1548     }
1549     if (Value >= -(1 << 15) && Value <= ((1 << 15) - 1)) {
1550       return DAG.getTargetConstant(Value, ValueType);
1551     }
1552   }
1553
1554   return SDValue();
1555 }
1556
1557 /// get_vec_i10imm - Test if this vector is a vector filled with the same value
1558 /// and the value fits into a signed 10-bit constant, and if so, return the
1559 /// constant
1560 SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
1561                               EVT ValueType) {
1562   if (ConstantSDNode *CN = getVecImm(N)) {
1563     int64_t Value = CN->getSExtValue();
1564     if (ValueType == MVT::i64) {
1565       uint64_t UValue = CN->getZExtValue();
1566       uint32_t upper = uint32_t(UValue >> 32);
1567       uint32_t lower = uint32_t(UValue);
1568       if (upper != lower)
1569         return SDValue();
1570       Value = Value >> 32;
1571     }
1572     if (isInt<10>(Value))
1573       return DAG.getTargetConstant(Value, ValueType);
1574   }
1575
1576   return SDValue();
1577 }
1578
1579 /// get_vec_i8imm - Test if this vector is a vector filled with the same value
1580 /// and the value fits into a signed 8-bit constant, and if so, return the
1581 /// constant.
1582 ///
1583 /// @note: The incoming vector is v16i8 because that's the only way we can load
1584 /// constant vectors. Thus, we test to see if the upper and lower bytes are the
1585 /// same value.
1586 SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
1587                              EVT ValueType) {
1588   if (ConstantSDNode *CN = getVecImm(N)) {
1589     int Value = (int) CN->getZExtValue();
1590     if (ValueType == MVT::i16
1591         && Value <= 0xffff                 /* truncated from uint64_t */
1592         && ((short) Value >> 8) == ((short) Value & 0xff))
1593       return DAG.getTargetConstant(Value & 0xff, ValueType);
1594     else if (ValueType == MVT::i8
1595              && (Value & 0xff) == Value)
1596       return DAG.getTargetConstant(Value, ValueType);
1597   }
1598
1599   return SDValue();
1600 }
1601
1602 /// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
1603 /// and the value fits into a signed 16-bit constant, and if so, return the
1604 /// constant
1605 SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
1606                                EVT ValueType) {
1607   if (ConstantSDNode *CN = getVecImm(N)) {
1608     uint64_t Value = CN->getZExtValue();
1609     if ((ValueType == MVT::i32
1610           && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
1611         || (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
1612       return DAG.getTargetConstant(Value >> 16, ValueType);
1613   }
1614
1615   return SDValue();
1616 }
1617
1618 /// get_v4i32_imm - Catch-all for general 32-bit constant vectors
1619 SDValue SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
1620   if (ConstantSDNode *CN = getVecImm(N)) {
1621     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i32);
1622   }
1623
1624   return SDValue();
1625 }
1626
1627 /// get_v4i32_imm - Catch-all for general 64-bit constant vectors
1628 SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
1629   if (ConstantSDNode *CN = getVecImm(N)) {
1630     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i64);
1631   }
1632
1633   return SDValue();
1634 }
1635
1636 //! Lower a BUILD_VECTOR instruction creatively:
1637 static SDValue
1638 LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
1639   EVT VT = Op.getValueType();
1640   EVT EltVT = VT.getVectorElementType();
1641   DebugLoc dl = Op.getDebugLoc();
1642   BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
1643   assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
1644   unsigned minSplatBits = EltVT.getSizeInBits();
1645
1646   if (minSplatBits < 16)
1647     minSplatBits = 16;
1648
1649   APInt APSplatBits, APSplatUndef;
1650   unsigned SplatBitSize;
1651   bool HasAnyUndefs;
1652
1653   if (!BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
1654                             HasAnyUndefs, minSplatBits)
1655       || minSplatBits < SplatBitSize)
1656     return SDValue();   // Wasn't a constant vector or splat exceeded min
1657
1658   uint64_t SplatBits = APSplatBits.getZExtValue();
1659
1660   switch (VT.getSimpleVT().SimpleTy) {
1661   default:
1662     report_fatal_error("CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = " +
1663                        Twine(VT.getEVTString()));
1664     /*NOTREACHED*/
1665   case MVT::v4f32: {
1666     uint32_t Value32 = uint32_t(SplatBits);
1667     assert(SplatBitSize == 32
1668            && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
1669     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
1670     SDValue T = DAG.getConstant(Value32, MVT::i32);
1671     return DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,
1672                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T));
1673     break;
1674   }
1675   case MVT::v2f64: {
1676     uint64_t f64val = uint64_t(SplatBits);
1677     assert(SplatBitSize == 64
1678            && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
1679     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
1680     SDValue T = DAG.getConstant(f64val, MVT::i64);
1681     return DAG.getNode(ISD::BITCAST, dl, MVT::v2f64,
1682                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T));
1683     break;
1684   }
1685   case MVT::v16i8: {
1686    // 8-bit constants have to be expanded to 16-bits
1687    unsigned short Value16 = SplatBits /* | (SplatBits << 8) */;
1688    SmallVector<SDValue, 8> Ops;
1689
1690    Ops.assign(8, DAG.getConstant(Value16, MVT::i16));
1691    return DAG.getNode(ISD::BITCAST, dl, VT,
1692                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size()));
1693   }
1694   case MVT::v8i16: {
1695     unsigned short Value16 = SplatBits;
1696     SDValue T = DAG.getConstant(Value16, EltVT);
1697     SmallVector<SDValue, 8> Ops;
1698
1699     Ops.assign(8, T);
1700     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
1701   }
1702   case MVT::v4i32: {
1703     SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
1704     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
1705   }
1706   case MVT::v2i64: {
1707     return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
1708   }
1709   }
1710
1711   return SDValue();
1712 }
1713
1714 /*!
1715  */
1716 SDValue
1717 SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
1718                      DebugLoc dl) {
1719   uint32_t upper = uint32_t(SplatVal >> 32);
1720   uint32_t lower = uint32_t(SplatVal);
1721
1722   if (upper == lower) {
1723     // Magic constant that can be matched by IL, ILA, et. al.
1724     SDValue Val = DAG.getTargetConstant(upper, MVT::i32);
1725     return DAG.getNode(ISD::BITCAST, dl, OpVT,
1726                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1727                                    Val, Val, Val, Val));
1728   } else {
1729     bool upper_special, lower_special;
1730
1731     // NOTE: This code creates common-case shuffle masks that can be easily
1732     // detected as common expressions. It is not attempting to create highly
1733     // specialized masks to replace any and all 0's, 0xff's and 0x80's.
1734
1735     // Detect if the upper or lower half is a special shuffle mask pattern:
1736     upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
1737     lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
1738
1739     // Both upper and lower are special, lower to a constant pool load:
1740     if (lower_special && upper_special) {
1741       SDValue SplatValCN = DAG.getConstant(SplatVal, MVT::i64);
1742       return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64,
1743                          SplatValCN, SplatValCN);
1744     }
1745
1746     SDValue LO32;
1747     SDValue HI32;
1748     SmallVector<SDValue, 16> ShufBytes;
1749     SDValue Result;
1750
1751     // Create lower vector if not a special pattern
1752     if (!lower_special) {
1753       SDValue LO32C = DAG.getConstant(lower, MVT::i32);
1754       LO32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
1755                          DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1756                                      LO32C, LO32C, LO32C, LO32C));
1757     }
1758
1759     // Create upper vector if not a special pattern
1760     if (!upper_special) {
1761       SDValue HI32C = DAG.getConstant(upper, MVT::i32);
1762       HI32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
1763                          DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1764                                      HI32C, HI32C, HI32C, HI32C));
1765     }
1766
1767     // If either upper or lower are special, then the two input operands are
1768     // the same (basically, one of them is a "don't care")
1769     if (lower_special)
1770       LO32 = HI32;
1771     if (upper_special)
1772       HI32 = LO32;
1773
1774     for (int i = 0; i < 4; ++i) {
1775       uint64_t val = 0;
1776       for (int j = 0; j < 4; ++j) {
1777         SDValue V;
1778         bool process_upper, process_lower;
1779         val <<= 8;
1780         process_upper = (upper_special && (i & 1) == 0);
1781         process_lower = (lower_special && (i & 1) == 1);
1782
1783         if (process_upper || process_lower) {
1784           if ((process_upper && upper == 0)
1785                   || (process_lower && lower == 0))
1786             val |= 0x80;
1787           else if ((process_upper && upper == 0xffffffff)
1788                   || (process_lower && lower == 0xffffffff))
1789             val |= 0xc0;
1790           else if ((process_upper && upper == 0x80000000)
1791                   || (process_lower && lower == 0x80000000))
1792             val |= (j == 0 ? 0xe0 : 0x80);
1793         } else
1794           val |= i * 4 + j + ((i & 1) * 16);
1795       }
1796
1797       ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
1798     }
1799
1800     return DAG.getNode(SPUISD::SHUFB, dl, OpVT, HI32, LO32,
1801                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1802                                    &ShufBytes[0], ShufBytes.size()));
1803   }
1804 }
1805
1806 /// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
1807 /// which the Cell can operate. The code inspects V3 to ascertain whether the
1808 /// permutation vector, V3, is monotonically increasing with one "exception"
1809 /// element, e.g., (0, 1, _, 3). If this is the case, then generate a
1810 /// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
1811 /// In either case, the net result is going to eventually invoke SHUFB to
1812 /// permute/shuffle the bytes from V1 and V2.
1813 /// \note
1814 /// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate
1815 /// control word for byte/halfword/word insertion. This takes care of a single
1816 /// element move from V2 into V1.
1817 /// \note
1818 /// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
1819 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
1820   const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
1821   SDValue V1 = Op.getOperand(0);
1822   SDValue V2 = Op.getOperand(1);
1823   DebugLoc dl = Op.getDebugLoc();
1824
1825   if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
1826
1827   // If we have a single element being moved from V1 to V2, this can be handled
1828   // using the C*[DX] compute mask instructions, but the vector elements have
1829   // to be monotonically increasing with one exception element, and the source
1830   // slot of the element to move must be the same as the destination.
1831   EVT VecVT = V1.getValueType();
1832   EVT EltVT = VecVT.getVectorElementType();
1833   unsigned EltsFromV2 = 0;
1834   unsigned V2EltOffset = 0;
1835   unsigned V2EltIdx0 = 0;
1836   unsigned CurrElt = 0;
1837   unsigned MaxElts = VecVT.getVectorNumElements();
1838   unsigned PrevElt = 0;
1839   bool monotonic = true;
1840   bool rotate = true;
1841   int rotamt=0;
1842   EVT maskVT;             // which of the c?d instructions to use
1843
1844   if (EltVT == MVT::i8) {
1845     V2EltIdx0 = 16;
1846     maskVT = MVT::v16i8;
1847   } else if (EltVT == MVT::i16) {
1848     V2EltIdx0 = 8;
1849     maskVT = MVT::v8i16;
1850   } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
1851     V2EltIdx0 = 4;
1852     maskVT = MVT::v4i32;
1853   } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
1854     V2EltIdx0 = 2;
1855     maskVT = MVT::v2i64;
1856   } else
1857     llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");
1858
1859   for (unsigned i = 0; i != MaxElts; ++i) {
1860     if (SVN->getMaskElt(i) < 0)
1861       continue;
1862
1863     unsigned SrcElt = SVN->getMaskElt(i);
1864
1865     if (monotonic) {
1866       if (SrcElt >= V2EltIdx0) {
1867         // TODO: optimize for the monotonic case when several consecutive
1868         // elements are taken form V2. Do we ever get such a case?
1869         if (EltsFromV2 == 0 && CurrElt == (SrcElt - V2EltIdx0))
1870           V2EltOffset = (SrcElt - V2EltIdx0) * (EltVT.getSizeInBits()/8);
1871         else
1872           monotonic = false;
1873         ++EltsFromV2;
1874       } else if (CurrElt != SrcElt) {
1875         monotonic = false;
1876       }
1877
1878       ++CurrElt;
1879     }
1880
1881     if (rotate) {
1882       if (PrevElt > 0 && SrcElt < MaxElts) {
1883         if ((PrevElt == SrcElt - 1)
1884             || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
1885           PrevElt = SrcElt;
1886         } else {
1887           rotate = false;
1888         }
1889       } else if (i == 0 || (PrevElt==0 && SrcElt==1)) {
1890         // First time or after a "wrap around"
1891         rotamt = SrcElt-i;
1892         PrevElt = SrcElt;
1893       } else {
1894         // This isn't a rotation, takes elements from vector 2
1895         rotate = false;
1896       }
1897     }
1898   }
1899
1900   if (EltsFromV2 == 1 && monotonic) {
1901     // Compute mask and shuffle
1902     EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1903
1904     // As SHUFFLE_MASK becomes a c?d instruction, feed it an address
1905     // R1 ($sp) is used here only as it is guaranteed to have last bits zero
1906     SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
1907                                 DAG.getRegister(SPU::R1, PtrVT),
1908                                 DAG.getConstant(V2EltOffset, MVT::i32));
1909     SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl,
1910                                      maskVT, Pointer);
1911
1912     // Use shuffle mask in SHUFB synthetic instruction:
1913     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
1914                        ShufMaskOp);
1915   } else if (rotate) {
1916     if (rotamt < 0)
1917       rotamt +=MaxElts;
1918     rotamt *= EltVT.getSizeInBits()/8;
1919     return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(),
1920                        V1, DAG.getConstant(rotamt, MVT::i16));
1921   } else {
1922    // Convert the SHUFFLE_VECTOR mask's input element units to the
1923    // actual bytes.
1924     unsigned BytesPerElement = EltVT.getSizeInBits()/8;
1925
1926     SmallVector<SDValue, 16> ResultMask;
1927     for (unsigned i = 0, e = MaxElts; i != e; ++i) {
1928       unsigned SrcElt = SVN->getMaskElt(i) < 0 ? 0 : SVN->getMaskElt(i);
1929
1930       for (unsigned j = 0; j < BytesPerElement; ++j)
1931         ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
1932     }
1933     SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
1934                                     &ResultMask[0], ResultMask.size());
1935     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
1936   }
1937 }
1938
1939 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
1940   SDValue Op0 = Op.getOperand(0);                     // Op0 = the scalar
1941   DebugLoc dl = Op.getDebugLoc();
1942
1943   if (Op0.getNode()->getOpcode() == ISD::Constant) {
1944     // For a constant, build the appropriate constant vector, which will
1945     // eventually simplify to a vector register load.
1946
1947     ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
1948     SmallVector<SDValue, 16> ConstVecValues;
1949     EVT VT;
1950     size_t n_copies;
1951
1952     // Create a constant vector:
1953     switch (Op.getValueType().getSimpleVT().SimpleTy) {
1954     default: llvm_unreachable("Unexpected constant value type in "
1955                               "LowerSCALAR_TO_VECTOR");
1956     case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
1957     case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
1958     case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
1959     case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
1960     case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
1961     case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
1962     }
1963
1964     SDValue CValue = DAG.getConstant(CN->getZExtValue(), VT);
1965     for (size_t j = 0; j < n_copies; ++j)
1966       ConstVecValues.push_back(CValue);
1967
1968     return DAG.getNode(ISD::BUILD_VECTOR, dl, Op.getValueType(),
1969                        &ConstVecValues[0], ConstVecValues.size());
1970   } else {
1971     // Otherwise, copy the value from one register to another:
1972     switch (Op0.getValueType().getSimpleVT().SimpleTy) {
1973     default: llvm_unreachable("Unexpected value type in LowerSCALAR_TO_VECTOR");
1974     case MVT::i8:
1975     case MVT::i16:
1976     case MVT::i32:
1977     case MVT::i64:
1978     case MVT::f32:
1979     case MVT::f64:
1980       return DAG.getNode(SPUISD::PREFSLOT2VEC, dl, Op.getValueType(), Op0, Op0);
1981     }
1982   }
1983
1984   return SDValue();
1985 }
1986
1987 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
1988   EVT VT = Op.getValueType();
1989   SDValue N = Op.getOperand(0);
1990   SDValue Elt = Op.getOperand(1);
1991   DebugLoc dl = Op.getDebugLoc();
1992   SDValue retval;
1993
1994   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
1995     // Constant argument:
1996     int EltNo = (int) C->getZExtValue();
1997
1998     // sanity checks:
1999     if (VT == MVT::i8 && EltNo >= 16)
2000       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
2001     else if (VT == MVT::i16 && EltNo >= 8)
2002       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
2003     else if (VT == MVT::i32 && EltNo >= 4)
2004       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
2005     else if (VT == MVT::i64 && EltNo >= 2)
2006       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
2007
2008     if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
2009       // i32 and i64: Element 0 is the preferred slot
2010       return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, N);
2011     }
2012
2013     // Need to generate shuffle mask and extract:
2014     int prefslot_begin = -1, prefslot_end = -1;
2015     int elt_byte = EltNo * VT.getSizeInBits() / 8;
2016
2017     switch (VT.getSimpleVT().SimpleTy) {
2018     default:
2019       assert(false && "Invalid value type!");
2020     case MVT::i8: {
2021       prefslot_begin = prefslot_end = 3;
2022       break;
2023     }
2024     case MVT::i16: {
2025       prefslot_begin = 2; prefslot_end = 3;
2026       break;
2027     }
2028     case MVT::i32:
2029     case MVT::f32: {
2030       prefslot_begin = 0; prefslot_end = 3;
2031       break;
2032     }
2033     case MVT::i64:
2034     case MVT::f64: {
2035       prefslot_begin = 0; prefslot_end = 7;
2036       break;
2037     }
2038     }
2039
2040     assert(prefslot_begin != -1 && prefslot_end != -1 &&
2041            "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
2042
2043     unsigned int ShufBytes[16] = {
2044       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
2045     };
2046     for (int i = 0; i < 16; ++i) {
2047       // zero fill uppper part of preferred slot, don't care about the
2048       // other slots:
2049       unsigned int mask_val;
2050       if (i <= prefslot_end) {
2051         mask_val =
2052           ((i < prefslot_begin)
2053            ? 0x80
2054            : elt_byte + (i - prefslot_begin));
2055
2056         ShufBytes[i] = mask_val;
2057       } else
2058         ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)];
2059     }
2060
2061     SDValue ShufMask[4];
2062     for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
2063       unsigned bidx = i * 4;
2064       unsigned int bits = ((ShufBytes[bidx] << 24) |
2065                            (ShufBytes[bidx+1] << 16) |
2066                            (ShufBytes[bidx+2] << 8) |
2067                            ShufBytes[bidx+3]);
2068       ShufMask[i] = DAG.getConstant(bits, MVT::i32);
2069     }
2070
2071     SDValue ShufMaskVec =
2072       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2073                   &ShufMask[0], sizeof(ShufMask)/sizeof(ShufMask[0]));
2074
2075     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
2076                          DAG.getNode(SPUISD::SHUFB, dl, N.getValueType(),
2077                                      N, N, ShufMaskVec));
2078   } else {
2079     // Variable index: Rotate the requested element into slot 0, then replicate
2080     // slot 0 across the vector
2081     EVT VecVT = N.getValueType();
2082     if (!VecVT.isSimple() || !VecVT.isVector()) {
2083       report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
2084                         "vector type!");
2085     }
2086
2087     // Make life easier by making sure the index is zero-extended to i32
2088     if (Elt.getValueType() != MVT::i32)
2089       Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Elt);
2090
2091     // Scale the index to a bit/byte shift quantity
2092     APInt scaleFactor =
2093             APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false);
2094     unsigned scaleShift = scaleFactor.logBase2();
2095     SDValue vecShift;
2096
2097     if (scaleShift > 0) {
2098       // Scale the shift factor:
2099       Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
2100                         DAG.getConstant(scaleShift, MVT::i32));
2101     }
2102
2103     vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt);
2104
2105     // Replicate the bytes starting at byte 0 across the entire vector (for
2106     // consistency with the notion of a unified register set)
2107     SDValue replicate;
2108
2109     switch (VT.getSimpleVT().SimpleTy) {
2110     default:
2111       report_fatal_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
2112                         "type");
2113       /*NOTREACHED*/
2114     case MVT::i8: {
2115       SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
2116       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2117                               factor, factor, factor, factor);
2118       break;
2119     }
2120     case MVT::i16: {
2121       SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
2122       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2123                               factor, factor, factor, factor);
2124       break;
2125     }
2126     case MVT::i32:
2127     case MVT::f32: {
2128       SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
2129       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2130                               factor, factor, factor, factor);
2131       break;
2132     }
2133     case MVT::i64:
2134     case MVT::f64: {
2135       SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
2136       SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
2137       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2138                               loFactor, hiFactor, loFactor, hiFactor);
2139       break;
2140     }
2141     }
2142
2143     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
2144                          DAG.getNode(SPUISD::SHUFB, dl, VecVT,
2145                                      vecShift, vecShift, replicate));
2146   }
2147
2148   return retval;
2149 }
2150
2151 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
2152   SDValue VecOp = Op.getOperand(0);
2153   SDValue ValOp = Op.getOperand(1);
2154   SDValue IdxOp = Op.getOperand(2);
2155   DebugLoc dl = Op.getDebugLoc();
2156   EVT VT = Op.getValueType();
2157   EVT eltVT = ValOp.getValueType();
2158
2159   // use 0 when the lane to insert to is 'undef'
2160   int64_t Offset=0;
2161   if (IdxOp.getOpcode() != ISD::UNDEF) {
2162     ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
2163     assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
2164     Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8;
2165   }
2166
2167   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
2168   // Use $sp ($1) because it's always 16-byte aligned and it's available:
2169   SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
2170                                 DAG.getRegister(SPU::R1, PtrVT),
2171                                 DAG.getConstant(Offset, PtrVT));
2172   // widen the mask when dealing with half vectors
2173   EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(),
2174                                 128/ VT.getVectorElementType().getSizeInBits());
2175   SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer);
2176
2177   SDValue result =
2178     DAG.getNode(SPUISD::SHUFB, dl, VT,
2179                 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp),
2180                 VecOp,
2181                 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ShufMask));
2182
2183   return result;
2184 }
2185
2186 static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
2187                            const TargetLowering &TLI)
2188 {
2189   SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
2190   DebugLoc dl = Op.getDebugLoc();
2191   EVT ShiftVT = TLI.getShiftAmountTy(N0.getValueType());
2192
2193   assert(Op.getValueType() == MVT::i8);
2194   switch (Opc) {
2195   default:
2196     llvm_unreachable("Unhandled i8 math operator");
2197     /*NOTREACHED*/
2198     break;
2199   case ISD::ADD: {
2200     // 8-bit addition: Promote the arguments up to 16-bits and truncate
2201     // the result:
2202     SDValue N1 = Op.getOperand(1);
2203     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2204     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
2205     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2206                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2207
2208   }
2209
2210   case ISD::SUB: {
2211     // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
2212     // the result:
2213     SDValue N1 = Op.getOperand(1);
2214     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2215     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
2216     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2217                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2218   }
2219   case ISD::ROTR:
2220   case ISD::ROTL: {
2221     SDValue N1 = Op.getOperand(1);
2222     EVT N1VT = N1.getValueType();
2223
2224     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
2225     if (!N1VT.bitsEq(ShiftVT)) {
2226       unsigned N1Opc = N1.getValueType().bitsLT(ShiftVT)
2227                        ? ISD::ZERO_EXTEND
2228                        : ISD::TRUNCATE;
2229       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
2230     }
2231
2232     // Replicate lower 8-bits into upper 8:
2233     SDValue ExpandArg =
2234       DAG.getNode(ISD::OR, dl, MVT::i16, N0,
2235                   DAG.getNode(ISD::SHL, dl, MVT::i16,
2236                               N0, DAG.getConstant(8, MVT::i32)));
2237
2238     // Truncate back down to i8
2239     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2240                        DAG.getNode(Opc, dl, MVT::i16, ExpandArg, N1));
2241   }
2242   case ISD::SRL:
2243   case ISD::SHL: {
2244     SDValue N1 = Op.getOperand(1);
2245     EVT N1VT = N1.getValueType();
2246
2247     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
2248     if (!N1VT.bitsEq(ShiftVT)) {
2249       unsigned N1Opc = ISD::ZERO_EXTEND;
2250
2251       if (N1.getValueType().bitsGT(ShiftVT))
2252         N1Opc = ISD::TRUNCATE;
2253
2254       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
2255     }
2256
2257     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2258                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2259   }
2260   case ISD::SRA: {
2261     SDValue N1 = Op.getOperand(1);
2262     EVT N1VT = N1.getValueType();
2263
2264     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2265     if (!N1VT.bitsEq(ShiftVT)) {
2266       unsigned N1Opc = ISD::SIGN_EXTEND;
2267
2268       if (N1VT.bitsGT(ShiftVT))
2269         N1Opc = ISD::TRUNCATE;
2270       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
2271     }
2272
2273     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2274                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2275   }
2276   case ISD::MUL: {
2277     SDValue N1 = Op.getOperand(1);
2278
2279     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2280     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
2281     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2282                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2283     break;
2284   }
2285   }
2286
2287   return SDValue();
2288 }
2289
2290 //! Lower byte immediate operations for v16i8 vectors:
2291 static SDValue
2292 LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
2293   SDValue ConstVec;
2294   SDValue Arg;
2295   EVT VT = Op.getValueType();
2296   DebugLoc dl = Op.getDebugLoc();
2297
2298   ConstVec = Op.getOperand(0);
2299   Arg = Op.getOperand(1);
2300   if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) {
2301     if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
2302       ConstVec = ConstVec.getOperand(0);
2303     } else {
2304       ConstVec = Op.getOperand(1);
2305       Arg = Op.getOperand(0);
2306       if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
2307         ConstVec = ConstVec.getOperand(0);
2308       }
2309     }
2310   }
2311
2312   if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
2313     BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(ConstVec.getNode());
2314     assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerByteImmed");
2315
2316     APInt APSplatBits, APSplatUndef;
2317     unsigned SplatBitSize;
2318     bool HasAnyUndefs;
2319     unsigned minSplatBits = VT.getVectorElementType().getSizeInBits();
2320
2321     if (BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
2322                               HasAnyUndefs, minSplatBits)
2323         && minSplatBits <= SplatBitSize) {
2324       uint64_t SplatBits = APSplatBits.getZExtValue();
2325       SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
2326
2327       SmallVector<SDValue, 16> tcVec;
2328       tcVec.assign(16, tc);
2329       return DAG.getNode(Op.getNode()->getOpcode(), dl, VT, Arg,
2330                          DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &tcVec[0], tcVec.size()));
2331     }
2332   }
2333
2334   // These operations (AND, OR, XOR) are legal, they just couldn't be custom
2335   // lowered.  Return the operation, rather than a null SDValue.
2336   return Op;
2337 }
2338
2339 //! Custom lowering for CTPOP (count population)
2340 /*!
2341   Custom lowering code that counts the number ones in the input
2342   operand. SPU has such an instruction, but it counts the number of
2343   ones per byte, which then have to be accumulated.
2344 */
2345 static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
2346   EVT VT = Op.getValueType();
2347   EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
2348                                VT, (128 / VT.getSizeInBits()));
2349   DebugLoc dl = Op.getDebugLoc();
2350
2351   switch (VT.getSimpleVT().SimpleTy) {
2352   default:
2353     assert(false && "Invalid value type!");
2354   case MVT::i8: {
2355     SDValue N = Op.getOperand(0);
2356     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
2357
2358     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
2359     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
2360
2361     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CNTB, Elt0);
2362   }
2363
2364   case MVT::i16: {
2365     MachineFunction &MF = DAG.getMachineFunction();
2366     MachineRegisterInfo &RegInfo = MF.getRegInfo();
2367
2368     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R16CRegClass);
2369
2370     SDValue N = Op.getOperand(0);
2371     SDValue Elt0 = DAG.getConstant(0, MVT::i16);
2372     SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
2373     SDValue Shift1 = DAG.getConstant(8, MVT::i32);
2374
2375     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
2376     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
2377
2378     // CNTB_result becomes the chain to which all of the virtual registers
2379     // CNTB_reg, SUM1_reg become associated:
2380     SDValue CNTB_result =
2381       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, CNTB, Elt0);
2382
2383     SDValue CNTB_rescopy =
2384       DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
2385
2386     SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i16);
2387
2388     return DAG.getNode(ISD::AND, dl, MVT::i16,
2389                        DAG.getNode(ISD::ADD, dl, MVT::i16,
2390                                    DAG.getNode(ISD::SRL, dl, MVT::i16,
2391                                                Tmp1, Shift1),
2392                                    Tmp1),
2393                        Mask0);
2394   }
2395
2396   case MVT::i32: {
2397     MachineFunction &MF = DAG.getMachineFunction();
2398     MachineRegisterInfo &RegInfo = MF.getRegInfo();
2399
2400     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
2401     unsigned SUM1_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
2402
2403     SDValue N = Op.getOperand(0);
2404     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
2405     SDValue Mask0 = DAG.getConstant(0xff, MVT::i32);
2406     SDValue Shift1 = DAG.getConstant(16, MVT::i32);
2407     SDValue Shift2 = DAG.getConstant(8, MVT::i32);
2408
2409     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
2410     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
2411
2412     // CNTB_result becomes the chain to which all of the virtual registers
2413     // CNTB_reg, SUM1_reg become associated:
2414     SDValue CNTB_result =
2415       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, CNTB, Elt0);
2416
2417     SDValue CNTB_rescopy =
2418       DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
2419
2420     SDValue Comp1 =
2421       DAG.getNode(ISD::SRL, dl, MVT::i32,
2422                   DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32),
2423                   Shift1);
2424
2425     SDValue Sum1 =
2426       DAG.getNode(ISD::ADD, dl, MVT::i32, Comp1,
2427                   DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32));
2428
2429     SDValue Sum1_rescopy =
2430       DAG.getCopyToReg(CNTB_result, dl, SUM1_reg, Sum1);
2431
2432     SDValue Comp2 =
2433       DAG.getNode(ISD::SRL, dl, MVT::i32,
2434                   DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32),
2435                   Shift2);
2436     SDValue Sum2 =
2437       DAG.getNode(ISD::ADD, dl, MVT::i32, Comp2,
2438                   DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32));
2439
2440     return DAG.getNode(ISD::AND, dl, MVT::i32, Sum2, Mask0);
2441   }
2442
2443   case MVT::i64:
2444     break;
2445   }
2446
2447   return SDValue();
2448 }
2449
2450 //! Lower ISD::FP_TO_SINT, ISD::FP_TO_UINT for i32
2451 /*!
2452  f32->i32 passes through unchanged, whereas f64->i32 expands to a libcall.
2453  All conversions to i64 are expanded to a libcall.
2454  */
2455 static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
2456                               const SPUTargetLowering &TLI) {
2457   EVT OpVT = Op.getValueType();
2458   SDValue Op0 = Op.getOperand(0);
2459   EVT Op0VT = Op0.getValueType();
2460
2461   if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
2462       || OpVT == MVT::i64) {
2463     // Convert f32 / f64 to i32 / i64 via libcall.
2464     RTLIB::Libcall LC =
2465             (Op.getOpcode() == ISD::FP_TO_SINT)
2466              ? RTLIB::getFPTOSINT(Op0VT, OpVT)
2467              : RTLIB::getFPTOUINT(Op0VT, OpVT);
2468     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd fp-to-int conversion!");
2469     SDValue Dummy;
2470     return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
2471   }
2472
2473   return Op;
2474 }
2475
2476 //! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
2477 /*!
2478  i32->f32 passes through unchanged, whereas i32->f64 is expanded to a libcall.
2479  All conversions from i64 are expanded to a libcall.
2480  */
2481 static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
2482                               const SPUTargetLowering &TLI) {
2483   EVT OpVT = Op.getValueType();
2484   SDValue Op0 = Op.getOperand(0);
2485   EVT Op0VT = Op0.getValueType();
2486
2487   if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
2488       || Op0VT == MVT::i64) {
2489     // Convert i32, i64 to f64 via libcall:
2490     RTLIB::Libcall LC =
2491             (Op.getOpcode() == ISD::SINT_TO_FP)
2492              ? RTLIB::getSINTTOFP(Op0VT, OpVT)
2493              : RTLIB::getUINTTOFP(Op0VT, OpVT);
2494     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd int-to-fp conversion!");
2495     SDValue Dummy;
2496     return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
2497   }
2498
2499   return Op;
2500 }
2501
2502 //! Lower ISD::SETCC
2503 /*!
2504  This handles MVT::f64 (double floating point) condition lowering
2505  */
2506 static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
2507                           const TargetLowering &TLI) {
2508   CondCodeSDNode *CC = dyn_cast<CondCodeSDNode>(Op.getOperand(2));
2509   DebugLoc dl = Op.getDebugLoc();
2510   assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
2511
2512   SDValue lhs = Op.getOperand(0);
2513   SDValue rhs = Op.getOperand(1);
2514   EVT lhsVT = lhs.getValueType();
2515   assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
2516
2517   EVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
2518   APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
2519   EVT IntVT(MVT::i64);
2520
2521   // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
2522   // selected to a NOP:
2523   SDValue i64lhs = DAG.getNode(ISD::BITCAST, dl, IntVT, lhs);
2524   SDValue lhsHi32 =
2525           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
2526                       DAG.getNode(ISD::SRL, dl, IntVT,
2527                                   i64lhs, DAG.getConstant(32, MVT::i32)));
2528   SDValue lhsHi32abs =
2529           DAG.getNode(ISD::AND, dl, MVT::i32,
2530                       lhsHi32, DAG.getConstant(0x7fffffff, MVT::i32));
2531   SDValue lhsLo32 =
2532           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, i64lhs);
2533
2534   // SETO and SETUO only use the lhs operand:
2535   if (CC->get() == ISD::SETO) {
2536     // Evaluates to true if Op0 is not [SQ]NaN - lowers to the inverse of
2537     // SETUO
2538     APInt ccResultAllOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
2539     return DAG.getNode(ISD::XOR, dl, ccResultVT,
2540                        DAG.getSetCC(dl, ccResultVT,
2541                                     lhs, DAG.getConstantFP(0.0, lhsVT),
2542                                     ISD::SETUO),
2543                        DAG.getConstant(ccResultAllOnes, ccResultVT));
2544   } else if (CC->get() == ISD::SETUO) {
2545     // Evaluates to true if Op0 is [SQ]NaN
2546     return DAG.getNode(ISD::AND, dl, ccResultVT,
2547                        DAG.getSetCC(dl, ccResultVT,
2548                                     lhsHi32abs,
2549                                     DAG.getConstant(0x7ff00000, MVT::i32),
2550                                     ISD::SETGE),
2551                        DAG.getSetCC(dl, ccResultVT,
2552                                     lhsLo32,
2553                                     DAG.getConstant(0, MVT::i32),
2554                                     ISD::SETGT));
2555   }
2556
2557   SDValue i64rhs = DAG.getNode(ISD::BITCAST, dl, IntVT, rhs);
2558   SDValue rhsHi32 =
2559           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
2560                       DAG.getNode(ISD::SRL, dl, IntVT,
2561                                   i64rhs, DAG.getConstant(32, MVT::i32)));
2562
2563   // If a value is negative, subtract from the sign magnitude constant:
2564   SDValue signMag2TC = DAG.getConstant(0x8000000000000000ULL, IntVT);
2565
2566   // Convert the sign-magnitude representation into 2's complement:
2567   SDValue lhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
2568                                       lhsHi32, DAG.getConstant(31, MVT::i32));
2569   SDValue lhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64lhs);
2570   SDValue lhsSelect =
2571           DAG.getNode(ISD::SELECT, dl, IntVT,
2572                       lhsSelectMask, lhsSignMag2TC, i64lhs);
2573
2574   SDValue rhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
2575                                       rhsHi32, DAG.getConstant(31, MVT::i32));
2576   SDValue rhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64rhs);
2577   SDValue rhsSelect =
2578           DAG.getNode(ISD::SELECT, dl, IntVT,
2579                       rhsSelectMask, rhsSignMag2TC, i64rhs);
2580
2581   unsigned compareOp;
2582
2583   switch (CC->get()) {
2584   case ISD::SETOEQ:
2585   case ISD::SETUEQ:
2586     compareOp = ISD::SETEQ; break;
2587   case ISD::SETOGT:
2588   case ISD::SETUGT:
2589     compareOp = ISD::SETGT; break;
2590   case ISD::SETOGE:
2591   case ISD::SETUGE:
2592     compareOp = ISD::SETGE; break;
2593   case ISD::SETOLT:
2594   case ISD::SETULT:
2595     compareOp = ISD::SETLT; break;
2596   case ISD::SETOLE:
2597   case ISD::SETULE:
2598     compareOp = ISD::SETLE; break;
2599   case ISD::SETUNE:
2600   case ISD::SETONE:
2601     compareOp = ISD::SETNE; break;
2602   default:
2603     report_fatal_error("CellSPU ISel Select: unimplemented f64 condition");
2604   }
2605
2606   SDValue result =
2607           DAG.getSetCC(dl, ccResultVT, lhsSelect, rhsSelect,
2608                        (ISD::CondCode) compareOp);
2609
2610   if ((CC->get() & 0x8) == 0) {
2611     // Ordered comparison:
2612     SDValue lhsNaN = DAG.getSetCC(dl, ccResultVT,
2613                                   lhs, DAG.getConstantFP(0.0, MVT::f64),
2614                                   ISD::SETO);
2615     SDValue rhsNaN = DAG.getSetCC(dl, ccResultVT,
2616                                   rhs, DAG.getConstantFP(0.0, MVT::f64),
2617                                   ISD::SETO);
2618     SDValue ordered = DAG.getNode(ISD::AND, dl, ccResultVT, lhsNaN, rhsNaN);
2619
2620     result = DAG.getNode(ISD::AND, dl, ccResultVT, ordered, result);
2621   }
2622
2623   return result;
2624 }
2625
2626 //! Lower ISD::SELECT_CC
2627 /*!
2628   ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
2629   SELB instruction.
2630
2631   \note Need to revisit this in the future: if the code path through the true
2632   and false value computations is longer than the latency of a branch (6
2633   cycles), then it would be more advantageous to branch and insert a new basic
2634   block and branch on the condition. However, this code does not make that
2635   assumption, given the simplisitc uses so far.
2636  */
2637
2638 static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
2639                               const TargetLowering &TLI) {
2640   EVT VT = Op.getValueType();
2641   SDValue lhs = Op.getOperand(0);
2642   SDValue rhs = Op.getOperand(1);
2643   SDValue trueval = Op.getOperand(2);
2644   SDValue falseval = Op.getOperand(3);
2645   SDValue condition = Op.getOperand(4);
2646   DebugLoc dl = Op.getDebugLoc();
2647
2648   // NOTE: SELB's arguments: $rA, $rB, $mask
2649   //
2650   // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
2651   // where bits in $mask are 1. CCond will be inverted, having 1s where the
2652   // condition was true and 0s where the condition was false. Hence, the
2653   // arguments to SELB get reversed.
2654
2655   // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
2656   // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
2657   // with another "cannot select select_cc" assert:
2658
2659   SDValue compare = DAG.getNode(ISD::SETCC, dl,
2660                                 TLI.getSetCCResultType(Op.getValueType()),
2661                                 lhs, rhs, condition);
2662   return DAG.getNode(SPUISD::SELB, dl, VT, falseval, trueval, compare);
2663 }
2664
2665 //! Custom lower ISD::TRUNCATE
2666 static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
2667 {
2668   // Type to truncate to
2669   EVT VT = Op.getValueType();
2670   MVT simpleVT = VT.getSimpleVT();
2671   EVT VecVT = EVT::getVectorVT(*DAG.getContext(),
2672                                VT, (128 / VT.getSizeInBits()));
2673   DebugLoc dl = Op.getDebugLoc();
2674
2675   // Type to truncate from
2676   SDValue Op0 = Op.getOperand(0);
2677   EVT Op0VT = Op0.getValueType();
2678
2679   if (Op0VT == MVT::i128 && simpleVT == MVT::i64) {
2680     // Create shuffle mask, least significant doubleword of quadword
2681     unsigned maskHigh = 0x08090a0b;
2682     unsigned maskLow = 0x0c0d0e0f;
2683     // Use a shuffle to perform the truncation
2684     SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2685                                    DAG.getConstant(maskHigh, MVT::i32),
2686                                    DAG.getConstant(maskLow, MVT::i32),
2687                                    DAG.getConstant(maskHigh, MVT::i32),
2688                                    DAG.getConstant(maskLow, MVT::i32));
2689
2690     SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, dl, VecVT,
2691                                        Op0, Op0, shufMask);
2692
2693     return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, truncShuffle);
2694   }
2695
2696   return SDValue();             // Leave the truncate unmolested
2697 }
2698
2699 /*!
2700  * Emit the instruction sequence for i64/i32 -> i128 sign extend. The basic
2701  * algorithm is to duplicate the sign bit using rotmai to generate at
2702  * least one byte full of sign bits. Then propagate the "sign-byte" into
2703  * the leftmost words and the i64/i32 into the rightmost words using shufb.
2704  *
2705  * @param Op The sext operand
2706  * @param DAG The current DAG
2707  * @return The SDValue with the entire instruction sequence
2708  */
2709 static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
2710 {
2711   DebugLoc dl = Op.getDebugLoc();
2712
2713   // Type to extend to
2714   MVT OpVT = Op.getValueType().getSimpleVT();
2715
2716   // Type to extend from
2717   SDValue Op0 = Op.getOperand(0);
2718   MVT Op0VT = Op0.getValueType().getSimpleVT();
2719
2720   // extend i8 & i16 via i32
2721   if (Op0VT == MVT::i8 || Op0VT == MVT::i16) {
2722     Op0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Op0);
2723     Op0VT = MVT::i32;
2724   }
2725
2726   // The type to extend to needs to be a i128 and
2727   // the type to extend from needs to be i64 or i32.
2728   assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
2729           "LowerSIGN_EXTEND: input and/or output operand have wrong size");
2730
2731   // Create shuffle mask
2732   unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
2733   unsigned mask2 = Op0VT == MVT::i64 ? 0x00010203 : 0x10101010; // byte  8 - 11
2734   unsigned mask3 = Op0VT == MVT::i64 ? 0x04050607 : 0x00010203; // byte 12 - 15
2735   SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2736                                  DAG.getConstant(mask1, MVT::i32),
2737                                  DAG.getConstant(mask1, MVT::i32),
2738                                  DAG.getConstant(mask2, MVT::i32),
2739                                  DAG.getConstant(mask3, MVT::i32));
2740
2741   // Word wise arithmetic right shift to generate at least one byte
2742   // that contains sign bits.
2743   MVT mvt = Op0VT == MVT::i64 ? MVT::v2i64 : MVT::v4i32;
2744   SDValue sraVal = DAG.getNode(ISD::SRA,
2745                  dl,
2746                  mvt,
2747                  DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0),
2748                  DAG.getConstant(31, MVT::i32));
2749
2750   // reinterpret as a i128 (SHUFB requires it). This gets lowered away.
2751   SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
2752                                         dl, Op0VT, Op0,
2753                                         DAG.getTargetConstant(
2754                                                   SPU::GPRCRegClass.getID(),
2755                                                   MVT::i32)), 0);
2756   // Shuffle bytes - Copy the sign bits into the upper 64 bits
2757   // and the input value into the lower 64 bits.
2758   SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt,
2759         extended, sraVal, shufMask);
2760   return DAG.getNode(ISD::BITCAST, dl, MVT::i128, extShuffle);
2761 }
2762
2763 //! Custom (target-specific) lowering entry point
2764 /*!
2765   This is where LLVM's DAG selection process calls to do target-specific
2766   lowering of nodes.
2767  */
2768 SDValue
2769 SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
2770 {
2771   unsigned Opc = (unsigned) Op.getOpcode();
2772   EVT VT = Op.getValueType();
2773
2774   switch (Opc) {
2775   default: {
2776 #ifndef NDEBUG
2777     errs() << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
2778     errs() << "Op.getOpcode() = " << Opc << "\n";
2779     errs() << "*Op.getNode():\n";
2780     Op.getNode()->dump();
2781 #endif
2782     llvm_unreachable(0);
2783   }
2784   case ISD::LOAD:
2785   case ISD::EXTLOAD:
2786   case ISD::SEXTLOAD:
2787   case ISD::ZEXTLOAD:
2788     return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
2789   case ISD::STORE:
2790     return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
2791   case ISD::ConstantPool:
2792     return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
2793   case ISD::GlobalAddress:
2794     return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
2795   case ISD::JumpTable:
2796     return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
2797   case ISD::ConstantFP:
2798     return LowerConstantFP(Op, DAG);
2799
2800   // i8, i64 math ops:
2801   case ISD::ADD:
2802   case ISD::SUB:
2803   case ISD::ROTR:
2804   case ISD::ROTL:
2805   case ISD::SRL:
2806   case ISD::SHL:
2807   case ISD::SRA: {
2808     if (VT == MVT::i8)
2809       return LowerI8Math(Op, DAG, Opc, *this);
2810     break;
2811   }
2812
2813   case ISD::FP_TO_SINT:
2814   case ISD::FP_TO_UINT:
2815     return LowerFP_TO_INT(Op, DAG, *this);
2816
2817   case ISD::SINT_TO_FP:
2818   case ISD::UINT_TO_FP:
2819     return LowerINT_TO_FP(Op, DAG, *this);
2820
2821   // Vector-related lowering.
2822   case ISD::BUILD_VECTOR:
2823     return LowerBUILD_VECTOR(Op, DAG);
2824   case ISD::SCALAR_TO_VECTOR:
2825     return LowerSCALAR_TO_VECTOR(Op, DAG);
2826   case ISD::VECTOR_SHUFFLE:
2827     return LowerVECTOR_SHUFFLE(Op, DAG);
2828   case ISD::EXTRACT_VECTOR_ELT:
2829     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2830   case ISD::INSERT_VECTOR_ELT:
2831     return LowerINSERT_VECTOR_ELT(Op, DAG);
2832
2833   // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
2834   case ISD::AND:
2835   case ISD::OR:
2836   case ISD::XOR:
2837     return LowerByteImmed(Op, DAG);
2838
2839   // Vector and i8 multiply:
2840   case ISD::MUL:
2841     if (VT == MVT::i8)
2842       return LowerI8Math(Op, DAG, Opc, *this);
2843
2844   case ISD::CTPOP:
2845     return LowerCTPOP(Op, DAG);
2846
2847   case ISD::SELECT_CC:
2848     return LowerSELECT_CC(Op, DAG, *this);
2849
2850   case ISD::SETCC:
2851     return LowerSETCC(Op, DAG, *this);
2852
2853   case ISD::TRUNCATE:
2854     return LowerTRUNCATE(Op, DAG);
2855
2856   case ISD::SIGN_EXTEND:
2857     return LowerSIGN_EXTEND(Op, DAG);
2858   }
2859
2860   return SDValue();
2861 }
2862
2863 void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
2864                                            SmallVectorImpl<SDValue>&Results,
2865                                            SelectionDAG &DAG) const
2866 {
2867 #if 0
2868   unsigned Opc = (unsigned) N->getOpcode();
2869   EVT OpVT = N->getValueType(0);
2870
2871   switch (Opc) {
2872   default: {
2873     errs() << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
2874     errs() << "Op.getOpcode() = " << Opc << "\n";
2875     errs() << "*Op.getNode():\n";
2876     N->dump();
2877     abort();
2878     /*NOTREACHED*/
2879   }
2880   }
2881 #endif
2882
2883   /* Otherwise, return unchanged */
2884 }
2885
2886 //===----------------------------------------------------------------------===//
2887 // Target Optimization Hooks
2888 //===----------------------------------------------------------------------===//
2889
2890 SDValue
2891 SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
2892 {
2893 #if 0
2894   TargetMachine &TM = getTargetMachine();
2895 #endif
2896   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
2897   SelectionDAG &DAG = DCI.DAG;
2898   SDValue Op0 = N->getOperand(0);       // everything has at least one operand
2899   EVT NodeVT = N->getValueType(0);      // The node's value type
2900   EVT Op0VT = Op0.getValueType();       // The first operand's result
2901   SDValue Result;                       // Initially, empty result
2902   DebugLoc dl = N->getDebugLoc();
2903
2904   switch (N->getOpcode()) {
2905   default: break;
2906   case ISD::ADD: {
2907     SDValue Op1 = N->getOperand(1);
2908
2909     if (Op0.getOpcode() == SPUISD::IndirectAddr
2910         || Op1.getOpcode() == SPUISD::IndirectAddr) {
2911       // Normalize the operands to reduce repeated code
2912       SDValue IndirectArg = Op0, AddArg = Op1;
2913
2914       if (Op1.getOpcode() == SPUISD::IndirectAddr) {
2915         IndirectArg = Op1;
2916         AddArg = Op0;
2917       }
2918
2919       if (isa<ConstantSDNode>(AddArg)) {
2920         ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
2921         SDValue IndOp1 = IndirectArg.getOperand(1);
2922
2923         if (CN0->isNullValue()) {
2924           // (add (SPUindirect <arg>, <arg>), 0) ->
2925           // (SPUindirect <arg>, <arg>)
2926
2927 #if !defined(NDEBUG)
2928           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2929             errs() << "\n"
2930                  << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
2931                  << "With:    (SPUindirect <arg>, <arg>)\n";
2932           }
2933 #endif
2934
2935           return IndirectArg;
2936         } else if (isa<ConstantSDNode>(IndOp1)) {
2937           // (add (SPUindirect <arg>, <const>), <const>) ->
2938           // (SPUindirect <arg>, <const + const>)
2939           ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
2940           int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
2941           SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);
2942
2943 #if !defined(NDEBUG)
2944           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2945             errs() << "\n"
2946                  << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
2947                  << "), " << CN0->getSExtValue() << ")\n"
2948                  << "With:    (SPUindirect <arg>, "
2949                  << combinedConst << ")\n";
2950           }
2951 #endif
2952
2953           return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
2954                              IndirectArg, combinedValue);
2955         }
2956       }
2957     }
2958     break;
2959   }
2960   case ISD::SIGN_EXTEND:
2961   case ISD::ZERO_EXTEND:
2962   case ISD::ANY_EXTEND: {
2963     if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
2964       // (any_extend (SPUextract_elt0 <arg>)) ->
2965       // (SPUextract_elt0 <arg>)
2966       // Types must match, however...
2967 #if !defined(NDEBUG)
2968       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2969         errs() << "\nReplace: ";
2970         N->dump(&DAG);
2971         errs() << "\nWith:    ";
2972         Op0.getNode()->dump(&DAG);
2973         errs() << "\n";
2974       }
2975 #endif
2976
2977       return Op0;
2978     }
2979     break;
2980   }
2981   case SPUISD::IndirectAddr: {
2982     if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
2983       ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
2984       if (CN != 0 && CN->isNullValue()) {
2985         // (SPUindirect (SPUaform <addr>, 0), 0) ->
2986         // (SPUaform <addr>, 0)
2987
2988         DEBUG(errs() << "Replace: ");
2989         DEBUG(N->dump(&DAG));
2990         DEBUG(errs() << "\nWith:    ");
2991         DEBUG(Op0.getNode()->dump(&DAG));
2992         DEBUG(errs() << "\n");
2993
2994         return Op0;
2995       }
2996     } else if (Op0.getOpcode() == ISD::ADD) {
2997       SDValue Op1 = N->getOperand(1);
2998       if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
2999         // (SPUindirect (add <arg>, <arg>), 0) ->
3000         // (SPUindirect <arg>, <arg>)
3001         if (CN1->isNullValue()) {
3002
3003 #if !defined(NDEBUG)
3004           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
3005             errs() << "\n"
3006                  << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
3007                  << "With:    (SPUindirect <arg>, <arg>)\n";
3008           }
3009 #endif
3010
3011           return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
3012                              Op0.getOperand(0), Op0.getOperand(1));
3013         }
3014       }
3015     }
3016     break;
3017   }
3018   case SPUISD::SHL_BITS:
3019   case SPUISD::SHL_BYTES:
3020   case SPUISD::ROTBYTES_LEFT: {
3021     SDValue Op1 = N->getOperand(1);
3022
3023     // Kill degenerate vector shifts:
3024     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
3025       if (CN->isNullValue()) {
3026         Result = Op0;
3027       }
3028     }
3029     break;
3030   }
3031   case SPUISD::PREFSLOT2VEC: {
3032     switch (Op0.getOpcode()) {
3033     default:
3034       break;
3035     case ISD::ANY_EXTEND:
3036     case ISD::ZERO_EXTEND:
3037     case ISD::SIGN_EXTEND: {
3038       // (SPUprefslot2vec (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
3039       // <arg>
3040       // but only if the SPUprefslot2vec and <arg> types match.
3041       SDValue Op00 = Op0.getOperand(0);
3042       if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
3043         SDValue Op000 = Op00.getOperand(0);
3044         if (Op000.getValueType() == NodeVT) {
3045           Result = Op000;
3046         }
3047       }
3048       break;
3049     }
3050     case SPUISD::VEC2PREFSLOT: {
3051       // (SPUprefslot2vec (SPUvec2prefslot <arg>)) ->
3052       // <arg>
3053       Result = Op0.getOperand(0);
3054       break;
3055     }
3056     }
3057     break;
3058   }
3059   }
3060
3061   // Otherwise, return unchanged.
3062 #ifndef NDEBUG
3063   if (Result.getNode()) {
3064     DEBUG(errs() << "\nReplace.SPU: ");
3065     DEBUG(N->dump(&DAG));
3066     DEBUG(errs() << "\nWith:        ");
3067     DEBUG(Result.getNode()->dump(&DAG));
3068     DEBUG(errs() << "\n");
3069   }
3070 #endif
3071
3072   return Result;
3073 }
3074
3075 //===----------------------------------------------------------------------===//
3076 // Inline Assembly Support
3077 //===----------------------------------------------------------------------===//
3078
3079 /// getConstraintType - Given a constraint letter, return the type of
3080 /// constraint it is for this target.
3081 SPUTargetLowering::ConstraintType
3082 SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
3083   if (ConstraintLetter.size() == 1) {
3084     switch (ConstraintLetter[0]) {
3085     default: break;
3086     case 'b':
3087     case 'r':
3088     case 'f':
3089     case 'v':
3090     case 'y':
3091       return C_RegisterClass;
3092     }
3093   }
3094   return TargetLowering::getConstraintType(ConstraintLetter);
3095 }
3096
3097 /// Examine constraint type and operand type and determine a weight value.
3098 /// This object must already have been set up with the operand type
3099 /// and the current alternative constraint selected.
3100 TargetLowering::ConstraintWeight
3101 SPUTargetLowering::getSingleConstraintMatchWeight(
3102     AsmOperandInfo &info, const char *constraint) const {
3103   ConstraintWeight weight = CW_Invalid;
3104   Value *CallOperandVal = info.CallOperandVal;
3105     // If we don't have a value, we can't do a match,
3106     // but allow it at the lowest weight.
3107   if (CallOperandVal == NULL)
3108     return CW_Default;
3109   // Look at the constraint type.
3110   switch (*constraint) {
3111   default:
3112     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
3113     break;
3114     //FIXME: Seems like the supported constraint letters were just copied
3115     // from PPC, as the following doesn't correspond to the GCC docs.
3116     // I'm leaving it so until someone adds the corresponding lowering support.
3117   case 'b':
3118   case 'r':
3119   case 'f':
3120   case 'd':
3121   case 'v':
3122   case 'y':
3123     weight = CW_Register;
3124     break;
3125   }
3126   return weight;
3127 }
3128
3129 std::pair<unsigned, const TargetRegisterClass*>
3130 SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
3131                                                 EVT VT) const
3132 {
3133   if (Constraint.size() == 1) {
3134     // GCC RS6000 Constraint Letters
3135     switch (Constraint[0]) {
3136     case 'b':   // R1-R31
3137     case 'r':   // R0-R31
3138       if (VT == MVT::i64)
3139         return std::make_pair(0U, SPU::R64CRegisterClass);
3140       return std::make_pair(0U, SPU::R32CRegisterClass);
3141     case 'f':
3142       if (VT == MVT::f32)
3143         return std::make_pair(0U, SPU::R32FPRegisterClass);
3144       else if (VT == MVT::f64)
3145         return std::make_pair(0U, SPU::R64FPRegisterClass);
3146       break;
3147     case 'v':
3148       return std::make_pair(0U, SPU::GPRCRegisterClass);
3149     }
3150   }
3151
3152   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
3153 }
3154
3155 //! Compute used/known bits for a SPU operand
3156 void
3157 SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
3158                                                   const APInt &Mask,
3159                                                   APInt &KnownZero,
3160                                                   APInt &KnownOne,
3161                                                   const SelectionDAG &DAG,
3162                                                   unsigned Depth ) const {
3163 #if 0
3164   const uint64_t uint64_sizebits = sizeof(uint64_t) * CHAR_BIT;
3165
3166   switch (Op.getOpcode()) {
3167   default:
3168     // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
3169     break;
3170   case CALL:
3171   case SHUFB:
3172   case SHUFFLE_MASK:
3173   case CNTB:
3174   case SPUISD::PREFSLOT2VEC:
3175   case SPUISD::LDRESULT:
3176   case SPUISD::VEC2PREFSLOT:
3177   case SPUISD::SHLQUAD_L_BITS:
3178   case SPUISD::SHLQUAD_L_BYTES:
3179   case SPUISD::VEC_ROTL:
3180   case SPUISD::VEC_ROTR:
3181   case SPUISD::ROTBYTES_LEFT:
3182   case SPUISD::SELECT_MASK:
3183   case SPUISD::SELB:
3184   }
3185 #endif
3186 }
3187
3188 unsigned
3189 SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
3190                                                    unsigned Depth) const {
3191   switch (Op.getOpcode()) {
3192   default:
3193     return 1;
3194
3195   case ISD::SETCC: {
3196     EVT VT = Op.getValueType();
3197
3198     if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
3199       VT = MVT::i32;
3200     }
3201     return VT.getSizeInBits();
3202   }
3203   }
3204 }
3205
3206 // LowerAsmOperandForConstraint
3207 void
3208 SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
3209                                                 std::string &Constraint,
3210                                                 std::vector<SDValue> &Ops,
3211                                                 SelectionDAG &DAG) const {
3212   // Default, for the time being, to the base class handler
3213   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3214 }
3215
3216 /// isLegalAddressImmediate - Return true if the integer value can be used
3217 /// as the offset of the target addressing mode.
3218 bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
3219                                                 const Type *Ty) const {
3220   // SPU's addresses are 256K:
3221   return (V > -(1 << 18) && V < (1 << 18) - 1);
3222 }
3223
3224 bool SPUTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
3225   return false;
3226 }
3227
3228 bool
3229 SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
3230   // The SPU target isn't yet aware of offsets.
3231   return false;
3232 }
3233
3234 // can we compare to Imm without writing it into a register?
3235 bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
3236   //ceqi, cgti, etc. all take s10 operand
3237   return isInt<10>(Imm);
3238 }
3239
3240 bool
3241 SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM,
3242                                          const Type * ) const{
3243
3244   // A-form: 18bit absolute address.
3245   if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0)
3246     return true;
3247
3248   // D-form: reg + 14bit offset
3249   if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs))
3250     return true;
3251
3252   // X-form: reg+reg
3253   if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0)
3254     return true;
3255
3256   return false;
3257 }