Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright 2011 Christoph Bumiller
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice shall be included in
  12.  * all copies or substantial portions of the Software.
  13.  *
  14.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18.  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19.  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20.  * OTHER DEALINGS IN THE SOFTWARE.
  21.  */
  22.  
  23. #include "nv50/codegen/nv50_ir.h"
  24. #include "nv50/codegen/nv50_ir_build_util.h"
  25.  
  26. #include "nv50_ir_target_nv50.h"
  27.  
  28. namespace nv50_ir {
  29.  
  30. // nv50 doesn't support 32 bit integer multiplication
  31. //
  32. //       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
  33. // -------------------
  34. //    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
  35. // ah*bh 00 00                 (           carry1) << 16 + ( carry2)
  36. //       al*bl
  37. //    ah*bl 00
  38. //
  39. // fffe0001 + fffe0001
  40. static bool
  41. expandIntegerMUL(BuildUtil *bld, Instruction *mul)
  42. {
  43.    const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
  44.  
  45.    DataType fTy = mul->sType; // full type
  46.    DataType hTy;
  47.    switch (fTy) {
  48.    case TYPE_S32: hTy = TYPE_S16; break;
  49.    case TYPE_U32: hTy = TYPE_U16; break;
  50.    case TYPE_U64: hTy = TYPE_U32; break;
  51.    case TYPE_S64: hTy = TYPE_S32; break;
  52.    default:
  53.       return false;
  54.    }
  55.    unsigned int fullSize = typeSizeof(fTy);
  56.    unsigned int halfSize = typeSizeof(hTy);
  57.  
  58.    Instruction *i[9];
  59.  
  60.    bld->setPosition(mul, true);
  61.  
  62.    Value *a[2], *b[2];
  63.    Value *c[2];
  64.    Value *t[4];
  65.    for (int j = 0; j < 4; ++j)
  66.       t[j] = bld->getSSA(fullSize);
  67.  
  68.    // split sources into halves
  69.    i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
  70.    i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
  71.  
  72.    i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
  73.    i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
  74.    i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
  75.    i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
  76.  
  77.    if (highResult) {
  78.       Value *r[3];
  79.       Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
  80.       c[0] = bld->getSSA(1, FILE_FLAGS);
  81.       c[1] = bld->getSSA(1, FILE_FLAGS);
  82.       for (int j = 0; j < 3; ++j)
  83.          r[j] = bld->getSSA(fullSize);
  84.  
  85.       i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
  86.       i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
  87.       bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
  88.       i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
  89.  
  90.       // set carry defs / sources
  91.       i[3]->setFlagsDef(1, c[0]);
  92.       i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
  93.       i[6]->setPredicate(CC_C, c[0]);
  94.       i[5]->setFlagsSrc(3, c[1]);
  95.    } else {
  96.       bld->mkMov(mul->getDef(0), t[3]);
  97.    }
  98.    delete_Instruction(bld->getProgram(), mul);
  99.  
  100.    for (int j = 2; j <= (highResult ? 5 : 4); ++j)
  101.       if (i[j])
  102.          i[j]->sType = hTy;
  103.  
  104.    return true;
  105. }
  106.  
  107. #define QOP_ADD  0
  108. #define QOP_SUBR 1
  109. #define QOP_SUB  2
  110. #define QOP_MOV2 3
  111.  
  112. //             UL UR LL LR
  113. #define QUADOP(q, r, s, t)            \
  114.    ((QOP_##q << 6) | (QOP_##r << 4) | \
  115.     (QOP_##s << 2) | (QOP_##t << 0))
  116.  
  117. class NV50LegalizePostRA : public Pass
  118. {
  119. private:
  120.    virtual bool visit(Function *);
  121.    virtual bool visit(BasicBlock *);
  122.  
  123.    void handlePRERET(FlowInstruction *);
  124.    void replaceZero(Instruction *);
  125.  
  126.    LValue *r63;
  127. };
  128.  
  129. bool
  130. NV50LegalizePostRA::visit(Function *fn)
  131. {
  132.    Program *prog = fn->getProgram();
  133.  
  134.    r63 = new_LValue(fn, FILE_GPR);
  135.    r63->reg.data.id = 63;
  136.  
  137.    // this is actually per-program, but we can do it all on visiting main()
  138.    std::list<Instruction *> *outWrites =
  139.       reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
  140.  
  141.    if (outWrites) {
  142.       for (std::list<Instruction *>::iterator it = outWrites->begin();
  143.            it != outWrites->end(); ++it)
  144.          (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
  145.       // instructions will be deleted on exit
  146.       outWrites->clear();
  147.    }
  148.  
  149.    return true;
  150. }
  151.  
  152. void
  153. NV50LegalizePostRA::replaceZero(Instruction *i)
  154. {
  155.    for (int s = 0; i->srcExists(s); ++s) {
  156.       ImmediateValue *imm = i->getSrc(s)->asImm();
  157.       if (imm && imm->reg.data.u64 == 0)
  158.          i->setSrc(s, r63);
  159.    }
  160. }
  161.  
  162. // Emulate PRERET: jump to the target and call to the origin from there
  163. //
  164. // WARNING: atm only works if BBs are affected by at most a single PRERET
  165. //
  166. // BB:0
  167. // preret BB:3
  168. // (...)
  169. // BB:3
  170. // (...)
  171. //             --->
  172. // BB:0
  173. // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
  174. // (...)
  175. // BB:3
  176. // bra BB:3 + n1 (skip the call)
  177. // call BB:0 + n2 (skip bra at beginning of BB:0)
  178. // (...)
  179. void
  180. NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
  181. {
  182.    BasicBlock *bbE = pre->bb;
  183.    BasicBlock *bbT = pre->target.bb;
  184.  
  185.    pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
  186.    bbE->remove(pre);
  187.    bbE->insertHead(pre);
  188.  
  189.    Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
  190.    Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
  191.  
  192.    bbT->insertHead(call);
  193.    bbT->insertHead(skip);
  194.  
  195.    // NOTE: maybe split blocks to prevent the instructions from moving ?
  196.  
  197.    skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
  198.    call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
  199. }
  200.  
  201. bool
  202. NV50LegalizePostRA::visit(BasicBlock *bb)
  203. {
  204.    Instruction *i, *next;
  205.  
  206.    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
  207.    for (i = bb->getFirst(); i; i = next) {
  208.       next = i->next;
  209.       if (i->isNop()) {
  210.          bb->remove(i);
  211.       } else
  212.       if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
  213.          handlePRERET(i->asFlow());
  214.       } else {
  215.          // TODO: We will want to do this before register allocation,
  216.          // since have to use a $c register for the carry flag.
  217.          if (typeSizeof(i->dType) == 8) {
  218.             Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
  219.             if (hi)
  220.                next = hi;
  221.          }
  222.  
  223.          if (i->op != OP_MOV && i->op != OP_PFETCH &&
  224.              i->op != OP_BAR &&
  225.              (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
  226.             replaceZero(i);
  227.       }
  228.    }
  229.    if (!bb->getEntry())
  230.       return true;
  231.  
  232.    return true;
  233. }
  234.  
  235. class NV50LegalizeSSA : public Pass
  236. {
  237. public:
  238.    NV50LegalizeSSA(Program *);
  239.  
  240.    virtual bool visit(BasicBlock *bb);
  241.  
  242. private:
  243.    void propagateWriteToOutput(Instruction *);
  244.    void handleDIV(Instruction *);
  245.    void handleMOD(Instruction *);
  246.    void handleMUL(Instruction *);
  247.    void handleAddrDef(Instruction *);
  248.  
  249.    inline bool isARL(const Instruction *) const;
  250.  
  251.    BuildUtil bld;
  252.  
  253.    std::list<Instruction *> *outWrites;
  254. };
  255.  
  256. NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
  257. {
  258.    bld.setProgram(prog);
  259.  
  260.    if (prog->optLevel >= 2 &&
  261.        (prog->getType() == Program::TYPE_GEOMETRY ||
  262.         prog->getType() == Program::TYPE_VERTEX))
  263.       outWrites =
  264.          reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
  265.    else
  266.       outWrites = NULL;
  267. }
  268.  
  269. void
  270. NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
  271. {
  272.    if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
  273.       return;
  274.  
  275.    // check def instruction can store
  276.    Instruction *di = st->getSrc(1)->defs.front()->getInsn();
  277.  
  278.    // TODO: move exports (if beneficial) in common opt pass
  279.    if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
  280.       return;
  281.    for (int s = 0; di->srcExists(s); ++s)
  282.       if (di->src(s).getFile() == FILE_IMMEDIATE)
  283.          return;
  284.  
  285.    // We cannot set defs to non-lvalues before register allocation, so
  286.    // save & remove (to save registers) the exports and replace later.
  287.    outWrites->push_back(st);
  288.    st->bb->remove(st);
  289. }
  290.  
  291. bool
  292. NV50LegalizeSSA::isARL(const Instruction *i) const
  293. {
  294.    ImmediateValue imm;
  295.  
  296.    if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
  297.       return false;
  298.    if (!i->src(1).getImmediate(imm))
  299.       return false;
  300.    return imm.isInteger(0);
  301. }
  302.  
  303. void
  304. NV50LegalizeSSA::handleAddrDef(Instruction *i)
  305. {
  306.    Instruction *arl;
  307.  
  308.    i->getDef(0)->reg.size = 2; // $aX are only 16 bit
  309.  
  310.    // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
  311.    if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
  312.       if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
  313.          return;
  314.       if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
  315.          return;
  316.    }
  317.  
  318.    // turn $a sources into $r sources (can't operate on $a)
  319.    for (int s = 0; i->srcExists(s); ++s) {
  320.       Value *a = i->getSrc(s);
  321.       Value *r;
  322.       if (a->reg.file == FILE_ADDRESS) {
  323.          if (a->getInsn() && isARL(a->getInsn())) {
  324.             i->setSrc(s, a->getInsn()->getSrc(0));
  325.          } else {
  326.             bld.setPosition(i, false);
  327.             r = bld.getSSA();
  328.             bld.mkMov(r, a);
  329.             i->setSrc(s, r);
  330.          }
  331.       }
  332.    }
  333.    if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
  334.       return;
  335.  
  336.    // turn result back into $a
  337.    bld.setPosition(i, true);
  338.    arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
  339.    i->setDef(0, arl->getSrc(0));
  340. }
  341.  
  342. void
  343. NV50LegalizeSSA::handleMUL(Instruction *mul)
  344. {
  345.    if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
  346.       return;
  347.    Value *def = mul->getDef(0);
  348.    Value *pred = mul->getPredicate();
  349.    CondCode cc = mul->cc;
  350.    if (pred)
  351.       mul->setPredicate(CC_ALWAYS, NULL);
  352.  
  353.    if (mul->op == OP_MAD) {
  354.       Instruction *add = mul;
  355.       bld.setPosition(add, false);
  356.       Value *res = cloneShallow(func, mul->getDef(0));
  357.       mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
  358.       add->op = OP_ADD;
  359.       add->setSrc(0, mul->getDef(0));
  360.       add->setSrc(1, add->getSrc(2));
  361.       for (int s = 2; add->srcExists(s); ++s)
  362.          add->setSrc(s, NULL);
  363.       mul->subOp = add->subOp;
  364.       add->subOp = 0;
  365.    }
  366.    expandIntegerMUL(&bld, mul);
  367.    if (pred)
  368.       def->getInsn()->setPredicate(cc, pred);
  369. }
  370.  
  371. // Use f32 division: first compute an approximate result, use it to reduce
  372. // the dividend, which should then be representable as f32, divide the reduced
  373. // dividend, and add the quotients.
  374. void
  375. NV50LegalizeSSA::handleDIV(Instruction *div)
  376. {
  377.    const DataType ty = div->sType;
  378.  
  379.    if (ty != TYPE_U32 && ty != TYPE_S32)
  380.       return;
  381.  
  382.    Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
  383.  
  384.    bld.setPosition(div, false);
  385.  
  386.    Value *a, *af = bld.getSSA();
  387.    Value *b, *bf = bld.getSSA();
  388.  
  389.    bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
  390.    bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
  391.  
  392.    if (isSignedType(ty)) {
  393.       af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
  394.       bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
  395.       a = bld.getSSA();
  396.       b = bld.getSSA();
  397.       bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
  398.       bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
  399.    } else {
  400.       a = div->getSrc(0);
  401.       b = div->getSrc(1);
  402.    }
  403.  
  404.    bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
  405.    bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
  406.  
  407.    bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
  408.    bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
  409.  
  410.    // get error of 1st result
  411.    expandIntegerMUL(&bld,
  412.       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
  413.    bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
  414.  
  415.    bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
  416.  
  417.    bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
  418.    bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
  419.       ->rnd = ROUND_Z;
  420.    bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
  421.  
  422.    // correction: if modulus >= divisor, add 1
  423.    expandIntegerMUL(&bld,
  424.       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
  425.    bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
  426.    bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b);
  427.    if (!isSignedType(ty)) {
  428.       div->op = OP_SUB;
  429.       div->setSrc(0, q);
  430.       div->setSrc(1, s);
  431.    } else {
  432.       t = q;
  433.       bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
  434.       s = bld.getSSA();
  435.       t = bld.getSSA();
  436.       // fix the sign
  437.       bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
  438.          ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
  439.       bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
  440.       bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
  441.  
  442.       div->op = OP_UNION;
  443.       div->setSrc(0, s);
  444.       div->setSrc(1, t);
  445.    }
  446. }
  447.  
  448. void
  449. NV50LegalizeSSA::handleMOD(Instruction *mod)
  450. {
  451.    if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
  452.       return;
  453.    bld.setPosition(mod, false);
  454.  
  455.    Value *q = bld.getSSA();
  456.    Value *m = bld.getSSA();
  457.  
  458.    bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
  459.    handleDIV(q->getInsn());
  460.  
  461.    bld.setPosition(mod, false);
  462.    expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
  463.  
  464.    mod->op = OP_SUB;
  465.    mod->setSrc(1, m);
  466. }
  467.  
  468. bool
  469. NV50LegalizeSSA::visit(BasicBlock *bb)
  470. {
  471.    Instruction *insn, *next;
  472.    // skipping PHIs (don't pass them to handleAddrDef) !
  473.    for (insn = bb->getEntry(); insn; insn = next) {
  474.       next = insn->next;
  475.  
  476.       switch (insn->op) {
  477.       case OP_EXPORT:
  478.          if (outWrites)
  479.             propagateWriteToOutput(insn);
  480.          break;
  481.       case OP_DIV:
  482.          handleDIV(insn);
  483.          break;
  484.       case OP_MOD:
  485.          handleMOD(insn);
  486.          break;
  487.       case OP_MAD:
  488.       case OP_MUL:
  489.          handleMUL(insn);
  490.          break;
  491.       default:
  492.          break;
  493.       }
  494.  
  495.       if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
  496.          handleAddrDef(insn);
  497.    }
  498.    return true;
  499. }
  500.  
  501. class NV50LoweringPreSSA : public Pass
  502. {
  503. public:
  504.    NV50LoweringPreSSA(Program *);
  505.  
  506. private:
  507.    virtual bool visit(Instruction *);
  508.    virtual bool visit(Function *);
  509.  
  510.    bool handleRDSV(Instruction *);
  511.    bool handleWRSV(Instruction *);
  512.  
  513.    bool handleEXPORT(Instruction *);
  514.  
  515.    bool handleDIV(Instruction *);
  516.    bool handleSQRT(Instruction *);
  517.    bool handlePOW(Instruction *);
  518.  
  519.    bool handleSET(Instruction *);
  520.    bool handleSLCT(CmpInstruction *);
  521.    bool handleSELP(Instruction *);
  522.  
  523.    bool handleTEX(TexInstruction *);
  524.    bool handleTXB(TexInstruction *); // I really
  525.    bool handleTXL(TexInstruction *); // hate
  526.    bool handleTXD(TexInstruction *); // these 3
  527.  
  528.    bool handleCALL(Instruction *);
  529.    bool handlePRECONT(Instruction *);
  530.    bool handleCONT(Instruction *);
  531.  
  532.    void checkPredicate(Instruction *);
  533.  
  534. private:
  535.    const Target *const targ;
  536.  
  537.    BuildUtil bld;
  538.  
  539.    Value *tid;
  540. };
  541.  
  542. NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
  543.    targ(prog->getTarget()), tid(NULL)
  544. {
  545.    bld.setProgram(prog);
  546. }
  547.  
  548. bool
  549. NV50LoweringPreSSA::visit(Function *f)
  550. {
  551.    BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
  552.  
  553.    if (prog->getType() == Program::TYPE_COMPUTE) {
  554.       // Add implicit "thread id" argument in $r0 to the function
  555.       Value *arg = new_LValue(func, FILE_GPR);
  556.       arg->reg.data.id = 0;
  557.       f->ins.push_back(arg);
  558.  
  559.       bld.setPosition(root, false);
  560.       tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
  561.    }
  562.  
  563.    return true;
  564. }
  565.  
  566. bool
  567. NV50LoweringPreSSA::handleTEX(TexInstruction *i)
  568. {
  569.    const int arg = i->tex.target.getArgCount();
  570.    const int dref = arg;
  571.    const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
  572.  
  573.    // dref comes before bias/lod
  574.    if (i->tex.target.isShadow())
  575.       if (i->op == OP_TXB || i->op == OP_TXL)
  576.          i->swapSources(dref, lod);
  577.  
  578.    // array index must be converted to u32
  579.    if (i->tex.target.isArray()) {
  580.       Value *layer = i->getSrc(arg - 1);
  581.       LValue *src = new_LValue(func, FILE_GPR);
  582.       bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
  583.       bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
  584.       i->setSrc(arg - 1, src);
  585.  
  586.       if (i->tex.target.isCube()) {
  587.          std::vector<Value *> acube, a2d;
  588.          int c;
  589.  
  590.          acube.resize(4);
  591.          for (c = 0; c < 4; ++c)
  592.             acube[c] = i->getSrc(c);
  593.          a2d.resize(4);
  594.          for (c = 0; c < 3; ++c)
  595.             a2d[c] = new_LValue(func, FILE_GPR);
  596.          a2d[3] = NULL;
  597.  
  598.          bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
  599.                    a2d, acube)->asTex()->tex.mask = 0x7;
  600.  
  601.          for (c = 0; c < 3; ++c)
  602.             i->setSrc(c, a2d[c]);
  603.          i->setSrc(c, NULL);
  604.          for (; i->srcExists(c + 1); ++c)
  605.             i->setSrc(c, i->getSrc(c + 1));
  606.  
  607.          i->tex.target = i->tex.target.isShadow() ?
  608.             TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
  609.       }
  610.    }
  611.  
  612.    // texel offsets are 3 immediate fields in the instruction,
  613.    // nv50 cannot do textureGatherOffsets
  614.    assert(i->tex.useOffsets <= 1);
  615.  
  616.    return true;
  617. }
  618.  
  619. // Bias must be equal for all threads of a quad or lod calculation will fail.
  620. //
  621. // The lanes of a quad are grouped by the bit in the condition register they
  622. // have set, which is selected by differing bias values.
  623. // Move the input values for TEX into a new register set for each group and
  624. // execute TEX only for a specific group.
  625. // We always need to use 4 new registers for the inputs/outputs because the
  626. // implicitly calculated derivatives must be correct.
  627. //
  628. // TODO: move to SSA phase so we can easily determine whether bias is constant
  629. bool
  630. NV50LoweringPreSSA::handleTXB(TexInstruction *i)
  631. {
  632.    const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
  633.    int l, d;
  634.  
  635.    handleTEX(i);
  636.    Value *bias = i->getSrc(i->tex.target.getArgCount());
  637.    if (bias->isUniform())
  638.       return true;
  639.  
  640.    Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
  641.                                  bld.loadImm(NULL, 1));
  642.    bld.setPosition(cond, false);
  643.  
  644.    for (l = 1; l < 4; ++l) {
  645.       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
  646.       Value *bit = bld.getSSA();
  647.       Value *pred = bld.getScratch(1, FILE_FLAGS);
  648.       Value *imm = bld.loadImm(NULL, (1 << l));
  649.       bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
  650.       bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
  651.       cond->setSrc(l, bit);
  652.    }
  653.    Value *flags = bld.getScratch(1, FILE_FLAGS);
  654.    bld.setPosition(cond, true);
  655.    bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
  656.  
  657.    Instruction *tex[4];
  658.    for (l = 0; l < 4; ++l) {
  659.       (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
  660.       bld.insert(tex[l]);
  661.    }
  662.  
  663.    Value *res[4][4];
  664.    for (d = 0; i->defExists(d); ++d)
  665.       res[0][d] = tex[0]->getDef(d);
  666.    for (l = 1; l < 4; ++l) {
  667.       for (d = 0; tex[l]->defExists(d); ++d) {
  668.          res[l][d] = cloneShallow(func, res[0][d]);
  669.          bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
  670.       }
  671.    }
  672.  
  673.    for (d = 0; i->defExists(d); ++d) {
  674.       Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
  675.       for (l = 0; l < 4; ++l)
  676.          dst->setSrc(l, res[l][d]);
  677.    }
  678.    delete_Instruction(prog, i);
  679.    return true;
  680. }
  681.  
  682. // LOD must be equal for all threads of a quad.
  683. // Unlike with TXB, here we can just diverge since there's no LOD calculation
  684. // that would require all 4 threads' sources to be set up properly.
  685. bool
  686. NV50LoweringPreSSA::handleTXL(TexInstruction *i)
  687. {
  688.    handleTEX(i);
  689.    Value *lod = i->getSrc(i->tex.target.getArgCount());
  690.    if (lod->isUniform())
  691.       return true;
  692.  
  693.    BasicBlock *currBB = i->bb;
  694.    BasicBlock *texiBB = i->bb->splitBefore(i, false);
  695.    BasicBlock *joinBB = i->bb->splitAfter(i);
  696.  
  697.    bld.setPosition(currBB, true);
  698.    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
  699.  
  700.    for (int l = 0; l <= 3; ++l) {
  701.       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
  702.       Value *pred = bld.getScratch(1, FILE_FLAGS);
  703.       bld.setPosition(currBB, true);
  704.       bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
  705.       bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
  706.       currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
  707.       if (l <= 2) {
  708.          BasicBlock *laneBB = new BasicBlock(func);
  709.          currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
  710.          currBB = laneBB;
  711.       }
  712.    }
  713.    bld.setPosition(joinBB, false);
  714.    bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
  715.    return true;
  716. }
  717.  
  718. bool
  719. NV50LoweringPreSSA::handleTXD(TexInstruction *i)
  720. {
  721.    static const uint8_t qOps[4][2] =
  722.    {
  723.       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
  724.       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
  725.       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
  726.       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
  727.    };
  728.    Value *def[4][4];
  729.    Value *crd[3];
  730.    Instruction *tex;
  731.    Value *zero = bld.loadImm(bld.getSSA(), 0);
  732.    int l, c;
  733.    const int dim = i->tex.target.getDim();
  734.  
  735.    handleTEX(i);
  736.    i->op = OP_TEX; // no need to clone dPdx/dPdy later
  737.  
  738.    for (c = 0; c < dim; ++c)
  739.       crd[c] = bld.getScratch();
  740.  
  741.    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
  742.    for (l = 0; l < 4; ++l) {
  743.       // mov coordinates from lane l to all lanes
  744.       for (c = 0; c < dim; ++c)
  745.          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
  746.       // add dPdx from lane l to lanes dx
  747.       for (c = 0; c < dim; ++c)
  748.          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
  749.       // add dPdy from lane l to lanes dy
  750.       for (c = 0; c < dim; ++c)
  751.          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
  752.       // texture
  753.       bld.insert(tex = cloneForward(func, i));
  754.       for (c = 0; c < dim; ++c)
  755.          tex->setSrc(c, crd[c]);
  756.       // save results
  757.       for (c = 0; i->defExists(c); ++c) {
  758.          Instruction *mov;
  759.          def[c][l] = bld.getSSA();
  760.          mov = bld.mkMov(def[c][l], tex->getDef(c));
  761.          mov->fixed = 1;
  762.          mov->lanes = 1 << l;
  763.       }
  764.    }
  765.    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
  766.  
  767.    for (c = 0; i->defExists(c); ++c) {
  768.       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
  769.       for (l = 0; l < 4; ++l)
  770.          u->setSrc(l, def[c][l]);
  771.    }
  772.  
  773.    i->bb->remove(i);
  774.    return true;
  775. }
  776.  
  777. bool
  778. NV50LoweringPreSSA::handleSET(Instruction *i)
  779. {
  780.    if (i->dType == TYPE_F32) {
  781.       bld.setPosition(i, true);
  782.       i->dType = TYPE_U32;
  783.       bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
  784.       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
  785.    }
  786.    return true;
  787. }
  788.  
  789. bool
  790. NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
  791. {
  792.    Value *src0 = bld.getSSA();
  793.    Value *src1 = bld.getSSA();
  794.    Value *pred = bld.getScratch(1, FILE_FLAGS);
  795.  
  796.    Value *v0 = i->getSrc(0);
  797.    Value *v1 = i->getSrc(1);
  798.    // XXX: these probably shouldn't be immediates in the first place ...
  799.    if (v0->asImm())
  800.       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
  801.    if (v1->asImm())
  802.       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
  803.  
  804.    bld.setPosition(i, true);
  805.    bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
  806.    bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
  807.    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
  808.  
  809.    bld.setPosition(i, false);
  810.    i->op = OP_SET;
  811.    i->setFlagsDef(0, pred);
  812.    i->dType = TYPE_U8;
  813.    i->setSrc(0, i->getSrc(2));
  814.    i->setSrc(2, NULL);
  815.    i->setSrc(1, bld.loadImm(NULL, 0));
  816.  
  817.    return true;
  818. }
  819.  
  820. bool
  821. NV50LoweringPreSSA::handleSELP(Instruction *i)
  822. {
  823.    Value *src0 = bld.getSSA();
  824.    Value *src1 = bld.getSSA();
  825.  
  826.    Value *v0 = i->getSrc(0);
  827.    Value *v1 = i->getSrc(1);
  828.    if (v0->asImm())
  829.       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
  830.    if (v1->asImm())
  831.       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
  832.  
  833.    bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
  834.    bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
  835.    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
  836.    delete_Instruction(prog, i);
  837.    return true;
  838. }
  839.  
  840. bool
  841. NV50LoweringPreSSA::handleWRSV(Instruction *i)
  842. {
  843.    Symbol *sym = i->getSrc(0)->asSym();
  844.  
  845.    // these are all shader outputs, $sreg are not writeable
  846.    uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
  847.    if (addr >= 0x400)
  848.       return false;
  849.    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
  850.  
  851.    bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
  852.  
  853.    bld.getBB()->remove(i);
  854.    return true;
  855. }
  856.  
  857. bool
  858. NV50LoweringPreSSA::handleCALL(Instruction *i)
  859. {
  860.    if (prog->getType() == Program::TYPE_COMPUTE) {
  861.       // Add implicit "thread id" argument in $r0 to the function
  862.       i->setSrc(i->srcCount(), tid);
  863.    }
  864.    return true;
  865. }
  866.  
  867. bool
  868. NV50LoweringPreSSA::handlePRECONT(Instruction *i)
  869. {
  870.    delete_Instruction(prog, i);
  871.    return true;
  872. }
  873.  
  874. bool
  875. NV50LoweringPreSSA::handleCONT(Instruction *i)
  876. {
  877.    i->op = OP_BRA;
  878.    return true;
  879. }
  880.  
  881. bool
  882. NV50LoweringPreSSA::handleRDSV(Instruction *i)
  883. {
  884.    Symbol *sym = i->getSrc(0)->asSym();
  885.    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
  886.    Value *def = i->getDef(0);
  887.    SVSemantic sv = sym->reg.data.sv.sv;
  888.    int idx = sym->reg.data.sv.index;
  889.  
  890.    if (addr >= 0x400) // mov $sreg
  891.       return true;
  892.  
  893.    switch (sv) {
  894.    case SV_POSITION:
  895.       assert(prog->getType() == Program::TYPE_FRAGMENT);
  896.       bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
  897.       break;
  898.    case SV_FACE:
  899.       bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
  900.       if (i->dType == TYPE_F32) {
  901.          bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
  902.          bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
  903.       }
  904.       break;
  905.    case SV_NCTAID:
  906.    case SV_CTAID:
  907.    case SV_NTID:
  908.       if ((sv == SV_NCTAID && idx >= 2) ||
  909.           (sv == SV_NTID && idx >= 3)) {
  910.          bld.mkMov(def, bld.mkImm(1));
  911.       } else if (sv == SV_CTAID && idx >= 2) {
  912.          bld.mkMov(def, bld.mkImm(0));
  913.       } else {
  914.          Value *x = bld.getSSA(2);
  915.          bld.mkOp1(OP_LOAD, TYPE_U16, x,
  916.                    bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
  917.          bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
  918.       }
  919.       break;
  920.    case SV_TID:
  921.       if (idx == 0) {
  922.          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
  923.       } else if (idx == 1) {
  924.          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
  925.          bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
  926.       } else if (idx == 2) {
  927.          bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
  928.       } else {
  929.          bld.mkMov(def, bld.mkImm(0));
  930.       }
  931.       break;
  932.    default:
  933.       bld.mkFetch(i->getDef(0), i->dType,
  934.                   FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
  935.       break;
  936.    }
  937.    bld.getBB()->remove(i);
  938.    return true;
  939. }
  940.  
  941. bool
  942. NV50LoweringPreSSA::handleDIV(Instruction *i)
  943. {
  944.    if (!isFloatType(i->dType))
  945.       return true;
  946.    bld.setPosition(i, false);
  947.    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
  948.    i->op = OP_MUL;
  949.    i->setSrc(1, rcp->getDef(0));
  950.    return true;
  951. }
  952.  
  953. bool
  954. NV50LoweringPreSSA::handleSQRT(Instruction *i)
  955. {
  956.    Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
  957.                                 bld.getSSA(), i->getSrc(0));
  958.    i->op = OP_MUL;
  959.    i->setSrc(1, rsq->getDef(0));
  960.  
  961.    return true;
  962. }
  963.  
  964. bool
  965. NV50LoweringPreSSA::handlePOW(Instruction *i)
  966. {
  967.    LValue *val = bld.getScratch();
  968.  
  969.    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
  970.    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
  971.    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
  972.  
  973.    i->op = OP_EX2;
  974.    i->setSrc(0, val);
  975.    i->setSrc(1, NULL);
  976.  
  977.    return true;
  978. }
  979.  
  980. bool
  981. NV50LoweringPreSSA::handleEXPORT(Instruction *i)
  982. {
  983.    if (prog->getType() == Program::TYPE_FRAGMENT) {
  984.       if (i->getIndirect(0, 0)) {
  985.          // TODO: redirect to l[] here, load to GPRs at exit
  986.          return false;
  987.       } else {
  988.          int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
  989.  
  990.          i->op = OP_MOV;
  991.          i->subOp = NV50_IR_SUBOP_MOV_FINAL;
  992.          i->src(0).set(i->src(1));
  993.          i->setSrc(1, NULL);
  994.          i->setDef(0, new_LValue(func, FILE_GPR));
  995.          i->getDef(0)->reg.data.id = id;
  996.  
  997.          prog->maxGPR = MAX2(prog->maxGPR, id);
  998.       }
  999.    }
  1000.    return true;
  1001. }
  1002.  
  1003. // Set flags according to predicate and make the instruction read $cX.
  1004. void
  1005. NV50LoweringPreSSA::checkPredicate(Instruction *insn)
  1006. {
  1007.    Value *pred = insn->getPredicate();
  1008.    Value *cdst;
  1009.  
  1010.    if (!pred || pred->reg.file == FILE_FLAGS)
  1011.       return;
  1012.    cdst = bld.getSSA(1, FILE_FLAGS);
  1013.  
  1014.    bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, bld.loadImm(NULL, 0), pred);
  1015.  
  1016.    insn->setPredicate(insn->cc, cdst);
  1017. }
  1018.  
  1019. //
  1020. // - add quadop dance for texturing
  1021. // - put FP outputs in GPRs
  1022. // - convert instruction sequences
  1023. //
  1024. bool
  1025. NV50LoweringPreSSA::visit(Instruction *i)
  1026. {
  1027.    bld.setPosition(i, false);
  1028.  
  1029.    if (i->cc != CC_ALWAYS)
  1030.       checkPredicate(i);
  1031.  
  1032.    switch (i->op) {
  1033.    case OP_TEX:
  1034.    case OP_TXF:
  1035.    case OP_TXG:
  1036.       return handleTEX(i->asTex());
  1037.    case OP_TXB:
  1038.       return handleTXB(i->asTex());
  1039.    case OP_TXL:
  1040.       return handleTXL(i->asTex());
  1041.    case OP_TXD:
  1042.       return handleTXD(i->asTex());
  1043.    case OP_EX2:
  1044.       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
  1045.       i->setSrc(0, i->getDef(0));
  1046.       break;
  1047.    case OP_SET:
  1048.       return handleSET(i);
  1049.    case OP_SLCT:
  1050.       return handleSLCT(i->asCmp());
  1051.    case OP_SELP:
  1052.       return handleSELP(i);
  1053.    case OP_POW:
  1054.       return handlePOW(i);
  1055.    case OP_DIV:
  1056.       return handleDIV(i);
  1057.    case OP_SQRT:
  1058.       return handleSQRT(i);
  1059.    case OP_EXPORT:
  1060.       return handleEXPORT(i);
  1061.    case OP_RDSV:
  1062.       return handleRDSV(i);
  1063.    case OP_WRSV:
  1064.       return handleWRSV(i);
  1065.    case OP_CALL:
  1066.       return handleCALL(i);
  1067.    case OP_PRECONT:
  1068.       return handlePRECONT(i);
  1069.    case OP_CONT:
  1070.       return handleCONT(i);
  1071.    default:
  1072.       break;
  1073.    }
  1074.    return true;
  1075. }
  1076.  
  1077. bool
  1078. TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
  1079. {
  1080.    bool ret = false;
  1081.  
  1082.    if (stage == CG_STAGE_PRE_SSA) {
  1083.       NV50LoweringPreSSA pass(prog);
  1084.       ret = pass.run(prog, false, true);
  1085.    } else
  1086.    if (stage == CG_STAGE_SSA) {
  1087.       if (!prog->targetPriv)
  1088.          prog->targetPriv = new std::list<Instruction *>();
  1089.       NV50LegalizeSSA pass(prog);
  1090.       ret = pass.run(prog, false, true);
  1091.    } else
  1092.    if (stage == CG_STAGE_POST_RA) {
  1093.       NV50LegalizePostRA pass;
  1094.       ret = pass.run(prog, false, true);
  1095.       if (prog->targetPriv)
  1096.          delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
  1097.    }
  1098.    return ret;
  1099. }
  1100.  
  1101. } // namespace nv50_ir
  1102.