Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright 2011 Christoph Bumiller
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice shall be included in
  12.  * all copies or substantial portions of the Software.
  13.  *
  14.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18.  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19.  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20.  * OTHER DEALINGS IN THE SOFTWARE.
  21.  */
  22.  
  23. #include "codegen/nv50_ir.h"
  24. #include "codegen/nv50_ir_build_util.h"
  25.  
  26. #include "codegen/nv50_ir_target_nv50.h"
  27.  
  28. namespace nv50_ir {
  29.  
  30. // nv50 doesn't support 32 bit integer multiplication
  31. //
  32. //       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
  33. // -------------------
  34. //    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
  35. // ah*bh 00 00                 (           carry1) << 16 + ( carry2)
  36. //       al*bl
  37. //    ah*bl 00
  38. //
  39. // fffe0001 + fffe0001
  40. //
  41. // Note that this sort of splitting doesn't work for signed values, so we
  42. // compute the sign on those manually and then perform an unsigned multiply.
  43. static bool
  44. expandIntegerMUL(BuildUtil *bld, Instruction *mul)
  45. {
  46.    const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
  47.  
  48.    DataType fTy; // full type
  49.    switch (mul->sType) {
  50.    case TYPE_S32: fTy = TYPE_U32; break;
  51.    case TYPE_S64: fTy = TYPE_U64; break;
  52.    default: fTy = mul->sType; break;
  53.    }
  54.  
  55.    DataType hTy; // half type
  56.    switch (fTy) {
  57.    case TYPE_U32: hTy = TYPE_U16; break;
  58.    case TYPE_U64: hTy = TYPE_U32; break;
  59.    default:
  60.       return false;
  61.    }
  62.    unsigned int fullSize = typeSizeof(fTy);
  63.    unsigned int halfSize = typeSizeof(hTy);
  64.  
  65.    Instruction *i[9];
  66.  
  67.    bld->setPosition(mul, true);
  68.  
  69.    Value *s[2];
  70.    Value *a[2], *b[2];
  71.    Value *t[4];
  72.    for (int j = 0; j < 4; ++j)
  73.       t[j] = bld->getSSA(fullSize);
  74.  
  75.    s[0] = mul->getSrc(0);
  76.    s[1] = mul->getSrc(1);
  77.  
  78.    if (isSignedType(mul->sType)) {
  79.       s[0] = bld->getSSA(fullSize);
  80.       s[1] = bld->getSSA(fullSize);
  81.       bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
  82.       bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
  83.    }
  84.  
  85.    // split sources into halves
  86.    i[0] = bld->mkSplit(a, halfSize, s[0]);
  87.    i[1] = bld->mkSplit(b, halfSize, s[1]);
  88.  
  89.    i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
  90.    i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
  91.    i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
  92.    i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
  93.  
  94.    if (highResult) {
  95.       Value *c[2];
  96.       Value *r[5];
  97.       Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
  98.       c[0] = bld->getSSA(1, FILE_FLAGS);
  99.       c[1] = bld->getSSA(1, FILE_FLAGS);
  100.       for (int j = 0; j < 5; ++j)
  101.          r[j] = bld->getSSA(fullSize);
  102.  
  103.       i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
  104.       i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
  105.       bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
  106.       bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
  107.       i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
  108.  
  109.       // set carry defs / sources
  110.       i[3]->setFlagsDef(1, c[0]);
  111.       // actual result required in negative case, but ignored for
  112.       // unsigned. for some reason the compiler ends up dropping the whole
  113.       // instruction if the destination is unused but the flags are.
  114.       if (isSignedType(mul->sType))
  115.          i[4]->setFlagsDef(1, c[1]);
  116.       else
  117.          i[4]->setFlagsDef(0, c[1]);
  118.       i[6]->setPredicate(CC_C, c[0]);
  119.       i[5]->setFlagsSrc(3, c[1]);
  120.  
  121.       if (isSignedType(mul->sType)) {
  122.          Value *cc[2];
  123.          Value *rr[7];
  124.          Value *one = bld->getSSA(fullSize);
  125.          bld->loadImm(one, 1);
  126.          for (int j = 0; j < 7; j++)
  127.             rr[j] = bld->getSSA(fullSize);
  128.  
  129.          // NOTE: this logic uses predicates because splitting basic blocks is
  130.          // ~impossible during the SSA phase. The RA relies on a correlation
  131.          // between edge order and phi node sources.
  132.  
  133.          // Set the sign of the result based on the inputs
  134.          bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
  135.             ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
  136.  
  137.          // 1s complement of 64-bit value
  138.          bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
  139.             ->setPredicate(CC_S, cc[0]);
  140.          bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
  141.             ->setPredicate(CC_S, cc[0]);
  142.  
  143.          // add to low 32-bits, keep track of the carry
  144.          Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
  145.          n->setPredicate(CC_S, cc[0]);
  146.          n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
  147.  
  148.          // If there was a carry, add 1 to the upper 32 bits
  149.          // XXX: These get executed even if they shouldn't be
  150.          bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
  151.             ->setPredicate(CC_C, cc[1]);
  152.          bld->mkMov(rr[3], rr[0])
  153.             ->setPredicate(CC_NC, cc[1]);
  154.          bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
  155.  
  156.          // Merge the results from the negative and non-negative paths
  157.          bld->mkMov(rr[5], rr[4])
  158.             ->setPredicate(CC_S, cc[0]);
  159.          bld->mkMov(rr[6], r[4])
  160.             ->setPredicate(CC_NS, cc[0]);
  161.          bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
  162.       } else {
  163.          bld->mkMov(mul->getDef(0), r[4]);
  164.       }
  165.    } else {
  166.       bld->mkMov(mul->getDef(0), t[3]);
  167.    }
  168.    delete_Instruction(bld->getProgram(), mul);
  169.  
  170.    for (int j = 2; j <= (highResult ? 5 : 4); ++j)
  171.       if (i[j])
  172.          i[j]->sType = hTy;
  173.  
  174.    return true;
  175. }
  176.  
  177. #define QOP_ADD  0
  178. #define QOP_SUBR 1
  179. #define QOP_SUB  2
  180. #define QOP_MOV2 3
  181.  
  182. //             UL UR LL LR
  183. #define QUADOP(q, r, s, t)            \
  184.    ((QOP_##q << 6) | (QOP_##r << 4) | \
  185.     (QOP_##s << 2) | (QOP_##t << 0))
  186.  
  187. class NV50LegalizePostRA : public Pass
  188. {
  189. private:
  190.    virtual bool visit(Function *);
  191.    virtual bool visit(BasicBlock *);
  192.  
  193.    void handlePRERET(FlowInstruction *);
  194.    void replaceZero(Instruction *);
  195.  
  196.    LValue *r63;
  197. };
  198.  
  199. bool
  200. NV50LegalizePostRA::visit(Function *fn)
  201. {
  202.    Program *prog = fn->getProgram();
  203.  
  204.    r63 = new_LValue(fn, FILE_GPR);
  205.    r63->reg.data.id = 63;
  206.  
  207.    // this is actually per-program, but we can do it all on visiting main()
  208.    std::list<Instruction *> *outWrites =
  209.       reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
  210.  
  211.    if (outWrites) {
  212.       for (std::list<Instruction *>::iterator it = outWrites->begin();
  213.            it != outWrites->end(); ++it)
  214.          (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
  215.       // instructions will be deleted on exit
  216.       outWrites->clear();
  217.    }
  218.  
  219.    return true;
  220. }
  221.  
  222. void
  223. NV50LegalizePostRA::replaceZero(Instruction *i)
  224. {
  225.    for (int s = 0; i->srcExists(s); ++s) {
  226.       ImmediateValue *imm = i->getSrc(s)->asImm();
  227.       if (imm && imm->reg.data.u64 == 0)
  228.          i->setSrc(s, r63);
  229.    }
  230. }
  231.  
  232. // Emulate PRERET: jump to the target and call to the origin from there
  233. //
  234. // WARNING: atm only works if BBs are affected by at most a single PRERET
  235. //
  236. // BB:0
  237. // preret BB:3
  238. // (...)
  239. // BB:3
  240. // (...)
  241. //             --->
  242. // BB:0
  243. // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
  244. // (...)
  245. // BB:3
  246. // bra BB:3 + n1 (skip the call)
  247. // call BB:0 + n2 (skip bra at beginning of BB:0)
  248. // (...)
  249. void
  250. NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
  251. {
  252.    BasicBlock *bbE = pre->bb;
  253.    BasicBlock *bbT = pre->target.bb;
  254.  
  255.    pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
  256.    bbE->remove(pre);
  257.    bbE->insertHead(pre);
  258.  
  259.    Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
  260.    Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
  261.  
  262.    bbT->insertHead(call);
  263.    bbT->insertHead(skip);
  264.  
  265.    // NOTE: maybe split blocks to prevent the instructions from moving ?
  266.  
  267.    skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
  268.    call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
  269. }
  270.  
  271. bool
  272. NV50LegalizePostRA::visit(BasicBlock *bb)
  273. {
  274.    Instruction *i, *next;
  275.  
  276.    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
  277.    for (i = bb->getFirst(); i; i = next) {
  278.       next = i->next;
  279.       if (i->isNop()) {
  280.          bb->remove(i);
  281.       } else
  282.       if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
  283.          handlePRERET(i->asFlow());
  284.       } else {
  285.          // TODO: We will want to do this before register allocation,
  286.          // since have to use a $c register for the carry flag.
  287.          if (typeSizeof(i->dType) == 8) {
  288.             Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
  289.             if (hi)
  290.                next = hi;
  291.          }
  292.  
  293.          if (i->op != OP_MOV && i->op != OP_PFETCH &&
  294.              i->op != OP_BAR &&
  295.              (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
  296.             replaceZero(i);
  297.       }
  298.    }
  299.    if (!bb->getEntry())
  300.       return true;
  301.  
  302.    return true;
  303. }
  304.  
  305. class NV50LegalizeSSA : public Pass
  306. {
  307. public:
  308.    NV50LegalizeSSA(Program *);
  309.  
  310.    virtual bool visit(BasicBlock *bb);
  311.  
  312. private:
  313.    void propagateWriteToOutput(Instruction *);
  314.    void handleDIV(Instruction *);
  315.    void handleMOD(Instruction *);
  316.    void handleMUL(Instruction *);
  317.    void handleAddrDef(Instruction *);
  318.  
  319.    inline bool isARL(const Instruction *) const;
  320.  
  321.    BuildUtil bld;
  322.  
  323.    std::list<Instruction *> *outWrites;
  324. };
  325.  
  326. NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
  327. {
  328.    bld.setProgram(prog);
  329.  
  330.    if (prog->optLevel >= 2 &&
  331.        (prog->getType() == Program::TYPE_GEOMETRY ||
  332.         prog->getType() == Program::TYPE_VERTEX))
  333.       outWrites =
  334.          reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
  335.    else
  336.       outWrites = NULL;
  337. }
  338.  
  339. void
  340. NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
  341. {
  342.    if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
  343.       return;
  344.  
  345.    // check def instruction can store
  346.    Instruction *di = st->getSrc(1)->defs.front()->getInsn();
  347.  
  348.    // TODO: move exports (if beneficial) in common opt pass
  349.    if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
  350.       return;
  351.  
  352.    for (int s = 0; di->srcExists(s); ++s)
  353.       if (di->src(s).getFile() == FILE_IMMEDIATE)
  354.          return;
  355.  
  356.    if (prog->getType() == Program::TYPE_GEOMETRY) {
  357.       // Only propagate output writes in geometry shaders when we can be sure
  358.       // that we are propagating to the same output vertex.
  359.       if (di->bb != st->bb)
  360.          return;
  361.       Instruction *i;
  362.       for (i = di; i != st; i = i->next) {
  363.          if (i->op == OP_EMIT || i->op == OP_RESTART)
  364.             return;
  365.       }
  366.       assert(i); // st after di
  367.    }
  368.  
  369.    // We cannot set defs to non-lvalues before register allocation, so
  370.    // save & remove (to save registers) the exports and replace later.
  371.    outWrites->push_back(st);
  372.    st->bb->remove(st);
  373. }
  374.  
  375. bool
  376. NV50LegalizeSSA::isARL(const Instruction *i) const
  377. {
  378.    ImmediateValue imm;
  379.  
  380.    if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
  381.       return false;
  382.    if (!i->src(1).getImmediate(imm))
  383.       return false;
  384.    return imm.isInteger(0);
  385. }
  386.  
  387. void
  388. NV50LegalizeSSA::handleAddrDef(Instruction *i)
  389. {
  390.    Instruction *arl;
  391.  
  392.    i->getDef(0)->reg.size = 2; // $aX are only 16 bit
  393.  
  394.    // PFETCH can always write to $a
  395.    if (i->op == OP_PFETCH)
  396.       return;
  397.    // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
  398.    if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
  399.       if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
  400.          return;
  401.       if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
  402.          return;
  403.    }
  404.  
  405.    // turn $a sources into $r sources (can't operate on $a)
  406.    for (int s = 0; i->srcExists(s); ++s) {
  407.       Value *a = i->getSrc(s);
  408.       Value *r;
  409.       if (a->reg.file == FILE_ADDRESS) {
  410.          if (a->getInsn() && isARL(a->getInsn())) {
  411.             i->setSrc(s, a->getInsn()->getSrc(0));
  412.          } else {
  413.             bld.setPosition(i, false);
  414.             r = bld.getSSA();
  415.             bld.mkMov(r, a);
  416.             i->setSrc(s, r);
  417.          }
  418.       }
  419.    }
  420.    if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
  421.       return;
  422.  
  423.    // turn result back into $a
  424.    bld.setPosition(i, true);
  425.    arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
  426.    i->setDef(0, arl->getSrc(0));
  427. }
  428.  
  429. void
  430. NV50LegalizeSSA::handleMUL(Instruction *mul)
  431. {
  432.    if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
  433.       return;
  434.    Value *def = mul->getDef(0);
  435.    Value *pred = mul->getPredicate();
  436.    CondCode cc = mul->cc;
  437.    if (pred)
  438.       mul->setPredicate(CC_ALWAYS, NULL);
  439.  
  440.    if (mul->op == OP_MAD) {
  441.       Instruction *add = mul;
  442.       bld.setPosition(add, false);
  443.       Value *res = cloneShallow(func, mul->getDef(0));
  444.       mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
  445.       add->op = OP_ADD;
  446.       add->setSrc(0, mul->getDef(0));
  447.       add->setSrc(1, add->getSrc(2));
  448.       for (int s = 2; add->srcExists(s); ++s)
  449.          add->setSrc(s, NULL);
  450.       mul->subOp = add->subOp;
  451.       add->subOp = 0;
  452.    }
  453.    expandIntegerMUL(&bld, mul);
  454.    if (pred)
  455.       def->getInsn()->setPredicate(cc, pred);
  456. }
  457.  
  458. // Use f32 division: first compute an approximate result, use it to reduce
  459. // the dividend, which should then be representable as f32, divide the reduced
  460. // dividend, and add the quotients.
  461. void
  462. NV50LegalizeSSA::handleDIV(Instruction *div)
  463. {
  464.    const DataType ty = div->sType;
  465.  
  466.    if (ty != TYPE_U32 && ty != TYPE_S32)
  467.       return;
  468.  
  469.    Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
  470.  
  471.    bld.setPosition(div, false);
  472.  
  473.    Value *a, *af = bld.getSSA();
  474.    Value *b, *bf = bld.getSSA();
  475.  
  476.    bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
  477.    bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
  478.  
  479.    if (isSignedType(ty)) {
  480.       af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
  481.       bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
  482.       a = bld.getSSA();
  483.       b = bld.getSSA();
  484.       bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
  485.       bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
  486.    } else {
  487.       a = div->getSrc(0);
  488.       b = div->getSrc(1);
  489.    }
  490.  
  491.    bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
  492.    bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
  493.  
  494.    bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
  495.    bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
  496.  
  497.    // get error of 1st result
  498.    expandIntegerMUL(&bld,
  499.       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
  500.    bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
  501.  
  502.    bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
  503.  
  504.    bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
  505.    bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
  506.       ->rnd = ROUND_Z;
  507.    bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
  508.  
  509.    // correction: if modulus >= divisor, add 1
  510.    expandIntegerMUL(&bld,
  511.       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
  512.    bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
  513.    bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
  514.    if (!isSignedType(ty)) {
  515.       div->op = OP_SUB;
  516.       div->setSrc(0, q);
  517.       div->setSrc(1, s);
  518.    } else {
  519.       t = q;
  520.       bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
  521.       s = bld.getSSA();
  522.       t = bld.getSSA();
  523.       // fix the sign
  524.       bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
  525.          ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
  526.       bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
  527.       bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
  528.  
  529.       div->op = OP_UNION;
  530.       div->setSrc(0, s);
  531.       div->setSrc(1, t);
  532.    }
  533. }
  534.  
  535. void
  536. NV50LegalizeSSA::handleMOD(Instruction *mod)
  537. {
  538.    if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
  539.       return;
  540.    bld.setPosition(mod, false);
  541.  
  542.    Value *q = bld.getSSA();
  543.    Value *m = bld.getSSA();
  544.  
  545.    bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
  546.    handleDIV(q->getInsn());
  547.  
  548.    bld.setPosition(mod, false);
  549.    expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
  550.  
  551.    mod->op = OP_SUB;
  552.    mod->setSrc(1, m);
  553. }
  554.  
  555. bool
  556. NV50LegalizeSSA::visit(BasicBlock *bb)
  557. {
  558.    Instruction *insn, *next;
  559.    // skipping PHIs (don't pass them to handleAddrDef) !
  560.    for (insn = bb->getEntry(); insn; insn = next) {
  561.       next = insn->next;
  562.  
  563.       if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
  564.          handleAddrDef(insn);
  565.  
  566.       switch (insn->op) {
  567.       case OP_EXPORT:
  568.          if (outWrites)
  569.             propagateWriteToOutput(insn);
  570.          break;
  571.       case OP_DIV:
  572.          handleDIV(insn);
  573.          break;
  574.       case OP_MOD:
  575.          handleMOD(insn);
  576.          break;
  577.       case OP_MAD:
  578.       case OP_MUL:
  579.          handleMUL(insn);
  580.          break;
  581.       default:
  582.          break;
  583.       }
  584.    }
  585.    return true;
  586. }
  587.  
  588. class NV50LoweringPreSSA : public Pass
  589. {
  590. public:
  591.    NV50LoweringPreSSA(Program *);
  592.  
  593. private:
  594.    virtual bool visit(Instruction *);
  595.    virtual bool visit(Function *);
  596.  
  597.    bool handleRDSV(Instruction *);
  598.    bool handleWRSV(Instruction *);
  599.  
  600.    bool handlePFETCH(Instruction *);
  601.    bool handleEXPORT(Instruction *);
  602.    bool handleLOAD(Instruction *);
  603.  
  604.    bool handleDIV(Instruction *);
  605.    bool handleSQRT(Instruction *);
  606.    bool handlePOW(Instruction *);
  607.  
  608.    bool handleSET(Instruction *);
  609.    bool handleSLCT(CmpInstruction *);
  610.    bool handleSELP(Instruction *);
  611.  
  612.    bool handleTEX(TexInstruction *);
  613.    bool handleTXB(TexInstruction *); // I really
  614.    bool handleTXL(TexInstruction *); // hate
  615.    bool handleTXD(TexInstruction *); // these 3
  616.    bool handleTXLQ(TexInstruction *);
  617.  
  618.    bool handleCALL(Instruction *);
  619.    bool handlePRECONT(Instruction *);
  620.    bool handleCONT(Instruction *);
  621.  
  622.    void checkPredicate(Instruction *);
  623.    void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
  624.    void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
  625.  
  626. private:
  627.    const Target *const targ;
  628.  
  629.    BuildUtil bld;
  630.  
  631.    Value *tid;
  632. };
  633.  
  634. NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
  635.    targ(prog->getTarget()), tid(NULL)
  636. {
  637.    bld.setProgram(prog);
  638. }
  639.  
  640. bool
  641. NV50LoweringPreSSA::visit(Function *f)
  642. {
  643.    BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
  644.  
  645.    if (prog->getType() == Program::TYPE_COMPUTE) {
  646.       // Add implicit "thread id" argument in $r0 to the function
  647.       Value *arg = new_LValue(func, FILE_GPR);
  648.       arg->reg.data.id = 0;
  649.       f->ins.push_back(arg);
  650.  
  651.       bld.setPosition(root, false);
  652.       tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
  653.    }
  654.  
  655.    return true;
  656. }
  657.  
  658. void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
  659.                                        Value **ms_x, Value **ms_y) {
  660.    // This loads the texture-indexed ms setting from the constant buffer
  661.    Value *tmp = new_LValue(func, FILE_GPR);
  662.    uint8_t b = prog->driver->io.resInfoCBSlot;
  663.    off += prog->driver->io.suInfoBase;
  664.    if (prog->getType() > Program::TYPE_VERTEX)
  665.       off += 16 * 2 * 4;
  666.    if (prog->getType() > Program::TYPE_GEOMETRY)
  667.       off += 16 * 2 * 4;
  668.    *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
  669.                              FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
  670.    *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
  671.                              FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);
  672.    *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y);
  673. }
  674.  
  675. void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) {
  676.    // Given a MS level, and a sample id, compute the delta x/y
  677.    uint8_t b = prog->driver->io.msInfoCBSlot;
  678.    Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR);
  679.  
  680.    // The required information is at mslevel * 16 * 4 + sample * 8
  681.    // = (mslevel * 8 + sample) * 8
  682.    bld.mkOp2(OP_SHL,
  683.              TYPE_U32,
  684.              off,
  685.              bld.mkOp2v(OP_ADD, TYPE_U32, t,
  686.                         bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),
  687.                         s),
  688.              bld.mkImm(3));
  689.    *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
  690.                            FILE_MEMORY_CONST, b, TYPE_U32,
  691.                            prog->driver->io.msInfoBase), off);
  692.    *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
  693.                            FILE_MEMORY_CONST, b, TYPE_U32,
  694.                            prog->driver->io.msInfoBase + 4), off);
  695. }
  696.  
  697. bool
  698. NV50LoweringPreSSA::handleTEX(TexInstruction *i)
  699. {
  700.    const int arg = i->tex.target.getArgCount();
  701.    const int dref = arg;
  702.    const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
  703.  
  704.    // handle MS, which means looking up the MS params for this texture, and
  705.    // adjusting the input coordinates to point at the right sample.
  706.    if (i->tex.target.isMS()) {
  707.       Value *x = i->getSrc(0);
  708.       Value *y = i->getSrc(1);
  709.       Value *s = i->getSrc(arg - 1);
  710.       Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR),
  711.          *ms, *ms_x, *ms_y, *dx, *dy;
  712.  
  713.       i->tex.target.clearMS();
  714.  
  715.       loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
  716.       loadMsInfo(ms, s, &dx, &dy);
  717.  
  718.       bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
  719.       bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
  720.       bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
  721.       bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
  722.       i->setSrc(0, tx);
  723.       i->setSrc(1, ty);
  724.       i->setSrc(arg - 1, bld.loadImm(NULL, 0));
  725.    }
  726.  
  727.    // dref comes before bias/lod
  728.    if (i->tex.target.isShadow())
  729.       if (i->op == OP_TXB || i->op == OP_TXL)
  730.          i->swapSources(dref, lod);
  731.  
  732.    if (i->tex.target.isArray()) {
  733.       if (i->op != OP_TXF) {
  734.          // array index must be converted to u32, but it's already an integer
  735.          // for TXF
  736.          Value *layer = i->getSrc(arg - 1);
  737.          LValue *src = new_LValue(func, FILE_GPR);
  738.          bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
  739.          bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
  740.          i->setSrc(arg - 1, src);
  741.       }
  742.       if (i->tex.target.isCube() && i->srcCount() > 4) {
  743.          std::vector<Value *> acube, a2d;
  744.          int c;
  745.  
  746.          acube.resize(4);
  747.          for (c = 0; c < 4; ++c)
  748.             acube[c] = i->getSrc(c);
  749.          a2d.resize(4);
  750.          for (c = 0; c < 3; ++c)
  751.             a2d[c] = new_LValue(func, FILE_GPR);
  752.          a2d[3] = NULL;
  753.  
  754.          bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
  755.                    a2d, acube)->asTex()->tex.mask = 0x7;
  756.  
  757.          for (c = 0; c < 3; ++c)
  758.             i->setSrc(c, a2d[c]);
  759.          for (; i->srcExists(c + 1); ++c)
  760.             i->setSrc(c, i->getSrc(c + 1));
  761.          i->setSrc(c, NULL);
  762.          assert(c <= 4);
  763.  
  764.          i->tex.target = i->tex.target.isShadow() ?
  765.             TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
  766.       }
  767.    }
  768.  
  769.    // texel offsets are 3 immediate fields in the instruction,
  770.    // nv50 cannot do textureGatherOffsets
  771.    assert(i->tex.useOffsets <= 1);
  772.    if (i->tex.useOffsets) {
  773.       for (int c = 0; c < 3; ++c) {
  774.          ImmediateValue val;
  775.          if (!i->offset[0][c].getImmediate(val))
  776.             assert(!"non-immediate offset");
  777.          i->tex.offset[c] = val.reg.data.u32;
  778.          i->offset[0][c].set(NULL);
  779.       }
  780.    }
  781.  
  782.    return true;
  783. }
  784.  
  785. // Bias must be equal for all threads of a quad or lod calculation will fail.
  786. //
  787. // The lanes of a quad are grouped by the bit in the condition register they
  788. // have set, which is selected by differing bias values.
  789. // Move the input values for TEX into a new register set for each group and
  790. // execute TEX only for a specific group.
  791. // We always need to use 4 new registers for the inputs/outputs because the
  792. // implicitly calculated derivatives must be correct.
  793. //
  794. // TODO: move to SSA phase so we can easily determine whether bias is constant
  795. bool
  796. NV50LoweringPreSSA::handleTXB(TexInstruction *i)
  797. {
  798.    const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
  799.    int l, d;
  800.  
  801.    // We can't actually apply bias *and* do a compare for a cube
  802.    // texture. Since the compare has to be done before the filtering, just
  803.    // drop the bias on the floor.
  804.    if (i->tex.target == TEX_TARGET_CUBE_SHADOW) {
  805.       i->op = OP_TEX;
  806.       i->setSrc(3, i->getSrc(4));
  807.       i->setSrc(4, NULL);
  808.       return handleTEX(i);
  809.    }
  810.  
  811.    handleTEX(i);
  812.    Value *bias = i->getSrc(i->tex.target.getArgCount());
  813.    if (bias->isUniform())
  814.       return true;
  815.  
  816.    Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
  817.                                  bld.loadImm(NULL, 1));
  818.    bld.setPosition(cond, false);
  819.  
  820.    for (l = 1; l < 4; ++l) {
  821.       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
  822.       Value *bit = bld.getSSA();
  823.       Value *pred = bld.getScratch(1, FILE_FLAGS);
  824.       Value *imm = bld.loadImm(NULL, (1 << l));
  825.       bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
  826.       bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
  827.       cond->setSrc(l, bit);
  828.    }
  829.    Value *flags = bld.getScratch(1, FILE_FLAGS);
  830.    bld.setPosition(cond, true);
  831.    bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
  832.  
  833.    Instruction *tex[4];
  834.    for (l = 0; l < 4; ++l) {
  835.       (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
  836.       bld.insert(tex[l]);
  837.    }
  838.  
  839.    Value *res[4][4];
  840.    for (d = 0; i->defExists(d); ++d)
  841.       res[0][d] = tex[0]->getDef(d);
  842.    for (l = 1; l < 4; ++l) {
  843.       for (d = 0; tex[l]->defExists(d); ++d) {
  844.          res[l][d] = cloneShallow(func, res[0][d]);
  845.          bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
  846.       }
  847.    }
  848.  
  849.    for (d = 0; i->defExists(d); ++d) {
  850.       Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
  851.       for (l = 0; l < 4; ++l)
  852.          dst->setSrc(l, res[l][d]);
  853.    }
  854.    delete_Instruction(prog, i);
  855.    return true;
  856. }
  857.  
  858. // LOD must be equal for all threads of a quad.
  859. // Unlike with TXB, here we can just diverge since there's no LOD calculation
  860. // that would require all 4 threads' sources to be set up properly.
  861. bool
  862. NV50LoweringPreSSA::handleTXL(TexInstruction *i)
  863. {
  864.    handleTEX(i);
  865.    Value *lod = i->getSrc(i->tex.target.getArgCount());
  866.    if (lod->isUniform())
  867.       return true;
  868.  
  869.    BasicBlock *currBB = i->bb;
  870.    BasicBlock *texiBB = i->bb->splitBefore(i, false);
  871.    BasicBlock *joinBB = i->bb->splitAfter(i);
  872.  
  873.    bld.setPosition(currBB, true);
  874.    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
  875.  
  876.    for (int l = 0; l <= 3; ++l) {
  877.       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
  878.       Value *pred = bld.getScratch(1, FILE_FLAGS);
  879.       bld.setPosition(currBB, true);
  880.       bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
  881.       bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
  882.       currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
  883.       if (l <= 2) {
  884.          BasicBlock *laneBB = new BasicBlock(func);
  885.          currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
  886.          currBB = laneBB;
  887.       }
  888.    }
  889.    bld.setPosition(joinBB, false);
  890.    bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
  891.    return true;
  892. }
  893.  
  894. bool
  895. NV50LoweringPreSSA::handleTXD(TexInstruction *i)
  896. {
  897.    static const uint8_t qOps[4][2] =
  898.    {
  899.       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
  900.       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
  901.       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
  902.       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
  903.    };
  904.    Value *def[4][4];
  905.    Value *crd[3];
  906.    Instruction *tex;
  907.    Value *zero = bld.loadImm(bld.getSSA(), 0);
  908.    int l, c;
  909.    const int dim = i->tex.target.getDim();
  910.  
  911.    handleTEX(i);
  912.    i->op = OP_TEX; // no need to clone dPdx/dPdy later
  913.  
  914.    for (c = 0; c < dim; ++c)
  915.       crd[c] = bld.getScratch();
  916.  
  917.    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
  918.    for (l = 0; l < 4; ++l) {
  919.       // mov coordinates from lane l to all lanes
  920.       for (c = 0; c < dim; ++c)
  921.          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
  922.       // add dPdx from lane l to lanes dx
  923.       for (c = 0; c < dim; ++c)
  924.          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
  925.       // add dPdy from lane l to lanes dy
  926.       for (c = 0; c < dim; ++c)
  927.          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
  928.       // texture
  929.       bld.insert(tex = cloneForward(func, i));
  930.       for (c = 0; c < dim; ++c)
  931.          tex->setSrc(c, crd[c]);
  932.       // save results
  933.       for (c = 0; i->defExists(c); ++c) {
  934.          Instruction *mov;
  935.          def[c][l] = bld.getSSA();
  936.          mov = bld.mkMov(def[c][l], tex->getDef(c));
  937.          mov->fixed = 1;
  938.          mov->lanes = 1 << l;
  939.       }
  940.    }
  941.    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
  942.  
  943.    for (c = 0; i->defExists(c); ++c) {
  944.       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
  945.       for (l = 0; l < 4; ++l)
  946.          u->setSrc(l, def[c][l]);
  947.    }
  948.  
  949.    i->bb->remove(i);
  950.    return true;
  951. }
  952.  
  953. bool
  954. NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
  955. {
  956.    handleTEX(i);
  957.    bld.setPosition(i, true);
  958.  
  959.    /* The returned values are not quite what we want:
  960.     * (a) convert from s32 to f32
  961.     * (b) multiply by 1/256
  962.     */
  963.    for (int def = 0; def < 2; ++def) {
  964.       if (!i->defExists(def))
  965.          continue;
  966.       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
  967.       bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
  968.                 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
  969.    }
  970.    return true;
  971. }
  972.  
  973. bool
  974. NV50LoweringPreSSA::handleSET(Instruction *i)
  975. {
  976.    if (i->dType == TYPE_F32) {
  977.       bld.setPosition(i, true);
  978.       i->dType = TYPE_U32;
  979.       bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
  980.       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
  981.    }
  982.    return true;
  983. }
  984.  
  985. bool
  986. NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
  987. {
  988.    Value *src0 = bld.getSSA();
  989.    Value *src1 = bld.getSSA();
  990.    Value *pred = bld.getScratch(1, FILE_FLAGS);
  991.  
  992.    Value *v0 = i->getSrc(0);
  993.    Value *v1 = i->getSrc(1);
  994.    // XXX: these probably shouldn't be immediates in the first place ...
  995.    if (v0->asImm())
  996.       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
  997.    if (v1->asImm())
  998.       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
  999.  
  1000.    bld.setPosition(i, true);
  1001.    bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
  1002.    bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
  1003.    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
  1004.  
  1005.    bld.setPosition(i, false);
  1006.    i->op = OP_SET;
  1007.    i->setFlagsDef(0, pred);
  1008.    i->dType = TYPE_U8;
  1009.    i->setSrc(0, i->getSrc(2));
  1010.    i->setSrc(2, NULL);
  1011.    i->setSrc(1, bld.loadImm(NULL, 0));
  1012.  
  1013.    return true;
  1014. }
  1015.  
  1016. bool
  1017. NV50LoweringPreSSA::handleSELP(Instruction *i)
  1018. {
  1019.    Value *src0 = bld.getSSA();
  1020.    Value *src1 = bld.getSSA();
  1021.  
  1022.    Value *v0 = i->getSrc(0);
  1023.    Value *v1 = i->getSrc(1);
  1024.    if (v0->asImm())
  1025.       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
  1026.    if (v1->asImm())
  1027.       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
  1028.  
  1029.    bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
  1030.    bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
  1031.    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
  1032.    delete_Instruction(prog, i);
  1033.    return true;
  1034. }
  1035.  
  1036. bool
  1037. NV50LoweringPreSSA::handleWRSV(Instruction *i)
  1038. {
  1039.    Symbol *sym = i->getSrc(0)->asSym();
  1040.  
  1041.    // these are all shader outputs, $sreg are not writeable
  1042.    uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
  1043.    if (addr >= 0x400)
  1044.       return false;
  1045.    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
  1046.  
  1047.    bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
  1048.  
  1049.    bld.getBB()->remove(i);
  1050.    return true;
  1051. }
  1052.  
  1053. bool
  1054. NV50LoweringPreSSA::handleCALL(Instruction *i)
  1055. {
  1056.    if (prog->getType() == Program::TYPE_COMPUTE) {
  1057.       // Add implicit "thread id" argument in $r0 to the function
  1058.       i->setSrc(i->srcCount(), tid);
  1059.    }
  1060.    return true;
  1061. }
  1062.  
  1063. bool
  1064. NV50LoweringPreSSA::handlePRECONT(Instruction *i)
  1065. {
  1066.    delete_Instruction(prog, i);
  1067.    return true;
  1068. }
  1069.  
  1070. bool
  1071. NV50LoweringPreSSA::handleCONT(Instruction *i)
  1072. {
  1073.    i->op = OP_BRA;
  1074.    return true;
  1075. }
  1076.  
  1077. bool
  1078. NV50LoweringPreSSA::handleRDSV(Instruction *i)
  1079. {
  1080.    Symbol *sym = i->getSrc(0)->asSym();
  1081.    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
  1082.    Value *def = i->getDef(0);
  1083.    SVSemantic sv = sym->reg.data.sv.sv;
  1084.    int idx = sym->reg.data.sv.index;
  1085.  
  1086.    if (addr >= 0x400) // mov $sreg
  1087.       return true;
  1088.  
  1089.    switch (sv) {
  1090.    case SV_POSITION:
  1091.       assert(prog->getType() == Program::TYPE_FRAGMENT);
  1092.       bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
  1093.       break;
  1094.    case SV_FACE:
  1095.       bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
  1096.       if (i->dType == TYPE_F32) {
  1097.          bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001));
  1098.          bld.mkOp1(OP_NEG, TYPE_S32, def, def);
  1099.          bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def);
  1100.       }
  1101.       break;
  1102.    case SV_NCTAID:
  1103.    case SV_CTAID:
  1104.    case SV_NTID:
  1105.       if ((sv == SV_NCTAID && idx >= 2) ||
  1106.           (sv == SV_NTID && idx >= 3)) {
  1107.          bld.mkMov(def, bld.mkImm(1));
  1108.       } else if (sv == SV_CTAID && idx >= 2) {
  1109.          bld.mkMov(def, bld.mkImm(0));
  1110.       } else {
  1111.          Value *x = bld.getSSA(2);
  1112.          bld.mkOp1(OP_LOAD, TYPE_U16, x,
  1113.                    bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
  1114.          bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
  1115.       }
  1116.       break;
  1117.    case SV_TID:
  1118.       if (idx == 0) {
  1119.          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
  1120.       } else if (idx == 1) {
  1121.          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
  1122.          bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
  1123.       } else if (idx == 2) {
  1124.          bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
  1125.       } else {
  1126.          bld.mkMov(def, bld.mkImm(0));
  1127.       }
  1128.       break;
  1129.    case SV_SAMPLE_POS: {
  1130.       Value *off = new_LValue(func, FILE_ADDRESS);
  1131.       bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
  1132.       bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
  1133.       bld.mkLoad(TYPE_F32,
  1134.                  def,
  1135.                  bld.mkSymbol(
  1136.                        FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
  1137.                        TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
  1138.                  off);
  1139.       break;
  1140.    }
  1141.    default:
  1142.       bld.mkFetch(i->getDef(0), i->dType,
  1143.                   FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
  1144.       break;
  1145.    }
  1146.    bld.getBB()->remove(i);
  1147.    return true;
  1148. }
  1149.  
  1150. bool
  1151. NV50LoweringPreSSA::handleDIV(Instruction *i)
  1152. {
  1153.    if (!isFloatType(i->dType))
  1154.       return true;
  1155.    bld.setPosition(i, false);
  1156.    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
  1157.    i->op = OP_MUL;
  1158.    i->setSrc(1, rcp->getDef(0));
  1159.    return true;
  1160. }
  1161.  
  1162. bool
  1163. NV50LoweringPreSSA::handleSQRT(Instruction *i)
  1164. {
  1165.    Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
  1166.                                 bld.getSSA(), i->getSrc(0));
  1167.    i->op = OP_MUL;
  1168.    i->setSrc(1, rsq->getDef(0));
  1169.  
  1170.    return true;
  1171. }
  1172.  
  1173. bool
  1174. NV50LoweringPreSSA::handlePOW(Instruction *i)
  1175. {
  1176.    LValue *val = bld.getScratch();
  1177.  
  1178.    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
  1179.    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
  1180.    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
  1181.  
  1182.    i->op = OP_EX2;
  1183.    i->setSrc(0, val);
  1184.    i->setSrc(1, NULL);
  1185.  
  1186.    return true;
  1187. }
  1188.  
  1189. bool
  1190. NV50LoweringPreSSA::handleEXPORT(Instruction *i)
  1191. {
  1192.    if (prog->getType() == Program::TYPE_FRAGMENT) {
  1193.       if (i->getIndirect(0, 0)) {
  1194.          // TODO: redirect to l[] here, load to GPRs at exit
  1195.          return false;
  1196.       } else {
  1197.          int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
  1198.  
  1199.          i->op = OP_MOV;
  1200.          i->subOp = NV50_IR_SUBOP_MOV_FINAL;
  1201.          i->src(0).set(i->src(1));
  1202.          i->setSrc(1, NULL);
  1203.          i->setDef(0, new_LValue(func, FILE_GPR));
  1204.          i->getDef(0)->reg.data.id = id;
  1205.  
  1206.          prog->maxGPR = MAX2(prog->maxGPR, id);
  1207.       }
  1208.    }
  1209.    return true;
  1210. }
  1211.  
  1212. // Handle indirect addressing in geometry shaders:
  1213. //
  1214. // ld $r0 a[$a1][$a2+k] ->
  1215. // ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
  1216. //
  1217. bool
  1218. NV50LoweringPreSSA::handleLOAD(Instruction *i)
  1219. {
  1220.    ValueRef src = i->src(0);
  1221.  
  1222.    if (src.isIndirect(1)) {
  1223.       assert(prog->getType() == Program::TYPE_GEOMETRY);
  1224.       Value *addr = i->getIndirect(0, 1);
  1225.  
  1226.       if (src.isIndirect(0)) {
  1227.          // base address is in an address register, so move to a GPR
  1228.          Value *base = bld.getScratch();
  1229.          bld.mkMov(base, addr);
  1230.  
  1231.          Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);
  1232.          Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);
  1233.          Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
  1234.                                     i->getIndirect(0, 0), bld.mkImm(2));
  1235.  
  1236.          // Calculate final address: addr = base + attr*vstride; use 16-bit
  1237.          // multiplication since 32-bit would be lowered to multiple
  1238.          // instructions, and we only need the low 16 bits of the result
  1239.          Value *a[2], *b[2];
  1240.          bld.mkSplit(a, 2, attrib);
  1241.          bld.mkSplit(b, 2, vstride);
  1242.          Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],
  1243.                                  base);
  1244.  
  1245.          // move address from GPR into an address register
  1246.          addr = bld.getSSA(2, FILE_ADDRESS);
  1247.          bld.mkMov(addr, sum);
  1248.       }
  1249.  
  1250.       i->setIndirect(0, 1, NULL);
  1251.       i->setIndirect(0, 0, addr);
  1252.    }
  1253.  
  1254.    return true;
  1255. }
  1256.  
  1257. bool
  1258. NV50LoweringPreSSA::handlePFETCH(Instruction *i)
  1259. {
  1260.    assert(prog->getType() == Program::TYPE_GEOMETRY);
  1261.  
  1262.    // NOTE: cannot use getImmediate here, not in SSA form yet, move to
  1263.    // later phase if that assertion ever triggers:
  1264.  
  1265.    ImmediateValue *imm = i->getSrc(0)->asImm();
  1266.    assert(imm);
  1267.  
  1268.    assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens
  1269.  
  1270.    if (i->srcExists(1)) {
  1271.       // indirect addressing of vertex in primitive space
  1272.  
  1273.       LValue *val = bld.getScratch();
  1274.       Value *ptr = bld.getSSA(2, FILE_ADDRESS);
  1275.       bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));
  1276.       bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);
  1277.  
  1278.       // NOTE: PFETCH directly to an $aX only works with direct addressing
  1279.       i->op = OP_SHL;
  1280.       i->setSrc(0, val);
  1281.       i->setSrc(1, bld.mkImm(0));
  1282.    }
  1283.  
  1284.    return true;
  1285. }
  1286.  
  1287. // Set flags according to predicate and make the instruction read $cX.
  1288. void
  1289. NV50LoweringPreSSA::checkPredicate(Instruction *insn)
  1290. {
  1291.    Value *pred = insn->getPredicate();
  1292.    Value *cdst;
  1293.  
  1294.    // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
  1295.    if (!pred ||
  1296.        pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)
  1297.       return;
  1298.  
  1299.    cdst = bld.getSSA(1, FILE_FLAGS);
  1300.  
  1301.    bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
  1302.  
  1303.    insn->setPredicate(insn->cc, cdst);
  1304. }
  1305.  
  1306. //
  1307. // - add quadop dance for texturing
  1308. // - put FP outputs in GPRs
  1309. // - convert instruction sequences
  1310. //
  1311. bool
  1312. NV50LoweringPreSSA::visit(Instruction *i)
  1313. {
  1314.    bld.setPosition(i, false);
  1315.  
  1316.    if (i->cc != CC_ALWAYS)
  1317.       checkPredicate(i);
  1318.  
  1319.    switch (i->op) {
  1320.    case OP_TEX:
  1321.    case OP_TXF:
  1322.    case OP_TXG:
  1323.       return handleTEX(i->asTex());
  1324.    case OP_TXB:
  1325.       return handleTXB(i->asTex());
  1326.    case OP_TXL:
  1327.       return handleTXL(i->asTex());
  1328.    case OP_TXD:
  1329.       return handleTXD(i->asTex());
  1330.    case OP_TXLQ:
  1331.       return handleTXLQ(i->asTex());
  1332.    case OP_EX2:
  1333.       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
  1334.       i->setSrc(0, i->getDef(0));
  1335.       break;
  1336.    case OP_SET:
  1337.       return handleSET(i);
  1338.    case OP_SLCT:
  1339.       return handleSLCT(i->asCmp());
  1340.    case OP_SELP:
  1341.       return handleSELP(i);
  1342.    case OP_POW:
  1343.       return handlePOW(i);
  1344.    case OP_DIV:
  1345.       return handleDIV(i);
  1346.    case OP_SQRT:
  1347.       return handleSQRT(i);
  1348.    case OP_EXPORT:
  1349.       return handleEXPORT(i);
  1350.    case OP_LOAD:
  1351.       return handleLOAD(i);
  1352.    case OP_RDSV:
  1353.       return handleRDSV(i);
  1354.    case OP_WRSV:
  1355.       return handleWRSV(i);
  1356.    case OP_CALL:
  1357.       return handleCALL(i);
  1358.    case OP_PRECONT:
  1359.       return handlePRECONT(i);
  1360.    case OP_CONT:
  1361.       return handleCONT(i);
  1362.    case OP_PFETCH:
  1363.       return handlePFETCH(i);
  1364.    default:
  1365.       break;
  1366.    }
  1367.    return true;
  1368. }
  1369.  
  1370. bool
  1371. TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
  1372. {
  1373.    bool ret = false;
  1374.  
  1375.    if (stage == CG_STAGE_PRE_SSA) {
  1376.       NV50LoweringPreSSA pass(prog);
  1377.       ret = pass.run(prog, false, true);
  1378.    } else
  1379.    if (stage == CG_STAGE_SSA) {
  1380.       if (!prog->targetPriv)
  1381.          prog->targetPriv = new std::list<Instruction *>();
  1382.       NV50LegalizeSSA pass(prog);
  1383.       ret = pass.run(prog, false, true);
  1384.    } else
  1385.    if (stage == CG_STAGE_POST_RA) {
  1386.       NV50LegalizePostRA pass;
  1387.       ret = pass.run(prog, false, true);
  1388.       if (prog->targetPriv)
  1389.          delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
  1390.    }
  1391.    return ret;
  1392. }
  1393.  
  1394. } // namespace nv50_ir
  1395.