Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright 2011 Christoph Bumiller
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice shall be included in
  12.  * all copies or substantial portions of the Software.
  13.  *
  14.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18.  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19.  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20.  * OTHER DEALINGS IN THE SOFTWARE.
  21.  */
  22.  
  23. #include "codegen/nv50_ir.h"
  24. #include "codegen/nv50_ir_build_util.h"
  25.  
  26. #include "codegen/nv50_ir_target_nvc0.h"
  27. #include "codegen/nv50_ir_lowering_nvc0.h"
  28.  
  29. #include <limits>
  30.  
  31. namespace nv50_ir {
  32.  
  33. #define QOP_ADD  0
  34. #define QOP_SUBR 1
  35. #define QOP_SUB  2
  36. #define QOP_MOV2 3
  37.  
  38. //             UL UR LL LR
  39. #define QUADOP(q, r, s, t)                      \
  40.    ((QOP_##q << 6) | (QOP_##r << 4) |           \
  41.     (QOP_##s << 2) | (QOP_##t << 0))
  42.  
  43. void
  44. NVC0LegalizeSSA::handleDIV(Instruction *i)
  45. {
  46.    FlowInstruction *call;
  47.    int builtin;
  48.    Value *def[2];
  49.  
  50.    bld.setPosition(i, false);
  51.    def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
  52.    def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
  53.    switch (i->dType) {
  54.    case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
  55.    case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
  56.    default:
  57.       return;
  58.    }
  59.    call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
  60.    bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
  61.    bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
  62.    bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
  63.  
  64.    call->fixed = 1;
  65.    call->absolute = call->builtin = 1;
  66.    call->target.builtin = builtin;
  67.    delete_Instruction(prog, i);
  68. }
  69.  
  70. void
  71. NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
  72. {
  73.    assert(i->dType == TYPE_F64);
  74.    // There are instructions that will compute the high 32 bits of the 64-bit
  75.    // float. We will just stick 0 in the bottom 32 bits.
  76.  
  77.    bld.setPosition(i, false);
  78.  
  79.    // 1. Take the source and it up.
  80.    Value *src[2], *dst[2], *def = i->getDef(0);
  81.    bld.mkSplit(src, 4, i->getSrc(0));
  82.  
  83.    // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
  84.    dst[0] = bld.loadImm(NULL, 0);
  85.    dst[1] = bld.getSSA();
  86.  
  87.    // 3. The new version of the instruction takes the high 32 bits of the
  88.    // source and outputs the high 32 bits of the destination.
  89.    i->setSrc(0, src[1]);
  90.    i->setDef(0, dst[1]);
  91.    i->setType(TYPE_F32);
  92.    i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
  93.  
  94.    // 4. Recombine the two dst pieces back into the original destination.
  95.    bld.setPosition(i, true);
  96.    bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
  97. }
  98.  
  99. void
  100. NVC0LegalizeSSA::handleFTZ(Instruction *i)
  101. {
  102.    // Only want to flush float inputs
  103.    assert(i->sType == TYPE_F32);
  104.  
  105.    // If we're already flushing denorms (and NaN's) to zero, no need for this.
  106.    if (i->dnz)
  107.       return;
  108.  
  109.    // Only certain classes of operations can flush
  110.    OpClass cls = prog->getTarget()->getOpClass(i->op);
  111.    if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
  112.        cls != OPCLASS_CONVERT)
  113.       return;
  114.  
  115.    i->ftz = true;
  116. }
  117.  
  118. bool
  119. NVC0LegalizeSSA::visit(Function *fn)
  120. {
  121.    bld.setProgram(fn->getProgram());
  122.    return true;
  123. }
  124.  
  125. bool
  126. NVC0LegalizeSSA::visit(BasicBlock *bb)
  127. {
  128.    Instruction *next;
  129.    for (Instruction *i = bb->getEntry(); i; i = next) {
  130.       next = i->next;
  131.       if (i->sType == TYPE_F32) {
  132.          if (prog->getType() != Program::TYPE_COMPUTE)
  133.             handleFTZ(i);
  134.          continue;
  135.       }
  136.       switch (i->op) {
  137.       case OP_DIV:
  138.       case OP_MOD:
  139.          handleDIV(i);
  140.          break;
  141.       case OP_RCP:
  142.       case OP_RSQ:
  143.          if (i->dType == TYPE_F64)
  144.             handleRCPRSQ(i);
  145.          break;
  146.       default:
  147.          break;
  148.       }
  149.    }
  150.    return true;
  151. }
  152.  
  153. NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
  154.    : rZero(NULL),
  155.      carry(NULL),
  156.      needTexBar(prog->getTarget()->getChipset() >= 0xe0)
  157. {
  158. }
  159.  
  160. bool
  161. NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
  162.                                     const Instruction *early) const
  163. {
  164.    if (early->bb == later->bb)
  165.       return early->serial < later->serial;
  166.    return later->bb->dominatedBy(early->bb);
  167. }
  168.  
  169. void
  170. NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
  171.                               Instruction *usei, const Instruction *insn)
  172. {
  173.    bool add = true;
  174.    for (std::list<TexUse>::iterator it = uses.begin();
  175.         it != uses.end();) {
  176.       if (insnDominatedBy(usei, it->insn)) {
  177.          add = false;
  178.          break;
  179.       }
  180.       if (insnDominatedBy(it->insn, usei))
  181.          it = uses.erase(it);
  182.       else
  183.          ++it;
  184.    }
  185.    if (add)
  186.       uses.push_back(TexUse(usei, insn));
  187. }
  188.  
  189. void
  190. NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
  191.                                         Instruction *insn,
  192.                                         const BasicBlock *term,
  193.                                         std::list<TexUse> &uses)
  194. {
  195.    while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
  196.       insn = insn->getSrc(0)->getUniqueInsn();
  197.  
  198.    if (!insn->bb->reachableBy(texi->bb, term))
  199.       return;
  200.  
  201.    switch (insn->op) {
  202.    /* Values not connected to the tex's definition through any of these should
  203.     * not be conflicting.
  204.     */
  205.    case OP_SPLIT:
  206.    case OP_MERGE:
  207.    case OP_PHI:
  208.    case OP_UNION:
  209.       /* recurse again */
  210.       for (int s = 0; insn->srcExists(s); ++s)
  211.          findOverwritingDefs(texi, insn->getSrc(s)->getUniqueInsn(), term,
  212.                              uses);
  213.       break;
  214.    default:
  215.       // if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ?
  216.       addTexUse(uses, insn, texi);
  217.       break;
  218.    }
  219. }
  220.  
  221. void
  222. NVC0LegalizePostRA::findFirstUses(
  223.       const Instruction *texi,
  224.       const Instruction *insn,
  225.       std::list<TexUse> &uses,
  226.       std::tr1::unordered_set<const Instruction *>& visited)
  227. {
  228.    for (int d = 0; insn->defExists(d); ++d) {
  229.       Value *v = insn->getDef(d);
  230.       for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
  231.          Instruction *usei = (*u)->getInsn();
  232.  
  233.          // NOTE: In case of a loop that overwrites a value but never uses
  234.          // it, it can happen that we have a cycle of uses that consists only
  235.          // of phis and no-op moves and will thus cause an infinite loop here
  236.          // since these are not considered actual uses.
  237.          // The most obvious (and perhaps the only) way to prevent this is to
  238.          // remember which instructions we've already visited.
  239.  
  240.          if (visited.find(usei) != visited.end())
  241.             continue;
  242.  
  243.          visited.insert(usei);
  244.  
  245.          if (usei->op == OP_PHI || usei->op == OP_UNION) {
  246.             // need a barrier before WAW cases
  247.             for (int s = 0; usei->srcExists(s); ++s) {
  248.                Instruction *defi = usei->getSrc(s)->getUniqueInsn();
  249.                if (defi && &usei->src(s) != *u)
  250.                   findOverwritingDefs(texi, defi, usei->bb, uses);
  251.             }
  252.          }
  253.  
  254.          if (usei->op == OP_SPLIT ||
  255.              usei->op == OP_MERGE ||
  256.              usei->op == OP_PHI ||
  257.              usei->op == OP_UNION) {
  258.             // these uses don't manifest in the machine code
  259.             findFirstUses(texi, usei, uses, visited);
  260.          } else
  261.          if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
  262.              usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
  263.             findFirstUses(texi, usei, uses, visited);
  264.          } else {
  265.             addTexUse(uses, usei, insn);
  266.          }
  267.       }
  268.    }
  269. }
  270.  
  271. // Texture barriers:
  272. // This pass is a bit long and ugly and can probably be optimized.
  273. //
  274. // 1. obtain a list of TEXes and their outputs' first use(s)
  275. // 2. calculate the barrier level of each first use (minimal number of TEXes,
  276. //    over all paths, between the TEX and the use in question)
  277. // 3. for each barrier, if all paths from the source TEX to that barrier
  278. //    contain a barrier of lesser level, it can be culled
  279. bool
  280. NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
  281. {
  282.    std::list<TexUse> *uses;
  283.    std::vector<Instruction *> texes;
  284.    std::vector<int> bbFirstTex;
  285.    std::vector<int> bbFirstUse;
  286.    std::vector<int> texCounts;
  287.    std::vector<TexUse> useVec;
  288.    ArrayList insns;
  289.  
  290.    fn->orderInstructions(insns);
  291.  
  292.    texCounts.resize(fn->allBBlocks.getSize(), 0);
  293.    bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
  294.    bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
  295.  
  296.    // tag BB CFG nodes by their id for later
  297.    for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
  298.       BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
  299.       if (bb)
  300.          bb->cfg.tag = bb->getId();
  301.    }
  302.  
  303.    // gather the first uses for each TEX
  304.    for (int i = 0; i < insns.getSize(); ++i) {
  305.       Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
  306.       if (isTextureOp(tex->op)) {
  307.          texes.push_back(tex);
  308.          if (!texCounts.at(tex->bb->getId()))
  309.             bbFirstTex[tex->bb->getId()] = texes.size() - 1;
  310.          texCounts[tex->bb->getId()]++;
  311.       }
  312.    }
  313.    insns.clear();
  314.    if (texes.empty())
  315.       return false;
  316.    uses = new std::list<TexUse>[texes.size()];
  317.    if (!uses)
  318.       return false;
  319.    for (size_t i = 0; i < texes.size(); ++i) {
  320.       std::tr1::unordered_set<const Instruction *> visited;
  321.       findFirstUses(texes[i], texes[i], uses[i], visited);
  322.    }
  323.  
  324.    // determine the barrier level at each use
  325.    for (size_t i = 0; i < texes.size(); ++i) {
  326.       for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
  327.            ++u) {
  328.          BasicBlock *tb = texes[i]->bb;
  329.          BasicBlock *ub = u->insn->bb;
  330.          if (tb == ub) {
  331.             u->level = 0;
  332.             for (size_t j = i + 1; j < texes.size() &&
  333.                     texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
  334.                  ++j)
  335.                u->level++;
  336.          } else {
  337.             u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
  338.                                                       &ub->cfg, texCounts);
  339.             if (u->level < 0) {
  340.                WARN("Failed to find path TEX -> TEXBAR\n");
  341.                u->level = 0;
  342.                continue;
  343.             }
  344.             // this counted all TEXes in the origin block, correct that
  345.             u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
  346.             // and did not count the TEXes in the destination block, add those
  347.             for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
  348.                     texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
  349.                  ++j)
  350.                u->level++;
  351.          }
  352.          assert(u->level >= 0);
  353.          useVec.push_back(*u);
  354.       }
  355.    }
  356.    delete[] uses;
  357.  
  358.    // insert the barriers
  359.    for (size_t i = 0; i < useVec.size(); ++i) {
  360.       Instruction *prev = useVec[i].insn->prev;
  361.       if (useVec[i].level < 0)
  362.          continue;
  363.       if (prev && prev->op == OP_TEXBAR) {
  364.          if (prev->subOp > useVec[i].level)
  365.             prev->subOp = useVec[i].level;
  366.          prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
  367.       } else {
  368.          Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
  369.          bar->fixed = 1;
  370.          bar->subOp = useVec[i].level;
  371.          // make use explicit to ease latency calculation
  372.          bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
  373.          useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
  374.       }
  375.    }
  376.  
  377.    if (fn->getProgram()->optLevel < 3)
  378.       return true;
  379.  
  380.    std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
  381.  
  382.    limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
  383.    limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
  384.    limitS.resize(fn->allBBlocks.getSize());
  385.  
  386.    // cull unneeded barriers (should do that earlier, but for simplicity)
  387.    IteratorRef bi = fn->cfg.iteratorCFG();
  388.    // first calculate min/max outstanding TEXes for each BB
  389.    for (bi->reset(); !bi->end(); bi->next()) {
  390.       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
  391.       BasicBlock *bb = BasicBlock::get(n);
  392.       int min = 0;
  393.       int max = std::numeric_limits<int>::max();
  394.       for (Instruction *i = bb->getFirst(); i; i = i->next) {
  395.          if (isTextureOp(i->op)) {
  396.             min++;
  397.             if (max < std::numeric_limits<int>::max())
  398.                max++;
  399.          } else
  400.          if (i->op == OP_TEXBAR) {
  401.             min = MIN2(min, i->subOp);
  402.             max = MIN2(max, i->subOp);
  403.          }
  404.       }
  405.       // limits when looking at an isolated block
  406.       limitS[bb->getId()].min = min;
  407.       limitS[bb->getId()].max = max;
  408.    }
  409.    // propagate the min/max values
  410.    for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
  411.       for (bi->reset(); !bi->end(); bi->next()) {
  412.          Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
  413.          BasicBlock *bb = BasicBlock::get(n);
  414.          const int bbId = bb->getId();
  415.          for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
  416.             BasicBlock *in = BasicBlock::get(ei.getNode());
  417.             const int inId = in->getId();
  418.             limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
  419.             limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
  420.          }
  421.          // I just hope this is correct ...
  422.          if (limitS[bbId].max == std::numeric_limits<int>::max()) {
  423.             // no barrier
  424.             limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
  425.             limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
  426.          } else {
  427.             // block contained a barrier
  428.             limitB[bbId].min = MIN2(limitS[bbId].max,
  429.                                     limitT[bbId].min + limitS[bbId].min);
  430.             limitB[bbId].max = MIN2(limitS[bbId].max,
  431.                                     limitT[bbId].max + limitS[bbId].min);
  432.          }
  433.       }
  434.    }
  435.    // finally delete unnecessary barriers
  436.    for (bi->reset(); !bi->end(); bi->next()) {
  437.       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
  438.       BasicBlock *bb = BasicBlock::get(n);
  439.       Instruction *prev = NULL;
  440.       Instruction *next;
  441.       int max = limitT[bb->getId()].max;
  442.       for (Instruction *i = bb->getFirst(); i; i = next) {
  443.          next = i->next;
  444.          if (i->op == OP_TEXBAR) {
  445.             if (i->subOp >= max) {
  446.                delete_Instruction(prog, i);
  447.                i = NULL;
  448.             } else {
  449.                max = i->subOp;
  450.                if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
  451.                   delete_Instruction(prog, prev);
  452.                   prev = NULL;
  453.                }
  454.             }
  455.          } else
  456.          if (isTextureOp(i->op)) {
  457.             max++;
  458.          }
  459.          if (i && !i->isNop())
  460.             prev = i;
  461.       }
  462.    }
  463.    return true;
  464. }
  465.  
  466. bool
  467. NVC0LegalizePostRA::visit(Function *fn)
  468. {
  469.    if (needTexBar)
  470.       insertTextureBarriers(fn);
  471.  
  472.    rZero = new_LValue(fn, FILE_GPR);
  473.    carry = new_LValue(fn, FILE_FLAGS);
  474.  
  475.    rZero->reg.data.id = prog->getTarget()->getFileSize(FILE_GPR);
  476.    carry->reg.data.id = 0;
  477.  
  478.    return true;
  479. }
  480.  
  481. void
  482. NVC0LegalizePostRA::replaceZero(Instruction *i)
  483. {
  484.    for (int s = 0; i->srcExists(s); ++s) {
  485.       if (s == 2 && i->op == OP_SUCLAMP)
  486.          continue;
  487.       ImmediateValue *imm = i->getSrc(s)->asImm();
  488.       if (imm && imm->reg.data.u64 == 0)
  489.          i->setSrc(s, rZero);
  490.    }
  491. }
  492.  
  493. // replace CONT with BRA for single unconditional continue
  494. bool
  495. NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
  496. {
  497.    if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
  498.       return false;
  499.    Graph::EdgeIterator ei = bb->cfg.incident();
  500.    if (ei.getType() != Graph::Edge::BACK)
  501.       ei.next();
  502.    if (ei.getType() != Graph::Edge::BACK)
  503.       return false;
  504.    BasicBlock *contBB = BasicBlock::get(ei.getNode());
  505.  
  506.    if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
  507.        contBB->getExit()->getPredicate())
  508.       return false;
  509.    contBB->getExit()->op = OP_BRA;
  510.    bb->remove(bb->getEntry()); // delete PRECONT
  511.  
  512.    ei.next();
  513.    assert(ei.end() || ei.getType() != Graph::Edge::BACK);
  514.    return true;
  515. }
  516.  
  517. // replace branches to join blocks with join ops
  518. void
  519. NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
  520. {
  521.    if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
  522.       return;
  523.    for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
  524.       BasicBlock *in = BasicBlock::get(ei.getNode());
  525.       Instruction *exit = in->getExit();
  526.       if (!exit) {
  527.          in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
  528.          // there should always be a terminator instruction
  529.          WARN("inserted missing terminator in BB:%i\n", in->getId());
  530.       } else
  531.       if (exit->op == OP_BRA) {
  532.          exit->op = OP_JOIN;
  533.          exit->asFlow()->limit = 1; // must-not-propagate marker
  534.       }
  535.    }
  536.    bb->remove(bb->getEntry());
  537. }
  538.  
  539. bool
  540. NVC0LegalizePostRA::visit(BasicBlock *bb)
  541. {
  542.    Instruction *i, *next;
  543.  
  544.    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
  545.    for (i = bb->getFirst(); i; i = next) {
  546.       next = i->next;
  547.       if (i->op == OP_EMIT || i->op == OP_RESTART) {
  548.          if (!i->getDef(0)->refCount())
  549.             i->setDef(0, NULL);
  550.          if (i->src(0).getFile() == FILE_IMMEDIATE)
  551.             i->setSrc(0, rZero); // initial value must be 0
  552.          replaceZero(i);
  553.       } else
  554.       if (i->isNop()) {
  555.          bb->remove(i);
  556.       } else {
  557.          // TODO: Move this to before register allocation for operations that
  558.          // need the $c register !
  559.          if (typeSizeof(i->dType) == 8) {
  560.             Instruction *hi;
  561.             hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
  562.             if (hi)
  563.                next = hi;
  564.          }
  565.  
  566.          if (i->op != OP_MOV && i->op != OP_PFETCH)
  567.             replaceZero(i);
  568.       }
  569.    }
  570.    if (!bb->getEntry())
  571.       return true;
  572.  
  573.    if (!tryReplaceContWithBra(bb))
  574.       propagateJoin(bb);
  575.  
  576.    return true;
  577. }
  578.  
  579. NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
  580. {
  581.    bld.setProgram(prog);
  582.    gMemBase = NULL;
  583. }
  584.  
  585. bool
  586. NVC0LoweringPass::visit(Function *fn)
  587. {
  588.    if (prog->getType() == Program::TYPE_GEOMETRY) {
  589.       assert(!strncmp(fn->getName(), "MAIN", 4));
  590.       // TODO: when we generate actual functions pass this value along somehow
  591.       bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
  592.       gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
  593.       if (fn->cfgExit) {
  594.          bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
  595.          bld.mkMovToReg(0, gpEmitAddress);
  596.       }
  597.    }
  598.    return true;
  599. }
  600.  
  601. bool
  602. NVC0LoweringPass::visit(BasicBlock *bb)
  603. {
  604.    return true;
  605. }
  606.  
  607. inline Value *
  608. NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
  609. {
  610.    uint8_t b = prog->driver->io.resInfoCBSlot;
  611.    uint32_t off = prog->driver->io.texBindBase + slot * 4;
  612.    return bld.
  613.       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
  614. }
  615.  
  616. // move array source to first slot, convert to u16, add indirections
  617. bool
  618. NVC0LoweringPass::handleTEX(TexInstruction *i)
  619. {
  620.    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
  621.    const int arg = i->tex.target.getArgCount();
  622.    const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
  623.    const int chipset = prog->getTarget()->getChipset();
  624.  
  625.    // Arguments to the TEX instruction are a little insane. Even though the
  626.    // encoding is identical between SM20 and SM30, the arguments mean
  627.    // different things between Fermi and Kepler+. A lot of arguments are
  628.    // optional based on flags passed to the instruction. This summarizes the
  629.    // order of things.
  630.    //
  631.    // Fermi:
  632.    //  array/indirect
  633.    //  coords
  634.    //  sample
  635.    //  lod bias
  636.    //  depth compare
  637.    //  offsets:
  638.    //    - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
  639.    //    - other: 4 bits each, single reg
  640.    //
  641.    // Kepler+:
  642.    //  indirect handle
  643.    //  array (+ offsets for txd in upper 16 bits)
  644.    //  coords
  645.    //  sample
  646.    //  lod bias
  647.    //  depth compare
  648.    //  offsets (same as fermi, except txd which takes it with array)
  649.    //
  650.    // Maxwell (tex):
  651.    //  array
  652.    //  coords
  653.    //  indirect handle
  654.    //  sample
  655.    //  lod bias
  656.    //  depth compare
  657.    //  offsets
  658.    //
  659.    // Maxwell (txd):
  660.    //  indirect handle
  661.    //  coords
  662.    //  array + offsets
  663.    //  derivatives
  664.  
  665.    if (chipset >= NVISA_GK104_CHIPSET) {
  666.       if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
  667.          // XXX this ignores tsc, and assumes a 1:1 mapping
  668.          assert(i->tex.rIndirectSrc >= 0);
  669.          Value *hnd = loadTexHandle(
  670.                bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
  671.                           i->getIndirectR(), bld.mkImm(2)),
  672.                i->tex.r);
  673.          i->tex.r = 0xff;
  674.          i->tex.s = 0x1f;
  675.          i->setIndirectR(hnd);
  676.          i->setIndirectS(NULL);
  677.       } else if (i->tex.r == i->tex.s) {
  678.          i->tex.r += prog->driver->io.texBindBase / 4;
  679.          i->tex.s  = 0; // only a single cX[] value possible here
  680.       } else {
  681.          Value *hnd = bld.getScratch();
  682.          Value *rHnd = loadTexHandle(NULL, i->tex.r);
  683.          Value *sHnd = loadTexHandle(NULL, i->tex.s);
  684.  
  685.          bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
  686.  
  687.          i->tex.r = 0; // not used for indirect tex
  688.          i->tex.s = 0;
  689.          i->setIndirectR(hnd);
  690.       }
  691.       if (i->tex.target.isArray()) {
  692.          LValue *layer = new_LValue(func, FILE_GPR);
  693.          Value *src = i->getSrc(lyr);
  694.          const int sat = (i->op == OP_TXF) ? 1 : 0;
  695.          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
  696.          bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
  697.          if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
  698.             for (int s = dim; s >= 1; --s)
  699.                i->setSrc(s, i->getSrc(s - 1));
  700.             i->setSrc(0, layer);
  701.          } else {
  702.             i->setSrc(dim, layer);
  703.          }
  704.       }
  705.       // Move the indirect reference to the first place
  706.       if (i->tex.rIndirectSrc >= 0 && (
  707.                 i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
  708.          Value *hnd = i->getIndirectR();
  709.  
  710.          i->setIndirectR(NULL);
  711.          i->moveSources(0, 1);
  712.          i->setSrc(0, hnd);
  713.          i->tex.rIndirectSrc = 0;
  714.          i->tex.sIndirectSrc = -1;
  715.       }
  716.    } else
  717.    // (nvc0) generate and move the tsc/tic/array source to the front
  718.    if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
  719.       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
  720.  
  721.       Value *ticRel = i->getIndirectR();
  722.       Value *tscRel = i->getIndirectS();
  723.  
  724.       if (ticRel) {
  725.          i->setSrc(i->tex.rIndirectSrc, NULL);
  726.          if (i->tex.r)
  727.             ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
  728.                                 ticRel, bld.mkImm(i->tex.r));
  729.       }
  730.       if (tscRel) {
  731.          i->setSrc(i->tex.sIndirectSrc, NULL);
  732.          if (i->tex.s)
  733.             tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
  734.                                 tscRel, bld.mkImm(i->tex.s));
  735.       }
  736.  
  737.       Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
  738.       for (int s = dim; s >= 1; --s)
  739.          i->setSrc(s, i->getSrc(s - 1));
  740.       i->setSrc(0, arrayIndex);
  741.  
  742.       if (arrayIndex) {
  743.          int sat = (i->op == OP_TXF) ? 1 : 0;
  744.          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
  745.          bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
  746.       } else {
  747.          bld.loadImm(src, 0);
  748.       }
  749.  
  750.       if (ticRel)
  751.          bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
  752.       if (tscRel)
  753.          bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
  754.  
  755.       i->setSrc(0, src);
  756.    }
  757.  
  758.    // For nvc0, the sample id has to be in the second operand, as the offset
  759.    // does. Right now we don't know how to pass both in, and this case can't
  760.    // happen with OpenGL. On nve0, the sample id is part of the texture
  761.    // coordinate argument.
  762.    assert(chipset >= NVISA_GK104_CHIPSET ||
  763.           !i->tex.useOffsets || !i->tex.target.isMS());
  764.  
  765.    // offset is between lod and dc
  766.    if (i->tex.useOffsets) {
  767.       int n, c;
  768.       int s = i->srcCount(0xff, true);
  769.       if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
  770.          if (i->tex.target.isShadow())
  771.             s--;
  772.          if (i->srcExists(s)) // move potential predicate out of the way
  773.             i->moveSources(s, 1);
  774.          if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
  775.             i->moveSources(s + 1, 1);
  776.       }
  777.       if (i->op == OP_TXG) {
  778.          // Either there is 1 offset, which goes into the 2 low bytes of the
  779.          // first source, or there are 4 offsets, which go into 2 sources (8
  780.          // values, 1 byte each).
  781.          Value *offs[2] = {NULL, NULL};
  782.          for (n = 0; n < i->tex.useOffsets; n++) {
  783.             for (c = 0; c < 2; ++c) {
  784.                if ((n % 2) == 0 && c == 0)
  785.                   offs[n / 2] = i->offset[n][c].get();
  786.                else
  787.                   bld.mkOp3(OP_INSBF, TYPE_U32,
  788.                             offs[n / 2],
  789.                             i->offset[n][c].get(),
  790.                             bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
  791.                             offs[n / 2]);
  792.             }
  793.          }
  794.          i->setSrc(s, offs[0]);
  795.          if (offs[1])
  796.             i->setSrc(s + 1, offs[1]);
  797.       } else {
  798.          unsigned imm = 0;
  799.          assert(i->tex.useOffsets == 1);
  800.          for (c = 0; c < 3; ++c) {
  801.             ImmediateValue val;
  802.             if (!i->offset[0][c].getImmediate(val))
  803.                assert(!"non-immediate offset passed to non-TXG");
  804.             imm |= (val.reg.data.u32 & 0xf) << (c * 4);
  805.          }
  806.          if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
  807.             // The offset goes into the upper 16 bits of the array index. So
  808.             // create it if it's not already there, and INSBF it if it already
  809.             // is.
  810.             s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
  811.             if (chipset >= NVISA_GM107_CHIPSET)
  812.                s += dim;
  813.             if (i->tex.target.isArray()) {
  814.                bld.mkOp3(OP_INSBF, TYPE_U32, i->getSrc(s),
  815.                          bld.loadImm(NULL, imm), bld.mkImm(0xc10),
  816.                          i->getSrc(s));
  817.             } else {
  818.                i->moveSources(s, 1);
  819.                i->setSrc(s, bld.loadImm(NULL, imm << 16));
  820.             }
  821.          } else {
  822.             i->setSrc(s, bld.loadImm(NULL, imm));
  823.          }
  824.       }
  825.    }
  826.  
  827.    if (chipset >= NVISA_GK104_CHIPSET) {
  828.       //
  829.       // If TEX requires more than 4 sources, the 2nd register tuple must be
  830.       // aligned to 4, even if it consists of just a single 4-byte register.
  831.       //
  832.       // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
  833.       //
  834.       int s = i->srcCount(0xff, true);
  835.       if (s > 4 && s < 7) {
  836.          if (i->srcExists(s)) // move potential predicate out of the way
  837.             i->moveSources(s, 7 - s);
  838.          while (s < 7)
  839.             i->setSrc(s++, bld.loadImm(NULL, 0));
  840.       }
  841.    }
  842.  
  843.    return true;
  844. }
  845.  
  846. bool
  847. NVC0LoweringPass::handleManualTXD(TexInstruction *i)
  848. {
  849.    static const uint8_t qOps[4][2] =
  850.    {
  851.       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
  852.       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
  853.       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
  854.       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
  855.    };
  856.    Value *def[4][4];
  857.    Value *crd[3];
  858.    Instruction *tex;
  859.    Value *zero = bld.loadImm(bld.getSSA(), 0);
  860.    int l, c;
  861.    const int dim = i->tex.target.getDim();
  862.    const int array = i->tex.target.isArray();
  863.  
  864.    i->op = OP_TEX; // no need to clone dPdx/dPdy later
  865.  
  866.    for (c = 0; c < dim; ++c)
  867.       crd[c] = bld.getScratch();
  868.  
  869.    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
  870.    for (l = 0; l < 4; ++l) {
  871.       // mov coordinates from lane l to all lanes
  872.       for (c = 0; c < dim; ++c)
  873.          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
  874.       // add dPdx from lane l to lanes dx
  875.       for (c = 0; c < dim; ++c)
  876.          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
  877.       // add dPdy from lane l to lanes dy
  878.       for (c = 0; c < dim; ++c)
  879.          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
  880.       // texture
  881.       bld.insert(tex = cloneForward(func, i));
  882.       for (c = 0; c < dim; ++c)
  883.          tex->setSrc(c + array, crd[c]);
  884.       // save results
  885.       for (c = 0; i->defExists(c); ++c) {
  886.          Instruction *mov;
  887.          def[c][l] = bld.getSSA();
  888.          mov = bld.mkMov(def[c][l], tex->getDef(c));
  889.          mov->fixed = 1;
  890.          mov->lanes = 1 << l;
  891.       }
  892.    }
  893.    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
  894.  
  895.    for (c = 0; i->defExists(c); ++c) {
  896.       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
  897.       for (l = 0; l < 4; ++l)
  898.          u->setSrc(l, def[c][l]);
  899.    }
  900.  
  901.    i->bb->remove(i);
  902.    return true;
  903. }
  904.  
  905. bool
  906. NVC0LoweringPass::handleTXD(TexInstruction *txd)
  907. {
  908.    int dim = txd->tex.target.getDim();
  909.    unsigned arg = txd->tex.target.getArgCount();
  910.    unsigned expected_args = arg;
  911.    const int chipset = prog->getTarget()->getChipset();
  912.  
  913.    if (chipset >= NVISA_GK104_CHIPSET) {
  914.       if (!txd->tex.target.isArray() && txd->tex.useOffsets)
  915.          expected_args++;
  916.       if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
  917.          expected_args++;
  918.    } else {
  919.       if (txd->tex.useOffsets)
  920.          expected_args++;
  921.       if (!txd->tex.target.isArray() && (
  922.                 txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
  923.          expected_args++;
  924.    }
  925.  
  926.    if (expected_args > 4 ||
  927.        dim > 2 ||
  928.        txd->tex.target.isShadow() ||
  929.        txd->tex.target.isCube())
  930.       txd->op = OP_TEX;
  931.  
  932.    handleTEX(txd);
  933.    while (txd->srcExists(arg))
  934.       ++arg;
  935.  
  936.    txd->tex.derivAll = true;
  937.    if (txd->op == OP_TEX)
  938.       return handleManualTXD(txd);
  939.  
  940.    assert(arg == expected_args);
  941.    for (int c = 0; c < dim; ++c) {
  942.       txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
  943.       txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
  944.       txd->dPdx[c].set(NULL);
  945.       txd->dPdy[c].set(NULL);
  946.    }
  947.    return true;
  948. }
  949.  
  950. bool
  951. NVC0LoweringPass::handleTXQ(TexInstruction *txq)
  952. {
  953.    // TODO: indirect resource/sampler index
  954.    return true;
  955. }
  956.  
  957. bool
  958. NVC0LoweringPass::handleTXLQ(TexInstruction *i)
  959. {
  960.    /* The outputs are inverted compared to what the TGSI instruction
  961.     * expects. Take that into account in the mask.
  962.     */
  963.    assert((i->tex.mask & ~3) == 0);
  964.    if (i->tex.mask == 1)
  965.       i->tex.mask = 2;
  966.    else if (i->tex.mask == 2)
  967.       i->tex.mask = 1;
  968.    handleTEX(i);
  969.    bld.setPosition(i, true);
  970.  
  971.    /* The returned values are not quite what we want:
  972.     * (a) convert from s16/u16 to f32
  973.     * (b) multiply by 1/256
  974.     */
  975.    for (int def = 0; def < 2; ++def) {
  976.       if (!i->defExists(def))
  977.          continue;
  978.       enum DataType type = TYPE_S16;
  979.       if (i->tex.mask == 2 || def > 0)
  980.          type = TYPE_U16;
  981.       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));
  982.       bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
  983.                 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
  984.    }
  985.    if (i->tex.mask == 3) {
  986.       LValue *t = new_LValue(func, FILE_GPR);
  987.       bld.mkMov(t, i->getDef(0));
  988.       bld.mkMov(i->getDef(0), i->getDef(1));
  989.       bld.mkMov(i->getDef(1), t);
  990.    }
  991.    return true;
  992. }
  993.  
  994.  
  995. bool
  996. NVC0LoweringPass::handleATOM(Instruction *atom)
  997. {
  998.    SVSemantic sv;
  999.  
  1000.    switch (atom->src(0).getFile()) {
  1001.    case FILE_MEMORY_LOCAL:
  1002.       sv = SV_LBASE;
  1003.       break;
  1004.    case FILE_MEMORY_SHARED:
  1005.       sv = SV_SBASE;
  1006.       break;
  1007.    default:
  1008.       assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
  1009.       return true;
  1010.    }
  1011.    Value *base =
  1012.       bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
  1013.    Value *ptr = atom->getIndirect(0, 0);
  1014.  
  1015.    atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
  1016.    atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
  1017.    if (ptr)
  1018.       base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
  1019.    atom->setIndirect(0, 0, base);
  1020.  
  1021.    return true;
  1022. }
  1023.  
  1024. bool
  1025. NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
  1026. {
  1027.    if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
  1028.        cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
  1029.       return false;
  1030.    bld.setPosition(cas, true);
  1031.  
  1032.    if (needCctl) {
  1033.       Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
  1034.       cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
  1035.       cctl->fixed = 1;
  1036.       cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
  1037.       if (cas->isPredicated())
  1038.          cctl->setPredicate(cas->cc, cas->getPredicate());
  1039.    }
  1040.  
  1041.    if (cas->defExists(0) && cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
  1042.       // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
  1043.       // should be set to the high part of the double reg or bad things will
  1044.       // happen elsewhere in the universe.
  1045.       // Also, it sometimes returns the new value instead of the old one
  1046.       // under mysterious circumstances.
  1047.       Value *dreg = bld.getSSA(8);
  1048.       bld.setPosition(cas, false);
  1049.       bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
  1050.       cas->setSrc(1, dreg);
  1051.    }
  1052.  
  1053.    return true;
  1054. }
  1055.  
  1056. inline Value *
  1057. NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
  1058. {
  1059.    uint8_t b = prog->driver->io.resInfoCBSlot;
  1060.    off += prog->driver->io.suInfoBase;
  1061.    return bld.
  1062.       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
  1063. }
  1064.  
  1065. inline Value *
  1066. NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
  1067. {
  1068.    uint8_t b = prog->driver->io.msInfoCBSlot;
  1069.    off += prog->driver->io.msInfoBase;
  1070.    return bld.
  1071.       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
  1072. }
  1073.  
  1074. /* On nvc0, surface info is obtained via the surface binding points passed
  1075.  * to the SULD/SUST instructions.
  1076.  * On nve4, surface info is stored in c[] and is used by various special
  1077.  * instructions, e.g. for clamping coordiantes or generating an address.
  1078.  * They couldn't just have added an equivalent to TIC now, couldn't they ?
  1079.  */
  1080. #define NVE4_SU_INFO_ADDR   0x00
  1081. #define NVE4_SU_INFO_FMT    0x04
  1082. #define NVE4_SU_INFO_DIM_X  0x08
  1083. #define NVE4_SU_INFO_PITCH  0x0c
  1084. #define NVE4_SU_INFO_DIM_Y  0x10
  1085. #define NVE4_SU_INFO_ARRAY  0x14
  1086. #define NVE4_SU_INFO_DIM_Z  0x18
  1087. #define NVE4_SU_INFO_UNK1C  0x1c
  1088. #define NVE4_SU_INFO_WIDTH  0x20
  1089. #define NVE4_SU_INFO_HEIGHT 0x24
  1090. #define NVE4_SU_INFO_DEPTH  0x28
  1091. #define NVE4_SU_INFO_TARGET 0x2c
  1092. #define NVE4_SU_INFO_CALL   0x30
  1093. #define NVE4_SU_INFO_RAW_X  0x34
  1094. #define NVE4_SU_INFO_MS_X   0x38
  1095. #define NVE4_SU_INFO_MS_Y   0x3c
  1096.  
  1097. #define NVE4_SU_INFO__STRIDE 0x40
  1098.  
  1099. #define NVE4_SU_INFO_DIM(i)  (0x08 + (i) * 8)
  1100. #define NVE4_SU_INFO_SIZE(i) (0x20 + (i) * 4)
  1101. #define NVE4_SU_INFO_MS(i)   (0x38 + (i) * 4)
  1102.  
  1103. static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
  1104. {
  1105.    switch (su->tex.target.getEnum()) {
  1106.    case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
  1107.    case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  1108.    case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  1109.    case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
  1110.                                    NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
  1111.                                    NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  1112.    case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
  1113.    case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
  1114.    case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  1115.    case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  1116.    case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  1117.    case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  1118.    case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  1119.    default:
  1120.       assert(0);
  1121.       return 0;
  1122.    }
  1123. }
  1124.  
  1125. void
  1126. NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
  1127. {
  1128.    const uint16_t base = tex->tex.r * NVE4_SU_INFO__STRIDE;
  1129.    const int arg = tex->tex.target.getArgCount();
  1130.  
  1131.    if (tex->tex.target == TEX_TARGET_2D_MS)
  1132.       tex->tex.target = TEX_TARGET_2D;
  1133.    else
  1134.    if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
  1135.       tex->tex.target = TEX_TARGET_2D_ARRAY;
  1136.    else
  1137.       return;
  1138.  
  1139.    Value *x = tex->getSrc(0);
  1140.    Value *y = tex->getSrc(1);
  1141.    Value *s = tex->getSrc(arg - 1);
  1142.  
  1143.    Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
  1144.  
  1145.    Value *ms_x = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(0));
  1146.    Value *ms_y = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(1));
  1147.  
  1148.    bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
  1149.    bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
  1150.  
  1151.    s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
  1152.    s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
  1153.  
  1154.    Value *dx = loadMsInfo32(ts, 0x0);
  1155.    Value *dy = loadMsInfo32(ts, 0x4);
  1156.  
  1157.    bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
  1158.    bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
  1159.  
  1160.    tex->setSrc(0, tx);
  1161.    tex->setSrc(1, ty);
  1162.    tex->moveSources(arg, -1);
  1163. }
  1164.  
  1165. // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
  1166. // They're computed from the coordinates using the surface info in c[] space.
  1167. void
  1168. NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
  1169. {
  1170.    Instruction *insn;
  1171.    const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
  1172.    const bool raw =
  1173.       su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
  1174.    const int idx = su->tex.r;
  1175.    const int dim = su->tex.target.getDim();
  1176.    const int arg = dim + (su->tex.target.isArray() ? 1 : 0);
  1177.    const uint16_t base = idx * NVE4_SU_INFO__STRIDE;
  1178.    int c;
  1179.    Value *zero = bld.mkImm(0);
  1180.    Value *p1 = NULL;
  1181.    Value *v;
  1182.    Value *src[3];
  1183.    Value *bf, *eau, *off;
  1184.    Value *addr, *pred;
  1185.  
  1186.    off = bld.getScratch(4);
  1187.    bf = bld.getScratch(4);
  1188.    addr = bld.getSSA(8);
  1189.    pred = bld.getScratch(1, FILE_PREDICATE);
  1190.  
  1191.    bld.setPosition(su, false);
  1192.  
  1193.    adjustCoordinatesMS(su);
  1194.  
  1195.    // calculate clamped coordinates
  1196.    for (c = 0; c < arg; ++c) {
  1197.       src[c] = bld.getScratch();
  1198.       if (c == 0 && raw)
  1199.          v = loadResInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
  1200.       else
  1201.          v = loadResInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
  1202.       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
  1203.          ->subOp = getSuClampSubOp(su, c);
  1204.    }
  1205.    for (; c < 3; ++c)
  1206.       src[c] = zero;
  1207.  
  1208.    // set predicate output
  1209.    if (su->tex.target == TEX_TARGET_BUFFER) {
  1210.       src[0]->getInsn()->setFlagsDef(1, pred);
  1211.    } else
  1212.    if (su->tex.target.isArray()) {
  1213.       p1 = bld.getSSA(1, FILE_PREDICATE);
  1214.       src[dim]->getInsn()->setFlagsDef(1, p1);
  1215.    }
  1216.  
  1217.    // calculate pixel offset
  1218.    if (dim == 1) {
  1219.       if (su->tex.target != TEX_TARGET_BUFFER)
  1220.          bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
  1221.    } else
  1222.    if (dim == 3) {
  1223.       v = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
  1224.       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
  1225.          ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
  1226.  
  1227.       v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
  1228.       bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
  1229.          ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
  1230.    } else {
  1231.       assert(dim == 2);
  1232.       v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
  1233.       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
  1234.          ->subOp = su->tex.target.isArray() ?
  1235.          NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
  1236.    }
  1237.  
  1238.    // calculate effective address part 1
  1239.    if (su->tex.target == TEX_TARGET_BUFFER) {
  1240.       if (raw) {
  1241.          bf = src[0];
  1242.       } else {
  1243.          v = loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
  1244.          bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
  1245.             ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
  1246.       }
  1247.    } else {
  1248.       Value *y = src[1];
  1249.       Value *z = src[2];
  1250.       uint16_t subOp = 0;
  1251.  
  1252.       switch (dim) {
  1253.       case 1:
  1254.          y = zero;
  1255.          z = zero;
  1256.          break;
  1257.       case 2:
  1258.          z = off;
  1259.          if (!su->tex.target.isArray()) {
  1260.             z = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
  1261.             subOp = NV50_IR_SUBOP_SUBFM_3D;
  1262.          }
  1263.          break;
  1264.       default:
  1265.          subOp = NV50_IR_SUBOP_SUBFM_3D;
  1266.          assert(dim == 3);
  1267.          break;
  1268.       }
  1269.       insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
  1270.       insn->subOp = subOp;
  1271.       insn->setFlagsDef(1, pred);
  1272.    }
  1273.  
  1274.    // part 2
  1275.    v = loadResInfo32(NULL, base + NVE4_SU_INFO_ADDR);
  1276.  
  1277.    if (su->tex.target == TEX_TARGET_BUFFER) {
  1278.       eau = v;
  1279.    } else {
  1280.       eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
  1281.    }
  1282.    // add array layer offset
  1283.    if (su->tex.target.isArray()) {
  1284.       v = loadResInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
  1285.       if (dim == 1)
  1286.          bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
  1287.             ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
  1288.       else
  1289.          bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
  1290.             ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
  1291.       // combine predicates
  1292.       assert(p1);
  1293.       bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
  1294.    }
  1295.  
  1296.    if (atom) {
  1297.       Value *lo = bf;
  1298.       if (su->tex.target == TEX_TARGET_BUFFER) {
  1299.          lo = zero;
  1300.          bld.mkMov(off, bf);
  1301.       }
  1302.       //  bf == g[] address & 0xff
  1303.       // eau == g[] address >> 8
  1304.       bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
  1305.       bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
  1306.    } else
  1307.    if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
  1308.       // Convert from u32 to u8 address format, which is what the library code
  1309.       // doing SULDP currently uses.
  1310.       // XXX: can SUEAU do this ?
  1311.       // XXX: does it matter that we don't mask high bytes in bf ?
  1312.       // Grrr.
  1313.       bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
  1314.       bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
  1315.    }
  1316.  
  1317.    bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
  1318.  
  1319.    if (atom && su->tex.target == TEX_TARGET_BUFFER)
  1320.       bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
  1321.  
  1322.    // let's just set it 0 for raw access and hope it works
  1323.    v = raw ?
  1324.       bld.mkImm(0) : loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
  1325.  
  1326.    // get rid of old coordinate sources, make space for fmt info and predicate
  1327.    su->moveSources(arg, 3 - arg);
  1328.    // set 64 bit address and 32-bit format sources
  1329.    su->setSrc(0, addr);
  1330.    su->setSrc(1, v);
  1331.    su->setSrc(2, pred);
  1332. }
  1333.  
  1334. void
  1335. NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
  1336. {
  1337.    processSurfaceCoordsNVE4(su);
  1338.  
  1339.    // Who do we hate more ? The person who decided that nvc0's SULD doesn't
  1340.    // have to support conversion or the person who decided that, in OpenCL,
  1341.    // you don't have to specify the format here like you do in OpenGL ?
  1342.  
  1343.    if (su->op == OP_SULDP) {
  1344.       // We don't patch shaders. Ever.
  1345.       // You get an indirect call to our library blob here.
  1346.       // But at least it's uniform.
  1347.       FlowInstruction *call;
  1348.       LValue *p[3];
  1349.       LValue *r[5];
  1350.       uint16_t base = su->tex.r * NVE4_SU_INFO__STRIDE + NVE4_SU_INFO_CALL;
  1351.  
  1352.       for (int i = 0; i < 4; ++i)
  1353.          (r[i] = bld.getScratch(4, FILE_GPR))->reg.data.id = i;
  1354.       for (int i = 0; i < 3; ++i)
  1355.          (p[i] = bld.getScratch(1, FILE_PREDICATE))->reg.data.id = i;
  1356.       (r[4] = bld.getScratch(8, FILE_GPR))->reg.data.id = 4;
  1357.  
  1358.       bld.mkMov(p[1], bld.mkImm((su->cache == CACHE_CA) ? 1 : 0), TYPE_U8);
  1359.       bld.mkMov(p[2], bld.mkImm((su->cache == CACHE_CG) ? 1 : 0), TYPE_U8);
  1360.       bld.mkMov(p[0], su->getSrc(2), TYPE_U8);
  1361.       bld.mkMov(r[4], su->getSrc(0), TYPE_U64);
  1362.       bld.mkMov(r[2], su->getSrc(1), TYPE_U32);
  1363.  
  1364.       call = bld.mkFlow(OP_CALL, NULL, su->cc, su->getPredicate());
  1365.  
  1366.       call->indirect = 1;
  1367.       call->absolute = 1;
  1368.       call->setSrc(0, bld.mkSymbol(FILE_MEMORY_CONST,
  1369.                                    prog->driver->io.resInfoCBSlot, TYPE_U32,
  1370.                                    prog->driver->io.suInfoBase + base));
  1371.       call->setSrc(1, r[2]);
  1372.       call->setSrc(2, r[4]);
  1373.       for (int i = 0; i < 3; ++i)
  1374.          call->setSrc(3 + i, p[i]);
  1375.       for (int i = 0; i < 4; ++i) {
  1376.          call->setDef(i, r[i]);
  1377.          bld.mkMov(su->getDef(i), r[i]);
  1378.       }
  1379.       call->setDef(4, p[1]);
  1380.       delete_Instruction(bld.getProgram(), su);
  1381.    }
  1382.  
  1383.    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
  1384.       // FIXME: for out of bounds access, destination value will be undefined !
  1385.       Value *pred = su->getSrc(2);
  1386.       CondCode cc = CC_NOT_P;
  1387.       if (su->getPredicate()) {
  1388.          pred = bld.getScratch(1, FILE_PREDICATE);
  1389.          cc = su->cc;
  1390.          if (cc == CC_NOT_P) {
  1391.             bld.mkOp2(OP_OR, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
  1392.          } else {
  1393.             bld.mkOp2(OP_AND, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
  1394.             pred->getInsn()->src(1).mod = Modifier(NV50_IR_MOD_NOT);
  1395.          }
  1396.       }
  1397.       Instruction *red = bld.mkOp(OP_ATOM, su->dType, su->getDef(0));
  1398.       red->subOp = su->subOp;
  1399.       if (!gMemBase)
  1400.          gMemBase = bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0);
  1401.       red->setSrc(0, gMemBase);
  1402.       red->setSrc(1, su->getSrc(3));
  1403.       if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
  1404.          red->setSrc(2, su->getSrc(4));
  1405.       red->setIndirect(0, 0, su->getSrc(0));
  1406.       red->setPredicate(cc, pred);
  1407.       delete_Instruction(bld.getProgram(), su);
  1408.       handleCasExch(red, true);
  1409.    } else {
  1410.       su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
  1411.    }
  1412. }
  1413.  
  1414. bool
  1415. NVC0LoweringPass::handleWRSV(Instruction *i)
  1416. {
  1417.    Instruction *st;
  1418.    Symbol *sym;
  1419.    uint32_t addr;
  1420.  
  1421.    // must replace, $sreg are not writeable
  1422.    addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
  1423.    if (addr >= 0x400)
  1424.       return false;
  1425.    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
  1426.  
  1427.    st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
  1428.                     i->getSrc(1));
  1429.    st->perPatch = i->perPatch;
  1430.  
  1431.    bld.getBB()->remove(i);
  1432.    return true;
  1433. }
  1434.  
  1435. void
  1436. NVC0LoweringPass::readTessCoord(LValue *dst, int c)
  1437. {
  1438.    Value *laneid = bld.getSSA();
  1439.    Value *x, *y;
  1440.  
  1441.    bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
  1442.  
  1443.    if (c == 0) {
  1444.       x = dst;
  1445.       y = NULL;
  1446.    } else
  1447.    if (c == 1) {
  1448.       x = NULL;
  1449.       y = dst;
  1450.    } else {
  1451.       assert(c == 2);
  1452.       x = bld.getSSA();
  1453.       y = bld.getSSA();
  1454.    }
  1455.    if (x)
  1456.       bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
  1457.    if (y)
  1458.       bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
  1459.  
  1460.    if (c == 2) {
  1461.       bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
  1462.       bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
  1463.    }
  1464. }
  1465.  
  1466. bool
  1467. NVC0LoweringPass::handleRDSV(Instruction *i)
  1468. {
  1469.    Symbol *sym = i->getSrc(0)->asSym();
  1470.    const SVSemantic sv = sym->reg.data.sv.sv;
  1471.    Value *vtx = NULL;
  1472.    Instruction *ld;
  1473.    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
  1474.  
  1475.    if (addr >= 0x400) {
  1476.       // mov $sreg
  1477.       if (sym->reg.data.sv.index == 3) {
  1478.          // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
  1479.          i->op = OP_MOV;
  1480.          i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
  1481.       }
  1482.       return true;
  1483.    }
  1484.  
  1485.    switch (sv) {
  1486.    case SV_POSITION:
  1487.       assert(prog->getType() == Program::TYPE_FRAGMENT);
  1488.       if (i->srcExists(1)) {
  1489.          // Pass offset through to the interpolation logic
  1490.          ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,
  1491.                            i->getDef(0), addr, NULL);
  1492.          ld->setSrc(1, i->getSrc(1));
  1493.       } else {
  1494.          bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
  1495.       }
  1496.       break;
  1497.    case SV_FACE:
  1498.    {
  1499.       Value *face = i->getDef(0);
  1500.       bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
  1501.       if (i->dType == TYPE_F32) {
  1502.          bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
  1503.          bld.mkOp1(OP_NEG, TYPE_S32, face, face);
  1504.          bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
  1505.       }
  1506.    }
  1507.       break;
  1508.    case SV_TESS_COORD:
  1509.       assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
  1510.       readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
  1511.       break;
  1512.    case SV_NTID:
  1513.    case SV_NCTAID:
  1514.    case SV_GRIDID:
  1515.       assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
  1516.       if (sym->reg.data.sv.index == 3) {
  1517.          i->op = OP_MOV;
  1518.          i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
  1519.          return true;
  1520.       }
  1521.       addr += prog->driver->prop.cp.gridInfoBase;
  1522.       bld.mkLoad(TYPE_U32, i->getDef(0),
  1523.                  bld.mkSymbol(FILE_MEMORY_CONST, 0, TYPE_U32, addr), NULL);
  1524.       break;
  1525.    case SV_SAMPLE_INDEX:
  1526.       // TODO: Properly pass source as an address in the PIX address space
  1527.       // (which can be of the form [r0+offset]). But this is currently
  1528.       // unnecessary.
  1529.       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
  1530.       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
  1531.       break;
  1532.    case SV_SAMPLE_POS: {
  1533.       Value *off = new_LValue(func, FILE_GPR);
  1534.       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
  1535.       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
  1536.       bld.mkOp2(OP_SHL, TYPE_U32, off, i->getDef(0), bld.mkImm(3));
  1537.       bld.mkLoad(TYPE_F32,
  1538.                  i->getDef(0),
  1539.                  bld.mkSymbol(
  1540.                        FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
  1541.                        TYPE_U32, prog->driver->io.sampleInfoBase +
  1542.                        4 * sym->reg.data.sv.index),
  1543.                  off);
  1544.       break;
  1545.    }
  1546.    case SV_SAMPLE_MASK:
  1547.       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
  1548.       ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
  1549.       break;
  1550.    default:
  1551.       if (prog->getType() == Program::TYPE_TESSELLATION_EVAL)
  1552.          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
  1553.       ld = bld.mkFetch(i->getDef(0), i->dType,
  1554.                        FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
  1555.       ld->perPatch = i->perPatch;
  1556.       break;
  1557.    }
  1558.    bld.getBB()->remove(i);
  1559.    return true;
  1560. }
  1561.  
  1562. bool
  1563. NVC0LoweringPass::handleDIV(Instruction *i)
  1564. {
  1565.    if (!isFloatType(i->dType))
  1566.       return true;
  1567.    bld.setPosition(i, false);
  1568.    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
  1569.    i->op = OP_MUL;
  1570.    i->setSrc(1, rcp->getDef(0));
  1571.    return true;
  1572. }
  1573.  
  1574. bool
  1575. NVC0LoweringPass::handleMOD(Instruction *i)
  1576. {
  1577.    if (!isFloatType(i->dType))
  1578.       return true;
  1579.    LValue *value = bld.getScratch(typeSizeof(i->dType));
  1580.    bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
  1581.    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
  1582.    bld.mkOp1(OP_TRUNC, i->dType, value, value);
  1583.    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
  1584.    i->op = OP_SUB;
  1585.    i->setSrc(1, value);
  1586.    return true;
  1587. }
  1588.  
  1589. bool
  1590. NVC0LoweringPass::handleSQRT(Instruction *i)
  1591. {
  1592.    Value *pred = bld.getSSA(1, FILE_PREDICATE);
  1593.    Value *zero = bld.getSSA();
  1594.    Instruction *rsq;
  1595.  
  1596.    bld.mkOp1(OP_MOV, TYPE_U32, zero, bld.mkImm(0));
  1597.    if (i->dType == TYPE_F64)
  1598.       zero = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), zero, zero);
  1599.    bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
  1600.    bld.mkOp1(OP_MOV, i->dType, i->getDef(0), zero)->setPredicate(CC_P, pred);
  1601.    rsq = bld.mkOp1(OP_RSQ, i->dType,
  1602.                    bld.getSSA(typeSizeof(i->dType)), i->getSrc(0));
  1603.    rsq->setPredicate(CC_NOT_P, pred);
  1604.    i->op = OP_MUL;
  1605.    i->setSrc(1, rsq->getDef(0));
  1606.    i->setPredicate(CC_NOT_P, pred);
  1607.  
  1608.  
  1609.    return true;
  1610. }
  1611.  
  1612. bool
  1613. NVC0LoweringPass::handlePOW(Instruction *i)
  1614. {
  1615.    LValue *val = bld.getScratch();
  1616.  
  1617.    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
  1618.    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
  1619.    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
  1620.  
  1621.    i->op = OP_EX2;
  1622.    i->setSrc(0, val);
  1623.    i->setSrc(1, NULL);
  1624.  
  1625.    return true;
  1626. }
  1627.  
  1628. bool
  1629. NVC0LoweringPass::handleEXPORT(Instruction *i)
  1630. {
  1631.    if (prog->getType() == Program::TYPE_FRAGMENT) {
  1632.       int id = i->getSrc(0)->reg.data.offset / 4;
  1633.  
  1634.       if (i->src(0).isIndirect(0)) // TODO, ugly
  1635.          return false;
  1636.       i->op = OP_MOV;
  1637.       i->subOp = NV50_IR_SUBOP_MOV_FINAL;
  1638.       i->src(0).set(i->src(1));
  1639.       i->setSrc(1, NULL);
  1640.       i->setDef(0, new_LValue(func, FILE_GPR));
  1641.       i->getDef(0)->reg.data.id = id;
  1642.  
  1643.       prog->maxGPR = MAX2(prog->maxGPR, id);
  1644.    } else
  1645.    if (prog->getType() == Program::TYPE_GEOMETRY) {
  1646.       i->setIndirect(0, 1, gpEmitAddress);
  1647.    }
  1648.    return true;
  1649. }
  1650.  
  1651. bool
  1652. NVC0LoweringPass::handleOUT(Instruction *i)
  1653. {
  1654.    Instruction *prev = i->prev;
  1655.    ImmediateValue stream, prevStream;
  1656.  
  1657.    // Only merge if the stream ids match. Also, note that the previous
  1658.    // instruction would have already been lowered, so we take arg1 from it.
  1659.    if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
  1660.        i->src(0).getImmediate(stream) &&
  1661.        prev->src(1).getImmediate(prevStream) &&
  1662.        stream.reg.data.u32 == prevStream.reg.data.u32) {
  1663.       i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
  1664.       delete_Instruction(prog, i);
  1665.    } else {
  1666.       assert(gpEmitAddress);
  1667.       i->setDef(0, gpEmitAddress);
  1668.       i->setSrc(1, i->getSrc(0));
  1669.       i->setSrc(0, gpEmitAddress);
  1670.    }
  1671.    return true;
  1672. }
  1673.  
  1674. // Generate a binary predicate if an instruction is predicated by
  1675. // e.g. an f32 value.
  1676. void
  1677. NVC0LoweringPass::checkPredicate(Instruction *insn)
  1678. {
  1679.    Value *pred = insn->getPredicate();
  1680.    Value *pdst;
  1681.  
  1682.    if (!pred || pred->reg.file == FILE_PREDICATE)
  1683.       return;
  1684.    pdst = new_LValue(func, FILE_PREDICATE);
  1685.  
  1686.    // CAUTION: don't use pdst->getInsn, the definition might not be unique,
  1687.    //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
  1688.  
  1689.    bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);
  1690.  
  1691.    insn->setPredicate(insn->cc, pdst);
  1692. }
  1693.  
  1694. //
  1695. // - add quadop dance for texturing
  1696. // - put FP outputs in GPRs
  1697. // - convert instruction sequences
  1698. //
  1699. bool
  1700. NVC0LoweringPass::visit(Instruction *i)
  1701. {
  1702.    bld.setPosition(i, false);
  1703.  
  1704.    if (i->cc != CC_ALWAYS)
  1705.       checkPredicate(i);
  1706.  
  1707.    switch (i->op) {
  1708.    case OP_TEX:
  1709.    case OP_TXB:
  1710.    case OP_TXL:
  1711.    case OP_TXF:
  1712.    case OP_TXG:
  1713.       return handleTEX(i->asTex());
  1714.    case OP_TXD:
  1715.       return handleTXD(i->asTex());
  1716.    case OP_TXLQ:
  1717.       return handleTXLQ(i->asTex());
  1718.    case OP_TXQ:
  1719.      return handleTXQ(i->asTex());
  1720.    case OP_EX2:
  1721.       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
  1722.       i->setSrc(0, i->getDef(0));
  1723.       break;
  1724.    case OP_POW:
  1725.       return handlePOW(i);
  1726.    case OP_DIV:
  1727.       return handleDIV(i);
  1728.    case OP_MOD:
  1729.       return handleMOD(i);
  1730.    case OP_SQRT:
  1731.       return handleSQRT(i);
  1732.    case OP_EXPORT:
  1733.       return handleEXPORT(i);
  1734.    case OP_EMIT:
  1735.    case OP_RESTART:
  1736.       return handleOUT(i);
  1737.    case OP_RDSV:
  1738.       return handleRDSV(i);
  1739.    case OP_WRSV:
  1740.       return handleWRSV(i);
  1741.    case OP_LOAD:
  1742.       if (i->src(0).getFile() == FILE_SHADER_INPUT) {
  1743.          if (prog->getType() == Program::TYPE_COMPUTE) {
  1744.             i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
  1745.             i->getSrc(0)->reg.fileIndex = 0;
  1746.          } else
  1747.          if (prog->getType() == Program::TYPE_GEOMETRY &&
  1748.              i->src(0).isIndirect(0)) {
  1749.             // XXX: this assumes vec4 units
  1750.             Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
  1751.                                     i->getIndirect(0, 0), bld.mkImm(4));
  1752.             i->setIndirect(0, 0, ptr);
  1753.             i->op = OP_VFETCH;
  1754.          } else {
  1755.             i->op = OP_VFETCH;
  1756.             assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
  1757.          }
  1758.       } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
  1759.          if (i->src(0).isIndirect(1)) {
  1760.             Value *ptr;
  1761.             if (i->src(0).isIndirect(0))
  1762.                ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
  1763.                                 i->getIndirect(0, 1), bld.mkImm(0x1010),
  1764.                                 i->getIndirect(0, 0));
  1765.             else
  1766.                ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
  1767.                                 i->getIndirect(0, 1), bld.mkImm(16));
  1768.             i->setIndirect(0, 1, NULL);
  1769.             i->setIndirect(0, 0, ptr);
  1770.             i->subOp = NV50_IR_SUBOP_LDC_IS;
  1771.          }
  1772.       }
  1773.       break;
  1774.    case OP_ATOM:
  1775.    {
  1776.       const bool cctl = i->src(0).getFile() == FILE_MEMORY_GLOBAL;
  1777.       handleATOM(i);
  1778.       handleCasExch(i, cctl);
  1779.    }
  1780.       break;
  1781.    case OP_SULDB:
  1782.    case OP_SULDP:
  1783.    case OP_SUSTB:
  1784.    case OP_SUSTP:
  1785.    case OP_SUREDB:
  1786.    case OP_SUREDP:
  1787.       if (targ->getChipset() >= NVISA_GK104_CHIPSET)
  1788.          handleSurfaceOpNVE4(i->asTex());
  1789.       break;
  1790.    default:
  1791.       break;
  1792.    }
  1793.    return true;
  1794. }
  1795.  
  1796. bool
  1797. TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
  1798. {
  1799.    if (stage == CG_STAGE_PRE_SSA) {
  1800.       NVC0LoweringPass pass(prog);
  1801.       return pass.run(prog, false, true);
  1802.    } else
  1803.    if (stage == CG_STAGE_POST_RA) {
  1804.       NVC0LegalizePostRA pass(prog);
  1805.       return pass.run(prog, false, true);
  1806.    } else
  1807.    if (stage == CG_STAGE_SSA) {
  1808.       NVC0LegalizeSSA pass;
  1809.       return pass.run(prog, false, true);
  1810.    }
  1811.    return false;
  1812. }
  1813.  
  1814. } // namespace nv50_ir
  1815.