Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright 2011 Christoph Bumiller
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice shall be included in
  12.  * all copies or substantial portions of the Software.
  13.  *
  14.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18.  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19.  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20.  * OTHER DEALINGS IN THE SOFTWARE.
  21.  */
  22.  
  23. #include "nv50/codegen/nv50_ir.h"
  24. #include "nv50/codegen/nv50_ir_build_util.h"
  25.  
  26. #include "nv50_ir_target_nvc0.h"
  27.  
  28. #include <limits>
  29.  
  30. namespace nv50_ir {
  31.  
  32. #define QOP_ADD  0
  33. #define QOP_SUBR 1
  34. #define QOP_SUB  2
  35. #define QOP_MOV2 3
  36.  
  37. //             UL UR LL LR
  38. #define QUADOP(q, r, s, t)                      \
  39.    ((QOP_##q << 6) | (QOP_##r << 4) |           \
  40.     (QOP_##s << 2) | (QOP_##t << 0))
  41.  
  42. class NVC0LegalizeSSA : public Pass
  43. {
  44. private:
  45.    virtual bool visit(BasicBlock *);
  46.    virtual bool visit(Function *);
  47.  
  48.    // we want to insert calls to the builtin library only after optimization
  49.    void handleDIV(Instruction *); // integer division, modulus
  50.    void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
  51.  
  52. private:
  53.    BuildUtil bld;
  54. };
  55.  
  56. void
  57. NVC0LegalizeSSA::handleDIV(Instruction *i)
  58. {
  59.    FlowInstruction *call;
  60.    int builtin;
  61.    Value *def[2];
  62.  
  63.    bld.setPosition(i, false);
  64.    def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
  65.    def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
  66.    switch (i->dType) {
  67.    case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
  68.    case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
  69.    default:
  70.       return;
  71.    }
  72.    call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
  73.    bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
  74.    bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
  75.    bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
  76.  
  77.    call->fixed = 1;
  78.    call->absolute = call->builtin = 1;
  79.    call->target.builtin = builtin;
  80.    delete_Instruction(prog, i);
  81. }
  82.  
  83. void
  84. NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
  85. {
  86.    // TODO
  87. }
  88.  
  89. bool
  90. NVC0LegalizeSSA::visit(Function *fn)
  91. {
  92.    bld.setProgram(fn->getProgram());
  93.    return true;
  94. }
  95.  
  96. bool
  97. NVC0LegalizeSSA::visit(BasicBlock *bb)
  98. {
  99.    Instruction *next;
  100.    for (Instruction *i = bb->getEntry(); i; i = next) {
  101.       next = i->next;
  102.       if (i->dType == TYPE_F32)
  103.          continue;
  104.       switch (i->op) {
  105.       case OP_DIV:
  106.       case OP_MOD:
  107.          handleDIV(i);
  108.          break;
  109.       case OP_RCP:
  110.       case OP_RSQ:
  111.          if (i->dType == TYPE_F64)
  112.             handleRCPRSQ(i);
  113.          break;
  114.       default:
  115.          break;
  116.       }
  117.    }
  118.    return true;
  119. }
  120.  
  121. class NVC0LegalizePostRA : public Pass
  122. {
  123. public:
  124.    NVC0LegalizePostRA(const Program *);
  125.  
  126. private:
  127.    virtual bool visit(Function *);
  128.    virtual bool visit(BasicBlock *);
  129.  
  130.    void replaceZero(Instruction *);
  131.    bool tryReplaceContWithBra(BasicBlock *);
  132.    void propagateJoin(BasicBlock *);
  133.  
  134.    struct TexUse
  135.    {
  136.       TexUse(Instruction *use, const Instruction *tex)
  137.          : insn(use), tex(tex), level(-1) { }
  138.       Instruction *insn;
  139.       const Instruction *tex; // or split / mov
  140.       int level;
  141.    };
  142.    struct Limits
  143.    {
  144.       Limits() { }
  145.       Limits(int min, int max) : min(min), max(max) { }
  146.       int min, max;
  147.    };
  148.    bool insertTextureBarriers(Function *);
  149.    inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
  150.    void findFirstUses(const Instruction *tex, const Instruction *def,
  151.                       std::list<TexUse>&);
  152.    void findOverwritingDefs(const Instruction *tex, Instruction *insn,
  153.                             const BasicBlock *term,
  154.                             std::list<TexUse>&);
  155.    void addTexUse(std::list<TexUse>&, Instruction *, const Instruction *);
  156.    const Instruction *recurseDef(const Instruction *);
  157.  
  158. private:
  159.    LValue *rZero;
  160.    LValue *carry;
  161.    const bool needTexBar;
  162. };
  163.  
  164. NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
  165.    : needTexBar(prog->getTarget()->getChipset() >= 0xe0)
  166. {
  167. }
  168.  
  169. bool
  170. NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
  171.                                     const Instruction *early) const
  172. {
  173.    if (early->bb == later->bb)
  174.       return early->serial < later->serial;
  175.    return later->bb->dominatedBy(early->bb);
  176. }
  177.  
  178. void
  179. NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
  180.                               Instruction *usei, const Instruction *insn)
  181. {
  182.    bool add = true;
  183.    for (std::list<TexUse>::iterator it = uses.begin();
  184.         it != uses.end();) {
  185.       if (insnDominatedBy(usei, it->insn)) {
  186.          add = false;
  187.          break;
  188.       }
  189.       if (insnDominatedBy(it->insn, usei))
  190.          it = uses.erase(it);
  191.       else
  192.          ++it;
  193.    }
  194.    if (add)
  195.       uses.push_back(TexUse(usei, insn));
  196. }
  197.  
  198. void
  199. NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
  200.                                         Instruction *insn,
  201.                                         const BasicBlock *term,
  202.                                         std::list<TexUse> &uses)
  203. {
  204.    while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
  205.       insn = insn->getSrc(0)->getUniqueInsn();
  206.  
  207.    if (!insn || !insn->bb->reachableBy(texi->bb, term))
  208.       return;
  209.  
  210.    switch (insn->op) {
  211.    /* Values not connected to the tex's definition through any of these should
  212.     * not be conflicting.
  213.     */
  214.    case OP_SPLIT:
  215.    case OP_MERGE:
  216.    case OP_PHI:
  217.    case OP_UNION:
  218.       /* recurse again */
  219.       for (int s = 0; insn->srcExists(s); ++s)
  220.          findOverwritingDefs(texi, insn->getSrc(s)->getUniqueInsn(), term,
  221.                              uses);
  222.       break;
  223.    default:
  224.       // if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ?
  225.       addTexUse(uses, insn, texi);
  226.       break;
  227.    }
  228. }
  229.  
  230. void
  231. NVC0LegalizePostRA::findFirstUses(const Instruction *texi,
  232.                                   const Instruction *insn,
  233.                                   std::list<TexUse> &uses)
  234. {
  235.    for (int d = 0; insn->defExists(d); ++d) {
  236.       Value *v = insn->getDef(d);
  237.       for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
  238.          Instruction *usei = (*u)->getInsn();
  239.  
  240.          if (usei->op == OP_PHI || usei->op == OP_UNION) {
  241.             // need a barrier before WAW cases
  242.             for (int s = 0; usei->srcExists(s); ++s) {
  243.                Instruction *defi = usei->getSrc(s)->getUniqueInsn();
  244.                if (defi && &usei->src(s) != *u)
  245.                   findOverwritingDefs(texi, defi, usei->bb, uses);
  246.             }
  247.          }
  248.  
  249.          if (usei->op == OP_SPLIT ||
  250.              usei->op == OP_MERGE ||
  251.              usei->op == OP_PHI ||
  252.              usei->op == OP_UNION) {
  253.             // these uses don't manifest in the machine code
  254.             findFirstUses(texi, usei, uses);
  255.          } else
  256.          if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
  257.              usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
  258.             findFirstUses(texi, usei, uses);
  259.          } else {
  260.             addTexUse(uses, usei, insn);
  261.          }
  262.       }
  263.    }
  264. }
  265.  
  266. // Texture barriers:
  267. // This pass is a bit long and ugly and can probably be optimized.
  268. //
  269. // 1. obtain a list of TEXes and their outputs' first use(s)
  270. // 2. calculate the barrier level of each first use (minimal number of TEXes,
  271. //    over all paths, between the TEX and the use in question)
  272. // 3. for each barrier, if all paths from the source TEX to that barrier
  273. //    contain a barrier of lesser level, it can be culled
  274. bool
  275. NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
  276. {
  277.    std::list<TexUse> *uses;
  278.    std::vector<Instruction *> texes;
  279.    std::vector<int> bbFirstTex;
  280.    std::vector<int> bbFirstUse;
  281.    std::vector<int> texCounts;
  282.    std::vector<TexUse> useVec;
  283.    ArrayList insns;
  284.  
  285.    fn->orderInstructions(insns);
  286.  
  287.    texCounts.resize(fn->allBBlocks.getSize(), 0);
  288.    bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
  289.    bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
  290.  
  291.    // tag BB CFG nodes by their id for later
  292.    for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
  293.       BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
  294.       if (bb)
  295.          bb->cfg.tag = bb->getId();
  296.    }
  297.  
  298.    // gather the first uses for each TEX
  299.    for (int i = 0; i < insns.getSize(); ++i) {
  300.       Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
  301.       if (isTextureOp(tex->op)) {
  302.          texes.push_back(tex);
  303.          if (!texCounts.at(tex->bb->getId()))
  304.             bbFirstTex[tex->bb->getId()] = texes.size() - 1;
  305.          texCounts[tex->bb->getId()]++;
  306.       }
  307.    }
  308.    insns.clear();
  309.    if (texes.empty())
  310.       return false;
  311.    uses = new std::list<TexUse>[texes.size()];
  312.    if (!uses)
  313.       return false;
  314.    for (size_t i = 0; i < texes.size(); ++i)
  315.       findFirstUses(texes[i], texes[i], uses[i]);
  316.  
  317.    // determine the barrier level at each use
  318.    for (size_t i = 0; i < texes.size(); ++i) {
  319.       for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
  320.            ++u) {
  321.          BasicBlock *tb = texes[i]->bb;
  322.          BasicBlock *ub = u->insn->bb;
  323.          if (tb == ub) {
  324.             u->level = 0;
  325.             for (size_t j = i + 1; j < texes.size() &&
  326.                     texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
  327.                  ++j)
  328.                u->level++;
  329.          } else {
  330.             u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
  331.                                                       &ub->cfg, texCounts);
  332.             if (u->level < 0) {
  333.                WARN("Failed to find path TEX -> TEXBAR\n");
  334.                u->level = 0;
  335.                continue;
  336.             }
  337.             // this counted all TEXes in the origin block, correct that
  338.             u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
  339.             // and did not count the TEXes in the destination block, add those
  340.             for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
  341.                     texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
  342.                  ++j)
  343.                u->level++;
  344.          }
  345.          assert(u->level >= 0);
  346.          useVec.push_back(*u);
  347.       }
  348.    }
  349.    delete[] uses;
  350.    uses = NULL;
  351.  
  352.    // insert the barriers
  353.    for (size_t i = 0; i < useVec.size(); ++i) {
  354.       Instruction *prev = useVec[i].insn->prev;
  355.       if (useVec[i].level < 0)
  356.          continue;
  357.       if (prev && prev->op == OP_TEXBAR) {
  358.          if (prev->subOp > useVec[i].level)
  359.             prev->subOp = useVec[i].level;
  360.          prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
  361.       } else {
  362.          Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
  363.          bar->fixed = 1;
  364.          bar->subOp = useVec[i].level;
  365.          // make use explicit to ease latency calculation
  366.          bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
  367.          useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
  368.       }
  369.    }
  370.  
  371.    if (fn->getProgram()->optLevel < 3) {
  372.       if (uses)
  373.          delete[] uses;
  374.       return true;
  375.    }
  376.  
  377.    std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
  378.  
  379.    limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
  380.    limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
  381.    limitS.resize(fn->allBBlocks.getSize());
  382.  
  383.    // cull unneeded barriers (should do that earlier, but for simplicity)
  384.    IteratorRef bi = fn->cfg.iteratorCFG();
  385.    // first calculate min/max outstanding TEXes for each BB
  386.    for (bi->reset(); !bi->end(); bi->next()) {
  387.       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
  388.       BasicBlock *bb = BasicBlock::get(n);
  389.       int min = 0;
  390.       int max = std::numeric_limits<int>::max();
  391.       for (Instruction *i = bb->getFirst(); i; i = i->next) {
  392.          if (isTextureOp(i->op)) {
  393.             min++;
  394.             if (max < std::numeric_limits<int>::max())
  395.                max++;
  396.          } else
  397.          if (i->op == OP_TEXBAR) {
  398.             min = MIN2(min, i->subOp);
  399.             max = MIN2(max, i->subOp);
  400.          }
  401.       }
  402.       // limits when looking at an isolated block
  403.       limitS[bb->getId()].min = min;
  404.       limitS[bb->getId()].max = max;
  405.    }
  406.    // propagate the min/max values
  407.    for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
  408.       for (bi->reset(); !bi->end(); bi->next()) {
  409.          Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
  410.          BasicBlock *bb = BasicBlock::get(n);
  411.          const int bbId = bb->getId();
  412.          for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
  413.             BasicBlock *in = BasicBlock::get(ei.getNode());
  414.             const int inId = in->getId();
  415.             limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
  416.             limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
  417.          }
  418.          // I just hope this is correct ...
  419.          if (limitS[bbId].max == std::numeric_limits<int>::max()) {
  420.             // no barrier
  421.             limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
  422.             limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
  423.          } else {
  424.             // block contained a barrier
  425.             limitB[bbId].min = MIN2(limitS[bbId].max,
  426.                                     limitT[bbId].min + limitS[bbId].min);
  427.             limitB[bbId].max = MIN2(limitS[bbId].max,
  428.                                     limitT[bbId].max + limitS[bbId].min);
  429.          }
  430.       }
  431.    }
  432.    // finally delete unnecessary barriers
  433.    for (bi->reset(); !bi->end(); bi->next()) {
  434.       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
  435.       BasicBlock *bb = BasicBlock::get(n);
  436.       Instruction *prev = NULL;
  437.       Instruction *next;
  438.       int max = limitT[bb->getId()].max;
  439.       for (Instruction *i = bb->getFirst(); i; i = next) {
  440.          next = i->next;
  441.          if (i->op == OP_TEXBAR) {
  442.             if (i->subOp >= max) {
  443.                delete_Instruction(prog, i);
  444.                i = NULL;
  445.             } else {
  446.                max = i->subOp;
  447.                if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
  448.                   delete_Instruction(prog, prev);
  449.                   prev = NULL;
  450.                }
  451.             }
  452.          } else
  453.          if (isTextureOp(i->op)) {
  454.             max++;
  455.          }
  456.          if (i && !i->isNop())
  457.             prev = i;
  458.       }
  459.    }
  460.    if (uses)
  461.       delete[] uses;
  462.    return true;
  463. }
  464.  
  465. bool
  466. NVC0LegalizePostRA::visit(Function *fn)
  467. {
  468.    if (needTexBar)
  469.       insertTextureBarriers(fn);
  470.  
  471.    rZero = new_LValue(fn, FILE_GPR);
  472.    carry = new_LValue(fn, FILE_FLAGS);
  473.  
  474.    rZero->reg.data.id = prog->getTarget()->getFileSize(FILE_GPR);
  475.    carry->reg.data.id = 0;
  476.  
  477.    return true;
  478. }
  479.  
  480. void
  481. NVC0LegalizePostRA::replaceZero(Instruction *i)
  482. {
  483.    for (int s = 0; i->srcExists(s); ++s) {
  484.       if (s == 2 && i->op == OP_SUCLAMP)
  485.          continue;
  486.       ImmediateValue *imm = i->getSrc(s)->asImm();
  487.       if (imm && imm->reg.data.u64 == 0)
  488.          i->setSrc(s, rZero);
  489.    }
  490. }
  491.  
  492. // replace CONT with BRA for single unconditional continue
  493. bool
  494. NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
  495. {
  496.    if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
  497.       return false;
  498.    Graph::EdgeIterator ei = bb->cfg.incident();
  499.    if (ei.getType() != Graph::Edge::BACK)
  500.       ei.next();
  501.    if (ei.getType() != Graph::Edge::BACK)
  502.       return false;
  503.    BasicBlock *contBB = BasicBlock::get(ei.getNode());
  504.  
  505.    if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
  506.        contBB->getExit()->getPredicate())
  507.       return false;
  508.    contBB->getExit()->op = OP_BRA;
  509.    bb->remove(bb->getEntry()); // delete PRECONT
  510.  
  511.    ei.next();
  512.    assert(ei.end() || ei.getType() != Graph::Edge::BACK);
  513.    return true;
  514. }
  515.  
  516. // replace branches to join blocks with join ops
  517. void
  518. NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
  519. {
  520.    if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
  521.       return;
  522.    for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
  523.       BasicBlock *in = BasicBlock::get(ei.getNode());
  524.       Instruction *exit = in->getExit();
  525.       if (!exit) {
  526.          in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
  527.          // there should always be a terminator instruction
  528.          WARN("inserted missing terminator in BB:%i\n", in->getId());
  529.       } else
  530.       if (exit->op == OP_BRA) {
  531.          exit->op = OP_JOIN;
  532.          exit->asFlow()->limit = 1; // must-not-propagate marker
  533.       }
  534.    }
  535.    bb->remove(bb->getEntry());
  536. }
  537.  
  538. bool
  539. NVC0LegalizePostRA::visit(BasicBlock *bb)
  540. {
  541.    Instruction *i, *next;
  542.  
  543.    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
  544.    for (i = bb->getFirst(); i; i = next) {
  545.       next = i->next;
  546.       if (i->op == OP_EMIT || i->op == OP_RESTART) {
  547.          if (!i->getDef(0)->refCount())
  548.             i->setDef(0, NULL);
  549.          if (i->src(0).getFile() == FILE_IMMEDIATE)
  550.             i->setSrc(0, rZero); // initial value must be 0
  551.       } else
  552.       if (i->isNop()) {
  553.          bb->remove(i);
  554.       } else {
  555.          // TODO: Move this to before register allocation for operations that
  556.          // need the $c register !
  557.          if (typeSizeof(i->dType) == 8) {
  558.             Instruction *hi;
  559.             hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
  560.             if (hi)
  561.                next = hi;
  562.          }
  563.  
  564.          if (i->op != OP_MOV && i->op != OP_PFETCH)
  565.             replaceZero(i);
  566.       }
  567.    }
  568.    if (!bb->getEntry())
  569.       return true;
  570.  
  571.    if (!tryReplaceContWithBra(bb))
  572.       propagateJoin(bb);
  573.  
  574.    return true;
  575. }
  576.  
  577. class NVC0LoweringPass : public Pass
  578. {
  579. public:
  580.    NVC0LoweringPass(Program *);
  581.  
  582. private:
  583.    virtual bool visit(Function *);
  584.    virtual bool visit(BasicBlock *);
  585.    virtual bool visit(Instruction *);
  586.  
  587.    bool handleRDSV(Instruction *);
  588.    bool handleWRSV(Instruction *);
  589.    bool handleEXPORT(Instruction *);
  590.    bool handleOUT(Instruction *);
  591.    bool handleDIV(Instruction *);
  592.    bool handleMOD(Instruction *);
  593.    bool handleSQRT(Instruction *);
  594.    bool handlePOW(Instruction *);
  595.    bool handleTEX(TexInstruction *);
  596.    bool handleTXD(TexInstruction *);
  597.    bool handleTXQ(TexInstruction *);
  598.    bool handleManualTXD(TexInstruction *);
  599.    bool handleATOM(Instruction *);
  600.    bool handleCasExch(Instruction *, bool needCctl);
  601.    void handleSurfaceOpNVE4(TexInstruction *);
  602.  
  603.    void checkPredicate(Instruction *);
  604.  
  605.    void readTessCoord(LValue *dst, int c);
  606.  
  607.    Value *loadResInfo32(Value *ptr, uint32_t off);
  608.    Value *loadMsInfo32(Value *ptr, uint32_t off);
  609.    Value *loadTexHandle(Value *ptr, unsigned int slot);
  610.  
  611.    void adjustCoordinatesMS(TexInstruction *);
  612.    void processSurfaceCoordsNVE4(TexInstruction *);
  613.  
  614. private:
  615.    const Target *const targ;
  616.  
  617.    BuildUtil bld;
  618.  
  619.    Symbol *gMemBase;
  620.    LValue *gpEmitAddress;
  621. };
  622.  
  623. NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
  624. {
  625.    bld.setProgram(prog);
  626.    gMemBase = NULL;
  627. }
  628.  
  629. bool
  630. NVC0LoweringPass::visit(Function *fn)
  631. {
  632.    if (prog->getType() == Program::TYPE_GEOMETRY) {
  633.       assert(!strncmp(fn->getName(), "MAIN", 4));
  634.       // TODO: when we generate actual functions pass this value along somehow
  635.       bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
  636.       gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
  637.       if (fn->cfgExit) {
  638.          bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
  639.          bld.mkMovToReg(0, gpEmitAddress);
  640.       }
  641.    }
  642.    return true;
  643. }
  644.  
  645. bool
  646. NVC0LoweringPass::visit(BasicBlock *bb)
  647. {
  648.    return true;
  649. }
  650.  
  651. inline Value *
  652. NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
  653. {
  654.    uint8_t b = prog->driver->io.resInfoCBSlot;
  655.    uint32_t off = prog->driver->io.texBindBase + slot * 4;
  656.    return bld.
  657.       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
  658. }
  659.  
  660. // move array source to first slot, convert to u16, add indirections
  661. bool
  662. NVC0LoweringPass::handleTEX(TexInstruction *i)
  663. {
  664.    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
  665.    const int arg = i->tex.target.getArgCount();
  666.    const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
  667.  
  668.    if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET) {
  669.       if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
  670.          WARN("indirect TEX not implemented\n");
  671.       }
  672.       if (i->tex.r == i->tex.s) {
  673.          i->tex.r += prog->driver->io.texBindBase / 4;
  674.          i->tex.s  = 0; // only a single cX[] value possible here
  675.       } else {
  676.          Value *hnd = bld.getScratch();
  677.          Value *rHnd = loadTexHandle(NULL, i->tex.r);
  678.          Value *sHnd = loadTexHandle(NULL, i->tex.s);
  679.  
  680.          bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
  681.  
  682.          i->tex.r = 0; // not used for indirect tex
  683.          i->tex.s = 0;
  684.          i->setIndirectR(hnd);
  685.       }
  686.       if (i->tex.target.isArray()) {
  687.          LValue *layer = new_LValue(func, FILE_GPR);
  688.          Value *src = i->getSrc(lyr);
  689.          const int sat = (i->op == OP_TXF) ? 1 : 0;
  690.          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
  691.          bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
  692.          for (int s = dim; s >= 1; --s)
  693.             i->setSrc(s, i->getSrc(s - 1));
  694.          i->setSrc(0, layer);
  695.       }
  696.    } else
  697.    // (nvc0) generate and move the tsc/tic/array source to the front
  698.    if (dim != arg || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
  699.       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
  700.  
  701.       Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
  702.       for (int s = dim; s >= 1; --s)
  703.          i->setSrc(s, i->getSrc(s - 1));
  704.       i->setSrc(0, arrayIndex);
  705.  
  706.       Value *ticRel = i->getIndirectR();
  707.       Value *tscRel = i->getIndirectS();
  708.  
  709.       if (arrayIndex) {
  710.          int sat = (i->op == OP_TXF) ? 1 : 0;
  711.          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
  712.          bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
  713.       } else {
  714.          bld.loadImm(src, 0);
  715.       }
  716.  
  717.       if (ticRel) {
  718.          i->setSrc(i->tex.rIndirectSrc, NULL);
  719.          bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
  720.       }
  721.       if (tscRel) {
  722.          i->setSrc(i->tex.sIndirectSrc, NULL);
  723.          bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
  724.       }
  725.  
  726.       i->setSrc(0, src);
  727.    }
  728.  
  729.    // offset is last source (lod 1st, dc 2nd)
  730.    if (i->tex.useOffsets) {
  731.       uint32_t value = 0;
  732.       int n, c;
  733.       int s = i->srcCount(0xff, true);
  734.       if (i->srcExists(s)) // move potential predicate out of the way
  735.          i->moveSources(s, 1);
  736.       for (n = 0; n < i->tex.useOffsets; ++n)
  737.          for (c = 0; c < 3; ++c)
  738.             value |= (i->tex.offset[n][c] & 0xf) << (n * 12 + c * 4);
  739.       i->setSrc(s, bld.loadImm(NULL, value));
  740.    }
  741.  
  742.    if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET) {
  743.       //
  744.       // If TEX requires more than 4 sources, the 2nd register tuple must be
  745.       // aligned to 4, even if it consists of just a single 4-byte register.
  746.       //
  747.       // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
  748.       //
  749.       int s = i->srcCount(0xff, true);
  750.       if (s > 4 && s < 7) {
  751.          if (i->srcExists(s)) // move potential predicate out of the way
  752.             i->moveSources(s, 7 - s);
  753.          while (s < 7)
  754.             i->setSrc(s++, bld.loadImm(NULL, 0));
  755.       }
  756.    }
  757.  
  758.    return true;
  759. }
  760.  
  761. bool
  762. NVC0LoweringPass::handleManualTXD(TexInstruction *i)
  763. {
  764.    static const uint8_t qOps[4][2] =
  765.    {
  766.       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
  767.       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
  768.       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
  769.       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
  770.    };
  771.    Value *def[4][4];
  772.    Value *crd[3];
  773.    Instruction *tex;
  774.    Value *zero = bld.loadImm(bld.getSSA(), 0);
  775.    int l, c;
  776.    const int dim = i->tex.target.getDim();
  777.  
  778.    i->op = OP_TEX; // no need to clone dPdx/dPdy later
  779.  
  780.    for (c = 0; c < dim; ++c)
  781.       crd[c] = bld.getScratch();
  782.  
  783.    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
  784.    for (l = 0; l < 4; ++l) {
  785.       // mov coordinates from lane l to all lanes
  786.       for (c = 0; c < dim; ++c)
  787.          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
  788.       // add dPdx from lane l to lanes dx
  789.       for (c = 0; c < dim; ++c)
  790.          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
  791.       // add dPdy from lane l to lanes dy
  792.       for (c = 0; c < dim; ++c)
  793.          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
  794.       // texture
  795.       bld.insert(tex = cloneForward(func, i));
  796.       for (c = 0; c < dim; ++c)
  797.          tex->setSrc(c, crd[c]);
  798.       // save results
  799.       for (c = 0; i->defExists(c); ++c) {
  800.          Instruction *mov;
  801.          def[c][l] = bld.getSSA();
  802.          mov = bld.mkMov(def[c][l], tex->getDef(c));
  803.          mov->fixed = 1;
  804.          mov->lanes = 1 << l;
  805.       }
  806.    }
  807.    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
  808.  
  809.    for (c = 0; i->defExists(c); ++c) {
  810.       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
  811.       for (l = 0; l < 4; ++l)
  812.          u->setSrc(l, def[c][l]);
  813.    }
  814.  
  815.    i->bb->remove(i);
  816.    return true;
  817. }
  818.  
  819. bool
  820. NVC0LoweringPass::handleTXD(TexInstruction *txd)
  821. {
  822.    int dim = txd->tex.target.getDim();
  823.    int arg = txd->tex.target.getArgCount();
  824.  
  825.    handleTEX(txd);
  826.    while (txd->srcExists(arg))
  827.       ++arg;
  828.  
  829.    txd->tex.derivAll = true;
  830.    if (dim > 2 ||
  831.        txd->tex.target.isCube() ||
  832.        arg > 4 ||
  833.        txd->tex.target.isShadow())
  834.       return handleManualTXD(txd);
  835.  
  836.    for (int c = 0; c < dim; ++c) {
  837.       txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
  838.       txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
  839.       txd->dPdx[c].set(NULL);
  840.       txd->dPdy[c].set(NULL);
  841.    }
  842.    return true;
  843. }
  844.  
  845. bool
  846. NVC0LoweringPass::handleTXQ(TexInstruction *txq)
  847. {
  848.    // TODO: indirect resource/sampler index
  849.    return true;
  850. }
  851.  
  852. bool
  853. NVC0LoweringPass::handleATOM(Instruction *atom)
  854. {
  855.    SVSemantic sv;
  856.  
  857.    switch (atom->src(0).getFile()) {
  858.    case FILE_MEMORY_LOCAL:
  859.       sv = SV_LBASE;
  860.       break;
  861.    case FILE_MEMORY_SHARED:
  862.       sv = SV_SBASE;
  863.       break;
  864.    default:
  865.       assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
  866.       return true;
  867.    }
  868.    Value *base =
  869.       bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
  870.    Value *ptr = atom->getIndirect(0, 0);
  871.  
  872.    atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
  873.    atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
  874.    if (ptr)
  875.       base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
  876.    atom->setIndirect(0, 0, base);
  877.  
  878.    return true;
  879. }
  880.  
  881. bool
  882. NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
  883. {
  884.    if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
  885.        cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
  886.       return false;
  887.    bld.setPosition(cas, true);
  888.  
  889.    if (needCctl) {
  890.       Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
  891.       cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
  892.       cctl->fixed = 1;
  893.       cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
  894.       if (cas->isPredicated())
  895.          cctl->setPredicate(cas->cc, cas->getPredicate());
  896.    }
  897.  
  898.    if (cas->defExists(0) && cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
  899.       // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
  900.       // should be set to the high part of the double reg or bad things will
  901.       // happen elsewhere in the universe.
  902.       // Also, it sometimes returns the new value instead of the old one
  903.       // under mysterious circumstances.
  904.       Value *dreg = bld.getSSA(8);
  905.       bld.setPosition(cas, false);
  906.       bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
  907.       cas->setSrc(1, dreg);
  908.    }
  909.  
  910.    return true;
  911. }
  912.  
  913. inline Value *
  914. NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
  915. {
  916.    uint8_t b = prog->driver->io.resInfoCBSlot;
  917.    off += prog->driver->io.suInfoBase;
  918.    return bld.
  919.       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
  920. }
  921.  
  922. inline Value *
  923. NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
  924. {
  925.    uint8_t b = prog->driver->io.msInfoCBSlot;
  926.    off += prog->driver->io.msInfoBase;
  927.    return bld.
  928.       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
  929. }
  930.  
  931. /* On nvc0, surface info is obtained via the surface binding points passed
  932.  * to the SULD/SUST instructions.
  933.  * On nve4, surface info is stored in c[] and is used by various special
  934.  * instructions, e.g. for clamping coordiantes or generating an address.
  935.  * They couldn't just have added an equivalent to TIC now, couldn't they ?
  936.  */
  937. #define NVE4_SU_INFO_ADDR   0x00
  938. #define NVE4_SU_INFO_FMT    0x04
  939. #define NVE4_SU_INFO_DIM_X  0x08
  940. #define NVE4_SU_INFO_PITCH  0x0c
  941. #define NVE4_SU_INFO_DIM_Y  0x10
  942. #define NVE4_SU_INFO_ARRAY  0x14
  943. #define NVE4_SU_INFO_DIM_Z  0x18
  944. #define NVE4_SU_INFO_UNK1C  0x1c
  945. #define NVE4_SU_INFO_WIDTH  0x20
  946. #define NVE4_SU_INFO_HEIGHT 0x24
  947. #define NVE4_SU_INFO_DEPTH  0x28
  948. #define NVE4_SU_INFO_TARGET 0x2c
  949. #define NVE4_SU_INFO_CALL   0x30
  950. #define NVE4_SU_INFO_RAW_X  0x34
  951. #define NVE4_SU_INFO_MS_X   0x38
  952. #define NVE4_SU_INFO_MS_Y   0x3c
  953.  
  954. #define NVE4_SU_INFO__STRIDE 0x40
  955.  
  956. #define NVE4_SU_INFO_DIM(i)  (0x08 + (i) * 8)
  957. #define NVE4_SU_INFO_SIZE(i) (0x20 + (i) * 4)
  958. #define NVE4_SU_INFO_MS(i)   (0x38 + (i) * 4)
  959.  
  960. static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
  961. {
  962.    switch (su->tex.target.getEnum()) {
  963.    case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
  964.    case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  965.    case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  966.    case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
  967.                                    NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
  968.                                    NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  969.    case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
  970.    case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
  971.    case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  972.    case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  973.    case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  974.    case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  975.    case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
  976.    default:
  977.       assert(0);
  978.       return 0;
  979.    }
  980. }
  981.  
  982. void
  983. NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
  984. {
  985.    const uint16_t base = tex->tex.r * NVE4_SU_INFO__STRIDE;
  986.    const int arg = tex->tex.target.getArgCount();
  987.  
  988.    if (tex->tex.target == TEX_TARGET_2D_MS)
  989.       tex->tex.target = TEX_TARGET_2D;
  990.    else
  991.    if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
  992.       tex->tex.target = TEX_TARGET_2D_ARRAY;
  993.    else
  994.       return;
  995.  
  996.    Value *x = tex->getSrc(0);
  997.    Value *y = tex->getSrc(1);
  998.    Value *s = tex->getSrc(arg - 1);
  999.  
  1000.    Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
  1001.  
  1002.    Value *ms_x = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(0));
  1003.    Value *ms_y = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(1));
  1004.  
  1005.    bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
  1006.    bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
  1007.  
  1008.    s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
  1009.    s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
  1010.  
  1011.    Value *dx = loadMsInfo32(ts, 0x0);
  1012.    Value *dy = loadMsInfo32(ts, 0x4);
  1013.  
  1014.    bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
  1015.    bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
  1016.  
  1017.    tex->setSrc(0, tx);
  1018.    tex->setSrc(1, ty);
  1019.    tex->moveSources(arg, -1);
  1020. }
  1021.  
  1022. // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
  1023. // They're computed from the coordinates using the surface info in c[] space.
  1024. void
  1025. NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
  1026. {
  1027.    Instruction *insn;
  1028.    const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
  1029.    const bool raw =
  1030.       su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
  1031.    const int idx = su->tex.r;
  1032.    const int dim = su->tex.target.getDim();
  1033.    const int arg = dim + (su->tex.target.isArray() ? 1 : 0);
  1034.    const uint16_t base = idx * NVE4_SU_INFO__STRIDE;
  1035.    int c;
  1036.    Value *zero = bld.mkImm(0);
  1037.    Value *p1 = NULL;
  1038.    Value *v;
  1039.    Value *src[3];
  1040.    Value *bf, *eau, *off;
  1041.    Value *addr, *pred;
  1042.  
  1043.    off = bld.getScratch(4);
  1044.    bf = bld.getScratch(4);
  1045.    addr = bld.getSSA(8);
  1046.    pred = bld.getScratch(1, FILE_PREDICATE);
  1047.  
  1048.    bld.setPosition(su, false);
  1049.  
  1050.    adjustCoordinatesMS(su);
  1051.  
  1052.    // calculate clamped coordinates
  1053.    for (c = 0; c < arg; ++c) {
  1054.       src[c] = bld.getScratch();
  1055.       if (c == 0 && raw)
  1056.          v = loadResInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
  1057.       else
  1058.          v = loadResInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
  1059.       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
  1060.          ->subOp = getSuClampSubOp(su, c);
  1061.    }
  1062.    for (; c < 3; ++c)
  1063.       src[c] = zero;
  1064.  
  1065.    // set predicate output
  1066.    if (su->tex.target == TEX_TARGET_BUFFER) {
  1067.       src[0]->getInsn()->setFlagsDef(1, pred);
  1068.    } else
  1069.    if (su->tex.target.isArray()) {
  1070.       p1 = bld.getSSA(1, FILE_PREDICATE);
  1071.       src[dim]->getInsn()->setFlagsDef(1, p1);
  1072.    }
  1073.  
  1074.    // calculate pixel offset
  1075.    if (dim == 1) {
  1076.       if (su->tex.target != TEX_TARGET_BUFFER)
  1077.          bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
  1078.    } else
  1079.    if (dim == 3) {
  1080.       v = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
  1081.       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
  1082.          ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
  1083.  
  1084.       v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
  1085.       bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
  1086.          ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
  1087.    } else {
  1088.       assert(dim == 2);
  1089.       v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
  1090.       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
  1091.          ->subOp = su->tex.target.isArray() ?
  1092.          NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
  1093.    }
  1094.  
  1095.    // calculate effective address part 1
  1096.    if (su->tex.target == TEX_TARGET_BUFFER) {
  1097.       if (raw) {
  1098.          bf = src[0];
  1099.       } else {
  1100.          v = loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
  1101.          bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
  1102.             ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
  1103.       }
  1104.    } else {
  1105.       Value *y = src[1];
  1106.       Value *z = src[2];
  1107.       uint16_t subOp = 0;
  1108.  
  1109.       switch (dim) {
  1110.       case 1:
  1111.          y = zero;
  1112.          z = zero;
  1113.          break;
  1114.       case 2:
  1115.          z = off;
  1116.          if (!su->tex.target.isArray()) {
  1117.             z = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
  1118.             subOp = NV50_IR_SUBOP_SUBFM_3D;
  1119.          }
  1120.          break;
  1121.       default:
  1122.          subOp = NV50_IR_SUBOP_SUBFM_3D;
  1123.          assert(dim == 3);
  1124.          break;
  1125.       }
  1126.       insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
  1127.       insn->subOp = subOp;
  1128.       insn->setFlagsDef(1, pred);
  1129.    }
  1130.  
  1131.    // part 2
  1132.    v = loadResInfo32(NULL, base + NVE4_SU_INFO_ADDR);
  1133.  
  1134.    if (su->tex.target == TEX_TARGET_BUFFER) {
  1135.       eau = v;
  1136.    } else {
  1137.       eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
  1138.    }
  1139.    // add array layer offset
  1140.    if (su->tex.target.isArray()) {
  1141.       v = loadResInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
  1142.       if (dim == 1)
  1143.          bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
  1144.             ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
  1145.       else
  1146.          bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
  1147.             ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
  1148.       // combine predicates
  1149.       assert(p1);
  1150.       bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
  1151.    }
  1152.  
  1153.    if (atom) {
  1154.       Value *lo = bf;
  1155.       if (su->tex.target == TEX_TARGET_BUFFER) {
  1156.          lo = zero;
  1157.          bld.mkMov(off, bf);
  1158.       }
  1159.       //  bf == g[] address & 0xff
  1160.       // eau == g[] address >> 8
  1161.       bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
  1162.       bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
  1163.    } else
  1164.    if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
  1165.       // Convert from u32 to u8 address format, which is what the library code
  1166.       // doing SULDP currently uses.
  1167.       // XXX: can SUEAU do this ?
  1168.       // XXX: does it matter that we don't mask high bytes in bf ?
  1169.       // Grrr.
  1170.       bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
  1171.       bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
  1172.    }
  1173.  
  1174.    bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
  1175.  
  1176.    if (atom && su->tex.target == TEX_TARGET_BUFFER)
  1177.       bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
  1178.  
  1179.    // let's just set it 0 for raw access and hope it works
  1180.    v = raw ?
  1181.       bld.mkImm(0) : loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
  1182.  
  1183.    // get rid of old coordinate sources, make space for fmt info and predicate
  1184.    su->moveSources(arg, 3 - arg);
  1185.    // set 64 bit address and 32-bit format sources
  1186.    su->setSrc(0, addr);
  1187.    su->setSrc(1, v);
  1188.    su->setSrc(2, pred);
  1189. }
  1190.  
  1191. void
  1192. NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
  1193. {
  1194.    processSurfaceCoordsNVE4(su);
  1195.  
  1196.    // Who do we hate more ? The person who decided that nvc0's SULD doesn't
  1197.    // have to support conversion or the person who decided that, in OpenCL,
  1198.    // you don't have to specify the format here like you do in OpenGL ?
  1199.  
  1200.    if (su->op == OP_SULDP) {
  1201.       // We don't patch shaders. Ever.
  1202.       // You get an indirect call to our library blob here.
  1203.       // But at least it's uniform.
  1204.       FlowInstruction *call;
  1205.       LValue *p[3];
  1206.       LValue *r[5];
  1207.       uint16_t base = su->tex.r * NVE4_SU_INFO__STRIDE + NVE4_SU_INFO_CALL;
  1208.  
  1209.       for (int i = 0; i < 4; ++i)
  1210.          (r[i] = bld.getScratch(4, FILE_GPR))->reg.data.id = i;
  1211.       for (int i = 0; i < 3; ++i)
  1212.          (p[i] = bld.getScratch(1, FILE_PREDICATE))->reg.data.id = i;
  1213.       (r[4] = bld.getScratch(8, FILE_GPR))->reg.data.id = 4;
  1214.  
  1215.       bld.mkMov(p[1], bld.mkImm((su->cache == CACHE_CA) ? 1 : 0), TYPE_U8);
  1216.       bld.mkMov(p[2], bld.mkImm((su->cache == CACHE_CG) ? 1 : 0), TYPE_U8);
  1217.       bld.mkMov(p[0], su->getSrc(2), TYPE_U8);
  1218.       bld.mkMov(r[4], su->getSrc(0), TYPE_U64);
  1219.       bld.mkMov(r[2], su->getSrc(1), TYPE_U32);
  1220.  
  1221.       call = bld.mkFlow(OP_CALL, NULL, su->cc, su->getPredicate());
  1222.  
  1223.       call->indirect = 1;
  1224.       call->absolute = 1;
  1225.       call->setSrc(0, bld.mkSymbol(FILE_MEMORY_CONST,
  1226.                                    prog->driver->io.resInfoCBSlot, TYPE_U32,
  1227.                                    prog->driver->io.suInfoBase + base));
  1228.       call->setSrc(1, r[2]);
  1229.       call->setSrc(2, r[4]);
  1230.       for (int i = 0; i < 3; ++i)
  1231.          call->setSrc(3 + i, p[i]);
  1232.       for (int i = 0; i < 4; ++i) {
  1233.          call->setDef(i, r[i]);
  1234.          bld.mkMov(su->getDef(i), r[i]);
  1235.       }
  1236.       call->setDef(4, p[1]);
  1237.       delete_Instruction(bld.getProgram(), su);
  1238.    }
  1239.  
  1240.    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
  1241.       // FIXME: for out of bounds access, destination value will be undefined !
  1242.       Value *pred = su->getSrc(2);
  1243.       CondCode cc = CC_NOT_P;
  1244.       if (su->getPredicate()) {
  1245.          pred = bld.getScratch(1, FILE_PREDICATE);
  1246.          cc = su->cc;
  1247.          if (cc == CC_NOT_P) {
  1248.             bld.mkOp2(OP_OR, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
  1249.          } else {
  1250.             bld.mkOp2(OP_AND, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
  1251.             pred->getInsn()->src(1).mod = Modifier(NV50_IR_MOD_NOT);
  1252.          }
  1253.       }
  1254.       Instruction *red = bld.mkOp(OP_ATOM, su->dType, su->getDef(0));
  1255.       red->subOp = su->subOp;
  1256.       if (!gMemBase)
  1257.          gMemBase = bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0);
  1258.       red->setSrc(0, gMemBase);
  1259.       red->setSrc(1, su->getSrc(3));
  1260.       if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
  1261.          red->setSrc(2, su->getSrc(4));
  1262.       red->setIndirect(0, 0, su->getSrc(0));
  1263.       red->setPredicate(cc, pred);
  1264.       delete_Instruction(bld.getProgram(), su);
  1265.       handleCasExch(red, true);
  1266.    } else {
  1267.       su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
  1268.    }
  1269. }
  1270.  
  1271. bool
  1272. NVC0LoweringPass::handleWRSV(Instruction *i)
  1273. {
  1274.    Instruction *st;
  1275.    Symbol *sym;
  1276.    uint32_t addr;
  1277.  
  1278.    // must replace, $sreg are not writeable
  1279.    addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
  1280.    if (addr >= 0x400)
  1281.       return false;
  1282.    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
  1283.  
  1284.    st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
  1285.                     i->getSrc(1));
  1286.    st->perPatch = i->perPatch;
  1287.  
  1288.    bld.getBB()->remove(i);
  1289.    return true;
  1290. }
  1291.  
  1292. void
  1293. NVC0LoweringPass::readTessCoord(LValue *dst, int c)
  1294. {
  1295.    Value *laneid = bld.getSSA();
  1296.    Value *x, *y;
  1297.  
  1298.    bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
  1299.  
  1300.    if (c == 0) {
  1301.       x = dst;
  1302.       y = NULL;
  1303.    } else
  1304.    if (c == 1) {
  1305.       x = NULL;
  1306.       y = dst;
  1307.    } else {
  1308.       assert(c == 2);
  1309.       x = bld.getSSA();
  1310.       y = bld.getSSA();
  1311.    }
  1312.    if (x)
  1313.       bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
  1314.    if (y)
  1315.       bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
  1316.  
  1317.    if (c == 2) {
  1318.       bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
  1319.       bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
  1320.    }
  1321. }
  1322.  
  1323. bool
  1324. NVC0LoweringPass::handleRDSV(Instruction *i)
  1325. {
  1326.    Symbol *sym = i->getSrc(0)->asSym();
  1327.    const SVSemantic sv = sym->reg.data.sv.sv;
  1328.    Value *vtx = NULL;
  1329.    Instruction *ld;
  1330.    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
  1331.  
  1332.    if (addr >= 0x400) {
  1333.       // mov $sreg
  1334.       if (sym->reg.data.sv.index == 3) {
  1335.          // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
  1336.          i->op = OP_MOV;
  1337.          i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
  1338.       }
  1339.       return true;
  1340.    }
  1341.  
  1342.    switch (sv) {
  1343.    case SV_POSITION:
  1344.       assert(prog->getType() == Program::TYPE_FRAGMENT);
  1345.       bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
  1346.       break;
  1347.    case SV_FACE:
  1348.    {
  1349.       Value *face = i->getDef(0);
  1350.       bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
  1351.       if (i->dType == TYPE_F32) {
  1352.          bld.mkOp2(OP_AND, TYPE_U32, face, face, bld.mkImm(0x80000000));
  1353.          bld.mkOp2(OP_XOR, TYPE_U32, face, face, bld.mkImm(0xbf800000));
  1354.       }
  1355.    }
  1356.       break;
  1357.    case SV_TESS_COORD:
  1358.       assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
  1359.       readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
  1360.       break;
  1361.    case SV_NTID:
  1362.    case SV_NCTAID:
  1363.    case SV_GRIDID:
  1364.       assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
  1365.       if (sym->reg.data.sv.index == 3) {
  1366.          i->op = OP_MOV;
  1367.          i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
  1368.          return true;
  1369.       }
  1370.       addr += prog->driver->prop.cp.gridInfoBase;
  1371.       bld.mkLoad(TYPE_U32, i->getDef(0),
  1372.                  bld.mkSymbol(FILE_MEMORY_CONST, 0, TYPE_U32, addr), NULL);
  1373.       break;
  1374.    default:
  1375.       if (prog->getType() == Program::TYPE_TESSELLATION_EVAL)
  1376.          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
  1377.       ld = bld.mkFetch(i->getDef(0), i->dType,
  1378.                        FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
  1379.       ld->perPatch = i->perPatch;
  1380.       break;
  1381.    }
  1382.    bld.getBB()->remove(i);
  1383.    return true;
  1384. }
  1385.  
  1386. bool
  1387. NVC0LoweringPass::handleDIV(Instruction *i)
  1388. {
  1389.    if (!isFloatType(i->dType))
  1390.       return true;
  1391.    bld.setPosition(i, false);
  1392.    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
  1393.    i->op = OP_MUL;
  1394.    i->setSrc(1, rcp->getDef(0));
  1395.    return true;
  1396. }
  1397.  
  1398. bool
  1399. NVC0LoweringPass::handleMOD(Instruction *i)
  1400. {
  1401.    if (i->dType != TYPE_F32)
  1402.       return true;
  1403.    LValue *value = bld.getScratch();
  1404.    bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1));
  1405.    bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value);
  1406.    bld.mkOp1(OP_TRUNC, TYPE_F32, value, value);
  1407.    bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value);
  1408.    i->op = OP_SUB;
  1409.    i->setSrc(1, value);
  1410.    return true;
  1411. }
  1412.  
  1413. bool
  1414. NVC0LoweringPass::handleSQRT(Instruction *i)
  1415. {
  1416.    Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
  1417.                                 bld.getSSA(), i->getSrc(0));
  1418.    i->op = OP_MUL;
  1419.    i->setSrc(1, rsq->getDef(0));
  1420.  
  1421.    return true;
  1422. }
  1423.  
  1424. bool
  1425. NVC0LoweringPass::handlePOW(Instruction *i)
  1426. {
  1427.    LValue *val = bld.getScratch();
  1428.  
  1429.    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
  1430.    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
  1431.    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
  1432.  
  1433.    i->op = OP_EX2;
  1434.    i->setSrc(0, val);
  1435.    i->setSrc(1, NULL);
  1436.  
  1437.    return true;
  1438. }
  1439.  
  1440. bool
  1441. NVC0LoweringPass::handleEXPORT(Instruction *i)
  1442. {
  1443.    if (prog->getType() == Program::TYPE_FRAGMENT) {
  1444.       int id = i->getSrc(0)->reg.data.offset / 4;
  1445.  
  1446.       if (i->src(0).isIndirect(0)) // TODO, ugly
  1447.          return false;
  1448.       i->op = OP_MOV;
  1449.       i->subOp = NV50_IR_SUBOP_MOV_FINAL;
  1450.       i->src(0).set(i->src(1));
  1451.       i->setSrc(1, NULL);
  1452.       i->setDef(0, new_LValue(func, FILE_GPR));
  1453.       i->getDef(0)->reg.data.id = id;
  1454.  
  1455.       prog->maxGPR = MAX2(prog->maxGPR, id);
  1456.    } else
  1457.    if (prog->getType() == Program::TYPE_GEOMETRY) {
  1458.       i->setIndirect(0, 1, gpEmitAddress);
  1459.    }
  1460.    return true;
  1461. }
  1462.  
  1463. bool
  1464. NVC0LoweringPass::handleOUT(Instruction *i)
  1465. {
  1466.    if (i->op == OP_RESTART && i->prev && i->prev->op == OP_EMIT) {
  1467.       i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
  1468.       delete_Instruction(prog, i);
  1469.    } else {
  1470.       assert(gpEmitAddress);
  1471.       i->setDef(0, gpEmitAddress);
  1472.       if (i->srcExists(0))
  1473.          i->setSrc(1, i->getSrc(0));
  1474.       i->setSrc(0, gpEmitAddress);
  1475.    }
  1476.    return true;
  1477. }
  1478.  
  1479. // Generate a binary predicate if an instruction is predicated by
  1480. // e.g. an f32 value.
  1481. void
  1482. NVC0LoweringPass::checkPredicate(Instruction *insn)
  1483. {
  1484.    Value *pred = insn->getPredicate();
  1485.    Value *pdst;
  1486.  
  1487.    if (!pred || pred->reg.file == FILE_PREDICATE)
  1488.       return;
  1489.    pdst = new_LValue(func, FILE_PREDICATE);
  1490.  
  1491.    // CAUTION: don't use pdst->getInsn, the definition might not be unique,
  1492.    //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
  1493.  
  1494.    bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, bld.mkImm(0), pred);
  1495.  
  1496.    insn->setPredicate(insn->cc, pdst);
  1497. }
  1498.  
  1499. //
  1500. // - add quadop dance for texturing
  1501. // - put FP outputs in GPRs
  1502. // - convert instruction sequences
  1503. //
  1504. bool
  1505. NVC0LoweringPass::visit(Instruction *i)
  1506. {
  1507.    bld.setPosition(i, false);
  1508.  
  1509.    if (i->cc != CC_ALWAYS)
  1510.       checkPredicate(i);
  1511.  
  1512.    switch (i->op) {
  1513.    case OP_TEX:
  1514.    case OP_TXB:
  1515.    case OP_TXL:
  1516.    case OP_TXF:
  1517.    case OP_TXG:
  1518.       return handleTEX(i->asTex());
  1519.    case OP_TXD:
  1520.       return handleTXD(i->asTex());
  1521.    case OP_TXQ:
  1522.      return handleTXQ(i->asTex());
  1523.    case OP_EX2:
  1524.       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
  1525.       i->setSrc(0, i->getDef(0));
  1526.       break;
  1527.    case OP_POW:
  1528.       return handlePOW(i);
  1529.    case OP_DIV:
  1530.       return handleDIV(i);
  1531.    case OP_MOD:
  1532.       return handleMOD(i);
  1533.    case OP_SQRT:
  1534.       return handleSQRT(i);
  1535.    case OP_EXPORT:
  1536.       return handleEXPORT(i);
  1537.    case OP_EMIT:
  1538.    case OP_RESTART:
  1539.       return handleOUT(i);
  1540.    case OP_RDSV:
  1541.       return handleRDSV(i);
  1542.    case OP_WRSV:
  1543.       return handleWRSV(i);
  1544.    case OP_LOAD:
  1545.       if (i->src(0).getFile() == FILE_SHADER_INPUT) {
  1546.          if (prog->getType() == Program::TYPE_COMPUTE) {
  1547.             i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
  1548.             i->getSrc(0)->reg.fileIndex = 0;
  1549.          } else {
  1550.             i->op = OP_VFETCH;
  1551.             assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
  1552.          }
  1553.       }
  1554.       break;
  1555.    case OP_ATOM:
  1556.    {
  1557.       const bool cctl = i->src(0).getFile() == FILE_MEMORY_GLOBAL;
  1558.       handleATOM(i);
  1559.       handleCasExch(i, cctl);
  1560.    }
  1561.       break;
  1562.    case OP_SULDB:
  1563.    case OP_SULDP:
  1564.    case OP_SUSTB:
  1565.    case OP_SUSTP:
  1566.    case OP_SUREDB:
  1567.    case OP_SUREDP:
  1568.       if (targ->getChipset() >= NVISA_GK104_CHIPSET)
  1569.          handleSurfaceOpNVE4(i->asTex());
  1570.       break;
  1571.    default:
  1572.       break;
  1573.    }
  1574.    return true;
  1575. }
  1576.  
  1577. bool
  1578. TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
  1579. {
  1580.    if (stage == CG_STAGE_PRE_SSA) {
  1581.       NVC0LoweringPass pass(prog);
  1582.       return pass.run(prog, false, true);
  1583.    } else
  1584.    if (stage == CG_STAGE_POST_RA) {
  1585.       NVC0LegalizePostRA pass(prog);
  1586.       return pass.run(prog, false, true);
  1587.    } else
  1588.    if (stage == CG_STAGE_SSA) {
  1589.       NVC0LegalizeSSA pass;
  1590.       return pass.run(prog, false, true);
  1591.    }
  1592.    return false;
  1593. }
  1594.  
  1595. } // namespace nv50_ir
  1596.