Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
5564 serge 1
/*
2
 * Copyright 2011 Christoph Bumiller
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice shall be included in
12
 * all copies or substantial portions of the Software.
13
 *
14
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20
 * OTHER DEALINGS IN THE SOFTWARE.
21
 */
22
 
23
#include "codegen/nv50_ir.h"
24
#include "codegen/nv50_ir_build_util.h"
25
 
26
#include "codegen/nv50_ir_target_nv50.h"
27
 
28
namespace nv50_ir {
29
 
30
// nv50 doesn't support 32 bit integer multiplication
31
//
32
//       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
33
// -------------------
34
//    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
35
// ah*bh 00 00                 (           carry1) << 16 + ( carry2)
36
//       al*bl
37
//    ah*bl 00
38
//
39
// fffe0001 + fffe0001
40
//
41
// Note that this sort of splitting doesn't work for signed values, so we
42
// compute the sign on those manually and then perform an unsigned multiply.
43
static bool
44
expandIntegerMUL(BuildUtil *bld, Instruction *mul)
45
{
46
   const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
47
 
48
   DataType fTy; // full type
49
   switch (mul->sType) {
50
   case TYPE_S32: fTy = TYPE_U32; break;
51
   case TYPE_S64: fTy = TYPE_U64; break;
52
   default: fTy = mul->sType; break;
53
   }
54
 
55
   DataType hTy; // half type
56
   switch (fTy) {
57
   case TYPE_U32: hTy = TYPE_U16; break;
58
   case TYPE_U64: hTy = TYPE_U32; break;
59
   default:
60
      return false;
61
   }
62
   unsigned int fullSize = typeSizeof(fTy);
63
   unsigned int halfSize = typeSizeof(hTy);
64
 
65
   Instruction *i[9];
66
 
67
   bld->setPosition(mul, true);
68
 
69
   Value *s[2];
70
   Value *a[2], *b[2];
71
   Value *t[4];
72
   for (int j = 0; j < 4; ++j)
73
      t[j] = bld->getSSA(fullSize);
74
 
75
   s[0] = mul->getSrc(0);
76
   s[1] = mul->getSrc(1);
77
 
78
   if (isSignedType(mul->sType)) {
79
      s[0] = bld->getSSA(fullSize);
80
      s[1] = bld->getSSA(fullSize);
81
      bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
82
      bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
83
   }
84
 
85
   // split sources into halves
86
   i[0] = bld->mkSplit(a, halfSize, s[0]);
87
   i[1] = bld->mkSplit(b, halfSize, s[1]);
88
 
89
   i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
90
   i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
91
   i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
92
   i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
93
 
94
   if (highResult) {
95
      Value *c[2];
96
      Value *r[5];
97
      Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
98
      c[0] = bld->getSSA(1, FILE_FLAGS);
99
      c[1] = bld->getSSA(1, FILE_FLAGS);
100
      for (int j = 0; j < 5; ++j)
101
         r[j] = bld->getSSA(fullSize);
102
 
103
      i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
104
      i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
105
      bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
106
      bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
107
      i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
108
 
109
      // set carry defs / sources
110
      i[3]->setFlagsDef(1, c[0]);
111
      // actual result required in negative case, but ignored for
112
      // unsigned. for some reason the compiler ends up dropping the whole
113
      // instruction if the destination is unused but the flags are.
114
      if (isSignedType(mul->sType))
115
         i[4]->setFlagsDef(1, c[1]);
116
      else
117
         i[4]->setFlagsDef(0, c[1]);
118
      i[6]->setPredicate(CC_C, c[0]);
119
      i[5]->setFlagsSrc(3, c[1]);
120
 
121
      if (isSignedType(mul->sType)) {
122
         Value *cc[2];
123
         Value *rr[7];
124
         Value *one = bld->getSSA(fullSize);
125
         bld->loadImm(one, 1);
126
         for (int j = 0; j < 7; j++)
127
            rr[j] = bld->getSSA(fullSize);
128
 
129
         // NOTE: this logic uses predicates because splitting basic blocks is
130
         // ~impossible during the SSA phase. The RA relies on a correlation
131
         // between edge order and phi node sources.
132
 
133
         // Set the sign of the result based on the inputs
134
         bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
135
            ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
136
 
137
         // 1s complement of 64-bit value
138
         bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
139
            ->setPredicate(CC_S, cc[0]);
140
         bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
141
            ->setPredicate(CC_S, cc[0]);
142
 
143
         // add to low 32-bits, keep track of the carry
144
         Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
145
         n->setPredicate(CC_S, cc[0]);
146
         n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
147
 
148
         // If there was a carry, add 1 to the upper 32 bits
149
         // XXX: These get executed even if they shouldn't be
150
         bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
151
            ->setPredicate(CC_C, cc[1]);
152
         bld->mkMov(rr[3], rr[0])
153
            ->setPredicate(CC_NC, cc[1]);
154
         bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
155
 
156
         // Merge the results from the negative and non-negative paths
157
         bld->mkMov(rr[5], rr[4])
158
            ->setPredicate(CC_S, cc[0]);
159
         bld->mkMov(rr[6], r[4])
160
            ->setPredicate(CC_NS, cc[0]);
161
         bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
162
      } else {
163
         bld->mkMov(mul->getDef(0), r[4]);
164
      }
165
   } else {
166
      bld->mkMov(mul->getDef(0), t[3]);
167
   }
168
   delete_Instruction(bld->getProgram(), mul);
169
 
170
   for (int j = 2; j <= (highResult ? 5 : 4); ++j)
171
      if (i[j])
172
         i[j]->sType = hTy;
173
 
174
   return true;
175
}
176
 
177
#define QOP_ADD  0
178
#define QOP_SUBR 1
179
#define QOP_SUB  2
180
#define QOP_MOV2 3
181
 
182
//             UL UR LL LR
183
#define QUADOP(q, r, s, t)            \
184
   ((QOP_##q << 6) | (QOP_##r << 4) | \
185
    (QOP_##s << 2) | (QOP_##t << 0))
186
 
187
class NV50LegalizePostRA : public Pass
188
{
189
private:
190
   virtual bool visit(Function *);
191
   virtual bool visit(BasicBlock *);
192
 
193
   void handlePRERET(FlowInstruction *);
194
   void replaceZero(Instruction *);
195
 
196
   LValue *r63;
197
};
198
 
199
bool
200
NV50LegalizePostRA::visit(Function *fn)
201
{
202
   Program *prog = fn->getProgram();
203
 
204
   r63 = new_LValue(fn, FILE_GPR);
205
   r63->reg.data.id = 63;
206
 
207
   // this is actually per-program, but we can do it all on visiting main()
208
   std::list *outWrites =
209
      reinterpret_cast *>(prog->targetPriv);
210
 
211
   if (outWrites) {
212
      for (std::list::iterator it = outWrites->begin();
213
           it != outWrites->end(); ++it)
214
         (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
215
      // instructions will be deleted on exit
216
      outWrites->clear();
217
   }
218
 
219
   return true;
220
}
221
 
222
void
223
NV50LegalizePostRA::replaceZero(Instruction *i)
224
{
225
   for (int s = 0; i->srcExists(s); ++s) {
226
      ImmediateValue *imm = i->getSrc(s)->asImm();
227
      if (imm && imm->reg.data.u64 == 0)
228
         i->setSrc(s, r63);
229
   }
230
}
231
 
232
// Emulate PRERET: jump to the target and call to the origin from there
233
//
234
// WARNING: atm only works if BBs are affected by at most a single PRERET
235
//
236
// BB:0
237
// preret BB:3
238
// (...)
239
// BB:3
240
// (...)
241
//             --->
242
// BB:0
243
// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
244
// (...)
245
// BB:3
246
// bra BB:3 + n1 (skip the call)
247
// call BB:0 + n2 (skip bra at beginning of BB:0)
248
// (...)
249
void
250
NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
251
{
252
   BasicBlock *bbE = pre->bb;
253
   BasicBlock *bbT = pre->target.bb;
254
 
255
   pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
256
   bbE->remove(pre);
257
   bbE->insertHead(pre);
258
 
259
   Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
260
   Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
261
 
262
   bbT->insertHead(call);
263
   bbT->insertHead(skip);
264
 
265
   // NOTE: maybe split blocks to prevent the instructions from moving ?
266
 
267
   skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
268
   call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
269
}
270
 
271
bool
272
NV50LegalizePostRA::visit(BasicBlock *bb)
273
{
274
   Instruction *i, *next;
275
 
276
   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
277
   for (i = bb->getFirst(); i; i = next) {
278
      next = i->next;
279
      if (i->isNop()) {
280
         bb->remove(i);
281
      } else
282
      if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
283
         handlePRERET(i->asFlow());
284
      } else {
285
         // TODO: We will want to do this before register allocation,
286
         // since have to use a $c register for the carry flag.
287
         if (typeSizeof(i->dType) == 8) {
288
            Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
289
            if (hi)
290
               next = hi;
291
         }
292
 
293
         if (i->op != OP_MOV && i->op != OP_PFETCH &&
294
             i->op != OP_BAR &&
295
             (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
296
            replaceZero(i);
297
      }
298
   }
299
   if (!bb->getEntry())
300
      return true;
301
 
302
   return true;
303
}
304
 
305
class NV50LegalizeSSA : public Pass
306
{
307
public:
308
   NV50LegalizeSSA(Program *);
309
 
310
   virtual bool visit(BasicBlock *bb);
311
 
312
private:
313
   void propagateWriteToOutput(Instruction *);
314
   void handleDIV(Instruction *);
315
   void handleMOD(Instruction *);
316
   void handleMUL(Instruction *);
317
   void handleAddrDef(Instruction *);
318
 
319
   inline bool isARL(const Instruction *) const;
320
 
321
   BuildUtil bld;
322
 
323
   std::list *outWrites;
324
};
325
 
326
NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
327
{
328
   bld.setProgram(prog);
329
 
330
   if (prog->optLevel >= 2 &&
331
       (prog->getType() == Program::TYPE_GEOMETRY ||
332
        prog->getType() == Program::TYPE_VERTEX))
333
      outWrites =
334
         reinterpret_cast *>(prog->targetPriv);
335
   else
336
      outWrites = NULL;
337
}
338
 
339
void
340
NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
341
{
342
   if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
343
      return;
344
 
345
   // check def instruction can store
346
   Instruction *di = st->getSrc(1)->defs.front()->getInsn();
347
 
348
   // TODO: move exports (if beneficial) in common opt pass
349
   if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
350
      return;
351
 
352
   for (int s = 0; di->srcExists(s); ++s)
353
      if (di->src(s).getFile() == FILE_IMMEDIATE)
354
         return;
355
 
356
   if (prog->getType() == Program::TYPE_GEOMETRY) {
357
      // Only propagate output writes in geometry shaders when we can be sure
358
      // that we are propagating to the same output vertex.
359
      if (di->bb != st->bb)
360
         return;
361
      Instruction *i;
362
      for (i = di; i != st; i = i->next) {
363
         if (i->op == OP_EMIT || i->op == OP_RESTART)
364
            return;
365
      }
366
      assert(i); // st after di
367
   }
368
 
369
   // We cannot set defs to non-lvalues before register allocation, so
370
   // save & remove (to save registers) the exports and replace later.
371
   outWrites->push_back(st);
372
   st->bb->remove(st);
373
}
374
 
375
bool
376
NV50LegalizeSSA::isARL(const Instruction *i) const
377
{
378
   ImmediateValue imm;
379
 
380
   if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
381
      return false;
382
   if (!i->src(1).getImmediate(imm))
383
      return false;
384
   return imm.isInteger(0);
385
}
386
 
387
void
388
NV50LegalizeSSA::handleAddrDef(Instruction *i)
389
{
390
   Instruction *arl;
391
 
392
   i->getDef(0)->reg.size = 2; // $aX are only 16 bit
393
 
394
   // PFETCH can always write to $a
395
   if (i->op == OP_PFETCH)
396
      return;
397
   // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
398
   if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
399
      if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
400
         return;
401
      if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
402
         return;
403
   }
404
 
405
   // turn $a sources into $r sources (can't operate on $a)
406
   for (int s = 0; i->srcExists(s); ++s) {
407
      Value *a = i->getSrc(s);
408
      Value *r;
409
      if (a->reg.file == FILE_ADDRESS) {
410
         if (a->getInsn() && isARL(a->getInsn())) {
411
            i->setSrc(s, a->getInsn()->getSrc(0));
412
         } else {
413
            bld.setPosition(i, false);
414
            r = bld.getSSA();
415
            bld.mkMov(r, a);
416
            i->setSrc(s, r);
417
         }
418
      }
419
   }
420
   if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
421
      return;
422
 
423
   // turn result back into $a
424
   bld.setPosition(i, true);
425
   arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
426
   i->setDef(0, arl->getSrc(0));
427
}
428
 
429
void
430
NV50LegalizeSSA::handleMUL(Instruction *mul)
431
{
432
   if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
433
      return;
434
   Value *def = mul->getDef(0);
435
   Value *pred = mul->getPredicate();
436
   CondCode cc = mul->cc;
437
   if (pred)
438
      mul->setPredicate(CC_ALWAYS, NULL);
439
 
440
   if (mul->op == OP_MAD) {
441
      Instruction *add = mul;
442
      bld.setPosition(add, false);
443
      Value *res = cloneShallow(func, mul->getDef(0));
444
      mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
445
      add->op = OP_ADD;
446
      add->setSrc(0, mul->getDef(0));
447
      add->setSrc(1, add->getSrc(2));
448
      for (int s = 2; add->srcExists(s); ++s)
449
         add->setSrc(s, NULL);
450
      mul->subOp = add->subOp;
451
      add->subOp = 0;
452
   }
453
   expandIntegerMUL(&bld, mul);
454
   if (pred)
455
      def->getInsn()->setPredicate(cc, pred);
456
}
457
 
458
// Use f32 division: first compute an approximate result, use it to reduce
459
// the dividend, which should then be representable as f32, divide the reduced
460
// dividend, and add the quotients.
461
void
462
NV50LegalizeSSA::handleDIV(Instruction *div)
463
{
464
   const DataType ty = div->sType;
465
 
466
   if (ty != TYPE_U32 && ty != TYPE_S32)
467
      return;
468
 
469
   Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
470
 
471
   bld.setPosition(div, false);
472
 
473
   Value *a, *af = bld.getSSA();
474
   Value *b, *bf = bld.getSSA();
475
 
476
   bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
477
   bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
478
 
479
   if (isSignedType(ty)) {
480
      af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
481
      bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
482
      a = bld.getSSA();
483
      b = bld.getSSA();
484
      bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
485
      bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
486
   } else {
487
      a = div->getSrc(0);
488
      b = div->getSrc(1);
489
   }
490
 
491
   bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
492
   bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
493
 
494
   bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
495
   bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
496
 
497
   // get error of 1st result
498
   expandIntegerMUL(&bld,
499
      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
500
   bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
501
 
502
   bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
503
 
504
   bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
505
   bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
506
      ->rnd = ROUND_Z;
507
   bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
508
 
509
   // correction: if modulus >= divisor, add 1
510
   expandIntegerMUL(&bld,
511
      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
512
   bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
513
   bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
514
   if (!isSignedType(ty)) {
515
      div->op = OP_SUB;
516
      div->setSrc(0, q);
517
      div->setSrc(1, s);
518
   } else {
519
      t = q;
520
      bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
521
      s = bld.getSSA();
522
      t = bld.getSSA();
523
      // fix the sign
524
      bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
525
         ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
526
      bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
527
      bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
528
 
529
      div->op = OP_UNION;
530
      div->setSrc(0, s);
531
      div->setSrc(1, t);
532
   }
533
}
534
 
535
void
536
NV50LegalizeSSA::handleMOD(Instruction *mod)
537
{
538
   if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
539
      return;
540
   bld.setPosition(mod, false);
541
 
542
   Value *q = bld.getSSA();
543
   Value *m = bld.getSSA();
544
 
545
   bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
546
   handleDIV(q->getInsn());
547
 
548
   bld.setPosition(mod, false);
549
   expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
550
 
551
   mod->op = OP_SUB;
552
   mod->setSrc(1, m);
553
}
554
 
555
bool
556
NV50LegalizeSSA::visit(BasicBlock *bb)
557
{
558
   Instruction *insn, *next;
559
   // skipping PHIs (don't pass them to handleAddrDef) !
560
   for (insn = bb->getEntry(); insn; insn = next) {
561
      next = insn->next;
562
 
563
      if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
564
         handleAddrDef(insn);
565
 
566
      switch (insn->op) {
567
      case OP_EXPORT:
568
         if (outWrites)
569
            propagateWriteToOutput(insn);
570
         break;
571
      case OP_DIV:
572
         handleDIV(insn);
573
         break;
574
      case OP_MOD:
575
         handleMOD(insn);
576
         break;
577
      case OP_MAD:
578
      case OP_MUL:
579
         handleMUL(insn);
580
         break;
581
      default:
582
         break;
583
      }
584
   }
585
   return true;
586
}
587
 
588
class NV50LoweringPreSSA : public Pass
589
{
590
public:
591
   NV50LoweringPreSSA(Program *);
592
 
593
private:
594
   virtual bool visit(Instruction *);
595
   virtual bool visit(Function *);
596
 
597
   bool handleRDSV(Instruction *);
598
   bool handleWRSV(Instruction *);
599
 
600
   bool handlePFETCH(Instruction *);
601
   bool handleEXPORT(Instruction *);
602
   bool handleLOAD(Instruction *);
603
 
604
   bool handleDIV(Instruction *);
605
   bool handleSQRT(Instruction *);
606
   bool handlePOW(Instruction *);
607
 
608
   bool handleSET(Instruction *);
609
   bool handleSLCT(CmpInstruction *);
610
   bool handleSELP(Instruction *);
611
 
612
   bool handleTEX(TexInstruction *);
613
   bool handleTXB(TexInstruction *); // I really
614
   bool handleTXL(TexInstruction *); // hate
615
   bool handleTXD(TexInstruction *); // these 3
616
   bool handleTXLQ(TexInstruction *);
617
 
618
   bool handleCALL(Instruction *);
619
   bool handlePRECONT(Instruction *);
620
   bool handleCONT(Instruction *);
621
 
622
   void checkPredicate(Instruction *);
623
   void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
624
   void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
625
 
626
private:
627
   const Target *const targ;
628
 
629
   BuildUtil bld;
630
 
631
   Value *tid;
632
};
633
 
634
NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
635
   targ(prog->getTarget()), tid(NULL)
636
{
637
   bld.setProgram(prog);
638
}
639
 
640
bool
641
NV50LoweringPreSSA::visit(Function *f)
642
{
643
   BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
644
 
645
   if (prog->getType() == Program::TYPE_COMPUTE) {
646
      // Add implicit "thread id" argument in $r0 to the function
647
      Value *arg = new_LValue(func, FILE_GPR);
648
      arg->reg.data.id = 0;
649
      f->ins.push_back(arg);
650
 
651
      bld.setPosition(root, false);
652
      tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
653
   }
654
 
655
   return true;
656
}
657
 
658
void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
659
                                       Value **ms_x, Value **ms_y) {
660
   // This loads the texture-indexed ms setting from the constant buffer
661
   Value *tmp = new_LValue(func, FILE_GPR);
662
   uint8_t b = prog->driver->io.resInfoCBSlot;
663
   off += prog->driver->io.suInfoBase;
664
   if (prog->getType() > Program::TYPE_VERTEX)
665
      off += 16 * 2 * 4;
666
   if (prog->getType() > Program::TYPE_GEOMETRY)
667
      off += 16 * 2 * 4;
668
   *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
669
                             FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
670
   *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
671
                             FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);
672
   *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y);
673
}
674
 
675
void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) {
676
   // Given a MS level, and a sample id, compute the delta x/y
677
   uint8_t b = prog->driver->io.msInfoCBSlot;
678
   Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR);
679
 
680
   // The required information is at mslevel * 16 * 4 + sample * 8
681
   // = (mslevel * 8 + sample) * 8
682
   bld.mkOp2(OP_SHL,
683
             TYPE_U32,
684
             off,
685
             bld.mkOp2v(OP_ADD, TYPE_U32, t,
686
                        bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),
687
                        s),
688
             bld.mkImm(3));
689
   *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
690
                           FILE_MEMORY_CONST, b, TYPE_U32,
691
                           prog->driver->io.msInfoBase), off);
692
   *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
693
                           FILE_MEMORY_CONST, b, TYPE_U32,
694
                           prog->driver->io.msInfoBase + 4), off);
695
}
696
 
697
bool
698
NV50LoweringPreSSA::handleTEX(TexInstruction *i)
699
{
700
   const int arg = i->tex.target.getArgCount();
701
   const int dref = arg;
702
   const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
703
 
704
   // handle MS, which means looking up the MS params for this texture, and
705
   // adjusting the input coordinates to point at the right sample.
706
   if (i->tex.target.isMS()) {
707
      Value *x = i->getSrc(0);
708
      Value *y = i->getSrc(1);
709
      Value *s = i->getSrc(arg - 1);
710
      Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR),
711
         *ms, *ms_x, *ms_y, *dx, *dy;
712
 
713
      i->tex.target.clearMS();
714
 
715
      loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
716
      loadMsInfo(ms, s, &dx, &dy);
717
 
718
      bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
719
      bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
720
      bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
721
      bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
722
      i->setSrc(0, tx);
723
      i->setSrc(1, ty);
724
      i->setSrc(arg - 1, bld.loadImm(NULL, 0));
725
   }
726
 
727
   // dref comes before bias/lod
728
   if (i->tex.target.isShadow())
729
      if (i->op == OP_TXB || i->op == OP_TXL)
730
         i->swapSources(dref, lod);
731
 
732
   if (i->tex.target.isArray()) {
733
      if (i->op != OP_TXF) {
734
         // array index must be converted to u32, but it's already an integer
735
         // for TXF
736
         Value *layer = i->getSrc(arg - 1);
737
         LValue *src = new_LValue(func, FILE_GPR);
738
         bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
739
         bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
740
         i->setSrc(arg - 1, src);
741
      }
742
      if (i->tex.target.isCube() && i->srcCount() > 4) {
743
         std::vector acube, a2d;
744
         int c;
745
 
746
         acube.resize(4);
747
         for (c = 0; c < 4; ++c)
748
            acube[c] = i->getSrc(c);
749
         a2d.resize(4);
750
         for (c = 0; c < 3; ++c)
751
            a2d[c] = new_LValue(func, FILE_GPR);
752
         a2d[3] = NULL;
753
 
754
         bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
755
                   a2d, acube)->asTex()->tex.mask = 0x7;
756
 
757
         for (c = 0; c < 3; ++c)
758
            i->setSrc(c, a2d[c]);
759
         for (; i->srcExists(c + 1); ++c)
760
            i->setSrc(c, i->getSrc(c + 1));
761
         i->setSrc(c, NULL);
762
         assert(c <= 4);
763
 
764
         i->tex.target = i->tex.target.isShadow() ?
765
            TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
766
      }
767
   }
768
 
769
   // texel offsets are 3 immediate fields in the instruction,
770
   // nv50 cannot do textureGatherOffsets
771
   assert(i->tex.useOffsets <= 1);
772
   if (i->tex.useOffsets) {
773
      for (int c = 0; c < 3; ++c) {
774
         ImmediateValue val;
775
         if (!i->offset[0][c].getImmediate(val))
776
            assert(!"non-immediate offset");
777
         i->tex.offset[c] = val.reg.data.u32;
778
         i->offset[0][c].set(NULL);
779
      }
780
   }
781
 
782
   return true;
783
}
784
 
785
// Bias must be equal for all threads of a quad or lod calculation will fail.
786
//
787
// The lanes of a quad are grouped by the bit in the condition register they
788
// have set, which is selected by differing bias values.
789
// Move the input values for TEX into a new register set for each group and
790
// execute TEX only for a specific group.
791
// We always need to use 4 new registers for the inputs/outputs because the
792
// implicitly calculated derivatives must be correct.
793
//
794
// TODO: move to SSA phase so we can easily determine whether bias is constant
795
bool
796
NV50LoweringPreSSA::handleTXB(TexInstruction *i)
797
{
798
   const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
799
   int l, d;
800
 
801
   // We can't actually apply bias *and* do a compare for a cube
802
   // texture. Since the compare has to be done before the filtering, just
803
   // drop the bias on the floor.
804
   if (i->tex.target == TEX_TARGET_CUBE_SHADOW) {
805
      i->op = OP_TEX;
806
      i->setSrc(3, i->getSrc(4));
807
      i->setSrc(4, NULL);
808
      return handleTEX(i);
809
   }
810
 
811
   handleTEX(i);
812
   Value *bias = i->getSrc(i->tex.target.getArgCount());
813
   if (bias->isUniform())
814
      return true;
815
 
816
   Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
817
                                 bld.loadImm(NULL, 1));
818
   bld.setPosition(cond, false);
819
 
820
   for (l = 1; l < 4; ++l) {
821
      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
822
      Value *bit = bld.getSSA();
823
      Value *pred = bld.getScratch(1, FILE_FLAGS);
824
      Value *imm = bld.loadImm(NULL, (1 << l));
825
      bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
826
      bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
827
      cond->setSrc(l, bit);
828
   }
829
   Value *flags = bld.getScratch(1, FILE_FLAGS);
830
   bld.setPosition(cond, true);
831
   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
832
 
833
   Instruction *tex[4];
834
   for (l = 0; l < 4; ++l) {
835
      (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
836
      bld.insert(tex[l]);
837
   }
838
 
839
   Value *res[4][4];
840
   for (d = 0; i->defExists(d); ++d)
841
      res[0][d] = tex[0]->getDef(d);
842
   for (l = 1; l < 4; ++l) {
843
      for (d = 0; tex[l]->defExists(d); ++d) {
844
         res[l][d] = cloneShallow(func, res[0][d]);
845
         bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
846
      }
847
   }
848
 
849
   for (d = 0; i->defExists(d); ++d) {
850
      Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
851
      for (l = 0; l < 4; ++l)
852
         dst->setSrc(l, res[l][d]);
853
   }
854
   delete_Instruction(prog, i);
855
   return true;
856
}
857
 
858
// LOD must be equal for all threads of a quad.
859
// Unlike with TXB, here we can just diverge since there's no LOD calculation
860
// that would require all 4 threads' sources to be set up properly.
861
bool
862
NV50LoweringPreSSA::handleTXL(TexInstruction *i)
863
{
864
   handleTEX(i);
865
   Value *lod = i->getSrc(i->tex.target.getArgCount());
866
   if (lod->isUniform())
867
      return true;
868
 
869
   BasicBlock *currBB = i->bb;
870
   BasicBlock *texiBB = i->bb->splitBefore(i, false);
871
   BasicBlock *joinBB = i->bb->splitAfter(i);
872
 
873
   bld.setPosition(currBB, true);
874
   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
875
 
876
   for (int l = 0; l <= 3; ++l) {
877
      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
878
      Value *pred = bld.getScratch(1, FILE_FLAGS);
879
      bld.setPosition(currBB, true);
880
      bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
881
      bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
882
      currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
883
      if (l <= 2) {
884
         BasicBlock *laneBB = new BasicBlock(func);
885
         currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
886
         currBB = laneBB;
887
      }
888
   }
889
   bld.setPosition(joinBB, false);
890
   bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
891
   return true;
892
}
893
 
894
bool
895
NV50LoweringPreSSA::handleTXD(TexInstruction *i)
896
{
897
   static const uint8_t qOps[4][2] =
898
   {
899
      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
900
      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
901
      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
902
      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
903
   };
904
   Value *def[4][4];
905
   Value *crd[3];
906
   Instruction *tex;
907
   Value *zero = bld.loadImm(bld.getSSA(), 0);
908
   int l, c;
909
   const int dim = i->tex.target.getDim();
910
 
911
   handleTEX(i);
912
   i->op = OP_TEX; // no need to clone dPdx/dPdy later
913
 
914
   for (c = 0; c < dim; ++c)
915
      crd[c] = bld.getScratch();
916
 
917
   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
918
   for (l = 0; l < 4; ++l) {
919
      // mov coordinates from lane l to all lanes
920
      for (c = 0; c < dim; ++c)
921
         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
922
      // add dPdx from lane l to lanes dx
923
      for (c = 0; c < dim; ++c)
924
         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
925
      // add dPdy from lane l to lanes dy
926
      for (c = 0; c < dim; ++c)
927
         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
928
      // texture
929
      bld.insert(tex = cloneForward(func, i));
930
      for (c = 0; c < dim; ++c)
931
         tex->setSrc(c, crd[c]);
932
      // save results
933
      for (c = 0; i->defExists(c); ++c) {
934
         Instruction *mov;
935
         def[c][l] = bld.getSSA();
936
         mov = bld.mkMov(def[c][l], tex->getDef(c));
937
         mov->fixed = 1;
938
         mov->lanes = 1 << l;
939
      }
940
   }
941
   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
942
 
943
   for (c = 0; i->defExists(c); ++c) {
944
      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
945
      for (l = 0; l < 4; ++l)
946
         u->setSrc(l, def[c][l]);
947
   }
948
 
949
   i->bb->remove(i);
950
   return true;
951
}
952
 
953
bool
954
NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
955
{
956
   handleTEX(i);
957
   bld.setPosition(i, true);
958
 
959
   /* The returned values are not quite what we want:
960
    * (a) convert from s32 to f32
961
    * (b) multiply by 1/256
962
    */
963
   for (int def = 0; def < 2; ++def) {
964
      if (!i->defExists(def))
965
         continue;
966
      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
967
      bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
968
                i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
969
   }
970
   return true;
971
}
972
 
973
bool
974
NV50LoweringPreSSA::handleSET(Instruction *i)
975
{
976
   if (i->dType == TYPE_F32) {
977
      bld.setPosition(i, true);
978
      i->dType = TYPE_U32;
979
      bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
980
      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
981
   }
982
   return true;
983
}
984
 
985
bool
986
NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
987
{
988
   Value *src0 = bld.getSSA();
989
   Value *src1 = bld.getSSA();
990
   Value *pred = bld.getScratch(1, FILE_FLAGS);
991
 
992
   Value *v0 = i->getSrc(0);
993
   Value *v1 = i->getSrc(1);
994
   // XXX: these probably shouldn't be immediates in the first place ...
995
   if (v0->asImm())
996
      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
997
   if (v1->asImm())
998
      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
999
 
1000
   bld.setPosition(i, true);
1001
   bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
1002
   bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
1003
   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1004
 
1005
   bld.setPosition(i, false);
1006
   i->op = OP_SET;
1007
   i->setFlagsDef(0, pred);
1008
   i->dType = TYPE_U8;
1009
   i->setSrc(0, i->getSrc(2));
1010
   i->setSrc(2, NULL);
1011
   i->setSrc(1, bld.loadImm(NULL, 0));
1012
 
1013
   return true;
1014
}
1015
 
1016
bool
1017
NV50LoweringPreSSA::handleSELP(Instruction *i)
1018
{
1019
   Value *src0 = bld.getSSA();
1020
   Value *src1 = bld.getSSA();
1021
 
1022
   Value *v0 = i->getSrc(0);
1023
   Value *v1 = i->getSrc(1);
1024
   if (v0->asImm())
1025
      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1026
   if (v1->asImm())
1027
      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1028
 
1029
   bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
1030
   bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
1031
   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1032
   delete_Instruction(prog, i);
1033
   return true;
1034
}
1035
 
1036
bool
1037
NV50LoweringPreSSA::handleWRSV(Instruction *i)
1038
{
1039
   Symbol *sym = i->getSrc(0)->asSym();
1040
 
1041
   // these are all shader outputs, $sreg are not writeable
1042
   uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
1043
   if (addr >= 0x400)
1044
      return false;
1045
   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
1046
 
1047
   bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
1048
 
1049
   bld.getBB()->remove(i);
1050
   return true;
1051
}
1052
 
1053
bool
1054
NV50LoweringPreSSA::handleCALL(Instruction *i)
1055
{
1056
   if (prog->getType() == Program::TYPE_COMPUTE) {
1057
      // Add implicit "thread id" argument in $r0 to the function
1058
      i->setSrc(i->srcCount(), tid);
1059
   }
1060
   return true;
1061
}
1062
 
1063
bool
1064
NV50LoweringPreSSA::handlePRECONT(Instruction *i)
1065
{
1066
   delete_Instruction(prog, i);
1067
   return true;
1068
}
1069
 
1070
bool
1071
NV50LoweringPreSSA::handleCONT(Instruction *i)
1072
{
1073
   i->op = OP_BRA;
1074
   return true;
1075
}
1076
 
1077
bool
1078
NV50LoweringPreSSA::handleRDSV(Instruction *i)
1079
{
1080
   Symbol *sym = i->getSrc(0)->asSym();
1081
   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
1082
   Value *def = i->getDef(0);
1083
   SVSemantic sv = sym->reg.data.sv.sv;
1084
   int idx = sym->reg.data.sv.index;
1085
 
1086
   if (addr >= 0x400) // mov $sreg
1087
      return true;
1088
 
1089
   switch (sv) {
1090
   case SV_POSITION:
1091
      assert(prog->getType() == Program::TYPE_FRAGMENT);
1092
      bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
1093
      break;
1094
   case SV_FACE:
1095
      bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
1096
      if (i->dType == TYPE_F32) {
1097
         bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001));
1098
         bld.mkOp1(OP_NEG, TYPE_S32, def, def);
1099
         bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def);
1100
      }
1101
      break;
1102
   case SV_NCTAID:
1103
   case SV_CTAID:
1104
   case SV_NTID:
1105
      if ((sv == SV_NCTAID && idx >= 2) ||
1106
          (sv == SV_NTID && idx >= 3)) {
1107
         bld.mkMov(def, bld.mkImm(1));
1108
      } else if (sv == SV_CTAID && idx >= 2) {
1109
         bld.mkMov(def, bld.mkImm(0));
1110
      } else {
1111
         Value *x = bld.getSSA(2);
1112
         bld.mkOp1(OP_LOAD, TYPE_U16, x,
1113
                   bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
1114
         bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
1115
      }
1116
      break;
1117
   case SV_TID:
1118
      if (idx == 0) {
1119
         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
1120
      } else if (idx == 1) {
1121
         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
1122
         bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
1123
      } else if (idx == 2) {
1124
         bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
1125
      } else {
1126
         bld.mkMov(def, bld.mkImm(0));
1127
      }
1128
      break;
1129
   case SV_SAMPLE_POS: {
1130
      Value *off = new_LValue(func, FILE_ADDRESS);
1131
      bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
1132
      bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
1133
      bld.mkLoad(TYPE_F32,
1134
                 def,
1135
                 bld.mkSymbol(
1136
                       FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
1137
                       TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
1138
                 off);
1139
      break;
1140
   }
1141
   default:
1142
      bld.mkFetch(i->getDef(0), i->dType,
1143
                  FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
1144
      break;
1145
   }
1146
   bld.getBB()->remove(i);
1147
   return true;
1148
}
1149
 
1150
bool
1151
NV50LoweringPreSSA::handleDIV(Instruction *i)
1152
{
1153
   if (!isFloatType(i->dType))
1154
      return true;
1155
   bld.setPosition(i, false);
1156
   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
1157
   i->op = OP_MUL;
1158
   i->setSrc(1, rcp->getDef(0));
1159
   return true;
1160
}
1161
 
1162
bool
1163
NV50LoweringPreSSA::handleSQRT(Instruction *i)
1164
{
1165
   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
1166
                                bld.getSSA(), i->getSrc(0));
1167
   i->op = OP_MUL;
1168
   i->setSrc(1, rsq->getDef(0));
1169
 
1170
   return true;
1171
}
1172
 
1173
bool
1174
NV50LoweringPreSSA::handlePOW(Instruction *i)
1175
{
1176
   LValue *val = bld.getScratch();
1177
 
1178
   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
1179
   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
1180
   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
1181
 
1182
   i->op = OP_EX2;
1183
   i->setSrc(0, val);
1184
   i->setSrc(1, NULL);
1185
 
1186
   return true;
1187
}
1188
 
1189
bool
1190
NV50LoweringPreSSA::handleEXPORT(Instruction *i)
1191
{
1192
   if (prog->getType() == Program::TYPE_FRAGMENT) {
1193
      if (i->getIndirect(0, 0)) {
1194
         // TODO: redirect to l[] here, load to GPRs at exit
1195
         return false;
1196
      } else {
1197
         int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
1198
 
1199
         i->op = OP_MOV;
1200
         i->subOp = NV50_IR_SUBOP_MOV_FINAL;
1201
         i->src(0).set(i->src(1));
1202
         i->setSrc(1, NULL);
1203
         i->setDef(0, new_LValue(func, FILE_GPR));
1204
         i->getDef(0)->reg.data.id = id;
1205
 
1206
         prog->maxGPR = MAX2(prog->maxGPR, id);
1207
      }
1208
   }
1209
   return true;
1210
}
1211
 
1212
// Handle indirect addressing in geometry shaders:
1213
//
1214
// ld $r0 a[$a1][$a2+k] ->
1215
// ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
1216
//
1217
bool
1218
NV50LoweringPreSSA::handleLOAD(Instruction *i)
1219
{
1220
   ValueRef src = i->src(0);
1221
 
1222
   if (src.isIndirect(1)) {
1223
      assert(prog->getType() == Program::TYPE_GEOMETRY);
1224
      Value *addr = i->getIndirect(0, 1);
1225
 
1226
      if (src.isIndirect(0)) {
1227
         // base address is in an address register, so move to a GPR
1228
         Value *base = bld.getScratch();
1229
         bld.mkMov(base, addr);
1230
 
1231
         Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);
1232
         Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);
1233
         Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1234
                                    i->getIndirect(0, 0), bld.mkImm(2));
1235
 
1236
         // Calculate final address: addr = base + attr*vstride; use 16-bit
1237
         // multiplication since 32-bit would be lowered to multiple
1238
         // instructions, and we only need the low 16 bits of the result
1239
         Value *a[2], *b[2];
1240
         bld.mkSplit(a, 2, attrib);
1241
         bld.mkSplit(b, 2, vstride);
1242
         Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],
1243
                                 base);
1244
 
1245
         // move address from GPR into an address register
1246
         addr = bld.getSSA(2, FILE_ADDRESS);
1247
         bld.mkMov(addr, sum);
1248
      }
1249
 
1250
      i->setIndirect(0, 1, NULL);
1251
      i->setIndirect(0, 0, addr);
1252
   }
1253
 
1254
   return true;
1255
}
1256
 
1257
bool
1258
NV50LoweringPreSSA::handlePFETCH(Instruction *i)
1259
{
1260
   assert(prog->getType() == Program::TYPE_GEOMETRY);
1261
 
1262
   // NOTE: cannot use getImmediate here, not in SSA form yet, move to
1263
   // later phase if that assertion ever triggers:
1264
 
1265
   ImmediateValue *imm = i->getSrc(0)->asImm();
1266
   assert(imm);
1267
 
1268
   assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens
1269
 
1270
   if (i->srcExists(1)) {
1271
      // indirect addressing of vertex in primitive space
1272
 
1273
      LValue *val = bld.getScratch();
1274
      Value *ptr = bld.getSSA(2, FILE_ADDRESS);
1275
      bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));
1276
      bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);
1277
 
1278
      // NOTE: PFETCH directly to an $aX only works with direct addressing
1279
      i->op = OP_SHL;
1280
      i->setSrc(0, val);
1281
      i->setSrc(1, bld.mkImm(0));
1282
   }
1283
 
1284
   return true;
1285
}
1286
 
1287
// Set flags according to predicate and make the instruction read $cX.
1288
void
1289
NV50LoweringPreSSA::checkPredicate(Instruction *insn)
1290
{
1291
   Value *pred = insn->getPredicate();
1292
   Value *cdst;
1293
 
1294
   // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
1295
   if (!pred ||
1296
       pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)
1297
      return;
1298
 
1299
   cdst = bld.getSSA(1, FILE_FLAGS);
1300
 
1301
   bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
1302
 
1303
   insn->setPredicate(insn->cc, cdst);
1304
}
1305
 
1306
//
1307
// - add quadop dance for texturing
1308
// - put FP outputs in GPRs
1309
// - convert instruction sequences
1310
//
1311
bool
1312
NV50LoweringPreSSA::visit(Instruction *i)
1313
{
1314
   bld.setPosition(i, false);
1315
 
1316
   if (i->cc != CC_ALWAYS)
1317
      checkPredicate(i);
1318
 
1319
   switch (i->op) {
1320
   case OP_TEX:
1321
   case OP_TXF:
1322
   case OP_TXG:
1323
      return handleTEX(i->asTex());
1324
   case OP_TXB:
1325
      return handleTXB(i->asTex());
1326
   case OP_TXL:
1327
      return handleTXL(i->asTex());
1328
   case OP_TXD:
1329
      return handleTXD(i->asTex());
1330
   case OP_TXLQ:
1331
      return handleTXLQ(i->asTex());
1332
   case OP_EX2:
1333
      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1334
      i->setSrc(0, i->getDef(0));
1335
      break;
1336
   case OP_SET:
1337
      return handleSET(i);
1338
   case OP_SLCT:
1339
      return handleSLCT(i->asCmp());
1340
   case OP_SELP:
1341
      return handleSELP(i);
1342
   case OP_POW:
1343
      return handlePOW(i);
1344
   case OP_DIV:
1345
      return handleDIV(i);
1346
   case OP_SQRT:
1347
      return handleSQRT(i);
1348
   case OP_EXPORT:
1349
      return handleEXPORT(i);
1350
   case OP_LOAD:
1351
      return handleLOAD(i);
1352
   case OP_RDSV:
1353
      return handleRDSV(i);
1354
   case OP_WRSV:
1355
      return handleWRSV(i);
1356
   case OP_CALL:
1357
      return handleCALL(i);
1358
   case OP_PRECONT:
1359
      return handlePRECONT(i);
1360
   case OP_CONT:
1361
      return handleCONT(i);
1362
   case OP_PFETCH:
1363
      return handlePFETCH(i);
1364
   default:
1365
      break;
1366
   }
1367
   return true;
1368
}
1369
 
1370
bool
1371
TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
1372
{
1373
   bool ret = false;
1374
 
1375
   if (stage == CG_STAGE_PRE_SSA) {
1376
      NV50LoweringPreSSA pass(prog);
1377
      ret = pass.run(prog, false, true);
1378
   } else
1379
   if (stage == CG_STAGE_SSA) {
1380
      if (!prog->targetPriv)
1381
         prog->targetPriv = new std::list();
1382
      NV50LegalizeSSA pass(prog);
1383
      ret = pass.run(prog, false, true);
1384
   } else
1385
   if (stage == CG_STAGE_POST_RA) {
1386
      NV50LegalizePostRA pass;
1387
      ret = pass.run(prog, false, true);
1388
      if (prog->targetPriv)
1389
         delete reinterpret_cast *>(prog->targetPriv);
1390
   }
1391
   return ret;
1392
}
1393
 
1394
} // namespace nv50_ir