Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4358 Serge 1
/*
2
 * Copyright 2011 Christoph Bumiller
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice shall be included in
12
 * all copies or substantial portions of the Software.
13
 *
14
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20
 * OTHER DEALINGS IN THE SOFTWARE.
21
 */
22
 
23
#include "nv50/codegen/nv50_ir.h"
24
#include "nv50/codegen/nv50_ir_build_util.h"
25
 
26
#include "nv50_ir_target_nv50.h"
27
 
28
namespace nv50_ir {
29
 
30
// nv50 doesn't support 32 bit integer multiplication
31
//
32
//       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
33
// -------------------
34
//    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
35
// ah*bh 00 00                 (           carry1) << 16 + ( carry2)
36
//       al*bl
37
//    ah*bl 00
38
//
39
// fffe0001 + fffe0001
40
static bool
41
expandIntegerMUL(BuildUtil *bld, Instruction *mul)
42
{
43
   const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
44
 
45
   DataType fTy = mul->sType; // full type
46
   DataType hTy;
47
   switch (fTy) {
48
   case TYPE_S32: hTy = TYPE_S16; break;
49
   case TYPE_U32: hTy = TYPE_U16; break;
50
   case TYPE_U64: hTy = TYPE_U32; break;
51
   case TYPE_S64: hTy = TYPE_S32; break;
52
   default:
53
      return false;
54
   }
55
   unsigned int fullSize = typeSizeof(fTy);
56
   unsigned int halfSize = typeSizeof(hTy);
57
 
58
   Instruction *i[9];
59
 
60
   bld->setPosition(mul, true);
61
 
62
   Value *a[2], *b[2];
63
   Value *c[2];
64
   Value *t[4];
65
   for (int j = 0; j < 4; ++j)
66
      t[j] = bld->getSSA(fullSize);
67
 
68
   // split sources into halves
69
   i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
70
   i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
71
 
72
   i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
73
   i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
74
   i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
75
   i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
76
 
77
   if (highResult) {
78
      Value *r[3];
79
      Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
80
      c[0] = bld->getSSA(1, FILE_FLAGS);
81
      c[1] = bld->getSSA(1, FILE_FLAGS);
82
      for (int j = 0; j < 3; ++j)
83
         r[j] = bld->getSSA(fullSize);
84
 
85
      i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
86
      i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
87
      bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
88
      i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
89
 
90
      // set carry defs / sources
91
      i[3]->setFlagsDef(1, c[0]);
92
      i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
93
      i[6]->setPredicate(CC_C, c[0]);
94
      i[5]->setFlagsSrc(3, c[1]);
95
   } else {
96
      bld->mkMov(mul->getDef(0), t[3]);
97
   }
98
   delete_Instruction(bld->getProgram(), mul);
99
 
100
   for (int j = 2; j <= (highResult ? 5 : 4); ++j)
101
      if (i[j])
102
         i[j]->sType = hTy;
103
 
104
   return true;
105
}
106
 
107
#define QOP_ADD  0
108
#define QOP_SUBR 1
109
#define QOP_SUB  2
110
#define QOP_MOV2 3
111
 
112
//             UL UR LL LR
113
#define QUADOP(q, r, s, t)            \
114
   ((QOP_##q << 6) | (QOP_##r << 4) | \
115
    (QOP_##s << 2) | (QOP_##t << 0))
116
 
117
class NV50LegalizePostRA : public Pass
118
{
119
private:
120
   virtual bool visit(Function *);
121
   virtual bool visit(BasicBlock *);
122
 
123
   void handlePRERET(FlowInstruction *);
124
   void replaceZero(Instruction *);
125
 
126
   LValue *r63;
127
};
128
 
129
bool
130
NV50LegalizePostRA::visit(Function *fn)
131
{
132
   Program *prog = fn->getProgram();
133
 
134
   r63 = new_LValue(fn, FILE_GPR);
135
   r63->reg.data.id = 63;
136
 
137
   // this is actually per-program, but we can do it all on visiting main()
138
   std::list *outWrites =
139
      reinterpret_cast *>(prog->targetPriv);
140
 
141
   if (outWrites) {
142
      for (std::list::iterator it = outWrites->begin();
143
           it != outWrites->end(); ++it)
144
         (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
145
      // instructions will be deleted on exit
146
      outWrites->clear();
147
   }
148
 
149
   return true;
150
}
151
 
152
void
153
NV50LegalizePostRA::replaceZero(Instruction *i)
154
{
155
   for (int s = 0; i->srcExists(s); ++s) {
156
      ImmediateValue *imm = i->getSrc(s)->asImm();
157
      if (imm && imm->reg.data.u64 == 0)
158
         i->setSrc(s, r63);
159
   }
160
}
161
 
162
// Emulate PRERET: jump to the target and call to the origin from there
163
//
164
// WARNING: atm only works if BBs are affected by at most a single PRERET
165
//
166
// BB:0
167
// preret BB:3
168
// (...)
169
// BB:3
170
// (...)
171
//             --->
172
// BB:0
173
// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
174
// (...)
175
// BB:3
176
// bra BB:3 + n1 (skip the call)
177
// call BB:0 + n2 (skip bra at beginning of BB:0)
178
// (...)
179
void
180
NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
181
{
182
   BasicBlock *bbE = pre->bb;
183
   BasicBlock *bbT = pre->target.bb;
184
 
185
   pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
186
   bbE->remove(pre);
187
   bbE->insertHead(pre);
188
 
189
   Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
190
   Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
191
 
192
   bbT->insertHead(call);
193
   bbT->insertHead(skip);
194
 
195
   // NOTE: maybe split blocks to prevent the instructions from moving ?
196
 
197
   skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
198
   call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
199
}
200
 
201
bool
202
NV50LegalizePostRA::visit(BasicBlock *bb)
203
{
204
   Instruction *i, *next;
205
 
206
   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
207
   for (i = bb->getFirst(); i; i = next) {
208
      next = i->next;
209
      if (i->isNop()) {
210
         bb->remove(i);
211
      } else
212
      if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
213
         handlePRERET(i->asFlow());
214
      } else {
215
         // TODO: We will want to do this before register allocation,
216
         // since have to use a $c register for the carry flag.
217
         if (typeSizeof(i->dType) == 8) {
218
            Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
219
            if (hi)
220
               next = hi;
221
         }
222
 
223
         if (i->op != OP_MOV && i->op != OP_PFETCH &&
224
             i->op != OP_BAR &&
225
             (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
226
            replaceZero(i);
227
      }
228
   }
229
   if (!bb->getEntry())
230
      return true;
231
 
232
   return true;
233
}
234
 
235
class NV50LegalizeSSA : public Pass
236
{
237
public:
238
   NV50LegalizeSSA(Program *);
239
 
240
   virtual bool visit(BasicBlock *bb);
241
 
242
private:
243
   void propagateWriteToOutput(Instruction *);
244
   void handleDIV(Instruction *);
245
   void handleMOD(Instruction *);
246
   void handleMUL(Instruction *);
247
   void handleAddrDef(Instruction *);
248
 
249
   inline bool isARL(const Instruction *) const;
250
 
251
   BuildUtil bld;
252
 
253
   std::list *outWrites;
254
};
255
 
256
NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
257
{
258
   bld.setProgram(prog);
259
 
260
   if (prog->optLevel >= 2 &&
261
       (prog->getType() == Program::TYPE_GEOMETRY ||
262
        prog->getType() == Program::TYPE_VERTEX))
263
      outWrites =
264
         reinterpret_cast *>(prog->targetPriv);
265
   else
266
      outWrites = NULL;
267
}
268
 
269
void
270
NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
271
{
272
   if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
273
      return;
274
 
275
   // check def instruction can store
276
   Instruction *di = st->getSrc(1)->defs.front()->getInsn();
277
 
278
   // TODO: move exports (if beneficial) in common opt pass
279
   if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
280
      return;
281
   for (int s = 0; di->srcExists(s); ++s)
282
      if (di->src(s).getFile() == FILE_IMMEDIATE)
283
         return;
284
 
285
   // We cannot set defs to non-lvalues before register allocation, so
286
   // save & remove (to save registers) the exports and replace later.
287
   outWrites->push_back(st);
288
   st->bb->remove(st);
289
}
290
 
291
bool
292
NV50LegalizeSSA::isARL(const Instruction *i) const
293
{
294
   ImmediateValue imm;
295
 
296
   if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
297
      return false;
298
   if (!i->src(1).getImmediate(imm))
299
      return false;
300
   return imm.isInteger(0);
301
}
302
 
303
void
304
NV50LegalizeSSA::handleAddrDef(Instruction *i)
305
{
306
   Instruction *arl;
307
 
308
   i->getDef(0)->reg.size = 2; // $aX are only 16 bit
309
 
310
   // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
311
   if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
312
      if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
313
         return;
314
      if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
315
         return;
316
   }
317
 
318
   // turn $a sources into $r sources (can't operate on $a)
319
   for (int s = 0; i->srcExists(s); ++s) {
320
      Value *a = i->getSrc(s);
321
      Value *r;
322
      if (a->reg.file == FILE_ADDRESS) {
323
         if (a->getInsn() && isARL(a->getInsn())) {
324
            i->setSrc(s, a->getInsn()->getSrc(0));
325
         } else {
326
            bld.setPosition(i, false);
327
            r = bld.getSSA();
328
            bld.mkMov(r, a);
329
            i->setSrc(s, r);
330
         }
331
      }
332
   }
333
   if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
334
      return;
335
 
336
   // turn result back into $a
337
   bld.setPosition(i, true);
338
   arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
339
   i->setDef(0, arl->getSrc(0));
340
}
341
 
342
void
343
NV50LegalizeSSA::handleMUL(Instruction *mul)
344
{
345
   if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
346
      return;
347
   Value *def = mul->getDef(0);
348
   Value *pred = mul->getPredicate();
349
   CondCode cc = mul->cc;
350
   if (pred)
351
      mul->setPredicate(CC_ALWAYS, NULL);
352
 
353
   if (mul->op == OP_MAD) {
354
      Instruction *add = mul;
355
      bld.setPosition(add, false);
356
      Value *res = cloneShallow(func, mul->getDef(0));
357
      mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
358
      add->op = OP_ADD;
359
      add->setSrc(0, mul->getDef(0));
360
      add->setSrc(1, add->getSrc(2));
361
      for (int s = 2; add->srcExists(s); ++s)
362
         add->setSrc(s, NULL);
363
      mul->subOp = add->subOp;
364
      add->subOp = 0;
365
   }
366
   expandIntegerMUL(&bld, mul);
367
   if (pred)
368
      def->getInsn()->setPredicate(cc, pred);
369
}
370
 
371
// Use f32 division: first compute an approximate result, use it to reduce
372
// the dividend, which should then be representable as f32, divide the reduced
373
// dividend, and add the quotients.
374
void
375
NV50LegalizeSSA::handleDIV(Instruction *div)
376
{
377
   const DataType ty = div->sType;
378
 
379
   if (ty != TYPE_U32 && ty != TYPE_S32)
380
      return;
381
 
382
   Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
383
 
384
   bld.setPosition(div, false);
385
 
386
   Value *a, *af = bld.getSSA();
387
   Value *b, *bf = bld.getSSA();
388
 
389
   bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
390
   bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
391
 
392
   if (isSignedType(ty)) {
393
      af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
394
      bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
395
      a = bld.getSSA();
396
      b = bld.getSSA();
397
      bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
398
      bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
399
   } else {
400
      a = div->getSrc(0);
401
      b = div->getSrc(1);
402
   }
403
 
404
   bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
405
   bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
406
 
407
   bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
408
   bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
409
 
410
   // get error of 1st result
411
   expandIntegerMUL(&bld,
412
      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
413
   bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
414
 
415
   bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
416
 
417
   bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
418
   bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
419
      ->rnd = ROUND_Z;
420
   bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
421
 
422
   // correction: if modulus >= divisor, add 1
423
   expandIntegerMUL(&bld,
424
      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
425
   bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
426
   bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b);
427
   if (!isSignedType(ty)) {
428
      div->op = OP_SUB;
429
      div->setSrc(0, q);
430
      div->setSrc(1, s);
431
   } else {
432
      t = q;
433
      bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
434
      s = bld.getSSA();
435
      t = bld.getSSA();
436
      // fix the sign
437
      bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
438
         ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
439
      bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
440
      bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
441
 
442
      div->op = OP_UNION;
443
      div->setSrc(0, s);
444
      div->setSrc(1, t);
445
   }
446
}
447
 
448
void
449
NV50LegalizeSSA::handleMOD(Instruction *mod)
450
{
451
   if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
452
      return;
453
   bld.setPosition(mod, false);
454
 
455
   Value *q = bld.getSSA();
456
   Value *m = bld.getSSA();
457
 
458
   bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
459
   handleDIV(q->getInsn());
460
 
461
   bld.setPosition(mod, false);
462
   expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
463
 
464
   mod->op = OP_SUB;
465
   mod->setSrc(1, m);
466
}
467
 
468
bool
469
NV50LegalizeSSA::visit(BasicBlock *bb)
470
{
471
   Instruction *insn, *next;
472
   // skipping PHIs (don't pass them to handleAddrDef) !
473
   for (insn = bb->getEntry(); insn; insn = next) {
474
      next = insn->next;
475
 
476
      switch (insn->op) {
477
      case OP_EXPORT:
478
         if (outWrites)
479
            propagateWriteToOutput(insn);
480
         break;
481
      case OP_DIV:
482
         handleDIV(insn);
483
         break;
484
      case OP_MOD:
485
         handleMOD(insn);
486
         break;
487
      case OP_MAD:
488
      case OP_MUL:
489
         handleMUL(insn);
490
         break;
491
      default:
492
         break;
493
      }
494
 
495
      if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
496
         handleAddrDef(insn);
497
   }
498
   return true;
499
}
500
 
501
class NV50LoweringPreSSA : public Pass
502
{
503
public:
504
   NV50LoweringPreSSA(Program *);
505
 
506
private:
507
   virtual bool visit(Instruction *);
508
   virtual bool visit(Function *);
509
 
510
   bool handleRDSV(Instruction *);
511
   bool handleWRSV(Instruction *);
512
 
513
   bool handleEXPORT(Instruction *);
514
 
515
   bool handleDIV(Instruction *);
516
   bool handleSQRT(Instruction *);
517
   bool handlePOW(Instruction *);
518
 
519
   bool handleSET(Instruction *);
520
   bool handleSLCT(CmpInstruction *);
521
   bool handleSELP(Instruction *);
522
 
523
   bool handleTEX(TexInstruction *);
524
   bool handleTXB(TexInstruction *); // I really
525
   bool handleTXL(TexInstruction *); // hate
526
   bool handleTXD(TexInstruction *); // these 3
527
 
528
   bool handleCALL(Instruction *);
529
   bool handlePRECONT(Instruction *);
530
   bool handleCONT(Instruction *);
531
 
532
   void checkPredicate(Instruction *);
533
 
534
private:
535
   const Target *const targ;
536
 
537
   BuildUtil bld;
538
 
539
   Value *tid;
540
};
541
 
542
NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
543
   targ(prog->getTarget()), tid(NULL)
544
{
545
   bld.setProgram(prog);
546
}
547
 
548
bool
549
NV50LoweringPreSSA::visit(Function *f)
550
{
551
   BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
552
 
553
   if (prog->getType() == Program::TYPE_COMPUTE) {
554
      // Add implicit "thread id" argument in $r0 to the function
555
      Value *arg = new_LValue(func, FILE_GPR);
556
      arg->reg.data.id = 0;
557
      f->ins.push_back(arg);
558
 
559
      bld.setPosition(root, false);
560
      tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
561
   }
562
 
563
   return true;
564
}
565
 
566
bool
567
NV50LoweringPreSSA::handleTEX(TexInstruction *i)
568
{
569
   const int arg = i->tex.target.getArgCount();
570
   const int dref = arg;
571
   const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
572
 
573
   // dref comes before bias/lod
574
   if (i->tex.target.isShadow())
575
      if (i->op == OP_TXB || i->op == OP_TXL)
576
         i->swapSources(dref, lod);
577
 
578
   // array index must be converted to u32
579
   if (i->tex.target.isArray()) {
580
      Value *layer = i->getSrc(arg - 1);
581
      LValue *src = new_LValue(func, FILE_GPR);
582
      bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
583
      bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
584
      i->setSrc(arg - 1, src);
585
 
586
      if (i->tex.target.isCube()) {
587
         std::vector acube, a2d;
588
         int c;
589
 
590
         acube.resize(4);
591
         for (c = 0; c < 4; ++c)
592
            acube[c] = i->getSrc(c);
593
         a2d.resize(4);
594
         for (c = 0; c < 3; ++c)
595
            a2d[c] = new_LValue(func, FILE_GPR);
596
         a2d[3] = NULL;
597
 
598
         bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
599
                   a2d, acube)->asTex()->tex.mask = 0x7;
600
 
601
         for (c = 0; c < 3; ++c)
602
            i->setSrc(c, a2d[c]);
603
         i->setSrc(c, NULL);
604
         for (; i->srcExists(c + 1); ++c)
605
            i->setSrc(c, i->getSrc(c + 1));
606
 
607
         i->tex.target = i->tex.target.isShadow() ?
608
            TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
609
      }
610
   }
611
 
612
   // texel offsets are 3 immediate fields in the instruction,
613
   // nv50 cannot do textureGatherOffsets
614
   assert(i->tex.useOffsets <= 1);
615
 
616
   return true;
617
}
618
 
619
// Bias must be equal for all threads of a quad or lod calculation will fail.
620
//
621
// The lanes of a quad are grouped by the bit in the condition register they
622
// have set, which is selected by differing bias values.
623
// Move the input values for TEX into a new register set for each group and
624
// execute TEX only for a specific group.
625
// We always need to use 4 new registers for the inputs/outputs because the
626
// implicitly calculated derivatives must be correct.
627
//
628
// TODO: move to SSA phase so we can easily determine whether bias is constant
629
bool
630
NV50LoweringPreSSA::handleTXB(TexInstruction *i)
631
{
632
   const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
633
   int l, d;
634
 
635
   handleTEX(i);
636
   Value *bias = i->getSrc(i->tex.target.getArgCount());
637
   if (bias->isUniform())
638
      return true;
639
 
640
   Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
641
                                 bld.loadImm(NULL, 1));
642
   bld.setPosition(cond, false);
643
 
644
   for (l = 1; l < 4; ++l) {
645
      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
646
      Value *bit = bld.getSSA();
647
      Value *pred = bld.getScratch(1, FILE_FLAGS);
648
      Value *imm = bld.loadImm(NULL, (1 << l));
649
      bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
650
      bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
651
      cond->setSrc(l, bit);
652
   }
653
   Value *flags = bld.getScratch(1, FILE_FLAGS);
654
   bld.setPosition(cond, true);
655
   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
656
 
657
   Instruction *tex[4];
658
   for (l = 0; l < 4; ++l) {
659
      (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
660
      bld.insert(tex[l]);
661
   }
662
 
663
   Value *res[4][4];
664
   for (d = 0; i->defExists(d); ++d)
665
      res[0][d] = tex[0]->getDef(d);
666
   for (l = 1; l < 4; ++l) {
667
      for (d = 0; tex[l]->defExists(d); ++d) {
668
         res[l][d] = cloneShallow(func, res[0][d]);
669
         bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
670
      }
671
   }
672
 
673
   for (d = 0; i->defExists(d); ++d) {
674
      Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
675
      for (l = 0; l < 4; ++l)
676
         dst->setSrc(l, res[l][d]);
677
   }
678
   delete_Instruction(prog, i);
679
   return true;
680
}
681
 
682
// LOD must be equal for all threads of a quad.
683
// Unlike with TXB, here we can just diverge since there's no LOD calculation
684
// that would require all 4 threads' sources to be set up properly.
685
bool
686
NV50LoweringPreSSA::handleTXL(TexInstruction *i)
687
{
688
   handleTEX(i);
689
   Value *lod = i->getSrc(i->tex.target.getArgCount());
690
   if (lod->isUniform())
691
      return true;
692
 
693
   BasicBlock *currBB = i->bb;
694
   BasicBlock *texiBB = i->bb->splitBefore(i, false);
695
   BasicBlock *joinBB = i->bb->splitAfter(i);
696
 
697
   bld.setPosition(currBB, true);
698
   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
699
 
700
   for (int l = 0; l <= 3; ++l) {
701
      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
702
      Value *pred = bld.getScratch(1, FILE_FLAGS);
703
      bld.setPosition(currBB, true);
704
      bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
705
      bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
706
      currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
707
      if (l <= 2) {
708
         BasicBlock *laneBB = new BasicBlock(func);
709
         currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
710
         currBB = laneBB;
711
      }
712
   }
713
   bld.setPosition(joinBB, false);
714
   bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
715
   return true;
716
}
717
 
718
bool
719
NV50LoweringPreSSA::handleTXD(TexInstruction *i)
720
{
721
   static const uint8_t qOps[4][2] =
722
   {
723
      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
724
      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
725
      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
726
      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
727
   };
728
   Value *def[4][4];
729
   Value *crd[3];
730
   Instruction *tex;
731
   Value *zero = bld.loadImm(bld.getSSA(), 0);
732
   int l, c;
733
   const int dim = i->tex.target.getDim();
734
 
735
   handleTEX(i);
736
   i->op = OP_TEX; // no need to clone dPdx/dPdy later
737
 
738
   for (c = 0; c < dim; ++c)
739
      crd[c] = bld.getScratch();
740
 
741
   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
742
   for (l = 0; l < 4; ++l) {
743
      // mov coordinates from lane l to all lanes
744
      for (c = 0; c < dim; ++c)
745
         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
746
      // add dPdx from lane l to lanes dx
747
      for (c = 0; c < dim; ++c)
748
         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
749
      // add dPdy from lane l to lanes dy
750
      for (c = 0; c < dim; ++c)
751
         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
752
      // texture
753
      bld.insert(tex = cloneForward(func, i));
754
      for (c = 0; c < dim; ++c)
755
         tex->setSrc(c, crd[c]);
756
      // save results
757
      for (c = 0; i->defExists(c); ++c) {
758
         Instruction *mov;
759
         def[c][l] = bld.getSSA();
760
         mov = bld.mkMov(def[c][l], tex->getDef(c));
761
         mov->fixed = 1;
762
         mov->lanes = 1 << l;
763
      }
764
   }
765
   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
766
 
767
   for (c = 0; i->defExists(c); ++c) {
768
      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
769
      for (l = 0; l < 4; ++l)
770
         u->setSrc(l, def[c][l]);
771
   }
772
 
773
   i->bb->remove(i);
774
   return true;
775
}
776
 
777
bool
778
NV50LoweringPreSSA::handleSET(Instruction *i)
779
{
780
   if (i->dType == TYPE_F32) {
781
      bld.setPosition(i, true);
782
      i->dType = TYPE_U32;
783
      bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
784
      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
785
   }
786
   return true;
787
}
788
 
789
bool
790
NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
791
{
792
   Value *src0 = bld.getSSA();
793
   Value *src1 = bld.getSSA();
794
   Value *pred = bld.getScratch(1, FILE_FLAGS);
795
 
796
   Value *v0 = i->getSrc(0);
797
   Value *v1 = i->getSrc(1);
798
   // XXX: these probably shouldn't be immediates in the first place ...
799
   if (v0->asImm())
800
      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
801
   if (v1->asImm())
802
      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
803
 
804
   bld.setPosition(i, true);
805
   bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
806
   bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
807
   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
808
 
809
   bld.setPosition(i, false);
810
   i->op = OP_SET;
811
   i->setFlagsDef(0, pred);
812
   i->dType = TYPE_U8;
813
   i->setSrc(0, i->getSrc(2));
814
   i->setSrc(2, NULL);
815
   i->setSrc(1, bld.loadImm(NULL, 0));
816
 
817
   return true;
818
}
819
 
820
bool
821
NV50LoweringPreSSA::handleSELP(Instruction *i)
822
{
823
   Value *src0 = bld.getSSA();
824
   Value *src1 = bld.getSSA();
825
 
826
   Value *v0 = i->getSrc(0);
827
   Value *v1 = i->getSrc(1);
828
   if (v0->asImm())
829
      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
830
   if (v1->asImm())
831
      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
832
 
833
   bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
834
   bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
835
   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
836
   delete_Instruction(prog, i);
837
   return true;
838
}
839
 
840
bool
841
NV50LoweringPreSSA::handleWRSV(Instruction *i)
842
{
843
   Symbol *sym = i->getSrc(0)->asSym();
844
 
845
   // these are all shader outputs, $sreg are not writeable
846
   uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
847
   if (addr >= 0x400)
848
      return false;
849
   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
850
 
851
   bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
852
 
853
   bld.getBB()->remove(i);
854
   return true;
855
}
856
 
857
bool
858
NV50LoweringPreSSA::handleCALL(Instruction *i)
859
{
860
   if (prog->getType() == Program::TYPE_COMPUTE) {
861
      // Add implicit "thread id" argument in $r0 to the function
862
      i->setSrc(i->srcCount(), tid);
863
   }
864
   return true;
865
}
866
 
867
bool
868
NV50LoweringPreSSA::handlePRECONT(Instruction *i)
869
{
870
   delete_Instruction(prog, i);
871
   return true;
872
}
873
 
874
bool
875
NV50LoweringPreSSA::handleCONT(Instruction *i)
876
{
877
   i->op = OP_BRA;
878
   return true;
879
}
880
 
881
bool
882
NV50LoweringPreSSA::handleRDSV(Instruction *i)
883
{
884
   Symbol *sym = i->getSrc(0)->asSym();
885
   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
886
   Value *def = i->getDef(0);
887
   SVSemantic sv = sym->reg.data.sv.sv;
888
   int idx = sym->reg.data.sv.index;
889
 
890
   if (addr >= 0x400) // mov $sreg
891
      return true;
892
 
893
   switch (sv) {
894
   case SV_POSITION:
895
      assert(prog->getType() == Program::TYPE_FRAGMENT);
896
      bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
897
      break;
898
   case SV_FACE:
899
      bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
900
      if (i->dType == TYPE_F32) {
901
         bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
902
         bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
903
      }
904
      break;
905
   case SV_NCTAID:
906
   case SV_CTAID:
907
   case SV_NTID:
908
      if ((sv == SV_NCTAID && idx >= 2) ||
909
          (sv == SV_NTID && idx >= 3)) {
910
         bld.mkMov(def, bld.mkImm(1));
911
      } else if (sv == SV_CTAID && idx >= 2) {
912
         bld.mkMov(def, bld.mkImm(0));
913
      } else {
914
         Value *x = bld.getSSA(2);
915
         bld.mkOp1(OP_LOAD, TYPE_U16, x,
916
                   bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
917
         bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
918
      }
919
      break;
920
   case SV_TID:
921
      if (idx == 0) {
922
         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
923
      } else if (idx == 1) {
924
         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
925
         bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
926
      } else if (idx == 2) {
927
         bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
928
      } else {
929
         bld.mkMov(def, bld.mkImm(0));
930
      }
931
      break;
932
   default:
933
      bld.mkFetch(i->getDef(0), i->dType,
934
                  FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
935
      break;
936
   }
937
   bld.getBB()->remove(i);
938
   return true;
939
}
940
 
941
bool
942
NV50LoweringPreSSA::handleDIV(Instruction *i)
943
{
944
   if (!isFloatType(i->dType))
945
      return true;
946
   bld.setPosition(i, false);
947
   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
948
   i->op = OP_MUL;
949
   i->setSrc(1, rcp->getDef(0));
950
   return true;
951
}
952
 
953
bool
954
NV50LoweringPreSSA::handleSQRT(Instruction *i)
955
{
956
   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
957
                                bld.getSSA(), i->getSrc(0));
958
   i->op = OP_MUL;
959
   i->setSrc(1, rsq->getDef(0));
960
 
961
   return true;
962
}
963
 
964
bool
965
NV50LoweringPreSSA::handlePOW(Instruction *i)
966
{
967
   LValue *val = bld.getScratch();
968
 
969
   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
970
   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
971
   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
972
 
973
   i->op = OP_EX2;
974
   i->setSrc(0, val);
975
   i->setSrc(1, NULL);
976
 
977
   return true;
978
}
979
 
980
bool
981
NV50LoweringPreSSA::handleEXPORT(Instruction *i)
982
{
983
   if (prog->getType() == Program::TYPE_FRAGMENT) {
984
      if (i->getIndirect(0, 0)) {
985
         // TODO: redirect to l[] here, load to GPRs at exit
986
         return false;
987
      } else {
988
         int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
989
 
990
         i->op = OP_MOV;
991
         i->subOp = NV50_IR_SUBOP_MOV_FINAL;
992
         i->src(0).set(i->src(1));
993
         i->setSrc(1, NULL);
994
         i->setDef(0, new_LValue(func, FILE_GPR));
995
         i->getDef(0)->reg.data.id = id;
996
 
997
         prog->maxGPR = MAX2(prog->maxGPR, id);
998
      }
999
   }
1000
   return true;
1001
}
1002
 
1003
// Set flags according to predicate and make the instruction read $cX.
1004
void
1005
NV50LoweringPreSSA::checkPredicate(Instruction *insn)
1006
{
1007
   Value *pred = insn->getPredicate();
1008
   Value *cdst;
1009
 
1010
   if (!pred || pred->reg.file == FILE_FLAGS)
1011
      return;
1012
   cdst = bld.getSSA(1, FILE_FLAGS);
1013
 
1014
   bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, bld.loadImm(NULL, 0), pred);
1015
 
1016
   insn->setPredicate(insn->cc, cdst);
1017
}
1018
 
1019
//
1020
// - add quadop dance for texturing
1021
// - put FP outputs in GPRs
1022
// - convert instruction sequences
1023
//
1024
bool
1025
NV50LoweringPreSSA::visit(Instruction *i)
1026
{
1027
   bld.setPosition(i, false);
1028
 
1029
   if (i->cc != CC_ALWAYS)
1030
      checkPredicate(i);
1031
 
1032
   switch (i->op) {
1033
   case OP_TEX:
1034
   case OP_TXF:
1035
   case OP_TXG:
1036
      return handleTEX(i->asTex());
1037
   case OP_TXB:
1038
      return handleTXB(i->asTex());
1039
   case OP_TXL:
1040
      return handleTXL(i->asTex());
1041
   case OP_TXD:
1042
      return handleTXD(i->asTex());
1043
   case OP_EX2:
1044
      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1045
      i->setSrc(0, i->getDef(0));
1046
      break;
1047
   case OP_SET:
1048
      return handleSET(i);
1049
   case OP_SLCT:
1050
      return handleSLCT(i->asCmp());
1051
   case OP_SELP:
1052
      return handleSELP(i);
1053
   case OP_POW:
1054
      return handlePOW(i);
1055
   case OP_DIV:
1056
      return handleDIV(i);
1057
   case OP_SQRT:
1058
      return handleSQRT(i);
1059
   case OP_EXPORT:
1060
      return handleEXPORT(i);
1061
   case OP_RDSV:
1062
      return handleRDSV(i);
1063
   case OP_WRSV:
1064
      return handleWRSV(i);
1065
   case OP_CALL:
1066
      return handleCALL(i);
1067
   case OP_PRECONT:
1068
      return handlePRECONT(i);
1069
   case OP_CONT:
1070
      return handleCONT(i);
1071
   default:
1072
      break;
1073
   }
1074
   return true;
1075
}
1076
 
1077
bool
1078
TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
1079
{
1080
   bool ret = false;
1081
 
1082
   if (stage == CG_STAGE_PRE_SSA) {
1083
      NV50LoweringPreSSA pass(prog);
1084
      ret = pass.run(prog, false, true);
1085
   } else
1086
   if (stage == CG_STAGE_SSA) {
1087
      if (!prog->targetPriv)
1088
         prog->targetPriv = new std::list();
1089
      NV50LegalizeSSA pass(prog);
1090
      ret = pass.run(prog, false, true);
1091
   } else
1092
   if (stage == CG_STAGE_POST_RA) {
1093
      NV50LegalizePostRA pass;
1094
      ret = pass.run(prog, false, true);
1095
      if (prog->targetPriv)
1096
         delete reinterpret_cast *>(prog->targetPriv);
1097
   }
1098
   return ret;
1099
}
1100
 
1101
} // namespace nv50_ir