Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4358 Serge 1
/*
2
 * Copyright 2010 Christoph Bumiller
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice shall be included in
12
 * all copies or substantial portions of the Software.
13
 *
14
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20
 * OTHER DEALINGS IN THE SOFTWARE.
21
 */
22
 
23
#include "nv50_program.h"
24
#include "nv50_context.h"
25
 
26
#include "codegen/nv50_ir_driver.h"
27
 
28
static INLINE unsigned
29
bitcount4(const uint32_t val)
30
{
31
   static const uint8_t cnt[16]
32
   = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
33
   return cnt[val & 0xf];
34
}
35
 
36
static int
37
nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
38
{
39
   struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
40
   unsigned i, n, c;
41
 
42
   n = 0;
43
   for (i = 0; i < info->numInputs; ++i) {
44
      prog->in[i].id = i;
45
      prog->in[i].sn = info->in[i].sn;
46
      prog->in[i].si = info->in[i].si;
47
      prog->in[i].hw = n;
48
      prog->in[i].mask = info->in[i].mask;
49
 
50
      prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32);
51
 
52
      for (c = 0; c < 4; ++c)
53
         if (info->in[i].mask & (1 << c))
54
            info->in[i].slot[c] = n++;
55
   }
56
   prog->in_nr = info->numInputs;
57
 
58
   for (i = 0; i < info->numSysVals; ++i) {
59
      switch (info->sv[i].sn) {
60
      case TGSI_SEMANTIC_INSTANCEID:
61
         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID;
62
         continue;
63
      case TGSI_SEMANTIC_VERTEXID:
64
         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;
65
         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_UNK12;
66
         continue;
67
      default:
68
         break;
69
      }
70
   }
71
 
72
   /*
73
    * Corner case: VP has no inputs, but we will still need to submit data to
74
    * draw it. HW will shout at us and won't draw anything if we don't enable
75
    * any input, so let's just pretend it's the first one.
76
    */
77
   if (prog->vp.attrs[0] == 0 &&
78
       prog->vp.attrs[1] == 0 &&
79
       prog->vp.attrs[2] == 0)
80
      prog->vp.attrs[0] |= 0xf;
81
 
82
   /* VertexID before InstanceID */
83
   if (info->io.vertexId < info->numSysVals)
84
      info->sv[info->io.vertexId].slot[0] = n++;
85
   if (info->io.instanceId < info->numSysVals)
86
      info->sv[info->io.instanceId].slot[0] = n++;
87
 
88
   n = 0;
89
   for (i = 0; i < info->numOutputs; ++i) {
90
      switch (info->out[i].sn) {
91
      case TGSI_SEMANTIC_PSIZE:
92
         prog->vp.psiz = i;
93
         break;
94
      case TGSI_SEMANTIC_CLIPDIST:
95
         prog->vp.clpd[info->out[i].si] = n;
96
         break;
97
      case TGSI_SEMANTIC_EDGEFLAG:
98
         prog->vp.edgeflag = i;
99
         break;
100
      case TGSI_SEMANTIC_BCOLOR:
101
         prog->vp.bfc[info->out[i].si] = i;
102
         break;
103
      default:
104
         break;
105
      }
106
      prog->out[i].id = i;
107
      prog->out[i].sn = info->out[i].sn;
108
      prog->out[i].si = info->out[i].si;
109
      prog->out[i].hw = n;
110
      prog->out[i].mask = info->out[i].mask;
111
 
112
      for (c = 0; c < 4; ++c)
113
         if (info->out[i].mask & (1 << c))
114
            info->out[i].slot[c] = n++;
115
   }
116
   prog->out_nr = info->numOutputs;
117
   prog->max_out = n;
118
 
119
   if (prog->vp.psiz < info->numOutputs)
120
      prog->vp.psiz = prog->out[prog->vp.psiz].hw;
121
 
122
   return 0;
123
}
124
 
125
static int
126
nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
127
{
128
   struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
129
   unsigned i, n, m, c;
130
   unsigned nvary;
131
   unsigned nflat;
132
   unsigned nintp = 0;
133
 
134
   /* count recorded non-flat inputs */
135
   for (m = 0, i = 0; i < info->numInputs; ++i) {
136
      switch (info->in[i].sn) {
137
      case TGSI_SEMANTIC_POSITION:
138
      case TGSI_SEMANTIC_FACE:
139
         continue;
140
      default:
141
         m += info->in[i].flat ? 0 : 1;
142
         break;
143
      }
144
   }
145
   /* careful: id may be != i in info->in[prog->in[i].id] */
146
 
147
   /* Fill prog->in[] so that non-flat inputs are first and
148
    * kick out special inputs that don't use the RESULT_MAP.
149
    */
150
   for (n = 0, i = 0; i < info->numInputs; ++i) {
151
      if (info->in[i].sn == TGSI_SEMANTIC_POSITION) {
152
         prog->fp.interp |= info->in[i].mask << 24;
153
         for (c = 0; c < 4; ++c)
154
            if (info->in[i].mask & (1 << c))
155
               info->in[i].slot[c] = nintp++;
156
      } else
157
      if (info->in[i].sn == TGSI_SEMANTIC_FACE) {
158
         info->in[i].slot[0] = 255;
159
      } else {
160
         unsigned j = info->in[i].flat ? m++ : n++;
161
 
162
         if (info->in[i].sn == TGSI_SEMANTIC_COLOR)
163
            prog->vp.bfc[info->in[i].si] = j;
164
 
165
         prog->in[j].id = i;
166
         prog->in[j].mask = info->in[i].mask;
167
         prog->in[j].sn = info->in[i].sn;
168
         prog->in[j].si = info->in[i].si;
169
         prog->in[j].linear = info->in[i].linear;
170
 
171
         prog->in_nr++;
172
      }
173
   }
174
   if (!(prog->fp.interp & (8 << 24))) {
175
      ++nintp;
176
      prog->fp.interp |= 8 << 24;
177
   }
178
 
179
   for (i = 0; i < prog->in_nr; ++i) {
180
      int j = prog->in[i].id;
181
 
182
      prog->in[i].hw = nintp;
183
      for (c = 0; c < 4; ++c)
184
         if (prog->in[i].mask & (1 << c))
185
            info->in[j].slot[c] = nintp++;
186
   }
187
   /* (n == m) if m never increased, i.e. no flat inputs */
188
   nflat = (n < m) ? (nintp - prog->in[n].hw) : 0;
189
   nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */
190
   nvary = nintp - nflat;
191
 
192
   prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT;
193
   prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT;
194
 
195
   /* put front/back colors right after HPOS */
196
   prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT;
197
   for (i = 0; i < 2; ++i)
198
      if (prog->vp.bfc[i] < 0xff)
199
         prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16;
200
 
201
   /* FP outputs */
202
 
203
   if (info->prop.fp.numColourResults > 1)
204
      prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS;
205
 
206
   for (i = 0; i < info->numOutputs; ++i) {
207
      prog->out[i].id = i;
208
      prog->out[i].sn = info->out[i].sn;
209
      prog->out[i].si = info->out[i].si;
210
      prog->out[i].mask = info->out[i].mask;
211
 
212
      if (i == info->io.fragDepth || i == info->io.sampleMask)
213
         continue;
214
      prog->out[i].hw = info->out[i].si * 4;
215
 
216
      for (c = 0; c < 4; ++c)
217
         info->out[i].slot[c] = prog->out[i].hw + c;
218
 
219
      prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4);
220
   }
221
 
222
   if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS)
223
      info->out[info->io.sampleMask].slot[0] = prog->max_out++;
224
 
225
   if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
226
      info->out[info->io.fragDepth].slot[2] = prog->max_out++;
227
 
228
   if (!prog->max_out)
229
      prog->max_out = 4;
230
 
231
   return 0;
232
}
233
 
234
static int
235
nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
236
{
237
   switch (info->type) {
238
   case PIPE_SHADER_VERTEX:
239
      return nv50_vertprog_assign_slots(info);
240
   case PIPE_SHADER_GEOMETRY:
241
      return nv50_vertprog_assign_slots(info);
242
   case PIPE_SHADER_FRAGMENT:
243
      return nv50_fragprog_assign_slots(info);
244
   default:
245
      return -1;
246
   }
247
}
248
 
249
static struct nv50_stream_output_state *
250
nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
251
                                  const struct pipe_stream_output_info *pso)
252
{
253
   struct nv50_stream_output_state *so;
254
   unsigned b, i, c;
255
   unsigned base[4];
256
 
257
   so = MALLOC_STRUCT(nv50_stream_output_state);
258
   if (!so)
259
      return NULL;
260
   memset(so->map, 0xff, sizeof(so->map));
261
 
262
   for (b = 0; b < 4; ++b)
263
      so->num_attribs[b] = 0;
264
   for (i = 0; i < pso->num_outputs; ++i) {
265
      unsigned end =  pso->output[i].dst_offset + pso->output[i].num_components;
266
      b = pso->output[i].output_buffer;
267
      assert(b < 4);
268
      so->num_attribs[b] = MAX2(so->num_attribs[b], end);
269
   }
270
 
271
   so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;
272
 
273
   so->stride[0] = pso->stride[0] * 4;
274
   base[0] = 0;
275
   for (b = 1; b < 4; ++b) {
276
      assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);
277
      so->stride[b] = so->num_attribs[b] * 4;
278
      if (so->num_attribs[b])
279
         so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
280
      base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
281
   }
282
   if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
283
      assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
284
      so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
285
   }
286
 
287
   so->map_size = base[3] + so->num_attribs[3];
288
 
289
   for (i = 0; i < pso->num_outputs; ++i) {
290
      const unsigned s = pso->output[i].start_component;
291
      const unsigned p = pso->output[i].dst_offset;
292
      const unsigned r = pso->output[i].register_index;
293
      b = pso->output[i].output_buffer;
294
 
295
      for (c = 0; c < pso->output[i].num_components; ++c)
296
         so->map[base[b] + p + c] = info->out[r].slot[s + c];
297
   }
298
 
299
   return so;
300
}
301
 
302
boolean
303
nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
304
{
305
   struct nv50_ir_prog_info *info;
306
   int ret;
307
   const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80;
308
 
309
   info = CALLOC_STRUCT(nv50_ir_prog_info);
310
   if (!info)
311
      return FALSE;
312
 
313
   info->type = prog->type;
314
   info->target = chipset;
315
   info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
316
   info->bin.source = (void *)prog->pipe.tokens;
317
 
318
   info->io.ucpCBSlot = 15;
319
   info->io.ucpBase = 0;
320
   info->io.genUserClip = prog->vp.clpd_nr;
321
 
322
   info->assignSlots = nv50_program_assign_varying_slots;
323
 
324
   prog->vp.bfc[0] = 0xff;
325
   prog->vp.bfc[1] = 0xff;
326
   prog->vp.edgeflag = 0xff;
327
   prog->vp.clpd[0] = map_undef;
328
   prog->vp.clpd[1] = map_undef;
329
   prog->vp.psiz = map_undef;
330
   prog->gp.primid = 0x80;
331
 
332
   info->driverPriv = prog;
333
 
334
#ifdef DEBUG
335
   info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
336
   info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
337
#else
338
   info->optLevel = 3;
339
#endif
340
 
341
   ret = nv50_ir_generate_code(info);
342
   if (ret) {
343
      NOUVEAU_ERR("shader translation failed: %i\n", ret);
344
      goto out;
345
   }
346
   FREE(info->bin.syms);
347
 
348
   prog->code = info->bin.code;
349
   prog->code_size = info->bin.codeSize;
350
   prog->fixups = info->bin.relocData;
351
   prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
352
   prog->tls_space = info->bin.tlsSpace;
353
 
354
   if (prog->type == PIPE_SHADER_FRAGMENT) {
355
      if (info->prop.fp.writesDepth) {
356
         prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;
357
         prog->fp.flags[1] = 0x11;
358
      }
359
      if (info->prop.fp.usesDiscard)
360
         prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
361
   }
362
 
363
   if (prog->pipe.stream_output.num_outputs)
364
      prog->so = nv50_program_create_strmout_state(info,
365
                                                   &prog->pipe.stream_output);
366
 
367
out:
368
   FREE(info);
369
   return !ret;
370
}
371
 
372
boolean
373
nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
374
{
375
   struct nouveau_heap *heap;
376
   int ret;
377
   uint32_t size = align(prog->code_size, 0x40);
378
 
379
   switch (prog->type) {
380
   case PIPE_SHADER_VERTEX:   heap = nv50->screen->vp_code_heap; break;
381
   case PIPE_SHADER_GEOMETRY: heap = nv50->screen->fp_code_heap; break;
382
   case PIPE_SHADER_FRAGMENT: heap = nv50->screen->gp_code_heap; break;
383
   default:
384
      assert(!"invalid program type");
385
      return FALSE;
386
   }
387
 
388
   ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
389
   if (ret) {
390
      /* Out of space: evict everything to compactify the code segment, hoping
391
       * the working set is much smaller and drifts slowly. Improve me !
392
       */
393
      while (heap->next) {
394
         struct nv50_program *evict = heap->next->priv;
395
         if (evict)
396
            nouveau_heap_free(&evict->mem);
397
      }
398
      debug_printf("WARNING: out of code space, evicting all shaders.\n");
399
      ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
400
      if (ret) {
401
         NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
402
         return FALSE;
403
      }
404
   }
405
   prog->code_base = prog->mem->start;
406
 
407
   ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
408
   if (ret < 0)
409
      return FALSE;
410
   if (ret > 0)
411
      nv50->state.new_tls_space = TRUE;
412
 
413
   if (prog->fixups)
414
      nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
415
 
416
   nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
417
                       (prog->type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
418
                       NOUVEAU_BO_VRAM, prog->code_size, prog->code);
419
 
420
   BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
421
   PUSH_DATA (nv50->base.pushbuf, 0);
422
 
423
   return TRUE;
424
}
425
 
426
void
427
nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
428
{
429
   const struct pipe_shader_state pipe = p->pipe;
430
   const ubyte type = p->type;
431
 
432
   if (p->mem)
433
      nouveau_heap_free(&p->mem);
434
 
435
   FREE(p->code);
436
 
437
   FREE(p->fixups);
438
 
439
   FREE(p->so);
440
 
441
   memset(p, 0, sizeof(*p));
442
 
443
   p->pipe = pipe;
444
   p->type = type;
445
}