Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
5564 serge 1
/*
2
 * Copyright (C) 2008 Nicolai Haehnle.
3
 *
4
 * All Rights Reserved.
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining
7
 * a copy of this software and associated documentation files (the
8
 * "Software"), to deal in the Software without restriction, including
9
 * without limitation the rights to use, copy, modify, merge, publish,
10
 * distribute, sublicense, and/or sell copies of the Software, and to
11
 * permit persons to whom the Software is furnished to do so, subject to
12
 * the following conditions:
13
 *
14
 * The above copyright notice and this permission notice (including the
15
 * next paragraph) shall be included in all copies or substantial
16
 * portions of the Software.
17
 *
18
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21
 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22
 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23
 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
 *
26
 */
27
 
28
/**
29
 * @file
30
 *
31
 * Shareable transformations that transform "special" ALU instructions
32
 * into ALU instructions that are supported by hardware.
33
 *
34
 */
35
 
36
#include "radeon_program_alu.h"
37
 
38
#include "radeon_compiler.h"
39
#include "radeon_compiler_util.h"
40
 
41
 
42
static struct rc_instruction *emit1(
43
	struct radeon_compiler * c, struct rc_instruction * after,
44
	rc_opcode Opcode, struct rc_sub_instruction * base,
45
	struct rc_dst_register DstReg, struct rc_src_register SrcReg)
46
{
47
	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
48
 
49
	if (base) {
50
		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
51
	}
52
 
53
	fpi->U.I.Opcode = Opcode;
54
	fpi->U.I.DstReg = DstReg;
55
	fpi->U.I.SrcReg[0] = SrcReg;
56
	return fpi;
57
}
58
 
59
static struct rc_instruction *emit2(
60
	struct radeon_compiler * c, struct rc_instruction * after,
61
	rc_opcode Opcode, struct rc_sub_instruction * base,
62
	struct rc_dst_register DstReg,
63
	struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
64
{
65
	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
66
 
67
	if (base) {
68
		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
69
	}
70
 
71
	fpi->U.I.Opcode = Opcode;
72
	fpi->U.I.DstReg = DstReg;
73
	fpi->U.I.SrcReg[0] = SrcReg0;
74
	fpi->U.I.SrcReg[1] = SrcReg1;
75
	return fpi;
76
}
77
 
78
static struct rc_instruction *emit3(
79
	struct radeon_compiler * c, struct rc_instruction * after,
80
	rc_opcode Opcode, struct rc_sub_instruction * base,
81
	struct rc_dst_register DstReg,
82
	struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
83
	struct rc_src_register SrcReg2)
84
{
85
	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
86
 
87
	if (base) {
88
		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
89
	}
90
 
91
	fpi->U.I.Opcode = Opcode;
92
	fpi->U.I.DstReg = DstReg;
93
	fpi->U.I.SrcReg[0] = SrcReg0;
94
	fpi->U.I.SrcReg[1] = SrcReg1;
95
	fpi->U.I.SrcReg[2] = SrcReg2;
96
	return fpi;
97
}
98
 
99
static struct rc_dst_register dstregtmpmask(int index, int mask)
100
{
101
	struct rc_dst_register dst = {0, 0, 0};
102
	dst.File = RC_FILE_TEMPORARY;
103
	dst.Index = index;
104
	dst.WriteMask = mask;
105
	return dst;
106
}
107
 
108
static const struct rc_src_register builtin_zero = {
109
	.File = RC_FILE_NONE,
110
	.Index = 0,
111
	.Swizzle = RC_SWIZZLE_0000
112
};
113
static const struct rc_src_register builtin_one = {
114
	.File = RC_FILE_NONE,
115
	.Index = 0,
116
	.Swizzle = RC_SWIZZLE_1111
117
};
118
 
119
static const struct rc_src_register builtin_half = {
120
	.File = RC_FILE_NONE,
121
	.Index = 0,
122
	.Swizzle = RC_SWIZZLE_HHHH
123
};
124
 
125
static const struct rc_src_register srcreg_undefined = {
126
	.File = RC_FILE_NONE,
127
	.Index = 0,
128
	.Swizzle = RC_SWIZZLE_XYZW
129
};
130
 
131
static struct rc_src_register srcreg(int file, int index)
132
{
133
	struct rc_src_register src = srcreg_undefined;
134
	src.File = file;
135
	src.Index = index;
136
	return src;
137
}
138
 
139
static struct rc_src_register srcregswz(int file, int index, int swz)
140
{
141
	struct rc_src_register src = srcreg_undefined;
142
	src.File = file;
143
	src.Index = index;
144
	src.Swizzle = swz;
145
	return src;
146
}
147
 
148
static struct rc_src_register absolute(struct rc_src_register reg)
149
{
150
	struct rc_src_register newreg = reg;
151
	newreg.Abs = 1;
152
	newreg.Negate = RC_MASK_NONE;
153
	return newreg;
154
}
155
 
156
static struct rc_src_register negate(struct rc_src_register reg)
157
{
158
	struct rc_src_register newreg = reg;
159
	newreg.Negate = newreg.Negate ^ RC_MASK_XYZW;
160
	return newreg;
161
}
162
 
163
static struct rc_src_register swizzle(struct rc_src_register reg,
164
		rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w)
165
{
166
	struct rc_src_register swizzled = reg;
167
	swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w);
168
	return swizzled;
169
}
170
 
171
static struct rc_src_register swizzle_smear(struct rc_src_register reg,
172
		rc_swizzle x)
173
{
174
	return swizzle(reg, x, x, x, x);
175
}
176
 
177
static struct rc_src_register swizzle_xxxx(struct rc_src_register reg)
178
{
179
	return swizzle_smear(reg, RC_SWIZZLE_X);
180
}
181
 
182
static struct rc_src_register swizzle_yyyy(struct rc_src_register reg)
183
{
184
	return swizzle_smear(reg, RC_SWIZZLE_Y);
185
}
186
 
187
static struct rc_src_register swizzle_zzzz(struct rc_src_register reg)
188
{
189
	return swizzle_smear(reg, RC_SWIZZLE_Z);
190
}
191
 
192
static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
193
{
194
	return swizzle_smear(reg, RC_SWIZZLE_W);
195
}
196
 
197
static int is_dst_safe_to_reuse(struct rc_instruction *inst)
198
{
199
	const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
200
	unsigned i;
201
 
202
	assert(info->HasDstReg);
203
 
204
	if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
205
		return 0;
206
 
207
	for (i = 0; i < info->NumSrcRegs; i++) {
208
		if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
209
		    inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)
210
			return 0;
211
	}
212
 
213
	return 1;
214
}
215
 
216
static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,
217
					       struct rc_instruction *inst)
218
{
219
	unsigned tmp;
220
 
221
	if (is_dst_safe_to_reuse(inst))
222
		tmp = inst->U.I.DstReg.Index;
223
	else
224
		tmp = rc_find_free_temporary(c);
225
 
226
	return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
227
}
228
 
229
static void transform_ABS(struct radeon_compiler* c,
230
	struct rc_instruction* inst)
231
{
232
	struct rc_src_register src = inst->U.I.SrcReg[0];
233
	src.Abs = 1;
234
	src.Negate = RC_MASK_NONE;
235
	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, src);
236
	rc_remove_instruction(inst);
237
}
238
 
239
static void transform_CEIL(struct radeon_compiler* c,
240
	struct rc_instruction* inst)
241
{
242
	/* Assuming:
243
	 *     ceil(x) = -floor(-x)
244
	 *
245
	 * After inlining floor:
246
	 *     ceil(x) = -(-x-frac(-x))
247
	 *
248
	 * After simplification:
249
	 *     ceil(x) = x+frac(-x)
250
	 */
251
 
252
	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
253
	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
254
	emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
255
		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
256
	rc_remove_instruction(inst);
257
}
258
 
259
static void transform_CLAMP(struct radeon_compiler *c,
260
	struct rc_instruction *inst)
261
{
262
	/* CLAMP dst, src, min, max
263
	 *    into:
264
	 * MIN tmp, src, max
265
	 * MAX dst, tmp, min
266
	 */
267
	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
268
	emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
269
		inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
270
	emit2(c, inst->Prev, RC_OPCODE_MAX, &inst->U.I, inst->U.I.DstReg,
271
		srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
272
	rc_remove_instruction(inst);
273
}
274
 
275
static void transform_DP2(struct radeon_compiler* c,
276
	struct rc_instruction* inst)
277
{
278
	struct rc_src_register src0 = inst->U.I.SrcReg[0];
279
	struct rc_src_register src1 = inst->U.I.SrcReg[1];
280
	src0.Negate &= ~(RC_MASK_Z | RC_MASK_W);
281
	src0.Swizzle &= ~(63 << (3 * 2));
282
	src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
283
	src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
284
	src1.Swizzle &= ~(63 << (3 * 2));
285
	src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
286
	emit2(c, inst->Prev, RC_OPCODE_DP3, &inst->U.I, inst->U.I.DstReg, src0, src1);
287
	rc_remove_instruction(inst);
288
}
289
 
290
static void transform_DPH(struct radeon_compiler* c,
291
	struct rc_instruction* inst)
292
{
293
	struct rc_src_register src0 = inst->U.I.SrcReg[0];
294
	src0.Negate &= ~RC_MASK_W;
295
	src0.Swizzle &= ~(7 << (3 * 3));
296
	src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
297
	emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
298
	rc_remove_instruction(inst);
299
}
300
 
301
/**
302
 * [1, src0.y*src1.y, src0.z, src1.w]
303
 * So basically MUL with lotsa swizzling.
304
 */
305
static void transform_DST(struct radeon_compiler* c,
306
	struct rc_instruction* inst)
307
{
308
	emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg,
309
		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
310
		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
311
	rc_remove_instruction(inst);
312
}
313
 
314
static void transform_FLR(struct radeon_compiler* c,
315
	struct rc_instruction* inst)
316
{
317
	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
318
	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
319
	emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
320
		inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
321
	rc_remove_instruction(inst);
322
}
323
 
324
static void transform_TRUNC(struct radeon_compiler* c,
325
	struct rc_instruction* inst)
326
{
327
	/* Definition of trunc:
328
	 *   trunc(x) = (abs(x) - fract(abs(x))) * sgn(x)
329
	 *
330
	 * The multiplication by sgn(x) can be simplified using CMP:
331
	 *   y * sgn(x) = (x < 0 ? -y : y)
332
	 */
333
	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
334
	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, absolute(inst->U.I.SrcReg[0]));
335
	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, absolute(inst->U.I.SrcReg[0]),
336
	      negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
337
	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, inst->U.I.SrcReg[0],
338
	      negate(srcreg(RC_FILE_TEMPORARY, dst.Index)), srcreg(RC_FILE_TEMPORARY, dst.Index));
339
	rc_remove_instruction(inst);
340
}
341
 
342
/**
343
 * Definition of LIT (from ARB_fragment_program):
344
 *
345
 *  tmp = VectorLoad(op0);
346
 *  if (tmp.x < 0) tmp.x = 0;
347
 *  if (tmp.y < 0) tmp.y = 0;
348
 *  if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
349
 *  else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
350
 *  result.x = 1.0;
351
 *  result.y = tmp.x;
352
 *  result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
353
 *  result.w = 1.0;
354
 *
355
 * The longest path of computation is the one leading to result.z,
356
 * consisting of 5 operations. This implementation of LIT takes
357
 * 5 slots, if the subsequent optimization passes are clever enough
358
 * to pair instructions correctly.
359
 */
360
static void transform_LIT(struct radeon_compiler* c,
361
	struct rc_instruction* inst)
362
{
363
	unsigned int constant;
364
	unsigned int constant_swizzle;
365
	unsigned int temp;
366
	struct rc_src_register srctemp;
367
 
368
	constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
369
 
370
	if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
371
		struct rc_instruction * inst_mov;
372
 
373
		inst_mov = emit1(c, inst,
374
			RC_OPCODE_MOV, 0, inst->U.I.DstReg,
375
			srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
376
 
377
		inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
378
		inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
379
		inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
380
	}
381
 
382
	temp = inst->U.I.DstReg.Index;
383
	srctemp = srcreg(RC_FILE_TEMPORARY, temp);
384
 
385
	/* tmp.x = max(0.0, Src.x); */
386
	/* tmp.y = max(0.0, Src.y); */
387
	/* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
388
	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
389
		dstregtmpmask(temp, RC_MASK_XYW),
390
		inst->U.I.SrcReg[0],
391
		swizzle(srcreg(RC_FILE_CONSTANT, constant),
392
			RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
393
	emit2(c, inst->Prev, RC_OPCODE_MIN, 0,
394
		dstregtmpmask(temp, RC_MASK_Z),
395
		swizzle_wwww(srctemp),
396
		negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
397
 
398
	/* tmp.w = Pow(tmp.y, tmp.w) */
399
	emit1(c, inst->Prev, RC_OPCODE_LG2, 0,
400
		dstregtmpmask(temp, RC_MASK_W),
401
		swizzle_yyyy(srctemp));
402
	emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
403
		dstregtmpmask(temp, RC_MASK_W),
404
		swizzle_wwww(srctemp),
405
		swizzle_zzzz(srctemp));
406
	emit1(c, inst->Prev, RC_OPCODE_EX2, 0,
407
		dstregtmpmask(temp, RC_MASK_W),
408
		swizzle_wwww(srctemp));
409
 
410
	/* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
411
	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,
412
		dstregtmpmask(temp, RC_MASK_Z),
413
		negate(swizzle_xxxx(srctemp)),
414
		swizzle_wwww(srctemp),
415
		builtin_zero);
416
 
417
	/* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
418
	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,
419
		dstregtmpmask(temp, RC_MASK_XYW),
420
		swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
421
 
422
	rc_remove_instruction(inst);
423
}
424
 
425
static void transform_LRP(struct radeon_compiler* c,
426
	struct rc_instruction* inst)
427
{
428
	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
429
 
430
	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
431
		dst,
432
		inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
433
	emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I,
434
		inst->U.I.DstReg,
435
		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
436
 
437
	rc_remove_instruction(inst);
438
}
439
 
440
static void transform_POW(struct radeon_compiler* c,
441
	struct rc_instruction* inst)
442
{
443
	struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);
444
	struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
445
	tempdst.WriteMask = RC_MASK_W;
446
	tempsrc.Swizzle = RC_SWIZZLE_WWWW;
447
 
448
	emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
449
	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
450
	emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);
451
 
452
	rc_remove_instruction(inst);
453
}
454
 
455
/* dst = ROUND(src) :
456
 *   add = src + .5
457
 *   frac = FRC(add)
458
 *   dst = add - frac
459
 *
460
 * According to the GLSL spec, the implementor can decide which way to round
461
 * when the fraction is .5.  We round down for .5.
462
 *
463
 */
464
static void transform_ROUND(struct radeon_compiler* c,
465
	struct rc_instruction* inst)
466
{
467
	unsigned int mask = inst->U.I.DstReg.WriteMask;
468
	unsigned int frac_index, add_index;
469
	struct rc_dst_register frac_dst, add_dst;
470
	struct rc_src_register frac_src, add_src;
471
 
472
	/* add = src + .5 */
473
	add_index = rc_find_free_temporary(c);
474
	add_dst = dstregtmpmask(add_index, mask);
475
	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, add_dst, inst->U.I.SrcReg[0],
476
								builtin_half);
477
	add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index);
478
 
479
 
480
	/* frac = FRC(add) */
481
	frac_index = rc_find_free_temporary(c);
482
	frac_dst = dstregtmpmask(frac_index, mask);
483
	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, frac_dst, add_src);
484
	frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index);
485
 
486
	/* dst = add - frac */
487
	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, inst->U.I.DstReg,
488
						add_src, negate(frac_src));
489
	rc_remove_instruction(inst);
490
}
491
 
492
static void transform_RSQ(struct radeon_compiler* c,
493
	struct rc_instruction* inst)
494
{
495
	inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]);
496
}
497
 
498
static void transform_SEQ(struct radeon_compiler* c,
499
	struct rc_instruction* inst)
500
{
501
	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
502
 
503
	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
504
	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
505
		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
506
 
507
	rc_remove_instruction(inst);
508
}
509
 
510
static void transform_SFL(struct radeon_compiler* c,
511
	struct rc_instruction* inst)
512
{
513
	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, builtin_zero);
514
	rc_remove_instruction(inst);
515
}
516
 
517
static void transform_SGE(struct radeon_compiler* c,
518
	struct rc_instruction* inst)
519
{
520
	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
521
 
522
	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
523
	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
524
		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
525
 
526
	rc_remove_instruction(inst);
527
}
528
 
529
static void transform_SGT(struct radeon_compiler* c,
530
	struct rc_instruction* inst)
531
{
532
	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
533
 
534
	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
535
	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
536
		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
537
 
538
	rc_remove_instruction(inst);
539
}
540
 
541
static void transform_SLE(struct radeon_compiler* c,
542
	struct rc_instruction* inst)
543
{
544
	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
545
 
546
	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
547
	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
548
		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
549
 
550
	rc_remove_instruction(inst);
551
}
552
 
553
static void transform_SLT(struct radeon_compiler* c,
554
	struct rc_instruction* inst)
555
{
556
	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
557
 
558
	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
559
	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
560
		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
561
 
562
	rc_remove_instruction(inst);
563
}
564
 
565
static void transform_SNE(struct radeon_compiler* c,
566
	struct rc_instruction* inst)
567
{
568
	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
569
 
570
	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
571
	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
572
		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
573
 
574
	rc_remove_instruction(inst);
575
}
576
 
577
static void transform_SSG(struct radeon_compiler* c,
578
	struct rc_instruction* inst)
579
{
580
	/* result = sign(x)
581
	 *
582
	 *   CMP tmp0, -x, 1, 0
583
	 *   CMP tmp1, x, 1, 0
584
	 *   ADD result, tmp0, -tmp1;
585
	 */
586
	struct rc_dst_register dst0;
587
	unsigned tmp1;
588
 
589
	/* 0 < x */
590
	dst0 = try_to_reuse_dst(c, inst);
591
	emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
592
	      dst0,
593
	      negate(inst->U.I.SrcReg[0]),
594
	      builtin_one,
595
	      builtin_zero);
596
 
597
	/* x < 0 */
598
	tmp1 = rc_find_free_temporary(c);
599
	emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
600
	      dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
601
	      inst->U.I.SrcReg[0],
602
	      builtin_one,
603
	      builtin_zero);
604
 
605
	/* Either both are zero, or one of them is one and the other is zero. */
606
	/* result = tmp0 - tmp1 */
607
	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
608
	      inst->U.I.DstReg,
609
	      srcreg(RC_FILE_TEMPORARY, dst0.Index),
610
	      negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
611
 
612
	rc_remove_instruction(inst);
613
}
614
 
615
static void transform_SUB(struct radeon_compiler* c,
616
	struct rc_instruction* inst)
617
{
618
	inst->U.I.Opcode = RC_OPCODE_ADD;
619
	inst->U.I.SrcReg[1] = negate(inst->U.I.SrcReg[1]);
620
}
621
 
622
static void transform_SWZ(struct radeon_compiler* c,
623
	struct rc_instruction* inst)
624
{
625
	inst->U.I.Opcode = RC_OPCODE_MOV;
626
}
627
 
628
static void transform_XPD(struct radeon_compiler* c,
629
	struct rc_instruction* inst)
630
{
631
	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
632
 
633
	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
634
		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
635
		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
636
	emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I, inst->U.I.DstReg,
637
		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
638
		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
639
		negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
640
 
641
	rc_remove_instruction(inst);
642
}
643
 
644
 
645
/**
646
 * Can be used as a transformation for @ref radeonClauseLocalTransform,
647
 * no userData necessary.
648
 *
649
 * Eliminates the following ALU instructions:
650
 *  ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
651
 * using:
652
 *  MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
653
 *
654
 * Transforms RSQ to Radeon's native RSQ by explicitly setting
655
 * absolute value.
656
 *
657
 * @note should be applicable to R300 and R500 fragment programs.
658
 */
659
int radeonTransformALU(
660
	struct radeon_compiler * c,
661
	struct rc_instruction* inst,
662
	void* unused)
663
{
664
	switch(inst->U.I.Opcode) {
665
	case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
666
	case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
667
	case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
668
	case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
669
	case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
670
	case RC_OPCODE_DST: transform_DST(c, inst); return 1;
671
	case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
672
	case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
673
	case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
674
	case RC_OPCODE_POW: transform_POW(c, inst); return 1;
675
	case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1;
676
	case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
677
	case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
678
	case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
679
	case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;
680
	case RC_OPCODE_SGT: transform_SGT(c, inst); return 1;
681
	case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
682
	case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
683
	case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
684
	case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
685
	case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
686
	case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
687
	case RC_OPCODE_TRUNC: transform_TRUNC(c, inst); return 1;
688
	case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
689
	default:
690
		return 0;
691
	}
692
}
693
 
694
 
695
static void transform_r300_vertex_ABS(struct radeon_compiler* c,
696
	struct rc_instruction* inst)
697
{
698
	/* Note: r500 can take absolute values, but r300 cannot. */
699
	inst->U.I.Opcode = RC_OPCODE_MAX;
700
	inst->U.I.SrcReg[1] = inst->U.I.SrcReg[0];
701
	inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
702
}
703
 
704
static void transform_r300_vertex_CMP(struct radeon_compiler* c,
705
	struct rc_instruction* inst)
706
{
707
	/* There is no decent CMP available, so let's rig one up.
708
	 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
709
	 * The following sequence consumes zero to two temps and two extra slots
710
	 * (the second temp and the second slot is consumed by transform_LRP),
711
	 * but should be equivalent:
712
	 *
713
	 * SLT tmp0, src0, 0.0
714
	 * LRP dst, tmp0, src1, src2
715
	 *
716
	 * Yes, I know, I'm a mad scientist. ~ C. & M. */
717
	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
718
 
719
	/* SLT tmp0, src0, 0.0 */
720
	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
721
		dst,
722
		inst->U.I.SrcReg[0], builtin_zero);
723
 
724
	/* LRP dst, tmp0, src1, src2 */
725
	transform_LRP(c,
726
		emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
727
		      inst->U.I.DstReg,
728
		      srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1],  inst->U.I.SrcReg[2]));
729
 
730
	rc_remove_instruction(inst);
731
}
732
 
733
static void transform_r300_vertex_DP2(struct radeon_compiler* c,
734
	struct rc_instruction* inst)
735
{
736
	struct rc_instruction *next_inst = inst->Next;
737
	transform_DP2(c, inst);
738
	next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4;
739
}
740
 
741
static void transform_r300_vertex_DP3(struct radeon_compiler* c,
742
	struct rc_instruction* inst)
743
{
744
	struct rc_src_register src0 = inst->U.I.SrcReg[0];
745
	struct rc_src_register src1 = inst->U.I.SrcReg[1];
746
	src0.Negate &= ~RC_MASK_W;
747
	src0.Swizzle &= ~(7 << (3 * 3));
748
	src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
749
	src1.Negate &= ~RC_MASK_W;
750
	src1.Swizzle &= ~(7 << (3 * 3));
751
	src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
752
	emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, src1);
753
	rc_remove_instruction(inst);
754
}
755
 
756
static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
757
	struct rc_instruction* inst)
758
{
759
	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
760
	unsigned constant_swizzle;
761
	int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
762
							 0.0000000000000000001,
763
							 &constant_swizzle);
764
 
765
	/* MOV dst, src */
766
	dst.WriteMask = RC_MASK_XYZW;
767
	emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
768
		dst,
769
		inst->U.I.SrcReg[0]);
770
 
771
	/* MAX dst.y, src, 0.00...001 */
772
	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
773
		dstregtmpmask(dst.Index, RC_MASK_Y),
774
		srcreg(RC_FILE_TEMPORARY, dst.Index),
775
		srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
776
 
777
	inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
778
}
779
 
780
static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
781
	struct rc_instruction *inst)
782
{
783
	/* x = y  <==>  x >= y && y >= x */
784
	int tmp = rc_find_free_temporary(c);
785
 
786
	/* x <= y */
787
	emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
788
	      dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
789
	      inst->U.I.SrcReg[0],
790
	      inst->U.I.SrcReg[1]);
791
 
792
	/* y <= x */
793
	emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
794
	      inst->U.I.DstReg,
795
	      inst->U.I.SrcReg[1],
796
	      inst->U.I.SrcReg[0]);
797
 
798
	/* x && y  =  x * y */
799
	emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
800
	      inst->U.I.DstReg,
801
	      srcreg(RC_FILE_TEMPORARY, tmp),
802
	      srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
803
 
804
	rc_remove_instruction(inst);
805
}
806
 
807
static void transform_r300_vertex_SNE(struct radeon_compiler *c,
808
	struct rc_instruction *inst)
809
{
810
	/* x != y  <==>  x < y || y < x */
811
	int tmp = rc_find_free_temporary(c);
812
 
813
	/* x < y */
814
	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
815
	      dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
816
	      inst->U.I.SrcReg[0],
817
	      inst->U.I.SrcReg[1]);
818
 
819
	/* y < x */
820
	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
821
	      inst->U.I.DstReg,
822
	      inst->U.I.SrcReg[1],
823
	      inst->U.I.SrcReg[0]);
824
 
825
	/* x || y  =  max(x, y) */
826
	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
827
	      inst->U.I.DstReg,
828
	      srcreg(RC_FILE_TEMPORARY, tmp),
829
	      srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
830
 
831
	rc_remove_instruction(inst);
832
}
833
 
834
static void transform_r300_vertex_SGT(struct radeon_compiler* c,
835
	struct rc_instruction* inst)
836
{
837
	/* x > y  <==>  -x < -y */
838
	inst->U.I.Opcode = RC_OPCODE_SLT;
839
	inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
840
	inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
841
}
842
 
843
static void transform_r300_vertex_SLE(struct radeon_compiler* c,
844
	struct rc_instruction* inst)
845
{
846
	/* x <= y  <==>  -x >= -y */
847
	inst->U.I.Opcode = RC_OPCODE_SGE;
848
	inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
849
	inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
850
}
851
 
852
static void transform_r300_vertex_SSG(struct radeon_compiler* c,
853
	struct rc_instruction* inst)
854
{
855
	/* result = sign(x)
856
	 *
857
	 *   SLT tmp0, 0, x;
858
	 *   SLT tmp1, x, 0;
859
	 *   ADD result, tmp0, -tmp1;
860
	 */
861
	struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
862
	unsigned tmp1;
863
 
864
	/* 0 < x */
865
	dst0 = try_to_reuse_dst(c, inst);
866
	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
867
	      dst0,
868
	      builtin_zero,
869
	      inst->U.I.SrcReg[0]);
870
 
871
	/* x < 0 */
872
	tmp1 = rc_find_free_temporary(c);
873
	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
874
	      dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
875
	      inst->U.I.SrcReg[0],
876
	      builtin_zero);
877
 
878
	/* Either both are zero, or one of them is one and the other is zero. */
879
	/* result = tmp0 - tmp1 */
880
	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
881
	      inst->U.I.DstReg,
882
	      srcreg(RC_FILE_TEMPORARY, dst0.Index),
883
	      negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
884
 
885
	rc_remove_instruction(inst);
886
}
887
 
888
static void transform_vertex_TRUNC(struct radeon_compiler* c,
889
	struct rc_instruction* inst)
890
{
891
	struct rc_instruction *next = inst->Next;
892
 
893
	/* next->Prev is removed after each transformation and replaced
894
	 * by a new instruction. */
895
	transform_TRUNC(c, next->Prev);
896
	transform_r300_vertex_CMP(c, next->Prev);
897
}
898
 
899
/**
900
 * For use with rc_local_transform, this transforms non-native ALU
901
 * instructions of the r300 up to r500 vertex engine.
902
 */
903
int r300_transform_vertex_alu(
904
	struct radeon_compiler * c,
905
	struct rc_instruction* inst,
906
	void* unused)
907
{
908
	switch(inst->U.I.Opcode) {
909
	case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
910
	case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
911
	case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
912
	case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
913
	case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
914
	case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
915
	case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
916
	case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
917
	case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;
918
	case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
919
	case RC_OPCODE_SEQ:
920
		if (!c->is_r500) {
921
			transform_r300_vertex_SEQ(c, inst);
922
			return 1;
923
		}
924
		return 0;
925
	case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
926
	case RC_OPCODE_SGT: transform_r300_vertex_SGT(c, inst); return 1;
927
	case RC_OPCODE_SLE: transform_r300_vertex_SLE(c, inst); return 1;
928
	case RC_OPCODE_SNE:
929
		if (!c->is_r500) {
930
			transform_r300_vertex_SNE(c, inst);
931
			return 1;
932
		}
933
		return 0;
934
	case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
935
	case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
936
	case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
937
	case RC_OPCODE_TRUNC: transform_vertex_TRUNC(c, inst); return 1;
938
	case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
939
	default:
940
		return 0;
941
	}
942
}
943
 
944
static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
945
{
946
	static const float SinCosConsts[2][4] = {
947
		{
948
			1.273239545,		/* 4/PI */
949
			-0.405284735,		/* -4/(PI*PI) */
950
			3.141592654,		/* PI */
951
			0.2225			/* weight */
952
		},
953
		{
954
			0.75,
955
			0.5,
956
			0.159154943,		/* 1/(2*PI) */
957
			6.283185307		/* 2*PI */
958
		}
959
	};
960
	int i;
961
 
962
	for(i = 0; i < 2; ++i)
963
		constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
964
}
965
 
966
/**
967
 * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
968
 *
969
 * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
970
 * MAD tmp.x, tmp.y, |src|, tmp.x
971
 * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
972
 * MAD dest, tmp.y, weight, tmp.x
973
 */
974
static void sin_approx(
975
	struct radeon_compiler* c, struct rc_instruction * inst,
976
	struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
977
{
978
	unsigned int tempreg = rc_find_free_temporary(c);
979
 
980
	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),
981
		swizzle_xxxx(src),
982
		srcreg(RC_FILE_CONSTANT, constants[0]));
983
	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),
984
		swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
985
		absolute(swizzle_xxxx(src)),
986
		swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
987
	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),
988
		swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
989
		absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),
990
		negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));
991
	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst,
992
		swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
993
		swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),
994
		swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
995
}
996
 
997
/**
998
 * Translate the trigonometric functions COS, SIN, and SCS
999
 * using only the basic instructions
1000
 *  MOV, ADD, MUL, MAD, FRC
1001
 */
1002
int r300_transform_trig_simple(struct radeon_compiler* c,
1003
	struct rc_instruction* inst,
1004
	void* unused)
1005
{
1006
	unsigned int constants[2];
1007
	unsigned int tempreg;
1008
 
1009
	if (inst->U.I.Opcode != RC_OPCODE_COS &&
1010
	    inst->U.I.Opcode != RC_OPCODE_SIN &&
1011
	    inst->U.I.Opcode != RC_OPCODE_SCS)
1012
		return 0;
1013
 
1014
	tempreg = rc_find_free_temporary(c);
1015
 
1016
	sincos_constants(c, constants);
1017
 
1018
	if (inst->U.I.Opcode == RC_OPCODE_COS) {
1019
		/* MAD tmp.x, src, 1/(2*PI), 0.75 */
1020
		/* FRC tmp.x, tmp.x */
1021
		/* MAD tmp.z, tmp.x, 2*PI, -PI */
1022
		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1023
			swizzle_xxxx(inst->U.I.SrcReg[0]),
1024
			swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1025
			swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));
1026
		emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
1027
			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
1028
		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1029
			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1030
			swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1031
			negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1032
 
1033
		sin_approx(c, inst, inst->U.I.DstReg,
1034
			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1035
			constants);
1036
	} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1037
		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1038
			swizzle_xxxx(inst->U.I.SrcReg[0]),
1039
			swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1040
			swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));
1041
		emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
1042
			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
1043
		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1044
			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1045
			swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1046
			negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1047
 
1048
		sin_approx(c, inst, inst->U.I.DstReg,
1049
			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1050
			constants);
1051
	} else {
1052
		struct rc_dst_register dst;
1053
 
1054
		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1055
			swizzle_xxxx(inst->U.I.SrcReg[0]),
1056
			swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1057
			swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
1058
		emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1059
			srcreg(RC_FILE_TEMPORARY, tempreg));
1060
		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1061
			srcreg(RC_FILE_TEMPORARY, tempreg),
1062
			swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1063
			negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1064
 
1065
		dst = inst->U.I.DstReg;
1066
 
1067
		dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
1068
		sin_approx(c, inst, dst,
1069
			swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
1070
			constants);
1071
 
1072
		dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
1073
		sin_approx(c, inst, dst,
1074
			swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
1075
			constants);
1076
	}
1077
 
1078
	rc_remove_instruction(inst);
1079
 
1080
	return 1;
1081
}
1082
 
1083
static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,
1084
	struct rc_instruction *inst,
1085
	unsigned srctmp)
1086
{
1087
	if (inst->U.I.Opcode == RC_OPCODE_COS) {
1088
		emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,
1089
			srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1090
	} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1091
		emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,
1092
			inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1093
	} else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
1094
		struct rc_dst_register moddst = inst->U.I.DstReg;
1095
 
1096
		if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
1097
			moddst.WriteMask = RC_MASK_X;
1098
			emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, moddst,
1099
				srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1100
		}
1101
		if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
1102
			moddst.WriteMask = RC_MASK_Y;
1103
			emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, moddst,
1104
				srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1105
		}
1106
	}
1107
 
1108
	rc_remove_instruction(inst);
1109
}
1110
 
1111
 
1112
/**
1113
 * Transform the trigonometric functions COS, SIN, and SCS
1114
 * to include pre-scaling by 1/(2*PI) and taking the fractional
1115
 * part, so that the input to COS and SIN is always in the range [0,1).
1116
 * SCS is replaced by one COS and one SIN instruction.
1117
 *
1118
 * @warning This transformation implicitly changes the semantics of SIN and COS!
1119
 */
1120
int radeonTransformTrigScale(struct radeon_compiler* c,
1121
	struct rc_instruction* inst,
1122
	void* unused)
1123
{
1124
	static const float RCP_2PI = 0.15915494309189535;
1125
	unsigned int temp;
1126
	unsigned int constant;
1127
	unsigned int constant_swizzle;
1128
 
1129
	if (inst->U.I.Opcode != RC_OPCODE_COS &&
1130
	    inst->U.I.Opcode != RC_OPCODE_SIN &&
1131
	    inst->U.I.Opcode != RC_OPCODE_SCS)
1132
		return 0;
1133
 
1134
	temp = rc_find_free_temporary(c);
1135
	constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
1136
 
1137
	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(temp, RC_MASK_W),
1138
		swizzle_xxxx(inst->U.I.SrcReg[0]),
1139
		srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
1140
	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1141
		srcreg(RC_FILE_TEMPORARY, temp));
1142
 
1143
	r300_transform_SIN_COS_SCS(c, inst, temp);
1144
	return 1;
1145
}
1146
 
1147
/**
1148
 * Transform the trigonometric functions COS, SIN, and SCS
1149
 * so that the input to COS and SIN is always in the range [-PI, PI].
1150
 * SCS is replaced by one COS and one SIN instruction.
1151
 */
1152
int r300_transform_trig_scale_vertex(struct radeon_compiler *c,
1153
	struct rc_instruction *inst,
1154
	void *unused)
1155
{
1156
	static const float cons[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};
1157
	unsigned int temp;
1158
	unsigned int constant;
1159
 
1160
	if (inst->U.I.Opcode != RC_OPCODE_COS &&
1161
	    inst->U.I.Opcode != RC_OPCODE_SIN &&
1162
	    inst->U.I.Opcode != RC_OPCODE_SCS)
1163
		return 0;
1164
 
1165
	/* Repeat x in the range [-PI, PI]:
1166
	 *
1167
	 *   repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI
1168
	 */
1169
 
1170
	temp = rc_find_free_temporary(c);
1171
	constant = rc_constants_add_immediate_vec4(&c->Program.Constants, cons);
1172
 
1173
	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1174
		swizzle_xxxx(inst->U.I.SrcReg[0]),
1175
		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_XXXX),
1176
		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_YYYY));
1177
	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1178
		srcreg(RC_FILE_TEMPORARY, temp));
1179
	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1180
		srcreg(RC_FILE_TEMPORARY, temp),
1181
		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_ZZZZ),
1182
		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_WWWW));
1183
 
1184
	r300_transform_SIN_COS_SCS(c, inst, temp);
1185
	return 1;
1186
}
1187
 
1188
/**
1189
 * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
1190
 * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
1191
 * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
1192
 *
1193
 * @warning This explicitly changes the form of DDX and DDY!
1194
 */
1195
 
1196
int radeonTransformDeriv(struct radeon_compiler* c,
1197
	struct rc_instruction* inst,
1198
	void* unused)
1199
{
1200
	if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)
1201
		return 0;
1202
 
1203
	inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_1111;
1204
	inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
1205
 
1206
	return 1;
1207
}
1208
 
1209
/**
1210
 * IF Temp[0].x -> IF Temp[0].x
1211
 * ...          -> ...
1212
 * KILL         -> KIL -abs(Temp[0].x)
1213
 * ...          -> ...
1214
 * ENDIF        -> ENDIF
1215
 *
1216
 * === OR ===
1217
 *
1218
 * IF Temp[0].x -\
1219
 * KILL         - > KIL -abs(Temp[0].x)
1220
 * ENDIF        -/
1221
 *
1222
 * === OR ===
1223
 *
1224
 * IF Temp[0].x -> IF Temp[0].x
1225
 * ...          -> ...
1226
 * ELSE         -> ELSE
1227
 * ...	        -> ...
1228
 * KILL	        -> KIL -abs(Temp[0].x)
1229
 * ...          -> ...
1230
 * ENDIF        -> ENDIF
1231
 *
1232
 * === OR ===
1233
 *
1234
 * KILL         -> KIL -none.1111
1235
 *
1236
 * This needs to be done in its own pass, because it might modify the
1237
 * instructions before and after KILL.
1238
 */
1239
void rc_transform_KILL(struct radeon_compiler * c, void *user)
1240
{
1241
	struct rc_instruction * inst;
1242
	for (inst = c->Program.Instructions.Next;
1243
			inst != &c->Program.Instructions; inst = inst->Next) {
1244
		struct rc_instruction * if_inst;
1245
		unsigned in_if = 0;
1246
 
1247
		if (inst->U.I.Opcode != RC_OPCODE_KILP)
1248
			continue;
1249
 
1250
		for (if_inst = inst->Prev; if_inst != &c->Program.Instructions;
1251
						if_inst = if_inst->Prev) {
1252
 
1253
			if (if_inst->U.I.Opcode == RC_OPCODE_IF) {
1254
				in_if = 1;
1255
				break;
1256
			}
1257
		}
1258
 
1259
		inst->U.I.Opcode = RC_OPCODE_KIL;
1260
 
1261
		if (!in_if) {
1262
			inst->U.I.SrcReg[0] = negate(builtin_one);
1263
		} else {
1264
			/* This should work even if the KILP is inside the ELSE
1265
			 * block, because -0.0 is considered negative. */
1266
			inst->U.I.SrcReg[0] =
1267
				negate(absolute(if_inst->U.I.SrcReg[0]));
1268
 
1269
			if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
1270
				&& inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
1271
 
1272
				/* Optimize the special case:
1273
				 * IF Temp[0].x
1274
				 * KILP
1275
				 * ENDIF
1276
				 */
1277
 
1278
				/* Remove IF */
1279
				rc_remove_instruction(inst->Prev);
1280
				/* Remove ENDIF */
1281
				rc_remove_instruction(inst->Next);
1282
			}
1283
		}
1284
	}
1285
}
1286
 
1287
int rc_force_output_alpha_to_one(struct radeon_compiler *c,
1288
				 struct rc_instruction *inst, void *data)
1289
{
1290
	struct r300_fragment_program_compiler *fragc = (struct r300_fragment_program_compiler*)c;
1291
	const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
1292
	unsigned tmp;
1293
 
1294
	if (!info->HasDstReg || inst->U.I.DstReg.File != RC_FILE_OUTPUT ||
1295
	    inst->U.I.DstReg.Index == fragc->OutputDepth)
1296
		return 1;
1297
 
1298
	tmp = rc_find_free_temporary(c);
1299
 
1300
	/* Insert MOV after inst, set alpha to 1. */
1301
	emit1(c, inst, RC_OPCODE_MOV, 0, inst->U.I.DstReg,
1302
	      srcregswz(RC_FILE_TEMPORARY, tmp, RC_SWIZZLE_XYZ1));
1303
 
1304
	/* Re-route the destination of inst to the source of mov. */
1305
	inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
1306
	inst->U.I.DstReg.Index = tmp;
1307
 
1308
	/* Move the saturate output modifier to the MOV instruction
1309
	 * (for better copy propagation). */
1310
	inst->Next->U.I.SaturateMode = inst->U.I.SaturateMode;
1311
	inst->U.I.SaturateMode = RC_SATURATE_NONE;
1312
	return 1;
1313
}