Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright (C) 2008 Nicolai Haehnle.
  3.  *
  4.  * All Rights Reserved.
  5.  *
  6.  * Permission is hereby granted, free of charge, to any person obtaining
  7.  * a copy of this software and associated documentation files (the
  8.  * "Software"), to deal in the Software without restriction, including
  9.  * without limitation the rights to use, copy, modify, merge, publish,
  10.  * distribute, sublicense, and/or sell copies of the Software, and to
  11.  * permit persons to whom the Software is furnished to do so, subject to
  12.  * the following conditions:
  13.  *
  14.  * The above copyright notice and this permission notice (including the
  15.  * next paragraph) shall be included in all copies or substantial
  16.  * portions of the Software.
  17.  *
  18.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19.  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20.  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21.  * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22.  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23.  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24.  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25.  *
  26.  */
  27.  
  28. /**
  29.  * @file
  30.  *
  31.  * Shareable transformations that transform "special" ALU instructions
  32.  * into ALU instructions that are supported by hardware.
  33.  *
  34.  */
  35.  
  36. #include "radeon_program_alu.h"
  37.  
  38. #include "radeon_compiler.h"
  39. #include "radeon_compiler_util.h"
  40.  
  41.  
  42. static struct rc_instruction *emit1(
  43.         struct radeon_compiler * c, struct rc_instruction * after,
  44.         rc_opcode Opcode, struct rc_sub_instruction * base,
  45.         struct rc_dst_register DstReg, struct rc_src_register SrcReg)
  46. {
  47.         struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
  48.  
  49.         if (base) {
  50.                 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
  51.         }
  52.  
  53.         fpi->U.I.Opcode = Opcode;
  54.         fpi->U.I.DstReg = DstReg;
  55.         fpi->U.I.SrcReg[0] = SrcReg;
  56.         return fpi;
  57. }
  58.  
  59. static struct rc_instruction *emit2(
  60.         struct radeon_compiler * c, struct rc_instruction * after,
  61.         rc_opcode Opcode, struct rc_sub_instruction * base,
  62.         struct rc_dst_register DstReg,
  63.         struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
  64. {
  65.         struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
  66.  
  67.         if (base) {
  68.                 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
  69.         }
  70.  
  71.         fpi->U.I.Opcode = Opcode;
  72.         fpi->U.I.DstReg = DstReg;
  73.         fpi->U.I.SrcReg[0] = SrcReg0;
  74.         fpi->U.I.SrcReg[1] = SrcReg1;
  75.         return fpi;
  76. }
  77.  
  78. static struct rc_instruction *emit3(
  79.         struct radeon_compiler * c, struct rc_instruction * after,
  80.         rc_opcode Opcode, struct rc_sub_instruction * base,
  81.         struct rc_dst_register DstReg,
  82.         struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
  83.         struct rc_src_register SrcReg2)
  84. {
  85.         struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
  86.  
  87.         if (base) {
  88.                 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
  89.         }
  90.  
  91.         fpi->U.I.Opcode = Opcode;
  92.         fpi->U.I.DstReg = DstReg;
  93.         fpi->U.I.SrcReg[0] = SrcReg0;
  94.         fpi->U.I.SrcReg[1] = SrcReg1;
  95.         fpi->U.I.SrcReg[2] = SrcReg2;
  96.         return fpi;
  97. }
  98.  
  99. static struct rc_dst_register dstregtmpmask(int index, int mask)
  100. {
  101.         struct rc_dst_register dst = {0, 0, 0};
  102.         dst.File = RC_FILE_TEMPORARY;
  103.         dst.Index = index;
  104.         dst.WriteMask = mask;
  105.         return dst;
  106. }
  107.  
  108. static const struct rc_src_register builtin_zero = {
  109.         .File = RC_FILE_NONE,
  110.         .Index = 0,
  111.         .Swizzle = RC_SWIZZLE_0000
  112. };
  113. static const struct rc_src_register builtin_one = {
  114.         .File = RC_FILE_NONE,
  115.         .Index = 0,
  116.         .Swizzle = RC_SWIZZLE_1111
  117. };
  118.  
  119. static const struct rc_src_register builtin_half = {
  120.         .File = RC_FILE_NONE,
  121.         .Index = 0,
  122.         .Swizzle = RC_SWIZZLE_HHHH
  123. };
  124.  
  125. static const struct rc_src_register srcreg_undefined = {
  126.         .File = RC_FILE_NONE,
  127.         .Index = 0,
  128.         .Swizzle = RC_SWIZZLE_XYZW
  129. };
  130.  
  131. static struct rc_src_register srcreg(int file, int index)
  132. {
  133.         struct rc_src_register src = srcreg_undefined;
  134.         src.File = file;
  135.         src.Index = index;
  136.         return src;
  137. }
  138.  
  139. static struct rc_src_register srcregswz(int file, int index, int swz)
  140. {
  141.         struct rc_src_register src = srcreg_undefined;
  142.         src.File = file;
  143.         src.Index = index;
  144.         src.Swizzle = swz;
  145.         return src;
  146. }
  147.  
  148. static struct rc_src_register absolute(struct rc_src_register reg)
  149. {
  150.         struct rc_src_register newreg = reg;
  151.         newreg.Abs = 1;
  152.         newreg.Negate = RC_MASK_NONE;
  153.         return newreg;
  154. }
  155.  
  156. static struct rc_src_register negate(struct rc_src_register reg)
  157. {
  158.         struct rc_src_register newreg = reg;
  159.         newreg.Negate = newreg.Negate ^ RC_MASK_XYZW;
  160.         return newreg;
  161. }
  162.  
  163. static struct rc_src_register swizzle(struct rc_src_register reg,
  164.                 rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w)
  165. {
  166.         struct rc_src_register swizzled = reg;
  167.         swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w);
  168.         return swizzled;
  169. }
  170.  
  171. static struct rc_src_register swizzle_smear(struct rc_src_register reg,
  172.                 rc_swizzle x)
  173. {
  174.         return swizzle(reg, x, x, x, x);
  175. }
  176.  
  177. static struct rc_src_register swizzle_xxxx(struct rc_src_register reg)
  178. {
  179.         return swizzle_smear(reg, RC_SWIZZLE_X);
  180. }
  181.  
  182. static struct rc_src_register swizzle_yyyy(struct rc_src_register reg)
  183. {
  184.         return swizzle_smear(reg, RC_SWIZZLE_Y);
  185. }
  186.  
  187. static struct rc_src_register swizzle_zzzz(struct rc_src_register reg)
  188. {
  189.         return swizzle_smear(reg, RC_SWIZZLE_Z);
  190. }
  191.  
  192. static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
  193. {
  194.         return swizzle_smear(reg, RC_SWIZZLE_W);
  195. }
  196.  
  197. static int is_dst_safe_to_reuse(struct rc_instruction *inst)
  198. {
  199.         const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
  200.         unsigned i;
  201.  
  202.         assert(info->HasDstReg);
  203.  
  204.         if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
  205.                 return 0;
  206.  
  207.         for (i = 0; i < info->NumSrcRegs; i++) {
  208.                 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
  209.                     inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)
  210.                         return 0;
  211.         }
  212.  
  213.         return 1;
  214. }
  215.  
  216. static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,
  217.                                                struct rc_instruction *inst)
  218. {
  219.         unsigned tmp;
  220.  
  221.         if (is_dst_safe_to_reuse(inst))
  222.                 tmp = inst->U.I.DstReg.Index;
  223.         else
  224.                 tmp = rc_find_free_temporary(c);
  225.  
  226.         return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
  227. }
  228.  
  229. static void transform_ABS(struct radeon_compiler* c,
  230.         struct rc_instruction* inst)
  231. {
  232.         struct rc_src_register src = inst->U.I.SrcReg[0];
  233.         src.Abs = 1;
  234.         src.Negate = RC_MASK_NONE;
  235.         emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, src);
  236.         rc_remove_instruction(inst);
  237. }
  238.  
  239. static void transform_CEIL(struct radeon_compiler* c,
  240.         struct rc_instruction* inst)
  241. {
  242.         /* Assuming:
  243.          *     ceil(x) = -floor(-x)
  244.          *
  245.          * After inlining floor:
  246.          *     ceil(x) = -(-x-frac(-x))
  247.          *
  248.          * After simplification:
  249.          *     ceil(x) = x+frac(-x)
  250.          */
  251.  
  252.         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
  253.         emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
  254.         emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
  255.                 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
  256.         rc_remove_instruction(inst);
  257. }
  258.  
  259. static void transform_CLAMP(struct radeon_compiler *c,
  260.         struct rc_instruction *inst)
  261. {
  262.         /* CLAMP dst, src, min, max
  263.          *    into:
  264.          * MIN tmp, src, max
  265.          * MAX dst, tmp, min
  266.          */
  267.         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
  268.         emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
  269.                 inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
  270.         emit2(c, inst->Prev, RC_OPCODE_MAX, &inst->U.I, inst->U.I.DstReg,
  271.                 srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
  272.         rc_remove_instruction(inst);
  273. }
  274.  
  275. static void transform_DP2(struct radeon_compiler* c,
  276.         struct rc_instruction* inst)
  277. {
  278.         struct rc_src_register src0 = inst->U.I.SrcReg[0];
  279.         struct rc_src_register src1 = inst->U.I.SrcReg[1];
  280.         src0.Negate &= ~(RC_MASK_Z | RC_MASK_W);
  281.         src0.Swizzle &= ~(63 << (3 * 2));
  282.         src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
  283.         src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
  284.         src1.Swizzle &= ~(63 << (3 * 2));
  285.         src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
  286.         emit2(c, inst->Prev, RC_OPCODE_DP3, &inst->U.I, inst->U.I.DstReg, src0, src1);
  287.         rc_remove_instruction(inst);
  288. }
  289.  
  290. static void transform_DPH(struct radeon_compiler* c,
  291.         struct rc_instruction* inst)
  292. {
  293.         struct rc_src_register src0 = inst->U.I.SrcReg[0];
  294.         src0.Negate &= ~RC_MASK_W;
  295.         src0.Swizzle &= ~(7 << (3 * 3));
  296.         src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
  297.         emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
  298.         rc_remove_instruction(inst);
  299. }
  300.  
  301. /**
  302.  * [1, src0.y*src1.y, src0.z, src1.w]
  303.  * So basically MUL with lotsa swizzling.
  304.  */
  305. static void transform_DST(struct radeon_compiler* c,
  306.         struct rc_instruction* inst)
  307. {
  308.         emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg,
  309.                 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
  310.                 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
  311.         rc_remove_instruction(inst);
  312. }
  313.  
  314. static void transform_FLR(struct radeon_compiler* c,
  315.         struct rc_instruction* inst)
  316. {
  317.         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
  318.         emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
  319.         emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
  320.                 inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
  321.         rc_remove_instruction(inst);
  322. }
  323.  
  324. static void transform_TRUNC(struct radeon_compiler* c,
  325.         struct rc_instruction* inst)
  326. {
  327.         /* Definition of trunc:
  328.          *   trunc(x) = (abs(x) - fract(abs(x))) * sgn(x)
  329.          *
  330.          * The multiplication by sgn(x) can be simplified using CMP:
  331.          *   y * sgn(x) = (x < 0 ? -y : y)
  332.          */
  333.         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
  334.         emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, absolute(inst->U.I.SrcReg[0]));
  335.         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, absolute(inst->U.I.SrcReg[0]),
  336.               negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
  337.         emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, inst->U.I.SrcReg[0],
  338.               negate(srcreg(RC_FILE_TEMPORARY, dst.Index)), srcreg(RC_FILE_TEMPORARY, dst.Index));
  339.         rc_remove_instruction(inst);
  340. }
  341.  
  342. /**
  343.  * Definition of LIT (from ARB_fragment_program):
  344.  *
  345.  *  tmp = VectorLoad(op0);
  346.  *  if (tmp.x < 0) tmp.x = 0;
  347.  *  if (tmp.y < 0) tmp.y = 0;
  348.  *  if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
  349.  *  else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
  350.  *  result.x = 1.0;
  351.  *  result.y = tmp.x;
  352.  *  result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
  353.  *  result.w = 1.0;
  354.  *
  355.  * The longest path of computation is the one leading to result.z,
  356.  * consisting of 5 operations. This implementation of LIT takes
  357.  * 5 slots, if the subsequent optimization passes are clever enough
  358.  * to pair instructions correctly.
  359.  */
  360. static void transform_LIT(struct radeon_compiler* c,
  361.         struct rc_instruction* inst)
  362. {
  363.         unsigned int constant;
  364.         unsigned int constant_swizzle;
  365.         unsigned int temp;
  366.         struct rc_src_register srctemp;
  367.  
  368.         constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
  369.  
  370.         if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
  371.                 struct rc_instruction * inst_mov;
  372.  
  373.                 inst_mov = emit1(c, inst,
  374.                         RC_OPCODE_MOV, 0, inst->U.I.DstReg,
  375.                         srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
  376.  
  377.                 inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
  378.                 inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
  379.                 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
  380.         }
  381.  
  382.         temp = inst->U.I.DstReg.Index;
  383.         srctemp = srcreg(RC_FILE_TEMPORARY, temp);
  384.  
  385.         /* tmp.x = max(0.0, Src.x); */
  386.         /* tmp.y = max(0.0, Src.y); */
  387.         /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
  388.         emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
  389.                 dstregtmpmask(temp, RC_MASK_XYW),
  390.                 inst->U.I.SrcReg[0],
  391.                 swizzle(srcreg(RC_FILE_CONSTANT, constant),
  392.                         RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
  393.         emit2(c, inst->Prev, RC_OPCODE_MIN, 0,
  394.                 dstregtmpmask(temp, RC_MASK_Z),
  395.                 swizzle_wwww(srctemp),
  396.                 negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
  397.  
  398.         /* tmp.w = Pow(tmp.y, tmp.w) */
  399.         emit1(c, inst->Prev, RC_OPCODE_LG2, 0,
  400.                 dstregtmpmask(temp, RC_MASK_W),
  401.                 swizzle_yyyy(srctemp));
  402.         emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
  403.                 dstregtmpmask(temp, RC_MASK_W),
  404.                 swizzle_wwww(srctemp),
  405.                 swizzle_zzzz(srctemp));
  406.         emit1(c, inst->Prev, RC_OPCODE_EX2, 0,
  407.                 dstregtmpmask(temp, RC_MASK_W),
  408.                 swizzle_wwww(srctemp));
  409.  
  410.         /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
  411.         emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,
  412.                 dstregtmpmask(temp, RC_MASK_Z),
  413.                 negate(swizzle_xxxx(srctemp)),
  414.                 swizzle_wwww(srctemp),
  415.                 builtin_zero);
  416.  
  417.         /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
  418.         emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,
  419.                 dstregtmpmask(temp, RC_MASK_XYW),
  420.                 swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
  421.  
  422.         rc_remove_instruction(inst);
  423. }
  424.  
  425. static void transform_LRP(struct radeon_compiler* c,
  426.         struct rc_instruction* inst)
  427. {
  428.         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
  429.  
  430.         emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
  431.                 dst,
  432.                 inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
  433.         emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I,
  434.                 inst->U.I.DstReg,
  435.                 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
  436.  
  437.         rc_remove_instruction(inst);
  438. }
  439.  
  440. static void transform_POW(struct radeon_compiler* c,
  441.         struct rc_instruction* inst)
  442. {
  443.         struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);
  444.         struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
  445.         tempdst.WriteMask = RC_MASK_W;
  446.         tempsrc.Swizzle = RC_SWIZZLE_WWWW;
  447.  
  448.         emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
  449.         emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
  450.         emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);
  451.  
  452.         rc_remove_instruction(inst);
  453. }
  454.  
  455. /* dst = ROUND(src) :
  456.  *   add = src + .5
  457.  *   frac = FRC(add)
  458.  *   dst = add - frac
  459.  *
  460.  * According to the GLSL spec, the implementor can decide which way to round
  461.  * when the fraction is .5.  We round down for .5.
  462.  *
  463.  */
  464. static void transform_ROUND(struct radeon_compiler* c,
  465.         struct rc_instruction* inst)
  466. {
  467.         unsigned int mask = inst->U.I.DstReg.WriteMask;
  468.         unsigned int frac_index, add_index;
  469.         struct rc_dst_register frac_dst, add_dst;
  470.         struct rc_src_register frac_src, add_src;
  471.  
  472.         /* add = src + .5 */
  473.         add_index = rc_find_free_temporary(c);
  474.         add_dst = dstregtmpmask(add_index, mask);
  475.         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, add_dst, inst->U.I.SrcReg[0],
  476.                                                                 builtin_half);
  477.         add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index);
  478.  
  479.  
  480.         /* frac = FRC(add) */
  481.         frac_index = rc_find_free_temporary(c);
  482.         frac_dst = dstregtmpmask(frac_index, mask);
  483.         emit1(c, inst->Prev, RC_OPCODE_FRC, 0, frac_dst, add_src);
  484.         frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index);
  485.  
  486.         /* dst = add - frac */
  487.         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, inst->U.I.DstReg,
  488.                                                 add_src, negate(frac_src));
  489.         rc_remove_instruction(inst);
  490. }
  491.  
  492. static void transform_RSQ(struct radeon_compiler* c,
  493.         struct rc_instruction* inst)
  494. {
  495.         inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]);
  496. }
  497.  
  498. static void transform_SEQ(struct radeon_compiler* c,
  499.         struct rc_instruction* inst)
  500. {
  501.         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
  502.  
  503.         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
  504.         emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
  505.                 negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
  506.  
  507.         rc_remove_instruction(inst);
  508. }
  509.  
  510. static void transform_SFL(struct radeon_compiler* c,
  511.         struct rc_instruction* inst)
  512. {
  513.         emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, builtin_zero);
  514.         rc_remove_instruction(inst);
  515. }
  516.  
  517. static void transform_SGE(struct radeon_compiler* c,
  518.         struct rc_instruction* inst)
  519. {
  520.         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
  521.  
  522.         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
  523.         emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
  524.                 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
  525.  
  526.         rc_remove_instruction(inst);
  527. }
  528.  
  529. static void transform_SGT(struct radeon_compiler* c,
  530.         struct rc_instruction* inst)
  531. {
  532.         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
  533.  
  534.         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
  535.         emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
  536.                 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
  537.  
  538.         rc_remove_instruction(inst);
  539. }
  540.  
  541. static void transform_SLE(struct radeon_compiler* c,
  542.         struct rc_instruction* inst)
  543. {
  544.         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
  545.  
  546.         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
  547.         emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
  548.                 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
  549.  
  550.         rc_remove_instruction(inst);
  551. }
  552.  
  553. static void transform_SLT(struct radeon_compiler* c,
  554.         struct rc_instruction* inst)
  555. {
  556.         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
  557.  
  558.         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
  559.         emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
  560.                 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
  561.  
  562.         rc_remove_instruction(inst);
  563. }
  564.  
  565. static void transform_SNE(struct radeon_compiler* c,
  566.         struct rc_instruction* inst)
  567. {
  568.         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
  569.  
  570.         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
  571.         emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
  572.                 negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
  573.  
  574.         rc_remove_instruction(inst);
  575. }
  576.  
  577. static void transform_SSG(struct radeon_compiler* c,
  578.         struct rc_instruction* inst)
  579. {
  580.         /* result = sign(x)
  581.          *
  582.          *   CMP tmp0, -x, 1, 0
  583.          *   CMP tmp1, x, 1, 0
  584.          *   ADD result, tmp0, -tmp1;
  585.          */
  586.         struct rc_dst_register dst0;
  587.         unsigned tmp1;
  588.  
  589.         /* 0 < x */
  590.         dst0 = try_to_reuse_dst(c, inst);
  591.         emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
  592.               dst0,
  593.               negate(inst->U.I.SrcReg[0]),
  594.               builtin_one,
  595.               builtin_zero);
  596.  
  597.         /* x < 0 */
  598.         tmp1 = rc_find_free_temporary(c);
  599.         emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
  600.               dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
  601.               inst->U.I.SrcReg[0],
  602.               builtin_one,
  603.               builtin_zero);
  604.  
  605.         /* Either both are zero, or one of them is one and the other is zero. */
  606.         /* result = tmp0 - tmp1 */
  607.         emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
  608.               inst->U.I.DstReg,
  609.               srcreg(RC_FILE_TEMPORARY, dst0.Index),
  610.               negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
  611.  
  612.         rc_remove_instruction(inst);
  613. }
  614.  
  615. static void transform_SUB(struct radeon_compiler* c,
  616.         struct rc_instruction* inst)
  617. {
  618.         inst->U.I.Opcode = RC_OPCODE_ADD;
  619.         inst->U.I.SrcReg[1] = negate(inst->U.I.SrcReg[1]);
  620. }
  621.  
  622. static void transform_SWZ(struct radeon_compiler* c,
  623.         struct rc_instruction* inst)
  624. {
  625.         inst->U.I.Opcode = RC_OPCODE_MOV;
  626. }
  627.  
  628. static void transform_XPD(struct radeon_compiler* c,
  629.         struct rc_instruction* inst)
  630. {
  631.         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
  632.  
  633.         emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
  634.                 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
  635.                 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
  636.         emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I, inst->U.I.DstReg,
  637.                 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
  638.                 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
  639.                 negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
  640.  
  641.         rc_remove_instruction(inst);
  642. }
  643.  
  644.  
  645. /**
  646.  * Can be used as a transformation for @ref radeonClauseLocalTransform,
  647.  * no userData necessary.
  648.  *
  649.  * Eliminates the following ALU instructions:
  650.  *  ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
  651.  * using:
  652.  *  MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
  653.  *
  654.  * Transforms RSQ to Radeon's native RSQ by explicitly setting
  655.  * absolute value.
  656.  *
  657.  * @note should be applicable to R300 and R500 fragment programs.
  658.  */
  659. int radeonTransformALU(
  660.         struct radeon_compiler * c,
  661.         struct rc_instruction* inst,
  662.         void* unused)
  663. {
  664.         switch(inst->U.I.Opcode) {
  665.         case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
  666.         case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
  667.         case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
  668.         case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
  669.         case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
  670.         case RC_OPCODE_DST: transform_DST(c, inst); return 1;
  671.         case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
  672.         case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
  673.         case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
  674.         case RC_OPCODE_POW: transform_POW(c, inst); return 1;
  675.         case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1;
  676.         case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
  677.         case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
  678.         case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
  679.         case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;
  680.         case RC_OPCODE_SGT: transform_SGT(c, inst); return 1;
  681.         case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
  682.         case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
  683.         case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
  684.         case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
  685.         case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
  686.         case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
  687.         case RC_OPCODE_TRUNC: transform_TRUNC(c, inst); return 1;
  688.         case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
  689.         default:
  690.                 return 0;
  691.         }
  692. }
  693.  
  694.  
  695. static void transform_r300_vertex_ABS(struct radeon_compiler* c,
  696.         struct rc_instruction* inst)
  697. {
  698.         /* Note: r500 can take absolute values, but r300 cannot. */
  699.         inst->U.I.Opcode = RC_OPCODE_MAX;
  700.         inst->U.I.SrcReg[1] = inst->U.I.SrcReg[0];
  701.         inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
  702. }
  703.  
  704. static void transform_r300_vertex_CMP(struct radeon_compiler* c,
  705.         struct rc_instruction* inst)
  706. {
  707.         /* There is no decent CMP available, so let's rig one up.
  708.          * CMP is defined as dst = src0 < 0.0 ? src1 : src2
  709.          * The following sequence consumes zero to two temps and two extra slots
  710.          * (the second temp and the second slot is consumed by transform_LRP),
  711.          * but should be equivalent:
  712.          *
  713.          * SLT tmp0, src0, 0.0
  714.          * LRP dst, tmp0, src1, src2
  715.          *
  716.          * Yes, I know, I'm a mad scientist. ~ C. & M. */
  717.         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
  718.  
  719.         /* SLT tmp0, src0, 0.0 */
  720.         emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
  721.                 dst,
  722.                 inst->U.I.SrcReg[0], builtin_zero);
  723.  
  724.         /* LRP dst, tmp0, src1, src2 */
  725.         transform_LRP(c,
  726.                 emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
  727.                       inst->U.I.DstReg,
  728.                       srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1],  inst->U.I.SrcReg[2]));
  729.  
  730.         rc_remove_instruction(inst);
  731. }
  732.  
  733. static void transform_r300_vertex_DP2(struct radeon_compiler* c,
  734.         struct rc_instruction* inst)
  735. {
  736.         struct rc_instruction *next_inst = inst->Next;
  737.         transform_DP2(c, inst);
  738.         next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4;
  739. }
  740.  
  741. static void transform_r300_vertex_DP3(struct radeon_compiler* c,
  742.         struct rc_instruction* inst)
  743. {
  744.         struct rc_src_register src0 = inst->U.I.SrcReg[0];
  745.         struct rc_src_register src1 = inst->U.I.SrcReg[1];
  746.         src0.Negate &= ~RC_MASK_W;
  747.         src0.Swizzle &= ~(7 << (3 * 3));
  748.         src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
  749.         src1.Negate &= ~RC_MASK_W;
  750.         src1.Swizzle &= ~(7 << (3 * 3));
  751.         src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
  752.         emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, src1);
  753.         rc_remove_instruction(inst);
  754. }
  755.  
  756. static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
  757.         struct rc_instruction* inst)
  758. {
  759.         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
  760.         unsigned constant_swizzle;
  761.         int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
  762.                                                          0.0000000000000000001,
  763.                                                          &constant_swizzle);
  764.  
  765.         /* MOV dst, src */
  766.         dst.WriteMask = RC_MASK_XYZW;
  767.         emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
  768.                 dst,
  769.                 inst->U.I.SrcReg[0]);
  770.  
  771.         /* MAX dst.y, src, 0.00...001 */
  772.         emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
  773.                 dstregtmpmask(dst.Index, RC_MASK_Y),
  774.                 srcreg(RC_FILE_TEMPORARY, dst.Index),
  775.                 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
  776.  
  777.         inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
  778. }
  779.  
  780. static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
  781.         struct rc_instruction *inst)
  782. {
  783.         /* x = y  <==>  x >= y && y >= x */
  784.         int tmp = rc_find_free_temporary(c);
  785.  
  786.         /* x <= y */
  787.         emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
  788.               dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
  789.               inst->U.I.SrcReg[0],
  790.               inst->U.I.SrcReg[1]);
  791.  
  792.         /* y <= x */
  793.         emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
  794.               inst->U.I.DstReg,
  795.               inst->U.I.SrcReg[1],
  796.               inst->U.I.SrcReg[0]);
  797.  
  798.         /* x && y  =  x * y */
  799.         emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
  800.               inst->U.I.DstReg,
  801.               srcreg(RC_FILE_TEMPORARY, tmp),
  802.               srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
  803.  
  804.         rc_remove_instruction(inst);
  805. }
  806.  
  807. static void transform_r300_vertex_SNE(struct radeon_compiler *c,
  808.         struct rc_instruction *inst)
  809. {
  810.         /* x != y  <==>  x < y || y < x */
  811.         int tmp = rc_find_free_temporary(c);
  812.  
  813.         /* x < y */
  814.         emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
  815.               dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
  816.               inst->U.I.SrcReg[0],
  817.               inst->U.I.SrcReg[1]);
  818.  
  819.         /* y < x */
  820.         emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
  821.               inst->U.I.DstReg,
  822.               inst->U.I.SrcReg[1],
  823.               inst->U.I.SrcReg[0]);
  824.  
  825.         /* x || y  =  max(x, y) */
  826.         emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
  827.               inst->U.I.DstReg,
  828.               srcreg(RC_FILE_TEMPORARY, tmp),
  829.               srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
  830.  
  831.         rc_remove_instruction(inst);
  832. }
  833.  
  834. static void transform_r300_vertex_SGT(struct radeon_compiler* c,
  835.         struct rc_instruction* inst)
  836. {
  837.         /* x > y  <==>  -x < -y */
  838.         inst->U.I.Opcode = RC_OPCODE_SLT;
  839.         inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
  840.         inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
  841. }
  842.  
  843. static void transform_r300_vertex_SLE(struct radeon_compiler* c,
  844.         struct rc_instruction* inst)
  845. {
  846.         /* x <= y  <==>  -x >= -y */
  847.         inst->U.I.Opcode = RC_OPCODE_SGE;
  848.         inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
  849.         inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
  850. }
  851.  
  852. static void transform_r300_vertex_SSG(struct radeon_compiler* c,
  853.         struct rc_instruction* inst)
  854. {
  855.         /* result = sign(x)
  856.          *
  857.          *   SLT tmp0, 0, x;
  858.          *   SLT tmp1, x, 0;
  859.          *   ADD result, tmp0, -tmp1;
  860.          */
  861.         struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
  862.         unsigned tmp1;
  863.  
  864.         /* 0 < x */
  865.         dst0 = try_to_reuse_dst(c, inst);
  866.         emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
  867.               dst0,
  868.               builtin_zero,
  869.               inst->U.I.SrcReg[0]);
  870.  
  871.         /* x < 0 */
  872.         tmp1 = rc_find_free_temporary(c);
  873.         emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
  874.               dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
  875.               inst->U.I.SrcReg[0],
  876.               builtin_zero);
  877.  
  878.         /* Either both are zero, or one of them is one and the other is zero. */
  879.         /* result = tmp0 - tmp1 */
  880.         emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
  881.               inst->U.I.DstReg,
  882.               srcreg(RC_FILE_TEMPORARY, dst0.Index),
  883.               negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
  884.  
  885.         rc_remove_instruction(inst);
  886. }
  887.  
  888. static void transform_vertex_TRUNC(struct radeon_compiler* c,
  889.         struct rc_instruction* inst)
  890. {
  891.         struct rc_instruction *next = inst->Next;
  892.  
  893.         /* next->Prev is removed after each transformation and replaced
  894.          * by a new instruction. */
  895.         transform_TRUNC(c, next->Prev);
  896.         transform_r300_vertex_CMP(c, next->Prev);
  897. }
  898.  
  899. /**
  900.  * For use with rc_local_transform, this transforms non-native ALU
  901.  * instructions of the r300 up to r500 vertex engine.
  902.  */
  903. int r300_transform_vertex_alu(
  904.         struct radeon_compiler * c,
  905.         struct rc_instruction* inst,
  906.         void* unused)
  907. {
  908.         switch(inst->U.I.Opcode) {
  909.         case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
  910.         case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
  911.         case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
  912.         case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
  913.         case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
  914.         case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
  915.         case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
  916.         case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
  917.         case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;
  918.         case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
  919.         case RC_OPCODE_SEQ:
  920.                 if (!c->is_r500) {
  921.                         transform_r300_vertex_SEQ(c, inst);
  922.                         return 1;
  923.                 }
  924.                 return 0;
  925.         case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
  926.         case RC_OPCODE_SGT: transform_r300_vertex_SGT(c, inst); return 1;
  927.         case RC_OPCODE_SLE: transform_r300_vertex_SLE(c, inst); return 1;
  928.         case RC_OPCODE_SNE:
  929.                 if (!c->is_r500) {
  930.                         transform_r300_vertex_SNE(c, inst);
  931.                         return 1;
  932.                 }
  933.                 return 0;
  934.         case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
  935.         case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
  936.         case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
  937.         case RC_OPCODE_TRUNC: transform_vertex_TRUNC(c, inst); return 1;
  938.         case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
  939.         default:
  940.                 return 0;
  941.         }
  942. }
  943.  
  944. static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
  945. {
  946.         static const float SinCosConsts[2][4] = {
  947.                 {
  948.                         1.273239545,            /* 4/PI */
  949.                         -0.405284735,           /* -4/(PI*PI) */
  950.                         3.141592654,            /* PI */
  951.                         0.2225                  /* weight */
  952.                 },
  953.                 {
  954.                         0.75,
  955.                         0.5,
  956.                         0.159154943,            /* 1/(2*PI) */
  957.                         6.283185307             /* 2*PI */
  958.                 }
  959.         };
  960.         int i;
  961.  
  962.         for(i = 0; i < 2; ++i)
  963.                 constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
  964. }
  965.  
  966. /**
  967.  * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
  968.  *
  969.  * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
  970.  * MAD tmp.x, tmp.y, |src|, tmp.x
  971.  * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
  972.  * MAD dest, tmp.y, weight, tmp.x
  973.  */
  974. static void sin_approx(
  975.         struct radeon_compiler* c, struct rc_instruction * inst,
  976.         struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
  977. {
  978.         unsigned int tempreg = rc_find_free_temporary(c);
  979.  
  980.         emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),
  981.                 swizzle_xxxx(src),
  982.                 srcreg(RC_FILE_CONSTANT, constants[0]));
  983.         emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),
  984.                 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
  985.                 absolute(swizzle_xxxx(src)),
  986.                 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
  987.         emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),
  988.                 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
  989.                 absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),
  990.                 negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));
  991.         emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst,
  992.                 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
  993.                 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),
  994.                 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
  995. }
  996.  
  997. /**
  998.  * Translate the trigonometric functions COS, SIN, and SCS
  999.  * using only the basic instructions
  1000.  *  MOV, ADD, MUL, MAD, FRC
  1001.  */
  1002. int r300_transform_trig_simple(struct radeon_compiler* c,
  1003.         struct rc_instruction* inst,
  1004.         void* unused)
  1005. {
  1006.         unsigned int constants[2];
  1007.         unsigned int tempreg;
  1008.  
  1009.         if (inst->U.I.Opcode != RC_OPCODE_COS &&
  1010.             inst->U.I.Opcode != RC_OPCODE_SIN &&
  1011.             inst->U.I.Opcode != RC_OPCODE_SCS)
  1012.                 return 0;
  1013.  
  1014.         tempreg = rc_find_free_temporary(c);
  1015.  
  1016.         sincos_constants(c, constants);
  1017.  
  1018.         if (inst->U.I.Opcode == RC_OPCODE_COS) {
  1019.                 /* MAD tmp.x, src, 1/(2*PI), 0.75 */
  1020.                 /* FRC tmp.x, tmp.x */
  1021.                 /* MAD tmp.z, tmp.x, 2*PI, -PI */
  1022.                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
  1023.                         swizzle_xxxx(inst->U.I.SrcReg[0]),
  1024.                         swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
  1025.                         swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));
  1026.                 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
  1027.                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
  1028.                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
  1029.                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
  1030.                         swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
  1031.                         negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
  1032.  
  1033.                 sin_approx(c, inst, inst->U.I.DstReg,
  1034.                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
  1035.                         constants);
  1036.         } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
  1037.                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
  1038.                         swizzle_xxxx(inst->U.I.SrcReg[0]),
  1039.                         swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
  1040.                         swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));
  1041.                 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
  1042.                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
  1043.                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
  1044.                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
  1045.                         swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
  1046.                         negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
  1047.  
  1048.                 sin_approx(c, inst, inst->U.I.DstReg,
  1049.                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
  1050.                         constants);
  1051.         } else {
  1052.                 struct rc_dst_register dst;
  1053.  
  1054.                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
  1055.                         swizzle_xxxx(inst->U.I.SrcReg[0]),
  1056.                         swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
  1057.                         swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
  1058.                 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_XY),
  1059.                         srcreg(RC_FILE_TEMPORARY, tempreg));
  1060.                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
  1061.                         srcreg(RC_FILE_TEMPORARY, tempreg),
  1062.                         swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
  1063.                         negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
  1064.  
  1065.                 dst = inst->U.I.DstReg;
  1066.  
  1067.                 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
  1068.                 sin_approx(c, inst, dst,
  1069.                         swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
  1070.                         constants);
  1071.  
  1072.                 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
  1073.                 sin_approx(c, inst, dst,
  1074.                         swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
  1075.                         constants);
  1076.         }
  1077.  
  1078.         rc_remove_instruction(inst);
  1079.  
  1080.         return 1;
  1081. }
  1082.  
  1083. static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,
  1084.         struct rc_instruction *inst,
  1085.         unsigned srctmp)
  1086. {
  1087.         if (inst->U.I.Opcode == RC_OPCODE_COS) {
  1088.                 emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,
  1089.                         srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
  1090.         } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
  1091.                 emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,
  1092.                         inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
  1093.         } else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
  1094.                 struct rc_dst_register moddst = inst->U.I.DstReg;
  1095.  
  1096.                 if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
  1097.                         moddst.WriteMask = RC_MASK_X;
  1098.                         emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, moddst,
  1099.                                 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
  1100.                 }
  1101.                 if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
  1102.                         moddst.WriteMask = RC_MASK_Y;
  1103.                         emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, moddst,
  1104.                                 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
  1105.                 }
  1106.         }
  1107.  
  1108.         rc_remove_instruction(inst);
  1109. }
  1110.  
  1111.  
  1112. /**
  1113.  * Transform the trigonometric functions COS, SIN, and SCS
  1114.  * to include pre-scaling by 1/(2*PI) and taking the fractional
  1115.  * part, so that the input to COS and SIN is always in the range [0,1).
  1116.  * SCS is replaced by one COS and one SIN instruction.
  1117.  *
  1118.  * @warning This transformation implicitly changes the semantics of SIN and COS!
  1119.  */
  1120. int radeonTransformTrigScale(struct radeon_compiler* c,
  1121.         struct rc_instruction* inst,
  1122.         void* unused)
  1123. {
  1124.         static const float RCP_2PI = 0.15915494309189535;
  1125.         unsigned int temp;
  1126.         unsigned int constant;
  1127.         unsigned int constant_swizzle;
  1128.  
  1129.         if (inst->U.I.Opcode != RC_OPCODE_COS &&
  1130.             inst->U.I.Opcode != RC_OPCODE_SIN &&
  1131.             inst->U.I.Opcode != RC_OPCODE_SCS)
  1132.                 return 0;
  1133.  
  1134.         temp = rc_find_free_temporary(c);
  1135.         constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
  1136.  
  1137.         emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(temp, RC_MASK_W),
  1138.                 swizzle_xxxx(inst->U.I.SrcReg[0]),
  1139.                 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
  1140.         emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
  1141.                 srcreg(RC_FILE_TEMPORARY, temp));
  1142.  
  1143.         r300_transform_SIN_COS_SCS(c, inst, temp);
  1144.         return 1;
  1145. }
  1146.  
  1147. /**
  1148.  * Transform the trigonometric functions COS, SIN, and SCS
  1149.  * so that the input to COS and SIN is always in the range [-PI, PI].
  1150.  * SCS is replaced by one COS and one SIN instruction.
  1151.  */
  1152. int r300_transform_trig_scale_vertex(struct radeon_compiler *c,
  1153.         struct rc_instruction *inst,
  1154.         void *unused)
  1155. {
  1156.         static const float cons[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};
  1157.         unsigned int temp;
  1158.         unsigned int constant;
  1159.  
  1160.         if (inst->U.I.Opcode != RC_OPCODE_COS &&
  1161.             inst->U.I.Opcode != RC_OPCODE_SIN &&
  1162.             inst->U.I.Opcode != RC_OPCODE_SCS)
  1163.                 return 0;
  1164.  
  1165.         /* Repeat x in the range [-PI, PI]:
  1166.          *
  1167.          *   repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI
  1168.          */
  1169.  
  1170.         temp = rc_find_free_temporary(c);
  1171.         constant = rc_constants_add_immediate_vec4(&c->Program.Constants, cons);
  1172.  
  1173.         emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
  1174.                 swizzle_xxxx(inst->U.I.SrcReg[0]),
  1175.                 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_XXXX),
  1176.                 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_YYYY));
  1177.         emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
  1178.                 srcreg(RC_FILE_TEMPORARY, temp));
  1179.         emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
  1180.                 srcreg(RC_FILE_TEMPORARY, temp),
  1181.                 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_ZZZZ),
  1182.                 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_WWWW));
  1183.  
  1184.         r300_transform_SIN_COS_SCS(c, inst, temp);
  1185.         return 1;
  1186. }
  1187.  
  1188. /**
  1189.  * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
  1190.  * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
  1191.  * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
  1192.  *
  1193.  * @warning This explicitly changes the form of DDX and DDY!
  1194.  */
  1195.  
  1196. int radeonTransformDeriv(struct radeon_compiler* c,
  1197.         struct rc_instruction* inst,
  1198.         void* unused)
  1199. {
  1200.         if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)
  1201.                 return 0;
  1202.  
  1203.         inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_1111;
  1204.         inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
  1205.  
  1206.         return 1;
  1207. }
  1208.  
  1209. /**
  1210.  * IF Temp[0].x -> IF Temp[0].x
  1211.  * ...          -> ...
  1212.  * KILL         -> KIL -abs(Temp[0].x)
  1213.  * ...          -> ...
  1214.  * ENDIF        -> ENDIF
  1215.  *
  1216.  * === OR ===
  1217.  *
  1218.  * IF Temp[0].x -\
  1219.  * KILL         - > KIL -abs(Temp[0].x)
  1220.  * ENDIF        -/
  1221.  *
  1222.  * === OR ===
  1223.  *
  1224.  * IF Temp[0].x -> IF Temp[0].x
  1225.  * ...          -> ...
  1226.  * ELSE         -> ELSE
  1227.  * ...          -> ...
  1228.  * KILL         -> KIL -abs(Temp[0].x)
  1229.  * ...          -> ...
  1230.  * ENDIF        -> ENDIF
  1231.  *
  1232.  * === OR ===
  1233.  *
  1234.  * KILL         -> KIL -none.1111
  1235.  *
  1236.  * This needs to be done in its own pass, because it might modify the
  1237.  * instructions before and after KILL.
  1238.  */
  1239. void rc_transform_KILL(struct radeon_compiler * c, void *user)
  1240. {
  1241.         struct rc_instruction * inst;
  1242.         for (inst = c->Program.Instructions.Next;
  1243.                         inst != &c->Program.Instructions; inst = inst->Next) {
  1244.                 struct rc_instruction * if_inst;
  1245.                 unsigned in_if = 0;
  1246.  
  1247.                 if (inst->U.I.Opcode != RC_OPCODE_KILP)
  1248.                         continue;
  1249.  
  1250.                 for (if_inst = inst->Prev; if_inst != &c->Program.Instructions;
  1251.                                                 if_inst = if_inst->Prev) {
  1252.  
  1253.                         if (if_inst->U.I.Opcode == RC_OPCODE_IF) {
  1254.                                 in_if = 1;
  1255.                                 break;
  1256.                         }
  1257.                 }
  1258.  
  1259.                 inst->U.I.Opcode = RC_OPCODE_KIL;
  1260.  
  1261.                 if (!in_if) {
  1262.                         inst->U.I.SrcReg[0] = negate(builtin_one);
  1263.                 } else {
  1264.                         /* This should work even if the KILP is inside the ELSE
  1265.                          * block, because -0.0 is considered negative. */
  1266.                         inst->U.I.SrcReg[0] =
  1267.                                 negate(absolute(if_inst->U.I.SrcReg[0]));
  1268.  
  1269.                         if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
  1270.                                 && inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
  1271.  
  1272.                                 /* Optimize the special case:
  1273.                                  * IF Temp[0].x
  1274.                                  * KILP
  1275.                                  * ENDIF
  1276.                                  */
  1277.  
  1278.                                 /* Remove IF */
  1279.                                 rc_remove_instruction(inst->Prev);
  1280.                                 /* Remove ENDIF */
  1281.                                 rc_remove_instruction(inst->Next);
  1282.                         }
  1283.                 }
  1284.         }
  1285. }
  1286.  
  1287. int rc_force_output_alpha_to_one(struct radeon_compiler *c,
  1288.                                  struct rc_instruction *inst, void *data)
  1289. {
  1290.         struct r300_fragment_program_compiler *fragc = (struct r300_fragment_program_compiler*)c;
  1291.         const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
  1292.         unsigned tmp;
  1293.  
  1294.         if (!info->HasDstReg || inst->U.I.DstReg.File != RC_FILE_OUTPUT ||
  1295.             inst->U.I.DstReg.Index == fragc->OutputDepth)
  1296.                 return 1;
  1297.  
  1298.         tmp = rc_find_free_temporary(c);
  1299.  
  1300.         /* Insert MOV after inst, set alpha to 1. */
  1301.         emit1(c, inst, RC_OPCODE_MOV, 0, inst->U.I.DstReg,
  1302.               srcregswz(RC_FILE_TEMPORARY, tmp, RC_SWIZZLE_XYZ1));
  1303.  
  1304.         /* Re-route the destination of inst to the source of mov. */
  1305.         inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
  1306.         inst->U.I.DstReg.Index = tmp;
  1307.  
  1308.         /* Move the saturate output modifier to the MOV instruction
  1309.          * (for better copy propagation). */
  1310.         inst->Next->U.I.SaturateMode = inst->U.I.SaturateMode;
  1311.         inst->U.I.SaturateMode = RC_SATURATE_NONE;
  1312.         return 1;
  1313. }
  1314.