Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1.  
  2. /* FF is big and ugly so feel free to write lines as long as you like.
  3.  * Aieeeeeeeee !
  4.  *
  5.  * Let me make that clearer:
  6.  * Aieeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee ! !! !!!
  7.  */
  8.  
  9. #include "device9.h"
  10. #include "basetexture9.h"
  11. #include "vertexdeclaration9.h"
  12. #include "vertexshader9.h"
  13. #include "pixelshader9.h"
  14. #include "nine_ff.h"
  15. #include "nine_defines.h"
  16. #include "nine_helpers.h"
  17. #include "nine_pipe.h"
  18. #include "nine_dump.h"
  19.  
  20. #include "pipe/p_context.h"
  21. #include "tgsi/tgsi_ureg.h"
  22. #include "tgsi/tgsi_dump.h"
  23. #include "util/u_box.h"
  24. #include "util/u_hash_table.h"
  25.  
  26. #define NINE_TGSI_LAZY_DEVS 1
  27.  
  28. #define DBG_CHANNEL DBG_FF
  29.  
  30. #define NINE_FF_NUM_VS_CONST 256
  31. #define NINE_FF_NUM_PS_CONST 24
  32.  
  33. #define NINED3DTSS_TCI_DISABLE                       0
  34. #define NINED3DTSS_TCI_PASSTHRU                      1
  35. #define NINED3DTSS_TCI_CAMERASPACENORMAL             2
  36. #define NINED3DTSS_TCI_CAMERASPACEPOSITION           3
  37. #define NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR   4
  38. #define NINED3DTSS_TCI_SPHEREMAP                     5
  39.  
  40. struct fvec4
  41. {
  42.     float x, y, z, w;
  43. };
  44.  
  45. struct nine_ff_vs_key
  46. {
  47.     union {
  48.         struct {
  49.             uint32_t position_t : 1;
  50.             uint32_t lighting   : 1;
  51.             uint32_t darkness   : 1; /* lighting enabled but no active lights */
  52.             uint32_t localviewer : 1;
  53.             uint32_t vertexpointsize : 1;
  54.             uint32_t pointscale : 1;
  55.             uint32_t vertexblend : 3;
  56.             uint32_t vertexblend_indexed : 1;
  57.             uint32_t vertextween : 1;
  58.             uint32_t mtl_diffuse : 2; /* 0 = material, 1 = color1, 2 = color2 */
  59.             uint32_t mtl_ambient : 2;
  60.             uint32_t mtl_specular : 2;
  61.             uint32_t mtl_emissive : 2;
  62.             uint32_t fog_mode : 2;
  63.             uint32_t fog_range : 1;
  64.             uint32_t color0in_one : 1;
  65.             uint32_t color1in_one : 1;
  66.             uint32_t pad1 : 8;
  67.             uint32_t tc_gen : 24; /* 8 * 3 bits */
  68.             uint32_t pad2 : 8;
  69.             uint32_t tc_idx : 24;
  70.             uint32_t pad3 : 8;
  71.             uint32_t tc_dim : 24; /* 8 * 3 bits */
  72.             uint32_t pad4 : 8;
  73.         };
  74.         uint64_t value64[2]; /* don't forget to resize VertexShader9.ff_key */
  75.         uint32_t value32[4];
  76.     };
  77. };
  78.  
  79. /* Texture stage state:
  80.  *
  81.  * COLOROP       D3DTOP 5 bit
  82.  * ALPHAOP       D3DTOP 5 bit
  83.  * COLORARG0     D3DTA  3 bit
  84.  * COLORARG1     D3DTA  3 bit
  85.  * COLORARG2     D3DTA  3 bit
  86.  * ALPHAARG0     D3DTA  3 bit
  87.  * ALPHAARG1     D3DTA  3 bit
  88.  * ALPHAARG2     D3DTA  3 bit
  89.  * RESULTARG     D3DTA  1 bit (CURRENT:0 or TEMP:1)
  90.  * TEXCOORDINDEX 0 - 7  3 bit
  91.  * ===========================
  92.  *                     32 bit per stage
  93.  */
  94. struct nine_ff_ps_key
  95. {
  96.     union {
  97.         struct {
  98.             struct {
  99.                 uint32_t colorop   : 5;
  100.                 uint32_t alphaop   : 5;
  101.                 uint32_t colorarg0 : 3;
  102.                 uint32_t colorarg1 : 3;
  103.                 uint32_t colorarg2 : 3;
  104.                 uint32_t alphaarg0 : 3;
  105.                 uint32_t alphaarg1 : 3;
  106.                 uint32_t alphaarg2 : 3;
  107.                 uint32_t resultarg : 1; /* CURRENT:0 or TEMP:1 */
  108.                 uint32_t textarget : 2; /* 1D/2D/3D/CUBE */
  109.                 uint32_t projected : 1;
  110.                 /* that's 32 bit exactly */
  111.             } ts[8];
  112.             uint32_t fog : 1; /* for vFog with programmable VS */
  113.             uint32_t fog_mode : 2;
  114.             uint32_t specular : 1; /* 9 32-bit words with this */
  115.             uint8_t colorarg_b4[3];
  116.             uint8_t colorarg_b5[3];
  117.             uint8_t alphaarg_b4[3]; /* 11 32-bit words plus a byte */
  118.         };
  119.         uint64_t value64[6]; /* don't forget to resize PixelShader9.ff_key */
  120.         uint32_t value32[12];
  121.     };
  122. };
  123.  
  124. static unsigned nine_ff_vs_key_hash(void *key)
  125. {
  126.     struct nine_ff_vs_key *vs = key;
  127.     unsigned i;
  128.     uint32_t hash = vs->value32[0];
  129.     for (i = 1; i < Elements(vs->value32); ++i)
  130.         hash ^= vs->value32[i];
  131.     return hash;
  132. }
  133. static int nine_ff_vs_key_comp(void *key1, void *key2)
  134. {
  135.     struct nine_ff_vs_key *a = (struct nine_ff_vs_key *)key1;
  136.     struct nine_ff_vs_key *b = (struct nine_ff_vs_key *)key2;
  137.  
  138.     return memcmp(a->value64, b->value64, sizeof(a->value64));
  139. }
  140. static unsigned nine_ff_ps_key_hash(void *key)
  141. {
  142.     struct nine_ff_ps_key *ps = key;
  143.     unsigned i;
  144.     uint32_t hash = ps->value32[0];
  145.     for (i = 1; i < Elements(ps->value32); ++i)
  146.         hash ^= ps->value32[i];
  147.     return hash;
  148. }
  149. static int nine_ff_ps_key_comp(void *key1, void *key2)
  150. {
  151.     struct nine_ff_ps_key *a = (struct nine_ff_ps_key *)key1;
  152.     struct nine_ff_ps_key *b = (struct nine_ff_ps_key *)key2;
  153.  
  154.     return memcmp(a->value64, b->value64, sizeof(a->value64));
  155. }
  156. static unsigned nine_ff_fvf_key_hash(void *key)
  157. {
  158.     return *(DWORD *)key;
  159. }
  160. static int nine_ff_fvf_key_comp(void *key1, void *key2)
  161. {
  162.     return *(DWORD *)key1 != *(DWORD *)key2;
  163. }
  164.  
  165. static void nine_ff_prune_vs(struct NineDevice9 *);
  166. static void nine_ff_prune_ps(struct NineDevice9 *);
  167.  
  168. static void nine_ureg_tgsi_dump(struct ureg_program *ureg, boolean override)
  169. {
  170.     if (debug_get_bool_option("NINE_FF_DUMP", FALSE) || override) {
  171.         unsigned count;
  172.         const struct tgsi_token *toks = ureg_get_tokens(ureg, &count);
  173.         tgsi_dump(toks, 0);
  174.         ureg_free_tokens(toks);
  175.     }
  176. }
  177.  
  178. #define _X(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_X)
  179. #define _Y(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Y)
  180. #define _Z(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Z)
  181. #define _W(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_W)
  182.  
  183. #define _XXXX(r) ureg_scalar(r, TGSI_SWIZZLE_X)
  184. #define _YYYY(r) ureg_scalar(r, TGSI_SWIZZLE_Y)
  185. #define _ZZZZ(r) ureg_scalar(r, TGSI_SWIZZLE_Z)
  186. #define _WWWW(r) ureg_scalar(r, TGSI_SWIZZLE_W)
  187.  
  188. #define _XYZW(r) (r)
  189.  
  190. /* AL should contain base address of lights table. */
  191. #define LIGHT_CONST(i)                                                \
  192.     ureg_src_indirect(ureg_DECL_constant(ureg, i), _X(AL))
  193.  
  194. #define MATERIAL_CONST(i) \
  195.     ureg_DECL_constant(ureg, 19 + (i))
  196.  
  197. #define _CONST(n) ureg_DECL_constant(ureg, n)
  198.  
  199. /* VS FF constants layout:
  200.  *
  201.  * CONST[ 0.. 3] D3DTS_WORLD * D3DTS_VIEW * D3DTS_PROJECTION
  202.  * CONST[ 4.. 7] D3DTS_WORLD * D3DTS_VIEW
  203.  * CONST[ 8..11] D3DTS_VIEW * D3DTS_PROJECTION
  204.  * CONST[12..15] D3DTS_VIEW
  205.  * CONST[16..18] Normal matrix
  206.  *
  207.  * CONST[19]      MATERIAL.Emissive + Material.Ambient * RS.Ambient
  208.  * CONST[20]      MATERIAL.Diffuse
  209.  * CONST[21]      MATERIAL.Ambient
  210.  * CONST[22]      MATERIAL.Specular
  211.  * CONST[23].x___ MATERIAL.Power
  212.  * CONST[24]      MATERIAL.Emissive
  213.  * CONST[25]      RS.Ambient
  214.  *
  215.  * CONST[26].x___ RS.PointSizeMin
  216.  * CONST[26]._y__ RS.PointSizeMax
  217.  * CONST[26].__z_ RS.PointSize
  218.  * CONST[26].___w RS.PointScaleA
  219.  * CONST[27].x___ RS.PointScaleB
  220.  * CONST[27]._y__ RS.PointScaleC
  221.  *
  222.  * CONST[28].x___ RS.FogEnd
  223.  * CONST[28]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
  224.  * CONST[28].__z_ RS.FogDensity
  225.  * CONST[29]      RS.FogColor
  226.  
  227.  * CONST[30].x___ TWEENFACTOR
  228.  *
  229.  * CONST[32].x___ LIGHT[0].Type
  230.  * CONST[32]._yzw LIGHT[0].Attenuation0,1,2
  231.  * CONST[33]      LIGHT[0].Diffuse
  232.  * CONST[34]      LIGHT[0].Specular
  233.  * CONST[35]      LIGHT[0].Ambient
  234.  * CONST[36].xyz_ LIGHT[0].Position
  235.  * CONST[36].___w LIGHT[0].Range
  236.  * CONST[37].xyz_ LIGHT[0].Direction
  237.  * CONST[37].___w LIGHT[0].Falloff
  238.  * CONST[38].x___ cos(LIGHT[0].Theta / 2)
  239.  * CONST[38]._y__ cos(LIGHT[0].Phi / 2)
  240.  * CONST[38].__z_ 1.0f / (cos(LIGHT[0].Theta / 2) - cos(Light[0].Phi / 2))
  241.  * CONST[39].xyz_ LIGHT[0].HalfVector (for directional lights)
  242.  * CONST[39].___w 1 if this is the last active light, 0 if not
  243.  * CONST[40]      LIGHT[1]
  244.  * CONST[48]      LIGHT[2]
  245.  * CONST[56]      LIGHT[3]
  246.  * CONST[64]      LIGHT[4]
  247.  * CONST[72]      LIGHT[5]
  248.  * CONST[80]      LIGHT[6]
  249.  * CONST[88]      LIGHT[7]
  250.  * NOTE: no lighting code is generated if there are no active lights
  251.  *
  252.  * CONST[100].x___ Viewport 2/width
  253.  * CONST[100]._y__ Viewport 2/height
  254.  * CONST[100].__z_ Viewport 1/(zmax - zmin)
  255.  * CONST[101].x___ Viewport x0
  256.  * CONST[101]._y__ Viewport y0
  257.  * CONST[101].__z_ Viewport z0
  258.  *
  259.  * CONST[128..131] D3DTS_TEXTURE0
  260.  * CONST[132..135] D3DTS_TEXTURE1
  261.  * CONST[136..139] D3DTS_TEXTURE2
  262.  * CONST[140..143] D3DTS_TEXTURE3
  263.  * CONST[144..147] D3DTS_TEXTURE4
  264.  * CONST[148..151] D3DTS_TEXTURE5
  265.  * CONST[152..155] D3DTS_TEXTURE6
  266.  * CONST[156..159] D3DTS_TEXTURE7
  267.  *
  268.  * CONST[224] D3DTS_WORLDMATRIX[0]
  269.  * CONST[228] D3DTS_WORLDMATRIX[1]
  270.  * ...
  271.  * CONST[252] D3DTS_WORLDMATRIX[7]
  272.  */
  273. struct vs_build_ctx
  274. {
  275.     struct ureg_program *ureg;
  276.     const struct nine_ff_vs_key *key;
  277.  
  278.     uint16_t input[PIPE_MAX_ATTRIBS];
  279.     unsigned num_inputs;
  280.  
  281.     struct ureg_src aVtx;
  282.     struct ureg_src aNrm;
  283.     struct ureg_src aCol[2];
  284.     struct ureg_src aTex[8];
  285.     struct ureg_src aPsz;
  286.     struct ureg_src aInd;
  287.     struct ureg_src aWgt;
  288.  
  289.     struct ureg_src aVtx1; /* tweening */
  290.     struct ureg_src aNrm1;
  291.  
  292.     struct ureg_src mtlA;
  293.     struct ureg_src mtlD;
  294.     struct ureg_src mtlS;
  295.     struct ureg_src mtlE;
  296. };
  297.  
  298. static INLINE unsigned
  299. get_texcoord_sn(struct pipe_screen *screen)
  300. {
  301.     if (screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD))
  302.         return TGSI_SEMANTIC_TEXCOORD;
  303.     return TGSI_SEMANTIC_GENERIC;
  304. }
  305.  
  306. static INLINE struct ureg_src
  307. build_vs_add_input(struct vs_build_ctx *vs, uint16_t ndecl)
  308. {
  309.     const unsigned i = vs->num_inputs++;
  310.     assert(i < PIPE_MAX_ATTRIBS);
  311.     vs->input[i] = ndecl;
  312.     return ureg_DECL_vs_input(vs->ureg, i);
  313. }
  314.  
  315. /* NOTE: dst may alias src */
  316. static INLINE void
  317. ureg_normalize3(struct ureg_program *ureg,
  318.                 struct ureg_dst dst, struct ureg_src src,
  319.                 struct ureg_dst tmp)
  320. {
  321. #ifdef NINE_TGSI_LAZY_DEVS
  322.     struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
  323.  
  324.     ureg_DP3(ureg, tmp_x, src, src);
  325.     ureg_RSQ(ureg, tmp_x, _X(tmp));
  326.     ureg_MUL(ureg, dst, src, _X(tmp));
  327. #else
  328.     ureg_NRM(ureg, dst, src);
  329. #endif
  330. }
  331.  
  332. static void *
  333. nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
  334. {
  335.     const struct nine_ff_vs_key *key = vs->key;
  336.     struct ureg_program *ureg = ureg_create(TGSI_PROCESSOR_VERTEX);
  337.     struct ureg_dst oPos, oCol[2], oTex[8], oPsz, oFog;
  338.     struct ureg_dst rCol[2]; /* oCol if no fog, TEMP otherwise */
  339.     struct ureg_dst rVtx, rNrm;
  340.     struct ureg_dst r[8];
  341.     struct ureg_dst AR;
  342.     struct ureg_dst tmp, tmp_x, tmp_z;
  343.     unsigned i, c;
  344.     unsigned label[32], l = 0;
  345.     unsigned num_r = 8;
  346.     boolean need_rNrm = key->lighting || key->pointscale;
  347.     boolean need_rVtx = key->lighting || key->fog_mode;
  348.     const unsigned texcoord_sn = get_texcoord_sn(device->screen);
  349.  
  350.     vs->ureg = ureg;
  351.  
  352.     /* Check which inputs we should transform. */
  353.     for (i = 0; i < 8 * 3; i += 3) {
  354.         switch ((key->tc_gen >> i) & 0x3) {
  355.         case NINED3DTSS_TCI_CAMERASPACENORMAL:
  356.             need_rNrm = TRUE;
  357.             break;
  358.         case NINED3DTSS_TCI_CAMERASPACEPOSITION:
  359.             need_rVtx = TRUE;
  360.             break;
  361.         case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
  362.             need_rVtx = need_rNrm = TRUE;
  363.             break;
  364.         default:
  365.             break;
  366.         }
  367.     }
  368.  
  369.     /* Declare and record used inputs (needed for linkage with vertex format):
  370.      * (texture coordinates handled later)
  371.      */
  372.     vs->aVtx = build_vs_add_input(vs,
  373.         key->position_t ? NINE_DECLUSAGE_POSITIONT : NINE_DECLUSAGE_POSITION);
  374.  
  375.     if (need_rNrm)
  376.         vs->aNrm = build_vs_add_input(vs, NINE_DECLUSAGE_NORMAL);
  377.  
  378.     vs->aCol[0] = ureg_imm1f(ureg, 1.0f);
  379.     vs->aCol[1] = ureg_imm1f(ureg, 1.0f);
  380.  
  381.     if (key->lighting || key->darkness) {
  382.         const unsigned mask = key->mtl_diffuse | key->mtl_specular |
  383.                               key->mtl_ambient | key->mtl_emissive;
  384.         if ((mask & 0x1) && !key->color0in_one)
  385.             vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
  386.         if ((mask & 0x2) && !key->color1in_one)
  387.             vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
  388.  
  389.         vs->mtlD = MATERIAL_CONST(1);
  390.         vs->mtlA = MATERIAL_CONST(2);
  391.         vs->mtlS = MATERIAL_CONST(3);
  392.         vs->mtlE = MATERIAL_CONST(5);
  393.         if (key->mtl_diffuse  == 1) vs->mtlD = vs->aCol[0]; else
  394.         if (key->mtl_diffuse  == 2) vs->mtlD = vs->aCol[1];
  395.         if (key->mtl_ambient  == 1) vs->mtlA = vs->aCol[0]; else
  396.         if (key->mtl_ambient  == 2) vs->mtlA = vs->aCol[1];
  397.         if (key->mtl_specular == 1) vs->mtlS = vs->aCol[0]; else
  398.         if (key->mtl_specular == 2) vs->mtlS = vs->aCol[1];
  399.         if (key->mtl_emissive == 1) vs->mtlE = vs->aCol[0]; else
  400.         if (key->mtl_emissive == 2) vs->mtlE = vs->aCol[1];
  401.     } else {
  402.         if (!key->color0in_one) vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
  403.         if (!key->color1in_one) vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
  404.     }
  405.  
  406.     if (key->vertexpointsize)
  407.         vs->aPsz = build_vs_add_input(vs, NINE_DECLUSAGE_PSIZE);
  408.  
  409.     if (key->vertexblend_indexed)
  410.         vs->aInd = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDINDICES);
  411.     if (key->vertexblend)
  412.         vs->aWgt = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDWEIGHT);
  413.     if (key->vertextween) {
  414.         vs->aVtx1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(POSITION,1));
  415.         vs->aNrm1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(NORMAL,1));
  416.     }
  417.  
  418.     /* Declare outputs:
  419.      */
  420.     oPos = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0); /* HPOS */
  421.     oCol[0] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0));
  422.     oCol[1] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1));
  423.  
  424.     if (key->vertexpointsize || key->pointscale) {
  425.         oPsz = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_PSIZE, 0, TGSI_WRITEMASK_X);
  426.         oPsz = ureg_writemask(oPsz, TGSI_WRITEMASK_X);
  427.     }
  428.     if (key->fog_mode) {
  429.         /* We apply fog to the vertex colors, oFog is for programmable shaders only ?
  430.          */
  431.         oFog = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_FOG, 0, TGSI_WRITEMASK_X);
  432.         oFog = ureg_writemask(oFog, TGSI_WRITEMASK_X);
  433.     }
  434.  
  435.     /* Declare TEMPs:
  436.      */
  437.     for (i = 0; i < num_r; ++i)
  438.         r[i] = ureg_DECL_local_temporary(ureg);
  439.     tmp = r[0];
  440.     tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
  441.     tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
  442.     if (key->lighting || key->vertexblend)
  443.         AR = ureg_DECL_address(ureg);
  444.  
  445.     if (key->fog_mode) {
  446.         rCol[0] = r[2];
  447.         rCol[1] = r[3];
  448.     } else {
  449.         rCol[0] = oCol[0];
  450.         rCol[1] = oCol[1];
  451.     }
  452.  
  453.     rVtx = ureg_writemask(r[1], TGSI_WRITEMASK_XYZ);
  454.     rNrm = ureg_writemask(r[2], TGSI_WRITEMASK_XYZ);
  455.  
  456.     /* === Vertex transformation / vertex blending:
  457.      */
  458.     if (key->vertextween) {
  459.         assert(!key->vertexblend);
  460.         ureg_LRP(ureg, r[2], _XXXX(_CONST(30)), vs->aVtx, vs->aVtx1);
  461.         if (need_rNrm)
  462.             ureg_LRP(ureg, r[3], _XXXX(_CONST(30)), vs->aNrm, vs->aNrm1);
  463.         vs->aVtx = ureg_src(r[2]);
  464.         vs->aNrm = ureg_src(r[3]);
  465.     }
  466.  
  467.     if (key->vertexblend) {
  468.         struct ureg_src cWM[4];
  469.  
  470.         for (i = 224; i <= 255; ++i)
  471.             ureg_DECL_constant(ureg, i);
  472.  
  473.         /* translate world matrix index to constant file index */
  474.         if (key->vertexblend_indexed) {
  475.             ureg_MAD(ureg, tmp, vs->aInd, ureg_imm1f(ureg, 4.0f), ureg_imm1f(ureg, 224.0f));
  476.             ureg_ARL(ureg, AR, ureg_src(tmp));
  477.         }
  478.         for (i = 0; i < key->vertexblend; ++i) {
  479.             for (c = 0; c < 4; ++c) {
  480.                 cWM[c] = ureg_src_register(TGSI_FILE_CONSTANT, (224 + i * 4) * !key->vertexblend_indexed + c);
  481.                 if (key->vertexblend_indexed)
  482.                     cWM[c] = ureg_src_indirect(cWM[c], ureg_scalar(ureg_src(AR), i));
  483.             }
  484.             /* multiply by WORLD(index) */
  485.             ureg_MUL(ureg, r[0], _XXXX(vs->aVtx), cWM[0]);
  486.             ureg_MAD(ureg, r[0], _YYYY(vs->aVtx), cWM[1], ureg_src(r[0]));
  487.             ureg_MAD(ureg, r[0], _ZZZZ(vs->aVtx), cWM[2], ureg_src(r[0]));
  488.             ureg_MAD(ureg, r[0], _WWWW(vs->aVtx), cWM[3], ureg_src(r[0]));
  489.  
  490.             /* accumulate weighted position value */
  491.             if (i)
  492.                 ureg_MAD(ureg, r[2], ureg_src(r[0]), ureg_scalar(vs->aWgt, i), ureg_src(r[2]));
  493.             else
  494.                 ureg_MUL(ureg, r[2], ureg_src(r[0]), ureg_scalar(vs->aWgt, 0));
  495.         }
  496.         /* multiply by VIEW_PROJ */
  497.         ureg_MUL(ureg, r[0], _X(r[2]), _CONST(8));
  498.         ureg_MAD(ureg, r[0], _Y(r[2]), _CONST(9),  ureg_src(r[0]));
  499.         ureg_MAD(ureg, r[0], _Z(r[2]), _CONST(10), ureg_src(r[0]));
  500.         ureg_MAD(ureg, oPos, _W(r[2]), _CONST(11), ureg_src(r[0]));
  501.  
  502.         if (need_rVtx)
  503.             vs->aVtx = ureg_src(r[2]);
  504.     } else
  505.     if (key->position_t && device->driver_caps.window_space_position_support) {
  506.         ureg_MOV(ureg, oPos, vs->aVtx);
  507.     } else if (key->position_t) {
  508.         /* vs->aVtx contains the coordinates buffer wise.
  509.         * later in the pipeline, clipping, viewport and division
  510.         * by w (rhw = 1/w) are going to be applied, so do the reverse
  511.         * of these transformations (except clipping) to have the good
  512.         * position at the end.*/
  513.         ureg_MOV(ureg, tmp, vs->aVtx);
  514.         /* X from [X_min, X_min + width] to [-1, 1], same for Y. Z to [0, 1] */
  515.         ureg_SUB(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _CONST(101));
  516.         ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _CONST(100));
  517.         ureg_SUB(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, 1.0f));
  518.         /* Y needs to be reversed */
  519.         ureg_MOV(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_negate(ureg_src(tmp)));
  520.         /* inverse rhw */
  521.         ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_W), _W(tmp));
  522.         /* multiply X, Y, Z by w */
  523.         ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _W(tmp));
  524.         ureg_MOV(ureg, oPos, ureg_src(tmp));
  525.     } else {
  526.         /* position = vertex * WORLD_VIEW_PROJ */
  527.         ureg_MUL(ureg, r[0], _XXXX(vs->aVtx), _CONST(0));
  528.         ureg_MAD(ureg, r[0], _YYYY(vs->aVtx), _CONST(1), ureg_src(r[0]));
  529.         ureg_MAD(ureg, r[0], _ZZZZ(vs->aVtx), _CONST(2), ureg_src(r[0]));
  530.         ureg_MAD(ureg, oPos, _WWWW(vs->aVtx), _CONST(3), ureg_src(r[0]));
  531.     }
  532.  
  533.     if (need_rVtx) {
  534.         ureg_MUL(ureg, rVtx, _XXXX(vs->aVtx), _CONST(4));
  535.         ureg_MAD(ureg, rVtx, _YYYY(vs->aVtx), _CONST(5), ureg_src(rVtx));
  536.         ureg_MAD(ureg, rVtx, _ZZZZ(vs->aVtx), _CONST(6), ureg_src(rVtx));
  537.         ureg_MAD(ureg, rVtx, _WWWW(vs->aVtx), _CONST(7), ureg_src(rVtx));
  538.     }
  539.     if (need_rNrm) {
  540.         ureg_MUL(ureg, rNrm, _XXXX(vs->aNrm), _CONST(16));
  541.         ureg_MAD(ureg, rNrm, _YYYY(vs->aNrm), _CONST(17), ureg_src(rNrm));
  542.         ureg_MAD(ureg, rNrm, _ZZZZ(vs->aNrm), _CONST(18), ureg_src(rNrm));
  543.         ureg_normalize3(ureg, rNrm, ureg_src(rNrm), tmp);
  544.     }
  545.     /* NOTE: don't use vs->aVtx, vs->aNrm after this line */
  546.  
  547.     /* === Process point size:
  548.      */
  549.     if (key->vertexpointsize) {
  550.         struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
  551. #ifdef NINE_TGSI_LAZY_DEVS
  552.         struct ureg_dst tmp_clamp = ureg_DECL_temporary(ureg);
  553.  
  554.         ureg_MAX(ureg, tmp_clamp, vs->aPsz, _XXXX(cPsz1));
  555.         ureg_MIN(ureg, oPsz, ureg_src(tmp_clamp), _YYYY(cPsz1));
  556.         ureg_release_temporary(ureg, tmp_clamp);
  557. #else
  558.         ureg_CLAMP(ureg, oPsz, vs->aPsz, _XXXX(cPsz1), _YYYY(cPsz1));
  559. #endif
  560.     } else if (key->pointscale) {
  561.         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
  562.         struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
  563.         struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
  564.         struct ureg_src cPsz2 = ureg_DECL_constant(ureg, 27);
  565.  
  566.         ureg_DP3(ureg, tmp_x, ureg_src(r[1]), ureg_src(r[1]));
  567.         ureg_SQRT(ureg, tmp_y, _X(tmp));
  568.         ureg_MAD(ureg, tmp_x, _Y(tmp), _YYYY(cPsz2), _XXXX(cPsz2));
  569.         ureg_MAD(ureg, tmp_x, _Y(tmp), _X(tmp), _WWWW(cPsz1));
  570.         ureg_RCP(ureg, tmp_x, ureg_src(tmp));
  571.         ureg_MUL(ureg, tmp_x, ureg_src(tmp), _ZZZZ(cPsz1));
  572. #ifdef NINE_TGSI_LAZY_DEVS
  573.         struct ureg_dst tmp_clamp = ureg_DECL_temporary(ureg);
  574.  
  575.         ureg_MAX(ureg, tmp_clamp, _X(tmp), _XXXX(cPsz1));
  576.         ureg_MIN(ureg, oPsz, ureg_src(tmp_clamp), _YYYY(cPsz1));
  577.         ureg_release_temporary(ureg, tmp_clamp);
  578. #else
  579.         ureg_CLAMP(ureg, oPsz, _X(tmp), _XXXX(cPsz1), _YYYY(cPsz1));
  580. #endif
  581.     }
  582.  
  583.     /* Texture coordinate generation:
  584.      * XXX: D3DTTFF_PROJECTED, transform matrix
  585.      */
  586.     for (i = 0; i < 8; ++i) {
  587.         struct ureg_dst dst[5];
  588.         struct ureg_src src;
  589.         unsigned c;
  590.         const unsigned tci = (key->tc_gen >> (i * 3)) & 0x7;
  591.         const unsigned idx = (key->tc_idx >> (i * 3)) & 0x7;
  592.         const unsigned dim = (key->tc_dim >> (i * 3)) & 0x7;
  593.  
  594.         if (tci == NINED3DTSS_TCI_DISABLE)
  595.             continue;
  596.         oTex[i] = ureg_DECL_output(ureg, texcoord_sn, i);
  597.  
  598.         if (tci == NINED3DTSS_TCI_PASSTHRU)
  599.             vs->aTex[idx] = build_vs_add_input(vs, NINE_DECLUSAGE_i(TEXCOORD,idx));
  600.  
  601.         if (!dim) {
  602.             dst[c = 4] = oTex[i];
  603.         } else {
  604.             dst[4] = r[5];
  605.             src = ureg_src(dst[4]);
  606.             for (c = 0; c < (dim - 1); ++c)
  607.                 dst[c] = ureg_writemask(tmp, (1 << dim) - 1);
  608.             dst[c] = ureg_writemask(oTex[i], (1 << dim) - 1);
  609.         }
  610.  
  611.         switch (tci) {
  612.         case NINED3DTSS_TCI_PASSTHRU:
  613.             ureg_MOV(ureg, dst[4], vs->aTex[idx]);
  614.             break;
  615.         case NINED3DTSS_TCI_CAMERASPACENORMAL:
  616.             assert(dim <= 3);
  617.             ureg_MOV(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_XYZ), ureg_src(rNrm));
  618.             ureg_MOV(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
  619.             break;
  620.         case NINED3DTSS_TCI_CAMERASPACEPOSITION:
  621.             ureg_MOV(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_XYZ), ureg_src(rVtx));
  622.             ureg_MOV(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
  623.             break;
  624.         case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
  625.             tmp.WriteMask = TGSI_WRITEMASK_XYZ;
  626.             ureg_DP3(ureg, tmp_x, ureg_src(rVtx), ureg_src(rNrm));
  627.             ureg_MUL(ureg, tmp, ureg_src(rNrm), _X(tmp));
  628.             ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
  629.             ureg_SUB(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_XYZ), ureg_src(rVtx), ureg_src(tmp));
  630.             ureg_MOV(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
  631.             tmp.WriteMask = TGSI_WRITEMASK_XYZW;
  632.             break;
  633.         case NINED3DTSS_TCI_SPHEREMAP:
  634.             assert(!"TODO");
  635.             break;
  636.         default:
  637.             break;
  638.         }
  639.         if (!dim)
  640.             continue;
  641.         dst[c].WriteMask = ~dst[c].WriteMask;
  642.         if (dst[c].WriteMask)
  643.             ureg_MOV(ureg, dst[c], src); /* store untransformed components */
  644.         dst[c].WriteMask = ~dst[c].WriteMask;
  645.         if (dim > 0) ureg_MUL(ureg, dst[0], _XXXX(src), _CONST(128 + i * 4));
  646.         if (dim > 1) ureg_MAD(ureg, dst[1], _YYYY(src), _CONST(129 + i * 4), ureg_src(tmp));
  647.         if (dim > 2) ureg_MAD(ureg, dst[2], _ZZZZ(src), _CONST(130 + i * 4), ureg_src(tmp));
  648.         if (dim > 3) ureg_MAD(ureg, dst[3], _WWWW(src), _CONST(131 + i * 4), ureg_src(tmp));
  649.     }
  650.  
  651.     /* === Lighting:
  652.      *
  653.      * DIRECTIONAL:  Light at infinite distance, parallel rays, no attenuation.
  654.      * POINT: Finite distance to scene, divergent rays, isotropic, attenuation.
  655.      * SPOT: Finite distance, divergent rays, angular dependence, attenuation.
  656.      *
  657.      * vec3 normal = normalize(in.Normal * NormalMatrix);
  658.      * vec3 hitDir = light.direction;
  659.      * float atten = 1.0;
  660.      *
  661.      * if (light.type != DIRECTIONAL)
  662.      * {
  663.      *     vec3 hitVec = light.position - eyeVertex;
  664.      *     float d = length(hitVec);
  665.      *     hitDir = hitVec / d;
  666.      *     atten = 1 / ((light.atten2 * d + light.atten1) * d + light.atten0);
  667.      * }
  668.      *
  669.      * if (light.type == SPOTLIGHT)
  670.      * {
  671.      *     float rho = dp3(-hitVec, light.direction);
  672.      *     if (rho < cos(light.phi / 2))
  673.      *         atten = 0;
  674.      *     if (rho < cos(light.theta / 2))
  675.      *         atten *= pow(some_func(rho), light.falloff);
  676.      * }
  677.      *
  678.      * float nDotHit = dp3_sat(normal, hitVec);
  679.      * float powFact = 0.0;
  680.      *
  681.      * if (nDotHit > 0.0)
  682.      * {
  683.      *     vec3 midVec = normalize(hitDir + eye);
  684.      *     float nDotMid = dp3_sat(normal, midVec);
  685.      *     pFact = pow(nDotMid, material.power);
  686.      * }
  687.      *
  688.      * ambient += light.ambient * atten;
  689.      * diffuse += light.diffuse * atten * nDotHit;
  690.      * specular += light.specular * atten * powFact;
  691.      */
  692.     if (key->lighting) {
  693.         struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
  694.  
  695.         struct ureg_dst rAtt = ureg_writemask(r[1], TGSI_WRITEMASK_W);
  696.         struct ureg_dst rHit = ureg_writemask(r[3], TGSI_WRITEMASK_XYZ);
  697.         struct ureg_dst rMid = ureg_writemask(r[4], TGSI_WRITEMASK_XYZ);
  698.  
  699.         struct ureg_dst rCtr = ureg_writemask(r[2], TGSI_WRITEMASK_W);
  700.  
  701.         struct ureg_dst AL = ureg_writemask(AR, TGSI_WRITEMASK_X);
  702.  
  703.         /* Light.*.Alpha is not used. */
  704.         struct ureg_dst rD = ureg_writemask(r[5], TGSI_WRITEMASK_XYZ);
  705.         struct ureg_dst rA = ureg_writemask(r[6], TGSI_WRITEMASK_XYZ);
  706.         struct ureg_dst rS = ureg_writemask(r[7], TGSI_WRITEMASK_XYZ);
  707.  
  708.         struct ureg_src mtlP = _XXXX(MATERIAL_CONST(4));
  709.  
  710.         struct ureg_src cLKind = _XXXX(LIGHT_CONST(0));
  711.         struct ureg_src cLAtt0 = _YYYY(LIGHT_CONST(0));
  712.         struct ureg_src cLAtt1 = _ZZZZ(LIGHT_CONST(0));
  713.         struct ureg_src cLAtt2 = _WWWW(LIGHT_CONST(0));
  714.         struct ureg_src cLColD = _XYZW(LIGHT_CONST(1));
  715.         struct ureg_src cLColS = _XYZW(LIGHT_CONST(2));
  716.         struct ureg_src cLColA = _XYZW(LIGHT_CONST(3));
  717.         struct ureg_src cLPos  = _XYZW(LIGHT_CONST(4));
  718.         struct ureg_src cLRng  = _WWWW(LIGHT_CONST(4));
  719.         struct ureg_src cLDir  = _XYZW(LIGHT_CONST(5));
  720.         struct ureg_src cLFOff = _WWWW(LIGHT_CONST(5));
  721.         struct ureg_src cLTht  = _XXXX(LIGHT_CONST(6));
  722.         struct ureg_src cLPhi  = _YYYY(LIGHT_CONST(6));
  723.         struct ureg_src cLSDiv = _ZZZZ(LIGHT_CONST(6));
  724.         struct ureg_src cLLast = _WWWW(LIGHT_CONST(7));
  725.  
  726.         const unsigned loop_label = l++;
  727.  
  728.         ureg_MOV(ureg, rCtr, ureg_imm1f(ureg, 32.0f)); /* &lightconst(0) */
  729.         ureg_MOV(ureg, rD, ureg_imm1f(ureg, 0.0f));
  730.         ureg_MOV(ureg, rA, ureg_imm1f(ureg, 0.0f));
  731.         ureg_MOV(ureg, rS, ureg_imm1f(ureg, 0.0f));
  732.         rD = ureg_saturate(rD);
  733.         rA = ureg_saturate(rA);
  734.         rS = ureg_saturate(rS);
  735.  
  736.  
  737.         /* loop management */
  738.         ureg_BGNLOOP(ureg, &label[loop_label]);
  739.         ureg_ARL(ureg, AL, _W(rCtr));
  740.  
  741.         /* if (not DIRECTIONAL light): */
  742.         ureg_SNE(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_DIRECTIONAL));
  743.         ureg_MOV(ureg, rHit, ureg_negate(cLDir));
  744.         ureg_MOV(ureg, rAtt, ureg_imm1f(ureg, 1.0f));
  745.         ureg_IF(ureg, _X(tmp), &label[l++]);
  746.         {
  747.             /* hitDir = light.position - eyeVtx
  748.              * d = length(hitDir)
  749.              * hitDir /= d
  750.              */
  751.             ureg_SUB(ureg, rHit, cLPos, ureg_src(rVtx));
  752.             ureg_DP3(ureg, tmp_x, ureg_src(rHit), ureg_src(rHit));
  753.             ureg_RSQ(ureg, tmp_y, _X(tmp));
  754.             ureg_MUL(ureg, rHit, ureg_src(rHit), _Y(tmp)); /* normalize */
  755.             ureg_MUL(ureg, tmp_x, _X(tmp), _Y(tmp)); /* length */
  756.  
  757.             /* att = 1.0 / (light.att0 + (light.att1 + light.att2 * d) * d) */
  758.             ureg_MAD(ureg, rAtt, _X(tmp), cLAtt2, cLAtt1);
  759.             ureg_MAD(ureg, rAtt, _X(tmp), _W(rAtt), cLAtt0);
  760.             ureg_RCP(ureg, rAtt, _W(rAtt));
  761.             /* cut-off if distance exceeds Light.Range */
  762.             ureg_SLT(ureg, tmp_x, _X(tmp), cLRng);
  763.             ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
  764.         }
  765.         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
  766.         ureg_ENDIF(ureg);
  767.  
  768.         /* if (SPOT light) */
  769.         ureg_SEQ(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_SPOT));
  770.         ureg_IF(ureg, _X(tmp), &label[l++]);
  771.         {
  772.             /* rho = dp3(-hitDir, light.spotDir)
  773.              *
  774.              * if (rho  > light.ctht2) NOTE: 0 <= phi <= pi, 0 <= theta <= phi
  775.              *     spotAtt = 1
  776.              * else
  777.              * if (rho <= light.cphi2)
  778.              *     spotAtt = 0
  779.              * else
  780.              *     spotAtt = (rho - light.cphi2) / (light.ctht2 - light.cphi2) ^ light.falloff
  781.              */
  782.             ureg_DP3(ureg, tmp_y, ureg_negate(ureg_src(rHit)), cLDir); /* rho */
  783.             ureg_SUB(ureg, tmp_x, _Y(tmp), cLPhi);
  784.             ureg_MUL(ureg, tmp_x, _X(tmp), cLSDiv);
  785.             ureg_POW(ureg, tmp_x, _X(tmp), cLFOff); /* spotAtten */
  786.             ureg_SGE(ureg, tmp_z, _Y(tmp), cLTht); /* if inside theta && phi */
  787.             ureg_SGE(ureg, tmp_y, _Y(tmp), cLPhi); /* if inside phi */
  788.             ureg_MAD(ureg, ureg_saturate(tmp_x), _X(tmp), _Y(tmp), _Z(tmp));
  789.             ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
  790.         }
  791.         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
  792.         ureg_ENDIF(ureg);
  793.  
  794.         /* directional factors, let's not use LIT because of clarity */
  795.         ureg_DP3(ureg, ureg_saturate(tmp_x), ureg_src(rNrm), ureg_src(rHit));
  796.         ureg_MOV(ureg, tmp_y, ureg_imm1f(ureg, 0.0f));
  797.         ureg_IF(ureg, _X(tmp), &label[l++]);
  798.         {
  799.             /* midVec = normalize(hitDir + eyeDir) */
  800.             if (key->localviewer) {
  801.                 ureg_normalize3(ureg, rMid, ureg_src(rVtx), tmp);
  802.                 ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_negate(ureg_src(rMid)));
  803.             } else {
  804.                 ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_imm3f(ureg, 0.0f, 0.0f, 1.0f));
  805.             }
  806.             ureg_normalize3(ureg, rMid, ureg_src(rMid), tmp);
  807.             ureg_DP3(ureg, ureg_saturate(tmp_y), ureg_src(rNrm), ureg_src(rMid));
  808.             ureg_POW(ureg, tmp_y, _Y(tmp), mtlP);
  809.  
  810.             ureg_MUL(ureg, tmp_x, _W(rAtt), _X(tmp)); /* dp3(normal,hitDir) * att */
  811.             ureg_MUL(ureg, tmp_y, _W(rAtt), _Y(tmp)); /* power factor * att */
  812.             ureg_MAD(ureg, rD, cLColD, _X(tmp), ureg_src(rD)); /* accumulate diffuse */
  813.             ureg_MAD(ureg, rS, cLColS, _Y(tmp), ureg_src(rS)); /* accumulate specular */
  814.         }
  815.         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
  816.         ureg_ENDIF(ureg);
  817.  
  818.         ureg_MAD(ureg, rA, cLColA, _W(rAtt), ureg_src(rA)); /* accumulate ambient */
  819.  
  820.         /* break if this was the last light */
  821.         ureg_IF(ureg, cLLast, &label[l++]);
  822.         ureg_BRK(ureg);
  823.         ureg_ENDIF(ureg);
  824.         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
  825.  
  826.         ureg_ADD(ureg, rCtr, _W(rCtr), ureg_imm1f(ureg, 8.0f));
  827.         ureg_fixup_label(ureg, label[loop_label], ureg_get_instruction_number(ureg));
  828.         ureg_ENDLOOP(ureg, &label[loop_label]);
  829.  
  830.         /* Set alpha factors of illumination to 1.0 for the multiplications. */
  831.         rD.WriteMask = TGSI_WRITEMASK_W; rD.Saturate = 0;
  832.         rS.WriteMask = TGSI_WRITEMASK_W; rS.Saturate = 0;
  833.         rA.WriteMask = TGSI_WRITEMASK_W; rA.Saturate = 0;
  834.         ureg_MOV(ureg, rD, ureg_imm1f(ureg, 1.0f));
  835.         ureg_MOV(ureg, rS, ureg_imm1f(ureg, 1.0f));
  836.  
  837.         /* Apply to material:
  838.          *
  839.          * oCol[0] = (material.emissive + material.ambient * rs.ambient) +
  840.          *           material.ambient * ambient +
  841.          *           material.diffuse * diffuse +
  842.          * oCol[1] = material.specular * specular;
  843.          */
  844.         if (key->mtl_emissive == 0 && key->mtl_ambient == 0) {
  845.             ureg_MOV(ureg, rA, ureg_imm1f(ureg, 1.0f));
  846.             ureg_MAD(ureg, tmp, ureg_src(rA), vs->mtlA, _CONST(19));
  847.         } else {
  848.             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), _CONST(25));
  849.             ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), vs->mtlA, ureg_src(tmp), vs->mtlE);
  850.             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_W  ), vs->mtlA, vs->mtlE);
  851.         }
  852.         ureg_MAD(ureg, rCol[0], ureg_src(rD), vs->mtlD, ureg_src(tmp));
  853.         ureg_MUL(ureg, rCol[1], ureg_src(rS), vs->mtlS);
  854.     } else
  855.     /* COLOR */
  856.     if (key->darkness) {
  857.         if (key->mtl_emissive == 0 && key->mtl_ambient == 0) {
  858.             ureg_MAD(ureg, rCol[0], vs->mtlD, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f), _CONST(19));
  859.         } else {
  860.             ureg_MAD(ureg, ureg_writemask(rCol[0], TGSI_WRITEMASK_XYZ), vs->mtlA, _CONST(25), vs->mtlE);
  861.             ureg_ADD(ureg, ureg_writemask(tmp,     TGSI_WRITEMASK_W), vs->mtlA, vs->mtlE);
  862.             ureg_ADD(ureg, ureg_writemask(rCol[0], TGSI_WRITEMASK_W), vs->mtlD, _W(tmp));
  863.         }
  864.         ureg_MUL(ureg, rCol[1], ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f), vs->mtlS);
  865.     } else {
  866.         ureg_MOV(ureg, rCol[0], vs->aCol[0]);
  867.         ureg_MOV(ureg, rCol[1], vs->aCol[1]);
  868.     }
  869.  
  870.     /* === Process fog.
  871.      *
  872.      * exp(x) = ex2(log2(e) * x)
  873.      */
  874.     if (key->fog_mode) {
  875.         /* Fog doesn't affect alpha, TODO: combine with light code output */
  876.         ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), _W(rCol[0]));
  877.         ureg_MOV(ureg, ureg_writemask(oCol[1], TGSI_WRITEMASK_W), _W(rCol[1]));
  878.  
  879.         if (key->position_t) {
  880.             ureg_MOV(ureg, ureg_saturate(tmp_x), ureg_scalar(vs->aCol[1], TGSI_SWIZZLE_W));
  881.         } else
  882.         if (key->fog_range) {
  883.             ureg_DP3(ureg, tmp_x, ureg_src(rVtx), ureg_src(rVtx));
  884.             ureg_RSQ(ureg, tmp_z, _X(tmp));
  885.             ureg_MUL(ureg, tmp_z, _Z(tmp), _X(tmp));
  886.         } else {
  887.             ureg_MOV(ureg, tmp_z, ureg_abs(_Z(rVtx)));
  888.         }
  889.  
  890.         if (key->fog_mode == D3DFOG_EXP) {
  891.             ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
  892.             ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
  893.             ureg_EX2(ureg, tmp_x, _X(tmp));
  894.         } else
  895.         if (key->fog_mode == D3DFOG_EXP2) {
  896.             ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
  897.             ureg_MUL(ureg, tmp_x, _X(tmp), _X(tmp));
  898.             ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
  899.             ureg_EX2(ureg, tmp_x, _X(tmp));
  900.         } else
  901.         if (key->fog_mode == D3DFOG_LINEAR && !key->position_t) {
  902.             ureg_SUB(ureg, tmp_x, _XXXX(_CONST(28)), _Z(tmp));
  903.             ureg_MUL(ureg, ureg_saturate(tmp_x), _X(tmp), _YYYY(_CONST(28)));
  904.         }
  905.         ureg_MOV(ureg, oFog, _X(tmp));
  906.         ureg_LRP(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), _X(tmp), ureg_src(rCol[0]), _CONST(29));
  907.         ureg_LRP(ureg, ureg_writemask(oCol[1], TGSI_WRITEMASK_XYZ), _X(tmp), ureg_src(rCol[1]), _CONST(29));
  908.     }
  909.  
  910.     if (key->position_t && device->driver_caps.window_space_position_support)
  911.         ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, TRUE);
  912.  
  913.     ureg_END(ureg);
  914.     nine_ureg_tgsi_dump(ureg, FALSE);
  915.     return ureg_create_shader_and_destroy(ureg, device->pipe);
  916. }
  917.  
  918. /* PS FF constants layout:
  919.  *
  920.  * CONST[ 0.. 7]      stage[i].D3DTSS_CONSTANT
  921.  * CONST[ 8..15].x___ stage[i].D3DTSS_BUMPENVMAT00
  922.  * CONST[ 8..15]._y__ stage[i].D3DTSS_BUMPENVMAT01
  923.  * CONST[ 8..15].__z_ stage[i].D3DTSS_BUMPENVMAT10
  924.  * CONST[ 8..15].___w stage[i].D3DTSS_BUMPENVMAT11
  925.  * CONST[16..19].x_z_ stage[i].D3DTSS_BUMPENVLSCALE
  926.  * CONST[17..19]._y_w stage[i].D3DTSS_BUMPENVLOFFSET
  927.  *
  928.  * CONST[20] D3DRS_TEXTUREFACTOR
  929.  * CONST[21] D3DRS_FOGCOLOR
  930.  * CONST[22].x___ RS.FogEnd
  931.  * CONST[22]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
  932.  * CONST[22].__z_ RS.FogDensity
  933.  */
  934. struct ps_build_ctx
  935. {
  936.     struct ureg_program *ureg;
  937.  
  938.     struct ureg_src vC[2]; /* DIFFUSE, SPECULAR */
  939.     struct ureg_src vT[8]; /* TEXCOORD[i] */
  940.     struct ureg_dst r[6];  /* TEMPs */
  941.     struct ureg_dst rCur; /* D3DTA_CURRENT */
  942.     struct ureg_dst rMod;
  943.     struct ureg_src rCurSrc;
  944.     struct ureg_dst rTmp; /* D3DTA_TEMP */
  945.     struct ureg_src rTmpSrc;
  946.     struct ureg_dst rTex;
  947.     struct ureg_src rTexSrc;
  948.     struct ureg_src cBEM[8];
  949.     struct ureg_src s[8];
  950.  
  951.     struct {
  952.         unsigned index;
  953.         unsigned index_pre_mod;
  954.         unsigned num_regs;
  955.     } stage;
  956. };
  957.  
  958. static struct ureg_src
  959. ps_get_ts_arg(struct ps_build_ctx *ps, unsigned ta)
  960. {
  961.     struct ureg_src reg;
  962.  
  963.     switch (ta & D3DTA_SELECTMASK) {
  964.     case D3DTA_CONSTANT:
  965.         reg = ureg_DECL_constant(ps->ureg, ps->stage.index);
  966.         break;
  967.     case D3DTA_CURRENT:
  968.         reg = (ps->stage.index == ps->stage.index_pre_mod) ? ureg_src(ps->rMod) : ps->rCurSrc;
  969.         break;
  970.     case D3DTA_DIFFUSE:
  971.         reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_PERSPECTIVE);
  972.         break;
  973.     case D3DTA_SPECULAR:
  974.         reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_PERSPECTIVE);
  975.         break;
  976.     case D3DTA_TEMP:
  977.         reg = ps->rTmpSrc;
  978.         break;
  979.     case D3DTA_TEXTURE:
  980.         reg = ps->rTexSrc;
  981.         break;
  982.     case D3DTA_TFACTOR:
  983.         reg = ureg_DECL_constant(ps->ureg, 20);
  984.         break;
  985.     default:
  986.         assert(0);
  987.         reg = ureg_src_undef();
  988.         break;
  989.     }
  990.     if (ta & D3DTA_COMPLEMENT) {
  991.         struct ureg_dst dst = ps->r[ps->stage.num_regs++];
  992.         ureg_SUB(ps->ureg, dst, ureg_imm1f(ps->ureg, 1.0f), reg);
  993.         reg = ureg_src(dst);
  994.     }
  995.     if (ta & D3DTA_ALPHAREPLICATE)
  996.         reg = _WWWW(reg);
  997.     return reg;
  998. }
  999.  
  1000. static struct ureg_dst
  1001. ps_get_ts_dst(struct ps_build_ctx *ps, unsigned ta)
  1002. {
  1003.     assert(!(ta & (D3DTA_COMPLEMENT | D3DTA_ALPHAREPLICATE)));
  1004.  
  1005.     switch (ta & D3DTA_SELECTMASK) {
  1006.     case D3DTA_CURRENT:
  1007.         return ps->rCur;
  1008.     case D3DTA_TEMP:
  1009.         return ps->rTmp;
  1010.     default:
  1011.         assert(0);
  1012.         return ureg_dst_undef();
  1013.     }
  1014. }
  1015.  
  1016. static uint8_t ps_d3dtop_args_mask(D3DTEXTUREOP top)
  1017. {
  1018.     switch (top) {
  1019.     case D3DTOP_DISABLE:
  1020.         return 0x0;
  1021.     case D3DTOP_SELECTARG1:
  1022.     case D3DTOP_PREMODULATE:
  1023.         return 0x2;
  1024.     case D3DTOP_SELECTARG2:
  1025.         return 0x4;
  1026.     case D3DTOP_MULTIPLYADD:
  1027.     case D3DTOP_LERP:
  1028.         return 0x7;
  1029.     default:
  1030.         return 0x6;
  1031.     }
  1032. }
  1033.  
  1034. static INLINE boolean
  1035. is_MOV_no_op(struct ureg_dst dst, struct ureg_src src)
  1036. {
  1037.     return !dst.WriteMask ||
  1038.         (dst.File == src.File &&
  1039.          dst.Index == src.Index &&
  1040.          !dst.Indirect &&
  1041.          !dst.Saturate &&
  1042.          !src.Indirect &&
  1043.          !src.Negate &&
  1044.          !src.Absolute &&
  1045.          (!(dst.WriteMask & TGSI_WRITEMASK_X) || (src.SwizzleX == TGSI_SWIZZLE_X)) &&
  1046.          (!(dst.WriteMask & TGSI_WRITEMASK_Y) || (src.SwizzleY == TGSI_SWIZZLE_Y)) &&
  1047.          (!(dst.WriteMask & TGSI_WRITEMASK_Z) || (src.SwizzleZ == TGSI_SWIZZLE_Z)) &&
  1048.          (!(dst.WriteMask & TGSI_WRITEMASK_W) || (src.SwizzleW == TGSI_SWIZZLE_W)));
  1049.  
  1050. }
  1051.  
  1052. static void
  1053. ps_do_ts_op(struct ps_build_ctx *ps, unsigned top, struct ureg_dst dst, struct ureg_src *arg)
  1054. {
  1055.     struct ureg_program *ureg = ps->ureg;
  1056.     struct ureg_dst tmp = ps->r[ps->stage.num_regs];
  1057.     struct ureg_dst tmp2 = ps->r[ps->stage.num_regs+1];
  1058.     struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
  1059.  
  1060.     tmp.WriteMask = dst.WriteMask;
  1061.  
  1062.     if (top != D3DTOP_SELECTARG1 && top != D3DTOP_SELECTARG2 &&
  1063.         top != D3DTOP_MODULATE && top != D3DTOP_PREMODULATE &&
  1064.         top != D3DTOP_BLENDDIFFUSEALPHA && top != D3DTOP_BLENDTEXTUREALPHA &&
  1065.         top != D3DTOP_BLENDFACTORALPHA && top != D3DTOP_BLENDCURRENTALPHA &&
  1066.         top != D3DTOP_BUMPENVMAP && top != D3DTOP_BUMPENVMAPLUMINANCE &&
  1067.         top != D3DTOP_LERP)
  1068.         dst = ureg_saturate(dst);
  1069.  
  1070.     switch (top) {
  1071.     case D3DTOP_SELECTARG1:
  1072.         if (!is_MOV_no_op(dst, arg[1]))
  1073.             ureg_MOV(ureg, dst, arg[1]);
  1074.         break;
  1075.     case D3DTOP_SELECTARG2:
  1076.         if (!is_MOV_no_op(dst, arg[2]))
  1077.             ureg_MOV(ureg, dst, arg[2]);
  1078.         break;
  1079.     case D3DTOP_MODULATE:
  1080.         ureg_MUL(ureg, dst, arg[1], arg[2]);
  1081.         break;
  1082.     case D3DTOP_MODULATE2X:
  1083.         ureg_MUL(ureg, tmp, arg[1], arg[2]);
  1084.         ureg_ADD(ureg, dst, ureg_src(tmp), ureg_src(tmp));
  1085.         break;
  1086.     case D3DTOP_MODULATE4X:
  1087.         ureg_MUL(ureg, tmp, arg[1], arg[2]);
  1088.         ureg_MUL(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 4.0f));
  1089.         break;
  1090.     case D3DTOP_ADD:
  1091.         ureg_ADD(ureg, dst, arg[1], arg[2]);
  1092.         break;
  1093.     case D3DTOP_ADDSIGNED:
  1094.         ureg_ADD(ureg, tmp, arg[1], arg[2]);
  1095.         ureg_SUB(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 0.5f));
  1096.         break;
  1097.     case D3DTOP_ADDSIGNED2X:
  1098.         ureg_ADD(ureg, tmp, arg[1], arg[2]);
  1099.         ureg_MAD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 2.0f), ureg_imm1f(ureg, -1.0f));
  1100.         break;
  1101.     case D3DTOP_SUBTRACT:
  1102.         ureg_SUB(ureg, dst, arg[1], arg[2]);
  1103.         break;
  1104.     case D3DTOP_ADDSMOOTH:
  1105.         ureg_SUB(ureg, tmp, ureg_imm1f(ureg, 1.0f), arg[1]);
  1106.         ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], arg[1]);
  1107.         break;
  1108.     case D3DTOP_BLENDDIFFUSEALPHA:
  1109.         ureg_LRP(ureg, dst, _WWWW(ps->vC[0]), arg[1], arg[2]);
  1110.         break;
  1111.     case D3DTOP_BLENDTEXTUREALPHA:
  1112.         /* XXX: alpha taken from previous stage, texture or result ? */
  1113.         ureg_LRP(ureg, dst, _W(ps->rTex), arg[1], arg[2]);
  1114.         break;
  1115.     case D3DTOP_BLENDFACTORALPHA:
  1116.         ureg_LRP(ureg, dst, _WWWW(_CONST(20)), arg[1], arg[2]);
  1117.         break;
  1118.     case D3DTOP_BLENDTEXTUREALPHAPM:
  1119.         ureg_SUB(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), _W(ps->rTex));
  1120.         ureg_MAD(ureg, dst, arg[2], _X(tmp), arg[1]);
  1121.         break;
  1122.     case D3DTOP_BLENDCURRENTALPHA:
  1123.         ureg_LRP(ureg, dst, _WWWW(ps->rCurSrc), arg[1], arg[2]);
  1124.         break;
  1125.     case D3DTOP_PREMODULATE:
  1126.         ureg_MOV(ureg, dst, arg[1]);
  1127.         ps->stage.index_pre_mod = ps->stage.index + 1;
  1128.         break;
  1129.     case D3DTOP_MODULATEALPHA_ADDCOLOR:
  1130.         ureg_MAD(ureg, dst, _WWWW(arg[1]), arg[2], arg[1]);
  1131.         break;
  1132.     case D3DTOP_MODULATECOLOR_ADDALPHA:
  1133.         ureg_MAD(ureg, dst, arg[1], arg[2], _WWWW(arg[1]));
  1134.         break;
  1135.     case D3DTOP_MODULATEINVALPHA_ADDCOLOR:
  1136.         ureg_SUB(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), _WWWW(arg[1]));
  1137.         ureg_MAD(ureg, dst, _X(tmp), arg[2], arg[1]);
  1138.         break;
  1139.     case D3DTOP_MODULATEINVCOLOR_ADDALPHA:
  1140.         ureg_SUB(ureg, tmp, ureg_imm1f(ureg, 1.0f), arg[1]);
  1141.         ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], _WWWW(arg[1]));
  1142.         break;
  1143.     case D3DTOP_BUMPENVMAP:
  1144.         break;
  1145.     case D3DTOP_BUMPENVMAPLUMINANCE:
  1146.         break;
  1147.     case D3DTOP_DOTPRODUCT3:
  1148.         ureg_SUB(ureg, tmp, arg[1], ureg_imm4f(ureg,0.5,0.5,0.5,0.5));
  1149.         ureg_SUB(ureg, tmp2, arg[2] , ureg_imm4f(ureg,0.5,0.5,0.5,0.5));
  1150.         ureg_DP3(ureg, tmp, ureg_src(tmp), ureg_src(tmp2));
  1151.         ureg_MUL(ureg, ureg_saturate(dst), ureg_src(tmp), ureg_imm4f(ureg,4.0,4.0,4.0,4.0));
  1152.         break;
  1153.     case D3DTOP_MULTIPLYADD:
  1154.         ureg_MAD(ureg, dst, arg[1], arg[2], arg[0]);
  1155.         break;
  1156.     case D3DTOP_LERP:
  1157.         ureg_LRP(ureg, dst, arg[0], arg[1], arg[2]);
  1158.         break;
  1159.     case D3DTOP_DISABLE:
  1160.         /* no-op ? */
  1161.         break;
  1162.     default:
  1163.         assert(!"invalid D3DTOP");
  1164.         break;
  1165.     }
  1166. }
  1167.  
  1168. static void *
  1169. nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
  1170. {
  1171.     struct ps_build_ctx ps;
  1172.     struct ureg_program *ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
  1173.     struct ureg_dst oCol;
  1174.     unsigned i, s;
  1175.     const unsigned texcoord_sn = get_texcoord_sn(device->screen);
  1176.  
  1177.     memset(&ps, 0, sizeof(ps));
  1178.     ps.ureg = ureg;
  1179.     ps.stage.index_pre_mod = -1;
  1180.  
  1181.     ps.vC[0] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_PERSPECTIVE);
  1182.  
  1183.     /* Declare all TEMPs we might need, serious drivers have a register allocator. */
  1184.     for (i = 0; i < Elements(ps.r); ++i)
  1185.         ps.r[i] = ureg_DECL_local_temporary(ureg);
  1186.     ps.rCur = ps.r[0];
  1187.     ps.rTmp = ps.r[1];
  1188.     ps.rTex = ps.r[2];
  1189.     ps.rCurSrc = ureg_src(ps.rCur);
  1190.     ps.rTmpSrc = ureg_src(ps.rTmp);
  1191.     ps.rTexSrc = ureg_src(ps.rTex);
  1192.  
  1193.     for (s = 0; s < 8; ++s) {
  1194.         ps.s[s] = ureg_src_undef();
  1195.  
  1196.         if (key->ts[s].colorop != D3DTOP_DISABLE) {
  1197.             if (key->ts[s].colorarg0 == D3DTA_SPECULAR ||
  1198.                 key->ts[s].colorarg1 == D3DTA_SPECULAR ||
  1199.                 key->ts[s].colorarg2 == D3DTA_SPECULAR)
  1200.                 ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_PERSPECTIVE);
  1201.  
  1202.             if (key->ts[s].colorarg0 == D3DTA_TEXTURE ||
  1203.                 key->ts[s].colorarg1 == D3DTA_TEXTURE ||
  1204.                 key->ts[s].colorarg2 == D3DTA_TEXTURE) {
  1205.                 ps.s[s] = ureg_DECL_sampler(ureg, s);
  1206.                 ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
  1207.             }
  1208.             if (s && (key->ts[s - 1].colorop == D3DTOP_PREMODULATE ||
  1209.                       key->ts[s - 1].alphaop == D3DTOP_PREMODULATE))
  1210.                 ps.s[s] = ureg_DECL_sampler(ureg, s);
  1211.         }
  1212.  
  1213.         if (key->ts[s].alphaop != D3DTOP_DISABLE) {
  1214.             if (key->ts[s].alphaarg0 == D3DTA_SPECULAR ||
  1215.                 key->ts[s].alphaarg1 == D3DTA_SPECULAR ||
  1216.                 key->ts[s].alphaarg2 == D3DTA_SPECULAR)
  1217.                 ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_PERSPECTIVE);
  1218.  
  1219.             if (key->ts[s].alphaarg0 == D3DTA_TEXTURE ||
  1220.                 key->ts[s].alphaarg1 == D3DTA_TEXTURE ||
  1221.                 key->ts[s].alphaarg2 == D3DTA_TEXTURE) {
  1222.                 ps.s[s] = ureg_DECL_sampler(ureg, s);
  1223.                 ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
  1224.             }
  1225.         }
  1226.     }
  1227.     if (key->specular)
  1228.         ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_PERSPECTIVE);
  1229.  
  1230.     oCol = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
  1231.  
  1232.     if (key->ts[0].colorop == D3DTOP_DISABLE &&
  1233.         key->ts[0].alphaop == D3DTOP_DISABLE)
  1234.         ureg_MOV(ureg, ps.rCur, ps.vC[0]);
  1235.     /* Or is it undefined then ? */
  1236.  
  1237.     /* Run stages.
  1238.      */
  1239.     for (s = 0; s < 8; ++s) {
  1240.         unsigned colorarg[3];
  1241.         unsigned alphaarg[3];
  1242.         const uint8_t used_c = ps_d3dtop_args_mask(key->ts[s].colorop);
  1243.         const uint8_t used_a = ps_d3dtop_args_mask(key->ts[s].alphaop);
  1244.         struct ureg_dst dst;
  1245.         struct ureg_src arg[3];
  1246.  
  1247.         if (key->ts[s].colorop == D3DTOP_DISABLE &&
  1248.             key->ts[s].alphaop == D3DTOP_DISABLE)
  1249.             continue;
  1250.         ps.stage.index = s;
  1251.         ps.stage.num_regs = 3;
  1252.  
  1253.         DBG("STAGE[%u]: colorop=%s alphaop=%s\n", s,
  1254.             nine_D3DTOP_to_str(key->ts[s].colorop),
  1255.             nine_D3DTOP_to_str(key->ts[s].alphaop));
  1256.  
  1257.         if (!ureg_src_is_undef(ps.s[s])) {
  1258.             unsigned target;
  1259.             switch (key->ts[s].textarget) {
  1260.             case 0: target = TGSI_TEXTURE_1D; break;
  1261.             case 1: target = TGSI_TEXTURE_2D; break;
  1262.             case 2: target = TGSI_TEXTURE_3D; break;
  1263.             case 3: target = TGSI_TEXTURE_CUBE; break;
  1264.             /* this is a 2 bit bitfield, do I really need a default case ? */
  1265.             }
  1266.  
  1267.             /* sample the texture */
  1268.             if (key->ts[s].colorop == D3DTOP_BUMPENVMAP ||
  1269.                 key->ts[s].colorop == D3DTOP_BUMPENVMAPLUMINANCE) {
  1270.             }
  1271.             if (key->ts[s].projected)
  1272.                 ureg_TXP(ureg, ps.rTex, target, ps.vT[s], ps.s[s]);
  1273.             else
  1274.                 ureg_TEX(ureg, ps.rTex, target, ps.vT[s], ps.s[s]);
  1275.         }
  1276.  
  1277.         if (s == 0 &&
  1278.             (key->ts[0].resultarg != 0 /* not current */ ||
  1279.              key->ts[0].colorop == D3DTOP_DISABLE ||
  1280.              key->ts[0].alphaop == D3DTOP_DISABLE ||
  1281.              key->ts[0].colorop == D3DTOP_BLENDCURRENTALPHA ||
  1282.              key->ts[0].alphaop == D3DTOP_BLENDCURRENTALPHA ||
  1283.              key->ts[0].colorarg0 == D3DTA_CURRENT ||
  1284.              key->ts[0].colorarg1 == D3DTA_CURRENT ||
  1285.              key->ts[0].colorarg2 == D3DTA_CURRENT ||
  1286.              key->ts[0].alphaarg0 == D3DTA_CURRENT ||
  1287.              key->ts[0].alphaarg1 == D3DTA_CURRENT ||
  1288.              key->ts[0].alphaarg2 == D3DTA_CURRENT)
  1289.            ) {
  1290.             /* Initialize D3DTA_CURRENT.
  1291.              * (Yes we can do this before the loop but not until
  1292.              *  NVE4 has an instruction scheduling pass.)
  1293.              */
  1294.             ureg_MOV(ureg, ps.rCur, ps.vC[0]);
  1295.         }
  1296.  
  1297.         dst = ps_get_ts_dst(&ps, key->ts[s].resultarg ? D3DTA_TEMP : D3DTA_CURRENT);
  1298.  
  1299.         if (ps.stage.index_pre_mod == ps.stage.index) {
  1300.             ps.rMod = ps.r[ps.stage.num_regs++];
  1301.             ureg_MUL(ureg, ps.rMod, ps.rCurSrc, ps.rTexSrc);
  1302.         }
  1303.  
  1304.         colorarg[0] = (key->ts[s].colorarg0 | ((key->colorarg_b4[0] >> s) << 4) | ((key->colorarg_b5[0] >> s) << 5)) & 0x3f;
  1305.         colorarg[1] = (key->ts[s].colorarg1 | ((key->colorarg_b4[1] >> s) << 4) | ((key->colorarg_b5[1] >> s) << 5)) & 0x3f;
  1306.         colorarg[2] = (key->ts[s].colorarg2 | ((key->colorarg_b4[2] >> s) << 4) | ((key->colorarg_b5[2] >> s) << 5)) & 0x3f;
  1307.         alphaarg[0] = (key->ts[s].alphaarg0 | ((key->alphaarg_b4[0] >> s) << 4)) & 0x1f;
  1308.         alphaarg[1] = (key->ts[s].alphaarg1 | ((key->alphaarg_b4[1] >> s) << 4)) & 0x1f;
  1309.         alphaarg[2] = (key->ts[s].alphaarg2 | ((key->alphaarg_b4[2] >> s) << 4)) & 0x1f;
  1310.  
  1311.         if (key->ts[s].colorop != key->ts[s].alphaop ||
  1312.             colorarg[0] != alphaarg[0] ||
  1313.             colorarg[1] != alphaarg[1] ||
  1314.             colorarg[2] != alphaarg[2])
  1315.             dst.WriteMask = TGSI_WRITEMASK_XYZ;
  1316.  
  1317.         if (used_c & 0x1) arg[0] = ps_get_ts_arg(&ps, colorarg[0]);
  1318.         if (used_c & 0x2) arg[1] = ps_get_ts_arg(&ps, colorarg[1]);
  1319.         if (used_c & 0x4) arg[2] = ps_get_ts_arg(&ps, colorarg[2]);
  1320.         ps_do_ts_op(&ps, key->ts[s].colorop, dst, arg);
  1321.  
  1322.         if (dst.WriteMask != TGSI_WRITEMASK_XYZW) {
  1323.             dst.WriteMask = TGSI_WRITEMASK_W;
  1324.  
  1325.             if (used_a & 0x1) arg[0] = ps_get_ts_arg(&ps, alphaarg[0]);
  1326.             if (used_a & 0x2) arg[1] = ps_get_ts_arg(&ps, alphaarg[1]);
  1327.             if (used_a & 0x4) arg[2] = ps_get_ts_arg(&ps, alphaarg[2]);
  1328.             ps_do_ts_op(&ps, key->ts[s].alphaop, dst, arg);
  1329.         }
  1330.     }
  1331.  
  1332.     if (key->specular)
  1333.         ureg_ADD(ureg, ps.rCur, ps.rCurSrc, ps.vC[1]);
  1334.  
  1335.     /* Fog.
  1336.      */
  1337.     if (key->fog_mode) {
  1338.         struct ureg_src vPos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0, TGSI_INTERPOLATE_LINEAR);
  1339.         struct ureg_dst rFog = ureg_writemask(ps.rTmp, TGSI_WRITEMASK_X);
  1340.         if (key->fog_mode == D3DFOG_EXP) {
  1341.             ureg_MUL(ureg, rFog, _ZZZZ(vPos), _ZZZZ(_CONST(22)));
  1342.             ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
  1343.             ureg_EX2(ureg, rFog, _X(rFog));
  1344.         } else
  1345.         if (key->fog_mode == D3DFOG_EXP2) {
  1346.             ureg_MUL(ureg, rFog, _ZZZZ(vPos), _ZZZZ(_CONST(22)));
  1347.             ureg_MUL(ureg, rFog, _X(rFog), _X(rFog));
  1348.             ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
  1349.             ureg_EX2(ureg, rFog, _X(rFog));
  1350.         } else
  1351.         if (key->fog_mode == D3DFOG_LINEAR) {
  1352.             ureg_SUB(ureg, rFog, _XXXX(_CONST(22)), _ZZZZ(vPos));
  1353.             ureg_MUL(ureg, ureg_saturate(rFog), _X(rFog), _YYYY(_CONST(22)));
  1354.         }
  1355.         ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _X(rFog), ps.rCurSrc, _CONST(21));
  1356.         ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
  1357.     } else
  1358.     if (key->fog) {
  1359.         struct ureg_src vFog = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_FOG, 0, TGSI_INTERPOLATE_PERSPECTIVE);
  1360.         ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _XXXX(vFog), ps.rCurSrc, _CONST(21));
  1361.         ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
  1362.     } else {
  1363.         ureg_MOV(ureg, oCol, ps.rCurSrc);
  1364.     }
  1365.  
  1366.     ureg_END(ureg);
  1367.     nine_ureg_tgsi_dump(ureg, FALSE);
  1368.     return ureg_create_shader_and_destroy(ureg, device->pipe);
  1369. }
  1370.  
  1371. static struct NineVertexShader9 *
  1372. nine_ff_get_vs(struct NineDevice9 *device)
  1373. {
  1374.     const struct nine_state *state = &device->state;
  1375.     struct NineVertexShader9 *vs;
  1376.     enum pipe_error err;
  1377.     struct vs_build_ctx bld;
  1378.     struct nine_ff_vs_key key;
  1379.     unsigned s, i;
  1380.     char input_texture_coord[8];
  1381.  
  1382.     assert(sizeof(key) <= sizeof(key.value32));
  1383.  
  1384.     memset(&key, 0, sizeof(key));
  1385.     memset(&bld, 0, sizeof(bld));
  1386.     memset(&input_texture_coord, 0, sizeof(input_texture_coord));
  1387.  
  1388.     bld.key = &key;
  1389.  
  1390.     /* FIXME: this shouldn't be NULL, but it is on init */
  1391.     if (state->vdecl) {
  1392.         key.color0in_one = 1;
  1393.         key.color1in_one = 1;
  1394.         for (i = 0; i < state->vdecl->nelems; i++) {
  1395.             uint16_t usage = state->vdecl->usage_map[i];
  1396.             if (usage == NINE_DECLUSAGE_POSITIONT)
  1397.                 key.position_t = 1;
  1398.             else if (usage == NINE_DECLUSAGE_i(COLOR, 0))
  1399.                 key.color0in_one = 0;
  1400.             else if (usage == NINE_DECLUSAGE_i(COLOR, 1))
  1401.                 key.color1in_one = 0;
  1402.             else if (usage == NINE_DECLUSAGE_PSIZE)
  1403.                 key.vertexpointsize = 1;
  1404.             else if (usage % NINE_DECLUSAGE_COUNT == NINE_DECLUSAGE_TEXCOORD) {
  1405.                 s = usage / NINE_DECLUSAGE_COUNT;
  1406.                 if (s < 8)
  1407.                     input_texture_coord[s] = 1;
  1408.                 else
  1409.                     DBG("FF given texture coordinate >= 8. Ignoring\n");
  1410.             }
  1411.         }
  1412.     }
  1413.     if (!key.vertexpointsize)
  1414.         key.pointscale = !!state->rs[D3DRS_POINTSCALEENABLE];
  1415.  
  1416.     key.lighting = !!state->rs[D3DRS_LIGHTING] &&  state->ff.num_lights_active;
  1417.     key.darkness = !!state->rs[D3DRS_LIGHTING] && !state->ff.num_lights_active;
  1418.     if (key.position_t) {
  1419.         key.darkness = 0; /* |= key.lighting; */ /* XXX ? */
  1420.         key.lighting = 0;
  1421.     }
  1422.     if ((key.lighting | key.darkness) && state->rs[D3DRS_COLORVERTEX]) {
  1423.         key.mtl_diffuse = state->rs[D3DRS_DIFFUSEMATERIALSOURCE];
  1424.         key.mtl_ambient = state->rs[D3DRS_AMBIENTMATERIALSOURCE];
  1425.         key.mtl_specular = state->rs[D3DRS_SPECULARMATERIALSOURCE];
  1426.         key.mtl_emissive = state->rs[D3DRS_EMISSIVEMATERIALSOURCE];
  1427.     }
  1428.     key.fog_mode = state->rs[D3DRS_FOGENABLE] ? state->rs[D3DRS_FOGVERTEXMODE] : 0;
  1429.     if (key.fog_mode)
  1430.         key.fog_range = !key.position_t && state->rs[D3DRS_RANGEFOGENABLE];
  1431.  
  1432.     if (state->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
  1433.         key.vertexblend_indexed = !!state->rs[D3DRS_INDEXEDVERTEXBLENDENABLE];
  1434.  
  1435.         switch (state->rs[D3DRS_VERTEXBLEND]) {
  1436.         case D3DVBF_0WEIGHTS: key.vertexblend = key.vertexblend_indexed; break;
  1437.         case D3DVBF_1WEIGHTS: key.vertexblend = 2; break;
  1438.         case D3DVBF_2WEIGHTS: key.vertexblend = 3; break;
  1439.         case D3DVBF_3WEIGHTS: key.vertexblend = 4; break;
  1440.         case D3DVBF_TWEENING: key.vertextween = 1; break;
  1441.         default:
  1442.             assert(!"invalid D3DVBF");
  1443.             break;
  1444.         }
  1445.     }
  1446.  
  1447.     for (s = 0; s < 8; ++s) {
  1448.         unsigned gen = (state->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] >> 16) + 1;
  1449.         unsigned dim = MIN2(state->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & 0x7, 4);
  1450.  
  1451.         if (key.position_t && gen > NINED3DTSS_TCI_PASSTHRU)
  1452.             gen = NINED3DTSS_TCI_PASSTHRU;
  1453.  
  1454.         if (!input_texture_coord[s] && gen == NINED3DTSS_TCI_PASSTHRU)
  1455.             gen = NINED3DTSS_TCI_DISABLE;
  1456.  
  1457.         key.tc_gen |= gen << (s * 3);
  1458.         key.tc_idx |= (state->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] & 7) << (s * 3);
  1459.         key.tc_dim |= dim << (s * 3);
  1460.     }
  1461.  
  1462.     vs = util_hash_table_get(device->ff.ht_vs, &key);
  1463.     if (vs)
  1464.         return vs;
  1465.     NineVertexShader9_new(device, &vs, NULL, nine_ff_build_vs(device, &bld));
  1466.  
  1467.     nine_ff_prune_vs(device);
  1468.     if (vs) {
  1469.         unsigned n;
  1470.  
  1471.         memcpy(&vs->ff_key, &key, sizeof(vs->ff_key));
  1472.  
  1473.         err = util_hash_table_set(device->ff.ht_vs, &vs->ff_key, vs);
  1474.         assert(err == PIPE_OK);
  1475.         device->ff.num_vs++;
  1476.         NineUnknown_ConvertRefToBind(NineUnknown(vs));
  1477.  
  1478.         vs->num_inputs = bld.num_inputs;
  1479.         for (n = 0; n < bld.num_inputs; ++n)
  1480.             vs->input_map[n].ndecl = bld.input[n];
  1481.  
  1482.         vs->position_t = key.position_t;
  1483.         vs->point_size = key.vertexpointsize | key.pointscale;
  1484.     }
  1485.     return vs;
  1486. }
  1487.  
  1488. static struct NinePixelShader9 *
  1489. nine_ff_get_ps(struct NineDevice9 *device)
  1490. {
  1491.     struct nine_state *state = &device->state;
  1492.     struct NinePixelShader9 *ps;
  1493.     enum pipe_error err;
  1494.     struct nine_ff_ps_key key;
  1495.     unsigned s;
  1496.     uint8_t sampler_mask = 0;
  1497.  
  1498.     assert(sizeof(key) <= sizeof(key.value32));
  1499.  
  1500.     memset(&key, 0, sizeof(key));
  1501.     for (s = 0; s < 8; ++s) {
  1502.         key.ts[s].colorop = state->ff.tex_stage[s][D3DTSS_COLOROP];
  1503.         key.ts[s].alphaop = state->ff.tex_stage[s][D3DTSS_ALPHAOP];
  1504.         /* MSDN says D3DTOP_DISABLE disables this and all subsequent stages. */
  1505.         /* ALPHAOP cannot be disabled if COLOROP is enabled. */
  1506.         if (key.ts[s].colorop == D3DTOP_DISABLE) {
  1507.             key.ts[s].alphaop = D3DTOP_DISABLE; /* DISABLE == 1, avoid degenerate keys */
  1508.             break;
  1509.         }
  1510.  
  1511.         if (!state->texture[s] &&
  1512.             state->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE) {
  1513.             /* This should also disable the stage. */
  1514.             key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
  1515.             break;
  1516.         }
  1517.  
  1518.         if (state->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE)
  1519.             sampler_mask |= (1 << s);
  1520.  
  1521.         if (key.ts[s].colorop != D3DTOP_DISABLE) {
  1522.             uint8_t used_c = ps_d3dtop_args_mask(key.ts[s].colorop);
  1523.             if (used_c & 0x1) key.ts[s].colorarg0 = state->ff.tex_stage[s][D3DTSS_COLORARG0];
  1524.             if (used_c & 0x2) key.ts[s].colorarg1 = state->ff.tex_stage[s][D3DTSS_COLORARG1];
  1525.             if (used_c & 0x4) key.ts[s].colorarg2 = state->ff.tex_stage[s][D3DTSS_COLORARG2];
  1526.             if (used_c & 0x1) key.colorarg_b4[0] |= (state->ff.tex_stage[s][D3DTSS_COLORARG0] >> 4) << s;
  1527.             if (used_c & 0x1) key.colorarg_b5[0] |= (state->ff.tex_stage[s][D3DTSS_COLORARG0] >> 5) << s;
  1528.             if (used_c & 0x2) key.colorarg_b4[1] |= (state->ff.tex_stage[s][D3DTSS_COLORARG1] >> 4) << s;
  1529.             if (used_c & 0x2) key.colorarg_b5[1] |= (state->ff.tex_stage[s][D3DTSS_COLORARG1] >> 5) << s;
  1530.             if (used_c & 0x4) key.colorarg_b4[2] |= (state->ff.tex_stage[s][D3DTSS_COLORARG2] >> 4) << s;
  1531.             if (used_c & 0x4) key.colorarg_b5[2] |= (state->ff.tex_stage[s][D3DTSS_COLORARG2] >> 5) << s;
  1532.         }
  1533.         if (key.ts[s].alphaop != D3DTOP_DISABLE) {
  1534.             uint8_t used_a = ps_d3dtop_args_mask(key.ts[s].alphaop);
  1535.             if (used_a & 0x1) key.ts[s].alphaarg0 = state->ff.tex_stage[s][D3DTSS_ALPHAARG0];
  1536.             if (used_a & 0x2) key.ts[s].alphaarg1 = state->ff.tex_stage[s][D3DTSS_ALPHAARG1];
  1537.             if (used_a & 0x4) key.ts[s].alphaarg2 = state->ff.tex_stage[s][D3DTSS_ALPHAARG2];
  1538.             if (used_a & 0x1) key.alphaarg_b4[0] |= (state->ff.tex_stage[s][D3DTSS_ALPHAARG0] >> 4) << s;
  1539.             if (used_a & 0x2) key.alphaarg_b4[1] |= (state->ff.tex_stage[s][D3DTSS_ALPHAARG1] >> 4) << s;
  1540.             if (used_a & 0x4) key.alphaarg_b4[2] |= (state->ff.tex_stage[s][D3DTSS_ALPHAARG2] >> 4) << s;
  1541.         }
  1542.         key.ts[s].resultarg = state->ff.tex_stage[s][D3DTSS_RESULTARG] == D3DTA_TEMP;
  1543.  
  1544.         key.ts[s].projected = !!(state->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & D3DTTFF_PROJECTED);
  1545.  
  1546.         if (state->texture[s]) {
  1547.             switch (state->texture[s]->base.type) {
  1548.             case D3DRTYPE_TEXTURE:       key.ts[s].textarget = 1; break;
  1549.             case D3DRTYPE_VOLUMETEXTURE: key.ts[s].textarget = 2; break;
  1550.             case D3DRTYPE_CUBETEXTURE:   key.ts[s].textarget = 3; break;
  1551.             default:
  1552.                 assert(!"unexpected texture type");
  1553.                 break;
  1554.             }
  1555.         } else {
  1556.             key.ts[s].textarget = 1;
  1557.         }
  1558.     }
  1559.     for (; s < 8; ++s)
  1560.         key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
  1561.     if (state->rs[D3DRS_FOGENABLE])
  1562.         key.fog_mode = state->rs[D3DRS_FOGTABLEMODE];
  1563.  
  1564.     ps = util_hash_table_get(device->ff.ht_ps, &key);
  1565.     if (ps)
  1566.         return ps;
  1567.     NinePixelShader9_new(device, &ps, NULL, nine_ff_build_ps(device, &key));
  1568.  
  1569.     nine_ff_prune_ps(device);
  1570.     if (ps) {
  1571.         memcpy(&ps->ff_key, &key, sizeof(ps->ff_key));
  1572.  
  1573.         err = util_hash_table_set(device->ff.ht_ps, &ps->ff_key, ps);
  1574.         assert(err == PIPE_OK);
  1575.         device->ff.num_ps++;
  1576.         NineUnknown_ConvertRefToBind(NineUnknown(ps));
  1577.  
  1578.         ps->rt_mask = 0x1;
  1579.         ps->sampler_mask = sampler_mask;
  1580.     }
  1581.     return ps;
  1582. }
  1583.  
  1584. #define GET_D3DTS(n) nine_state_access_transform(state, D3DTS_##n, FALSE)
  1585. #define IS_D3DTS_DIRTY(s,n) ((s)->ff.changed.transform[(D3DTS_##n) / 32] & (1 << ((D3DTS_##n) % 32)))
  1586. static void
  1587. nine_ff_load_vs_transforms(struct NineDevice9 *device)
  1588. {
  1589.     struct nine_state *state = &device->state;
  1590.     D3DMATRIX T;
  1591.     D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
  1592.     unsigned i;
  1593.  
  1594.     /* TODO: make this nicer, and only upload the ones we need */
  1595.     /* TODO: use ff.vs_const as storage of W, V, P matrices */
  1596.  
  1597.     if (IS_D3DTS_DIRTY(state, WORLD) ||
  1598.         IS_D3DTS_DIRTY(state, VIEW) ||
  1599.         IS_D3DTS_DIRTY(state, PROJECTION)) {
  1600.         /* WVP, WV matrices */
  1601.         nine_d3d_matrix_matrix_mul(&M[1], GET_D3DTS(WORLD), GET_D3DTS(VIEW));
  1602.         nine_d3d_matrix_matrix_mul(&M[0], &M[1], GET_D3DTS(PROJECTION));
  1603.  
  1604.         /* normal matrix == transpose(inverse(WV)) */
  1605.         nine_d3d_matrix_inverse_3x3(&T, &M[1]);
  1606.         nine_d3d_matrix_transpose(&M[4], &T);
  1607.  
  1608.         /* VP matrix */
  1609.         nine_d3d_matrix_matrix_mul(&M[2], GET_D3DTS(VIEW), GET_D3DTS(PROJECTION));
  1610.  
  1611.         /* V and W matrix */
  1612.         M[3] = *GET_D3DTS(VIEW);
  1613.         M[56] = *GET_D3DTS(WORLD);
  1614.     }
  1615.  
  1616.     if (state->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
  1617.         /* load other world matrices */
  1618.         for (i = 1; i <= 7; ++i)
  1619.             M[56 + i] = *GET_D3DTS(WORLDMATRIX(i));
  1620.     }
  1621.  
  1622.     device->ff.vs_const[30 * 4] = asfloat(state->rs[D3DRS_TWEENFACTOR]);
  1623. }
  1624.  
  1625. static void
  1626. nine_ff_load_lights(struct NineDevice9 *device)
  1627. {
  1628.     struct nine_state *state = &device->state;
  1629.     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
  1630.     unsigned l;
  1631.  
  1632.     if (state->changed.group & NINE_STATE_FF_MATERIAL) {
  1633.         const D3DMATERIAL9 *mtl = &state->ff.material;
  1634.  
  1635.         memcpy(&dst[20], &mtl->Diffuse, 4 * sizeof(float));
  1636.         memcpy(&dst[21], &mtl->Ambient, 4 * sizeof(float));
  1637.         memcpy(&dst[22], &mtl->Specular, 4 * sizeof(float));
  1638.         dst[23].x = mtl->Power;
  1639.         memcpy(&dst[24], &mtl->Emissive, 4 * sizeof(float));
  1640.         d3dcolor_to_rgba(&dst[25].x, state->rs[D3DRS_AMBIENT]);
  1641.         dst[19].x = dst[25].x * mtl->Ambient.r + mtl->Emissive.r;
  1642.         dst[19].y = dst[25].y * mtl->Ambient.g + mtl->Emissive.g;
  1643.         dst[19].z = dst[25].z * mtl->Ambient.b + mtl->Emissive.b;
  1644.         dst[19].w = mtl->Ambient.a + mtl->Emissive.a;
  1645.     }
  1646.  
  1647.     if (!(state->changed.group & NINE_STATE_FF_LIGHTING))
  1648.         return;
  1649.  
  1650.     for (l = 0; l < state->ff.num_lights_active; ++l) {
  1651.         const D3DLIGHT9 *light = &state->ff.light[state->ff.active_light[l]];
  1652.  
  1653.         dst[32 + l * 8].x = light->Type;
  1654.         dst[32 + l * 8].y = light->Attenuation0;
  1655.         dst[32 + l * 8].z = light->Attenuation1;
  1656.         dst[32 + l * 8].w = light->Attenuation2;
  1657.         memcpy(&dst[33 + l * 8].x, &light->Diffuse, sizeof(light->Diffuse));
  1658.         memcpy(&dst[34 + l * 8].x, &light->Specular, sizeof(light->Specular));
  1659.         memcpy(&dst[35 + l * 8].x, &light->Ambient, sizeof(light->Ambient));
  1660.         nine_d3d_vector4_matrix_mul((D3DVECTOR *)&dst[36 + l * 8].x, &light->Position, GET_D3DTS(VIEW));
  1661.         nine_d3d_vector3_matrix_mul((D3DVECTOR *)&dst[37 + l * 8].x, &light->Direction, GET_D3DTS(VIEW));
  1662.         dst[36 + l * 8].w = light->Type == D3DLIGHT_DIRECTIONAL ? 1e9f : light->Range;
  1663.         dst[37 + l * 8].w = light->Falloff;
  1664.         dst[38 + l * 8].x = cosf(light->Theta * 0.5f);
  1665.         dst[38 + l * 8].y = cosf(light->Phi * 0.5f);
  1666.         dst[38 + l * 8].z = 1.0f / (dst[38 + l * 8].x - dst[38 + l * 8].y);
  1667.         dst[39 + l * 8].w = (l + 1) == state->ff.num_lights_active;
  1668.     }
  1669. }
  1670.  
  1671. static void
  1672. nine_ff_load_point_and_fog_params(struct NineDevice9 *device)
  1673. {
  1674.     const struct nine_state *state = &device->state;
  1675.     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
  1676.  
  1677.     if (!(state->changed.group & NINE_STATE_FF_OTHER))
  1678.         return;
  1679.     dst[26].x = asfloat(state->rs[D3DRS_POINTSIZE_MIN]);
  1680.     dst[26].y = asfloat(state->rs[D3DRS_POINTSIZE_MAX]);
  1681.     dst[26].z = asfloat(state->rs[D3DRS_POINTSIZE]);
  1682.     dst[26].w = asfloat(state->rs[D3DRS_POINTSCALE_A]);
  1683.     dst[27].x = asfloat(state->rs[D3DRS_POINTSCALE_B]);
  1684.     dst[27].y = asfloat(state->rs[D3DRS_POINTSCALE_C]);
  1685.     dst[28].x = asfloat(state->rs[D3DRS_FOGEND]);
  1686.     dst[28].y = 1.0f / (asfloat(state->rs[D3DRS_FOGEND]) - asfloat(state->rs[D3DRS_FOGSTART]));
  1687.     if (isinf(dst[28].y))
  1688.         dst[28].y = 0.0f;
  1689.     dst[28].z = asfloat(state->rs[D3DRS_FOGDENSITY]);
  1690.     d3dcolor_to_rgba(&dst[29].x, state->rs[D3DRS_FOGCOLOR]);
  1691. }
  1692.  
  1693. static void
  1694. nine_ff_load_tex_matrices(struct NineDevice9 *device)
  1695. {
  1696.     struct nine_state *state = &device->state;
  1697.     D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
  1698.     unsigned s;
  1699.  
  1700.     if (!(state->ff.changed.transform[0] & 0xff0000))
  1701.         return;
  1702.     for (s = 0; s < 8; ++s) {
  1703.         if (IS_D3DTS_DIRTY(state, TEXTURE0 + s))
  1704.             M[32 + s] = *nine_state_access_transform(state, D3DTS_TEXTURE0 + s, FALSE);
  1705.     }
  1706. }
  1707.  
  1708. static void
  1709. nine_ff_load_ps_params(struct NineDevice9 *device)
  1710. {
  1711.     const struct nine_state *state = &device->state;
  1712.     struct fvec4 *dst = (struct fvec4 *)device->ff.ps_const;
  1713.     unsigned s;
  1714.  
  1715.     if (!(state->changed.group & (NINE_STATE_FF_PSSTAGES | NINE_STATE_FF_OTHER)))
  1716.         return;
  1717.  
  1718.     for (s = 0; s < 8; ++s)
  1719.         d3dcolor_to_rgba(&dst[s].x, state->ff.tex_stage[s][D3DTSS_CONSTANT]);
  1720.  
  1721.     for (s = 0; s < 8; ++s) {
  1722.         dst[8 + s].x = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVMAT00]);
  1723.         dst[8 + s].y = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVMAT01]);
  1724.         dst[8 + s].z = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVMAT10]);
  1725.         dst[8 + s].w = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVMAT11]);
  1726.         if (s & 1) {
  1727.             dst[8 + s / 2].z = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
  1728.             dst[8 + s / 2].w = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
  1729.         } else {
  1730.             dst[8 + s / 2].x = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
  1731.             dst[8 + s / 2].y = asfloat(state->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
  1732.         }
  1733.     }
  1734.  
  1735.     d3dcolor_to_rgba(&dst[20].x, state->rs[D3DRS_TEXTUREFACTOR]);
  1736.     d3dcolor_to_rgba(&dst[21].x, state->rs[D3DRS_FOGCOLOR]);
  1737.     dst[22].x = asfloat(state->rs[D3DRS_FOGEND]);
  1738.     dst[22].y = 1.0f / (asfloat(state->rs[D3DRS_FOGEND]) - asfloat(state->rs[D3DRS_FOGSTART]));
  1739.     dst[22].z = asfloat(state->rs[D3DRS_FOGDENSITY]);
  1740. }
  1741.  
  1742. static void
  1743. nine_ff_load_viewport_info(struct NineDevice9 *device)
  1744. {
  1745.     D3DVIEWPORT9 *viewport = &device->state.viewport;
  1746.     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
  1747.     float diffZ = viewport->MaxZ - viewport->MinZ;
  1748.  
  1749.     /* Note: the other functions avoids to fill the const again if nothing changed.
  1750.      * But we don't have much to fill, and adding code to allow that may be complex
  1751.      * so just fill it always */
  1752.     dst[100].x = 2.0f / (float)(viewport->Width);
  1753.     dst[100].y = 2.0f / (float)(viewport->Height);
  1754.     dst[100].z = (diffZ == 0.0f) ? 0.0f : (1.0f / diffZ);
  1755.     dst[101].x = (float)(viewport->X);
  1756.     dst[101].y = (float)(viewport->Y);
  1757.     dst[101].z = (float)(viewport->MinZ);
  1758. }
  1759.  
  1760. void
  1761. nine_ff_update(struct NineDevice9 *device)
  1762. {
  1763.     struct pipe_context *pipe = device->pipe;
  1764.     struct nine_state *state = &device->state;
  1765.  
  1766.     DBG("vs=%p ps=%p\n", device->state.vs, device->state.ps);
  1767.  
  1768.     /* NOTE: the only reference belongs to the hash table */
  1769.     if (!device->state.vs)
  1770.         device->ff.vs = nine_ff_get_vs(device);
  1771.     if (!device->state.ps)
  1772.         device->ff.ps = nine_ff_get_ps(device);
  1773.  
  1774.     if (!device->state.vs) {
  1775.         if (device->state.ff.clobber.vs_const) {
  1776.             device->state.ff.clobber.vs_const = FALSE;
  1777.             device->state.changed.group |=
  1778.                 NINE_STATE_FF_VSTRANSF |
  1779.                 NINE_STATE_FF_MATERIAL |
  1780.                 NINE_STATE_FF_LIGHTING |
  1781.                 NINE_STATE_FF_OTHER;
  1782.             device->state.ff.changed.transform[0] |= 0xff000c;
  1783.             device->state.ff.changed.transform[8] |= 0xff;
  1784.         }
  1785.         nine_ff_load_vs_transforms(device);
  1786.         nine_ff_load_tex_matrices(device);
  1787.         nine_ff_load_lights(device);
  1788.         nine_ff_load_point_and_fog_params(device);
  1789.         nine_ff_load_viewport_info(device);
  1790.  
  1791.         memset(state->ff.changed.transform, 0, sizeof(state->ff.changed.transform));
  1792.  
  1793.         device->state.changed.group |= NINE_STATE_VS;
  1794.         device->state.changed.group |= NINE_STATE_VS_CONST;
  1795.  
  1796.         if (device->prefer_user_constbuf) {
  1797.             struct pipe_context *pipe = device->pipe;
  1798.             struct pipe_constant_buffer cb;
  1799.             cb.buffer_offset = 0;
  1800.             cb.buffer = NULL;
  1801.             cb.user_buffer = device->ff.vs_const;
  1802.             cb.buffer_size = NINE_FF_NUM_VS_CONST * 4 * sizeof(float);
  1803.             pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &cb);
  1804.         } else {
  1805.             struct pipe_box box;
  1806.             u_box_1d(0, NINE_FF_NUM_VS_CONST * 4 * sizeof(float), &box);
  1807.             pipe->transfer_inline_write(pipe, device->constbuf_vs, 0,
  1808.                                         0, &box,
  1809.                                         device->ff.vs_const, 0, 0);
  1810.             nine_ranges_insert(&device->state.changed.vs_const_f, 0, NINE_FF_NUM_VS_CONST,
  1811.                                &device->range_pool);
  1812.         }
  1813.     }
  1814.  
  1815.     if (!device->state.ps) {
  1816.         if (device->state.ff.clobber.ps_const) {
  1817.             device->state.ff.clobber.ps_const = FALSE;
  1818.             device->state.changed.group |=
  1819.                 NINE_STATE_FF_PSSTAGES |
  1820.                 NINE_STATE_FF_OTHER;
  1821.         }
  1822.         nine_ff_load_ps_params(device);
  1823.  
  1824.         device->state.changed.group |= NINE_STATE_PS;
  1825.         device->state.changed.group |= NINE_STATE_PS_CONST;
  1826.  
  1827.         if (device->prefer_user_constbuf) {
  1828.             struct pipe_context *pipe = device->pipe;
  1829.             struct pipe_constant_buffer cb;
  1830.             cb.buffer_offset = 0;
  1831.             cb.buffer = NULL;
  1832.             cb.user_buffer = device->ff.ps_const;
  1833.             cb.buffer_size = NINE_FF_NUM_PS_CONST * 4 * sizeof(float);
  1834.             pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &cb);
  1835.         } else {
  1836.             struct pipe_box box;
  1837.             u_box_1d(0, NINE_FF_NUM_PS_CONST * 4 * sizeof(float), &box);
  1838.             pipe->transfer_inline_write(pipe, device->constbuf_ps, 0,
  1839.                                         0, &box,
  1840.                                         device->ff.ps_const, 0, 0);
  1841.             nine_ranges_insert(&device->state.changed.ps_const_f, 0, NINE_FF_NUM_PS_CONST,
  1842.                                &device->range_pool);
  1843.         }
  1844.     }
  1845.  
  1846.     device->state.changed.group &= ~NINE_STATE_FF;
  1847. }
  1848.  
  1849.  
  1850. boolean
  1851. nine_ff_init(struct NineDevice9 *device)
  1852. {
  1853.     device->ff.ht_vs = util_hash_table_create(nine_ff_vs_key_hash,
  1854.                                               nine_ff_vs_key_comp);
  1855.     device->ff.ht_ps = util_hash_table_create(nine_ff_ps_key_hash,
  1856.                                               nine_ff_ps_key_comp);
  1857.  
  1858.     device->ff.ht_fvf = util_hash_table_create(nine_ff_fvf_key_hash,
  1859.                                                nine_ff_fvf_key_comp);
  1860.  
  1861.     device->ff.vs_const = CALLOC(NINE_FF_NUM_VS_CONST, 4 * sizeof(float));
  1862.     device->ff.ps_const = CALLOC(NINE_FF_NUM_PS_CONST, 4 * sizeof(float));
  1863.  
  1864.     return device->ff.ht_vs && device->ff.ht_ps &&
  1865.         device->ff.ht_fvf &&
  1866.         device->ff.vs_const && device->ff.ps_const;
  1867. }
  1868.  
  1869. static enum pipe_error nine_ff_ht_delete_cb(void *key, void *value, void *data)
  1870. {
  1871.     NineUnknown_Unbind(NineUnknown(value));
  1872.     return PIPE_OK;
  1873. }
  1874.  
  1875. void
  1876. nine_ff_fini(struct NineDevice9 *device)
  1877. {
  1878.     if (device->ff.ht_vs) {
  1879.         util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
  1880.         util_hash_table_destroy(device->ff.ht_vs);
  1881.     }
  1882.     if (device->ff.ht_ps) {
  1883.         util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
  1884.         util_hash_table_destroy(device->ff.ht_ps);
  1885.     }
  1886.     if (device->ff.ht_fvf) {
  1887.         util_hash_table_foreach(device->ff.ht_fvf, nine_ff_ht_delete_cb, NULL);
  1888.         util_hash_table_destroy(device->ff.ht_fvf);
  1889.     }
  1890.     device->ff.vs = NULL; /* destroyed by unbinding from hash table */
  1891.     device->ff.ps = NULL;
  1892.  
  1893.     FREE(device->ff.vs_const);
  1894.     FREE(device->ff.ps_const);
  1895. }
  1896.  
  1897. static void
  1898. nine_ff_prune_vs(struct NineDevice9 *device)
  1899. {
  1900.     if (device->ff.num_vs > 100) {
  1901.         /* could destroy the bound one here, so unbind */
  1902.         device->pipe->bind_vs_state(device->pipe, NULL);
  1903.         util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
  1904.         util_hash_table_clear(device->ff.ht_vs);
  1905.         device->ff.num_vs = 0;
  1906.         device->state.changed.group |= NINE_STATE_VS;
  1907.     }
  1908. }
  1909. static void
  1910. nine_ff_prune_ps(struct NineDevice9 *device)
  1911. {
  1912.     if (device->ff.num_ps > 100) {
  1913.         /* could destroy the bound one here, so unbind */
  1914.         device->pipe->bind_fs_state(device->pipe, NULL);
  1915.         util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
  1916.         util_hash_table_clear(device->ff.ht_ps);
  1917.         device->ff.num_ps = 0;
  1918.         device->state.changed.group |= NINE_STATE_PS;
  1919.     }
  1920. }
  1921.  
  1922. /* ========================================================================== */
  1923.  
  1924. /* Matrix multiplication:
  1925.  *
  1926.  * in memory: 0 1 2 3 (row major)
  1927.  *            4 5 6 7
  1928.  *            8 9 a b
  1929.  *            c d e f
  1930.  *
  1931.  *    cA cB cC cD
  1932.  * r0             = (r0 * cA) (r0 * cB) . .
  1933.  * r1             = (r1 * cA) (r1 * cB)
  1934.  * r2             = (r2 * cA) .
  1935.  * r3             = (r3 * cA) .
  1936.  *
  1937.  *               r: (11) (12) (13) (14)
  1938.  *                  (21) (22) (23) (24)
  1939.  *                  (31) (32) (33) (34)
  1940.  *                  (41) (42) (43) (44)
  1941.  * l: (11 12 13 14)
  1942.  *    (21 22 23 24)
  1943.  *    (31 32 33 34)
  1944.  *    (41 42 43 44)
  1945.  *
  1946.  * v: (x  y  z  1 )
  1947.  *
  1948.  * t.xyzw = MUL(v.xxxx, r[0]);
  1949.  * t.xyzw = MAD(v.yyyy, r[1], t.xyzw);
  1950.  * t.xyzw = MAD(v.zzzz, r[2], t.xyzw);
  1951.  * v.xyzw = MAD(v.wwww, r[3], t.xyzw);
  1952.  *
  1953.  * v.x = DP4(v, c[0]);
  1954.  * v.y = DP4(v, c[1]);
  1955.  * v.z = DP4(v, c[2]);
  1956.  * v.w = DP4(v, c[3]) = 1
  1957.  */
  1958.  
  1959. /*
  1960. static void
  1961. nine_D3DMATRIX_print(const D3DMATRIX *M)
  1962. {
  1963.     DBG("\n(%f %f %f %f)\n"
  1964.         "(%f %f %f %f)\n"
  1965.         "(%f %f %f %f)\n"
  1966.         "(%f %f %f %f)\n",
  1967.         M->m[0][0], M->m[0][1], M->m[0][2], M->m[0][3],
  1968.         M->m[1][0], M->m[1][1], M->m[1][2], M->m[1][3],
  1969.         M->m[2][0], M->m[2][1], M->m[2][2], M->m[2][3],
  1970.         M->m[3][0], M->m[3][1], M->m[3][2], M->m[3][3]);
  1971. }
  1972. */
  1973.  
  1974. static INLINE float
  1975. nine_DP4_row_col(const D3DMATRIX *A, int r, const D3DMATRIX *B, int c)
  1976. {
  1977.     return A->m[r][0] * B->m[0][c] +
  1978.            A->m[r][1] * B->m[1][c] +
  1979.            A->m[r][2] * B->m[2][c] +
  1980.            A->m[r][3] * B->m[3][c];
  1981. }
  1982.  
  1983. static INLINE float
  1984. nine_DP4_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
  1985. {
  1986.     return v->x * M->m[0][c] +
  1987.            v->y * M->m[1][c] +
  1988.            v->z * M->m[2][c] +
  1989.            1.0f * M->m[3][c];
  1990. }
  1991.  
  1992. static INLINE float
  1993. nine_DP3_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
  1994. {
  1995.     return v->x * M->m[0][c] +
  1996.            v->y * M->m[1][c] +
  1997.            v->z * M->m[2][c];
  1998. }
  1999.  
  2000. void
  2001. nine_d3d_matrix_matrix_mul(D3DMATRIX *D, const D3DMATRIX *L, const D3DMATRIX *R)
  2002. {
  2003.     D->_11 = nine_DP4_row_col(L, 0, R, 0);
  2004.     D->_12 = nine_DP4_row_col(L, 0, R, 1);
  2005.     D->_13 = nine_DP4_row_col(L, 0, R, 2);
  2006.     D->_14 = nine_DP4_row_col(L, 0, R, 3);
  2007.  
  2008.     D->_21 = nine_DP4_row_col(L, 1, R, 0);
  2009.     D->_22 = nine_DP4_row_col(L, 1, R, 1);
  2010.     D->_23 = nine_DP4_row_col(L, 1, R, 2);
  2011.     D->_24 = nine_DP4_row_col(L, 1, R, 3);
  2012.  
  2013.     D->_31 = nine_DP4_row_col(L, 2, R, 0);
  2014.     D->_32 = nine_DP4_row_col(L, 2, R, 1);
  2015.     D->_33 = nine_DP4_row_col(L, 2, R, 2);
  2016.     D->_34 = nine_DP4_row_col(L, 2, R, 3);
  2017.  
  2018.     D->_41 = nine_DP4_row_col(L, 3, R, 0);
  2019.     D->_42 = nine_DP4_row_col(L, 3, R, 1);
  2020.     D->_43 = nine_DP4_row_col(L, 3, R, 2);
  2021.     D->_44 = nine_DP4_row_col(L, 3, R, 3);
  2022. }
  2023.  
  2024. void
  2025. nine_d3d_vector4_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
  2026. {
  2027.     d->x = nine_DP4_vec_col(v, M, 0);
  2028.     d->y = nine_DP4_vec_col(v, M, 1);
  2029.     d->z = nine_DP4_vec_col(v, M, 2);
  2030. }
  2031.  
  2032. void
  2033. nine_d3d_vector3_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
  2034. {
  2035.     d->x = nine_DP3_vec_col(v, M, 0);
  2036.     d->y = nine_DP3_vec_col(v, M, 1);
  2037.     d->z = nine_DP3_vec_col(v, M, 2);
  2038. }
  2039.  
  2040. void
  2041. nine_d3d_matrix_transpose(D3DMATRIX *D, const D3DMATRIX *M)
  2042. {
  2043.     unsigned i, j;
  2044.     for (i = 0; i < 4; ++i)
  2045.     for (j = 0; j < 4; ++j)
  2046.         D->m[i][j] = M->m[j][i];
  2047. }
  2048.  
  2049. #define _M_ADD_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
  2050.     float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
  2051.     if (t > 0.0f) pos += t; else neg += t; } while(0)
  2052.  
  2053. #define _M_SUB_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
  2054.     float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
  2055.     if (t > 0.0f) neg -= t; else pos -= t; } while(0)
  2056. float
  2057. nine_d3d_matrix_det(const D3DMATRIX *M)
  2058. {
  2059.     float pos = 0.0f;
  2060.     float neg = 0.0f;
  2061.  
  2062.     _M_ADD_PROD_1i_2j_3k_4l(1, 2, 3, 4);
  2063.     _M_ADD_PROD_1i_2j_3k_4l(1, 3, 4, 2);
  2064.     _M_ADD_PROD_1i_2j_3k_4l(1, 4, 2, 3);
  2065.  
  2066.     _M_ADD_PROD_1i_2j_3k_4l(2, 1, 4, 3);
  2067.     _M_ADD_PROD_1i_2j_3k_4l(2, 3, 1, 4);
  2068.     _M_ADD_PROD_1i_2j_3k_4l(2, 4, 3, 1);
  2069.  
  2070.     _M_ADD_PROD_1i_2j_3k_4l(3, 1, 2, 4);
  2071.     _M_ADD_PROD_1i_2j_3k_4l(3, 2, 4, 1);
  2072.     _M_ADD_PROD_1i_2j_3k_4l(3, 4, 1, 2);
  2073.  
  2074.     _M_ADD_PROD_1i_2j_3k_4l(4, 1, 3, 2);
  2075.     _M_ADD_PROD_1i_2j_3k_4l(4, 2, 1, 3);
  2076.     _M_ADD_PROD_1i_2j_3k_4l(4, 3, 2, 1);
  2077.  
  2078.     _M_SUB_PROD_1i_2j_3k_4l(1, 2, 4, 3);
  2079.     _M_SUB_PROD_1i_2j_3k_4l(1, 3, 2, 4);
  2080.     _M_SUB_PROD_1i_2j_3k_4l(1, 4, 3, 2);
  2081.  
  2082.     _M_SUB_PROD_1i_2j_3k_4l(2, 1, 3, 4);
  2083.     _M_SUB_PROD_1i_2j_3k_4l(2, 3, 4, 1);
  2084.     _M_SUB_PROD_1i_2j_3k_4l(2, 4, 1, 3);
  2085.  
  2086.     _M_SUB_PROD_1i_2j_3k_4l(3, 1, 4, 2);
  2087.     _M_SUB_PROD_1i_2j_3k_4l(3, 2, 1, 4);
  2088.     _M_SUB_PROD_1i_2j_3k_4l(3, 4, 2, 1);
  2089.  
  2090.     _M_SUB_PROD_1i_2j_3k_4l(4, 1, 2, 3);
  2091.     _M_SUB_PROD_1i_2j_3k_4l(4, 2, 3, 1);
  2092.     _M_SUB_PROD_1i_2j_3k_4l(4, 3, 1, 2);
  2093.  
  2094.     return pos + neg;
  2095. }
  2096.  
  2097. /* XXX: Probably better to just use src/mesa/math/m_matrix.c because
  2098.  * I have no idea where this code came from.
  2099.  */
  2100. void
  2101. nine_d3d_matrix_inverse(D3DMATRIX *D, const D3DMATRIX *M)
  2102. {
  2103.     int i, k;
  2104.     float det;
  2105.  
  2106.     D->m[0][0] =
  2107.         M->m[1][1] * M->m[2][2] * M->m[3][3] -
  2108.         M->m[1][1] * M->m[3][2] * M->m[2][3] -
  2109.         M->m[1][2] * M->m[2][1] * M->m[3][3] +
  2110.         M->m[1][2] * M->m[3][1] * M->m[2][3] +
  2111.         M->m[1][3] * M->m[2][1] * M->m[3][2] -
  2112.         M->m[1][3] * M->m[3][1] * M->m[2][2];
  2113.  
  2114.     D->m[0][1] =
  2115.        -M->m[0][1] * M->m[2][2] * M->m[3][3] +
  2116.         M->m[0][1] * M->m[3][2] * M->m[2][3] +
  2117.         M->m[0][2] * M->m[2][1] * M->m[3][3] -
  2118.         M->m[0][2] * M->m[3][1] * M->m[2][3] -
  2119.         M->m[0][3] * M->m[2][1] * M->m[3][2] +
  2120.         M->m[0][3] * M->m[3][1] * M->m[2][2];
  2121.  
  2122.     D->m[0][2] =
  2123.         M->m[0][1] * M->m[1][2] * M->m[3][3] -
  2124.         M->m[0][1] * M->m[3][2] * M->m[1][3] -
  2125.         M->m[0][2] * M->m[1][1] * M->m[3][3] +
  2126.         M->m[0][2] * M->m[3][1] * M->m[1][3] +
  2127.         M->m[0][3] * M->m[1][1] * M->m[3][2] -
  2128.         M->m[0][3] * M->m[3][1] * M->m[1][2];
  2129.  
  2130.     D->m[0][3] =
  2131.        -M->m[0][1] * M->m[1][2] * M->m[2][3] +
  2132.         M->m[0][1] * M->m[2][2] * M->m[1][3] +
  2133.         M->m[0][2] * M->m[1][1] * M->m[2][3] -
  2134.         M->m[0][2] * M->m[2][1] * M->m[1][3] -
  2135.         M->m[0][3] * M->m[1][1] * M->m[2][2] +
  2136.         M->m[0][3] * M->m[2][1] * M->m[1][2];
  2137.  
  2138.     D->m[1][0] =
  2139.        -M->m[1][0] * M->m[2][2] * M->m[3][3] +
  2140.         M->m[1][0] * M->m[3][2] * M->m[2][3] +
  2141.         M->m[1][2] * M->m[2][0] * M->m[3][3] -
  2142.         M->m[1][2] * M->m[3][0] * M->m[2][3] -
  2143.         M->m[1][3] * M->m[2][0] * M->m[3][2] +
  2144.         M->m[1][3] * M->m[3][0] * M->m[2][2];
  2145.  
  2146.     D->m[1][1] =
  2147.         M->m[0][0] * M->m[2][2] * M->m[3][3] -
  2148.         M->m[0][0] * M->m[3][2] * M->m[2][3] -
  2149.         M->m[0][2] * M->m[2][0] * M->m[3][3] +
  2150.         M->m[0][2] * M->m[3][0] * M->m[2][3] +
  2151.         M->m[0][3] * M->m[2][0] * M->m[3][2] -
  2152.         M->m[0][3] * M->m[3][0] * M->m[2][2];
  2153.  
  2154.     D->m[1][2] =
  2155.        -M->m[0][0] * M->m[1][2] * M->m[3][3] +
  2156.         M->m[0][0] * M->m[3][2] * M->m[1][3] +
  2157.         M->m[0][2] * M->m[1][0] * M->m[3][3] -
  2158.         M->m[0][2] * M->m[3][0] * M->m[1][3] -
  2159.         M->m[0][3] * M->m[1][0] * M->m[3][2] +
  2160.         M->m[0][3] * M->m[3][0] * M->m[1][2];
  2161.  
  2162.     D->m[1][3] =
  2163.         M->m[0][0] * M->m[1][2] * M->m[2][3] -
  2164.         M->m[0][0] * M->m[2][2] * M->m[1][3] -
  2165.         M->m[0][2] * M->m[1][0] * M->m[2][3] +
  2166.         M->m[0][2] * M->m[2][0] * M->m[1][3] +
  2167.         M->m[0][3] * M->m[1][0] * M->m[2][2] -
  2168.         M->m[0][3] * M->m[2][0] * M->m[1][2];
  2169.  
  2170.     D->m[2][0] =
  2171.         M->m[1][0] * M->m[2][1] * M->m[3][3] -
  2172.         M->m[1][0] * M->m[3][1] * M->m[2][3] -
  2173.         M->m[1][1] * M->m[2][0] * M->m[3][3] +
  2174.         M->m[1][1] * M->m[3][0] * M->m[2][3] +
  2175.         M->m[1][3] * M->m[2][0] * M->m[3][1] -
  2176.         M->m[1][3] * M->m[3][0] * M->m[2][1];
  2177.  
  2178.     D->m[2][1] =
  2179.        -M->m[0][0] * M->m[2][1] * M->m[3][3] +
  2180.         M->m[0][0] * M->m[3][1] * M->m[2][3] +
  2181.         M->m[0][1] * M->m[2][0] * M->m[3][3] -
  2182.         M->m[0][1] * M->m[3][0] * M->m[2][3] -
  2183.         M->m[0][3] * M->m[2][0] * M->m[3][1] +
  2184.         M->m[0][3] * M->m[3][0] * M->m[2][1];
  2185.  
  2186.     D->m[2][2] =
  2187.         M->m[0][0] * M->m[1][1] * M->m[3][3] -
  2188.         M->m[0][0] * M->m[3][1] * M->m[1][3] -
  2189.         M->m[0][1] * M->m[1][0] * M->m[3][3] +
  2190.         M->m[0][1] * M->m[3][0] * M->m[1][3] +
  2191.         M->m[0][3] * M->m[1][0] * M->m[3][1] -
  2192.         M->m[0][3] * M->m[3][0] * M->m[1][1];
  2193.  
  2194.     D->m[2][3] =
  2195.        -M->m[0][0] * M->m[1][1] * M->m[2][3] +
  2196.         M->m[0][0] * M->m[2][1] * M->m[1][3] +
  2197.         M->m[0][1] * M->m[1][0] * M->m[2][3] -
  2198.         M->m[0][1] * M->m[2][0] * M->m[1][3] -
  2199.         M->m[0][3] * M->m[1][0] * M->m[2][1] +
  2200.         M->m[0][3] * M->m[2][0] * M->m[1][1];
  2201.  
  2202.     D->m[3][0] =
  2203.        -M->m[1][0] * M->m[2][1] * M->m[3][2] +
  2204.         M->m[1][0] * M->m[3][1] * M->m[2][2] +
  2205.         M->m[1][1] * M->m[2][0] * M->m[3][2] -
  2206.         M->m[1][1] * M->m[3][0] * M->m[2][2] -
  2207.         M->m[1][2] * M->m[2][0] * M->m[3][1] +
  2208.         M->m[1][2] * M->m[3][0] * M->m[2][1];
  2209.  
  2210.     D->m[3][1] =
  2211.         M->m[0][0] * M->m[2][1] * M->m[3][2] -
  2212.         M->m[0][0] * M->m[3][1] * M->m[2][2] -
  2213.         M->m[0][1] * M->m[2][0] * M->m[3][2] +
  2214.         M->m[0][1] * M->m[3][0] * M->m[2][2] +
  2215.         M->m[0][2] * M->m[2][0] * M->m[3][1] -
  2216.         M->m[0][2] * M->m[3][0] * M->m[2][1];
  2217.  
  2218.     D->m[3][2] =
  2219.        -M->m[0][0] * M->m[1][1] * M->m[3][2] +
  2220.         M->m[0][0] * M->m[3][1] * M->m[1][2] +
  2221.         M->m[0][1] * M->m[1][0] * M->m[3][2] -
  2222.         M->m[0][1] * M->m[3][0] * M->m[1][2] -
  2223.         M->m[0][2] * M->m[1][0] * M->m[3][1] +
  2224.         M->m[0][2] * M->m[3][0] * M->m[1][1];
  2225.  
  2226.     D->m[3][3] =
  2227.         M->m[0][0] * M->m[1][1] * M->m[2][2] -
  2228.         M->m[0][0] * M->m[2][1] * M->m[1][2] -
  2229.         M->m[0][1] * M->m[1][0] * M->m[2][2] +
  2230.         M->m[0][1] * M->m[2][0] * M->m[1][2] +
  2231.         M->m[0][2] * M->m[1][0] * M->m[2][1] -
  2232.         M->m[0][2] * M->m[2][0] * M->m[1][1];
  2233.  
  2234.     det =
  2235.         M->m[0][0] * D->m[0][0] +
  2236.         M->m[1][0] * D->m[0][1] +
  2237.         M->m[2][0] * D->m[0][2] +
  2238.         M->m[3][0] * D->m[0][3];
  2239.  
  2240.     det = 1.0 / det;
  2241.  
  2242.     for (i = 0; i < 4; i++)
  2243.     for (k = 0; k < 4; k++)
  2244.         D->m[i][k] *= det;
  2245.  
  2246. #ifdef DEBUG
  2247.     {
  2248.         D3DMATRIX I;
  2249.  
  2250.         nine_d3d_matrix_matrix_mul(&I, D, M);
  2251.  
  2252.         for (i = 0; i < 4; ++i)
  2253.         for (k = 0; k < 4; ++k)
  2254.             if (fabsf(I.m[i][k] - (float)(i == k)) > 1e-3)
  2255.                 DBG("Matrix inversion check FAILED !\n");
  2256.     }
  2257. #endif
  2258. }
  2259.  
  2260. /* TODO: don't use 4x4 inverse, unless this gets all nicely inlined ? */
  2261. void
  2262. nine_d3d_matrix_inverse_3x3(D3DMATRIX *D, const D3DMATRIX *M)
  2263. {
  2264.     D3DMATRIX T;
  2265.     unsigned i, j;
  2266.  
  2267.     for (i = 0; i < 3; ++i)
  2268.     for (j = 0; j < 3; ++j)
  2269.         T.m[i][j] = M->m[i][j];
  2270.     for (i = 0; i < 3; ++i) {
  2271.         T.m[i][3] = 0.0f;
  2272.         T.m[3][i] = 0.0f;
  2273.     }
  2274.     T.m[3][3] = 1.0f;
  2275.  
  2276.     nine_d3d_matrix_inverse(D, &T);
  2277. }
  2278.