Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. #include <float.h>
  2. #include "pipe/p_context.h"
  3. #include "pipe/p_defines.h"
  4. #include "pipe/p_state.h"
  5. #include "util/u_dynarray.h"
  6. #include "util/u_inlines.h"
  7. #include "util/u_debug.h"
  8.  
  9. #include "pipe/p_shader_tokens.h"
  10. #include "tgsi/tgsi_parse.h"
  11. #include "tgsi/tgsi_util.h"
  12. #include "tgsi/tgsi_dump.h"
  13. #include "tgsi/tgsi_ureg.h"
  14.  
  15. #include "nouveau_debug.h"
  16. #include "nv_object.xml.h"
  17. #include "nv30/nv30-40_3d.xml.h"
  18. #include "nv30/nvfx_shader.h"
  19. #include "nv30/nv30_state.h"
  20.  
  21. struct nvfx_fpc {
  22.    struct nv30_fragprog *fp;
  23.  
  24.    unsigned max_temps;
  25.    unsigned long long r_temps;
  26.    unsigned long long r_temps_discard;
  27.    struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS];
  28.    struct nvfx_reg r_input[PIPE_MAX_SHADER_INPUTS];
  29.    struct nvfx_reg *r_temp;
  30.  
  31.    int num_regs;
  32.  
  33.    unsigned inst_offset;
  34.    unsigned have_const;
  35.    unsigned is_nv4x;
  36.  
  37.    struct util_dynarray imm_data;
  38.  
  39.    struct nvfx_reg* r_imm;
  40.    unsigned nr_imm;
  41.  
  42.    struct util_dynarray if_stack;
  43.    //struct util_dynarray loop_stack;
  44.    struct util_dynarray label_relocs;
  45. };
  46.  
  47. static INLINE struct nvfx_reg
  48. temp(struct nvfx_fpc *fpc)
  49. {
  50.    int idx = __builtin_ctzll(~fpc->r_temps);
  51.  
  52.    if (idx >= fpc->max_temps) {
  53.       NOUVEAU_ERR("out of temps!!\n");
  54.       assert(0);
  55.       return nvfx_reg(NVFXSR_TEMP, 0);
  56.    }
  57.  
  58.    fpc->r_temps |= (1ULL << idx);
  59.    fpc->r_temps_discard |= (1ULL << idx);
  60.    return nvfx_reg(NVFXSR_TEMP, idx);
  61. }
  62.  
  63. static INLINE void
  64. release_temps(struct nvfx_fpc *fpc)
  65. {
  66.    fpc->r_temps &= ~fpc->r_temps_discard;
  67.    fpc->r_temps_discard = 0ULL;
  68. }
  69.  
  70. static inline struct nvfx_reg
  71. nvfx_fp_imm(struct nvfx_fpc *fpc, float a, float b, float c, float d)
  72. {
  73.    float v[4] = {a, b, c, d};
  74.    int idx = fpc->imm_data.size >> 4;
  75.  
  76.    memcpy(util_dynarray_grow(&fpc->imm_data, sizeof(float) * 4), v, 4 * sizeof(float));
  77.    return nvfx_reg(NVFXSR_IMM, idx);
  78. }
  79.  
  80. static void
  81. grow_insns(struct nvfx_fpc *fpc, int size)
  82. {
  83.    struct nv30_fragprog *fp = fpc->fp;
  84.  
  85.    fp->insn_len += size;
  86.    fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
  87. }
  88.  
  89. static void
  90. emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_src src)
  91. {
  92.    struct nv30_fragprog *fp = fpc->fp;
  93.    uint32_t *hw = &fp->insn[fpc->inst_offset];
  94.    uint32_t sr = 0;
  95.  
  96.    switch (src.reg.type) {
  97.    case NVFXSR_INPUT:
  98.       sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
  99.       hw[0] |= (src.reg.index << NVFX_FP_OP_INPUT_SRC_SHIFT);
  100.       break;
  101.    case NVFXSR_OUTPUT:
  102.       sr |= NVFX_FP_REG_SRC_HALF;
  103.       /* fall-through */
  104.    case NVFXSR_TEMP:
  105.       sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT);
  106.       sr |= (src.reg.index << NVFX_FP_REG_SRC_SHIFT);
  107.       break;
  108.    case NVFXSR_IMM:
  109.       if (!fpc->have_const) {
  110.          grow_insns(fpc, 4);
  111.          hw = &fp->insn[fpc->inst_offset];
  112.          fpc->have_const = 1;
  113.       }
  114.  
  115.       memcpy(&fp->insn[fpc->inst_offset + 4],
  116.             (float*)fpc->imm_data.data + src.reg.index * 4,
  117.             sizeof(uint32_t) * 4);
  118.  
  119.       sr |= (NVFX_FP_REG_TYPE_CONST << NVFX_FP_REG_TYPE_SHIFT);
  120.       break;
  121.    case NVFXSR_CONST:
  122.       if (!fpc->have_const) {
  123.          grow_insns(fpc, 4);
  124.          hw = &fp->insn[fpc->inst_offset];
  125.          fpc->have_const = 1;
  126.       }
  127.  
  128.       {
  129.          struct nv30_fragprog_data *fpd;
  130.  
  131.          fp->consts = realloc(fp->consts, ++fp->nr_consts *
  132.                     sizeof(*fpd));
  133.          fpd = &fp->consts[fp->nr_consts - 1];
  134.          fpd->offset = fpc->inst_offset + 4;
  135.          fpd->index = src.reg.index;
  136.          memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
  137.       }
  138.  
  139.       sr |= (NVFX_FP_REG_TYPE_CONST << NVFX_FP_REG_TYPE_SHIFT);
  140.       break;
  141.    case NVFXSR_NONE:
  142.       sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
  143.       break;
  144.    default:
  145.       assert(0);
  146.    }
  147.  
  148.    if (src.negate)
  149.       sr |= NVFX_FP_REG_NEGATE;
  150.  
  151.    if (src.abs)
  152.       hw[1] |= (1 << (29 + pos));
  153.  
  154.    sr |= ((src.swz[0] << NVFX_FP_REG_SWZ_X_SHIFT) |
  155.           (src.swz[1] << NVFX_FP_REG_SWZ_Y_SHIFT) |
  156.           (src.swz[2] << NVFX_FP_REG_SWZ_Z_SHIFT) |
  157.           (src.swz[3] << NVFX_FP_REG_SWZ_W_SHIFT));
  158.  
  159.    hw[pos + 1] |= sr;
  160. }
  161.  
  162. static void
  163. emit_dst(struct nvfx_fpc *fpc, struct nvfx_reg dst)
  164. {
  165.    struct nv30_fragprog *fp = fpc->fp;
  166.    uint32_t *hw = &fp->insn[fpc->inst_offset];
  167.  
  168.    switch (dst.type) {
  169.    case NVFXSR_OUTPUT:
  170.       if (dst.index == 1)
  171.          fp->fp_control |= 0x0000000e;
  172.       else {
  173.          hw[0] |= NVFX_FP_OP_OUT_REG_HALF;
  174.          dst.index <<= 1;
  175.       }
  176.       /* fall-through */
  177.    case NVFXSR_TEMP:
  178.       if (fpc->num_regs < (dst.index + 1))
  179.          fpc->num_regs = dst.index + 1;
  180.       break;
  181.    case NVFXSR_NONE:
  182.       hw[0] |= (1 << 30);
  183.       break;
  184.    default:
  185.       assert(0);
  186.    }
  187.  
  188.    hw[0] |= (dst.index << NVFX_FP_OP_OUT_REG_SHIFT);
  189. }
  190.  
  191. static void
  192. nvfx_fp_emit(struct nvfx_fpc *fpc, struct nvfx_insn insn)
  193. {
  194.    struct nv30_fragprog *fp = fpc->fp;
  195.    uint32_t *hw;
  196.  
  197.    fpc->inst_offset = fp->insn_len;
  198.    fpc->have_const = 0;
  199.    grow_insns(fpc, 4);
  200.    hw = &fp->insn[fpc->inst_offset];
  201.    memset(hw, 0, sizeof(uint32_t) * 4);
  202.  
  203.    if (insn.op == NVFX_FP_OP_OPCODE_KIL)
  204.       fp->fp_control |= NV30_3D_FP_CONTROL_USES_KIL;
  205.    hw[0] |= (insn.op << NVFX_FP_OP_OPCODE_SHIFT);
  206.    hw[0] |= (insn.mask << NVFX_FP_OP_OUTMASK_SHIFT);
  207.    hw[2] |= (insn.scale << NVFX_FP_OP_DST_SCALE_SHIFT);
  208.  
  209.    if (insn.sat)
  210.       hw[0] |= NVFX_FP_OP_OUT_SAT;
  211.  
  212.    if (insn.cc_update)
  213.       hw[0] |= NVFX_FP_OP_COND_WRITE_ENABLE;
  214.    hw[1] |= (insn.cc_test << NVFX_FP_OP_COND_SHIFT);
  215.    hw[1] |= ((insn.cc_swz[0] << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
  216.         (insn.cc_swz[1] << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
  217.         (insn.cc_swz[2] << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
  218.         (insn.cc_swz[3] << NVFX_FP_OP_COND_SWZ_W_SHIFT));
  219.  
  220.    if(insn.unit >= 0)
  221.    {
  222.       hw[0] |= (insn.unit << NVFX_FP_OP_TEX_UNIT_SHIFT);
  223.    }
  224.  
  225.    emit_dst(fpc, insn.dst);
  226.    emit_src(fpc, 0, insn.src[0]);
  227.    emit_src(fpc, 1, insn.src[1]);
  228.    emit_src(fpc, 2, insn.src[2]);
  229. }
  230.  
  231. #define arith(s,o,d,m,s0,s1,s2) \
  232.        nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, -1, \
  233.                        (d), (m), (s0), (s1), (s2))
  234.  
  235. #define tex(s,o,u,d,m,s0,s1,s2) \
  236.    nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, (u), \
  237.                    (d), (m), (s0), none, none)
  238.  
  239. /* IF src.x != 0, as TGSI specifies */
  240. static void
  241. nv40_fp_if(struct nvfx_fpc *fpc, struct nvfx_src src)
  242. {
  243.    const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
  244.    struct nvfx_insn insn = arith(0, MOV, none.reg, NVFX_FP_MASK_X, src, none, none);
  245.    uint32_t *hw;
  246.    insn.cc_update = 1;
  247.    nvfx_fp_emit(fpc, insn);
  248.  
  249.    fpc->inst_offset = fpc->fp->insn_len;
  250.    grow_insns(fpc, 4);
  251.    hw = &fpc->fp->insn[fpc->inst_offset];
  252.    /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
  253.    hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
  254.       NV40_FP_OP_OUT_NONE |
  255.       (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
  256.    /* Use .xxxx swizzle so that we check only src[0].x*/
  257.    hw[1] = (0 << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
  258.          (0 << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
  259.          (0 << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
  260.          (0 << NVFX_FP_OP_COND_SWZ_W_SHIFT) |
  261.          (NVFX_FP_OP_COND_NE << NVFX_FP_OP_COND_SHIFT);
  262.    hw[2] = 0; /* | NV40_FP_OP_OPCODE_IS_BRANCH | else_offset */
  263.    hw[3] = 0; /* | endif_offset */
  264.    util_dynarray_append(&fpc->if_stack, unsigned, fpc->inst_offset);
  265. }
  266.  
  267. /* IF src.x != 0, as TGSI specifies */
  268. static void
  269. nv40_fp_cal(struct nvfx_fpc *fpc, unsigned target)
  270. {
  271.         struct nvfx_relocation reloc;
  272.         uint32_t *hw;
  273.         fpc->inst_offset = fpc->fp->insn_len;
  274.         grow_insns(fpc, 4);
  275.         hw = &fpc->fp->insn[fpc->inst_offset];
  276.         /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
  277.         hw[0] = (NV40_FP_OP_BRA_OPCODE_CAL << NVFX_FP_OP_OPCODE_SHIFT);
  278.         /* Use .xxxx swizzle so that we check only src[0].x*/
  279.         hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
  280.                         (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
  281.         hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
  282.         hw[3] = 0;
  283.         reloc.target = target;
  284.         reloc.location = fpc->inst_offset + 2;
  285.         util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
  286. }
  287.  
  288. static void
  289. nv40_fp_ret(struct nvfx_fpc *fpc)
  290. {
  291.    uint32_t *hw;
  292.    fpc->inst_offset = fpc->fp->insn_len;
  293.    grow_insns(fpc, 4);
  294.    hw = &fpc->fp->insn[fpc->inst_offset];
  295.    /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
  296.    hw[0] = (NV40_FP_OP_BRA_OPCODE_RET << NVFX_FP_OP_OPCODE_SHIFT);
  297.    /* Use .xxxx swizzle so that we check only src[0].x*/
  298.    hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
  299.          (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
  300.    hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
  301.    hw[3] = 0;
  302. }
  303.  
  304. static void
  305. nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target)
  306. {
  307.         struct nvfx_relocation reloc;
  308.         uint32_t *hw;
  309.         fpc->inst_offset = fpc->fp->insn_len;
  310.         grow_insns(fpc, 4);
  311.         hw = &fpc->fp->insn[fpc->inst_offset];
  312.         /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
  313.         hw[0] = (NV40_FP_OP_BRA_OPCODE_REP << NVFX_FP_OP_OPCODE_SHIFT) |
  314.                         NV40_FP_OP_OUT_NONE |
  315.                         (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
  316.         /* Use .xxxx swizzle so that we check only src[0].x*/
  317.         hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
  318.                         (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
  319.         hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH |
  320.                         (count << NV40_FP_OP_REP_COUNT1_SHIFT) |
  321.                         (count << NV40_FP_OP_REP_COUNT2_SHIFT) |
  322.                         (count << NV40_FP_OP_REP_COUNT3_SHIFT);
  323.         hw[3] = 0; /* | end_offset */
  324.         reloc.target = target;
  325.         reloc.location = fpc->inst_offset + 3;
  326.         util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
  327.         //util_dynarray_append(&fpc->loop_stack, unsigned, target);
  328. }
  329.  
  330. #if 0
  331. /* documentation only */
  332. /* warning: this only works forward, and probably only if not inside any IF */
  333. static void
  334. nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target)
  335. {
  336.         struct nvfx_relocation reloc;
  337.         uint32_t *hw;
  338.         fpc->inst_offset = fpc->fp->insn_len;
  339.         grow_insns(fpc, 4);
  340.         hw = &fpc->fp->insn[fpc->inst_offset];
  341.         /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
  342.         hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
  343.                 NV40_FP_OP_OUT_NONE |
  344.                 (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
  345.         /* Use .xxxx swizzle so that we check only src[0].x*/
  346.         hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
  347.                         (NVFX_FP_OP_COND_FL << NVFX_FP_OP_COND_SHIFT);
  348.         hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | else_offset */
  349.         hw[3] = 0; /* | endif_offset */
  350.         reloc.target = target;
  351.         reloc.location = fpc->inst_offset + 2;
  352.         util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
  353.         reloc.target = target;
  354.         reloc.location = fpc->inst_offset + 3;
  355.         util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
  356. }
  357. #endif
  358.  
  359. static void
  360. nv40_fp_brk(struct nvfx_fpc *fpc)
  361. {
  362.    uint32_t *hw;
  363.    fpc->inst_offset = fpc->fp->insn_len;
  364.    grow_insns(fpc, 4);
  365.    hw = &fpc->fp->insn[fpc->inst_offset];
  366.    /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
  367.    hw[0] = (NV40_FP_OP_BRA_OPCODE_BRK << NVFX_FP_OP_OPCODE_SHIFT) |
  368.       NV40_FP_OP_OUT_NONE;
  369.    /* Use .xxxx swizzle so that we check only src[0].x*/
  370.    hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
  371.          (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
  372.    hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH;
  373.    hw[3] = 0;
  374. }
  375.  
  376. static INLINE struct nvfx_src
  377. tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
  378. {
  379.    struct nvfx_src src;
  380.  
  381.    switch (fsrc->Register.File) {
  382.    case TGSI_FILE_INPUT:
  383.       src.reg = fpc->r_input[fsrc->Register.Index];
  384.       break;
  385.    case TGSI_FILE_CONSTANT:
  386.       src.reg = nvfx_reg(NVFXSR_CONST, fsrc->Register.Index);
  387.       break;
  388.    case TGSI_FILE_IMMEDIATE:
  389.       assert(fsrc->Register.Index < fpc->nr_imm);
  390.       src.reg = fpc->r_imm[fsrc->Register.Index];
  391.       break;
  392.    case TGSI_FILE_TEMPORARY:
  393.       src.reg = fpc->r_temp[fsrc->Register.Index];
  394.       break;
  395.    /* NV40 fragprog result regs are just temps, so this is simple */
  396.    case TGSI_FILE_OUTPUT:
  397.       src.reg = fpc->r_result[fsrc->Register.Index];
  398.       break;
  399.    default:
  400.       NOUVEAU_ERR("bad src file\n");
  401.       src.reg.index = 0;
  402.       src.reg.type = 0;
  403.       break;
  404.    }
  405.  
  406.    src.abs = fsrc->Register.Absolute;
  407.    src.negate = fsrc->Register.Negate;
  408.    src.swz[0] = fsrc->Register.SwizzleX;
  409.    src.swz[1] = fsrc->Register.SwizzleY;
  410.    src.swz[2] = fsrc->Register.SwizzleZ;
  411.    src.swz[3] = fsrc->Register.SwizzleW;
  412.    src.indirect = 0;
  413.    src.indirect_reg = 0;
  414.    src.indirect_swz = 0;
  415.    return src;
  416. }
  417.  
  418. static INLINE struct nvfx_reg
  419. tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
  420.    switch (fdst->Register.File) {
  421.    case TGSI_FILE_OUTPUT:
  422.       return fpc->r_result[fdst->Register.Index];
  423.    case TGSI_FILE_TEMPORARY:
  424.       return fpc->r_temp[fdst->Register.Index];
  425.    case TGSI_FILE_NULL:
  426.       return nvfx_reg(NVFXSR_NONE, 0);
  427.    default:
  428.       NOUVEAU_ERR("bad dst file %d\n", fdst->Register.File);
  429.       return nvfx_reg(NVFXSR_NONE, 0);
  430.    }
  431. }
  432.  
  433. static INLINE int
  434. tgsi_mask(uint tgsi)
  435. {
  436.    int mask = 0;
  437.  
  438.    if (tgsi & TGSI_WRITEMASK_X) mask |= NVFX_FP_MASK_X;
  439.    if (tgsi & TGSI_WRITEMASK_Y) mask |= NVFX_FP_MASK_Y;
  440.    if (tgsi & TGSI_WRITEMASK_Z) mask |= NVFX_FP_MASK_Z;
  441.    if (tgsi & TGSI_WRITEMASK_W) mask |= NVFX_FP_MASK_W;
  442.    return mask;
  443. }
  444.  
  445. static boolean
  446. nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
  447.             const struct tgsi_full_instruction *finst)
  448. {
  449.    const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
  450.    struct nvfx_insn insn;
  451.    struct nvfx_src src[3], tmp;
  452.    struct nvfx_reg dst;
  453.    int mask, sat, unit = 0;
  454.    int ai = -1, ci = -1, ii = -1;
  455.    int i;
  456.  
  457.    if (finst->Instruction.Opcode == TGSI_OPCODE_END)
  458.       return TRUE;
  459.  
  460.    for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
  461.       const struct tgsi_full_src_register *fsrc;
  462.  
  463.       fsrc = &finst->Src[i];
  464.       if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
  465.          src[i] = tgsi_src(fpc, fsrc);
  466.       }
  467.    }
  468.  
  469.    for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
  470.       const struct tgsi_full_src_register *fsrc;
  471.  
  472.       fsrc = &finst->Src[i];
  473.  
  474.       switch (fsrc->Register.File) {
  475.       case TGSI_FILE_INPUT:
  476.          if(fpc->fp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FOG && (0
  477.                || fsrc->Register.SwizzleX == PIPE_SWIZZLE_ALPHA
  478.                || fsrc->Register.SwizzleY == PIPE_SWIZZLE_ALPHA
  479.                || fsrc->Register.SwizzleZ == PIPE_SWIZZLE_ALPHA
  480.                || fsrc->Register.SwizzleW == PIPE_SWIZZLE_ALPHA
  481.                )) {
  482.             /* hardware puts 0 in fogcoord.w, but GL/Gallium want 1 there */
  483.             struct nvfx_src addend = nvfx_src(nvfx_fp_imm(fpc, 0, 0, 0, 1));
  484.             addend.swz[0] = fsrc->Register.SwizzleX;
  485.             addend.swz[1] = fsrc->Register.SwizzleY;
  486.             addend.swz[2] = fsrc->Register.SwizzleZ;
  487.             addend.swz[3] = fsrc->Register.SwizzleW;
  488.             src[i] = nvfx_src(temp(fpc));
  489.             nvfx_fp_emit(fpc, arith(0, ADD, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), addend, none));
  490.          } else if (ai == -1 || ai == fsrc->Register.Index) {
  491.             ai = fsrc->Register.Index;
  492.             src[i] = tgsi_src(fpc, fsrc);
  493.          } else {
  494.             src[i] = nvfx_src(temp(fpc));
  495.             nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
  496.          }
  497.          break;
  498.       case TGSI_FILE_CONSTANT:
  499.          if ((ci == -1 && ii == -1) ||
  500.              ci == fsrc->Register.Index) {
  501.             ci = fsrc->Register.Index;
  502.             src[i] = tgsi_src(fpc, fsrc);
  503.          } else {
  504.             src[i] = nvfx_src(temp(fpc));
  505.             nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
  506.          }
  507.          break;
  508.       case TGSI_FILE_IMMEDIATE:
  509.          if ((ci == -1 && ii == -1) ||
  510.              ii == fsrc->Register.Index) {
  511.             ii = fsrc->Register.Index;
  512.             src[i] = tgsi_src(fpc, fsrc);
  513.          } else {
  514.             src[i] = nvfx_src(temp(fpc));
  515.             nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
  516.          }
  517.          break;
  518.       case TGSI_FILE_TEMPORARY:
  519.          /* handled above */
  520.          break;
  521.       case TGSI_FILE_SAMPLER:
  522.          unit = fsrc->Register.Index;
  523.          break;
  524.       case TGSI_FILE_OUTPUT:
  525.          break;
  526.       default:
  527.          NOUVEAU_ERR("bad src file\n");
  528.          return FALSE;
  529.       }
  530.    }
  531.  
  532.    dst  = tgsi_dst(fpc, &finst->Dst[0]);
  533.    mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
  534.    sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
  535.  
  536.    switch (finst->Instruction.Opcode) {
  537.    case TGSI_OPCODE_ABS:
  538.       nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, abs(src[0]), none, none));
  539.       break;
  540.    case TGSI_OPCODE_ADD:
  541.       nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], src[1], none));
  542.       break;
  543.    case TGSI_OPCODE_CEIL:
  544.       tmp = nvfx_src(temp(fpc));
  545.       nvfx_fp_emit(fpc, arith(0, FLR, tmp.reg, mask, neg(src[0]), none, none));
  546.       nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, neg(tmp), none, none));
  547.       break;
  548.    case TGSI_OPCODE_CMP:
  549.       insn = arith(0, MOV, none.reg, mask, src[0], none, none);
  550.       insn.cc_update = 1;
  551.       nvfx_fp_emit(fpc, insn);
  552.  
  553.       insn = arith(sat, MOV, dst, mask, src[2], none, none);
  554.       insn.cc_test = NVFX_COND_GE;
  555.       nvfx_fp_emit(fpc, insn);
  556.  
  557.       insn = arith(sat, MOV, dst, mask, src[1], none, none);
  558.       insn.cc_test = NVFX_COND_LT;
  559.       nvfx_fp_emit(fpc, insn);
  560.       break;
  561.    case TGSI_OPCODE_COS:
  562.       nvfx_fp_emit(fpc, arith(sat, COS, dst, mask, src[0], none, none));
  563.       break;
  564.    case TGSI_OPCODE_DDX:
  565.       if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
  566.          tmp = nvfx_src(temp(fpc));
  567.          nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none));
  568.          nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none));
  569.          nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none));
  570.          nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none));
  571.       } else {
  572.          nvfx_fp_emit(fpc, arith(sat, DDX, dst, mask, src[0], none, none));
  573.       }
  574.       break;
  575.    case TGSI_OPCODE_DDY:
  576.       if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
  577.          tmp = nvfx_src(temp(fpc));
  578.          nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none));
  579.          nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none));
  580.          nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none));
  581.          nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none));
  582.       } else {
  583.          nvfx_fp_emit(fpc, arith(sat, DDY, dst, mask, src[0], none, none));
  584.       }
  585.       break;
  586.    case TGSI_OPCODE_DP2:
  587.       tmp = nvfx_src(temp(fpc));
  588.       nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], src[1], none));
  589.       nvfx_fp_emit(fpc, arith(0, ADD, dst, mask, swz(tmp, X, X, X, X), swz(tmp, Y, Y, Y, Y), none));
  590.       break;
  591.    case TGSI_OPCODE_DP3:
  592.       nvfx_fp_emit(fpc, arith(sat, DP3, dst, mask, src[0], src[1], none));
  593.       break;
  594.    case TGSI_OPCODE_DP4:
  595.       nvfx_fp_emit(fpc, arith(sat, DP4, dst, mask, src[0], src[1], none));
  596.       break;
  597.    case TGSI_OPCODE_DPH:
  598.       tmp = nvfx_src(temp(fpc));
  599.       nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_X, src[0], src[1], none));
  600.       nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, swz(tmp, X, X, X, X), swz(src[1], W, W, W, W), none));
  601.       break;
  602.    case TGSI_OPCODE_DST:
  603.       nvfx_fp_emit(fpc, arith(sat, DST, dst, mask, src[0], src[1], none));
  604.       break;
  605.    case TGSI_OPCODE_EX2:
  606.       nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, src[0], none, none));
  607.       break;
  608.    case TGSI_OPCODE_FLR:
  609.       nvfx_fp_emit(fpc, arith(sat, FLR, dst, mask, src[0], none, none));
  610.       break;
  611.    case TGSI_OPCODE_FRC:
  612.       nvfx_fp_emit(fpc, arith(sat, FRC, dst, mask, src[0], none, none));
  613.       break;
  614.    case TGSI_OPCODE_KILL:
  615.       nvfx_fp_emit(fpc, arith(0, KIL, none.reg, 0, none, none, none));
  616.       break;
  617.    case TGSI_OPCODE_KILL_IF:
  618.       insn = arith(0, MOV, none.reg, NVFX_FP_MASK_ALL, src[0], none, none);
  619.       insn.cc_update = 1;
  620.       nvfx_fp_emit(fpc, insn);
  621.  
  622.       insn = arith(0, KIL, none.reg, 0, none, none, none);
  623.       insn.cc_test = NVFX_COND_LT;
  624.       nvfx_fp_emit(fpc, insn);
  625.       break;
  626.    case TGSI_OPCODE_LG2:
  627.       nvfx_fp_emit(fpc, arith(sat, LG2, dst, mask, src[0], none, none));
  628.       break;
  629.    case TGSI_OPCODE_LIT:
  630.       if(!fpc->is_nv4x)
  631.          nvfx_fp_emit(fpc, arith(sat, LIT_NV30, dst, mask, src[0], none, none));
  632.       else {
  633.          /* we use FLT_MIN, so that log2 never gives -infinity, and thus multiplication by
  634.           * specular 0 always gives 0, so that ex2 gives 1, to satisfy the 0^0 = 1 requirement
  635.           *
  636.           * NOTE: if we start using half precision, we might need an fp16 FLT_MIN here instead
  637.           */
  638.          struct nvfx_src maxs = nvfx_src(nvfx_fp_imm(fpc, 0, FLT_MIN, 0, 0));
  639.          tmp = nvfx_src(temp(fpc));
  640.          if (ci>= 0 || ii >= 0) {
  641.             nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, maxs, none, none));
  642.             maxs = tmp;
  643.          }
  644.          nvfx_fp_emit(fpc, arith(0, MAX, tmp.reg, NVFX_FP_MASK_Y | NVFX_FP_MASK_W, swz(src[0], X, X, X, Y), swz(maxs, X, X, Y, Y), none));
  645.          nvfx_fp_emit(fpc, arith(0, LG2, tmp.reg, NVFX_FP_MASK_W, swz(tmp, W, W, W, W), none, none));
  646.          nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_W, swz(tmp, W, W, W, W), swz(src[0], W, W, W, W), none));
  647.          nvfx_fp_emit(fpc, arith(sat, LITEX2_NV40, dst, mask, swz(tmp, Y, Y, W, W), none, none));
  648.       }
  649.       break;
  650.    case TGSI_OPCODE_LRP:
  651.       if(!fpc->is_nv4x)
  652.          nvfx_fp_emit(fpc, arith(sat, LRP_NV30, dst, mask, src[0], src[1], src[2]));
  653.       else {
  654.          tmp = nvfx_src(temp(fpc));
  655.          nvfx_fp_emit(fpc, arith(0, MAD, tmp.reg, mask, neg(src[0]), src[2], src[2]));
  656.          nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], tmp));
  657.       }
  658.       break;
  659.    case TGSI_OPCODE_MAD:
  660.       nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], src[2]));
  661.       break;
  662.    case TGSI_OPCODE_MAX:
  663.       nvfx_fp_emit(fpc, arith(sat, MAX, dst, mask, src[0], src[1], none));
  664.       break;
  665.    case TGSI_OPCODE_MIN:
  666.       nvfx_fp_emit(fpc, arith(sat, MIN, dst, mask, src[0], src[1], none));
  667.       break;
  668.    case TGSI_OPCODE_MOV:
  669.       nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, src[0], none, none));
  670.       break;
  671.    case TGSI_OPCODE_MUL:
  672.       nvfx_fp_emit(fpc, arith(sat, MUL, dst, mask, src[0], src[1], none));
  673.       break;
  674.    case TGSI_OPCODE_NOP:
  675.       break;
  676.    case TGSI_OPCODE_POW:
  677.       if(!fpc->is_nv4x)
  678.          nvfx_fp_emit(fpc, arith(sat, POW_NV30, dst, mask, src[0], src[1], none));
  679.       else {
  680.          tmp = nvfx_src(temp(fpc));
  681.          nvfx_fp_emit(fpc, arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
  682.          nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X, swz(tmp, X, X, X, X), swz(src[1], X, X, X, X), none));
  683.          nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, swz(tmp, X, X, X, X), none, none));
  684.       }
  685.       break;
  686.    case TGSI_OPCODE_RCP:
  687.       nvfx_fp_emit(fpc, arith(sat, RCP, dst, mask, src[0], none, none));
  688.       break;
  689.    case TGSI_OPCODE_RSQ:
  690.       if(!fpc->is_nv4x)
  691.          nvfx_fp_emit(fpc, arith(sat, RSQ_NV30, dst, mask, abs(swz(src[0], X, X, X, X)), none, none));
  692.       else {
  693.          tmp = nvfx_src(temp(fpc));
  694.          insn = arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, abs(swz(src[0], X, X, X, X)), none, none);
  695.          insn.scale = NVFX_FP_OP_DST_SCALE_INV_2X;
  696.          nvfx_fp_emit(fpc, insn);
  697.          nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, neg(swz(tmp, X, X, X, X)), none, none));
  698.       }
  699.       break;
  700.    case TGSI_OPCODE_SCS:
  701.       /* avoid overwriting the source */
  702.       if(src[0].swz[NVFX_SWZ_X] != NVFX_SWZ_X)
  703.       {
  704.          if (mask & NVFX_FP_MASK_X)
  705.             nvfx_fp_emit(fpc, arith(sat, COS, dst, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
  706.          if (mask & NVFX_FP_MASK_Y)
  707.             nvfx_fp_emit(fpc, arith(sat, SIN, dst, NVFX_FP_MASK_Y, swz(src[0], X, X, X, X), none, none));
  708.       }
  709.       else
  710.       {
  711.          if (mask & NVFX_FP_MASK_Y)
  712.             nvfx_fp_emit(fpc, arith(sat, SIN, dst, NVFX_FP_MASK_Y, swz(src[0], X, X, X, X), none, none));
  713.          if (mask & NVFX_FP_MASK_X)
  714.             nvfx_fp_emit(fpc, arith(sat, COS, dst, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
  715.       }
  716.       break;
  717.    case TGSI_OPCODE_SEQ:
  718.       nvfx_fp_emit(fpc, arith(sat, SEQ, dst, mask, src[0], src[1], none));
  719.       break;
  720.    case TGSI_OPCODE_SGE:
  721.       nvfx_fp_emit(fpc, arith(sat, SGE, dst, mask, src[0], src[1], none));
  722.       break;
  723.    case TGSI_OPCODE_SGT:
  724.       nvfx_fp_emit(fpc, arith(sat, SGT, dst, mask, src[0], src[1], none));
  725.       break;
  726.    case TGSI_OPCODE_SIN:
  727.       nvfx_fp_emit(fpc, arith(sat, SIN, dst, mask, src[0], none, none));
  728.       break;
  729.    case TGSI_OPCODE_SLE:
  730.       nvfx_fp_emit(fpc, arith(sat, SLE, dst, mask, src[0], src[1], none));
  731.       break;
  732.    case TGSI_OPCODE_SLT:
  733.       nvfx_fp_emit(fpc, arith(sat, SLT, dst, mask, src[0], src[1], none));
  734.       break;
  735.    case TGSI_OPCODE_SNE:
  736.       nvfx_fp_emit(fpc, arith(sat, SNE, dst, mask, src[0], src[1], none));
  737.       break;
  738.    case TGSI_OPCODE_SSG:
  739.    {
  740.       struct nvfx_src minones = swz(nvfx_src(nvfx_fp_imm(fpc, -1, -1, -1, -1)), X, X, X, X);
  741.  
  742.       insn = arith(sat, MOV, dst, mask, src[0], none, none);
  743.       insn.cc_update = 1;
  744.       nvfx_fp_emit(fpc, insn);
  745.  
  746.       insn = arith(0, STR, dst, mask, none, none, none);
  747.       insn.cc_test = NVFX_COND_GT;
  748.       nvfx_fp_emit(fpc, insn);
  749.  
  750.       if(!sat) {
  751.          insn = arith(0, MOV, dst, mask, minones, none, none);
  752.          insn.cc_test = NVFX_COND_LT;
  753.          nvfx_fp_emit(fpc, insn);
  754.       }
  755.       break;
  756.    }
  757.    case TGSI_OPCODE_SUB:
  758.       nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], neg(src[1]), none));
  759.       break;
  760.    case TGSI_OPCODE_TEX:
  761.       nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none));
  762.       break;
  763.         case TGSI_OPCODE_TRUNC:
  764.                 tmp = nvfx_src(temp(fpc));
  765.                 insn = arith(0, MOV, none.reg, mask, src[0], none, none);
  766.                 insn.cc_update = 1;
  767.                 nvfx_fp_emit(fpc, insn);
  768.  
  769.                 nvfx_fp_emit(fpc, arith(0, FLR, tmp.reg, mask, abs(src[0]), none, none));
  770.                 nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, tmp, none, none));
  771.  
  772.                 insn = arith(sat, MOV, dst, mask, neg(tmp), none, none);
  773.                 insn.cc_test = NVFX_COND_LT;
  774.                 nvfx_fp_emit(fpc, insn);
  775.                 break;
  776.         case TGSI_OPCODE_TXB:
  777.                 nvfx_fp_emit(fpc, tex(sat, TXB, unit, dst, mask, src[0], none, none));
  778.                 break;
  779.         case TGSI_OPCODE_TXL:
  780.                 if(fpc->is_nv4x)
  781.                         nvfx_fp_emit(fpc, tex(sat, TXL_NV40, unit, dst, mask, src[0], none, none));
  782.                 else /* unsupported on nv30, use TEX and hope they like it */
  783.                         nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none));
  784.                 break;
  785.         case TGSI_OPCODE_TXP:
  786.                 nvfx_fp_emit(fpc, tex(sat, TXP, unit, dst, mask, src[0], none, none));
  787.                 break;
  788.    case TGSI_OPCODE_XPD:
  789.       tmp = nvfx_src(temp(fpc));
  790.       nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
  791.       nvfx_fp_emit(fpc, arith(sat, MAD, dst, (mask & ~NVFX_FP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
  792.       break;
  793.  
  794.    case TGSI_OPCODE_IF:
  795.       // MOVRC0 R31 (TR0.xyzw), R<src>:
  796.       // IF (NE.xxxx) ELSE <else> END <end>
  797.       if(!fpc->is_nv4x)
  798.          goto nv3x_cflow;
  799.       nv40_fp_if(fpc, src[0]);
  800.       break;
  801.  
  802.    case TGSI_OPCODE_ELSE:
  803.    {
  804.       uint32_t *hw;
  805.       if(!fpc->is_nv4x)
  806.          goto nv3x_cflow;
  807.       assert(util_dynarray_contains(&fpc->if_stack, unsigned));
  808.       hw = &fpc->fp->insn[util_dynarray_top(&fpc->if_stack, unsigned)];
  809.       hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
  810.       break;
  811.    }
  812.  
  813.    case TGSI_OPCODE_ENDIF:
  814.    {
  815.       uint32_t *hw;
  816.       if(!fpc->is_nv4x)
  817.          goto nv3x_cflow;
  818.       assert(util_dynarray_contains(&fpc->if_stack, unsigned));
  819.       hw = &fpc->fp->insn[util_dynarray_pop(&fpc->if_stack, unsigned)];
  820.       if(!hw[2])
  821.          hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
  822.       hw[3] = fpc->fp->insn_len;
  823.       break;
  824.    }
  825.  
  826.    case TGSI_OPCODE_BGNSUB:
  827.    case TGSI_OPCODE_ENDSUB:
  828.       /* nothing to do here */
  829.       break;
  830.  
  831.    case TGSI_OPCODE_CAL:
  832.       if(!fpc->is_nv4x)
  833.          goto nv3x_cflow;
  834.       nv40_fp_cal(fpc, finst->Label.Label);
  835.       break;
  836.  
  837.    case TGSI_OPCODE_RET:
  838.       if(!fpc->is_nv4x)
  839.          goto nv3x_cflow;
  840.       nv40_fp_ret(fpc);
  841.       break;
  842.  
  843.    case TGSI_OPCODE_BGNLOOP:
  844.       if(!fpc->is_nv4x)
  845.          goto nv3x_cflow;
  846.       /* TODO: we should support using two nested REPs to allow a > 255 iteration count */
  847.       nv40_fp_rep(fpc, 255, finst->Label.Label);
  848.       break;
  849.  
  850.    case TGSI_OPCODE_ENDLOOP:
  851.       break;
  852.  
  853.    case TGSI_OPCODE_BRK:
  854.       if(!fpc->is_nv4x)
  855.          goto nv3x_cflow;
  856.       nv40_fp_brk(fpc);
  857.       break;
  858.  
  859.    case TGSI_OPCODE_CONT:
  860.    {
  861.       static int warned = 0;
  862.       if(!warned) {
  863.          NOUVEAU_ERR("Sorry, the continue keyword is not implemented: ignoring it.\n");
  864.          warned = 1;
  865.       }
  866.       break;
  867.    }
  868.  
  869.         default:
  870.       NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
  871.       return FALSE;
  872.    }
  873.  
  874. out:
  875.    release_temps(fpc);
  876.    return TRUE;
  877. nv3x_cflow:
  878.    {
  879.       static int warned = 0;
  880.       if(!warned) {
  881.          NOUVEAU_ERR(
  882.                "Sorry, control flow instructions are not supported in hardware on nv3x: ignoring them\n"
  883.                "If rendering is incorrect, try to disable GLSL support in the application.\n");
  884.          warned = 1;
  885.       }
  886.    }
  887.    goto out;
  888. }
  889.  
  890. static boolean
  891. nvfx_fragprog_parse_decl_input(struct nvfx_fpc *fpc,
  892.                                const struct tgsi_full_declaration *fdec)
  893. {
  894.    unsigned idx = fdec->Range.First;
  895.    unsigned hw;
  896.  
  897.    switch (fdec->Semantic.Name) {
  898.    case TGSI_SEMANTIC_POSITION:
  899.       hw = NVFX_FP_OP_INPUT_SRC_POSITION;
  900.       break;
  901.    case TGSI_SEMANTIC_COLOR:
  902.       hw = NVFX_FP_OP_INPUT_SRC_COL0 + fdec->Semantic.Index;
  903.       break;
  904.    case TGSI_SEMANTIC_FOG:
  905.       hw = NVFX_FP_OP_INPUT_SRC_FOGC;
  906.       break;
  907.    case TGSI_SEMANTIC_FACE:
  908.       hw = NV40_FP_OP_INPUT_SRC_FACING;
  909.       break;
  910.    case TGSI_SEMANTIC_TEXCOORD:
  911.       assert(fdec->Semantic.Index < 8);
  912.       fpc->fp->texcoord[fdec->Semantic.Index] = fdec->Semantic.Index;
  913.       fpc->fp->texcoords |= (1 << fdec->Semantic.Index);
  914.       fpc->fp->vp_or |= (0x00004000 << fdec->Semantic.Index);
  915.       hw = NVFX_FP_OP_INPUT_SRC_TC(fdec->Semantic.Index);
  916.       break;
  917.    case TGSI_SEMANTIC_GENERIC:
  918.    case TGSI_SEMANTIC_PCOORD:
  919.       /* will be assigned to remaining TC slots later */
  920.       return TRUE;
  921.    default:
  922.       assert(0);
  923.       return FALSE;
  924.    }
  925.  
  926.    fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw);
  927.    return TRUE;
  928. }
  929.  
  930. static boolean
  931. nvfx_fragprog_assign_generic(struct nvfx_fpc *fpc,
  932.                              const struct tgsi_full_declaration *fdec)
  933. {
  934.    unsigned num_texcoords = fpc->is_nv4x ? 10 : 8;
  935.    unsigned idx = fdec->Range.First;
  936.    unsigned hw;
  937.  
  938.    switch (fdec->Semantic.Name) {
  939.    case TGSI_SEMANTIC_GENERIC:
  940.    case TGSI_SEMANTIC_PCOORD:
  941.       for (hw = 0; hw < num_texcoords; hw++) {
  942.          if (fpc->fp->texcoord[hw] == 0xffff) {
  943.             if (hw <= 7) {
  944.                fpc->fp->texcoords |= (0x1 << hw);
  945.                fpc->fp->vp_or |= (0x00004000 << hw);
  946.             } else {
  947.                fpc->fp->vp_or |= (0x00001000 << (hw - 8));
  948.             }
  949.             if (fdec->Semantic.Name == TGSI_SEMANTIC_PCOORD) {
  950.                fpc->fp->texcoord[hw] = 0xfffe;
  951.                fpc->fp->point_sprite_control |= (0x00000100 << hw);
  952.             } else {
  953.                fpc->fp->texcoord[hw] = fdec->Semantic.Index + 8;
  954.             }
  955.             hw = NVFX_FP_OP_INPUT_SRC_TC(hw);
  956.             fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw);
  957.             return TRUE;
  958.          }
  959.       }
  960.       return FALSE;
  961.    default:
  962.       return TRUE;
  963.    }
  964. }
  965.  
  966. static boolean
  967. nvfx_fragprog_parse_decl_output(struct nvfx_fpc *fpc,
  968.             const struct tgsi_full_declaration *fdec)
  969. {
  970.    unsigned idx = fdec->Range.First;
  971.    unsigned hw;
  972.  
  973.    switch (fdec->Semantic.Name) {
  974.    case TGSI_SEMANTIC_POSITION:
  975.       hw = 1;
  976.       break;
  977.    case TGSI_SEMANTIC_COLOR:
  978.       hw = ~0;
  979.       switch (fdec->Semantic.Index) {
  980.       case 0: hw = 0; break;
  981.       case 1: hw = 2; break;
  982.       case 2: hw = 3; break;
  983.       case 3: hw = 4; break;
  984.       }
  985.       if(hw > ((fpc->is_nv4x) ? 4 : 2)) {
  986.          NOUVEAU_ERR("bad rcol index\n");
  987.          return FALSE;
  988.       }
  989.       break;
  990.    default:
  991.       NOUVEAU_ERR("bad output semantic\n");
  992.       return FALSE;
  993.    }
  994.  
  995.    fpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
  996.    fpc->r_temps |= (1ULL << hw);
  997.    return TRUE;
  998. }
  999.  
  1000. static boolean
  1001. nvfx_fragprog_prepare(struct nvfx_fpc *fpc)
  1002. {
  1003.    struct tgsi_parse_context p;
  1004.    int high_temp = -1, i;
  1005.  
  1006.    fpc->r_imm = CALLOC(fpc->fp->info.immediate_count, sizeof(struct nvfx_reg));
  1007.  
  1008.    tgsi_parse_init(&p, fpc->fp->pipe.tokens);
  1009.    while (!tgsi_parse_end_of_tokens(&p)) {
  1010.       const union tgsi_full_token *tok = &p.FullToken;
  1011.  
  1012.       tgsi_parse_token(&p);
  1013.       switch(tok->Token.Type) {
  1014.       case TGSI_TOKEN_TYPE_DECLARATION:
  1015.       {
  1016.          const struct tgsi_full_declaration *fdec;
  1017.          fdec = &p.FullToken.FullDeclaration;
  1018.          switch (fdec->Declaration.File) {
  1019.          case TGSI_FILE_INPUT:
  1020.             if (!nvfx_fragprog_parse_decl_input(fpc, fdec))
  1021.                goto out_err;
  1022.             break;
  1023.          case TGSI_FILE_OUTPUT:
  1024.             if (!nvfx_fragprog_parse_decl_output(fpc, fdec))
  1025.                goto out_err;
  1026.             break;
  1027.          case TGSI_FILE_TEMPORARY:
  1028.             if (fdec->Range.Last > high_temp) {
  1029.                high_temp =
  1030.                   fdec->Range.Last;
  1031.             }
  1032.             break;
  1033.          default:
  1034.             break;
  1035.          }
  1036.       }
  1037.          break;
  1038.       case TGSI_TOKEN_TYPE_IMMEDIATE:
  1039.       {
  1040.          struct tgsi_full_immediate *imm;
  1041.  
  1042.          imm = &p.FullToken.FullImmediate;
  1043.          assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
  1044.          assert(fpc->nr_imm < fpc->fp->info.immediate_count);
  1045.  
  1046.          fpc->r_imm[fpc->nr_imm++] = nvfx_fp_imm(fpc, imm->u[0].Float, imm->u[1].Float, imm->u[2].Float, imm->u[3].Float);
  1047.          break;
  1048.       }
  1049.       default:
  1050.          break;
  1051.       }
  1052.    }
  1053.    tgsi_parse_free(&p);
  1054.  
  1055.    tgsi_parse_init(&p, fpc->fp->pipe.tokens);
  1056.    while (!tgsi_parse_end_of_tokens(&p)) {
  1057.       const struct tgsi_full_declaration *fdec;
  1058.       tgsi_parse_token(&p);
  1059.       switch(p.FullToken.Token.Type) {
  1060.       case TGSI_TOKEN_TYPE_DECLARATION:
  1061.          fdec = &p.FullToken.FullDeclaration;
  1062.          switch (fdec->Declaration.File) {
  1063.          case TGSI_FILE_INPUT:
  1064.             if (!nvfx_fragprog_assign_generic(fpc, fdec))
  1065.                goto out_err;
  1066.             break;
  1067.          default:
  1068.             break;
  1069.          }
  1070.          break;
  1071.       default:
  1072.          break;
  1073.       }
  1074.    }
  1075.    tgsi_parse_free(&p);
  1076.  
  1077.    if (++high_temp) {
  1078.       fpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_reg));
  1079.       for (i = 0; i < high_temp; i++)
  1080.          fpc->r_temp[i] = temp(fpc);
  1081.       fpc->r_temps_discard = 0ULL;
  1082.    }
  1083.  
  1084.    return TRUE;
  1085.  
  1086. out_err:
  1087.    FREE(fpc->r_temp);
  1088.    fpc->r_temp = NULL;
  1089.  
  1090.    tgsi_parse_free(&p);
  1091.    return FALSE;
  1092. }
  1093.  
  1094. DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", FALSE)
  1095.  
  1096. void
  1097. _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp)
  1098. {
  1099.    struct tgsi_parse_context parse;
  1100.    struct nvfx_fpc *fpc = NULL;
  1101.    struct util_dynarray insns;
  1102.  
  1103.    fp->translated = FALSE;
  1104.    fp->point_sprite_control = 0;
  1105.    fp->vp_or = 0;
  1106.  
  1107.    fpc = CALLOC_STRUCT(nvfx_fpc);
  1108.    if (!fpc)
  1109.       goto out_err;
  1110.  
  1111.    fpc->is_nv4x = (oclass >= NV40_3D_CLASS) ? ~0 : 0;
  1112.    fpc->max_temps = fpc->is_nv4x ? 48 : 32;
  1113.    fpc->fp = fp;
  1114.    fpc->num_regs = 2;
  1115.    memset(fp->texcoord, 0xff, sizeof(fp->texcoord));
  1116.  
  1117.    if (fp->info.properties[TGSI_PROPERTY_FS_COORD_ORIGIN])
  1118.       fp->coord_conventions |= NV30_3D_COORD_CONVENTIONS_ORIGIN_INVERTED;
  1119.    if (fp->info.properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER])
  1120.       fp->coord_conventions |= NV30_3D_COORD_CONVENTIONS_CENTER_INTEGER;
  1121.    if (fp->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
  1122.       fp->rt_enable |= NV30_3D_RT_ENABLE_MRT;
  1123.  
  1124.    if (!nvfx_fragprog_prepare(fpc))
  1125.       goto out_err;
  1126.  
  1127.    tgsi_parse_init(&parse, fp->pipe.tokens);
  1128.    util_dynarray_init(&insns);
  1129.  
  1130.    while (!tgsi_parse_end_of_tokens(&parse)) {
  1131.       tgsi_parse_token(&parse);
  1132.  
  1133.       switch (parse.FullToken.Token.Type) {
  1134.       case TGSI_TOKEN_TYPE_INSTRUCTION:
  1135.       {
  1136.          const struct tgsi_full_instruction *finst;
  1137.  
  1138.          util_dynarray_append(&insns, unsigned, fp->insn_len);
  1139.          finst = &parse.FullToken.FullInstruction;
  1140.          if (!nvfx_fragprog_parse_instruction(fpc, finst))
  1141.             goto out_err;
  1142.       }
  1143.          break;
  1144.       default:
  1145.          break;
  1146.       }
  1147.    }
  1148.    util_dynarray_append(&insns, unsigned, fp->insn_len);
  1149.  
  1150.    for(unsigned i = 0; i < fpc->label_relocs.size; i += sizeof(struct nvfx_relocation))
  1151.    {
  1152.       struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)fpc->label_relocs.data + i);
  1153.       fp->insn[label_reloc->location] |= ((unsigned*)insns.data)[label_reloc->target];
  1154.    }
  1155.    util_dynarray_fini(&insns);
  1156.  
  1157.    if(!fpc->is_nv4x)
  1158.       fp->fp_control |= (fpc->num_regs-1)/2;
  1159.    else
  1160.       fp->fp_control |= fpc->num_regs << NV40_3D_FP_CONTROL_TEMP_COUNT__SHIFT;
  1161.  
  1162.    /* Terminate final instruction */
  1163.    if(fp->insn)
  1164.       fp->insn[fpc->inst_offset] |= 0x00000001;
  1165.  
  1166.    /* Append NOP + END instruction for branches to the end of the program */
  1167.    fpc->inst_offset = fp->insn_len;
  1168.    grow_insns(fpc, 4);
  1169.    fp->insn[fpc->inst_offset + 0] = 0x00000001;
  1170.    fp->insn[fpc->inst_offset + 1] = 0x00000000;
  1171.    fp->insn[fpc->inst_offset + 2] = 0x00000000;
  1172.    fp->insn[fpc->inst_offset + 3] = 0x00000000;
  1173.  
  1174.    if(debug_get_option_nvfx_dump_fp())
  1175.    {
  1176.       debug_printf("\n");
  1177.       tgsi_dump(fp->pipe.tokens, 0);
  1178.  
  1179.       debug_printf("\n%s fragment program:\n", fpc->is_nv4x ? "nv4x" : "nv3x");
  1180.       for (unsigned i = 0; i < fp->insn_len; i += 4)
  1181.          debug_printf("%3u: %08x %08x %08x %08x\n", i >> 2, fp->insn[i], fp->insn[i + 1], fp->insn[i + 2], fp->insn[i + 3]);
  1182.       debug_printf("\n");
  1183.    }
  1184.  
  1185.    fp->translated = TRUE;
  1186.  
  1187. out:
  1188.    tgsi_parse_free(&parse);
  1189.    if(fpc)
  1190.    {
  1191.       FREE(fpc->r_temp);
  1192.       FREE(fpc->r_imm);
  1193.       util_dynarray_fini(&fpc->if_stack);
  1194.       util_dynarray_fini(&fpc->label_relocs);
  1195.       util_dynarray_fini(&fpc->imm_data);
  1196.       //util_dynarray_fini(&fpc->loop_stack);
  1197.       FREE(fpc);
  1198.    }
  1199.  
  1200.    return;
  1201.  
  1202. out_err:
  1203.    _debug_printf("Error: failed to compile this fragment program:\n");
  1204.    tgsi_dump(fp->pipe.tokens, 0);
  1205.    goto out;
  1206. }
  1207.