Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /**************************************************************************
  2.  *
  3.  * Copyright 2009 VMware, Inc.
  4.  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
  5.  * All Rights Reserved.
  6.  *
  7.  * Permission is hereby granted, free of charge, to any person obtaining a
  8.  * copy of this software and associated documentation files (the
  9.  * "Software"), to deal in the Software without restriction, including
  10.  * without limitation the rights to use, copy, modify, merge, publish,
  11.  * distribute, sub license, and/or sell copies of the Software, and to
  12.  * permit persons to whom the Software is furnished to do so, subject to
  13.  * the following conditions:
  14.  *
  15.  * The above copyright notice and this permission notice (including the
  16.  * next paragraph) shall be included in all copies or substantial portions
  17.  * of the Software.
  18.  *
  19.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20.  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21.  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22.  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  23.  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24.  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25.  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26.  *
  27.  **************************************************************************/
  28.  
  29. /**
  30.  * @file
  31.  * Code generate the whole fragment pipeline.
  32.  *
  33.  * The fragment pipeline consists of the following stages:
  34.  * - early depth test
  35.  * - fragment shader
  36.  * - alpha test
  37.  * - depth/stencil test
  38.  * - blending
  39.  *
  40.  * This file has only the glue to assemble the fragment pipeline.  The actual
  41.  * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the
  42.  * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we
  43.  * muster the LLVM JIT execution engine to create a function that follows an
  44.  * established binary interface and that can be called from C directly.
  45.  *
  46.  * A big source of complexity here is that we often want to run different
  47.  * stages with different precisions and data types and precisions. For example,
  48.  * the fragment shader needs typically to be done in floats, but the
  49.  * depth/stencil test and blending is better done in the type that most closely
  50.  * matches the depth/stencil and color buffer respectively.
  51.  *
  52.  * Since the width of a SIMD vector register stays the same regardless of the
  53.  * element type, different types imply different number of elements, so we must
  54.  * code generate more instances of the stages with larger types to be able to
  55.  * feed/consume the stages with smaller types.
  56.  *
  57.  * @author Jose Fonseca <jfonseca@vmware.com>
  58.  */
  59.  
  60. #include <limits.h>
  61. #include "pipe/p_defines.h"
  62. #include "util/u_inlines.h"
  63. #include "util/u_memory.h"
  64. #include "util/u_pointer.h"
  65. #include "util/u_format.h"
  66. #include "util/u_dump.h"
  67. #include "util/u_string.h"
  68. #include "util/u_simple_list.h"
  69. #include "util/u_dual_blend.h"
  70. #include "os/os_time.h"
  71. #include "pipe/p_shader_tokens.h"
  72. #include "draw/draw_context.h"
  73. #include "tgsi/tgsi_dump.h"
  74. #include "tgsi/tgsi_scan.h"
  75. #include "tgsi/tgsi_parse.h"
  76. #include "gallivm/lp_bld_type.h"
  77. #include "gallivm/lp_bld_const.h"
  78. #include "gallivm/lp_bld_conv.h"
  79. #include "gallivm/lp_bld_init.h"
  80. #include "gallivm/lp_bld_intr.h"
  81. #include "gallivm/lp_bld_logic.h"
  82. #include "gallivm/lp_bld_tgsi.h"
  83. #include "gallivm/lp_bld_swizzle.h"
  84. #include "gallivm/lp_bld_flow.h"
  85. #include "gallivm/lp_bld_debug.h"
  86. #include "gallivm/lp_bld_arit.h"
  87. #include "gallivm/lp_bld_pack.h"
  88. #include "gallivm/lp_bld_format.h"
  89. #include "gallivm/lp_bld_quad.h"
  90.  
  91. #include "lp_bld_alpha.h"
  92. #include "lp_bld_blend.h"
  93. #include "lp_bld_depth.h"
  94. #include "lp_bld_interp.h"
  95. #include "lp_context.h"
  96. #include "lp_debug.h"
  97. #include "lp_perf.h"
  98. #include "lp_setup.h"
  99. #include "lp_state.h"
  100. #include "lp_tex_sample.h"
  101. #include "lp_flush.h"
  102. #include "lp_state_fs.h"
  103. #include "lp_rast.h"
  104.  
  105.  
  106. /** Fragment shader number (for debugging) */
  107. static unsigned fs_no = 0;
  108.  
  109.  
  110. /**
  111.  * Expand the relevant bits of mask_input to a n*4-dword mask for the
  112.  * n*four pixels in n 2x2 quads.  This will set the n*four elements of the
  113.  * quad mask vector to 0 or ~0.
  114.  * Grouping is 01, 23 for 2 quad mode hence only 0 and 2 are valid
  115.  * quad arguments with fs length 8.
  116.  *
  117.  * \param first_quad  which quad(s) of the quad group to test, in [0,3]
  118.  * \param mask_input  bitwise mask for the whole 4x4 stamp
  119.  */
  120. static LLVMValueRef
  121. generate_quad_mask(struct gallivm_state *gallivm,
  122.                    struct lp_type fs_type,
  123.                    unsigned first_quad,
  124.                    LLVMValueRef mask_input) /* int32 */
  125. {
  126.    LLVMBuilderRef builder = gallivm->builder;
  127.    struct lp_type mask_type;
  128.    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
  129.    LLVMValueRef bits[16];
  130.    LLVMValueRef mask;
  131.    int shift, i;
  132.  
  133.    /*
  134.     * XXX: We'll need a different path for 16 x u8
  135.     */
  136.    assert(fs_type.width == 32);
  137.    assert(fs_type.length <= Elements(bits));
  138.    mask_type = lp_int_type(fs_type);
  139.  
  140.    /*
  141.     * mask_input >>= (quad * 4)
  142.     */
  143.    switch (first_quad) {
  144.    case 0:
  145.       shift = 0;
  146.       break;
  147.    case 1:
  148.       assert(fs_type.length == 4);
  149.       shift = 2;
  150.       break;
  151.    case 2:
  152.       shift = 8;
  153.       break;
  154.    case 3:
  155.       assert(fs_type.length == 4);
  156.       shift = 10;
  157.       break;
  158.    default:
  159.       assert(0);
  160.       shift = 0;
  161.    }
  162.  
  163.    mask_input = LLVMBuildLShr(builder,
  164.                               mask_input,
  165.                               LLVMConstInt(i32t, shift, 0),
  166.                               "");
  167.  
  168.    /*
  169.     * mask = { mask_input & (1 << i), for i in [0,3] }
  170.     */
  171.    mask = lp_build_broadcast(gallivm,
  172.                              lp_build_vec_type(gallivm, mask_type),
  173.                              mask_input);
  174.  
  175.    for (i = 0; i < fs_type.length / 4; i++) {
  176.       unsigned j = 2 * (i % 2) + (i / 2) * 8;
  177.       bits[4*i + 0] = LLVMConstInt(i32t, 1 << (j + 0), 0);
  178.       bits[4*i + 1] = LLVMConstInt(i32t, 1 << (j + 1), 0);
  179.       bits[4*i + 2] = LLVMConstInt(i32t, 1 << (j + 4), 0);
  180.       bits[4*i + 3] = LLVMConstInt(i32t, 1 << (j + 5), 0);
  181.    }
  182.    mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, fs_type.length), "");
  183.  
  184.    /*
  185.     * mask = mask != 0 ? ~0 : 0
  186.     */
  187.    mask = lp_build_compare(gallivm,
  188.                            mask_type, PIPE_FUNC_NOTEQUAL,
  189.                            mask,
  190.                            lp_build_const_int_vec(gallivm, mask_type, 0));
  191.  
  192.    return mask;
  193. }
  194.  
  195.  
  196. #define EARLY_DEPTH_TEST  0x1
  197. #define LATE_DEPTH_TEST   0x2
  198. #define EARLY_DEPTH_WRITE 0x4
  199. #define LATE_DEPTH_WRITE  0x8
  200.  
  201. static int
  202. find_output_by_semantic( const struct tgsi_shader_info *info,
  203.                          unsigned semantic,
  204.                          unsigned index )
  205. {
  206.    int i;
  207.  
  208.    for (i = 0; i < info->num_outputs; i++)
  209.       if (info->output_semantic_name[i] == semantic &&
  210.           info->output_semantic_index[i] == index)
  211.          return i;
  212.  
  213.    return -1;
  214. }
  215.  
  216.  
  217. /**
  218.  * Generate the fragment shader, depth/stencil test, and alpha tests.
  219.  */
  220. static void
  221. generate_fs_loop(struct gallivm_state *gallivm,
  222.                  struct lp_fragment_shader *shader,
  223.                  const struct lp_fragment_shader_variant_key *key,
  224.                  LLVMBuilderRef builder,
  225.                  struct lp_type type,
  226.                  LLVMValueRef context_ptr,
  227.                  LLVMValueRef num_loop,
  228.                  struct lp_build_interp_soa_context *interp,
  229.                  struct lp_build_sampler_soa *sampler,
  230.                  LLVMValueRef mask_store,
  231.                  LLVMValueRef (*out_color)[4],
  232.                  LLVMValueRef depth_ptr,
  233.                  LLVMValueRef depth_stride,
  234.                  LLVMValueRef facing,
  235.                  LLVMValueRef thread_data_ptr)
  236. {
  237.    const struct util_format_description *zs_format_desc = NULL;
  238.    const struct tgsi_token *tokens = shader->base.tokens;
  239.    LLVMTypeRef vec_type;
  240.    LLVMValueRef mask_ptr, mask_val;
  241.    LLVMValueRef consts_ptr;
  242.    LLVMValueRef z;
  243.    LLVMValueRef z_value, s_value;
  244.    LLVMValueRef z_fb, s_fb;
  245.    LLVMValueRef stencil_refs[2];
  246.    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
  247.    struct lp_build_for_loop_state loop_state;
  248.    struct lp_build_mask_context mask;
  249.    /*
  250.     * TODO: figure out if simple_shader optimization is really worthwile to
  251.     * keep. Disabled because it may hide some real bugs in the (depth/stencil)
  252.     * code since tests tend to take another codepath than real shaders.
  253.     */
  254.    boolean simple_shader = (shader->info.base.file_count[TGSI_FILE_SAMPLER] == 0 &&
  255.                             shader->info.base.num_inputs < 3 &&
  256.                             shader->info.base.num_instructions < 8) && 0;
  257.    const boolean dual_source_blend = key->blend.rt[0].blend_enable &&
  258.                                      util_blend_state_is_dual(&key->blend, 0);
  259.    unsigned attrib;
  260.    unsigned chan;
  261.    unsigned cbuf;
  262.    unsigned depth_mode;
  263.  
  264.    struct lp_bld_tgsi_system_values system_values;
  265.  
  266.    memset(&system_values, 0, sizeof(system_values));
  267.  
  268.    if (key->depth.enabled ||
  269.        key->stencil[0].enabled) {
  270.  
  271.       zs_format_desc = util_format_description(key->zsbuf_format);
  272.       assert(zs_format_desc);
  273.  
  274.       if (!shader->info.base.writes_z) {
  275.          if (key->alpha.enabled || shader->info.base.uses_kill) {
  276.             /* With alpha test and kill, can do the depth test early
  277.              * and hopefully eliminate some quads.  But need to do a
  278.              * special deferred depth write once the final mask value
  279.              * is known. This only works though if there's either no
  280.              * stencil test or the stencil value isn't written.
  281.              */
  282.             if (key->stencil[0].enabled && (key->stencil[0].writemask ||
  283.                                             (key->stencil[1].enabled &&
  284.                                              key->stencil[1].writemask)))
  285.                depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
  286.             else
  287.                depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE;
  288.          }
  289.          else
  290.             depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
  291.       }
  292.       else {
  293.          depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
  294.       }
  295.  
  296.       if (!(key->depth.enabled && key->depth.writemask) &&
  297.           !(key->stencil[0].enabled && (key->stencil[0].writemask ||
  298.                                         (key->stencil[1].enabled &&
  299.                                          key->stencil[1].writemask))))
  300.          depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
  301.    }
  302.    else {
  303.       depth_mode = 0;
  304.    }
  305.  
  306.  
  307.    stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_ptr);
  308.    stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_ptr);
  309.  
  310.    vec_type = lp_build_vec_type(gallivm, type);
  311.  
  312.    consts_ptr = lp_jit_context_constants(gallivm, context_ptr);
  313.  
  314.    lp_build_for_loop_begin(&loop_state, gallivm,
  315.                            lp_build_const_int32(gallivm, 0),
  316.                            LLVMIntULT,
  317.                            num_loop,
  318.                            lp_build_const_int32(gallivm, 1));
  319.  
  320.    mask_ptr = LLVMBuildGEP(builder, mask_store,
  321.                            &loop_state.counter, 1, "mask_ptr");
  322.    mask_val = LLVMBuildLoad(builder, mask_ptr, "");
  323.  
  324.    memset(outputs, 0, sizeof outputs);
  325.  
  326.    for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
  327.       for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
  328.          out_color[cbuf][chan] = lp_build_array_alloca(gallivm,
  329.                                                        lp_build_vec_type(gallivm,
  330.                                                                          type),
  331.                                                        num_loop, "color");
  332.       }
  333.    }
  334.    if (dual_source_blend) {
  335.       assert(key->nr_cbufs <= 1);
  336.       for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
  337.          out_color[1][chan] = lp_build_array_alloca(gallivm,
  338.                                                     lp_build_vec_type(gallivm,
  339.                                                                       type),
  340.                                                     num_loop, "color1");
  341.       }
  342.    }
  343.  
  344.  
  345.    /* 'mask' will control execution based on quad's pixel alive/killed state */
  346.    lp_build_mask_begin(&mask, gallivm, type, mask_val);
  347.  
  348.    if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader)
  349.       lp_build_mask_check(&mask);
  350.  
  351.    lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter);
  352.    z = interp->pos[2];
  353.  
  354.    if (depth_mode & EARLY_DEPTH_TEST) {
  355.       lp_build_depth_stencil_load_swizzled(gallivm, type,
  356.                                            zs_format_desc, key->resource_1d,
  357.                                            depth_ptr, depth_stride,
  358.                                            &z_fb, &s_fb, loop_state.counter);
  359.       lp_build_depth_stencil_test(gallivm,
  360.                                   &key->depth,
  361.                                   key->stencil,
  362.                                   type,
  363.                                   zs_format_desc,
  364.                                   &mask,
  365.                                   stencil_refs,
  366.                                   z, z_fb, s_fb,
  367.                                   facing,
  368.                                   &z_value, &s_value,
  369.                                   !simple_shader);
  370.  
  371.       if (depth_mode & EARLY_DEPTH_WRITE) {
  372.          lp_build_depth_stencil_write_swizzled(gallivm, type,
  373.                                                zs_format_desc, key->resource_1d,
  374.                                                NULL, NULL, NULL, loop_state.counter,
  375.                                                depth_ptr, depth_stride,
  376.                                                z_value, s_value);
  377.       }
  378.       /*
  379.        * Note mask check if stencil is enabled must be after ds write not after
  380.        * stencil test otherwise new stencil values may not get written if all
  381.        * fragments got killed by depth/stencil test.
  382.        */
  383.       if (!simple_shader && key->stencil[0].enabled)
  384.          lp_build_mask_check(&mask);
  385.    }
  386.  
  387.    lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter);
  388.  
  389.    /* Build the actual shader */
  390.    lp_build_tgsi_soa(gallivm, tokens, type, &mask,
  391.                      consts_ptr, &system_values,
  392.                      interp->inputs,
  393.                      outputs, sampler, &shader->info.base, NULL);
  394.  
  395.    /* Alpha test */
  396.    if (key->alpha.enabled) {
  397.       int color0 = find_output_by_semantic(&shader->info.base,
  398.                                            TGSI_SEMANTIC_COLOR,
  399.                                            0);
  400.  
  401.       if (color0 != -1 && outputs[color0][3]) {
  402.          const struct util_format_description *cbuf_format_desc;
  403.          LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha");
  404.          LLVMValueRef alpha_ref_value;
  405.  
  406.          alpha_ref_value = lp_jit_context_alpha_ref_value(gallivm, context_ptr);
  407.          alpha_ref_value = lp_build_broadcast(gallivm, vec_type, alpha_ref_value);
  408.  
  409.          cbuf_format_desc = util_format_description(key->cbuf_format[0]);
  410.  
  411.          lp_build_alpha_test(gallivm, key->alpha.func, type, cbuf_format_desc,
  412.                              &mask, alpha, alpha_ref_value,
  413.                              (depth_mode & LATE_DEPTH_TEST) != 0);
  414.       }
  415.    }
  416.  
  417.    /* Late Z test */
  418.    if (depth_mode & LATE_DEPTH_TEST) {
  419.       int pos0 = find_output_by_semantic(&shader->info.base,
  420.                                          TGSI_SEMANTIC_POSITION,
  421.                                          0);
  422.  
  423.       if (pos0 != -1 && outputs[pos0][2]) {
  424.          z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z");
  425.       }
  426.  
  427.       lp_build_depth_stencil_load_swizzled(gallivm, type,
  428.                                            zs_format_desc, key->resource_1d,
  429.                                            depth_ptr, depth_stride,
  430.                                            &z_fb, &s_fb, loop_state.counter);
  431.  
  432.       lp_build_depth_stencil_test(gallivm,
  433.                                   &key->depth,
  434.                                   key->stencil,
  435.                                   type,
  436.                                   zs_format_desc,
  437.                                   &mask,
  438.                                   stencil_refs,
  439.                                   z, z_fb, s_fb,
  440.                                   facing,
  441.                                   &z_value, &s_value,
  442.                                   !simple_shader);
  443.       /* Late Z write */
  444.       if (depth_mode & LATE_DEPTH_WRITE) {
  445.          lp_build_depth_stencil_write_swizzled(gallivm, type,
  446.                                                zs_format_desc, key->resource_1d,
  447.                                                NULL, NULL, NULL, loop_state.counter,
  448.                                                depth_ptr, depth_stride,
  449.                                                z_value, s_value);
  450.       }
  451.    }
  452.    else if ((depth_mode & EARLY_DEPTH_TEST) &&
  453.             (depth_mode & LATE_DEPTH_WRITE))
  454.    {
  455.       /* Need to apply a reduced mask to the depth write.  Reload the
  456.        * depth value, update from zs_value with the new mask value and
  457.        * write that out.
  458.        */
  459.       lp_build_depth_stencil_write_swizzled(gallivm, type,
  460.                                             zs_format_desc, key->resource_1d,
  461.                                             &mask, z_fb, s_fb, loop_state.counter,
  462.                                             depth_ptr, depth_stride,
  463.                                             z_value, s_value);
  464.    }
  465.  
  466.  
  467.    /* Color write  */
  468.    for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib)
  469.    {
  470.       unsigned cbuf = shader->info.base.output_semantic_index[attrib];
  471.       if ((shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR) &&
  472.            ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend)))
  473.       {
  474.          for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
  475.             if(outputs[attrib][chan]) {
  476.                /* XXX: just initialize outputs to point at colors[] and
  477.                 * skip this.
  478.                 */
  479.                LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
  480.                LLVMValueRef color_ptr;
  481.                color_ptr = LLVMBuildGEP(builder, out_color[cbuf][chan],
  482.                                         &loop_state.counter, 1, "");
  483.                lp_build_name(out, "color%u.%c", attrib, "rgba"[chan]);
  484.                LLVMBuildStore(builder, out, color_ptr);
  485.             }
  486.          }
  487.       }
  488.    }
  489.  
  490.    if (key->occlusion_count) {
  491.       LLVMValueRef counter = lp_jit_thread_data_counter(gallivm, thread_data_ptr);
  492.       lp_build_name(counter, "counter");
  493.       lp_build_occlusion_count(gallivm, type,
  494.                                lp_build_mask_value(&mask), counter);
  495.    }
  496.  
  497.    mask_val = lp_build_mask_end(&mask);
  498.    LLVMBuildStore(builder, mask_val, mask_ptr);
  499.    lp_build_for_loop_end(&loop_state);
  500. }
  501.  
  502.  
  503. /**
  504.  * This function will reorder pixels from the fragment shader SoA to memory layout AoS
  505.  *
  506.  * Fragment Shader outputs pixels in small 2x2 blocks
  507.  *  e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ...
  508.  *
  509.  * However in memory pixels are stored in rows
  510.  *  e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ...
  511.  *
  512.  * @param type            fragment shader type (4x or 8x float)
  513.  * @param num_fs          number of fs_src
  514.  * @param is_1d           whether we're outputting to a 1d resource
  515.  * @param dst_channels    number of output channels
  516.  * @param fs_src          output from fragment shader
  517.  * @param dst             pointer to store result
  518.  * @param pad_inline      is channel padding inline or at end of row
  519.  * @return                the number of dsts
  520.  */
  521. static int
  522. generate_fs_twiddle(struct gallivm_state *gallivm,
  523.                     struct lp_type type,
  524.                     unsigned num_fs,
  525.                     unsigned dst_channels,
  526.                     LLVMValueRef fs_src[][4],
  527.                     LLVMValueRef* dst,
  528.                     bool pad_inline)
  529. {
  530.    LLVMValueRef src[16];
  531.  
  532.    bool swizzle_pad;
  533.    bool twiddle;
  534.    bool split;
  535.  
  536.    unsigned pixels = type.length / 4;
  537.    unsigned reorder_group;
  538.    unsigned src_channels;
  539.    unsigned src_count;
  540.    unsigned i;
  541.  
  542.    src_channels = dst_channels < 3 ? dst_channels : 4;
  543.    src_count = num_fs * src_channels;
  544.  
  545.    assert(pixels == 2 || pixels == 1);
  546.    assert(num_fs * src_channels <= Elements(src));
  547.  
  548.    /*
  549.     * Transpose from SoA -> AoS
  550.     */
  551.    for (i = 0; i < num_fs; ++i) {
  552.       lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels, &src[i * src_channels]);
  553.    }
  554.  
  555.    /*
  556.     * Pick transformation options
  557.     */
  558.    swizzle_pad = false;
  559.    twiddle = false;
  560.    split = false;
  561.    reorder_group = 0;
  562.  
  563.    if (dst_channels == 1) {
  564.       twiddle = true;
  565.  
  566.       if (pixels == 2) {
  567.          split = true;
  568.       }
  569.    } else if (dst_channels == 2) {
  570.       if (pixels == 1) {
  571.          reorder_group = 1;
  572.       }
  573.    } else if (dst_channels > 2) {
  574.       if (pixels == 1) {
  575.          reorder_group = 2;
  576.       } else {
  577.          twiddle = true;
  578.       }
  579.  
  580.       if (!pad_inline && dst_channels == 3 && pixels > 1) {
  581.          swizzle_pad = true;
  582.       }
  583.    }
  584.  
  585.    /*
  586.     * Split the src in half
  587.     */
  588.    if (split) {
  589.       for (i = num_fs; i > 0; --i) {
  590.          src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4);
  591.          src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4);
  592.       }
  593.  
  594.       src_count *= 2;
  595.       type.length = 4;
  596.    }
  597.  
  598.    /*
  599.     * Ensure pixels are in memory order
  600.     */
  601.    if (reorder_group) {
  602.       /* Twiddle pixels by reordering the array, e.g.:
  603.        *
  604.        * src_count =  8 -> 0 2 1 3 4 6 5 7
  605.        * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15
  606.        */
  607.       const unsigned reorder_sw[] = { 0, 2, 1, 3 };
  608.  
  609.       for (i = 0; i < src_count; ++i) {
  610.          unsigned group = i / reorder_group;
  611.          unsigned block = (group / 4) * 4 * reorder_group;
  612.          unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group);
  613.          dst[i] = src[j];
  614.       }
  615.    } else if (twiddle) {
  616.       /* Twiddle pixels across elements of array */
  617.       lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
  618.    } else {
  619.       /* Do nothing */
  620.       memcpy(dst, src, sizeof(LLVMValueRef) * src_count);
  621.    }
  622.  
  623.    /*
  624.     * Moves any padding between pixels to the end
  625.     * e.g. RGBXRGBX -> RGBRGBXX
  626.     */
  627.    if (swizzle_pad) {
  628.       unsigned char swizzles[16];
  629.       unsigned elems = pixels * dst_channels;
  630.  
  631.       for (i = 0; i < type.length; ++i) {
  632.          if (i < elems)
  633.             swizzles[i] = i % dst_channels + (i / dst_channels) * 4;
  634.          else
  635.             swizzles[i] = LP_BLD_SWIZZLE_DONTCARE;
  636.       }
  637.  
  638.       for (i = 0; i < src_count; ++i) {
  639.          dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles, type.length, type.length);
  640.       }
  641.    }
  642.  
  643.    return src_count;
  644. }
  645.  
  646.  
  647. /**
  648.  * Load an unswizzled block of pixels from memory
  649.  */
  650. static void
  651. load_unswizzled_block(struct gallivm_state *gallivm,
  652.                       LLVMValueRef base_ptr,
  653.                       LLVMValueRef stride,
  654.                       unsigned block_width,
  655.                       unsigned block_height,
  656.                       LLVMValueRef* dst,
  657.                       struct lp_type dst_type,
  658.                       unsigned dst_count,
  659.                       unsigned dst_alignment)
  660. {
  661.    LLVMBuilderRef builder = gallivm->builder;
  662.    unsigned row_size = dst_count / block_height;
  663.    unsigned i;
  664.  
  665.    /* Ensure block exactly fits into dst */
  666.    assert((block_width * block_height) % dst_count == 0);
  667.  
  668.    for (i = 0; i < dst_count; ++i) {
  669.       unsigned x = i % row_size;
  670.       unsigned y = i / row_size;
  671.  
  672.       LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length);
  673.       LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
  674.  
  675.       LLVMValueRef gep[2];
  676.       LLVMValueRef dst_ptr;
  677.  
  678.       gep[0] = lp_build_const_int32(gallivm, 0);
  679.       gep[1] = LLVMBuildAdd(builder, bx, by, "");
  680.  
  681.       dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
  682.       dst_ptr = LLVMBuildBitCast(builder, dst_ptr, LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
  683.  
  684.       dst[i] = LLVMBuildLoad(builder, dst_ptr, "");
  685.  
  686.       lp_set_load_alignment(dst[i], dst_alignment);
  687.    }
  688. }
  689.  
  690.  
  691. /**
  692.  * Store an unswizzled block of pixels to memory
  693.  */
  694. static void
  695. store_unswizzled_block(struct gallivm_state *gallivm,
  696.                        LLVMValueRef base_ptr,
  697.                        LLVMValueRef stride,
  698.                        unsigned block_width,
  699.                        unsigned block_height,
  700.                        LLVMValueRef* src,
  701.                        struct lp_type src_type,
  702.                        unsigned src_count,
  703.                        unsigned src_alignment)
  704. {
  705.    LLVMBuilderRef builder = gallivm->builder;
  706.    unsigned row_size = src_count / block_height;
  707.    unsigned i;
  708.  
  709.    /* Ensure src exactly fits into block */
  710.    assert((block_width * block_height) % src_count == 0);
  711.  
  712.    for (i = 0; i < src_count; ++i) {
  713.       unsigned x = i % row_size;
  714.       unsigned y = i / row_size;
  715.  
  716.       LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length);
  717.       LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
  718.  
  719.       LLVMValueRef gep[2];
  720.       LLVMValueRef src_ptr;
  721.  
  722.       gep[0] = lp_build_const_int32(gallivm, 0);
  723.       gep[1] = LLVMBuildAdd(builder, bx, by, "");
  724.  
  725.       src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
  726.       src_ptr = LLVMBuildBitCast(builder, src_ptr, LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
  727.  
  728.       src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
  729.  
  730.       lp_set_store_alignment(src_ptr, src_alignment);
  731.    }
  732. }
  733.  
  734.  
  735. /**
  736.  * Checks if a format description is an arithmetic format
  737.  *
  738.  * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
  739.  */
  740. static INLINE boolean
  741. is_arithmetic_format(const struct util_format_description *format_desc)
  742. {
  743.    boolean arith = false;
  744.    unsigned i;
  745.  
  746.    for (i = 0; i < format_desc->nr_channels; ++i) {
  747.       arith |= format_desc->channel[i].size != format_desc->channel[0].size;
  748.       arith |= (format_desc->channel[i].size % 8) != 0;
  749.    }
  750.  
  751.    return arith;
  752. }
  753.  
  754.  
  755. /**
  756.  * Checks if this format requires special handling due to required expansion
  757.  * to floats for blending, and furthermore has "natural" packed AoS -> unpacked
  758.  * SoA conversion.
  759.  */
  760. static INLINE boolean
  761. format_expands_to_float_soa(const struct util_format_description *format_desc)
  762. {
  763.    if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
  764.        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
  765.       return true;
  766.    }
  767.    return false;
  768. }
  769.  
  770.  
  771. /**
  772.  * Retrieves the type representing the memory layout for a format
  773.  *
  774.  * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
  775.  */
  776. static INLINE void
  777. lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
  778.                              struct lp_type* type)
  779. {
  780.    unsigned i;
  781.    unsigned chan;
  782.  
  783.    if (format_expands_to_float_soa(format_desc)) {
  784.       /* just make this a 32bit uint */
  785.       type->floating = false;
  786.       type->fixed = false;
  787.       type->sign = false;
  788.       type->norm = false;
  789.       type->width = 32;
  790.       type->length = 1;
  791.       return;
  792.    }
  793.  
  794.    for (i = 0; i < 4; i++)
  795.       if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
  796.          break;
  797.    chan = i;
  798.  
  799.    memset(type, 0, sizeof(struct lp_type));
  800.    type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
  801.    type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
  802.    type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
  803.    type->norm     = format_desc->channel[chan].normalized;
  804.  
  805.    if (is_arithmetic_format(format_desc)) {
  806.       type->width = 0;
  807.       type->length = 1;
  808.  
  809.       for (i = 0; i < format_desc->nr_channels; ++i) {
  810.          type->width += format_desc->channel[i].size;
  811.       }
  812.    } else {
  813.       type->width = format_desc->channel[chan].size;
  814.       type->length = format_desc->nr_channels;
  815.    }
  816. }
  817.  
  818.  
  819. /**
  820.  * Retrieves the type for a format which is usable in the blending code.
  821.  *
  822.  * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
  823.  */
  824. static INLINE void
  825. lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
  826.                                struct lp_type* type)
  827. {
  828.    unsigned i;
  829.    unsigned chan;
  830.  
  831.    if (format_expands_to_float_soa(format_desc)) {
  832.       /* always use ordinary floats for blending */
  833.       type->floating = true;
  834.       type->fixed = false;
  835.       type->sign = true;
  836.       type->norm = false;
  837.       type->width = 32;
  838.       type->length = 4;
  839.       return;
  840.    }
  841.  
  842.    for (i = 0; i < 4; i++)
  843.       if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
  844.          break;
  845.    chan = i;
  846.  
  847.    memset(type, 0, sizeof(struct lp_type));
  848.    type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
  849.    type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
  850.    type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
  851.    type->norm     = format_desc->channel[chan].normalized;
  852.    type->width    = format_desc->channel[chan].size;
  853.    type->length   = format_desc->nr_channels;
  854.  
  855.    for (i = 1; i < format_desc->nr_channels; ++i) {
  856.       if (format_desc->channel[i].size > type->width)
  857.          type->width = format_desc->channel[i].size;
  858.    }
  859.  
  860.    if (type->floating) {
  861.       type->width = 32;
  862.    } else {
  863.       if (type->width <= 8) {
  864.          type->width = 8;
  865.       } else if (type->width <= 16) {
  866.          type->width = 16;
  867.       } else {
  868.          type->width = 32;
  869.       }
  870.    }
  871.  
  872.    if (is_arithmetic_format(format_desc) && type->length == 3) {
  873.       type->length = 4;
  874.    }
  875. }
  876.  
  877.  
  878. /**
  879.  * Scale a normalized value from src_bits to dst_bits
  880.  */
  881. static INLINE LLVMValueRef
  882. scale_bits(struct gallivm_state *gallivm,
  883.            int src_bits,
  884.            int dst_bits,
  885.            LLVMValueRef src,
  886.            struct lp_type src_type)
  887. {
  888.    LLVMBuilderRef builder = gallivm->builder;
  889.    LLVMValueRef result = src;
  890.  
  891.    if (dst_bits < src_bits) {
  892.       /* Scale down by LShr */
  893.       result = LLVMBuildLShr(builder,
  894.                              src,
  895.                              lp_build_const_int_vec(gallivm, src_type, src_bits - dst_bits),
  896.                              "");
  897.    } else if (dst_bits > src_bits) {
  898.       /* Scale up bits */
  899.       int db = dst_bits - src_bits;
  900.  
  901.       /* Shift left by difference in bits */
  902.       result = LLVMBuildShl(builder,
  903.                             src,
  904.                             lp_build_const_int_vec(gallivm, src_type, db),
  905.                             "");
  906.  
  907.       if (db < src_bits) {
  908.          /* Enough bits in src to fill the remainder */
  909.          LLVMValueRef lower = LLVMBuildLShr(builder,
  910.                                             src,
  911.                                             lp_build_const_int_vec(gallivm, src_type, src_bits - db),
  912.                                             "");
  913.  
  914.          result = LLVMBuildOr(builder, result, lower, "");
  915.       } else if (db > src_bits) {
  916.          /* Need to repeatedly copy src bits to fill remainder in dst */
  917.          unsigned n;
  918.  
  919.          for (n = src_bits; n < dst_bits; n *= 2) {
  920.             LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
  921.  
  922.             result = LLVMBuildOr(builder,
  923.                                  result,
  924.                                  LLVMBuildLShr(builder, result, shuv, ""),
  925.                                  "");
  926.          }
  927.       }
  928.    }
  929.  
  930.    return result;
  931. }
  932.  
  933.  
  934. /**
  935.  * Convert from memory format to blending format
  936.  *
  937.  * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending
  938.  */
  939. static void
  940. convert_to_blend_type(struct gallivm_state *gallivm,
  941.                       unsigned block_size,
  942.                       const struct util_format_description *src_fmt,
  943.                       struct lp_type src_type,
  944.                       struct lp_type dst_type,
  945.                       LLVMValueRef* src, // and dst
  946.                       unsigned num_srcs)
  947. {
  948.    LLVMValueRef *dst = src;
  949.    LLVMBuilderRef builder = gallivm->builder;
  950.    struct lp_type blend_type;
  951.    struct lp_type mem_type;
  952.    unsigned i, j, k;
  953.    unsigned pixels = block_size / num_srcs;
  954.    bool is_arith;
  955.  
  956.    /*
  957.     * full custom path for packed floats and srgb formats - none of the later
  958.     * functions would do anything useful, and given the lp_type representation they
  959.     * can't be fixed. Should really have some SoA blend path for these kind of
  960.     * formats rather than hacking them in here.
  961.     */
  962.    if (format_expands_to_float_soa(src_fmt)) {
  963.       LLVMValueRef tmpsrc[4];
  964.       /*
  965.        * This is pretty suboptimal for this case blending in SoA would be much
  966.        * better, since conversion gets us SoA values so need to convert back.
  967.        */
  968.       assert(src_type.width == 32);
  969.       assert(dst_type.floating);
  970.       assert(dst_type.width == 32);
  971.       assert(dst_type.length % 4 == 0);
  972.       assert(num_srcs % 4 == 0);
  973.  
  974.       for (i = 0; i < 4; i++) {
  975.          tmpsrc[i] = src[i];
  976.       }
  977.       for (i = 0; i < num_srcs / 4; i++) {
  978.          LLVMValueRef tmpsoa[4];
  979.          LLVMValueRef tmps = tmpsrc[i];
  980.          if (dst_type.length == 8) {
  981.             LLVMValueRef shuffles[8];
  982.             unsigned j;
  983.             /* fetch was 4 values but need 8-wide output values */
  984.             tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
  985.             /*
  986.              * for 8-wide aos transpose would give us wrong order not matching
  987.              * incoming converted fs values and mask. ARGH.
  988.              */
  989.             for (j = 0; j < 4; j++) {
  990.                shuffles[j] = lp_build_const_int32(gallivm, j * 2);
  991.                shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
  992.             }
  993.             tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
  994.                                           LLVMConstVector(shuffles, 8), "");
  995.          }
  996.          if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
  997.             lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
  998.          }
  999.          else {
  1000.             lp_build_unpack_rgba_soa(gallivm, src_fmt, dst_type, tmps, tmpsoa);
  1001.          }
  1002.          lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
  1003.       }
  1004.       return;
  1005.    }
  1006.  
  1007.    lp_mem_type_from_format_desc(src_fmt, &mem_type);
  1008.    lp_blend_type_from_format_desc(src_fmt, &blend_type);
  1009.  
  1010.    /* Is the format arithmetic */
  1011.    is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length;
  1012.    is_arith &= !(mem_type.width == 16 && mem_type.floating);
  1013.  
  1014.    /* Pad if necessary */
  1015.    if (!is_arith && src_type.length < dst_type.length) {
  1016.       for (i = 0; i < num_srcs; ++i) {
  1017.          dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length);
  1018.       }
  1019.  
  1020.       src_type.length = dst_type.length;
  1021.    }
  1022.  
  1023.    /* Special case for half-floats */
  1024.    if (mem_type.width == 16 && mem_type.floating) {
  1025.       assert(blend_type.width == 32 && blend_type.floating);
  1026.       lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
  1027.       is_arith = false;
  1028.    }
  1029.  
  1030.    if (!is_arith) {
  1031.       return;
  1032.    }
  1033.  
  1034.    src_type.width = blend_type.width * blend_type.length;
  1035.    blend_type.length *= pixels;
  1036.    src_type.length *= pixels / (src_type.length / mem_type.length);
  1037.  
  1038.    for (i = 0; i < num_srcs; ++i) {
  1039.       LLVMValueRef chans[4];
  1040.       LLVMValueRef res = NULL;
  1041.  
  1042.       dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
  1043.  
  1044.       for (j = 0; j < src_fmt->nr_channels; ++j) {
  1045.          unsigned mask = 0;
  1046.          unsigned sa = src_fmt->channel[j].shift;
  1047. #ifdef PIPE_ARCH_LITTLE_ENDIAN
  1048.          unsigned from_lsb = j;
  1049. #else
  1050.          unsigned from_lsb = src_fmt->nr_channels - j - 1;
  1051. #endif
  1052.  
  1053.          for (k = 0; k < src_fmt->channel[j].size; ++k) {
  1054.             mask |= 1 << k;
  1055.          }
  1056.  
  1057.          /* Extract bits from source */
  1058.          chans[j] = LLVMBuildLShr(builder,
  1059.                                   dst[i],
  1060.                                   lp_build_const_int_vec(gallivm, src_type, sa),
  1061.                                   "");
  1062.  
  1063.          chans[j] = LLVMBuildAnd(builder,
  1064.                                  chans[j],
  1065.                                  lp_build_const_int_vec(gallivm, src_type, mask),
  1066.                                  "");
  1067.  
  1068.          /* Scale bits */
  1069.          if (src_type.norm) {
  1070.             chans[j] = scale_bits(gallivm, src_fmt->channel[j].size,
  1071.                                   blend_type.width, chans[j], src_type);
  1072.          }
  1073.  
  1074.          /* Insert bits into correct position */
  1075.          chans[j] = LLVMBuildShl(builder,
  1076.                                  chans[j],
  1077.                                  lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
  1078.                                  "");
  1079.  
  1080.          if (j == 0) {
  1081.             res = chans[j];
  1082.          } else {
  1083.             res = LLVMBuildOr(builder, res, chans[j], "");
  1084.          }
  1085.       }
  1086.  
  1087.       dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), "");
  1088.    }
  1089. }
  1090.  
  1091.  
  1092. /**
  1093.  * Convert from blending format to memory format
  1094.  *
  1095.  * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory
  1096.  */
  1097. static void
  1098. convert_from_blend_type(struct gallivm_state *gallivm,
  1099.                         unsigned block_size,
  1100.                         const struct util_format_description *src_fmt,
  1101.                         struct lp_type src_type,
  1102.                         struct lp_type dst_type,
  1103.                         LLVMValueRef* src, // and dst
  1104.                         unsigned num_srcs)
  1105. {
  1106.    LLVMValueRef* dst = src;
  1107.    unsigned i, j, k;
  1108.    struct lp_type mem_type;
  1109.    struct lp_type blend_type;
  1110.    LLVMBuilderRef builder = gallivm->builder;
  1111.    unsigned pixels = block_size / num_srcs;
  1112.    bool is_arith;
  1113.  
  1114.    /*
  1115.     * full custom path for packed floats and srgb formats - none of the later
  1116.     * functions would do anything useful, and given the lp_type representation they
  1117.     * can't be fixed. Should really have some SoA blend path for these kind of
  1118.     * formats rather than hacking them in here.
  1119.     */
  1120.    if (format_expands_to_float_soa(src_fmt)) {
  1121.       /*
  1122.        * This is pretty suboptimal for this case blending in SoA would be much
  1123.        * better - we need to transpose the AoS values back to SoA values for
  1124.        * conversion/packing.
  1125.        */
  1126.       assert(src_type.floating);
  1127.       assert(src_type.width == 32);
  1128.       assert(src_type.length % 4 == 0);
  1129.       assert(dst_type.width == 32);
  1130.  
  1131.       for (i = 0; i < num_srcs / 4; i++) {
  1132.          LLVMValueRef tmpsoa[4], tmpdst;
  1133.          lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
  1134.          /* really really need SoA here */
  1135.  
  1136.          if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
  1137.             tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
  1138.          }
  1139.          else {
  1140.             tmpdst = lp_build_float_to_srgb_packed(gallivm, src_fmt,
  1141.                                                    src_type, tmpsoa);
  1142.          }
  1143.  
  1144.          if (src_type.length == 8) {
  1145.             LLVMValueRef tmpaos, shuffles[8];
  1146.             unsigned j;
  1147.             /*
  1148.              * for 8-wide aos transpose has given us wrong order not matching
  1149.              * output order. HMPF. Also need to split the output values manually.
  1150.              */
  1151.             for (j = 0; j < 4; j++) {
  1152.                shuffles[j * 2] = lp_build_const_int32(gallivm, j);
  1153.                shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
  1154.             }
  1155.             tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
  1156.                                             LLVMConstVector(shuffles, 8), "");
  1157.             src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
  1158.             src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
  1159.          }
  1160.          else {
  1161.             src[i] = tmpdst;
  1162.          }
  1163.       }
  1164.       return;
  1165.    }
  1166.  
  1167.    lp_mem_type_from_format_desc(src_fmt, &mem_type);
  1168.    lp_blend_type_from_format_desc(src_fmt, &blend_type);
  1169.  
  1170.    is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length);
  1171.  
  1172.    /* Special case for half-floats */
  1173.    if (mem_type.width == 16 && mem_type.floating) {
  1174.       int length = dst_type.length;
  1175.       assert(blend_type.width == 32 && blend_type.floating);
  1176.  
  1177.       dst_type.length = src_type.length;
  1178.  
  1179.       lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
  1180.  
  1181.       dst_type.length = length;
  1182.       is_arith = false;
  1183.    }
  1184.  
  1185.    /* Remove any padding */
  1186.    if (!is_arith && (src_type.length % mem_type.length)) {
  1187.       src_type.length -= (src_type.length % mem_type.length);
  1188.  
  1189.       for (i = 0; i < num_srcs; ++i) {
  1190.          dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length);
  1191.       }
  1192.    }
  1193.  
  1194.    /* No bit arithmetic to do */
  1195.    if (!is_arith) {
  1196.       return;
  1197.    }
  1198.  
  1199.    src_type.length = pixels;
  1200.    src_type.width = blend_type.length * blend_type.width;
  1201.    dst_type.length = pixels;
  1202.  
  1203.    for (i = 0; i < num_srcs; ++i) {
  1204.       LLVMValueRef chans[4];
  1205.       LLVMValueRef res = NULL;
  1206.  
  1207.       dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
  1208.  
  1209.       for (j = 0; j < src_fmt->nr_channels; ++j) {
  1210.          unsigned mask = 0;
  1211.          unsigned sa = src_fmt->channel[j].shift;
  1212. #ifdef PIPE_ARCH_LITTLE_ENDIAN
  1213.          unsigned from_lsb = j;
  1214. #else
  1215.          unsigned from_lsb = src_fmt->nr_channels - j - 1;
  1216. #endif
  1217.  
  1218.          assert(blend_type.width > src_fmt->channel[j].size);
  1219.  
  1220.          for (k = 0; k < blend_type.width; ++k) {
  1221.             mask |= 1 << k;
  1222.          }
  1223.  
  1224.          /* Extract bits */
  1225.          chans[j] = LLVMBuildLShr(builder,
  1226.                                   dst[i],
  1227.                                   lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
  1228.                                   "");
  1229.  
  1230.          chans[j] = LLVMBuildAnd(builder,
  1231.                                  chans[j],
  1232.                                  lp_build_const_int_vec(gallivm, src_type, mask),
  1233.                                  "");
  1234.  
  1235.          /* Scale down bits */
  1236.          if (src_type.norm) {
  1237.             chans[j] = scale_bits(gallivm, blend_type.width,
  1238.                                   src_fmt->channel[j].size, chans[j], src_type);
  1239.          }
  1240.  
  1241.          /* Insert bits */
  1242.          chans[j] = LLVMBuildShl(builder,
  1243.                                  chans[j],
  1244.                                  lp_build_const_int_vec(gallivm, src_type, sa),
  1245.                                  "");
  1246.  
  1247.          sa += src_fmt->channel[j].size;
  1248.  
  1249.          if (j == 0) {
  1250.             res = chans[j];
  1251.          } else {
  1252.             res = LLVMBuildOr(builder, res, chans[j], "");
  1253.          }
  1254.       }
  1255.  
  1256.       assert (dst_type.width != 24);
  1257.  
  1258.       dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), "");
  1259.    }
  1260. }
  1261.  
  1262.  
  1263. /**
  1264.  * Convert alpha to same blend type as src
  1265.  */
  1266. static void
  1267. convert_alpha(struct gallivm_state *gallivm,
  1268.               struct lp_type row_type,
  1269.               struct lp_type alpha_type,
  1270.               const unsigned block_size,
  1271.               const unsigned block_height,
  1272.               const unsigned src_count,
  1273.               const unsigned dst_channels,
  1274.               const bool pad_inline,
  1275.               LLVMValueRef* src_alpha)
  1276. {
  1277.    LLVMBuilderRef builder = gallivm->builder;
  1278.    unsigned i, j;
  1279.    unsigned length = row_type.length;
  1280.    row_type.length = alpha_type.length;
  1281.  
  1282.    /* Twiddle the alpha to match pixels */
  1283.    lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, block_height, src_alpha);
  1284.  
  1285.    /*
  1286.     * TODO this should use single lp_build_conv call for
  1287.     * src_count == 1 && dst_channels == 1 case (dropping the concat below)
  1288.     */
  1289.    for (i = 0; i < block_height; ++i) {
  1290.       lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1, &src_alpha[i], 1);
  1291.    }
  1292.  
  1293.    alpha_type = row_type;
  1294.    row_type.length = length;
  1295.  
  1296.    /* If only one channel we can only need the single alpha value per pixel */
  1297.    if (src_count == 1 && dst_channels == 1) {
  1298.  
  1299.       lp_build_concat_n(gallivm, alpha_type, src_alpha, block_height, src_alpha, src_count);
  1300.    } else {
  1301.       /* If there are more srcs than rows then we need to split alpha up */
  1302.       if (src_count > block_height) {
  1303.          for (i = src_count; i > 0; --i) {
  1304.             unsigned pixels = block_size / src_count;
  1305.             unsigned idx = i - 1;
  1306.  
  1307.             src_alpha[idx] = lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4],
  1308.                                                     (idx * pixels) % 4, pixels);
  1309.          }
  1310.       }
  1311.  
  1312.       /* If there is a src for each pixel broadcast the alpha across whole row */
  1313.       if (src_count == block_size) {
  1314.          for (i = 0; i < src_count; ++i) {
  1315.             src_alpha[i] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, row_type), src_alpha[i]);
  1316.          }
  1317.       } else {
  1318.          unsigned pixels = block_size / src_count;
  1319.          unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels;
  1320.          unsigned alpha_span = 1;
  1321.          LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
  1322.  
  1323.          /* Check if we need 2 src_alphas for our shuffles */
  1324.          if (pixels > alpha_type.length) {
  1325.             alpha_span = 2;
  1326.          }
  1327.  
  1328.          /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */
  1329.          for (j = 0; j < row_type.length; ++j) {
  1330.             if (j < pixels * channels) {
  1331.                shuffles[j] = lp_build_const_int32(gallivm, j / channels);
  1332.             } else {
  1333.                shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
  1334.             }
  1335.          }
  1336.  
  1337.          for (i = 0; i < src_count; ++i) {
  1338.             unsigned idx1 = i, idx2 = i;
  1339.  
  1340.             if (alpha_span > 1){
  1341.                idx1 *= alpha_span;
  1342.                idx2 = idx1 + 1;
  1343.             }
  1344.  
  1345.             src_alpha[i] = LLVMBuildShuffleVector(builder,
  1346.                                                   src_alpha[idx1],
  1347.                                                   src_alpha[idx2],
  1348.                                                   LLVMConstVector(shuffles, row_type.length),
  1349.                                                   "");
  1350.          }
  1351.       }
  1352.    }
  1353. }
  1354.  
  1355.  
  1356. /**
  1357.  * Generates the blend function for unswizzled colour buffers
  1358.  * Also generates the read & write from colour buffer
  1359.  */
  1360. static void
  1361. generate_unswizzled_blend(struct gallivm_state *gallivm,
  1362.                           unsigned rt,
  1363.                           struct lp_fragment_shader_variant *variant,
  1364.                           enum pipe_format out_format,
  1365.                           unsigned int num_fs,
  1366.                           struct lp_type fs_type,
  1367.                           LLVMValueRef* fs_mask,
  1368.                           LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],
  1369.                           LLVMValueRef context_ptr,
  1370.                           LLVMValueRef color_ptr,
  1371.                           LLVMValueRef stride,
  1372.                           unsigned partial_mask,
  1373.                           boolean do_branch)
  1374. {
  1375.    const unsigned alpha_channel = 3;
  1376.    const unsigned block_width = LP_RASTER_BLOCK_SIZE;
  1377.    const unsigned block_height = LP_RASTER_BLOCK_SIZE;
  1378.    const unsigned block_size = block_width * block_height;
  1379.    const unsigned lp_integer_vector_width = 128;
  1380.  
  1381.    LLVMBuilderRef builder = gallivm->builder;
  1382.    LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
  1383.    LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS];
  1384.    LLVMValueRef src_alpha[4 * 4];
  1385.    LLVMValueRef src1_alpha[4 * 4];
  1386.    LLVMValueRef src_mask[4 * 4];
  1387.    LLVMValueRef src[4 * 4];
  1388.    LLVMValueRef src1[4 * 4];
  1389.    LLVMValueRef dst[4 * 4];
  1390.    LLVMValueRef blend_color;
  1391.    LLVMValueRef blend_alpha;
  1392.    LLVMValueRef i32_zero;
  1393.    LLVMValueRef check_mask;
  1394.    LLVMValueRef undef_src_val;
  1395.  
  1396.    struct lp_build_mask_context mask_ctx;
  1397.    struct lp_type mask_type;
  1398.    struct lp_type blend_type;
  1399.    struct lp_type row_type;
  1400.    struct lp_type dst_type;
  1401.  
  1402.    unsigned char swizzle[TGSI_NUM_CHANNELS];
  1403.    unsigned vector_width;
  1404.    unsigned src_channels = TGSI_NUM_CHANNELS;
  1405.    unsigned dst_channels;
  1406.    unsigned dst_count;
  1407.    unsigned src_count;
  1408.    unsigned i, j;
  1409.  
  1410.    const struct util_format_description* out_format_desc = util_format_description(out_format);
  1411.  
  1412.    unsigned dst_alignment;
  1413.  
  1414.    bool pad_inline = is_arithmetic_format(out_format_desc);
  1415.    bool has_alpha = false;
  1416.    const boolean dual_source_blend = variant->key.blend.rt[0].blend_enable &&
  1417.                                      util_blend_state_is_dual(&variant->key.blend, 0);
  1418.  
  1419.    const boolean is_1d = variant->key.resource_1d;
  1420.    unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
  1421.  
  1422.    mask_type = lp_int32_vec4_type();
  1423.    mask_type.length = fs_type.length;
  1424.  
  1425.    for (i = num_fs; i < num_fullblock_fs; i++) {
  1426.       fs_mask[i] = lp_build_zero(gallivm, mask_type);
  1427.    }
  1428.  
  1429.    /* Do not bother executing code when mask is empty.. */
  1430.    if (do_branch) {
  1431.       check_mask = LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type));
  1432.  
  1433.       for (i = 0; i < num_fullblock_fs; ++i) {
  1434.          check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], "");
  1435.       }
  1436.  
  1437.       lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask);
  1438.       lp_build_mask_check(&mask_ctx);
  1439.    }
  1440.  
  1441.    partial_mask |= !variant->opaque;
  1442.    i32_zero = lp_build_const_int32(gallivm, 0);
  1443.  
  1444. #if HAVE_LLVM < 0x0302
  1445.    /*
  1446.     * undef triggers a crash in LLVMBuildTrunc in convert_from_blend_type in some
  1447.     * cases (seen with r10g10b10a2, 128bit wide vectors) (only used for 1d case).
  1448.     */
  1449.    undef_src_val = lp_build_zero(gallivm, fs_type);
  1450. #else
  1451.    undef_src_val = lp_build_undef(gallivm, fs_type);
  1452. #endif
  1453.  
  1454.  
  1455.    /* Get type from output format */
  1456.    lp_blend_type_from_format_desc(out_format_desc, &row_type);
  1457.    lp_mem_type_from_format_desc(out_format_desc, &dst_type);
  1458.  
  1459.    row_type.length = fs_type.length;
  1460.    vector_width    = dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
  1461.  
  1462.    /* Compute correct swizzle and count channels */
  1463.    memset(swizzle, LP_BLD_SWIZZLE_DONTCARE, TGSI_NUM_CHANNELS);
  1464.    dst_channels = 0;
  1465.  
  1466.    for (i = 0; i < TGSI_NUM_CHANNELS; ++i) {
  1467.       /* Ensure channel is used */
  1468.       if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) {
  1469.          continue;
  1470.       }
  1471.  
  1472.       /* Ensure not already written to (happens in case with GL_ALPHA) */
  1473.       if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) {
  1474.          continue;
  1475.       }
  1476.  
  1477.       /* Ensure we havn't already found all channels */
  1478.       if (dst_channels >= out_format_desc->nr_channels) {
  1479.          continue;
  1480.       }
  1481.  
  1482.       swizzle[out_format_desc->swizzle[i]] = i;
  1483.       ++dst_channels;
  1484.  
  1485.       if (i == alpha_channel) {
  1486.          has_alpha = true;
  1487.       }
  1488.    }
  1489.  
  1490.    if (format_expands_to_float_soa(out_format_desc)) {
  1491.       /*
  1492.        * the code above can't work for layout_other
  1493.        * for srgb it would sort of work but we short-circuit swizzles, etc.
  1494.        * as that is done as part of unpack / pack.
  1495.        */
  1496.       dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */
  1497.       has_alpha = true;
  1498.       swizzle[0] = 0;
  1499.       swizzle[1] = 1;
  1500.       swizzle[2] = 2;
  1501.       swizzle[3] = 3;
  1502.       pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */
  1503.    }
  1504.  
  1505.    /* If 3 channels then pad to include alpha for 4 element transpose */
  1506.    if (dst_channels == 3 && !has_alpha) {
  1507.       for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
  1508.          if (swizzle[i] > TGSI_NUM_CHANNELS)
  1509.             swizzle[i] = 3;
  1510.       }
  1511.       if (out_format_desc->nr_channels == 4) {
  1512.          dst_channels = 4;
  1513.       }
  1514.    }
  1515.  
  1516.    /*
  1517.     * Load shader output
  1518.     */
  1519.    for (i = 0; i < num_fullblock_fs; ++i) {
  1520.       /* Always load alpha for use in blending */
  1521.       LLVMValueRef alpha;
  1522.       if (i < num_fs) {
  1523.          alpha = LLVMBuildLoad(builder, fs_out_color[rt][alpha_channel][i], "");
  1524.       }
  1525.       else {
  1526.          alpha = undef_src_val;
  1527.       }
  1528.  
  1529.       /* Load each channel */
  1530.       for (j = 0; j < dst_channels; ++j) {
  1531.          assert(swizzle[j] < 4);
  1532.          if (i < num_fs) {
  1533.             fs_src[i][j] = LLVMBuildLoad(builder, fs_out_color[rt][swizzle[j]][i], "");
  1534.          }
  1535.          else {
  1536.             fs_src[i][j] = undef_src_val;
  1537.          }
  1538.       }
  1539.  
  1540.       /* If 3 channels then pad to include alpha for 4 element transpose */
  1541.       /*
  1542.        * XXX If we include that here maybe could actually use it instead of
  1543.        * separate alpha for blending?
  1544.        */
  1545.       if (dst_channels == 3 && !has_alpha) {
  1546.          fs_src[i][3] = alpha;
  1547.       }
  1548.  
  1549.       /* We split the row_mask and row_alpha as we want 128bit interleave */
  1550.       if (fs_type.length == 8) {
  1551.          src_mask[i*2 + 0]  = lp_build_extract_range(gallivm, fs_mask[i], 0, src_channels);
  1552.          src_mask[i*2 + 1]  = lp_build_extract_range(gallivm, fs_mask[i], src_channels, src_channels);
  1553.  
  1554.          src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
  1555.          src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
  1556.       } else {
  1557.          src_mask[i] = fs_mask[i];
  1558.          src_alpha[i] = alpha;
  1559.       }
  1560.    }
  1561.    if (dual_source_blend) {
  1562.       /* same as above except different src/dst, skip masks and comments... */
  1563.       for (i = 0; i < num_fullblock_fs; ++i) {
  1564.          LLVMValueRef alpha;
  1565.          if (i < num_fs) {
  1566.             alpha = LLVMBuildLoad(builder, fs_out_color[1][alpha_channel][i], "");
  1567.          }
  1568.          else {
  1569.             alpha = undef_src_val;
  1570.          }
  1571.  
  1572.          for (j = 0; j < dst_channels; ++j) {
  1573.             assert(swizzle[j] < 4);
  1574.             if (i < num_fs) {
  1575.                fs_src1[i][j] = LLVMBuildLoad(builder, fs_out_color[1][swizzle[j]][i], "");
  1576.             }
  1577.             else {
  1578.                fs_src1[i][j] = undef_src_val;
  1579.             }
  1580.          }
  1581.          if (dst_channels == 3 && !has_alpha) {
  1582.             fs_src1[i][3] = alpha;
  1583.          }
  1584.          if (fs_type.length == 8) {
  1585.             src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
  1586.             src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
  1587.          } else {
  1588.             src1_alpha[i] = alpha;
  1589.          }
  1590.       }
  1591.    }
  1592.  
  1593.    if (util_format_is_pure_integer(out_format)) {
  1594.       /*
  1595.        * In this case fs_type was really ints or uints disguised as floats,
  1596.        * fix that up now.
  1597.        */
  1598.       fs_type.floating = 0;
  1599.       fs_type.sign = dst_type.sign;
  1600.       for (i = 0; i < num_fullblock_fs; ++i) {
  1601.          for (j = 0; j < dst_channels; ++j) {
  1602.             fs_src[i][j] = LLVMBuildBitCast(builder, fs_src[i][j],
  1603.                                             lp_build_vec_type(gallivm, fs_type), "");
  1604.          }
  1605.          if (dst_channels == 3 && !has_alpha) {
  1606.             fs_src[i][3] = LLVMBuildBitCast(builder, fs_src[i][3],
  1607.                                             lp_build_vec_type(gallivm, fs_type), "");
  1608.          }
  1609.       }
  1610.    }
  1611.  
  1612.    /*
  1613.     * Pixel twiddle from fragment shader order to memory order
  1614.     */
  1615.    src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
  1616.                                    dst_channels, fs_src, src, pad_inline);
  1617.    if (dual_source_blend) {
  1618.       generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
  1619.                           fs_src1, src1, pad_inline);
  1620.    }
  1621.  
  1622.    src_channels = dst_channels < 3 ? dst_channels : 4;
  1623.    if (src_count != num_fullblock_fs * src_channels) {
  1624.       unsigned ds = src_count / (num_fullblock_fs * src_channels);
  1625.       row_type.length /= ds;
  1626.       fs_type.length = row_type.length;
  1627.    }
  1628.  
  1629.    blend_type = row_type;
  1630.    mask_type.length = 4;
  1631.  
  1632.    /* Convert src to row_type */
  1633.    if (dual_source_blend) {
  1634.       struct lp_type old_row_type = row_type;
  1635.       lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
  1636.       src_count = lp_build_conv_auto(gallivm, fs_type, &old_row_type, src1, src_count, src1);
  1637.    }
  1638.    else {
  1639.       src_count = lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
  1640.    }
  1641.  
  1642.    /* If the rows are not an SSE vector, combine them to become SSE size! */
  1643.    if ((row_type.width * row_type.length) % 128) {
  1644.       unsigned bits = row_type.width * row_type.length;
  1645.       unsigned combined;
  1646.  
  1647.       assert(src_count >= (vector_width / bits));
  1648.  
  1649.       dst_count = src_count / (vector_width / bits);
  1650.  
  1651.       combined = lp_build_concat_n(gallivm, row_type, src, src_count, src, dst_count);
  1652.       if (dual_source_blend) {
  1653.          lp_build_concat_n(gallivm, row_type, src1, src_count, src1, dst_count);
  1654.       }
  1655.  
  1656.       row_type.length *= combined;
  1657.       src_count /= combined;
  1658.  
  1659.       bits = row_type.width * row_type.length;
  1660.       assert(bits == 128 || bits == 256);
  1661.    }
  1662.  
  1663.  
  1664.    /*
  1665.     * Blend Colour conversion
  1666.     */
  1667.    blend_color = lp_jit_context_f_blend_color(gallivm, context_ptr);
  1668.    blend_color = LLVMBuildPointerCast(builder, blend_color, LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), "");
  1669.    blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color, &i32_zero, 1, ""), "");
  1670.  
  1671.    /* Convert */
  1672.    lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1, &blend_color, 1);
  1673.  
  1674.    if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
  1675.       /*
  1676.        * since blending is done with floats, there was no conversion.
  1677.        * However, the rules according to fixed point renderbuffers still
  1678.        * apply, that is we must clamp inputs to 0.0/1.0.
  1679.        * (This would apply to separate alpha conversion too but we currently
  1680.        * force has_alpha to be true.)
  1681.        * TODO: should skip this with "fake" blend, since post-blend conversion
  1682.        * will clamp anyway.
  1683.        * TODO: could also skip this if fragment color clamping is enabled. We
  1684.        * don't support it natively so it gets baked into the shader however, so
  1685.        * can't really tell here.
  1686.        */
  1687.       struct lp_build_context f32_bld;
  1688.       assert(row_type.floating);
  1689.       lp_build_context_init(&f32_bld, gallivm, row_type);
  1690.       for (i = 0; i < src_count; i++) {
  1691.          src[i] = lp_build_clamp(&f32_bld, src[i], f32_bld.zero, f32_bld.one);
  1692.       }
  1693.       if (dual_source_blend) {
  1694.          for (i = 0; i < src_count; i++) {
  1695.             src1[i] = lp_build_clamp(&f32_bld, src1[i], f32_bld.zero, f32_bld.one);
  1696.          }
  1697.       }
  1698.       /* probably can't be different than row_type but better safe than sorry... */
  1699.       lp_build_context_init(&f32_bld, gallivm, blend_type);
  1700.       blend_color = lp_build_clamp(&f32_bld, blend_color, f32_bld.zero, f32_bld.one);
  1701.    }
  1702.  
  1703.    /* Extract alpha */
  1704.    blend_alpha = lp_build_extract_broadcast(gallivm, blend_type, row_type, blend_color, lp_build_const_int32(gallivm, 3));
  1705.  
  1706.    /* Swizzle to appropriate channels, e.g. from RGBA to BGRA BGRA */
  1707.    pad_inline &= (dst_channels * (block_size / src_count) * row_type.width) != vector_width;
  1708.    if (pad_inline) {
  1709.       /* Use all 4 channels e.g. from RGBA RGBA to RGxx RGxx */
  1710.       blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, TGSI_NUM_CHANNELS, row_type.length);
  1711.    } else {
  1712.       /* Only use dst_channels e.g. RGBA RGBA to RG RG xxxx */
  1713.       blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, dst_channels, row_type.length);
  1714.    }
  1715.  
  1716.    /*
  1717.     * Mask conversion
  1718.     */
  1719.    lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0], block_height, &src_mask[0]);
  1720.  
  1721.    if (src_count < block_height) {
  1722.       lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count);
  1723.    } else if (src_count > block_height) {
  1724.       for (i = src_count; i > 0; --i) {
  1725.          unsigned pixels = block_size / src_count;
  1726.          unsigned idx = i - 1;
  1727.  
  1728.          src_mask[idx] = lp_build_extract_range(gallivm, src_mask[(idx * pixels) / 4],
  1729.                                                 (idx * pixels) % 4, pixels);
  1730.       }
  1731.    }
  1732.  
  1733.    assert(mask_type.width == 32);
  1734.  
  1735.    for (i = 0; i < src_count; ++i) {
  1736.       unsigned pixels = block_size / src_count;
  1737.       unsigned pixel_width = row_type.width * dst_channels;
  1738.  
  1739.       if (pixel_width == 24) {
  1740.          mask_type.width = 8;
  1741.          mask_type.length = vector_width / mask_type.width;
  1742.       } else {
  1743.          mask_type.length = pixels;
  1744.          mask_type.width = row_type.width * dst_channels;
  1745.  
  1746.          src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
  1747.  
  1748.          mask_type.length *= dst_channels;
  1749.          mask_type.width /= dst_channels;
  1750.       }
  1751.  
  1752.       src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
  1753.       src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
  1754.    }
  1755.  
  1756.    /*
  1757.     * Alpha conversion
  1758.     */
  1759.    if (!has_alpha) {
  1760.       struct lp_type alpha_type = fs_type;
  1761.       alpha_type.length = 4;
  1762.       convert_alpha(gallivm, row_type, alpha_type,
  1763.                     block_size, block_height,
  1764.                     src_count, dst_channels,
  1765.                     pad_inline, src_alpha);
  1766.       if (dual_source_blend) {
  1767.          convert_alpha(gallivm, row_type, alpha_type,
  1768.                        block_size, block_height,
  1769.                        src_count, dst_channels,
  1770.                        pad_inline, src1_alpha);
  1771.       }
  1772.    }
  1773.  
  1774.  
  1775.    /*
  1776.     * Load dst from memory
  1777.     */
  1778.    if (src_count < block_height) {
  1779.       dst_count = block_height;
  1780.    } else {
  1781.       dst_count = src_count;
  1782.    }
  1783.  
  1784.    dst_type.length *= block_size / dst_count;
  1785.  
  1786.    if (format_expands_to_float_soa(out_format_desc)) {
  1787.       /*
  1788.        * we need multiple values at once for the conversion, so can as well
  1789.        * load them vectorized here too instead of concatenating later.
  1790.        * (Still need concatenation later for 8-wide vectors).
  1791.        */
  1792.       dst_count = block_height;
  1793.       dst_type.length = block_width;
  1794.    }
  1795.  
  1796.    /*
  1797.     * Compute the alignment of the destination pointer in bytes
  1798.     * We fetch 1-4 pixels, if the format has pot alignment then those fetches
  1799.     * are always aligned by MIN2(16, fetch_width) except for buffers (not
  1800.     * 1d tex but can't distinguish here) so need to stick with per-pixel
  1801.     * alignment in this case.
  1802.     */
  1803.    if (is_1d) {
  1804.       dst_alignment = (out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8);
  1805.    }
  1806.    else {
  1807.       dst_alignment = dst_type.length * dst_type.width / 8;
  1808.    }
  1809.    /* Force power-of-two alignment by extracting only the least-significant-bit */
  1810.    dst_alignment = 1 << (ffs(dst_alignment) - 1);
  1811.    /*
  1812.     * Resource base and stride pointers are aligned to 16 bytes, so that's
  1813.     * the maximum alignment we can guarantee
  1814.     */
  1815.    dst_alignment = MIN2(16, dst_alignment);
  1816.  
  1817.    if (is_1d) {
  1818.       load_unswizzled_block(gallivm, color_ptr, stride, block_width, 1,
  1819.                             dst, dst_type, dst_count / 4, dst_alignment);
  1820.       for (i = dst_count / 4; i < dst_count; i++) {
  1821.          dst[i] = lp_build_undef(gallivm, dst_type);
  1822.       }
  1823.  
  1824.    }
  1825.    else {
  1826.       load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
  1827.                             dst, dst_type, dst_count, dst_alignment);
  1828.    }
  1829.  
  1830.  
  1831.    /*
  1832.     * Convert from dst/output format to src/blending format.
  1833.     *
  1834.     * This is necessary as we can only read 1 row from memory at a time,
  1835.     * so the minimum dst_count will ever be at this point is 4.
  1836.     *
  1837.     * With, for example, R8 format you can have all 16 pixels in a 128 bit vector,
  1838.     * this will take the 4 dsts and combine them into 1 src so we can perform blending
  1839.     * on all 16 pixels in that single vector at once.
  1840.     */
  1841.    if (dst_count > src_count) {
  1842.       lp_build_concat_n(gallivm, dst_type, dst, 4, dst, src_count);
  1843.    }
  1844.  
  1845.    /*
  1846.     * Blending
  1847.     */
  1848.    /* XXX this is broken for RGB8 formats -
  1849.     * they get expanded from 12 to 16 elements (to include alpha)
  1850.     * by convert_to_blend_type then reduced to 15 instead of 12
  1851.     * by convert_from_blend_type (a simple fix though breaks A8...).
  1852.     * R16G16B16 also crashes differently however something going wrong
  1853.     * inside llvm handling npot vector sizes seemingly.
  1854.     * It seems some cleanup could be done here (like skipping conversion/blend
  1855.     * when not needed).
  1856.     */
  1857.    convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type, row_type, dst, src_count);
  1858.  
  1859.    /*
  1860.     * FIXME: Really should get logic ops / masks out of generic blend / row
  1861.     * format. Logic ops will definitely not work on the blend float format
  1862.     * used for SRGB here and I think OpenGL expects this to work as expected
  1863.     * (that is incoming values converted to srgb then logic op applied).
  1864.     */
  1865.    for (i = 0; i < src_count; ++i) {
  1866.       dst[i] = lp_build_blend_aos(gallivm,
  1867.                                   &variant->key.blend,
  1868.                                   out_format,
  1869.                                   row_type,
  1870.                                   rt,
  1871.                                   src[i],
  1872.                                   has_alpha ? NULL : src_alpha[i],
  1873.                                   src1[i],
  1874.                                   has_alpha ? NULL : src1_alpha[i],
  1875.                                   dst[i],
  1876.                                   partial_mask ? src_mask[i] : NULL,
  1877.                                   blend_color,
  1878.                                   has_alpha ? NULL : blend_alpha,
  1879.                                   swizzle,
  1880.                                   pad_inline ? 4 : dst_channels);
  1881.    }
  1882.  
  1883.    convert_from_blend_type(gallivm, block_size, out_format_desc, row_type, dst_type, dst, src_count);
  1884.  
  1885.    /* Split the blend rows back to memory rows */
  1886.    if (dst_count > src_count) {
  1887.       row_type.length = dst_type.length * (dst_count / src_count);
  1888.  
  1889.       if (src_count == 1) {
  1890.          dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2);
  1891.          dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
  1892.  
  1893.          row_type.length /= 2;
  1894.          src_count *= 2;
  1895.       }
  1896.  
  1897.       dst[3] = lp_build_extract_range(gallivm, dst[1], row_type.length / 2, row_type.length / 2);
  1898.       dst[2] = lp_build_extract_range(gallivm, dst[1], 0, row_type.length / 2);
  1899.       dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2);
  1900.       dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
  1901.  
  1902.       row_type.length /= 2;
  1903.       src_count *= 2;
  1904.    }
  1905.  
  1906.    /*
  1907.     * Store blend result to memory
  1908.     */
  1909.    if (is_1d) {
  1910.       store_unswizzled_block(gallivm, color_ptr, stride, block_width, 1,
  1911.                              dst, dst_type, dst_count / 4, dst_alignment);
  1912.    }
  1913.    else {
  1914.       store_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
  1915.                              dst, dst_type, dst_count, dst_alignment);
  1916.    }
  1917.  
  1918.    if (do_branch) {
  1919.       lp_build_mask_end(&mask_ctx);
  1920.    }
  1921. }
  1922.  
  1923.  
  1924. /**
  1925.  * Generate the runtime callable function for the whole fragment pipeline.
  1926.  * Note that the function which we generate operates on a block of 16
  1927.  * pixels at at time.  The block contains 2x2 quads.  Each quad contains
  1928.  * 2x2 pixels.
  1929.  */
  1930. static void
  1931. generate_fragment(struct llvmpipe_context *lp,
  1932.                   struct lp_fragment_shader *shader,
  1933.                   struct lp_fragment_shader_variant *variant,
  1934.                   unsigned partial_mask)
  1935. {
  1936.    struct gallivm_state *gallivm = variant->gallivm;
  1937.    const struct lp_fragment_shader_variant_key *key = &variant->key;
  1938.    struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
  1939.    char func_name[256];
  1940.    struct lp_type fs_type;
  1941.    struct lp_type blend_type;
  1942.    LLVMTypeRef fs_elem_type;
  1943.    LLVMTypeRef blend_vec_type;
  1944.    LLVMTypeRef arg_types[13];
  1945.    LLVMTypeRef func_type;
  1946.    LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
  1947.    LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
  1948.    LLVMValueRef context_ptr;
  1949.    LLVMValueRef x;
  1950.    LLVMValueRef y;
  1951.    LLVMValueRef a0_ptr;
  1952.    LLVMValueRef dadx_ptr;
  1953.    LLVMValueRef dady_ptr;
  1954.    LLVMValueRef color_ptr_ptr;
  1955.    LLVMValueRef stride_ptr;
  1956.    LLVMValueRef depth_ptr;
  1957.    LLVMValueRef depth_stride;
  1958.    LLVMValueRef mask_input;
  1959.    LLVMValueRef thread_data_ptr;
  1960.    LLVMBasicBlockRef block;
  1961.    LLVMBuilderRef builder;
  1962.    struct lp_build_sampler_soa *sampler;
  1963.    struct lp_build_interp_soa_context interp;
  1964.    LLVMValueRef fs_mask[16 / 4];
  1965.    LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4];
  1966.    LLVMValueRef function;
  1967.    LLVMValueRef facing;
  1968.    unsigned num_fs;
  1969.    unsigned i;
  1970.    unsigned chan;
  1971.    unsigned cbuf;
  1972.    boolean cbuf0_write_all;
  1973.    const boolean dual_source_blend = key->blend.rt[0].blend_enable &&
  1974.                                      util_blend_state_is_dual(&key->blend, 0);
  1975.  
  1976.    assert(lp_native_vector_width / 32 >= 4);
  1977.  
  1978.    /* Adjust color input interpolation according to flatshade state:
  1979.     */
  1980.    memcpy(inputs, shader->inputs, shader->info.base.num_inputs * sizeof inputs[0]);
  1981.    for (i = 0; i < shader->info.base.num_inputs; i++) {
  1982.       if (inputs[i].interp == LP_INTERP_COLOR) {
  1983.          if (key->flatshade)
  1984.             inputs[i].interp = LP_INTERP_CONSTANT;
  1985.          else
  1986.             inputs[i].interp = LP_INTERP_PERSPECTIVE;
  1987.       }
  1988.    }
  1989.  
  1990.    /* check if writes to cbuf[0] are to be copied to all cbufs */
  1991.    cbuf0_write_all = FALSE;
  1992.    for (i = 0;i < shader->info.base.num_properties; i++) {
  1993.       if (shader->info.base.properties[i].name ==
  1994.           TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS) {
  1995.          cbuf0_write_all = TRUE;
  1996.          break;
  1997.       }
  1998.    }
  1999.  
  2000.    /* TODO: actually pick these based on the fs and color buffer
  2001.     * characteristics. */
  2002.  
  2003.    memset(&fs_type, 0, sizeof fs_type);
  2004.    fs_type.floating = TRUE;      /* floating point values */
  2005.    fs_type.sign = TRUE;          /* values are signed */
  2006.    fs_type.norm = FALSE;         /* values are not limited to [0,1] or [-1,1] */
  2007.    fs_type.width = 32;           /* 32-bit float */
  2008.    fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */
  2009.  
  2010.    memset(&blend_type, 0, sizeof blend_type);
  2011.    blend_type.floating = FALSE; /* values are integers */
  2012.    blend_type.sign = FALSE;     /* values are unsigned */
  2013.    blend_type.norm = TRUE;      /* values are in [0,1] or [-1,1] */
  2014.    blend_type.width = 8;        /* 8-bit ubyte values */
  2015.    blend_type.length = 16;      /* 16 elements per vector */
  2016.  
  2017.    /*
  2018.     * Generate the function prototype. Any change here must be reflected in
  2019.     * lp_jit.h's lp_jit_frag_func function pointer type, and vice-versa.
  2020.     */
  2021.  
  2022.    fs_elem_type = lp_build_elem_type(gallivm, fs_type);
  2023.  
  2024.    blend_vec_type = lp_build_vec_type(gallivm, blend_type);
  2025.  
  2026.    util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s",
  2027.                  shader->no, variant->no, partial_mask ? "partial" : "whole");
  2028.  
  2029.    arg_types[0] = variant->jit_context_ptr_type;       /* context */
  2030.    arg_types[1] = int32_type;                          /* x */
  2031.    arg_types[2] = int32_type;                          /* y */
  2032.    arg_types[3] = int32_type;                          /* facing */
  2033.    arg_types[4] = LLVMPointerType(fs_elem_type, 0);    /* a0 */
  2034.    arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
  2035.    arg_types[6] = LLVMPointerType(fs_elem_type, 0);    /* dady */
  2036.    arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0);  /* color */
  2037.    arg_types[8] = LLVMPointerType(int8_type, 0);       /* depth */
  2038.    arg_types[9] = int32_type;                          /* mask_input */
  2039.    arg_types[10] = variant->jit_thread_data_ptr_type;  /* per thread data */
  2040.    arg_types[11] = LLVMPointerType(int32_type, 0);     /* stride */
  2041.    arg_types[12] = int32_type;                         /* depth_stride */
  2042.  
  2043.    func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
  2044.                                 arg_types, Elements(arg_types), 0);
  2045.  
  2046.    function = LLVMAddFunction(gallivm->module, func_name, func_type);
  2047.    LLVMSetFunctionCallConv(function, LLVMCCallConv);
  2048.  
  2049.    variant->function[partial_mask] = function;
  2050.  
  2051.    /* XXX: need to propagate noalias down into color param now we are
  2052.     * passing a pointer-to-pointer?
  2053.     */
  2054.    for(i = 0; i < Elements(arg_types); ++i)
  2055.       if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
  2056.          LLVMAddAttribute(LLVMGetParam(function, i), LLVMNoAliasAttribute);
  2057.  
  2058.    context_ptr  = LLVMGetParam(function, 0);
  2059.    x            = LLVMGetParam(function, 1);
  2060.    y            = LLVMGetParam(function, 2);
  2061.    facing       = LLVMGetParam(function, 3);
  2062.    a0_ptr       = LLVMGetParam(function, 4);
  2063.    dadx_ptr     = LLVMGetParam(function, 5);
  2064.    dady_ptr     = LLVMGetParam(function, 6);
  2065.    color_ptr_ptr = LLVMGetParam(function, 7);
  2066.    depth_ptr    = LLVMGetParam(function, 8);
  2067.    mask_input   = LLVMGetParam(function, 9);
  2068.    thread_data_ptr  = LLVMGetParam(function, 10);
  2069.    stride_ptr   = LLVMGetParam(function, 11);
  2070.    depth_stride = LLVMGetParam(function, 12);
  2071.  
  2072.    lp_build_name(context_ptr, "context");
  2073.    lp_build_name(x, "x");
  2074.    lp_build_name(y, "y");
  2075.    lp_build_name(a0_ptr, "a0");
  2076.    lp_build_name(dadx_ptr, "dadx");
  2077.    lp_build_name(dady_ptr, "dady");
  2078.    lp_build_name(color_ptr_ptr, "color_ptr_ptr");
  2079.    lp_build_name(depth_ptr, "depth");
  2080.    lp_build_name(thread_data_ptr, "thread_data");
  2081.    lp_build_name(mask_input, "mask_input");
  2082.    lp_build_name(stride_ptr, "stride_ptr");
  2083.    lp_build_name(depth_stride, "depth_stride");
  2084.  
  2085.    /*
  2086.     * Function body
  2087.     */
  2088.  
  2089.    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
  2090.    builder = gallivm->builder;
  2091.    assert(builder);
  2092.    LLVMPositionBuilderAtEnd(builder, block);
  2093.  
  2094.    /* code generated texture sampling */
  2095.    sampler = lp_llvm_sampler_soa_create(key->state, context_ptr);
  2096.  
  2097.    num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
  2098.    /* for 1d resources only run "upper half" of stamp */
  2099.    if (key->resource_1d)
  2100.       num_fs /= 2;
  2101.  
  2102.    {
  2103.       LLVMValueRef num_loop = lp_build_const_int32(gallivm, num_fs);
  2104.       LLVMTypeRef mask_type = lp_build_int_vec_type(gallivm, fs_type);
  2105.       LLVMValueRef mask_store = lp_build_array_alloca(gallivm, mask_type,
  2106.                                                       num_loop, "mask_store");
  2107.       LLVMValueRef color_store[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS];
  2108.  
  2109.       /*
  2110.        * The shader input interpolation info is not explicitely baked in the
  2111.        * shader key, but everything it derives from (TGSI, and flatshade) is
  2112.        * already included in the shader key.
  2113.        */
  2114.       lp_build_interp_soa_init(&interp,
  2115.                                gallivm,
  2116.                                shader->info.base.num_inputs,
  2117.                                inputs,
  2118.                                shader->info.base.pixel_center_integer,
  2119.                                builder, fs_type,
  2120.                                a0_ptr, dadx_ptr, dady_ptr,
  2121.                                x, y);
  2122.  
  2123.       for (i = 0; i < num_fs; i++) {
  2124.          LLVMValueRef mask;
  2125.          LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
  2126.          LLVMValueRef mask_ptr = LLVMBuildGEP(builder, mask_store,
  2127.                                               &indexi, 1, "mask_ptr");
  2128.  
  2129.          if (partial_mask) {
  2130.             mask = generate_quad_mask(gallivm, fs_type,
  2131.                                       i*fs_type.length/4, mask_input);
  2132.          }
  2133.          else {
  2134.             mask = lp_build_const_int_vec(gallivm, fs_type, ~0);
  2135.          }
  2136.          LLVMBuildStore(builder, mask, mask_ptr);
  2137.       }
  2138.  
  2139.       generate_fs_loop(gallivm,
  2140.                        shader, key,
  2141.                        builder,
  2142.                        fs_type,
  2143.                        context_ptr,
  2144.                        num_loop,
  2145.                        &interp,
  2146.                        sampler,
  2147.                        mask_store, /* output */
  2148.                        color_store,
  2149.                        depth_ptr,
  2150.                        depth_stride,
  2151.                        facing,
  2152.                        thread_data_ptr);
  2153.  
  2154.       for (i = 0; i < num_fs; i++) {
  2155.          LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
  2156.          LLVMValueRef ptr = LLVMBuildGEP(builder, mask_store,
  2157.                                          &indexi, 1, "");
  2158.          fs_mask[i] = LLVMBuildLoad(builder, ptr, "mask");
  2159.          /* This is fucked up need to reorganize things */
  2160.          for (cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
  2161.             for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
  2162.                ptr = LLVMBuildGEP(builder,
  2163.                                   color_store[cbuf * !cbuf0_write_all][chan],
  2164.                                   &indexi, 1, "");
  2165.                fs_out_color[cbuf][chan][i] = ptr;
  2166.             }
  2167.          }
  2168.          if (dual_source_blend) {
  2169.             /* only support one dual source blend target hence always use output 1 */
  2170.             for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
  2171.                ptr = LLVMBuildGEP(builder,
  2172.                                   color_store[1][chan],
  2173.                                   &indexi, 1, "");
  2174.                fs_out_color[1][chan][i] = ptr;
  2175.             }
  2176.          }
  2177.       }
  2178.    }
  2179.  
  2180.    sampler->destroy(sampler);
  2181.  
  2182.    /* Loop over color outputs / color buffers to do blending.
  2183.     */
  2184.    for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
  2185.       LLVMValueRef color_ptr;
  2186.       LLVMValueRef stride;
  2187.       LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
  2188.  
  2189.       boolean do_branch = ((key->depth.enabled
  2190.                             || key->stencil[0].enabled
  2191.                             || key->alpha.enabled)
  2192.                            && !shader->info.base.uses_kill);
  2193.  
  2194.       color_ptr = LLVMBuildLoad(builder,
  2195.                                 LLVMBuildGEP(builder, color_ptr_ptr, &index, 1, ""),
  2196.                                 "");
  2197.  
  2198.       lp_build_name(color_ptr, "color_ptr%d", cbuf);
  2199.  
  2200.       stride = LLVMBuildLoad(builder,
  2201.                              LLVMBuildGEP(builder, stride_ptr, &index, 1, ""),
  2202.                              "");
  2203.  
  2204.       generate_unswizzled_blend(gallivm, cbuf, variant, key->cbuf_format[cbuf],
  2205.                                 num_fs, fs_type, fs_mask, fs_out_color,
  2206.                                 context_ptr, color_ptr, stride, partial_mask, do_branch);
  2207.    }
  2208.  
  2209.    LLVMBuildRetVoid(builder);
  2210.  
  2211.    gallivm_verify_function(gallivm, function);
  2212.  
  2213.    variant->nr_instrs += lp_build_count_instructions(function);
  2214. }
  2215.  
  2216.  
  2217. static void
  2218. dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
  2219. {
  2220.    unsigned i;
  2221.  
  2222.    debug_printf("fs variant %p:\n", (void *) key);
  2223.  
  2224.    if (key->flatshade) {
  2225.       debug_printf("flatshade = 1\n");
  2226.    }
  2227.    for (i = 0; i < key->nr_cbufs; ++i) {
  2228.       debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
  2229.    }
  2230.    if (key->depth.enabled) {
  2231.       debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
  2232.       debug_printf("depth.func = %s\n", util_dump_func(key->depth.func, TRUE));
  2233.       debug_printf("depth.writemask = %u\n", key->depth.writemask);
  2234.    }
  2235.  
  2236.    for (i = 0; i < 2; ++i) {
  2237.       if (key->stencil[i].enabled) {
  2238.          debug_printf("stencil[%u].func = %s\n", i, util_dump_func(key->stencil[i].func, TRUE));
  2239.          debug_printf("stencil[%u].fail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].fail_op, TRUE));
  2240.          debug_printf("stencil[%u].zpass_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zpass_op, TRUE));
  2241.          debug_printf("stencil[%u].zfail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zfail_op, TRUE));
  2242.          debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
  2243.          debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
  2244.       }
  2245.    }
  2246.  
  2247.    if (key->alpha.enabled) {
  2248.       debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE));
  2249.    }
  2250.  
  2251.    if (key->occlusion_count) {
  2252.       debug_printf("occlusion_count = 1\n");
  2253.    }
  2254.  
  2255.    if (key->blend.logicop_enable) {
  2256.       debug_printf("blend.logicop_func = %s\n", util_dump_logicop(key->blend.logicop_func, TRUE));
  2257.    }
  2258.    else if (key->blend.rt[0].blend_enable) {
  2259.       debug_printf("blend.rgb_func = %s\n",   util_dump_blend_func  (key->blend.rt[0].rgb_func, TRUE));
  2260.       debug_printf("blend.rgb_src_factor = %s\n",   util_dump_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
  2261.       debug_printf("blend.rgb_dst_factor = %s\n",   util_dump_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
  2262.       debug_printf("blend.alpha_func = %s\n",       util_dump_blend_func  (key->blend.rt[0].alpha_func, TRUE));
  2263.       debug_printf("blend.alpha_src_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
  2264.       debug_printf("blend.alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
  2265.    }
  2266.    debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
  2267.    for (i = 0; i < key->nr_samplers; ++i) {
  2268.       const struct lp_static_sampler_state *sampler = &key->state[i].sampler_state;
  2269.       debug_printf("sampler[%u] = \n", i);
  2270.       debug_printf("  .wrap = %s %s %s\n",
  2271.                    util_dump_tex_wrap(sampler->wrap_s, TRUE),
  2272.                    util_dump_tex_wrap(sampler->wrap_t, TRUE),
  2273.                    util_dump_tex_wrap(sampler->wrap_r, TRUE));
  2274.       debug_printf("  .min_img_filter = %s\n",
  2275.                    util_dump_tex_filter(sampler->min_img_filter, TRUE));
  2276.       debug_printf("  .min_mip_filter = %s\n",
  2277.                    util_dump_tex_mipfilter(sampler->min_mip_filter, TRUE));
  2278.       debug_printf("  .mag_img_filter = %s\n",
  2279.                    util_dump_tex_filter(sampler->mag_img_filter, TRUE));
  2280.       if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE)
  2281.          debug_printf("  .compare_func = %s\n", util_dump_func(sampler->compare_func, TRUE));
  2282.       debug_printf("  .normalized_coords = %u\n", sampler->normalized_coords);
  2283.       debug_printf("  .min_max_lod_equal = %u\n", sampler->min_max_lod_equal);
  2284.       debug_printf("  .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero);
  2285.       debug_printf("  .apply_min_lod = %u\n", sampler->apply_min_lod);
  2286.       debug_printf("  .apply_max_lod = %u\n", sampler->apply_max_lod);
  2287.    }
  2288.    for (i = 0; i < key->nr_sampler_views; ++i) {
  2289.       const struct lp_static_texture_state *texture = &key->state[i].texture_state;
  2290.       debug_printf("texture[%u] = \n", i);
  2291.       debug_printf("  .format = %s\n",
  2292.                    util_format_name(texture->format));
  2293.       debug_printf("  .target = %s\n",
  2294.                    util_dump_tex_target(texture->target, TRUE));
  2295.       debug_printf("  .level_zero_only = %u\n",
  2296.                    texture->level_zero_only);
  2297.       debug_printf("  .pot = %u %u %u\n",
  2298.                    texture->pot_width,
  2299.                    texture->pot_height,
  2300.                    texture->pot_depth);
  2301.    }
  2302. }
  2303.  
  2304.  
  2305. void
  2306. lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant)
  2307. {
  2308.    debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n",
  2309.                 variant->shader->no, variant->no);
  2310.    tgsi_dump(variant->shader->base.tokens, 0);
  2311.    dump_fs_variant_key(&variant->key);
  2312.    debug_printf("variant->opaque = %u\n", variant->opaque);
  2313.    debug_printf("\n");
  2314. }
  2315.  
  2316.  
  2317. /**
  2318.  * Generate a new fragment shader variant from the shader code and
  2319.  * other state indicated by the key.
  2320.  */
  2321. static struct lp_fragment_shader_variant *
  2322. generate_variant(struct llvmpipe_context *lp,
  2323.                  struct lp_fragment_shader *shader,
  2324.                  const struct lp_fragment_shader_variant_key *key)
  2325. {
  2326.    struct lp_fragment_shader_variant *variant;
  2327.    const struct util_format_description *cbuf0_format_desc;
  2328.    boolean fullcolormask;
  2329.  
  2330.    variant = CALLOC_STRUCT(lp_fragment_shader_variant);
  2331.    if(!variant)
  2332.       return NULL;
  2333.  
  2334.    variant->gallivm = gallivm_create();
  2335.    if (!variant->gallivm) {
  2336.       FREE(variant);
  2337.       return NULL;
  2338.    }
  2339.  
  2340.    variant->shader = shader;
  2341.    variant->list_item_global.base = variant;
  2342.    variant->list_item_local.base = variant;
  2343.    variant->no = shader->variants_created++;
  2344.  
  2345.    memcpy(&variant->key, key, shader->variant_key_size);
  2346.  
  2347.    /*
  2348.     * Determine whether we are touching all channels in the color buffer.
  2349.     */
  2350.    fullcolormask = FALSE;
  2351.    if (key->nr_cbufs == 1) {
  2352.       cbuf0_format_desc = util_format_description(key->cbuf_format[0]);
  2353.       fullcolormask = util_format_colormask_full(cbuf0_format_desc, key->blend.rt[0].colormask);
  2354.    }
  2355.  
  2356.    variant->opaque =
  2357.          !key->blend.logicop_enable &&
  2358.          !key->blend.rt[0].blend_enable &&
  2359.          fullcolormask &&
  2360.          !key->stencil[0].enabled &&
  2361.          !key->alpha.enabled &&
  2362.          !key->depth.enabled &&
  2363.          !shader->info.base.uses_kill
  2364.          ? TRUE : FALSE;
  2365.  
  2366.    if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
  2367.       lp_debug_fs_variant(variant);
  2368.    }
  2369.  
  2370.    lp_jit_init_types(variant);
  2371.    
  2372.    if (variant->jit_function[RAST_EDGE_TEST] == NULL)
  2373.       generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
  2374.  
  2375.    if (variant->jit_function[RAST_WHOLE] == NULL) {
  2376.       if (variant->opaque) {
  2377.          /* Specialized shader, which doesn't need to read the color buffer. */
  2378.          generate_fragment(lp, shader, variant, RAST_WHOLE);
  2379.       }
  2380.    }
  2381.  
  2382.    /*
  2383.     * Compile everything
  2384.     */
  2385.  
  2386.    gallivm_compile_module(variant->gallivm);
  2387.  
  2388.    if (variant->function[RAST_EDGE_TEST]) {
  2389.       variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func)
  2390.             gallivm_jit_function(variant->gallivm,
  2391.                                  variant->function[RAST_EDGE_TEST]);
  2392.    }
  2393.  
  2394.    if (variant->function[RAST_WHOLE]) {
  2395.          variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
  2396.                gallivm_jit_function(variant->gallivm,
  2397.                                     variant->function[RAST_WHOLE]);
  2398.    } else if (!variant->jit_function[RAST_WHOLE]) {
  2399.       variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST];
  2400.    }
  2401.  
  2402.    return variant;
  2403. }
  2404.  
  2405.  
  2406. static void *
  2407. llvmpipe_create_fs_state(struct pipe_context *pipe,
  2408.                          const struct pipe_shader_state *templ)
  2409. {
  2410.    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
  2411.    struct lp_fragment_shader *shader;
  2412.    int nr_samplers;
  2413.    int nr_sampler_views;
  2414.    int i;
  2415.  
  2416.    shader = CALLOC_STRUCT(lp_fragment_shader);
  2417.    if (!shader)
  2418.       return NULL;
  2419.  
  2420.    shader->no = fs_no++;
  2421.    make_empty_list(&shader->variants);
  2422.  
  2423.    /* get/save the summary info for this shader */
  2424.    lp_build_tgsi_info(templ->tokens, &shader->info);
  2425.  
  2426.    /* we need to keep a local copy of the tokens */
  2427.    shader->base.tokens = tgsi_dup_tokens(templ->tokens);
  2428.  
  2429.    shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ);
  2430.    if (shader->draw_data == NULL) {
  2431.       FREE((void *) shader->base.tokens);
  2432.       FREE(shader);
  2433.       return NULL;
  2434.    }
  2435.  
  2436.    nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
  2437.    nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
  2438.  
  2439.    shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key,
  2440.                                      state[MAX2(nr_samplers, nr_sampler_views)]);
  2441.  
  2442.    for (i = 0; i < shader->info.base.num_inputs; i++) {
  2443.       shader->inputs[i].usage_mask = shader->info.base.input_usage_mask[i];
  2444.       shader->inputs[i].cyl_wrap = shader->info.base.input_cylindrical_wrap[i];
  2445.  
  2446.       switch (shader->info.base.input_interpolate[i]) {
  2447.       case TGSI_INTERPOLATE_CONSTANT:
  2448.          shader->inputs[i].interp = LP_INTERP_CONSTANT;
  2449.          break;
  2450.       case TGSI_INTERPOLATE_LINEAR:
  2451.          shader->inputs[i].interp = LP_INTERP_LINEAR;
  2452.          break;
  2453.       case TGSI_INTERPOLATE_PERSPECTIVE:
  2454.          shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
  2455.          break;
  2456.       case TGSI_INTERPOLATE_COLOR:
  2457.          shader->inputs[i].interp = LP_INTERP_COLOR;
  2458.          break;
  2459.       default:
  2460.          assert(0);
  2461.          break;
  2462.       }
  2463.  
  2464.       switch (shader->info.base.input_semantic_name[i]) {
  2465.       case TGSI_SEMANTIC_FACE:
  2466.          shader->inputs[i].interp = LP_INTERP_FACING;
  2467.          break;
  2468.       case TGSI_SEMANTIC_POSITION:
  2469.          /* Position was already emitted above
  2470.           */
  2471.          shader->inputs[i].interp = LP_INTERP_POSITION;
  2472.          shader->inputs[i].src_index = 0;
  2473.          continue;
  2474.       }
  2475.  
  2476.       shader->inputs[i].src_index = i+1;
  2477.    }
  2478.  
  2479.    if (LP_DEBUG & DEBUG_TGSI) {
  2480.       unsigned attrib;
  2481.       debug_printf("llvmpipe: Create fragment shader #%u %p:\n",
  2482.                    shader->no, (void *) shader);
  2483.       tgsi_dump(templ->tokens, 0);
  2484.       debug_printf("usage masks:\n");
  2485.       for (attrib = 0; attrib < shader->info.base.num_inputs; ++attrib) {
  2486.          unsigned usage_mask = shader->info.base.input_usage_mask[attrib];
  2487.          debug_printf("  IN[%u].%s%s%s%s\n",
  2488.                       attrib,
  2489.                       usage_mask & TGSI_WRITEMASK_X ? "x" : "",
  2490.                       usage_mask & TGSI_WRITEMASK_Y ? "y" : "",
  2491.                       usage_mask & TGSI_WRITEMASK_Z ? "z" : "",
  2492.                       usage_mask & TGSI_WRITEMASK_W ? "w" : "");
  2493.       }
  2494.       debug_printf("\n");
  2495.    }
  2496.  
  2497.    return shader;
  2498. }
  2499.  
  2500.  
  2501. static void
  2502. llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
  2503. {
  2504.    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
  2505.  
  2506.    if (llvmpipe->fs == fs)
  2507.       return;
  2508.  
  2509.    llvmpipe->fs = (struct lp_fragment_shader *) fs;
  2510.  
  2511.    draw_bind_fragment_shader(llvmpipe->draw,
  2512.                              (llvmpipe->fs ? llvmpipe->fs->draw_data : NULL));
  2513.  
  2514.    llvmpipe->dirty |= LP_NEW_FS;
  2515. }
  2516.  
  2517.  
  2518. /**
  2519.  * Remove shader variant from two lists: the shader's variant list
  2520.  * and the context's variant list.
  2521.  */
  2522. void
  2523. llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
  2524.                                struct lp_fragment_shader_variant *variant)
  2525. {
  2526.    unsigned i;
  2527.  
  2528.    if (gallivm_debug & GALLIVM_DEBUG_IR) {
  2529.       debug_printf("llvmpipe: del fs #%u var #%u v created #%u v cached"
  2530.                    " #%u v total cached #%u\n",
  2531.                    variant->shader->no,
  2532.                    variant->no,
  2533.                    variant->shader->variants_created,
  2534.                    variant->shader->variants_cached,
  2535.                    lp->nr_fs_variants);
  2536.    }
  2537.  
  2538.    /* free all the variant's JIT'd functions */
  2539.    for (i = 0; i < Elements(variant->function); i++) {
  2540.       if (variant->function[i]) {
  2541.          gallivm_free_function(variant->gallivm,
  2542.                                variant->function[i],
  2543.                                variant->jit_function[i]);
  2544.       }
  2545.    }
  2546.  
  2547.    gallivm_destroy(variant->gallivm);
  2548.  
  2549.    /* remove from shader's list */
  2550.    remove_from_list(&variant->list_item_local);
  2551.    variant->shader->variants_cached--;
  2552.  
  2553.    /* remove from context's list */
  2554.    remove_from_list(&variant->list_item_global);
  2555.    lp->nr_fs_variants--;
  2556.    lp->nr_fs_instrs -= variant->nr_instrs;
  2557.  
  2558.    FREE(variant);
  2559. }
  2560.  
  2561.  
  2562. static void
  2563. llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
  2564. {
  2565.    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
  2566.    struct lp_fragment_shader *shader = fs;
  2567.    struct lp_fs_variant_list_item *li;
  2568.  
  2569.    assert(fs != llvmpipe->fs);
  2570.  
  2571.    /*
  2572.     * XXX: we need to flush the context until we have some sort of reference
  2573.     * counting in fragment shaders as they may still be binned
  2574.     * Flushing alone might not sufficient we need to wait on it too.
  2575.     */
  2576.    llvmpipe_finish(pipe, __FUNCTION__);
  2577.  
  2578.    /* Delete all the variants */
  2579.    li = first_elem(&shader->variants);
  2580.    while(!at_end(&shader->variants, li)) {
  2581.       struct lp_fs_variant_list_item *next = next_elem(li);
  2582.       llvmpipe_remove_shader_variant(llvmpipe, li->base);
  2583.       li = next;
  2584.    }
  2585.  
  2586.    /* Delete draw module's data */
  2587.    draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
  2588.  
  2589.    assert(shader->variants_cached == 0);
  2590.    FREE((void *) shader->base.tokens);
  2591.    FREE(shader);
  2592. }
  2593.  
  2594.  
  2595.  
  2596. static void
  2597. llvmpipe_set_constant_buffer(struct pipe_context *pipe,
  2598.                              uint shader, uint index,
  2599.                              struct pipe_constant_buffer *cb)
  2600. {
  2601.    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
  2602.    struct pipe_resource *constants = cb ? cb->buffer : NULL;
  2603.  
  2604.    assert(shader < PIPE_SHADER_TYPES);
  2605.    assert(index < Elements(llvmpipe->constants[shader]));
  2606.  
  2607.    /* note: reference counting */
  2608.    util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb);
  2609.  
  2610.    if (shader == PIPE_SHADER_VERTEX ||
  2611.        shader == PIPE_SHADER_GEOMETRY) {
  2612.       /* Pass the constants to the 'draw' module */
  2613.       const unsigned size = cb ? cb->buffer_size : 0;
  2614.       const ubyte *data;
  2615.  
  2616.       if (constants) {
  2617.          data = (ubyte *) llvmpipe_resource_data(constants);
  2618.       }
  2619.       else if (cb && cb->user_buffer) {
  2620.          data = (ubyte *) cb->user_buffer;
  2621.       }
  2622.       else {
  2623.          data = NULL;
  2624.       }
  2625.  
  2626.       if (data)
  2627.          data += cb->buffer_offset;
  2628.  
  2629.       draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
  2630.                                       index, data, size);
  2631.    }
  2632.  
  2633.    llvmpipe->dirty |= LP_NEW_CONSTANTS;
  2634.  
  2635.    if (cb && cb->user_buffer) {
  2636.       pipe_resource_reference(&constants, NULL);
  2637.    }
  2638. }
  2639.  
  2640.  
  2641. /**
  2642.  * Return the blend factor equivalent to a destination alpha of one.
  2643.  */
  2644. static INLINE unsigned
  2645. force_dst_alpha_one(unsigned factor, boolean clamped_zero)
  2646. {
  2647.    switch(factor) {
  2648.    case PIPE_BLENDFACTOR_DST_ALPHA:
  2649.       return PIPE_BLENDFACTOR_ONE;
  2650.    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
  2651.       return PIPE_BLENDFACTOR_ZERO;
  2652.    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
  2653.       if (clamped_zero)
  2654.          return PIPE_BLENDFACTOR_ZERO;
  2655.       else
  2656.          return PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE;
  2657.    }
  2658.  
  2659.    return factor;
  2660. }
  2661.  
  2662.  
  2663. /**
  2664.  * We need to generate several variants of the fragment pipeline to match
  2665.  * all the combinations of the contributing state atoms.
  2666.  *
  2667.  * TODO: there is actually no reason to tie this to context state -- the
  2668.  * generated code could be cached globally in the screen.
  2669.  */
  2670. static void
  2671. make_variant_key(struct llvmpipe_context *lp,
  2672.                  struct lp_fragment_shader *shader,
  2673.                  struct lp_fragment_shader_variant_key *key)
  2674. {
  2675.    unsigned i;
  2676.  
  2677.    memset(key, 0, shader->variant_key_size);
  2678.  
  2679.    if (lp->framebuffer.zsbuf) {
  2680.       enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format;
  2681.       const struct util_format_description *zsbuf_desc =
  2682.          util_format_description(zsbuf_format);
  2683.  
  2684.       if (lp->depth_stencil->depth.enabled &&
  2685.           util_format_has_depth(zsbuf_desc)) {
  2686.          key->zsbuf_format = zsbuf_format;
  2687.          memcpy(&key->depth, &lp->depth_stencil->depth, sizeof key->depth);
  2688.       }
  2689.       if (lp->depth_stencil->stencil[0].enabled &&
  2690.           util_format_has_stencil(zsbuf_desc)) {
  2691.          key->zsbuf_format = zsbuf_format;
  2692.          memcpy(&key->stencil, &lp->depth_stencil->stencil, sizeof key->stencil);
  2693.       }
  2694.       if (llvmpipe_resource_is_1d(lp->framebuffer.zsbuf->texture)) {
  2695.          key->resource_1d = TRUE;
  2696.       }
  2697.    }
  2698.  
  2699.    /* alpha test only applies if render buffer 0 is non-integer (or does not exist) */
  2700.    if (!lp->framebuffer.nr_cbufs ||
  2701.        !util_format_is_pure_integer(lp->framebuffer.cbufs[0]->format)) {
  2702.       key->alpha.enabled = lp->depth_stencil->alpha.enabled;
  2703.    }
  2704.    if(key->alpha.enabled)
  2705.       key->alpha.func = lp->depth_stencil->alpha.func;
  2706.    /* alpha.ref_value is passed in jit_context */
  2707.  
  2708.    key->flatshade = lp->rasterizer->flatshade;
  2709.    if (lp->active_occlusion_queries) {
  2710.       key->occlusion_count = TRUE;
  2711.    }
  2712.  
  2713.    if (lp->framebuffer.nr_cbufs) {
  2714.       memcpy(&key->blend, lp->blend, sizeof key->blend);
  2715.    }
  2716.  
  2717.    key->nr_cbufs = lp->framebuffer.nr_cbufs;
  2718.  
  2719.    if (!key->blend.independent_blend_enable) {
  2720.       /* we always need independent blend otherwise the fixups below won't work */
  2721.       for (i = 1; i < key->nr_cbufs; i++) {
  2722.          memcpy(&key->blend.rt[i], &key->blend.rt[0], sizeof(key->blend.rt[0]));
  2723.       }
  2724.       key->blend.independent_blend_enable = 1;
  2725.    }
  2726.  
  2727.    for (i = 0; i < lp->framebuffer.nr_cbufs; i++) {
  2728.       enum pipe_format format = lp->framebuffer.cbufs[i]->format;
  2729.       struct pipe_rt_blend_state *blend_rt = &key->blend.rt[i];
  2730.       const struct util_format_description *format_desc;
  2731.  
  2732.       key->cbuf_format[i] = format;
  2733.  
  2734.       /*
  2735.        * Figure out if this is a 1d resource. Note that OpenGL allows crazy
  2736.        * mixing of 2d textures with height 1 and 1d textures, so make sure
  2737.        * we pick 1d if any cbuf or zsbuf is 1d.
  2738.        */
  2739.       if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[0]->texture)) {
  2740.          key->resource_1d = TRUE;
  2741.       }
  2742.  
  2743.       format_desc = util_format_description(format);
  2744.       assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
  2745.              format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
  2746.  
  2747.       /*
  2748.        * Mask out color channels not present in the color buffer.
  2749.        */
  2750.       blend_rt->colormask &= util_format_colormask(format_desc);
  2751.  
  2752.       /*
  2753.        * Disable blend for integer formats.
  2754.        */
  2755.       if (util_format_is_pure_integer(format)) {
  2756.          blend_rt->blend_enable = 0;
  2757.       }
  2758.  
  2759.       /*
  2760.        * Our swizzled render tiles always have an alpha channel, but the linear
  2761.        * render target format often does not, so force here the dst alpha to be
  2762.        * one.
  2763.        *
  2764.        * This is not a mere optimization. Wrong results will be produced if the
  2765.        * dst alpha is used, the dst format does not have alpha, and the previous
  2766.        * rendering was not flushed from the swizzled to linear buffer. For
  2767.        * example, NonPowTwo DCT.
  2768.        *
  2769.        * TODO: This should be generalized to all channels for better
  2770.        * performance, but only alpha causes correctness issues.
  2771.        *
  2772.        * Also, force rgb/alpha func/factors match, to make AoS blending easier.
  2773.        */
  2774.       if (format_desc->swizzle[3] > UTIL_FORMAT_SWIZZLE_W ||
  2775.           format_desc->swizzle[3] == format_desc->swizzle[0]) {
  2776.          /* Doesn't cover mixed snorm/unorm but can't render to them anyway */
  2777.          boolean clamped_zero = !util_format_is_float(format) &&
  2778.                                 !util_format_is_snorm(format);
  2779.          blend_rt->rgb_src_factor   = force_dst_alpha_one(blend_rt->rgb_src_factor,
  2780.                                                           clamped_zero);
  2781.          blend_rt->rgb_dst_factor   = force_dst_alpha_one(blend_rt->rgb_dst_factor,
  2782.                                                           clamped_zero);
  2783.          blend_rt->alpha_func       = blend_rt->rgb_func;
  2784.          blend_rt->alpha_src_factor = blend_rt->rgb_src_factor;
  2785.          blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor;
  2786.       }
  2787.    }
  2788.  
  2789.    /* This value will be the same for all the variants of a given shader:
  2790.     */
  2791.    key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
  2792.  
  2793.    for(i = 0; i < key->nr_samplers; ++i) {
  2794.       if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
  2795.          lp_sampler_static_sampler_state(&key->state[i].sampler_state,
  2796.                                          lp->samplers[PIPE_SHADER_FRAGMENT][i]);
  2797.       }
  2798.    }
  2799.  
  2800.    /*
  2801.     * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes
  2802.     * are dx10-style? Can't really have mixed opcodes, at least not
  2803.     * if we want to skip the holes here (without rescanning tgsi).
  2804.     */
  2805.    if (shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) {
  2806.       key->nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
  2807.       for(i = 0; i < key->nr_sampler_views; ++i) {
  2808.          if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) {
  2809.             lp_sampler_static_texture_state(&key->state[i].texture_state,
  2810.                                             lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
  2811.          }
  2812.       }
  2813.    }
  2814.    else {
  2815.       key->nr_sampler_views = key->nr_samplers;
  2816.       for(i = 0; i < key->nr_sampler_views; ++i) {
  2817.          if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
  2818.             lp_sampler_static_texture_state(&key->state[i].texture_state,
  2819.                                             lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
  2820.          }
  2821.       }
  2822.    }
  2823. }
  2824.  
  2825.  
  2826.  
  2827. /**
  2828.  * Update fragment shader state.  This is called just prior to drawing
  2829.  * something when some fragment-related state has changed.
  2830.  */
  2831. void
  2832. llvmpipe_update_fs(struct llvmpipe_context *lp)
  2833. {
  2834.    struct lp_fragment_shader *shader = lp->fs;
  2835.    struct lp_fragment_shader_variant_key key;
  2836.    struct lp_fragment_shader_variant *variant = NULL;
  2837.    struct lp_fs_variant_list_item *li;
  2838.  
  2839.    make_variant_key(lp, shader, &key);
  2840.  
  2841.    /* Search the variants for one which matches the key */
  2842.    li = first_elem(&shader->variants);
  2843.    while(!at_end(&shader->variants, li)) {
  2844.       if(memcmp(&li->base->key, &key, shader->variant_key_size) == 0) {
  2845.          variant = li->base;
  2846.          break;
  2847.       }
  2848.       li = next_elem(li);
  2849.    }
  2850.  
  2851.    if (variant) {
  2852.       /* Move this variant to the head of the list to implement LRU
  2853.        * deletion of shader's when we have too many.
  2854.        */
  2855.       move_to_head(&lp->fs_variants_list, &variant->list_item_global);
  2856.    }
  2857.    else {
  2858.       /* variant not found, create it now */
  2859.       int64_t t0, t1, dt;
  2860.       unsigned i;
  2861.       unsigned variants_to_cull;
  2862.  
  2863.       if (0) {
  2864.          debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n",
  2865.                       lp->nr_fs_variants,
  2866.                       lp->nr_fs_instrs,
  2867.                       lp->nr_fs_variants ? lp->nr_fs_instrs / lp->nr_fs_variants : 0);
  2868.       }
  2869.  
  2870.       /* First, check if we've exceeded the max number of shader variants.
  2871.        * If so, free 25% of them (the least recently used ones).
  2872.        */
  2873.       variants_to_cull = lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS ? LP_MAX_SHADER_VARIANTS / 4 : 0;
  2874.  
  2875.       if (variants_to_cull ||
  2876.           lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) {
  2877.          struct pipe_context *pipe = &lp->pipe;
  2878.  
  2879.          /*
  2880.           * XXX: we need to flush the context until we have some sort of
  2881.           * reference counting in fragment shaders as they may still be binned
  2882.           * Flushing alone might not be sufficient we need to wait on it too.
  2883.           */
  2884.          llvmpipe_finish(pipe, __FUNCTION__);
  2885.  
  2886.          /*
  2887.           * We need to re-check lp->nr_fs_variants because an arbitrarliy large
  2888.           * number of shader variants (potentially all of them) could be
  2889.           * pending for destruction on flush.
  2890.           */
  2891.  
  2892.          for (i = 0; i < variants_to_cull || lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS; i++) {
  2893.             struct lp_fs_variant_list_item *item;
  2894.             if (is_empty_list(&lp->fs_variants_list)) {
  2895.                break;
  2896.             }
  2897.             item = last_elem(&lp->fs_variants_list);
  2898.             assert(item);
  2899.             assert(item->base);
  2900.             llvmpipe_remove_shader_variant(lp, item->base);
  2901.          }
  2902.       }
  2903.  
  2904.       /*
  2905.        * Generate the new variant.
  2906.        */
  2907.       t0 = os_time_get();
  2908.       variant = generate_variant(lp, shader, &key);
  2909.       t1 = os_time_get();
  2910.       dt = t1 - t0;
  2911.       LP_COUNT_ADD(llvm_compile_time, dt);
  2912.       LP_COUNT_ADD(nr_llvm_compiles, 2);  /* emit vs. omit in/out test */
  2913.  
  2914.       llvmpipe_variant_count++;
  2915.  
  2916.       /* Put the new variant into the list */
  2917.       if (variant) {
  2918.          insert_at_head(&shader->variants, &variant->list_item_local);
  2919.          insert_at_head(&lp->fs_variants_list, &variant->list_item_global);
  2920.          lp->nr_fs_variants++;
  2921.          lp->nr_fs_instrs += variant->nr_instrs;
  2922.          shader->variants_cached++;
  2923.       }
  2924.    }
  2925.  
  2926.    /* Bind this variant */
  2927.    lp_setup_set_fs_variant(lp->setup, variant);
  2928. }
  2929.  
  2930.  
  2931.  
  2932.  
  2933.  
  2934. void
  2935. llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe)
  2936. {
  2937.    llvmpipe->pipe.create_fs_state = llvmpipe_create_fs_state;
  2938.    llvmpipe->pipe.bind_fs_state   = llvmpipe_bind_fs_state;
  2939.    llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state;
  2940.  
  2941.    llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
  2942. }
  2943.