Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /**************************************************************************
  2.  *
  3.  * Copyright 2007-2009 VMware, Inc.
  4.  * All Rights Reserved.
  5.  *
  6.  * Permission is hereby granted, free of charge, to any person obtaining a
  7.  * copy of this software and associated documentation files (the
  8.  * "Software"), to deal in the Software without restriction, including
  9.  * without limitation the rights to use, copy, modify, merge, publish,
  10.  * distribute, sub license, and/or sell copies of the Software, and to
  11.  * permit persons to whom the Software is furnished to do so, subject to
  12.  * the following conditions:
  13.  *
  14.  * The above copyright notice and this permission notice (including the
  15.  * next paragraph) shall be included in all copies or substantial portions
  16.  * of the Software.
  17.  *
  18.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19.  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20.  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21.  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22.  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23.  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24.  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25.  *
  26.  **************************************************************************/
  27.  
  28. /*
  29.  * Rasterization for binned triangles within a tile
  30.  */
  31.  
  32. #include <limits.h>
  33. #include "util/u_math.h"
  34. #include "lp_debug.h"
  35. #include "lp_perf.h"
  36. #include "lp_rast_priv.h"
  37.  
  38.  
  39.  
  40.  
  41. /**
  42.  * Shade all pixels in a 4x4 block.
  43.  */
  44. static void
  45. block_full_4(struct lp_rasterizer_task *task,
  46.              const struct lp_rast_triangle *tri,
  47.              int x, int y)
  48. {
  49.    lp_rast_shade_quads_all(task, &tri->inputs, x, y);
  50. }
  51.  
  52.  
  53. /**
  54.  * Shade all pixels in a 16x16 block.
  55.  */
  56. static void
  57. block_full_16(struct lp_rasterizer_task *task,
  58.               const struct lp_rast_triangle *tri,
  59.               int x, int y)
  60. {
  61.    unsigned ix, iy;
  62.    assert(x % 16 == 0);
  63.    assert(y % 16 == 0);
  64.    for (iy = 0; iy < 16; iy += 4)
  65.       for (ix = 0; ix < 16; ix += 4)
  66.          block_full_4(task, tri, x + ix, y + iy);
  67. }
  68.  
  69. #if !defined(PIPE_ARCH_SSE)
  70.  
  71. static INLINE unsigned
  72. build_mask_linear(int c, int dcdx, int dcdy)
  73. {
  74.    int mask = 0;
  75.  
  76.    int c0 = c;
  77.    int c1 = c0 + dcdy;
  78.    int c2 = c1 + dcdy;
  79.    int c3 = c2 + dcdy;
  80.  
  81.    mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
  82.    mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
  83.    mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
  84.    mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
  85.    mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
  86.    mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
  87.    mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
  88.    mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
  89.    mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
  90.    mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
  91.    mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
  92.    mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
  93.    mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
  94.    mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
  95.    mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
  96.    mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
  97.  
  98.    return mask;
  99. }
  100.  
  101.  
  102. static INLINE void
  103. build_masks(int c,
  104.             int cdiff,
  105.             int dcdx,
  106.             int dcdy,
  107.             unsigned *outmask,
  108.             unsigned *partmask)
  109. {
  110.    *outmask |= build_mask_linear(c, dcdx, dcdy);
  111.    *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
  112. }
  113.  
  114. void
  115. lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
  116.                       const union lp_rast_cmd_arg arg)
  117. {
  118.    union lp_rast_cmd_arg arg2;
  119.    arg2.triangle.tri = arg.triangle.tri;
  120.    arg2.triangle.plane_mask = (1<<3)-1;
  121.    lp_rast_triangle_3(task, arg2);
  122. }
  123.  
  124. void
  125. lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
  126.                       const union lp_rast_cmd_arg arg)
  127. {
  128.    union lp_rast_cmd_arg arg2;
  129.    arg2.triangle.tri = arg.triangle.tri;
  130.    arg2.triangle.plane_mask = (1<<4)-1;
  131.    lp_rast_triangle_4(task, arg2);
  132. }
  133.  
  134. void
  135. lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
  136.                       const union lp_rast_cmd_arg arg)
  137. {
  138.    lp_rast_triangle_3_16(task, arg);
  139. }
  140.  
  141. #else
  142. #include <emmintrin.h>
  143. #include "util/u_sse.h"
  144.  
  145.  
  146. static INLINE void
  147. build_masks(int c,
  148.             int cdiff,
  149.             int dcdx,
  150.             int dcdy,
  151.             unsigned *outmask,
  152.             unsigned *partmask)
  153. {
  154.    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
  155.    __m128i xdcdy = _mm_set1_epi32(dcdy);
  156.  
  157.    /* Get values across the quad
  158.     */
  159.    __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
  160.    __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
  161.    __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
  162.  
  163.    {
  164.       __m128i cstep01, cstep23, result;
  165.  
  166.       cstep01 = _mm_packs_epi32(cstep0, cstep1);
  167.       cstep23 = _mm_packs_epi32(cstep2, cstep3);
  168.       result = _mm_packs_epi16(cstep01, cstep23);
  169.  
  170.       *outmask |= _mm_movemask_epi8(result);
  171.    }
  172.  
  173.  
  174.    {
  175.       __m128i cio4 = _mm_set1_epi32(cdiff);
  176.       __m128i cstep01, cstep23, result;
  177.  
  178.       cstep0 = _mm_add_epi32(cstep0, cio4);
  179.       cstep1 = _mm_add_epi32(cstep1, cio4);
  180.       cstep2 = _mm_add_epi32(cstep2, cio4);
  181.       cstep3 = _mm_add_epi32(cstep3, cio4);
  182.  
  183.       cstep01 = _mm_packs_epi32(cstep0, cstep1);
  184.       cstep23 = _mm_packs_epi32(cstep2, cstep3);
  185.       result = _mm_packs_epi16(cstep01, cstep23);
  186.  
  187.       *partmask |= _mm_movemask_epi8(result);
  188.    }
  189. }
  190.  
  191.  
  192. static INLINE unsigned
  193. build_mask_linear(int c, int dcdx, int dcdy)
  194. {
  195.    __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
  196.    __m128i xdcdy = _mm_set1_epi32(dcdy);
  197.  
  198.    /* Get values across the quad
  199.     */
  200.    __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
  201.    __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
  202.    __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
  203.  
  204.    /* pack pairs of results into epi16
  205.     */
  206.    __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
  207.    __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
  208.  
  209.    /* pack into epi8, preserving sign bits
  210.     */
  211.    __m128i result = _mm_packs_epi16(cstep01, cstep23);
  212.  
  213.    /* extract sign bits to create mask
  214.     */
  215.    return _mm_movemask_epi8(result);
  216. }
  217.  
  218. static INLINE unsigned
  219. sign_bits4(const __m128i *cstep, int cdiff)
  220. {
  221.  
  222.    /* Adjust the step values
  223.     */
  224.    __m128i cio4 = _mm_set1_epi32(cdiff);
  225.    __m128i cstep0 = _mm_add_epi32(cstep[0], cio4);
  226.    __m128i cstep1 = _mm_add_epi32(cstep[1], cio4);
  227.    __m128i cstep2 = _mm_add_epi32(cstep[2], cio4);
  228.    __m128i cstep3 = _mm_add_epi32(cstep[3], cio4);
  229.  
  230.    /* Pack down to epi8
  231.     */
  232.    __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
  233.    __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
  234.    __m128i result = _mm_packs_epi16(cstep01, cstep23);
  235.  
  236.    /* Extract the sign bits
  237.     */
  238.    return _mm_movemask_epi8(result);
  239. }
  240.  
  241.  
  242. #define NR_PLANES 3
  243.  
  244.  
  245.  
  246.  
  247.  
  248.  
  249.  
  250. void
  251. lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
  252.                       const union lp_rast_cmd_arg arg)
  253. {
  254.    const struct lp_rast_triangle *tri = arg.triangle.tri;
  255.    const struct lp_rast_plane *plane = GET_PLANES(tri);
  256.    int x = (arg.triangle.plane_mask & 0xff) + task->x;
  257.    int y = (arg.triangle.plane_mask >> 8) + task->y;
  258.    unsigned i, j;
  259.  
  260.    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
  261.    unsigned nr = 0;
  262.  
  263.    __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
  264.    __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
  265.    __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
  266.    __m128i zero = _mm_setzero_si128();
  267.  
  268.    __m128i c;
  269.    __m128i dcdx;
  270.    __m128i dcdy;
  271.    __m128i rej4;
  272.  
  273.    __m128i dcdx2;
  274.    __m128i dcdx3;
  275.    
  276.    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
  277.    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
  278.    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
  279.    __m128i unused;
  280.    
  281.    transpose4_epi32(&p0, &p1, &p2, &zero,
  282.                     &c, &dcdx, &dcdy, &rej4);
  283.  
  284.    /* Adjust dcdx;
  285.     */
  286.    dcdx = _mm_sub_epi32(zero, dcdx);
  287.  
  288.    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
  289.    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
  290.    rej4 = _mm_slli_epi32(rej4, 2);
  291.  
  292.    /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
  293.    c = _mm_sub_epi32(c, _mm_set1_epi32(1));
  294.    rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));
  295.  
  296.    dcdx2 = _mm_add_epi32(dcdx, dcdx);
  297.    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
  298.  
  299.    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
  300.                     &span_0, &span_1, &span_2, &unused);
  301.  
  302.    for (i = 0; i < 4; i++) {
  303.       __m128i cx = c;
  304.  
  305.       for (j = 0; j < 4; j++) {
  306.          __m128i c4rej = _mm_add_epi32(cx, rej4);
  307.          __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
  308.  
  309.          /* if (is_zero(rej_masks)) */
  310.          if (_mm_movemask_epi8(rej_masks) == 0) {
  311.             __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
  312.             __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
  313.             __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
  314.  
  315.             __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
  316.  
  317.             __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
  318.             __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
  319.             __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
  320.  
  321.             __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
  322.             __m128i c_01 = _mm_packs_epi32(c_0, c_1);
  323.  
  324.             __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
  325.             __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
  326.             __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
  327.  
  328.             __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
  329.  
  330.             __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
  331.             __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
  332.             __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
  333.  
  334.             __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
  335.             __m128i c_23 = _mm_packs_epi32(c_2, c_3);
  336.             __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
  337.  
  338.             unsigned mask = _mm_movemask_epi8(c_0123);
  339.  
  340.             out[nr].i = i;
  341.             out[nr].j = j;
  342.             out[nr].mask = mask;
  343.             if (mask != 0xffff)
  344.                nr++;
  345.          }
  346.          cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
  347.       }
  348.  
  349.       c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
  350.    }
  351.  
  352.    for (i = 0; i < nr; i++)
  353.       lp_rast_shade_quads_mask(task,
  354.                                &tri->inputs,
  355.                                x + 4 * out[i].j,
  356.                                y + 4 * out[i].i,
  357.                                0xffff & ~out[i].mask);
  358. }
  359.  
  360.  
  361.  
  362.  
  363.  
  364. void
  365. lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
  366.                      const union lp_rast_cmd_arg arg)
  367. {
  368.    const struct lp_rast_triangle *tri = arg.triangle.tri;
  369.    const struct lp_rast_plane *plane = GET_PLANES(tri);
  370.    unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
  371.    unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
  372.  
  373.    __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
  374.    __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
  375.    __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
  376.    __m128i zero = _mm_setzero_si128();
  377.  
  378.    __m128i c;
  379.    __m128i dcdx;
  380.    __m128i dcdy;
  381.  
  382.    __m128i dcdx2;
  383.    __m128i dcdx3;
  384.    
  385.    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
  386.    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
  387.    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
  388.    __m128i unused;
  389.  
  390.    transpose4_epi32(&p0, &p1, &p2, &zero,
  391.                     &c, &dcdx, &dcdy, &unused);
  392.  
  393.    /* Adjust dcdx;
  394.     */
  395.    dcdx = _mm_sub_epi32(zero, dcdx);
  396.  
  397.    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
  398.    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
  399.  
  400.    /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
  401.    c = _mm_sub_epi32(c, _mm_set1_epi32(1));
  402.  
  403.    dcdx2 = _mm_add_epi32(dcdx, dcdx);
  404.    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
  405.  
  406.    transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
  407.                     &span_0, &span_1, &span_2, &unused);
  408.  
  409.  
  410.    {
  411.       __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
  412.       __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
  413.       __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
  414.      
  415.       __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
  416.  
  417.       __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
  418.       __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
  419.       __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
  420.  
  421.       __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
  422.       __m128i c_01 = _mm_packs_epi32(c_0, c_1);
  423.  
  424.       __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
  425.       __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
  426.       __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
  427.  
  428.       __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
  429.  
  430.       __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
  431.       __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
  432.       __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
  433.  
  434.       __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
  435.       __m128i c_23 = _mm_packs_epi32(c_2, c_3);
  436.       __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
  437.  
  438.       unsigned mask = _mm_movemask_epi8(c_0123);
  439.  
  440.       if (mask != 0xffff)
  441.          lp_rast_shade_quads_mask(task,
  442.                                   &tri->inputs,
  443.                                   x,
  444.                                   y,
  445.                                   0xffff & ~mask);
  446.    }
  447. }
  448.  
  449. #undef NR_PLANES
  450. #endif
  451.  
  452.  
  453.  
  454.  
  455. #define TAG(x) x##_1
  456. #define NR_PLANES 1
  457. #include "lp_rast_tri_tmp.h"
  458.  
  459. #define TAG(x) x##_2
  460. #define NR_PLANES 2
  461. #include "lp_rast_tri_tmp.h"
  462.  
  463. #define TAG(x) x##_3
  464. #define NR_PLANES 3
  465. /*#define TRI_4 lp_rast_triangle_3_4*/
  466. /*#define TRI_16 lp_rast_triangle_3_16*/
  467. #include "lp_rast_tri_tmp.h"
  468.  
  469. #define TAG(x) x##_4
  470. #define NR_PLANES 4
  471. #define TRI_16 lp_rast_triangle_4_16
  472. #include "lp_rast_tri_tmp.h"
  473.  
  474. #define TAG(x) x##_5
  475. #define NR_PLANES 5
  476. #include "lp_rast_tri_tmp.h"
  477.  
  478. #define TAG(x) x##_6
  479. #define NR_PLANES 6
  480. #include "lp_rast_tri_tmp.h"
  481.  
  482. #define TAG(x) x##_7
  483. #define NR_PLANES 7
  484. #include "lp_rast_tri_tmp.h"
  485.  
  486. #define TAG(x) x##_8
  487. #define NR_PLANES 8
  488. #include "lp_rast_tri_tmp.h"
  489.  
  490.