Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Mesa 3-D graphics library
  3.  *
  4.  * Copyright 2012 Intel Corporation
  5.  * Copyright 2013 Google
  6.  *
  7.  * Permission is hereby granted, free of charge, to any person obtaining a
  8.  * copy of this software and associated documentation files (the
  9.  * "Software"), to deal in the Software without restriction, including
  10.  * without limitation the rights to use, copy, modify, merge, publish,
  11.  * distribute, sub license, and/or sell copies of the Software, and to
  12.  * permit persons to whom the Software is furnished to do so, subject to
  13.  * the following conditions:
  14.  *
  15.  * The above copyright notice and this permission notice (including the
  16.  * next paragraph) shall be included in all copies or substantial portions
  17.  * of the Software.
  18.  *
  19.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20.  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21.  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22.  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  23.  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24.  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25.  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26.  *
  27.  * Authors:
  28.  *    Chad Versace <chad.versace@linux.intel.com>
  29.  *    Frank Henigman <fjhenigman@google.com>
  30.  */
  31.  
  32. #include <string.h>
  33.  
  34. #include "util/macros.h"
  35.  
  36. #include "brw_context.h"
  37. #include "intel_tiled_memcpy.h"
  38.  
  39. #ifdef __SSSE3__
  40. #include <tmmintrin.h>
  41. #endif
  42.  
  43. #define FILE_DEBUG_FLAG DEBUG_TEXTURE
  44.  
  45. #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
  46. #define ALIGN_UP(a, b) ALIGN(a, b)
  47.  
  48. /* Tile dimensions.  Width and span are in bytes, height is in pixels (i.e.
  49.  * unitless).  A "span" is the most number of bytes we can copy from linear
  50.  * to tiled without needing to calculate a new destination address.
  51.  */
  52. static const uint32_t xtile_width = 512;
  53. static const uint32_t xtile_height = 8;
  54. static const uint32_t xtile_span = 64;
  55. static const uint32_t ytile_width = 128;
  56. static const uint32_t ytile_height = 32;
  57. static const uint32_t ytile_span = 16;
  58.  
  59. #ifdef __SSSE3__
  60. static const uint8_t rgba8_permutation[16] =
  61.    { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
  62.  
  63. /* NOTE: dst must be 16-byte aligned. src may be unaligned. */
  64. #define rgba8_copy_16_aligned_dst(dst, src)                            \
  65.    _mm_store_si128((__m128i *)(dst),                                   \
  66.                    _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(src)), \
  67.                                     *(__m128i *) rgba8_permutation))
  68.  
  69. /* NOTE: src must be 16-byte aligned. dst may be unaligned. */
  70. #define rgba8_copy_16_aligned_src(dst, src)                            \
  71.    _mm_storeu_si128((__m128i *)(dst),                                  \
  72.                     _mm_shuffle_epi8(_mm_load_si128((__m128i *)(src)), \
  73.                                      *(__m128i *) rgba8_permutation))
  74. #endif
  75.  
  76. /**
  77.  * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
  78.  */
  79. static inline void *
  80. rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
  81. {
  82.    uint8_t *d = dst;
  83.    uint8_t const *s = src;
  84.  
  85. #ifdef __SSSE3__
  86.    if (bytes == 16) {
  87.       assert(!(((uintptr_t)dst) & 0xf));
  88.       rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
  89.       return dst;
  90.    }
  91.  
  92.    if (bytes == 64) {
  93.       assert(!(((uintptr_t)dst) & 0xf));
  94.       rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
  95.       rgba8_copy_16_aligned_dst(d+16, s+16);
  96.       rgba8_copy_16_aligned_dst(d+32, s+32);
  97.       rgba8_copy_16_aligned_dst(d+48, s+48);
  98.       return dst;
  99.    }
  100. #endif
  101.  
  102.    while (bytes >= 4) {
  103.       d[0] = s[2];
  104.       d[1] = s[1];
  105.       d[2] = s[0];
  106.       d[3] = s[3];
  107.       d += 4;
  108.       s += 4;
  109.       bytes -= 4;
  110.    }
  111.    return dst;
  112. }
  113.  
  114. /**
  115.  * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
  116.  */
  117. static inline void *
  118. rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
  119. {
  120.    uint8_t *d = dst;
  121.    uint8_t const *s = src;
  122.  
  123. #ifdef __SSSE3__
  124.    if (bytes == 16) {
  125.       assert(!(((uintptr_t)src) & 0xf));
  126.       rgba8_copy_16_aligned_src(d+ 0, s+ 0);
  127.       return dst;
  128.    }
  129.  
  130.    if (bytes == 64) {
  131.       assert(!(((uintptr_t)src) & 0xf));
  132.       rgba8_copy_16_aligned_src(d+ 0, s+ 0);
  133.       rgba8_copy_16_aligned_src(d+16, s+16);
  134.       rgba8_copy_16_aligned_src(d+32, s+32);
  135.       rgba8_copy_16_aligned_src(d+48, s+48);
  136.       return dst;
  137.    }
  138. #endif
  139.  
  140.    while (bytes >= 4) {
  141.       d[0] = s[2];
  142.       d[1] = s[1];
  143.       d[2] = s[0];
  144.       d[3] = s[3];
  145.       d += 4;
  146.       s += 4;
  147.       bytes -= 4;
  148.    }
  149.    return dst;
  150. }
  151.  
  152. /**
  153.  * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
  154.  * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
  155.  * The first and last ranges must be shorter than a "span" (the longest linear
  156.  * stretch within a tile) and the middle must equal a whole number of spans.
  157.  * Ranges may be empty.  The region copied must land entirely within one tile.
  158.  * 'dst' is the start of the tile and 'src' is the corresponding
  159.  * address to copy from, though copying begins at (x0, y0).
  160.  * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
  161.  * Swizzling flips bit 6 in the copy destination offset, when certain other
  162.  * bits are set in it.
  163.  */
  164. typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
  165.                              uint32_t y0, uint32_t y1,
  166.                              char *dst, const char *src,
  167.                              int32_t linear_pitch,
  168.                              uint32_t swizzle_bit,
  169.                              mem_copy_fn mem_copy);
  170.  
  171. /**
  172.  * Copy texture data from linear to X tile layout.
  173.  *
  174.  * \copydoc tile_copy_fn
  175.  */
  176. static inline void
  177. linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
  178.                  uint32_t y0, uint32_t y1,
  179.                  char *dst, const char *src,
  180.                  int32_t src_pitch,
  181.                  uint32_t swizzle_bit,
  182.                  mem_copy_fn mem_copy)
  183. {
  184.    /* The copy destination offset for each range copied is the sum of
  185.     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
  186.     */
  187.    uint32_t xo, yo;
  188.  
  189.    src += (ptrdiff_t)y0 * src_pitch;
  190.  
  191.    for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
  192.       /* Bits 9 and 10 of the copy destination offset control swizzling.
  193.        * Only 'yo' contributes to those bits in the total offset,
  194.        * so calculate 'swizzle' just once per row.
  195.        * Move bits 9 and 10 three and four places respectively down
  196.        * to bit 6 and xor them.
  197.        */
  198.       uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
  199.  
  200.       mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
  201.  
  202.       for (xo = x1; xo < x2; xo += xtile_span) {
  203.          mem_copy(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
  204.       }
  205.  
  206.       mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
  207.  
  208.       src += src_pitch;
  209.    }
  210. }
  211.  
  212. /**
  213.  * Copy texture data from linear to Y tile layout.
  214.  *
  215.  * \copydoc tile_copy_fn
  216.  */
  217. static inline void
  218. linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
  219.                  uint32_t y0, uint32_t y1,
  220.                  char *dst, const char *src,
  221.                  int32_t src_pitch,
  222.                  uint32_t swizzle_bit,
  223.                  mem_copy_fn mem_copy)
  224. {
  225.    /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
  226.     * as the tile).  Thus the destination offset for (x,y) is the sum of:
  227.     *   (x % column_width)                    // position within column
  228.     *   (x / column_width) * bytes_per_column // column number * bytes per column
  229.     *   y * column_width
  230.     *
  231.     * The copy destination offset for each range copied is the sum of
  232.     * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
  233.     */
  234.    const uint32_t column_width = ytile_span;
  235.    const uint32_t bytes_per_column = column_width * ytile_height;
  236.  
  237.    uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
  238.    uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
  239.  
  240.    /* Bit 9 of the destination offset control swizzling.
  241.     * Only the X offset contributes to bit 9 of the total offset,
  242.     * so swizzle can be calculated in advance for these X positions.
  243.     * Move bit 9 three places down to bit 6.
  244.     */
  245.    uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
  246.    uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
  247.  
  248.    uint32_t x, yo;
  249.  
  250.    src += (ptrdiff_t)y0 * src_pitch;
  251.  
  252.    for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
  253.       uint32_t xo = xo1;
  254.       uint32_t swizzle = swizzle1;
  255.  
  256.       mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
  257.  
  258.       /* Step by spans/columns.  As it happens, the swizzle bit flips
  259.        * at each step so we don't need to calculate it explicitly.
  260.        */
  261.       for (x = x1; x < x2; x += ytile_span) {
  262.          mem_copy(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
  263.          xo += bytes_per_column;
  264.          swizzle ^= swizzle_bit;
  265.       }
  266.  
  267.       mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
  268.  
  269.       src += src_pitch;
  270.    }
  271. }
  272.  
  273. /**
  274.  * Copy texture data from X tile layout to linear.
  275.  *
  276.  * \copydoc tile_copy_fn
  277.  */
  278. static inline void
  279. xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
  280.                  uint32_t y0, uint32_t y1,
  281.                  char *dst, const char *src,
  282.                  int32_t dst_pitch,
  283.                  uint32_t swizzle_bit,
  284.                  mem_copy_fn mem_copy)
  285. {
  286.    /* The copy destination offset for each range copied is the sum of
  287.     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
  288.     */
  289.    uint32_t xo, yo;
  290.  
  291.    dst += (ptrdiff_t)y0 * dst_pitch;
  292.  
  293.    for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
  294.       /* Bits 9 and 10 of the copy destination offset control swizzling.
  295.        * Only 'yo' contributes to those bits in the total offset,
  296.        * so calculate 'swizzle' just once per row.
  297.        * Move bits 9 and 10 three and four places respectively down
  298.        * to bit 6 and xor them.
  299.        */
  300.       uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
  301.  
  302.       mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
  303.  
  304.       for (xo = x1; xo < x2; xo += xtile_span) {
  305.          mem_copy(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
  306.       }
  307.  
  308.       mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
  309.  
  310.       dst += dst_pitch;
  311.    }
  312. }
  313.  
  314.  /**
  315.  * Copy texture data from Y tile layout to linear.
  316.  *
  317.  * \copydoc tile_copy_fn
  318.  */
  319. static inline void
  320. ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
  321.                  uint32_t y0, uint32_t y1,
  322.                  char *dst, const char *src,
  323.                  int32_t dst_pitch,
  324.                  uint32_t swizzle_bit,
  325.                  mem_copy_fn mem_copy)
  326. {
  327.    /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
  328.     * as the tile).  Thus the destination offset for (x,y) is the sum of:
  329.     *   (x % column_width)                    // position within column
  330.     *   (x / column_width) * bytes_per_column // column number * bytes per column
  331.     *   y * column_width
  332.     *
  333.     * The copy destination offset for each range copied is the sum of
  334.     * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
  335.     */
  336.    const uint32_t column_width = ytile_span;
  337.    const uint32_t bytes_per_column = column_width * ytile_height;
  338.  
  339.    uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
  340.    uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
  341.  
  342.    /* Bit 9 of the destination offset control swizzling.
  343.     * Only the X offset contributes to bit 9 of the total offset,
  344.     * so swizzle can be calculated in advance for these X positions.
  345.     * Move bit 9 three places down to bit 6.
  346.     */
  347.    uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
  348.    uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
  349.  
  350.    uint32_t x, yo;
  351.  
  352.    dst += (ptrdiff_t)y0 * dst_pitch;
  353.  
  354.    for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
  355.       uint32_t xo = xo1;
  356.       uint32_t swizzle = swizzle1;
  357.  
  358.       mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
  359.  
  360.       /* Step by spans/columns.  As it happens, the swizzle bit flips
  361.        * at each step so we don't need to calculate it explicitly.
  362.        */
  363.       for (x = x1; x < x2; x += ytile_span) {
  364.          mem_copy(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
  365.          xo += bytes_per_column;
  366.          swizzle ^= swizzle_bit;
  367.       }
  368.  
  369.       mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
  370.  
  371.       dst += dst_pitch;
  372.    }
  373. }
  374.  
  375.  
  376. /**
  377.  * Copy texture data from linear to X tile layout, faster.
  378.  *
  379.  * Same as \ref linear_to_xtiled but faster, because it passes constant
  380.  * parameters for common cases, allowing the compiler to inline code
  381.  * optimized for those cases.
  382.  *
  383.  * \copydoc tile_copy_fn
  384.  */
  385. static FLATTEN void
  386. linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
  387.                         uint32_t y0, uint32_t y1,
  388.                         char *dst, const char *src,
  389.                         int32_t src_pitch,
  390.                         uint32_t swizzle_bit,
  391.                         mem_copy_fn mem_copy)
  392. {
  393.    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
  394.       if (mem_copy == memcpy)
  395.          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
  396.                                  dst, src, src_pitch, swizzle_bit, memcpy);
  397.       else if (mem_copy == rgba8_copy_aligned_dst)
  398.          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
  399.                                  dst, src, src_pitch, swizzle_bit,
  400.                                  rgba8_copy_aligned_dst);
  401.       else
  402.          unreachable("not reached");
  403.    } else {
  404.       if (mem_copy == memcpy)
  405.          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
  406.                                  dst, src, src_pitch, swizzle_bit, memcpy);
  407.       else if (mem_copy == rgba8_copy_aligned_dst)
  408.          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
  409.                                  dst, src, src_pitch, swizzle_bit,
  410.                                  rgba8_copy_aligned_dst);
  411.       else
  412.          unreachable("not reached");
  413.    }
  414.    linear_to_xtiled(x0, x1, x2, x3, y0, y1,
  415.                     dst, src, src_pitch, swizzle_bit, mem_copy);
  416. }
  417.  
  418. /**
  419.  * Copy texture data from linear to Y tile layout, faster.
  420.  *
  421.  * Same as \ref linear_to_ytiled but faster, because it passes constant
  422.  * parameters for common cases, allowing the compiler to inline code
  423.  * optimized for those cases.
  424.  *
  425.  * \copydoc tile_copy_fn
  426.  */
  427. static FLATTEN void
  428. linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
  429.                         uint32_t y0, uint32_t y1,
  430.                         char *dst, const char *src,
  431.                         int32_t src_pitch,
  432.                         uint32_t swizzle_bit,
  433.                         mem_copy_fn mem_copy)
  434. {
  435.    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
  436.       if (mem_copy == memcpy)
  437.          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
  438.                                  dst, src, src_pitch, swizzle_bit, memcpy);
  439.       else if (mem_copy == rgba8_copy_aligned_dst)
  440.          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
  441.                                  dst, src, src_pitch, swizzle_bit,
  442.                                  rgba8_copy_aligned_dst);
  443.       else
  444.          unreachable("not reached");
  445.    } else {
  446.       if (mem_copy == memcpy)
  447.          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
  448.                                  dst, src, src_pitch, swizzle_bit, memcpy);
  449.       else if (mem_copy == rgba8_copy_aligned_dst)
  450.          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
  451.                                  dst, src, src_pitch, swizzle_bit,
  452.                                  rgba8_copy_aligned_dst);
  453.       else
  454.          unreachable("not reached");
  455.    }
  456.    linear_to_ytiled(x0, x1, x2, x3, y0, y1,
  457.                     dst, src, src_pitch, swizzle_bit, mem_copy);
  458. }
  459.  
  460. /**
  461.  * Copy texture data from X tile layout to linear, faster.
  462.  *
  463.  * Same as \ref xtile_to_linear but faster, because it passes constant
  464.  * parameters for common cases, allowing the compiler to inline code
  465.  * optimized for those cases.
  466.  *
  467.  * \copydoc tile_copy_fn
  468.  */
  469. static FLATTEN void
  470. xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
  471.                         uint32_t y0, uint32_t y1,
  472.                         char *dst, const char *src,
  473.                         int32_t dst_pitch,
  474.                         uint32_t swizzle_bit,
  475.                         mem_copy_fn mem_copy)
  476. {
  477.    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
  478.       if (mem_copy == memcpy)
  479.          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
  480.                                  dst, src, dst_pitch, swizzle_bit, memcpy);
  481.       else if (mem_copy == rgba8_copy_aligned_src)
  482.          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
  483.                                  dst, src, dst_pitch, swizzle_bit,
  484.                                  rgba8_copy_aligned_src);
  485.       else
  486.          unreachable("not reached");
  487.    } else {
  488.       if (mem_copy == memcpy)
  489.          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
  490.                                  dst, src, dst_pitch, swizzle_bit, memcpy);
  491.       else if (mem_copy == rgba8_copy_aligned_src)
  492.          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
  493.                                  dst, src, dst_pitch, swizzle_bit,
  494.                                  rgba8_copy_aligned_src);
  495.       else
  496.          unreachable("not reached");
  497.    }
  498.    xtiled_to_linear(x0, x1, x2, x3, y0, y1,
  499.                     dst, src, dst_pitch, swizzle_bit, mem_copy);
  500. }
  501.  
  502. /**
  503.  * Copy texture data from Y tile layout to linear, faster.
  504.  *
  505.  * Same as \ref ytile_to_linear but faster, because it passes constant
  506.  * parameters for common cases, allowing the compiler to inline code
  507.  * optimized for those cases.
  508.  *
  509.  * \copydoc tile_copy_fn
  510.  */
  511. static FLATTEN void
  512. ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
  513.                         uint32_t y0, uint32_t y1,
  514.                         char *dst, const char *src,
  515.                         int32_t dst_pitch,
  516.                         uint32_t swizzle_bit,
  517.                         mem_copy_fn mem_copy)
  518. {
  519.    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
  520.       if (mem_copy == memcpy)
  521.          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
  522.                                  dst, src, dst_pitch, swizzle_bit, memcpy);
  523.       else if (mem_copy == rgba8_copy_aligned_src)
  524.          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
  525.                                  dst, src, dst_pitch, swizzle_bit,
  526.                                  rgba8_copy_aligned_src);
  527.       else
  528.          unreachable("not reached");
  529.    } else {
  530.       if (mem_copy == memcpy)
  531.          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
  532.                                  dst, src, dst_pitch, swizzle_bit, memcpy);
  533.       else if (mem_copy == rgba8_copy_aligned_src)
  534.          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
  535.                                  dst, src, dst_pitch, swizzle_bit,
  536.                                  rgba8_copy_aligned_src);
  537.       else
  538.          unreachable("not reached");
  539.    }
  540.    ytiled_to_linear(x0, x1, x2, x3, y0, y1,
  541.                     dst, src, dst_pitch, swizzle_bit, mem_copy);
  542. }
  543.  
  544. /**
  545.  * Copy from linear to tiled texture.
  546.  *
  547.  * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
  548.  * pieces that do not cross tile boundaries and copy each piece with a tile
  549.  * copy function (\ref tile_copy_fn).
  550.  * The X range is in bytes, i.e. pixels * bytes-per-pixel.
  551.  * The Y range is in pixels (i.e. unitless).
  552.  * 'dst' is the start of the texture and 'src' is the corresponding
  553.  * address to copy from, though copying begins at (xt1, yt1).
  554.  */
  555. void
  556. linear_to_tiled(uint32_t xt1, uint32_t xt2,
  557.                 uint32_t yt1, uint32_t yt2,
  558.                 char *dst, const char *src,
  559.                 uint32_t dst_pitch, int32_t src_pitch,
  560.                 bool has_swizzling,
  561.                 uint32_t tiling,
  562.                 mem_copy_fn mem_copy)
  563. {
  564.    tile_copy_fn tile_copy;
  565.    uint32_t xt0, xt3;
  566.    uint32_t yt0, yt3;
  567.    uint32_t xt, yt;
  568.    uint32_t tw, th, span;
  569.    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
  570.  
  571.    if (tiling == I915_TILING_X) {
  572.       tw = xtile_width;
  573.       th = xtile_height;
  574.       span = xtile_span;
  575.       tile_copy = linear_to_xtiled_faster;
  576.    } else if (tiling == I915_TILING_Y) {
  577.       tw = ytile_width;
  578.       th = ytile_height;
  579.       span = ytile_span;
  580.       tile_copy = linear_to_ytiled_faster;
  581.    } else {
  582.       unreachable("unsupported tiling");
  583.    }
  584.  
  585.    /* Round out to tile boundaries. */
  586.    xt0 = ALIGN_DOWN(xt1, tw);
  587.    xt3 = ALIGN_UP  (xt2, tw);
  588.    yt0 = ALIGN_DOWN(yt1, th);
  589.    yt3 = ALIGN_UP  (yt2, th);
  590.  
  591.    /* Loop over all tiles to which we have something to copy.
  592.     * 'xt' and 'yt' are the origin of the destination tile, whether copying
  593.     * copying a full or partial tile.
  594.     * tile_copy() copies one tile or partial tile.
  595.     * Looping x inside y is the faster memory access pattern.
  596.     */
  597.    for (yt = yt0; yt < yt3; yt += th) {
  598.       for (xt = xt0; xt < xt3; xt += tw) {
  599.          /* The area to update is [x0,x3) x [y0,y1).
  600.           * May not want the whole tile, hence the min and max.
  601.           */
  602.          uint32_t x0 = MAX2(xt1, xt);
  603.          uint32_t y0 = MAX2(yt1, yt);
  604.          uint32_t x3 = MIN2(xt2, xt + tw);
  605.          uint32_t y1 = MIN2(yt2, yt + th);
  606.  
  607.          /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
  608.           * the middle interval is the longest span-aligned part.
  609.           * The sub-ranges could be empty.
  610.           */
  611.          uint32_t x1, x2;
  612.          x1 = ALIGN_UP(x0, span);
  613.          if (x1 > x3)
  614.             x1 = x2 = x3;
  615.          else
  616.             x2 = ALIGN_DOWN(x3, span);
  617.  
  618.          assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
  619.          assert(x1 - x0 < span && x3 - x2 < span);
  620.          assert(x3 - x0 <= tw);
  621.          assert((x2 - x1) % span == 0);
  622.  
  623.          /* Translate by (xt,yt) for single-tile copier. */
  624.          tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
  625.                    y0-yt, y1-yt,
  626.                    dst + (ptrdiff_t) xt * th + (ptrdiff_t) yt * dst_pitch,
  627.                    src + (ptrdiff_t) xt      + (ptrdiff_t) yt * src_pitch,
  628.                    src_pitch,
  629.                    swizzle_bit,
  630.                    mem_copy);
  631.       }
  632.    }
  633. }
  634.  
  635. /**
  636.  * Copy from tiled to linear texture.
  637.  *
  638.  * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
  639.  * pieces that do not cross tile boundaries and copy each piece with a tile
  640.  * copy function (\ref tile_copy_fn).
  641.  * The X range is in bytes, i.e. pixels * bytes-per-pixel.
  642.  * The Y range is in pixels (i.e. unitless).
  643.  * 'dst' is the start of the texture and 'src' is the corresponding
  644.  * address to copy from, though copying begins at (xt1, yt1).
  645.  */
  646. void
  647. tiled_to_linear(uint32_t xt1, uint32_t xt2,
  648.                 uint32_t yt1, uint32_t yt2,
  649.                 char *dst, const char *src,
  650.                 int32_t dst_pitch, uint32_t src_pitch,
  651.                 bool has_swizzling,
  652.                 uint32_t tiling,
  653.                 mem_copy_fn mem_copy)
  654. {
  655.    tile_copy_fn tile_copy;
  656.    uint32_t xt0, xt3;
  657.    uint32_t yt0, yt3;
  658.    uint32_t xt, yt;
  659.    uint32_t tw, th, span;
  660.    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
  661.  
  662.    if (tiling == I915_TILING_X) {
  663.       tw = xtile_width;
  664.       th = xtile_height;
  665.       span = xtile_span;
  666.       tile_copy = xtiled_to_linear_faster;
  667.    } else if (tiling == I915_TILING_Y) {
  668.       tw = ytile_width;
  669.       th = ytile_height;
  670.       span = ytile_span;
  671.       tile_copy = ytiled_to_linear_faster;
  672.    } else {
  673.       unreachable("unsupported tiling");
  674.    }
  675.  
  676.    /* Round out to tile boundaries. */
  677.    xt0 = ALIGN_DOWN(xt1, tw);
  678.    xt3 = ALIGN_UP  (xt2, tw);
  679.    yt0 = ALIGN_DOWN(yt1, th);
  680.    yt3 = ALIGN_UP  (yt2, th);
  681.  
  682.    /* Loop over all tiles to which we have something to copy.
  683.     * 'xt' and 'yt' are the origin of the destination tile, whether copying
  684.     * copying a full or partial tile.
  685.     * tile_copy() copies one tile or partial tile.
  686.     * Looping x inside y is the faster memory access pattern.
  687.     */
  688.    for (yt = yt0; yt < yt3; yt += th) {
  689.       for (xt = xt0; xt < xt3; xt += tw) {
  690.          /* The area to update is [x0,x3) x [y0,y1).
  691.           * May not want the whole tile, hence the min and max.
  692.           */
  693.          uint32_t x0 = MAX2(xt1, xt);
  694.          uint32_t y0 = MAX2(yt1, yt);
  695.          uint32_t x3 = MIN2(xt2, xt + tw);
  696.          uint32_t y1 = MIN2(yt2, yt + th);
  697.  
  698.          /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
  699.           * the middle interval is the longest span-aligned part.
  700.           * The sub-ranges could be empty.
  701.           */
  702.          uint32_t x1, x2;
  703.          x1 = ALIGN_UP(x0, span);
  704.          if (x1 > x3)
  705.             x1 = x2 = x3;
  706.          else
  707.             x2 = ALIGN_DOWN(x3, span);
  708.  
  709.          assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
  710.          assert(x1 - x0 < span && x3 - x2 < span);
  711.          assert(x3 - x0 <= tw);
  712.          assert((x2 - x1) % span == 0);
  713.  
  714.          /* Translate by (xt,yt) for single-tile copier. */
  715.          tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
  716.                    y0-yt, y1-yt,
  717.                    dst + (ptrdiff_t) xt      + (ptrdiff_t) yt * dst_pitch,
  718.                    src + (ptrdiff_t) xt * th + (ptrdiff_t) yt * src_pitch,
  719.                    dst_pitch,
  720.                    swizzle_bit,
  721.                    mem_copy);
  722.       }
  723.    }
  724. }
  725.  
  726.  
  727. /**
  728.  * Determine which copy function to use for the given format combination
  729.  *
  730.  * The only two possible copy functions which are ever returned are a
  731.  * direct memcpy and a RGBA <-> BGRA copy function.  Since RGBA -> BGRA and
  732.  * BGRA -> RGBA are exactly the same operation (and memcpy is obviously
  733.  * symmetric), it doesn't matter whether the copy is from the tiled image
  734.  * to the untiled or vice versa.  The copy function required is the same in
  735.  * either case so this function can be used.
  736.  *
  737.  * \param[in]  tiledFormat The format of the tiled image
  738.  * \param[in]  format      The GL format of the client data
  739.  * \param[in]  type        The GL type of the client data
  740.  * \param[out] mem_copy    Will be set to one of either the standard
  741.  *                         library's memcpy or a different copy function
  742.  *                         that performs an RGBA to BGRA conversion
  743.  * \param[out] cpp         Number of bytes per channel
  744.  *
  745.  * \return true if the format and type combination are valid
  746.  */
  747. bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
  748.                       GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp,
  749.                       enum intel_memcpy_direction direction)
  750. {
  751.    if (type == GL_UNSIGNED_INT_8_8_8_8_REV &&
  752.        !(format == GL_RGBA || format == GL_BGRA))
  753.       return false; /* Invalid type/format combination */
  754.  
  755.    if ((tiledFormat == MESA_FORMAT_L_UNORM8 && format == GL_LUMINANCE) ||
  756.        (tiledFormat == MESA_FORMAT_A_UNORM8 && format == GL_ALPHA)) {
  757.       *cpp = 1;
  758.       *mem_copy = memcpy;
  759.    } else if ((tiledFormat == MESA_FORMAT_B8G8R8A8_UNORM) ||
  760.               (tiledFormat == MESA_FORMAT_B8G8R8X8_UNORM)) {
  761.       *cpp = 4;
  762.       if (format == GL_BGRA) {
  763.          *mem_copy = memcpy;
  764.       } else if (format == GL_RGBA) {
  765.          *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
  766.                                                : rgba8_copy_aligned_src;
  767.       }
  768.    } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
  769.               (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM)) {
  770.       *cpp = 4;
  771.       if (format == GL_BGRA) {
  772.          /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
  773.           * use the same function.
  774.           */
  775.          *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
  776.                                                : rgba8_copy_aligned_src;
  777.       } else if (format == GL_RGBA) {
  778.          *mem_copy = memcpy;
  779.       }
  780.    }
  781.  
  782.    if (!(*mem_copy))
  783.       return false;
  784.  
  785.    return true;
  786. }
  787.