Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* SIMD-optimized IDCT-related routines
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2003-2013 Michael Niedermayer
  5. ;* Copyright (c) 2013 Daniel Kang
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23.  
  24. %include "libavutil/x86/x86util.asm"
  25.  
  26. SECTION_RODATA
  27.  
  28. cextern pb_80
  29.  
  30. SECTION .text
  31.  
  32. ;--------------------------------------------------------------------------
  33. ;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
  34. ;                                  ptrdiff_t line_size)
  35. ;--------------------------------------------------------------------------
  36.  
  37. %macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1
  38.     mova     m1, [blockq+mmsize*0+%1]
  39.     mova     m2, [blockq+mmsize*2+%1]
  40. %if mmsize == 8
  41.     mova     m3, [blockq+mmsize*4+%1]
  42.     mova     m4, [blockq+mmsize*6+%1]
  43. %endif
  44.     packsswb m1, [blockq+mmsize*1+%1]
  45.     packsswb m2, [blockq+mmsize*3+%1]
  46. %if mmsize == 8
  47.     packsswb m3, [blockq+mmsize*5+%1]
  48.     packsswb m4, [blockq+mmsize*7+%1]
  49. %endif
  50.     paddb    m1, m0
  51.     paddb    m2, m0
  52. %if mmsize == 8
  53.     paddb    m3, m0
  54.     paddb    m4, m0
  55.     movq     [pixelsq+lsizeq*0], m1
  56.     movq     [pixelsq+lsizeq*1], m2
  57.     movq     [pixelsq+lsizeq*2], m3
  58.     movq     [pixelsq+lsize3q ], m4
  59. %else
  60.     movq     [pixelsq+lsizeq*0], m1
  61.     movhps   [pixelsq+lsizeq*1], m1
  62.     movq     [pixelsq+lsizeq*2], m2
  63.     movhps   [pixelsq+lsize3q ], m2
  64. %endif
  65. %endmacro
  66.  
  67. %macro PUT_SIGNED_PIXELS_CLAMPED 1
  68. cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3
  69.     mova     m0, [pb_80]
  70.     lea      lsize3q, [lsizeq*3]
  71.     PUT_SIGNED_PIXELS_CLAMPED_HALF 0
  72.     lea      pixelsq, [pixelsq+lsizeq*4]
  73.     PUT_SIGNED_PIXELS_CLAMPED_HALF 64
  74.     RET
  75. %endmacro
  76.  
  77. INIT_MMX mmx
  78. PUT_SIGNED_PIXELS_CLAMPED 0
  79. INIT_XMM sse2
  80. PUT_SIGNED_PIXELS_CLAMPED 3
  81.  
  82. ;--------------------------------------------------------------------------
  83. ; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels,
  84. ;                            ptrdiff_t line_size);
  85. ;--------------------------------------------------------------------------
  86. ; %1 = block offset
  87. %macro PUT_PIXELS_CLAMPED_HALF 1
  88.     mova     m0, [blockq+mmsize*0+%1]
  89.     mova     m1, [blockq+mmsize*2+%1]
  90. %if mmsize == 8
  91.     mova     m2, [blockq+mmsize*4+%1]
  92.     mova     m3, [blockq+mmsize*6+%1]
  93. %endif
  94.     packuswb m0, [blockq+mmsize*1+%1]
  95.     packuswb m1, [blockq+mmsize*3+%1]
  96. %if mmsize == 8
  97.     packuswb m2, [blockq+mmsize*5+%1]
  98.     packuswb m3, [blockq+mmsize*7+%1]
  99.     movq           [pixelsq], m0
  100.     movq    [lsizeq+pixelsq], m1
  101.     movq  [2*lsizeq+pixelsq], m2
  102.     movq   [lsize3q+pixelsq], m3
  103. %else
  104.     movq           [pixelsq], m0
  105.     movhps  [lsizeq+pixelsq], m0
  106.     movq  [2*lsizeq+pixelsq], m1
  107.     movhps [lsize3q+pixelsq], m1
  108. %endif
  109. %endmacro
  110.  
  111. %macro PUT_PIXELS_CLAMPED 0
  112. cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3
  113.     lea lsize3q, [lsizeq*3]
  114.     PUT_PIXELS_CLAMPED_HALF 0
  115.     lea pixelsq, [pixelsq+lsizeq*4]
  116.     PUT_PIXELS_CLAMPED_HALF 64
  117.     RET
  118. %endmacro
  119.  
  120. INIT_MMX mmx
  121. PUT_PIXELS_CLAMPED
  122. INIT_XMM sse2
  123. PUT_PIXELS_CLAMPED
  124.  
  125. ;--------------------------------------------------------------------------
  126. ; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels,
  127. ;                            ptrdiff_t line_size);
  128. ;--------------------------------------------------------------------------
  129. ; %1 = block offset
  130. %macro ADD_PIXELS_CLAMPED 1
  131.     mova       m0, [blockq+mmsize*0+%1]
  132.     mova       m1, [blockq+mmsize*1+%1]
  133. %if mmsize == 8
  134.     mova       m5, [blockq+mmsize*2+%1]
  135.     mova       m6, [blockq+mmsize*3+%1]
  136. %endif
  137.     movq       m2, [pixelsq]
  138.     movq       m3, [pixelsq+lsizeq]
  139. %if mmsize == 8
  140.     mova       m7, m2
  141.     punpcklbw  m2, m4
  142.     punpckhbw  m7, m4
  143.     paddsw     m0, m2
  144.     paddsw     m1, m7
  145.     mova       m7, m3
  146.     punpcklbw  m3, m4
  147.     punpckhbw  m7, m4
  148.     paddsw     m5, m3
  149.     paddsw     m6, m7
  150. %else
  151.     punpcklbw  m2, m4
  152.     punpcklbw  m3, m4
  153.     paddsw     m0, m2
  154.     paddsw     m1, m3
  155. %endif
  156.     packuswb   m0, m1
  157. %if mmsize == 8
  158.     packuswb   m5, m6
  159.     movq       [pixelsq], m0
  160.     movq       [pixelsq+lsizeq], m5
  161. %else
  162.     movq       [pixelsq], m0
  163.     movhps     [pixelsq+lsizeq], m0
  164. %endif
  165. %endmacro
  166.  
  167. %macro ADD_PIXELS_CLAMPED 0
  168. cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize
  169.     pxor       m4, m4
  170.     ADD_PIXELS_CLAMPED 0
  171.     lea        pixelsq, [pixelsq+lsizeq*2]
  172.     ADD_PIXELS_CLAMPED 32
  173.     lea        pixelsq, [pixelsq+lsizeq*2]
  174.     ADD_PIXELS_CLAMPED 64
  175.     lea        pixelsq, [pixelsq+lsizeq*2]
  176.     ADD_PIXELS_CLAMPED 96
  177.     RET
  178. %endmacro
  179.  
  180. INIT_MMX mmx
  181. ADD_PIXELS_CLAMPED
  182. INIT_XMM sse2
  183. ADD_PIXELS_CLAMPED
  184.