Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;*****************************************************************************
  2. ;* SSE2-optimized weighted prediction code
  3. ;*****************************************************************************
  4. ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  5. ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23.  
  24. %include "libavutil/x86/x86util.asm"
  25.  
  26. SECTION .text
  27.  
  28. ;-----------------------------------------------------------------------------
  29. ; biweight pred:
  30. ;
  31. ; void ff_h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
  32. ;                               int height, int log2_denom, int weightd,
  33. ;                               int weights, int offset);
  34. ; and
  35. ; void ff_h264_weight_16_sse2(uint8_t *dst, int stride, int height,
  36. ;                             int log2_denom, int weight, int offset);
  37. ;-----------------------------------------------------------------------------
  38.  
  39. %macro WEIGHT_SETUP 0
  40.     add        r5, r5
  41.     inc        r5
  42.     movd       m3, r4d
  43.     movd       m5, r5d
  44.     movd       m6, r3d
  45.     pslld      m5, m6
  46.     psrld      m5, 1
  47. %if mmsize == 16
  48.     pshuflw    m3, m3, 0
  49.     pshuflw    m5, m5, 0
  50.     punpcklqdq m3, m3
  51.     punpcklqdq m5, m5
  52. %else
  53.     pshufw     m3, m3, 0
  54.     pshufw     m5, m5, 0
  55. %endif
  56.     pxor       m7, m7
  57. %endmacro
  58.  
  59. %macro WEIGHT_OP 2
  60.     movh          m0, [r0+%1]
  61.     movh          m1, [r0+%2]
  62.     punpcklbw     m0, m7
  63.     punpcklbw     m1, m7
  64.     pmullw        m0, m3
  65.     pmullw        m1, m3
  66.     paddsw        m0, m5
  67.     paddsw        m1, m5
  68.     psraw         m0, m6
  69.     psraw         m1, m6
  70.     packuswb      m0, m1
  71. %endmacro
  72.  
  73. INIT_MMX mmxext
  74. cglobal h264_weight_16, 6, 6, 0
  75.     WEIGHT_SETUP
  76. .nextrow:
  77.     WEIGHT_OP 0,  4
  78.     mova     [r0  ], m0
  79.     WEIGHT_OP 8, 12
  80.     mova     [r0+8], m0
  81.     add        r0, r1
  82.     dec        r2d
  83.     jnz .nextrow
  84.     REP_RET
  85.  
  86. %macro WEIGHT_FUNC_MM 2
  87. cglobal h264_weight_%1, 6, 6, %2
  88.     WEIGHT_SETUP
  89. .nextrow:
  90.     WEIGHT_OP 0, mmsize/2
  91.     mova     [r0], m0
  92.     add        r0, r1
  93.     dec        r2d
  94.     jnz .nextrow
  95.     REP_RET
  96. %endmacro
  97.  
  98. INIT_MMX mmxext
  99. WEIGHT_FUNC_MM  8, 0
  100. INIT_XMM sse2
  101. WEIGHT_FUNC_MM 16, 8
  102.  
  103. %macro WEIGHT_FUNC_HALF_MM 2
  104. cglobal h264_weight_%1, 6, 6, %2
  105.     WEIGHT_SETUP
  106.     sar       r2d, 1
  107.     lea        r3, [r1*2]
  108. .nextrow:
  109.     WEIGHT_OP 0, r1
  110.     movh     [r0], m0
  111. %if mmsize == 16
  112.     movhps   [r0+r1], m0
  113. %else
  114.     psrlq      m0, 32
  115.     movh     [r0+r1], m0
  116. %endif
  117.     add        r0, r3
  118.     dec        r2d
  119.     jnz .nextrow
  120.     REP_RET
  121. %endmacro
  122.  
  123. INIT_MMX mmxext
  124. WEIGHT_FUNC_HALF_MM 4, 0
  125. INIT_XMM sse2
  126. WEIGHT_FUNC_HALF_MM 8, 8
  127.  
  128. %macro BIWEIGHT_SETUP 0
  129. %if ARCH_X86_64
  130. %define off_regd r7d
  131. %else
  132. %define off_regd r3d
  133. %endif
  134.     mov  off_regd, r7m
  135.     add  off_regd, 1
  136.     or   off_regd, 1
  137.     add        r4, 1
  138.     cmp        r6d, 128
  139.     je .nonnormal
  140.     cmp        r5, 128
  141.     jne .normal
  142. .nonnormal:
  143.     sar        r5, 1
  144.     sar        r6, 1
  145.     sar  off_regd, 1
  146.     sub        r4, 1
  147. .normal:
  148. %if cpuflag(ssse3)
  149.     movd       m4, r5d
  150.     movd       m0, r6d
  151. %else
  152.     movd       m3, r5d
  153.     movd       m4, r6d
  154. %endif
  155.     movd       m5, off_regd
  156.     movd       m6, r4d
  157.     pslld      m5, m6
  158.     psrld      m5, 1
  159. %if cpuflag(ssse3)
  160.     punpcklbw  m4, m0
  161.     pshuflw    m4, m4, 0
  162.     pshuflw    m5, m5, 0
  163.     punpcklqdq m4, m4
  164.     punpcklqdq m5, m5
  165.  
  166. %else
  167. %if mmsize == 16
  168.     pshuflw    m3, m3, 0
  169.     pshuflw    m4, m4, 0
  170.     pshuflw    m5, m5, 0
  171.     punpcklqdq m3, m3
  172.     punpcklqdq m4, m4
  173.     punpcklqdq m5, m5
  174. %else
  175.     pshufw     m3, m3, 0
  176.     pshufw     m4, m4, 0
  177.     pshufw     m5, m5, 0
  178. %endif
  179.     pxor       m7, m7
  180. %endif
  181. %endmacro
  182.  
  183. %macro BIWEIGHT_STEPA 3
  184.     movh       m%1, [r0+%3]
  185.     movh       m%2, [r1+%3]
  186.     punpcklbw  m%1, m7
  187.     punpcklbw  m%2, m7
  188.     pmullw     m%1, m3
  189.     pmullw     m%2, m4
  190.     paddsw     m%1, m%2
  191. %endmacro
  192.  
  193. %macro BIWEIGHT_STEPB 0
  194.     paddsw     m0, m5
  195.     paddsw     m1, m5
  196.     psraw      m0, m6
  197.     psraw      m1, m6
  198.     packuswb   m0, m1
  199. %endmacro
  200.  
  201. INIT_MMX mmxext
  202. cglobal h264_biweight_16, 7, 8, 0
  203.     BIWEIGHT_SETUP
  204.     movifnidn r3d, r3m
  205. .nextrow:
  206.     BIWEIGHT_STEPA 0, 1, 0
  207.     BIWEIGHT_STEPA 1, 2, 4
  208.     BIWEIGHT_STEPB
  209.     mova       [r0], m0
  210.     BIWEIGHT_STEPA 0, 1, 8
  211.     BIWEIGHT_STEPA 1, 2, 12
  212.     BIWEIGHT_STEPB
  213.     mova     [r0+8], m0
  214.     add        r0, r2
  215.     add        r1, r2
  216.     dec        r3d
  217.     jnz .nextrow
  218.     REP_RET
  219.  
  220. %macro BIWEIGHT_FUNC_MM 2
  221. cglobal h264_biweight_%1, 7, 8, %2
  222.     BIWEIGHT_SETUP
  223.     movifnidn r3d, r3m
  224. .nextrow:
  225.     BIWEIGHT_STEPA 0, 1, 0
  226.     BIWEIGHT_STEPA 1, 2, mmsize/2
  227.     BIWEIGHT_STEPB
  228.     mova       [r0], m0
  229.     add        r0, r2
  230.     add        r1, r2
  231.     dec        r3d
  232.     jnz .nextrow
  233.     REP_RET
  234. %endmacro
  235.  
  236. INIT_MMX mmxext
  237. BIWEIGHT_FUNC_MM  8, 0
  238. INIT_XMM sse2
  239. BIWEIGHT_FUNC_MM 16, 8
  240.  
  241. %macro BIWEIGHT_FUNC_HALF_MM 2
  242. cglobal h264_biweight_%1, 7, 8, %2
  243.     BIWEIGHT_SETUP
  244.     movifnidn r3d, r3m
  245.     sar        r3, 1
  246.     lea        r4, [r2*2]
  247. .nextrow:
  248.     BIWEIGHT_STEPA 0, 1, 0
  249.     BIWEIGHT_STEPA 1, 2, r2
  250.     BIWEIGHT_STEPB
  251.     movh       [r0], m0
  252. %if mmsize == 16
  253.     movhps     [r0+r2], m0
  254. %else
  255.     psrlq      m0, 32
  256.     movh       [r0+r2], m0
  257. %endif
  258.     add        r0, r4
  259.     add        r1, r4
  260.     dec        r3d
  261.     jnz .nextrow
  262.     REP_RET
  263. %endmacro
  264.  
  265. INIT_MMX mmxext
  266. BIWEIGHT_FUNC_HALF_MM 4, 0
  267. INIT_XMM sse2
  268. BIWEIGHT_FUNC_HALF_MM 8, 8
  269.  
  270. %macro BIWEIGHT_SSSE3_OP 0
  271.     pmaddubsw  m0, m4
  272.     pmaddubsw  m2, m4
  273.     paddsw     m0, m5
  274.     paddsw     m2, m5
  275.     psraw      m0, m6
  276.     psraw      m2, m6
  277.     packuswb   m0, m2
  278. %endmacro
  279.  
  280. INIT_XMM ssse3
  281. cglobal h264_biweight_16, 7, 8, 8
  282.     BIWEIGHT_SETUP
  283.     movifnidn r3d, r3m
  284.  
  285. .nextrow:
  286.     movh       m0, [r0]
  287.     movh       m2, [r0+8]
  288.     movh       m3, [r1+8]
  289.     punpcklbw  m0, [r1]
  290.     punpcklbw  m2, m3
  291.     BIWEIGHT_SSSE3_OP
  292.     mova       [r0], m0
  293.     add        r0, r2
  294.     add        r1, r2
  295.     dec        r3d
  296.     jnz .nextrow
  297.     REP_RET
  298.  
  299. INIT_XMM ssse3
  300. cglobal h264_biweight_8, 7, 8, 8
  301.     BIWEIGHT_SETUP
  302.     movifnidn r3d, r3m
  303.     sar        r3, 1
  304.     lea        r4, [r2*2]
  305.  
  306. .nextrow:
  307.     movh       m0, [r0]
  308.     movh       m1, [r1]
  309.     movh       m2, [r0+r2]
  310.     movh       m3, [r1+r2]
  311.     punpcklbw  m0, m1
  312.     punpcklbw  m2, m3
  313.     BIWEIGHT_SSSE3_OP
  314.     movh       [r0], m0
  315.     movhps     [r0+r2], m0
  316.     add        r0, r4
  317.     add        r1, r4
  318.     dec        r3d
  319.     jnz .nextrow
  320.     REP_RET
  321.