Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. ;*****************************************************************************
  2. ;* SSE2-optimized weighted prediction code
  3. ;*****************************************************************************
  4. ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  5. ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23.  
  24. %include "libavutil/x86/x86util.asm"
  25.  
  26. SECTION .text
  27.  
  28. ;-----------------------------------------------------------------------------
  29. ; biweight pred:
  30. ;
  31. ; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
  32. ;                            int height, int log2_denom, int weightd,
  33. ;                            int weights, int offset);
  34. ; and
  35. ; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
  36. ;                          int log2_denom, int weight, int offset);
  37. ;-----------------------------------------------------------------------------
  38.  
  39. %macro WEIGHT_SETUP 0
  40.     add        r5, r5
  41.     inc        r5
  42.     movd       m3, r4d
  43.     movd       m5, r5d
  44.     movd       m6, r3d
  45.     pslld      m5, m6
  46.     psrld      m5, 1
  47. %if mmsize == 16
  48.     pshuflw    m3, m3, 0
  49.     pshuflw    m5, m5, 0
  50.     punpcklqdq m3, m3
  51.     punpcklqdq m5, m5
  52. %else
  53.     pshufw     m3, m3, 0
  54.     pshufw     m5, m5, 0
  55. %endif
  56.     pxor       m7, m7
  57. %endmacro
  58.  
  59. %macro WEIGHT_OP 2
  60.     movh          m0, [r0+%1]
  61.     movh          m1, [r0+%2]
  62.     punpcklbw     m0, m7
  63.     punpcklbw     m1, m7
  64.     pmullw        m0, m3
  65.     pmullw        m1, m3
  66.     paddsw        m0, m5
  67.     paddsw        m1, m5
  68.     psraw         m0, m6
  69.     psraw         m1, m6
  70.     packuswb      m0, m1
  71. %endmacro
  72.  
  73. INIT_MMX mmxext
  74. cglobal h264_weight_16, 6, 6, 0
  75.     WEIGHT_SETUP
  76. .nextrow:
  77.     WEIGHT_OP 0,  4
  78.     mova     [r0  ], m0
  79.     WEIGHT_OP 8, 12
  80.     mova     [r0+8], m0
  81.     add        r0, r1
  82.     dec        r2d
  83.     jnz .nextrow
  84.     REP_RET
  85.  
  86. %macro WEIGHT_FUNC_MM 2
  87. cglobal h264_weight_%1, 6, 6, %2
  88.     WEIGHT_SETUP
  89. .nextrow:
  90.     WEIGHT_OP 0, mmsize/2
  91.     mova     [r0], m0
  92.     add        r0, r1
  93.     dec        r2d
  94.     jnz .nextrow
  95.     REP_RET
  96. %endmacro
  97.  
  98. INIT_MMX mmxext
  99. WEIGHT_FUNC_MM  8, 0
  100. INIT_XMM sse2
  101. WEIGHT_FUNC_MM 16, 8
  102.  
  103. %macro WEIGHT_FUNC_HALF_MM 2
  104. cglobal h264_weight_%1, 6, 6, %2
  105.     WEIGHT_SETUP
  106.     sar       r2d, 1
  107.     lea        r3, [r1*2]
  108. .nextrow:
  109.     WEIGHT_OP 0, r1
  110.     movh     [r0], m0
  111. %if mmsize == 16
  112.     movhps   [r0+r1], m0
  113. %else
  114.     psrlq      m0, 32
  115.     movh     [r0+r1], m0
  116. %endif
  117.     add        r0, r3
  118.     dec        r2d
  119.     jnz .nextrow
  120.     REP_RET
  121. %endmacro
  122.  
  123. INIT_MMX mmxext
  124. WEIGHT_FUNC_HALF_MM 4, 0
  125. INIT_XMM sse2
  126. WEIGHT_FUNC_HALF_MM 8, 8
  127.  
  128. %macro BIWEIGHT_SETUP 0
  129. %if ARCH_X86_64
  130. %define off_regd r7d
  131. %else
  132. %define off_regd r3d
  133. %endif
  134.     mov  off_regd, r7m
  135.     add  off_regd, 1
  136.     or   off_regd, 1
  137.     add        r4, 1
  138.     cmp        r5, 128
  139.      jne .normal
  140.     sar        r5, 1
  141.     sar        r6, 1
  142.     sar  off_regd, 1
  143.     sub        r4, 1
  144. .normal
  145. %if cpuflag(ssse3)
  146.     movd       m4, r5d
  147.     movd       m0, r6d
  148. %else
  149.     movd       m3, r5d
  150.     movd       m4, r6d
  151. %endif
  152.     movd       m5, off_regd
  153.     movd       m6, r4d
  154.     pslld      m5, m6
  155.     psrld      m5, 1
  156. %if cpuflag(ssse3)
  157.     punpcklbw  m4, m0
  158.     pshuflw    m4, m4, 0
  159.     pshuflw    m5, m5, 0
  160.     punpcklqdq m4, m4
  161.     punpcklqdq m5, m5
  162.  
  163. %else
  164. %if mmsize == 16
  165.     pshuflw    m3, m3, 0
  166.     pshuflw    m4, m4, 0
  167.     pshuflw    m5, m5, 0
  168.     punpcklqdq m3, m3
  169.     punpcklqdq m4, m4
  170.     punpcklqdq m5, m5
  171. %else
  172.     pshufw     m3, m3, 0
  173.     pshufw     m4, m4, 0
  174.     pshufw     m5, m5, 0
  175. %endif
  176.     pxor       m7, m7
  177. %endif
  178. %endmacro
  179.  
  180. %macro BIWEIGHT_STEPA 3
  181.     movh       m%1, [r0+%3]
  182.     movh       m%2, [r1+%3]
  183.     punpcklbw  m%1, m7
  184.     punpcklbw  m%2, m7
  185.     pmullw     m%1, m3
  186.     pmullw     m%2, m4
  187.     paddsw     m%1, m%2
  188. %endmacro
  189.  
  190. %macro BIWEIGHT_STEPB 0
  191.     paddsw     m0, m5
  192.     paddsw     m1, m5
  193.     psraw      m0, m6
  194.     psraw      m1, m6
  195.     packuswb   m0, m1
  196. %endmacro
  197.  
  198. INIT_MMX mmxext
  199. cglobal h264_biweight_16, 7, 8, 0
  200.     BIWEIGHT_SETUP
  201.     movifnidn r3d, r3m
  202. .nextrow:
  203.     BIWEIGHT_STEPA 0, 1, 0
  204.     BIWEIGHT_STEPA 1, 2, 4
  205.     BIWEIGHT_STEPB
  206.     mova       [r0], m0
  207.     BIWEIGHT_STEPA 0, 1, 8
  208.     BIWEIGHT_STEPA 1, 2, 12
  209.     BIWEIGHT_STEPB
  210.     mova     [r0+8], m0
  211.     add        r0, r2
  212.     add        r1, r2
  213.     dec        r3d
  214.     jnz .nextrow
  215.     REP_RET
  216.  
  217. %macro BIWEIGHT_FUNC_MM 2
  218. cglobal h264_biweight_%1, 7, 8, %2
  219.     BIWEIGHT_SETUP
  220.     movifnidn r3d, r3m
  221. .nextrow:
  222.     BIWEIGHT_STEPA 0, 1, 0
  223.     BIWEIGHT_STEPA 1, 2, mmsize/2
  224.     BIWEIGHT_STEPB
  225.     mova       [r0], m0
  226.     add        r0, r2
  227.     add        r1, r2
  228.     dec        r3d
  229.     jnz .nextrow
  230.     REP_RET
  231. %endmacro
  232.  
  233. INIT_MMX mmxext
  234. BIWEIGHT_FUNC_MM  8, 0
  235. INIT_XMM sse2
  236. BIWEIGHT_FUNC_MM 16, 8
  237.  
  238. %macro BIWEIGHT_FUNC_HALF_MM 2
  239. cglobal h264_biweight_%1, 7, 8, %2
  240.     BIWEIGHT_SETUP
  241.     movifnidn r3d, r3m
  242.     sar        r3, 1
  243.     lea        r4, [r2*2]
  244. .nextrow:
  245.     BIWEIGHT_STEPA 0, 1, 0
  246.     BIWEIGHT_STEPA 1, 2, r2
  247.     BIWEIGHT_STEPB
  248.     movh       [r0], m0
  249. %if mmsize == 16
  250.     movhps     [r0+r2], m0
  251. %else
  252.     psrlq      m0, 32
  253.     movh       [r0+r2], m0
  254. %endif
  255.     add        r0, r4
  256.     add        r1, r4
  257.     dec        r3d
  258.     jnz .nextrow
  259.     REP_RET
  260. %endmacro
  261.  
  262. INIT_MMX mmxext
  263. BIWEIGHT_FUNC_HALF_MM 4, 0
  264. INIT_XMM sse2
  265. BIWEIGHT_FUNC_HALF_MM 8, 8
  266.  
  267. %macro BIWEIGHT_SSSE3_OP 0
  268.     pmaddubsw  m0, m4
  269.     pmaddubsw  m2, m4
  270.     paddsw     m0, m5
  271.     paddsw     m2, m5
  272.     psraw      m0, m6
  273.     psraw      m2, m6
  274.     packuswb   m0, m2
  275. %endmacro
  276.  
  277. INIT_XMM ssse3
  278. cglobal h264_biweight_16, 7, 8, 8
  279.     BIWEIGHT_SETUP
  280.     movifnidn r3d, r3m
  281.  
  282. .nextrow:
  283.     movh       m0, [r0]
  284.     movh       m2, [r0+8]
  285.     movh       m3, [r1+8]
  286.     punpcklbw  m0, [r1]
  287.     punpcklbw  m2, m3
  288.     BIWEIGHT_SSSE3_OP
  289.     mova       [r0], m0
  290.     add        r0, r2
  291.     add        r1, r2
  292.     dec        r3d
  293.     jnz .nextrow
  294.     REP_RET
  295.  
  296. INIT_XMM ssse3
  297. cglobal h264_biweight_8, 7, 8, 8
  298.     BIWEIGHT_SETUP
  299.     movifnidn r3d, r3m
  300.     sar        r3, 1
  301.     lea        r4, [r2*2]
  302.  
  303. .nextrow:
  304.     movh       m0, [r0]
  305.     movh       m1, [r1]
  306.     movh       m2, [r0+r2]
  307.     movh       m3, [r1+r2]
  308.     punpcklbw  m0, m1
  309.     punpcklbw  m2, m3
  310.     BIWEIGHT_SSSE3_OP
  311.     movh       [r0], m0
  312.     movhps     [r0+r2], m0
  313.     add        r0, r4
  314.     add        r1, r4
  315.     dec        r3d
  316.     jnz .nextrow
  317.     REP_RET
  318.