Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* VC1 deblocking optimizations
  3. ;* Copyright (c) 2009 David Conrad
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21.  
  22. %include "libavutil/x86/x86util.asm"
  23.  
  24. cextern pw_4
  25. cextern pw_5
  26.  
  27. section .text
  28.  
  29. ; dst_low, dst_high (src), zero
  30. ; zero-extends one vector from 8 to 16 bits
  31. %macro UNPACK_8TO16 4
  32.     mova      m%2, m%3
  33.     punpckh%1 m%3, m%4
  34.     punpckl%1 m%2, m%4
  35. %endmacro
  36.  
  37. %macro STORE_4_WORDS 6
  38. %if cpuflag(sse4)
  39.     pextrw %1, %5, %6+0
  40.     pextrw %2, %5, %6+1
  41.     pextrw %3, %5, %6+2
  42.     pextrw %4, %5, %6+3
  43. %else
  44.     movd  %6d, %5
  45. %if mmsize==16
  46.     psrldq %5, 4
  47. %else
  48.     psrlq  %5, 32
  49. %endif
  50.     mov    %1, %6w
  51.     shr    %6, 16
  52.     mov    %2, %6w
  53.     movd  %6d, %5
  54.     mov    %3, %6w
  55.     shr    %6, 16
  56.     mov    %4, %6w
  57. %endif
  58. %endmacro
  59.  
  60. ; in:  p1 p0 q0 q1, clobbers p0
  61. ; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
  62. %macro VC1_LOOP_FILTER_A0 4
  63.     psubw  %1, %4
  64.     psubw  %2, %3
  65.     paddw  %1, %1
  66.     pmullw %2, [pw_5]
  67.     psubw  %1, %2
  68.     paddw  %1, [pw_4]
  69.     psraw  %1, 3
  70. %endmacro
  71.  
  72. ; in: p0 q0 a0 a1 a2
  73. ;     m0 m1 m7 m6 m5
  74. ; %1: size
  75. ; out: m0=p0' m1=q0'
  76. %macro VC1_FILTER 1
  77.     PABSW   m4, m7
  78.     PABSW   m3, m6
  79.     PABSW   m2, m5
  80.     mova    m6, m4
  81.     pminsw  m3, m2
  82.     pcmpgtw m6, m3  ; if (a2 < a0 || a1 < a0)
  83.     psubw   m3, m4
  84.     pmullw  m3, [pw_5]   ; 5*(a3 - a0)
  85.     PABSW   m2, m3
  86.     psraw   m2, 3   ; abs(d/8)
  87.     pxor    m7, m3  ; d_sign ^= a0_sign
  88.  
  89.     pxor    m5, m5
  90.     movd    m3, r2d
  91. %if %1 > 4
  92.     punpcklbw m3, m3
  93. %endif
  94.     punpcklbw m3, m5
  95.     pcmpgtw m3, m4  ; if (a0 < pq)
  96.     pand    m6, m3
  97.  
  98.     mova    m3, m0
  99.     psubw   m3, m1
  100.     PABSW   m4, m3
  101.     psraw   m4, 1
  102.     pxor    m3, m7  ; d_sign ^ clip_sign
  103.     psraw   m3, 15
  104.     pminsw  m2, m4  ; min(d, clip)
  105.     pcmpgtw m4, m5
  106.     pand    m6, m4  ; filt3 (C return value)
  107.  
  108. ; each set of 4 pixels is not filtered if the 3rd is not
  109. %if mmsize==16
  110.     pshuflw m4, m6, 0xaa
  111. %if %1 > 4
  112.     pshufhw m4, m4, 0xaa
  113. %endif
  114. %else
  115.     pshufw  m4, m6, 0xaa
  116. %endif
  117.     pandn   m3, m4
  118.     pand    m2, m6
  119.     pand    m3, m2  ; d final
  120.  
  121.     psraw   m7, 15
  122.     pxor    m3, m7
  123.     psubw   m3, m7
  124.     psubw   m0, m3
  125.     paddw   m1, m3
  126.     packuswb m0, m0
  127.     packuswb m1, m1
  128. %endmacro
  129.  
  130. ; 1st param: size of filter
  131. ; 2nd param: mov suffix equivalent to the filter size
  132. %macro VC1_V_LOOP_FILTER 2
  133.     pxor      m5, m5
  134.     mov%2     m6, [r4]
  135.     mov%2     m4, [r4+r1]
  136.     mov%2     m7, [r4+2*r1]
  137.     mov%2     m0, [r4+r3]
  138.     punpcklbw m6, m5
  139.     punpcklbw m4, m5
  140.     punpcklbw m7, m5
  141.     punpcklbw m0, m5
  142.  
  143.     VC1_LOOP_FILTER_A0 m6, m4, m7, m0
  144.     mov%2     m1, [r0]
  145.     mov%2     m2, [r0+r1]
  146.     punpcklbw m1, m5
  147.     punpcklbw m2, m5
  148.     mova      m4, m0
  149.     VC1_LOOP_FILTER_A0 m7, m4, m1, m2
  150.     mov%2     m3, [r0+2*r1]
  151.     mov%2     m4, [r0+r3]
  152.     punpcklbw m3, m5
  153.     punpcklbw m4, m5
  154.     mova      m5, m1
  155.     VC1_LOOP_FILTER_A0 m5, m2, m3, m4
  156.  
  157.     VC1_FILTER %1
  158.     mov%2 [r4+r3], m0
  159.     mov%2 [r0],    m1
  160. %endmacro
  161.  
  162. ; 1st param: size of filter
  163. ;     NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
  164. ; 2nd (optional) param: temp register to use for storing words
  165. %macro VC1_H_LOOP_FILTER 1-2
  166. %if %1 == 4
  167.     movq      m0, [r0     -4]
  168.     movq      m1, [r0+  r1-4]
  169.     movq      m2, [r0+2*r1-4]
  170.     movq      m3, [r0+  r3-4]
  171.     TRANSPOSE4x4B 0, 1, 2, 3, 4
  172. %else
  173.     movq      m0, [r0     -4]
  174.     movq      m4, [r0+  r1-4]
  175.     movq      m1, [r0+2*r1-4]
  176.     movq      m5, [r0+  r3-4]
  177.     movq      m2, [r4     -4]
  178.     movq      m6, [r4+  r1-4]
  179.     movq      m3, [r4+2*r1-4]
  180.     movq      m7, [r4+  r3-4]
  181.     punpcklbw m0, m4
  182.     punpcklbw m1, m5
  183.     punpcklbw m2, m6
  184.     punpcklbw m3, m7
  185.     TRANSPOSE4x4W 0, 1, 2, 3, 4
  186. %endif
  187.     pxor      m5, m5
  188.  
  189.     UNPACK_8TO16 bw, 6, 0, 5
  190.     UNPACK_8TO16 bw, 7, 1, 5
  191.     VC1_LOOP_FILTER_A0 m6, m0, m7, m1
  192.     UNPACK_8TO16 bw, 4, 2, 5
  193.     mova    m0, m1                      ; m0 = p0
  194.     VC1_LOOP_FILTER_A0 m7, m1, m4, m2
  195.     UNPACK_8TO16 bw, 1, 3, 5
  196.     mova    m5, m4
  197.     VC1_LOOP_FILTER_A0 m5, m2, m1, m3
  198.     SWAP 1, 4                           ; m1 = q0
  199.  
  200.     VC1_FILTER %1
  201.     punpcklbw m0, m1
  202. %if %0 > 1
  203.     STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
  204. %if %1 > 4
  205.     psrldq m0, 4
  206.     STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
  207. %endif
  208. %else
  209.     STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
  210.     STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
  211. %endif
  212. %endmacro
  213.  
  214.  
  215. %macro START_V_FILTER 0
  216.     mov  r4, r0
  217.     lea  r3, [4*r1]
  218.     sub  r4, r3
  219.     lea  r3, [r1+2*r1]
  220.     imul r2, 0x01010101
  221. %endmacro
  222.  
  223. %macro START_H_FILTER 1
  224.     lea  r3, [r1+2*r1]
  225. %if %1 > 4
  226.     lea  r4, [r0+4*r1]
  227. %endif
  228.     imul r2, 0x01010101
  229. %endmacro
  230.  
  231. %macro VC1_LF 0
  232. cglobal vc1_v_loop_filter_internal
  233.     VC1_V_LOOP_FILTER 4, d
  234.     ret
  235.  
  236. cglobal vc1_h_loop_filter_internal
  237.     VC1_H_LOOP_FILTER 4, r4
  238.     ret
  239.  
  240. ; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
  241. cglobal vc1_v_loop_filter4, 3,5,0
  242.     START_V_FILTER
  243.     call vc1_v_loop_filter_internal
  244.     RET
  245.  
  246. ; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
  247. cglobal vc1_h_loop_filter4, 3,5,0
  248.     START_H_FILTER 4
  249.     call vc1_h_loop_filter_internal
  250.     RET
  251.  
  252. ; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
  253. cglobal vc1_v_loop_filter8, 3,5,0
  254.     START_V_FILTER
  255.     call vc1_v_loop_filter_internal
  256.     add  r4, 4
  257.     add  r0, 4
  258.     call vc1_v_loop_filter_internal
  259.     RET
  260.  
  261. ; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
  262. cglobal vc1_h_loop_filter8, 3,5,0
  263.     START_H_FILTER 4
  264.     call vc1_h_loop_filter_internal
  265.     lea  r0, [r0+4*r1]
  266.     call vc1_h_loop_filter_internal
  267.     RET
  268. %endmacro
  269.  
  270. INIT_MMX mmxext
  271. VC1_LF
  272.  
  273. INIT_XMM sse2
  274. ; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
  275. cglobal vc1_v_loop_filter8, 3,5,8
  276.     START_V_FILTER
  277.     VC1_V_LOOP_FILTER 8, q
  278.     RET
  279.  
  280. ; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
  281. cglobal vc1_h_loop_filter8, 3,6,8
  282.     START_H_FILTER 8
  283.     VC1_H_LOOP_FILTER 8, r5
  284.     RET
  285.  
  286. INIT_MMX ssse3
  287. ; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
  288. cglobal vc1_v_loop_filter4, 3,5,0
  289.     START_V_FILTER
  290.     VC1_V_LOOP_FILTER 4, d
  291.     RET
  292.  
  293. ; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
  294. cglobal vc1_h_loop_filter4, 3,5,0
  295.     START_H_FILTER 4
  296.     VC1_H_LOOP_FILTER 4, r4
  297.     RET
  298.  
  299. INIT_XMM ssse3
  300. ; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
  301. cglobal vc1_v_loop_filter8, 3,5,8
  302.     START_V_FILTER
  303.     VC1_V_LOOP_FILTER 8, q
  304.     RET
  305.  
  306. ; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
  307. cglobal vc1_h_loop_filter8, 3,6,8
  308.     START_H_FILTER 8
  309.     VC1_H_LOOP_FILTER 8, r5
  310.     RET
  311.  
  312. INIT_XMM sse4
  313. ; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
  314. cglobal vc1_h_loop_filter8, 3,5,8
  315.     START_H_FILTER 8
  316.     VC1_H_LOOP_FILTER 8
  317.     RET
  318.