Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* SIMD lossless video DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2014 Michael Niedermayer
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22.  
  23. %include "libavutil/x86/x86util.asm"
  24.  
  25. SECTION_RODATA
  26.  
  27. pb_ef: times 8 db 14,15
  28. pb_67: times 8 db  6, 7
  29. pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
  30. pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
  31.  
  32. SECTION .text
  33.  
  34. %macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
  35.     movd    m4, maskd
  36.     SPLATW  m4, m4
  37.     add     wd, wd
  38.     test    wq, 2*mmsize - 1
  39.     jz %%.tomainloop
  40.     push  tmpq
  41. %%.wordloop:
  42.     sub     wq, 2
  43. %ifidn %2, add
  44.     mov   tmpw, [srcq+wq]
  45.     add   tmpw, [dstq+wq]
  46. %else
  47.     mov   tmpw, [src1q+wq]
  48.     sub   tmpw, [src2q+wq]
  49. %endif
  50.     and   tmpw, maskw
  51.     mov     [dstq+wq], tmpw
  52.     test    wq, 2*mmsize - 1
  53.     jnz %%.wordloop
  54.     pop   tmpq
  55. %%.tomainloop:
  56. %ifidn %2, add
  57.     add     srcq, wq
  58. %else
  59.     add     src1q, wq
  60.     add     src2q, wq
  61. %endif
  62.     add     dstq, wq
  63.     neg     wq
  64.     jz      %%.end
  65. %%.loop:
  66. %ifidn %2, add
  67.     mov%1   m0, [srcq+wq]
  68.     mov%1   m1, [dstq+wq]
  69.     mov%1   m2, [srcq+wq+mmsize]
  70.     mov%1   m3, [dstq+wq+mmsize]
  71. %else
  72.     mov%1   m0, [src1q+wq]
  73.     mov%1   m1, [src2q+wq]
  74.     mov%1   m2, [src1q+wq+mmsize]
  75.     mov%1   m3, [src2q+wq+mmsize]
  76. %endif
  77.     p%2w    m0, m1
  78.     p%2w    m2, m3
  79.     pand    m0, m4
  80.     pand    m2, m4
  81.     mov%1   [dstq+wq]       , m0
  82.     mov%1   [dstq+wq+mmsize], m2
  83.     add     wq, 2*mmsize
  84.     jl %%.loop
  85. %%.end:
  86.     RET
  87. %endmacro
  88.  
  89. INIT_MMX mmx
  90. cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
  91.     INT16_LOOP a, add
  92.  
  93. INIT_XMM sse2
  94. cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
  95.     test srcq, mmsize-1
  96.     jnz .unaligned
  97.     test dstq, mmsize-1
  98.     jnz .unaligned
  99.     INT16_LOOP a, add
  100. .unaligned:
  101.     INT16_LOOP u, add
  102.  
  103. INIT_MMX mmx
  104. cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
  105.     INT16_LOOP a, sub
  106.  
  107. INIT_XMM sse2
  108. cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
  109.     test src1q, mmsize-1
  110.     jnz .unaligned
  111.     test src2q, mmsize-1
  112.     jnz .unaligned
  113.     test dstq, mmsize-1
  114.     jnz .unaligned
  115.     INT16_LOOP a, sub
  116. .unaligned:
  117.     INT16_LOOP u, sub
  118.  
  119.  
  120. %macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
  121.     add     wd, wd
  122.     add     srcq, wq
  123.     add     dstq, wq
  124.     neg     wq
  125. %%.loop:
  126.     mov%2   m1, [srcq+wq]
  127.     mova    m2, m1
  128.     pslld   m1, 16
  129.     paddw   m1, m2
  130.     mova    m2, m1
  131.  
  132.     pshufb  m1, m3
  133.     paddw   m1, m2
  134.     pshufb  m0, m5
  135. %if mmsize == 16
  136.     mova    m2, m1
  137.     pshufb  m1, m4
  138.     paddw   m1, m2
  139. %endif
  140.     paddw   m0, m1
  141.     pand    m0, m7
  142. %ifidn %1, a
  143.     mova    [dstq+wq], m0
  144. %else
  145.     movq    [dstq+wq], m0
  146.     movhps  [dstq+wq+8], m0
  147. %endif
  148.     add     wq, mmsize
  149.     jl %%.loop
  150.     mov     eax, mmsize-1
  151.     sub     eax, wd
  152.     mov     wd, eax
  153.     shl     wd, 8
  154.     lea     eax, [wd+eax-1]
  155.     movd    m1, eax
  156.     pshufb  m0, m1
  157.     movd    eax, m0
  158.     RET
  159. %endmacro
  160.  
  161. ; int add_hfyu_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
  162. INIT_MMX ssse3
  163. cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left
  164. .skip_prologue:
  165.     mova    m5, [pb_67]
  166.     mova    m3, [pb_zzzz2323zzzzabab]
  167.     movd    m0, leftm
  168.     psllq   m0, 48
  169.     movd    m7, maskm
  170.     SPLATW  m7 ,m7
  171.     ADD_HFYU_LEFT_LOOP_INT16 a, a
  172.  
  173. INIT_XMM sse4
  174. cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left
  175.     mova    m5, [pb_ef]
  176.     mova    m4, [pb_zzzzzzzz67676767]
  177.     mova    m3, [pb_zzzz2323zzzzabab]
  178.     movd    m0, leftm
  179.     pslldq  m0, 14
  180.     movd    m7, maskm
  181.     SPLATW  m7 ,m7
  182.     test    srcq, 15
  183.     jnz .src_unaligned
  184.     test    dstq, 15
  185.     jnz .dst_unaligned
  186.     ADD_HFYU_LEFT_LOOP_INT16 a, a
  187. .dst_unaligned:
  188.     ADD_HFYU_LEFT_LOOP_INT16 u, a
  189. .src_unaligned:
  190.     ADD_HFYU_LEFT_LOOP_INT16 u, u
  191.  
  192. ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
  193. INIT_MMX mmxext
  194. cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
  195.     add      wd, wd
  196.     movd    mm6, maskd
  197.     SPLATW  mm6, mm6
  198.     movq    mm0, [topq]
  199.     movq    mm2, mm0
  200.     movd    mm4, [left_topq]
  201.     psllq   mm2, 16
  202.     movq    mm1, mm0
  203.     por     mm4, mm2
  204.     movd    mm3, [leftq]
  205.     psubw   mm0, mm4 ; t-tl
  206.     add    dstq, wq
  207.     add    topq, wq
  208.     add   diffq, wq
  209.     neg      wq
  210.     jmp .skip
  211. .loop:
  212.     movq    mm4, [topq+wq]
  213.     movq    mm0, mm4
  214.     psllq   mm4, 16
  215.     por     mm4, mm1
  216.     movq    mm1, mm0 ; t
  217.     psubw   mm0, mm4 ; t-tl
  218. .skip:
  219.     movq    mm2, [diffq+wq]
  220. %assign i 0
  221. %rep 4
  222.     movq    mm4, mm0
  223.     paddw   mm4, mm3 ; t-tl+l
  224.     pand    mm4, mm6
  225.     movq    mm5, mm3
  226.     pmaxsw  mm3, mm1
  227.     pminsw  mm5, mm1
  228.     pminsw  mm3, mm4
  229.     pmaxsw  mm3, mm5 ; median
  230.     paddw   mm3, mm2 ; +residual
  231.     pand    mm3, mm6
  232. %if i==0
  233.     movq    mm7, mm3
  234.     psllq   mm7, 48
  235. %else
  236.     movq    mm4, mm3
  237.     psrlq   mm7, 16
  238.     psllq   mm4, 48
  239.     por     mm7, mm4
  240. %endif
  241. %if i<3
  242.     psrlq   mm0, 16
  243.     psrlq   mm1, 16
  244.     psrlq   mm2, 16
  245. %endif
  246. %assign i i+1
  247. %endrep
  248.     movq [dstq+wq], mm7
  249.     add      wq, 8
  250.     jl .loop
  251.     movzx   r2d, word [dstq-2]
  252.     mov [leftq], r2d
  253.     movzx   r2d, word [topq-2]
  254.     mov [left_topq], r2d
  255.     RET
  256.  
  257. cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
  258.     add      wd, wd
  259.     movd    mm7, maskd
  260.     SPLATW  mm7, mm7
  261.     movq    mm0, [src1q]
  262.     movq    mm2, [src2q]
  263.     psllq   mm0, 16
  264.     psllq   mm2, 16
  265.     movd    mm6, [left_topq]
  266.     por     mm0, mm6
  267.     movd    mm6, [leftq]
  268.     por     mm2, mm6
  269.     xor     maskq, maskq
  270. .loop:
  271.     movq    mm1, [src1q + maskq]
  272.     movq    mm3, [src2q + maskq]
  273.     movq    mm4, mm2
  274.     psubw   mm2, mm0
  275.     paddw   mm2, mm1
  276.     pand    mm2, mm7
  277.     movq    mm5, mm4
  278.     pmaxsw  mm4, mm1
  279.     pminsw  mm1, mm5
  280.     pminsw  mm4, mm2
  281.     pmaxsw  mm4, mm1
  282.     psubw   mm3, mm4
  283.     pand    mm3, mm7
  284.     movq    [dstq + maskq], mm3
  285.     add     maskq, 8
  286.     movq    mm0, [src1q + maskq - 2]
  287.     movq    mm2, [src2q + maskq - 2]
  288.     cmp     maskq, wq
  289.         jb .loop
  290.     movzx maskd, word [src1q + wq - 2]
  291.     mov [left_topq], maskd
  292.     movzx maskd, word [src2q + wq - 2]
  293.     mov [leftq], maskd
  294.     RET
  295.