Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* SIMD-optimized HuffYUV functions
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2014 Christophe Gisquet
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22.  
  23. %include "libavutil/x86/x86util.asm"
  24.  
  25. SECTION_RODATA
  26. cextern pb_15
  27. pb_zzzzzzzz77777777: times 8 db -1
  28. pb_7: times 8 db 7
  29. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  30. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  31.  
  32. SECTION .text
  33.  
  34. ; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
  35. ;                                     const uint8_t *diff, int w,
  36. ;                                     int *left, int *left_top)
  37. %macro HFYU_MEDIAN 0
  38. cglobal add_hfyu_median_pred, 6,6,8, dst, top, diff, w, left, left_top
  39.     movu    m0, [topq]
  40.     mova    m2, m0
  41.     movd    m4, [left_topq]
  42.     LSHIFT  m2, 1
  43.     mova    m1, m0
  44.     por     m4, m2
  45.     movd    m3, [leftq]
  46.     psubb   m0, m4 ; t-tl
  47.     add    dstq, wq
  48.     add    topq, wq
  49.     add   diffq, wq
  50.     neg      wq
  51.     jmp .skip
  52. .loop:
  53.     movu    m4, [topq+wq]
  54.     mova    m0, m4
  55.     LSHIFT  m4, 1
  56.     por     m4, m1
  57.     mova    m1, m0 ; t
  58.     psubb   m0, m4 ; t-tl
  59. .skip:
  60.     movu    m2, [diffq+wq]
  61. %assign i 0
  62. %rep mmsize
  63.     mova    m4, m0
  64.     paddb   m4, m3 ; t-tl+l
  65.     mova    m5, m3
  66.     pmaxub  m3, m1
  67.     pminub  m5, m1
  68.     pminub  m3, m4
  69.     pmaxub  m3, m5 ; median
  70.     paddb   m3, m2 ; +residual
  71. %if i==0
  72.     mova    m7, m3
  73.     LSHIFT  m7, mmsize-1
  74. %else
  75.     mova    m6, m3
  76.     RSHIFT  m7, 1
  77.     LSHIFT  m6, mmsize-1
  78.     por     m7, m6
  79. %endif
  80. %if i<mmsize-1
  81.     RSHIFT  m0, 1
  82.     RSHIFT  m1, 1
  83.     RSHIFT  m2, 1
  84. %endif
  85. %assign i i+1
  86. %endrep
  87.     movu [dstq+wq], m7
  88.     add      wq, mmsize
  89.     jl .loop
  90.     movzx   r2d, byte [dstq-1]
  91.     mov [leftq], r2d
  92.     movzx   r2d, byte [topq-1]
  93.     mov [left_topq], r2d
  94.     RET
  95. %endmacro
  96.  
  97. %if ARCH_X86_32
  98. INIT_MMX mmxext
  99. HFYU_MEDIAN
  100. %endif
  101. INIT_XMM sse2
  102. HFYU_MEDIAN
  103.  
  104.  
  105. %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
  106.     add     srcq, wq
  107.     add     dstq, wq
  108.     neg     wq
  109. %%.loop:
  110. %if %2
  111.     mova    m1, [srcq+wq]
  112. %else
  113.     movu    m1, [srcq+wq]
  114. %endif
  115.     mova    m2, m1
  116.     psllw   m1, 8
  117.     paddb   m1, m2
  118.     mova    m2, m1
  119.     pshufb  m1, m3
  120.     paddb   m1, m2
  121.     pshufb  m0, m5
  122.     mova    m2, m1
  123.     pshufb  m1, m4
  124.     paddb   m1, m2
  125. %if mmsize == 16
  126.     mova    m2, m1
  127.     pshufb  m1, m6
  128.     paddb   m1, m2
  129. %endif
  130.     paddb   m0, m1
  131. %if %1
  132.     mova    [dstq+wq], m0
  133. %else
  134.     movq    [dstq+wq], m0
  135.     movhps  [dstq+wq+8], m0
  136. %endif
  137.     add     wq, mmsize
  138.     jl %%.loop
  139.     mov     eax, mmsize-1
  140.     sub     eax, wd
  141.     movd    m1, eax
  142.     pshufb  m0, m1
  143.     movd    eax, m0
  144.     RET
  145. %endmacro
  146.  
  147. ; int ff_add_hfyu_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
  148. INIT_MMX ssse3
  149. cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
  150. .skip_prologue:
  151.     mova    m5, [pb_7]
  152.     mova    m4, [pb_zzzz3333zzzzbbbb]
  153.     mova    m3, [pb_zz11zz55zz99zzdd]
  154.     movd    m0, leftm
  155.     psllq   m0, 56
  156.     ADD_HFYU_LEFT_LOOP 1, 1
  157.  
  158. INIT_XMM sse4
  159. cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
  160.     mova    m5, [pb_15]
  161.     mova    m6, [pb_zzzzzzzz77777777]
  162.     mova    m4, [pb_zzzz3333zzzzbbbb]
  163.     mova    m3, [pb_zz11zz55zz99zzdd]
  164.     movd    m0, leftm
  165.     pslldq  m0, 15
  166.     test    srcq, 15
  167.     jnz .src_unaligned
  168.     test    dstq, 15
  169.     jnz .dst_unaligned
  170.     ADD_HFYU_LEFT_LOOP 1, 1
  171. .dst_unaligned:
  172.     ADD_HFYU_LEFT_LOOP 0, 1
  173. .src_unaligned:
  174.     ADD_HFYU_LEFT_LOOP 0, 0
  175.  
  176. %macro ADD_BYTES 0
  177. cglobal add_bytes, 3,4,2, dst, src, w, size
  178.     mov  sizeq, wq
  179.     and  sizeq, -2*mmsize
  180.     jz  .2
  181.     add   dstq, sizeq
  182.     add   srcq, sizeq
  183.     neg  sizeq
  184. .1:
  185.     mova    m0, [srcq + sizeq]
  186.     mova    m1, [srcq + sizeq + mmsize]
  187.     paddb   m0, [dstq + sizeq]
  188.     paddb   m1, [dstq + sizeq + mmsize]
  189.     mova   [dstq + sizeq], m0
  190.     mova   [dstq + sizeq + mmsize], m1
  191.     add  sizeq, 2*mmsize
  192.     jl .1
  193. .2:
  194.     and     wq, 2*mmsize-1
  195.     jz    .end
  196.     add   dstq, wq
  197.     add   srcq, wq
  198.     neg     wq
  199. .3:
  200.     mov  sizeb, [srcq + wq]
  201.     add [dstq + wq], sizeb
  202.     inc     wq
  203.     jl .3
  204. .end:
  205.     REP_RET
  206. %endmacro
  207.  
  208. %if ARCH_X86_32
  209. INIT_MMX mmx
  210. ADD_BYTES
  211. %endif
  212. INIT_XMM sse2
  213. ADD_BYTES
  214.  
  215. ; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
  216. ;                               intptr_t w, uint8_t *left)
  217. %macro LEFT_BGR32 0
  218. cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
  219.     shl           wq, 2
  220.     movd          m0, [leftq]
  221.     lea         dstq, [dstq + wq]
  222.     lea         srcq, [srcq + wq]
  223.     LSHIFT        m0, mmsize-4
  224.     neg           wq
  225. .loop:
  226.     movu          m1, [srcq+wq]
  227.     mova          m2, m1
  228. %if mmsize == 8
  229.     punpckhdq     m0, m0
  230. %endif
  231.     LSHIFT        m1, 4
  232.     paddb         m1, m2
  233. %if mmsize == 16
  234.     pshufd        m0, m0, q3333
  235.     mova          m2, m1
  236.     LSHIFT        m1, 8
  237.     paddb         m1, m2
  238. %endif
  239.     paddb         m0, m1
  240.     movu   [dstq+wq], m0
  241.     add           wq, mmsize
  242.     jl         .loop
  243.     movd          m0, [dstq-4]
  244.     movd     [leftq], m0
  245.     REP_RET
  246. %endmacro
  247.  
  248. %if ARCH_X86_32
  249. INIT_MMX mmx
  250. LEFT_BGR32
  251. %endif
  252. INIT_XMM sse2
  253. LEFT_BGR32
  254.