Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;*****************************************************************************
  2. ;* x86-optimized functions for interlace filter
  3. ;*
  4. ;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22.  
  23. %include "libavutil/x86/x86util.asm"
  24.  
  25. SECTION .text
  26.  
  27. %macro SSE_LINE_FN 2 ; 8 or 16, byte or word
  28. INIT_XMM sse2
  29. %if ARCH_X86_32
  30. %if %1 == 8
  31. cglobal sse_line_%1 %+ bit, 0, 6, 8, res, buf, w, px1, px2, ref
  32. %else
  33. cglobal sse_line_%1 %+ bit, 0, 7, 8, res, buf, reshigh, w, px1, px2, ref
  34. %endif
  35.     mov       bufq, r0mp
  36.     mov       refq, r1mp
  37.     mov         wd, r2m
  38. %else
  39. cglobal sse_line_%1 %+ bit, 3, 5, 8, buf, ref, w, px1, px2
  40. %endif
  41.     pxor        m6, m6
  42.     pxor        m7, m7
  43.     sub         wd, mmsize*2
  44.     jl .end
  45.  
  46. .loop:
  47.     movu        m0, [bufq+mmsize*0]
  48.     movu        m1, [bufq+mmsize*1]
  49.     movu        m2, [refq+mmsize*0]
  50.     movu        m3, [refq+mmsize*1]
  51. %if %1 == 8
  52.     add       bufq, mmsize*2
  53.     add       refq, mmsize*2
  54.     psubusb     m4, m0, m2
  55.     psubusb     m5, m1, m3
  56.     psubusb     m2, m0
  57.     psubusb     m3, m1
  58.     por         m2, m4
  59.     por         m3, m5
  60.     punpcklbw   m0, m2, m6
  61.     punpcklbw   m1, m3, m6
  62.     punpckhbw   m2, m6
  63.     punpckhbw   m3, m6
  64. %else
  65.     psubw       m0, m2
  66.     psubw       m1, m3
  67.     movu        m2, [bufq+mmsize*2]
  68.     movu        m3, [bufq+mmsize*3]
  69.     movu        m4, [refq+mmsize*2]
  70.     movu        m5, [refq+mmsize*3]
  71.     psubw       m2, m4
  72.     psubw       m3, m5
  73.     add       bufq, mmsize*4
  74.     add       refq, mmsize*4
  75. %endif
  76.     pmaddwd     m0, m0
  77.     pmaddwd     m1, m1
  78.     pmaddwd     m2, m2
  79.     pmaddwd     m3, m3
  80.     paddd       m0, m1
  81.     paddd       m2, m3
  82. %if %1 == 8
  83.     paddd       m7, m0
  84.     paddd       m7, m2
  85. %else
  86.     paddd       m0, m2
  87.     punpckldq   m2, m0, m6
  88.     punpckhdq   m0, m6
  89.     paddq       m7, m0
  90.     paddq       m7, m2
  91. %endif
  92.     sub         wd, mmsize*2
  93.     jge .loop
  94.  
  95. .end:
  96.     add         wd, mmsize*2
  97.     movhlps     m0, m7
  98. %if %1 == 8
  99.     paddd       m7, m0
  100.     pshufd      m0, m7, 1
  101.     paddd       m7, m0
  102.     movd       eax, m7
  103. %else
  104.     paddq       m7, m0
  105. %if ARCH_X86_32
  106.     movd       eax, m7
  107.     psrldq      m7, 4
  108.     movd       edx, m7
  109. %else
  110.     movq       rax, m7
  111. %endif
  112. %endif
  113.  
  114.     ; deal with cases where w % 32 != 0
  115.     test        wd, wd
  116.     jz .end_scalar
  117. .loop_scalar:
  118.     movzx     px1d, %2 [bufq+wq*(%1/8)-(%1/8)]
  119.     movzx     px2d, %2 [refq+wq*(%1/8)-(%1/8)]
  120.     sub       px1d, px2d
  121.     imul      px1d, px1d
  122. %if %1 == 8
  123.     add        eax, px1d
  124. %elif ARCH_X86_64
  125.     add        rax, px1q
  126. %else
  127.     add        eax, px1d
  128.     adc        edx, 0
  129. %endif
  130.     dec         wd
  131.     jg .loop_scalar
  132.  
  133. .end_scalar:
  134.     ; for %1=8, no need to zero edx on x86-32, since edx=wd, which is zero
  135.     RET
  136. %endmacro
  137.  
  138. INIT_XMM sse2
  139. SSE_LINE_FN  8, byte
  140. SSE_LINE_FN 16, word
  141.