Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;*****************************************************************************
  2. ;* x86-optimized functions for idet filter
  3. ;*
  4. ;* Copyright (C) 2014 Pascal Massimino (pascal.massimino@gmail.com)
  5. ;* Copyright (c) 2014 Neil Birkbeck (birkbeck@google.com)
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23.  
  24. %include "libavutil/x86/x86util.asm"
  25.  
  26. SECTION .text
  27.  
  28. ; Implementation that does 8-bytes at a time using single-word operations.
  29. %macro IDET_FILTER_LINE 1
  30. INIT_MMX %1
  31. cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index
  32.     xor       indexq, indexq
  33. %define   m_zero m2
  34. %define   m_sum  m5
  35.     pxor      m_sum, m_sum
  36.     pxor      m_zero, m_zero
  37.  
  38. .loop:
  39.     movu      m0, [aq + indexq*1]
  40.     punpckhbw m1, m0, m_zero
  41.     punpcklbw m0, m_zero
  42.  
  43.     movu      m3, [cq + indexq*1]
  44.     punpckhbw m4, m3, m_zero
  45.     punpcklbw m3, m_zero
  46.  
  47.     paddsw    m1, m4
  48.     paddsw    m0, m3
  49.  
  50.     movu      m3, [bq + indexq*1]
  51.     punpckhbw m4, m3, m_zero
  52.     punpcklbw m3, m_zero
  53.  
  54.     paddw     m4, m4
  55.     paddw     m3, m3
  56.     psubsw    m1, m4
  57.     psubsw    m0, m3
  58.  
  59.     ABS2      m1, m0, m4, m3
  60.  
  61.     paddw     m0, m1
  62.     punpckhwd m1, m0, m_zero
  63.     punpcklwd m0, m_zero
  64.  
  65.     paddd     m0, m1
  66.     paddd     m_sum, m0
  67.  
  68.     add       indexq, 0x8
  69.     CMP       widthd, indexd
  70.     jg        .loop
  71.  
  72.     HADDD     m_sum, m0
  73.     movd      eax, m_sum
  74.     RET
  75. %endmacro
  76.  
  77. %if ARCH_X86_32
  78. IDET_FILTER_LINE mmxext
  79. IDET_FILTER_LINE mmx
  80. %endif
  81.  
  82. ;******************************************************************************
  83. ; 16bit implementation that does 4/8-pixels at a time
  84.  
  85. %macro PABS_DIFF_WD 3    ; a, b, junk   , output=a
  86.   psubusw   %3, %2, %1
  87.   psubusw   %1, %2
  88.   por       %1, %3
  89.  
  90.   mova      %2, %1
  91.   punpcklwd %1, m_zero
  92.   punpckhwd %2, m_zero
  93.   paddd     %1, %2
  94. %endmacro
  95.  
  96. %macro IDET_FILTER_LINE_16BIT 1   ; %1=increment (4 or 8 words)
  97. cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
  98.     xor       indexq, indexq
  99. %define m_zero m1
  100. %define m_sum  m0
  101.     pxor      m_sum, m_sum
  102.     pxor      m_zero, m_zero
  103.  
  104. .loop_16bit:
  105.     movu      m2, [bq + indexq * 2]  ; B
  106.     movu      m3, [aq + indexq * 2]  ; A
  107.     mova      m6, m2
  108.     psubusw   m5, m2, m3             ; ba
  109.  
  110.     movu      m4, [cq + indexq * 2]  ; C
  111.     add       indexq, %1
  112.     psubusw   m3, m2                 ; ab
  113.     CMP       indexd, widthd
  114.  
  115.     psubusw   m6, m4                 ; bc
  116.     psubusw   m4, m2                 ; cb
  117.  
  118.     PABS_DIFF_WD   m3, m6, m7        ; |ab - bc|
  119.     PABS_DIFF_WD   m5, m4, m7        ; |ba - cb|
  120.     paddd          m_sum, m3
  121.     paddd          m_sum, m5
  122.     jl        .loop_16bit
  123.  
  124.     HADDD     m_sum, m2
  125.     movd      eax, m_sum
  126.     RET
  127. %endmacro
  128.  
  129. INIT_XMM sse2
  130. IDET_FILTER_LINE_16BIT 8
  131. %if ARCH_X86_32
  132. INIT_MMX mmx
  133. IDET_FILTER_LINE_16BIT 4
  134. %endif
  135.  
  136. ;******************************************************************************
  137. ; SSE2 8-bit implementation that does 16-bytes at a time:
  138.  
  139. INIT_XMM sse2
  140. cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
  141.     xor       indexq, indexq
  142.     pxor      m0, m0
  143.     pxor      m1, m1
  144.  
  145. .sse2_loop:
  146.     movu      m2, [bq + indexq*1]  ; B
  147.     movu      m3, [aq + indexq*1]  ; A
  148.     mova      m6, m2
  149.     mova      m4, m3
  150.     psubusb   m5, m2, m3           ; ba
  151.  
  152.     movu      m3, [cq + indexq*1]  ; C
  153.     add       indexq, 0x10
  154.     psubusb   m4, m2               ; ab
  155.     CMP       indexd, widthd
  156.  
  157.     psubusb   m6, m3               ; bc
  158.     psubusb   m3, m2               ; cb
  159.  
  160.     psadbw    m4, m6               ; |ab - bc|
  161.     paddq     m0, m4
  162.     psadbw    m5, m3               ; |ba - cb|
  163.     paddq     m1, m5
  164.     jl       .sse2_loop
  165.  
  166.     paddq     m0, m1
  167.     movhlps   m1, m0
  168.     paddq     m0, m1
  169.     movd      eax, m0
  170.     RET
  171.