Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;*****************************************************************************
  2. ;* SIMD-optimized MPEG encoding functions
  3. ;*****************************************************************************
  4. ;* Copyright (c) 2000, 2001 Fabrice Bellard
  5. ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;*****************************************************************************
  23.  
  24. %include "libavutil/x86/x86util.asm"
  25.  
  26. SECTION_RODATA
  27.  
  28. cextern pw_1
  29.  
  30. SECTION .text
  31. ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
  32. ; %1 = number of loops
  33. ; %2 = number of GPRs used
  34. %macro PIX_SUM16 3
  35. cglobal pix_sum16, 2, %2, 6
  36.     movsxdifnidn r1, r1d
  37.     mov          r2, %1
  38. %if mmsize == 16
  39.     lea          r3, [r1*3]
  40. %endif
  41. %if notcpuflag(xop)
  42.     pxor         m5, m5
  43. %endif
  44.     pxor         m4, m4
  45. .loop:
  46. %if cpuflag(xop)
  47.     vphaddubq    m0, [r0]
  48.     vphaddubq    m1, [r0+r1]
  49.     vphaddubq    m2, [r0+r1*2]
  50.     vphaddubq    m3, [r0+r3]
  51. %else
  52.     mova         m0, [r0]
  53. %if mmsize == 8
  54.     mova         m1, [r0+8]
  55. %if cpuflag(mmxext)
  56.     mova         m2, [r0+r1]
  57.     mova         m3, [r0+r1+8]
  58. %endif
  59. %else ; sse2
  60.     mova         m1, [r0+r1]
  61.     mova         m2, [r0+r1*2]
  62.     mova         m3, [r0+r3]
  63. %endif
  64. %if cpuflag(mmxext)
  65.     psadbw       m0, m5
  66.     psadbw       m1, m5
  67.     psadbw       m2, m5
  68.     psadbw       m3, m5
  69. %else ; mmx
  70.     punpckhbw    m2, m0, m5
  71.     punpcklbw    m0, m5
  72.     punpckhbw    m3, m1, m5
  73.     punpcklbw    m1, m5
  74. %endif ; cpuflag(mmxext)
  75. %endif ; cpuflag(xop)
  76.     paddw        m1, m0
  77.     paddw        m3, m2
  78.     paddw        m3, m1
  79.     paddw        m4, m3
  80. %if cpuflag(mmxext)
  81.     lea          r0, [r0+r1*%3]
  82. %else
  83.     add          r0, r1
  84. %endif
  85.     dec r2
  86.     jne .loop
  87. %if mmsize == 16
  88.     pshufd       m0, m4, q0032
  89.     paddd        m4, m0
  90. %elif notcpuflag(mmxext)
  91.     HADDW        m4, m5
  92. %endif
  93.     movd        eax, m4
  94.     RET
  95. %endmacro
  96.  
  97. %if ARCH_X86_32
  98. INIT_MMX mmx
  99. PIX_SUM16 16, 3, 0
  100. INIT_MMX mmxext
  101. PIX_SUM16  8, 4, 2
  102. %endif
  103. INIT_XMM sse2
  104. PIX_SUM16  4, 4, 4
  105. %if HAVE_XOP_EXTERNAL
  106. INIT_XMM xop
  107. PIX_SUM16  4, 4, 4
  108. %endif
  109.  
  110. ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
  111. ; %1 = number of xmm registers used
  112. ; %2 = number of loops
  113. %macro PIX_NORM1 2
  114. cglobal pix_norm1, 2, 3, %1
  115.     movsxdifnidn r1, r1d
  116.     mov          r2, %2
  117.     pxor         m0, m0
  118.     pxor         m5, m5
  119. .loop:
  120.     mova         m2, [r0+0]
  121. %if mmsize == 8
  122.     mova         m3, [r0+8]
  123. %else
  124.     mova         m3, [r0+r1]
  125. %endif
  126.     punpckhbw    m1, m2, m0
  127.     punpcklbw    m2, m0
  128.     punpckhbw    m4, m3, m0
  129.     punpcklbw    m3, m0
  130.     pmaddwd      m1, m1
  131.     pmaddwd      m2, m2
  132.     pmaddwd      m3, m3
  133.     pmaddwd      m4, m4
  134.     paddd        m2, m1
  135.     paddd        m4, m3
  136.     paddd        m5, m2
  137.     paddd        m5, m4
  138. %if mmsize == 8
  139.     add          r0, r1
  140. %else
  141.     lea          r0, [r0+r1*2]
  142. %endif
  143.     dec r2
  144.     jne .loop
  145.     HADDD        m5, m1
  146.     movd        eax, m5
  147.     RET
  148. %endmacro
  149.  
  150. INIT_MMX mmx
  151. PIX_NORM1 0, 16
  152. INIT_XMM sse2
  153. PIX_NORM1 6, 8
  154.  
  155.