Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* optimized audio functions
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21.  
  22. %include "libavutil/x86/x86util.asm"
  23.  
  24. SECTION .text
  25.  
  26. %macro SCALARPRODUCT 0
  27. ; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
  28. cglobal scalarproduct_int16, 3,3,3, v1, v2, order
  29.     shl orderq, 1
  30.     add v1q, orderq
  31.     add v2q, orderq
  32.     neg orderq
  33.     pxor    m2, m2
  34. .loop:
  35.     movu    m0, [v1q + orderq]
  36.     movu    m1, [v1q + orderq + mmsize]
  37.     pmaddwd m0, [v2q + orderq]
  38.     pmaddwd m1, [v2q + orderq + mmsize]
  39.     paddd   m2, m0
  40.     paddd   m2, m1
  41.     add     orderq, mmsize*2
  42.     jl .loop
  43.     HADDD   m2, m0
  44.     movd   eax, m2
  45. %if mmsize == 8
  46.     emms
  47. %endif
  48.     RET
  49. %endmacro
  50.  
  51. INIT_MMX mmxext
  52. SCALARPRODUCT
  53. INIT_XMM sse2
  54. SCALARPRODUCT
  55.  
  56.  
  57. ;-----------------------------------------------------------------------------
  58. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  59. ;                           int32_t max, unsigned int len)
  60. ;-----------------------------------------------------------------------------
  61.  
  62. ; %1 = number of xmm registers used
  63. ; %2 = number of inline load/process/store loops per asm loop
  64. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  65. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  66. ; %5 = suffix
  67. %macro VECTOR_CLIP_INT32 4-5
  68. cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
  69. %if %4
  70.     cvtsi2ss  m4, minm
  71.     cvtsi2ss  m5, maxm
  72. %else
  73.     movd      m4, minm
  74.     movd      m5, maxm
  75. %endif
  76.     SPLATD    m4
  77.     SPLATD    m5
  78. .loop:
  79. %assign %%i 0
  80. %rep %2
  81.     mova      m0,  [srcq+mmsize*(0+%%i)]
  82.     mova      m1,  [srcq+mmsize*(1+%%i)]
  83.     mova      m2,  [srcq+mmsize*(2+%%i)]
  84.     mova      m3,  [srcq+mmsize*(3+%%i)]
  85. %if %3
  86.     mova      m7,  [srcq+mmsize*(4+%%i)]
  87.     mova      m8,  [srcq+mmsize*(5+%%i)]
  88.     mova      m9,  [srcq+mmsize*(6+%%i)]
  89.     mova      m10, [srcq+mmsize*(7+%%i)]
  90. %endif
  91.     CLIPD  m0,  m4, m5, m6
  92.     CLIPD  m1,  m4, m5, m6
  93.     CLIPD  m2,  m4, m5, m6
  94.     CLIPD  m3,  m4, m5, m6
  95. %if %3
  96.     CLIPD  m7,  m4, m5, m6
  97.     CLIPD  m8,  m4, m5, m6
  98.     CLIPD  m9,  m4, m5, m6
  99.     CLIPD  m10, m4, m5, m6
  100. %endif
  101.     mova  [dstq+mmsize*(0+%%i)], m0
  102.     mova  [dstq+mmsize*(1+%%i)], m1
  103.     mova  [dstq+mmsize*(2+%%i)], m2
  104.     mova  [dstq+mmsize*(3+%%i)], m3
  105. %if %3
  106.     mova  [dstq+mmsize*(4+%%i)], m7
  107.     mova  [dstq+mmsize*(5+%%i)], m8
  108.     mova  [dstq+mmsize*(6+%%i)], m9
  109.     mova  [dstq+mmsize*(7+%%i)], m10
  110. %endif
  111. %assign %%i %%i+4*(%3+1)
  112. %endrep
  113.     add     srcq, mmsize*4*(%2+%3)
  114.     add     dstq, mmsize*4*(%2+%3)
  115.     sub     lend, mmsize*(%2+%3)
  116.     jg .loop
  117.     REP_RET
  118. %endmacro
  119.  
  120. INIT_MMX mmx
  121. %define CLIPD CLIPD_MMX
  122. VECTOR_CLIP_INT32 0, 1, 0, 0
  123. INIT_XMM sse2
  124. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  125. %define CLIPD CLIPD_SSE2
  126. VECTOR_CLIP_INT32 6, 2, 0, 1
  127. INIT_XMM sse4
  128. %define CLIPD CLIPD_SSE41
  129. %ifdef m8
  130. VECTOR_CLIP_INT32 11, 1, 1, 0
  131. %else
  132. VECTOR_CLIP_INT32 6, 1, 0, 0
  133. %endif
  134.  
  135. ;-----------------------------------------------------
  136. ;void ff_vector_clipf(float *dst, const float *src,
  137. ;                     float min, float max, int len)
  138. ;-----------------------------------------------------
  139. INIT_XMM sse
  140. %if UNIX64
  141. cglobal vector_clipf, 3,3,6, dst, src, len
  142. %else
  143. cglobal vector_clipf, 5,5,6, dst, src, min, max, len
  144. %endif
  145. %if WIN64
  146.     SWAP 0, 2
  147.     SWAP 1, 3
  148. %elif ARCH_X86_32
  149.     movss   m0, minm
  150.     movss   m1, maxm
  151. %endif
  152.     SPLATD  m0
  153.     SPLATD  m1
  154.         shl lend, 2
  155.         add srcq, lenq
  156.         add dstq, lenq
  157.         neg lenq
  158. .loop:
  159.     mova    m2,  [srcq+lenq+mmsize*0]
  160.     mova    m3,  [srcq+lenq+mmsize*1]
  161.     mova    m4,  [srcq+lenq+mmsize*2]
  162.     mova    m5,  [srcq+lenq+mmsize*3]
  163.     maxps   m2, m0
  164.     maxps   m3, m0
  165.     maxps   m4, m0
  166.     maxps   m5, m0
  167.     minps   m2, m1
  168.     minps   m3, m1
  169.     minps   m4, m1
  170.     minps   m5, m1
  171.     mova    [dstq+lenq+mmsize*0], m2
  172.     mova    [dstq+lenq+mmsize*1], m3
  173.     mova    [dstq+lenq+mmsize*2], m4
  174.     mova    [dstq+lenq+mmsize*3], m5
  175.     add     lenq, mmsize*4
  176.     jl .loop
  177.     REP_RET
  178.