Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;*****************************************************************************
  2. ;* x86-optimized Float DSP functions
  3. ;*
  4. ;* Copyright 2006 Loren Merritt
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22.  
  23. %include "x86util.asm"
  24.  
  25. SECTION .text
  26.  
  27. ;-----------------------------------------------------------------------------
  28. ; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
  29. ;-----------------------------------------------------------------------------
  30. %macro VECTOR_FMUL 0
  31. cglobal vector_fmul, 4,4,2, dst, src0, src1, len
  32.     lea       lenq, [lend*4 - 64]
  33. ALIGN 16
  34. .loop:
  35. %assign a 0
  36. %rep 32/mmsize
  37.     mova      m0,   [src0q + lenq + (a+0)*mmsize]
  38.     mova      m1,   [src0q + lenq + (a+1)*mmsize]
  39.     mulps     m0, m0, [src1q + lenq + (a+0)*mmsize]
  40.     mulps     m1, m1, [src1q + lenq + (a+1)*mmsize]
  41.     mova      [dstq + lenq + (a+0)*mmsize], m0
  42.     mova      [dstq + lenq + (a+1)*mmsize], m1
  43. %assign a a+2
  44. %endrep
  45.  
  46.     sub       lenq, 64
  47.     jge       .loop
  48.     REP_RET
  49. %endmacro
  50.  
  51. INIT_XMM sse
  52. VECTOR_FMUL
  53. %if HAVE_AVX_EXTERNAL
  54. INIT_YMM avx
  55. VECTOR_FMUL
  56. %endif
  57.  
  58. ;------------------------------------------------------------------------------
  59. ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
  60. ;------------------------------------------------------------------------------
  61.  
  62. %macro VECTOR_FMAC_SCALAR 0
  63. %if UNIX64
  64. cglobal vector_fmac_scalar, 3,3,5, dst, src, len
  65. %else
  66. cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
  67. %endif
  68. %if ARCH_X86_32
  69.     VBROADCASTSS m0, mulm
  70. %else
  71. %if WIN64
  72.     SWAP 0, 2
  73. %endif
  74.     shufps      xm0, xm0, 0
  75. %if cpuflag(avx)
  76.     vinsertf128  m0, m0, xm0, 1
  77. %endif
  78. %endif
  79.     lea    lenq, [lend*4-64]
  80. .loop:
  81. %if cpuflag(fma3)
  82.     mova     m1,     [dstq+lenq]
  83.     mova     m2,     [dstq+lenq+1*mmsize]
  84.     fmaddps  m1, m0, [srcq+lenq], m1
  85.     fmaddps  m2, m0, [srcq+lenq+1*mmsize], m2
  86. %else ; cpuflag
  87.     mulps    m1, m0, [srcq+lenq]
  88.     mulps    m2, m0, [srcq+lenq+1*mmsize]
  89. %if mmsize < 32
  90.     mulps    m3, m0, [srcq+lenq+2*mmsize]
  91.     mulps    m4, m0, [srcq+lenq+3*mmsize]
  92. %endif ; mmsize
  93.     addps    m1, m1, [dstq+lenq]
  94.     addps    m2, m2, [dstq+lenq+1*mmsize]
  95. %if mmsize < 32
  96.     addps    m3, m3, [dstq+lenq+2*mmsize]
  97.     addps    m4, m4, [dstq+lenq+3*mmsize]
  98. %endif ; mmsize
  99. %endif ; cpuflag
  100.     mova  [dstq+lenq], m1
  101.     mova  [dstq+lenq+1*mmsize], m2
  102. %if mmsize < 32
  103.     mova  [dstq+lenq+2*mmsize], m3
  104.     mova  [dstq+lenq+3*mmsize], m4
  105. %endif ; mmsize
  106.     sub    lenq, 64
  107.     jge .loop
  108.     REP_RET
  109. %endmacro
  110.  
  111. INIT_XMM sse
  112. VECTOR_FMAC_SCALAR
  113. %if HAVE_AVX_EXTERNAL
  114. INIT_YMM avx
  115. VECTOR_FMAC_SCALAR
  116. %endif
  117. %if HAVE_FMA3_EXTERNAL
  118. INIT_YMM fma3
  119. VECTOR_FMAC_SCALAR
  120. %endif
  121.  
  122. ;------------------------------------------------------------------------------
  123. ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
  124. ;------------------------------------------------------------------------------
  125.  
  126. %macro VECTOR_FMUL_SCALAR 0
  127. %if UNIX64
  128. cglobal vector_fmul_scalar, 3,3,2, dst, src, len
  129. %else
  130. cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
  131. %endif
  132. %if ARCH_X86_32
  133.     movss    m0, mulm
  134. %elif WIN64
  135.     SWAP 0, 2
  136. %endif
  137.     shufps   m0, m0, 0
  138.     lea    lenq, [lend*4-mmsize]
  139. .loop:
  140.     mova     m1, [srcq+lenq]
  141.     mulps    m1, m0
  142.     mova  [dstq+lenq], m1
  143.     sub    lenq, mmsize
  144.     jge .loop
  145.     REP_RET
  146. %endmacro
  147.  
  148. INIT_XMM sse
  149. VECTOR_FMUL_SCALAR
  150.  
  151. ;------------------------------------------------------------------------------
  152. ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
  153. ;                            int len)
  154. ;------------------------------------------------------------------------------
  155.  
  156. %macro VECTOR_DMUL_SCALAR 0
  157. %if ARCH_X86_32
  158. cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
  159.     mov          lenq, lenaddrm
  160. %elif UNIX64
  161. cglobal vector_dmul_scalar, 3,3,3, dst, src, len
  162. %else
  163. cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
  164. %endif
  165. %if ARCH_X86_32
  166.     VBROADCASTSD   m0, mulm
  167. %else
  168. %if WIN64
  169.     SWAP 0, 2
  170. %endif
  171.     movlhps       xm0, xm0
  172. %if cpuflag(avx)
  173.     vinsertf128   ym0, ym0, xm0, 1
  174. %endif
  175. %endif
  176.     lea          lenq, [lend*8-2*mmsize]
  177. .loop:
  178.     mulpd          m1, m0, [srcq+lenq       ]
  179.     mulpd          m2, m0, [srcq+lenq+mmsize]
  180.     mova   [dstq+lenq       ], m1
  181.     mova   [dstq+lenq+mmsize], m2
  182.     sub          lenq, 2*mmsize
  183.     jge .loop
  184.     REP_RET
  185. %endmacro
  186.  
  187. INIT_XMM sse2
  188. VECTOR_DMUL_SCALAR
  189. %if HAVE_AVX_EXTERNAL
  190. INIT_YMM avx
  191. VECTOR_DMUL_SCALAR
  192. %endif
  193.  
  194. ;-----------------------------------------------------------------------------
  195. ; vector_fmul_window(float *dst, const float *src0,
  196. ;                    const float *src1, const float *win, int len);
  197. ;-----------------------------------------------------------------------------
  198. %macro VECTOR_FMUL_WINDOW 0
  199. cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
  200.     shl     lend, 2
  201.     lea    len1q, [lenq - mmsize]
  202.     add    src0q, lenq
  203.     add     dstq, lenq
  204.     add     winq, lenq
  205.     neg     lenq
  206. .loop:
  207.     mova      m0, [winq  + lenq]
  208.     mova      m4, [src0q + lenq]
  209. %if cpuflag(sse)
  210.     mova      m1, [winq  + len1q]
  211.     mova      m5, [src1q + len1q]
  212.     shufps    m1, m1, 0x1b
  213.     shufps    m5, m5, 0x1b
  214.     mova      m2, m0
  215.     mova      m3, m1
  216.     mulps     m2, m4
  217.     mulps     m3, m5
  218.     mulps     m1, m4
  219.     mulps     m0, m5
  220.     addps     m2, m3
  221.     subps     m1, m0
  222.     shufps    m2, m2, 0x1b
  223. %else
  224.     pswapd    m1, [winq  + len1q]
  225.     pswapd    m5, [src1q + len1q]
  226.     mova      m2, m0
  227.     mova      m3, m1
  228.     pfmul     m2, m4
  229.     pfmul     m3, m5
  230.     pfmul     m1, m4
  231.     pfmul     m0, m5
  232.     pfadd     m2, m3
  233.     pfsub     m1, m0
  234.     pswapd    m2, m2
  235. %endif
  236.     mova      [dstq + lenq], m1
  237.     mova      [dstq + len1q], m2
  238.     sub       len1q, mmsize
  239.     add       lenq,  mmsize
  240.     jl .loop
  241. %if mmsize == 8
  242.     femms
  243. %endif
  244.     REP_RET
  245. %endmacro
  246.  
  247. INIT_MMX 3dnowext
  248. VECTOR_FMUL_WINDOW
  249. INIT_XMM sse
  250. VECTOR_FMUL_WINDOW
  251.  
  252. ;-----------------------------------------------------------------------------
  253. ; vector_fmul_add(float *dst, const float *src0, const float *src1,
  254. ;                 const float *src2, int len)
  255. ;-----------------------------------------------------------------------------
  256. %macro VECTOR_FMUL_ADD 0
  257. cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
  258.     lea       lenq, [lend*4 - 2*mmsize]
  259. ALIGN 16
  260. .loop:
  261.     mova    m0,   [src0q + lenq]
  262.     mova    m1,   [src0q + lenq + mmsize]
  263. %if cpuflag(fma3)
  264.     mova    m2,     [src2q + lenq]
  265.     mova    m3,     [src2q + lenq + mmsize]
  266.     fmaddps m0, m0, [src1q + lenq], m2
  267.     fmaddps m1, m1, [src1q + lenq + mmsize], m3
  268. %else
  269.     mulps   m0, m0, [src1q + lenq]
  270.     mulps   m1, m1, [src1q + lenq + mmsize]
  271.     addps   m0, m0, [src2q + lenq]
  272.     addps   m1, m1, [src2q + lenq + mmsize]
  273. %endif
  274.     mova    [dstq + lenq], m0
  275.     mova    [dstq + lenq + mmsize], m1
  276.  
  277.     sub     lenq,   2*mmsize
  278.     jge     .loop
  279.     REP_RET
  280. %endmacro
  281.  
  282. INIT_XMM sse
  283. VECTOR_FMUL_ADD
  284. %if HAVE_AVX_EXTERNAL
  285. INIT_YMM avx
  286. VECTOR_FMUL_ADD
  287. %endif
  288. %if HAVE_FMA3_EXTERNAL
  289. INIT_YMM fma3
  290. VECTOR_FMUL_ADD
  291. %endif
  292.  
  293. ;-----------------------------------------------------------------------------
  294. ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
  295. ;                          int len)
  296. ;-----------------------------------------------------------------------------
  297. %macro VECTOR_FMUL_REVERSE 0
  298. cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
  299.     lea       lenq, [lend*4 - 2*mmsize]
  300. ALIGN 16
  301. .loop:
  302. %if cpuflag(avx)
  303.     vmovaps     xmm0, [src1q + 16]
  304.     vinsertf128 m0, m0, [src1q], 1
  305.     vshufps     m0, m0, m0, q0123
  306.     vmovaps     xmm1, [src1q + mmsize + 16]
  307.     vinsertf128 m1, m1, [src1q + mmsize], 1
  308.     vshufps     m1, m1, m1, q0123
  309. %else
  310.     mova    m0, [src1q]
  311.     mova    m1, [src1q + mmsize]
  312.     shufps  m0, m0, q0123
  313.     shufps  m1, m1, q0123
  314. %endif
  315.     mulps   m0, m0, [src0q + lenq + mmsize]
  316.     mulps   m1, m1, [src0q + lenq]
  317.     mova    [dstq + lenq + mmsize], m0
  318.     mova    [dstq + lenq], m1
  319.     add     src1q, 2*mmsize
  320.     sub     lenq,  2*mmsize
  321.     jge     .loop
  322.     REP_RET
  323. %endmacro
  324.  
  325. INIT_XMM sse
  326. VECTOR_FMUL_REVERSE
  327. %if HAVE_AVX_EXTERNAL
  328. INIT_YMM avx
  329. VECTOR_FMUL_REVERSE
  330. %endif
  331.  
  332. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  333. INIT_XMM sse
  334. cglobal scalarproduct_float, 3,3,2, v1, v2, offset
  335.     shl   offsetd, 2
  336.     add       v1q, offsetq
  337.     add       v2q, offsetq
  338.     neg   offsetq
  339.     xorps    xmm0, xmm0
  340. .loop:
  341.     movaps   xmm1, [v1q+offsetq]
  342.     mulps    xmm1, [v2q+offsetq]
  343.     addps    xmm0, xmm1
  344.     add   offsetq, 16
  345.     js .loop
  346.     movhlps  xmm1, xmm0
  347.     addps    xmm0, xmm1
  348.     movss    xmm1, xmm0
  349.     shufps   xmm0, xmm0, 1
  350.     addss    xmm0, xmm1
  351. %if ARCH_X86_64 == 0
  352.     movss     r0m,  xmm0
  353.     fld dword r0m
  354. %endif
  355.     RET
  356.  
  357. ;-----------------------------------------------------------------------------
  358. ; void ff_butterflies_float(float *src0, float *src1, int len);
  359. ;-----------------------------------------------------------------------------
  360. INIT_XMM sse
  361. cglobal butterflies_float, 3,3,3, src0, src1, len
  362. %if ARCH_X86_64
  363.     movsxd    lenq, lend
  364. %endif
  365.     test      lenq, lenq
  366.     jz .end
  367.     shl       lenq, 2
  368.     add      src0q, lenq
  369.     add      src1q, lenq
  370.     neg       lenq
  371. .loop:
  372.     mova        m0, [src0q + lenq]
  373.     mova        m1, [src1q + lenq]
  374.     subps       m2, m0, m1
  375.     addps       m0, m0, m1
  376.     mova        [src1q + lenq], m2
  377.     mova        [src0q + lenq], m0
  378.     add       lenq, mmsize
  379.     jl .loop
  380. .end:
  381.     REP_RET
  382.