Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. ;*****************************************************************************
  2. ;* x86-optimized Float DSP functions
  3. ;*
  4. ;* Copyright 2006 Loren Merritt
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22.  
  23. %include "x86util.asm"
  24.  
  25. SECTION .text
  26.  
  27. ;-----------------------------------------------------------------------------
  28. ; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
  29. ;-----------------------------------------------------------------------------
  30. %macro VECTOR_FMUL 0
  31. cglobal vector_fmul, 4,4,2, dst, src0, src1, len
  32.     lea       lenq, [lend*4 - 2*mmsize]
  33. ALIGN 16
  34. .loop:
  35.     mova      m0,   [src0q + lenq]
  36.     mova      m1,   [src0q + lenq + mmsize]
  37.     mulps     m0, m0, [src1q + lenq]
  38.     mulps     m1, m1, [src1q + lenq + mmsize]
  39.     mova      [dstq + lenq], m0
  40.     mova      [dstq + lenq + mmsize], m1
  41.  
  42.     sub       lenq, 2*mmsize
  43.     jge       .loop
  44.     REP_RET
  45. %endmacro
  46.  
  47. INIT_XMM sse
  48. VECTOR_FMUL
  49. %if HAVE_AVX_EXTERNAL
  50. INIT_YMM avx
  51. VECTOR_FMUL
  52. %endif
  53.  
  54. ;------------------------------------------------------------------------------
  55. ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
  56. ;------------------------------------------------------------------------------
  57.  
  58. %macro VECTOR_FMAC_SCALAR 0
  59. %if UNIX64
  60. cglobal vector_fmac_scalar, 3,3,3, dst, src, len
  61. %else
  62. cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
  63. %endif
  64. %if ARCH_X86_32
  65.     VBROADCASTSS m0, mulm
  66. %else
  67. %if WIN64
  68.     mova       xmm0, xmm2
  69. %endif
  70.     shufps     xmm0, xmm0, 0
  71. %if cpuflag(avx)
  72.     vinsertf128  m0, m0, xmm0, 1
  73. %endif
  74. %endif
  75.     lea    lenq, [lend*4-2*mmsize]
  76. .loop:
  77.     mulps    m1, m0, [srcq+lenq       ]
  78.     mulps    m2, m0, [srcq+lenq+mmsize]
  79.     addps    m1, m1, [dstq+lenq       ]
  80.     addps    m2, m2, [dstq+lenq+mmsize]
  81.     mova  [dstq+lenq       ], m1
  82.     mova  [dstq+lenq+mmsize], m2
  83.     sub    lenq, 2*mmsize
  84.     jge .loop
  85.     REP_RET
  86. %endmacro
  87.  
  88. INIT_XMM sse
  89. VECTOR_FMAC_SCALAR
  90. %if HAVE_AVX_EXTERNAL
  91. INIT_YMM avx
  92. VECTOR_FMAC_SCALAR
  93. %endif
  94.  
  95. ;------------------------------------------------------------------------------
  96. ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
  97. ;------------------------------------------------------------------------------
  98.  
  99. %macro VECTOR_FMUL_SCALAR 0
  100. %if UNIX64
  101. cglobal vector_fmul_scalar, 3,3,2, dst, src, len
  102. %else
  103. cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
  104. %endif
  105. %if ARCH_X86_32
  106.     movss    m0, mulm
  107. %elif WIN64
  108.     SWAP 0, 2
  109. %endif
  110.     shufps   m0, m0, 0
  111.     lea    lenq, [lend*4-mmsize]
  112. .loop:
  113.     mova     m1, [srcq+lenq]
  114.     mulps    m1, m0
  115.     mova  [dstq+lenq], m1
  116.     sub    lenq, mmsize
  117.     jge .loop
  118.     REP_RET
  119. %endmacro
  120.  
  121. INIT_XMM sse
  122. VECTOR_FMUL_SCALAR
  123.  
  124. ;------------------------------------------------------------------------------
  125. ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
  126. ;                            int len)
  127. ;------------------------------------------------------------------------------
  128.  
  129. %macro VECTOR_DMUL_SCALAR 0
  130. %if ARCH_X86_32
  131. cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
  132.     mov          lenq, lenaddrm
  133. %elif UNIX64
  134. cglobal vector_dmul_scalar, 3,3,3, dst, src, len
  135. %else
  136. cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
  137. %endif
  138. %if ARCH_X86_32
  139.     VBROADCASTSD   m0, mulm
  140. %else
  141. %if WIN64
  142.     movlhps      xmm2, xmm2
  143. %if cpuflag(avx)
  144.     vinsertf128  ymm2, ymm2, xmm2, 1
  145. %endif
  146.     SWAP 0, 2
  147. %else
  148.     movlhps      xmm0, xmm0
  149. %if cpuflag(avx)
  150.     vinsertf128  ymm0, ymm0, xmm0, 1
  151. %endif
  152. %endif
  153. %endif
  154.     lea          lenq, [lend*8-2*mmsize]
  155. .loop:
  156.     mulpd          m1, m0, [srcq+lenq       ]
  157.     mulpd          m2, m0, [srcq+lenq+mmsize]
  158.     mova   [dstq+lenq       ], m1
  159.     mova   [dstq+lenq+mmsize], m2
  160.     sub          lenq, 2*mmsize
  161.     jge .loop
  162.     REP_RET
  163. %endmacro
  164.  
  165. INIT_XMM sse2
  166. VECTOR_DMUL_SCALAR
  167. %if HAVE_AVX_EXTERNAL
  168. INIT_YMM avx
  169. VECTOR_DMUL_SCALAR
  170. %endif
  171.  
  172. ;-----------------------------------------------------------------------------
  173. ; vector_fmul_add(float *dst, const float *src0, const float *src1,
  174. ;                 const float *src2, int len)
  175. ;-----------------------------------------------------------------------------
  176. %macro VECTOR_FMUL_ADD 0
  177. cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
  178.     lea       lenq, [lend*4 - 2*mmsize]
  179. ALIGN 16
  180. .loop:
  181.     mova    m0,   [src0q + lenq]
  182.     mova    m1,   [src0q + lenq + mmsize]
  183.     mulps   m0, m0, [src1q + lenq]
  184.     mulps   m1, m1, [src1q + lenq + mmsize]
  185.     addps   m0, m0, [src2q + lenq]
  186.     addps   m1, m1, [src2q + lenq + mmsize]
  187.     mova    [dstq + lenq], m0
  188.     mova    [dstq + lenq + mmsize], m1
  189.  
  190.     sub     lenq,   2*mmsize
  191.     jge     .loop
  192.     REP_RET
  193. %endmacro
  194.  
  195. INIT_XMM sse
  196. VECTOR_FMUL_ADD
  197. %if HAVE_AVX_EXTERNAL
  198. INIT_YMM avx
  199. VECTOR_FMUL_ADD
  200. %endif
  201.  
  202. ;-----------------------------------------------------------------------------
  203. ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
  204. ;                          int len)
  205. ;-----------------------------------------------------------------------------
  206. %macro VECTOR_FMUL_REVERSE 0
  207. cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
  208.     lea       lenq, [lend*4 - 2*mmsize]
  209. ALIGN 16
  210. .loop:
  211. %if cpuflag(avx)
  212.     vmovaps     xmm0, [src1q + 16]
  213.     vinsertf128 m0, m0, [src1q], 1
  214.     vshufps     m0, m0, m0, q0123
  215.     vmovaps     xmm1, [src1q + mmsize + 16]
  216.     vinsertf128 m1, m1, [src1q + mmsize], 1
  217.     vshufps     m1, m1, m1, q0123
  218. %else
  219.     mova    m0, [src1q]
  220.     mova    m1, [src1q + mmsize]
  221.     shufps  m0, m0, q0123
  222.     shufps  m1, m1, q0123
  223. %endif
  224.     mulps   m0, m0, [src0q + lenq + mmsize]
  225.     mulps   m1, m1, [src0q + lenq]
  226.     mova    [dstq + lenq + mmsize], m0
  227.     mova    [dstq + lenq], m1
  228.     add     src1q, 2*mmsize
  229.     sub     lenq,  2*mmsize
  230.     jge     .loop
  231.     REP_RET
  232. %endmacro
  233.  
  234. INIT_XMM sse
  235. VECTOR_FMUL_REVERSE
  236. %if HAVE_AVX_EXTERNAL
  237. INIT_YMM avx
  238. VECTOR_FMUL_REVERSE
  239. %endif
  240.  
  241. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  242. INIT_XMM sse
  243. cglobal scalarproduct_float, 3,3,2, v1, v2, offset
  244.     neg   offsetq
  245.     shl   offsetq, 2
  246.     sub       v1q, offsetq
  247.     sub       v2q, offsetq
  248.     xorps    xmm0, xmm0
  249. .loop:
  250.     movaps   xmm1, [v1q+offsetq]
  251.     mulps    xmm1, [v2q+offsetq]
  252.     addps    xmm0, xmm1
  253.     add   offsetq, 16
  254.     js .loop
  255.     movhlps  xmm1, xmm0
  256.     addps    xmm0, xmm1
  257.     movss    xmm1, xmm0
  258.     shufps   xmm0, xmm0, 1
  259.     addss    xmm0, xmm1
  260. %if ARCH_X86_64 == 0
  261.     movss     r0m,  xmm0
  262.     fld dword r0m
  263. %endif
  264.     RET
  265.  
  266. ;-----------------------------------------------------------------------------
  267. ; void ff_butterflies_float(float *src0, float *src1, int len);
  268. ;-----------------------------------------------------------------------------
  269. INIT_XMM sse
  270. cglobal butterflies_float, 3,3,3, src0, src1, len
  271. %if ARCH_X86_64
  272.     movsxd    lenq, lend
  273. %endif
  274.     test      lenq, lenq
  275.     jz .end
  276.     shl       lenq, 2
  277.     add      src0q, lenq
  278.     add      src1q, lenq
  279.     neg       lenq
  280. .loop:
  281.     mova        m0, [src0q + lenq]
  282.     mova        m1, [src1q + lenq]
  283.     subps       m2, m0, m1
  284.     addps       m0, m0, m1
  285.     mova        [src1q + lenq], m2
  286.     mova        [src0q + lenq], m0
  287.     add       lenq, mmsize
  288.     jl .loop
  289. .end:
  290.     REP_RET
  291.