Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
  3. ;*
  4. ;* Copyright (C) 2015 James Almer
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22.  
  23. %include "libavutil/x86/x86util.asm"
  24.  
  25. SECTION_RODATA
  26.  
  27. ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
  28.  
  29. SECTION .text
  30.  
  31. ;*************************************************************************
  32. ;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
  33. ;*************************************************************************
  34. %macro PS_ADD_SQUARES 1
  35. cglobal ps_add_squares, 3, 3, %1, dst, src, n
  36. .loop:
  37.     movaps m0, [srcq]
  38.     movaps m1, [srcq+mmsize]
  39.     mulps  m0, m0
  40.     mulps  m1, m1
  41. %if cpuflag(sse3)
  42.     haddps m0, m1
  43. %else
  44.     movaps m3, m0
  45.     movaps m4, m1
  46.     shufps m3, m3, q0301
  47.     shufps m4, m4, q0301
  48.     addps  m0, m3
  49.     addps  m1, m4
  50.     shufps m0, m1, q2020
  51. %endif
  52.     addps  m0, [dstq]
  53.     movaps [dstq], m0
  54.     add  dstq, mmsize
  55.     add  srcq, mmsize*2
  56.     sub    nd, mmsize/4
  57.     jg .loop
  58.     REP_RET
  59. %endmacro
  60.  
  61. INIT_XMM sse
  62. PS_ADD_SQUARES 3
  63. INIT_XMM sse3
  64. PS_ADD_SQUARES 5
  65.  
  66. ;*******************************************************************
  67. ;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
  68. ;                                   float *src1, int n);
  69. ;*******************************************************************
  70. INIT_XMM sse
  71. cglobal ps_mul_pair_single, 4, 5, 4, dst, src1, src2, n
  72.     xor r4q, r4q
  73.  
  74. .loop:
  75.     movu     m0, [src1q+r4q]
  76.     movu     m1, [src1q+r4q+mmsize]
  77.     mova     m2, [src2q]
  78.     mova     m3, m2
  79.     unpcklps m2, m2
  80.     unpckhps m3, m3
  81.     mulps    m0, m2
  82.     mulps    m1, m3
  83.     mova [dstq+r4q], m0
  84.     mova [dstq+r4q+mmsize], m1
  85.     add   src2q, mmsize
  86.     add     r4q, mmsize*2
  87.     sub      nd, mmsize/4
  88.     jg .loop
  89.     REP_RET
  90.  
  91. ;***********************************************************************
  92. ;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
  93. ;                                   float h[2][4], float h_step[2][4],
  94. ;                                   int len);
  95. ;***********************************************************************
  96. INIT_XMM sse3
  97. cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
  98.     movaps   m0, [hq]
  99.     movaps   m1, [h_stepq]
  100.     cmp      nd, 0
  101.     jle .ret
  102.     shl      nd, 3
  103.     add      lq, nq
  104.     add      rq, nq
  105.     neg      nq
  106.  
  107. align 16
  108. .loop:
  109.     addps    m0, m1
  110.     movddup  m2, [lq+nq]
  111.     movddup  m3, [rq+nq]
  112.     movaps   m4, m0
  113.     movaps   m5, m0
  114.     unpcklps m4, m4
  115.     unpckhps m5, m5
  116.     mulps    m2, m4
  117.     mulps    m3, m5
  118.     addps    m2, m3
  119.     movsd  [lq+nq], m2
  120.     movhps [rq+nq], m2
  121.     add      nq, 8
  122.     jl .loop
  123. .ret:
  124.     REP_RET
  125.  
  126. ;*******************************************************************
  127. ;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
  128. ;                                 const float (*filter)[8][2],
  129. ;                                 int stride, int n);
  130. ;*******************************************************************
  131. %macro PS_HYBRID_ANALYSIS_LOOP 3
  132.     movu     %1, [inq+mmsize*%3]
  133.     movu     m1, [inq+mmsize*(5-%3)+8]
  134. %if cpuflag(sse3)
  135.     pshufd   %2, %1, q2301
  136.     pshufd   m4, m1, q0123
  137.     pshufd   m1, m1, q1032
  138.     pshufd   m2, [filterq+nq+mmsize*%3], q2301
  139.     addsubps %2, m4
  140.     addsubps %1, m1
  141. %else
  142.     mova     m2, [filterq+nq+mmsize*%3]
  143.     mova     %2, %1
  144.     mova     m4, m1
  145.     shufps   %2, %2, q2301
  146.     shufps   m4, m4, q0123
  147.     shufps   m1, m1, q1032
  148.     shufps   m2, m2, q2301
  149.     xorps    m4, m7
  150.     xorps    m1, m7
  151.     subps    %2, m4
  152.     subps    %1, m1
  153. %endif
  154.     mulps    %2, m2
  155.     mulps    %1, m2
  156. %if %3
  157.     addps    m3, %2
  158.     addps    m0, %1
  159. %endif
  160. %endmacro
  161.  
  162. %macro PS_HYBRID_ANALYSIS 0
  163. cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
  164. %if cpuflag(sse3)
  165. %define MOVH movsd
  166. %else
  167. %define MOVH movlps
  168. %endif
  169.     shl strided, 3
  170.     shl nd, 6
  171.     add filterq, nq
  172.     neg nq
  173.     mova m7, [ps_p1m1p1m1]
  174.  
  175. align 16
  176. .loop:
  177.     PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
  178.     PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
  179.     PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
  180.  
  181. %if cpuflag(sse3)
  182.     pshufd   m3, m3, q2301
  183.     xorps    m0, m7
  184.     hsubps   m3, m0
  185.     pshufd   m1, m3, q0020
  186.     pshufd   m3, m3, q0031
  187.     addps    m1, m3
  188.     movsd    m2, [inq+6*8]
  189. %else
  190.     mova     m1, m3
  191.     mova     m2, m0
  192.     shufps   m1, m1, q2301
  193.     shufps   m2, m2, q2301
  194.     subps    m1, m3
  195.     addps    m2, m0
  196.     unpcklps m3, m1, m2
  197.     unpckhps m1, m2
  198.     addps    m1, m3
  199.     movu     m2, [inq+6*8] ; faster than movlps and no risk of overread
  200. %endif
  201.     movss    m3, [filterq+nq+8*6]
  202.     SPLATD   m3
  203.     mulps    m2, m3
  204.     addps    m1, m2
  205.     MOVH [outq], m1
  206.     add    outq, strideq
  207.     add      nq, 64
  208.     jl .loop
  209.     REP_RET
  210. %endmacro
  211.  
  212. INIT_XMM sse
  213. PS_HYBRID_ANALYSIS
  214. INIT_XMM sse3
  215. PS_HYBRID_ANALYSIS
  216.