Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* FFT transform with SSE/3DNow optimizations
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2011 Vitor Sessak
  5. ;*
  6. ;* This algorithm (though not any of the implementation details) is
  7. ;* based on libdjbfft by D. J. Bernstein.
  8. ;*
  9. ;* This file is part of FFmpeg.
  10. ;*
  11. ;* FFmpeg is free software; you can redistribute it and/or
  12. ;* modify it under the terms of the GNU Lesser General Public
  13. ;* License as published by the Free Software Foundation; either
  14. ;* version 2.1 of the License, or (at your option) any later version.
  15. ;*
  16. ;* FFmpeg is distributed in the hope that it will be useful,
  17. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19. ;* Lesser General Public License for more details.
  20. ;*
  21. ;* You should have received a copy of the GNU Lesser General Public
  22. ;* License along with FFmpeg; if not, write to the Free Software
  23. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24. ;******************************************************************************
  25.  
  26. ; These functions are not individually interchangeable with the C versions.
  27. ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
  28. ; in blocks as conventient to the vector size.
  29. ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
  30.  
  31. %include "libavutil/x86/x86util.asm"
  32.  
  33. %if ARCH_X86_64
  34. %define pointer resq
  35. %else
  36. %define pointer resd
  37. %endif
  38.  
  39. struc FFTContext
  40.     .nbits:    resd 1
  41.     .reverse:  resd 1
  42.     .revtab:   pointer 1
  43.     .tmpbuf:   pointer 1
  44.     .mdctsize: resd 1
  45.     .mdctbits: resd 1
  46.     .tcos:     pointer 1
  47.     .tsin:     pointer 1
  48.     .fftperm:  pointer 1
  49.     .fftcalc:  pointer 1
  50.     .imdctcalc:pointer 1
  51.     .imdcthalf:pointer 1
  52. endstruc
  53.  
  54. SECTION_RODATA 32
  55.  
  56. %define M_SQRT1_2 0.70710678118654752440
  57. %define M_COS_PI_1_8 0.923879532511287
  58. %define M_COS_PI_3_8 0.38268343236509
  59.  
  60. ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
  61. ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
  62.  
  63. ps_root2: times 8 dd M_SQRT1_2
  64. ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
  65. ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
  66.  
  67. perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
  68. perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
  69. ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
  70. ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
  71. ps_m1p1: dd 1<<31, 0
  72.  
  73. cextern ps_neg
  74.  
  75. %assign i 16
  76. %rep 13
  77. cextern cos_ %+ i
  78. %assign i i<<1
  79. %endrep
  80.  
  81. %if ARCH_X86_64
  82.     %define pointer dq
  83. %else
  84.     %define pointer dd
  85. %endif
  86.  
  87. %macro IF0 1+
  88. %endmacro
  89. %macro IF1 1+
  90.     %1
  91. %endmacro
  92.  
  93. SECTION .text
  94.  
  95. %macro T2_3DNOW 4 ; z0, z1, mem0, mem1
  96.     mova     %1, %3
  97.     mova     %2, %1
  98.     pfadd    %1, %4
  99.     pfsub    %2, %4
  100. %endmacro
  101.  
  102. %macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
  103.     mova     %5, %3
  104.     pfsub    %3, %4
  105.     pfadd    %5, %4 ; {t6,t5}
  106.     pxor     %3, [ps_m1p1] ; {t8,t7}
  107.     mova     %6, %1
  108.     movd [r0+12], %3
  109.     punpckhdq %3, [r0+8]
  110.     pfadd    %1, %5 ; {r0,i0}
  111.     pfsub    %6, %5 ; {r2,i2}
  112.     mova     %4, %2
  113.     pfadd    %2, %3 ; {r1,i1}
  114.     pfsub    %4, %3 ; {r3,i3}
  115.     SWAP     %3, %6
  116. %endmacro
  117.  
  118. ;  in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
  119. ;      %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
  120. ;      %3, %4, %5 tmp
  121. ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
  122. ;      %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
  123. %macro T8_AVX 5
  124.     vsubps     %5, %1, %2       ; v  = %1 - %2
  125.     vaddps     %3, %1, %2       ; w  = %1 + %2
  126.     vmulps     %2, %5, [ps_p1p1m1p1root2]  ; v *= vals1
  127.     vpermilps  %2, %2, [perm1]
  128.     vblendps   %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
  129.     vshufps    %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
  130.     vsubps     %4, %5, %1       ; s = r - q
  131.     vaddps     %1, %5, %1       ; u = r + q
  132.     vpermilps  %1, %1, [perm2]  ; k  = {u1,u2,u3,u4,u6,u5,u7,u8}
  133.     vshufps    %5, %4, %1, 0xbb
  134.     vshufps    %3, %4, %1, 0xee
  135.     vperm2f128 %3, %3, %5, 0x13
  136.     vxorps     %4, %4, [ps_m1m1p1m1p1m1m1m1]  ; s *= {1,1,-1,-1,1,-1,-1,-1}
  137.     vshufps    %2, %1, %4, 0xdd
  138.     vshufps    %1, %1, %4, 0x88
  139.     vperm2f128 %4, %2, %1, 0x02 ; v  = {k1,k3,s1,s3,k2,k4,s2,s4}
  140.     vperm2f128 %1, %1, %2, 0x13 ; w  = {k6,k8,s6,s8,k5,k7,s5,s7}
  141.     vsubps     %5, %1, %3
  142.     vblendps   %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
  143.     vsubps     %2, %4, %1       ; %2 = v - w
  144.     vaddps     %1, %4, %1       ; %1 = v + w
  145. %endmacro
  146.  
  147. ; In SSE mode do one fft4 transforms
  148. ; in:  %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
  149. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
  150. ;
  151. ; In AVX mode do two fft4 transforms
  152. ; in:  %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
  153. ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
  154. %macro T4_SSE 3
  155.     subps    %3, %1, %2       ; {t3,t4,-t8,t7}
  156.     addps    %1, %1, %2       ; {t1,t2,t6,t5}
  157.     xorps    %3, %3, [ps_p1p1m1p1]
  158.     shufps   %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
  159.     shufps   %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
  160.     subps    %3, %1, %2       ; {r2,i2,r3,i3}
  161.     addps    %1, %1, %2       ; {r0,i0,r1,i1}
  162.     shufps   %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
  163.     shufps   %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
  164. %endmacro
  165.  
  166. ; In SSE mode do one FFT8
  167. ; in:  %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
  168. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
  169. ;
  170. ; In AVX mode do two FFT8
  171. ; in:  %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
  172. ;      %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
  173. ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
  174. ;      %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
  175. %macro T8_SSE 6
  176.     addps    %6, %3, %4       ; {t1,t2,t3,t4}
  177.     subps    %3, %3, %4       ; {r5,i5,r7,i7}
  178.     shufps   %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
  179.     mulps    %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
  180.     mulps    %4, %4, [ps_root2]
  181.     addps    %3, %3, %4       ; {t8,t7,ta,t9}
  182.     shufps   %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
  183.     shufps   %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
  184.     subps    %3, %6, %4       ; {t6,t5,tc,tb}
  185.     addps    %6, %6, %4       ; {t1,t2,t9,ta}
  186.     shufps   %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
  187.     shufps   %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
  188.     subps    %3, %1, %6       ; {r4,r5,r6,r7}
  189.     addps    %1, %1, %6       ; {r0,r1,r2,r3}
  190.     subps    %4, %2, %5       ; {i4,i5,i6,i7}
  191.     addps    %2, %2, %5       ; {i0,i1,i2,i3}
  192. %endmacro
  193.  
  194. ; scheduled for cpu-bound sizes
  195. %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
  196. IF%1 mova    m4, Z(4)
  197. IF%1 mova    m5, Z(5)
  198.     mova     m0, %2 ; wre
  199.     mova     m1, %3 ; wim
  200.     mulps    m2, m4, m0 ; r2*wre
  201. IF%1 mova    m6, Z2(6)
  202.     mulps    m3, m5, m1 ; i2*wim
  203. IF%1 mova    m7, Z2(7)
  204.     mulps    m4, m4, m1 ; r2*wim
  205.     mulps    m5, m5, m0 ; i2*wre
  206.     addps    m2, m2, m3 ; r2*wre + i2*wim
  207.     mulps    m3, m1, m7 ; i3*wim
  208.     subps    m5, m5, m4 ; i2*wre - r2*wim
  209.     mulps    m1, m1, m6 ; r3*wim
  210.     mulps    m4, m0, m6 ; r3*wre
  211.     mulps    m0, m0, m7 ; i3*wre
  212.     subps    m4, m4, m3 ; r3*wre - i3*wim
  213.     mova     m3, Z(0)
  214.     addps    m0, m0, m1 ; i3*wre + r3*wim
  215.     subps    m1, m4, m2 ; t3
  216.     addps    m4, m4, m2 ; t5
  217.     subps    m3, m3, m4 ; r2
  218.     addps    m4, m4, Z(0) ; r0
  219.     mova     m6, Z(2)
  220.     mova   Z(4), m3
  221.     mova   Z(0), m4
  222.     subps    m3, m5, m0 ; t4
  223.     subps    m4, m6, m3 ; r3
  224.     addps    m3, m3, m6 ; r1
  225.     mova  Z2(6), m4
  226.     mova   Z(2), m3
  227.     mova     m2, Z(3)
  228.     addps    m3, m5, m0 ; t6
  229.     subps    m2, m2, m1 ; i3
  230.     mova     m7, Z(1)
  231.     addps    m1, m1, Z(3) ; i1
  232.     mova  Z2(7), m2
  233.     mova   Z(3), m1
  234.     subps    m4, m7, m3 ; i2
  235.     addps    m3, m3, m7 ; i0
  236.     mova   Z(5), m4
  237.     mova   Z(1), m3
  238. %endmacro
  239.  
  240. ; scheduled to avoid store->load aliasing
  241. %macro PASS_BIG 1 ; (!interleave)
  242.     mova     m4, Z(4) ; r2
  243.     mova     m5, Z(5) ; i2
  244.     mova     m0, [wq] ; wre
  245.     mova     m1, [wq+o1q] ; wim
  246.     mulps    m2, m4, m0 ; r2*wre
  247.     mova     m6, Z2(6) ; r3
  248.     mulps    m3, m5, m1 ; i2*wim
  249.     mova     m7, Z2(7) ; i3
  250.     mulps    m4, m4, m1 ; r2*wim
  251.     mulps    m5, m5, m0 ; i2*wre
  252.     addps    m2, m2, m3 ; r2*wre + i2*wim
  253.     mulps    m3, m1, m7 ; i3*wim
  254.     mulps    m1, m1, m6 ; r3*wim
  255.     subps    m5, m5, m4 ; i2*wre - r2*wim
  256.     mulps    m4, m0, m6 ; r3*wre
  257.     mulps    m0, m0, m7 ; i3*wre
  258.     subps    m4, m4, m3 ; r3*wre - i3*wim
  259.     mova     m3, Z(0)
  260.     addps    m0, m0, m1 ; i3*wre + r3*wim
  261.     subps    m1, m4, m2 ; t3
  262.     addps    m4, m4, m2 ; t5
  263.     subps    m3, m3, m4 ; r2
  264.     addps    m4, m4, Z(0) ; r0
  265.     mova     m6, Z(2)
  266.     mova   Z(4), m3
  267.     mova   Z(0), m4
  268.     subps    m3, m5, m0 ; t4
  269.     subps    m4, m6, m3 ; r3
  270.     addps    m3, m3, m6 ; r1
  271. IF%1 mova Z2(6), m4
  272. IF%1 mova  Z(2), m3
  273.     mova     m2, Z(3)
  274.     addps    m5, m5, m0 ; t6
  275.     subps    m2, m2, m1 ; i3
  276.     mova     m7, Z(1)
  277.     addps    m1, m1, Z(3) ; i1
  278. IF%1 mova Z2(7), m2
  279. IF%1 mova  Z(3), m1
  280.     subps    m6, m7, m5 ; i2
  281.     addps    m5, m5, m7 ; i0
  282. IF%1 mova  Z(5), m6
  283. IF%1 mova  Z(1), m5
  284. %if %1==0
  285.     INTERL m1, m3, m7, Z, 2
  286.     INTERL m2, m4, m0, Z2, 6
  287.  
  288.     mova     m1, Z(0)
  289.     mova     m2, Z(4)
  290.  
  291.     INTERL m5, m1, m3, Z, 0
  292.     INTERL m6, m2, m7, Z, 4
  293. %endif
  294. %endmacro
  295.  
  296. %macro PUNPCK 3
  297.     mova      %3, %1
  298.     punpckldq %1, %2
  299.     punpckhdq %3, %2
  300. %endmacro
  301.  
  302. %define Z(x) [r0+mmsize*x]
  303. %define Z2(x) [r0+mmsize*x]
  304. %define ZH(x) [r0+mmsize*x+mmsize/2]
  305.  
  306. INIT_YMM avx
  307.  
  308. %if HAVE_AVX_EXTERNAL
  309. align 16
  310. fft8_avx:
  311.     mova      m0, Z(0)
  312.     mova      m1, Z(1)
  313.     T8_AVX    m0, m1, m2, m3, m4
  314.     mova      Z(0), m0
  315.     mova      Z(1), m1
  316.     ret
  317.  
  318.  
  319. align 16
  320. fft16_avx:
  321.     mova       m2, Z(2)
  322.     mova       m3, Z(3)
  323.     T4_SSE     m2, m3, m7
  324.  
  325.     mova       m0, Z(0)
  326.     mova       m1, Z(1)
  327.     T8_AVX     m0, m1, m4, m5, m7
  328.  
  329.     mova       m4, [ps_cos16_1]
  330.     mova       m5, [ps_cos16_2]
  331.     vmulps     m6, m2, m4
  332.     vmulps     m7, m3, m5
  333.     vaddps     m7, m7, m6
  334.     vmulps     m2, m2, m5
  335.     vmulps     m3, m3, m4
  336.     vsubps     m3, m3, m2
  337.     vblendps   m2, m7, m3, 0xf0
  338.     vperm2f128 m3, m7, m3, 0x21
  339.     vaddps     m4, m2, m3
  340.     vsubps     m2, m3, m2
  341.     vperm2f128 m2, m2, m2, 0x01
  342.     vsubps     m3, m1, m2
  343.     vaddps     m1, m1, m2
  344.     vsubps     m5, m0, m4
  345.     vaddps     m0, m0, m4
  346.     vextractf128   Z(0), m0, 0
  347.     vextractf128  ZH(0), m1, 0
  348.     vextractf128   Z(1), m0, 1
  349.     vextractf128  ZH(1), m1, 1
  350.     vextractf128   Z(2), m5, 0
  351.     vextractf128  ZH(2), m3, 0
  352.     vextractf128   Z(3), m5, 1
  353.     vextractf128  ZH(3), m3, 1
  354.     ret
  355.  
  356. align 16
  357. fft32_avx:
  358.     call fft16_avx
  359.  
  360.     mova m0, Z(4)
  361.     mova m1, Z(5)
  362.  
  363.     T4_SSE      m0, m1, m4
  364.  
  365.     mova m2, Z(6)
  366.     mova m3, Z(7)
  367.  
  368.     T8_SSE      m0, m1, m2, m3, m4, m6
  369.     ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
  370.     ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
  371.  
  372.     vperm2f128  m4, m0, m2, 0x20
  373.     vperm2f128  m5, m1, m3, 0x20
  374.     vperm2f128  m6, m0, m2, 0x31
  375.     vperm2f128  m7, m1, m3, 0x31
  376.  
  377.     PASS_SMALL 0, [cos_32], [cos_32+32]
  378.  
  379.     ret
  380.  
  381. fft32_interleave_avx:
  382.     call fft32_avx
  383.     mov r2d, 32
  384. .deint_loop:
  385.     mova     m2, Z(0)
  386.     mova     m3, Z(1)
  387.     vunpcklps      m0, m2, m3
  388.     vunpckhps      m1, m2, m3
  389.     vextractf128   Z(0), m0, 0
  390.     vextractf128  ZH(0), m1, 0
  391.     vextractf128   Z(1), m0, 1
  392.     vextractf128  ZH(1), m1, 1
  393.     add r0, mmsize*2
  394.     sub r2d, mmsize/4
  395.     jg .deint_loop
  396.     ret
  397.  
  398. %endif
  399.  
  400. INIT_XMM sse
  401.  
  402. align 16
  403. fft4_avx:
  404. fft4_sse:
  405.     mova     m0, Z(0)
  406.     mova     m1, Z(1)
  407.     T4_SSE   m0, m1, m2
  408.     mova   Z(0), m0
  409.     mova   Z(1), m1
  410.     ret
  411.  
  412. align 16
  413. fft8_sse:
  414.     mova     m0, Z(0)
  415.     mova     m1, Z(1)
  416.     T4_SSE   m0, m1, m2
  417.     mova     m2, Z(2)
  418.     mova     m3, Z(3)
  419.     T8_SSE   m0, m1, m2, m3, m4, m5
  420.     mova   Z(0), m0
  421.     mova   Z(1), m1
  422.     mova   Z(2), m2
  423.     mova   Z(3), m3
  424.     ret
  425.  
  426. align 16
  427. fft16_sse:
  428.     mova     m0, Z(0)
  429.     mova     m1, Z(1)
  430.     T4_SSE   m0, m1, m2
  431.     mova     m2, Z(2)
  432.     mova     m3, Z(3)
  433.     T8_SSE   m0, m1, m2, m3, m4, m5
  434.     mova     m4, Z(4)
  435.     mova     m5, Z(5)
  436.     mova   Z(0), m0
  437.     mova   Z(1), m1
  438.     mova   Z(2), m2
  439.     mova   Z(3), m3
  440.     T4_SSE   m4, m5, m6
  441.     mova     m6, Z2(6)
  442.     mova     m7, Z2(7)
  443.     T4_SSE   m6, m7, m0
  444.     PASS_SMALL 0, [cos_16], [cos_16+16]
  445.     ret
  446.  
  447.  
  448. %macro FFT48_3DNOW 0
  449. align 16
  450. fft4 %+ SUFFIX:
  451.     T2_3DNOW m0, m1, Z(0), Z(1)
  452.     mova     m2, Z(2)
  453.     mova     m3, Z(3)
  454.     T4_3DNOW m0, m1, m2, m3, m4, m5
  455.     PUNPCK   m0, m1, m4
  456.     PUNPCK   m2, m3, m5
  457.     mova   Z(0), m0
  458.     mova   Z(1), m4
  459.     mova   Z(2), m2
  460.     mova   Z(3), m5
  461.     ret
  462.  
  463. align 16
  464. fft8 %+ SUFFIX:
  465.     T2_3DNOW m0, m1, Z(0), Z(1)
  466.     mova     m2, Z(2)
  467.     mova     m3, Z(3)
  468.     T4_3DNOW m0, m1, m2, m3, m4, m5
  469.     mova   Z(0), m0
  470.     mova   Z(2), m2
  471.     T2_3DNOW m4, m5,  Z(4),  Z(5)
  472.     T2_3DNOW m6, m7, Z2(6), Z2(7)
  473.     PSWAPD   m0, m5
  474.     PSWAPD   m2, m7
  475.     pxor     m0, [ps_m1p1]
  476.     pxor     m2, [ps_m1p1]
  477.     pfsub    m5, m0
  478.     pfadd    m7, m2
  479.     pfmul    m5, [ps_root2]
  480.     pfmul    m7, [ps_root2]
  481.     T4_3DNOW m1, m3, m5, m7, m0, m2
  482.     mova   Z(5), m5
  483.     mova  Z2(7), m7
  484.     mova     m0, Z(0)
  485.     mova     m2, Z(2)
  486.     T4_3DNOW m0, m2, m4, m6, m5, m7
  487.     PUNPCK   m0, m1, m5
  488.     PUNPCK   m2, m3, m7
  489.     mova   Z(0), m0
  490.     mova   Z(1), m5
  491.     mova   Z(2), m2
  492.     mova   Z(3), m7
  493.     PUNPCK   m4,  Z(5), m5
  494.     PUNPCK   m6, Z2(7), m7
  495.     mova   Z(4), m4
  496.     mova   Z(5), m5
  497.     mova  Z2(6), m6
  498.     mova  Z2(7), m7
  499.     ret
  500. %endmacro
  501.  
  502. %if ARCH_X86_32
  503. INIT_MMX 3dnowext
  504. FFT48_3DNOW
  505.  
  506. INIT_MMX 3dnow
  507. FFT48_3DNOW
  508. %endif
  509.  
  510. %define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
  511. %define Z2(x) [zcq + o3q + mmsize*(x&1)]
  512. %define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
  513. %define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
  514.  
  515. %macro DECL_PASS 2+ ; name, payload
  516. align 16
  517. %1:
  518. DEFINE_ARGS zc, w, n, o1, o3
  519.     lea o3q, [nq*3]
  520.     lea o1q, [nq*8]
  521.     shl o3q, 4
  522. .loop:
  523.     %2
  524.     add zcq, mmsize*2
  525.     add  wq, mmsize
  526.     sub  nd, mmsize/8
  527.     jg .loop
  528.     rep ret
  529. %endmacro
  530.  
  531. %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
  532.     lea r2, [dispatch_tab%1]
  533.     mov r2, [r2 + (%2q-2)*gprsize]
  534. %ifdef PIC
  535.     lea r3, [$$]
  536.     add r2, r3
  537. %endif
  538.     call r2
  539. %endmacro ; FFT_DISPATCH
  540.  
  541. INIT_YMM avx
  542.  
  543. %if HAVE_AVX_EXTERNAL
  544. %macro INTERL_AVX 5
  545.     vunpckhps      %3, %2, %1
  546.     vunpcklps      %2, %2, %1
  547.     vextractf128   %4(%5), %2, 0
  548.     vextractf128  %4 %+ H(%5), %3, 0
  549.     vextractf128   %4(%5 + 1), %2, 1
  550.     vextractf128  %4 %+ H(%5 + 1), %3, 1
  551. %endmacro
  552.  
  553. %define INTERL INTERL_AVX
  554.  
  555. DECL_PASS pass_avx, PASS_BIG 1
  556. DECL_PASS pass_interleave_avx, PASS_BIG 0
  557.  
  558. cglobal fft_calc, 2,5,8
  559.     mov     r3d, [r0 + FFTContext.nbits]
  560.     mov     r0, r1
  561.     mov     r1, r3
  562.     FFT_DISPATCH _interleave %+ SUFFIX, r1
  563.     REP_RET
  564.  
  565. %endif
  566.  
  567. INIT_XMM sse
  568.  
  569. %macro INTERL_SSE 5
  570.     mova     %3, %2
  571.     unpcklps %2, %1
  572.     unpckhps %3, %1
  573.     mova  %4(%5), %2
  574.     mova  %4(%5+1), %3
  575. %endmacro
  576.  
  577. %define INTERL INTERL_SSE
  578.  
  579. DECL_PASS pass_sse, PASS_BIG 1
  580. DECL_PASS pass_interleave_sse, PASS_BIG 0
  581.  
  582. %macro FFT_CALC_FUNC 0
  583. cglobal fft_calc, 2,5,8
  584.     mov     r3d, [r0 + FFTContext.nbits]
  585.     PUSH    r1
  586.     PUSH    r3
  587.     mov     r0, r1
  588.     mov     r1, r3
  589.     FFT_DISPATCH _interleave %+ SUFFIX, r1
  590.     POP     rcx
  591.     POP     r4
  592.     cmp     rcx, 3+(mmsize/16)
  593.     jg      .end
  594.     mov     r2, -1
  595.     add     rcx, 3
  596.     shl     r2, cl
  597.     sub     r4, r2
  598. .loop:
  599. %if mmsize == 8
  600.     PSWAPD  m0, [r4 + r2 + 4]
  601.     mova [r4 + r2 + 4], m0
  602. %else
  603.     movaps   xmm0, [r4 + r2]
  604.     movaps   xmm1, xmm0
  605.     unpcklps xmm0, [r4 + r2 + 16]
  606.     unpckhps xmm1, [r4 + r2 + 16]
  607.     movaps   [r4 + r2],      xmm0
  608.     movaps   [r4 + r2 + 16], xmm1
  609. %endif
  610.     add      r2, mmsize*2
  611.     jl       .loop
  612. .end:
  613. %if cpuflag(3dnow)
  614.     femms
  615.     RET
  616. %else
  617.     REP_RET
  618. %endif
  619. %endmacro
  620.  
  621. %if ARCH_X86_32
  622. INIT_MMX 3dnow
  623. FFT_CALC_FUNC
  624. INIT_MMX 3dnowext
  625. FFT_CALC_FUNC
  626. %endif
  627. INIT_XMM sse
  628. FFT_CALC_FUNC
  629.  
  630. cglobal fft_permute, 2,7,1
  631.     mov     r4,  [r0 + FFTContext.revtab]
  632.     mov     r5,  [r0 + FFTContext.tmpbuf]
  633.     mov     ecx, [r0 + FFTContext.nbits]
  634.     mov     r2, 1
  635.     shl     r2, cl
  636.     xor     r0, r0
  637. %if ARCH_X86_32
  638.     mov     r1, r1m
  639. %endif
  640. .loop:
  641.     movaps  xmm0, [r1 + 8*r0]
  642.     movzx   r6, word [r4 + 2*r0]
  643.     movzx   r3, word [r4 + 2*r0 + 2]
  644.     movlps  [r5 + 8*r6], xmm0
  645.     movhps  [r5 + 8*r3], xmm0
  646.     add     r0, 2
  647.     cmp     r0, r2
  648.     jl      .loop
  649.     shl     r2, 3
  650.     add     r1, r2
  651.     add     r5, r2
  652.     neg     r2
  653. ; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
  654. .loopcopy:
  655.     movaps  xmm0, [r5 + r2]
  656.     movaps  xmm1, [r5 + r2 + 16]
  657.     movaps  [r1 + r2], xmm0
  658.     movaps  [r1 + r2 + 16], xmm1
  659.     add     r2, 32
  660.     jl      .loopcopy
  661.     REP_RET
  662.  
  663. %macro IMDCT_CALC_FUNC 0
  664. cglobal imdct_calc, 3,5,3
  665.     mov     r3d, [r0 + FFTContext.mdctsize]
  666.     mov     r4,  [r0 + FFTContext.imdcthalf]
  667.     add     r1,  r3
  668.     PUSH    r3
  669.     PUSH    r1
  670. %if ARCH_X86_32
  671.     push    r2
  672.     push    r1
  673.     push    r0
  674. %else
  675.     sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
  676. %endif
  677.     call    r4
  678. %if ARCH_X86_32
  679.     add     esp, 12
  680. %else
  681.     add     rsp, 8+32*WIN64
  682. %endif
  683.     POP     r1
  684.     POP     r3
  685.     lea     r0, [r1 + 2*r3]
  686.     mov     r2, r3
  687.     sub     r3, mmsize
  688.     neg     r2
  689.     mova    m2, [ps_neg]
  690. .loop:
  691. %if mmsize == 8
  692.     PSWAPD  m0, [r1 + r3]
  693.     PSWAPD  m1, [r0 + r2]
  694.     pxor    m0, m2
  695. %else
  696.     mova    m0, [r1 + r3]
  697.     mova    m1, [r0 + r2]
  698.     shufps  m0, m0, 0x1b
  699.     shufps  m1, m1, 0x1b
  700.     xorps   m0, m2
  701. %endif
  702.     mova [r0 + r3], m1
  703.     mova [r1 + r2], m0
  704.     sub     r3, mmsize
  705.     add     r2, mmsize
  706.     jl      .loop
  707. %if cpuflag(3dnow)
  708.     femms
  709.     RET
  710. %else
  711.     REP_RET
  712. %endif
  713. %endmacro
  714.  
  715. %if ARCH_X86_32
  716. INIT_MMX 3dnow
  717. IMDCT_CALC_FUNC
  718. INIT_MMX 3dnowext
  719. IMDCT_CALC_FUNC
  720. %endif
  721.  
  722. INIT_XMM sse
  723. IMDCT_CALC_FUNC
  724.  
  725. %if ARCH_X86_32
  726. INIT_MMX 3dnow
  727. %define mulps pfmul
  728. %define addps pfadd
  729. %define subps pfsub
  730. %define unpcklps punpckldq
  731. %define unpckhps punpckhdq
  732. DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
  733. DECL_PASS pass_interleave_3dnow, PASS_BIG 0
  734. %define pass_3dnowext pass_3dnow
  735. %define pass_interleave_3dnowext pass_interleave_3dnow
  736. %endif
  737.  
  738. %ifdef PIC
  739. %define SECTION_REL - $$
  740. %else
  741. %define SECTION_REL
  742. %endif
  743.  
  744. %macro DECL_FFT 1-2 ; nbits, suffix
  745. %ifidn %0, 1
  746. %xdefine fullsuffix SUFFIX
  747. %else
  748. %xdefine fullsuffix %2 %+ SUFFIX
  749. %endif
  750. %xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
  751. %if %1>=5
  752. %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
  753. %endif
  754. %if %1>=6
  755. %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
  756. %endif
  757.  
  758. %assign n 1<<%1
  759. %rep 17-%1
  760. %assign n2 n/2
  761. %assign n4 n/4
  762. %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
  763.  
  764. align 16
  765. fft %+ n %+ fullsuffix:
  766.     call fft %+ n2 %+ SUFFIX
  767.     add r0, n*4 - (n&(-2<<%1))
  768.     call fft %+ n4 %+ SUFFIX
  769.     add r0, n*2 - (n2&(-2<<%1))
  770.     call fft %+ n4 %+ SUFFIX
  771.     sub r0, n*6 + (n2&(-2<<%1))
  772.     lea r1, [cos_ %+ n]
  773.     mov r2d, n4/2
  774.     jmp pass %+ fullsuffix
  775.  
  776. %assign n n*2
  777. %endrep
  778. %undef n
  779.  
  780. align 8
  781. dispatch_tab %+ fullsuffix: pointer list_of_fft
  782. %endmacro ; DECL_FFT
  783.  
  784. %if HAVE_AVX_EXTERNAL
  785. INIT_YMM avx
  786. DECL_FFT 6
  787. DECL_FFT 6, _interleave
  788. %endif
  789. INIT_XMM sse
  790. DECL_FFT 5
  791. DECL_FFT 5, _interleave
  792. %if ARCH_X86_32
  793. INIT_MMX 3dnow
  794. DECL_FFT 4
  795. DECL_FFT 4, _interleave
  796. INIT_MMX 3dnowext
  797. DECL_FFT 4
  798. DECL_FFT 4, _interleave
  799. %endif
  800.  
  801. INIT_XMM sse
  802. %undef mulps
  803. %undef addps
  804. %undef subps
  805. %undef unpcklps
  806. %undef unpckhps
  807.  
  808. %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
  809. %if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
  810.     PSWAPD     m0, [%3+%2*4]
  811.     movq       m2, [%3+%1*4-8]
  812.     movq       m3, m0
  813.     punpckldq  m0, m2
  814.     punpckhdq  m2, m3
  815.     movd       m1, [%4+%1*2-4] ; tcos[j]
  816.     movd       m3, [%4+%2*2]   ; tcos[n4-j-1]
  817.     punpckldq  m1, [%5+%1*2-4] ; tsin[j]
  818.     punpckldq  m3, [%5+%2*2]   ; tsin[n4-j-1]
  819.  
  820.     mova       m4, m0
  821.     PSWAPD     m5, m1
  822.     pfmul      m0, m1
  823.     pfmul      m4, m5
  824.     mova       m6, m2
  825.     PSWAPD     m5, m3
  826.     pfmul      m2, m3
  827.     pfmul      m6, m5
  828. %if cpuflag(3dnowext)
  829.     pfpnacc    m0, m4
  830.     pfpnacc    m2, m6
  831. %else
  832.     SBUTTERFLY dq, 0, 4, 1
  833.     SBUTTERFLY dq, 2, 6, 3
  834.     pxor       m4, m7
  835.     pxor       m6, m7
  836.     pfadd      m0, m4
  837.     pfadd      m2, m6
  838. %endif
  839. %else
  840.     movaps   xmm0, [%3+%2*4]
  841.     movaps   xmm1, [%3+%1*4-0x10]
  842.     movaps   xmm2, xmm0
  843.     shufps   xmm0, xmm1, 0x88
  844.     shufps   xmm1, xmm2, 0x77
  845.     movlps   xmm4, [%4+%2*2]
  846.     movlps   xmm5, [%5+%2*2+0x0]
  847.     movhps   xmm4, [%4+%1*2-0x8]
  848.     movhps   xmm5, [%5+%1*2-0x8]
  849.     movaps   xmm2, xmm0
  850.     movaps   xmm3, xmm1
  851.     mulps    xmm0, xmm5
  852.     mulps    xmm1, xmm4
  853.     mulps    xmm2, xmm4
  854.     mulps    xmm3, xmm5
  855.     subps    xmm1, xmm0
  856.     addps    xmm2, xmm3
  857.     movaps   xmm0, xmm1
  858.     unpcklps xmm1, xmm2
  859.     unpckhps xmm0, xmm2
  860. %endif
  861. %endmacro
  862.  
  863. %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
  864.     mulps      m6, %3, [%5+%1]
  865.     mulps      m7, %2, [%5+%1]
  866.     mulps      %2, %2, [%6+%1]
  867.     mulps      %3, %3, [%6+%1]
  868.     subps      %2, %2, m6
  869.     addps      %3, %3, m7
  870. %endmacro
  871.  
  872. %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
  873. .post:
  874.     vmovaps      ymm1,   [%3+%1*2]
  875.     vmovaps      ymm0,   [%3+%1*2+0x20]
  876.     vmovaps      ymm3,   [%3+%2*2]
  877.     vmovaps      ymm2,   [%3+%2*2+0x20]
  878.  
  879.     CMUL         %1, ymm0, ymm1, %3, %4, %5
  880.     CMUL         %2, ymm2, ymm3, %3, %4, %5
  881.     vshufps      ymm1, ymm1, ymm1, 0x1b
  882.     vshufps      ymm3, ymm3, ymm3, 0x1b
  883.     vperm2f128   ymm1, ymm1, ymm1, 0x01
  884.     vperm2f128   ymm3, ymm3, ymm3, 0x01
  885.     vunpcklps    ymm6, ymm2, ymm1
  886.     vunpckhps    ymm4, ymm2, ymm1
  887.     vunpcklps    ymm7, ymm0, ymm3
  888.     vunpckhps    ymm5, ymm0, ymm3
  889.  
  890.     vextractf128 [%3+%1*2],      ymm7, 0
  891.     vextractf128 [%3+%1*2+0x10], ymm5, 0
  892.     vextractf128 [%3+%1*2+0x20], ymm7, 1
  893.     vextractf128 [%3+%1*2+0x30], ymm5, 1
  894.  
  895.     vextractf128 [%3+%2*2],      ymm6, 0
  896.     vextractf128 [%3+%2*2+0x10], ymm4, 0
  897.     vextractf128 [%3+%2*2+0x20], ymm6, 1
  898.     vextractf128 [%3+%2*2+0x30], ymm4, 1
  899.     sub      %2,   0x20
  900.     add      %1,   0x20
  901.     jl       .post
  902. %endmacro
  903.  
  904. %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
  905. .post:
  906.     movaps   xmm1, [%3+%1*2]
  907.     movaps   xmm0, [%3+%1*2+0x10]
  908.     CMUL     %1,   xmm0, xmm1, %3, %4, %5
  909.     movaps   xmm5, [%3+%2*2]
  910.     movaps   xmm4, [%3+%2*2+0x10]
  911.     CMUL     %2,   xmm4, xmm5, %3, %4, %5
  912.     shufps   xmm1, xmm1, 0x1b
  913.     shufps   xmm5, xmm5, 0x1b
  914.     movaps   xmm6, xmm4
  915.     unpckhps xmm4, xmm1
  916.     unpcklps xmm6, xmm1
  917.     movaps   xmm2, xmm0
  918.     unpcklps xmm0, xmm5
  919.     unpckhps xmm2, xmm5
  920.     movaps   [%3+%2*2],      xmm6
  921.     movaps   [%3+%2*2+0x10], xmm4
  922.     movaps   [%3+%1*2],      xmm0
  923.     movaps   [%3+%1*2+0x10], xmm2
  924.     sub      %2,   0x10
  925.     add      %1,   0x10
  926.     jl       .post
  927. %endmacro
  928.  
  929. %macro CMUL_3DNOW 6
  930.     mova       m6, [%1+%2*2]
  931.     mova       %3, [%1+%2*2+8]
  932.     mova       %4, m6
  933.     mova       m7, %3
  934.     pfmul      m6, [%5+%2]
  935.     pfmul      %3, [%6+%2]
  936.     pfmul      %4, [%6+%2]
  937.     pfmul      m7, [%5+%2]
  938.     pfsub      %3, m6
  939.     pfadd      %4, m7
  940. %endmacro
  941.  
  942. %macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8
  943. .post:
  944.     CMUL_3DNOW %3, %1, m0, m1, %4, %5
  945.     CMUL_3DNOW %3, %2, m2, m3, %4, %5
  946.     movd  [%3+%1*2+ 0], m0
  947.     movd  [%3+%2*2+12], m1
  948.     movd  [%3+%2*2+ 0], m2
  949.     movd  [%3+%1*2+12], m3
  950.     psrlq      m0, 32
  951.     psrlq      m1, 32
  952.     psrlq      m2, 32
  953.     psrlq      m3, 32
  954.     movd  [%3+%1*2+ 8], m0
  955.     movd  [%3+%2*2+ 4], m1
  956.     movd  [%3+%2*2+ 8], m2
  957.     movd  [%3+%1*2+ 4], m3
  958.     sub        %2, 8
  959.     add        %1, 8
  960.     jl         .post
  961. %endmacro
  962.  
  963. %macro DECL_IMDCT 1
  964. cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
  965. %if ARCH_X86_64
  966. %define rrevtab r7
  967. %define rtcos   r8
  968. %define rtsin   r9
  969. %else
  970. %define rrevtab r6
  971. %define rtsin   r6
  972. %define rtcos   r5
  973. %endif
  974.     mov   r3d, [r0+FFTContext.mdctsize]
  975.     add   r2, r3
  976.     shr   r3, 1
  977.     mov   rtcos, [r0+FFTContext.tcos]
  978.     mov   rtsin, [r0+FFTContext.tsin]
  979.     add   rtcos, r3
  980.     add   rtsin, r3
  981. %if ARCH_X86_64 == 0
  982.     push  rtcos
  983.     push  rtsin
  984. %endif
  985.     shr   r3, 1
  986.     mov   rrevtab, [r0+FFTContext.revtab]
  987.     add   rrevtab, r3
  988. %if ARCH_X86_64 == 0
  989.     push  rrevtab
  990. %endif
  991.  
  992. %if mmsize == 8
  993.     sub   r3, 2
  994. %else
  995.     sub   r3, 4
  996. %endif
  997. %if ARCH_X86_64 || mmsize == 8
  998.     xor   r4, r4
  999.     sub   r4, r3
  1000. %endif
  1001. %if notcpuflag(3dnowext) && mmsize == 8
  1002.     movd  m7, [ps_neg]
  1003. %endif
  1004. .pre:
  1005. %if ARCH_X86_64 == 0
  1006. ;unspill
  1007. %if mmsize != 8
  1008.     xor   r4, r4
  1009.     sub   r4, r3
  1010. %endif
  1011.     mov   rtcos, [esp+8]
  1012.     mov   rtsin, [esp+4]
  1013. %endif
  1014.  
  1015.     PREROTATER r4, r3, r2, rtcos, rtsin
  1016. %if mmsize == 8
  1017.     mov    r6, [esp]                ; rrevtab = ptr+n8
  1018.     movzx  r5,  word [rrevtab+r4-2] ; rrevtab[j]
  1019.     movzx  r6,  word [rrevtab+r3]   ; rrevtab[n4-j-1]
  1020.     mova [r1+r5*8], m0
  1021.     mova [r1+r6*8], m2
  1022.     add    r4, 2
  1023.     sub    r3, 2
  1024. %else
  1025. %if ARCH_X86_64
  1026.     movzx  r5,  word [rrevtab+r4-4]
  1027.     movzx  r6,  word [rrevtab+r4-2]
  1028.     movzx  r10, word [rrevtab+r3]
  1029.     movzx  r11, word [rrevtab+r3+2]
  1030.     movlps [r1+r5 *8], xmm0
  1031.     movhps [r1+r6 *8], xmm0
  1032.     movlps [r1+r10*8], xmm1
  1033.     movhps [r1+r11*8], xmm1
  1034.     add    r4, 4
  1035. %else
  1036.     mov    r6, [esp]
  1037.     movzx  r5, word [r6+r4-4]
  1038.     movzx  r4, word [r6+r4-2]
  1039.     movlps [r1+r5*8], xmm0
  1040.     movhps [r1+r4*8], xmm0
  1041.     movzx  r5, word [r6+r3]
  1042.     movzx  r4, word [r6+r3+2]
  1043.     movlps [r1+r5*8], xmm1
  1044.     movhps [r1+r4*8], xmm1
  1045. %endif
  1046.     sub    r3, 4
  1047. %endif
  1048.     jns    .pre
  1049.  
  1050.     mov  r5, r0
  1051.     mov  r6, r1
  1052.     mov  r0, r1
  1053.     mov  r1d, [r5+FFTContext.nbits]
  1054.  
  1055.     FFT_DISPATCH SUFFIX, r1
  1056.  
  1057.     mov  r0d, [r5+FFTContext.mdctsize]
  1058.     add  r6, r0
  1059.     shr  r0, 1
  1060. %if ARCH_X86_64 == 0
  1061. %define rtcos r2
  1062. %define rtsin r3
  1063.     mov  rtcos, [esp+8]
  1064.     mov  rtsin, [esp+4]
  1065. %endif
  1066.     neg  r0
  1067.     mov  r1, -mmsize
  1068.     sub  r1, r0
  1069.     %1 r0, r1, r6, rtcos, rtsin
  1070. %if ARCH_X86_64 == 0
  1071.     add esp, 12
  1072. %endif
  1073. %if mmsize == 8
  1074.     femms
  1075. %endif
  1076.     RET
  1077. %endmacro
  1078.  
  1079. DECL_IMDCT POSROTATESHUF
  1080.  
  1081. %if ARCH_X86_32
  1082. INIT_MMX 3dnow
  1083. DECL_IMDCT POSROTATESHUF_3DNOW
  1084.  
  1085. INIT_MMX 3dnowext
  1086. DECL_IMDCT POSROTATESHUF_3DNOW
  1087. %endif
  1088.  
  1089. INIT_YMM avx
  1090.  
  1091. %if HAVE_AVX_EXTERNAL
  1092. DECL_IMDCT POSROTATESHUF_AVX
  1093. %endif
  1094.