Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* FFT transform with SSE/3DNow optimizations
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2011 Vitor Sessak
  5. ;*
  6. ;* This algorithm (though not any of the implementation details) is
  7. ;* based on libdjbfft by D. J. Bernstein.
  8. ;*
  9. ;* This file is part of FFmpeg.
  10. ;*
  11. ;* FFmpeg is free software; you can redistribute it and/or
  12. ;* modify it under the terms of the GNU Lesser General Public
  13. ;* License as published by the Free Software Foundation; either
  14. ;* version 2.1 of the License, or (at your option) any later version.
  15. ;*
  16. ;* FFmpeg is distributed in the hope that it will be useful,
  17. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19. ;* Lesser General Public License for more details.
  20. ;*
  21. ;* You should have received a copy of the GNU Lesser General Public
  22. ;* License along with FFmpeg; if not, write to the Free Software
  23. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24. ;******************************************************************************
  25.  
  26. ; These functions are not individually interchangeable with the C versions.
  27. ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
  28. ; in blocks as conventient to the vector size.
  29. ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
  30.  
  31. %include "libavutil/x86/x86util.asm"
  32.  
  33. %if ARCH_X86_64
  34. %define pointer resq
  35. %else
  36. %define pointer resd
  37. %endif
  38.  
  39. SECTION_RODATA 32
  40.  
  41. struc FFTContext
  42.     .nbits:    resd 1
  43.     .reverse:  resd 1
  44.     .revtab:   pointer 1
  45.     .tmpbuf:   pointer 1
  46.     .mdctsize: resd 1
  47.     .mdctbits: resd 1
  48.     .tcos:     pointer 1
  49.     .tsin:     pointer 1
  50.     .fftperm:  pointer 1
  51.     .fftcalc:  pointer 1
  52.     .imdctcalc:pointer 1
  53.     .imdcthalf:pointer 1
  54. endstruc
  55.  
  56. %define M_SQRT1_2 0.70710678118654752440
  57. %define M_COS_PI_1_8 0.923879532511287
  58. %define M_COS_PI_3_8 0.38268343236509
  59.  
  60. ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
  61. ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
  62.  
  63. ps_root2: times 8 dd M_SQRT1_2
  64. ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
  65. ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
  66.  
  67. perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
  68. perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
  69. ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
  70. ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
  71. ps_m1m1m1m1: times 4 dd 1<<31
  72. ps_m1p1: dd 1<<31, 0
  73.  
  74. %assign i 16
  75. %rep 13
  76. cextern cos_ %+ i
  77. %assign i i<<1
  78. %endrep
  79.  
  80. %if ARCH_X86_64
  81.     %define pointer dq
  82. %else
  83.     %define pointer dd
  84. %endif
  85.  
  86. %macro IF0 1+
  87. %endmacro
  88. %macro IF1 1+
  89.     %1
  90. %endmacro
  91.  
  92. SECTION_TEXT
  93.  
  94. %macro T2_3DNOW 4 ; z0, z1, mem0, mem1
  95.     mova     %1, %3
  96.     mova     %2, %1
  97.     pfadd    %1, %4
  98.     pfsub    %2, %4
  99. %endmacro
  100.  
  101. %macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
  102.     mova     %5, %3
  103.     pfsub    %3, %4
  104.     pfadd    %5, %4 ; {t6,t5}
  105.     pxor     %3, [ps_m1p1] ; {t8,t7}
  106.     mova     %6, %1
  107.     movd [r0+12], %3
  108.     punpckhdq %3, [r0+8]
  109.     pfadd    %1, %5 ; {r0,i0}
  110.     pfsub    %6, %5 ; {r2,i2}
  111.     mova     %4, %2
  112.     pfadd    %2, %3 ; {r1,i1}
  113.     pfsub    %4, %3 ; {r3,i3}
  114.     SWAP     %3, %6
  115. %endmacro
  116.  
  117. ;  in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
  118. ;      %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
  119. ;      %3, %4, %5 tmp
  120. ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
  121. ;      %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
  122. %macro T8_AVX 5
  123.     vsubps     %5, %1, %2       ; v  = %1 - %2
  124.     vaddps     %3, %1, %2       ; w  = %1 + %2
  125.     vmulps     %2, %5, [ps_p1p1m1p1root2]  ; v *= vals1
  126.     vpermilps  %2, %2, [perm1]
  127.     vblendps   %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
  128.     vshufps    %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
  129.     vsubps     %4, %5, %1       ; s = r - q
  130.     vaddps     %1, %5, %1       ; u = r + q
  131.     vpermilps  %1, %1, [perm2]  ; k  = {u1,u2,u3,u4,u6,u5,u7,u8}
  132.     vshufps    %5, %4, %1, 0xbb
  133.     vshufps    %3, %4, %1, 0xee
  134.     vperm2f128 %3, %3, %5, 0x13
  135.     vxorps     %4, %4, [ps_m1m1p1m1p1m1m1m1]  ; s *= {1,1,-1,-1,1,-1,-1,-1}
  136.     vshufps    %2, %1, %4, 0xdd
  137.     vshufps    %1, %1, %4, 0x88
  138.     vperm2f128 %4, %2, %1, 0x02 ; v  = {k1,k3,s1,s3,k2,k4,s2,s4}
  139.     vperm2f128 %1, %1, %2, 0x13 ; w  = {k6,k8,s6,s8,k5,k7,s5,s7}
  140.     vsubps     %5, %1, %3
  141.     vblendps   %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
  142.     vsubps     %2, %4, %1       ; %2 = v - w
  143.     vaddps     %1, %4, %1       ; %1 = v + w
  144. %endmacro
  145.  
  146. ; In SSE mode do one fft4 transforms
  147. ; in:  %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
  148. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
  149. ;
  150. ; In AVX mode do two fft4 transforms
  151. ; in:  %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
  152. ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
  153. %macro T4_SSE 3
  154.     subps    %3, %1, %2       ; {t3,t4,-t8,t7}
  155.     addps    %1, %1, %2       ; {t1,t2,t6,t5}
  156.     xorps    %3, %3, [ps_p1p1m1p1]
  157.     shufps   %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
  158.     shufps   %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
  159.     subps    %3, %1, %2       ; {r2,i2,r3,i3}
  160.     addps    %1, %1, %2       ; {r0,i0,r1,i1}
  161.     shufps   %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
  162.     shufps   %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
  163. %endmacro
  164.  
  165. ; In SSE mode do one FFT8
  166. ; in:  %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
  167. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
  168. ;
  169. ; In AVX mode do two FFT8
  170. ; in:  %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
  171. ;      %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
  172. ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
  173. ;      %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
  174. %macro T8_SSE 6
  175.     addps    %6, %3, %4       ; {t1,t2,t3,t4}
  176.     subps    %3, %3, %4       ; {r5,i5,r7,i7}
  177.     shufps   %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
  178.     mulps    %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
  179.     mulps    %4, %4, [ps_root2]
  180.     addps    %3, %3, %4       ; {t8,t7,ta,t9}
  181.     shufps   %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
  182.     shufps   %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
  183.     subps    %3, %6, %4       ; {t6,t5,tc,tb}
  184.     addps    %6, %6, %4       ; {t1,t2,t9,ta}
  185.     shufps   %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
  186.     shufps   %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
  187.     subps    %3, %1, %6       ; {r4,r5,r6,r7}
  188.     addps    %1, %1, %6       ; {r0,r1,r2,r3}
  189.     subps    %4, %2, %5       ; {i4,i5,i6,i7}
  190.     addps    %2, %2, %5       ; {i0,i1,i2,i3}
  191. %endmacro
  192.  
  193. ; scheduled for cpu-bound sizes
  194. %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
  195. IF%1 mova    m4, Z(4)
  196. IF%1 mova    m5, Z(5)
  197.     mova     m0, %2 ; wre
  198.     mova     m1, %3 ; wim
  199.     mulps    m2, m4, m0 ; r2*wre
  200. IF%1 mova    m6, Z2(6)
  201.     mulps    m3, m5, m1 ; i2*wim
  202. IF%1 mova    m7, Z2(7)
  203.     mulps    m4, m4, m1 ; r2*wim
  204.     mulps    m5, m5, m0 ; i2*wre
  205.     addps    m2, m2, m3 ; r2*wre + i2*wim
  206.     mulps    m3, m1, m7 ; i3*wim
  207.     subps    m5, m5, m4 ; i2*wre - r2*wim
  208.     mulps    m1, m1, m6 ; r3*wim
  209.     mulps    m4, m0, m6 ; r3*wre
  210.     mulps    m0, m0, m7 ; i3*wre
  211.     subps    m4, m4, m3 ; r3*wre - i3*wim
  212.     mova     m3, Z(0)
  213.     addps    m0, m0, m1 ; i3*wre + r3*wim
  214.     subps    m1, m4, m2 ; t3
  215.     addps    m4, m4, m2 ; t5
  216.     subps    m3, m3, m4 ; r2
  217.     addps    m4, m4, Z(0) ; r0
  218.     mova     m6, Z(2)
  219.     mova   Z(4), m3
  220.     mova   Z(0), m4
  221.     subps    m3, m5, m0 ; t4
  222.     subps    m4, m6, m3 ; r3
  223.     addps    m3, m3, m6 ; r1
  224.     mova  Z2(6), m4
  225.     mova   Z(2), m3
  226.     mova     m2, Z(3)
  227.     addps    m3, m5, m0 ; t6
  228.     subps    m2, m2, m1 ; i3
  229.     mova     m7, Z(1)
  230.     addps    m1, m1, Z(3) ; i1
  231.     mova  Z2(7), m2
  232.     mova   Z(3), m1
  233.     subps    m4, m7, m3 ; i2
  234.     addps    m3, m3, m7 ; i0
  235.     mova   Z(5), m4
  236.     mova   Z(1), m3
  237. %endmacro
  238.  
  239. ; scheduled to avoid store->load aliasing
  240. %macro PASS_BIG 1 ; (!interleave)
  241.     mova     m4, Z(4) ; r2
  242.     mova     m5, Z(5) ; i2
  243.     mova     m0, [wq] ; wre
  244.     mova     m1, [wq+o1q] ; wim
  245.     mulps    m2, m4, m0 ; r2*wre
  246.     mova     m6, Z2(6) ; r3
  247.     mulps    m3, m5, m1 ; i2*wim
  248.     mova     m7, Z2(7) ; i3
  249.     mulps    m4, m4, m1 ; r2*wim
  250.     mulps    m5, m5, m0 ; i2*wre
  251.     addps    m2, m2, m3 ; r2*wre + i2*wim
  252.     mulps    m3, m1, m7 ; i3*wim
  253.     mulps    m1, m1, m6 ; r3*wim
  254.     subps    m5, m5, m4 ; i2*wre - r2*wim
  255.     mulps    m4, m0, m6 ; r3*wre
  256.     mulps    m0, m0, m7 ; i3*wre
  257.     subps    m4, m4, m3 ; r3*wre - i3*wim
  258.     mova     m3, Z(0)
  259.     addps    m0, m0, m1 ; i3*wre + r3*wim
  260.     subps    m1, m4, m2 ; t3
  261.     addps    m4, m4, m2 ; t5
  262.     subps    m3, m3, m4 ; r2
  263.     addps    m4, m4, Z(0) ; r0
  264.     mova     m6, Z(2)
  265.     mova   Z(4), m3
  266.     mova   Z(0), m4
  267.     subps    m3, m5, m0 ; t4
  268.     subps    m4, m6, m3 ; r3
  269.     addps    m3, m3, m6 ; r1
  270. IF%1 mova Z2(6), m4
  271. IF%1 mova  Z(2), m3
  272.     mova     m2, Z(3)
  273.     addps    m5, m5, m0 ; t6
  274.     subps    m2, m2, m1 ; i3
  275.     mova     m7, Z(1)
  276.     addps    m1, m1, Z(3) ; i1
  277. IF%1 mova Z2(7), m2
  278. IF%1 mova  Z(3), m1
  279.     subps    m6, m7, m5 ; i2
  280.     addps    m5, m5, m7 ; i0
  281. IF%1 mova  Z(5), m6
  282. IF%1 mova  Z(1), m5
  283. %if %1==0
  284.     INTERL m1, m3, m7, Z, 2
  285.     INTERL m2, m4, m0, Z2, 6
  286.  
  287.     mova     m1, Z(0)
  288.     mova     m2, Z(4)
  289.  
  290.     INTERL m5, m1, m3, Z, 0
  291.     INTERL m6, m2, m7, Z, 4
  292. %endif
  293. %endmacro
  294.  
  295. %macro PUNPCK 3
  296.     mova      %3, %1
  297.     punpckldq %1, %2
  298.     punpckhdq %3, %2
  299. %endmacro
  300.  
  301. %define Z(x) [r0+mmsize*x]
  302. %define Z2(x) [r0+mmsize*x]
  303. %define ZH(x) [r0+mmsize*x+mmsize/2]
  304.  
  305. INIT_YMM avx
  306.  
  307. %if HAVE_AVX_EXTERNAL
  308. align 16
  309. fft8_avx:
  310.     mova      m0, Z(0)
  311.     mova      m1, Z(1)
  312.     T8_AVX    m0, m1, m2, m3, m4
  313.     mova      Z(0), m0
  314.     mova      Z(1), m1
  315.     ret
  316.  
  317.  
  318. align 16
  319. fft16_avx:
  320.     mova       m2, Z(2)
  321.     mova       m3, Z(3)
  322.     T4_SSE     m2, m3, m7
  323.  
  324.     mova       m0, Z(0)
  325.     mova       m1, Z(1)
  326.     T8_AVX     m0, m1, m4, m5, m7
  327.  
  328.     mova       m4, [ps_cos16_1]
  329.     mova       m5, [ps_cos16_2]
  330.     vmulps     m6, m2, m4
  331.     vmulps     m7, m3, m5
  332.     vaddps     m7, m7, m6
  333.     vmulps     m2, m2, m5
  334.     vmulps     m3, m3, m4
  335.     vsubps     m3, m3, m2
  336.     vblendps   m2, m7, m3, 0xf0
  337.     vperm2f128 m3, m7, m3, 0x21
  338.     vaddps     m4, m2, m3
  339.     vsubps     m2, m3, m2
  340.     vperm2f128 m2, m2, m2, 0x01
  341.     vsubps     m3, m1, m2
  342.     vaddps     m1, m1, m2
  343.     vsubps     m5, m0, m4
  344.     vaddps     m0, m0, m4
  345.     vextractf128   Z(0), m0, 0
  346.     vextractf128  ZH(0), m1, 0
  347.     vextractf128   Z(1), m0, 1
  348.     vextractf128  ZH(1), m1, 1
  349.     vextractf128   Z(2), m5, 0
  350.     vextractf128  ZH(2), m3, 0
  351.     vextractf128   Z(3), m5, 1
  352.     vextractf128  ZH(3), m3, 1
  353.     ret
  354.  
  355. align 16
  356. fft32_avx:
  357.     call fft16_avx
  358.  
  359.     mova m0, Z(4)
  360.     mova m1, Z(5)
  361.  
  362.     T4_SSE      m0, m1, m4
  363.  
  364.     mova m2, Z(6)
  365.     mova m3, Z(7)
  366.  
  367.     T8_SSE      m0, m1, m2, m3, m4, m6
  368.     ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
  369.     ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
  370.  
  371.     vperm2f128  m4, m0, m2, 0x20
  372.     vperm2f128  m5, m1, m3, 0x20
  373.     vperm2f128  m6, m0, m2, 0x31
  374.     vperm2f128  m7, m1, m3, 0x31
  375.  
  376.     PASS_SMALL 0, [cos_32], [cos_32+32]
  377.  
  378.     ret
  379.  
  380. fft32_interleave_avx:
  381.     call fft32_avx
  382.     mov r2d, 32
  383. .deint_loop:
  384.     mova     m2, Z(0)
  385.     mova     m3, Z(1)
  386.     vunpcklps      m0, m2, m3
  387.     vunpckhps      m1, m2, m3
  388.     vextractf128   Z(0), m0, 0
  389.     vextractf128  ZH(0), m1, 0
  390.     vextractf128   Z(1), m0, 1
  391.     vextractf128  ZH(1), m1, 1
  392.     add r0, mmsize*2
  393.     sub r2d, mmsize/4
  394.     jg .deint_loop
  395.     ret
  396.  
  397. %endif
  398.  
  399. INIT_XMM sse
  400.  
  401. align 16
  402. fft4_avx:
  403. fft4_sse:
  404.     mova     m0, Z(0)
  405.     mova     m1, Z(1)
  406.     T4_SSE   m0, m1, m2
  407.     mova   Z(0), m0
  408.     mova   Z(1), m1
  409.     ret
  410.  
  411. align 16
  412. fft8_sse:
  413.     mova     m0, Z(0)
  414.     mova     m1, Z(1)
  415.     T4_SSE   m0, m1, m2
  416.     mova     m2, Z(2)
  417.     mova     m3, Z(3)
  418.     T8_SSE   m0, m1, m2, m3, m4, m5
  419.     mova   Z(0), m0
  420.     mova   Z(1), m1
  421.     mova   Z(2), m2
  422.     mova   Z(3), m3
  423.     ret
  424.  
  425. align 16
  426. fft16_sse:
  427.     mova     m0, Z(0)
  428.     mova     m1, Z(1)
  429.     T4_SSE   m0, m1, m2
  430.     mova     m2, Z(2)
  431.     mova     m3, Z(3)
  432.     T8_SSE   m0, m1, m2, m3, m4, m5
  433.     mova     m4, Z(4)
  434.     mova     m5, Z(5)
  435.     mova   Z(0), m0
  436.     mova   Z(1), m1
  437.     mova   Z(2), m2
  438.     mova   Z(3), m3
  439.     T4_SSE   m4, m5, m6
  440.     mova     m6, Z2(6)
  441.     mova     m7, Z2(7)
  442.     T4_SSE   m6, m7, m0
  443.     PASS_SMALL 0, [cos_16], [cos_16+16]
  444.     ret
  445.  
  446.  
  447. %macro FFT48_3DNOW 0
  448. align 16
  449. fft4 %+ SUFFIX:
  450.     T2_3DNOW m0, m1, Z(0), Z(1)
  451.     mova     m2, Z(2)
  452.     mova     m3, Z(3)
  453.     T4_3DNOW m0, m1, m2, m3, m4, m5
  454.     PUNPCK   m0, m1, m4
  455.     PUNPCK   m2, m3, m5
  456.     mova   Z(0), m0
  457.     mova   Z(1), m4
  458.     mova   Z(2), m2
  459.     mova   Z(3), m5
  460.     ret
  461.  
  462. align 16
  463. fft8 %+ SUFFIX:
  464.     T2_3DNOW m0, m1, Z(0), Z(1)
  465.     mova     m2, Z(2)
  466.     mova     m3, Z(3)
  467.     T4_3DNOW m0, m1, m2, m3, m4, m5
  468.     mova   Z(0), m0
  469.     mova   Z(2), m2
  470.     T2_3DNOW m4, m5,  Z(4),  Z(5)
  471.     T2_3DNOW m6, m7, Z2(6), Z2(7)
  472.     PSWAPD   m0, m5
  473.     PSWAPD   m2, m7
  474.     pxor     m0, [ps_m1p1]
  475.     pxor     m2, [ps_m1p1]
  476.     pfsub    m5, m0
  477.     pfadd    m7, m2
  478.     pfmul    m5, [ps_root2]
  479.     pfmul    m7, [ps_root2]
  480.     T4_3DNOW m1, m3, m5, m7, m0, m2
  481.     mova   Z(5), m5
  482.     mova  Z2(7), m7
  483.     mova     m0, Z(0)
  484.     mova     m2, Z(2)
  485.     T4_3DNOW m0, m2, m4, m6, m5, m7
  486.     PUNPCK   m0, m1, m5
  487.     PUNPCK   m2, m3, m7
  488.     mova   Z(0), m0
  489.     mova   Z(1), m5
  490.     mova   Z(2), m2
  491.     mova   Z(3), m7
  492.     PUNPCK   m4,  Z(5), m5
  493.     PUNPCK   m6, Z2(7), m7
  494.     mova   Z(4), m4
  495.     mova   Z(5), m5
  496.     mova  Z2(6), m6
  497.     mova  Z2(7), m7
  498.     ret
  499. %endmacro
  500.  
  501. %if ARCH_X86_32
  502. INIT_MMX 3dnowext
  503. FFT48_3DNOW
  504.  
  505. INIT_MMX 3dnow
  506. FFT48_3DNOW
  507. %endif
  508.  
  509. %define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
  510. %define Z2(x) [zcq + o3q + mmsize*(x&1)]
  511. %define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
  512. %define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
  513.  
  514. %macro DECL_PASS 2+ ; name, payload
  515. align 16
  516. %1:
  517. DEFINE_ARGS zc, w, n, o1, o3
  518.     lea o3q, [nq*3]
  519.     lea o1q, [nq*8]
  520.     shl o3q, 4
  521. .loop:
  522.     %2
  523.     add zcq, mmsize*2
  524.     add  wq, mmsize
  525.     sub  nd, mmsize/8
  526.     jg .loop
  527.     rep ret
  528. %endmacro
  529.  
  530. %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
  531.     lea r2, [dispatch_tab%1]
  532.     mov r2, [r2 + (%2q-2)*gprsize]
  533. %ifdef PIC
  534.     lea r3, [$$]
  535.     add r2, r3
  536. %endif
  537.     call r2
  538. %endmacro ; FFT_DISPATCH
  539.  
  540. INIT_YMM avx
  541.  
  542. %if HAVE_AVX_EXTERNAL
  543. %macro INTERL_AVX 5
  544.     vunpckhps      %3, %2, %1
  545.     vunpcklps      %2, %2, %1
  546.     vextractf128   %4(%5), %2, 0
  547.     vextractf128  %4 %+ H(%5), %3, 0
  548.     vextractf128   %4(%5 + 1), %2, 1
  549.     vextractf128  %4 %+ H(%5 + 1), %3, 1
  550. %endmacro
  551.  
  552. %define INTERL INTERL_AVX
  553.  
  554. DECL_PASS pass_avx, PASS_BIG 1
  555. DECL_PASS pass_interleave_avx, PASS_BIG 0
  556.  
  557. cglobal fft_calc, 2,5,8
  558.     mov     r3d, [r0 + FFTContext.nbits]
  559.     mov     r0, r1
  560.     mov     r1, r3
  561.     FFT_DISPATCH _interleave %+ SUFFIX, r1
  562.     REP_RET
  563.  
  564. %endif
  565.  
  566. INIT_XMM sse
  567.  
  568. %macro INTERL_SSE 5
  569.     mova     %3, %2
  570.     unpcklps %2, %1
  571.     unpckhps %3, %1
  572.     mova  %4(%5), %2
  573.     mova  %4(%5+1), %3
  574. %endmacro
  575.  
  576. %define INTERL INTERL_SSE
  577.  
  578. DECL_PASS pass_sse, PASS_BIG 1
  579. DECL_PASS pass_interleave_sse, PASS_BIG 0
  580.  
  581. %macro FFT_CALC_FUNC 0
  582. cglobal fft_calc, 2,5,8
  583.     mov     r3d, [r0 + FFTContext.nbits]
  584.     PUSH    r1
  585.     PUSH    r3
  586.     mov     r0, r1
  587.     mov     r1, r3
  588.     FFT_DISPATCH _interleave %+ SUFFIX, r1
  589.     POP     rcx
  590.     POP     r4
  591.     cmp     rcx, 3+(mmsize/16)
  592.     jg      .end
  593.     mov     r2, -1
  594.     add     rcx, 3
  595.     shl     r2, cl
  596.     sub     r4, r2
  597. .loop:
  598. %if mmsize == 8
  599.     PSWAPD  m0, [r4 + r2 + 4]
  600.     mova [r4 + r2 + 4], m0
  601. %else
  602.     movaps   xmm0, [r4 + r2]
  603.     movaps   xmm1, xmm0
  604.     unpcklps xmm0, [r4 + r2 + 16]
  605.     unpckhps xmm1, [r4 + r2 + 16]
  606.     movaps   [r4 + r2],      xmm0
  607.     movaps   [r4 + r2 + 16], xmm1
  608. %endif
  609.     add      r2, mmsize*2
  610.     jl       .loop
  611. .end:
  612. %if cpuflag(3dnow)
  613.     femms
  614.     RET
  615. %else
  616.     REP_RET
  617. %endif
  618. %endmacro
  619.  
  620. %if ARCH_X86_32
  621. INIT_MMX 3dnow
  622. FFT_CALC_FUNC
  623. INIT_MMX 3dnowext
  624. FFT_CALC_FUNC
  625. %endif
  626. INIT_XMM sse
  627. FFT_CALC_FUNC
  628.  
  629. cglobal fft_permute, 2,7,1
  630.     mov     r4,  [r0 + FFTContext.revtab]
  631.     mov     r5,  [r0 + FFTContext.tmpbuf]
  632.     mov     ecx, [r0 + FFTContext.nbits]
  633.     mov     r2, 1
  634.     shl     r2, cl
  635.     xor     r0, r0
  636. %if ARCH_X86_32
  637.     mov     r1, r1m
  638. %endif
  639. .loop:
  640.     movaps  xmm0, [r1 + 8*r0]
  641.     movzx   r6, word [r4 + 2*r0]
  642.     movzx   r3, word [r4 + 2*r0 + 2]
  643.     movlps  [r5 + 8*r6], xmm0
  644.     movhps  [r5 + 8*r3], xmm0
  645.     add     r0, 2
  646.     cmp     r0, r2
  647.     jl      .loop
  648.     shl     r2, 3
  649.     add     r1, r2
  650.     add     r5, r2
  651.     neg     r2
  652. ; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
  653. .loopcopy:
  654.     movaps  xmm0, [r5 + r2]
  655.     movaps  xmm1, [r5 + r2 + 16]
  656.     movaps  [r1 + r2], xmm0
  657.     movaps  [r1 + r2 + 16], xmm1
  658.     add     r2, 32
  659.     jl      .loopcopy
  660.     REP_RET
  661.  
  662. %macro IMDCT_CALC_FUNC 0
  663. cglobal imdct_calc, 3,5,3
  664.     mov     r3d, [r0 + FFTContext.mdctsize]
  665.     mov     r4,  [r0 + FFTContext.imdcthalf]
  666.     add     r1,  r3
  667.     PUSH    r3
  668.     PUSH    r1
  669. %if ARCH_X86_32
  670.     push    r2
  671.     push    r1
  672.     push    r0
  673. %else
  674.     sub     rsp, 8+32*WIN64 ; allocate win64 shadow space
  675. %endif
  676.     call    r4
  677. %if ARCH_X86_32
  678.     add     esp, 12
  679. %else
  680.     add     rsp, 8+32*WIN64
  681. %endif
  682.     POP     r1
  683.     POP     r3
  684.     lea     r0, [r1 + 2*r3]
  685.     mov     r2, r3
  686.     sub     r3, mmsize
  687.     neg     r2
  688.     mova    m2, [ps_m1m1m1m1]
  689. .loop:
  690. %if mmsize == 8
  691.     PSWAPD  m0, [r1 + r3]
  692.     PSWAPD  m1, [r0 + r2]
  693.     pxor    m0, m2
  694. %else
  695.     mova    m0, [r1 + r3]
  696.     mova    m1, [r0 + r2]
  697.     shufps  m0, m0, 0x1b
  698.     shufps  m1, m1, 0x1b
  699.     xorps   m0, m2
  700. %endif
  701.     mova [r0 + r3], m1
  702.     mova [r1 + r2], m0
  703.     sub     r3, mmsize
  704.     add     r2, mmsize
  705.     jl      .loop
  706. %if cpuflag(3dnow)
  707.     femms
  708.     RET
  709. %else
  710.     REP_RET
  711. %endif
  712. %endmacro
  713.  
  714. %if ARCH_X86_32
  715. INIT_MMX 3dnow
  716. IMDCT_CALC_FUNC
  717. INIT_MMX 3dnowext
  718. IMDCT_CALC_FUNC
  719. %endif
  720.  
  721. INIT_XMM sse
  722. IMDCT_CALC_FUNC
  723.  
  724. %if ARCH_X86_32
  725. INIT_MMX 3dnow
  726. %define mulps pfmul
  727. %define addps pfadd
  728. %define subps pfsub
  729. %define unpcklps punpckldq
  730. %define unpckhps punpckhdq
  731. DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
  732. DECL_PASS pass_interleave_3dnow, PASS_BIG 0
  733. %define pass_3dnowext pass_3dnow
  734. %define pass_interleave_3dnowext pass_interleave_3dnow
  735. %endif
  736.  
  737. %ifdef PIC
  738. %define SECTION_REL - $$
  739. %else
  740. %define SECTION_REL
  741. %endif
  742.  
  743. %macro DECL_FFT 1-2 ; nbits, suffix
  744. %ifidn %0, 1
  745. %xdefine fullsuffix SUFFIX
  746. %else
  747. %xdefine fullsuffix %2 %+ SUFFIX
  748. %endif
  749. %xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
  750. %if %1>=5
  751. %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
  752. %endif
  753. %if %1>=6
  754. %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
  755. %endif
  756.  
  757. %assign n 1<<%1
  758. %rep 17-%1
  759. %assign n2 n/2
  760. %assign n4 n/4
  761. %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
  762.  
  763. align 16
  764. fft %+ n %+ fullsuffix:
  765.     call fft %+ n2 %+ SUFFIX
  766.     add r0, n*4 - (n&(-2<<%1))
  767.     call fft %+ n4 %+ SUFFIX
  768.     add r0, n*2 - (n2&(-2<<%1))
  769.     call fft %+ n4 %+ SUFFIX
  770.     sub r0, n*6 + (n2&(-2<<%1))
  771.     lea r1, [cos_ %+ n]
  772.     mov r2d, n4/2
  773.     jmp pass %+ fullsuffix
  774.  
  775. %assign n n*2
  776. %endrep
  777. %undef n
  778.  
  779. align 8
  780. dispatch_tab %+ fullsuffix: pointer list_of_fft
  781. %endmacro ; DECL_FFT
  782.  
  783. %if HAVE_AVX_EXTERNAL
  784. INIT_YMM avx
  785. DECL_FFT 6
  786. DECL_FFT 6, _interleave
  787. %endif
  788. INIT_XMM sse
  789. DECL_FFT 5
  790. DECL_FFT 5, _interleave
  791. %if ARCH_X86_32
  792. INIT_MMX 3dnow
  793. DECL_FFT 4
  794. DECL_FFT 4, _interleave
  795. INIT_MMX 3dnowext
  796. DECL_FFT 4
  797. DECL_FFT 4, _interleave
  798. %endif
  799.  
  800. INIT_XMM sse
  801. %undef mulps
  802. %undef addps
  803. %undef subps
  804. %undef unpcklps
  805. %undef unpckhps
  806.  
  807. %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
  808. %if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
  809.     PSWAPD     m0, [%3+%2*4]
  810.     movq       m2, [%3+%1*4-8]
  811.     movq       m3, m0
  812.     punpckldq  m0, m2
  813.     punpckhdq  m2, m3
  814.     movd       m1, [%4+%1*2-4] ; tcos[j]
  815.     movd       m3, [%4+%2*2]   ; tcos[n4-j-1]
  816.     punpckldq  m1, [%5+%1*2-4] ; tsin[j]
  817.     punpckldq  m3, [%5+%2*2]   ; tsin[n4-j-1]
  818.  
  819.     mova       m4, m0
  820.     PSWAPD     m5, m1
  821.     pfmul      m0, m1
  822.     pfmul      m4, m5
  823.     mova       m6, m2
  824.     PSWAPD     m5, m3
  825.     pfmul      m2, m3
  826.     pfmul      m6, m5
  827. %if cpuflag(3dnowext)
  828.     pfpnacc    m0, m4
  829.     pfpnacc    m2, m6
  830. %else
  831.     SBUTTERFLY dq, 0, 4, 1
  832.     SBUTTERFLY dq, 2, 6, 3
  833.     pxor       m4, m7
  834.     pxor       m6, m7
  835.     pfadd      m0, m4
  836.     pfadd      m2, m6
  837. %endif
  838. %else
  839.     movaps   xmm0, [%3+%2*4]
  840.     movaps   xmm1, [%3+%1*4-0x10]
  841.     movaps   xmm2, xmm0
  842.     shufps   xmm0, xmm1, 0x88
  843.     shufps   xmm1, xmm2, 0x77
  844.     movlps   xmm4, [%4+%2*2]
  845.     movlps   xmm5, [%5+%2*2+0x0]
  846.     movhps   xmm4, [%4+%1*2-0x8]
  847.     movhps   xmm5, [%5+%1*2-0x8]
  848.     movaps   xmm2, xmm0
  849.     movaps   xmm3, xmm1
  850.     mulps    xmm0, xmm5
  851.     mulps    xmm1, xmm4
  852.     mulps    xmm2, xmm4
  853.     mulps    xmm3, xmm5
  854.     subps    xmm1, xmm0
  855.     addps    xmm2, xmm3
  856.     movaps   xmm0, xmm1
  857.     unpcklps xmm1, xmm2
  858.     unpckhps xmm0, xmm2
  859. %endif
  860. %endmacro
  861.  
  862. %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
  863.     mulps      m6, %3, [%5+%1]
  864.     mulps      m7, %2, [%5+%1]
  865.     mulps      %2, %2, [%6+%1]
  866.     mulps      %3, %3, [%6+%1]
  867.     subps      %2, %2, m6
  868.     addps      %3, %3, m7
  869. %endmacro
  870.  
  871. %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
  872. .post:
  873.     vmovaps      ymm1,   [%3+%1*2]
  874.     vmovaps      ymm0,   [%3+%1*2+0x20]
  875.     vmovaps      ymm3,   [%3+%2*2]
  876.     vmovaps      ymm2,   [%3+%2*2+0x20]
  877.  
  878.     CMUL         %1, ymm0, ymm1, %3, %4, %5
  879.     CMUL         %2, ymm2, ymm3, %3, %4, %5
  880.     vshufps      ymm1, ymm1, ymm1, 0x1b
  881.     vshufps      ymm3, ymm3, ymm3, 0x1b
  882.     vperm2f128   ymm1, ymm1, ymm1, 0x01
  883.     vperm2f128   ymm3, ymm3, ymm3, 0x01
  884.     vunpcklps    ymm6, ymm2, ymm1
  885.     vunpckhps    ymm4, ymm2, ymm1
  886.     vunpcklps    ymm7, ymm0, ymm3
  887.     vunpckhps    ymm5, ymm0, ymm3
  888.  
  889.     vextractf128 [%3+%1*2],      ymm7, 0
  890.     vextractf128 [%3+%1*2+0x10], ymm5, 0
  891.     vextractf128 [%3+%1*2+0x20], ymm7, 1
  892.     vextractf128 [%3+%1*2+0x30], ymm5, 1
  893.  
  894.     vextractf128 [%3+%2*2],      ymm6, 0
  895.     vextractf128 [%3+%2*2+0x10], ymm4, 0
  896.     vextractf128 [%3+%2*2+0x20], ymm6, 1
  897.     vextractf128 [%3+%2*2+0x30], ymm4, 1
  898.     sub      %2,   0x20
  899.     add      %1,   0x20
  900.     jl       .post
  901. %endmacro
  902.  
  903. %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
  904. .post:
  905.     movaps   xmm1, [%3+%1*2]
  906.     movaps   xmm0, [%3+%1*2+0x10]
  907.     CMUL     %1,   xmm0, xmm1, %3, %4, %5
  908.     movaps   xmm5, [%3+%2*2]
  909.     movaps   xmm4, [%3+%2*2+0x10]
  910.     CMUL     %2,   xmm4, xmm5, %3, %4, %5
  911.     shufps   xmm1, xmm1, 0x1b
  912.     shufps   xmm5, xmm5, 0x1b
  913.     movaps   xmm6, xmm4
  914.     unpckhps xmm4, xmm1
  915.     unpcklps xmm6, xmm1
  916.     movaps   xmm2, xmm0
  917.     unpcklps xmm0, xmm5
  918.     unpckhps xmm2, xmm5
  919.     movaps   [%3+%2*2],      xmm6
  920.     movaps   [%3+%2*2+0x10], xmm4
  921.     movaps   [%3+%1*2],      xmm0
  922.     movaps   [%3+%1*2+0x10], xmm2
  923.     sub      %2,   0x10
  924.     add      %1,   0x10
  925.     jl       .post
  926. %endmacro
  927.  
  928. %macro CMUL_3DNOW 6
  929.     mova       m6, [%1+%2*2]
  930.     mova       %3, [%1+%2*2+8]
  931.     mova       %4, m6
  932.     mova       m7, %3
  933.     pfmul      m6, [%5+%2]
  934.     pfmul      %3, [%6+%2]
  935.     pfmul      %4, [%6+%2]
  936.     pfmul      m7, [%5+%2]
  937.     pfsub      %3, m6
  938.     pfadd      %4, m7
  939. %endmacro
  940.  
  941. %macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8
  942. .post:
  943.     CMUL_3DNOW %3, %1, m0, m1, %4, %5
  944.     CMUL_3DNOW %3, %2, m2, m3, %4, %5
  945.     movd  [%3+%1*2+ 0], m0
  946.     movd  [%3+%2*2+12], m1
  947.     movd  [%3+%2*2+ 0], m2
  948.     movd  [%3+%1*2+12], m3
  949.     psrlq      m0, 32
  950.     psrlq      m1, 32
  951.     psrlq      m2, 32
  952.     psrlq      m3, 32
  953.     movd  [%3+%1*2+ 8], m0
  954.     movd  [%3+%2*2+ 4], m1
  955.     movd  [%3+%2*2+ 8], m2
  956.     movd  [%3+%1*2+ 4], m3
  957.     sub        %2, 8
  958.     add        %1, 8
  959.     jl         .post
  960. %endmacro
  961.  
  962. %macro DECL_IMDCT 1
  963. cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
  964. %if ARCH_X86_64
  965. %define rrevtab r7
  966. %define rtcos   r8
  967. %define rtsin   r9
  968. %else
  969. %define rrevtab r6
  970. %define rtsin   r6
  971. %define rtcos   r5
  972. %endif
  973.     mov   r3d, [r0+FFTContext.mdctsize]
  974.     add   r2, r3
  975.     shr   r3, 1
  976.     mov   rtcos, [r0+FFTContext.tcos]
  977.     mov   rtsin, [r0+FFTContext.tsin]
  978.     add   rtcos, r3
  979.     add   rtsin, r3
  980. %if ARCH_X86_64 == 0
  981.     push  rtcos
  982.     push  rtsin
  983. %endif
  984.     shr   r3, 1
  985.     mov   rrevtab, [r0+FFTContext.revtab]
  986.     add   rrevtab, r3
  987. %if ARCH_X86_64 == 0
  988.     push  rrevtab
  989. %endif
  990.  
  991. %if mmsize == 8
  992.     sub   r3, 2
  993. %else
  994.     sub   r3, 4
  995. %endif
  996. %if ARCH_X86_64 || mmsize == 8
  997.     xor   r4, r4
  998.     sub   r4, r3
  999. %endif
  1000. %if notcpuflag(3dnowext) && mmsize == 8
  1001.     movd  m7, [ps_m1m1m1m1]
  1002. %endif
  1003. .pre:
  1004. %if ARCH_X86_64 == 0
  1005. ;unspill
  1006. %if mmsize != 8
  1007.     xor   r4, r4
  1008.     sub   r4, r3
  1009. %endif
  1010.     mov   rtcos, [esp+8]
  1011.     mov   rtsin, [esp+4]
  1012. %endif
  1013.  
  1014.     PREROTATER r4, r3, r2, rtcos, rtsin
  1015. %if mmsize == 8
  1016.     mov    r6, [esp]                ; rrevtab = ptr+n8
  1017.     movzx  r5,  word [rrevtab+r4-2] ; rrevtab[j]
  1018.     movzx  r6,  word [rrevtab+r3]   ; rrevtab[n4-j-1]
  1019.     mova [r1+r5*8], m0
  1020.     mova [r1+r6*8], m2
  1021.     add    r4, 2
  1022.     sub    r3, 2
  1023. %else
  1024. %if ARCH_X86_64
  1025.     movzx  r5,  word [rrevtab+r4-4]
  1026.     movzx  r6,  word [rrevtab+r4-2]
  1027.     movzx  r10, word [rrevtab+r3]
  1028.     movzx  r11, word [rrevtab+r3+2]
  1029.     movlps [r1+r5 *8], xmm0
  1030.     movhps [r1+r6 *8], xmm0
  1031.     movlps [r1+r10*8], xmm1
  1032.     movhps [r1+r11*8], xmm1
  1033.     add    r4, 4
  1034. %else
  1035.     mov    r6, [esp]
  1036.     movzx  r5, word [r6+r4-4]
  1037.     movzx  r4, word [r6+r4-2]
  1038.     movlps [r1+r5*8], xmm0
  1039.     movhps [r1+r4*8], xmm0
  1040.     movzx  r5, word [r6+r3]
  1041.     movzx  r4, word [r6+r3+2]
  1042.     movlps [r1+r5*8], xmm1
  1043.     movhps [r1+r4*8], xmm1
  1044. %endif
  1045.     sub    r3, 4
  1046. %endif
  1047.     jns    .pre
  1048.  
  1049.     mov  r5, r0
  1050.     mov  r6, r1
  1051.     mov  r0, r1
  1052.     mov  r1d, [r5+FFTContext.nbits]
  1053.  
  1054.     FFT_DISPATCH SUFFIX, r1
  1055.  
  1056.     mov  r0d, [r5+FFTContext.mdctsize]
  1057.     add  r6, r0
  1058.     shr  r0, 1
  1059. %if ARCH_X86_64 == 0
  1060. %define rtcos r2
  1061. %define rtsin r3
  1062.     mov  rtcos, [esp+8]
  1063.     mov  rtsin, [esp+4]
  1064. %endif
  1065.     neg  r0
  1066.     mov  r1, -mmsize
  1067.     sub  r1, r0
  1068.     %1 r0, r1, r6, rtcos, rtsin
  1069. %if ARCH_X86_64 == 0
  1070.     add esp, 12
  1071. %endif
  1072. %if mmsize == 8
  1073.     femms
  1074. %endif
  1075.     RET
  1076. %endmacro
  1077.  
  1078. DECL_IMDCT POSROTATESHUF
  1079.  
  1080. %if ARCH_X86_32
  1081. INIT_MMX 3dnow
  1082. DECL_IMDCT POSROTATESHUF_3DNOW
  1083.  
  1084. INIT_MMX 3dnowext
  1085. DECL_IMDCT POSROTATESHUF_3DNOW
  1086. %endif
  1087.  
  1088. INIT_YMM avx
  1089.  
  1090. %if HAVE_AVX_EXTERNAL
  1091. DECL_IMDCT POSROTATESHUF_AVX
  1092. %endif
  1093.