Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. #ifndef AVCODEC_PPC_FFT_VSX_H
  2. #define AVCODEC_PPC_FFT_VSX_H
  3. /*
  4.  * FFT  transform, optimized with VSX built-in functions
  5.  * Copyright (c) 2014 Rong Yan  Copyright (c) 2009 Loren Merritt
  6.  *
  7.  * This algorithm (though not any of the implementation details) is
  8.  * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S.
  9.  *
  10.  * This file is part of FFmpeg.
  11.  *
  12.  * FFmpeg is free software; you can redistribute it and/or
  13.  * modify it under the terms of the GNU Lesser General Public
  14.  * License as published by the Free Software Foundation; either
  15.  * version 2.1 of the License, or (at your option) any later version.
  16.  *
  17.  * FFmpeg is distributed in the hope that it will be useful,
  18.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20.  * Lesser General Public License for more details.
  21.  *
  22.  * You should have received a copy of the GNU Lesser General Public
  23.  * License along with FFmpeg; if not, write to the Free Software
  24.  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25.  */
  26.  
  27.  
  28. #include "config.h"
  29. #include "libavutil/cpu.h"
  30. #include "libavutil/ppc/types_altivec.h"
  31. #include "libavutil/ppc/util_altivec.h"
  32. #include "libavcodec/fft.h"
  33. #include "libavcodec/fft-internal.h"
  34.  
  35. #if HAVE_VSX
  36.  
  37. void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z);
  38. void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z);
  39.  
  40.  
  41. #define byte_2complex (2*sizeof(FFTComplex))
  42. #define byte_4complex (4*sizeof(FFTComplex))
  43. #define byte_6complex (6*sizeof(FFTComplex))
  44. #define byte_8complex (8*sizeof(FFTComplex))
  45. #define byte_10complex (10*sizeof(FFTComplex))
  46. #define byte_12complex (12*sizeof(FFTComplex))
  47. #define byte_14complex (14*sizeof(FFTComplex))
  48.  
  49. inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n)
  50. {
  51.     int o1 = n<<1;
  52.     int o2 = n<<2;
  53.     int o3 = o1+o2;
  54.     int i1, i2, i3;
  55.     FFTSample* out = (FFTSample*)z;
  56.     const FFTSample *wim = wre+o1;
  57.     vec_f vz0, vzo1, vzo2, vzo3;
  58.     vec_f x0, x1, x2, x3;
  59.     vec_f x4, x5, x6, x7;
  60.     vec_f x8, x9, x10, x11;
  61.     vec_f x12, x13, x14, x15;
  62.     vec_f x16, x17, x18, x19;
  63.     vec_f x20, x21, x22, x23;
  64.     vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1;
  65.     vec_f y0, y1, y2, y3;
  66.     vec_f y4, y5, y8, y9;
  67.     vec_f y10, y13, y14, y15;
  68.     vec_f y16, y17, y18, y19;
  69.     vec_f y20, y21, y22, y23;
  70.     vec_f wr1, wi1, wr0, wi0;
  71.     vec_f wr2, wi2, wr3, wi3;
  72.     vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3;
  73.  
  74.     n = n-2;
  75.     i1 = o1*sizeof(FFTComplex);
  76.     i2 = o2*sizeof(FFTComplex);
  77.     i3 = o3*sizeof(FFTComplex);
  78.     vzo2 = vec_ld(i2, &(out[0]));  // zo2.r  zo2.i  z(o2+1).r  z(o2+1).i
  79.     vzo2plus1 = vec_ld(i2+16, &(out[0]));
  80.     vzo3 = vec_ld(i3, &(out[0]));  // zo3.r  zo3.i  z(o3+1).r  z(o3+1).i
  81.     vzo3plus1 = vec_ld(i3+16, &(out[0]));
  82.     vz0 = vec_ld(0, &(out[0]));    // z0.r  z0.i  z1.r  z1.i
  83.     vz0plus1 = vec_ld(16, &(out[0]));
  84.     vzo1 = vec_ld(i1, &(out[0]));  // zo1.r  zo1.i  z(o1+1).r  z(o1+1).i
  85.     vzo1plus1 = vec_ld(i1+16, &(out[0]));
  86.  
  87.     x0 = vec_add(vzo2, vzo3);
  88.     x1 = vec_sub(vzo2, vzo3);
  89.     y0 = vec_add(vzo2plus1, vzo3plus1);
  90.     y1 = vec_sub(vzo2plus1, vzo3plus1);
  91.  
  92.     wr1 = vec_splats(wre[1]);
  93.     wi1 = vec_splats(wim[-1]);
  94.     wi2 = vec_splats(wim[-2]);
  95.     wi3 = vec_splats(wim[-3]);
  96.     wr2 = vec_splats(wre[2]);
  97.     wr3 = vec_splats(wre[3]);
  98.  
  99.     x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
  100.     x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
  101.  
  102.     y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
  103.     y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
  104.     y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
  105.     y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
  106.  
  107.     ymulwi2 = vec_mul(y4, wi2);
  108.     ymulwi3 = vec_mul(y5, wi3);
  109.     x4 = vec_mul(x2, wr1);
  110.     x5 = vec_mul(x3, wi1);
  111.     y8 = vec_madd(y2, wr2, ymulwi2);
  112.     y9 = vec_msub(y2, wr2, ymulwi2);
  113.     x6 = vec_add(x4, x5);
  114.     x7 = vec_sub(x4, x5);
  115.     y13 = vec_madd(y3, wr3, ymulwi3);
  116.     y14 = vec_msub(y3, wr3, ymulwi3);
  117.  
  118.     x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3));
  119.     y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
  120.     y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
  121.  
  122.     x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2));
  123.     x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1));
  124.  
  125.     y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
  126.     y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
  127.  
  128.     x11 = vec_add(vz0, x9);
  129.     x12 = vec_sub(vz0, x9);
  130.     x13 = vec_add(vzo1, x10);
  131.     x14 = vec_sub(vzo1, x10);
  132.  
  133.     y18 = vec_add(vz0plus1, y16);
  134.     y19 = vec_sub(vz0plus1, y16);
  135.     y20 = vec_add(vzo1plus1, y17);
  136.     y21 = vec_sub(vzo1plus1, y17);
  137.  
  138.     x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3));
  139.     x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3));
  140.     y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
  141.     y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
  142.  
  143.  
  144.     vec_st(x11, 0, &(out[0]));
  145.     vec_st(y18, 16, &(out[0]));
  146.     vec_st(x15, i1, &(out[0]));
  147.     vec_st(y22, i1+16, &(out[0]));
  148.     vec_st(x12, i2, &(out[0]));
  149.     vec_st(y19, i2+16, &(out[0]));
  150.     vec_st(x16, i3, &(out[0]));
  151.     vec_st(y23, i3+16, &(out[0]));
  152.  
  153.     do {
  154.         out += 8;
  155.         wre += 4;
  156.         wim -= 4;
  157.         wr0 = vec_splats(wre[0]);
  158.         wr1 = vec_splats(wre[1]);
  159.         wi0 = vec_splats(wim[0]);
  160.         wi1 = vec_splats(wim[-1]);
  161.  
  162.         wr2 = vec_splats(wre[2]);
  163.         wr3 = vec_splats(wre[3]);
  164.         wi2 = vec_splats(wim[-2]);
  165.         wi3 = vec_splats(wim[-3]);
  166.  
  167.         vzo2 = vec_ld(i2, &(out[0]));  // zo2.r  zo2.i  z(o2+1).r  z(o2+1).i
  168.         vzo2plus1 = vec_ld(i2+16, &(out[0]));
  169.         vzo3 = vec_ld(i3, &(out[0]));  // zo3.r  zo3.i  z(o3+1).r  z(o3+1).i
  170.         vzo3plus1 = vec_ld(i3+16, &(out[0]));
  171.         vz0 = vec_ld(0, &(out[0]));    // z0.r  z0.i  z1.r  z1.i
  172.         vz0plus1 = vec_ld(16, &(out[0]));
  173.         vzo1 = vec_ld(i1, &(out[0])); // zo1.r  zo1.i  z(o1+1).r  z(o1+1).i
  174.         vzo1plus1 = vec_ld(i1+16, &(out[0]));
  175.  
  176.         x0 = vec_add(vzo2, vzo3);
  177.         x1 = vec_sub(vzo2, vzo3);
  178.  
  179.         y0 = vec_add(vzo2plus1, vzo3plus1);
  180.         y1 = vec_sub(vzo2plus1, vzo3plus1);
  181.  
  182.         x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0));
  183.         x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
  184.         x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1));
  185.         x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
  186.  
  187.         y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
  188.         y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
  189.         xmulwi0 = vec_mul(x4, wi0);
  190.         xmulwi1 = vec_mul(x5, wi1);
  191.  
  192.         y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
  193.         y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
  194.  
  195.         x8 = vec_madd(x2, wr0, xmulwi0);
  196.         x9 = vec_msub(x2, wr0, xmulwi0);
  197.         ymulwi2 = vec_mul(y4, wi2);
  198.         ymulwi3 = vec_mul(y5, wi3);
  199.  
  200.         x13 = vec_madd(x3, wr1, xmulwi1);
  201.         x14 = vec_msub(x3, wr1, xmulwi1);
  202.  
  203.         y8 = vec_madd(y2, wr2, ymulwi2);
  204.         y9 = vec_msub(y2, wr2, ymulwi2);
  205.         y13 = vec_madd(y3, wr3, ymulwi3);
  206.         y14 = vec_msub(y3, wr3, ymulwi3);
  207.  
  208.         x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3));
  209.         x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3));
  210.  
  211.         y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
  212.         y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
  213.  
  214.         x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2));
  215.         x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1));
  216.  
  217.         y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
  218.         y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
  219.  
  220.         x18 = vec_add(vz0, x16);
  221.         x19 = vec_sub(vz0, x16);
  222.         x20 = vec_add(vzo1, x17);
  223.         x21 = vec_sub(vzo1, x17);
  224.  
  225.         y18 = vec_add(vz0plus1, y16);
  226.         y19 = vec_sub(vz0plus1, y16);
  227.         y20 = vec_add(vzo1plus1, y17);
  228.         y21 = vec_sub(vzo1plus1, y17);
  229.  
  230.         x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3));
  231.         x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3));
  232.  
  233.         y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
  234.         y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
  235.  
  236.         vec_st(x18, 0, &(out[0]));
  237.         vec_st(y18, 16, &(out[0]));
  238.         vec_st(x22, i1, &(out[0]));
  239.         vec_st(y22, i1+16, &(out[0]));
  240.         vec_st(x19, i2, &(out[0]));
  241.         vec_st(y19, i2+16, &(out[0]));
  242.         vec_st(x23, i3, &(out[0]));
  243.         vec_st(y23, i3+16, &(out[0]));
  244.     } while (n-=2);
  245. }
  246.  
  247. inline static void fft2_vsx_interleave(FFTComplex *z)
  248. {
  249.     FFTSample r1, i1;
  250.  
  251.     r1 = z[0].re - z[1].re;
  252.     z[0].re += z[1].re;
  253.     z[1].re = r1;
  254.  
  255.     i1 = z[0].im - z[1].im;
  256.     z[0].im += z[1].im;
  257.     z[1].im = i1;
  258.  }
  259.  
  260. inline static void fft4_vsx_interleave(FFTComplex *z)
  261. {
  262.     vec_f a, b, c, d;
  263.     float* out=  (float*)z;
  264.     a = vec_ld(0, &(out[0]));
  265.     b = vec_ld(byte_2complex, &(out[0]));
  266.  
  267.     c = vec_perm(a, b, vcprm(0,1,s2,s1));
  268.     d = vec_perm(a, b, vcprm(2,3,s0,s3));
  269.     a = vec_add(c, d);
  270.     b = vec_sub(c, d);
  271.  
  272.     c = vec_perm(a, b, vcprm(0,1,s0,s1));
  273.     d = vec_perm(a, b, vcprm(2,3,s3,s2));
  274.  
  275.     a = vec_add(c, d);
  276.     b = vec_sub(c, d);
  277.     vec_st(a, 0, &(out[0]));
  278.     vec_st(b, byte_2complex, &(out[0]));
  279. }
  280.  
  281. inline static void fft8_vsx_interleave(FFTComplex *z)
  282. {
  283.     vec_f vz0, vz1, vz2, vz3;
  284.     vec_f x0, x1, x2, x3;
  285.     vec_f x4, x5, x6, x7;
  286.     vec_f x8, x9, x10, x11;
  287.     vec_f x12, x13, x14, x15;
  288.     vec_f x16, x17, x18, x19;
  289.     vec_f x20, x21, x22, x23;
  290.     vec_f x24, x25, x26, x27;
  291.     vec_f x28, x29, x30, x31;
  292.     vec_f x32, x33, x34;
  293.  
  294.     float* out=  (float*)z;
  295.     vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
  296.  
  297.     vz0 = vec_ld(0, &(out[0]));
  298.     vz1 = vec_ld(byte_2complex, &(out[0]));
  299.     vz2 = vec_ld(byte_4complex, &(out[0]));
  300.     vz3 = vec_ld(byte_6complex, &(out[0]));
  301.  
  302.     x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
  303.     x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
  304.     x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1));
  305.     x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3));
  306.  
  307.     x4 = vec_add(x0, x1);
  308.     x5 = vec_sub(x0, x1);
  309.     x6 = vec_add(x2, x3);
  310.     x7 = vec_sub(x2, x3);
  311.  
  312.     x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1));
  313.     x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2));
  314.     x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1));
  315.     x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3));
  316.  
  317.     x12 = vec_add(x8, x9);
  318.     x13 = vec_sub(x8, x9);
  319.     x14 = vec_add(x10, x11);
  320.     x15 = vec_sub(x10, x11);
  321.     x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1));
  322.     x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1));
  323.     x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1));
  324.     x19 = vec_add(x16, x18); // z0.r  z2.r  z0.i  z2.i
  325.     x20 = vec_sub(x16, x18); // z4.r  z6.r  z4.i  z6.i
  326.  
  327.     x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3));
  328.     x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3));
  329.     x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2));
  330.     x24 = vec_add(x22, x23);
  331.     x25 = vec_sub(x22, x23);
  332.     x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1);
  333.  
  334.     x27 = vec_add(x21, x26); // z1.r  z7.r z1.i z3.i
  335.     x28 = vec_sub(x21, x26); //z5.r  z3.r z5.i z7.i
  336.  
  337.     x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r  z0.i  z1.r  z1.i
  338.     x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r  z2.i  z7.r  z3.i
  339.     x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r  z4.i  z5.r  z5.i
  340.     x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r  z6.i  z3.r  z7.i
  341.     x33 = vec_perm(x30, x32, vcprm(0,1,s2,3));  // z2.r  z2.i  z3.r  z3.i
  342.     x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r  z6.i  z7.r  z7.i
  343.  
  344.     vec_st(x29, 0, &(out[0]));
  345.     vec_st(x33, byte_2complex, &(out[0]));
  346.     vec_st(x31, byte_4complex, &(out[0]));
  347.     vec_st(x34, byte_6complex, &(out[0]));
  348. }
  349.  
  350. inline static void fft16_vsx_interleave(FFTComplex *z)
  351. {
  352.     float* out=  (float*)z;
  353.     vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
  354.     vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]};
  355.     vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]};
  356.     vec_f vz0, vz1, vz2, vz3;
  357.     vec_f vz4, vz5, vz6, vz7;
  358.     vec_f x0, x1, x2, x3;
  359.     vec_f x4, x5, x6, x7;
  360.     vec_f x8, x9, x10, x11;
  361.     vec_f x12, x13, x14, x15;
  362.     vec_f x16, x17, x18, x19;
  363.     vec_f x20, x21, x22, x23;
  364.     vec_f x24, x25, x26, x27;
  365.     vec_f x28, x29, x30, x31;
  366.     vec_f x32, x33, x34, x35;
  367.     vec_f x36, x37, x38, x39;
  368.     vec_f x40, x41, x42, x43;
  369.     vec_f x44, x45, x46, x47;
  370.     vec_f x48, x49, x50, x51;
  371.     vec_f x52, x53, x54, x55;
  372.     vec_f x56, x57, x58, x59;
  373.     vec_f x60, x61, x62, x63;
  374.     vec_f x64, x65, x66, x67;
  375.     vec_f x68, x69, x70, x71;
  376.     vec_f x72, x73, x74, x75;
  377.     vec_f x76, x77, x78, x79;
  378.     vec_f x80, x81, x82, x83;
  379.     vec_f x84, x85, x86;
  380.  
  381.     vz0 = vec_ld(0, &(out[0]));
  382.     vz1 = vec_ld(byte_2complex, &(out[0]));
  383.     vz2 = vec_ld(byte_4complex, &(out[0]));
  384.     vz3 = vec_ld(byte_6complex, &(out[0]));
  385.     vz4 = vec_ld(byte_8complex, &(out[0]));
  386.     vz5 = vec_ld(byte_10complex, &(out[0]));
  387.     vz6 = vec_ld(byte_12complex, &(out[0]));
  388.     vz7 = vec_ld(byte_14complex, &(out[0]));
  389.  
  390.     x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
  391.     x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
  392.     x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
  393.     x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
  394.  
  395.     x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1));
  396.     x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3));
  397.     x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1));
  398.     x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3));
  399.  
  400.     x8 = vec_add(x0, x1);
  401.     x9 = vec_sub(x0, x1);
  402.     x10 = vec_add(x2, x3);
  403.     x11 = vec_sub(x2, x3);
  404.  
  405.     x12 = vec_add(x4, x5);
  406.     x13 = vec_sub(x4, x5);
  407.     x14 = vec_add(x6, x7);
  408.     x15 = vec_sub(x6, x7);
  409.  
  410.     x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1));
  411.     x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2));
  412.     x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2));
  413.     x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3));
  414.     x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1));
  415.     x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3));
  416.     x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1));
  417.     x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2));
  418.  
  419.     x24 = vec_add(x16, x17);
  420.     x25 = vec_sub(x16, x17);
  421.     x26 = vec_add(x18, x19);
  422.     x27 = vec_sub(x18, x19);
  423.     x28 = vec_add(x20, x21);
  424.     x29 = vec_sub(x20, x21);
  425.     x30 = vec_add(x22, x23);
  426.     x31 = vec_sub(x22, x23);
  427.  
  428.     x32 = vec_add(x24, x26);
  429.     x33 = vec_sub(x24, x26);
  430.     x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1));
  431.  
  432.     x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2));
  433.     x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3));
  434.     x37 = vec_add(x35, x36);
  435.     x38 = vec_sub(x35, x36);
  436.     x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0));
  437.  
  438.     x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3));
  439.     x41 = vec_perm(x26,  x37, vcprm(2,3,s3,s2));
  440.     x42 = vec_add(x40, x41);
  441.     x43 = vec_sub(x40, x41);
  442.     x44 = vec_mul(x42, vc0);
  443.     x45 = vec_mul(x43, vc0);
  444.  
  445.     x46 = vec_add(x34, x39);  // z0.r  z0.i  z4.r  z4.i
  446.     x47 = vec_sub(x34, x39);  // z8.r  z8.i  z12.r  z12.i
  447.  
  448.     x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2));
  449.     x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0));
  450.     x50 = vec_add(x48, x49);
  451.     x51 = vec_sub(x48, x49);
  452.     x52 = vec_mul(x50, vc1);
  453.     x53 = vec_mul(x50, vc2);
  454.     x54 = vec_mul(x51, vc1);
  455.     x55 = vec_mul(x51, vc2);
  456.  
  457.     x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3));
  458.     x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0));
  459.     x58 = vec_add(x56, x57);
  460.     x59 = vec_sub(x56, x57);
  461.  
  462.     x60 = vec_perm(x54, x55, vcprm(1,0,3,2));
  463.     x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2));
  464.     x62 = vec_add(x52, x61);
  465.     x63 = vec_sub(x52, x61);
  466.     x64 = vec_add(x60, x53);
  467.     x65 = vec_sub(x60, x53);
  468.     x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2));
  469.     x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2));
  470.  
  471.     x68 = vec_add(x58, x66); // z1.r    z1.i  z3.r    z3.i
  472.     x69 = vec_sub(x58, x66); // z9.r    z9.i  z11.r  z11.i
  473.     x70 = vec_add(x59, x67); // z5.r    z5.i  z15.r  z15.i
  474.     x71 = vec_sub(x59, x67); // z13.r  z13.i z7.r   z7.i
  475.  
  476.     x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3));
  477.     x73 = vec_add(x25, x72);
  478.     x74 = vec_sub(x25, x72);
  479.     x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1));
  480.     x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3));
  481.     x77 = vec_add(x75, x76); // z2.r   z2.i    z6.r    z6.i
  482.     x78 = vec_sub(x75, x76); // z10.r  z10.i  z14.r  z14.i
  483.  
  484.     x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r  z0.i  z1.r  z1.i
  485.     x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r  z2.i  z3.r  z3.i
  486.     x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r  z4.i  z5.r  z5.i
  487.     x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r  z6.i  z7.r  z7.i
  488.     vec_st(x79, 0, &(out[0]));
  489.     vec_st(x80, byte_2complex, &(out[0]));
  490.     vec_st(x81, byte_4complex, &(out[0]));
  491.     vec_st(x82, byte_6complex, &(out[0]));
  492.     x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r  z8.i  z9.r  z9.i
  493.     x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r  z10.i  z11.r  z11.i
  494.     x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r  z12.i  z13.r  z13.i
  495.     x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r  z14.i  z15.r  z15.i
  496.     vec_st(x83, byte_8complex, &(out[0]));
  497.     vec_st(x84, byte_10complex, &(out[0]));
  498.     vec_st(x85, byte_12complex, &(out[0]));
  499.     vec_st(x86, byte_14complex, &(out[0]));
  500. }
  501.  
  502. inline static void fft4_vsx(FFTComplex *z)
  503. {
  504.     vec_f a, b, c, d;
  505.     float* out=  (float*)z;
  506.     a = vec_ld(0, &(out[0]));
  507.     b = vec_ld(byte_2complex, &(out[0]));
  508.  
  509.     c = vec_perm(a, b, vcprm(0,1,s2,s1));
  510.     d = vec_perm(a, b, vcprm(2,3,s0,s3));
  511.     a = vec_add(c, d);
  512.     b = vec_sub(c, d);
  513.  
  514.     c = vec_perm(a,b, vcprm(0,s0,1,s1));
  515.     d = vec_perm(a, b, vcprm(2,s3,3,s2));
  516.  
  517.     a = vec_add(c, d);
  518.     b = vec_sub(c, d);
  519.  
  520.     c = vec_perm(a, b, vcprm(0,1,s0,s1));
  521.     d = vec_perm(a, b, vcprm(2,3,s2,s3));
  522.  
  523.     vec_st(c, 0, &(out[0]));
  524.     vec_st(d, byte_2complex, &(out[0]));
  525.     return;
  526. }
  527.  
  528. inline static void fft8_vsx(FFTComplex *z)
  529. {
  530.     vec_f vz0, vz1, vz2, vz3;
  531.     vec_f vz4, vz5, vz6, vz7, vz8;
  532.  
  533.     float* out=  (float*)z;
  534.     vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
  535.     vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
  536.     vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
  537.  
  538.     vz0 = vec_ld(0, &(out[0]));
  539.     vz1 = vec_ld(byte_2complex, &(out[0]));
  540.     vz2 = vec_ld(byte_4complex, &(out[0]));
  541.     vz3 = vec_ld(byte_6complex, &(out[0]));
  542.  
  543.     vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
  544.     vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
  545.     vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
  546.     vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
  547.  
  548.     vz2 = vec_add(vz6, vz7);
  549.     vz3 = vec_sub(vz6, vz7);
  550.     vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
  551.  
  552.     vz0 = vec_add(vz4, vz5);
  553.     vz1 = vec_sub(vz4, vz5);
  554.  
  555.     vz3 = vec_madd(vz3, vc1, vc0);
  556.     vz3 = vec_madd(vz8, vc2, vz3);
  557.  
  558.     vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
  559.     vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
  560.     vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
  561.     vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
  562.  
  563.     vz0 = vec_add(vz4, vz5);
  564.     vz1 = vec_sub(vz4, vz5);
  565.     vz2 = vec_add(vz6, vz7);
  566.     vz3 = vec_sub(vz6, vz7);
  567.  
  568.     vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
  569.     vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
  570.     vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
  571.     vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
  572.  
  573.  
  574.     vz2 = vec_sub(vz4, vz6);
  575.     vz3 = vec_sub(vz5, vz7);
  576.  
  577.     vz0 = vec_add(vz4, vz6);
  578.     vz1 = vec_add(vz5, vz7);
  579.  
  580.     vec_st(vz0, 0, &(out[0]));
  581.     vec_st(vz1, byte_2complex, &(out[0]));
  582.     vec_st(vz2, byte_4complex, &(out[0]));
  583.     vec_st(vz3, byte_6complex, &(out[0]));
  584.     return;
  585. }
  586.  
  587. inline static void fft16_vsx(FFTComplex *z)
  588. {
  589.     float* out=  (float*)z;
  590.     vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
  591.     vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
  592.     vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
  593.     vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343};
  594.     vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953};
  595.     vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953};
  596.  
  597.     vec_f vz0, vz1, vz2, vz3;
  598.     vec_f vz4, vz5, vz6, vz7;
  599.     vec_f vz8, vz9, vz10, vz11;
  600.     vec_f vz12, vz13;
  601.  
  602.     vz0 = vec_ld(byte_8complex, &(out[0]));
  603.     vz1 = vec_ld(byte_10complex, &(out[0]));
  604.     vz2 = vec_ld(byte_12complex, &(out[0]));
  605.     vz3 = vec_ld(byte_14complex, &(out[0]));
  606.  
  607.     vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
  608.     vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
  609.     vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1));
  610.     vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3));
  611.  
  612.     vz0 = vec_add(vz4, vz5);
  613.     vz1= vec_sub(vz4, vz5);
  614.     vz2 = vec_add(vz6, vz7);
  615.     vz3 = vec_sub(vz6, vz7);
  616.  
  617.     vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
  618.     vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
  619.     vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
  620.     vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2));
  621.  
  622.     vz0 = vec_add(vz4, vz5);
  623.     vz1 = vec_sub(vz4, vz5);
  624.     vz2 = vec_add(vz6, vz7);
  625.     vz3 = vec_sub(vz6, vz7);
  626.  
  627.     vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
  628.     vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
  629.  
  630.     vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
  631.     vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
  632.  
  633.     vz0 = vec_ld(0, &(out[0]));
  634.     vz1 = vec_ld(byte_2complex, &(out[0]));
  635.     vz2 = vec_ld(byte_4complex, &(out[0]));
  636.     vz3 = vec_ld(byte_6complex, &(out[0]));
  637.     vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
  638.     vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
  639.     vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
  640.     vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
  641.  
  642.     vz2 = vec_add(vz10, vz11);
  643.     vz3 = vec_sub(vz10, vz11);
  644.     vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
  645.     vz0 = vec_add(vz8, vz9);
  646.     vz1 = vec_sub(vz8, vz9);
  647.  
  648.     vz3 = vec_madd(vz3, vc1, vc0);
  649.     vz3 = vec_madd(vz12, vc2, vz3);
  650.     vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
  651.     vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
  652.     vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
  653.     vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
  654.  
  655.     vz0 = vec_add(vz8, vz9);
  656.     vz1 = vec_sub(vz8, vz9);
  657.     vz2 = vec_add(vz10, vz11);
  658.     vz3 = vec_sub(vz10, vz11);
  659.  
  660.     vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
  661.     vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
  662.     vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
  663.     vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
  664.  
  665.     vz2 = vec_sub(vz8, vz10);
  666.     vz3 = vec_sub(vz9, vz11);
  667.     vz0 = vec_add(vz8, vz10);
  668.     vz1 = vec_add(vz9, vz11);
  669.  
  670.     vz8 = vec_madd(vz4, vc3, vc0);
  671.     vz9 = vec_madd(vz5, vc3, vc0);
  672.     vz10 = vec_madd(vz6, vc3, vc0);
  673.     vz11 = vec_madd(vz7, vc3, vc0);
  674.  
  675.     vz8 = vec_madd(vz5, vc4, vz8);
  676.     vz9 = vec_madd(vz4, vc5, vz9);
  677.     vz10 = vec_madd(vz7, vc5, vz10);
  678.     vz11 = vec_madd(vz6, vc4, vz11);
  679.  
  680.     vz12 = vec_sub(vz10, vz8);
  681.     vz10 = vec_add(vz10, vz8);
  682.  
  683.     vz13 = vec_sub(vz9, vz11);
  684.     vz11 = vec_add(vz9, vz11);
  685.  
  686.     vz4 = vec_sub(vz0, vz10);
  687.     vz0 = vec_add(vz0, vz10);
  688.  
  689.     vz7= vec_sub(vz3, vz12);
  690.     vz3= vec_add(vz3, vz12);
  691.  
  692.     vz5 = vec_sub(vz1, vz11);
  693.     vz1 = vec_add(vz1, vz11);
  694.  
  695.     vz6 = vec_sub(vz2, vz13);
  696.     vz2 = vec_add(vz2, vz13);
  697.  
  698.     vec_st(vz0, 0, &(out[0]));
  699.     vec_st(vz1, byte_2complex, &(out[0]));
  700.     vec_st(vz2, byte_4complex, &(out[0]));
  701.     vec_st(vz3, byte_6complex, &(out[0]));
  702.     vec_st(vz4, byte_8complex, &(out[0]));
  703.     vec_st(vz5, byte_10complex, &(out[0]));
  704.     vec_st(vz6, byte_12complex, &(out[0]));
  705.     vec_st(vz7, byte_14complex, &(out[0]));
  706.     return;
  707.  
  708. }
  709. inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n)
  710. {
  711.     int o1 = n<<1;
  712.     int o2 = n<<2;
  713.     int o3 = o1+o2;
  714.     int i1, i2, i3;
  715.     FFTSample* out = (FFTSample*)z;
  716.     const FFTSample *wim = wre+o1;
  717.     vec_f v0, v1, v2, v3;
  718.     vec_f v4, v5, v6, v7;
  719.     vec_f v8, v9, v10, v11;
  720.     vec_f v12, v13;
  721.  
  722.     n = n-2;
  723.     i1 = o1*sizeof(FFTComplex);
  724.     i2 = o2*sizeof(FFTComplex);
  725.     i3 = o3*sizeof(FFTComplex);
  726.  
  727.     v8 = vec_ld(0, &(wre[0]));
  728.     v10 = vec_ld(0, &(wim[0]));
  729.     v9 = vec_ld(0, &(wim[-4]));
  730.     v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
  731.  
  732.     v4 = vec_ld(i2, &(out[0]));
  733.     v5 = vec_ld(i2+16, &(out[0]));
  734.     v6 = vec_ld(i3, &(out[0]));
  735.     v7 = vec_ld(i3+16, &(out[0]));
  736.     v10 = vec_mul(v4, v8); // r2*wre
  737.     v11 = vec_mul(v5, v8); // i2*wre
  738.     v12 = vec_mul(v6, v8); // r3*wre
  739.     v13 = vec_mul(v7, v8); // i3*wre
  740.  
  741.     v0 = vec_ld(0, &(out[0])); // r0
  742.     v3 = vec_ld(i1+16, &(out[0])); // i1
  743.     v10 = vec_madd(v5, v9, v10); // r2*wim
  744.     v11 = vec_nmsub(v4, v9, v11); // i2*wim
  745.     v12 = vec_nmsub(v7, v9, v12); // r3*wim
  746.     v13 = vec_madd(v6, v9, v13); // i3*wim
  747.  
  748.     v1 = vec_ld(16, &(out[0])); // i0
  749.     v2 = vec_ld(i1, &(out[0])); // r1
  750.     v8 = vec_sub(v12, v10);
  751.     v12 = vec_add(v12, v10);
  752.     v9 = vec_sub(v11, v13);
  753.     v13 = vec_add(v11, v13);
  754.     v4 = vec_sub(v0, v12);
  755.     v0 = vec_add(v0, v12);
  756.     v7 = vec_sub(v3, v8);
  757.     v3 = vec_add(v3, v8);
  758.  
  759.     vec_st(v0, 0, &(out[0])); // r0
  760.     vec_st(v3, i1+16, &(out[0])); // i1
  761.     vec_st(v4, i2, &(out[0])); // r2
  762.     vec_st(v7, i3+16, &(out[0]));// i3
  763.  
  764.     v5 = vec_sub(v1, v13);
  765.     v1 = vec_add(v1, v13);
  766.     v6 = vec_sub(v2, v9);
  767.     v2 = vec_add(v2, v9);
  768.  
  769.     vec_st(v1, 16, &(out[0])); // i0
  770.     vec_st(v2, i1, &(out[0])); // r1
  771.     vec_st(v5, i2+16, &(out[0])); // i2
  772.     vec_st(v6, i3, &(out[0])); // r3
  773.  
  774.     do {
  775.         out += 8;
  776.         wre += 4;
  777.         wim -= 4;
  778.  
  779.         v8 = vec_ld(0, &(wre[0]));
  780.         v10 = vec_ld(0, &(wim[0]));
  781.         v9 = vec_ld(0, &(wim[-4]));
  782.         v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
  783.  
  784.         v4 = vec_ld(i2, &(out[0])); // r2
  785.         v5 = vec_ld(i2+16, &(out[0])); // i2
  786.         v6 = vec_ld(i3, &(out[0])); // r3
  787.         v7 = vec_ld(i3+16, &(out[0]));// i3
  788.         v10 = vec_mul(v4, v8); // r2*wre
  789.         v11 = vec_mul(v5, v8); // i2*wre
  790.         v12 = vec_mul(v6, v8); // r3*wre
  791.         v13 = vec_mul(v7, v8); // i3*wre
  792.  
  793.         v0 = vec_ld(0, &(out[0])); // r0
  794.         v3 = vec_ld(i1+16, &(out[0])); // i1
  795.         v10 = vec_madd(v5, v9, v10); // r2*wim
  796.         v11 = vec_nmsub(v4, v9, v11); // i2*wim
  797.         v12 = vec_nmsub(v7, v9, v12); // r3*wim
  798.         v13 = vec_madd(v6, v9, v13); // i3*wim
  799.  
  800.         v1 = vec_ld(16, &(out[0])); // i0
  801.         v2 = vec_ld(i1, &(out[0])); // r1
  802.         v8 = vec_sub(v12, v10);
  803.         v12 = vec_add(v12, v10);
  804.         v9 = vec_sub(v11, v13);
  805.         v13 = vec_add(v11, v13);
  806.         v4 = vec_sub(v0, v12);
  807.         v0 = vec_add(v0, v12);
  808.         v7 = vec_sub(v3, v8);
  809.         v3 = vec_add(v3, v8);
  810.  
  811.         vec_st(v0, 0, &(out[0])); // r0
  812.         vec_st(v3, i1+16, &(out[0])); // i1
  813.         vec_st(v4, i2, &(out[0])); // r2
  814.         vec_st(v7, i3+16, &(out[0])); // i3
  815.  
  816.         v5 = vec_sub(v1, v13);
  817.         v1 = vec_add(v1, v13);
  818.         v6 = vec_sub(v2, v9);
  819.         v2 = vec_add(v2, v9);
  820.  
  821.         vec_st(v1, 16, &(out[0])); // i0
  822.         vec_st(v2, i1, &(out[0])); // r1
  823.         vec_st(v5, i2+16, &(out[0])); // i2
  824.         vec_st(v6, i3, &(out[0])); // r3
  825.     } while (n-=2);
  826. }
  827.  
  828. #endif
  829.  
  830. #endif /* AVCODEC_PPC_FFT_VSX_H */
  831.