Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* 32 point SSE-optimized DCT transform
  3. ;* Copyright (c) 2010 Vitor Sessak
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21.  
  22. %include "libavutil/x86/x86util.asm"
  23.  
  24. SECTION_RODATA 32
  25.  
  26. align 32
  27. ps_cos_vec: dd   0.500603,  0.505471,  0.515447,  0.531043
  28.             dd   0.553104,  0.582935,  0.622504,  0.674808
  29.             dd -10.190008, -3.407609, -2.057781, -1.484165
  30.             dd  -1.169440, -0.972568, -0.839350, -0.744536
  31.             dd   0.502419,  0.522499,  0.566944,  0.646822
  32.             dd   0.788155,  1.060678,  1.722447,  5.101149
  33.             dd   0.509796,  0.601345,  0.899976,  2.562916
  34.             dd   0.509796,  0.601345,  0.899976,  2.562916
  35.             dd   1.000000,  1.000000,  1.306563,  0.541196
  36.             dd   1.000000,  1.000000,  1.306563,  0.541196
  37.             dd   1.000000,  0.707107,  1.000000, -0.707107
  38.             dd   1.000000,  0.707107,  1.000000, -0.707107
  39.             dd   0.707107,  0.707107,  0.707107,  0.707107
  40.  
  41. align 32
  42. ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
  43.  
  44. %macro BUTTERFLY 4
  45.     subps  %4, %1, %2
  46.     addps  %2, %2, %1
  47.     mulps  %1, %4, %3
  48. %endmacro
  49.  
  50. %macro BUTTERFLY0 5
  51. %if cpuflag(sse2) && notcpuflag(avx)
  52.     pshufd %4, %1, %5
  53.     xorps  %1, %2
  54.     addps  %1, %4
  55.     mulps  %1, %3
  56. %else
  57.     shufps %4, %1, %1, %5
  58.     xorps  %1, %1, %2
  59.     addps  %4, %4, %1
  60.     mulps  %1, %4, %3
  61. %endif
  62. %endmacro
  63.  
  64. %macro BUTTERFLY2 4
  65.     BUTTERFLY0 %1, %2, %3, %4, 0x1b
  66. %endmacro
  67.  
  68. %macro BUTTERFLY3 4
  69.     BUTTERFLY0 %1, %2, %3, %4, 0xb1
  70. %endmacro
  71.  
  72. %macro BUTTERFLY3V 5
  73.     movaps m%5, m%1
  74.     addps  m%1, m%2
  75.     subps  m%5, m%2
  76.     SWAP %2, %5
  77.     mulps  m%2, [ps_cos_vec+192]
  78.     movaps m%5, m%3
  79.     addps  m%3, m%4
  80.     subps  m%4, m%5
  81.     mulps  m%4, [ps_cos_vec+192]
  82. %endmacro
  83.  
  84. %macro PASS6_AND_PERMUTE 0
  85.     mov         tmpd, [outq+4]
  86.     movss         m7, [outq+72]
  87.     addss         m7, [outq+76]
  88.     movss         m3, [outq+56]
  89.     addss         m3, [outq+60]
  90.     addss         m4, m3
  91.     movss         m2, [outq+52]
  92.     addss         m2, m3
  93.     movss         m3, [outq+104]
  94.     addss         m3, [outq+108]
  95.     addss         m1, m3
  96.     addss         m5, m4
  97.     movss [outq+ 16], m1
  98.     movss         m1, [outq+100]
  99.     addss         m1, m3
  100.     movss         m3, [outq+40]
  101.     movss [outq+ 48], m1
  102.     addss         m3, [outq+44]
  103.     movss         m1, [outq+100]
  104.     addss         m4, m3
  105.     addss         m3, m2
  106.     addss         m1, [outq+108]
  107.     movss [outq+ 40], m3
  108.     addss         m2, [outq+36]
  109.     movss         m3, [outq+8]
  110.     movss [outq+ 56], m2
  111.     addss         m3, [outq+12]
  112.     movss [outq+ 32], m3
  113.     movss         m3, [outq+80]
  114.     movss [outq+  8], m5
  115.     movss [outq+ 80], m1
  116.     movss         m2, [outq+52]
  117.     movss         m5, [outq+120]
  118.     addss         m5, [outq+124]
  119.     movss         m1, [outq+64]
  120.     addss         m2, [outq+60]
  121.     addss         m0, m5
  122.     addss         m5, [outq+116]
  123.     mov    [outq+64], tmpd
  124.     addss         m6, m0
  125.     addss         m1, m6
  126.     mov         tmpd, [outq+12]
  127.     mov   [outq+ 96], tmpd
  128.     movss [outq+  4], m1
  129.     movss         m1, [outq+24]
  130.     movss [outq+ 24], m4
  131.     movss         m4, [outq+88]
  132.     addss         m4, [outq+92]
  133.     addss         m3, m4
  134.     addss         m4, [outq+84]
  135.     mov         tmpd, [outq+108]
  136.     addss         m1, [outq+28]
  137.     addss         m0, m1
  138.     addss         m1, m5
  139.     addss         m6, m3
  140.     addss         m3, m0
  141.     addss         m0, m7
  142.     addss         m5, [outq+20]
  143.     addss         m7, m1
  144.     movss [outq+ 12], m6
  145.     mov   [outq+112], tmpd
  146.     movss         m6, [outq+28]
  147.     movss [outq+ 28], m0
  148.     movss         m0, [outq+36]
  149.     movss [outq+ 36], m7
  150.     addss         m1, m4
  151.     movss         m7, [outq+116]
  152.     addss         m0, m2
  153.     addss         m7, [outq+124]
  154.     movss [outq+ 72], m0
  155.     movss         m0, [outq+44]
  156.     addss         m2, m0
  157.     movss [outq+ 44], m1
  158.     movss [outq+ 88], m2
  159.     addss         m0, [outq+60]
  160.     mov         tmpd, [outq+60]
  161.     mov   [outq+120], tmpd
  162.     movss [outq+104], m0
  163.     addss         m4, m5
  164.     addss         m5, [outq+68]
  165.     movss  [outq+52], m4
  166.     movss  [outq+60], m5
  167.     movss         m4, [outq+68]
  168.     movss         m5, [outq+20]
  169.     movss [outq+ 20], m3
  170.     addss         m5, m7
  171.     addss         m7, m6
  172.     addss         m4, m5
  173.     movss         m2, [outq+84]
  174.     addss         m2, [outq+92]
  175.     addss         m5, m2
  176.     movss [outq+ 68], m4
  177.     addss         m2, m7
  178.     movss         m4, [outq+76]
  179.     movss [outq+ 84], m2
  180.     movss [outq+ 76], m5
  181.     addss         m7, m4
  182.     addss         m6, [outq+124]
  183.     addss         m4, m6
  184.     addss         m6, [outq+92]
  185.     movss [outq+100], m4
  186.     movss [outq+108], m6
  187.     movss         m6, [outq+92]
  188.     movss  [outq+92], m7
  189.     addss         m6, [outq+124]
  190.     movss [outq+116], m6
  191. %endmacro
  192.  
  193. INIT_YMM avx
  194. SECTION_TEXT
  195. %if HAVE_AVX_EXTERNAL
  196. ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
  197. cglobal dct32_float, 2,3,8, out, in, tmp
  198.     ; pass 1
  199.     vmovaps     m4, [inq+0]
  200.     vinsertf128 m5, m5, [inq+96], 1
  201.     vinsertf128 m5, m5, [inq+112], 0
  202.     vshufps     m5, m5, m5, 0x1b
  203.     BUTTERFLY   m4, m5, [ps_cos_vec], m6
  204.  
  205.     vmovaps     m2, [inq+64]
  206.     vinsertf128 m6, m6, [inq+32], 1
  207.     vinsertf128 m6, m6, [inq+48], 0
  208.     vshufps     m6, m6, m6, 0x1b
  209.     BUTTERFLY   m2, m6, [ps_cos_vec+32], m0
  210.  
  211.     ; pass 2
  212.  
  213.     BUTTERFLY  m5, m6, [ps_cos_vec+64], m0
  214.     BUTTERFLY  m4, m2, [ps_cos_vec+64], m7
  215.  
  216.  
  217.     ; pass 3
  218.     vperm2f128  m3, m6, m4, 0x31
  219.     vperm2f128  m1, m6, m4, 0x20
  220.     vshufps     m3, m3, m3, 0x1b
  221.  
  222.     BUTTERFLY   m1, m3, [ps_cos_vec+96], m6
  223.  
  224.  
  225.     vperm2f128  m4, m5, m2, 0x20
  226.     vperm2f128  m5, m5, m2, 0x31
  227.     vshufps     m5, m5, m5, 0x1b
  228.  
  229.     BUTTERFLY   m4, m5, [ps_cos_vec+96], m6
  230.  
  231.     ; pass 4
  232.     vmovaps m6, [ps_p1p1m1m1+0]
  233.     vmovaps m2, [ps_cos_vec+128]
  234.  
  235.     BUTTERFLY2  m5, m6, m2, m7
  236.     BUTTERFLY2  m4, m6, m2, m7
  237.     BUTTERFLY2  m1, m6, m2, m7
  238.     BUTTERFLY2  m3, m6, m2, m7
  239.  
  240.  
  241.     ; pass 5
  242.     vshufps m6, m6, m6, 0xcc
  243.     vmovaps m2, [ps_cos_vec+160]
  244.  
  245.     BUTTERFLY3  m5, m6, m2, m7
  246.     BUTTERFLY3  m4, m6, m2, m7
  247.     BUTTERFLY3  m1, m6, m2, m7
  248.     BUTTERFLY3  m3, m6, m2, m7
  249.  
  250.     vperm2f128  m6, m3, m3, 0x31
  251.     vmovaps [outq], m3
  252.  
  253.     vextractf128  [outq+64], m5, 1
  254.     vextractf128  [outq+32], m5, 0
  255.  
  256.     vextractf128  [outq+80], m4, 1
  257.     vextractf128  [outq+48], m4, 0
  258.  
  259.     vperm2f128  m0, m1, m1, 0x31
  260.     vmovaps [outq+96], m1
  261.  
  262.     vzeroupper
  263.  
  264.     ;    pass 6, no SIMD...
  265. INIT_XMM
  266.     PASS6_AND_PERMUTE
  267.     RET
  268. %endif
  269.  
  270. %if ARCH_X86_64
  271. %define SPILL SWAP
  272. %define UNSPILL SWAP
  273.  
  274. %macro PASS5 0
  275.     nop ; FIXME code alignment
  276.     SWAP 5, 8
  277.     SWAP 4, 12
  278.     SWAP 6, 14
  279.     SWAP 7, 13
  280.     SWAP 0, 15
  281.     PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
  282.     TRANSPOSE4x4PS 8, 9, 10, 11, 0
  283.     BUTTERFLY3V    8, 9, 10, 11, 0
  284.     addps   m10, m11
  285.     TRANSPOSE4x4PS 12, 13, 14, 15, 0
  286.     BUTTERFLY3V    12, 13, 14, 15, 0
  287.     addps   m14, m15
  288.     addps   m12, m14
  289.     addps   m14, m13
  290.     addps   m13, m15
  291. %endmacro
  292.  
  293. %macro PASS6 0
  294.     SWAP 9, 12
  295.     SWAP 11, 14
  296.     movss [outq+0x00], m8
  297.     pshuflw m0, m8, 0xe
  298.     movss [outq+0x10], m9
  299.     pshuflw m1, m9, 0xe
  300.     movss [outq+0x20], m10
  301.     pshuflw m2, m10, 0xe
  302.     movss [outq+0x30], m11
  303.     pshuflw m3, m11, 0xe
  304.     movss [outq+0x40], m12
  305.     pshuflw m4, m12, 0xe
  306.     movss [outq+0x50], m13
  307.     pshuflw m5, m13, 0xe
  308.     movss [outq+0x60], m14
  309.     pshuflw m6, m14, 0xe
  310.     movaps [outq+0x70], m15
  311.     pshuflw m7, m15, 0xe
  312.     addss   m0, m1
  313.     addss   m1, m2
  314.     movss [outq+0x08], m0
  315.     addss   m2, m3
  316.     movss [outq+0x18], m1
  317.     addss   m3, m4
  318.     movss [outq+0x28], m2
  319.     addss   m4, m5
  320.     movss [outq+0x38], m3
  321.     addss   m5, m6
  322.     movss [outq+0x48], m4
  323.     addss   m6, m7
  324.     movss [outq+0x58], m5
  325.     movss [outq+0x68], m6
  326.     movss [outq+0x78], m7
  327.  
  328.     PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
  329.     movhlps m0, m1
  330.     pshufd  m1, m1, 3
  331.     SWAP 0, 2, 4, 6, 8, 10, 12, 14
  332.     SWAP 1, 3, 5, 7, 9, 11, 13, 15
  333. %rep 7
  334.     movhlps m0, m1
  335.     pshufd  m1, m1, 3
  336.     addss   m15, m1
  337.     SWAP 0, 2, 4, 6, 8, 10, 12, 14
  338.     SWAP 1, 3, 5, 7, 9, 11, 13, 15
  339. %endrep
  340. %assign i 4
  341. %rep 15
  342.     addss m0, m1
  343.     movss [outq+i], m0
  344.     SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
  345.     %assign i i+8
  346. %endrep
  347. %endmacro
  348.  
  349. %else ; ARCH_X86_32
  350. %macro SPILL 2 ; xmm#, mempos
  351.     movaps [outq+(%2-8)*16], m%1
  352. %endmacro
  353. %macro UNSPILL 2
  354.     movaps m%1, [outq+(%2-8)*16]
  355. %endmacro
  356.  
  357. %define PASS6 PASS6_AND_PERMUTE
  358. %macro PASS5 0
  359.     movaps      m2, [ps_cos_vec+160]
  360.     shufps      m3, m3, 0xcc
  361.  
  362.     BUTTERFLY3  m5, m3, m2, m1
  363.     SPILL 5, 8
  364.  
  365.     UNSPILL 1, 9
  366.     BUTTERFLY3  m1, m3, m2, m5
  367.     SPILL 1, 14
  368.  
  369.     BUTTERFLY3  m4, m3, m2, m5
  370.     SPILL 4, 12
  371.  
  372.     BUTTERFLY3  m7, m3, m2, m5
  373.     SPILL 7, 13
  374.  
  375.     UNSPILL 5, 10
  376.     BUTTERFLY3  m5, m3, m2, m7
  377.     SPILL 5, 10
  378.  
  379.     UNSPILL 4, 11
  380.     BUTTERFLY3  m4, m3, m2, m7
  381.     SPILL 4, 11
  382.  
  383.     BUTTERFLY3  m6, m3, m2, m7
  384.     SPILL 6, 9
  385.  
  386.     BUTTERFLY3  m0, m3, m2, m7
  387.     SPILL 0, 15
  388. %endmacro
  389. %endif
  390.  
  391.  
  392. ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
  393. %macro DCT32_FUNC 0
  394. cglobal dct32_float, 2, 3, 16, out, in, tmp
  395.     ; pass 1
  396.  
  397.     movaps      m0, [inq+0]
  398.     LOAD_INV    m1, [inq+112]
  399.     BUTTERFLY   m0, m1, [ps_cos_vec], m3
  400.  
  401.     movaps      m7, [inq+64]
  402.     LOAD_INV    m4, [inq+48]
  403.     BUTTERFLY   m7, m4, [ps_cos_vec+32], m3
  404.  
  405.     ; pass 2
  406.     movaps      m2, [ps_cos_vec+64]
  407.     BUTTERFLY   m1, m4, m2, m3
  408.     SPILL 1, 11
  409.     SPILL 4, 8
  410.  
  411.     ; pass 1
  412.     movaps      m1, [inq+16]
  413.     LOAD_INV    m6, [inq+96]
  414.     BUTTERFLY   m1, m6, [ps_cos_vec+16], m3
  415.  
  416.     movaps      m4, [inq+80]
  417.     LOAD_INV    m5, [inq+32]
  418.     BUTTERFLY   m4, m5, [ps_cos_vec+48], m3
  419.  
  420.     ; pass 2
  421.     BUTTERFLY   m0, m7, m2, m3
  422.  
  423.     movaps      m2, [ps_cos_vec+80]
  424.     BUTTERFLY   m6, m5, m2, m3
  425.  
  426.     BUTTERFLY   m1, m4, m2, m3
  427.  
  428.     ; pass 3
  429.     movaps      m2, [ps_cos_vec+96]
  430.     shufps      m1, m1, 0x1b
  431.     BUTTERFLY   m0, m1, m2, m3
  432.     SPILL 0, 15
  433.     SPILL 1, 14
  434.  
  435.     UNSPILL 0, 8
  436.     shufps      m5, m5, 0x1b
  437.     BUTTERFLY   m0, m5, m2, m3
  438.  
  439.     UNSPILL 1, 11
  440.     shufps      m6, m6, 0x1b
  441.     BUTTERFLY   m1, m6, m2, m3
  442.     SPILL 1, 11
  443.  
  444.     shufps      m4, m4, 0x1b
  445.     BUTTERFLY   m7, m4, m2, m3
  446.  
  447.     ; pass 4
  448.     movaps      m3, [ps_p1p1m1m1+0]
  449.     movaps      m2, [ps_cos_vec+128]
  450.  
  451.     BUTTERFLY2  m5, m3, m2, m1
  452.  
  453.     BUTTERFLY2  m0, m3, m2, m1
  454.     SPILL 0, 9
  455.  
  456.     BUTTERFLY2  m6, m3, m2, m1
  457.     SPILL 6, 10
  458.  
  459.     UNSPILL 0, 11
  460.     BUTTERFLY2  m0, m3, m2, m1
  461.     SPILL 0, 11
  462.  
  463.     BUTTERFLY2  m4, m3, m2, m1
  464.  
  465.     BUTTERFLY2  m7, m3, m2, m1
  466.  
  467.     UNSPILL 6, 14
  468.     BUTTERFLY2  m6, m3, m2, m1
  469.  
  470.     UNSPILL 0, 15
  471.     BUTTERFLY2  m0, m3, m2, m1
  472.  
  473.     PASS5
  474.     PASS6
  475.     RET
  476. %endmacro
  477.  
  478. %macro LOAD_INV 2
  479. %if cpuflag(sse2)
  480.     pshufd      %1, %2, 0x1b
  481. %elif cpuflag(sse)
  482.     movaps      %1, %2
  483.     shufps      %1, %1, 0x1b
  484. %endif
  485. %endmacro
  486.  
  487. INIT_XMM sse
  488. DCT32_FUNC
  489. INIT_XMM sse2
  490. DCT32_FUNC
  491.