Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;*****************************************************************************
  2. ;* x86-optimized AC-3 DSP functions
  3. ;* Copyright (c) 2011 Justin Ruggles
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21.  
  22. %include "libavutil/x86/x86util.asm"
  23.  
  24. SECTION_RODATA
  25.  
  26. ; 16777216.0f - used in ff_float_to_fixed24()
  27. pf_1_24: times 4 dd 0x4B800000
  28.  
  29. ; used in ff_ac3_compute_mantissa_size()
  30. cextern ac3_bap_bits
  31. pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
  32. pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
  33.  
  34. ; used in ff_ac3_extract_exponents()
  35. cextern pd_1
  36. pd_151: times 4 dd 151
  37.  
  38. ; used in ff_apply_window_int16()
  39. pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
  40. pd_16384: times 4 dd 16384
  41.  
  42. SECTION .text
  43.  
  44. ;-----------------------------------------------------------------------------
  45. ; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
  46. ;-----------------------------------------------------------------------------
  47.  
  48. %macro AC3_EXPONENT_MIN 0
  49. cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
  50.     shl  reuse_blksq, 8
  51.     jz .end
  52.     LOOP_ALIGN
  53. .nextexp:
  54.     mov      offsetq, reuse_blksq
  55.     mova          m0, [expq+offsetq]
  56.     sub      offsetq, 256
  57.     LOOP_ALIGN
  58. .nextblk:
  59.     PMINUB        m0, [expq+offsetq], m1
  60.     sub      offsetq, 256
  61.     jae .nextblk
  62.     mova      [expq], m0
  63.     add         expq, mmsize
  64.     sub        expnq, mmsize
  65.     jg .nextexp
  66. .end:
  67.     REP_RET
  68. %endmacro
  69.  
  70. %define LOOP_ALIGN
  71. INIT_MMX mmx
  72. AC3_EXPONENT_MIN
  73. %if HAVE_MMXEXT_EXTERNAL
  74. %define LOOP_ALIGN ALIGN 16
  75. INIT_MMX mmxext
  76. AC3_EXPONENT_MIN
  77. %endif
  78. %if HAVE_SSE2_EXTERNAL
  79. INIT_XMM sse2
  80. AC3_EXPONENT_MIN
  81. %endif
  82. %undef LOOP_ALIGN
  83.  
  84. ;-----------------------------------------------------------------------------
  85. ; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
  86. ;
  87. ; This function uses 2 different methods to calculate a valid result.
  88. ; 1) logical 'or' of abs of each element
  89. ;        This is used for ssse3 because of the pabsw instruction.
  90. ;        It is also used for mmx because of the lack of min/max instructions.
  91. ; 2) calculate min/max for the array, then or(abs(min),abs(max))
  92. ;        This is used for mmxext and sse2 because they have pminsw/pmaxsw.
  93. ;-----------------------------------------------------------------------------
  94.  
  95. ; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
  96. %macro OR_WORDS_HORIZ 2 ; src, tmp
  97. %if cpuflag(sse2)
  98.     movhlps     %2, %1
  99.     por         %1, %2
  100.     pshuflw     %2, %1, q0032
  101.     por         %1, %2
  102.     pshuflw     %2, %1, q0001
  103.     por         %1, %2
  104. %elif cpuflag(mmxext)
  105.     pshufw      %2, %1, q0032
  106.     por         %1, %2
  107.     pshufw      %2, %1, q0001
  108.     por         %1, %2
  109. %else ; mmx
  110.     movq        %2, %1
  111.     psrlq       %2, 32
  112.     por         %1, %2
  113.     movq        %2, %1
  114.     psrlq       %2, 16
  115.     por         %1, %2
  116. %endif
  117. %endmacro
  118.  
  119. %macro AC3_MAX_MSB_ABS_INT16 1
  120. cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
  121.     pxor        m2, m2
  122.     pxor        m3, m3
  123. .loop:
  124. %ifidn %1, min_max
  125.     mova        m0, [srcq]
  126.     mova        m1, [srcq+mmsize]
  127.     pminsw      m2, m0
  128.     pminsw      m2, m1
  129.     pmaxsw      m3, m0
  130.     pmaxsw      m3, m1
  131. %else ; or_abs
  132. %if notcpuflag(ssse3)
  133.     mova        m0, [srcq]
  134.     mova        m1, [srcq+mmsize]
  135.     ABS2        m0, m1, m3, m4
  136. %else ; ssse3
  137.     ; using memory args is faster for ssse3
  138.     pabsw       m0, [srcq]
  139.     pabsw       m1, [srcq+mmsize]
  140. %endif
  141.     por         m2, m0
  142.     por         m2, m1
  143. %endif
  144.     add       srcq, mmsize*2
  145.     sub       lend, mmsize
  146.     ja .loop
  147. %ifidn %1, min_max
  148.     ABS2        m2, m3, m0, m1
  149.     por         m2, m3
  150. %endif
  151.     OR_WORDS_HORIZ m2, m0
  152.     movd       eax, m2
  153.     and        eax, 0xFFFF
  154.     RET
  155. %endmacro
  156.  
  157. INIT_MMX mmx
  158. AC3_MAX_MSB_ABS_INT16 or_abs
  159. INIT_MMX mmxext
  160. AC3_MAX_MSB_ABS_INT16 min_max
  161. INIT_XMM sse2
  162. AC3_MAX_MSB_ABS_INT16 min_max
  163. INIT_XMM ssse3
  164. AC3_MAX_MSB_ABS_INT16 or_abs
  165.  
  166. ;-----------------------------------------------------------------------------
  167. ; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
  168. ;-----------------------------------------------------------------------------
  169.  
  170. %macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
  171. cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
  172.     movd      m0, shiftd
  173. .loop:
  174.     mova      m1, [srcq         ]
  175.     mova      m2, [srcq+mmsize  ]
  176.     mova      m3, [srcq+mmsize*2]
  177.     mova      m4, [srcq+mmsize*3]
  178.     %3        m1, m0
  179.     %3        m2, m0
  180.     %3        m3, m0
  181.     %3        m4, m0
  182.     mova  [srcq         ], m1
  183.     mova  [srcq+mmsize  ], m2
  184.     mova  [srcq+mmsize*2], m3
  185.     mova  [srcq+mmsize*3], m4
  186.     add     srcq, mmsize*4
  187.     sub     lend, mmsize*32/%2
  188.     ja .loop
  189. .end:
  190.     REP_RET
  191. %endmacro
  192.  
  193. ;-----------------------------------------------------------------------------
  194. ; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
  195. ;-----------------------------------------------------------------------------
  196.  
  197. INIT_MMX mmx
  198. AC3_SHIFT l, 16, psllw
  199. INIT_XMM sse2
  200. AC3_SHIFT l, 16, psllw
  201.  
  202. ;-----------------------------------------------------------------------------
  203. ; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
  204. ;-----------------------------------------------------------------------------
  205.  
  206. INIT_MMX mmx
  207. AC3_SHIFT r, 32, psrad
  208. INIT_XMM sse2
  209. AC3_SHIFT r, 32, psrad
  210.  
  211. ;-----------------------------------------------------------------------------
  212. ; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
  213. ;-----------------------------------------------------------------------------
  214.  
  215. ; The 3DNow! version is not bit-identical because pf2id uses truncation rather
  216. ; than round-to-nearest.
  217. INIT_MMX 3dnow
  218. cglobal float_to_fixed24, 3, 3, 0, dst, src, len
  219.     movq   m0, [pf_1_24]
  220. .loop:
  221.     movq   m1, [srcq   ]
  222.     movq   m2, [srcq+8 ]
  223.     movq   m3, [srcq+16]
  224.     movq   m4, [srcq+24]
  225.     pfmul  m1, m0
  226.     pfmul  m2, m0
  227.     pfmul  m3, m0
  228.     pfmul  m4, m0
  229.     pf2id  m1, m1
  230.     pf2id  m2, m2
  231.     pf2id  m3, m3
  232.     pf2id  m4, m4
  233.     movq  [dstq   ], m1
  234.     movq  [dstq+8 ], m2
  235.     movq  [dstq+16], m3
  236.     movq  [dstq+24], m4
  237.     add  srcq, 32
  238.     add  dstq, 32
  239.     sub  lend, 8
  240.     ja .loop
  241.     femms
  242.     RET
  243.  
  244. INIT_XMM sse
  245. cglobal float_to_fixed24, 3, 3, 3, dst, src, len
  246.     movaps     m0, [pf_1_24]
  247. .loop:
  248.     movaps     m1, [srcq   ]
  249.     movaps     m2, [srcq+16]
  250.     mulps      m1, m0
  251.     mulps      m2, m0
  252.     cvtps2pi  mm0, m1
  253.     movhlps    m1, m1
  254.     cvtps2pi  mm1, m1
  255.     cvtps2pi  mm2, m2
  256.     movhlps    m2, m2
  257.     cvtps2pi  mm3, m2
  258.     movq  [dstq   ], mm0
  259.     movq  [dstq+ 8], mm1
  260.     movq  [dstq+16], mm2
  261.     movq  [dstq+24], mm3
  262.     add      srcq, 32
  263.     add      dstq, 32
  264.     sub      lend, 8
  265.     ja .loop
  266.     emms
  267.     RET
  268.  
  269. INIT_XMM sse2
  270. cglobal float_to_fixed24, 3, 3, 9, dst, src, len
  271.     movaps     m0, [pf_1_24]
  272. .loop:
  273.     movaps     m1, [srcq    ]
  274.     movaps     m2, [srcq+16 ]
  275.     movaps     m3, [srcq+32 ]
  276.     movaps     m4, [srcq+48 ]
  277. %ifdef m8
  278.     movaps     m5, [srcq+64 ]
  279.     movaps     m6, [srcq+80 ]
  280.     movaps     m7, [srcq+96 ]
  281.     movaps     m8, [srcq+112]
  282. %endif
  283.     mulps      m1, m0
  284.     mulps      m2, m0
  285.     mulps      m3, m0
  286.     mulps      m4, m0
  287. %ifdef m8
  288.     mulps      m5, m0
  289.     mulps      m6, m0
  290.     mulps      m7, m0
  291.     mulps      m8, m0
  292. %endif
  293.     cvtps2dq   m1, m1
  294.     cvtps2dq   m2, m2
  295.     cvtps2dq   m3, m3
  296.     cvtps2dq   m4, m4
  297. %ifdef m8
  298.     cvtps2dq   m5, m5
  299.     cvtps2dq   m6, m6
  300.     cvtps2dq   m7, m7
  301.     cvtps2dq   m8, m8
  302. %endif
  303.     movdqa  [dstq    ], m1
  304.     movdqa  [dstq+16 ], m2
  305.     movdqa  [dstq+32 ], m3
  306.     movdqa  [dstq+48 ], m4
  307. %ifdef m8
  308.     movdqa  [dstq+64 ], m5
  309.     movdqa  [dstq+80 ], m6
  310.     movdqa  [dstq+96 ], m7
  311.     movdqa  [dstq+112], m8
  312.     add      srcq, 128
  313.     add      dstq, 128
  314.     sub      lenq, 32
  315. %else
  316.     add      srcq, 64
  317.     add      dstq, 64
  318.     sub      lenq, 16
  319. %endif
  320.     ja .loop
  321.     REP_RET
  322.  
  323. ;------------------------------------------------------------------------------
  324. ; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
  325. ;------------------------------------------------------------------------------
  326.  
  327. %macro PHADDD4 2 ; xmm src, xmm tmp
  328.     movhlps  %2, %1
  329.     paddd    %1, %2
  330.     pshufd   %2, %1, 0x1
  331.     paddd    %1, %2
  332. %endmacro
  333.  
  334. INIT_XMM sse2
  335. cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
  336.     movdqa      m0, [mant_cntq      ]
  337.     movdqa      m1, [mant_cntq+ 1*16]
  338.     paddw       m0, [mant_cntq+ 2*16]
  339.     paddw       m1, [mant_cntq+ 3*16]
  340.     paddw       m0, [mant_cntq+ 4*16]
  341.     paddw       m1, [mant_cntq+ 5*16]
  342.     paddw       m0, [mant_cntq+ 6*16]
  343.     paddw       m1, [mant_cntq+ 7*16]
  344.     paddw       m0, [mant_cntq+ 8*16]
  345.     paddw       m1, [mant_cntq+ 9*16]
  346.     paddw       m0, [mant_cntq+10*16]
  347.     paddw       m1, [mant_cntq+11*16]
  348.     pmaddwd     m0, [ac3_bap_bits   ]
  349.     pmaddwd     m1, [ac3_bap_bits+16]
  350.     paddd       m0, m1
  351.     PHADDD4     m0, m1
  352.     movd      sumd, m0
  353.     movdqa      m3, [pw_bap_mul1]
  354.     movhpd      m0, [mant_cntq     +2]
  355.     movlpd      m0, [mant_cntq+1*32+2]
  356.     movhpd      m1, [mant_cntq+2*32+2]
  357.     movlpd      m1, [mant_cntq+3*32+2]
  358.     movhpd      m2, [mant_cntq+4*32+2]
  359.     movlpd      m2, [mant_cntq+5*32+2]
  360.     pmulhuw     m0, m3
  361.     pmulhuw     m1, m3
  362.     pmulhuw     m2, m3
  363.     paddusw     m0, m1
  364.     paddusw     m0, m2
  365.     pmaddwd     m0, [pw_bap_mul2]
  366.     PHADDD4     m0, m1
  367.     movd       eax, m0
  368.     add        eax, sumd
  369.     RET
  370.  
  371. ;------------------------------------------------------------------------------
  372. ; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
  373. ;------------------------------------------------------------------------------
  374.  
  375. %macro PABSD 1-2 ; src/dst, unused
  376. %if cpuflag(ssse3)
  377.     pabsd    %1, %1
  378. %else ; src/dst, tmp
  379.     pxor     %2, %2
  380.     pcmpgtd  %2, %1
  381.     pxor     %1, %2
  382.     psubd    %1, %2
  383. %endif
  384. %endmacro
  385.  
  386. %macro AC3_EXTRACT_EXPONENTS 0
  387. cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
  388.     add     expq, lenq
  389.     lea    coefq, [coefq+4*lenq]
  390.     neg     lenq
  391.     mova      m2, [pd_1]
  392.     mova      m3, [pd_151]
  393. .loop:
  394.     ; move 4 32-bit coefs to xmm0
  395.     mova      m0, [coefq+4*lenq]
  396.     ; absolute value
  397.     PABSD     m0, m1
  398.     ; convert to float and extract exponents
  399.     pslld     m0, 1
  400.     por       m0, m2
  401.     cvtdq2ps  m1, m0
  402.     psrld     m1, 23
  403.     mova      m0, m3
  404.     psubd     m0, m1
  405.     ; move the lowest byte in each of 4 dwords to the low dword
  406.     ; NOTE: We cannot just extract the low bytes with pshufb because the dword
  407.     ;       result for 16777215 is -1 due to float inaccuracy. Using packuswb
  408.     ;       clips this to 0, which is the correct exponent.
  409.     packssdw  m0, m0
  410.     packuswb  m0, m0
  411.     movd  [expq+lenq], m0
  412.  
  413.     add     lenq, 4
  414.     jl .loop
  415.     REP_RET
  416. %endmacro
  417.  
  418. %if HAVE_SSE2_EXTERNAL
  419. INIT_XMM sse2
  420. AC3_EXTRACT_EXPONENTS
  421. %endif
  422. %if HAVE_SSSE3_EXTERNAL
  423. INIT_XMM ssse3
  424. AC3_EXTRACT_EXPONENTS
  425. %endif
  426.  
  427. ;-----------------------------------------------------------------------------
  428. ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
  429. ;                            const int16_t *window, unsigned int len)
  430. ;-----------------------------------------------------------------------------
  431.  
  432. %macro REVERSE_WORDS 1-2
  433. %if cpuflag(ssse3) && notcpuflag(atom)
  434.     pshufb  %1, %2
  435. %elif cpuflag(sse2)
  436.     pshuflw  %1, %1, 0x1B
  437.     pshufhw  %1, %1, 0x1B
  438.     pshufd   %1, %1, 0x4E
  439. %elif cpuflag(mmxext)
  440.     pshufw   %1, %1, 0x1B
  441. %endif
  442. %endmacro
  443.  
  444. %macro MUL16FIXED 3
  445. %if cpuflag(ssse3) ; dst, src, unused
  446. ; dst = ((dst * src) + (1<<14)) >> 15
  447.     pmulhrsw   %1, %2
  448. %elif cpuflag(mmxext) ; dst, src, temp
  449. ; dst = (dst * src) >> 15
  450. ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
  451. ; in from the pmullw result.
  452.     mova    %3, %1
  453.     pmulhw  %1, %2
  454.     pmullw  %3, %2
  455.     psrlw   %3, 15
  456.     psllw   %1, 1
  457.     por     %1, %3
  458. %endif
  459. %endmacro
  460.  
  461. %macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
  462. %if %1
  463. cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
  464. %else
  465. cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
  466. %endif
  467.     lea     offset2q, [offsetq-mmsize]
  468. %if cpuflag(ssse3) && notcpuflag(atom)
  469.     mova          m5, [pb_revwords]
  470.     ALIGN 16
  471. %elif %1
  472.     mova          m5, [pd_16384]
  473. %endif
  474. .loop:
  475. %if cpuflag(ssse3)
  476.     ; This version does the 16x16->16 multiplication in-place without expanding
  477.     ; to 32-bit. The ssse3 version is bit-identical.
  478.     mova          m0, [windowq+offset2q]
  479.     mova          m1, [ inputq+offset2q]
  480.     pmulhrsw      m1, m0
  481.     REVERSE_WORDS m0, m5
  482.     pmulhrsw      m0, [ inputq+offsetq ]
  483.     mova  [outputq+offset2q], m1
  484.     mova  [outputq+offsetq ], m0
  485. %elif %1
  486.     ; This version expands 16-bit to 32-bit, multiplies by the window,
  487.     ; adds 16384 for rounding, right shifts 15, then repacks back to words to
  488.     ; save to the output. The window is reversed for the second half.
  489.     mova          m3, [windowq+offset2q]
  490.     mova          m4, [ inputq+offset2q]
  491.     pxor          m0, m0
  492.     punpcklwd     m0, m3
  493.     punpcklwd     m1, m4
  494.     pmaddwd       m0, m1
  495.     paddd         m0, m5
  496.     psrad         m0, 15
  497.     pxor          m2, m2
  498.     punpckhwd     m2, m3
  499.     punpckhwd     m1, m4
  500.     pmaddwd       m2, m1
  501.     paddd         m2, m5
  502.     psrad         m2, 15
  503.     packssdw      m0, m2
  504.     mova  [outputq+offset2q], m0
  505.     REVERSE_WORDS m3
  506.     mova          m4, [ inputq+offsetq]
  507.     pxor          m0, m0
  508.     punpcklwd     m0, m3
  509.     punpcklwd     m1, m4
  510.     pmaddwd       m0, m1
  511.     paddd         m0, m5
  512.     psrad         m0, 15
  513.     pxor          m2, m2
  514.     punpckhwd     m2, m3
  515.     punpckhwd     m1, m4
  516.     pmaddwd       m2, m1
  517.     paddd         m2, m5
  518.     psrad         m2, 15
  519.     packssdw      m0, m2
  520.     mova  [outputq+offsetq], m0
  521. %else
  522.     ; This version does the 16x16->16 multiplication in-place without expanding
  523.     ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
  524.     ; therefore are not bit-identical to the C version.
  525.     mova          m0, [windowq+offset2q]
  526.     mova          m1, [ inputq+offset2q]
  527.     mova          m2, [ inputq+offsetq ]
  528.     MUL16FIXED    m1, m0, m3
  529.     REVERSE_WORDS m0
  530.     MUL16FIXED    m2, m0, m3
  531.     mova  [outputq+offset2q], m1
  532.     mova  [outputq+offsetq ], m2
  533. %endif
  534.     add      offsetd, mmsize
  535.     sub     offset2d, mmsize
  536.     jae .loop
  537.     REP_RET
  538. %endmacro
  539.  
  540. INIT_MMX mmxext
  541. APPLY_WINDOW_INT16 0
  542. INIT_XMM sse2
  543. APPLY_WINDOW_INT16 0
  544.  
  545. INIT_MMX mmxext
  546. APPLY_WINDOW_INT16 1
  547. INIT_XMM sse2
  548. APPLY_WINDOW_INT16 1
  549. INIT_XMM ssse3
  550. APPLY_WINDOW_INT16 1
  551. INIT_XMM ssse3, atom
  552. APPLY_WINDOW_INT16 1
  553.