Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. ;*****************************************************************************
  2. ;* x86-optimized AC-3 DSP utils
  3. ;* Copyright (c) 2011 Justin Ruggles
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21.  
  22. %include "libavutil/x86/x86util.asm"
  23.  
  24. SECTION_RODATA
  25.  
  26. ; 16777216.0f - used in ff_float_to_fixed24()
  27. pf_1_24: times 4 dd 0x4B800000
  28.  
  29. ; used in ff_ac3_compute_mantissa_size()
  30. cextern ac3_bap_bits
  31. pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
  32. pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
  33.  
  34. ; used in ff_ac3_extract_exponents()
  35. pd_1:   times 4 dd 1
  36. pd_151: times 4 dd 151
  37.  
  38. SECTION .text
  39.  
  40. ;-----------------------------------------------------------------------------
  41. ; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
  42. ;-----------------------------------------------------------------------------
  43.  
  44. %macro AC3_EXPONENT_MIN 0
  45. cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
  46.     shl  reuse_blksq, 8
  47.     jz .end
  48.     LOOP_ALIGN
  49. .nextexp:
  50.     mov      offsetq, reuse_blksq
  51.     mova          m0, [expq+offsetq]
  52.     sub      offsetq, 256
  53.     LOOP_ALIGN
  54. .nextblk:
  55.     PMINUB        m0, [expq+offsetq], m1
  56.     sub      offsetq, 256
  57.     jae .nextblk
  58.     mova      [expq], m0
  59.     add         expq, mmsize
  60.     sub        expnq, mmsize
  61.     jg .nextexp
  62. .end:
  63.     REP_RET
  64. %endmacro
  65.  
  66. %define LOOP_ALIGN
  67. INIT_MMX mmx
  68. AC3_EXPONENT_MIN
  69. %if HAVE_MMXEXT_EXTERNAL
  70. %define LOOP_ALIGN ALIGN 16
  71. INIT_MMX mmxext
  72. AC3_EXPONENT_MIN
  73. %endif
  74. %if HAVE_SSE2_EXTERNAL
  75. INIT_XMM sse2
  76. AC3_EXPONENT_MIN
  77. %endif
  78. %undef LOOP_ALIGN
  79.  
  80. ;-----------------------------------------------------------------------------
  81. ; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
  82. ;
  83. ; This function uses 2 different methods to calculate a valid result.
  84. ; 1) logical 'or' of abs of each element
  85. ;        This is used for ssse3 because of the pabsw instruction.
  86. ;        It is also used for mmx because of the lack of min/max instructions.
  87. ; 2) calculate min/max for the array, then or(abs(min),abs(max))
  88. ;        This is used for mmxext and sse2 because they have pminsw/pmaxsw.
  89. ;-----------------------------------------------------------------------------
  90.  
  91. ; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
  92. %macro OR_WORDS_HORIZ 2 ; src, tmp
  93. %if cpuflag(sse2)
  94.     movhlps     %2, %1
  95.     por         %1, %2
  96.     pshuflw     %2, %1, q0032
  97.     por         %1, %2
  98.     pshuflw     %2, %1, q0001
  99.     por         %1, %2
  100. %elif cpuflag(mmxext)
  101.     pshufw      %2, %1, q0032
  102.     por         %1, %2
  103.     pshufw      %2, %1, q0001
  104.     por         %1, %2
  105. %else ; mmx
  106.     movq        %2, %1
  107.     psrlq       %2, 32
  108.     por         %1, %2
  109.     movq        %2, %1
  110.     psrlq       %2, 16
  111.     por         %1, %2
  112. %endif
  113. %endmacro
  114.  
  115. %macro AC3_MAX_MSB_ABS_INT16 1
  116. cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
  117.     pxor        m2, m2
  118.     pxor        m3, m3
  119. .loop:
  120. %ifidn %1, min_max
  121.     mova        m0, [srcq]
  122.     mova        m1, [srcq+mmsize]
  123.     pminsw      m2, m0
  124.     pminsw      m2, m1
  125.     pmaxsw      m3, m0
  126.     pmaxsw      m3, m1
  127. %else ; or_abs
  128. %if notcpuflag(ssse3)
  129.     mova        m0, [srcq]
  130.     mova        m1, [srcq+mmsize]
  131.     ABS2        m0, m1, m3, m4
  132. %else ; ssse3
  133.     ; using memory args is faster for ssse3
  134.     pabsw       m0, [srcq]
  135.     pabsw       m1, [srcq+mmsize]
  136. %endif
  137.     por         m2, m0
  138.     por         m2, m1
  139. %endif
  140.     add       srcq, mmsize*2
  141.     sub       lend, mmsize
  142.     ja .loop
  143. %ifidn %1, min_max
  144.     ABS2        m2, m3, m0, m1
  145.     por         m2, m3
  146. %endif
  147.     OR_WORDS_HORIZ m2, m0
  148.     movd       eax, m2
  149.     and        eax, 0xFFFF
  150.     RET
  151. %endmacro
  152.  
  153. INIT_MMX mmx
  154. AC3_MAX_MSB_ABS_INT16 or_abs
  155. INIT_MMX mmxext
  156. AC3_MAX_MSB_ABS_INT16 min_max
  157. INIT_XMM sse2
  158. AC3_MAX_MSB_ABS_INT16 min_max
  159. INIT_XMM ssse3
  160. AC3_MAX_MSB_ABS_INT16 or_abs
  161.  
  162. ;-----------------------------------------------------------------------------
  163. ; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
  164. ;-----------------------------------------------------------------------------
  165.  
  166. %macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
  167. cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
  168.     movd      m0, shiftd
  169. .loop:
  170.     mova      m1, [srcq         ]
  171.     mova      m2, [srcq+mmsize  ]
  172.     mova      m3, [srcq+mmsize*2]
  173.     mova      m4, [srcq+mmsize*3]
  174.     %3        m1, m0
  175.     %3        m2, m0
  176.     %3        m3, m0
  177.     %3        m4, m0
  178.     mova  [srcq         ], m1
  179.     mova  [srcq+mmsize  ], m2
  180.     mova  [srcq+mmsize*2], m3
  181.     mova  [srcq+mmsize*3], m4
  182.     add     srcq, mmsize*4
  183.     sub     lend, mmsize*32/%2
  184.     ja .loop
  185. .end:
  186.     REP_RET
  187. %endmacro
  188.  
  189. ;-----------------------------------------------------------------------------
  190. ; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
  191. ;-----------------------------------------------------------------------------
  192.  
  193. INIT_MMX mmx
  194. AC3_SHIFT l, 16, psllw
  195. INIT_XMM sse2
  196. AC3_SHIFT l, 16, psllw
  197.  
  198. ;-----------------------------------------------------------------------------
  199. ; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
  200. ;-----------------------------------------------------------------------------
  201.  
  202. INIT_MMX mmx
  203. AC3_SHIFT r, 32, psrad
  204. INIT_XMM sse2
  205. AC3_SHIFT r, 32, psrad
  206.  
  207. ;-----------------------------------------------------------------------------
  208. ; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
  209. ;-----------------------------------------------------------------------------
  210.  
  211. ; The 3DNow! version is not bit-identical because pf2id uses truncation rather
  212. ; than round-to-nearest.
  213. INIT_MMX 3dnow
  214. cglobal float_to_fixed24, 3, 3, 0, dst, src, len
  215.     movq   m0, [pf_1_24]
  216. .loop:
  217.     movq   m1, [srcq   ]
  218.     movq   m2, [srcq+8 ]
  219.     movq   m3, [srcq+16]
  220.     movq   m4, [srcq+24]
  221.     pfmul  m1, m0
  222.     pfmul  m2, m0
  223.     pfmul  m3, m0
  224.     pfmul  m4, m0
  225.     pf2id  m1, m1
  226.     pf2id  m2, m2
  227.     pf2id  m3, m3
  228.     pf2id  m4, m4
  229.     movq  [dstq   ], m1
  230.     movq  [dstq+8 ], m2
  231.     movq  [dstq+16], m3
  232.     movq  [dstq+24], m4
  233.     add  srcq, 32
  234.     add  dstq, 32
  235.     sub  lend, 8
  236.     ja .loop
  237.     femms
  238.     RET
  239.  
  240. INIT_XMM sse
  241. cglobal float_to_fixed24, 3, 3, 3, dst, src, len
  242.     movaps     m0, [pf_1_24]
  243. .loop:
  244.     movaps     m1, [srcq   ]
  245.     movaps     m2, [srcq+16]
  246.     mulps      m1, m0
  247.     mulps      m2, m0
  248.     cvtps2pi  mm0, m1
  249.     movhlps    m1, m1
  250.     cvtps2pi  mm1, m1
  251.     cvtps2pi  mm2, m2
  252.     movhlps    m2, m2
  253.     cvtps2pi  mm3, m2
  254.     movq  [dstq   ], mm0
  255.     movq  [dstq+ 8], mm1
  256.     movq  [dstq+16], mm2
  257.     movq  [dstq+24], mm3
  258.     add      srcq, 32
  259.     add      dstq, 32
  260.     sub      lend, 8
  261.     ja .loop
  262.     emms
  263.     RET
  264.  
  265. INIT_XMM sse2
  266. cglobal float_to_fixed24, 3, 3, 9, dst, src, len
  267.     movaps     m0, [pf_1_24]
  268. .loop:
  269.     movaps     m1, [srcq    ]
  270.     movaps     m2, [srcq+16 ]
  271.     movaps     m3, [srcq+32 ]
  272.     movaps     m4, [srcq+48 ]
  273. %ifdef m8
  274.     movaps     m5, [srcq+64 ]
  275.     movaps     m6, [srcq+80 ]
  276.     movaps     m7, [srcq+96 ]
  277.     movaps     m8, [srcq+112]
  278. %endif
  279.     mulps      m1, m0
  280.     mulps      m2, m0
  281.     mulps      m3, m0
  282.     mulps      m4, m0
  283. %ifdef m8
  284.     mulps      m5, m0
  285.     mulps      m6, m0
  286.     mulps      m7, m0
  287.     mulps      m8, m0
  288. %endif
  289.     cvtps2dq   m1, m1
  290.     cvtps2dq   m2, m2
  291.     cvtps2dq   m3, m3
  292.     cvtps2dq   m4, m4
  293. %ifdef m8
  294.     cvtps2dq   m5, m5
  295.     cvtps2dq   m6, m6
  296.     cvtps2dq   m7, m7
  297.     cvtps2dq   m8, m8
  298. %endif
  299.     movdqa  [dstq    ], m1
  300.     movdqa  [dstq+16 ], m2
  301.     movdqa  [dstq+32 ], m3
  302.     movdqa  [dstq+48 ], m4
  303. %ifdef m8
  304.     movdqa  [dstq+64 ], m5
  305.     movdqa  [dstq+80 ], m6
  306.     movdqa  [dstq+96 ], m7
  307.     movdqa  [dstq+112], m8
  308.     add      srcq, 128
  309.     add      dstq, 128
  310.     sub      lenq, 32
  311. %else
  312.     add      srcq, 64
  313.     add      dstq, 64
  314.     sub      lenq, 16
  315. %endif
  316.     ja .loop
  317.     REP_RET
  318.  
  319. ;------------------------------------------------------------------------------
  320. ; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
  321. ;------------------------------------------------------------------------------
  322.  
  323. %macro PHADDD4 2 ; xmm src, xmm tmp
  324.     movhlps  %2, %1
  325.     paddd    %1, %2
  326.     pshufd   %2, %1, 0x1
  327.     paddd    %1, %2
  328. %endmacro
  329.  
  330. INIT_XMM sse2
  331. cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
  332.     movdqa      m0, [mant_cntq      ]
  333.     movdqa      m1, [mant_cntq+ 1*16]
  334.     paddw       m0, [mant_cntq+ 2*16]
  335.     paddw       m1, [mant_cntq+ 3*16]
  336.     paddw       m0, [mant_cntq+ 4*16]
  337.     paddw       m1, [mant_cntq+ 5*16]
  338.     paddw       m0, [mant_cntq+ 6*16]
  339.     paddw       m1, [mant_cntq+ 7*16]
  340.     paddw       m0, [mant_cntq+ 8*16]
  341.     paddw       m1, [mant_cntq+ 9*16]
  342.     paddw       m0, [mant_cntq+10*16]
  343.     paddw       m1, [mant_cntq+11*16]
  344.     pmaddwd     m0, [ac3_bap_bits   ]
  345.     pmaddwd     m1, [ac3_bap_bits+16]
  346.     paddd       m0, m1
  347.     PHADDD4     m0, m1
  348.     movd      sumd, m0
  349.     movdqa      m3, [pw_bap_mul1]
  350.     movhpd      m0, [mant_cntq     +2]
  351.     movlpd      m0, [mant_cntq+1*32+2]
  352.     movhpd      m1, [mant_cntq+2*32+2]
  353.     movlpd      m1, [mant_cntq+3*32+2]
  354.     movhpd      m2, [mant_cntq+4*32+2]
  355.     movlpd      m2, [mant_cntq+5*32+2]
  356.     pmulhuw     m0, m3
  357.     pmulhuw     m1, m3
  358.     pmulhuw     m2, m3
  359.     paddusw     m0, m1
  360.     paddusw     m0, m2
  361.     pmaddwd     m0, [pw_bap_mul2]
  362.     PHADDD4     m0, m1
  363.     movd       eax, m0
  364.     add        eax, sumd
  365.     RET
  366.  
  367. ;------------------------------------------------------------------------------
  368. ; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
  369. ;------------------------------------------------------------------------------
  370.  
  371. %macro PABSD 1-2 ; src/dst, unused
  372. %if cpuflag(ssse3)
  373.     pabsd    %1, %1
  374. %else ; src/dst, tmp
  375.     pxor     %2, %2
  376.     pcmpgtd  %2, %1
  377.     pxor     %1, %2
  378.     psubd    %1, %2
  379. %endif
  380. %endmacro
  381.  
  382. %macro AC3_EXTRACT_EXPONENTS 0
  383. cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
  384.     add     expq, lenq
  385.     lea    coefq, [coefq+4*lenq]
  386.     neg     lenq
  387.     mova      m2, [pd_1]
  388.     mova      m3, [pd_151]
  389. .loop:
  390.     ; move 4 32-bit coefs to xmm0
  391.     mova      m0, [coefq+4*lenq]
  392.     ; absolute value
  393.     PABSD     m0, m1
  394.     ; convert to float and extract exponents
  395.     pslld     m0, 1
  396.     por       m0, m2
  397.     cvtdq2ps  m1, m0
  398.     psrld     m1, 23
  399.     mova      m0, m3
  400.     psubd     m0, m1
  401.     ; move the lowest byte in each of 4 dwords to the low dword
  402.     ; NOTE: We cannot just extract the low bytes with pshufb because the dword
  403.     ;       result for 16777215 is -1 due to float inaccuracy. Using packuswb
  404.     ;       clips this to 0, which is the correct exponent.
  405.     packssdw  m0, m0
  406.     packuswb  m0, m0
  407.     movd  [expq+lenq], m0
  408.  
  409.     add     lenq, 4
  410.     jl .loop
  411.     REP_RET
  412. %endmacro
  413.  
  414. %if HAVE_SSE2_EXTERNAL
  415. INIT_XMM sse2
  416. AC3_EXTRACT_EXPONENTS
  417. %endif
  418. %if HAVE_SSSE3_EXTERNAL
  419. INIT_XMM ssse3
  420. AC3_EXTRACT_EXPONENTS
  421. %endif
  422.