Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* x86 optimized Format Conversion Utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21.  
  22. %include "libavutil/x86/x86util.asm"
  23.  
  24. SECTION_TEXT
  25.  
  26. %macro CVTPS2PI 2
  27. %if cpuflag(sse)
  28.     cvtps2pi %1, %2
  29. %elif cpuflag(3dnow)
  30.     pf2id %1, %2
  31. %endif
  32. %endmacro
  33.  
  34. ;---------------------------------------------------------------------------------
  35. ; void int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul, int len);
  36. ;---------------------------------------------------------------------------------
  37. %macro INT32_TO_FLOAT_FMUL_SCALAR 1
  38. %if UNIX64
  39. cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
  40. %else
  41. cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
  42. %endif
  43. %if WIN64
  44.     SWAP 0, 2
  45. %elif ARCH_X86_32
  46.     movss   m0, mulm
  47. %endif
  48.     SPLATD  m0
  49.     shl     lenq, 2
  50.     add     srcq, lenq
  51.     add     dstq, lenq
  52.     neg     lenq
  53. .loop:
  54. %if cpuflag(sse2)
  55.     cvtdq2ps  m1, [srcq+lenq   ]
  56.     cvtdq2ps  m2, [srcq+lenq+16]
  57. %else
  58.     cvtpi2ps  m1, [srcq+lenq   ]
  59.     cvtpi2ps  m3, [srcq+lenq+ 8]
  60.     cvtpi2ps  m2, [srcq+lenq+16]
  61.     cvtpi2ps  m4, [srcq+lenq+24]
  62.     movlhps   m1, m3
  63.     movlhps   m2, m4
  64. %endif
  65.     mulps     m1, m0
  66.     mulps     m2, m0
  67.     mova  [dstq+lenq   ], m1
  68.     mova  [dstq+lenq+16], m2
  69.     add     lenq, 32
  70.     jl .loop
  71.     REP_RET
  72. %endmacro
  73.  
  74. INIT_XMM sse
  75. INT32_TO_FLOAT_FMUL_SCALAR 5
  76. INIT_XMM sse2
  77. INT32_TO_FLOAT_FMUL_SCALAR 3
  78.  
  79.  
  80. ;------------------------------------------------------------------------------
  81. ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
  82. ;------------------------------------------------------------------------------
  83. %macro FLOAT_TO_INT16 1
  84. cglobal float_to_int16, 3, 3, %1, dst, src, len
  85.     add       lenq, lenq
  86.     lea       srcq, [srcq+2*lenq]
  87.     add       dstq, lenq
  88.     neg       lenq
  89. .loop:
  90. %if cpuflag(sse2)
  91.     cvtps2dq    m0, [srcq+2*lenq   ]
  92.     cvtps2dq    m1, [srcq+2*lenq+16]
  93.     packssdw    m0, m1
  94.     mova  [dstq+lenq], m0
  95. %else
  96.     CVTPS2PI    m0, [srcq+2*lenq   ]
  97.     CVTPS2PI    m1, [srcq+2*lenq+ 8]
  98.     CVTPS2PI    m2, [srcq+2*lenq+16]
  99.     CVTPS2PI    m3, [srcq+2*lenq+24]
  100.     packssdw    m0, m1
  101.     packssdw    m2, m3
  102.     mova  [dstq+lenq  ], m0
  103.     mova  [dstq+lenq+8], m2
  104. %endif
  105.     add       lenq, 16
  106.     js .loop
  107. %if mmsize == 8
  108.     emms
  109. %endif
  110.     REP_RET
  111. %endmacro
  112.  
  113. INIT_XMM sse2
  114. FLOAT_TO_INT16 2
  115. INIT_MMX sse
  116. FLOAT_TO_INT16 0
  117. INIT_MMX 3dnow
  118. FLOAT_TO_INT16 0
  119.  
  120. ;------------------------------------------------------------------------------
  121. ; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
  122. ;------------------------------------------------------------------------------
  123. %macro FLOAT_TO_INT16_STEP 1
  124. cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
  125.     add       lenq, lenq
  126.     lea       srcq, [srcq+2*lenq]
  127.     lea     step3q, [stepq*3]
  128.     neg       lenq
  129. .loop:
  130. %if cpuflag(sse2)
  131.     cvtps2dq    m0, [srcq+2*lenq   ]
  132.     cvtps2dq    m1, [srcq+2*lenq+16]
  133.     packssdw    m0, m1
  134.     movd       v1d, m0
  135.     psrldq      m0, 4
  136.     movd       v2d, m0
  137.     psrldq      m0, 4
  138.     mov     [dstq], v1w
  139.     mov  [dstq+stepq*4], v2w
  140.     shr        v1d, 16
  141.     shr        v2d, 16
  142.     mov  [dstq+stepq*2], v1w
  143.     mov  [dstq+step3q*2], v2w
  144.     lea       dstq, [dstq+stepq*8]
  145.     movd       v1d, m0
  146.     psrldq      m0, 4
  147.     movd       v2d, m0
  148.     mov     [dstq], v1w
  149.     mov  [dstq+stepq*4], v2w
  150.     shr        v1d, 16
  151.     shr        v2d, 16
  152.     mov  [dstq+stepq*2], v1w
  153.     mov  [dstq+step3q*2], v2w
  154.     lea       dstq, [dstq+stepq*8]
  155. %else
  156.     CVTPS2PI    m0, [srcq+2*lenq   ]
  157.     CVTPS2PI    m1, [srcq+2*lenq+ 8]
  158.     CVTPS2PI    m2, [srcq+2*lenq+16]
  159.     CVTPS2PI    m3, [srcq+2*lenq+24]
  160.     packssdw    m0, m1
  161.     packssdw    m2, m3
  162.     movd       v1d, m0
  163.     psrlq       m0, 32
  164.     movd       v2d, m0
  165.     mov     [dstq], v1w
  166.     mov  [dstq+stepq*4], v2w
  167.     shr        v1d, 16
  168.     shr        v2d, 16
  169.     mov  [dstq+stepq*2], v1w
  170.     mov  [dstq+step3q*2], v2w
  171.     lea       dstq, [dstq+stepq*8]
  172.     movd       v1d, m2
  173.     psrlq       m2, 32
  174.     movd       v2d, m2
  175.     mov     [dstq], v1w
  176.     mov  [dstq+stepq*4], v2w
  177.     shr        v1d, 16
  178.     shr        v2d, 16
  179.     mov  [dstq+stepq*2], v1w
  180.     mov  [dstq+step3q*2], v2w
  181.     lea       dstq, [dstq+stepq*8]
  182. %endif
  183.     add       lenq, 16
  184.     js .loop
  185. %if mmsize == 8
  186.     emms
  187. %endif
  188.     REP_RET
  189. %endmacro
  190.  
  191. INIT_XMM sse2
  192. FLOAT_TO_INT16_STEP 2
  193. INIT_MMX sse
  194. FLOAT_TO_INT16_STEP 0
  195. INIT_MMX 3dnow
  196. FLOAT_TO_INT16_STEP 0
  197.  
  198. ;-------------------------------------------------------------------------------
  199. ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
  200. ;-------------------------------------------------------------------------------
  201. %macro FLOAT_TO_INT16_INTERLEAVE2 0
  202. cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
  203.     lea      lenq, [4*r2q]
  204.     mov     src1q, [src0q+gprsize]
  205.     mov     src0q, [src0q]
  206.     add      dstq, lenq
  207.     add     src0q, lenq
  208.     add     src1q, lenq
  209.     neg      lenq
  210. .loop:
  211. %if cpuflag(sse2)
  212.     cvtps2dq   m0, [src0q+lenq]
  213.     cvtps2dq   m1, [src1q+lenq]
  214.     packssdw   m0, m1
  215.     movhlps    m1, m0
  216.     punpcklwd  m0, m1
  217.     mova  [dstq+lenq], m0
  218. %else
  219.     CVTPS2PI   m0, [src0q+lenq  ]
  220.     CVTPS2PI   m1, [src0q+lenq+8]
  221.     CVTPS2PI   m2, [src1q+lenq  ]
  222.     CVTPS2PI   m3, [src1q+lenq+8]
  223.     packssdw   m0, m1
  224.     packssdw   m2, m3
  225.     mova       m1, m0
  226.     punpcklwd  m0, m2
  227.     punpckhwd  m1, m2
  228.     mova  [dstq+lenq  ], m0
  229.     mova  [dstq+lenq+8], m1
  230. %endif
  231.     add      lenq, 16
  232.     js .loop
  233. %if mmsize == 8
  234.     emms
  235. %endif
  236.     REP_RET
  237. %endmacro
  238.  
  239. INIT_MMX 3dnow
  240. FLOAT_TO_INT16_INTERLEAVE2
  241. INIT_MMX sse
  242. FLOAT_TO_INT16_INTERLEAVE2
  243. INIT_XMM sse2
  244. FLOAT_TO_INT16_INTERLEAVE2
  245.  
  246. %macro FLOAT_TO_INT16_INTERLEAVE6 0
  247. ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
  248. cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
  249. %if ARCH_X86_64
  250.     mov     lend, r2d
  251. %else
  252.     %define lend dword r2m
  253. %endif
  254.     mov src1q, [srcq+1*gprsize]
  255.     mov src2q, [srcq+2*gprsize]
  256.     mov src3q, [srcq+3*gprsize]
  257.     mov src4q, [srcq+4*gprsize]
  258.     mov src5q, [srcq+5*gprsize]
  259.     mov srcq,  [srcq]
  260.     sub src1q, srcq
  261.     sub src2q, srcq
  262.     sub src3q, srcq
  263.     sub src4q, srcq
  264.     sub src5q, srcq
  265. .loop:
  266.     CVTPS2PI   mm0, [srcq]
  267.     CVTPS2PI   mm1, [srcq+src1q]
  268.     CVTPS2PI   mm2, [srcq+src2q]
  269.     CVTPS2PI   mm3, [srcq+src3q]
  270.     CVTPS2PI   mm4, [srcq+src4q]
  271.     CVTPS2PI   mm5, [srcq+src5q]
  272.     packssdw   mm0, mm3
  273.     packssdw   mm1, mm4
  274.     packssdw   mm2, mm5
  275.     PSWAPD     mm3, mm0
  276.     punpcklwd  mm0, mm1
  277.     punpckhwd  mm1, mm2
  278.     punpcklwd  mm2, mm3
  279.     PSWAPD     mm3, mm0
  280.     punpckldq  mm0, mm2
  281.     punpckhdq  mm2, mm1
  282.     punpckldq  mm1, mm3
  283.     movq [dstq   ], mm0
  284.     movq [dstq+16], mm2
  285.     movq [dstq+ 8], mm1
  286.     add srcq, 8
  287.     add dstq, 24
  288.     sub lend, 2
  289.     jg .loop
  290.     emms
  291.     RET
  292. %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
  293.  
  294. INIT_MMX sse
  295. FLOAT_TO_INT16_INTERLEAVE6
  296. INIT_MMX 3dnow
  297. FLOAT_TO_INT16_INTERLEAVE6
  298. INIT_MMX 3dnowext
  299. FLOAT_TO_INT16_INTERLEAVE6
  300.  
  301. ;-----------------------------------------------------------------------------
  302. ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
  303. ;-----------------------------------------------------------------------------
  304.  
  305. %macro FLOAT_INTERLEAVE6 1
  306. cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
  307. %if ARCH_X86_64
  308.     mov     lend, r2d
  309. %else
  310.     %define lend dword r2m
  311. %endif
  312.     mov    src1q, [srcq+1*gprsize]
  313.     mov    src2q, [srcq+2*gprsize]
  314.     mov    src3q, [srcq+3*gprsize]
  315.     mov    src4q, [srcq+4*gprsize]
  316.     mov    src5q, [srcq+5*gprsize]
  317.     mov     srcq, [srcq]
  318.     sub    src1q, srcq
  319.     sub    src2q, srcq
  320.     sub    src3q, srcq
  321.     sub    src4q, srcq
  322.     sub    src5q, srcq
  323. .loop:
  324. %if cpuflag(sse)
  325.     movaps    m0, [srcq]
  326.     movaps    m1, [srcq+src1q]
  327.     movaps    m2, [srcq+src2q]
  328.     movaps    m3, [srcq+src3q]
  329.     movaps    m4, [srcq+src4q]
  330.     movaps    m5, [srcq+src5q]
  331.  
  332.     SBUTTERFLYPS 0, 1, 6
  333.     SBUTTERFLYPS 2, 3, 6
  334.     SBUTTERFLYPS 4, 5, 6
  335.  
  336.     movaps    m6, m4
  337.     shufps    m4, m0, 0xe4
  338.     movlhps   m0, m2
  339.     movhlps   m6, m2
  340.     movaps [dstq   ], m0
  341.     movaps [dstq+16], m4
  342.     movaps [dstq+32], m6
  343.  
  344.     movaps    m6, m5
  345.     shufps    m5, m1, 0xe4
  346.     movlhps   m1, m3
  347.     movhlps   m6, m3
  348.     movaps [dstq+48], m1
  349.     movaps [dstq+64], m5
  350.     movaps [dstq+80], m6
  351. %else ; mmx
  352.     movq       m0, [srcq]
  353.     movq       m1, [srcq+src1q]
  354.     movq       m2, [srcq+src2q]
  355.     movq       m3, [srcq+src3q]
  356.     movq       m4, [srcq+src4q]
  357.     movq       m5, [srcq+src5q]
  358.  
  359.     SBUTTERFLY dq, 0, 1, 6
  360.     SBUTTERFLY dq, 2, 3, 6
  361.     SBUTTERFLY dq, 4, 5, 6
  362.     movq [dstq   ], m0
  363.     movq [dstq+ 8], m2
  364.     movq [dstq+16], m4
  365.     movq [dstq+24], m1
  366.     movq [dstq+32], m3
  367.     movq [dstq+40], m5
  368. %endif
  369.     add      srcq, mmsize
  370.     add      dstq, mmsize*6
  371.     sub      lend, mmsize/4
  372.     jg .loop
  373. %if mmsize == 8
  374.     emms
  375. %endif
  376.     REP_RET
  377. %endmacro
  378.  
  379. INIT_MMX mmx
  380. FLOAT_INTERLEAVE6 0
  381. INIT_XMM sse
  382. FLOAT_INTERLEAVE6 7
  383.  
  384. ;-----------------------------------------------------------------------------
  385. ; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
  386. ;-----------------------------------------------------------------------------
  387.  
  388. %macro FLOAT_INTERLEAVE2 1
  389. cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
  390.     mov     src1q, [srcq+gprsize]
  391.     mov      srcq, [srcq        ]
  392.     sub     src1q, srcq
  393. .loop:
  394.     mova       m0, [srcq             ]
  395.     mova       m1, [srcq+src1q       ]
  396.     mova       m3, [srcq      +mmsize]
  397.     mova       m4, [srcq+src1q+mmsize]
  398.  
  399.     mova       m2, m0
  400.     PUNPCKLDQ  m0, m1
  401.     PUNPCKHDQ  m2, m1
  402.  
  403.     mova       m1, m3
  404.     PUNPCKLDQ  m3, m4
  405.     PUNPCKHDQ  m1, m4
  406.  
  407.     mova  [dstq         ], m0
  408.     mova  [dstq+1*mmsize], m2
  409.     mova  [dstq+2*mmsize], m3
  410.     mova  [dstq+3*mmsize], m1
  411.  
  412.     add      srcq, mmsize*2
  413.     add      dstq, mmsize*4
  414.     sub      lend, mmsize/2
  415.     jg .loop
  416. %if mmsize == 8
  417.     emms
  418. %endif
  419.     REP_RET
  420. %endmacro
  421.  
  422. INIT_MMX mmx
  423. %define PUNPCKLDQ punpckldq
  424. %define PUNPCKHDQ punpckhdq
  425. FLOAT_INTERLEAVE2 0
  426. INIT_XMM sse
  427. %define PUNPCKLDQ unpcklps
  428. %define PUNPCKHDQ unpckhps
  429. FLOAT_INTERLEAVE2 5
  430.