Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* Copyright (c) 2012 Michael Niedermayer
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with FFmpeg; if not, write to the Free Software
  18. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20.  
  21. %include "libavutil/x86/x86util.asm"
  22.  
  23. SECTION_RODATA 32
  24. flt2pm31: times 8 dd 4.6566129e-10
  25. flt2p31 : times 8 dd 2147483648.0
  26. flt2p15 : times 8 dd 32768.0
  27.  
  28. word_unpack_shuf : db  0, 1, 4, 5, 8, 9,12,13, 2, 3, 6, 7,10,11,14,15
  29.  
  30. SECTION .text
  31.  
  32.  
  33. ;to, from, a/u, log2_outsize, log_intsize, const
  34. %macro PACK_2CH 5-7
  35. cglobal pack_2ch_%2_to_%1_%3, 3, 4, 6, dst, src, len, src2
  36.     mov src2q   , [srcq+gprsize]
  37.     mov srcq    , [srcq]
  38.     mov dstq    , [dstq]
  39. %ifidn %3, a
  40.     test dstq, mmsize-1
  41.         jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
  42.     test srcq, mmsize-1
  43.         jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
  44.     test src2q, mmsize-1
  45.         jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
  46. %else
  47. pack_2ch_%2_to_%1_u_int %+ SUFFIX:
  48. %endif
  49.     lea     srcq , [srcq  + (1<<%5)*lenq]
  50.     lea     src2q, [src2q + (1<<%5)*lenq]
  51.     lea     dstq , [dstq  + (2<<%4)*lenq]
  52.     neg     lenq
  53.     %7 m0,m1,m2,m3,m4,m5
  54. .next:
  55. %if %4 >= %5
  56.     mov%3     m0, [         srcq +(1<<%5)*lenq]
  57.     mova      m1, m0
  58.     mov%3     m2, [         src2q+(1<<%5)*lenq]
  59. %if %5 == 1
  60.     punpcklwd m0, m2
  61.     punpckhwd m1, m2
  62. %else
  63.     punpckldq m0, m2
  64.     punpckhdq m1, m2
  65. %endif
  66.     %6 m0,m1,m2,m3,m4,m5
  67. %else
  68.     mov%3     m0, [         srcq +(1<<%5)*lenq]
  69.     mov%3     m1, [mmsize + srcq +(1<<%5)*lenq]
  70.     mov%3     m2, [         src2q+(1<<%5)*lenq]
  71.     mov%3     m3, [mmsize + src2q+(1<<%5)*lenq]
  72.     %6 m0,m1,m2,m3,m4,m5
  73.     mova      m2, m0
  74.     punpcklwd m0, m1
  75.     punpckhwd m2, m1
  76.     SWAP 1,2
  77. %endif
  78.     mov%3 [           dstq+(2<<%4)*lenq], m0
  79.     mov%3 [  mmsize + dstq+(2<<%4)*lenq], m1
  80. %if %4 > %5
  81.     mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2
  82.     mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3
  83.     add lenq, 4*mmsize/(2<<%4)
  84. %else
  85.     add lenq, 2*mmsize/(2<<%4)
  86. %endif
  87.         jl .next
  88.     REP_RET
  89. %endmacro
  90.  
  91. %macro UNPACK_2CH 5-7
  92. cglobal unpack_2ch_%2_to_%1_%3, 3, 4, 7, dst, src, len, dst2
  93.     mov dst2q   , [dstq+gprsize]
  94.     mov srcq    , [srcq]
  95.     mov dstq    , [dstq]
  96. %ifidn %3, a
  97.     test dstq, mmsize-1
  98.         jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
  99.     test srcq, mmsize-1
  100.         jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
  101.     test dst2q, mmsize-1
  102.         jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
  103. %else
  104. unpack_2ch_%2_to_%1_u_int %+ SUFFIX:
  105. %endif
  106.     lea     srcq , [srcq  + (2<<%5)*lenq]
  107.     lea     dstq , [dstq  + (1<<%4)*lenq]
  108.     lea     dst2q, [dst2q + (1<<%4)*lenq]
  109.     neg     lenq
  110.     %7 m0,m1,m2,m3,m4,m5
  111.     mova      m6, [word_unpack_shuf]
  112. .next:
  113.     mov%3     m0, [           srcq +(2<<%5)*lenq]
  114.     mov%3     m2, [  mmsize + srcq +(2<<%5)*lenq]
  115. %if %5 == 1
  116. %ifidn SUFFIX, _ssse3
  117.     pshufb    m0, m6
  118.     mova      m1, m0
  119.     pshufb    m2, m6
  120.     punpcklqdq m0,m2
  121.     punpckhqdq m1,m2
  122. %else
  123.     mova      m1, m0
  124.     punpcklwd m0,m2
  125.     punpckhwd m1,m2
  126.  
  127.     mova      m2, m0
  128.     punpcklwd m0,m1
  129.     punpckhwd m2,m1
  130.  
  131.     mova      m1, m0
  132.     punpcklwd m0,m2
  133.     punpckhwd m1,m2
  134. %endif
  135. %else
  136.     mova      m1, m0
  137.     shufps    m0, m2, 10001000b
  138.     shufps    m1, m2, 11011101b
  139. %endif
  140. %if %4 < %5
  141.     mov%3     m2, [2*mmsize + srcq +(2<<%5)*lenq]
  142.     mova      m3, m2
  143.     mov%3     m4, [3*mmsize + srcq +(2<<%5)*lenq]
  144.     shufps    m2, m4, 10001000b
  145.     shufps    m3, m4, 11011101b
  146.     SWAP 1,2
  147. %endif
  148.     %6 m0,m1,m2,m3,m4,m5
  149.     mov%3 [           dstq+(1<<%4)*lenq], m0
  150. %if %4 > %5
  151.     mov%3 [          dst2q+(1<<%4)*lenq], m2
  152.     mov%3 [ mmsize +  dstq+(1<<%4)*lenq], m1
  153.     mov%3 [ mmsize + dst2q+(1<<%4)*lenq], m3
  154.     add lenq, 2*mmsize/(1<<%4)
  155. %else
  156.     mov%3 [          dst2q+(1<<%4)*lenq], m1
  157.     add lenq, mmsize/(1<<%4)
  158. %endif
  159.         jl .next
  160.     REP_RET
  161. %endmacro
  162.  
  163. %macro CONV 5-7
  164. cglobal %2_to_%1_%3, 3, 3, 6, dst, src, len
  165.     mov srcq    , [srcq]
  166.     mov dstq    , [dstq]
  167. %ifidn %3, a
  168.     test dstq, mmsize-1
  169.         jne %2_to_%1_u_int %+ SUFFIX
  170.     test srcq, mmsize-1
  171.         jne %2_to_%1_u_int %+ SUFFIX
  172. %else
  173. %2_to_%1_u_int %+ SUFFIX:
  174. %endif
  175.     lea     srcq , [srcq  + (1<<%5)*lenq]
  176.     lea     dstq , [dstq  + (1<<%4)*lenq]
  177.     neg     lenq
  178.     %7 m0,m1,m2,m3,m4,m5
  179. .next:
  180.     mov%3     m0, [           srcq +(1<<%5)*lenq]
  181.     mov%3     m1, [  mmsize + srcq +(1<<%5)*lenq]
  182. %if %4 < %5
  183.     mov%3     m2, [2*mmsize + srcq +(1<<%5)*lenq]
  184.     mov%3     m3, [3*mmsize + srcq +(1<<%5)*lenq]
  185. %endif
  186.     %6 m0,m1,m2,m3,m4,m5
  187.     mov%3 [           dstq+(1<<%4)*lenq], m0
  188.     mov%3 [  mmsize + dstq+(1<<%4)*lenq], m1
  189. %if %4 > %5
  190.     mov%3 [2*mmsize + dstq+(1<<%4)*lenq], m2
  191.     mov%3 [3*mmsize + dstq+(1<<%4)*lenq], m3
  192.     add lenq, 4*mmsize/(1<<%4)
  193. %else
  194.     add lenq, 2*mmsize/(1<<%4)
  195. %endif
  196.         jl .next
  197. %if mmsize == 8
  198.     emms
  199.     RET
  200. %else
  201.     REP_RET
  202. %endif
  203. %endmacro
  204.  
  205. %macro PACK_6CH 5-7
  206. cglobal pack_6ch_%2_to_%1_%3, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
  207. %if ARCH_X86_64
  208.     mov     lend, r2d
  209. %else
  210.     %define lend dword r2m
  211. %endif
  212.     mov    src1q, [srcq+1*gprsize]
  213.     mov    src2q, [srcq+2*gprsize]
  214.     mov    src3q, [srcq+3*gprsize]
  215.     mov    src4q, [srcq+4*gprsize]
  216.     mov    src5q, [srcq+5*gprsize]
  217.     mov     srcq, [srcq]
  218.     mov     dstq, [dstq]
  219. %ifidn %3, a
  220.     test dstq, mmsize-1
  221.         jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  222.     test srcq, mmsize-1
  223.         jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  224.     test src1q, mmsize-1
  225.         jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  226.     test src2q, mmsize-1
  227.         jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  228.     test src3q, mmsize-1
  229.         jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  230.     test src4q, mmsize-1
  231.         jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  232.     test src5q, mmsize-1
  233.         jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  234. %else
  235. pack_6ch_%2_to_%1_u_int %+ SUFFIX:
  236. %endif
  237.     sub    src1q, srcq
  238.     sub    src2q, srcq
  239.     sub    src3q, srcq
  240.     sub    src4q, srcq
  241.     sub    src5q, srcq
  242.     %7 x,x,x,x,m7,x
  243. .loop:
  244.     mov%3     m0, [srcq      ]
  245.     mov%3     m1, [srcq+src1q]
  246.     mov%3     m2, [srcq+src2q]
  247.     mov%3     m3, [srcq+src3q]
  248.     mov%3     m4, [srcq+src4q]
  249.     mov%3     m5, [srcq+src5q]
  250. %if cpuflag(sse)
  251.     SBUTTERFLYPS 0, 1, 6
  252.     SBUTTERFLYPS 2, 3, 6
  253.     SBUTTERFLYPS 4, 5, 6
  254.  
  255. %if cpuflag(avx)
  256.     blendps   m6, m4, m0, 1100b
  257. %else
  258.     movaps    m6, m4
  259.     shufps    m4, m0, q3210
  260.     SWAP 4,6
  261. %endif
  262.     movlhps   m0, m2
  263.     movhlps   m4, m2
  264. %if cpuflag(avx)
  265.     blendps   m2, m5, m1, 1100b
  266. %else
  267.     movaps    m2, m5
  268.     shufps    m5, m1, q3210
  269.     SWAP 2,5
  270. %endif
  271.     movlhps   m1, m3
  272.     movhlps   m5, m3
  273.  
  274.     %6 m0,m6,x,x,m7,m3
  275.     %6 m4,m1,x,x,m7,m3
  276.     %6 m2,m5,x,x,m7,m3
  277.  
  278.     mov %+ %3 %+ ps [dstq   ], m0
  279.     mov %+ %3 %+ ps [dstq+16], m6
  280.     mov %+ %3 %+ ps [dstq+32], m4
  281.     mov %+ %3 %+ ps [dstq+48], m1
  282.     mov %+ %3 %+ ps [dstq+64], m2
  283.     mov %+ %3 %+ ps [dstq+80], m5
  284. %else ; mmx
  285.     SBUTTERFLY dq, 0, 1, 6
  286.     SBUTTERFLY dq, 2, 3, 6
  287.     SBUTTERFLY dq, 4, 5, 6
  288.  
  289.     movq   [dstq   ], m0
  290.     movq   [dstq+ 8], m2
  291.     movq   [dstq+16], m4
  292.     movq   [dstq+24], m1
  293.     movq   [dstq+32], m3
  294.     movq   [dstq+40], m5
  295. %endif
  296.     add      srcq, mmsize
  297.     add      dstq, mmsize*6
  298.     sub      lend, mmsize/4
  299.     jg .loop
  300. %if mmsize == 8
  301.     emms
  302.     RET
  303. %else
  304.     REP_RET
  305. %endif
  306. %endmacro
  307.  
  308. %macro UNPACK_6CH 5-7
  309. cglobal unpack_6ch_%2_to_%1_%3, 2, 8, 8, dst, src, dst1, dst2, dst3, dst4, dst5, len
  310. %if ARCH_X86_64
  311.     mov     lend, r2d
  312. %else
  313.     %define lend dword r2m
  314. %endif
  315.     mov    dst1q, [dstq+1*gprsize]
  316.     mov    dst2q, [dstq+2*gprsize]
  317.     mov    dst3q, [dstq+3*gprsize]
  318.     mov    dst4q, [dstq+4*gprsize]
  319.     mov    dst5q, [dstq+5*gprsize]
  320.     mov     dstq, [dstq]
  321.     mov     srcq, [srcq]
  322. %ifidn %3, a
  323.     test dstq, mmsize-1
  324.         jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
  325.     test srcq, mmsize-1
  326.         jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
  327.     test dst1q, mmsize-1
  328.         jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
  329.     test dst2q, mmsize-1
  330.         jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
  331.     test dst3q, mmsize-1
  332.         jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
  333.     test dst4q, mmsize-1
  334.         jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
  335.     test dst5q, mmsize-1
  336.         jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
  337. %else
  338. unpack_6ch_%2_to_%1_u_int %+ SUFFIX:
  339. %endif
  340.     sub    dst1q, dstq
  341.     sub    dst2q, dstq
  342.     sub    dst3q, dstq
  343.     sub    dst4q, dstq
  344.     sub    dst5q, dstq
  345.     %7 x,x,x,x,m7,x
  346. .loop:
  347.     mov%3     m0, [srcq   ]
  348.     mov%3     m1, [srcq+16]
  349.     mov%3     m2, [srcq+32]
  350.     mov%3     m3, [srcq+48]
  351.     mov%3     m4, [srcq+64]
  352.     mov%3     m5, [srcq+80]
  353.  
  354.     SBUTTERFLYPS 0, 3, 6
  355.     SBUTTERFLYPS 1, 4, 6
  356.     SBUTTERFLYPS 2, 5, 6
  357.     SBUTTERFLYPS 0, 4, 6
  358.     SBUTTERFLYPS 3, 2, 6
  359.     SBUTTERFLYPS 1, 5, 6
  360.     SWAP 1, 4
  361.     SWAP 2, 3
  362.  
  363.     %6 m0,m1,x,x,m7,m6
  364.     %6 m2,m3,x,x,m7,m6
  365.     %6 m4,m5,x,x,m7,m6
  366.  
  367.     mov %+ %3 %+ ps [dstq      ], m0
  368.     mov %+ %3 %+ ps [dstq+dst1q], m1
  369.     mov %+ %3 %+ ps [dstq+dst2q], m2
  370.     mov %+ %3 %+ ps [dstq+dst3q], m3
  371.     mov %+ %3 %+ ps [dstq+dst4q], m4
  372.     mov %+ %3 %+ ps [dstq+dst5q], m5
  373.  
  374.     add      srcq, mmsize*6
  375.     add      dstq, mmsize
  376.     sub      lend, mmsize/4
  377.     jg .loop
  378.     REP_RET
  379. %endmacro
  380.  
  381. %define PACK_8CH_GPRS (10 * ARCH_X86_64) + ((6 + HAVE_ALIGNED_STACK) * ARCH_X86_32)
  382.  
  383. %macro PACK_8CH 5-7
  384. cglobal pack_8ch_%2_to_%1_%3, 2,PACK_8CH_GPRS,10, ARCH_X86_32*48, dst, src, len, src1, src2, src3, src4, src5, src6, src7
  385.     mov     dstq, [dstq]
  386. %if ARCH_X86_32
  387.     DEFINE_ARGS dst, src, src2, src3, src4, src5, src6
  388.     %define lend dword r2m
  389.     %define src1q r0q
  390.     %define src1m dword [rsp+32]
  391. %if HAVE_ALIGNED_STACK == 0
  392.     DEFINE_ARGS dst, src, src2, src3, src5, src6
  393.     %define src4q r0q
  394.     %define src4m dword [rsp+36]
  395. %endif
  396.     %define src7q r0q
  397.     %define src7m dword [rsp+40]
  398.     mov     dstm, dstq
  399. %endif
  400.     mov    src7q, [srcq+7*gprsize]
  401.     mov    src6q, [srcq+6*gprsize]
  402. %if ARCH_X86_32
  403.     mov    src7m, src7q
  404. %endif
  405.     mov    src5q, [srcq+5*gprsize]
  406.     mov    src4q, [srcq+4*gprsize]
  407.     mov    src3q, [srcq+3*gprsize]
  408. %if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
  409.     mov    src4m, src4q
  410. %endif
  411.     mov    src2q, [srcq+2*gprsize]
  412.     mov    src1q, [srcq+1*gprsize]
  413.     mov     srcq, [srcq]
  414. %ifidn %3, a
  415. %if ARCH_X86_32
  416.     test dstmp, mmsize-1
  417. %else
  418.     test dstq, mmsize-1
  419. %endif
  420.         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  421.     test srcq, mmsize-1
  422.         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  423.     test src1q, mmsize-1
  424.         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  425.     test src2q, mmsize-1
  426.         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  427.     test src3q, mmsize-1
  428.         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  429. %if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
  430.     test src4m, mmsize-1
  431. %else
  432.     test src4q, mmsize-1
  433. %endif
  434.         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  435.     test src5q, mmsize-1
  436.         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  437.     test src6q, mmsize-1
  438.         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  439. %if ARCH_X86_32
  440.     test src7m, mmsize-1
  441. %else
  442.     test src7q, mmsize-1
  443. %endif
  444.         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
  445. %else
  446. pack_8ch_%2_to_%1_u_int %+ SUFFIX:
  447. %endif
  448.     sub    src1q, srcq
  449.     sub    src2q, srcq
  450.     sub    src3q, srcq
  451. %if ARCH_X86_64 || HAVE_ALIGNED_STACK
  452.     sub    src4q, srcq
  453. %else
  454.     sub    src4m, srcq
  455. %endif
  456.     sub    src5q, srcq
  457.     sub    src6q, srcq
  458. %if ARCH_X86_64
  459.     sub    src7q, srcq
  460. %else
  461.     mov src1m, src1q
  462.     sub src7m, srcq
  463. %endif
  464.  
  465. %if ARCH_X86_64
  466.     %7 x,x,x,x,m9,x
  467. %elifidn %1, int32
  468.     %define m9 [flt2p31]
  469. %else
  470.     %define m9 [flt2pm31]
  471. %endif
  472.  
  473. .loop:
  474.     mov%3     m0, [srcq      ]
  475.     mov%3     m1, [srcq+src1q]
  476.     mov%3     m2, [srcq+src2q]
  477. %if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
  478.     mov    src4q, src4m
  479. %endif
  480.     mov%3     m3, [srcq+src3q]
  481.     mov%3     m4, [srcq+src4q]
  482.     mov%3     m5, [srcq+src5q]
  483. %if ARCH_X86_32
  484.     mov    src7q, src7m
  485. %endif
  486.     mov%3     m6, [srcq+src6q]
  487.     mov%3     m7, [srcq+src7q]
  488.  
  489. %if ARCH_X86_64
  490.     TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
  491.  
  492.     %6 m0,m1,x,x,m9,m8
  493.     %6 m2,m3,x,x,m9,m8
  494.     %6 m4,m5,x,x,m9,m8
  495.     %6 m6,m7,x,x,m9,m8
  496.  
  497.     mov%3 [dstq], m0
  498. %else
  499.     mov     dstq, dstm
  500.  
  501.     TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, [rsp], [rsp+16], 1
  502.  
  503.     %6 m0,m1,x,x,m9,m2
  504.     mova     m2, [rsp]
  505.     mov%3   [dstq], m0
  506.     %6 m2,m3,x,x,m9,m0
  507.     %6 m4,m5,x,x,m9,m0
  508.     %6 m6,m7,x,x,m9,m0
  509.  
  510. %endif
  511.  
  512.     mov%3 [dstq+16],  m1
  513.     mov%3 [dstq+32],  m2
  514.     mov%3 [dstq+48],  m3
  515.     mov%3 [dstq+64],  m4
  516.     mov%3 [dstq+80],  m5
  517.     mov%3 [dstq+96],  m6
  518.     mov%3 [dstq+112], m7
  519.  
  520.     add      srcq, mmsize
  521.     add      dstq, mmsize*8
  522. %if ARCH_X86_32
  523.     mov      dstm, dstq
  524.     mov      src1q, src1m
  525. %endif
  526.     sub      lend, mmsize/4
  527.     jg .loop
  528.     REP_RET
  529. %endmacro
  530.  
  531. %macro INT16_TO_INT32_N 6
  532.     pxor      m2, m2
  533.     pxor      m3, m3
  534.     punpcklwd m2, m1
  535.     punpckhwd m3, m1
  536.     SWAP 4,0
  537.     pxor      m0, m0
  538.     pxor      m1, m1
  539.     punpcklwd m0, m4
  540.     punpckhwd m1, m4
  541. %endmacro
  542.  
  543. %macro INT32_TO_INT16_N 6
  544.     psrad     m0, 16
  545.     psrad     m1, 16
  546.     psrad     m2, 16
  547.     psrad     m3, 16
  548.     packssdw  m0, m1
  549.     packssdw  m2, m3
  550.     SWAP 1,2
  551. %endmacro
  552.  
  553. %macro INT32_TO_FLOAT_INIT 6
  554.     mova      %5, [flt2pm31]
  555. %endmacro
  556. %macro INT32_TO_FLOAT_N 6
  557.     cvtdq2ps  %1, %1
  558.     cvtdq2ps  %2, %2
  559.     mulps %1, %1, %5
  560.     mulps %2, %2, %5
  561. %endmacro
  562.  
  563. %macro FLOAT_TO_INT32_INIT 6
  564.     mova      %5, [flt2p31]
  565. %endmacro
  566. %macro FLOAT_TO_INT32_N 6
  567.     mulps %1, %5
  568.     mulps %2, %5
  569.     cvtps2dq  %6, %1
  570.     cmpps %1, %1, %5, 5
  571.     paddd %1, %6
  572.     cvtps2dq  %6, %2
  573.     cmpps %2, %2, %5, 5
  574.     paddd %2, %6
  575. %endmacro
  576.  
  577. %macro INT16_TO_FLOAT_INIT 6
  578.     mova      m5, [flt2pm31]
  579. %endmacro
  580. %macro INT16_TO_FLOAT_N 6
  581.     INT16_TO_INT32_N %1,%2,%3,%4,%5,%6
  582.     cvtdq2ps  m0, m0
  583.     cvtdq2ps  m1, m1
  584.     cvtdq2ps  m2, m2
  585.     cvtdq2ps  m3, m3
  586.     mulps m0, m0, m5
  587.     mulps m1, m1, m5
  588.     mulps m2, m2, m5
  589.     mulps m3, m3, m5
  590. %endmacro
  591.  
  592. %macro FLOAT_TO_INT16_INIT 6
  593.     mova      m5, [flt2p15]
  594. %endmacro
  595. %macro FLOAT_TO_INT16_N 6
  596.     mulps m0, m5
  597.     mulps m1, m5
  598.     mulps m2, m5
  599.     mulps m3, m5
  600.     cvtps2dq  m0, m0
  601.     cvtps2dq  m1, m1
  602.     packssdw  m0, m1
  603.     cvtps2dq  m1, m2
  604.     cvtps2dq  m3, m3
  605.     packssdw  m1, m3
  606. %endmacro
  607.  
  608. %macro NOP_N 0-6
  609. %endmacro
  610.  
  611. INIT_MMX mmx
  612. CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
  613. CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
  614. CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
  615. CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
  616.  
  617. PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
  618. PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
  619.  
  620. INIT_XMM sse
  621. PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
  622. PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
  623.  
  624. UNPACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
  625. UNPACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
  626.  
  627. INIT_XMM sse2
  628. CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
  629. CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
  630. CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
  631. CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
  632.  
  633. PACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
  634. PACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
  635. PACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
  636. PACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
  637. PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
  638. PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
  639. PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
  640. PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
  641.  
  642. UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
  643. UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
  644. UNPACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
  645. UNPACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
  646. UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
  647. UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
  648. UNPACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
  649. UNPACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
  650.  
  651. CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  652. CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  653. CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  654. CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  655. CONV float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  656. CONV float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  657. CONV int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  658. CONV int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  659.  
  660. PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  661. PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  662. PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  663. PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  664. PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  665. PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  666. PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  667. PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  668.  
  669. UNPACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  670. UNPACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  671. UNPACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  672. UNPACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  673. UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  674. UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  675. UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  676. UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  677.  
  678. PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  679. PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  680. PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  681. PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  682.  
  683. UNPACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  684. UNPACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  685. UNPACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  686. UNPACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  687.  
  688. PACK_8CH float, float, u, 2, 2, NOP_N, NOP_N
  689. PACK_8CH float, float, a, 2, 2, NOP_N, NOP_N
  690.  
  691. PACK_8CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  692. PACK_8CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  693. PACK_8CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  694. PACK_8CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  695.  
  696. INIT_XMM ssse3
  697. UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
  698. UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
  699. UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
  700. UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
  701. UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  702. UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  703.  
  704. %if HAVE_AVX_EXTERNAL
  705. INIT_XMM avx
  706. PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
  707. PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
  708.  
  709. UNPACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
  710. UNPACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
  711.  
  712. PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  713. PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  714. PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  715. PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  716.  
  717. UNPACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  718. UNPACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  719. UNPACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  720. UNPACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  721.  
  722. PACK_8CH float, float, u, 2, 2, NOP_N, NOP_N
  723. PACK_8CH float, float, a, 2, 2, NOP_N, NOP_N
  724.  
  725. PACK_8CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  726. PACK_8CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  727. PACK_8CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  728. PACK_8CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  729.  
  730. INIT_YMM avx
  731. CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  732. CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  733. %endif
  734.  
  735. %if HAVE_AVX2_EXTERNAL
  736. INIT_YMM avx2
  737. CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  738. CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  739. %endif
  740.