Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* x86-optimized horizontal line scaling functions
  3. ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21.  
  22. %include "libavutil/x86/x86util.asm"
  23.  
  24. SECTION_RODATA
  25.  
  26. max_19bit_int: times 4 dd 0x7ffff
  27. max_19bit_flt: times 4 dd 524287.0
  28. minshort:      times 8 dw 0x8000
  29. unicoeff:      times 4 dd 0x20000000
  30.  
  31. SECTION .text
  32.  
  33. ;-----------------------------------------------------------------------------
  34. ; horizontal line scaling
  35. ;
  36. ; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
  37. ;                               (SwsContext *c, int{16,32}_t *dst,
  38. ;                                int dstW, const uint{8,16}_t *src,
  39. ;                                const int16_t *filter,
  40. ;                                const int32_t *filterPos, int filterSize);
  41. ;
  42. ; Scale one horizontal line. Input is either 8-bits width or 16-bits width
  43. ; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
  44. ; downscale before multiplying). Filter is 14-bits. Output is either 15bits
  45. ; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each
  46. ; output pixel is generated from $filterSize input pixels, the position of
  47. ; the first pixel is given in filterPos[nOutputPixel].
  48. ;-----------------------------------------------------------------------------
  49.  
  50. ; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
  51. %macro SCALE_FUNC 6
  52. %ifnidn %3, X
  53. cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
  54. %else
  55. cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
  56. %endif
  57. %if ARCH_X86_64
  58.     movsxd        wq, wd
  59. %define mov32 movsxd
  60. %else ; x86-32
  61. %define mov32 mov
  62. %endif ; x86-64
  63. %if %2 == 19
  64. %if mmsize == 8 ; mmx
  65.     mova          m2, [max_19bit_int]
  66. %elif cpuflag(sse4)
  67.     mova          m2, [max_19bit_int]
  68. %else ; ssse3/sse2
  69.     mova          m2, [max_19bit_flt]
  70. %endif ; mmx/sse2/ssse3/sse4
  71. %endif ; %2 == 19
  72. %if %1 == 16
  73.     mova          m6, [minshort]
  74.     mova          m7, [unicoeff]
  75. %elif %1 == 8
  76.     pxor          m3, m3
  77. %endif ; %1 == 8/16
  78.  
  79. %if %1 == 8
  80. %define movlh movd
  81. %define movbh movh
  82. %define srcmul 1
  83. %else ; %1 == 9-16
  84. %define movlh movq
  85. %define movbh movu
  86. %define srcmul 2
  87. %endif ; %1 == 8/9-16
  88.  
  89. %ifnidn %3, X
  90.  
  91.     ; setup loop
  92. %if %3 == 8
  93.     shl           wq, 1                         ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
  94. %define wshr 1
  95. %else ; %3 == 4
  96. %define wshr 0
  97. %endif ; %3 == 8
  98.     lea      filterq, [filterq+wq*8]
  99. %if %2 == 15
  100.     lea         dstq, [dstq+wq*(2>>wshr)]
  101. %else ; %2 == 19
  102.     lea         dstq, [dstq+wq*(4>>wshr)]
  103. %endif ; %2 == 15/19
  104.     lea      fltposq, [fltposq+wq*(4>>wshr)]
  105.     neg           wq
  106.  
  107. .loop:
  108. %if %3 == 4 ; filterSize == 4 scaling
  109.     ; load 2x4 or 4x4 source pixels into m0/m1
  110.     mov32      pos0q, dword [fltposq+wq*4+ 0]   ; filterPos[0]
  111.     mov32      pos1q, dword [fltposq+wq*4+ 4]   ; filterPos[1]
  112.     movlh         m0, [srcq+pos0q*srcmul]       ; src[filterPos[0] + {0,1,2,3}]
  113. %if mmsize == 8
  114.     movlh         m1, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
  115. %else ; mmsize == 16
  116. %if %1 > 8
  117.     movhps        m0, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
  118. %else ; %1 == 8
  119.     movd          m4, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
  120. %endif
  121.     mov32      pos0q, dword [fltposq+wq*4+ 8]   ; filterPos[2]
  122.     mov32      pos1q, dword [fltposq+wq*4+12]   ; filterPos[3]
  123.     movlh         m1, [srcq+pos0q*srcmul]       ; src[filterPos[2] + {0,1,2,3}]
  124. %if %1 > 8
  125.     movhps        m1, [srcq+pos1q*srcmul]       ; src[filterPos[3] + {0,1,2,3}]
  126. %else ; %1 == 8
  127.     movd          m5, [srcq+pos1q*srcmul]       ; src[filterPos[3] + {0,1,2,3}]
  128.     punpckldq     m0, m4
  129.     punpckldq     m1, m5
  130. %endif ; %1 == 8
  131. %endif ; mmsize == 8/16
  132. %if %1 == 8
  133.     punpcklbw     m0, m3                        ; byte -> word
  134.     punpcklbw     m1, m3                        ; byte -> word
  135. %endif ; %1 == 8
  136.  
  137.     ; multiply with filter coefficients
  138. %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
  139.              ; add back 0x8000 * sum(coeffs) after the horizontal add
  140.     psubw         m0, m6
  141.     psubw         m1, m6
  142. %endif ; %1 == 16
  143.     pmaddwd       m0, [filterq+wq*8+mmsize*0]   ; *= filter[{0,1,..,6,7}]
  144.     pmaddwd       m1, [filterq+wq*8+mmsize*1]   ; *= filter[{8,9,..,14,15}]
  145.  
  146.     ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
  147. %if mmsize == 8 ; mmx
  148.     movq          m4, m0
  149.     punpckldq     m0, m1
  150.     punpckhdq     m4, m1
  151.     paddd         m0, m4
  152. %elif notcpuflag(ssse3) ; sse2
  153.     mova          m4, m0
  154.     shufps        m0, m1, 10001000b
  155.     shufps        m4, m1, 11011101b
  156.     paddd         m0, m4
  157. %else ; ssse3/sse4
  158.     phaddd        m0, m1                        ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
  159.                                                 ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
  160.                                                 ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
  161.                                                 ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
  162. %endif ; mmx/sse2/ssse3/sse4
  163. %else ; %3 == 8, i.e. filterSize == 8 scaling
  164.     ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
  165.     mov32      pos0q, dword [fltposq+wq*2+0]    ; filterPos[0]
  166.     mov32      pos1q, dword [fltposq+wq*2+4]    ; filterPos[1]
  167.     movbh         m0, [srcq+ pos0q   *srcmul]   ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
  168. %if mmsize == 8
  169.     movbh         m1, [srcq+(pos0q+4)*srcmul]   ; src[filterPos[0] + {4,5,6,7}]
  170.     movbh         m4, [srcq+ pos1q   *srcmul]   ; src[filterPos[1] + {0,1,2,3}]
  171.     movbh         m5, [srcq+(pos1q+4)*srcmul]   ; src[filterPos[1] + {4,5,6,7}]
  172. %else ; mmsize == 16
  173.     movbh         m1, [srcq+ pos1q   *srcmul]   ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
  174.     mov32      pos0q, dword [fltposq+wq*2+8]    ; filterPos[2]
  175.     mov32      pos1q, dword [fltposq+wq*2+12]   ; filterPos[3]
  176.     movbh         m4, [srcq+ pos0q   *srcmul]   ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
  177.     movbh         m5, [srcq+ pos1q   *srcmul]   ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
  178. %endif ; mmsize == 8/16
  179. %if %1 == 8
  180.     punpcklbw     m0, m3                        ; byte -> word
  181.     punpcklbw     m1, m3                        ; byte -> word
  182.     punpcklbw     m4, m3                        ; byte -> word
  183.     punpcklbw     m5, m3                        ; byte -> word
  184. %endif ; %1 == 8
  185.  
  186.     ; multiply
  187. %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
  188.              ; add back 0x8000 * sum(coeffs) after the horizontal add
  189.     psubw         m0, m6
  190.     psubw         m1, m6
  191.     psubw         m4, m6
  192.     psubw         m5, m6
  193. %endif ; %1 == 16
  194.     pmaddwd       m0, [filterq+wq*8+mmsize*0]   ; *= filter[{0,1,..,6,7}]
  195.     pmaddwd       m1, [filterq+wq*8+mmsize*1]   ; *= filter[{8,9,..,14,15}]
  196.     pmaddwd       m4, [filterq+wq*8+mmsize*2]   ; *= filter[{16,17,..,22,23}]
  197.     pmaddwd       m5, [filterq+wq*8+mmsize*3]   ; *= filter[{24,25,..,30,31}]
  198.  
  199.     ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
  200. %if mmsize == 8
  201.     paddd         m0, m1
  202.     paddd         m4, m5
  203.     movq          m1, m0
  204.     punpckldq     m0, m4
  205.     punpckhdq     m1, m4
  206.     paddd         m0, m1
  207. %elif notcpuflag(ssse3) ; sse2
  208. %if %1 == 8
  209. %define mex m6
  210. %else
  211. %define mex m3
  212. %endif
  213.     ; emulate horizontal add as transpose + vertical add
  214.     mova         mex, m0
  215.     punpckldq     m0, m1
  216.     punpckhdq    mex, m1
  217.     paddd         m0, mex
  218.     mova          m1, m4
  219.     punpckldq     m4, m5
  220.     punpckhdq     m1, m5
  221.     paddd         m4, m1
  222.     mova          m1, m0
  223.     punpcklqdq    m0, m4
  224.     punpckhqdq    m1, m4
  225.     paddd         m0, m1
  226. %else ; ssse3/sse4
  227.     ; FIXME if we rearrange the filter in pairs of 4, we can
  228.     ; load pixels likewise and use 2 x paddd + phaddd instead
  229.     ; of 3 x phaddd here, faster on older cpus
  230.     phaddd        m0, m1
  231.     phaddd        m4, m5
  232.     phaddd        m0, m4                        ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
  233.                                                 ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
  234.                                                 ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
  235.                                                 ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
  236. %endif ; mmx/sse2/ssse3/sse4
  237. %endif ; %3 == 4/8
  238.  
  239. %else ; %3 == X, i.e. any filterSize scaling
  240.  
  241. %ifidn %4, X4
  242. %define dlt 4
  243. %else ; %4 == X || %4 == X8
  244. %define dlt 0
  245. %endif ; %4 ==/!= X4
  246. %if ARCH_X86_64
  247. %define srcq    r8
  248. %define pos1q   r7
  249. %define srcendq r9
  250.     movsxd  fltsizeq, fltsized                  ; filterSize
  251.     lea      srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
  252. %else ; x86-32
  253. %define srcq    srcmemq
  254. %define pos1q   dstq
  255. %define srcendq r6m
  256.     lea        pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
  257.     mov      srcendq, pos0q
  258. %endif ; x86-32/64
  259.     lea      fltposq, [fltposq+wq*4]
  260. %if %2 == 15
  261.     lea         dstq, [dstq+wq*2]
  262. %else ; %2 == 19
  263.     lea         dstq, [dstq+wq*4]
  264. %endif ; %2 == 15/19
  265.     movifnidn  dstmp, dstq
  266.     neg           wq
  267.  
  268. .loop:
  269.     mov32      pos0q, dword [fltposq+wq*4+0]    ; filterPos[0]
  270.     mov32      pos1q, dword [fltposq+wq*4+4]    ; filterPos[1]
  271.     ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
  272.     pxor          m4, m4
  273.     pxor          m5, m5
  274.     mov         srcq, srcmemmp
  275.  
  276. .innerloop:
  277.     ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
  278.     movbh         m0, [srcq+ pos0q     *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
  279.     movbh         m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
  280. %if %1 == 8
  281.     punpcklbw     m0, m3
  282.     punpcklbw     m1, m3
  283. %endif ; %1 == 8
  284.  
  285.     ; multiply
  286. %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
  287.              ; add back 0x8000 * sum(coeffs) after the horizontal add
  288.     psubw         m0, m6
  289.     psubw         m1, m6
  290. %endif ; %1 == 16
  291.     pmaddwd       m0, [filterq]                 ; filter[{0,1,2,3(,4,5,6,7)}]
  292.     pmaddwd       m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
  293.     paddd         m4, m0
  294.     paddd         m5, m1
  295.     add      filterq, mmsize
  296.     add         srcq, srcmul*mmsize/2
  297.     cmp         srcq, srcendq                   ; while (src += 4) < &src[filterSize]
  298.     jl .innerloop
  299.  
  300. %ifidn %4, X4
  301.     mov32      pos1q, dword [fltposq+wq*4+4]    ; filterPos[1]
  302.     movlh         m0, [srcq+ pos0q     *srcmul] ; split last 4 srcpx of dstpx[0]
  303.     sub        pos1q, fltsizeq                  ; and first 4 srcpx of dstpx[1]
  304. %if %1 > 8
  305.     movhps        m0, [srcq+(pos1q+dlt)*srcmul]
  306. %else ; %1 == 8
  307.     movd          m1, [srcq+(pos1q+dlt)*srcmul]
  308.     punpckldq     m0, m1
  309. %endif ; %1 == 8
  310. %if %1 == 8
  311.     punpcklbw     m0, m3
  312. %endif ; %1 == 8
  313. %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
  314.              ; add back 0x8000 * sum(coeffs) after the horizontal add
  315.     psubw         m0, m6
  316. %endif ; %1 == 16
  317.     pmaddwd       m0, [filterq]
  318. %endif ; %4 == X4
  319.  
  320.     lea      filterq, [filterq+(fltsizeq+dlt)*2]
  321.  
  322. %if mmsize == 8 ; mmx
  323.     movq          m0, m4
  324.     punpckldq     m4, m5
  325.     punpckhdq     m0, m5
  326.     paddd         m0, m4
  327. %else ; mmsize == 16
  328. %if notcpuflag(ssse3) ; sse2
  329.     mova          m1, m4
  330.     punpcklqdq    m4, m5
  331.     punpckhqdq    m1, m5
  332.     paddd         m4, m1
  333. %else ; ssse3/sse4
  334.     phaddd        m4, m5
  335. %endif ; sse2/ssse3/sse4
  336. %ifidn %4, X4
  337.     paddd         m4, m0
  338. %endif ; %3 == X4
  339. %if notcpuflag(ssse3) ; sse2
  340.     pshufd        m4, m4, 11011000b
  341.     movhlps       m0, m4
  342.     paddd         m0, m4
  343. %else ; ssse3/sse4
  344.     phaddd        m4, m4
  345.     SWAP           0, 4
  346. %endif ; sse2/ssse3/sse4
  347. %endif ; mmsize == 8/16
  348. %endif ; %3 ==/!= X
  349.  
  350. %if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
  351.     paddd         m0, m7
  352. %endif ; %1 == 16
  353.  
  354.     ; clip, store
  355.     psrad         m0, 14 + %1 - %2
  356. %ifidn %3, X
  357.     movifnidn   dstq, dstmp
  358. %endif ; %3 == X
  359. %if %2 == 15
  360.     packssdw      m0, m0
  361. %ifnidn %3, X
  362.     movh [dstq+wq*(2>>wshr)], m0
  363. %else ; %3 == X
  364.     movd [dstq+wq*2], m0
  365. %endif ; %3 ==/!= X
  366. %else ; %2 == 19
  367. %if mmsize == 8
  368.     PMINSD_MMX    m0, m2, m4
  369. %elif cpuflag(sse4)
  370.     pminsd        m0, m2
  371. %else ; sse2/ssse3
  372.     cvtdq2ps      m0, m0
  373.     minps         m0, m2
  374.     cvtps2dq      m0, m0
  375. %endif ; mmx/sse2/ssse3/sse4
  376. %ifnidn %3, X
  377.     mova [dstq+wq*(4>>wshr)], m0
  378. %else ; %3 == X
  379.     movq [dstq+wq*4], m0
  380. %endif ; %3 ==/!= X
  381. %endif ; %2 == 15/19
  382. %ifnidn %3, X
  383.     add           wq, (mmsize<<wshr)/4          ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
  384.                                                 ; per iteration. see "shl wq,1" above as for why we do this
  385. %else ; %3 == X
  386.     add           wq, 2
  387. %endif ; %3 ==/!= X
  388.     jl .loop
  389.     REP_RET
  390. %endmacro
  391.  
  392. ; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
  393. %macro SCALE_FUNCS 3
  394. SCALE_FUNC %1, %2, 4, 4,  6, %3
  395. SCALE_FUNC %1, %2, 8, 8,  6, %3
  396. %if mmsize == 8
  397. SCALE_FUNC %1, %2, X, X,  7, %3
  398. %else
  399. SCALE_FUNC %1, %2, X, X4, 7, %3
  400. SCALE_FUNC %1, %2, X, X8, 7, %3
  401. %endif
  402. %endmacro
  403.  
  404. ; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
  405. %macro SCALE_FUNCS2 3
  406. %if notcpuflag(sse4)
  407. SCALE_FUNCS  8, 15, %1
  408. SCALE_FUNCS  9, 15, %2
  409. SCALE_FUNCS 10, 15, %2
  410. SCALE_FUNCS 12, 15, %2
  411. SCALE_FUNCS 14, 15, %2
  412. SCALE_FUNCS 16, 15, %3
  413. %endif ; !sse4
  414. SCALE_FUNCS  8, 19, %1
  415. SCALE_FUNCS  9, 19, %2
  416. SCALE_FUNCS 10, 19, %2
  417. SCALE_FUNCS 12, 19, %2
  418. SCALE_FUNCS 14, 19, %2
  419. SCALE_FUNCS 16, 19, %3
  420. %endmacro
  421.  
  422. %if ARCH_X86_32
  423. INIT_MMX mmx
  424. SCALE_FUNCS2 0, 0, 0
  425. %endif
  426. INIT_XMM sse2
  427. SCALE_FUNCS2 6, 7, 8
  428. INIT_XMM ssse3
  429. SCALE_FUNCS2 6, 6, 8
  430. INIT_XMM sse4
  431. SCALE_FUNCS2 6, 6, 8
  432.