Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* MMX optimized discrete wavelet trasnform
  3. ;* Copyright (c) 2010 David Conrad
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21.  
  22. %include "libavutil/x86/x86util.asm"
  23.  
  24. SECTION_RODATA
  25. pw_1991: times 4 dw 9,-1
  26.  
  27. cextern pw_1
  28. cextern pw_2
  29. cextern pw_8
  30. cextern pw_16
  31.  
  32. section .text
  33.  
  34. ; %1 -= (%2 + %3 + 2)>>2     %4 is pw_2
  35. %macro COMPOSE_53iL0 4
  36.     paddw   %2, %3
  37.     paddw   %2, %4
  38.     psraw   %2, 2
  39.     psubw   %1, %2
  40. %endm
  41.  
  42. ; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
  43. ; if %4 is supplied, %1 is loaded unaligned from there
  44. ; m2: clobbered  m3: pw_8  m4: pw_1991
  45. %macro COMPOSE_DD97iH0 3-4
  46.     paddw   m0, %3
  47.     paddw   m1, %2
  48.     psubw   m0, m3
  49.     mova    m2, m1
  50.     punpcklwd m1, m0
  51.     punpckhwd m2, m0
  52.     pmaddwd m1, m4
  53.     pmaddwd m2, m4
  54. %if %0 > 3
  55.     movu    %1, %4
  56. %endif
  57.     psrad   m1, 4
  58.     psrad   m2, 4
  59.     packssdw m1, m2
  60.     paddw   m1, %1
  61. %endm
  62.  
  63. %macro COMPOSE_VERTICAL 1
  64. ; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
  65. ;                                  int width)
  66. cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
  67.     mova    m2, [pw_2]
  68. %if ARCH_X86_64
  69.     mov     widthd, widthd
  70. %endif
  71. .loop:
  72.     sub     widthq, mmsize/2
  73.     mova    m1, [b0q+2*widthq]
  74.     mova    m0, [b1q+2*widthq]
  75.     COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
  76.     mova    [b1q+2*widthq], m0
  77.     jg      .loop
  78.     REP_RET
  79.  
  80. ; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
  81. ;                                  int width)
  82. cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
  83.     mova    m1, [pw_1]
  84. %if ARCH_X86_64
  85.     mov     widthd, widthd
  86. %endif
  87. .loop:
  88.     sub     widthq, mmsize/2
  89.     mova    m0, [b0q+2*widthq]
  90.     paddw   m0, [b2q+2*widthq]
  91.     paddw   m0, m1
  92.     psraw   m0, 1
  93.     paddw   m0, [b1q+2*widthq]
  94.     mova    [b1q+2*widthq], m0
  95.     jg      .loop
  96.     REP_RET
  97.  
  98. ; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
  99. ;                               IDWTELEM *b3, IDWTELEM *b4, int width)
  100. cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
  101.     mova    m3, [pw_8]
  102.     mova    m4, [pw_1991]
  103. %if ARCH_X86_64
  104.     mov     widthd, widthd
  105. %endif
  106. .loop:
  107.     sub     widthq, mmsize/2
  108.     mova    m0, [b0q+2*widthq]
  109.     mova    m1, [b1q+2*widthq]
  110.     COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
  111.     mova    [b2q+2*widthq], m1
  112.     jg      .loop
  113.     REP_RET
  114.  
  115. ; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
  116. ;                                IDWTELEM *b3, IDWTELEM *b4, int width)
  117. cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
  118.     mova    m3, [pw_16]
  119.     mova    m4, [pw_1991]
  120. %if ARCH_X86_64
  121.     mov     widthd, widthd
  122. %endif
  123. .loop:
  124.     sub     widthq, mmsize/2
  125.     mova    m0, [b0q+2*widthq]
  126.     mova    m1, [b1q+2*widthq]
  127.     mova    m5, [b2q+2*widthq]
  128.     paddw   m0, [b4q+2*widthq]
  129.     paddw   m1, [b3q+2*widthq]
  130.     psubw   m0, m3
  131.     mova    m2, m1
  132.     punpcklwd m1, m0
  133.     punpckhwd m2, m0
  134.     pmaddwd m1, m4
  135.     pmaddwd m2, m4
  136.     psrad   m1, 5
  137.     psrad   m2, 5
  138.     packssdw m1, m2
  139.     psubw   m5, m1
  140.     mova    [b2q+2*widthq], m5
  141.     jg      .loop
  142.     REP_RET
  143.  
  144. ; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
  145. cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
  146.     mova    m3, [pw_1]
  147. %if ARCH_X86_64
  148.     mov     widthd, widthd
  149. %endif
  150. .loop:
  151.     sub     widthq, mmsize/2
  152.     mova    m1, [b1q+2*widthq]
  153.     mova    m0, [b0q+2*widthq]
  154.     mova    m2, m1
  155.     paddw   m1, m3
  156.     psraw   m1, 1
  157.     psubw   m0, m1
  158.     mova    [b0q+2*widthq], m0
  159.     paddw   m2, m0
  160.     mova    [b1q+2*widthq], m2
  161.     jg      .loop
  162.     REP_RET
  163. %endmacro
  164.  
  165. ; extend the left and right edges of the tmp array by %1 and %2 respectively
  166. %macro EDGE_EXTENSION 3
  167.     mov     %3, [tmpq]
  168. %assign %%i 1
  169. %rep %1
  170.     mov     [tmpq-2*%%i], %3
  171.     %assign %%i %%i+1
  172. %endrep
  173.     mov     %3, [tmpq+2*w2q-2]
  174. %assign %%i 0
  175. %rep %2
  176.     mov     [tmpq+2*w2q+2*%%i], %3
  177.     %assign %%i %%i+1
  178. %endrep
  179. %endmacro
  180.  
  181.  
  182. %macro HAAR_HORIZONTAL 2
  183. ; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
  184. cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
  185.     mov    w2d, wd
  186.     xor     xq, xq
  187.     shr    w2d, 1
  188.     lea  b_w2q, [bq+wq]
  189.     mova    m3, [pw_1]
  190. .lowpass_loop:
  191.     movu    m1, [b_w2q + 2*xq]
  192.     mova    m0, [bq    + 2*xq]
  193.     paddw   m1, m3
  194.     psraw   m1, 1
  195.     psubw   m0, m1
  196.     mova    [tmpq + 2*xq], m0
  197.     add     xq, mmsize/2
  198.     cmp     xq, w2q
  199.     jl      .lowpass_loop
  200.  
  201.     xor     xq, xq
  202.     and    w2q, ~(mmsize/2 - 1)
  203.     cmp    w2q, mmsize/2
  204.     jl      .end
  205.  
  206. .highpass_loop:
  207.     movu    m1, [b_w2q + 2*xq]
  208.     mova    m0, [tmpq  + 2*xq]
  209.     paddw   m1, m0
  210.  
  211.     ; shift and interleave
  212. %if %2 == 1
  213.     paddw   m0, m3
  214.     paddw   m1, m3
  215.     psraw   m0, 1
  216.     psraw   m1, 1
  217. %endif
  218.     mova    m2, m0
  219.     punpcklwd m0, m1
  220.     punpckhwd m2, m1
  221.     mova    [bq+4*xq], m0
  222.     mova    [bq+4*xq+mmsize], m2
  223.  
  224.     add     xq, mmsize/2
  225.     cmp     xq, w2q
  226.     jl      .highpass_loop
  227. .end:
  228.     REP_RET
  229. %endmacro
  230.  
  231.  
  232. INIT_XMM
  233. ; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
  234. cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
  235.     mov    w2d, wd
  236.     xor     xd, xd
  237.     shr    w2d, 1
  238.     lea  b_w2q, [bq+wq]
  239.     movu    m4, [bq+wq]
  240.     mova    m7, [pw_2]
  241.     pslldq  m4, 14
  242. .lowpass_loop:
  243.     movu    m1, [b_w2q + 2*xq]
  244.     mova    m0, [bq    + 2*xq]
  245.     mova    m2, m1
  246.     palignr m1, m4, 14
  247.     mova    m4, m2
  248.     COMPOSE_53iL0 m0, m1, m2, m7
  249.     mova    [tmpq + 2*xq], m0
  250.     add     xd, mmsize/2
  251.     cmp     xd, w2d
  252.     jl      .lowpass_loop
  253.  
  254.     EDGE_EXTENSION 1, 2, xw
  255.     ; leave the last up to 7 (sse) or 3 (mmx) values for C
  256.     xor     xd, xd
  257.     and    w2d, ~(mmsize/2 - 1)
  258.     cmp    w2d, mmsize/2
  259.     jl      .end
  260.  
  261.     mova    m7, [tmpq-mmsize]
  262.     mova    m0, [tmpq]
  263.     mova    m5, [pw_1]
  264.     mova    m3, [pw_8]
  265.     mova    m4, [pw_1991]
  266. .highpass_loop:
  267.     mova    m6, m0
  268.     palignr m0, m7, 14
  269.     mova    m7, [tmpq + 2*xq + 16]
  270.     mova    m1, m7
  271.     mova    m2, m7
  272.     palignr m1, m6, 2
  273.     palignr m2, m6, 4
  274.     COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
  275.     mova    m0, m7
  276.     mova    m7, m6
  277.  
  278.     ; shift and interleave
  279.     paddw   m6, m5
  280.     paddw   m1, m5
  281.     psraw   m6, 1
  282.     psraw   m1, 1
  283.     mova    m2, m6
  284.     punpcklwd m6, m1
  285.     punpckhwd m2, m1
  286.     mova    [bq+4*xq], m6
  287.     mova    [bq+4*xq+mmsize], m2
  288.  
  289.     add     xd, mmsize/2
  290.     cmp     xd, w2d
  291.     jl      .highpass_loop
  292. .end:
  293.     REP_RET
  294.  
  295.  
  296. %if ARCH_X86_64 == 0
  297. INIT_MMX
  298. COMPOSE_VERTICAL mmx
  299. HAAR_HORIZONTAL mmx, 0
  300. HAAR_HORIZONTAL mmx, 1
  301. %endif
  302.  
  303. ;;INIT_XMM
  304. INIT_XMM
  305. COMPOSE_VERTICAL sse2
  306. HAAR_HORIZONTAL sse2, 0
  307. HAAR_HORIZONTAL sse2, 1
  308.