Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* Copyright (c) 2010 David Conrad
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with FFmpeg; if not, write to the Free Software
  18. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20.  
  21. %include "libavutil/x86/x86util.asm"
  22.  
  23. SECTION_RODATA
  24. pw_3: times 8 dw 3
  25. pw_7: times 8 dw 7
  26. pw_16: times 8 dw 16
  27. pw_32: times 8 dw 32
  28. pb_128: times 16 db 128
  29.  
  30. section .text
  31.  
  32. %macro UNPACK_ADD 6
  33.     mov%5   %1, %3
  34.     mov%6   m5, %4
  35.     mova    m4, %1
  36.     mova    %2, m5
  37.     punpcklbw %1, m7
  38.     punpcklbw m5, m7
  39.     punpckhbw m4, m7
  40.     punpckhbw %2, m7
  41.     paddw   %1, m5
  42.     paddw   %2, m4
  43. %endmacro
  44.  
  45. %macro HPEL_FILTER 1
  46. ; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
  47. cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
  48.     mov     src0q, srcq
  49.     lea     stridex3q, [3*strideq]
  50.     sub     src0q, stridex3q
  51.     pxor    m7, m7
  52. .loop:
  53.     ; 7*(src[0] + src[1])
  54.     UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
  55.     pmullw  m0, [pw_7]
  56.     pmullw  m1, [pw_7]
  57.  
  58.     ; 3*( ... + src[-2] + src[3])
  59.     UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
  60.     paddw   m0, m2
  61.     paddw   m1, m3
  62.     pmullw  m0, [pw_3]
  63.     pmullw  m1, [pw_3]
  64.  
  65.     ; ... - 7*(src[-1] + src[2])
  66.     UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
  67.     pmullw  m2, [pw_7]
  68.     pmullw  m3, [pw_7]
  69.     psubw   m0, m2
  70.     psubw   m1, m3
  71.  
  72.     ; ... - (src[-3] + src[4])
  73.     UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
  74.     psubw   m0, m2
  75.     psubw   m1, m3
  76.  
  77.     paddw   m0, [pw_16]
  78.     paddw   m1, [pw_16]
  79.     psraw   m0, 5
  80.     psraw   m1, 5
  81.     packuswb m0, m1
  82.     mova    [dstq], m0
  83.     add     dstq, mmsize
  84.     add     srcq, mmsize
  85.     add     src0q, mmsize
  86.     sub     widthd, mmsize
  87.     jg      .loop
  88.     RET
  89.  
  90. ; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
  91. cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
  92.     dec     widthd
  93.     pxor    m7, m7
  94.     and     widthd, ~(mmsize-1)
  95. .loop:
  96.     ; 7*(src[0] + src[1])
  97.     UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
  98.     pmullw  m0, [pw_7]
  99.     pmullw  m1, [pw_7]
  100.  
  101.     ; 3*( ... + src[-2] + src[3])
  102.     UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
  103.     paddw   m0, m2
  104.     paddw   m1, m3
  105.     pmullw  m0, [pw_3]
  106.     pmullw  m1, [pw_3]
  107.  
  108.     ; ... - 7*(src[-1] + src[2])
  109.     UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
  110.     pmullw  m2, [pw_7]
  111.     pmullw  m3, [pw_7]
  112.     psubw   m0, m2
  113.     psubw   m1, m3
  114.  
  115.     ; ... - (src[-3] + src[4])
  116.     UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
  117.     psubw   m0, m2
  118.     psubw   m1, m3
  119.  
  120.     paddw   m0, [pw_16]
  121.     paddw   m1, [pw_16]
  122.     psraw   m0, 5
  123.     psraw   m1, 5
  124.     packuswb m0, m1
  125.     mova    [dstq + widthq], m0
  126.     sub     widthd, mmsize
  127.     jge     .loop
  128.     RET
  129. %endmacro
  130.  
  131. %macro PUT_RECT 1
  132. ; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
  133. cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
  134.     mova    m0, [pb_128]
  135.     add     wd, (mmsize-1)
  136.     and     wd, ~(mmsize-1)
  137.  
  138. %if ARCH_X86_64
  139.     movsxd   dst_strideq, dst_strided
  140.     movsxd   src_strideq, src_strided
  141.     mov   r7d, r5m
  142.     mov   r8d, wd
  143.     %define wspill r8d
  144.     %define hd r7d
  145. %else
  146.     mov    r4m, wd
  147.     %define wspill r4m
  148.     %define hd r5mp
  149. %endif
  150.  
  151. .loopy
  152.     lea     src2q, [srcq+src_strideq*2]
  153.     lea     dst2q, [dstq+dst_strideq]
  154. .loopx:
  155.     sub      wd, mmsize
  156.     mova     m1, [srcq +2*wq]
  157.     mova     m2, [src2q+2*wq]
  158.     packsswb m1, [srcq +2*wq+mmsize]
  159.     packsswb m2, [src2q+2*wq+mmsize]
  160.     paddb    m1, m0
  161.     paddb    m2, m0
  162.     mova    [dstq +wq], m1
  163.     mova    [dst2q+wq], m2
  164.     jg      .loopx
  165.  
  166.     lea   srcq, [srcq+src_strideq*4]
  167.     lea   dstq, [dstq+dst_strideq*2]
  168.     sub     hd, 2
  169.     mov     wd, wspill
  170.     jg      .loopy
  171.     RET
  172. %endm
  173.  
  174. %macro ADD_RECT 1
  175. ; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
  176. cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
  177.     mova    m0, [pw_32]
  178.     add     wd, (mmsize-1)
  179.     and     wd, ~(mmsize-1)
  180.  
  181. %if ARCH_X86_64
  182.     movsxd   strideq, strided
  183.     movsxd   idwt_strideq, idwt_strided
  184.     mov   r8d, wd
  185.     %define wspill r8d
  186. %else
  187.     mov    r5m, wd
  188.     %define wspill r5m
  189. %endif
  190.  
  191. .loop:
  192.     sub     wd, mmsize
  193.     movu    m1, [srcq +2*wq] ; FIXME: ensure alignment
  194.     paddw   m1, m0
  195.     psraw   m1, 6
  196.     movu    m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
  197.     paddw   m2, m0
  198.     psraw   m2, 6
  199.     paddw   m1, [idwtq+2*wq]
  200.     paddw   m2, [idwtq+2*wq+mmsize]
  201.     packuswb m1, m2
  202.     mova    [dstq +wq], m1
  203.     jg      .loop
  204.  
  205.     lea   srcq, [srcq + 2*strideq]
  206.     add   dstq, strideq
  207.     lea  idwtq, [idwtq+ 2*idwt_strideq]
  208.     sub     hd, 1
  209.     mov     wd, wspill
  210.     jg      .loop
  211.     RET
  212. %endm
  213.  
  214. %macro ADD_OBMC 2
  215. ; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
  216. cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
  217.     pxor        m4, m4
  218. .loop:
  219. %assign i 0
  220. %rep %1 / mmsize
  221.     mova        m0, [srcq+i]
  222.     mova        m1, m0
  223.     punpcklbw   m0, m4
  224.     punpckhbw   m1, m4
  225.     mova        m2, [obmcq+i]
  226.     mova        m3, m2
  227.    punpcklbw   m2, m4
  228.     punpckhbw   m3, m4
  229.     pmullw      m0, m2
  230.     pmullw      m1, m3
  231.     movu        m2, [dstq+2*i]
  232.     movu        m3, [dstq+2*i+mmsize]
  233.     paddw       m0, m2
  234.     paddw       m1, m3
  235.     movu        [dstq+2*i], m0
  236.     movu        [dstq+2*i+mmsize], m1
  237. %assign i i+mmsize
  238. %endrep
  239.     lea         srcq, [srcq+strideq]
  240.     lea         dstq, [dstq+2*strideq]
  241.     add         obmcq, 32
  242.     sub         yblend, 1
  243.     jg          .loop
  244.     RET
  245. %endm
  246.  
  247. INIT_MMX
  248. %if ARCH_X86_64 == 0
  249. PUT_RECT mmx
  250. ADD_RECT mmx
  251.  
  252. HPEL_FILTER mmx
  253. ADD_OBMC 32, mmx
  254. ADD_OBMC 16, mmx
  255. %endif
  256. ADD_OBMC 8, mmx
  257.  
  258. INIT_XMM
  259. PUT_RECT sse2
  260. ADD_RECT sse2
  261.  
  262. HPEL_FILTER sse2
  263. ADD_OBMC 32, sse2
  264. ADD_OBMC 16, sse2
  265.