Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* Copyright (c) 2010 David Conrad
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with FFmpeg; if not, write to the Free Software
  18. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20.  
  21. %include "libavutil/x86/x86util.asm"
  22.  
  23. SECTION_RODATA
  24. pw_7: times 8 dw 7
  25.  
  26. cextern pw_3
  27. cextern pw_16
  28. cextern pw_32
  29. cextern pb_80
  30.  
  31. section .text
  32.  
  33. %macro UNPACK_ADD 6
  34.     mov%5   %1, %3
  35.     mov%6   m5, %4
  36.     mova    m4, %1
  37.     mova    %2, m5
  38.     punpcklbw %1, m7
  39.     punpcklbw m5, m7
  40.     punpckhbw m4, m7
  41.     punpckhbw %2, m7
  42.     paddw   %1, m5
  43.     paddw   %2, m4
  44. %endmacro
  45.  
  46. %macro HPEL_FILTER 1
  47. ; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
  48. cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
  49.     mov     src0q, srcq
  50.     lea     stridex3q, [3*strideq]
  51.     sub     src0q, stridex3q
  52.     pxor    m7, m7
  53. .loop:
  54.     ; 7*(src[0] + src[1])
  55.     UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
  56.     pmullw  m0, [pw_7]
  57.     pmullw  m1, [pw_7]
  58.  
  59.     ; 3*( ... + src[-2] + src[3])
  60.     UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
  61.     paddw   m0, m2
  62.     paddw   m1, m3
  63.     pmullw  m0, [pw_3]
  64.     pmullw  m1, [pw_3]
  65.  
  66.     ; ... - 7*(src[-1] + src[2])
  67.     UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
  68.     pmullw  m2, [pw_7]
  69.     pmullw  m3, [pw_7]
  70.     psubw   m0, m2
  71.     psubw   m1, m3
  72.  
  73.     ; ... - (src[-3] + src[4])
  74.     UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
  75.     psubw   m0, m2
  76.     psubw   m1, m3
  77.  
  78.     paddw   m0, [pw_16]
  79.     paddw   m1, [pw_16]
  80.     psraw   m0, 5
  81.     psraw   m1, 5
  82.     packuswb m0, m1
  83.     mova    [dstq], m0
  84.     add     dstq, mmsize
  85.     add     srcq, mmsize
  86.     add     src0q, mmsize
  87.     sub     widthd, mmsize
  88.     jg      .loop
  89.     RET
  90.  
  91. ; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
  92. cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
  93.     dec     widthd
  94.     pxor    m7, m7
  95.     and     widthd, ~(mmsize-1)
  96. .loop:
  97.     ; 7*(src[0] + src[1])
  98.     UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
  99.     pmullw  m0, [pw_7]
  100.     pmullw  m1, [pw_7]
  101.  
  102.     ; 3*( ... + src[-2] + src[3])
  103.     UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
  104.     paddw   m0, m2
  105.     paddw   m1, m3
  106.     pmullw  m0, [pw_3]
  107.     pmullw  m1, [pw_3]
  108.  
  109.     ; ... - 7*(src[-1] + src[2])
  110.     UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
  111.     pmullw  m2, [pw_7]
  112.     pmullw  m3, [pw_7]
  113.     psubw   m0, m2
  114.     psubw   m1, m3
  115.  
  116.     ; ... - (src[-3] + src[4])
  117.     UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
  118.     psubw   m0, m2
  119.     psubw   m1, m3
  120.  
  121.     paddw   m0, [pw_16]
  122.     paddw   m1, [pw_16]
  123.     psraw   m0, 5
  124.     psraw   m1, 5
  125.     packuswb m0, m1
  126.     mova    [dstq + widthq], m0
  127.     sub     widthd, mmsize
  128.     jge     .loop
  129.     RET
  130. %endmacro
  131.  
  132. %macro PUT_RECT 1
  133. ; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
  134. cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
  135.     mova    m0, [pb_80]
  136.     add     wd, (mmsize-1)
  137.     and     wd, ~(mmsize-1)
  138.  
  139. %if ARCH_X86_64
  140.     movsxd   dst_strideq, dst_strided
  141.     movsxd   src_strideq, src_strided
  142.     mov   r7d, r5m
  143.     mov   r8d, wd
  144.     %define wspill r8d
  145.     %define hd r7d
  146. %else
  147.     mov    r4m, wd
  148.     %define wspill r4m
  149.     %define hd r5mp
  150. %endif
  151.  
  152. .loopy:
  153.     lea     src2q, [srcq+src_strideq*2]
  154.     lea     dst2q, [dstq+dst_strideq]
  155. .loopx:
  156.     sub      wd, mmsize
  157.     mova     m1, [srcq +2*wq]
  158.     mova     m2, [src2q+2*wq]
  159.     packsswb m1, [srcq +2*wq+mmsize]
  160.     packsswb m2, [src2q+2*wq+mmsize]
  161.     paddb    m1, m0
  162.     paddb    m2, m0
  163.     mova    [dstq +wq], m1
  164.     mova    [dst2q+wq], m2
  165.     jg      .loopx
  166.  
  167.     lea   srcq, [srcq+src_strideq*4]
  168.     lea   dstq, [dstq+dst_strideq*2]
  169.     sub     hd, 2
  170.     mov     wd, wspill
  171.     jg      .loopy
  172.     RET
  173. %endm
  174.  
  175. %macro ADD_RECT 1
  176. ; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
  177. cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
  178.     mova    m0, [pw_32]
  179.     add     wd, (mmsize-1)
  180.     and     wd, ~(mmsize-1)
  181.  
  182. %if ARCH_X86_64
  183.     movsxd   strideq, strided
  184.     movsxd   idwt_strideq, idwt_strided
  185.     mov   r8d, wd
  186.     %define wspill r8d
  187. %else
  188.     mov    r5m, wd
  189.     %define wspill r5m
  190. %endif
  191.  
  192. .loop:
  193.     sub     wd, mmsize
  194.     movu    m1, [srcq +2*wq] ; FIXME: ensure alignment
  195.     paddw   m1, m0
  196.     psraw   m1, 6
  197.     movu    m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
  198.     paddw   m2, m0
  199.     psraw   m2, 6
  200.     paddw   m1, [idwtq+2*wq]
  201.     paddw   m2, [idwtq+2*wq+mmsize]
  202.     packuswb m1, m2
  203.     mova    [dstq +wq], m1
  204.     jg      .loop
  205.  
  206.     lea   srcq, [srcq + 2*strideq]
  207.     add   dstq, strideq
  208.     lea  idwtq, [idwtq+ 2*idwt_strideq]
  209.     sub     hd, 1
  210.     mov     wd, wspill
  211.     jg      .loop
  212.     RET
  213. %endm
  214.  
  215. %macro ADD_OBMC 2
  216. ; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
  217. cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
  218.     pxor        m4, m4
  219. .loop:
  220. %assign i 0
  221. %rep %1 / mmsize
  222.     mova        m0, [srcq+i]
  223.     mova        m1, m0
  224.     punpcklbw   m0, m4
  225.     punpckhbw   m1, m4
  226.     mova        m2, [obmcq+i]
  227.     mova        m3, m2
  228.    punpcklbw   m2, m4
  229.     punpckhbw   m3, m4
  230.     pmullw      m0, m2
  231.     pmullw      m1, m3
  232.     movu        m2, [dstq+2*i]
  233.     movu        m3, [dstq+2*i+mmsize]
  234.     paddw       m0, m2
  235.     paddw       m1, m3
  236.     movu        [dstq+2*i], m0
  237.     movu        [dstq+2*i+mmsize], m1
  238. %assign i i+mmsize
  239. %endrep
  240.     lea         srcq, [srcq+strideq]
  241.     lea         dstq, [dstq+2*strideq]
  242.     add         obmcq, 32
  243.     sub         yblend, 1
  244.     jg          .loop
  245.     RET
  246. %endm
  247.  
  248. INIT_MMX
  249. %if ARCH_X86_64 == 0
  250. PUT_RECT mmx
  251. ADD_RECT mmx
  252.  
  253. HPEL_FILTER mmx
  254. ADD_OBMC 32, mmx
  255. ADD_OBMC 16, mmx
  256. %endif
  257. ADD_OBMC 8, mmx
  258.  
  259. INIT_XMM
  260. PUT_RECT sse2
  261. ADD_RECT sse2
  262.  
  263. HPEL_FILTER sse2
  264. ADD_OBMC 32, sse2
  265. ADD_OBMC 16, sse2
  266.