Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* MMX-optimized H.263 loop filter
  3. ;* Copyright (c) 2003-2013 Michael Niedermayer
  4. ;* Copyright (c) 2013 Daniel Kang
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22.  
  23. %include "libavutil/x86/x86util.asm"
  24.  
  25. SECTION_RODATA
  26. cextern pb_FC
  27. cextern h263_loop_filter_strength
  28.  
  29. SECTION_TEXT
  30.  
  31. %macro H263_LOOP_FILTER 5
  32.     pxor         m7, m7
  33.     mova         m0, [%1]
  34.     mova         m1, [%1]
  35.     mova         m2, [%4]
  36.     mova         m3, [%4]
  37.     punpcklbw    m0, m7
  38.     punpckhbw    m1, m7
  39.     punpcklbw    m2, m7
  40.     punpckhbw    m3, m7
  41.     psubw        m0, m2
  42.     psubw        m1, m3
  43.     mova         m2, [%2]
  44.     mova         m3, [%2]
  45.     mova         m4, [%3]
  46.     mova         m5, [%3]
  47.     punpcklbw    m2, m7
  48.     punpckhbw    m3, m7
  49.     punpcklbw    m4, m7
  50.     punpckhbw    m5, m7
  51.     psubw        m4, m2
  52.     psubw        m5, m3
  53.     psllw        m4, 2
  54.     psllw        m5, 2
  55.     paddw        m4, m0
  56.     paddw        m5, m1
  57.     pxor         m6, m6
  58.     pcmpgtw      m6, m4
  59.     pcmpgtw      m7, m5
  60.     pxor         m4, m6
  61.     pxor         m5, m7
  62.     psubw        m4, m6
  63.     psubw        m5, m7
  64.     psrlw        m4, 3
  65.     psrlw        m5, 3
  66.     packuswb     m4, m5
  67.     packsswb     m6, m7
  68.     pxor         m7, m7
  69.     movd         m2, %5
  70.     punpcklbw    m2, m2
  71.     punpcklbw    m2, m2
  72.     punpcklbw    m2, m2
  73.     psubusb      m2, m4
  74.     mova         m3, m2
  75.     psubusb      m3, m4
  76.     psubb        m2, m3
  77.     mova         m3, [%2]
  78.     mova         m4, [%3]
  79.     pxor         m3, m6
  80.     pxor         m4, m6
  81.     paddusb      m3, m2
  82.     psubusb      m4, m2
  83.     pxor         m3, m6
  84.     pxor         m4, m6
  85.     paddusb      m2, m2
  86.     packsswb     m0, m1
  87.     pcmpgtb      m7, m0
  88.     pxor         m0, m7
  89.     psubb        m0, m7
  90.     mova         m1, m0
  91.     psubusb      m0, m2
  92.     psubb        m1, m0
  93.     pand         m1, [pb_FC]
  94.     psrlw        m1, 2
  95.     pxor         m1, m7
  96.     psubb        m1, m7
  97.     mova         m5, [%1]
  98.     mova         m6, [%4]
  99.     psubb        m5, m1
  100.     paddb        m6, m1
  101. %endmacro
  102.  
  103. INIT_MMX mmx
  104. ; void h263_v_loop_filter(uint8_t *src, int stride, int qscale)
  105. cglobal h263_v_loop_filter, 3,5
  106.     movsxdifnidn r1, r1d
  107.     movsxdifnidn r2, r2d
  108.  
  109.     lea          r4, [h263_loop_filter_strength]
  110.     movzx       r3d, BYTE [r4+r2]
  111.     movsx        r2, r3b
  112.     shl          r2, 1
  113.  
  114.     mov          r3, r0
  115.     sub          r3, r1
  116.     mov          r4, r3
  117.     sub          r4, r1
  118.     H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
  119.  
  120.     mova       [r3], m3
  121.     mova       [r0], m4
  122.     mova       [r4], m5
  123.     mova    [r0+r1], m6
  124.     RET
  125.  
  126. %macro TRANSPOSE4X4 2
  127.     movd      m0, [%1]
  128.     movd      m1, [%1+r1]
  129.     movd      m2, [%1+r1*2]
  130.     movd      m3, [%1+r3]
  131.     punpcklbw m0, m1
  132.     punpcklbw m2, m3
  133.     mova      m1, m0
  134.     punpcklwd m0, m2
  135.     punpckhwd m1, m2
  136.     movd [%2+ 0], m0
  137.     punpckhdq m0, m0
  138.     movd [%2+ 8], m0
  139.     movd [%2+16], m1
  140.     punpckhdq m1, m1
  141.     movd [%2+24], m1
  142. %endmacro
  143.  
  144.  
  145. ; void h263_h_loop_filter(uint8_t *src, int stride, int qscale)
  146. INIT_MMX mmx
  147. cglobal h263_h_loop_filter, 3,5,0,32
  148.     movsxdifnidn r1, r1d
  149.     movsxdifnidn r2, r2d
  150.  
  151.     lea          r4, [h263_loop_filter_strength]
  152.     movzx       r3d, BYTE [r4+r2]
  153.     movsx        r2, r3b
  154.     shl          r2, 1
  155.  
  156.     sub          r0, 2
  157.     lea          r3, [r1*3]
  158.  
  159.     TRANSPOSE4X4 r0, rsp
  160.     lea          r4, [r0+r1*4]
  161.     TRANSPOSE4X4 r4, rsp+4
  162.  
  163.     H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
  164.  
  165.     mova         m1, m5
  166.     mova         m0, m4
  167.     punpcklbw    m5, m3
  168.     punpcklbw    m4, m6
  169.     punpckhbw    m1, m3
  170.     punpckhbw    m0, m6
  171.     mova         m3, m5
  172.     mova         m6, m1
  173.     punpcklwd    m5, m4
  174.     punpcklwd    m1, m0
  175.     punpckhwd    m3, m4
  176.     punpckhwd    m6, m0
  177.     movd       [r0], m5
  178.     punpckhdq    m5, m5
  179.     movd  [r0+r1*1], m5
  180.     movd  [r0+r1*2], m3
  181.     punpckhdq    m3, m3
  182.     movd    [r0+r3], m3
  183.     movd       [r4], m1
  184.     punpckhdq    m1, m1
  185.     movd  [r4+r1*1], m1
  186.     movd  [r4+r1*2], m6
  187.     punpckhdq    m6, m6
  188.     movd    [r4+r3], m6
  189.     RET
  190.