Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* Copyright (c) 2008 Loren Merritt
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with FFmpeg; if not, write to the Free Software
  18. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20.  
  21. %include "libavutil/x86/x86util.asm"
  22.  
  23. SECTION .text
  24.  
  25. %macro SCALARPRODUCT 0
  26. ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
  27. ;                                     int order, int mul)
  28. cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
  29.     shl orderq, 1
  30.     movd    m7, mulm
  31. %if mmsize == 16
  32.     pshuflw m7, m7, 0
  33.     punpcklqdq m7, m7
  34. %else
  35.     pshufw  m7, m7, 0
  36. %endif
  37.     pxor    m6, m6
  38.     add v1q, orderq
  39.     add v2q, orderq
  40.     add v3q, orderq
  41.     neg orderq
  42. .loop:
  43.     movu    m0, [v2q + orderq]
  44.     movu    m1, [v2q + orderq + mmsize]
  45.     mova    m4, [v1q + orderq]
  46.     mova    m5, [v1q + orderq + mmsize]
  47.     movu    m2, [v3q + orderq]
  48.     movu    m3, [v3q + orderq + mmsize]
  49.     pmaddwd m0, m4
  50.     pmaddwd m1, m5
  51.     pmullw  m2, m7
  52.     pmullw  m3, m7
  53.     paddd   m6, m0
  54.     paddd   m6, m1
  55.     paddw   m2, m4
  56.     paddw   m3, m5
  57.     mova    [v1q + orderq], m2
  58.     mova    [v1q + orderq + mmsize], m3
  59.     add     orderq, mmsize*2
  60.     jl .loop
  61.     HADDD   m6, m0
  62.     movd   eax, m6
  63.     RET
  64. %endmacro
  65.  
  66. INIT_MMX mmxext
  67. SCALARPRODUCT
  68. INIT_XMM sse2
  69. SCALARPRODUCT
  70.  
  71. %macro SCALARPRODUCT_LOOP 1
  72. align 16
  73. .loop%1:
  74.     sub     orderq, mmsize*2
  75. %if %1
  76.     mova    m1, m4
  77.     mova    m4, [v2q + orderq]
  78.     mova    m0, [v2q + orderq + mmsize]
  79.     palignr m1, m0, %1
  80.     palignr m0, m4, %1
  81.     mova    m3, m5
  82.     mova    m5, [v3q + orderq]
  83.     mova    m2, [v3q + orderq + mmsize]
  84.     palignr m3, m2, %1
  85.     palignr m2, m5, %1
  86. %else
  87.     mova    m0, [v2q + orderq]
  88.     mova    m1, [v2q + orderq + mmsize]
  89.     mova    m2, [v3q + orderq]
  90.     mova    m3, [v3q + orderq + mmsize]
  91. %endif
  92.     %define t0  [v1q + orderq]
  93.     %define t1  [v1q + orderq + mmsize]
  94. %if ARCH_X86_64
  95.     mova    m8, t0
  96.     mova    m9, t1
  97.     %define t0  m8
  98.     %define t1  m9
  99. %endif
  100.     pmaddwd m0, t0
  101.     pmaddwd m1, t1
  102.     pmullw  m2, m7
  103.     pmullw  m3, m7
  104.     paddw   m2, t0
  105.     paddw   m3, t1
  106.     paddd   m6, m0
  107.     paddd   m6, m1
  108.     mova    [v1q + orderq], m2
  109.     mova    [v1q + orderq + mmsize], m3
  110.     jg .loop%1
  111. %if %1
  112.     jmp .end
  113. %endif
  114. %endmacro
  115.  
  116. ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
  117. ;                                     int order, int mul)
  118. INIT_XMM ssse3
  119. cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
  120.     shl orderq, 1
  121.     movd    m7, mulm
  122.     pshuflw m7, m7, 0
  123.     punpcklqdq m7, m7
  124.     pxor    m6, m6
  125.     mov    r4d, v2d
  126.     and    r4d, 15
  127.     and    v2q, ~15
  128.     and    v3q, ~15
  129.     mova    m4, [v2q + orderq]
  130.     mova    m5, [v3q + orderq]
  131.     ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  132.     cmp    r4d, 0
  133.     je .loop0
  134.     cmp    r4d, 2
  135.     je .loop2
  136.     cmp    r4d, 4
  137.     je .loop4
  138.     cmp    r4d, 6
  139.     je .loop6
  140.     cmp    r4d, 8
  141.     je .loop8
  142.     cmp    r4d, 10
  143.     je .loop10
  144.     cmp    r4d, 12
  145.     je .loop12
  146. SCALARPRODUCT_LOOP 14
  147. SCALARPRODUCT_LOOP 12
  148. SCALARPRODUCT_LOOP 10
  149. SCALARPRODUCT_LOOP 8
  150. SCALARPRODUCT_LOOP 6
  151. SCALARPRODUCT_LOOP 4
  152. SCALARPRODUCT_LOOP 2
  153. SCALARPRODUCT_LOOP 0
  154. .end:
  155.     HADDD   m6, m0
  156.     movd   eax, m6
  157.     RET
  158.