Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* SIMD-optimized JPEG2000 DSP functions
  3. ;* Copyright (c) 2014 Nicolas Bertrand
  4. ;* Copyright (c) 2015 James Almer
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22.  
  23. %include "libavutil/x86/x86util.asm"
  24.  
  25. SECTION_RODATA 32
  26.  
  27. pf_ict0: times 8 dd 1.402
  28. pf_ict1: times 8 dd 0.34413
  29. pf_ict2: times 8 dd 0.71414
  30. pf_ict3: times 8 dd 1.772
  31.  
  32. SECTION .text
  33.  
  34. ;***********************************************************************
  35. ; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize)
  36. ;***********************************************************************
  37. %macro ICT_FLOAT 1
  38. cglobal ict_float, 4, 4, %1, src0, src1, src2, csize
  39.     shl  csized, 2
  40.     add   src0q, csizeq
  41.     add   src1q, csizeq
  42.     add   src2q, csizeq
  43.     neg  csizeq
  44.     movaps   m6, [pf_ict0]
  45.     movaps   m7, [pf_ict1]
  46.     %define ICT0 m6
  47.     %define ICT1 m7
  48.  
  49. %if ARCH_X86_64
  50.     movaps   m8, [pf_ict2]
  51.     %define ICT2 m8
  52. %if cpuflag(avx)
  53.     movaps   m3, [pf_ict3]
  54.     %define ICT3 m3
  55. %else
  56.     movaps   m9, [pf_ict3]
  57.     %define ICT3 m9
  58. %endif
  59.  
  60. %else ; ARCH_X86_32
  61.     %define ICT2 [pf_ict2]
  62. %if cpuflag(avx)
  63.     movaps   m3, [pf_ict3]
  64.     %define ICT3 m3
  65. %else
  66.     %define ICT3 [pf_ict3]
  67. %endif
  68.  
  69. %endif ; ARCH
  70.  
  71. align 16
  72. .loop:
  73.     movaps   m0, [src0q+csizeq]
  74.     movaps   m1, [src1q+csizeq]
  75.     movaps   m2, [src2q+csizeq]
  76.  
  77. %if cpuflag(avx)
  78.     mulps    m5, m1, ICT1
  79.     mulps    m4, m2, ICT0
  80.     mulps    m1, m1, ICT3
  81.     mulps    m2, m2, ICT2
  82.     subps    m5, m0, m5
  83. %else ; sse
  84.     movaps   m3, m1
  85.     movaps   m4, m2
  86.     movaps   m5, m0
  87.     mulps    m3, ICT1
  88.     mulps    m4, ICT0
  89.     mulps    m1, ICT3
  90.     mulps    m2, ICT2
  91.     subps    m5, m3
  92. %endif
  93.     addps    m4, m4, m0
  94.     addps    m0, m0, m1
  95.     subps    m5, m5, m2
  96.  
  97.     movaps   [src0q+csizeq], m4
  98.     movaps   [src2q+csizeq], m0
  99.     movaps   [src1q+csizeq], m5
  100.     add  csizeq, mmsize
  101.     jl .loop
  102.     REP_RET
  103. %endmacro
  104.  
  105. INIT_XMM sse
  106. ICT_FLOAT 10
  107. INIT_YMM avx
  108. ICT_FLOAT 9
  109.  
  110. ;***************************************************************************
  111. ; ff_rct_int_<opt>(int32_t *src0, int32_t *src1, int32_t *src2, int csize)
  112. ;***************************************************************************
  113. %macro RCT_INT 0
  114. cglobal rct_int, 4, 4, 4, src0, src1, src2, csize
  115.     shl  csized, 2
  116.     add   src0q, csizeq
  117.     add   src1q, csizeq
  118.     add   src2q, csizeq
  119.     neg  csizeq
  120.  
  121. align 16
  122. .loop:
  123.     mova   m1, [src1q+csizeq]
  124.     mova   m2, [src2q+csizeq]
  125.     mova   m0, [src0q+csizeq]
  126.     paddd  m3, m1, m2
  127.     psrad  m3, 2
  128.     psubd  m0, m3
  129.     paddd  m1, m0
  130.     paddd  m2, m0
  131.     mova   [src1q+csizeq], m0
  132.     mova   [src2q+csizeq], m1
  133.     mova   [src0q+csizeq], m2
  134.     add  csizeq, mmsize
  135.     jl .loop
  136.     REP_RET
  137. %endmacro
  138.  
  139. INIT_XMM sse2
  140. RCT_INT
  141. %if HAVE_AVX2_EXTERNAL
  142. INIT_YMM avx2
  143. RCT_INT
  144. %endif
  145.