Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ; /*
  2. ; * SIMD optimized idct functions for HEVC decoding
  3. ; * Copyright (c) 2014 Pierre-Edouard LEPERE
  4. ; * Copyright (c) 2014 James Almer
  5. ; *
  6. ; * This file is part of FFmpeg.
  7. ; *
  8. ; * FFmpeg is free software; you can redistribute it and/or
  9. ; * modify it under the terms of the GNU Lesser General Public
  10. ; * License as published by the Free Software Foundation; either
  11. ; * version 2.1 of the License, or (at your option) any later version.
  12. ; *
  13. ; * FFmpeg is distributed in the hope that it will be useful,
  14. ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16. ; * Lesser General Public License for more details.
  17. ; *
  18. ; * You should have received a copy of the GNU Lesser General Public
  19. ; * License along with FFmpeg; if not, write to the Free Software
  20. ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ; */
  22. %include "libavutil/x86/x86util.asm"
  23.  
  24. SECTION .text
  25.  
  26. ; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
  27. ; %1 = HxW
  28. ; %2 = number of loops
  29. ; %3 = bitdepth
  30. %macro IDCT_DC 3
  31. cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp
  32.     movsx             tmpq, word [coeffq]
  33.     add               tmpw, ((1 << 14-%3) + 1)
  34.     sar               tmpw, (15-%3)
  35.     movd               xm0, tmpd
  36.     SPLATW              m0, xm0
  37.     DEFINE_ARGS coeff, cnt
  38.     mov               cntd, %2
  39. .loop:
  40.     mova [coeffq+mmsize*0], m0
  41.     mova [coeffq+mmsize*1], m0
  42.     mova [coeffq+mmsize*2], m0
  43.     mova [coeffq+mmsize*3], m0
  44.     mova [coeffq+mmsize*4], m0
  45.     mova [coeffq+mmsize*5], m0
  46.     mova [coeffq+mmsize*6], m0
  47.     mova [coeffq+mmsize*7], m0
  48.     add  coeffq, mmsize*8
  49.     dec  cntd
  50.     jg  .loop
  51.     RET
  52. %endmacro
  53.  
  54. ; %1 = HxW
  55. ; %2 = bitdepth
  56. %macro IDCT_DC_NL 2 ; No loop
  57. cglobal hevc_idct%1x%1_dc_%2, 1, 2, 1, coeff, tmp
  58.     movsx             tmpq, word [coeffq]
  59.     add               tmpw, ((1 << 14-%2) + 1)
  60.     sar               tmpw, (15-%2)
  61.     movd                m0, tmpd
  62.     SPLATW              m0, xm0
  63.     mova [coeffq+mmsize*0], m0
  64.     mova [coeffq+mmsize*1], m0
  65.     mova [coeffq+mmsize*2], m0
  66.     mova [coeffq+mmsize*3], m0
  67. %if mmsize == 16
  68.     mova [coeffq+mmsize*4], m0
  69.     mova [coeffq+mmsize*5], m0
  70.     mova [coeffq+mmsize*6], m0
  71.     mova [coeffq+mmsize*7], m0
  72. %endif
  73.     RET
  74. %endmacro
  75.  
  76. ; 8-bit
  77. INIT_MMX mmxext
  78. IDCT_DC_NL  4,      8
  79. IDCT_DC     8,  2,  8
  80.  
  81. INIT_XMM sse2
  82. IDCT_DC_NL  8,      8
  83. IDCT_DC    16,  4,  8
  84. IDCT_DC    32, 16,  8
  85.  
  86. %if HAVE_AVX2_EXTERNAL
  87. INIT_YMM avx2
  88. IDCT_DC    16,  2,  8
  89. IDCT_DC    32,  8,  8
  90. %endif ;HAVE_AVX2_EXTERNAL
  91.  
  92. ; 10-bit
  93. INIT_MMX mmxext
  94. IDCT_DC_NL  4,     10
  95. IDCT_DC     8,  2, 10
  96.  
  97. INIT_XMM sse2
  98. IDCT_DC_NL  8,     10
  99. IDCT_DC    16,  4, 10
  100. IDCT_DC    32, 16, 10
  101.  
  102. %if HAVE_AVX2_EXTERNAL
  103. INIT_YMM avx2
  104. IDCT_DC    16,  2, 10
  105. IDCT_DC    32,  8, 10
  106. %endif ;HAVE_AVX2_EXTERNAL
  107.  
  108. ; 12-bit
  109. INIT_MMX mmxext
  110. IDCT_DC_NL  4,     12
  111. IDCT_DC     8,  2, 12
  112.  
  113. INIT_XMM sse2
  114. IDCT_DC_NL  8,     12
  115. IDCT_DC    16,  4, 12
  116. IDCT_DC    32, 16, 12
  117.  
  118. %if HAVE_AVX2_EXTERNAL
  119. INIT_YMM avx2
  120. IDCT_DC    16,  2, 12
  121. IDCT_DC    32,  8, 12
  122. %endif ;HAVE_AVX2_EXTERNAL
  123.