Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* FLAC DSP functions
  3. ;*
  4. ;* Copyright (c) 2014 James Darnley <james.darnley@gmail.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or modify
  9. ;* it under the terms of the GNU General Public License as published by
  10. ;* the Free Software Foundation; either version 2 of the License, or
  11. ;* (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16. ;* GNU General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU General Public License along
  19. ;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  20. ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  21. ;******************************************************************************
  22.  
  23. %include "libavutil/x86/x86util.asm"
  24.  
  25. SECTION .text
  26.  
  27. INIT_XMM sse4
  28. %if ARCH_X86_64
  29.     cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
  30.     DECLARE_REG_TMP 5, 6
  31.     %define length r2d
  32.  
  33.     movsxd orderq, orderd
  34. %else
  35.     cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs
  36.     DECLARE_REG_TMP 2, 5
  37.     %define length r2mp
  38. %endif
  39.  
  40. ; Here we assume that the maximum order value is 32.  This means that we only
  41. ; need to copy a maximum of 32 samples.  Therefore we let the preprocessor
  42. ; unroll this loop and copy all 32.
  43. %assign iter 0
  44. %rep 32/(mmsize/4)
  45.     movu  m0,         [smpq+iter]
  46.     movu [resq+iter],  m0
  47.     %assign iter iter+mmsize
  48. %endrep
  49.  
  50. lea  resq,   [resq+orderq*4]
  51. lea  smpq,   [smpq+orderq*4]
  52. lea  coefsq, [coefsq+orderq*4]
  53. sub  length,  orderd
  54. movd m3,      r5m
  55. neg  orderq
  56.  
  57. %define posj t0q
  58. %define negj t1q
  59.  
  60. .looplen:
  61.     pxor m0,   m0
  62.     pxor m4,   m4
  63.     pxor m6,   m6
  64.     mov  posj, orderq
  65.     xor  negj, negj
  66.  
  67.     .looporder:
  68.         movd   m2, [coefsq+posj*4] ; c = coefs[j]
  69.         SPLATD m2
  70.         movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
  71.         movu   m5, [smpq+negj*4-4+mmsize]
  72.         movu   m7, [smpq+negj*4-4+mmsize*2]
  73.         pmulld m1,  m2
  74.         pmulld m5,  m2
  75.         pmulld m7,  m2
  76.         paddd  m0,  m1             ; p += c * s
  77.         paddd  m4,  m5
  78.         paddd  m6,  m7
  79.  
  80.         dec    negj
  81.         inc    posj
  82.     jnz .looporder
  83.  
  84.     psrad  m0,     m3              ; p >>= shift
  85.     psrad  m4,     m3
  86.     psrad  m6,     m3
  87.     movu   m1,    [smpq]
  88.     movu   m5,    [smpq+mmsize]
  89.     movu   m7,    [smpq+mmsize*2]
  90.     psubd  m1,     m0              ; smp[i] - p
  91.     psubd  m5,     m4
  92.     psubd  m7,     m6
  93.     movu  [resq],  m1              ; res[i] = smp[i] - (p >> shift)
  94.     movu  [resq+mmsize], m5
  95.     movu  [resq+mmsize*2], m7
  96.  
  97.     add resq,    3*mmsize
  98.     add smpq,    3*mmsize
  99.     sub length, (3*mmsize)/4
  100. jg .looplen
  101. RET
  102.