Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. ;******************************************************************************
  2. ;* TTA DSP SIMD optimizations
  3. ;*
  4. ;* Copyright (C) 2014 James Almer
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22.  
  23. %include "libavutil/x86/x86util.asm"
  24.  
  25. SECTION_RODATA
  26.  
  27. pd_n0113: dd ~0, ~1, ~1, ~3
  28. pd_1224:  dd 1, 2, 2, 4
  29.  
  30. SECTION .text
  31.  
  32. %macro TTA_FILTER 2
  33. INIT_XMM %1
  34. cglobal ttafilter_process_dec, 5,5,%2, qm, dx, dl, error, in, shift, round
  35.     mova       m2, [qmq       ]
  36.     mova       m3, [qmq + 0x10]
  37.     mova       m4, [dxq       ]
  38.     mova       m5, [dxq + 0x10]
  39.  
  40.     movd       m6, [errorq]         ; if (filter->error < 0) {
  41.     SPLATD     m6                   ;     for (int i = 0; i < 8; i++)
  42.     psignd     m0, m4, m6           ;         filter->qm[i] -= filter->dx[i];
  43.     psignd     m1, m5, m6           ; } else if (filter->error > 0) {
  44.     paddd      m2, m0               ;     for (int i = 0; i < 8; i++)
  45.     paddd      m3, m1               ;         filter->qm[i] += filter->dx[i];
  46.     mova       [qmq       ], m2     ; }
  47.     mova       [qmq + 0x10], m3     ;
  48.  
  49.     mova       m0, [dlq       ]
  50.     mova       m1, [dlq + 0x10]
  51.  
  52. %if cpuflag(sse4)
  53.     pmulld     m2, m0
  54.     pmulld     m3, m1
  55. %else
  56.     pshufd     m6, m0, 0xb1
  57.     pshufd     m7, m2, 0xb1
  58.     pmuludq    m6, m7
  59.     pshufd     m6, m6, 0xd8
  60.     pmuludq    m2, m0
  61.     pshufd     m2, m2, 0xd8
  62.     punpckldq  m2, m6
  63.  
  64.     pshufd     m6, m1, 0xb1
  65.     pshufd     m7, m3, 0xb1
  66.     pmuludq    m6, m7
  67.     pshufd     m6, m6, 0xd8
  68.     pmuludq    m3, m1
  69.     pshufd     m3, m3, 0xd8
  70.     punpckldq  m3, m6
  71. %endif
  72.     ; Using horizontal add (phaddd) seems to be slower than shuffling stuff around
  73.     paddd      m2, m3               ; int sum = filter->round +
  74.                                     ;           filter->dl[0] * filter->qm[0] +
  75.     pshufd     m3, m2, 0xe          ;           filter->dl[1] * filter->qm[1] +
  76.     paddd      m2, m3               ;           filter->dl[2] * filter->qm[2] +
  77.                                     ;           filter->dl[3] * filter->qm[3] +
  78.     movd       m6, roundm           ;           filter->dl[4] * filter->qm[4] +
  79.     paddd      m6, m2               ;           filter->dl[5] * filter->qm[5] +
  80.     pshufd     m2, m2, 0x1          ;           filter->dl[6] * filter->qm[6] +
  81.     paddd      m6, m2               ;           filter->dl[7] * filter->qm[7];
  82.  
  83.     palignr    m5, m4, 4            ; filter->dx[0] = filter->dx[1]; filter->dx[1] = filter->dx[2];
  84.                                     ; filter->dx[2] = filter->dx[3]; filter->dx[3] = filter->dx[4];
  85.  
  86.     palignr    m2, m1, m0, 4        ; filter->dl[0] = filter->dl[1]; filter->dl[1] = filter->dl[2];
  87.                                     ; filter->dl[2] = filter->dl[3]; filter->dl[3] = filter->dl[4];
  88.  
  89.     psrad      m4, m1, 30           ; filter->dx[4] = ((filter->dl[4] >> 30) | 1);
  90.     por        m4, [pd_1224 ]       ; filter->dx[5] = ((filter->dl[5] >> 30) | 2) & ~1;
  91.     pand       m4, [pd_n0113]       ; filter->dx[6] = ((filter->dl[6] >> 30) | 2) & ~1;
  92.                                     ; filter->dx[7] = ((filter->dl[7] >> 30) | 4) & ~3;
  93.  
  94.     mova       [dlq       ], m2
  95.     mova       [dxq       ], m5
  96.     mova       [dxq + 0x10], m4
  97.     movd       m0, [inq]            ; filter->error = *in;
  98.     movd       [errorq], m0         ;
  99.  
  100.     movd       m2, shiftm           ; *in += (sum >> filter->shift);
  101.     psrad      m6, m2               ;
  102.     paddd      m0, m6               ;
  103.     movd       [inq], m0            ;
  104.  
  105.     psrldq     m1, 4                ;
  106.     pslldq     m0, 12               ; filter->dl[4] = -filter->dl[5];
  107.     pshufd     m0, m0, 0xf0         ; filter->dl[5] = -filter->dl[6];
  108.     psubd      m0, m1               ; filter->dl[6] = *in - filter->dl[7];
  109.     psrldq     m1, m0, 4            ; filter->dl[7] = *in;
  110.     pshufd     m1, m1, 0xf4         ; filter->dl[5] += filter->dl[6];
  111.     paddd      m0, m1               ; filter->dl[4] += filter->dl[5];
  112.     psrldq     m1, 4                ;
  113.     paddd      m0, m1               ;
  114.     mova       [dlq + 0x10], m0     ;
  115.     RET
  116. %endmacro
  117.  
  118. TTA_FILTER ssse3, 8
  119. TTA_FILTER sse4,  7
  120.