Subversion Repositories Kolibri OS

Rev

Rev 1905 | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. /*
  2.         decode_sse3d: Synth for SSE and extended 3DNow (yeah, the name is a relic)
  3.  
  4.         copyright 2006-2007 by Zuxy Meng/the mpg123 project - free software under the terms of the LGPL 2.1
  5.         see COPYING and AUTHORS files in distribution or http://mpg123.org
  6.         initially written by the mysterious higway for MMX (apparently)
  7.         then developed into SSE opt by Zuxy Meng, also building on Romain Dolbeau's AltiVec
  8.         Both have agreed to distribution under LGPL 2.1 .
  9.  
  10.         Transformed back into standalone asm, with help of
  11.         gcc -S -DHAVE_CONFIG_H -I.  -march=pentium -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o decode_mmxsse.{S,c}
  12.  
  13.         The difference between SSE and 3DNowExt is the dct64 function and the synth function name.
  14.         This template here uses the SYNTH_NAME and MPL_DCT64 macros for this - see decode_sse.S and decode_3dnowext.S...
  15.         That's not memory efficient since there's doubled code, but it's easier than giving another function pointer.
  16.         Maybe I'll change it in future, but now I need something that works.
  17.  
  18.         Original comment from MPlayer source follows:
  19. */
  20.  
  21. /*
  22.  * this code comes under GPL
  23.  * This code was taken from http://www.mpg123.org
  24.  * See ChangeLog of mpg123-0.59s-pre.1 for detail
  25.  * Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
  26.  *
  27.  * Local ChangeLog:
  28.  * - Partial loops unrolling and removing MOVW insn from loops
  29. */
  30.  
  31. #include "mangle.h"
  32.  
  33.         .data
  34.         ALIGN8
  35. one_null:
  36.         .long   -65536
  37.         .long   -65536
  38.         ALIGN8
  39. null_one:
  40.         .long   65535
  41.         .long   65535
  42.  
  43.         .text
  44.         ALIGN16
  45.         /* void SYNTH_NAME(real *bandPtr, int channel, short *samples, short *buffs, int *bo, float *decwins) */
  46. .globl SYNTH_NAME
  47. SYNTH_NAME:
  48.         pushl   %ebp
  49. /* stack:0=ebp 4=back 8=bandptr 12=channel 16=samples 20=buffs 24=bo 28=decwins */
  50.         movl    %esp, %ebp
  51. /* Now the old stack addresses are preserved via %epb. */
  52.         subl  $4,%esp /* What has been called temp before. */
  53.         pushl   %edi
  54.         pushl   %esi
  55.         pushl   %ebx
  56. #define TEMP 12(%esp)
  57. /* APP */
  58.         movl 12(%ebp),%ecx
  59.         movl 16(%ebp),%edi
  60.         movl $15,%ebx
  61.         movl 24(%ebp),%edx
  62.         leal (%edi,%ecx,2),%edi
  63.         decl %ecx
  64.         movl 20(%ebp),%esi
  65.         movl (%edx),%eax
  66.         jecxz .L01
  67.         decl %eax
  68.         andl %ebx,%eax
  69.         leal 1088(%esi),%esi
  70.         movl %eax,(%edx)
  71.         .L01:
  72.         leal (%esi,%eax,2),%edx
  73.         movl %eax,TEMP
  74.         incl %eax
  75.         andl %ebx,%eax
  76.         leal 544(%esi,%eax,2),%ecx
  77.         incl %ebx
  78.         testl $1, %eax
  79.         jnz .L02
  80.         xchgl %edx,%ecx
  81.         incl TEMP
  82.         leal 544(%esi),%esi
  83.         .L02:
  84.         pushl 8(%ebp)
  85.         pushl %edx
  86.         pushl %ecx
  87.         call MPL_DCT64
  88.         addl $12, %esp
  89.         leal 1(%ebx), %ecx
  90.         subl TEMP,%ebx
  91.         pushl %ecx
  92.         /* leal ASM_NAME(decwins)(%ebx,%ebx,1), %edx */
  93.         movl 28(%ebp),%ecx
  94.         leal (%ecx,%ebx,2), %edx
  95.         movl (%esp),%ecx /* restore, but leave value on stack */
  96.         shrl $1, %ecx
  97.         ALIGN16
  98.         .L03:
  99.         movq  (%edx),%mm0
  100.         movq  64(%edx),%mm4
  101.         pmaddwd (%esi),%mm0
  102.         pmaddwd 32(%esi),%mm4
  103.         movq  8(%edx),%mm1
  104.         movq  72(%edx),%mm5
  105.         pmaddwd 8(%esi),%mm1
  106.         pmaddwd 40(%esi),%mm5
  107.         movq  16(%edx),%mm2
  108.         movq  80(%edx),%mm6
  109.         pmaddwd 16(%esi),%mm2
  110.         pmaddwd 48(%esi),%mm6
  111.         movq  24(%edx),%mm3
  112.         movq  88(%edx),%mm7
  113.         pmaddwd 24(%esi),%mm3
  114.         pmaddwd 56(%esi),%mm7
  115.         paddd %mm1,%mm0
  116.         paddd %mm5,%mm4
  117.         paddd %mm2,%mm0
  118.         paddd %mm6,%mm4
  119.         paddd %mm3,%mm0
  120.         paddd %mm7,%mm4
  121.         movq  %mm0,%mm1
  122.         movq  %mm4,%mm5
  123.         psrlq $32,%mm1
  124.         psrlq $32,%mm5
  125.         paddd %mm1,%mm0
  126.         paddd %mm5,%mm4
  127.         psrad $13,%mm0
  128.         psrad $13,%mm4
  129.         packssdw %mm0,%mm0
  130.         packssdw %mm4,%mm4
  131.         movq    (%edi), %mm1
  132.         punpckldq %mm4, %mm0
  133.         pand   one_null, %mm1
  134.         pand   null_one, %mm0
  135.         por    %mm0, %mm1
  136.         movq   %mm1,(%edi)
  137.         leal 64(%esi),%esi
  138.         leal 128(%edx),%edx
  139.         leal 8(%edi),%edi
  140.         decl %ecx
  141.         jnz  .L03
  142.         popl %ecx
  143.         andl $1, %ecx
  144.         jecxz .next_loop
  145.         movq  (%edx),%mm0
  146.         pmaddwd (%esi),%mm0
  147.         movq  8(%edx),%mm1
  148.         pmaddwd 8(%esi),%mm1
  149.         movq  16(%edx),%mm2
  150.         pmaddwd 16(%esi),%mm2
  151.         movq  24(%edx),%mm3
  152.         pmaddwd 24(%esi),%mm3
  153.         paddd %mm1,%mm0
  154.         paddd %mm2,%mm0
  155.         paddd %mm3,%mm0
  156.         movq  %mm0,%mm1
  157.         psrlq $32,%mm1
  158.         paddd %mm1,%mm0
  159.         psrad $13,%mm0
  160.         packssdw %mm0,%mm0
  161.         movd %mm0,%eax
  162.         movw %ax, (%edi)
  163.         leal 32(%esi),%esi
  164.         leal 64(%edx),%edx
  165.         leal 4(%edi),%edi
  166.         .next_loop:
  167.         subl $64,%esi
  168.         movl $7,%ecx
  169.         ALIGN16
  170.         .L04:
  171.         movq  (%edx),%mm0
  172.         movq  64(%edx),%mm4
  173.         pmaddwd (%esi),%mm0
  174.         pmaddwd -32(%esi),%mm4
  175.         movq  8(%edx),%mm1
  176.         movq  72(%edx),%mm5
  177.         pmaddwd 8(%esi),%mm1
  178.         pmaddwd -24(%esi),%mm5
  179.         movq  16(%edx),%mm2
  180.         movq  80(%edx),%mm6
  181.         pmaddwd 16(%esi),%mm2
  182.         pmaddwd -16(%esi),%mm6
  183.         movq  24(%edx),%mm3
  184.         movq  88(%edx),%mm7
  185.         pmaddwd 24(%esi),%mm3
  186.         pmaddwd -8(%esi),%mm7
  187.         paddd %mm1,%mm0
  188.         paddd %mm5,%mm4
  189.         paddd %mm2,%mm0
  190.         paddd %mm6,%mm4
  191.         paddd %mm3,%mm0
  192.         paddd %mm7,%mm4
  193.         movq  %mm0,%mm1
  194.         movq  %mm4,%mm5
  195.         psrlq $32,%mm1
  196.         psrlq $32,%mm5
  197.         paddd %mm0,%mm1
  198.         paddd %mm4,%mm5
  199.         psrad $13,%mm1
  200.         psrad $13,%mm5
  201.         packssdw %mm1,%mm1
  202.         packssdw %mm5,%mm5
  203.         psubd %mm0,%mm0
  204.         psubd %mm4,%mm4
  205.         psubsw %mm1,%mm0
  206.         psubsw %mm5,%mm4
  207.         movq    (%edi), %mm1
  208.         punpckldq %mm4, %mm0
  209.         pand   one_null, %mm1
  210.         pand   null_one, %mm0
  211.         por    %mm0, %mm1
  212.         movq   %mm1,(%edi)
  213.         subl $64,%esi
  214.         addl $128,%edx
  215.         leal 8(%edi),%edi
  216.         decl %ecx
  217.         jnz  .L04
  218.         movq  (%edx),%mm0
  219.         pmaddwd (%esi),%mm0
  220.         movq  8(%edx),%mm1
  221.         pmaddwd 8(%esi),%mm1
  222.         movq  16(%edx),%mm2
  223.         pmaddwd 16(%esi),%mm2
  224.         movq  24(%edx),%mm3
  225.         pmaddwd 24(%esi),%mm3
  226.         paddd %mm1,%mm0
  227.         paddd %mm2,%mm0
  228.         paddd %mm3,%mm0
  229.         movq  %mm0,%mm1
  230.         psrlq $32,%mm1
  231.         paddd %mm0,%mm1
  232.         psrad $13,%mm1
  233.         packssdw %mm1,%mm1
  234.         psubd %mm0,%mm0
  235.         psubsw %mm1,%mm0
  236.         movd %mm0,%eax
  237.         movw %ax,(%edi)
  238.         emms
  239.  
  240. /* NO_APP */
  241.         popl    %ebx
  242.         popl    %esi
  243.         popl    %edi
  244.         addl $4,%esp
  245.         popl    %ebp
  246.         ret
  247.