Rev 1905 | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
1905 | serge | 1 | /* |
2 | decode_sse3d: Synth for SSE and extended 3DNow (yeah, the name is a relic) |
||
3 | |||
4 | copyright 2006-2007 by Zuxy Meng/the mpg123 project - free software under the terms of the LGPL 2.1 |
||
5 | see COPYING and AUTHORS files in distribution or http://mpg123.org |
||
6 | initially written by the mysterious higway for MMX (apparently) |
||
7 | then developed into SSE opt by Zuxy Meng, also building on Romain Dolbeau's AltiVec |
||
8 | Both have agreed to distribution under LGPL 2.1 . |
||
9 | |||
10 | Transformed back into standalone asm, with help of |
||
11 | gcc -S -DHAVE_CONFIG_H -I. -march=pentium -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o decode_mmxsse.{S,c} |
||
12 | |||
13 | The difference between SSE and 3DNowExt is the dct64 function and the synth function name. |
||
14 | This template here uses the SYNTH_NAME and MPL_DCT64 macros for this - see decode_sse.S and decode_3dnowext.S... |
||
15 | That's not memory efficient since there's doubled code, but it's easier than giving another function pointer. |
||
16 | Maybe I'll change it in future, but now I need something that works. |
||
17 | |||
18 | Original comment from MPlayer source follows: |
||
19 | */ |
||
20 | |||
21 | /* |
||
22 | * this code comes under GPL |
||
23 | * This code was taken from http://www.mpg123.org |
||
24 | * See ChangeLog of mpg123-0.59s-pre.1 for detail |
||
25 | * Applied to mplayer by Nick Kurshev |
||
26 | * |
||
27 | * Local ChangeLog: |
||
28 | * - Partial loops unrolling and removing MOVW insn from loops |
||
29 | */ |
||
30 | |||
31 | #include "mangle.h" |
||
32 | |||
33 | .data |
||
34 | ALIGN8 |
||
35 | one_null: |
||
36 | .long -65536 |
||
37 | .long -65536 |
||
38 | ALIGN8 |
||
39 | null_one: |
||
40 | .long 65535 |
||
41 | .long 65535 |
||
42 | |||
43 | .text |
||
3960 | Serge | 44 | ALIGN16 |
1905 | serge | 45 | /* void SYNTH_NAME(real *bandPtr, int channel, short *samples, short *buffs, int *bo, float *decwins) */ |
46 | .globl SYNTH_NAME |
||
47 | SYNTH_NAME: |
||
48 | pushl %ebp |
||
49 | /* stack:0=ebp 4=back 8=bandptr 12=channel 16=samples 20=buffs 24=bo 28=decwins */ |
||
50 | movl %esp, %ebp |
||
51 | /* Now the old stack addresses are preserved via %epb. */ |
||
52 | subl $4,%esp /* What has been called temp before. */ |
||
53 | pushl %edi |
||
54 | pushl %esi |
||
55 | pushl %ebx |
||
56 | #define TEMP 12(%esp) |
||
57 | /* APP */ |
||
58 | movl 12(%ebp),%ecx |
||
59 | movl 16(%ebp),%edi |
||
60 | movl $15,%ebx |
||
61 | movl 24(%ebp),%edx |
||
62 | leal (%edi,%ecx,2),%edi |
||
63 | decl %ecx |
||
64 | movl 20(%ebp),%esi |
||
65 | movl (%edx),%eax |
||
66 | jecxz .L01 |
||
67 | decl %eax |
||
68 | andl %ebx,%eax |
||
69 | leal 1088(%esi),%esi |
||
70 | movl %eax,(%edx) |
||
71 | .L01: |
||
72 | leal (%esi,%eax,2),%edx |
||
73 | movl %eax,TEMP |
||
74 | incl %eax |
||
75 | andl %ebx,%eax |
||
76 | leal 544(%esi,%eax,2),%ecx |
||
77 | incl %ebx |
||
78 | testl $1, %eax |
||
79 | jnz .L02 |
||
80 | xchgl %edx,%ecx |
||
81 | incl TEMP |
||
82 | leal 544(%esi),%esi |
||
83 | .L02: |
||
84 | pushl 8(%ebp) |
||
85 | pushl %edx |
||
86 | pushl %ecx |
||
87 | call MPL_DCT64 |
||
88 | addl $12, %esp |
||
89 | leal 1(%ebx), %ecx |
||
90 | subl TEMP,%ebx |
||
91 | pushl %ecx |
||
92 | /* leal ASM_NAME(decwins)(%ebx,%ebx,1), %edx */ |
||
93 | movl 28(%ebp),%ecx |
||
94 | leal (%ecx,%ebx,2), %edx |
||
95 | movl (%esp),%ecx /* restore, but leave value on stack */ |
||
96 | shrl $1, %ecx |
||
97 | ALIGN16 |
||
98 | .L03: |
||
99 | movq (%edx),%mm0 |
||
100 | movq 64(%edx),%mm4 |
||
101 | pmaddwd (%esi),%mm0 |
||
102 | pmaddwd 32(%esi),%mm4 |
||
103 | movq 8(%edx),%mm1 |
||
104 | movq 72(%edx),%mm5 |
||
105 | pmaddwd 8(%esi),%mm1 |
||
106 | pmaddwd 40(%esi),%mm5 |
||
107 | movq 16(%edx),%mm2 |
||
108 | movq 80(%edx),%mm6 |
||
109 | pmaddwd 16(%esi),%mm2 |
||
110 | pmaddwd 48(%esi),%mm6 |
||
111 | movq 24(%edx),%mm3 |
||
112 | movq 88(%edx),%mm7 |
||
113 | pmaddwd 24(%esi),%mm3 |
||
114 | pmaddwd 56(%esi),%mm7 |
||
115 | paddd %mm1,%mm0 |
||
116 | paddd %mm5,%mm4 |
||
117 | paddd %mm2,%mm0 |
||
118 | paddd %mm6,%mm4 |
||
119 | paddd %mm3,%mm0 |
||
120 | paddd %mm7,%mm4 |
||
121 | movq %mm0,%mm1 |
||
122 | movq %mm4,%mm5 |
||
123 | psrlq $32,%mm1 |
||
124 | psrlq $32,%mm5 |
||
125 | paddd %mm1,%mm0 |
||
126 | paddd %mm5,%mm4 |
||
127 | psrad $13,%mm0 |
||
128 | psrad $13,%mm4 |
||
129 | packssdw %mm0,%mm0 |
||
130 | packssdw %mm4,%mm4 |
||
131 | movq (%edi), %mm1 |
||
132 | punpckldq %mm4, %mm0 |
||
133 | pand one_null, %mm1 |
||
134 | pand null_one, %mm0 |
||
135 | por %mm0, %mm1 |
||
136 | movq %mm1,(%edi) |
||
137 | leal 64(%esi),%esi |
||
138 | leal 128(%edx),%edx |
||
139 | leal 8(%edi),%edi |
||
140 | decl %ecx |
||
141 | jnz .L03 |
||
142 | popl %ecx |
||
143 | andl $1, %ecx |
||
144 | jecxz .next_loop |
||
145 | movq (%edx),%mm0 |
||
146 | pmaddwd (%esi),%mm0 |
||
147 | movq 8(%edx),%mm1 |
||
148 | pmaddwd 8(%esi),%mm1 |
||
149 | movq 16(%edx),%mm2 |
||
150 | pmaddwd 16(%esi),%mm2 |
||
151 | movq 24(%edx),%mm3 |
||
152 | pmaddwd 24(%esi),%mm3 |
||
153 | paddd %mm1,%mm0 |
||
154 | paddd %mm2,%mm0 |
||
155 | paddd %mm3,%mm0 |
||
156 | movq %mm0,%mm1 |
||
157 | psrlq $32,%mm1 |
||
158 | paddd %mm1,%mm0 |
||
159 | psrad $13,%mm0 |
||
160 | packssdw %mm0,%mm0 |
||
161 | movd %mm0,%eax |
||
162 | movw %ax, (%edi) |
||
163 | leal 32(%esi),%esi |
||
164 | leal 64(%edx),%edx |
||
165 | leal 4(%edi),%edi |
||
166 | .next_loop: |
||
167 | subl $64,%esi |
||
168 | movl $7,%ecx |
||
169 | ALIGN16 |
||
170 | .L04: |
||
171 | movq (%edx),%mm0 |
||
172 | movq 64(%edx),%mm4 |
||
173 | pmaddwd (%esi),%mm0 |
||
174 | pmaddwd -32(%esi),%mm4 |
||
175 | movq 8(%edx),%mm1 |
||
176 | movq 72(%edx),%mm5 |
||
177 | pmaddwd 8(%esi),%mm1 |
||
178 | pmaddwd -24(%esi),%mm5 |
||
179 | movq 16(%edx),%mm2 |
||
180 | movq 80(%edx),%mm6 |
||
181 | pmaddwd 16(%esi),%mm2 |
||
182 | pmaddwd -16(%esi),%mm6 |
||
183 | movq 24(%edx),%mm3 |
||
184 | movq 88(%edx),%mm7 |
||
185 | pmaddwd 24(%esi),%mm3 |
||
186 | pmaddwd -8(%esi),%mm7 |
||
187 | paddd %mm1,%mm0 |
||
188 | paddd %mm5,%mm4 |
||
189 | paddd %mm2,%mm0 |
||
190 | paddd %mm6,%mm4 |
||
191 | paddd %mm3,%mm0 |
||
192 | paddd %mm7,%mm4 |
||
193 | movq %mm0,%mm1 |
||
194 | movq %mm4,%mm5 |
||
195 | psrlq $32,%mm1 |
||
196 | psrlq $32,%mm5 |
||
197 | paddd %mm0,%mm1 |
||
198 | paddd %mm4,%mm5 |
||
199 | psrad $13,%mm1 |
||
200 | psrad $13,%mm5 |
||
201 | packssdw %mm1,%mm1 |
||
202 | packssdw %mm5,%mm5 |
||
203 | psubd %mm0,%mm0 |
||
204 | psubd %mm4,%mm4 |
||
205 | psubsw %mm1,%mm0 |
||
206 | psubsw %mm5,%mm4 |
||
207 | movq (%edi), %mm1 |
||
208 | punpckldq %mm4, %mm0 |
||
209 | pand one_null, %mm1 |
||
210 | pand null_one, %mm0 |
||
211 | por %mm0, %mm1 |
||
212 | movq %mm1,(%edi) |
||
213 | subl $64,%esi |
||
214 | addl $128,%edx |
||
215 | leal 8(%edi),%edi |
||
216 | decl %ecx |
||
217 | jnz .L04 |
||
218 | movq (%edx),%mm0 |
||
219 | pmaddwd (%esi),%mm0 |
||
220 | movq 8(%edx),%mm1 |
||
221 | pmaddwd 8(%esi),%mm1 |
||
222 | movq 16(%edx),%mm2 |
||
223 | pmaddwd 16(%esi),%mm2 |
||
224 | movq 24(%edx),%mm3 |
||
225 | pmaddwd 24(%esi),%mm3 |
||
226 | paddd %mm1,%mm0 |
||
227 | paddd %mm2,%mm0 |
||
228 | paddd %mm3,%mm0 |
||
229 | movq %mm0,%mm1 |
||
230 | psrlq $32,%mm1 |
||
231 | paddd %mm0,%mm1 |
||
232 | psrad $13,%mm1 |
||
233 | packssdw %mm1,%mm1 |
||
234 | psubd %mm0,%mm0 |
||
235 | psubsw %mm1,%mm0 |
||
236 | movd %mm0,%eax |
||
237 | movw %ax,(%edi) |
||
238 | emms |
||
239 | |||
240 | /* NO_APP */ |
||
241 | popl %ebx |
||
242 | popl %esi |
||
243 | popl %edi |
||
244 | addl $4,%esp |
||
245 | popl %ebp |
||
246 | ret |