Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
3960 | Serge | 1 | /* |
2 | decode_3dnow.s - 3DNow! optimized synth_1to1() |
||
3 | |||
4 | copyright ?-2007 by the mpg123 project - free software under the terms of the LGPL 2.1 |
||
5 | see COPYING and AUTHORS files in distribution or http://mpg123.org |
||
6 | initially written by Syuuhei Kashiyama |
||
7 | |||
8 | This code based 'decode_3dnow.s' by Syuuhei Kashiyama |
||
9 |
|
||
10 | |||
11 | - remove PREFETCH instruction for speedup |
||
12 | - change function name for support 3DNow! automatic detect |
||
13 | - femms moved to before 'call dct64_3dnow' |
||
14 | |||
15 | You can find Kashiyama's original 3dnow! support patch |
||
16 | (for mpg123-0.59o) at |
||
17 | http://user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese). |
||
18 | |||
19 | by KIMURA Takuhiro |
||
20 |
|
||
21 | |||
22 | |||
23 | |||
24 | Replacement of synth_1to1() with AMD's 3DNow! SIMD operations support |
||
25 | |||
26 | Syuuhei Kashiyama |
||
27 | |||
28 | The author of this program disclaim whole expressed or implied |
||
29 | warranties with regard to this program, and in no event shall the |
||
30 | author of this program liable to whatever resulted from the use of |
||
31 | this program. Use it at your own risk. |
||
32 | */ |
||
33 | |||
34 | #include "mangle.h" |
||
35 | |||
36 | #ifdef ACCURATE_ROUNDING |
||
37 | #ifndef __APPLE__ |
||
38 | .section .rodata |
||
39 | #else |
||
40 | .data |
||
41 | #endif |
||
42 | ALIGN8 |
||
43 | max_s16: |
||
44 | .long 1191181824 /* 32767.0 */ |
||
45 | .long 1191181824 |
||
46 | min_s16: |
||
47 | .long -956301312 /* -32768.0 */ |
||
48 | .long -956301312 |
||
49 | ftoi_magic: |
||
50 | .long 1262485504 /* 2^23 + 2^22 */ |
||
51 | .long 1262485504 |
||
52 | #endif |
||
53 | .text |
||
54 | ALIGN16 |
||
55 | .globl ASM_NAME(synth_1to1_3dnow_asm) |
||
56 | /* int synth_1to1_3dnow_asm(real *bandPtr, int channel, unsigned char *out, unsigned char *buffs, int *bo, real *decwin); */ |
||
57 | ASM_NAME(synth_1to1_3dnow_asm): |
||
58 | subl $24,%esp |
||
59 | pushl %ebp |
||
60 | pushl %edi |
||
61 | xorl %ebp,%ebp |
||
62 | pushl %esi |
||
63 | pushl %ebx |
||
64 | /* stack old: 0=ebx 4=esi 8=edi 12=ebp 16,20,24,28,32,36=local 40=back 44=bandptr 48=channel 52=out 56=pnt */ |
||
65 | /* stack new: 0=ebx 4=esi 8=edi 12=ebp 16,20,24,28,32,36=local 40=back 44=bandptr 48=channel 52=out 56=buffs 60=bo 64=decwin */ |
||
66 | #define OUT 52(%esp) |
||
67 | #define CHANNEL 48(%esp) |
||
68 | #define BANDPTR 44(%esp) |
||
69 | #define BUFFS 56(%esp) |
||
70 | #define BO 60(%esp) |
||
71 | #define DECWIN 64(%esp) |
||
72 | #define LOCAL0 16(%esp) |
||
73 | #define LOCAL1 20(%esp) |
||
74 | #define LOCAL5 36(%esp) |
||
75 | movl OUT,%esi |
||
76 | movl %esi,LOCAL0 /* save buffer start (samples pointer) to another local var */ |
||
77 | movl CHANNEL,%ebx |
||
78 | movl BO,%esi /* bo address */ |
||
79 | movl (%esi),%edx /* bo value */ |
||
80 | |||
81 | femms |
||
82 | testl %ebx,%ebx |
||
83 | jne .L26 |
||
84 | /* if(!channel) */ |
||
85 | decl %edx /* --bo */ |
||
86 | andl $15,%edx |
||
87 | movl %edx,(%esi) /* save bo */ |
||
88 | movl BUFFS,%ecx |
||
89 | jmp .L27 |
||
90 | .L26: /* if(channel) */ |
||
91 | addl $2,LOCAL0 /* samples++ */ |
||
92 | movl BUFFS,%ecx |
||
93 | addl $2176,%ecx |
||
94 | .L27: |
||
95 | /* edx (and it's lower end) still holds bo value */ |
||
96 | testb $1,%dl /* bo & 0x1 */ |
||
97 | je .L28 |
||
98 | movl %edx,LOCAL5 |
||
99 | movl %ecx,%ebx |
||
100 | movl BANDPTR,%esi |
||
101 | movl %edx,%edi |
||
102 | pushl %esi |
||
103 | sall $2,%edi |
||
104 | movl %ebx,%eax |
||
105 | movl %edi,24(%esp) /* LOCAL1, actually */ |
||
106 | addl %edi,%eax |
||
107 | pushl %eax |
||
108 | movl %edx,%eax |
||
109 | incl %eax |
||
110 | andl $15,%eax |
||
111 | leal 1088(,%eax,4),%eax |
||
112 | addl %ebx,%eax |
||
113 | pushl %eax |
||
114 | call ASM_NAME(dct64_3dnow) |
||
115 | addl $12,%esp |
||
116 | jmp .L29 |
||
117 | .L28: |
||
118 | leal 1(%edx),%esi |
||
119 | movl BANDPTR,%edi |
||
120 | movl %esi,LOCAL5 |
||
121 | leal 1092(%ecx,%edx,4),%eax |
||
122 | pushl %edi |
||
123 | leal 1088(%ecx),%ebx |
||
124 | pushl %eax |
||
125 | sall $2,%esi |
||
126 | leal (%ecx,%edx,4),%eax |
||
127 | pushl %eax |
||
128 | call ASM_NAME(dct64_3dnow) |
||
129 | addl $12,%esp |
||
130 | movl %esi,LOCAL1 |
||
131 | .L29: |
||
132 | movl DECWIN,%edx |
||
133 | addl $64,%edx |
||
134 | movl $16,%ecx |
||
135 | subl LOCAL1,%edx |
||
136 | movl LOCAL0,%edi |
||
137 | |||
138 | pcmpeqb %mm7,%mm7 |
||
139 | pslld $31,%mm7 |
||
140 | movq (%edx),%mm0 |
||
141 | movq (%ebx),%mm1 |
||
142 | ALIGN32 |
||
143 | .L33: |
||
144 | movq 8(%edx),%mm3 |
||
145 | pfmul %mm1,%mm0 |
||
146 | movq 8(%ebx),%mm4 |
||
147 | movq 16(%edx),%mm5 |
||
148 | pfmul %mm4,%mm3 |
||
149 | movq 16(%ebx),%mm6 |
||
150 | pfadd %mm3,%mm0 |
||
151 | movq 24(%edx),%mm1 |
||
152 | pfmul %mm6,%mm5 |
||
153 | movq 24(%ebx),%mm2 |
||
154 | pfadd %mm5,%mm0 |
||
155 | movq 32(%edx),%mm3 |
||
156 | pfmul %mm2,%mm1 |
||
157 | movq 32(%ebx),%mm4 |
||
158 | pfadd %mm1,%mm0 |
||
159 | movq 40(%edx),%mm5 |
||
160 | pfmul %mm4,%mm3 |
||
161 | movq 40(%ebx),%mm6 |
||
162 | pfadd %mm3,%mm0 |
||
163 | movq 48(%edx),%mm1 |
||
164 | pfmul %mm6,%mm5 |
||
165 | movq 48(%ebx),%mm2 |
||
166 | pfadd %mm0,%mm5 |
||
167 | movq 56(%edx),%mm3 |
||
168 | pfmul %mm1,%mm2 |
||
169 | movq 56(%ebx),%mm4 |
||
170 | pfadd %mm5,%mm2 |
||
171 | addl $64,%ebx |
||
172 | subl $-128,%edx |
||
173 | movq (%edx),%mm0 |
||
174 | pfmul %mm4,%mm3 |
||
175 | movq (%ebx),%mm1 |
||
176 | pfadd %mm3,%mm2 |
||
177 | movq %mm2,%mm3 |
||
178 | psrlq $32,%mm3 |
||
179 | pfsub %mm3,%mm2 |
||
180 | incl %ebp |
||
181 | #ifdef ACCURATE_ROUNDING |
||
182 | pfmin (max_s16),%mm2 |
||
183 | pfmax (min_s16),%mm2 |
||
184 | pfadd (ftoi_magic),%mm2 |
||
185 | #else |
||
186 | pf2id %mm2,%mm2 |
||
187 | packssdw %mm2,%mm2 |
||
188 | #endif |
||
189 | movd %mm2,%eax |
||
190 | movw %ax,0(%edi) |
||
191 | addl $4,%edi |
||
192 | decl %ecx |
||
193 | jnz .L33 |
||
194 | |||
195 | movd (%ebx),%mm0 |
||
196 | movd (%edx),%mm1 |
||
197 | punpckldq 8(%ebx),%mm0 |
||
198 | punpckldq 8(%edx),%mm1 |
||
199 | movd 16(%ebx),%mm3 |
||
200 | movd 16(%edx),%mm4 |
||
201 | pfmul %mm1,%mm0 |
||
202 | punpckldq 24(%ebx),%mm3 |
||
203 | punpckldq 24(%edx),%mm4 |
||
204 | movd 32(%ebx),%mm5 |
||
205 | movd 32(%edx),%mm6 |
||
206 | pfmul %mm4,%mm3 |
||
207 | punpckldq 40(%ebx),%mm5 |
||
208 | punpckldq 40(%edx),%mm6 |
||
209 | pfadd %mm3,%mm0 |
||
210 | movd 48(%ebx),%mm1 |
||
211 | movd 48(%edx),%mm2 |
||
212 | pfmul %mm6,%mm5 |
||
213 | punpckldq 56(%ebx),%mm1 |
||
214 | punpckldq 56(%edx),%mm2 |
||
215 | pfadd %mm5,%mm0 |
||
216 | pfmul %mm2,%mm1 |
||
217 | pfadd %mm1,%mm0 |
||
218 | pfacc %mm1,%mm0 |
||
219 | #ifdef ACCURATE_ROUNDING |
||
220 | pfmin (max_s16),%mm0 |
||
221 | pfmax (min_s16),%mm0 |
||
222 | pfadd (ftoi_magic),%mm0 |
||
223 | #else |
||
224 | pf2id %mm0,%mm0 |
||
225 | packssdw %mm0,%mm0 |
||
226 | #endif |
||
227 | movd %mm0,%eax |
||
228 | movw %ax,0(%edi) |
||
229 | incl %ebp |
||
230 | movl LOCAL5,%esi |
||
231 | addl $-64,%ebx |
||
232 | movl $15,%ebp |
||
233 | addl $4,%edi |
||
234 | leal -128(%edx,%esi,8),%edx |
||
235 | |||
236 | movl $15,%ecx |
||
237 | movd (%ebx),%mm0 |
||
238 | movd -4(%edx),%mm1 |
||
239 | punpckldq 4(%ebx),%mm0 |
||
240 | punpckldq -8(%edx),%mm1 |
||
241 | ALIGN32 |
||
242 | .L46: |
||
243 | movd 8(%ebx),%mm3 |
||
244 | movd -12(%edx),%mm4 |
||
245 | pfmul %mm1,%mm0 |
||
246 | punpckldq 12(%ebx),%mm3 |
||
247 | punpckldq -16(%edx),%mm4 |
||
248 | movd 16(%ebx),%mm5 |
||
249 | movd -20(%edx),%mm6 |
||
250 | pfmul %mm4,%mm3 |
||
251 | punpckldq 20(%ebx),%mm5 |
||
252 | punpckldq -24(%edx),%mm6 |
||
253 | pfadd %mm3,%mm0 |
||
254 | movd 24(%ebx),%mm1 |
||
255 | movd -28(%edx),%mm2 |
||
256 | pfmul %mm6,%mm5 |
||
257 | punpckldq 28(%ebx),%mm1 |
||
258 | punpckldq -32(%edx),%mm2 |
||
259 | pfadd %mm5,%mm0 |
||
260 | movd 32(%ebx),%mm3 |
||
261 | movd -36(%edx),%mm4 |
||
262 | pfmul %mm2,%mm1 |
||
263 | punpckldq 36(%ebx),%mm3 |
||
264 | punpckldq -40(%edx),%mm4 |
||
265 | pfadd %mm1,%mm0 |
||
266 | movd 40(%ebx),%mm5 |
||
267 | movd -44(%edx),%mm6 |
||
268 | pfmul %mm4,%mm3 |
||
269 | punpckldq 44(%ebx),%mm5 |
||
270 | punpckldq -48(%edx),%mm6 |
||
271 | pfadd %mm3,%mm0 |
||
272 | movd 48(%ebx),%mm1 |
||
273 | movd -52(%edx),%mm2 |
||
274 | pfmul %mm6,%mm5 |
||
275 | punpckldq 52(%ebx),%mm1 |
||
276 | punpckldq -56(%edx),%mm2 |
||
277 | pfadd %mm0,%mm5 |
||
278 | movd 56(%ebx),%mm3 |
||
279 | movd -60(%edx),%mm4 |
||
280 | pfmul %mm2,%mm1 |
||
281 | punpckldq 60(%ebx),%mm3 |
||
282 | punpckldq (%edx),%mm4 |
||
283 | pfadd %mm1,%mm5 |
||
284 | addl $-128,%edx |
||
285 | addl $-64,%ebx |
||
286 | movd (%ebx),%mm0 |
||
287 | movd -4(%edx),%mm1 |
||
288 | pfmul %mm4,%mm3 |
||
289 | punpckldq 4(%ebx),%mm0 |
||
290 | punpckldq -8(%edx),%mm1 |
||
291 | pfadd %mm5,%mm3 |
||
292 | pfacc %mm3,%mm3 |
||
293 | incl %ebp |
||
294 | pxor %mm7,%mm3 |
||
295 | #ifdef ACCURATE_ROUNDING |
||
296 | pfmin (max_s16),%mm3 |
||
297 | pfmax (min_s16),%mm3 |
||
298 | pfadd (ftoi_magic),%mm3 |
||
299 | #else |
||
300 | pf2id %mm3,%mm3 |
||
301 | packssdw %mm3,%mm3 |
||
302 | #endif |
||
303 | movd %mm3,%eax |
||
304 | movw %ax,(%edi) |
||
305 | addl $4,%edi |
||
306 | decl %ecx |
||
307 | jnz .L46 |
||
308 | |||
309 | femms |
||
310 | movl %ebp,%eax |
||
311 | popl %ebx |
||
312 | popl %esi |
||
313 | popl %edi |
||
314 | popl %ebp |
||
315 | addl $24,%esp |
||
316 | ret |
||
317 | |||
318 | NONEXEC_STACK |