Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
3960 Serge 1
/*
2
	decode_3dnow.s - 3DNow! optimized synth_1to1()
3
 
4
	copyright ?-2007 by the mpg123 project - free software under the terms of the LGPL 2.1
5
	see COPYING and AUTHORS files in distribution or http://mpg123.org
6
	initially written by Syuuhei Kashiyama
7
 
8
	This code based 'decode_3dnow.s' by Syuuhei Kashiyama
9
	,only two types of changes have been made:
10
 
11
	- remove PREFETCH instruction for speedup
12
	- change function name for support 3DNow! automatic detect
13
	- femms moved to before 'call dct64_3dnow'
14
 
15
	You can find Kashiyama's original 3dnow! support patch
16
	(for mpg123-0.59o) at
17
	http://user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese).
18
 
19
	by KIMURA Takuhiro  - until 31.Mar.1999
20
                  	               - after  1.Apr.1999
21
 
22
 
23
 
24
	Replacement of synth_1to1() with AMD's 3DNow! SIMD operations support
25
 
26
	Syuuhei Kashiyama 
27
 
28
	The author of this program disclaim whole expressed or implied
29
	warranties with regard to this program, and in no event shall the
30
	author of this program liable to whatever resulted from the use of
31
	this program. Use it at your own risk.
32
*/
33
 
34
#include "mangle.h"
35
 
36
#ifdef ACCURATE_ROUNDING
37
#ifndef __APPLE__
38
	.section	.rodata
39
#else
40
	.data
41
#endif
42
	ALIGN8
43
max_s16:
44
	.long   1191181824 /* 32767.0 */
45
	.long   1191181824
46
min_s16:
47
	.long   -956301312 /* -32768.0 */
48
	.long   -956301312
49
ftoi_magic:
50
	.long	1262485504 /* 2^23 + 2^22 */
51
	.long	1262485504
52
#endif
53
	.text
54
	ALIGN16
55
.globl ASM_NAME(synth_1to1_3dnow_asm)
56
/* int synth_1to1_3dnow_asm(real *bandPtr, int channel, unsigned char *out, unsigned char *buffs, int *bo, real *decwin); */
57
ASM_NAME(synth_1to1_3dnow_asm):
58
	subl $24,%esp
59
	pushl %ebp
60
	pushl %edi
61
	xorl %ebp,%ebp
62
	pushl %esi
63
	pushl %ebx
64
/* stack old: 0=ebx 4=esi 8=edi 12=ebp 16,20,24,28,32,36=local 40=back 44=bandptr 48=channel 52=out 56=pnt */
65
/* stack new: 0=ebx 4=esi 8=edi 12=ebp 16,20,24,28,32,36=local 40=back 44=bandptr 48=channel 52=out 56=buffs 60=bo 64=decwin */
66
#define OUT     52(%esp)
67
#define CHANNEL 48(%esp)
68
#define BANDPTR 44(%esp)
69
#define BUFFS   56(%esp)
70
#define BO      60(%esp)
71
#define DECWIN  64(%esp)
72
#define LOCAL0  16(%esp)
73
#define LOCAL1  20(%esp)
74
#define LOCAL5  36(%esp)
75
	movl OUT,%esi
76
	movl %esi,LOCAL0 /* save buffer start (samples pointer) to another local var */
77
	movl CHANNEL,%ebx
78
	movl BO,%esi     /* bo address */
79
	movl (%esi),%edx /* bo value */
80
 
81
	femms
82
	testl %ebx,%ebx
83
	jne .L26
84
/* if(!channel) */
85
	decl %edx   /* --bo */
86
	andl $15,%edx
87
	movl %edx,(%esi) /* save bo */
88
	movl BUFFS,%ecx
89
	jmp .L27
90
.L26: /* if(channel) */
91
	addl $2,LOCAL0   /* samples++ */
92
	movl BUFFS,%ecx
93
	addl $2176,%ecx
94
.L27:
95
/* edx (and it's lower end) still holds bo value */
96
	testb $1,%dl  /* bo & 0x1 */
97
	je .L28
98
	movl %edx,LOCAL5
99
	movl %ecx,%ebx
100
	movl BANDPTR,%esi
101
	movl %edx,%edi
102
	pushl %esi
103
	sall $2,%edi
104
	movl %ebx,%eax
105
	movl %edi,24(%esp) /* LOCAL1, actually */
106
	addl %edi,%eax
107
	pushl %eax
108
	movl %edx,%eax
109
	incl %eax
110
	andl $15,%eax
111
	leal 1088(,%eax,4),%eax
112
	addl %ebx,%eax
113
	pushl %eax
114
	call ASM_NAME(dct64_3dnow)
115
	addl $12,%esp
116
	jmp .L29
117
.L28:
118
	leal 1(%edx),%esi
119
	movl BANDPTR,%edi
120
	movl %esi,LOCAL5
121
	leal 1092(%ecx,%edx,4),%eax
122
	pushl %edi
123
	leal 1088(%ecx),%ebx
124
	pushl %eax
125
	sall $2,%esi
126
	leal (%ecx,%edx,4),%eax
127
	pushl %eax
128
	call ASM_NAME(dct64_3dnow)
129
	addl $12,%esp
130
	movl %esi,LOCAL1
131
.L29:
132
	movl DECWIN,%edx
133
	addl $64,%edx
134
	movl $16,%ecx
135
	subl LOCAL1,%edx
136
	movl LOCAL0,%edi
137
 
138
	pcmpeqb %mm7,%mm7
139
	pslld $31,%mm7
140
	movq (%edx),%mm0
141
	movq (%ebx),%mm1
142
	ALIGN32
143
.L33:
144
	movq 8(%edx),%mm3
145
	pfmul %mm1,%mm0
146
	movq 8(%ebx),%mm4
147
	movq 16(%edx),%mm5
148
	pfmul %mm4,%mm3
149
	movq 16(%ebx),%mm6
150
	pfadd %mm3,%mm0
151
	movq 24(%edx),%mm1
152
	pfmul %mm6,%mm5
153
	movq 24(%ebx),%mm2
154
	pfadd %mm5,%mm0
155
	movq 32(%edx),%mm3
156
	pfmul %mm2,%mm1
157
	movq 32(%ebx),%mm4
158
	pfadd %mm1,%mm0
159
	movq 40(%edx),%mm5
160
	pfmul %mm4,%mm3
161
	movq 40(%ebx),%mm6
162
	pfadd %mm3,%mm0
163
	movq 48(%edx),%mm1
164
	pfmul %mm6,%mm5
165
	movq 48(%ebx),%mm2
166
	pfadd %mm0,%mm5
167
	movq 56(%edx),%mm3
168
	pfmul %mm1,%mm2
169
	movq 56(%ebx),%mm4
170
	pfadd %mm5,%mm2
171
	addl $64,%ebx
172
	subl $-128,%edx
173
	movq (%edx),%mm0
174
	pfmul %mm4,%mm3
175
	movq (%ebx),%mm1
176
	pfadd %mm3,%mm2
177
	movq %mm2,%mm3
178
	psrlq $32,%mm3
179
	pfsub %mm3,%mm2
180
	incl %ebp
181
#ifdef ACCURATE_ROUNDING
182
	pfmin (max_s16),%mm2
183
	pfmax (min_s16),%mm2
184
	pfadd (ftoi_magic),%mm2
185
#else
186
	pf2id %mm2,%mm2
187
	packssdw %mm2,%mm2
188
#endif
189
	movd %mm2,%eax
190
	movw %ax,0(%edi)
191
	addl $4,%edi
192
	decl %ecx
193
	jnz .L33
194
 
195
	movd (%ebx),%mm0
196
	movd (%edx),%mm1
197
	punpckldq 8(%ebx),%mm0
198
	punpckldq 8(%edx),%mm1
199
	movd 16(%ebx),%mm3
200
	movd 16(%edx),%mm4
201
	pfmul %mm1,%mm0
202
	punpckldq 24(%ebx),%mm3
203
	punpckldq 24(%edx),%mm4
204
	movd 32(%ebx),%mm5
205
	movd 32(%edx),%mm6
206
	pfmul %mm4,%mm3
207
	punpckldq 40(%ebx),%mm5
208
	punpckldq 40(%edx),%mm6
209
	pfadd %mm3,%mm0
210
	movd 48(%ebx),%mm1
211
	movd 48(%edx),%mm2
212
	pfmul %mm6,%mm5
213
	punpckldq 56(%ebx),%mm1
214
	punpckldq 56(%edx),%mm2
215
	pfadd %mm5,%mm0
216
	pfmul %mm2,%mm1
217
	pfadd %mm1,%mm0
218
	pfacc %mm1,%mm0
219
#ifdef ACCURATE_ROUNDING
220
	pfmin (max_s16),%mm0
221
	pfmax (min_s16),%mm0
222
	pfadd (ftoi_magic),%mm0
223
#else
224
	pf2id %mm0,%mm0
225
	packssdw %mm0,%mm0
226
#endif
227
	movd %mm0,%eax
228
	movw %ax,0(%edi)
229
	incl %ebp
230
	movl LOCAL5,%esi
231
	addl $-64,%ebx
232
	movl $15,%ebp
233
	addl $4,%edi
234
	leal -128(%edx,%esi,8),%edx
235
 
236
	movl $15,%ecx
237
	movd (%ebx),%mm0
238
	movd -4(%edx),%mm1
239
	punpckldq 4(%ebx),%mm0
240
	punpckldq -8(%edx),%mm1
241
	ALIGN32
242
.L46:
243
	movd 8(%ebx),%mm3
244
	movd -12(%edx),%mm4
245
	pfmul %mm1,%mm0
246
	punpckldq 12(%ebx),%mm3
247
	punpckldq -16(%edx),%mm4
248
	movd 16(%ebx),%mm5
249
	movd -20(%edx),%mm6
250
	pfmul %mm4,%mm3
251
	punpckldq 20(%ebx),%mm5
252
	punpckldq -24(%edx),%mm6
253
	pfadd %mm3,%mm0
254
	movd 24(%ebx),%mm1
255
	movd -28(%edx),%mm2
256
	pfmul %mm6,%mm5
257
	punpckldq 28(%ebx),%mm1
258
	punpckldq -32(%edx),%mm2
259
	pfadd %mm5,%mm0
260
	movd 32(%ebx),%mm3
261
	movd -36(%edx),%mm4
262
	pfmul %mm2,%mm1
263
	punpckldq 36(%ebx),%mm3
264
	punpckldq -40(%edx),%mm4
265
	pfadd %mm1,%mm0
266
	movd 40(%ebx),%mm5
267
	movd -44(%edx),%mm6
268
	pfmul %mm4,%mm3
269
	punpckldq 44(%ebx),%mm5
270
	punpckldq -48(%edx),%mm6
271
	pfadd %mm3,%mm0
272
	movd 48(%ebx),%mm1
273
	movd -52(%edx),%mm2
274
	pfmul %mm6,%mm5
275
	punpckldq 52(%ebx),%mm1
276
	punpckldq -56(%edx),%mm2
277
	pfadd %mm0,%mm5
278
	movd 56(%ebx),%mm3
279
	movd -60(%edx),%mm4
280
	pfmul %mm2,%mm1
281
	punpckldq 60(%ebx),%mm3
282
	punpckldq (%edx),%mm4
283
	pfadd %mm1,%mm5
284
	addl $-128,%edx
285
	addl $-64,%ebx
286
	movd (%ebx),%mm0
287
	movd -4(%edx),%mm1
288
	pfmul %mm4,%mm3
289
	punpckldq 4(%ebx),%mm0
290
	punpckldq -8(%edx),%mm1
291
	pfadd %mm5,%mm3
292
	pfacc %mm3,%mm3
293
	incl %ebp
294
	pxor %mm7,%mm3
295
#ifdef ACCURATE_ROUNDING
296
	pfmin (max_s16),%mm3
297
	pfmax (min_s16),%mm3
298
	pfadd (ftoi_magic),%mm3
299
#else
300
	pf2id %mm3,%mm3
301
	packssdw %mm3,%mm3
302
#endif
303
	movd %mm3,%eax
304
	movw %ax,(%edi)
305
	addl $4,%edi
306
	decl %ecx
307
	jnz .L46
308
 
309
	femms
310
	movl %ebp,%eax
311
	popl %ebx
312
	popl %esi
313
	popl %edi
314
	popl %ebp
315
	addl $24,%esp
316
	ret
317
 
318
NONEXEC_STACK