Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
3960 Serge 1
/*
2
	synth_sse_float: SSE optimized synth (stereo specific, float output version)
3
 
4
	copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5
	see COPYING and AUTHORS files in distribution or http://mpg123.org
6
	initially written by Taihei Monma
7
*/
8
 
9
#include "mangle.h"
10
 
11
/* real *window; */
12
#define WINDOW %ebx
13
/* real *b0l; */
14
#define B0L %edx
15
/* real *b0r; */
16
#define B0R %esi
17
/* real *samples; */
18
#define SAMPLES %edi
19
 
20
#define TEMP(n) (12+16*n)(%esp)
21
 
22
/*
23
	int synth_1to1_real_s_sse_asm(real *window, real *b0l, real *b0r, real *samples, int bo1);
24
	return value: number of clipped samples (0)
25
*/
26
 
27
#ifndef __APPLE__
28
	.section	.rodata
29
#else
30
	.data
31
#endif
32
	ALIGN32
33
ASM_NAME(scale_sse):
34
	.long   939524096
35
	.long   939524096
36
	.long   939524096
37
	.long   939524096
38
	.text
39
	ALIGN16
40
.globl ASM_NAME(synth_1to1_real_s_sse_asm)
41
ASM_NAME(synth_1to1_real_s_sse_asm):
42
	pushl		%ebp
43
	movl		%esp, %ebp
44
	andl		$-16, %esp
45
	subl		$128, %esp
46
	pushl		%ebx
47
	pushl		%esi
48
	pushl		%edi
49
 
50
	movl		8(%ebp), WINDOW
51
	movl		12(%ebp), B0L
52
	movl		16(%ebp), B0R
53
	movl		20(%ebp), SAMPLES
54
	movl		24(%ebp), %eax
55
	shll		$2, %eax
56
 
57
	leal		64(WINDOW), WINDOW
58
	subl		%eax, WINDOW
59
 
60
	movl		$4, %ecx
61
 
62
	ALIGN16
63
Loop_start_1:
64
	movups		(WINDOW), %xmm0
65
	movups		16(WINDOW), %xmm1
66
	movups		32(WINDOW), %xmm2
67
	movups		48(WINDOW), %xmm3
68
	movaps		%xmm0, %xmm4
69
	movaps		%xmm1, %xmm5
70
	movaps		%xmm2, %xmm6
71
	movaps		%xmm3, %xmm7
72
	mulps		0(B0L), %xmm0
73
	mulps		16(B0L), %xmm1
74
	mulps		32(B0L), %xmm2
75
	mulps		48(B0L), %xmm3
76
	mulps		0(B0R), %xmm4
77
	mulps		16(B0R), %xmm5
78
	mulps		32(B0R), %xmm6
79
	mulps		48(B0R), %xmm7
80
	addps		%xmm1, %xmm0
81
	addps		%xmm3, %xmm2
82
	addps		%xmm5, %xmm4
83
	addps		%xmm7, %xmm6
84
	addps		%xmm2, %xmm0
85
	addps		%xmm6, %xmm4
86
	movaps		%xmm0, TEMP(0)
87
	movaps		%xmm4, TEMP(4)
88
 
89
	leal		128(WINDOW), WINDOW
90
	leal		64(B0L), B0L
91
	leal		64(B0R), B0R
92
 
93
	movups		(WINDOW), %xmm0
94
	movups		16(WINDOW), %xmm1
95
	movups		32(WINDOW), %xmm2
96
	movups		48(WINDOW), %xmm3
97
	movaps		%xmm0, %xmm4
98
	movaps		%xmm1, %xmm5
99
	movaps		%xmm2, %xmm6
100
	movaps		%xmm3, %xmm7
101
	mulps		0(B0L), %xmm0
102
	mulps		16(B0L), %xmm1
103
	mulps		32(B0L), %xmm2
104
	mulps		48(B0L), %xmm3
105
	mulps		0(B0R), %xmm4
106
	mulps		16(B0R), %xmm5
107
	mulps		32(B0R), %xmm6
108
	mulps		48(B0R), %xmm7
109
	addps		%xmm1, %xmm0
110
	addps		%xmm3, %xmm2
111
	addps		%xmm5, %xmm4
112
	addps		%xmm7, %xmm6
113
	addps		%xmm2, %xmm0
114
	addps		%xmm6, %xmm4
115
	movaps		%xmm0, TEMP(1)
116
	movaps		%xmm4, TEMP(5)
117
 
118
	leal		128(WINDOW), WINDOW
119
	leal		64(B0L), B0L
120
	leal		64(B0R), B0R
121
 
122
	movups		(WINDOW), %xmm0
123
	movups		16(WINDOW), %xmm1
124
	movups		32(WINDOW), %xmm2
125
	movups		48(WINDOW), %xmm3
126
	movaps		%xmm0, %xmm4
127
	movaps		%xmm1, %xmm5
128
	movaps		%xmm2, %xmm6
129
	movaps		%xmm3, %xmm7
130
	mulps		0(B0L), %xmm0
131
	mulps		16(B0L), %xmm1
132
	mulps		32(B0L), %xmm2
133
	mulps		48(B0L), %xmm3
134
	mulps		0(B0R), %xmm4
135
	mulps		16(B0R), %xmm5
136
	mulps		32(B0R), %xmm6
137
	mulps		48(B0R), %xmm7
138
	addps		%xmm1, %xmm0
139
	addps		%xmm3, %xmm2
140
	addps		%xmm5, %xmm4
141
	addps		%xmm7, %xmm6
142
	addps		%xmm2, %xmm0
143
	addps		%xmm6, %xmm4
144
	movaps		%xmm0, TEMP(2)
145
	movaps		%xmm4, TEMP(6)
146
 
147
	leal		128(WINDOW), WINDOW
148
	leal		64(B0L), B0L
149
	leal		64(B0R), B0R
150
 
151
	movups		(WINDOW), %xmm0
152
	movups		16(WINDOW), %xmm1
153
	movups		32(WINDOW), %xmm2
154
	movups		48(WINDOW), %xmm3
155
	movaps		%xmm0, %xmm4
156
	movaps		%xmm1, %xmm5
157
	movaps		%xmm2, %xmm6
158
	movaps		%xmm3, %xmm7
159
	mulps		0(B0L), %xmm0
160
	mulps		16(B0L), %xmm1
161
	mulps		32(B0L), %xmm2
162
	mulps		48(B0L), %xmm3
163
	mulps		0(B0R), %xmm4
164
	mulps		16(B0R), %xmm5
165
	mulps		32(B0R), %xmm6
166
	mulps		48(B0R), %xmm7
167
	addps		%xmm1, %xmm0
168
	addps		%xmm3, %xmm2
169
	addps		%xmm5, %xmm4
170
	addps		%xmm7, %xmm6
171
	addps		%xmm2, %xmm0
172
	addps		%xmm6, %xmm4
173
	movaps		%xmm0, %xmm7
174
	movaps		%xmm4, TEMP(7)
175
 
176
	leal		128(WINDOW), WINDOW
177
	leal		64(B0L), B0L
178
	leal		64(B0R), B0R
179
 
180
	movaps		TEMP(0), %xmm4
181
	movaps		TEMP(1), %xmm5
182
	movaps		TEMP(2), %xmm6
183
	movaps		%xmm4, %xmm0
184
	movaps		%xmm6, %xmm1
185
	unpcklps	%xmm5, %xmm4
186
	unpcklps	%xmm7, %xmm6
187
	unpckhps	%xmm5, %xmm0
188
	unpckhps	%xmm7, %xmm1
189
	movaps		%xmm4, %xmm2
190
	movaps		%xmm0, %xmm3
191
	movlhps		%xmm6, %xmm4
192
	movhlps		%xmm2, %xmm6
193
	movlhps		%xmm1, %xmm0
194
	movhlps		%xmm3, %xmm1
195
	subps		%xmm6, %xmm4
196
	subps		%xmm1, %xmm0
197
	addps		%xmm4, %xmm0
198
	movaps		%xmm0, %xmm2
199
 
200
	movaps		TEMP(4), %xmm4
201
	movaps		TEMP(5), %xmm5
202
	movaps		TEMP(6), %xmm6
203
	movaps		TEMP(7), %xmm7
204
	movaps		%xmm4, %xmm0
205
	movaps		%xmm6, %xmm1
206
	unpcklps	%xmm5, %xmm4
207
	unpcklps	%xmm7, %xmm6
208
	unpckhps	%xmm5, %xmm0
209
	unpckhps	%xmm7, %xmm1
210
	movaps		%xmm2, %xmm5
211
	movaps		%xmm4, %xmm2
212
	movaps		%xmm0, %xmm3
213
	movlhps		%xmm6, %xmm4
214
	movhlps		%xmm2, %xmm6
215
	movlhps		%xmm1, %xmm0
216
	movhlps		%xmm3, %xmm1
217
	subps		%xmm6, %xmm4
218
	subps		%xmm1, %xmm0
219
	addps		%xmm4, %xmm0
220
 
221
	mulps		ASM_NAME(scale_sse), %xmm5
222
	mulps		ASM_NAME(scale_sse), %xmm0
223
	movaps		%xmm5, %xmm1
224
	unpcklps	%xmm0, %xmm5
225
	unpckhps	%xmm0, %xmm1
226
	movups		%xmm5, (SAMPLES)
227
	movups		%xmm1, 16(SAMPLES)
228
 
229
	leal		32(SAMPLES), SAMPLES
230
	decl		%ecx
231
	jnz			Loop_start_1
232
 
233
	movl		$4, %ecx
234
 
235
	ALIGN16
236
Loop_start_2:
237
	movups		(WINDOW), %xmm0
238
	movups		16(WINDOW), %xmm1
239
	movups		32(WINDOW), %xmm2
240
	movups		48(WINDOW), %xmm3
241
	movaps		%xmm0, %xmm4
242
	movaps		%xmm1, %xmm5
243
	movaps		%xmm2, %xmm6
244
	movaps		%xmm3, %xmm7
245
	mulps		0(B0L), %xmm0
246
	mulps		16(B0L), %xmm1
247
	mulps		32(B0L), %xmm2
248
	mulps		48(B0L), %xmm3
249
	mulps		0(B0R), %xmm4
250
	mulps		16(B0R), %xmm5
251
	mulps		32(B0R), %xmm6
252
	mulps		48(B0R), %xmm7
253
	addps		%xmm1, %xmm0
254
	addps		%xmm3, %xmm2
255
	addps		%xmm5, %xmm4
256
	addps		%xmm7, %xmm6
257
	addps		%xmm2, %xmm0
258
	addps		%xmm6, %xmm4
259
	movaps		%xmm0, TEMP(0)
260
	movaps		%xmm4, TEMP(4)
261
 
262
	leal		128(WINDOW), WINDOW
263
	leal		-64(B0L), B0L
264
	leal		-64(B0R), B0R
265
 
266
	movups		(WINDOW), %xmm0
267
	movups		16(WINDOW), %xmm1
268
	movups		32(WINDOW), %xmm2
269
	movups		48(WINDOW), %xmm3
270
	movaps		%xmm0, %xmm4
271
	movaps		%xmm1, %xmm5
272
	movaps		%xmm2, %xmm6
273
	movaps		%xmm3, %xmm7
274
	mulps		0(B0L), %xmm0
275
	mulps		16(B0L), %xmm1
276
	mulps		32(B0L), %xmm2
277
	mulps		48(B0L), %xmm3
278
	mulps		0(B0R), %xmm4
279
	mulps		16(B0R), %xmm5
280
	mulps		32(B0R), %xmm6
281
	mulps		48(B0R), %xmm7
282
	addps		%xmm1, %xmm0
283
	addps		%xmm3, %xmm2
284
	addps		%xmm5, %xmm4
285
	addps		%xmm7, %xmm6
286
	addps		%xmm2, %xmm0
287
	addps		%xmm6, %xmm4
288
	movaps		%xmm0, TEMP(1)
289
	movaps		%xmm4, TEMP(5)
290
 
291
	leal		128(WINDOW), WINDOW
292
	leal		-64(B0L), B0L
293
	leal		-64(B0R), B0R
294
 
295
	movups		(WINDOW), %xmm0
296
	movups		16(WINDOW), %xmm1
297
	movups		32(WINDOW), %xmm2
298
	movups		48(WINDOW), %xmm3
299
	movaps		%xmm0, %xmm4
300
	movaps		%xmm1, %xmm5
301
	movaps		%xmm2, %xmm6
302
	movaps		%xmm3, %xmm7
303
	mulps		0(B0L), %xmm0
304
	mulps		16(B0L), %xmm1
305
	mulps		32(B0L), %xmm2
306
	mulps		48(B0L), %xmm3
307
	mulps		0(B0R), %xmm4
308
	mulps		16(B0R), %xmm5
309
	mulps		32(B0R), %xmm6
310
	mulps		48(B0R), %xmm7
311
	addps		%xmm1, %xmm0
312
	addps		%xmm3, %xmm2
313
	addps		%xmm5, %xmm4
314
	addps		%xmm7, %xmm6
315
	addps		%xmm2, %xmm0
316
	addps		%xmm6, %xmm4
317
	movaps		%xmm0, TEMP(2)
318
	movaps		%xmm4, TEMP(6)
319
 
320
	leal		128(WINDOW), WINDOW
321
	leal		-64(B0L), B0L
322
	leal		-64(B0R), B0R
323
 
324
	movups		(WINDOW), %xmm0
325
	movups		16(WINDOW), %xmm1
326
	movups		32(WINDOW), %xmm2
327
	movups		48(WINDOW), %xmm3
328
	movaps		%xmm0, %xmm4
329
	movaps		%xmm1, %xmm5
330
	movaps		%xmm2, %xmm6
331
	movaps		%xmm3, %xmm7
332
	mulps		0(B0L), %xmm0
333
	mulps		16(B0L), %xmm1
334
	mulps		32(B0L), %xmm2
335
	mulps		48(B0L), %xmm3
336
	mulps		0(B0R), %xmm4
337
	mulps		16(B0R), %xmm5
338
	mulps		32(B0R), %xmm6
339
	mulps		48(B0R), %xmm7
340
	addps		%xmm1, %xmm0
341
	addps		%xmm3, %xmm2
342
	addps		%xmm5, %xmm4
343
	addps		%xmm7, %xmm6
344
	addps		%xmm2, %xmm0
345
	addps		%xmm6, %xmm4
346
	movaps		%xmm0, %xmm7
347
	movaps		%xmm4, TEMP(7)
348
 
349
	leal		128(WINDOW), WINDOW
350
	leal		-64(B0L), B0L
351
	leal		-64(B0R), B0R
352
 
353
	movaps		TEMP(0), %xmm4
354
	movaps		TEMP(1), %xmm5
355
	movaps		TEMP(2), %xmm6
356
	movaps		%xmm4, %xmm0
357
	movaps		%xmm6, %xmm1
358
	unpcklps	%xmm5, %xmm4
359
	unpcklps	%xmm7, %xmm6
360
	unpckhps	%xmm5, %xmm0
361
	unpckhps	%xmm7, %xmm1
362
	movaps		%xmm4, %xmm2
363
	movaps		%xmm0, %xmm3
364
	movlhps		%xmm6, %xmm4
365
	movhlps		%xmm2, %xmm6
366
	movlhps		%xmm1, %xmm0
367
	movhlps		%xmm3, %xmm1
368
	addps		%xmm6, %xmm4
369
	addps		%xmm1, %xmm0
370
	addps		%xmm4, %xmm0
371
	movaps		%xmm0, %xmm2
372
 
373
	movaps		TEMP(4), %xmm4
374
	movaps		TEMP(5), %xmm5
375
	movaps		TEMP(6), %xmm6
376
	movaps		TEMP(7), %xmm7
377
	movaps		%xmm4, %xmm0
378
	movaps		%xmm6, %xmm1
379
	unpcklps	%xmm5, %xmm4
380
	unpcklps	%xmm7, %xmm6
381
	unpckhps	%xmm5, %xmm0
382
	unpckhps	%xmm7, %xmm1
383
	movaps		%xmm2, %xmm5
384
	movaps		%xmm4, %xmm2
385
	movaps		%xmm0, %xmm3
386
	movlhps		%xmm6, %xmm4
387
	movhlps		%xmm2, %xmm6
388
	movlhps		%xmm1, %xmm0
389
	movhlps		%xmm3, %xmm1
390
	addps		%xmm6, %xmm4
391
	addps		%xmm1, %xmm0
392
	addps		%xmm4, %xmm0
393
 
394
	mulps		ASM_NAME(scale_sse), %xmm5
395
	mulps		ASM_NAME(scale_sse), %xmm0
396
	movaps		%xmm5, %xmm1
397
	unpcklps	%xmm0, %xmm5
398
	unpckhps	%xmm0, %xmm1
399
	movups		%xmm5, (SAMPLES)
400
	movups		%xmm1, 16(SAMPLES)
401
 
402
	leal		32(SAMPLES), SAMPLES
403
	decl		%ecx
404
	jnz			Loop_start_2
405
 
406
	xorl		%eax, %eax
407
 
408
	popl		%edi
409
	popl		%esi
410
	popl		%ebx
411
	movl		%ebp, %esp
412
	popl		%ebp
413
 
414
	ret
415
 
416
NONEXEC_STACK