Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
3960 Serge 1
/*
2
	synth_stereo_sse_s32: SSE optimized synth (stereo specific, s32 output version)
3
 
4
	copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5
	see COPYING and AUTHORS files in distribution or http://mpg123.org
6
	initially written by Taihei Monma
7
*/
8
 
9
#include "mangle.h"
10
 
11
/* real *window; */
12
#define WINDOW %ebx
13
/* real *b0l; */
14
#define B0L %edx
15
/* real *b0r; */
16
#define B0R %esi
17
/* real *samples; */
18
#define SAMPLES %edi
19
 
20
#define TEMP(n) (12+16*n)(%esp)
21
#define MMREG_CLIP %mm7
22
 
23
/*
24
	int synth_1to1_s32_s_sse_asm(real *window, real *b0l, real *b0r, int32_t *samples, int bo1);
25
	return value: number of clipped samples
26
*/
27
 
28
#ifndef __APPLE__
29
	.section	.rodata
30
#else
31
	.data
32
#endif
33
	ALIGN32
34
ASM_NAME(scale_s32):
35
	.long   1199570944 /* 65536.0 */
36
	.long   1199570944
37
	.long   1199570944
38
	.long   1199570944
39
	ALIGN16
40
ASM_NAME(maxmin_s32):
41
	.long   1191182335 /* 32767.999 */
42
	.long   1191182335
43
	.long   1191182335
44
	.long   1191182335
45
	.long   -956301312 /* -32768.0 */
46
	.long   -956301312
47
	.long   -956301312
48
	.long   -956301312
49
	.text
50
	ALIGN16
51
.globl ASM_NAME(synth_1to1_s32_s_sse_asm)
52
ASM_NAME(synth_1to1_s32_s_sse_asm):
53
	pushl		%ebp
54
	movl		%esp, %ebp
55
	andl		$-16, %esp
56
	subl		$128, %esp
57
	pushl		%ebx
58
	pushl		%esi
59
	pushl		%edi
60
 
61
	pxor		MMREG_CLIP, MMREG_CLIP
62
 
63
	movl		8(%ebp), WINDOW
64
	movl		12(%ebp), B0L
65
	movl		16(%ebp), B0R
66
	movl		20(%ebp), SAMPLES
67
	movl		24(%ebp), %eax
68
	shll		$2, %eax
69
 
70
	leal		64(WINDOW), WINDOW
71
	subl		%eax, WINDOW
72
 
73
	movl		$4, %ecx
74
 
75
	ALIGN16
76
Loop_start_1:
77
	movups		(WINDOW), %xmm0
78
	movups		16(WINDOW), %xmm1
79
	movups		32(WINDOW), %xmm2
80
	movups		48(WINDOW), %xmm3
81
	movaps		%xmm0, %xmm4
82
	movaps		%xmm1, %xmm5
83
	movaps		%xmm2, %xmm6
84
	movaps		%xmm3, %xmm7
85
	mulps		0(B0L), %xmm0
86
	mulps		16(B0L), %xmm1
87
	mulps		32(B0L), %xmm2
88
	mulps		48(B0L), %xmm3
89
	mulps		0(B0R), %xmm4
90
	mulps		16(B0R), %xmm5
91
	mulps		32(B0R), %xmm6
92
	mulps		48(B0R), %xmm7
93
	addps		%xmm1, %xmm0
94
	addps		%xmm3, %xmm2
95
	addps		%xmm5, %xmm4
96
	addps		%xmm7, %xmm6
97
	addps		%xmm2, %xmm0
98
	addps		%xmm6, %xmm4
99
	movaps		%xmm0, TEMP(0)
100
	movaps		%xmm4, TEMP(4)
101
 
102
	leal		128(WINDOW), WINDOW
103
	leal		64(B0L), B0L
104
	leal		64(B0R), B0R
105
 
106
	movups		(WINDOW), %xmm0
107
	movups		16(WINDOW), %xmm1
108
	movups		32(WINDOW), %xmm2
109
	movups		48(WINDOW), %xmm3
110
	movaps		%xmm0, %xmm4
111
	movaps		%xmm1, %xmm5
112
	movaps		%xmm2, %xmm6
113
	movaps		%xmm3, %xmm7
114
	mulps		0(B0L), %xmm0
115
	mulps		16(B0L), %xmm1
116
	mulps		32(B0L), %xmm2
117
	mulps		48(B0L), %xmm3
118
	mulps		0(B0R), %xmm4
119
	mulps		16(B0R), %xmm5
120
	mulps		32(B0R), %xmm6
121
	mulps		48(B0R), %xmm7
122
	addps		%xmm1, %xmm0
123
	addps		%xmm3, %xmm2
124
	addps		%xmm5, %xmm4
125
	addps		%xmm7, %xmm6
126
	addps		%xmm2, %xmm0
127
	addps		%xmm6, %xmm4
128
	movaps		%xmm0, TEMP(1)
129
	movaps		%xmm4, TEMP(5)
130
 
131
	leal		128(WINDOW), WINDOW
132
	leal		64(B0L), B0L
133
	leal		64(B0R), B0R
134
 
135
	movups		(WINDOW), %xmm0
136
	movups		16(WINDOW), %xmm1
137
	movups		32(WINDOW), %xmm2
138
	movups		48(WINDOW), %xmm3
139
	movaps		%xmm0, %xmm4
140
	movaps		%xmm1, %xmm5
141
	movaps		%xmm2, %xmm6
142
	movaps		%xmm3, %xmm7
143
	mulps		0(B0L), %xmm0
144
	mulps		16(B0L), %xmm1
145
	mulps		32(B0L), %xmm2
146
	mulps		48(B0L), %xmm3
147
	mulps		0(B0R), %xmm4
148
	mulps		16(B0R), %xmm5
149
	mulps		32(B0R), %xmm6
150
	mulps		48(B0R), %xmm7
151
	addps		%xmm1, %xmm0
152
	addps		%xmm3, %xmm2
153
	addps		%xmm5, %xmm4
154
	addps		%xmm7, %xmm6
155
	addps		%xmm2, %xmm0
156
	addps		%xmm6, %xmm4
157
	movaps		%xmm0, TEMP(2)
158
	movaps		%xmm4, TEMP(6)
159
 
160
	leal		128(WINDOW), WINDOW
161
	leal		64(B0L), B0L
162
	leal		64(B0R), B0R
163
 
164
	movups		(WINDOW), %xmm0
165
	movups		16(WINDOW), %xmm1
166
	movups		32(WINDOW), %xmm2
167
	movups		48(WINDOW), %xmm3
168
	movaps		%xmm0, %xmm4
169
	movaps		%xmm1, %xmm5
170
	movaps		%xmm2, %xmm6
171
	movaps		%xmm3, %xmm7
172
	mulps		0(B0L), %xmm0
173
	mulps		16(B0L), %xmm1
174
	mulps		32(B0L), %xmm2
175
	mulps		48(B0L), %xmm3
176
	mulps		0(B0R), %xmm4
177
	mulps		16(B0R), %xmm5
178
	mulps		32(B0R), %xmm6
179
	mulps		48(B0R), %xmm7
180
	addps		%xmm1, %xmm0
181
	addps		%xmm3, %xmm2
182
	addps		%xmm5, %xmm4
183
	addps		%xmm7, %xmm6
184
	addps		%xmm2, %xmm0
185
	addps		%xmm6, %xmm4
186
	movaps		%xmm0, %xmm7
187
	movaps		%xmm4, TEMP(7)
188
 
189
	leal		128(WINDOW), WINDOW
190
	leal		64(B0L), B0L
191
	leal		64(B0R), B0R
192
 
193
	movaps		TEMP(0), %xmm4
194
	movaps		TEMP(1), %xmm5
195
	movaps		TEMP(2), %xmm6
196
	movaps		%xmm4, %xmm0
197
	movaps		%xmm6, %xmm1
198
	unpcklps	%xmm5, %xmm4
199
	unpcklps	%xmm7, %xmm6
200
	unpckhps	%xmm5, %xmm0
201
	unpckhps	%xmm7, %xmm1
202
	movaps		%xmm4, %xmm2
203
	movaps		%xmm0, %xmm3
204
	movlhps		%xmm6, %xmm4
205
	movhlps		%xmm2, %xmm6
206
	movlhps		%xmm1, %xmm0
207
	movhlps		%xmm3, %xmm1
208
	subps		%xmm6, %xmm4
209
	subps		%xmm1, %xmm0
210
	addps		%xmm4, %xmm0
211
	movaps		%xmm0, %xmm2
212
 
213
	movaps		TEMP(4), %xmm4
214
	movaps		TEMP(5), %xmm5
215
	movaps		TEMP(6), %xmm6
216
	movaps		TEMP(7), %xmm7
217
	movaps		%xmm4, %xmm0
218
	movaps		%xmm6, %xmm1
219
	unpcklps	%xmm5, %xmm4
220
	unpcklps	%xmm7, %xmm6
221
	unpckhps	%xmm5, %xmm0
222
	unpckhps	%xmm7, %xmm1
223
	movaps		%xmm2, %xmm5
224
	movaps		%xmm4, %xmm2
225
	movaps		%xmm0, %xmm3
226
	movlhps		%xmm6, %xmm4
227
	movhlps		%xmm2, %xmm6
228
	movlhps		%xmm1, %xmm0
229
	movhlps		%xmm3, %xmm1
230
	subps		%xmm6, %xmm4
231
	subps		%xmm1, %xmm0
232
	addps		%xmm4, %xmm0
233
 
234
	movaps		%xmm5, %xmm1
235
	movaps		%xmm5, %xmm2
236
	movaps		%xmm0, %xmm3
237
	movaps		%xmm0, %xmm4
238
	mulps		ASM_NAME(scale_s32), %xmm5
239
	mulps		ASM_NAME(scale_s32), %xmm0
240
	cmpnleps	ASM_NAME(maxmin_s32), %xmm1
241
	cmpltps		ASM_NAME(maxmin_s32)+16, %xmm2
242
	cmpnleps	ASM_NAME(maxmin_s32), %xmm3
243
	cmpltps		ASM_NAME(maxmin_s32)+16, %xmm4
244
	cvtps2pi	%xmm5, %mm0
245
	cvtps2pi	%xmm0, %mm1
246
	cvtps2pi	%xmm1, %mm2
247
	cvtps2pi	%xmm3, %mm3
248
	psrad		$31, %mm2
249
	psrad		$31, %mm3
250
	pxor		%mm2, %mm0
251
	pxor		%mm3, %mm1
252
	movq		%mm0, %mm4
253
	punpckldq	%mm1, %mm0
254
	punpckhdq	%mm1, %mm4
255
	movq		%mm0, (SAMPLES)
256
	movq		%mm4, 8(SAMPLES)
257
	movhlps		%xmm5, %xmm5
258
	movhlps		%xmm0, %xmm0
259
	movhlps		%xmm1, %xmm1
260
	movhlps		%xmm3, %xmm3
261
	cvtps2pi	%xmm5, %mm0
262
	cvtps2pi	%xmm0, %mm1
263
	cvtps2pi	%xmm1, %mm4
264
	cvtps2pi	%xmm3, %mm5
265
	psrad		$31, %mm4
266
	psrad		$31, %mm5
267
	pxor		%mm4, %mm0
268
	pxor		%mm5, %mm1
269
	movq		%mm0, %mm6
270
	punpckldq	%mm1, %mm0
271
	punpckhdq	%mm1, %mm6
272
	movq		%mm0, 16(SAMPLES)
273
	movq		%mm6, 24(SAMPLES)
274
 
275
	packssdw	%mm4, %mm2
276
	packssdw	%mm5, %mm3
277
	psrlw		$15, %mm2
278
	psrlw		$15, %mm3
279
	cvtps2pi	%xmm2, %mm0
280
	cvtps2pi	%xmm4, %mm1
281
	movhlps		%xmm2, %xmm2
282
	movhlps		%xmm4, %xmm4
283
	cvtps2pi	%xmm2, %mm4
284
	cvtps2pi	%xmm4, %mm5
285
	packssdw	%mm4, %mm0
286
	packssdw	%mm5, %mm1
287
	psrlw		$15, %mm0
288
	psrlw		$15, %mm1
289
	paddw		%mm3, %mm2
290
	paddw		%mm1, %mm0
291
	paddw		%mm2, %mm0
292
	paddw		%mm0, MMREG_CLIP
293
 
294
	leal		32(SAMPLES), SAMPLES
295
	decl		%ecx
296
	jnz			Loop_start_1
297
 
298
	movl		$4, %ecx
299
 
300
	ALIGN16
301
Loop_start_2:
302
	movups		(WINDOW), %xmm0
303
	movups		16(WINDOW), %xmm1
304
	movups		32(WINDOW), %xmm2
305
	movups		48(WINDOW), %xmm3
306
	movaps		%xmm0, %xmm4
307
	movaps		%xmm1, %xmm5
308
	movaps		%xmm2, %xmm6
309
	movaps		%xmm3, %xmm7
310
	mulps		0(B0L), %xmm0
311
	mulps		16(B0L), %xmm1
312
	mulps		32(B0L), %xmm2
313
	mulps		48(B0L), %xmm3
314
	mulps		0(B0R), %xmm4
315
	mulps		16(B0R), %xmm5
316
	mulps		32(B0R), %xmm6
317
	mulps		48(B0R), %xmm7
318
	addps		%xmm1, %xmm0
319
	addps		%xmm3, %xmm2
320
	addps		%xmm5, %xmm4
321
	addps		%xmm7, %xmm6
322
	addps		%xmm2, %xmm0
323
	addps		%xmm6, %xmm4
324
	movaps		%xmm0, TEMP(0)
325
	movaps		%xmm4, TEMP(4)
326
 
327
	leal		128(WINDOW), WINDOW
328
	leal		-64(B0L), B0L
329
	leal		-64(B0R), B0R
330
 
331
	movups		(WINDOW), %xmm0
332
	movups		16(WINDOW), %xmm1
333
	movups		32(WINDOW), %xmm2
334
	movups		48(WINDOW), %xmm3
335
	movaps		%xmm0, %xmm4
336
	movaps		%xmm1, %xmm5
337
	movaps		%xmm2, %xmm6
338
	movaps		%xmm3, %xmm7
339
	mulps		0(B0L), %xmm0
340
	mulps		16(B0L), %xmm1
341
	mulps		32(B0L), %xmm2
342
	mulps		48(B0L), %xmm3
343
	mulps		0(B0R), %xmm4
344
	mulps		16(B0R), %xmm5
345
	mulps		32(B0R), %xmm6
346
	mulps		48(B0R), %xmm7
347
	addps		%xmm1, %xmm0
348
	addps		%xmm3, %xmm2
349
	addps		%xmm5, %xmm4
350
	addps		%xmm7, %xmm6
351
	addps		%xmm2, %xmm0
352
	addps		%xmm6, %xmm4
353
	movaps		%xmm0, TEMP(1)
354
	movaps		%xmm4, TEMP(5)
355
 
356
	leal		128(WINDOW), WINDOW
357
	leal		-64(B0L), B0L
358
	leal		-64(B0R), B0R
359
 
360
	movups		(WINDOW), %xmm0
361
	movups		16(WINDOW), %xmm1
362
	movups		32(WINDOW), %xmm2
363
	movups		48(WINDOW), %xmm3
364
	movaps		%xmm0, %xmm4
365
	movaps		%xmm1, %xmm5
366
	movaps		%xmm2, %xmm6
367
	movaps		%xmm3, %xmm7
368
	mulps		0(B0L), %xmm0
369
	mulps		16(B0L), %xmm1
370
	mulps		32(B0L), %xmm2
371
	mulps		48(B0L), %xmm3
372
	mulps		0(B0R), %xmm4
373
	mulps		16(B0R), %xmm5
374
	mulps		32(B0R), %xmm6
375
	mulps		48(B0R), %xmm7
376
	addps		%xmm1, %xmm0
377
	addps		%xmm3, %xmm2
378
	addps		%xmm5, %xmm4
379
	addps		%xmm7, %xmm6
380
	addps		%xmm2, %xmm0
381
	addps		%xmm6, %xmm4
382
	movaps		%xmm0, TEMP(2)
383
	movaps		%xmm4, TEMP(6)
384
 
385
	leal		128(WINDOW), WINDOW
386
	leal		-64(B0L), B0L
387
	leal		-64(B0R), B0R
388
 
389
	movups		(WINDOW), %xmm0
390
	movups		16(WINDOW), %xmm1
391
	movups		32(WINDOW), %xmm2
392
	movups		48(WINDOW), %xmm3
393
	movaps		%xmm0, %xmm4
394
	movaps		%xmm1, %xmm5
395
	movaps		%xmm2, %xmm6
396
	movaps		%xmm3, %xmm7
397
	mulps		0(B0L), %xmm0
398
	mulps		16(B0L), %xmm1
399
	mulps		32(B0L), %xmm2
400
	mulps		48(B0L), %xmm3
401
	mulps		0(B0R), %xmm4
402
	mulps		16(B0R), %xmm5
403
	mulps		32(B0R), %xmm6
404
	mulps		48(B0R), %xmm7
405
	addps		%xmm1, %xmm0
406
	addps		%xmm3, %xmm2
407
	addps		%xmm5, %xmm4
408
	addps		%xmm7, %xmm6
409
	addps		%xmm2, %xmm0
410
	addps		%xmm6, %xmm4
411
	movaps		%xmm0, %xmm7
412
	movaps		%xmm4, TEMP(7)
413
 
414
	leal		128(WINDOW), WINDOW
415
	leal		-64(B0L), B0L
416
	leal		-64(B0R), B0R
417
 
418
	movaps		TEMP(0), %xmm4
419
	movaps		TEMP(1), %xmm5
420
	movaps		TEMP(2), %xmm6
421
	movaps		%xmm4, %xmm0
422
	movaps		%xmm6, %xmm1
423
	unpcklps	%xmm5, %xmm4
424
	unpcklps	%xmm7, %xmm6
425
	unpckhps	%xmm5, %xmm0
426
	unpckhps	%xmm7, %xmm1
427
	movaps		%xmm4, %xmm2
428
	movaps		%xmm0, %xmm3
429
	movlhps		%xmm6, %xmm4
430
	movhlps		%xmm2, %xmm6
431
	movlhps		%xmm1, %xmm0
432
	movhlps		%xmm3, %xmm1
433
	addps		%xmm6, %xmm4
434
	addps		%xmm1, %xmm0
435
	addps		%xmm4, %xmm0
436
	movaps		%xmm0, %xmm2
437
 
438
	movaps		TEMP(4), %xmm4
439
	movaps		TEMP(5), %xmm5
440
	movaps		TEMP(6), %xmm6
441
	movaps		TEMP(7), %xmm7
442
	movaps		%xmm4, %xmm0
443
	movaps		%xmm6, %xmm1
444
	unpcklps	%xmm5, %xmm4
445
	unpcklps	%xmm7, %xmm6
446
	unpckhps	%xmm5, %xmm0
447
	unpckhps	%xmm7, %xmm1
448
	movaps		%xmm2, %xmm5
449
	movaps		%xmm4, %xmm2
450
	movaps		%xmm0, %xmm3
451
	movlhps		%xmm6, %xmm4
452
	movhlps		%xmm2, %xmm6
453
	movlhps		%xmm1, %xmm0
454
	movhlps		%xmm3, %xmm1
455
	addps		%xmm6, %xmm4
456
	addps		%xmm1, %xmm0
457
	addps		%xmm4, %xmm0
458
 
459
	movaps		%xmm5, %xmm1
460
	movaps		%xmm5, %xmm2
461
	movaps		%xmm0, %xmm3
462
	movaps		%xmm0, %xmm4
463
	mulps		ASM_NAME(scale_s32), %xmm5
464
	mulps		ASM_NAME(scale_s32), %xmm0
465
	cmpnleps	ASM_NAME(maxmin_s32), %xmm1
466
	cmpltps		ASM_NAME(maxmin_s32)+16, %xmm2
467
	cmpnleps	ASM_NAME(maxmin_s32), %xmm3
468
	cmpltps		ASM_NAME(maxmin_s32)+16, %xmm4
469
	cvtps2pi	%xmm5, %mm0
470
	cvtps2pi	%xmm0, %mm1
471
	cvtps2pi	%xmm1, %mm2
472
	cvtps2pi	%xmm3, %mm3
473
	psrad		$31, %mm2
474
	psrad		$31, %mm3
475
	pxor		%mm2, %mm0
476
	pxor		%mm3, %mm1
477
	movq		%mm0, %mm4
478
	punpckldq	%mm1, %mm0
479
	punpckhdq	%mm1, %mm4
480
	movq		%mm0, (SAMPLES)
481
	movq		%mm4, 8(SAMPLES)
482
	movhlps		%xmm5, %xmm5
483
	movhlps		%xmm0, %xmm0
484
	movhlps		%xmm1, %xmm1
485
	movhlps		%xmm3, %xmm3
486
	cvtps2pi	%xmm5, %mm0
487
	cvtps2pi	%xmm0, %mm1
488
	cvtps2pi	%xmm1, %mm4
489
	cvtps2pi	%xmm3, %mm5
490
	psrad		$31, %mm4
491
	psrad		$31, %mm5
492
	pxor		%mm4, %mm0
493
	pxor		%mm5, %mm1
494
	movq		%mm0, %mm6
495
	punpckldq	%mm1, %mm0
496
	punpckhdq	%mm1, %mm6
497
	movq		%mm0, 16(SAMPLES)
498
	movq		%mm6, 24(SAMPLES)
499
 
500
	packssdw	%mm4, %mm2
501
	packssdw	%mm5, %mm3
502
	psrlw		$15, %mm2
503
	psrlw		$15, %mm3
504
	cvtps2pi	%xmm2, %mm0
505
	cvtps2pi	%xmm4, %mm1
506
	movhlps		%xmm2, %xmm2
507
	movhlps		%xmm4, %xmm4
508
	cvtps2pi	%xmm2, %mm4
509
	cvtps2pi	%xmm4, %mm5
510
	packssdw	%mm4, %mm0
511
	packssdw	%mm5, %mm1
512
	psrlw		$15, %mm0
513
	psrlw		$15, %mm1
514
	paddw		%mm3, %mm2
515
	paddw		%mm1, %mm0
516
	paddw		%mm2, %mm0
517
	paddw		%mm0, MMREG_CLIP
518
 
519
	leal		32(SAMPLES), SAMPLES
520
	decl		%ecx
521
	jnz			Loop_start_2
522
 
523
	pshufw		$0xee, MMREG_CLIP, %mm0
524
	paddw		MMREG_CLIP, %mm0
525
	pshufw		$0x55, %mm0, %mm1
526
	paddw		%mm1, %mm0
527
	movd		%mm0, %eax
528
	andl		$0xffff, %eax
529
 
530
	popl		%edi
531
	popl		%esi
532
	popl		%ebx
533
	movl		%ebp, %esp
534
	popl		%ebp
535
 
536
	emms
537
 
538
	ret
539
 
540
NONEXEC_STACK