Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
3960 Serge 1
/*
2
	synth_sse_s32: SSE optimized synth (s32 output version)
3
 
4
	copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5
	see COPYING and AUTHORS files in distribution or http://mpg123.org
6
	initially written by Taihei Monma
7
*/
8
 
9
#include "mangle.h"
10
 
11
/* real *window; */
12
#define WINDOW %ebx
13
/* real *b0; */
14
#define B0 %edx
15
/* real *samples; */
16
#define SAMPLES %esi
17
 
18
#define MMREG_CLIP %mm7
19
 
20
/*
21
	int synth_1to1_s32_sse_asm(real *window, real *b0, int32_t *samples, int bo1);
22
	return value: number of clipped samples
23
*/
24
 
25
#ifndef __APPLE__
26
	.section	.rodata
27
#else
28
	.data
29
#endif
30
	ALIGN32
31
ASM_NAME(scale_s32):
32
	.long   1199570944 /* 65536.0 */
33
	.long   1199570944
34
	.long   1199570944
35
	.long   1199570944
36
	ALIGN16
37
ASM_NAME(maxmin_s32):
38
	.long   1191182335 /* 32767.999 */
39
	.long   1191182335
40
	.long   1191182335
41
	.long   1191182335
42
	.long   -956301312 /* -32768.0 */
43
	.long   -956301312
44
	.long   -956301312
45
	.long   -956301312
46
	.text
47
	ALIGN16
48
.globl ASM_NAME(synth_1to1_s32_sse_asm)
49
ASM_NAME(synth_1to1_s32_sse_asm):
50
	pushl		%ebp
51
	movl		%esp, %ebp
52
	pushl		%ebx
53
	pushl		%esi
54
 
55
	pxor		MMREG_CLIP, MMREG_CLIP
56
 
57
	movl		8(%ebp), WINDOW
58
	movl		12(%ebp), B0
59
	movl		16(%ebp), SAMPLES
60
	movl		20(%ebp), %eax
61
	shll		$2, %eax
62
 
63
	leal		64(WINDOW), WINDOW
64
	subl		%eax, WINDOW
65
 
66
	movl		$4, %ecx
67
 
68
	ALIGN16
69
Loop_start_1:
70
	movups		(WINDOW), %xmm0
71
	movups		16(WINDOW), %xmm1
72
	movups		32(WINDOW), %xmm2
73
	movups		48(WINDOW), %xmm3
74
	movups		128(WINDOW), %xmm4
75
	movups		144(WINDOW), %xmm5
76
	movups		160(WINDOW), %xmm6
77
	movups		176(WINDOW), %xmm7
78
	mulps		0(B0), %xmm0
79
	mulps		16(B0), %xmm1
80
	mulps		32(B0), %xmm2
81
	mulps		48(B0), %xmm3
82
	mulps		64(B0), %xmm4
83
	mulps		80(B0), %xmm5
84
	mulps		96(B0), %xmm6
85
	mulps		112(B0), %xmm7
86
	addps		%xmm1, %xmm0
87
	addps		%xmm3, %xmm2
88
	addps		%xmm5, %xmm4
89
	addps		%xmm7, %xmm6
90
	addps		%xmm2, %xmm0
91
	addps		%xmm6, %xmm4
92
	movaps		%xmm4, %xmm5
93
	movaps		%xmm0, %xmm4
94
 
95
	leal		256(WINDOW), WINDOW
96
	leal		128(B0), B0
97
 
98
	movups		(WINDOW), %xmm0
99
	movups		16(WINDOW), %xmm1
100
	movups		32(WINDOW), %xmm2
101
	movups		48(WINDOW), %xmm3
102
	movups		128(WINDOW), %xmm6
103
	movups		144(WINDOW), %xmm7
104
	mulps		(B0), %xmm0
105
	mulps		16(B0), %xmm1
106
	mulps		32(B0), %xmm2
107
	mulps		48(B0), %xmm3
108
	mulps		64(B0), %xmm6
109
	mulps		80(B0), %xmm7
110
	addps		%xmm1, %xmm0
111
	addps		%xmm3, %xmm2
112
	addps		%xmm7, %xmm6
113
	movups		160(WINDOW), %xmm1
114
	movups		176(WINDOW), %xmm3
115
	mulps		96(B0), %xmm1
116
	mulps		112(B0), %xmm3
117
	addps		%xmm2, %xmm0
118
	addps		%xmm3, %xmm1
119
	addps		%xmm1, %xmm6
120
	movaps		%xmm6, %xmm7
121
	movaps		%xmm0, %xmm6
122
 
123
	leal		256(WINDOW), WINDOW
124
	leal		128(B0), B0
125
 
126
	movaps		%xmm4, %xmm0
127
	movaps		%xmm6, %xmm1
128
	unpcklps	%xmm5, %xmm4
129
	unpcklps	%xmm7, %xmm6
130
	unpckhps	%xmm5, %xmm0
131
	unpckhps	%xmm7, %xmm1
132
	movaps		%xmm4, %xmm2
133
	movaps		%xmm0, %xmm3
134
	movlhps		%xmm6, %xmm4
135
	movhlps		%xmm2, %xmm6
136
	movlhps		%xmm1, %xmm0
137
	movhlps		%xmm3, %xmm1
138
	subps		%xmm6, %xmm4
139
	subps		%xmm1, %xmm0
140
	addps		%xmm4, %xmm0
141
 
142
	movaps		%xmm0, %xmm1
143
	movaps		%xmm0, %xmm2
144
	mulps		ASM_NAME(scale_s32), %xmm0
145
	cmpnleps	ASM_NAME(maxmin_s32), %xmm1
146
	cmpltps		ASM_NAME(maxmin_s32)+16, %xmm2
147
	cvtps2pi	%xmm0, %mm0
148
	movhlps		%xmm0, %xmm0
149
	cvtps2pi	%xmm0, %mm1
150
	cvtps2pi	%xmm1, %mm2
151
	movhlps		%xmm1, %xmm1
152
	cvtps2pi	%xmm1, %mm3
153
	psrad		$31, %mm2
154
	psrad		$31, %mm3
155
	pxor		%mm2, %mm0
156
	pxor		%mm3, %mm1
157
	movd		%mm0, (SAMPLES)
158
	psrlq		$32, %mm0
159
	movd		%mm0, 8(SAMPLES)
160
	movd		%mm1, 16(SAMPLES)
161
	psrlq		$32, %mm1
162
	movd		%mm1, 24(SAMPLES)
163
 
164
	cvtps2pi	%xmm2, %mm0
165
	movhlps		%xmm2, %xmm2
166
	cvtps2pi	%xmm2, %mm1
167
	packssdw	%mm3, %mm2
168
	packssdw	%mm1, %mm0
169
	psrlw		$15, %mm2
170
	psrlw		$15, %mm0
171
	paddw		%mm2, %mm0
172
	paddw		%mm0, MMREG_CLIP
173
 
174
	leal		32(SAMPLES), SAMPLES
175
	decl		%ecx
176
	jnz			Loop_start_1
177
 
178
	movl		$4, %ecx
179
 
180
	ALIGN16
181
Loop_start_2:
182
	movups		(WINDOW), %xmm0
183
	movups		16(WINDOW), %xmm1
184
	movups		32(WINDOW), %xmm2
185
	movups		48(WINDOW), %xmm3
186
	movups		128(WINDOW), %xmm4
187
	movups		144(WINDOW), %xmm5
188
	movups		160(WINDOW), %xmm6
189
	movups		176(WINDOW), %xmm7
190
	mulps		0(B0), %xmm0
191
	mulps		16(B0), %xmm1
192
	mulps		32(B0), %xmm2
193
	mulps		48(B0), %xmm3
194
	mulps		-64(B0), %xmm4
195
	mulps		-48(B0), %xmm5
196
	mulps		-32(B0), %xmm6
197
	mulps		-16(B0), %xmm7
198
	addps		%xmm1, %xmm0
199
	addps		%xmm3, %xmm2
200
	addps		%xmm5, %xmm4
201
	addps		%xmm7, %xmm6
202
	addps		%xmm2, %xmm0
203
	addps		%xmm6, %xmm4
204
	movaps		%xmm4, %xmm5
205
	movaps		%xmm0, %xmm4
206
 
207
	leal		256(WINDOW), WINDOW
208
	leal		-128(B0), B0
209
 
210
	movups		(WINDOW), %xmm0
211
	movups		16(WINDOW), %xmm1
212
	movups		32(WINDOW), %xmm2
213
	movups		48(WINDOW), %xmm3
214
	movups		128(WINDOW), %xmm6
215
	movups		144(WINDOW), %xmm7
216
	mulps		(B0), %xmm0
217
	mulps		16(B0), %xmm1
218
	mulps		32(B0), %xmm2
219
	mulps		48(B0), %xmm3
220
	mulps		-64(B0), %xmm6
221
	mulps		-48(B0), %xmm7
222
	addps		%xmm1, %xmm0
223
	addps		%xmm3, %xmm2
224
	addps		%xmm7, %xmm6
225
	movups		160(WINDOW), %xmm1
226
	movups		176(WINDOW), %xmm3
227
	mulps		-32(B0), %xmm1
228
	mulps		-16(B0), %xmm3
229
	addps		%xmm2, %xmm0
230
	addps		%xmm3, %xmm1
231
	addps		%xmm1, %xmm6
232
	movaps		%xmm6, %xmm7
233
	movaps		%xmm0, %xmm6
234
 
235
	leal		256(WINDOW), WINDOW
236
	leal		-128(B0), B0
237
 
238
	movaps		%xmm4, %xmm0
239
	movaps		%xmm6, %xmm1
240
	unpcklps	%xmm5, %xmm4
241
	unpcklps	%xmm7, %xmm6
242
	unpckhps	%xmm5, %xmm0
243
	unpckhps	%xmm7, %xmm1
244
	movaps		%xmm4, %xmm2
245
	movaps		%xmm0, %xmm3
246
	movlhps		%xmm6, %xmm4
247
	movhlps		%xmm2, %xmm6
248
	movlhps		%xmm1, %xmm0
249
	movhlps		%xmm3, %xmm1
250
	addps		%xmm6, %xmm4
251
	addps		%xmm1, %xmm0
252
	addps		%xmm4, %xmm0
253
 
254
	movaps		%xmm0, %xmm1
255
	movaps		%xmm0, %xmm2
256
	mulps		ASM_NAME(scale_s32), %xmm0
257
	cmpnleps	ASM_NAME(maxmin_s32), %xmm1
258
	cmpltps		ASM_NAME(maxmin_s32)+16, %xmm2
259
	cvtps2pi	%xmm0, %mm0
260
	movhlps		%xmm0, %xmm0
261
	cvtps2pi	%xmm0, %mm1
262
	cvtps2pi	%xmm1, %mm2
263
	movhlps		%xmm1, %xmm1
264
	cvtps2pi	%xmm1, %mm3
265
	psrad		$31, %mm2
266
	psrad		$31, %mm3
267
	pxor		%mm2, %mm0
268
	pxor		%mm3, %mm1
269
	movd		%mm0, (SAMPLES)
270
	psrlq		$32, %mm0
271
	movd		%mm0, 8(SAMPLES)
272
	movd		%mm1, 16(SAMPLES)
273
	psrlq		$32, %mm1
274
	movd		%mm1, 24(SAMPLES)
275
 
276
	cvtps2pi	%xmm2, %mm0
277
	movhlps		%xmm2, %xmm2
278
	cvtps2pi	%xmm2, %mm1
279
	packssdw	%mm3, %mm2
280
	packssdw	%mm1, %mm0
281
	psrlw		$15, %mm2
282
	psrlw		$15, %mm0
283
	paddw		%mm2, %mm0
284
	paddw		%mm0, MMREG_CLIP
285
 
286
	leal		32(SAMPLES), SAMPLES
287
	decl		%ecx
288
	jnz			Loop_start_2
289
 
290
	pshufw		$0xee, MMREG_CLIP, %mm0
291
	paddw		MMREG_CLIP, %mm0
292
	pshufw		$0x55, %mm0, %mm1
293
	paddw		%mm1, %mm0
294
	movd		%mm0, %eax
295
	andl		$0xffff, %eax
296
 
297
	popl		%esi
298
	popl		%ebx
299
	movl		%ebp, %esp
300
	popl		%ebp
301
 
302
	emms
303
 
304
	ret
305
 
306
NONEXEC_STACK