Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
3960 Serge 1
/*
2
	dct64_sse: MMX/SSE optimized dct64
3
 
4
	copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1
5
	see COPYING and AUTHORS files in distribution or http://mpg123.org
6
	initially written by Taihei Monma
7
*/
8
 
9
#include "mangle.h"
10
 
11
#define ARG(n) (8+n*4)(%ebp)
12
#define TEMP(n) (4+n*16)(%esp)
13
#define TEMP_BYTE(n) (4+n)(%esp)
14
 
15
/*
16
	void dct64_sse(short *out0, short *out1, real *samples);
17
*/
18
 
19
#ifndef __APPLE__
20
	.section	.rodata
21
#else
22
	.data
23
#endif
24
	ALIGN16
25
pnpn:
26
	.long	0
27
	.long	-2147483648
28
	.long	0
29
	.long	-2147483648
30
	ALIGN16
31
mask:
32
	.long	-1
33
	.long	-1
34
	.long	-1
35
	.long	0
36
 
37
	.text
38
	ALIGN16
39
.globl ASM_NAME(dct64_sse)
40
ASM_NAME(dct64_sse):
41
	pushl		%ebp
42
	movl		%esp, %ebp
43
 
44
	andl		$-16, %esp /* align the stack at 16 bytes */
45
	subl		$128, %esp /* reserve space for temporal store */
46
	pushl		%ebx
47
 
48
	movl		ARG(0), %ecx
49
	movl		ARG(1), %ebx
50
	movl		ARG(2), %eax
51
 
52
	MOVUAPS 	(%eax), %xmm7
53
	MOVUAPS 	16(%eax), %xmm6
54
	MOVUAPS 	112(%eax), %xmm0
55
	MOVUAPS 	96(%eax), %xmm1
56
	shufps 		$0x1b, %xmm0, %xmm0
57
	shufps 		$0x1b, %xmm1, %xmm1
58
	movaps 		%xmm7, %xmm4
59
	movaps		%xmm6, %xmm5
60
	addps 		%xmm0, %xmm4
61
	addps 		%xmm1, %xmm5
62
	subps 		%xmm0, %xmm7
63
	subps 		%xmm1, %xmm6
64
	movaps		%xmm4, TEMP(0)
65
	movaps		%xmm5, TEMP(1)
66
 
67
	MOVUAPS 	32(%eax), %xmm2
68
	MOVUAPS 	48(%eax), %xmm3
69
	MOVUAPS 	80(%eax), %xmm0
70
	MOVUAPS 	64(%eax), %xmm1
71
	shufps 		$0x1b, %xmm0, %xmm0
72
	shufps 		$0x1b, %xmm1, %xmm1
73
	movaps 		%xmm2, %xmm5
74
	movaps		%xmm3, %xmm4
75
	addps 		%xmm0, %xmm2
76
	addps 		%xmm1, %xmm3
77
	subps 		%xmm0, %xmm5
78
	subps 		%xmm1, %xmm4
79
 
80
	mulps		ASM_NAME(costab_mmxsse), %xmm7
81
	mulps		ASM_NAME(costab_mmxsse)+16, %xmm6
82
	mulps		ASM_NAME(costab_mmxsse)+32, %xmm5
83
	mulps		ASM_NAME(costab_mmxsse)+48, %xmm4
84
 
85
	shufps		$0x1b, %xmm2, %xmm2
86
	shufps		$0x1b, %xmm3, %xmm3
87
	shufps		$0x1b, %xmm4, %xmm4
88
	shufps		$0x1b, %xmm5, %xmm5
89
	movaps		TEMP(0), %xmm0
90
	movaps		TEMP(1), %xmm1
91
	subps		%xmm3, %xmm0
92
	subps		%xmm2, %xmm1
93
	addps		TEMP(0), %xmm3
94
	addps		TEMP(1), %xmm2
95
	movaps		%xmm3, TEMP(0)
96
	movaps		%xmm2, TEMP(1)
97
	movaps		%xmm6, %xmm2
98
	movaps		%xmm7, %xmm3
99
	subps		%xmm5, %xmm6
100
	subps		%xmm4, %xmm7
101
	addps		%xmm3, %xmm4
102
	addps		%xmm2, %xmm5
103
	mulps		ASM_NAME(costab_mmxsse)+64, %xmm0
104
	mulps		ASM_NAME(costab_mmxsse)+80, %xmm1
105
	mulps		ASM_NAME(costab_mmxsse)+80, %xmm6
106
	mulps		ASM_NAME(costab_mmxsse)+64, %xmm7
107
 
108
	movaps		TEMP(0), %xmm2
109
	movaps		TEMP(1), %xmm3
110
	shufps		$0x1b, %xmm3, %xmm3
111
	shufps		$0x1b, %xmm5, %xmm5
112
	shufps		$0x1b, %xmm1, %xmm1
113
	shufps		$0x1b, %xmm6, %xmm6
114
	movaps		%xmm0, TEMP(1)
115
	subps		%xmm3, %xmm2
116
	subps		%xmm1, %xmm0
117
	addps		TEMP(0), %xmm3
118
	addps		TEMP(1), %xmm1
119
	movaps		%xmm3, TEMP(0)
120
	movaps		%xmm1, TEMP(2)
121
	movaps		%xmm5, %xmm1
122
	movaps		%xmm4, %xmm5
123
	movaps		%xmm7, %xmm3
124
	subps		%xmm1, %xmm5
125
	subps		%xmm6, %xmm7
126
	addps		%xmm1, %xmm4
127
	addps		%xmm3, %xmm6
128
	mulps		ASM_NAME(costab_mmxsse)+96, %xmm2
129
	mulps		ASM_NAME(costab_mmxsse)+96, %xmm0
130
	mulps		ASM_NAME(costab_mmxsse)+96, %xmm5
131
	mulps		ASM_NAME(costab_mmxsse)+96, %xmm7
132
	movaps		%xmm2, TEMP(1)
133
	movaps		%xmm0, TEMP(3)
134
 
135
	movaps		%xmm4, %xmm2
136
	movaps		%xmm5, %xmm3
137
	shufps		$0x44, %xmm6, %xmm2
138
	shufps		$0xbb, %xmm7, %xmm5
139
	shufps		$0xbb, %xmm6, %xmm4
140
	shufps		$0x44, %xmm7, %xmm3
141
	movaps		%xmm2, %xmm6
142
	movaps		%xmm3, %xmm7
143
	subps		%xmm4, %xmm2
144
	subps		%xmm5, %xmm3
145
	addps		%xmm6, %xmm4
146
	addps		%xmm7, %xmm5
147
	movaps		ASM_NAME(costab_mmxsse)+112, %xmm0
148
	movlhps		%xmm0, %xmm0
149
	mulps		%xmm0, %xmm2
150
	mulps		%xmm0, %xmm3
151
	movaps		%xmm0, TEMP(4)
152
	movaps		%xmm4, %xmm6
153
	movaps		%xmm5, %xmm7
154
	shufps		$0x14, %xmm2, %xmm4
155
	shufps		$0xbe, %xmm2, %xmm6
156
	shufps		$0x14, %xmm3, %xmm5
157
	shufps		$0xbe, %xmm3, %xmm7
158
	movaps		%xmm5, TEMP(5)
159
	movaps		%xmm7, TEMP(7)
160
 
161
	movaps		TEMP(0), %xmm0
162
	movaps		TEMP(1), %xmm1
163
	movaps		%xmm0, %xmm2
164
	movaps		%xmm1, %xmm3
165
	shufps		$0x44, TEMP(2), %xmm2
166
	shufps		$0xbb, TEMP(3), %xmm1
167
	shufps		$0xbb, TEMP(2), %xmm0
168
	shufps		$0x44, TEMP(3), %xmm3
169
	movaps		%xmm2, %xmm5
170
	movaps		%xmm3, %xmm7
171
	subps		%xmm0, %xmm2
172
	subps		%xmm1, %xmm3
173
	addps		%xmm5, %xmm0
174
	addps		%xmm7, %xmm1
175
	mulps		TEMP(4), %xmm2
176
	mulps		TEMP(4), %xmm3
177
	movaps		%xmm0, %xmm5
178
	movaps		%xmm1, %xmm7
179
	shufps		$0x14, %xmm2, %xmm0
180
	shufps		$0xbe, %xmm2, %xmm5
181
	shufps		$0x14, %xmm3, %xmm1
182
	shufps		$0xbe, %xmm3, %xmm7
183
 
184
	movaps		%xmm0, TEMP(0)
185
	movaps		%xmm1, TEMP(1)
186
	movaps		%xmm5, TEMP(2)
187
	movaps		%xmm7, TEMP(3)
188
 
189
	movss		ASM_NAME(costab_mmxsse)+120, %xmm5
190
	shufps		$0x00, %xmm5, %xmm5
191
	xorps		pnpn, %xmm5
192
 
193
	movaps		%xmm4, %xmm0
194
	movaps		%xmm6, %xmm1
195
	unpcklps	TEMP(5), %xmm4
196
	unpckhps	TEMP(5), %xmm0
197
	unpcklps	TEMP(7), %xmm6
198
	unpckhps	TEMP(7), %xmm1
199
	movaps		%xmm4, %xmm2
200
	movaps		%xmm6, %xmm3
201
	unpcklps	%xmm0, %xmm4
202
	unpckhps	%xmm0, %xmm2
203
	unpcklps	%xmm1, %xmm6
204
	unpckhps	%xmm1, %xmm3
205
	movaps		%xmm4, %xmm0
206
	movaps		%xmm6, %xmm1
207
	subps		%xmm2, %xmm0
208
	subps		%xmm3, %xmm1
209
	addps		%xmm2, %xmm4
210
	addps		%xmm3, %xmm6
211
	mulps		%xmm5, %xmm0
212
	mulps		%xmm5, %xmm1
213
	movaps		%xmm5, TEMP(5)
214
	movaps		%xmm4, %xmm5
215
	movaps		%xmm6, %xmm7
216
	unpcklps	%xmm0, %xmm4
217
	unpckhps	%xmm0, %xmm5
218
	unpcklps	%xmm1, %xmm6
219
	unpckhps	%xmm1, %xmm7
220
 
221
	movaps		TEMP(0), %xmm0
222
	movaps		TEMP(2), %xmm2
223
	movaps		%xmm4, TEMP(4)
224
	movaps		%xmm6, TEMP(6)
225
 
226
	movaps		%xmm0, %xmm4
227
	movaps		%xmm2, %xmm6
228
	unpcklps	TEMP(1), %xmm0
229
	unpckhps	TEMP(1), %xmm4
230
	unpcklps	TEMP(3), %xmm2
231
	unpckhps	TEMP(3), %xmm6
232
	movaps		%xmm0, %xmm1
233
	movaps		%xmm2, %xmm3
234
	unpcklps	%xmm4, %xmm0
235
	unpckhps	%xmm4, %xmm1
236
	unpcklps	%xmm6, %xmm2
237
	unpckhps	%xmm6, %xmm3
238
	movaps		%xmm0, %xmm4
239
	movaps		%xmm2, %xmm6
240
	subps		%xmm1, %xmm4
241
	subps		%xmm3, %xmm6
242
	addps		%xmm1, %xmm0
243
	addps		%xmm3, %xmm2
244
	mulps		TEMP(5), %xmm4
245
	mulps		TEMP(5), %xmm6
246
	movaps		%xmm0, %xmm1
247
	movaps		%xmm2, %xmm3
248
	unpcklps	%xmm4, %xmm0
249
	unpckhps	%xmm4, %xmm1
250
	unpcklps	%xmm6, %xmm2
251
	unpckhps	%xmm6, %xmm3
252
 
253
	movaps		%xmm0, TEMP(0)
254
	movaps		%xmm1, TEMP(1)
255
	movaps		%xmm2, TEMP(2)
256
	movaps		%xmm3, TEMP(3)
257
	movaps		%xmm5, TEMP(5)
258
	movaps		%xmm7, TEMP(7)
259
 
260
	movss		TEMP_BYTE(12), %xmm0
261
	movss		TEMP_BYTE(28), %xmm1
262
	movss		TEMP_BYTE(44), %xmm2
263
	movss		TEMP_BYTE(60), %xmm3
264
	addss		TEMP_BYTE(8), %xmm0
265
	addss		TEMP_BYTE(24), %xmm1
266
	addss		TEMP_BYTE(40), %xmm2
267
	addss		TEMP_BYTE(56), %xmm3
268
	movss		%xmm0, TEMP_BYTE(8)
269
	movss		%xmm1, TEMP_BYTE(24)
270
	movss		%xmm2, TEMP_BYTE(40)
271
	movss		%xmm3, TEMP_BYTE(56)
272
	movss		TEMP_BYTE(76), %xmm0
273
	movss		TEMP_BYTE(92), %xmm1
274
	movss		TEMP_BYTE(108), %xmm2
275
	movss		TEMP_BYTE(124), %xmm3
276
	addss		TEMP_BYTE(72), %xmm0
277
	addss		TEMP_BYTE(88), %xmm1
278
	addss		TEMP_BYTE(104), %xmm2
279
	addss		TEMP_BYTE(120), %xmm3
280
	movss		%xmm0, TEMP_BYTE(72)
281
	movss		%xmm1, TEMP_BYTE(88)
282
	movss		%xmm2, TEMP_BYTE(104)
283
	movss		%xmm3, TEMP_BYTE(120)
284
 
285
	movaps		TEMP_BYTE(16), %xmm1
286
	movaps		TEMP_BYTE(48), %xmm3
287
	movaps		TEMP_BYTE(80), %xmm5
288
	movaps		TEMP_BYTE(112), %xmm7
289
	movaps		%xmm1, %xmm0
290
	movaps		%xmm3, %xmm2
291
	movaps		%xmm5, %xmm4
292
	movaps		%xmm7, %xmm6
293
	shufps		$0x1e, %xmm0, %xmm0
294
	shufps		$0x1e, %xmm2, %xmm2
295
	shufps		$0x1e, %xmm4, %xmm4
296
	shufps		$0x1e, %xmm6, %xmm6
297
	andps		mask, %xmm0
298
	andps		mask, %xmm2
299
	andps		mask, %xmm4
300
	andps		mask, %xmm6
301
	addps		%xmm0, %xmm1
302
	addps		%xmm2, %xmm3
303
	addps		%xmm4, %xmm5
304
	addps		%xmm6, %xmm7
305
 
306
	movaps		TEMP_BYTE(32), %xmm2
307
	movaps		TEMP_BYTE(96), %xmm6
308
	movaps		%xmm2, %xmm0
309
	movaps		%xmm6, %xmm4
310
	shufps		$0x1e, %xmm0, %xmm0
311
	shufps		$0x1e, %xmm4, %xmm4
312
	andps		mask, %xmm0
313
	andps		mask, %xmm4
314
	addps		%xmm3, %xmm2
315
	addps		%xmm0, %xmm3
316
	addps		%xmm7, %xmm6
317
	addps		%xmm4, %xmm7
318
 
319
	movaps		TEMP_BYTE(0), %xmm0
320
	movaps		TEMP_BYTE(64), %xmm4
321
 
322
	cvtps2pi	%xmm0, %mm0
323
	cvtps2pi	%xmm1, %mm1
324
	movhlps		%xmm0, %xmm0
325
	movhlps		%xmm1, %xmm1
326
	cvtps2pi	%xmm0, %mm2
327
	cvtps2pi	%xmm1, %mm3
328
	packssdw	%mm2, %mm0
329
	packssdw	%mm3, %mm1
330
 
331
	cvtps2pi	%xmm2, %mm2
332
	cvtps2pi	%xmm3, %mm3
333
	movhlps		%xmm2, %xmm2
334
	movhlps		%xmm3, %xmm3
335
	cvtps2pi	%xmm2, %mm4
336
	cvtps2pi	%xmm3, %mm5
337
	packssdw	%mm4, %mm2
338
	packssdw	%mm5, %mm3
339
 
340
	movd		%mm0, %eax
341
	movd		%mm1, %edx
342
	movw		%ax, 512(%ecx)
343
	movw		%dx, 384(%ecx)
344
	shrl		$16, %eax
345
	shrl		$16, %edx
346
	movw		%ax, (%ecx)
347
	movw		%ax, (%ebx)
348
	movw		%dx, 128(%ebx)
349
 
350
	movd		%mm2, %eax
351
	movd		%mm3, %edx
352
	movw		%ax, 448(%ecx)
353
	movw		%dx, 320(%ecx)
354
	shrl		$16, %eax
355
	shrl		$16, %edx
356
	movw		%ax, 64(%ebx)
357
	movw		%dx, 192(%ebx)
358
 
359
	psrlq		$32, %mm0
360
	psrlq		$32, %mm1
361
	movd		%mm0, %eax
362
	movd		%mm1, %edx
363
	movw		%ax, 256(%ecx)
364
	movw		%dx, 128(%ecx)
365
	shrl		$16, %eax
366
	shrl		$16, %edx
367
	movw		%ax, 256(%ebx)
368
	movw		%dx, 384(%ebx)
369
 
370
	psrlq		$32, %mm2
371
	psrlq		$32, %mm3
372
	movd		%mm2, %eax
373
	movd		%mm3, %edx
374
	movw		%ax, 192(%ecx)
375
	movw		%dx, 64(%ecx)
376
	shrl		$16, %eax
377
	shrl		$16, %edx
378
	movw		%ax, 320(%ebx)
379
	movw		%dx, 448(%ebx)
380
 
381
	movaps		%xmm4, %xmm0
382
	shufps		$0x1e, %xmm0, %xmm0
383
	movaps		%xmm5, %xmm1
384
	andps		mask, %xmm0
385
 
386
	addps		%xmm6, %xmm4
387
	addps		%xmm7, %xmm5
388
	addps		%xmm1, %xmm6
389
	addps		%xmm0, %xmm7
390
 
391
	cvtps2pi	%xmm4, %mm0
392
	cvtps2pi	%xmm5, %mm1
393
	movhlps		%xmm4, %xmm4
394
	movhlps		%xmm5, %xmm5
395
	cvtps2pi	%xmm4, %mm2
396
	cvtps2pi	%xmm5, %mm3
397
	packssdw	%mm2, %mm0
398
	packssdw	%mm3, %mm1
399
 
400
	cvtps2pi	%xmm6, %mm2
401
	cvtps2pi	%xmm7, %mm3
402
	movhlps		%xmm6, %xmm6
403
	movhlps		%xmm7, %xmm7
404
	cvtps2pi	%xmm6, %mm4
405
	cvtps2pi	%xmm7, %mm5
406
	packssdw	%mm4, %mm2
407
	packssdw	%mm5, %mm3
408
 
409
	movd		%mm0, %eax
410
	movd		%mm2, %edx
411
	movw		%ax, 480(%ecx)
412
	movw		%dx, 416(%ecx)
413
	shrl		$16, %eax
414
	shrl		$16, %edx
415
	movw		%ax, 32(%ebx)
416
	movw		%dx, 96(%ebx)
417
 
418
	psrlq		$32, %mm0
419
	psrlq		$32, %mm2
420
	movd		%mm0, %eax
421
	movd		%mm2, %edx
422
	movw		%ax, 224(%ecx)
423
	movw		%dx, 160(%ecx)
424
	shrl		$16, %eax
425
	shrl		$16, %edx
426
	movw		%ax, 288(%ebx)
427
	movw		%dx, 352(%ebx)
428
 
429
	movd		%mm1, %eax
430
	movd		%mm3, %edx
431
	movw		%ax, 352(%ecx)
432
	movw		%dx, 288(%ecx)
433
	shrl		$16, %eax
434
	shrl		$16, %edx
435
	movw		%ax, 160(%ebx)
436
	movw		%dx, 224(%ebx)
437
 
438
	psrlq		$32, %mm1
439
	psrlq		$32, %mm3
440
	movd		%mm1, %eax
441
	movd		%mm3, %edx
442
	movw		%ax, 96(%ecx)
443
	movw		%dx, 32(%ecx)
444
	shrl		$16, %eax
445
	shrl		$16, %edx
446
	movw		%ax, 416(%ebx)
447
	movw		%dx, 480(%ebx)
448
 
449
	popl		%ebx
450
	movl		%ebp, %esp
451
	popl		%ebp
452
	ret
453
 
454
NONEXEC_STACK