0,0 → 1,454 |
/* |
dct64_sse: MMX/SSE optimized dct64 |
|
copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1 |
see COPYING and AUTHORS files in distribution or http://mpg123.org |
initially written by Taihei Monma |
*/ |
|
#include "mangle.h" |
|
#define ARG(n) (8+n*4)(%ebp) |
#define TEMP(n) (4+n*16)(%esp) |
#define TEMP_BYTE(n) (4+n)(%esp) |
|
/* |
void dct64_sse(short *out0, short *out1, real *samples); |
*/ |
|
#ifndef __APPLE__ |
.section .rodata |
#else |
.data |
#endif |
ALIGN16 |
pnpn: |
.long 0 |
.long -2147483648 |
.long 0 |
.long -2147483648 |
ALIGN16 |
mask: |
.long -1 |
.long -1 |
.long -1 |
.long 0 |
|
.text |
ALIGN16 |
.globl ASM_NAME(dct64_sse) |
ASM_NAME(dct64_sse): |
pushl %ebp |
movl %esp, %ebp |
|
andl $-16, %esp /* align the stack at 16 bytes */ |
subl $128, %esp /* reserve space for temporal store */ |
pushl %ebx |
|
movl ARG(0), %ecx |
movl ARG(1), %ebx |
movl ARG(2), %eax |
|
MOVUAPS (%eax), %xmm7 |
MOVUAPS 16(%eax), %xmm6 |
MOVUAPS 112(%eax), %xmm0 |
MOVUAPS 96(%eax), %xmm1 |
shufps $0x1b, %xmm0, %xmm0 |
shufps $0x1b, %xmm1, %xmm1 |
movaps %xmm7, %xmm4 |
movaps %xmm6, %xmm5 |
addps %xmm0, %xmm4 |
addps %xmm1, %xmm5 |
subps %xmm0, %xmm7 |
subps %xmm1, %xmm6 |
movaps %xmm4, TEMP(0) |
movaps %xmm5, TEMP(1) |
|
MOVUAPS 32(%eax), %xmm2 |
MOVUAPS 48(%eax), %xmm3 |
MOVUAPS 80(%eax), %xmm0 |
MOVUAPS 64(%eax), %xmm1 |
shufps $0x1b, %xmm0, %xmm0 |
shufps $0x1b, %xmm1, %xmm1 |
movaps %xmm2, %xmm5 |
movaps %xmm3, %xmm4 |
addps %xmm0, %xmm2 |
addps %xmm1, %xmm3 |
subps %xmm0, %xmm5 |
subps %xmm1, %xmm4 |
|
mulps ASM_NAME(costab_mmxsse), %xmm7 |
mulps ASM_NAME(costab_mmxsse)+16, %xmm6 |
mulps ASM_NAME(costab_mmxsse)+32, %xmm5 |
mulps ASM_NAME(costab_mmxsse)+48, %xmm4 |
|
shufps $0x1b, %xmm2, %xmm2 |
shufps $0x1b, %xmm3, %xmm3 |
shufps $0x1b, %xmm4, %xmm4 |
shufps $0x1b, %xmm5, %xmm5 |
movaps TEMP(0), %xmm0 |
movaps TEMP(1), %xmm1 |
subps %xmm3, %xmm0 |
subps %xmm2, %xmm1 |
addps TEMP(0), %xmm3 |
addps TEMP(1), %xmm2 |
movaps %xmm3, TEMP(0) |
movaps %xmm2, TEMP(1) |
movaps %xmm6, %xmm2 |
movaps %xmm7, %xmm3 |
subps %xmm5, %xmm6 |
subps %xmm4, %xmm7 |
addps %xmm3, %xmm4 |
addps %xmm2, %xmm5 |
mulps ASM_NAME(costab_mmxsse)+64, %xmm0 |
mulps ASM_NAME(costab_mmxsse)+80, %xmm1 |
mulps ASM_NAME(costab_mmxsse)+80, %xmm6 |
mulps ASM_NAME(costab_mmxsse)+64, %xmm7 |
|
movaps TEMP(0), %xmm2 |
movaps TEMP(1), %xmm3 |
shufps $0x1b, %xmm3, %xmm3 |
shufps $0x1b, %xmm5, %xmm5 |
shufps $0x1b, %xmm1, %xmm1 |
shufps $0x1b, %xmm6, %xmm6 |
movaps %xmm0, TEMP(1) |
subps %xmm3, %xmm2 |
subps %xmm1, %xmm0 |
addps TEMP(0), %xmm3 |
addps TEMP(1), %xmm1 |
movaps %xmm3, TEMP(0) |
movaps %xmm1, TEMP(2) |
movaps %xmm5, %xmm1 |
movaps %xmm4, %xmm5 |
movaps %xmm7, %xmm3 |
subps %xmm1, %xmm5 |
subps %xmm6, %xmm7 |
addps %xmm1, %xmm4 |
addps %xmm3, %xmm6 |
mulps ASM_NAME(costab_mmxsse)+96, %xmm2 |
mulps ASM_NAME(costab_mmxsse)+96, %xmm0 |
mulps ASM_NAME(costab_mmxsse)+96, %xmm5 |
mulps ASM_NAME(costab_mmxsse)+96, %xmm7 |
movaps %xmm2, TEMP(1) |
movaps %xmm0, TEMP(3) |
|
movaps %xmm4, %xmm2 |
movaps %xmm5, %xmm3 |
shufps $0x44, %xmm6, %xmm2 |
shufps $0xbb, %xmm7, %xmm5 |
shufps $0xbb, %xmm6, %xmm4 |
shufps $0x44, %xmm7, %xmm3 |
movaps %xmm2, %xmm6 |
movaps %xmm3, %xmm7 |
subps %xmm4, %xmm2 |
subps %xmm5, %xmm3 |
addps %xmm6, %xmm4 |
addps %xmm7, %xmm5 |
movaps ASM_NAME(costab_mmxsse)+112, %xmm0 |
movlhps %xmm0, %xmm0 |
mulps %xmm0, %xmm2 |
mulps %xmm0, %xmm3 |
movaps %xmm0, TEMP(4) |
movaps %xmm4, %xmm6 |
movaps %xmm5, %xmm7 |
shufps $0x14, %xmm2, %xmm4 |
shufps $0xbe, %xmm2, %xmm6 |
shufps $0x14, %xmm3, %xmm5 |
shufps $0xbe, %xmm3, %xmm7 |
movaps %xmm5, TEMP(5) |
movaps %xmm7, TEMP(7) |
|
movaps TEMP(0), %xmm0 |
movaps TEMP(1), %xmm1 |
movaps %xmm0, %xmm2 |
movaps %xmm1, %xmm3 |
shufps $0x44, TEMP(2), %xmm2 |
shufps $0xbb, TEMP(3), %xmm1 |
shufps $0xbb, TEMP(2), %xmm0 |
shufps $0x44, TEMP(3), %xmm3 |
movaps %xmm2, %xmm5 |
movaps %xmm3, %xmm7 |
subps %xmm0, %xmm2 |
subps %xmm1, %xmm3 |
addps %xmm5, %xmm0 |
addps %xmm7, %xmm1 |
mulps TEMP(4), %xmm2 |
mulps TEMP(4), %xmm3 |
movaps %xmm0, %xmm5 |
movaps %xmm1, %xmm7 |
shufps $0x14, %xmm2, %xmm0 |
shufps $0xbe, %xmm2, %xmm5 |
shufps $0x14, %xmm3, %xmm1 |
shufps $0xbe, %xmm3, %xmm7 |
|
movaps %xmm0, TEMP(0) |
movaps %xmm1, TEMP(1) |
movaps %xmm5, TEMP(2) |
movaps %xmm7, TEMP(3) |
|
movss ASM_NAME(costab_mmxsse)+120, %xmm5 |
shufps $0x00, %xmm5, %xmm5 |
xorps pnpn, %xmm5 |
|
movaps %xmm4, %xmm0 |
movaps %xmm6, %xmm1 |
unpcklps TEMP(5), %xmm4 |
unpckhps TEMP(5), %xmm0 |
unpcklps TEMP(7), %xmm6 |
unpckhps TEMP(7), %xmm1 |
movaps %xmm4, %xmm2 |
movaps %xmm6, %xmm3 |
unpcklps %xmm0, %xmm4 |
unpckhps %xmm0, %xmm2 |
unpcklps %xmm1, %xmm6 |
unpckhps %xmm1, %xmm3 |
movaps %xmm4, %xmm0 |
movaps %xmm6, %xmm1 |
subps %xmm2, %xmm0 |
subps %xmm3, %xmm1 |
addps %xmm2, %xmm4 |
addps %xmm3, %xmm6 |
mulps %xmm5, %xmm0 |
mulps %xmm5, %xmm1 |
movaps %xmm5, TEMP(5) |
movaps %xmm4, %xmm5 |
movaps %xmm6, %xmm7 |
unpcklps %xmm0, %xmm4 |
unpckhps %xmm0, %xmm5 |
unpcklps %xmm1, %xmm6 |
unpckhps %xmm1, %xmm7 |
|
movaps TEMP(0), %xmm0 |
movaps TEMP(2), %xmm2 |
movaps %xmm4, TEMP(4) |
movaps %xmm6, TEMP(6) |
|
movaps %xmm0, %xmm4 |
movaps %xmm2, %xmm6 |
unpcklps TEMP(1), %xmm0 |
unpckhps TEMP(1), %xmm4 |
unpcklps TEMP(3), %xmm2 |
unpckhps TEMP(3), %xmm6 |
movaps %xmm0, %xmm1 |
movaps %xmm2, %xmm3 |
unpcklps %xmm4, %xmm0 |
unpckhps %xmm4, %xmm1 |
unpcklps %xmm6, %xmm2 |
unpckhps %xmm6, %xmm3 |
movaps %xmm0, %xmm4 |
movaps %xmm2, %xmm6 |
subps %xmm1, %xmm4 |
subps %xmm3, %xmm6 |
addps %xmm1, %xmm0 |
addps %xmm3, %xmm2 |
mulps TEMP(5), %xmm4 |
mulps TEMP(5), %xmm6 |
movaps %xmm0, %xmm1 |
movaps %xmm2, %xmm3 |
unpcklps %xmm4, %xmm0 |
unpckhps %xmm4, %xmm1 |
unpcklps %xmm6, %xmm2 |
unpckhps %xmm6, %xmm3 |
|
movaps %xmm0, TEMP(0) |
movaps %xmm1, TEMP(1) |
movaps %xmm2, TEMP(2) |
movaps %xmm3, TEMP(3) |
movaps %xmm5, TEMP(5) |
movaps %xmm7, TEMP(7) |
|
movss TEMP_BYTE(12), %xmm0 |
movss TEMP_BYTE(28), %xmm1 |
movss TEMP_BYTE(44), %xmm2 |
movss TEMP_BYTE(60), %xmm3 |
addss TEMP_BYTE(8), %xmm0 |
addss TEMP_BYTE(24), %xmm1 |
addss TEMP_BYTE(40), %xmm2 |
addss TEMP_BYTE(56), %xmm3 |
movss %xmm0, TEMP_BYTE(8) |
movss %xmm1, TEMP_BYTE(24) |
movss %xmm2, TEMP_BYTE(40) |
movss %xmm3, TEMP_BYTE(56) |
movss TEMP_BYTE(76), %xmm0 |
movss TEMP_BYTE(92), %xmm1 |
movss TEMP_BYTE(108), %xmm2 |
movss TEMP_BYTE(124), %xmm3 |
addss TEMP_BYTE(72), %xmm0 |
addss TEMP_BYTE(88), %xmm1 |
addss TEMP_BYTE(104), %xmm2 |
addss TEMP_BYTE(120), %xmm3 |
movss %xmm0, TEMP_BYTE(72) |
movss %xmm1, TEMP_BYTE(88) |
movss %xmm2, TEMP_BYTE(104) |
movss %xmm3, TEMP_BYTE(120) |
|
movaps TEMP_BYTE(16), %xmm1 |
movaps TEMP_BYTE(48), %xmm3 |
movaps TEMP_BYTE(80), %xmm5 |
movaps TEMP_BYTE(112), %xmm7 |
movaps %xmm1, %xmm0 |
movaps %xmm3, %xmm2 |
movaps %xmm5, %xmm4 |
movaps %xmm7, %xmm6 |
shufps $0x1e, %xmm0, %xmm0 |
shufps $0x1e, %xmm2, %xmm2 |
shufps $0x1e, %xmm4, %xmm4 |
shufps $0x1e, %xmm6, %xmm6 |
andps mask, %xmm0 |
andps mask, %xmm2 |
andps mask, %xmm4 |
andps mask, %xmm6 |
addps %xmm0, %xmm1 |
addps %xmm2, %xmm3 |
addps %xmm4, %xmm5 |
addps %xmm6, %xmm7 |
|
movaps TEMP_BYTE(32), %xmm2 |
movaps TEMP_BYTE(96), %xmm6 |
movaps %xmm2, %xmm0 |
movaps %xmm6, %xmm4 |
shufps $0x1e, %xmm0, %xmm0 |
shufps $0x1e, %xmm4, %xmm4 |
andps mask, %xmm0 |
andps mask, %xmm4 |
addps %xmm3, %xmm2 |
addps %xmm0, %xmm3 |
addps %xmm7, %xmm6 |
addps %xmm4, %xmm7 |
|
movaps TEMP_BYTE(0), %xmm0 |
movaps TEMP_BYTE(64), %xmm4 |
|
cvtps2pi %xmm0, %mm0 |
cvtps2pi %xmm1, %mm1 |
movhlps %xmm0, %xmm0 |
movhlps %xmm1, %xmm1 |
cvtps2pi %xmm0, %mm2 |
cvtps2pi %xmm1, %mm3 |
packssdw %mm2, %mm0 |
packssdw %mm3, %mm1 |
|
cvtps2pi %xmm2, %mm2 |
cvtps2pi %xmm3, %mm3 |
movhlps %xmm2, %xmm2 |
movhlps %xmm3, %xmm3 |
cvtps2pi %xmm2, %mm4 |
cvtps2pi %xmm3, %mm5 |
packssdw %mm4, %mm2 |
packssdw %mm5, %mm3 |
|
movd %mm0, %eax |
movd %mm1, %edx |
movw %ax, 512(%ecx) |
movw %dx, 384(%ecx) |
shrl $16, %eax |
shrl $16, %edx |
movw %ax, (%ecx) |
movw %ax, (%ebx) |
movw %dx, 128(%ebx) |
|
movd %mm2, %eax |
movd %mm3, %edx |
movw %ax, 448(%ecx) |
movw %dx, 320(%ecx) |
shrl $16, %eax |
shrl $16, %edx |
movw %ax, 64(%ebx) |
movw %dx, 192(%ebx) |
|
psrlq $32, %mm0 |
psrlq $32, %mm1 |
movd %mm0, %eax |
movd %mm1, %edx |
movw %ax, 256(%ecx) |
movw %dx, 128(%ecx) |
shrl $16, %eax |
shrl $16, %edx |
movw %ax, 256(%ebx) |
movw %dx, 384(%ebx) |
|
psrlq $32, %mm2 |
psrlq $32, %mm3 |
movd %mm2, %eax |
movd %mm3, %edx |
movw %ax, 192(%ecx) |
movw %dx, 64(%ecx) |
shrl $16, %eax |
shrl $16, %edx |
movw %ax, 320(%ebx) |
movw %dx, 448(%ebx) |
|
movaps %xmm4, %xmm0 |
shufps $0x1e, %xmm0, %xmm0 |
movaps %xmm5, %xmm1 |
andps mask, %xmm0 |
|
addps %xmm6, %xmm4 |
addps %xmm7, %xmm5 |
addps %xmm1, %xmm6 |
addps %xmm0, %xmm7 |
|
cvtps2pi %xmm4, %mm0 |
cvtps2pi %xmm5, %mm1 |
movhlps %xmm4, %xmm4 |
movhlps %xmm5, %xmm5 |
cvtps2pi %xmm4, %mm2 |
cvtps2pi %xmm5, %mm3 |
packssdw %mm2, %mm0 |
packssdw %mm3, %mm1 |
|
cvtps2pi %xmm6, %mm2 |
cvtps2pi %xmm7, %mm3 |
movhlps %xmm6, %xmm6 |
movhlps %xmm7, %xmm7 |
cvtps2pi %xmm6, %mm4 |
cvtps2pi %xmm7, %mm5 |
packssdw %mm4, %mm2 |
packssdw %mm5, %mm3 |
|
movd %mm0, %eax |
movd %mm2, %edx |
movw %ax, 480(%ecx) |
movw %dx, 416(%ecx) |
shrl $16, %eax |
shrl $16, %edx |
movw %ax, 32(%ebx) |
movw %dx, 96(%ebx) |
|
psrlq $32, %mm0 |
psrlq $32, %mm2 |
movd %mm0, %eax |
movd %mm2, %edx |
movw %ax, 224(%ecx) |
movw %dx, 160(%ecx) |
shrl $16, %eax |
shrl $16, %edx |
movw %ax, 288(%ebx) |
movw %dx, 352(%ebx) |
|
movd %mm1, %eax |
movd %mm3, %edx |
movw %ax, 352(%ecx) |
movw %dx, 288(%ecx) |
shrl $16, %eax |
shrl $16, %edx |
movw %ax, 160(%ebx) |
movw %dx, 224(%ebx) |
|
psrlq $32, %mm1 |
psrlq $32, %mm3 |
movd %mm1, %eax |
movd %mm3, %edx |
movw %ax, 96(%ecx) |
movw %dx, 32(%ecx) |
shrl $16, %eax |
shrl $16, %edx |
movw %ax, 416(%ebx) |
movw %dx, 480(%ebx) |
|
popl %ebx |
movl %ebp, %esp |
popl %ebp |
ret |
|
NONEXEC_STACK |