0,0 → 1,792 |
; Fast Hartley Transform routine |
; Copyright (C) 1999, 2004, 2010 |
; Artem Jerdev artem@jerdev.co.uk |
; |
; free KolibriOS version - not to be ported to other OSes |
; ========================================================== |
|
|
; global constants |
align 8 |
fht_r dq 1.41421356237309504880169 ; = sqrt(2) |
fht_r2 dq 0.70710678118654752440084 ; = sqrt(2)/2 |
fht_c1 dq 0.92387953251128675612818 ; = cos(pi/8) |
fht_s1 dq 0.38268343236508977172846 ; = sin(pi/8) |
|
|
;================================================================= |
; parameter1: |
; -- reg dl (bits[3:0]) = Power_of_4 |
; -- reg edx && (-16) = 4k-aligned data array address |
; returns: |
; -- edx = Power_of_4 |
; -- ecx = N |
; destroys: |
; -- eax, ebx, ecx, edx, esi |
;; ========================== |
align 4 |
BitInvert: |
mov esi, edx |
and esi, 0xFFFFFFF0 |
and edx, 0x0F |
push edx |
mov cl, dl |
xor eax, eax |
inc eax |
shl eax, cl |
shl eax, cl |
push eax |
xor ecx, ecx ; index term |
align 4 |
.newterm: |
inc ecx |
cmp ecx, [esp] ; N |
jge .done |
|
xor eax, eax |
mov edx, ecx |
xor bl, bl |
align 4 |
.do_invert: |
inc bl |
cmp bl, byte[esp+4] ; Power_of_4 |
jg .switch |
|
mov bh, dl |
and bh, 3 |
shl eax, 2 |
or al, bh |
shr edx, 2 |
jmp .do_invert |
align 8 |
|
.switch: |
cmp eax, ecx |
jle .newterm |
|
fld qword [esi+eax*8] |
fld qword [esi+ecx*8] |
fstp qword [esi+eax*8] |
fstp qword [esi+ecx*8] |
jmp .newterm |
|
align 4 |
.done: |
pop ecx |
pop edx |
ret |
|
;================================================================= |
|
|
;================================================================= |
; stdcall parameters: |
; -- [esp+4] = N |
; -- [esp+8] = 4k-aligned data array address |
; returns: |
; -- nothing |
; destroys: |
; -- ebx, esi |
;; ========================== |
align 4 |
step1: |
mov ebx, [esp+8] |
mov esi, [esp+4] |
shl esi, 3 |
add esi, ebx |
|
align 4 |
.loop: |
fld qword[ebx] |
fld qword[ebx+8] |
fld st1 |
fsub st0, st1 ; st : t2, f[i+1], f[i] |
fxch st1 ; st : f[i+1], t2, f[i] |
faddp st2, st0 ; st : t2, t1 |
fld qword[ebx+16] |
fld qword[ebx+24] |
fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1 |
fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1 |
fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1 |
fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1 |
fstp st1 ; st : t4, t3, t2, t1 |
fld st2 ; st : t2, t4, t3, t2, t1 |
fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1 |
fstp qword[ebx+16] ; st : t4, t3, t2, t1 |
fsubp st2, st0 ; st : t3, t2-t4, t1 |
fld st2 ; st : t1, t3, t2-t4, t1 |
fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1 |
fstp qword[ebx] ; st : t3, t2-t4, t1 |
fsubp st2, st0 ; st : t2-t4, t1-t3 |
fstp qword[ebx+24] ; st : t1-t3 |
fstp qword[ebx+8] ; st : <empty> |
|
add ebx, 32 |
cmp ebx, esi |
jnz .loop |
ret |
|
; local stack definitions |
;=========================================================================== |
_t0 equ dword [esp] |
_t1 equ dword[esp+4] |
_t2 equ dword[esp+8] |
_t3 equ dword[esp+12] |
_t4 equ dword[esp+16] |
_t5 equ dword[esp+20] |
_t6 equ dword[esp+24] |
_t7 equ dword[esp+28] |
_t8 equ dword[esp+32] |
_t9 equ dword[esp+36] |
|
_l1 equ dword[esp+40] |
_l2 equ dword[esp+44] |
_l3 equ dword[esp+48] |
_l4 equ dword[esp+52] |
_l5 equ dword[esp+56] |
_l6 equ dword[esp+60] |
_l7 equ dword[esp+64] |
_l8 equ dword[esp+68] |
_l9 equ dword[esp+72] |
_l0 equ dword[esp+76] |
_d1 equ dword[esp+80] |
_d2 equ dword[esp+84] |
_d3 equ dword[esp+88] |
_d4 equ dword[esp+92] |
_d5 equ dword[esp+96] |
_d6 equ dword[esp+100] |
_j5 equ dword[esp+104] |
_jj equ dword[esp+108] |
_end_of_array equ dword[esp+112] |
_step equ word [esp+116] |
|
|
;================================================================= |
; cdecl parameters: |
; -- [ebp+8] = N |
; -- [ebp+12] = 4k-aligned data array address |
; returns: |
; -- nothing |
; destroys: |
; -- eax, ebx |
; locals: |
; -- 10 stack-located dwords (_t0 ... _t9) |
;; ========================== |
align 4 |
step2: |
push ebp |
mov ebp, esp |
sub esp, 40 |
mov ebx, [ebp+12] |
mov eax, [ebp+ 8] |
shl eax, 3 |
add eax, ebx |
|
align 4 |
.loop_i: |
|
; -- quad subelements +0, +4, +8 and +12 (simpliest operations) |
fld qword[ebx] |
fld qword[ebx+8*4] |
fld st0 |
fadd st0, st2 ; st : t1, f_4, f_0 |
fxch st1 |
fsubp st2, st0 ; st : t1, t2 |
fld qword[ebx+8*8] |
fld qword[ebx+8*12] |
fld st0 |
fadd st0, st2 ; st : t3, f_12, t1, t2 |
fxch st1 |
fsubp st2, st0 ; st : t3, t4, t1, t2 |
; ------ |
fld st2 ; st : t1, t3, t4, t1, t2 |
fadd st0, st1 |
fstp qword[ebx] ; st : t3, t4, t1, t2 |
fsub st0, st2 ; st : t3-t1, t4, t1, t2 |
fchs ; st : t1-t3, t4, t1, t2 |
fstp qword[ebx+8*4] ; st : t4, t1, t2 |
fst st1 ; st : t4, t4, t2 |
fadd st0, st2 ; st : t2+t4, t4, t2 |
fstp qword[ebx+8*8] ; st : t4, t2 |
fsubp st1, st0 ; st : t2-t4 |
fstp qword[ebx+8*12] ; st : <empty> |
|
; -- even subelements +2, +6, +10 and +14 (2 multiplications needed) |
fld qword[ebx+8*2] |
fld qword[ebx+8*6] |
fld [fht_r] |
fmul st1, st0 ; st : r, t2, t1 |
fld qword[ebx+8*10] |
fxch st1 ; st : r, t3, t2, t1 |
fmul qword[ebx+8*14] ; st : t4, t3, t2, t1 |
; ------ |
fld st3 ; st : t1, t4, t3, t2, t1 |
fadd st0, st3 ; |
fadd st0, st2 ; |
fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3 |
fsub st0, st3 ; |
fsub st0, st3 ; |
fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3 |
fld st3 ; st : t1, t4, t3, t2, t1 |
fsub st0, st2 ; |
fsub st0, st1 ; |
fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4 |
fadd st0, st1 ; |
faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1 |
fstp qword[ebx+8*6] ; store f[i+6] |
fstp st0 ; st : t2, t1 |
fstp st0 ; st : t1 |
fstp st0 ; st : <empty> |
|
; -- odd subelements |
fld qword[ebx+8*9] |
fld qword[ebx+8*11] |
fld st1 |
fsub st0, st1 |
fxch st1 |
faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7]) |
fld [fht_r2] |
fmul st2, st0 |
fmulp st1, st0 ; st : t9, t6 |
fld qword[ebx+8*3] |
fld st0 |
fadd st0, st2 ; st : t1, f[l5], t9, t6 |
fstp _t1 |
fsub st0, st1 |
fstp _t2 |
fstp _t9 ; (t9 never used) |
fstp _t6 ; st : <empty> |
|
fld [fht_c1] |
fld [fht_s1] |
fld qword[ebx+8*5] |
fld qword[ebx+8*7] |
fld st3 ; st: c1, f[l6], f[l2], s1, c1 |
fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c |
fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c |
fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c |
faddp st1, st0 ; st: t5, f_6, f_2, s, c |
fstp _t5 ; st: f_6, f_2, s, c |
fld st3 ; st: c, f_6, f_2, s, c |
fmul st0, st1 |
fld st3 |
fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c |
fsubp st1, st0 ; st: t8, f_6, f_2, s, c |
fstp _t8 ; st: f_6, f_2, s, c |
fstp st0 ; st: f_2, s, c |
fstp st0 ; st: s, c |
|
fld qword[ebx+8*13] |
fld qword[ebx+8*15] |
fld st3 ; st: c1, f[l8], f[l4], s1, c1 |
fmul st0, st1 |
fld st3 |
fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c |
faddp st1, st0 ; st: t7, f_8, f_4, s, c |
fld _t5 ; st: t5, t7, f_8, f_4, s, c |
fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c |
fstp _t4 |
fstp _t7 ; st: f_8, f_4, s, c |
fld st3 ; st: c, f_8, f_4, s, c |
fmul st0, st2 |
fld st3 |
fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c |
fsubp st1, st0 ; st:-t0, f_8, f_4, s, c |
fchs |
fld _t8 |
fchs ; st:-t8, t0, f_8, f_4, s, c |
fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c |
fstp _t3 |
fstp _t0 ; st: f_8, f_4, s, c |
fstp st0 ; st: f_4, s, c |
fstp st0 ; st: s, c |
fstp st0 ; st: c |
fstp st0 ; st: <empty> |
|
fld _t1 |
fld _t4 |
fld st1 |
fsub st0, st1 |
fstp qword[ebx+8*11] ; f[l7] = t1-t4 |
faddp st1, st0 |
fstp qword[ebx+8*3] ; f[l5] = t1+t4 |
fld _t2 |
fld _t3 |
fld st1 |
fsub st0, st1 |
fstp qword[ebx+8*15] ; f[l8] |
faddp st1, st0 |
fstp qword[ebx+8*7] ; f[l6] |
|
fld _t6 |
fld qword[ebx+8] |
fld st1 |
fsub st0, st1 |
fxch st1 |
faddp st2, st0 ; st : t2, t1 |
fld _t8 |
fsub _t0 |
fld _t5 |
fadd _t7 ; st : t4, t3, t2, t1 |
|
fld st3 |
fsub st0, st1 |
fstp qword[ebx+8*9] ; f[l3] = t1-t4 |
fadd st0, st3 |
fstp qword[ebx+8] ; f[l1] = t1+t4 |
fld st1 ; st : t2, t3, t2, t1 |
fsub st0, st1 ; f[l4] = t2-t3 |
fstp qword[ebx+8*13] ; st : t3, t2, t1 |
faddp st1, st0 ; st : t2+t3, t1 |
fstp qword[ebx+8*5] ; f[l2] = t2+t3 |
fstp st0 ; st : <empty> |
|
add ebx, 16*8 |
cmp ebx, eax |
jb .loop_i |
|
mov esp, ebp |
pop ebp |
ret |
|
|
|
|
;================================================================= |
; cdecl parameters: |
; -- [ebp+8] = N |
; -- [ebp+12] = p |
; -- [ebp+16] = 4k-aligned data array address |
; -- [ebp+20] = 4k-aligned SinCosTable address |
; returns: |
; -- nothing |
; destroys: |
; -- all GPRegs |
; locals: |
; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step) |
;; ========================== |
align 4 |
step3: |
push ebp |
mov ebp, esp |
sub esp, 120 |
; 283 : { |
|
|
; 293 : for (l=3; l<=p; l++) |
mov cx, 0x0200 |
align 4 |
.newstep: |
inc ch |
cmp ch, byte[ebp+12] |
jg .done |
mov _step, cx |
|
; 294 : { |
; 295 : d1 = 1 << (l + l - 3); |
|
mov cl, ch |
add cl, cl |
sub cl, 3 |
mov edx, 1 |
shl edx, cl |
mov _d1, edx |
|
; 296 : d2 = d1 << 1; |
shl edx, 1 |
mov _d2, edx |
mov eax, edx |
|
; 297 : d3 = d2 << 1; |
shl edx, 1 |
mov _d3, edx |
|
; 298 : d4 = d2 + d3; |
add eax, edx |
mov _d4, eax |
|
; 299 : d5 = d3 << 1; |
shl edx, 1 |
mov _d5, edx |
shl edx, 3 |
mov _d6, edx ; d6 = d5*8 to simplify index operations |
|
; 339 : j5 = N / d5; ; moved out of internal loop |
mov cl, [ebp+12] |
sub cl, ch |
add cl, cl |
mov edx, 1 |
shl edx, cl |
mov _j5, edx |
|
; 300 : |
; 301 : for (j=0; j<N; j+=d5) |
mov ebx, [ebp+16] |
mov esi, [ebp+8] |
shl esi, 3 |
add esi, ebx |
mov _end_of_array, esi |
|
align 4 |
.next_j: |
|
; { |
; t1 = f[j] + f[j+d2]; |
mov eax, _d2 |
fld qword[ebx] |
fld qword[ebx+eax*8] |
fld st1 |
fadd st0, st1 |
fstp _t1 |
|
; t2 = f[j] - f[j+d2]; |
fsubp st1, st0 |
fstp _t2 |
|
; t3 = f[j+d3] + f[j+d4]; |
mov edi, _d3 |
fld qword[ebx+edi*8] |
mov edx, _d4 |
fld qword[ebx+edx*8] |
fld st1 |
fsub st0, st1 ; st : t4, f4, f3 |
fxch st1 ; st : f4, t4, f3 |
|
; t4 = f[j+d3] - f[j+d4]; |
faddp st2, st0 ; st : t4, t3 |
|
; f[j+d4] = t2 - t4; |
; f[j+d3] = t2 + t4; |
fld _t2 |
fld st0 |
fsub st0, st2 ; st : f4, t2, t4, t3 |
fstp qword[ebx+edx*8] ; st : t2, t4, t3 |
fadd st0, st1 ; st : f3, t4, t3 |
fstp qword[ebx+edi*8] ; st : t4, t3 |
|
; f[j+d2] = t1 - t3; |
; f[j] = t1 + t3; |
fld _t1 |
fst st1 |
fsub st0, st2 ; st : f2, t1, t3 |
fstp qword[ebx+eax*8] ; st : t1, t3 |
fadd st0, st1 ; st : f0, t3 |
fstp qword[ebx] ; st : t3 |
fstp st0 |
|
; jj = j + d1; / ?? |
mov edi, _d1 |
shl edi, 3 ; = d1*8 |
mov edx, edi |
mov eax, edi |
add eax, eax ; eax = d2*8 |
shl edx, 2 ; = d3*8 |
add edi, ebx ; now [edi] points to f[jj] |
add edx, edi ; and [edx] points to f[jj+d3] |
|
; t1 = f[jj]; |
fld qword [edi] ; st : t1 |
; t3 = f[jj+d3]; |
fld qword [edx] ; st : t3, t1 |
|
; t2 = f[jj+d2] * r; |
fld qword [edi+eax] |
fld [fht_r] |
fmul st1, st0 ; st : r, t2, t3, t1 |
; t4 = f[jj+d4] * r |
fmul qword [edx+eax] ; st : t4, t2, t3, t1 |
|
; f[jj] = t1 + t2 + t3; |
fld st3 ; st : t1, t4, t2, t3, t1 |
fadd st0, st3 |
fadd st0, st2 |
fstp qword [edi] |
|
; f[jj+d2] = t1 - t3 + t4; |
fld st3 |
fsub st0, st3 ; st : (t1-t3), t4, t2, t3, t1 |
fld st0 |
fadd st0, st2 ; st : f2, (t1-t3), t4, t2, t3, t1 |
fstp qword [edi+eax] |
; f[jj+d4] = t1 - t3 - t4; |
fsub st0, st1 ; st : f4, t4, t2, t3, t1 |
fstp qword [edx+eax] |
|
; f[jj+d3] = t1 - t2 + t3; |
fstp st0 ; st : t2, t3, t1 |
fsubp st1, st0 ; st : (t3-t2), t1 |
faddp st1, st0 ; st : f3 |
fstp qword [edx] |
|
; for (k=1; k<d1; k++) |
xor ecx, ecx ; ecx = k |
mov _jj, ecx |
align 4 |
.next_k: |
inc ecx |
cmp ecx, _d1 |
jge .done_k |
; { |
mov eax, _d2 ; the sector increment |
; l1 = j + k; |
mov edx, ecx |
mov _l1, edx ; [ebx+edx*8] --> f[j+k] |
; l2 = l1 + d2; |
add edx, eax |
mov _l2, edx |
; l3 = l1 + d3; |
add edx, eax |
mov _l3, edx |
; l4 = l1 + d4; |
add edx, eax |
mov _l4, edx |
|
; l5 = j + d2 - k; |
mov edx, eax |
sub edx, ecx |
mov _l5, edx |
; l6 = l5 + d2; |
add edx, eax |
mov _l6, edx |
; l7 = l5 + d3; |
add edx, eax |
mov _l7, edx |
; l8 = l5 + d4; |
add edx, eax |
mov _l8, edx |
|
|
; 340 : j5 *= k; // add-substituted multiplication |
mov eax, _jj |
add eax, _j5 |
mov _jj, eax |
|
; c1 = C[jj]; |
; s1 = S[jj]; |
mov edi, [ebp+20] |
fld qword[edi+eax*8] |
mov esi, [ebp+8] |
shl esi, 2 |
add esi, edi |
fld qword[esi+eax*8] ; st : s1, c1 |
|
; t5 = f[l2] * c1 + f[l6] * s1; |
; t8 = f[l6] * c1 - f[l2] * s1; |
mov edx, _l6 |
fld qword[ebx+edx*8] |
mov edx, _l2 |
fld st0 |
fmul st0, st2 |
fxch st1 |
fmul st0, st3 |
fld qword[ebx+edx*8] ; st : f[l2], f[l6]*c, f[l6]*s, s, c |
fmul st4, st0 |
fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c |
fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c |
fstp _t8 |
faddp st2, st0 ; st : f[l2]*s, t5 |
fstp st0 ; st : t5 |
fstp _t5 ; st : <empty> |
|
; c2 = C[2*jj]; |
; s2 = S[2*jj]; |
shl eax, 1 |
fld qword[edi+eax*8] |
fld qword[esi+eax*8] ; st : s2, c2 |
|
; t6 = f[l3] * c2 + f[l7] * s2; |
; t9 = f[l7] * c2 - f[l3] * s2; |
mov edx, _l7 |
fld qword[ebx+edx*8] |
mov edx, _l3 |
fld st0 |
fmul st0, st2 |
fxch st1 |
fmul st0, st3 |
fld qword[ebx+edx*8] ; st : f[l3], f[l7]*c, f[l7]*s, s, c |
fmul st4, st0 |
fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c |
fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c |
fstp _t9 |
faddp st2, st0 ; st : f[l2]*s, t6 |
fstp st0 ; st : t6 |
fstp _t6 ; st : <empty> |
|
; c3 = C[3*jj]; |
; s3 = S[3*jj]; |
add eax, _jj |
fld qword[edi+eax*8] |
fld qword[esi+eax*8] ; st : s3, c3 |
|
; t7 = f[l4] * c3 + f[l8] * s3; |
; t0 = f[l8] * c3 - f[l4] * s3; |
mov edx, _l8 |
fld qword[ebx+edx*8] |
mov edx, _l4 |
fld st0 |
fmul st0, st2 |
fxch st1 |
fmul st0, st3 |
fld qword[ebx+edx*8] ; st : f[l4], f[l8]*c, f[l8]*s, s, c |
fmul st4, st0 |
fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c |
fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c |
fstp _t0 |
faddp st2, st0 ; st : f[l2]*s, t7 |
fstp st0 ; st : t7 |
fstp _t7 ; st : <empty> |
|
; t1 = f[l5] - t9; |
; t2 = f[l5] + t9; |
mov eax, _l5 |
fld qword [ebx+eax*8] |
fld _t9 |
fld st0 |
fadd st0, st2 |
fstp _t2 |
fsubp st1, st0 |
fstp _t1 |
|
; t3 = - t8 - t0; |
fld _t8 |
fadd _t0 |
fchs |
fstp _t3 |
; t4 = t5 - t7; |
fld _t5 |
fsub _t7 |
fstp _t4 |
|
; f[l5] = t1 + t4; |
fld _t1 |
fld _t4 |
fld st0 |
fadd st0, st2 |
fstp qword [ebx+eax*8] |
; f[l7] = t1 - t4; |
mov eax, _l7 |
fsubp st1, st0 |
fstp qword [ebx+eax*8] |
|
; f[l6] = t2 + t3; |
mov eax, _l6 |
fld _t2 |
fld _t3 |
fld st0 |
fadd st0, st2 |
fstp qword [ebx+eax*8] |
; f[l8] = t2 - t3; |
mov eax, _l8 |
fsubp st1, st0 |
fstp qword [ebx+eax*8] |
|
; t1 = f[l1] + t6; |
mov eax, _l1 |
fld qword [ebx+eax*8] |
fld _t6 |
fld st0 |
fadd st0, st2 |
fstp _t1 |
; t2 = f[l1] - t6; |
fsubp st1, st0 |
fstp _t2 |
|
; t3 = t8 - t0; |
fld _t8 |
fsub _t0 |
fstp _t3 |
; t4 = t5 + t7; |
fld _t5 |
fadd _t7 |
fstp _t4 |
|
; f[l1] = t1 + t4; |
mov eax, _l1 |
fld _t1 |
fld _t4 |
fld st0 |
fadd st0, st2 |
fstp qword [ebx+eax*8] |
; f[l3] = t1 - t4; |
mov eax, _l3 |
fsubp st1, st0 |
fstp qword [ebx+eax*8] |
|
; f[l2] = t2 + t3; |
mov eax, _l2 |
fld _t2 |
fld _t3 |
fld st0 |
fadd st0, st2 |
fstp qword [ebx+eax*8] |
; f[l4] = t2 - t3; |
mov eax, _l4 |
fsubp st1, st0 |
fstp qword [ebx+eax*8] |
|
; 374 : } |
jmp .next_k |
|
align 4 |
.done_k: |
; 375 : } |
add ebx, _d6 ; d6 = d5*8 |
cmp ebx, _end_of_array |
jb .next_j |
|
; 376 : } |
mov cx, _step |
jmp .newstep |
.done: |
mov esp, ebp |
pop ebp |
; 377 : } |
ret |
|
|
;=========== Step3 ends here =========== |
|
|
; ================================================================= |
|
;================================================================= |
; parameters: |
; -- [ebp+12] = N |
; -- [ebp+16] = p |
; -- [ebp+20] = 4k-aligned data array address |
; -- [ebp+24] = 4k-aligned SinCosTable address |
; returns: |
; -- nothing |
; destroys: |
; -- all GPRegs |
;; ========================== |
|
align 4 |
|
FHT_4: |
push ebp |
mov ebp, esp |
|
mov edx, [ebp+20] ; a |
mov dl, byte[ebp+16] |
call BitInvert |
push dword[ebp+20] ; a |
push ecx ; N |
call step1 ; 4-point transform |
cmp cl, 1 |
jz .done |
call step2 ; 16-point transform |
cmp byte[ebp+16],1 ; p = 2 ? |
jz .done |
pop edx ; N |
pop ecx ; a |
push dword[ebp+24] ; t |
push ecx |
push dword[ebp+16] ; p |
push edx ; N |
call step3 |
.done: |
mov esp, ebp |
pop ebp |
|
ret |