; Fast Hartley Transform routine ; Copyright (C) 1999, 2004, 2010 ; Artem Jerdev artem@jerdev.co.uk ; ; free KolibriOS version - not to be ported to other OSes ; ========================================================== ; global constants align 8 _r dq 1.41421356237309504880169 ; = sqrt(2) _r2 dq 0.70710678118654752440084 ; = sqrt(2)/2 _c1 dq 0.92387953251128675612818 ; = cos(pi/8) _s1 dq 0.38268343236508977172846 ; = sin(pi/8) ;================================================================= ; parameter1: ; -- reg dl (bits[3:0]) = Power_of_4 ; returns: ; -- reg edx = _CosTable address (4k-aligned) ; assumes: _SinTable = _CosTable + (N/2)*8 ; user heap has to be initialized ; destroys: ; -- eax, ebx, ecx ;; ========================== align 4 CreateSinCosTable: xor eax, eax inc eax mov cl, dl and cl, 15 shl eax, cl shl eax, cl mov ecx, eax ; now ecx = N shl ecx, 3 mov ebx, 12 mov eax, 68 int 0x40 ; getmem(N*sizeof(double)) mov edx, eax ; edx = _CosTable shr ecx, 1 mov ebx, eax add ebx, ecx ; ebx = _SinTable shr ecx, 3 push ecx ; [esp] = ecx = N/2 xor eax, eax fldpi fidiv dword[esp] ; st : dx = 2*pi/N pop ecx fldz ; st : 0, dx .loop: fld st0 ; st : x, x, dx FSINCOS ; st : cos, sin, x, dx fstp qword [edx+eax*8] ; st : sin, x, dx fstp qword [ebx+eax*8] ; st : x, dx fadd st0, st1 ; st : x+dx, dx inc eax cmp eax, ecx jne .loop fstp st0 ; st : dx fstp st0 ; st : ret ;================================================================= ; parameter1: ; -- reg edx = _CosTable address ; destroys: ; -- eax, ebx, ecx ;; ========================== align 4 DestroySinCosTable: mov ecx, edx mov ebx, 13 mov eax, 68 int 0x40 ; free(SinCosTable) ret ;================================================================= ; parameter1: ; -- reg dl (bits[3:0]) = Power_of_4 ; -- reg edx && (-16) = 4k-aligned data array address ; returns: ; -- edx = Power_of_4 ; -- ecx = N ; destroys: ; -- eax, ebx, ecx, edx, esi ;; ========================== align 4 BitInvert: mov esi, edx and esi, 0xFFFFFFF0 and edx, 0x0F push edx mov cl, dl xor eax, eax inc eax shl eax, cl shl eax, cl push eax xor ecx, ecx ; index term .newterm: inc ecx cmp ecx, [esp] ; N jge .done xor eax, eax mov edx, ecx xor bl, bl .do_invert: inc bl cmp bl, byte[esp+4] ; Power_of_4 jg .switch mov bh, dl and bh, 3 shl eax, 2 or al, bh shr edx, 2 jmp .do_invert .switch: cmp eax, ecx jle .newterm fld qword [esi+eax*8] fld qword [esi+ecx*8] fstp qword [esi+eax*8] fstp qword [esi+ecx*8] jmp .newterm .done: pop ecx pop edx ret ;================================================================= ;================================================================= ; stdcall parameters: ; -- [esp+4] = N ; -- [esp+8] = 4k-aligned data array address ; returns: ; -- nothing ; destroys: ; -- ebx, esi ;; ========================== align 4 step1: mov ebx, [esp+8] mov esi, [esp+4] shl esi, 3 add esi, ebx .loop: fld qword[ebx] fld qword[ebx+8] fld st1 fsub st0, st1 ; st : t2, f[i+1], f[i] fxch st1 ; st : f[i+1], t2, f[i] faddp st2, st0 ; st : t2, t1 fld qword[ebx+16] fld qword[ebx+24] fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1 fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1 fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1 fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1 fstp st1 ; st : t4, t3, t2, t1 fld st2 ; st : t2, t4, t3, t2, t1 fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1 fstp qword[ebx+16] ; st : t4, t3, t2, t1 fsubp st2, st0 ; st : t3, t2-t4, t1 fld st2 ; st : t1, t3, t2-t4, t1 fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1 fstp qword[ebx] ; st : t3, t2-t4, t1 fsubp st2, st0 ; st : t2-t4, t1-t3 fstp qword[ebx+24] ; st : t1-t3 fstp qword[ebx+8] ; st : add ebx, 32 cmp ebx, esi jnz .loop ret ; local stack definitions ;=========================================================================== _t0 equ dword [esp] _t1 equ dword[esp+4] _t2 equ dword[esp+8] _t3 equ dword[esp+12] _t4 equ dword[esp+16] _t5 equ dword[esp+20] _t6 equ dword[esp+24] _t7 equ dword[esp+28] _t8 equ dword[esp+32] _t9 equ dword[esp+36] _l1 equ dword[esp+40] _l2 equ dword[esp+44] _l3 equ dword[esp+48] _l4 equ dword[esp+52] _l5 equ dword[esp+56] _l6 equ dword[esp+60] _l7 equ dword[esp+64] _l8 equ dword[esp+68] _l9 equ dword[esp+72] _l0 equ dword[esp+76] _d1 equ dword[esp+80] _d2 equ dword[esp+84] _d3 equ dword[esp+88] _d4 equ dword[esp+92] _d5 equ dword[esp+96] _d6 equ dword[esp+100] _j5 equ dword[esp+104] _jj equ dword[esp+108] _end_of_array equ dword[esp+112] _step equ word [esp+116] ;================================================================= ; cdecl parameters: ; -- [ebp+8] = N ; -- [ebp+12] = 4k-aligned data array address ; returns: ; -- nothing ; destroys: ; -- eax, ebx ; locals: ; -- 10 stack-located dwords (_t0 ... _t9) ;; ========================== align 4 step2: push ebp mov ebp, esp sub esp, 40 mov ebx, [ebp+12] mov eax, [ebp+ 8] shl eax, 3 add eax, ebx .loop_i: ; -- quad subelements +0, +4, +8 and +12 (simpliest operations) fld qword[ebx] fld qword[ebx+8*4] fld st0 fadd st0, st2 ; st : t1, f_4, f_0 fxch st1 fsubp st2, st0 ; st : t1, t2 fld qword[ebx+8*8] fld qword[ebx+8*12] fld st0 fadd st0, st2 ; st : t3, f_12, t1, t2 fxch st1 fsubp st2, st0 ; st : t3, t4, t1, t2 ; ------ fld st2 ; st : t1, t3, t4, t1, t2 fadd st0, st1 fstp qword[ebx] ; st : t3, t4, t1, t2 fsub st0, st2 ; st : t3-t1, t4, t1, t2 fchs ; st : t1-t3, t4, t1, t2 fstp qword[ebx+8*4] ; st : t4, t1, t2 fst st1 ; st : t4, t4, t2 fadd st0, st2 ; st : t2+t4, t4, t2 fstp qword[ebx+8*8] ; st : t4, t2 fsubp st1, st0 ; st : t2-t4 fstp qword[ebx+8*12] ; st : ; -- even subelements +2, +6, +10 and +14 (2 multiplications needed) fld qword[ebx+8*2] fld qword[ebx+8*6] fld [_r] fmul st1, st0 ; st : r, t2, t1 fld qword[ebx+8*10] fxch st1 ; st : r, t3, t2, t1 fmul qword[ebx+8*14] ; st : t4, t3, t2, t1 ; ------ fld st3 ; st : t1, t4, t3, t2, t1 fadd st0, st3 ; fadd st0, st2 ; fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3 fsub st0, st3 ; fsub st0, st3 ; fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3 fld st3 ; st : t1, t4, t3, t2, t1 fsub st0, st2 ; fsub st0, st1 ; fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4 fadd st0, st1 ; faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1 fstp qword[ebx+8*6] ; store f[i+6] fstp st0 ; st : t2, t1 fstp st0 ; st : t1 fstp st0 ; st : ; -- odd subelements fld qword[ebx+8*9] fld qword[ebx+8*11] fld st1 fsub st0, st1 fxch st1 faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7]) fld [_r2] fmul st2, st0 fmulp st1, st0 ; st : t9, t6 fld qword[ebx+8*3] fld st0 fadd st0, st2 ; st : t1, f[l5], t9, t6 fstp _t1 fsub st0, st1 fstp _t2 fstp _t9 ; (t9 never used) fstp _t6 ; st : fld [_c1] fld [_s1] fld qword[ebx+8*5] fld qword[ebx+8*7] fld st3 ; st: c1, f[l6], f[l2], s1, c1 fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c faddp st1, st0 ; st: t5, f_6, f_2, s, c fstp _t5 ; st: f_6, f_2, s, c fld st3 ; st: c, f_6, f_2, s, c fmul st0, st1 fld st3 fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c fsubp st1, st0 ; st: t8, f_6, f_2, s, c fstp _t8 ; st: f_6, f_2, s, c fstp st0 ; st: f_2, s, c fstp st0 ; st: s, c fld qword[ebx+8*13] fld qword[ebx+8*15] fld st3 ; st: c1, f[l8], f[l4], s1, c1 fmul st0, st1 fld st3 fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c faddp st1, st0 ; st: t7, f_8, f_4, s, c fld _t5 ; st: t5, t7, f_8, f_4, s, c fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c fstp _t4 fstp _t7 ; st: f_8, f_4, s, c fld st3 ; st: c, f_8, f_4, s, c fmul st0, st2 fld st3 fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c fsubp st1, st0 ; st:-t0, f_8, f_4, s, c fchs fld _t8 fchs ; st:-t8, t0, f_8, f_4, s, c fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c fstp _t3 fstp _t0 ; st: f_8, f_4, s, c fstp st0 ; st: f_4, s, c fstp st0 ; st: s, c fstp st0 ; st: c fstp st0 ; st: fld _t1 fld _t4 fld st1 fsub st0, st1 fstp qword[ebx+8*11] ; f[l7] = t1-t4 faddp st1, st0 fstp qword[ebx+8*3] ; f[l5] = t1+t4 fld _t2 fld _t3 fld st1 fsub st0, st1 fstp qword[ebx+8*15] ; f[l8] faddp st1, st0 fstp qword[ebx+8*7] ; f[l6] fld _t6 fld qword[ebx+8] fld st1 fsub st0, st1 fxch st1 faddp st2, st0 ; st : t2, t1 fld _t8 fsub _t0 fld _t5 fadd _t7 ; st : t4, t3, t2, t1 fld st3 fsub st0, st1 fstp qword[ebx+8*9] ; f[l3] = t1-t4 fadd st0, st3 fstp qword[ebx+8] ; f[l1] = t1+t4 fld st1 ; st : t2, t3, t2, t1 fsub st0, st1 ; f[l4] = t2-t3 fstp qword[ebx+8*13] ; st : t3, t2, t1 faddp st1, st0 ; st : t2+t3, t1 fstp qword[ebx+8*5] ; f[l2] = t2+t3 fstp st0 ; st : add ebx, 16*8 cmp ebx, eax jb .loop_i mov esp, ebp pop ebp ret ;================================================================= ; cdecl parameters: ; -- [ebp+8] = N ; -- [ebp+12] = p ; -- [ebp+16] = 4k-aligned data array address ; -- [ebp+20] = 4k-aligned SinCosTable address ; returns: ; -- nothing ; destroys: ; -- all GPRegs ; locals: ; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step) ;; ========================== align 4 step3: push ebp mov ebp, esp sub esp, 120 ; 283 : { ; 293 : for (l=3; l<=p; l++) mov cx, 0x0200 .newstep: inc ch cmp ch, byte[ebp+12] jg .done mov _step, cx ; 294 : { ; 295 : d1 = 1 << (l + l - 3); mov cl, ch add cl, cl sub cl, 3 mov edx, 1 shl edx, cl mov _d1, edx ; 296 : d2 = d1 << 1; shl edx, 1 mov _d2, edx mov eax, edx ; 297 : d3 = d2 << 1; shl edx, 1 mov _d3, edx ; 298 : d4 = d2 + d3; add eax, edx mov _d4, eax ; 299 : d5 = d3 << 1; shl edx, 1 mov _d5, edx shl edx, 3 mov _d6, edx ; d6 = d5*8 to simplify index operations ; 339 : j5 = N / d5; ; moved out of internal loop mov cl, [ebp+12] sub cl, ch add cl, cl mov edx, 1 shl edx, cl mov _j5, edx ; 300 : ; 301 : for (j=0; j f[j+k] ; l2 = l1 + d2; add edx, eax mov _l2, edx ; l3 = l1 + d3; add edx, eax mov _l3, edx ; l4 = l1 + d4; add edx, eax mov _l4, edx ; l5 = j + d2 - k; mov edx, eax sub edx, ecx mov _l5, edx ; l6 = l5 + d2; add edx, eax mov _l6, edx ; l7 = l5 + d3; add edx, eax mov _l7, edx ; l8 = l5 + d4; add edx, eax mov _l8, edx ; 340 : j5 *= k; // add-substituted multiplication mov eax, _jj add eax, _j5 mov _jj, eax ; c1 = C[jj]; ; s1 = S[jj]; mov edi, [ebp+20] fld qword[edi+eax*8] mov esi, [ebp+8] shl esi, 2 add esi, edi fld qword[esi+eax*8] ; st : s1, c1 ; t5 = f[l2] * c1 + f[l6] * s1; ; t8 = f[l6] * c1 - f[l2] * s1; mov edx, _l6 fld qword[ebx+edx*8] mov edx, _l2 fld st0 fmul st0, st2 fxch st1 fmul st0, st3 fld qword[ebx+edx*8] ; st : f[l2], f[l6]*c, f[l6]*s, s, c fmul st4, st0 fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c fstp _t8 faddp st2, st0 ; st : f[l2]*s, t5 fstp st0 ; st : t5 fstp _t5 ; st : ; c2 = C[2*jj]; ; s2 = S[2*jj]; shl eax, 1 fld qword[edi+eax*8] fld qword[esi+eax*8] ; st : s2, c2 ; t6 = f[l3] * c2 + f[l7] * s2; ; t9 = f[l7] * c2 - f[l3] * s2; mov edx, _l7 fld qword[ebx+edx*8] mov edx, _l3 fld st0 fmul st0, st2 fxch st1 fmul st0, st3 fld qword[ebx+edx*8] ; st : f[l3], f[l7]*c, f[l7]*s, s, c fmul st4, st0 fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c fstp _t9 faddp st2, st0 ; st : f[l2]*s, t6 fstp st0 ; st : t6 fstp _t6 ; st : ; c3 = C[3*jj]; ; s3 = S[3*jj]; add eax, _jj fld qword[edi+eax*8] fld qword[esi+eax*8] ; st : s3, c3 ; t7 = f[l4] * c3 + f[l8] * s3; ; t0 = f[l8] * c3 - f[l4] * s3; mov edx, _l8 fld qword[ebx+edx*8] mov edx, _l4 fld st0 fmul st0, st2 fxch st1 fmul st0, st3 fld qword[ebx+edx*8] ; st : f[l4], f[l8]*c, f[l8]*s, s, c fmul st4, st0 fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c fstp _t0 faddp st2, st0 ; st : f[l2]*s, t7 fstp st0 ; st : t7 fstp _t7 ; st : ; t1 = f[l5] - t9; ; t2 = f[l5] + t9; mov eax, _l5 fld qword [ebx+eax*8] fld _t9 fld st0 fadd st0, st2 fstp _t2 fsubp st1, st0 fstp _t1 ; t3 = - t8 - t0; fld _t8 fadd _t0 fchs fstp _t3 ; t4 = t5 - t7; fld _t5 fsub _t7 fstp _t4 ; f[l5] = t1 + t4; fld _t1 fld _t4 fld st0 fadd st0, st2 fstp qword [ebx+eax*8] ; f[l7] = t1 - t4; mov eax, _l7 fsubp st1, st0 fstp qword [ebx+eax*8] ; f[l6] = t2 + t3; mov eax, _l6 fld _t2 fld _t3 fld st0 fadd st0, st2 fstp qword [ebx+eax*8] ; f[l8] = t2 - t3; mov eax, _l8 fsubp st1, st0 fstp qword [ebx+eax*8] ; t1 = f[l1] + t6; mov eax, _l1 fld qword [ebx+eax*8] fld _t6 fld st0 fadd st0, st2 fstp _t1 ; t2 = f[l1] - t6; fsubp st1, st0 fstp _t2 ; t3 = t8 - t0; fld _t8 fsub _t0 fstp _t3 ; t4 = t5 + t7; fld _t5 fadd _t7 fstp _t4 ; f[l1] = t1 + t4; mov eax, _l1 fld _t1 fld _t4 fld st0 fadd st0, st2 fstp qword [ebx+eax*8] ; f[l3] = t1 - t4; mov eax, _l3 fsubp st1, st0 fstp qword [ebx+eax*8] ; f[l2] = t2 + t3; mov eax, _l2 fld _t2 fld _t3 fld st0 fadd st0, st2 fstp qword [ebx+eax*8] ; f[l4] = t2 - t3; mov eax, _l4 fsubp st1, st0 fstp qword [ebx+eax*8] ; 374 : } jmp .next_k .done_k: ; 375 : } add ebx, _d6 ; d6 = d5*8 cmp ebx, _end_of_array jb .next_j ; 376 : } mov cx, _step jmp .newstep .done: mov esp, ebp pop ebp ; 377 : } ret ;=========== Step3 ends here =========== ; ================================================================= ;================================================================= ; parameters: ; -- [ebp+8] = N ; -- [ebp+12] = p ; -- [ebp+16] = 4k-aligned data array address ; -- [ebp+20] = 4k-aligned SinCosTable address ; returns: ; -- nothing ; destroys: ; -- all GPRegs ;; ========================== align 4 FHT_4: push ebp mov ebp, esp mov edx, [ebp+16] add edx, [ebp+12] call BitInvert push dword[ebp+16] push dword[ebp+8] call step1 call step2 pop edx ; N pop ecx ; a push dword[ebp+20] ; t push ecx push dword[ebp+12] ; p push edx ; N call step3 mov esp, ebp pop ebp ret