WebSVN – Kolibri OS – Path Comparison – / – /kernel/branches/Kolibri-A/trunk/sound/ Rev 1640 and /kernel/branches/Kolibri-A/trunk/sound/ Rev 1641

Regard whitespace Rev 1640 → Rev 1641

/kernel/branches/Kolibri-A/trunk/sound/FFT.inc
File deleted

 /kernel/branches/Kolibri-A/trunk/sound/FHT.INC
 ,0 → 1,792
+;           Fast Hartley Transform routine
+;           Copyright (C) 1999, 2004, 2010
+;          Artem Jerdev  artem@jerdev.co.uk
+;
+; free KolibriOS version - not to be ported to other OSes
+; ==========================================================
+; global constants
+align 8
+fht_r      dq      1.41421356237309504880169       ; = sqrt(2)
+fht_r2     dq      0.70710678118654752440084       ; = sqrt(2)/2
+fht_c1     dq      0.92387953251128675612818       ; = cos(pi/8)
+fht_s1     dq      0.38268343236508977172846       ; = sin(pi/8)
+;=================================================================
+; parameter1:
+; -- reg  dl (bits[3:0])   = Power_of_4
+; -- reg edx && (-16) = 4k-aligned data array address
+; returns:
+; -- edx = Power_of_4
+; -- ecx = N
+; destroys:
+; -- eax, ebx, ecx, edx, esi
+;; ==========================
+align 4
+BitInvert:
+        mov     esi, edx
+        and     esi, 0xFFFFFFF0
+        and     edx, 0x0F
+        push    edx
+        mov     cl, dl
+        xor     eax, eax
+        inc     eax
+        shl     eax, cl
+        shl     eax, cl
+        push    eax
+        xor     ecx, ecx                ; index term
+align 4
+.newterm:
+        inc     ecx
+        cmp     ecx, [esp]              ; N
+        jge     .done
+        xor     eax, eax
+        mov     edx, ecx
+        xor     bl, bl
+align 4
+.do_invert:
+        inc     bl
+        cmp     bl, byte[esp+4] ; Power_of_4
+        jg      .switch
+        mov     bh, dl
+        and     bh,  3
+        shl     eax, 2
+        or      al, bh
+        shr     edx, 2
+        jmp     .do_invert
+align 8
+.switch:
+        cmp     eax, ecx
+        jle     .newterm
+        fld     qword [esi+eax*8]
+        fld     qword [esi+ecx*8]
+        fstp    qword [esi+eax*8]
+        fstp    qword [esi+ecx*8]
+        jmp     .newterm
+align 4
+.done:
+        pop     ecx
+        pop     edx
+        ret
+;=================================================================
+;=================================================================
+; stdcall parameters:
+; -- [esp+4]  = N
+; -- [esp+8]  = 4k-aligned data array  address
+; returns:
+; -- nothing
+; destroys:
+; -- ebx, esi
+;; ==========================
+align 4
+step1:
+        mov     ebx, [esp+8]
+        mov     esi, [esp+4]
+        shl     esi, 3
+        add     esi, ebx
+align 4
+.loop:
+        fld     qword[ebx]
+        fld     qword[ebx+8]
+        fld     st1
+        fsub    st0, st1        ; st : t2, f[i+1], f[i]
+        fxch    st1             ; st : f[i+1], t2, f[i]
+        faddp   st2, st0        ; st : t2, t1
+        fld     qword[ebx+16]
+        fld     qword[ebx+24]
+        fld     st1             ; st : f[i+2], f[i+3], f[i+2], t2, t1
+        fadd    st0, st1        ; st : t3, f[i+3], f[i+2], t2, t1
+        fxch    st2             ; st : f[i+2], f[i+3], t3, t2, t1
+        fsub    st0, st1        ; st : t4, f[i+3], t3, t2, t1
+        fstp    st1             ; st : t4, t3, t2, t1
+        fld     st2             ; st : t2, t4, t3, t2, t1
+        fadd    st0, st1        ; st : t2+t4, t4, t3, t2, t1
+        fstp    qword[ebx+16]   ; st : t4, t3, t2, t1
+        fsubp   st2, st0        ; st : t3, t2-t4, t1
+        fld     st2             ; st : t1, t3, t2-t4, t1
+        fadd    st0, st1        ; st : t1+t3, t3, t2-t4, t1
+        fstp    qword[ebx]      ; st : t3, t2-t4, t1
+        fsubp   st2, st0        ; st : t2-t4, t1-t3
+        fstp    qword[ebx+24]   ; st : t1-t3
+        fstp    qword[ebx+8]    ; st : <empty>
+        add     ebx, 32
+        cmp     ebx, esi
+        jnz     .loop
+ret
+;       local stack definitions
+;===========================================================================
+_t0     equ     dword [esp]
+_t1     equ     dword[esp+4]
+_t2     equ     dword[esp+8]
+_t3     equ     dword[esp+12]
+_t4     equ     dword[esp+16]
+_t5     equ     dword[esp+20]
+_t6     equ     dword[esp+24]
+_t7     equ     dword[esp+28]
+_t8     equ     dword[esp+32]
+_t9     equ     dword[esp+36]
+_l1   equ       dword[esp+40]
+_l2   equ       dword[esp+44]
+_l3   equ       dword[esp+48]
+_l4   equ       dword[esp+52]
+_l5   equ       dword[esp+56]
+_l6   equ       dword[esp+60]
+_l7   equ       dword[esp+64]
+_l8   equ       dword[esp+68]
+_l9   equ       dword[esp+72]
+_l0   equ       dword[esp+76]
+_d1   equ       dword[esp+80]
+_d2   equ       dword[esp+84]
+_d3   equ       dword[esp+88]
+_d4   equ       dword[esp+92]
+_d5   equ       dword[esp+96]
+_d6   equ       dword[esp+100]
+_j5   equ       dword[esp+104]
+_jj   equ       dword[esp+108]
+_end_of_array   equ     dword[esp+112]
+_step           equ     word [esp+116]
+;=================================================================
+; cdecl parameters:
+; -- [ebp+8]   = N
+; -- [ebp+12]  = 4k-aligned data array  address
+; returns:
+; -- nothing
+; destroys:
+; -- eax, ebx
+; locals:
+; -- 10 stack-located dwords (_t0 ... _t9)
+;; ==========================
+align 4
+step2:
+        push    ebp
+        mov     ebp, esp
+        sub     esp, 40
+        mov     ebx, [ebp+12]
+        mov     eax, [ebp+ 8]
+        shl     eax, 3
+        add     eax, ebx
+align 4
+.loop_i:
+; -- quad subelements  +0, +4, +8 and +12 (simpliest operations)
+        fld     qword[ebx]
+        fld     qword[ebx+8*4]
+        fld     st0
+        fadd    st0, st2        ; st : t1, f_4, f_0
+        fxch    st1
+        fsubp   st2, st0        ; st : t1, t2
+        fld     qword[ebx+8*8]
+        fld     qword[ebx+8*12]
+        fld     st0
+        fadd    st0, st2        ; st : t3, f_12, t1, t2
+        fxch    st1
+        fsubp   st2, st0        ; st : t3, t4, t1, t2
+        ; ------
+        fld     st2             ; st : t1, t3, t4, t1, t2
+        fadd    st0, st1
+        fstp    qword[ebx]      ; st : t3, t4, t1, t2
+        fsub    st0, st2        ; st : t3-t1, t4, t1, t2
+        fchs                    ; st : t1-t3, t4, t1, t2
+        fstp    qword[ebx+8*4]  ; st : t4, t1, t2
+        fst     st1             ; st : t4, t4, t2
+        fadd    st0, st2        ; st : t2+t4, t4, t2
+        fstp    qword[ebx+8*8]  ; st : t4, t2
+        fsubp   st1, st0        ; st : t2-t4
+        fstp    qword[ebx+8*12] ; st : <empty>
+; -- even subelements  +2, +6, +10 and +14 (2 multiplications needed)
+        fld     qword[ebx+8*2]
+        fld     qword[ebx+8*6]
+        fld     [fht_r]
+        fmul    st1, st0        ; st : r, t2, t1
+        fld     qword[ebx+8*10]
+        fxch    st1             ; st : r, t3, t2, t1
+        fmul    qword[ebx+8*14] ; st : t4, t3, t2, t1
+        ; ------
+        fld     st3             ; st : t1, t4, t3, t2, t1
+        fadd    st0, st3        ;
+        fadd    st0, st2        ;
+        fst     qword[ebx+8*2]  ; store f[i+8] = t1+t2+t3
+        fsub    st0, st3        ;
+        fsub    st0, st3        ;
+        fstp    qword[ebx+8*10] ; store f[i+10]= t1-t2+t3
+        fld     st3             ; st : t1, t4, t3, t2, t1
+        fsub    st0, st2        ;
+        fsub    st0, st1        ;
+        fst     qword[ebx+8*14] ; store f[i+14]= t1-t3-t4
+        fadd    st0, st1        ;
+        faddp   st1, st0        ; st : t1-t3+t4, t3, t2, t1
+        fstp    qword[ebx+8*6]  ; store f[i+6]
+        fstp    st0             ; st : t2, t1
+        fstp    st0             ; st : t1
+        fstp    st0             ; st : <empty>
+; -- odd subelements
+        fld     qword[ebx+8*9]
+        fld     qword[ebx+8*11]
+        fld     st1
+        fsub    st0, st1
+        fxch    st1
+        faddp   st2, st0        ; st : (f[l3]-f[l7]), (f[l3]+f[l7])
+        fld     [fht_r2]
+        fmul    st2, st0
+        fmulp   st1, st0        ; st : t9, t6
+        fld     qword[ebx+8*3]
+        fld     st0
+        fadd    st0, st2        ; st : t1, f[l5], t9, t6
+        fstp    _t1
+        fsub    st0, st1
+        fstp    _t2
+        fstp    _t9     ; (t9 never used)
+        fstp    _t6             ; st : <empty>
+        fld     [fht_c1]
+        fld     [fht_s1]
+        fld     qword[ebx+8*5]
+        fld     qword[ebx+8*7]
+        fld     st3             ; st: c1, f[l6], f[l2], s1, c1
+        fmul    st0, st2        ; st: f_2*c, f_6, f_2, s, c
+        fld     st1             ; st: f_6, f_2*c, f_6, f_2, s, c
+        fmul    st0, st4        ; st: f_6*s, f_2*c, f_6, f_2, s, c
+        faddp   st1, st0        ; st: t5, f_6, f_2, s, c
+        fstp    _t5             ; st: f_6, f_2, s, c
+        fld     st3             ; st: c, f_6, f_2, s, c
+        fmul    st0, st1
+        fld     st3
+        fmul    st0, st3        ; st: f_2*s, f_6*c, f_6, f_2, s, c
+        fsubp   st1, st0        ; st: t8, f_6, f_2, s, c
+        fstp    _t8             ; st: f_6, f_2, s, c
+        fstp    st0             ; st: f_2, s, c
+        fstp    st0             ; st: s, c
+        fld     qword[ebx+8*13]
+        fld     qword[ebx+8*15]
+        fld     st3             ; st: c1, f[l8], f[l4], s1, c1
+        fmul    st0, st1
+        fld     st3
+        fmul    st0, st3        ; st: f_4*s, f_8*c, f_8, f_4, s, c
+        faddp   st1, st0        ; st: t7, f_8, f_4, s, c
+        fld     _t5             ; st: t5, t7, f_8, f_4, s, c
+        fsub    st0, st1        ; st: t4, t7, f_8, f_4, s, c
+        fstp    _t4
+        fstp    _t7             ; st: f_8, f_4, s, c
+        fld     st3             ; st: c, f_8, f_4, s, c
+        fmul    st0, st2
+        fld     st3
+        fmul    st0, st2        ; st: f_8*s, f_4*c, f_8, f_4, s, c
+        fsubp   st1, st0        ; st:-t0, f_8, f_4, s, c
+        fchs
+        fld     _t8
+        fchs                    ; st:-t8, t0, f_8, f_4, s, c
+        fsub    st0, st1        ; st: t3, t0, f_8, f_4, s, c
+        fstp    _t3
+        fstp    _t0             ; st: f_8, f_4, s, c
+        fstp    st0             ; st: f_4, s, c
+        fstp    st0             ; st: s, c
+        fstp    st0             ; st: c
+        fstp    st0             ; st: <empty>
+        fld     _t1
+        fld     _t4
+        fld     st1
+        fsub    st0, st1
+        fstp    qword[ebx+8*11] ; f[l7] = t1-t4
+        faddp   st1, st0
+        fstp    qword[ebx+8*3]  ; f[l5] = t1+t4
+        fld     _t2
+        fld     _t3
+        fld     st1
+        fsub    st0, st1
+        fstp    qword[ebx+8*15] ; f[l8]
+        faddp   st1, st0
+        fstp    qword[ebx+8*7]  ; f[l6]
+        fld     _t6
+        fld     qword[ebx+8]
+        fld     st1
+        fsub    st0, st1
+        fxch    st1
+        faddp   st2, st0        ; st : t2, t1
+        fld     _t8
+        fsub    _t0
+        fld     _t5
+        fadd    _t7             ; st : t4, t3, t2, t1
+        fld     st3
+        fsub    st0, st1
+        fstp    qword[ebx+8*9]  ; f[l3] = t1-t4
+        fadd    st0, st3
+        fstp    qword[ebx+8]    ; f[l1] = t1+t4
+        fld     st1             ; st : t2, t3, t2, t1
+        fsub    st0, st1        ; f[l4] = t2-t3
+        fstp    qword[ebx+8*13] ; st : t3, t2, t1
+        faddp   st1, st0        ; st : t2+t3, t1
+        fstp    qword[ebx+8*5]  ; f[l2] = t2+t3
+        fstp    st0             ; st : <empty>
+        add     ebx, 16*8
+        cmp     ebx, eax
+        jb      .loop_i
+        mov     esp, ebp
+        pop     ebp
+ret
+;=================================================================
+; cdecl parameters:
+; -- [ebp+8]   = N
+; -- [ebp+12]  = p
+; -- [ebp+16]  = 4k-aligned data array  address
+; -- [ebp+20]  = 4k-aligned SinCosTable address
+; returns:
+; -- nothing
+; destroys:
+; -- all GPRegs
+; locals:
+; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step)
+;; ==========================
+align 4
+step3:
+        push    ebp
+        mov     ebp, esp
+        sub     esp, 120
+; 283  : {
+; 293  :   for (l=3; l<=p; l++)
+        mov     cx, 0x0200
+align 4
+.newstep:
+        inc     ch
+        cmp     ch, byte[ebp+12]
+        jg      .done
+        mov     _step, cx
+; 294  :   {
+; 295  :     d1 = 1 << (l + l - 3);
+        mov     cl, ch
+        add     cl, cl
+        sub     cl, 3
+        mov     edx, 1
+        shl     edx, cl
+        mov     _d1, edx
+; 296  :     d2 = d1 << 1;
+        shl     edx, 1
+        mov     _d2, edx
+        mov     eax, edx
+; 297  :     d3 = d2 << 1;
+        shl     edx, 1
+        mov     _d3, edx
+; 298  :     d4 = d2 + d3;
+        add     eax, edx
+        mov     _d4, eax
+; 299  :     d5 = d3 << 1;
+        shl     edx, 1
+        mov     _d5, edx
+        shl     edx, 3
+        mov     _d6, edx        ; d6 = d5*8 to simplify index operations
+; 339  :         j5 = N / d5;   ; moved out of internal loop
+        mov     cl, [ebp+12]
+        sub     cl, ch
+        add     cl, cl
+        mov     edx, 1
+        shl     edx, cl
+        mov     _j5, edx
+; 300  :
+; 301  :     for (j=0; j<N; j+=d5)
+        mov     ebx, [ebp+16]
+        mov     esi, [ebp+8]
+        shl     esi, 3
+        add     esi, ebx
+        mov     _end_of_array, esi
+align 4
+.next_j:
+; {
+; t1 = f[j] + f[j+d2];
+        mov     eax, _d2
+        fld     qword[ebx]
+        fld     qword[ebx+eax*8]
+        fld     st1
+        fadd    st0, st1
+        fstp    _t1
+; t2 = f[j] - f[j+d2];
+        fsubp   st1, st0
+        fstp    _t2
+; t3 = f[j+d3] + f[j+d4];
+        mov     edi, _d3
+        fld     qword[ebx+edi*8]
+        mov     edx, _d4
+        fld     qword[ebx+edx*8]
+        fld     st1
+        fsub    st0, st1                ; st : t4, f4, f3
+        fxch    st1                     ; st : f4, t4, f3
+; t4 = f[j+d3] - f[j+d4];
+        faddp   st2, st0                ; st : t4, t3
+; f[j+d4] = t2 - t4;
+; f[j+d3] = t2 + t4;
+        fld     _t2
+        fld     st0
+        fsub    st0, st2                ; st : f4, t2, t4, t3
+        fstp    qword[ebx+edx*8]        ; st : t2, t4, t3
+        fadd    st0, st1                ; st : f3, t4, t3
+        fstp    qword[ebx+edi*8]        ; st : t4, t3
+; f[j+d2] = t1 - t3;
+; f[j]    = t1 + t3;
+        fld     _t1
+        fst     st1
+        fsub    st0, st2                ; st : f2, t1, t3
+        fstp    qword[ebx+eax*8]        ; st : t1, t3
+        fadd    st0, st1                ; st : f0, t3
+        fstp    qword[ebx]              ; st : t3
+        fstp    st0
+; jj = j + d1;     / ??
+        mov     edi, _d1
+        shl     edi, 3          ; = d1*8
+        mov     edx, edi
+        mov     eax, edi
+        add     eax, eax        ; eax = d2*8
+        shl     edx, 2          ; = d3*8
+        add     edi, ebx        ; now [edi] points to f[jj]
+        add     edx, edi        ; and [edx] points to f[jj+d3]
+; t1 = f[jj];
+        fld     qword [edi]     ; st : t1
+; t3 = f[jj+d3];
+        fld     qword [edx]     ; st : t3, t1
+; t2 = f[jj+d2] * r;
+        fld     qword [edi+eax]
+        fld     [fht_r]
+        fmul    st1, st0        ; st : r,  t2, t3, t1
+; t4 = f[jj+d4] * r
+        fmul    qword [edx+eax] ; st : t4, t2, t3, t1
+; f[jj]    = t1 + t2 + t3;
+        fld     st3             ; st : t1, t4, t2, t3, t1
+        fadd    st0, st3
+        fadd    st0, st2
+        fstp    qword [edi]
+; f[jj+d2] = t1 - t3 + t4;
+        fld     st3
+        fsub    st0, st3        ; st : (t1-t3), t4, t2, t3, t1
+        fld     st0
+        fadd    st0, st2        ; st : f2, (t1-t3), t4, t2, t3, t1
+        fstp    qword [edi+eax]
+; f[jj+d4] = t1 - t3 - t4;
+        fsub    st0, st1        ; st : f4, t4, t2, t3, t1
+        fstp    qword [edx+eax]
+; f[jj+d3] = t1 - t2 + t3;
+        fstp    st0             ; st : t2, t3,  t1
+        fsubp   st1, st0        ; st : (t3-t2), t1
+        faddp   st1, st0        ; st : f3
+        fstp    qword [edx]
+; for (k=1; k<d1; k++)
+        xor     ecx, ecx        ; ecx = k
+        mov     _jj, ecx
+align 4
+.next_k:
+        inc     ecx
+        cmp     ecx, _d1
+        jge     .done_k
+; {
+        mov     eax, _d2        ; the sector increment
+; l1 = j  + k;
+        mov     edx, ecx
+        mov     _l1, edx        ; [ebx+edx*8] --> f[j+k]
+; l2 = l1 + d2;
+        add     edx, eax
+        mov     _l2, edx
+; l3 = l1 + d3;
+        add     edx, eax
+        mov     _l3, edx
+; l4 = l1 + d4;
+        add     edx, eax
+        mov     _l4, edx
+; l5 = j  + d2 - k;
+        mov     edx, eax
+        sub     edx, ecx
+        mov     _l5, edx
+; l6 = l5 + d2;
+        add     edx, eax
+        mov     _l6, edx
+; l7 = l5 + d3;
+        add     edx, eax
+        mov     _l7, edx
+; l8 = l5 + d4;
+        add     edx, eax
+        mov     _l8, edx
+; 340  :         j5 *= k;       // add-substituted multiplication
+        mov     eax, _jj
+        add     eax, _j5
+        mov     _jj, eax
+; c1 = C[jj];
+; s1 = S[jj];
+        mov     edi, [ebp+20]
+        fld     qword[edi+eax*8]
+        mov     esi, [ebp+8]
+        shl     esi, 2
+        add     esi, edi
+        fld     qword[esi+eax*8]        ; st : s1, c1
+; t5 = f[l2] * c1 + f[l6] * s1;
+; t8 = f[l6] * c1 - f[l2] * s1;
+        mov     edx, _l6
+        fld     qword[ebx+edx*8]
+        mov     edx, _l2
+        fld     st0
+        fmul    st0, st2
+        fxch    st1
+        fmul    st0, st3
+        fld     qword[ebx+edx*8]        ; st : f[l2], f[l6]*c, f[l6]*s, s, c
+        fmul    st4, st0
+        fmulp   st3, st0                ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c
+        fsub    st0, st2                ; st :   t8,    f[l6]*s, f[l2]*s, f[l2]*c
+        fstp    _t8
+        faddp   st2, st0                ; st :  f[l2]*s, t5
+        fstp    st0                     ; st :  t5
+        fstp    _t5                     ; st :  <empty>
+; c2 = C[2*jj];
+; s2 = S[2*jj];
+        shl     eax, 1
+        fld     qword[edi+eax*8]
+        fld     qword[esi+eax*8]        ; st : s2, c2
+; t6 = f[l3] * c2 + f[l7] * s2;
+; t9 = f[l7] * c2 - f[l3] * s2;
+        mov     edx, _l7
+        fld     qword[ebx+edx*8]
+        mov     edx, _l3
+        fld     st0
+        fmul    st0, st2
+        fxch    st1
+        fmul    st0, st3
+        fld     qword[ebx+edx*8]        ; st : f[l3], f[l7]*c, f[l7]*s, s, c
+        fmul    st4, st0
+        fmulp   st3, st0                ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c
+        fsub    st0, st2                ; st :   t9,    f[l7]*s, f[l3]*s, f[l3]*c
+        fstp    _t9
+        faddp   st2, st0                ; st :  f[l2]*s, t6
+        fstp    st0                     ; st :  t6
+        fstp    _t6                     ; st :  <empty>
+; c3 = C[3*jj];
+; s3 = S[3*jj];
+        add     eax, _jj
+        fld     qword[edi+eax*8]
+        fld     qword[esi+eax*8]        ; st : s3, c3
+; t7 = f[l4] * c3 + f[l8] * s3;
+; t0 = f[l8] * c3 - f[l4] * s3;
+        mov     edx, _l8
+        fld     qword[ebx+edx*8]
+        mov     edx, _l4
+        fld     st0
+        fmul    st0, st2
+        fxch    st1
+        fmul    st0, st3
+        fld     qword[ebx+edx*8]        ; st : f[l4], f[l8]*c, f[l8]*s, s, c
+        fmul    st4, st0
+        fmulp   st3, st0                ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c
+        fsub    st0, st2                ; st :   t9,    f[l8]*s, f[l4]*s, f[l4]*c
+        fstp    _t0
+        faddp   st2, st0                ; st : f[l2]*s, t7
+        fstp    st0                     ; st :  t7
+        fstp    _t7                     ; st :  <empty>
+; t1 = f[l5] - t9;
+; t2 = f[l5] + t9;
+        mov     eax, _l5
+        fld     qword [ebx+eax*8]
+        fld     _t9
+        fld     st0
+        fadd    st0, st2
+        fstp    _t2
+        fsubp   st1, st0
+        fstp    _t1
+; t3 = - t8  - t0;
+        fld     _t8
+        fadd    _t0
+        fchs
+        fstp    _t3
+; t4 =   t5  - t7;
+        fld     _t5
+        fsub    _t7
+        fstp    _t4
+; f[l5] = t1 + t4;
+        fld     _t1
+        fld     _t4
+        fld     st0
+        fadd    st0, st2
+        fstp    qword [ebx+eax*8]
+; f[l7] = t1 - t4;
+        mov     eax, _l7
+        fsubp   st1, st0
+        fstp    qword [ebx+eax*8]
+; f[l6] = t2 + t3;
+        mov     eax, _l6
+        fld     _t2
+        fld     _t3
+        fld     st0
+        fadd    st0, st2
+        fstp    qword [ebx+eax*8]
+; f[l8] = t2 - t3;
+        mov     eax, _l8
+        fsubp   st1, st0
+        fstp    qword [ebx+eax*8]
+; t1 = f[l1] + t6;
+        mov     eax, _l1
+        fld     qword [ebx+eax*8]
+        fld     _t6
+        fld     st0
+        fadd    st0, st2
+        fstp    _t1
+; t2 = f[l1] - t6;
+        fsubp   st1, st0
+        fstp    _t2
+; t3 =    t8 - t0;
+        fld     _t8
+        fsub    _t0
+        fstp    _t3
+; t4 =    t5 + t7;
+        fld     _t5
+        fadd    _t7
+        fstp    _t4
+; f[l1] = t1 + t4;
+        mov     eax, _l1
+        fld     _t1
+        fld     _t4
+      fld     st0
+        fadd    st0, st2
+        fstp    qword [ebx+eax*8]
+; f[l3] = t1 - t4;
+        mov     eax, _l3
+        fsubp   st1, st0
+        fstp    qword [ebx+eax*8]
+; f[l2] = t2 + t3;
+        mov     eax, _l2
+        fld     _t2
+        fld     _t3
+        fld     st0
+        fadd    st0, st2
+        fstp    qword [ebx+eax*8]
+; f[l4] = t2 - t3;
+        mov     eax, _l4
+        fsubp   st1, st0
+        fstp    qword [ebx+eax*8]
+; 374  :       }
+        jmp     .next_k
+align 4
+.done_k:
+; 375  :     }
+        add     ebx, _d6        ; d6 = d5*8
+        cmp     ebx, _end_of_array
+        jb      .next_j
+; 376  :   }
+        mov     cx, _step
+        jmp     .newstep
+.done:
+        mov     esp, ebp
+        pop     ebp
+; 377  : }
+        ret
+                ;=========== Step3 ends here ===========
+; =================================================================
+;=================================================================
+; parameters:
+; -- [ebp+12]   = N
+; -- [ebp+16]  = p
+; -- [ebp+20]  = 4k-aligned data array  address
+; -- [ebp+24]  = 4k-aligned SinCosTable address
+; returns:
+; -- nothing
+; destroys:
+; -- all GPRegs
+;; ==========================
+align 4
+FHT_4:
+        push    ebp
+        mov     ebp, esp
+        mov     edx, [ebp+20]   ; a
+        mov     dl, byte[ebp+16]
+        call BitInvert
+        push    dword[ebp+20]   ; a
+        push    ecx             ; N
+        call    step1           ; 4-point transform
+        cmp     cl, 1
+        jz      .done
+        call    step2           ; 16-point transform
+        cmp     byte[ebp+16],1  ; p = 2 ?
+        jz      .done
+        pop     edx             ; N
+        pop     ecx             ; a
+        push    dword[ebp+24]   ; t
+        push    ecx
+        push    dword[ebp+16]   ; p
+        push    edx             ; N
+        call    step3
+.done:
+        mov     esp, ebp
+        pop     ebp
+ret

Subversion Repositories Kolibri OS

Compare Revisions

Regard whitespace Rev 1640 → Rev 1641