Rev 2215 | Go to most recent revision | Only display areas with differences | Regard whitespace | Details | Blame | Last modification | View Log | RSS feed
Rev 2215 | Rev 3474 | ||
---|---|---|---|
1 | ; Fast Hartley Transform routine |
1 | ; Fast Hartley Transform routine |
2 | ; Copyright (C) 1999, 2004, 2010 |
2 | ; Copyright (C) 1999, 2004, 2010 |
3 | ; Artem Jerdev artem@jerdev.co.uk |
3 | ; Artem Jerdev artem@jerdev.co.uk |
4 | ; |
4 | ; |
5 | ; free KolibriOS version - not to be ported to other OSes |
5 | ; free KolibriOS version - not to be ported to other OSes |
6 | ; ========================================================== |
6 | ; ========================================================== |
7 | 7 | ||
8 | 8 | ||
9 | ; global constants |
9 | ; global constants |
10 | align 8 |
10 | align 8 |
11 | _r dq 1.41421356237309504880169 ; = sqrt(2) |
11 | _r dq 1.41421356237309504880169 ; = sqrt(2) |
12 | _r2 dq 0.70710678118654752440084 ; = sqrt(2)/2 |
12 | _r2 dq 0.70710678118654752440084 ; = sqrt(2)/2 |
13 | _c1 dq 0.92387953251128675612818 ; = cos(pi/8) |
13 | _c1 dq 0.92387953251128675612818 ; = cos(pi/8) |
14 | _s1 dq 0.38268343236508977172846 ; = sin(pi/8) |
14 | _s1 dq 0.38268343236508977172846 ; = sin(pi/8) |
15 | 15 | ||
16 | ;================================================================= |
16 | ;================================================================= |
17 | ; parameter1: |
17 | ; parameter1: |
18 | ; -- reg dl (bits[3:0]) = Power_of_4 |
18 | ; -- reg dl (bits[3:0]) = Power_of_4 |
19 | ; returns: |
19 | ; returns: |
20 | ; -- reg edx = _CosTable address (4k-aligned) |
20 | ; -- reg edx = _CosTable address (4k-aligned) |
21 | ; assumes: _SinTable = _CosTable + (N/2)*8 |
21 | ; assumes: _SinTable = _CosTable + (N/2)*8 |
22 | ; user heap has to be initialized |
22 | ; user heap has to be initialized |
23 | ; destroys: |
23 | ; destroys: |
24 | ; -- eax, ebx, ecx |
24 | ; -- eax, ebx, ecx |
25 | ;; ========================== |
25 | ;; ========================== |
26 | align 4 |
26 | align 4 |
27 | CreateSinCosTable: |
27 | CreateSinCosTable: |
28 | xor eax, eax |
28 | xor eax, eax |
29 | inc eax |
29 | inc eax |
30 | mov cl, dl |
30 | mov cl, dl |
31 | and cl, 15 |
31 | and cl, 15 |
32 | shl eax, cl |
32 | shl eax, cl |
33 | shl eax, cl |
33 | shl eax, cl |
34 | mov ecx, eax ; now ecx = N |
34 | mov ecx, eax ; now ecx = N |
35 | shl ecx, 3 |
35 | shl ecx, 3 |
36 | mov ebx, 12 |
36 | mov ebx, 12 |
37 | mov eax, 68 |
37 | mov eax, 68 |
38 | int 0x40 ; getmem(N*sizeof(double)) |
38 | int 0x40 ; getmem(N*sizeof(double)) |
39 | 39 | ||
40 | mov edx, eax ; edx = _CosTable |
40 | mov edx, eax ; edx = _CosTable |
41 | shr ecx, 1 |
41 | shr ecx, 1 |
42 | mov ebx, eax |
42 | mov ebx, eax |
43 | add ebx, ecx ; ebx = _SinTable |
43 | add ebx, ecx ; ebx = _SinTable |
44 | shr ecx, 3 |
44 | shr ecx, 3 |
45 | push ecx ; [esp] = ecx = N/2 |
45 | push ecx ; [esp] = ecx = N/2 |
46 | 46 | ||
47 | xor eax, eax |
47 | xor eax, eax |
48 | fldpi |
48 | fldpi |
49 | fidiv dword[esp] ; st : dx = 2*pi/N |
49 | fidiv dword[esp] ; st : dx = 2*pi/N |
50 | pop ecx |
50 | pop ecx |
51 | fldz ; st : 0, dx |
51 | fldz ; st : 0, dx |
52 | .loop: |
52 | .loop: |
53 | fld st0 ; st : x, x, dx |
53 | fld st0 ; st : x, x, dx |
54 | FSINCOS ; st : cos, sin, x, dx |
54 | FSINCOS ; st : cos, sin, x, dx |
55 | fstp qword [edx+eax*8] ; st : sin, x, dx |
55 | fstp qword [edx+eax*8] ; st : sin, x, dx |
56 | fstp qword [ebx+eax*8] ; st : x, dx |
56 | fstp qword [ebx+eax*8] ; st : x, dx |
57 | fadd st0, st1 ; st : x+dx, dx |
57 | fadd st0, st1 ; st : x+dx, dx |
58 | 58 | ||
59 | inc eax |
59 | inc eax |
60 | cmp eax, ecx |
60 | cmp eax, ecx |
61 | jne .loop |
61 | jne .loop |
62 | fstp st0 ; st : dx |
62 | fstp st0 ; st : dx |
63 | fstp st0 ; st : |
63 | fstp st0 ; st : |
64 | ret |
64 | ret |
65 | 65 | ||
66 | ;================================================================= |
66 | ;================================================================= |
67 | ; parameter1: |
67 | ; parameter1: |
68 | ; -- reg edx = _CosTable address |
68 | ; -- reg edx = _CosTable address |
69 | ; destroys: |
69 | ; destroys: |
70 | ; -- eax, ebx, ecx |
70 | ; -- eax, ebx, ecx |
71 | ;; ========================== |
71 | ;; ========================== |
72 | align 4 |
72 | align 4 |
73 | DestroySinCosTable: |
73 | DestroySinCosTable: |
74 | mov ecx, edx |
74 | mov ecx, edx |
75 | mov ebx, 13 |
75 | mov ebx, 13 |
76 | mov eax, 68 |
76 | mov eax, 68 |
77 | int 0x40 ; free(SinCosTable) |
77 | int 0x40 ; free(SinCosTable) |
78 | ret |
78 | ret |
79 | 79 | ||
80 | ;================================================================= |
80 | ;================================================================= |
81 | ; parameter1: |
81 | ; parameter1: |
82 | ; -- reg dl (bits[3:0]) = Power_of_4 |
82 | ; -- reg dl (bits[3:0]) = Power_of_4 |
83 | ; -- reg edx && (-16) = 4k-aligned data array address |
83 | ; -- reg edx && (-16) = 4k-aligned data array address |
84 | ; returns: |
84 | ; returns: |
85 | ; -- edx = Power_of_4 |
85 | ; -- edx = Power_of_4 |
86 | ; -- ecx = N |
86 | ; -- ecx = N |
87 | ; destroys: |
87 | ; destroys: |
88 | ; -- eax, ebx, ecx, edx, esi |
88 | ; -- eax, ebx, ecx, edx, esi |
89 | ;; ========================== |
89 | ;; ========================== |
90 | align 4 |
90 | align 4 |
91 | BitInvert: |
91 | BitInvert: |
92 | mov esi, edx |
92 | mov esi, edx |
93 | and esi, 0xFFFFFFF0 |
93 | and esi, 0xFFFFFFF0 |
94 | and edx, 0x0F |
94 | and edx, 0x0F |
95 | push edx |
95 | push edx |
96 | mov cl, dl |
96 | mov cl, dl |
97 | xor eax, eax |
97 | xor eax, eax |
98 | inc eax |
98 | inc eax |
99 | shl eax, cl |
99 | shl eax, cl |
100 | shl eax, cl |
100 | shl eax, cl |
101 | push eax |
101 | push eax |
102 | xor ecx, ecx ; index term |
102 | xor ecx, ecx ; index term |
103 | .newterm: |
103 | .newterm: |
104 | inc ecx |
104 | inc ecx |
105 | cmp ecx, [esp] ; N |
105 | cmp ecx, [esp] ; N |
106 | jge .done |
106 | jge .done |
107 | 107 | ||
108 | xor eax, eax |
108 | xor eax, eax |
109 | mov edx, ecx |
109 | mov edx, ecx |
110 | xor bl, bl |
110 | xor bl, bl |
111 | 111 | ||
112 | .do_invert: |
112 | .do_invert: |
113 | inc bl |
113 | inc bl |
114 | cmp bl, byte[esp+4] ; Power_of_4 |
114 | cmp bl, byte[esp+4] ; Power_of_4 |
115 | jg .switch |
115 | jg .switch |
116 | 116 | ||
117 | mov bh, dl |
117 | mov bh, dl |
118 | and bh, 3 |
118 | and bh, 3 |
119 | shl eax, 2 |
119 | shl eax, 2 |
120 | or al, bh |
120 | or al, bh |
121 | shr edx, 2 |
121 | shr edx, 2 |
122 | jmp .do_invert |
122 | jmp .do_invert |
123 | 123 | ||
124 | .switch: |
124 | .switch: |
125 | cmp eax, ecx |
125 | cmp eax, ecx |
126 | jle .newterm |
126 | jle .newterm |
127 | 127 | ||
128 | fld qword [esi+eax*8] |
128 | fld qword [esi+eax*8] |
129 | fld qword [esi+ecx*8] |
129 | fld qword [esi+ecx*8] |
130 | fstp qword [esi+eax*8] |
130 | fstp qword [esi+eax*8] |
131 | fstp qword [esi+ecx*8] |
131 | fstp qword [esi+ecx*8] |
132 | jmp .newterm |
132 | jmp .newterm |
133 | 133 | ||
134 | .done: |
134 | .done: |
135 | pop ecx |
135 | pop ecx |
136 | pop edx |
136 | pop edx |
137 | ret |
137 | ret |
138 | 138 | ||
139 | ;================================================================= |
139 | ;================================================================= |
140 | 140 | ||
141 | 141 | ||
142 | ;================================================================= |
142 | ;================================================================= |
143 | ; stdcall parameters: |
143 | ; stdcall parameters: |
144 | ; -- [esp+4] = N |
144 | ; -- [esp+4] = N |
145 | ; -- [esp+8] = 4k-aligned data array address |
145 | ; -- [esp+8] = 4k-aligned data array address |
146 | ; returns: |
146 | ; returns: |
147 | ; -- nothing |
147 | ; -- nothing |
148 | ; destroys: |
148 | ; destroys: |
149 | ; -- ebx, esi |
149 | ; -- ebx, esi |
150 | ;; ========================== |
150 | ;; ========================== |
151 | align 4 |
151 | align 4 |
152 | step1: |
152 | step1: |
153 | mov ebx, [esp+8] |
153 | mov ebx, [esp+8] |
154 | mov esi, [esp+4] |
154 | mov esi, [esp+4] |
155 | shl esi, 3 |
155 | shl esi, 3 |
156 | add esi, ebx |
156 | add esi, ebx |
157 | 157 | ||
158 | .loop: |
158 | .loop: |
159 | fld qword[ebx] |
159 | fld qword[ebx] |
160 | fld qword[ebx+8] |
160 | fld qword[ebx+8] |
161 | fld st1 |
161 | fld st1 |
162 | fsub st0, st1 ; st : t2, f[i+1], f[i] |
162 | fsub st0, st1 ; st : t2, f[i+1], f[i] |
163 | fxch st1 ; st : f[i+1], t2, f[i] |
163 | fxch st1 ; st : f[i+1], t2, f[i] |
164 | faddp st2, st0 ; st : t2, t1 |
164 | faddp st2, st0 ; st : t2, t1 |
165 | fld qword[ebx+16] |
165 | fld qword[ebx+16] |
166 | fld qword[ebx+24] |
166 | fld qword[ebx+24] |
167 | fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1 |
167 | fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1 |
168 | fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1 |
168 | fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1 |
169 | fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1 |
169 | fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1 |
170 | fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1 |
170 | fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1 |
171 | fstp st1 ; st : t4, t3, t2, t1 |
171 | fstp st1 ; st : t4, t3, t2, t1 |
172 | fld st2 ; st : t2, t4, t3, t2, t1 |
172 | fld st2 ; st : t2, t4, t3, t2, t1 |
173 | fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1 |
173 | fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1 |
174 | fstp qword[ebx+16] ; st : t4, t3, t2, t1 |
174 | fstp qword[ebx+16] ; st : t4, t3, t2, t1 |
175 | fsubp st2, st0 ; st : t3, t2-t4, t1 |
175 | fsubp st2, st0 ; st : t3, t2-t4, t1 |
176 | fld st2 ; st : t1, t3, t2-t4, t1 |
176 | fld st2 ; st : t1, t3, t2-t4, t1 |
177 | fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1 |
177 | fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1 |
178 | fstp qword[ebx] ; st : t3, t2-t4, t1 |
178 | fstp qword[ebx] ; st : t3, t2-t4, t1 |
179 | fsubp st2, st0 ; st : t2-t4, t1-t3 |
179 | fsubp st2, st0 ; st : t2-t4, t1-t3 |
180 | fstp qword[ebx+24] ; st : t1-t3 |
180 | fstp qword[ebx+24] ; st : t1-t3 |
181 | fstp qword[ebx+8] ; st : |
181 | fstp qword[ebx+8] ; st : |
182 | 182 | ||
183 | add ebx, 32 |
183 | add ebx, 32 |
184 | cmp ebx, esi |
184 | cmp ebx, esi |
185 | jnz .loop |
185 | jnz .loop |
186 | ret |
186 | ret |
187 | 187 | ||
188 | ;================================================================= |
188 | ;================================================================= |
189 | ; SSE3 version: Step1 |
189 | ; SSE3 version: Step1 |
190 | ; |
190 | ; |
191 | ;========================== |
191 | ;========================== |
192 | 192 | ||
193 | align 4 |
193 | align 4 |
194 | step1_sse: |
194 | step1_sse: |
195 | mov ebx, [esp+8] |
195 | mov ebx, [esp+8] |
196 | mov esi, [esp+4] |
196 | mov esi, [esp+4] |
197 | shl esi, 3 |
197 | shl esi, 3 |
198 | add esi, ebx |
198 | add esi, ebx |
199 | 199 | ||
200 | .loop: |
200 | .loop: |
201 | movddup xmm0, [ebx] ; xmm0: f0 ; f0 |
201 | movddup xmm0, [ebx] ; xmm0: f0 ; f0 |
202 | movddup xmm1, [ebx+8] ; xmm1: f1 ; f1 |
202 | movddup xmm1, [ebx+8] ; xmm1: f1 ; f1 |
203 | addsubpd xmm0, xmm1 ; xmm0: t1 ; t2 ( + - ) |
203 | addsubpd xmm0, xmm1 ; xmm0: t1 ; t2 ( + - ) |
204 | movddup xmm1, [ebx+16] ; xmm1: f2 ; f2 |
204 | movddup xmm1, [ebx+16] ; xmm1: f2 ; f2 |
205 | movddup xmm2, [ebx+24] ; xmm2: f3 ; f3 |
205 | movddup xmm2, [ebx+24] ; xmm2: f3 ; f3 |
206 | addsubpd xmm1, xmm2 ; xmm1: t3 ; t4 ( + - ) |
206 | addsubpd xmm1, xmm2 ; xmm1: t3 ; t4 ( + - ) |
207 | 207 | ||
208 | movddup xmm2, xmm0 ; xmm2: t2 ; t2 |
208 | movddup xmm2, xmm0 ; xmm2: t2 ; t2 |
209 | movddup xmm3, xmm1 ; xmm3: t4 ; t4 |
209 | movddup xmm3, xmm1 ; xmm3: t4 ; t4 |
210 | addsubpd xmm2, xmm3 ; xmm2: 2+4; 2-4 |
210 | addsubpd xmm2, xmm3 ; xmm2: 2+4; 2-4 |
211 | shufpd xmm2, xmm2, 1 ; xmm2: 2-4; 2+4 |
211 | shufpd xmm2, xmm2, 1 ; xmm2: 2-4; 2+4 |
212 | movapd [ebx+16], xmm2 |
212 | movapd [ebx+16], xmm2 |
213 | 213 | ||
214 | shufpd xmm0, xmm0, 1 ; xmm0: t2 ; t1 |
214 | shufpd xmm0, xmm0, 1 ; xmm0: t2 ; t1 |
215 | shufpd xmm1, xmm1, 1 ; xmm1: t4 ; t3 |
215 | shufpd xmm1, xmm1, 1 ; xmm1: t4 ; t3 |
216 | movddup xmm2, xmm0 ; xmm2: t1 ; t1 |
216 | movddup xmm2, xmm0 ; xmm2: t1 ; t1 |
217 | movddup xmm3, xmm1 ; xmm3: t3 ; t3 |
217 | movddup xmm3, xmm1 ; xmm3: t3 ; t3 |
218 | addsubpd xmm2, xmm3 ; xmm2: 1+3; 1-3 |
218 | addsubpd xmm2, xmm3 ; xmm2: 1+3; 1-3 |
219 | shufpd xmm2, xmm2, 1 ; xmm2: 1-3; 1+3 |
219 | shufpd xmm2, xmm2, 1 ; xmm2: 1-3; 1+3 |
220 | movapd [ebx], xmm2 |
220 | movapd [ebx], xmm2 |
221 | 221 | ||
222 | add ebx, 32 |
222 | add ebx, 32 |
223 | cmp ebx, esi |
223 | cmp ebx, esi |
224 | jnz .loop |
224 | jnz .loop |
225 | ret |
225 | ret |
226 | 226 | ||
227 | ; local stack definitions |
227 | ; local stack definitions |
228 | ;=========================================================================== |
228 | ;=========================================================================== |
229 | _t0 equ dword [esp] |
229 | _t0 equ dword [esp] |
230 | _t1 equ dword[esp+4] |
230 | _t1 equ dword[esp+4] |
231 | _t2 equ dword[esp+8] |
231 | _t2 equ dword[esp+8] |
232 | _t3 equ dword[esp+12] |
232 | _t3 equ dword[esp+12] |
233 | _t4 equ dword[esp+16] |
233 | _t4 equ dword[esp+16] |
234 | _t5 equ dword[esp+20] |
234 | _t5 equ dword[esp+20] |
235 | _t6 equ dword[esp+24] |
235 | _t6 equ dword[esp+24] |
236 | _t7 equ dword[esp+28] |
236 | _t7 equ dword[esp+28] |
237 | _t8 equ dword[esp+32] |
237 | _t8 equ dword[esp+32] |
238 | _t9 equ dword[esp+36] |
238 | _t9 equ dword[esp+36] |
239 | 239 | ||
240 | _l1 equ dword[esp+40] |
240 | _l1 equ dword[esp+40] |
241 | _l2 equ dword[esp+44] |
241 | _l2 equ dword[esp+44] |
242 | _l3 equ dword[esp+48] |
242 | _l3 equ dword[esp+48] |
243 | _l4 equ dword[esp+52] |
243 | _l4 equ dword[esp+52] |
244 | _l5 equ dword[esp+56] |
244 | _l5 equ dword[esp+56] |
245 | _l6 equ dword[esp+60] |
245 | _l6 equ dword[esp+60] |
246 | _l7 equ dword[esp+64] |
246 | _l7 equ dword[esp+64] |
247 | _l8 equ dword[esp+68] |
247 | _l8 equ dword[esp+68] |
248 | _l9 equ dword[esp+72] |
248 | _l9 equ dword[esp+72] |
249 | _l0 equ dword[esp+76] |
249 | _l0 equ dword[esp+76] |
250 | _d1 equ dword[esp+80] |
250 | _d1 equ dword[esp+80] |
251 | _d2 equ dword[esp+84] |
251 | _d2 equ dword[esp+84] |
252 | _d3 equ dword[esp+88] |
252 | _d3 equ dword[esp+88] |
253 | _d4 equ dword[esp+92] |
253 | _d4 equ dword[esp+92] |
254 | _d5 equ dword[esp+96] |
254 | _d5 equ dword[esp+96] |
255 | _d6 equ dword[esp+100] |
255 | _d6 equ dword[esp+100] |
256 | _j5 equ dword[esp+104] |
256 | _j5 equ dword[esp+104] |
257 | _jj equ dword[esp+108] |
257 | _jj equ dword[esp+108] |
258 | _end_of_array equ dword[esp+112] |
258 | _end_of_array equ dword[esp+112] |
259 | _step equ word [esp+116] |
259 | _step equ word [esp+116] |
260 | 260 | ||
261 | 261 | ||
262 | ;================================================================= |
262 | ;================================================================= |
263 | ; cdecl parameters: |
263 | ; cdecl parameters: |
264 | ; -- [ebp+8] = N |
264 | ; -- [ebp+8] = N |
265 | ; -- [ebp+12] = 4k-aligned data array address |
265 | ; -- [ebp+12] = 4k-aligned data array address |
266 | ; returns: |
266 | ; returns: |
267 | ; -- nothing |
267 | ; -- nothing |
268 | ; destroys: |
268 | ; destroys: |
269 | ; -- eax, ebx |
269 | ; -- eax, ebx |
270 | ; locals: |
270 | ; locals: |
271 | ; -- 10 stack-located dwords (_t0 ... _t9) |
271 | ; -- 10 stack-located dwords (_t0 ... _t9) |
272 | ;; ========================== |
272 | ;; ========================== |
273 | align 4 |
273 | align 4 |
274 | step2: |
274 | step2: |
275 | push ebp |
275 | push ebp |
276 | mov ebp, esp |
276 | mov ebp, esp |
277 | sub esp, 40 |
277 | sub esp, 40 |
278 | mov ebx, [ebp+12] |
278 | mov ebx, [ebp+12] |
279 | mov eax, [ebp+ 8] |
279 | mov eax, [ebp+ 8] |
280 | shl eax, 3 |
280 | shl eax, 3 |
281 | add eax, ebx |
281 | add eax, ebx |
282 | 282 | ||
283 | .loop_i: |
283 | .loop_i: |
284 | 284 | ||
285 | ; -- quad subelements +0, +4, +8 and +12 (simpliest operations) |
285 | ; -- quad subelements +0, +4, +8 and +12 (simpliest operations) |
286 | fld qword[ebx] |
286 | fld qword[ebx] |
287 | fld qword[ebx+8*4] |
287 | fld qword[ebx+8*4] |
288 | fld st0 |
288 | fld st0 |
289 | fadd st0, st2 ; st : t1, f_4, f_0 |
289 | fadd st0, st2 ; st : t1, f_4, f_0 |
290 | fxch st1 |
290 | fxch st1 |
291 | fsubp st2, st0 ; st : t1, t2 |
291 | fsubp st2, st0 ; st : t1, t2 |
292 | fld qword[ebx+8*8] |
292 | fld qword[ebx+8*8] |
293 | fld qword[ebx+8*12] |
293 | fld qword[ebx+8*12] |
294 | fld st0 |
294 | fld st0 |
295 | fadd st0, st2 ; st : t3, f_12, t1, t2 |
295 | fadd st0, st2 ; st : t3, f_12, t1, t2 |
296 | fxch st1 |
296 | fxch st1 |
297 | fsubp st2, st0 ; st : t3, t4, t1, t2 |
297 | fsubp st2, st0 ; st : t3, t4, t1, t2 |
298 | ; ------ |
298 | ; ------ |
299 | fld st2 ; st : t1, t3, t4, t1, t2 |
299 | fld st2 ; st : t1, t3, t4, t1, t2 |
300 | fadd st0, st1 |
300 | fadd st0, st1 |
301 | fstp qword[ebx] ; st : t3, t4, t1, t2 |
301 | fstp qword[ebx] ; st : t3, t4, t1, t2 |
302 | fsub st0, st2 ; st : t3-t1, t4, t1, t2 |
302 | fsub st0, st2 ; st : t3-t1, t4, t1, t2 |
303 | fchs ; st : t1-t3, t4, t1, t2 |
303 | fchs ; st : t1-t3, t4, t1, t2 |
304 | fstp qword[ebx+8*4] ; st : t4, t1, t2 |
304 | fstp qword[ebx+8*4] ; st : t4, t1, t2 |
305 | fst st1 ; st : t4, t4, t2 |
305 | fst st1 ; st : t4, t4, t2 |
306 | fadd st0, st2 ; st : t2+t4, t4, t2 |
306 | fadd st0, st2 ; st : t2+t4, t4, t2 |
307 | fstp qword[ebx+8*8] ; st : t4, t2 |
307 | fstp qword[ebx+8*8] ; st : t4, t2 |
308 | fsubp st1, st0 ; st : t2-t4 |
308 | fsubp st1, st0 ; st : t2-t4 |
309 | fstp qword[ebx+8*12] ; st : |
309 | fstp qword[ebx+8*12] ; st : |
310 | 310 | ||
311 | ; -- even subelements +2, +6, +10 and +14 (2 multiplications needed) |
311 | ; -- even subelements +2, +6, +10 and +14 (2 multiplications needed) |
312 | fld qword[ebx+8*2] |
312 | fld qword[ebx+8*2] |
313 | fld qword[ebx+8*6] |
313 | fld qword[ebx+8*6] |
314 | fld [_r] |
314 | fld [_r] |
315 | fmul st1, st0 ; st : r, t2, t1 |
315 | fmul st1, st0 ; st : r, t2, t1 |
316 | fld qword[ebx+8*10] |
316 | fld qword[ebx+8*10] |
317 | fxch st1 ; st : r, t3, t2, t1 |
317 | fxch st1 ; st : r, t3, t2, t1 |
318 | fmul qword[ebx+8*14] ; st : t4, t3, t2, t1 |
318 | fmul qword[ebx+8*14] ; st : t4, t3, t2, t1 |
319 | ; ------ |
319 | ; ------ |
320 | fld st3 ; st : t1, t4, t3, t2, t1 |
320 | fld st3 ; st : t1, t4, t3, t2, t1 |
321 | fadd st0, st3 ; |
321 | fadd st0, st3 ; |
322 | fadd st0, st2 ; |
322 | fadd st0, st2 ; |
323 | fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3 |
323 | fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3 |
324 | fsub st0, st3 ; |
324 | fsub st0, st3 ; |
325 | fsub st0, st3 ; |
325 | fsub st0, st3 ; |
326 | fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3 |
326 | fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3 |
327 | fld st3 ; st : t1, t4, t3, t2, t1 |
327 | fld st3 ; st : t1, t4, t3, t2, t1 |
328 | fsub st0, st2 ; |
328 | fsub st0, st2 ; |
329 | fsub st0, st1 ; |
329 | fsub st0, st1 ; |
330 | fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4 |
330 | fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4 |
331 | fadd st0, st1 ; |
331 | fadd st0, st1 ; |
332 | faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1 |
332 | faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1 |
333 | fstp qword[ebx+8*6] ; store f[i+6] |
333 | fstp qword[ebx+8*6] ; store f[i+6] |
334 | fstp st0 ; st : t2, t1 |
334 | fstp st0 ; st : t2, t1 |
335 | fstp st0 ; st : t1 |
335 | fstp st0 ; st : t1 |
336 | fstp st0 ; st : |
336 | fstp st0 ; st : |
337 | 337 | ||
338 | ; -- odd subelements |
338 | ; -- odd subelements |
339 | fld qword[ebx+8*9] |
339 | fld qword[ebx+8*9] |
340 | fld qword[ebx+8*11] |
340 | fld qword[ebx+8*11] |
341 | fld st1 |
341 | fld st1 |
342 | fsub st0, st1 |
342 | fsub st0, st1 |
343 | fxch st1 |
343 | fxch st1 |
344 | faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7]) |
344 | faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7]) |
345 | fld [_r2] |
345 | fld [_r2] |
346 | fmul st2, st0 |
346 | fmul st2, st0 |
347 | fmulp st1, st0 ; st : t9, t6 |
347 | fmulp st1, st0 ; st : t9, t6 |
348 | fld qword[ebx+8*3] |
348 | fld qword[ebx+8*3] |
349 | fld st0 |
349 | fld st0 |
350 | fadd st0, st2 ; st : t1, f[l5], t9, t6 |
350 | fadd st0, st2 ; st : t1, f[l5], t9, t6 |
351 | fstp _t1 |
351 | fstp _t1 |
352 | fsub st0, st1 |
352 | fsub st0, st1 |
353 | fstp _t2 |
353 | fstp _t2 |
354 | fstp _t9 ; (t9 never used) |
354 | fstp _t9 ; (t9 never used) |
355 | fstp _t6 ; st : |
355 | fstp _t6 ; st : |
356 | 356 | ||
357 | fld [_c1] |
357 | fld [_c1] |
358 | fld [_s1] |
358 | fld [_s1] |
359 | fld qword[ebx+8*5] |
359 | fld qword[ebx+8*5] |
360 | fld qword[ebx+8*7] |
360 | fld qword[ebx+8*7] |
361 | fld st3 ; st: c1, f[l6], f[l2], s1, c1 |
361 | fld st3 ; st: c1, f[l6], f[l2], s1, c1 |
362 | fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c |
362 | fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c |
363 | fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c |
363 | fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c |
364 | fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c |
364 | fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c |
365 | faddp st1, st0 ; st: t5, f_6, f_2, s, c |
365 | faddp st1, st0 ; st: t5, f_6, f_2, s, c |
366 | fstp _t5 ; st: f_6, f_2, s, c |
366 | fstp _t5 ; st: f_6, f_2, s, c |
367 | fld st3 ; st: c, f_6, f_2, s, c |
367 | fld st3 ; st: c, f_6, f_2, s, c |
368 | fmul st0, st1 |
368 | fmul st0, st1 |
369 | fld st3 |
369 | fld st3 |
370 | fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c |
370 | fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c |
371 | fsubp st1, st0 ; st: t8, f_6, f_2, s, c |
371 | fsubp st1, st0 ; st: t8, f_6, f_2, s, c |
372 | fstp _t8 ; st: f_6, f_2, s, c |
372 | fstp _t8 ; st: f_6, f_2, s, c |
373 | fstp st0 ; st: f_2, s, c |
373 | fstp st0 ; st: f_2, s, c |
374 | fstp st0 ; st: s, c |
374 | fstp st0 ; st: s, c |
375 | 375 | ||
376 | fld qword[ebx+8*13] |
376 | fld qword[ebx+8*13] |
377 | fld qword[ebx+8*15] |
377 | fld qword[ebx+8*15] |
378 | fld st3 ; st: c1, f[l8], f[l4], s1, c1 |
378 | fld st3 ; st: c1, f[l8], f[l4], s1, c1 |
379 | fmul st0, st1 |
379 | fmul st0, st1 |
380 | fld st3 |
380 | fld st3 |
381 | fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c |
381 | fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c |
382 | faddp st1, st0 ; st: t7, f_8, f_4, s, c |
382 | faddp st1, st0 ; st: t7, f_8, f_4, s, c |
383 | fld _t5 ; st: t5, t7, f_8, f_4, s, c |
383 | fld _t5 ; st: t5, t7, f_8, f_4, s, c |
384 | fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c |
384 | fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c |
385 | fstp _t4 |
385 | fstp _t4 |
386 | fstp _t7 ; st: f_8, f_4, s, c |
386 | fstp _t7 ; st: f_8, f_4, s, c |
387 | fld st3 ; st: c, f_8, f_4, s, c |
387 | fld st3 ; st: c, f_8, f_4, s, c |
388 | fmul st0, st2 |
388 | fmul st0, st2 |
389 | fld st3 |
389 | fld st3 |
390 | fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c |
390 | fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c |
391 | fsubp st1, st0 ; st:-t0, f_8, f_4, s, c |
391 | fsubp st1, st0 ; st:-t0, f_8, f_4, s, c |
392 | fchs |
392 | fchs |
393 | fld _t8 |
393 | fld _t8 |
394 | fchs ; st:-t8, t0, f_8, f_4, s, c |
394 | fchs ; st:-t8, t0, f_8, f_4, s, c |
395 | fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c |
395 | fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c |
396 | fstp _t3 |
396 | fstp _t3 |
397 | fstp _t0 ; st: f_8, f_4, s, c |
397 | fstp _t0 ; st: f_8, f_4, s, c |
398 | fstp st0 ; st: f_4, s, c |
398 | fstp st0 ; st: f_4, s, c |
399 | fstp st0 ; st: s, c |
399 | fstp st0 ; st: s, c |
400 | fstp st0 ; st: c |
400 | fstp st0 ; st: c |
401 | fstp st0 ; st: |
401 | fstp st0 ; st: |
402 | 402 | ||
403 | fld _t1 |
403 | fld _t1 |
404 | fld _t4 |
404 | fld _t4 |
405 | fld st1 |
405 | fld st1 |
406 | fsub st0, st1 |
406 | fsub st0, st1 |
407 | fstp qword[ebx+8*11] ; f[l7] = t1-t4 |
407 | fstp qword[ebx+8*11] ; f[l7] = t1-t4 |
408 | faddp st1, st0 |
408 | faddp st1, st0 |
409 | fstp qword[ebx+8*3] ; f[l5] = t1+t4 |
409 | fstp qword[ebx+8*3] ; f[l5] = t1+t4 |
410 | fld _t2 |
410 | fld _t2 |
411 | fld _t3 |
411 | fld _t3 |
412 | fld st1 |
412 | fld st1 |
413 | fsub st0, st1 |
413 | fsub st0, st1 |
414 | fstp qword[ebx+8*15] ; f[l8] |
414 | fstp qword[ebx+8*15] ; f[l8] |
415 | faddp st1, st0 |
415 | faddp st1, st0 |
416 | fstp qword[ebx+8*7] ; f[l6] |
416 | fstp qword[ebx+8*7] ; f[l6] |
417 | 417 | ||
418 | fld _t6 |
418 | fld _t6 |
419 | fld qword[ebx+8] |
419 | fld qword[ebx+8] |
420 | fld st1 |
420 | fld st1 |
421 | fsub st0, st1 |
421 | fld st1 |
422 | fxch st1 |
422 | faddp st3, st0 |
423 | faddp st2, st0 ; st : t2, t1 |
423 | fsubp st1, st0 ; st : t2, t1 |
- | 424 | ||
424 | fld _t8 |
425 | fld _t8 |
425 | fsub _t0 |
426 | fsub _t0 |
426 | fld _t5 |
427 | fld _t5 |
427 | fadd _t7 ; st : t4, t3, t2, t1 |
428 | fadd _t7 ; st : t4, t3, t2, t1 |
428 | 429 | ||
429 | fld st3 |
430 | fld st3 |
430 | fsub st0, st1 |
431 | fsub st0, st1 |
431 | fstp qword[ebx+8*9] ; f[l3] = t1-t4 |
432 | fstp qword[ebx+8*9] ; f[l3] = t1-t4 |
432 | fadd st0, st3 |
433 | fadd st0, st3 |
433 | fstp qword[ebx+8] ; f[l1] = t1+t4 |
434 | fstp qword[ebx+8] ; f[l1] = t1+t4 |
434 | fld st1 ; st : t2, t3, t2, t1 |
435 | fld st1 ; st : t2, t3, t2, t1 |
435 | fsub st0, st1 ; f[l4] = t2-t3 |
436 | fsub st0, st1 ; f[l4] = t2-t3 |
436 | fstp qword[ebx+8*13] ; st : t3, t2, t1 |
437 | fstp qword[ebx+8*13] ; st : t3, t2, t1 |
437 | faddp st1, st0 ; st : t2+t3, t1 |
438 | faddp st1, st0 ; st : t2+t3, t1 |
438 | fstp qword[ebx+8*5] ; f[l2] = t2+t3 |
439 | fstp qword[ebx+8*5] ; f[l2] = t2+t3 |
439 | fstp st0 ; st : |
440 | fstp st0 ; st : |
440 | 441 | ||
441 | add ebx, 16*8 |
442 | add ebx, 16*8 |
442 | cmp ebx, eax |
443 | cmp ebx, eax |
443 | jb .loop_i |
444 | jb .loop_i |
444 | 445 | ||
445 | mov esp, ebp |
446 | mov esp, ebp |
446 | pop ebp |
447 | pop ebp |
447 | ret |
448 | ret |
448 | 449 | ||
449 | 450 | ||
450 | 451 | ||
451 | 452 | ||
452 | ;================================================================= |
453 | ;================================================================= |
453 | ; cdecl parameters: |
454 | ; cdecl parameters: |
454 | ; -- [ebp+8] = N |
455 | ; -- [ebp+8] = N |
455 | ; -- [ebp+12] = p |
456 | ; -- [ebp+12] = p |
456 | ; -- [ebp+16] = 4k-aligned data array address |
457 | ; -- [ebp+16] = 4k-aligned data array address |
457 | ; -- [ebp+20] = 4k-aligned SinCosTable address |
458 | ; -- [ebp+20] = 4k-aligned SinCosTable address |
458 | ; returns: |
459 | ; returns: |
459 | ; -- nothing |
460 | ; -- nothing |
460 | ; destroys: |
461 | ; destroys: |
461 | ; -- all GPRegs |
462 | ; -- all GPRegs |
462 | ; locals: |
463 | ; locals: |
463 | ; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step) |
464 | ; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step) |
464 | ;; ========================== |
465 | ;; ========================== |
465 | align 4 |
466 | align 4 |
466 | step3: |
467 | step3: |
467 | push ebp |
468 | push ebp |
468 | mov ebp, esp |
469 | mov ebp, esp |
469 | sub esp, 120 |
470 | sub esp, 120 |
470 | ; 283 : { |
471 | ; 283 : { |
471 | 472 | ||
472 | 473 | ||
473 | ; 293 : for (l=3; l<=p; l++) |
474 | ; 293 : for (l=3; l<=p; l++) |
474 | mov cx, 0x0200 |
475 | mov cx, 0x0200 |
475 | .newstep: |
476 | .newstep: |
476 | inc ch |
477 | inc ch |
477 | cmp ch, byte[ebp+12] |
478 | cmp ch, byte[ebp+12] |
478 | jg .done |
479 | jg .done |
479 | mov _step, cx |
480 | mov _step, cx |
480 | 481 | ||
481 | ; 294 : { |
482 | ; 294 : { |
482 | ; 295 : d1 = 1 << (l + l - 3); |
483 | ; 295 : d1 = 1 << (l + l - 3); |
483 | 484 | ||
484 | mov cl, ch |
485 | mov cl, ch |
485 | add cl, cl |
486 | add cl, cl |
486 | sub cl, 3 |
487 | sub cl, 3 |
487 | mov edx, 1 |
488 | mov edx, 1 |
488 | shl edx, cl |
489 | shl edx, cl |
489 | mov _d1, edx |
490 | mov _d1, edx |
490 | 491 | ||
491 | ; 296 : d2 = d1 << 1; |
492 | ; 296 : d2 = d1 << 1; |
492 | shl edx, 1 |
493 | shl edx, 1 |
493 | mov _d2, edx |
494 | mov _d2, edx |
494 | mov eax, edx |
495 | mov eax, edx |
495 | 496 | ||
496 | ; 297 : d3 = d2 << 1; |
497 | ; 297 : d3 = d2 << 1; |
497 | shl edx, 1 |
498 | shl edx, 1 |
498 | mov _d3, edx |
499 | mov _d3, edx |
499 | 500 | ||
500 | ; 298 : d4 = d2 + d3; |
501 | ; 298 : d4 = d2 + d3; |
501 | add eax, edx |
502 | add eax, edx |
502 | mov _d4, eax |
503 | mov _d4, eax |
503 | 504 | ||
504 | ; 299 : d5 = d3 << 1; |
505 | ; 299 : d5 = d3 << 1; |
505 | shl edx, 1 |
506 | shl edx, 1 |
506 | mov _d5, edx |
507 | mov _d5, edx |
507 | shl edx, 3 |
508 | shl edx, 3 |
508 | mov _d6, edx ; d6 = d5*8 to simplify index operations |
509 | mov _d6, edx ; d6 = d5*8 to simplify index operations |
509 | 510 | ||
510 | ; 339 : j5 = N / d5; ; moved out of internal loop |
511 | ; 339 : j5 = N / d5; ; moved out of internal loop |
511 | mov cl, [ebp+12] |
512 | mov cl, [ebp+12] |
512 | sub cl, ch |
513 | sub cl, ch |
513 | add cl, cl |
514 | add cl, cl |
514 | mov edx, 1 |
515 | mov edx, 1 |
515 | shl edx, cl |
516 | shl edx, cl |
516 | mov _j5, edx |
517 | mov _j5, edx |
517 | 518 | ||
518 | ; 300 : |
519 | ; 300 : |
519 | ; 301 : for (j=0; j |
520 | ; 301 : for (j=0; j |
520 | mov ebx, [ebp+16] |
521 | mov ebx, [ebp+16] |
521 | mov esi, [ebp+8] |
522 | mov esi, [ebp+8] |
522 | shl esi, 3 |
523 | shl esi, 3 |
523 | add esi, ebx |
524 | add esi, ebx |
524 | mov _end_of_array, esi |
525 | mov _end_of_array, esi |
525 | 526 | ||
526 | .next_j: |
527 | .next_j: |
527 | 528 | ||
528 | ; { |
529 | ; { |
529 | ; t1 = f[j] + f[j+d2]; |
530 | ; t1 = f[j] + f[j+d2]; |
530 | mov eax, _d2 |
531 | mov eax, _d2 |
531 | fld qword[ebx] |
532 | fld qword[ebx] |
532 | fld qword[ebx+eax*8] |
533 | fld qword[ebx+eax*8] |
533 | fld st1 |
534 | fld st1 |
534 | fadd st0, st1 |
535 | fadd st0, st1 |
535 | fstp _t1 |
536 | fstp _t1 |
536 | 537 | ||
537 | ; t2 = f[j] - f[j+d2]; |
538 | ; t2 = f[j] - f[j+d2]; |
538 | fsubp st1, st0 |
539 | fsubp st1, st0 |
539 | fstp _t2 |
540 | fstp _t2 |
540 | 541 | ||
541 | ; t3 = f[j+d3] + f[j+d4]; |
542 | ; t3 = f[j+d3] + f[j+d4]; |
542 | mov edi, _d3 |
543 | mov edi, _d3 |
543 | fld qword[ebx+edi*8] |
544 | fld qword[ebx+edi*8] |
544 | mov edx, _d4 |
545 | mov edx, _d4 |
545 | fld qword[ebx+edx*8] |
546 | fld qword[ebx+edx*8] |
546 | fld st1 |
547 | fld st1 |
547 | fsub st0, st1 ; st : t4, f4, f3 |
548 | fsub st0, st1 ; st : t4, f4, f3 |
548 | fxch st1 ; st : f4, t4, f3 |
549 | fxch st1 ; st : f4, t4, f3 |
549 | 550 | ||
550 | ; t4 = f[j+d3] - f[j+d4]; |
551 | ; t4 = f[j+d3] - f[j+d4]; |
551 | faddp st2, st0 ; st : t4, t3 |
552 | faddp st2, st0 ; st : t4, t3 |
552 | 553 | ||
553 | ; f[j+d4] = t2 - t4; |
554 | ; f[j+d4] = t2 - t4; |
554 | ; f[j+d3] = t2 + t4; |
555 | ; f[j+d3] = t2 + t4; |
555 | fld _t2 |
556 | fld _t2 |
556 | fld st0 |
557 | fld st0 |
557 | fsub st0, st2 ; st : f4, t2, t4, t3 |
558 | fsub st0, st2 ; st : f4, t2, t4, t3 |
558 | fstp qword[ebx+edx*8] ; st : t2, t4, t3 |
559 | fstp qword[ebx+edx*8] ; st : t2, t4, t3 |
559 | fadd st0, st1 ; st : f3, t4, t3 |
560 | fadd st0, st1 ; st : f3, t4, t3 |
560 | fstp qword[ebx+edi*8] ; st : t4, t3 |
561 | fstp qword[ebx+edi*8] ; st : t4, t3 |
561 | 562 | ||
562 | ; f[j+d2] = t1 - t3; |
563 | ; f[j+d2] = t1 - t3; |
563 | ; f[j] = t1 + t3; |
564 | ; f[j] = t1 + t3; |
564 | fld _t1 |
565 | fld _t1 |
565 | fst st1 |
566 | fst st1 |
566 | fsub st0, st2 ; st : f2, t1, t3 |
567 | fsub st0, st2 ; st : f2, t1, t3 |
567 | fstp qword[ebx+eax*8] ; st : t1, t3 |
568 | fstp qword[ebx+eax*8] ; st : t1, t3 |
568 | fadd st0, st1 ; st : f0, t3 |
569 | fadd st0, st1 ; st : f0, t3 |
569 | fstp qword[ebx] ; st : t3 |
570 | fstp qword[ebx] ; st : t3 |
570 | fstp st0 |
571 | fstp st0 |
571 | 572 | ||
572 | ; jj = j + d1; / ?? |
573 | ; jj = j + d1; / ?? |
573 | mov edi, _d1 |
574 | mov edi, _d1 |
574 | shl edi, 3 ; = d1*8 |
575 | shl edi, 3 ; = d1*8 |
575 | mov edx, edi |
576 | mov edx, edi |
576 | mov eax, edi |
577 | mov eax, edi |
577 | add eax, eax ; eax = d2*8 |
578 | add eax, eax ; eax = d2*8 |
578 | shl edx, 2 ; = d3*8 |
579 | shl edx, 2 ; = d3*8 |
579 | add edi, ebx ; now [edi] points to f[jj] |
580 | add edi, ebx ; now [edi] points to f[jj] |
580 | add edx, edi ; and [edx] points to f[jj+d3] |
581 | add edx, edi ; and [edx] points to f[jj+d3] |
581 | 582 | ||
582 | ; t1 = f[jj]; |
583 | ; t1 = f[jj]; |
583 | fld qword [edi] ; st : t1 |
584 | fld qword [edi] ; st : t1 |
584 | ; t3 = f[jj+d3]; |
585 | ; t3 = f[jj+d3]; |
585 | fld qword [edx] ; st : t3, t1 |
586 | fld qword [edx] ; st : t3, t1 |
586 | 587 | ||
587 | ; t2 = f[jj+d2] * r; |
588 | ; t2 = f[jj+d2] * r; |
588 | fld qword [edi+eax] |
589 | fld qword [edi+eax] |
589 | fld [_r] |
590 | fld [_r] |
590 | fmul st1, st0 ; st : r, t2, t3, t1 |
591 | fmul st1, st0 ; st : r, t2, t3, t1 |
591 | ; t4 = f[jj+d4] * r |
592 | ; t4 = f[jj+d4] * r |
592 | fmul qword [edx+eax] ; st : t4, t2, t3, t1 |
593 | fmul qword [edx+eax] ; st : t4, t2, t3, t1 |
593 | 594 | ||
594 | ; f[jj] = t1 + t2 + t3; |
595 | ; f[jj] = t1 + t2 + t3; |
595 | fld st3 ; st : t1, t4, t2, t3, t1 |
596 | fld st3 ; st : t1, t4, t2, t3, t1 |
596 | fadd st0, st3 |
597 | fadd st0, st3 |
597 | fadd st0, st2 |
598 | fadd st0, st2 |
598 | fstp qword [edi] |
599 | fstp qword [edi] |
599 | 600 | ||
600 | ; f[jj+d2] = t1 - t3 + t4; |
601 | ; f[jj+d2] = t1 - t3 + t4; |
601 | fld st3 |
602 | fld st3 |
602 | fsub st0, st3 ; st : (t1-t3), t4, t2, t3, t1 |
603 | fsub st0, st3 ; st : (t1-t3), t4, t2, t3, t1 |
603 | fld st0 |
604 | fld st0 |
604 | fadd st0, st2 ; st : f2, (t1-t3), t4, t2, t3, t1 |
605 | fadd st0, st2 ; st : f2, (t1-t3), t4, t2, t3, t1 |
605 | fstp qword [edi+eax] |
606 | fstp qword [edi+eax] |
606 | ; f[jj+d4] = t1 - t3 - t4; |
607 | ; f[jj+d4] = t1 - t3 - t4; |
607 | fsub st0, st1 ; st : f4, t4, t2, t3, t1 |
608 | fsub st0, st1 ; st : f4, t4, t2, t3, t1 |
608 | fstp qword [edx+eax] |
609 | fstp qword [edx+eax] |
609 | 610 | ||
610 | ; f[jj+d3] = t1 - t2 + t3; |
611 | ; f[jj+d3] = t1 - t2 + t3; |
611 | fstp st0 ; st : t2, t3, t1 |
612 | fstp st0 ; st : t2, t3, t1 |
612 | fsubp st1, st0 ; st : (t3-t2), t1 |
613 | fsubp st1, st0 ; st : (t3-t2), t1 |
613 | faddp st1, st0 ; st : f3 |
614 | faddp st1, st0 ; st : f3 |
614 | fstp qword [edx] |
615 | fstp qword [edx] |
615 | 616 | ||
616 | ; for (k=1; k |
617 | ; for (k=1; k |
617 | xor ecx, ecx ; ecx = k |
618 | xor ecx, ecx ; ecx = k |
618 | mov _jj, ecx |
619 | mov _jj, ecx |
619 | .next_k: |
620 | .next_k: |
620 | inc ecx |
621 | inc ecx |
621 | cmp ecx, _d1 |
622 | cmp ecx, _d1 |
622 | jge .done_k |
623 | jge .done_k |
623 | ; { |
624 | ; { |
624 | mov eax, _d2 ; the sector increment |
625 | mov eax, _d2 ; the sector increment |
625 | ; l1 = j + k; |
626 | ; l1 = j + k; |
626 | mov edx, ecx |
627 | mov edx, ecx |
627 | mov _l1, edx ; [ebx+edx*8] --> f[j+k] |
628 | mov _l1, edx ; [ebx+edx*8] --> f[j+k] |
628 | ; l2 = l1 + d2; |
629 | ; l2 = l1 + d2; |
629 | add edx, eax |
630 | add edx, eax |
630 | mov _l2, edx |
631 | mov _l2, edx |
631 | ; l3 = l1 + d3; |
632 | ; l3 = l1 + d3; |
632 | add edx, eax |
633 | add edx, eax |
633 | mov _l3, edx |
634 | mov _l3, edx |
634 | ; l4 = l1 + d4; |
635 | ; l4 = l1 + d4; |
635 | add edx, eax |
636 | add edx, eax |
636 | mov _l4, edx |
637 | mov _l4, edx |
637 | 638 | ||
638 | ; l5 = j + d2 - k; |
639 | ; l5 = j + d2 - k; |
639 | mov edx, eax |
640 | mov edx, eax |
640 | sub edx, ecx |
641 | sub edx, ecx |
641 | mov _l5, edx |
642 | mov _l5, edx |
642 | ; l6 = l5 + d2; |
643 | ; l6 = l5 + d2; |
643 | add edx, eax |
644 | add edx, eax |
644 | mov _l6, edx |
645 | mov _l6, edx |
645 | ; l7 = l5 + d3; |
646 | ; l7 = l5 + d3; |
646 | add edx, eax |
647 | add edx, eax |
647 | mov _l7, edx |
648 | mov _l7, edx |
648 | ; l8 = l5 + d4; |
649 | ; l8 = l5 + d4; |
649 | add edx, eax |
650 | add edx, eax |
650 | mov _l8, edx |
651 | mov _l8, edx |
651 | 652 | ||
652 | 653 | ||
653 | ; 340 : j5 *= k; // add-substituted multiplication |
654 | ; 340 : j5 *= k; // add-substituted multiplication |
654 | mov eax, _jj |
655 | mov eax, _jj |
655 | add eax, _j5 |
656 | add eax, _j5 |
656 | mov _jj, eax |
657 | mov _jj, eax |
657 | 658 | ||
658 | ; c1 = C[jj]; |
659 | ; c1 = C[jj]; |
659 | ; s1 = S[jj]; |
660 | ; s1 = S[jj]; |
660 | mov edi, [ebp+20] |
661 | mov edi, [ebp+20] |
661 | fld qword[edi+eax*8] |
662 | fld qword[edi+eax*8] |
662 | mov esi, [ebp+8] |
663 | mov esi, [ebp+8] |
663 | shl esi, 2 |
664 | shl esi, 2 |
664 | add esi, edi |
665 | add esi, edi |
665 | fld qword[esi+eax*8] ; st : s1, c1 |
666 | fld qword[esi+eax*8] ; st : s1, c1 |
666 | 667 | ||
667 | ; t5 = f[l2] * c1 + f[l6] * s1; |
668 | ; t5 = f[l2] * c1 + f[l6] * s1; |
668 | ; t8 = f[l6] * c1 - f[l2] * s1; |
669 | ; t8 = f[l6] * c1 - f[l2] * s1; |
669 | mov edx, _l6 |
670 | mov edx, _l6 |
670 | fld qword[ebx+edx*8] |
671 | fld qword[ebx+edx*8] |
671 | mov edx, _l2 |
672 | mov edx, _l2 |
672 | fld st0 |
673 | fld st0 |
673 | fmul st0, st2 |
674 | fmul st0, st2 |
674 | fxch st1 |
675 | fxch st1 |
675 | fmul st0, st3 |
676 | fmul st0, st3 |
676 | fld qword[ebx+edx*8] ; st : f[l2], f[l6]*c, f[l6]*s, s, c |
677 | fld qword[ebx+edx*8] ; st : f[l2], f[l6]*c, f[l6]*s, s, c |
677 | fmul st4, st0 |
678 | fmul st4, st0 |
678 | fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c |
679 | fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c |
679 | fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c |
680 | fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c |
680 | fstp _t8 |
681 | fstp _t8 |
681 | faddp st2, st0 ; st : f[l2]*s, t5 |
682 | faddp st2, st0 ; st : f[l2]*s, t5 |
682 | fstp st0 ; st : t5 |
683 | fstp st0 ; st : t5 |
683 | fstp _t5 ; st : |
684 | fstp _t5 ; st : |
684 | 685 | ||
685 | ; c2 = C[2*jj]; |
686 | ; c2 = C[2*jj]; |
686 | ; s2 = S[2*jj]; |
687 | ; s2 = S[2*jj]; |
687 | shl eax, 1 |
688 | shl eax, 1 |
688 | fld qword[edi+eax*8] |
689 | fld qword[edi+eax*8] |
689 | fld qword[esi+eax*8] ; st : s2, c2 |
690 | fld qword[esi+eax*8] ; st : s2, c2 |
690 | 691 | ||
691 | ; t6 = f[l3] * c2 + f[l7] * s2; |
692 | ; t6 = f[l3] * c2 + f[l7] * s2; |
692 | ; t9 = f[l7] * c2 - f[l3] * s2; |
693 | ; t9 = f[l7] * c2 - f[l3] * s2; |
693 | mov edx, _l7 |
694 | mov edx, _l7 |
694 | fld qword[ebx+edx*8] |
695 | fld qword[ebx+edx*8] |
695 | mov edx, _l3 |
696 | mov edx, _l3 |
696 | fld st0 |
697 | fld st0 |
697 | fmul st0, st2 |
698 | fmul st0, st2 |
698 | fxch st1 |
699 | fxch st1 |
699 | fmul st0, st3 |
700 | fmul st0, st3 |
700 | fld qword[ebx+edx*8] ; st : f[l3], f[l7]*c, f[l7]*s, s, c |
701 | fld qword[ebx+edx*8] ; st : f[l3], f[l7]*c, f[l7]*s, s, c |
701 | fmul st4, st0 |
702 | fmul st4, st0 |
702 | fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c |
703 | fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c |
703 | fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c |
704 | fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c |
704 | fstp _t9 |
705 | fstp _t9 |
705 | faddp st2, st0 ; st : f[l2]*s, t6 |
706 | faddp st2, st0 ; st : f[l2]*s, t6 |
706 | fstp st0 ; st : t6 |
707 | fstp st0 ; st : t6 |
707 | fstp _t6 ; st : |
708 | fstp _t6 ; st : |
708 | 709 | ||
709 | ; c3 = C[3*jj]; |
710 | ; c3 = C[3*jj]; |
710 | ; s3 = S[3*jj]; |
711 | ; s3 = S[3*jj]; |
711 | add eax, _jj |
712 | add eax, _jj |
712 | fld qword[edi+eax*8] |
713 | fld qword[edi+eax*8] |
713 | fld qword[esi+eax*8] ; st : s3, c3 |
714 | fld qword[esi+eax*8] ; st : s3, c3 |
714 | 715 | ||
715 | ; t7 = f[l4] * c3 + f[l8] * s3; |
716 | ; t7 = f[l4] * c3 + f[l8] * s3; |
716 | ; t0 = f[l8] * c3 - f[l4] * s3; |
717 | ; t0 = f[l8] * c3 - f[l4] * s3; |
717 | mov edx, _l8 |
718 | mov edx, _l8 |
718 | fld qword[ebx+edx*8] |
719 | fld qword[ebx+edx*8] |
719 | mov edx, _l4 |
720 | mov edx, _l4 |
720 | fld st0 |
721 | fld st0 |
721 | fmul st0, st2 |
722 | fmul st0, st2 |
722 | fxch st1 |
723 | fxch st1 |
723 | fmul st0, st3 |
724 | fmul st0, st3 |
724 | fld qword[ebx+edx*8] ; st : f[l4], f[l8]*c, f[l8]*s, s, c |
725 | fld qword[ebx+edx*8] ; st : f[l4], f[l8]*c, f[l8]*s, s, c |
725 | fmul st4, st0 |
726 | fmul st4, st0 |
726 | fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c |
727 | fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c |
727 | fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c |
728 | fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c |
728 | fstp _t0 |
729 | fstp _t0 |
729 | faddp st2, st0 ; st : f[l2]*s, t7 |
730 | faddp st2, st0 ; st : f[l2]*s, t7 |
730 | fstp st0 ; st : t7 |
731 | fstp st0 ; st : t7 |
731 | fstp _t7 ; st : |
732 | fstp _t7 ; st : |
732 | 733 | ||
733 | ; t1 = f[l5] - t9; |
734 | ; t1 = f[l5] - t9; |
734 | ; t2 = f[l5] + t9; |
735 | ; t2 = f[l5] + t9; |
735 | mov eax, _l5 |
736 | mov eax, _l5 |
736 | fld qword [ebx+eax*8] |
737 | fld qword [ebx+eax*8] |
737 | fld _t9 |
738 | fld _t9 |
738 | fld st0 |
739 | fld st0 |
739 | fadd st0, st2 |
740 | fadd st0, st2 |
740 | fstp _t2 |
741 | fstp _t2 |
741 | fsubp st1, st0 |
742 | fsubp st1, st0 |
742 | fstp _t1 |
743 | fstp _t1 |
743 | 744 | ||
744 | ; t3 = - t8 - t0; |
745 | ; t3 = - t8 - t0; |
745 | fld _t8 |
746 | fld _t8 |
746 | fadd _t0 |
747 | fadd _t0 |
747 | fchs |
748 | fchs |
748 | fstp _t3 |
749 | fstp _t3 |
749 | ; t4 = t5 - t7; |
750 | ; t4 = t5 - t7; |
750 | fld _t5 |
751 | fld _t5 |
751 | fsub _t7 |
752 | fsub _t7 |
752 | fstp _t4 |
753 | fstp _t4 |
753 | 754 | ||
754 | ; f[l5] = t1 + t4; |
755 | ; f[l5] = t1 + t4; |
755 | fld _t1 |
756 | fld _t1 |
756 | fld _t4 |
757 | fld _t4 |
757 | fld st0 |
758 | fld st0 |
758 | fadd st0, st2 |
759 | fadd st0, st2 |
759 | fstp qword [ebx+eax*8] |
760 | fstp qword [ebx+eax*8] |
760 | ; f[l7] = t1 - t4; |
761 | ; f[l7] = t1 - t4; |
761 | mov eax, _l7 |
762 | mov eax, _l7 |
762 | fsubp st1, st0 |
763 | fsubp st1, st0 |
763 | fstp qword [ebx+eax*8] |
764 | fstp qword [ebx+eax*8] |
764 | 765 | ||
765 | ; f[l6] = t2 + t3; |
766 | ; f[l6] = t2 + t3; |
766 | mov eax, _l6 |
767 | mov eax, _l6 |
767 | fld _t2 |
768 | fld _t2 |
768 | fld _t3 |
769 | fld _t3 |
769 | fld st0 |
770 | fld st0 |
770 | fadd st0, st2 |
771 | fadd st0, st2 |
771 | fstp qword [ebx+eax*8] |
772 | fstp qword [ebx+eax*8] |
772 | ; f[l8] = t2 - t3; |
773 | ; f[l8] = t2 - t3; |
773 | mov eax, _l8 |
774 | mov eax, _l8 |
774 | fsubp st1, st0 |
775 | fsubp st1, st0 |
775 | fstp qword [ebx+eax*8] |
776 | fstp qword [ebx+eax*8] |
776 | 777 | ||
777 | ; t1 = f[l1] + t6; |
778 | ; t1 = f[l1] + t6; |
778 | mov eax, _l1 |
779 | mov eax, _l1 |
779 | fld qword [ebx+eax*8] |
780 | fld qword [ebx+eax*8] |
780 | fld _t6 |
781 | fld _t6 |
781 | fld st0 |
782 | fld st0 |
782 | fadd st0, st2 |
783 | fadd st0, st2 |
783 | fstp _t1 |
784 | fstp _t1 |
784 | ; t2 = f[l1] - t6; |
785 | ; t2 = f[l1] - t6; |
785 | fsubp st1, st0 |
786 | fsubp st1, st0 |
786 | fstp _t2 |
787 | fstp _t2 |
787 | 788 | ||
788 | ; t3 = t8 - t0; |
789 | ; t3 = t8 - t0; |
789 | fld _t8 |
790 | fld _t8 |
790 | fsub _t0 |
791 | fsub _t0 |
791 | fstp _t3 |
792 | fstp _t3 |
792 | ; t4 = t5 + t7; |
793 | ; t4 = t5 + t7; |
793 | fld _t5 |
794 | fld _t5 |
794 | fadd _t7 |
795 | fadd _t7 |
795 | fstp _t4 |
796 | fstp _t4 |
796 | 797 | ||
797 | ; f[l1] = t1 + t4; |
798 | ; f[l1] = t1 + t4; |
798 | mov eax, _l1 |
799 | mov eax, _l1 |
799 | fld _t1 |
800 | fld _t1 |
800 | fld _t4 |
801 | fld _t4 |
801 | fld st0 |
802 | fld st0 |
802 | fadd st0, st2 |
803 | fadd st0, st2 |
803 | fstp qword [ebx+eax*8] |
804 | fstp qword [ebx+eax*8] |
804 | ; f[l3] = t1 - t4; |
805 | ; f[l3] = t1 - t4; |
805 | mov eax, _l3 |
806 | mov eax, _l3 |
806 | fsubp st1, st0 |
807 | fsubp st1, st0 |
807 | fstp qword [ebx+eax*8] |
808 | fstp qword [ebx+eax*8] |
808 | 809 | ||
809 | ; f[l2] = t2 + t3; |
810 | ; f[l2] = t2 + t3; |
810 | mov eax, _l2 |
811 | mov eax, _l2 |
811 | fld _t2 |
812 | fld _t2 |
812 | fld _t3 |
813 | fld _t3 |
813 | fld st0 |
814 | fld st0 |
814 | fadd st0, st2 |
815 | fadd st0, st2 |
815 | fstp qword [ebx+eax*8] |
816 | fstp qword [ebx+eax*8] |
816 | ; f[l4] = t2 - t3; |
817 | ; f[l4] = t2 - t3; |
817 | mov eax, _l4 |
818 | mov eax, _l4 |
818 | fsubp st1, st0 |
819 | fsubp st1, st0 |
819 | fstp qword [ebx+eax*8] |
820 | fstp qword [ebx+eax*8] |
820 | 821 | ||
821 | ; 374 : } |
822 | ; 374 : } |
822 | jmp .next_k |
823 | jmp .next_k |
823 | 824 | ||
824 | .done_k: |
825 | .done_k: |
825 | ; 375 : } |
826 | ; 375 : } |
826 | add ebx, _d6 ; d6 = d5*8 |
827 | add ebx, _d6 ; d6 = d5*8 |
827 | cmp ebx, _end_of_array |
828 | cmp ebx, _end_of_array |
828 | jb .next_j |
829 | jb .next_j |
829 | 830 | ||
830 | ; 376 : } |
831 | ; 376 : } |
831 | mov cx, _step |
832 | mov cx, _step |
832 | jmp .newstep |
833 | jmp .newstep |
833 | .done: |
834 | .done: |
834 | mov esp, ebp |
835 | mov esp, ebp |
835 | pop ebp |
836 | pop ebp |
836 | ; 377 : } |
837 | ; 377 : } |
837 | ret |
838 | ret |
838 | 839 | ||
839 | 840 | ||
840 | ;=========== Step3 ends here =========== |
841 | ;=========== Step3 ends here =========== |
841 | 842 | ||
842 | 843 | ||
843 | ; ================================================================= |
844 | ; ================================================================= |
844 | 845 | ||
845 | ;================================================================= |
846 | ;================================================================= |
846 | ; parameters: |
847 | ; parameters: |
847 | ; -- [ebp+8] = N |
848 | ; -- [ebp+8] = N |
848 | ; -- [ebp+12] = p |
849 | ; -- [ebp+12] = p |
849 | ; -- [ebp+16] = 4k-aligned data array address |
850 | ; -- [ebp+16] = 4k-aligned data array address |
850 | ; -- [ebp+20] = 4k-aligned SinCosTable address |
851 | ; -- [ebp+20] = 4k-aligned SinCosTable address |
851 | ; returns: |
852 | ; returns: |
852 | ; -- nothing |
853 | ; -- nothing |
853 | ; destroys: |
854 | ; destroys: |
854 | ; -- all GPRegs |
855 | ; -- all GPRegs |
855 | ;; ========================== |
856 | ;; ========================== |
856 | 857 | ||
857 | align 4 |
858 | align 4 |
858 | 859 | ||
859 | FHT_4: |
860 | FHT_4: |
860 | 861 | ||
861 | push ebp |
862 | push ebp |
862 | mov ebp, esp |
863 | mov ebp, esp |
863 | mov edx, [ebp+16] |
864 | mov edx, [ebp+16] |
864 | add edx, [ebp+12] |
865 | add edx, [ebp+12] |
865 | call BitInvert |
866 | call BitInvert |
866 | push dword[ebp+16] |
867 | push dword[ebp+16] |
867 | push dword[ebp+8] |
868 | push dword[ebp+8] |
868 | call step1 |
869 | call step1 |
869 | call step2 |
870 | call step2 |
870 | pop edx ; N |
871 | pop edx ; N |
871 | pop ecx ; a |
872 | pop ecx ; a |
872 | push dword[ebp+20] ; t |
873 | push dword[ebp+20] ; t |
873 | push ecx |
874 | push ecx |
874 | push dword[ebp+12] ; p |
875 | push dword[ebp+12] ; p |
875 | push edx ; N |
876 | push edx ; N |
876 | call step3 |
877 | call step3 |
877 | mov esp, ebp |
878 | mov esp, ebp |
878 | pop ebp |
879 | pop ebp |
879 | 880 | ||
880 | ret><>><>><>><>=p;> |
881 | ret><>><>><>><>=p;> |