Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
1626 | art_zh | 1 | ; Fast Hartley Transform routine |
2 | ; Copyright (C) 1999, 2004, 2010 |
||
3 | ; Artem Jerdev artem@jerdev.co.uk |
||
4 | ; |
||
5 | ; free KolibriOS version - not to be ported to other OSes |
||
6 | ; ========================================================== |
||
7 | |||
8 | Power_of_4 equ 5 |
||
9 | NumPoints equ 1024 |
||
10 | N_2 equ NumPoints / 2 |
||
11 | N_4 equ NumPoints / 4 |
||
12 | |||
13 | ;================================================================= |
||
14 | ; global constants |
||
15 | align 8 |
||
16 | _root dq 1.41421356237309504880169 ; = sqrt(2) |
||
17 | _root2 dq 0.70710678118654752440084 ; = sqrt(2)/2 |
||
18 | _c1 dq 0.92387953251128675612818 ; = cos(pi/8) |
||
19 | _s1 dq 0.38268343236508977172846 ; = sin(pi/8) |
||
20 | _dx dq 0.00613592315154296875 ; pi/512 |
||
21 | |||
22 | ;[_CosTable] dd 0 ; N_2 elements |
||
23 | ;[_SinTable] dd 0 ; N_2 elements |
||
24 | |||
25 | ;; ========================== |
||
26 | align 4 |
||
27 | MakeSinCosTable: |
||
28 | mov ebx, [_Sines] |
||
29 | mov ecx, [_Cosins] |
||
30 | xor eax, eax |
||
31 | fld [_dx] ; st : dx |
||
32 | fldz ; st : 0, dx |
||
33 | .loop: |
||
34 | fld st0 ; st : x, x, dx |
||
35 | FSINCOS ; st : cos, sin, x, dx |
||
36 | fstp qword [ecx+eax*8] ; st : sin, x, dx |
||
37 | fstp qword [ebx+eax*8] ; st : x, dx |
||
38 | fadd st0, st1 ; st : x+dx, dx |
||
39 | |||
40 | inc eax |
||
41 | cmp eax, N_2 |
||
42 | jne .loop |
||
43 | fstp st0 ; st : dx |
||
44 | fstp st0 ; st : |
||
45 | ret |
||
46 | |||
47 | ; ================================================================ |
||
48 | align 4 |
||
49 | BitInvert: |
||
50 | mov esi, [x] ; array of qwords |
||
51 | xor ecx, ecx ; index term |
||
52 | .newterm: |
||
53 | inc ecx |
||
54 | cmp ecx, NumPoints |
||
55 | jge .done |
||
56 | |||
57 | xor eax, eax |
||
58 | mov edx, ecx |
||
59 | xor bl, bl |
||
60 | |||
61 | .do_invert: |
||
62 | inc bl |
||
63 | cmp bl, Power_of_4 |
||
64 | jg .switch |
||
65 | |||
66 | mov bh, dl |
||
67 | and bh, 3 |
||
68 | shl eax, 2 |
||
69 | or al, bh |
||
70 | shr edx, 2 |
||
71 | jmp .do_invert |
||
72 | |||
73 | .switch: |
||
74 | cmp eax, ecx |
||
75 | jle .newterm |
||
76 | |||
77 | fld qword [esi+eax*8] |
||
78 | fld qword [esi+ecx*8] |
||
79 | fstp qword [esi+eax*8] |
||
80 | fstp qword [esi+ecx*8] |
||
81 | jmp .newterm |
||
82 | |||
83 | .done: |
||
84 | ret |
||
85 | |||
86 | ;================================================================= |
||
87 | align 4 |
||
88 | |||
89 | step1: |
||
90 | mov esi, [x] |
||
91 | mov ebx, esi |
||
92 | add esi, NumPoints*8 |
||
93 | |||
94 | .loop: |
||
95 | fld qword[ebx] |
||
96 | fld qword[ebx+8] |
||
97 | fld st1 |
||
98 | fsub st0, st1 ; st : t2, f[i+1], f[i] |
||
99 | fxch st1 ; st : f[i+1], t2, f[i] |
||
100 | faddp st2, st0 ; st : t2, t1 |
||
101 | fld qword[ebx+16] |
||
102 | fld qword[ebx+24] |
||
103 | fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1 |
||
104 | fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1 |
||
105 | fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1 |
||
106 | fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1 |
||
107 | fstp st1 ; st : t4, t3, t2, t1 |
||
108 | fld st2 ; st : t2, t4, t3, t2, t1 |
||
109 | fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1 |
||
110 | fstp qword[ebx+16] ; st : t4, t3, t2, t1 |
||
111 | fsubp st2, st0 ; st : t3, t2-t4, t1 |
||
112 | fld st2 ; st : t1, t3, t2-t4, t1 |
||
113 | fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1 |
||
114 | fstp qword[ebx] ; st : t3, t2-t4, t1 |
||
115 | fsubp st2, st0 ; st : t2-t4, t1-t3 |
||
116 | fstp qword[ebx+24] ; st : t1-t3 |
||
117 | fstp qword[ebx+8] ; st : |
||
118 | |||
119 | add ebx, 32 |
||
120 | cmp ebx, esi |
||
121 | jnz .loop |
||
122 | |||
123 | ret |
||
124 | |||
125 | |||
126 | ; |
||
127 | ;=========================================================================== |
||
128 | step2: ; Step2 |
||
129 | ;=========================================================================== |
||
130 | |||
131 | mov eax, [_f] |
||
132 | mov ebx, eax |
||
133 | add eax, NumPoints*8 |
||
134 | |||
135 | .loop_i: |
||
136 | |||
137 | ; -- quad subelements +0, +4, +8 and +12 (simpliest operations) |
||
138 | fld qword[ebx] |
||
139 | fld qword[ebx+8*4] |
||
140 | fld st0 |
||
141 | fadd st0, st2 ; st : t1, f_4, f_0 |
||
142 | fxch st1 |
||
143 | fsubp st2, st0 ; st : t1, t2 |
||
144 | fld qword[ebx+8*8] |
||
145 | fld qword[ebx+8*12] |
||
146 | fld st0 |
||
147 | fadd st0, st2 ; st : t3, f_12, t1, t2 |
||
148 | fxch st1 |
||
149 | fsubp st2, st0 ; st : t3, t4, t1, t2 |
||
150 | ; ------ |
||
151 | fld st2 ; st : t1, t3, t4, t1, t2 |
||
152 | fadd st0, st1 |
||
153 | fstp qword[ebx] ; st : t3, t4, t1, t2 |
||
154 | fsub st0, st2 ; st : t3-t1, t4, t1, t2 |
||
155 | fchs ; st : t1-t3, t4, t1, t2 |
||
156 | fstp qword[ebx+8*4] ; st : t4, t1, t2 |
||
157 | fst st1 ; st : t4, t4, t2 |
||
158 | fadd st0, st2 ; st : t2+t4, t4, t2 |
||
159 | fstp qword[ebx+8*8] ; st : t4, t2 |
||
160 | fsubp st1, st0 ; st : t2-t4 |
||
161 | fstp qword[ebx+8*12] ; st : |
||
162 | |||
163 | ; -- even subelements +2, +6, +10 and +14 (2 multiplications needed) |
||
164 | fld qword[ebx+8*2] |
||
165 | fld qword[ebx+8*6] |
||
166 | fld [_root] |
||
167 | fmul st1, st0 ; st : r, t2, t1 |
||
168 | fld qword[ebx+8*10] |
||
169 | fxch st1 ; st : r, t3, t2, t1 |
||
170 | fmul qword[ebx+8*14] ; st : t4, t3, t2, t1 |
||
171 | ; ------ |
||
172 | fld st3 ; st : t1, t4, t3, t2, t1 |
||
173 | fadd st0, st3 ; |
||
174 | fadd st0, st2 ; |
||
175 | fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3 |
||
176 | fsub st0, st3 ; |
||
177 | fsub st0, st3 ; |
||
178 | fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3 |
||
179 | fld st3 ; st : t1, t4, t3, t2, t1 |
||
180 | fsub st0, st2 ; |
||
181 | fsub st0, st1 ; |
||
182 | fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4 |
||
183 | fadd st0, st1 ; |
||
184 | faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1 |
||
185 | fstp qword[ebx+8*6] ; store f[i+6] |
||
186 | fstp st0 ; st : t2, t1 |
||
187 | fstp st0 ; st : t1 |
||
188 | fstp st0 ; st : |
||
189 | |||
190 | ; -- odd subelements |
||
191 | fld qword[ebx+8*9] |
||
192 | fld qword[ebx+8*11] |
||
193 | fld st1 |
||
194 | fsub st0, st1 |
||
195 | fxch st1 |
||
196 | faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7]) |
||
197 | fld [_root2] |
||
198 | fmul st2, st0 |
||
199 | fmulp st1, st0 ; st : t9, t6 |
||
200 | fld qword[ebx+8*3] |
||
201 | fld st0 |
||
202 | fadd st0, st2 ; st : t1, f[l5], t9, t6 |
||
203 | fstp [_t1] |
||
204 | fsub st0, st1 |
||
205 | fstp [_t2] |
||
206 | fstp [_t9] ; (t9 never used) |
||
207 | fstp [_t6] ; st : |
||
208 | |||
209 | fld [_c1] |
||
210 | fld [_s1] |
||
211 | fld qword[ebx+8*5] |
||
212 | fld qword[ebx+8*7] |
||
213 | fld st3 ; st: c1, f[l6], f[l2], s1, c1 |
||
214 | fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c |
||
215 | fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c |
||
216 | fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c |
||
217 | faddp st1, st0 ; st: t5, f_6, f_2, s, c |
||
218 | fstp [_t5] ; st: f_6, f_2, s, c |
||
219 | fld st3 ; st: c, f_6, f_2, s, c |
||
220 | fmul st0, st1 |
||
221 | fld st3 |
||
222 | fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c |
||
223 | fsubp st1, st0 ; st: t8, f_6, f_2, s, c |
||
224 | fstp [_t8] ; st: f_6, f_2, s, c |
||
225 | fstp st0 ; st: f_2, s, c |
||
226 | fstp st0 ; st: s, c |
||
227 | |||
228 | fld qword[ebx+8*13] |
||
229 | fld qword[ebx+8*15] |
||
230 | fld st3 ; st: c1, f[l8], f[l4], s1, c1 |
||
231 | fmul st0, st1 |
||
232 | fld st3 |
||
233 | fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c |
||
234 | faddp st1, st0 ; st: t7, f_8, f_4, s, c |
||
235 | fld [_t5] ; st: t5, t7, f_8, f_4, s, c |
||
236 | fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c |
||
237 | fstp [_t4] |
||
238 | fstp [_t7] ; st: f_8, f_4, s, c |
||
239 | fld st3 ; st: c, f_8, f_4, s, c |
||
240 | fmul st0, st2 |
||
241 | fld st3 |
||
242 | fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c |
||
243 | fsubp st1, st0 ; st:-t0, f_8, f_4, s, c |
||
244 | fchs |
||
245 | fld [_t8] |
||
246 | fchs ; st:-t8, t0, f_8, f_4, s, c |
||
247 | fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c |
||
248 | fstp [_t3] |
||
249 | fstp [_t0] ; st: f_8, f_4, s, c |
||
250 | fstp st0 ; st: f_4, s, c |
||
251 | fstp st0 ; st: s, c |
||
252 | fstp st0 ; st: c |
||
253 | fstp st0 ; st: |
||
254 | |||
255 | fld [_t1] |
||
256 | fld [_t4] |
||
257 | fld st1 |
||
258 | fsub st0, st1 |
||
259 | fstp qword[ebx+8*11] ; f[l7] = t1-t4 |
||
260 | faddp st1, st0 |
||
261 | fstp qword[ebx+8*3] ; f[l5] = t1+t4 |
||
262 | fld [_t2] |
||
263 | fld [_t3] |
||
264 | fld st1 |
||
265 | fsub st0, st1 |
||
266 | fstp qword[ebx+8*15] ; f[l8] |
||
267 | faddp st1, st0 |
||
268 | fstp qword[ebx+8*7] ; f[l6] |
||
269 | |||
270 | fld [_t6] |
||
271 | fld qword[ebx+8] |
||
272 | fld st1 |
||
273 | fsub st0, st1 |
||
274 | fxch st1 |
||
275 | faddp st2, st0 ; st : t2, t1 |
||
276 | fld [_t8] |
||
277 | fsub [_t0] |
||
278 | fld [_t5] |
||
279 | fadd [_t7] ; st : t4, t3, t2, t1 |
||
280 | |||
281 | fld st3 |
||
282 | fsub st0, st1 |
||
283 | fstp qword[ebx+8*9] ; f[l3] = t1-t4 |
||
284 | fadd st0, st3 |
||
285 | fstp qword[ebx+8] ; f[l1] = t1+t4 |
||
286 | fld st1 ; st : t2, t3, t2, t1 |
||
287 | fsub st0, st1 ; f[l4] = t2-t3 |
||
288 | fstp qword[ebx+8*13] ; st : t3, t2, t1 |
||
289 | faddp st1, st0 ; st : t2+t3, t1 |
||
290 | fstp qword[ebx+8*5] ; f[l2] = t2+t3 |
||
291 | fstp st0 ; st : |
||
292 | |||
293 | add ebx, 16*8 |
||
294 | cmp ebx, eax |
||
295 | jb .loop_i |
||
296 | |||
297 | ret |
||
298 | |||
299 | align 8 ; shared local vars |
||
300 | _t0 dq 0 |
||
301 | _t1 dq 0 |
||
302 | _t2 dq 0 |
||
303 | _t3 dq 0 |
||
304 | _t4 dq 0 |
||
305 | _t5 dq 0 |
||
306 | _t6 dq 0 |
||
307 | _t7 dq 0 |
||
308 | _t8 dq 0 |
||
309 | _t9 dq 0 |
||
310 | |||
311 | |||
312 | |||
313 | ;=================================================================== |
||
314 | step3: |
||
315 | ;=================================================================== |
||
316 | |||
317 | ; 283 : { |
||
318 | |||
319 | |||
320 | ; 293 : for (l=3; l<=p; l++) |
||
321 | mov cx, 0x0200 |
||
322 | .newstep: |
||
323 | inc ch |
||
324 | cmp ch, Power_of_4 |
||
325 | jg .done |
||
326 | mov [.step], cx |
||
327 | |||
328 | ; 294 : { |
||
329 | ; 295 : d1 = 1 << (l + l - 3); |
||
330 | |||
331 | mov cl, ch |
||
332 | add cl, cl |
||
333 | sub cl, 3 |
||
334 | mov edx, 1 |
||
335 | shl edx, cl |
||
336 | mov [.d1], edx |
||
337 | |||
338 | ; 296 : d2 = d1 << 1; |
||
339 | shl edx, 1 |
||
340 | mov [.d2], edx |
||
341 | mov eax, edx |
||
342 | |||
343 | ; 297 : d3 = d2 << 1; |
||
344 | shl edx, 1 |
||
345 | mov [.d3], edx |
||
346 | |||
347 | ; 298 : d4 = d2 + d3; |
||
348 | add eax, edx |
||
349 | mov [.d4], eax |
||
350 | |||
351 | ; 299 : d5 = d3 << 1; |
||
352 | shl edx, 1 |
||
353 | mov [.d5], edx |
||
354 | shl edx, 3 |
||
355 | mov [.d6], edx ; d6 = d5*8 to simplify index operations |
||
356 | |||
357 | ; 339 : j5 = N / d5; ; moved out of internal loop |
||
358 | mov cl, Power_of_4 |
||
359 | sub cl, ch |
||
360 | add cl, cl |
||
361 | mov edx, 1 |
||
362 | shl edx, cl |
||
363 | mov [.j5], edx |
||
364 | |||
365 | ; 300 : |
||
366 | ; 301 : for (j=0; j |
||
367 | mov esi, [_f] |
||
368 | mov ebx, esi |
||
369 | add esi, NumPoints*8 |
||
370 | mov [.end_of_array], esi |
||
371 | |||
372 | .next_j: |
||
373 | |||
374 | ; { |
||
375 | ; t1 = f[j] + f[j+d2]; |
||
376 | mov eax, [.d2] |
||
377 | fld qword[ebx] |
||
378 | fld qword[ebx+eax*8] |
||
379 | fld st1 |
||
380 | fadd st0, st1 |
||
381 | fstp [_t1] |
||
382 | |||
383 | ; t2 = f[j] - f[j+d2]; |
||
384 | fsubp st1, st0 |
||
385 | fstp [_t2] |
||
386 | |||
387 | ; t3 = f[j+d3] + f[j+d4]; |
||
388 | mov edi, [.d3] |
||
389 | fld qword[ebx+edi*8] |
||
390 | mov edx, [.d4] |
||
391 | fld qword[ebx+edx*8] |
||
392 | fld st1 |
||
393 | fsub st0, st1 ; st : t4, f4, f3 |
||
394 | fxch st1 ; st : f4, t4, f3 |
||
395 | |||
396 | ; t4 = f[j+d3] - f[j+d4]; |
||
397 | faddp st2, st0 ; st : t4, t3 |
||
398 | |||
399 | ; f[j+d4] = t2 - t4; |
||
400 | ; f[j+d3] = t2 + t4; |
||
401 | fld [_t2] |
||
402 | fld st0 |
||
403 | fsub st0, st2 ; st : f4, t2, t4, t3 |
||
404 | fstp qword[ebx+edx*8] ; st : t2, t4, t3 |
||
405 | fadd st0, st1 ; st : f3, t4, t3 |
||
406 | fstp qword[ebx+edi*8] ; st : t4, t3 |
||
407 | |||
408 | ; f[j+d2] = t1 - t3; |
||
409 | ; f[j] = t1 + t3; |
||
410 | fld [_t1] |
||
411 | fst st1 |
||
412 | fsub st0, st2 ; st : f2, t1, t3 |
||
413 | fstp qword[ebx+eax*8] ; st : t1, t3 |
||
414 | fadd st0, st1 ; st : f0, t3 |
||
415 | fstp qword[ebx] ; st : t3 |
||
416 | fstp st0 |
||
417 | |||
418 | ; jj = j + d1; / ?? |
||
419 | mov edi, [.d1] |
||
420 | shl edi, 3 ; = d1*8 |
||
421 | mov edx, edi |
||
422 | mov eax, edi |
||
423 | add eax, eax ; eax = d2*8 |
||
424 | shl edx, 2 ; = d3*8 |
||
425 | add edi, ebx ; now [edi] points to f[jj] |
||
426 | add edx, edi ; and [edx] points to f[jj+d3] |
||
427 | |||
428 | ; t1 = f[jj]; |
||
429 | fld qword [edi] ; st : t1 |
||
430 | ; t3 = f[jj+d3]; |
||
431 | fld qword [edx] ; st : t3, t1 |
||
432 | |||
433 | ; t2 = f[jj+d2] * r; |
||
434 | fld qword [edi+eax] |
||
435 | fld [_root] |
||
436 | fmul st1, st0 ; st : r, t2, t3, t1 |
||
437 | ; t4 = f[jj+d4] * r |
||
438 | fmul qword [edx+eax] ; st : t4, t2, t3, t1 |
||
439 | |||
440 | ; f[jj] = t1 + t2 + t3; |
||
441 | fld st3 ; st : t1, t4, t2, t3, t1 |
||
442 | fadd st0, st3 |
||
443 | fadd st0, st2 |
||
444 | fstp qword [edi] |
||
445 | |||
446 | ; f[jj+d2] = t1 - t3 + t4; |
||
447 | fld st3 |
||
448 | fsub st0, st3 ; st : (t1-t3), t4, t2, t3, t1 |
||
449 | fld st0 |
||
450 | fadd st0, st2 ; st : f2, (t1-t3), t4, t2, t3, t1 |
||
451 | fstp qword [edi+eax] |
||
452 | ; f[jj+d4] = t1 - t3 - t4; |
||
453 | fsub st0, st1 ; st : f4, t4, t2, t3, t1 |
||
454 | fstp qword [edx+eax] |
||
455 | |||
456 | ; f[jj+d3] = t1 - t2 + t3; |
||
457 | fstp st0 ; st : t2, t3, t1 |
||
458 | fsubp st1, st0 ; st : (t3-t2), t1 |
||
459 | faddp st1, st0 ; st : f3 |
||
460 | fstp qword [edx] |
||
461 | |||
462 | ; for (k=1; k |
||
463 | xor ecx, ecx ; ecx = k |
||
464 | mov [.jj], ecx |
||
465 | .next_k: |
||
466 | inc ecx |
||
467 | cmp ecx, [.d1] |
||
468 | jge .done_k |
||
469 | ; { |
||
470 | mov eax, [.d2] ; the sector increment |
||
471 | ; l1 = j + k; |
||
472 | mov edx, ecx |
||
473 | mov [.l1], edx ; [ebx+edx*8] --> f[j+k] |
||
474 | ; l2 = l1 + d2; |
||
475 | add edx, eax |
||
476 | mov [.l2], edx |
||
477 | ; l3 = l1 + d3; |
||
478 | add edx, eax |
||
479 | mov [.l3], edx |
||
480 | ; l4 = l1 + d4; |
||
481 | add edx, eax |
||
482 | mov [.l4], edx |
||
483 | |||
484 | ; l5 = j + d2 - k; |
||
485 | mov edx, eax |
||
486 | sub edx, ecx |
||
487 | mov [.l5], edx |
||
488 | ; l6 = l5 + d2; |
||
489 | add edx, eax |
||
490 | mov [.l6], edx |
||
491 | ; l7 = l5 + d3; |
||
492 | add edx, eax |
||
493 | mov [.l7], edx |
||
494 | ; l8 = l5 + d4; |
||
495 | add edx, eax |
||
496 | mov [.l8], edx |
||
497 | |||
498 | |||
499 | ; 340 : j5 *= k; // add-substituted multiplication |
||
500 | mov eax, [.jj] |
||
501 | add eax, [.j5] |
||
502 | mov [.jj], eax |
||
503 | |||
504 | ; c1 = C[jj]; |
||
505 | ; s1 = S[jj]; |
||
506 | mov edi, [_Cosins] |
||
507 | fld qword[edi+eax*8] |
||
508 | mov esi, [_Sines] |
||
509 | fld qword[esi+eax*8] ; st : s1, c1 |
||
510 | |||
511 | ; t5 = f[l2] * c1 + f[l6] * s1; |
||
512 | ; t8 = f[l6] * c1 - f[l2] * s1; |
||
513 | mov edx, [.l6] |
||
514 | fld qword[ebx+edx*8] |
||
515 | mov edx, [.l2] |
||
516 | fld st0 |
||
517 | fmul st0, st2 |
||
518 | fxch st1 |
||
519 | fmul st0, st3 |
||
520 | fld qword[ebx+edx*8] ; st : f[l2], f[l6]*c, f[l6]*s, s, c |
||
521 | fmul st4, st0 |
||
522 | fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c |
||
523 | fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c |
||
524 | fstp [_t8] |
||
525 | faddp st2, st0 ; st : f[l2]*s, t5 |
||
526 | fstp st0 ; st : t5 |
||
527 | fstp [_t5] ; st : |
||
528 | |||
529 | ; c2 = C[2*jj]; |
||
530 | ; s2 = S[2*jj]; |
||
531 | shl eax, 1 |
||
532 | fld qword[edi+eax*8] |
||
533 | fld qword[esi+eax*8] ; st : s2, c2 |
||
534 | |||
535 | ; t6 = f[l3] * c2 + f[l7] * s2; |
||
536 | ; t9 = f[l7] * c2 - f[l3] * s2; |
||
537 | mov edx, [.l7] |
||
538 | fld qword[ebx+edx*8] |
||
539 | mov edx, [.l3] |
||
540 | fld st0 |
||
541 | fmul st0, st2 |
||
542 | fxch st1 |
||
543 | fmul st0, st3 |
||
544 | fld qword[ebx+edx*8] ; st : f[l3], f[l7]*c, f[l7]*s, s, c |
||
545 | fmul st4, st0 |
||
546 | fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c |
||
547 | fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c |
||
548 | fstp [_t9] |
||
549 | faddp st2, st0 ; st : f[l2]*s, t6 |
||
550 | fstp st0 ; st : t6 |
||
551 | fstp [_t6] ; st : |
||
552 | |||
553 | ; c3 = C[3*jj]; |
||
554 | ; s3 = S[3*jj]; |
||
555 | add eax, [.jj] |
||
556 | fld qword[edi+eax*8] |
||
557 | fld qword[esi+eax*8] ; st : s3, c3 |
||
558 | |||
559 | ; t7 = f[l4] * c3 + f[l8] * s3; |
||
560 | ; t0 = f[l8] * c3 - f[l4] * s3; |
||
561 | mov edx, [.l8] |
||
562 | fld qword[ebx+edx*8] |
||
563 | mov edx, [.l4] |
||
564 | fld st0 |
||
565 | fmul st0, st2 |
||
566 | fxch st1 |
||
567 | fmul st0, st3 |
||
568 | fld qword[ebx+edx*8] ; st : f[l4], f[l8]*c, f[l8]*s, s, c |
||
569 | fmul st4, st0 |
||
570 | fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c |
||
571 | fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c |
||
572 | fstp [_t0] |
||
573 | faddp st2, st0 ; st : f[l2]*s, t7 |
||
574 | fstp st0 ; st : t7 |
||
575 | fstp [_t7] ; st : |
||
576 | |||
577 | ; t1 = f[l5] - t9; |
||
578 | ; t2 = f[l5] + t9; |
||
579 | mov eax, [.l5] |
||
580 | fld qword [ebx+eax*8] |
||
581 | fld [_t9] |
||
582 | fld st0 |
||
583 | fadd st0, st2 |
||
584 | fstp [_t2] |
||
585 | fsubp st1, st0 |
||
586 | fstp [_t1] |
||
587 | |||
588 | ; t3 = - t8 - t0; |
||
589 | fld [_t8] |
||
590 | fadd [_t0] |
||
591 | fchs |
||
592 | fstp [_t3] |
||
593 | ; t4 = t5 - t7; |
||
594 | fld [_t5] |
||
595 | fsub [_t7] |
||
596 | fstp [_t4] |
||
597 | |||
598 | ; f[l5] = t1 + t4; |
||
599 | fld [_t1] |
||
600 | fld [_t4] |
||
601 | fld st0 |
||
602 | fadd st0, st2 |
||
603 | fstp qword [ebx+eax*8] |
||
604 | ; f[l7] = t1 - t4; |
||
605 | mov eax, [.l7] |
||
606 | fsubp st1, st0 |
||
607 | fstp qword [ebx+eax*8] |
||
608 | |||
609 | ; f[l6] = t2 + t3; |
||
610 | mov eax, [.l6] |
||
611 | fld [_t2] |
||
612 | fld [_t3] |
||
613 | fld st0 |
||
614 | fadd st0, st2 |
||
615 | fstp qword [ebx+eax*8] |
||
616 | ; f[l8] = t2 - t3; |
||
617 | mov eax, [.l8] |
||
618 | fsubp st1, st0 |
||
619 | fstp qword [ebx+eax*8] |
||
620 | |||
621 | ; t1 = f[l1] + t6; |
||
622 | mov eax, [.l1] |
||
623 | fld qword [ebx+eax*8] |
||
624 | fld [_t6] |
||
625 | fld st0 |
||
626 | fadd st0, st2 |
||
627 | fstp [_t1] |
||
628 | ; t2 = f[l1] - t6; |
||
629 | fsubp st1, st0 |
||
630 | fstp [_t2] |
||
631 | |||
632 | ; t3 = t8 - t0; |
||
633 | fld [_t8] |
||
634 | fsub [_t0] |
||
635 | fstp [_t3] |
||
636 | ; t4 = t5 + t7; |
||
637 | fld [_t5] |
||
638 | fadd [_t7] |
||
639 | fstp [_t4] |
||
640 | |||
641 | ; f[l1] = t1 + t4; |
||
642 | mov eax, [.l1] |
||
643 | fld [_t1] |
||
644 | fld [_t4] |
||
645 | fld st0 |
||
646 | fadd st0, st2 |
||
647 | fstp qword [ebx+eax*8] |
||
648 | ; f[l3] = t1 - t4; |
||
649 | mov eax, [.l3] |
||
650 | fsubp st1, st0 |
||
651 | fstp qword [ebx+eax*8] |
||
652 | |||
653 | ; f[l2] = t2 + t3; |
||
654 | mov eax, [.l2] |
||
655 | fld [_t2] |
||
656 | fld [_t3] |
||
657 | fld st0 |
||
658 | fadd st0, st2 |
||
659 | fstp qword [ebx+eax*8] |
||
660 | ; f[l4] = t2 - t3; |
||
661 | mov eax, [.l4] |
||
662 | fsubp st1, st0 |
||
663 | fstp qword [ebx+eax*8] |
||
664 | |||
665 | ; 374 : } |
||
666 | jmp .next_k |
||
667 | |||
668 | .done_k: |
||
669 | ; 375 : } |
||
670 | add ebx, [.d6] ; d6 = d5*8 |
||
671 | cmp ebx, [.end_of_array] |
||
672 | jb .next_j |
||
673 | |||
674 | ; 376 : } |
||
675 | mov cx, [.step] |
||
676 | jmp .newstep |
||
677 | .done: |
||
678 | |||
679 | ; 377 : } |
||
680 | ret |
||
681 | |||
682 | align 4 |
||
683 | .l1 dd 0 |
||
684 | .l2 dd 0 |
||
685 | .l3 dd 0 |
||
686 | .l4 dd 0 |
||
687 | .l5 dd 0 |
||
688 | .l6 dd 0 |
||
689 | .l7 dd 0 |
||
690 | .l8 dd 0 |
||
691 | .l9 dd 0 |
||
692 | .l0 dd 0 |
||
693 | .d1 dd 0 |
||
694 | .d2 dd 0 |
||
695 | .d3 dd 0 |
||
696 | .d4 dd 0 |
||
697 | .d5 dd 0 |
||
698 | .d6 dd 0 |
||
699 | .j5 dd 0 |
||
700 | .jj dd 0 |
||
701 | .end_of_array dd 0 |
||
702 | .step dw 0 |
||
703 | |||
704 | align 8 |
||
705 | |||
706 | ;=========== Step3 ends here =========== |
||
707 | |||
708 | |||
709 | |||
710 | |||
711 | |||
712 | ; ================================================================= |
||
713 | ; syscall entry |
||
714 | ; |
||
715 | _f dd ? |
||
716 | _N dd 1024 ; number of points |
||
717 | |||
718 | _a dd ? ; initial data array |
||
719 | x dd 0 ; tranformed (float) data array |
||
720 | _Cosins dd 0 |
||
721 | _Sines dd 0 |
||
722 | |||
723 | FFT4: |
||
724 | or al, al |
||
725 | jnz .trans |
||
726 | mov cl, Power_of_4 |
||
727 | mov eax, 1 |
||
728 | shl eax, cl |
||
729 | shl eax, cl |
||
730 | mov [_N], eax |
||
731 | shl eax, 2 ; size of Sine table in bytes |
||
732 | add eax, ebx |
||
733 | mov [_Sines], ebx |
||
734 | mov [_Cosins], eax |
||
735 | cpuid |
||
736 | rdtsc |
||
737 | mov [.time], eax |
||
738 | call MakeSinCosTable |
||
739 | cpuid |
||
740 | rdtsc |
||
741 | sub eax, [.time] |
||
742 | ret |
||
743 | .trans: |
||
744 | mov [x], ebx |
||
745 | mov [_f], ebx |
||
746 | cli ;----- |
||
747 | cpuid |
||
748 | rdtsc |
||
749 | mov [.time], eax |
||
750 | call BitInvert |
||
751 | call step1 |
||
752 | call step2 |
||
753 | call step3 |
||
754 | cpuid |
||
755 | rdtsc |
||
756 | sti ;---- |
||
757 | sub eax, [.time] |
||
758 | ret |
||
759 | |||
760 | .time dd 0><>><>><>><>=p;> |