Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
2210 | art_zh | 1 | ; Fast Hartley Transform routine |
2 | ; Copyright (C) 1999, 2004, 2010 |
||
3 | ; Artem Jerdev artem@jerdev.co.uk |
||
4 | ; |
||
5 | ; free KolibriOS version - not to be ported to other OSes |
||
6 | ; ========================================================== |
||
7 | |||
8 | |||
9 | ; global constants |
||
10 | align 8 |
||
11 | _r dq 1.41421356237309504880169 ; = sqrt(2) |
||
12 | _r2 dq 0.70710678118654752440084 ; = sqrt(2)/2 |
||
13 | _c1 dq 0.92387953251128675612818 ; = cos(pi/8) |
||
14 | _s1 dq 0.38268343236508977172846 ; = sin(pi/8) |
||
15 | |||
16 | ;================================================================= |
||
17 | ; parameter1: |
||
18 | ; -- reg dl (bits[3:0]) = Power_of_4 |
||
19 | ; returns: |
||
20 | ; -- reg edx = _CosTable address (4k-aligned) |
||
21 | ; assumes: _SinTable = _CosTable + (N/2)*8 |
||
22 | ; user heap has to be initialized |
||
23 | ; destroys: |
||
24 | ; -- eax, ebx, ecx |
||
25 | ;; ========================== |
||
26 | align 4 |
||
27 | CreateSinCosTable: |
||
28 | xor eax, eax |
||
29 | inc eax |
||
30 | mov cl, dl |
||
31 | and cl, 15 |
||
32 | shl eax, cl |
||
33 | shl eax, cl |
||
34 | mov ecx, eax ; now ecx = N |
||
35 | shl ecx, 3 |
||
36 | mov ebx, 12 |
||
37 | mov eax, 68 |
||
38 | int 0x40 ; getmem(N*sizeof(double)) |
||
39 | |||
40 | mov edx, eax ; edx = _CosTable |
||
41 | shr ecx, 1 |
||
42 | mov ebx, eax |
||
43 | add ebx, ecx ; ebx = _SinTable |
||
44 | shr ecx, 3 |
||
45 | push ecx ; [esp] = ecx = N/2 |
||
46 | |||
47 | xor eax, eax |
||
48 | fldpi |
||
49 | fidiv dword[esp] ; st : dx = 2*pi/N |
||
50 | pop ecx |
||
51 | fldz ; st : 0, dx |
||
52 | .loop: |
||
53 | fld st0 ; st : x, x, dx |
||
54 | FSINCOS ; st : cos, sin, x, dx |
||
55 | fstp qword [edx+eax*8] ; st : sin, x, dx |
||
56 | fstp qword [ebx+eax*8] ; st : x, dx |
||
57 | fadd st0, st1 ; st : x+dx, dx |
||
58 | |||
59 | inc eax |
||
60 | cmp eax, ecx |
||
61 | jne .loop |
||
62 | fstp st0 ; st : dx |
||
63 | fstp st0 ; st : |
||
64 | ret |
||
65 | |||
66 | ;================================================================= |
||
67 | ; parameter1: |
||
68 | ; -- reg edx = _CosTable address |
||
69 | ; destroys: |
||
70 | ; -- eax, ebx, ecx |
||
71 | ;; ========================== |
||
72 | align 4 |
||
73 | DestroySinCosTable: |
||
74 | mov ecx, edx |
||
75 | mov ebx, 13 |
||
76 | mov eax, 68 |
||
77 | int 0x40 ; free(SinCosTable) |
||
78 | ret |
||
79 | |||
80 | ;================================================================= |
||
81 | ; parameter1: |
||
82 | ; -- reg dl (bits[3:0]) = Power_of_4 |
||
83 | ; -- reg edx && (-16) = 4k-aligned data array address |
||
84 | ; returns: |
||
85 | ; -- edx = Power_of_4 |
||
86 | ; -- ecx = N |
||
87 | ; destroys: |
||
88 | ; -- eax, ebx, ecx, edx, esi |
||
89 | ;; ========================== |
||
90 | align 4 |
||
91 | BitInvert: |
||
92 | mov esi, edx |
||
93 | and esi, 0xFFFFFFF0 |
||
94 | and edx, 0x0F |
||
95 | push edx |
||
96 | mov cl, dl |
||
97 | xor eax, eax |
||
98 | inc eax |
||
99 | shl eax, cl |
||
100 | shl eax, cl |
||
101 | push eax |
||
102 | xor ecx, ecx ; index term |
||
103 | .newterm: |
||
104 | inc ecx |
||
105 | cmp ecx, [esp] ; N |
||
106 | jge .done |
||
107 | |||
108 | xor eax, eax |
||
109 | mov edx, ecx |
||
110 | xor bl, bl |
||
111 | |||
112 | .do_invert: |
||
113 | inc bl |
||
114 | cmp bl, byte[esp+4] ; Power_of_4 |
||
115 | jg .switch |
||
116 | |||
117 | mov bh, dl |
||
118 | and bh, 3 |
||
119 | shl eax, 2 |
||
120 | or al, bh |
||
121 | shr edx, 2 |
||
122 | jmp .do_invert |
||
123 | |||
124 | .switch: |
||
125 | cmp eax, ecx |
||
126 | jle .newterm |
||
127 | |||
128 | fld qword [esi+eax*8] |
||
129 | fld qword [esi+ecx*8] |
||
130 | fstp qword [esi+eax*8] |
||
131 | fstp qword [esi+ecx*8] |
||
132 | jmp .newterm |
||
133 | |||
134 | .done: |
||
135 | pop ecx |
||
136 | pop edx |
||
137 | ret |
||
138 | |||
139 | ;================================================================= |
||
140 | |||
141 | |||
142 | ;================================================================= |
||
143 | ; stdcall parameters: |
||
144 | ; -- [esp+4] = N |
||
145 | ; -- [esp+8] = 4k-aligned data array address |
||
146 | ; returns: |
||
147 | ; -- nothing |
||
148 | ; destroys: |
||
149 | ; -- ebx, esi |
||
150 | ;; ========================== |
||
151 | align 4 |
||
152 | step1: |
||
153 | mov ebx, [esp+8] |
||
154 | mov esi, [esp+4] |
||
155 | shl esi, 3 |
||
156 | add esi, ebx |
||
157 | |||
158 | .loop: |
||
159 | fld qword[ebx] |
||
160 | fld qword[ebx+8] |
||
161 | fld st1 |
||
162 | fsub st0, st1 ; st : t2, f[i+1], f[i] |
||
163 | fxch st1 ; st : f[i+1], t2, f[i] |
||
164 | faddp st2, st0 ; st : t2, t1 |
||
165 | fld qword[ebx+16] |
||
166 | fld qword[ebx+24] |
||
167 | fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1 |
||
168 | fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1 |
||
169 | fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1 |
||
170 | fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1 |
||
171 | fstp st1 ; st : t4, t3, t2, t1 |
||
172 | fld st2 ; st : t2, t4, t3, t2, t1 |
||
173 | fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1 |
||
174 | fstp qword[ebx+16] ; st : t4, t3, t2, t1 |
||
175 | fsubp st2, st0 ; st : t3, t2-t4, t1 |
||
176 | fld st2 ; st : t1, t3, t2-t4, t1 |
||
177 | fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1 |
||
178 | fstp qword[ebx] ; st : t3, t2-t4, t1 |
||
179 | fsubp st2, st0 ; st : t2-t4, t1-t3 |
||
180 | fstp qword[ebx+24] ; st : t1-t3 |
||
181 | fstp qword[ebx+8] ; st : |
||
182 | |||
183 | add ebx, 32 |
||
184 | cmp ebx, esi |
||
185 | jnz .loop |
||
186 | ret |
||
187 | |||
188 | ; local stack definitions |
||
189 | ;=========================================================================== |
||
190 | _t0 equ dword [esp] |
||
191 | _t1 equ dword[esp+4] |
||
192 | _t2 equ dword[esp+8] |
||
193 | _t3 equ dword[esp+12] |
||
194 | _t4 equ dword[esp+16] |
||
195 | _t5 equ dword[esp+20] |
||
196 | _t6 equ dword[esp+24] |
||
197 | _t7 equ dword[esp+28] |
||
198 | _t8 equ dword[esp+32] |
||
199 | _t9 equ dword[esp+36] |
||
200 | |||
201 | _l1 equ dword[esp+40] |
||
202 | _l2 equ dword[esp+44] |
||
203 | _l3 equ dword[esp+48] |
||
204 | _l4 equ dword[esp+52] |
||
205 | _l5 equ dword[esp+56] |
||
206 | _l6 equ dword[esp+60] |
||
207 | _l7 equ dword[esp+64] |
||
208 | _l8 equ dword[esp+68] |
||
209 | _l9 equ dword[esp+72] |
||
210 | _l0 equ dword[esp+76] |
||
211 | _d1 equ dword[esp+80] |
||
212 | _d2 equ dword[esp+84] |
||
213 | _d3 equ dword[esp+88] |
||
214 | _d4 equ dword[esp+92] |
||
215 | _d5 equ dword[esp+96] |
||
216 | _d6 equ dword[esp+100] |
||
217 | _j5 equ dword[esp+104] |
||
218 | _jj equ dword[esp+108] |
||
219 | _end_of_array equ dword[esp+112] |
||
220 | _step equ word [esp+116] |
||
221 | |||
222 | |||
223 | ;================================================================= |
||
224 | ; cdecl parameters: |
||
225 | ; -- [ebp+8] = N |
||
226 | ; -- [ebp+12] = 4k-aligned data array address |
||
227 | ; returns: |
||
228 | ; -- nothing |
||
229 | ; destroys: |
||
230 | ; -- eax, ebx |
||
231 | ; locals: |
||
232 | ; -- 10 stack-located dwords (_t0 ... _t9) |
||
233 | ;; ========================== |
||
234 | align 4 |
||
235 | step2: |
||
236 | push ebp |
||
237 | mov ebp, esp |
||
238 | sub esp, 40 |
||
239 | mov ebx, [ebp+12] |
||
240 | mov eax, [ebp+ 8] |
||
241 | shl eax, 3 |
||
242 | add eax, ebx |
||
243 | |||
244 | .loop_i: |
||
245 | |||
246 | ; -- quad subelements +0, +4, +8 and +12 (simpliest operations) |
||
247 | fld qword[ebx] |
||
248 | fld qword[ebx+8*4] |
||
249 | fld st0 |
||
250 | fadd st0, st2 ; st : t1, f_4, f_0 |
||
251 | fxch st1 |
||
252 | fsubp st2, st0 ; st : t1, t2 |
||
253 | fld qword[ebx+8*8] |
||
254 | fld qword[ebx+8*12] |
||
255 | fld st0 |
||
256 | fadd st0, st2 ; st : t3, f_12, t1, t2 |
||
257 | fxch st1 |
||
258 | fsubp st2, st0 ; st : t3, t4, t1, t2 |
||
259 | ; ------ |
||
260 | fld st2 ; st : t1, t3, t4, t1, t2 |
||
261 | fadd st0, st1 |
||
262 | fstp qword[ebx] ; st : t3, t4, t1, t2 |
||
263 | fsub st0, st2 ; st : t3-t1, t4, t1, t2 |
||
264 | fchs ; st : t1-t3, t4, t1, t2 |
||
265 | fstp qword[ebx+8*4] ; st : t4, t1, t2 |
||
266 | fst st1 ; st : t4, t4, t2 |
||
267 | fadd st0, st2 ; st : t2+t4, t4, t2 |
||
268 | fstp qword[ebx+8*8] ; st : t4, t2 |
||
269 | fsubp st1, st0 ; st : t2-t4 |
||
270 | fstp qword[ebx+8*12] ; st : |
||
271 | |||
272 | ; -- even subelements +2, +6, +10 and +14 (2 multiplications needed) |
||
273 | fld qword[ebx+8*2] |
||
274 | fld qword[ebx+8*6] |
||
275 | fld [_r] |
||
276 | fmul st1, st0 ; st : r, t2, t1 |
||
277 | fld qword[ebx+8*10] |
||
278 | fxch st1 ; st : r, t3, t2, t1 |
||
279 | fmul qword[ebx+8*14] ; st : t4, t3, t2, t1 |
||
280 | ; ------ |
||
281 | fld st3 ; st : t1, t4, t3, t2, t1 |
||
282 | fadd st0, st3 ; |
||
283 | fadd st0, st2 ; |
||
284 | fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3 |
||
285 | fsub st0, st3 ; |
||
286 | fsub st0, st3 ; |
||
287 | fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3 |
||
288 | fld st3 ; st : t1, t4, t3, t2, t1 |
||
289 | fsub st0, st2 ; |
||
290 | fsub st0, st1 ; |
||
291 | fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4 |
||
292 | fadd st0, st1 ; |
||
293 | faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1 |
||
294 | fstp qword[ebx+8*6] ; store f[i+6] |
||
295 | fstp st0 ; st : t2, t1 |
||
296 | fstp st0 ; st : t1 |
||
297 | fstp st0 ; st : |
||
298 | |||
299 | ; -- odd subelements |
||
300 | fld qword[ebx+8*9] |
||
301 | fld qword[ebx+8*11] |
||
302 | fld st1 |
||
303 | fsub st0, st1 |
||
304 | fxch st1 |
||
305 | faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7]) |
||
306 | fld [_r2] |
||
307 | fmul st2, st0 |
||
308 | fmulp st1, st0 ; st : t9, t6 |
||
309 | fld qword[ebx+8*3] |
||
310 | fld st0 |
||
311 | fadd st0, st2 ; st : t1, f[l5], t9, t6 |
||
312 | fstp _t1 |
||
313 | fsub st0, st1 |
||
314 | fstp _t2 |
||
315 | fstp _t9 ; (t9 never used) |
||
316 | fstp _t6 ; st : |
||
317 | |||
318 | fld [_c1] |
||
319 | fld [_s1] |
||
320 | fld qword[ebx+8*5] |
||
321 | fld qword[ebx+8*7] |
||
322 | fld st3 ; st: c1, f[l6], f[l2], s1, c1 |
||
323 | fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c |
||
324 | fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c |
||
325 | fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c |
||
326 | faddp st1, st0 ; st: t5, f_6, f_2, s, c |
||
327 | fstp _t5 ; st: f_6, f_2, s, c |
||
328 | fld st3 ; st: c, f_6, f_2, s, c |
||
329 | fmul st0, st1 |
||
330 | fld st3 |
||
331 | fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c |
||
332 | fsubp st1, st0 ; st: t8, f_6, f_2, s, c |
||
333 | fstp _t8 ; st: f_6, f_2, s, c |
||
334 | fstp st0 ; st: f_2, s, c |
||
335 | fstp st0 ; st: s, c |
||
336 | |||
337 | fld qword[ebx+8*13] |
||
338 | fld qword[ebx+8*15] |
||
339 | fld st3 ; st: c1, f[l8], f[l4], s1, c1 |
||
340 | fmul st0, st1 |
||
341 | fld st3 |
||
342 | fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c |
||
343 | faddp st1, st0 ; st: t7, f_8, f_4, s, c |
||
344 | fld _t5 ; st: t5, t7, f_8, f_4, s, c |
||
345 | fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c |
||
346 | fstp _t4 |
||
347 | fstp _t7 ; st: f_8, f_4, s, c |
||
348 | fld st3 ; st: c, f_8, f_4, s, c |
||
349 | fmul st0, st2 |
||
350 | fld st3 |
||
351 | fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c |
||
352 | fsubp st1, st0 ; st:-t0, f_8, f_4, s, c |
||
353 | fchs |
||
354 | fld _t8 |
||
355 | fchs ; st:-t8, t0, f_8, f_4, s, c |
||
356 | fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c |
||
357 | fstp _t3 |
||
358 | fstp _t0 ; st: f_8, f_4, s, c |
||
359 | fstp st0 ; st: f_4, s, c |
||
360 | fstp st0 ; st: s, c |
||
361 | fstp st0 ; st: c |
||
362 | fstp st0 ; st: |
||
363 | |||
364 | fld _t1 |
||
365 | fld _t4 |
||
366 | fld st1 |
||
367 | fsub st0, st1 |
||
368 | fstp qword[ebx+8*11] ; f[l7] = t1-t4 |
||
369 | faddp st1, st0 |
||
370 | fstp qword[ebx+8*3] ; f[l5] = t1+t4 |
||
371 | fld _t2 |
||
372 | fld _t3 |
||
373 | fld st1 |
||
374 | fsub st0, st1 |
||
375 | fstp qword[ebx+8*15] ; f[l8] |
||
376 | faddp st1, st0 |
||
377 | fstp qword[ebx+8*7] ; f[l6] |
||
378 | |||
379 | fld _t6 |
||
380 | fld qword[ebx+8] |
||
381 | fld st1 |
||
382 | fsub st0, st1 |
||
383 | fxch st1 |
||
384 | faddp st2, st0 ; st : t2, t1 |
||
385 | fld _t8 |
||
386 | fsub _t0 |
||
387 | fld _t5 |
||
388 | fadd _t7 ; st : t4, t3, t2, t1 |
||
389 | |||
390 | fld st3 |
||
391 | fsub st0, st1 |
||
392 | fstp qword[ebx+8*9] ; f[l3] = t1-t4 |
||
393 | fadd st0, st3 |
||
394 | fstp qword[ebx+8] ; f[l1] = t1+t4 |
||
395 | fld st1 ; st : t2, t3, t2, t1 |
||
396 | fsub st0, st1 ; f[l4] = t2-t3 |
||
397 | fstp qword[ebx+8*13] ; st : t3, t2, t1 |
||
398 | faddp st1, st0 ; st : t2+t3, t1 |
||
399 | fstp qword[ebx+8*5] ; f[l2] = t2+t3 |
||
400 | fstp st0 ; st : |
||
401 | |||
402 | add ebx, 16*8 |
||
403 | cmp ebx, eax |
||
404 | jb .loop_i |
||
405 | |||
406 | mov esp, ebp |
||
407 | pop ebp |
||
408 | ret |
||
409 | |||
410 | |||
411 | |||
412 | |||
413 | ;================================================================= |
||
414 | ; cdecl parameters: |
||
415 | ; -- [ebp+8] = N |
||
416 | ; -- [ebp+12] = p |
||
417 | ; -- [ebp+16] = 4k-aligned data array address |
||
418 | ; -- [ebp+20] = 4k-aligned SinCosTable address |
||
419 | ; returns: |
||
420 | ; -- nothing |
||
421 | ; destroys: |
||
422 | ; -- all GPRegs |
||
423 | ; locals: |
||
424 | ; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step) |
||
425 | ;; ========================== |
||
426 | align 4 |
||
427 | step3: |
||
428 | push ebp |
||
429 | mov ebp, esp |
||
430 | sub esp, 120 |
||
431 | ; 283 : { |
||
432 | |||
433 | |||
434 | ; 293 : for (l=3; l<=p; l++) |
||
435 | mov cx, 0x0200 |
||
436 | .newstep: |
||
437 | inc ch |
||
438 | cmp ch, byte[ebp+12] |
||
439 | jg .done |
||
440 | mov _step, cx |
||
441 | |||
442 | ; 294 : { |
||
443 | ; 295 : d1 = 1 << (l + l - 3); |
||
444 | |||
445 | mov cl, ch |
||
446 | add cl, cl |
||
447 | sub cl, 3 |
||
448 | mov edx, 1 |
||
449 | shl edx, cl |
||
450 | mov _d1, edx |
||
451 | |||
452 | ; 296 : d2 = d1 << 1; |
||
453 | shl edx, 1 |
||
454 | mov _d2, edx |
||
455 | mov eax, edx |
||
456 | |||
457 | ; 297 : d3 = d2 << 1; |
||
458 | shl edx, 1 |
||
459 | mov _d3, edx |
||
460 | |||
461 | ; 298 : d4 = d2 + d3; |
||
462 | add eax, edx |
||
463 | mov _d4, eax |
||
464 | |||
465 | ; 299 : d5 = d3 << 1; |
||
466 | shl edx, 1 |
||
467 | mov _d5, edx |
||
468 | shl edx, 3 |
||
469 | mov _d6, edx ; d6 = d5*8 to simplify index operations |
||
470 | |||
471 | ; 339 : j5 = N / d5; ; moved out of internal loop |
||
472 | mov cl, [ebp+12] |
||
473 | sub cl, ch |
||
474 | add cl, cl |
||
475 | mov edx, 1 |
||
476 | shl edx, cl |
||
477 | mov _j5, edx |
||
478 | |||
479 | ; 300 : |
||
480 | ; 301 : for (j=0; j |
||
481 | mov ebx, [ebp+16] |
||
482 | mov esi, [ebp+8] |
||
483 | shl esi, 3 |
||
484 | add esi, ebx |
||
485 | mov _end_of_array, esi |
||
486 | |||
487 | .next_j: |
||
488 | |||
489 | ; { |
||
490 | ; t1 = f[j] + f[j+d2]; |
||
491 | mov eax, _d2 |
||
492 | fld qword[ebx] |
||
493 | fld qword[ebx+eax*8] |
||
494 | fld st1 |
||
495 | fadd st0, st1 |
||
496 | fstp _t1 |
||
497 | |||
498 | ; t2 = f[j] - f[j+d2]; |
||
499 | fsubp st1, st0 |
||
500 | fstp _t2 |
||
501 | |||
502 | ; t3 = f[j+d3] + f[j+d4]; |
||
503 | mov edi, _d3 |
||
504 | fld qword[ebx+edi*8] |
||
505 | mov edx, _d4 |
||
506 | fld qword[ebx+edx*8] |
||
507 | fld st1 |
||
508 | fsub st0, st1 ; st : t4, f4, f3 |
||
509 | fxch st1 ; st : f4, t4, f3 |
||
510 | |||
511 | ; t4 = f[j+d3] - f[j+d4]; |
||
512 | faddp st2, st0 ; st : t4, t3 |
||
513 | |||
514 | ; f[j+d4] = t2 - t4; |
||
515 | ; f[j+d3] = t2 + t4; |
||
516 | fld _t2 |
||
517 | fld st0 |
||
518 | fsub st0, st2 ; st : f4, t2, t4, t3 |
||
519 | fstp qword[ebx+edx*8] ; st : t2, t4, t3 |
||
520 | fadd st0, st1 ; st : f3, t4, t3 |
||
521 | fstp qword[ebx+edi*8] ; st : t4, t3 |
||
522 | |||
523 | ; f[j+d2] = t1 - t3; |
||
524 | ; f[j] = t1 + t3; |
||
525 | fld _t1 |
||
526 | fst st1 |
||
527 | fsub st0, st2 ; st : f2, t1, t3 |
||
528 | fstp qword[ebx+eax*8] ; st : t1, t3 |
||
529 | fadd st0, st1 ; st : f0, t3 |
||
530 | fstp qword[ebx] ; st : t3 |
||
531 | fstp st0 |
||
532 | |||
533 | ; jj = j + d1; / ?? |
||
534 | mov edi, _d1 |
||
535 | shl edi, 3 ; = d1*8 |
||
536 | mov edx, edi |
||
537 | mov eax, edi |
||
538 | add eax, eax ; eax = d2*8 |
||
539 | shl edx, 2 ; = d3*8 |
||
540 | add edi, ebx ; now [edi] points to f[jj] |
||
541 | add edx, edi ; and [edx] points to f[jj+d3] |
||
542 | |||
543 | ; t1 = f[jj]; |
||
544 | fld qword [edi] ; st : t1 |
||
545 | ; t3 = f[jj+d3]; |
||
546 | fld qword [edx] ; st : t3, t1 |
||
547 | |||
548 | ; t2 = f[jj+d2] * r; |
||
549 | fld qword [edi+eax] |
||
550 | fld [_r] |
||
551 | fmul st1, st0 ; st : r, t2, t3, t1 |
||
552 | ; t4 = f[jj+d4] * r |
||
553 | fmul qword [edx+eax] ; st : t4, t2, t3, t1 |
||
554 | |||
555 | ; f[jj] = t1 + t2 + t3; |
||
556 | fld st3 ; st : t1, t4, t2, t3, t1 |
||
557 | fadd st0, st3 |
||
558 | fadd st0, st2 |
||
559 | fstp qword [edi] |
||
560 | |||
561 | ; f[jj+d2] = t1 - t3 + t4; |
||
562 | fld st3 |
||
563 | fsub st0, st3 ; st : (t1-t3), t4, t2, t3, t1 |
||
564 | fld st0 |
||
565 | fadd st0, st2 ; st : f2, (t1-t3), t4, t2, t3, t1 |
||
566 | fstp qword [edi+eax] |
||
567 | ; f[jj+d4] = t1 - t3 - t4; |
||
568 | fsub st0, st1 ; st : f4, t4, t2, t3, t1 |
||
569 | fstp qword [edx+eax] |
||
570 | |||
571 | ; f[jj+d3] = t1 - t2 + t3; |
||
572 | fstp st0 ; st : t2, t3, t1 |
||
573 | fsubp st1, st0 ; st : (t3-t2), t1 |
||
574 | faddp st1, st0 ; st : f3 |
||
575 | fstp qword [edx] |
||
576 | |||
577 | ; for (k=1; k |
||
578 | xor ecx, ecx ; ecx = k |
||
579 | mov _jj, ecx |
||
580 | .next_k: |
||
581 | inc ecx |
||
582 | cmp ecx, _d1 |
||
583 | jge .done_k |
||
584 | ; { |
||
585 | mov eax, _d2 ; the sector increment |
||
586 | ; l1 = j + k; |
||
587 | mov edx, ecx |
||
588 | mov _l1, edx ; [ebx+edx*8] --> f[j+k] |
||
589 | ; l2 = l1 + d2; |
||
590 | add edx, eax |
||
591 | mov _l2, edx |
||
592 | ; l3 = l1 + d3; |
||
593 | add edx, eax |
||
594 | mov _l3, edx |
||
595 | ; l4 = l1 + d4; |
||
596 | add edx, eax |
||
597 | mov _l4, edx |
||
598 | |||
599 | ; l5 = j + d2 - k; |
||
600 | mov edx, eax |
||
601 | sub edx, ecx |
||
602 | mov _l5, edx |
||
603 | ; l6 = l5 + d2; |
||
604 | add edx, eax |
||
605 | mov _l6, edx |
||
606 | ; l7 = l5 + d3; |
||
607 | add edx, eax |
||
608 | mov _l7, edx |
||
609 | ; l8 = l5 + d4; |
||
610 | add edx, eax |
||
611 | mov _l8, edx |
||
612 | |||
613 | |||
614 | ; 340 : j5 *= k; // add-substituted multiplication |
||
615 | mov eax, _jj |
||
616 | add eax, _j5 |
||
617 | mov _jj, eax |
||
618 | |||
619 | ; c1 = C[jj]; |
||
620 | ; s1 = S[jj]; |
||
621 | mov edi, [ebp+20] |
||
622 | fld qword[edi+eax*8] |
||
623 | mov esi, [ebp+8] |
||
624 | shl esi, 2 |
||
625 | add esi, edi |
||
626 | fld qword[esi+eax*8] ; st : s1, c1 |
||
627 | |||
628 | ; t5 = f[l2] * c1 + f[l6] * s1; |
||
629 | ; t8 = f[l6] * c1 - f[l2] * s1; |
||
630 | mov edx, _l6 |
||
631 | fld qword[ebx+edx*8] |
||
632 | mov edx, _l2 |
||
633 | fld st0 |
||
634 | fmul st0, st2 |
||
635 | fxch st1 |
||
636 | fmul st0, st3 |
||
637 | fld qword[ebx+edx*8] ; st : f[l2], f[l6]*c, f[l6]*s, s, c |
||
638 | fmul st4, st0 |
||
639 | fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c |
||
640 | fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c |
||
641 | fstp _t8 |
||
642 | faddp st2, st0 ; st : f[l2]*s, t5 |
||
643 | fstp st0 ; st : t5 |
||
644 | fstp _t5 ; st : |
||
645 | |||
646 | ; c2 = C[2*jj]; |
||
647 | ; s2 = S[2*jj]; |
||
648 | shl eax, 1 |
||
649 | fld qword[edi+eax*8] |
||
650 | fld qword[esi+eax*8] ; st : s2, c2 |
||
651 | |||
652 | ; t6 = f[l3] * c2 + f[l7] * s2; |
||
653 | ; t9 = f[l7] * c2 - f[l3] * s2; |
||
654 | mov edx, _l7 |
||
655 | fld qword[ebx+edx*8] |
||
656 | mov edx, _l3 |
||
657 | fld st0 |
||
658 | fmul st0, st2 |
||
659 | fxch st1 |
||
660 | fmul st0, st3 |
||
661 | fld qword[ebx+edx*8] ; st : f[l3], f[l7]*c, f[l7]*s, s, c |
||
662 | fmul st4, st0 |
||
663 | fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c |
||
664 | fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c |
||
665 | fstp _t9 |
||
666 | faddp st2, st0 ; st : f[l2]*s, t6 |
||
667 | fstp st0 ; st : t6 |
||
668 | fstp _t6 ; st : |
||
669 | |||
670 | ; c3 = C[3*jj]; |
||
671 | ; s3 = S[3*jj]; |
||
672 | add eax, _jj |
||
673 | fld qword[edi+eax*8] |
||
674 | fld qword[esi+eax*8] ; st : s3, c3 |
||
675 | |||
676 | ; t7 = f[l4] * c3 + f[l8] * s3; |
||
677 | ; t0 = f[l8] * c3 - f[l4] * s3; |
||
678 | mov edx, _l8 |
||
679 | fld qword[ebx+edx*8] |
||
680 | mov edx, _l4 |
||
681 | fld st0 |
||
682 | fmul st0, st2 |
||
683 | fxch st1 |
||
684 | fmul st0, st3 |
||
685 | fld qword[ebx+edx*8] ; st : f[l4], f[l8]*c, f[l8]*s, s, c |
||
686 | fmul st4, st0 |
||
687 | fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c |
||
688 | fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c |
||
689 | fstp _t0 |
||
690 | faddp st2, st0 ; st : f[l2]*s, t7 |
||
691 | fstp st0 ; st : t7 |
||
692 | fstp _t7 ; st : |
||
693 | |||
694 | ; t1 = f[l5] - t9; |
||
695 | ; t2 = f[l5] + t9; |
||
696 | mov eax, _l5 |
||
697 | fld qword [ebx+eax*8] |
||
698 | fld _t9 |
||
699 | fld st0 |
||
700 | fadd st0, st2 |
||
701 | fstp _t2 |
||
702 | fsubp st1, st0 |
||
703 | fstp _t1 |
||
704 | |||
705 | ; t3 = - t8 - t0; |
||
706 | fld _t8 |
||
707 | fadd _t0 |
||
708 | fchs |
||
709 | fstp _t3 |
||
710 | ; t4 = t5 - t7; |
||
711 | fld _t5 |
||
712 | fsub _t7 |
||
713 | fstp _t4 |
||
714 | |||
715 | ; f[l5] = t1 + t4; |
||
716 | fld _t1 |
||
717 | fld _t4 |
||
718 | fld st0 |
||
719 | fadd st0, st2 |
||
720 | fstp qword [ebx+eax*8] |
||
721 | ; f[l7] = t1 - t4; |
||
722 | mov eax, _l7 |
||
723 | fsubp st1, st0 |
||
724 | fstp qword [ebx+eax*8] |
||
725 | |||
726 | ; f[l6] = t2 + t3; |
||
727 | mov eax, _l6 |
||
728 | fld _t2 |
||
729 | fld _t3 |
||
730 | fld st0 |
||
731 | fadd st0, st2 |
||
732 | fstp qword [ebx+eax*8] |
||
733 | ; f[l8] = t2 - t3; |
||
734 | mov eax, _l8 |
||
735 | fsubp st1, st0 |
||
736 | fstp qword [ebx+eax*8] |
||
737 | |||
738 | ; t1 = f[l1] + t6; |
||
739 | mov eax, _l1 |
||
740 | fld qword [ebx+eax*8] |
||
741 | fld _t6 |
||
742 | fld st0 |
||
743 | fadd st0, st2 |
||
744 | fstp _t1 |
||
745 | ; t2 = f[l1] - t6; |
||
746 | fsubp st1, st0 |
||
747 | fstp _t2 |
||
748 | |||
749 | ; t3 = t8 - t0; |
||
750 | fld _t8 |
||
751 | fsub _t0 |
||
752 | fstp _t3 |
||
753 | ; t4 = t5 + t7; |
||
754 | fld _t5 |
||
755 | fadd _t7 |
||
756 | fstp _t4 |
||
757 | |||
758 | ; f[l1] = t1 + t4; |
||
759 | mov eax, _l1 |
||
760 | fld _t1 |
||
761 | fld _t4 |
||
762 | fld st0 |
||
763 | fadd st0, st2 |
||
764 | fstp qword [ebx+eax*8] |
||
765 | ; f[l3] = t1 - t4; |
||
766 | mov eax, _l3 |
||
767 | fsubp st1, st0 |
||
768 | fstp qword [ebx+eax*8] |
||
769 | |||
770 | ; f[l2] = t2 + t3; |
||
771 | mov eax, _l2 |
||
772 | fld _t2 |
||
773 | fld _t3 |
||
774 | fld st0 |
||
775 | fadd st0, st2 |
||
776 | fstp qword [ebx+eax*8] |
||
777 | ; f[l4] = t2 - t3; |
||
778 | mov eax, _l4 |
||
779 | fsubp st1, st0 |
||
780 | fstp qword [ebx+eax*8] |
||
781 | |||
782 | ; 374 : } |
||
783 | jmp .next_k |
||
784 | |||
785 | .done_k: |
||
786 | ; 375 : } |
||
787 | add ebx, _d6 ; d6 = d5*8 |
||
788 | cmp ebx, _end_of_array |
||
789 | jb .next_j |
||
790 | |||
791 | ; 376 : } |
||
792 | mov cx, _step |
||
793 | jmp .newstep |
||
794 | .done: |
||
795 | mov esp, ebp |
||
796 | pop ebp |
||
797 | ; 377 : } |
||
798 | ret |
||
799 | |||
800 | |||
801 | ;=========== Step3 ends here =========== |
||
802 | |||
803 | |||
804 | ; ================================================================= |
||
805 | |||
806 | ;================================================================= |
||
807 | ; parameters: |
||
808 | ; -- [ebp+8] = N |
||
809 | ; -- [ebp+12] = p |
||
810 | ; -- [ebp+16] = 4k-aligned data array address |
||
811 | ; -- [ebp+20] = 4k-aligned SinCosTable address |
||
812 | ; returns: |
||
813 | ; -- nothing |
||
814 | ; destroys: |
||
815 | ; -- all GPRegs |
||
816 | ;; ========================== |
||
817 | |||
818 | align 4 |
||
819 | |||
820 | FHT_4: |
||
821 | |||
822 | push ebp |
||
823 | mov ebp, esp |
||
824 | mov edx, [ebp+16] |
||
825 | add edx, [ebp+12] |
||
826 | call BitInvert |
||
827 | push dword[ebp+16] |
||
828 | push dword[ebp+8] |
||
829 | call step1 |
||
830 | call step2 |
||
831 | pop edx ; N |
||
832 | pop ecx ; a |
||
833 | push dword[ebp+20] ; t |
||
834 | push ecx |
||
835 | push dword[ebp+12] ; p |
||
836 | push edx ; N |
||
837 | call step3 |
||
838 | mov esp, ebp |
||
839 | pop ebp |
||
840 | |||
841 | ret><>><>><>><>=p;> |