Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
1641 | art_zh | 1 | ; Fast Hartley Transform routine |
2 | ; Copyright (C) 1999, 2004, 2010 |
||
3 | ; Artem Jerdev artem@jerdev.co.uk |
||
4 | ; |
||
5 | ; free KolibriOS version - not to be ported to other OSes |
||
6 | ; ========================================================== |
||
7 | |||
8 | |||
9 | ; global constants |
||
10 | align 8 |
||
11 | fht_r dq 1.41421356237309504880169 ; = sqrt(2) |
||
12 | fht_r2 dq 0.70710678118654752440084 ; = sqrt(2)/2 |
||
13 | fht_c1 dq 0.92387953251128675612818 ; = cos(pi/8) |
||
14 | fht_s1 dq 0.38268343236508977172846 ; = sin(pi/8) |
||
15 | |||
16 | |||
17 | ;================================================================= |
||
18 | ; parameter1: |
||
19 | ; -- reg dl (bits[3:0]) = Power_of_4 |
||
20 | ; -- reg edx && (-16) = 4k-aligned data array address |
||
21 | ; returns: |
||
22 | ; -- edx = Power_of_4 |
||
23 | ; -- ecx = N |
||
24 | ; destroys: |
||
25 | ; -- eax, ebx, ecx, edx, esi |
||
26 | ;; ========================== |
||
27 | align 4 |
||
28 | BitInvert: |
||
29 | mov esi, edx |
||
30 | and esi, 0xFFFFFFF0 |
||
31 | and edx, 0x0F |
||
32 | push edx |
||
33 | mov cl, dl |
||
34 | xor eax, eax |
||
35 | inc eax |
||
36 | shl eax, cl |
||
37 | shl eax, cl |
||
38 | push eax |
||
39 | xor ecx, ecx ; index term |
||
40 | align 4 |
||
41 | .newterm: |
||
42 | inc ecx |
||
43 | cmp ecx, [esp] ; N |
||
44 | jge .done |
||
45 | |||
46 | xor eax, eax |
||
47 | mov edx, ecx |
||
48 | xor bl, bl |
||
49 | align 4 |
||
50 | .do_invert: |
||
51 | inc bl |
||
52 | cmp bl, byte[esp+4] ; Power_of_4 |
||
53 | jg .switch |
||
54 | |||
55 | mov bh, dl |
||
56 | and bh, 3 |
||
57 | shl eax, 2 |
||
58 | or al, bh |
||
59 | shr edx, 2 |
||
60 | jmp .do_invert |
||
61 | align 8 |
||
62 | |||
63 | .switch: |
||
64 | cmp eax, ecx |
||
65 | jle .newterm |
||
66 | |||
67 | fld qword [esi+eax*8] |
||
68 | fld qword [esi+ecx*8] |
||
69 | fstp qword [esi+eax*8] |
||
70 | fstp qword [esi+ecx*8] |
||
71 | jmp .newterm |
||
72 | |||
73 | align 4 |
||
74 | .done: |
||
75 | pop ecx |
||
76 | pop edx |
||
77 | ret |
||
78 | |||
79 | ;================================================================= |
||
80 | |||
81 | |||
82 | ;================================================================= |
||
83 | ; stdcall parameters: |
||
84 | ; -- [esp+4] = N |
||
85 | ; -- [esp+8] = 4k-aligned data array address |
||
86 | ; returns: |
||
87 | ; -- nothing |
||
88 | ; destroys: |
||
89 | ; -- ebx, esi |
||
90 | ;; ========================== |
||
91 | align 4 |
||
92 | step1: |
||
93 | mov ebx, [esp+8] |
||
94 | mov esi, [esp+4] |
||
95 | shl esi, 3 |
||
96 | add esi, ebx |
||
97 | |||
98 | align 4 |
||
99 | .loop: |
||
100 | fld qword[ebx] |
||
101 | fld qword[ebx+8] |
||
102 | fld st1 |
||
103 | fsub st0, st1 ; st : t2, f[i+1], f[i] |
||
104 | fxch st1 ; st : f[i+1], t2, f[i] |
||
105 | faddp st2, st0 ; st : t2, t1 |
||
106 | fld qword[ebx+16] |
||
107 | fld qword[ebx+24] |
||
108 | fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1 |
||
109 | fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1 |
||
110 | fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1 |
||
111 | fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1 |
||
112 | fstp st1 ; st : t4, t3, t2, t1 |
||
113 | fld st2 ; st : t2, t4, t3, t2, t1 |
||
114 | fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1 |
||
115 | fstp qword[ebx+16] ; st : t4, t3, t2, t1 |
||
116 | fsubp st2, st0 ; st : t3, t2-t4, t1 |
||
117 | fld st2 ; st : t1, t3, t2-t4, t1 |
||
118 | fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1 |
||
119 | fstp qword[ebx] ; st : t3, t2-t4, t1 |
||
120 | fsubp st2, st0 ; st : t2-t4, t1-t3 |
||
121 | fstp qword[ebx+24] ; st : t1-t3 |
||
122 | fstp qword[ebx+8] ; st : |
||
123 | |||
124 | add ebx, 32 |
||
125 | cmp ebx, esi |
||
126 | jnz .loop |
||
127 | ret |
||
128 | |||
129 | ; local stack definitions |
||
130 | ;=========================================================================== |
||
131 | _t0 equ dword [esp] |
||
132 | _t1 equ dword[esp+4] |
||
133 | _t2 equ dword[esp+8] |
||
134 | _t3 equ dword[esp+12] |
||
135 | _t4 equ dword[esp+16] |
||
136 | _t5 equ dword[esp+20] |
||
137 | _t6 equ dword[esp+24] |
||
138 | _t7 equ dword[esp+28] |
||
139 | _t8 equ dword[esp+32] |
||
140 | _t9 equ dword[esp+36] |
||
141 | |||
142 | _l1 equ dword[esp+40] |
||
143 | _l2 equ dword[esp+44] |
||
144 | _l3 equ dword[esp+48] |
||
145 | _l4 equ dword[esp+52] |
||
146 | _l5 equ dword[esp+56] |
||
147 | _l6 equ dword[esp+60] |
||
148 | _l7 equ dword[esp+64] |
||
149 | _l8 equ dword[esp+68] |
||
150 | _l9 equ dword[esp+72] |
||
151 | _l0 equ dword[esp+76] |
||
152 | _d1 equ dword[esp+80] |
||
153 | _d2 equ dword[esp+84] |
||
154 | _d3 equ dword[esp+88] |
||
155 | _d4 equ dword[esp+92] |
||
156 | _d5 equ dword[esp+96] |
||
157 | _d6 equ dword[esp+100] |
||
158 | _j5 equ dword[esp+104] |
||
159 | _jj equ dword[esp+108] |
||
160 | _end_of_array equ dword[esp+112] |
||
161 | _step equ word [esp+116] |
||
162 | |||
163 | |||
164 | ;================================================================= |
||
165 | ; cdecl parameters: |
||
166 | ; -- [ebp+8] = N |
||
167 | ; -- [ebp+12] = 4k-aligned data array address |
||
168 | ; returns: |
||
169 | ; -- nothing |
||
170 | ; destroys: |
||
171 | ; -- eax, ebx |
||
172 | ; locals: |
||
173 | ; -- 10 stack-located dwords (_t0 ... _t9) |
||
174 | ;; ========================== |
||
175 | align 4 |
||
176 | step2: |
||
177 | push ebp |
||
178 | mov ebp, esp |
||
179 | sub esp, 40 |
||
180 | mov ebx, [ebp+12] |
||
181 | mov eax, [ebp+ 8] |
||
182 | shl eax, 3 |
||
183 | add eax, ebx |
||
184 | |||
185 | align 4 |
||
186 | .loop_i: |
||
187 | |||
188 | ; -- quad subelements +0, +4, +8 and +12 (simpliest operations) |
||
189 | fld qword[ebx] |
||
190 | fld qword[ebx+8*4] |
||
191 | fld st0 |
||
192 | fadd st0, st2 ; st : t1, f_4, f_0 |
||
193 | fxch st1 |
||
194 | fsubp st2, st0 ; st : t1, t2 |
||
195 | fld qword[ebx+8*8] |
||
196 | fld qword[ebx+8*12] |
||
197 | fld st0 |
||
198 | fadd st0, st2 ; st : t3, f_12, t1, t2 |
||
199 | fxch st1 |
||
200 | fsubp st2, st0 ; st : t3, t4, t1, t2 |
||
201 | ; ------ |
||
202 | fld st2 ; st : t1, t3, t4, t1, t2 |
||
203 | fadd st0, st1 |
||
204 | fstp qword[ebx] ; st : t3, t4, t1, t2 |
||
205 | fsub st0, st2 ; st : t3-t1, t4, t1, t2 |
||
206 | fchs ; st : t1-t3, t4, t1, t2 |
||
207 | fstp qword[ebx+8*4] ; st : t4, t1, t2 |
||
208 | fst st1 ; st : t4, t4, t2 |
||
209 | fadd st0, st2 ; st : t2+t4, t4, t2 |
||
210 | fstp qword[ebx+8*8] ; st : t4, t2 |
||
211 | fsubp st1, st0 ; st : t2-t4 |
||
212 | fstp qword[ebx+8*12] ; st : |
||
213 | |||
214 | ; -- even subelements +2, +6, +10 and +14 (2 multiplications needed) |
||
215 | fld qword[ebx+8*2] |
||
216 | fld qword[ebx+8*6] |
||
217 | fld [fht_r] |
||
218 | fmul st1, st0 ; st : r, t2, t1 |
||
219 | fld qword[ebx+8*10] |
||
220 | fxch st1 ; st : r, t3, t2, t1 |
||
221 | fmul qword[ebx+8*14] ; st : t4, t3, t2, t1 |
||
222 | ; ------ |
||
223 | fld st3 ; st : t1, t4, t3, t2, t1 |
||
224 | fadd st0, st3 ; |
||
225 | fadd st0, st2 ; |
||
226 | fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3 |
||
227 | fsub st0, st3 ; |
||
228 | fsub st0, st3 ; |
||
229 | fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3 |
||
230 | fld st3 ; st : t1, t4, t3, t2, t1 |
||
231 | fsub st0, st2 ; |
||
232 | fsub st0, st1 ; |
||
233 | fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4 |
||
234 | fadd st0, st1 ; |
||
235 | faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1 |
||
236 | fstp qword[ebx+8*6] ; store f[i+6] |
||
237 | fstp st0 ; st : t2, t1 |
||
238 | fstp st0 ; st : t1 |
||
239 | fstp st0 ; st : |
||
240 | |||
241 | ; -- odd subelements |
||
242 | fld qword[ebx+8*9] |
||
243 | fld qword[ebx+8*11] |
||
244 | fld st1 |
||
245 | fsub st0, st1 |
||
246 | fxch st1 |
||
247 | faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7]) |
||
248 | fld [fht_r2] |
||
249 | fmul st2, st0 |
||
250 | fmulp st1, st0 ; st : t9, t6 |
||
251 | fld qword[ebx+8*3] |
||
252 | fld st0 |
||
253 | fadd st0, st2 ; st : t1, f[l5], t9, t6 |
||
254 | fstp _t1 |
||
255 | fsub st0, st1 |
||
256 | fstp _t2 |
||
257 | fstp _t9 ; (t9 never used) |
||
258 | fstp _t6 ; st : |
||
259 | |||
260 | fld [fht_c1] |
||
261 | fld [fht_s1] |
||
262 | fld qword[ebx+8*5] |
||
263 | fld qword[ebx+8*7] |
||
264 | fld st3 ; st: c1, f[l6], f[l2], s1, c1 |
||
265 | fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c |
||
266 | fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c |
||
267 | fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c |
||
268 | faddp st1, st0 ; st: t5, f_6, f_2, s, c |
||
269 | fstp _t5 ; st: f_6, f_2, s, c |
||
270 | fld st3 ; st: c, f_6, f_2, s, c |
||
271 | fmul st0, st1 |
||
272 | fld st3 |
||
273 | fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c |
||
274 | fsubp st1, st0 ; st: t8, f_6, f_2, s, c |
||
275 | fstp _t8 ; st: f_6, f_2, s, c |
||
276 | fstp st0 ; st: f_2, s, c |
||
277 | fstp st0 ; st: s, c |
||
278 | |||
279 | fld qword[ebx+8*13] |
||
280 | fld qword[ebx+8*15] |
||
281 | fld st3 ; st: c1, f[l8], f[l4], s1, c1 |
||
282 | fmul st0, st1 |
||
283 | fld st3 |
||
284 | fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c |
||
285 | faddp st1, st0 ; st: t7, f_8, f_4, s, c |
||
286 | fld _t5 ; st: t5, t7, f_8, f_4, s, c |
||
287 | fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c |
||
288 | fstp _t4 |
||
289 | fstp _t7 ; st: f_8, f_4, s, c |
||
290 | fld st3 ; st: c, f_8, f_4, s, c |
||
291 | fmul st0, st2 |
||
292 | fld st3 |
||
293 | fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c |
||
294 | fsubp st1, st0 ; st:-t0, f_8, f_4, s, c |
||
295 | fchs |
||
296 | fld _t8 |
||
297 | fchs ; st:-t8, t0, f_8, f_4, s, c |
||
298 | fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c |
||
299 | fstp _t3 |
||
300 | fstp _t0 ; st: f_8, f_4, s, c |
||
301 | fstp st0 ; st: f_4, s, c |
||
302 | fstp st0 ; st: s, c |
||
303 | fstp st0 ; st: c |
||
304 | fstp st0 ; st: |
||
305 | |||
306 | fld _t1 |
||
307 | fld _t4 |
||
308 | fld st1 |
||
309 | fsub st0, st1 |
||
310 | fstp qword[ebx+8*11] ; f[l7] = t1-t4 |
||
311 | faddp st1, st0 |
||
312 | fstp qword[ebx+8*3] ; f[l5] = t1+t4 |
||
313 | fld _t2 |
||
314 | fld _t3 |
||
315 | fld st1 |
||
316 | fsub st0, st1 |
||
317 | fstp qword[ebx+8*15] ; f[l8] |
||
318 | faddp st1, st0 |
||
319 | fstp qword[ebx+8*7] ; f[l6] |
||
320 | |||
321 | fld _t6 |
||
322 | fld qword[ebx+8] |
||
323 | fld st1 |
||
324 | fsub st0, st1 |
||
325 | fxch st1 |
||
326 | faddp st2, st0 ; st : t2, t1 |
||
327 | fld _t8 |
||
328 | fsub _t0 |
||
329 | fld _t5 |
||
330 | fadd _t7 ; st : t4, t3, t2, t1 |
||
331 | |||
332 | fld st3 |
||
333 | fsub st0, st1 |
||
334 | fstp qword[ebx+8*9] ; f[l3] = t1-t4 |
||
335 | fadd st0, st3 |
||
336 | fstp qword[ebx+8] ; f[l1] = t1+t4 |
||
337 | fld st1 ; st : t2, t3, t2, t1 |
||
338 | fsub st0, st1 ; f[l4] = t2-t3 |
||
339 | fstp qword[ebx+8*13] ; st : t3, t2, t1 |
||
340 | faddp st1, st0 ; st : t2+t3, t1 |
||
341 | fstp qword[ebx+8*5] ; f[l2] = t2+t3 |
||
342 | fstp st0 ; st : |
||
343 | |||
344 | add ebx, 16*8 |
||
345 | cmp ebx, eax |
||
346 | jb .loop_i |
||
347 | |||
348 | mov esp, ebp |
||
349 | pop ebp |
||
350 | ret |
||
351 | |||
352 | |||
353 | |||
354 | |||
355 | ;================================================================= |
||
356 | ; cdecl parameters: |
||
357 | ; -- [ebp+8] = N |
||
358 | ; -- [ebp+12] = p |
||
359 | ; -- [ebp+16] = 4k-aligned data array address |
||
360 | ; -- [ebp+20] = 4k-aligned SinCosTable address |
||
361 | ; returns: |
||
362 | ; -- nothing |
||
363 | ; destroys: |
||
364 | ; -- all GPRegs |
||
365 | ; locals: |
||
366 | ; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step) |
||
367 | ;; ========================== |
||
368 | align 4 |
||
369 | step3: |
||
370 | push ebp |
||
371 | mov ebp, esp |
||
372 | sub esp, 120 |
||
373 | ; 283 : { |
||
374 | |||
375 | |||
376 | ; 293 : for (l=3; l<=p; l++) |
||
377 | mov cx, 0x0200 |
||
378 | align 4 |
||
379 | .newstep: |
||
380 | inc ch |
||
381 | cmp ch, byte[ebp+12] |
||
382 | jg .done |
||
383 | mov _step, cx |
||
384 | |||
385 | ; 294 : { |
||
386 | ; 295 : d1 = 1 << (l + l - 3); |
||
387 | |||
388 | mov cl, ch |
||
389 | add cl, cl |
||
390 | sub cl, 3 |
||
391 | mov edx, 1 |
||
392 | shl edx, cl |
||
393 | mov _d1, edx |
||
394 | |||
395 | ; 296 : d2 = d1 << 1; |
||
396 | shl edx, 1 |
||
397 | mov _d2, edx |
||
398 | mov eax, edx |
||
399 | |||
400 | ; 297 : d3 = d2 << 1; |
||
401 | shl edx, 1 |
||
402 | mov _d3, edx |
||
403 | |||
404 | ; 298 : d4 = d2 + d3; |
||
405 | add eax, edx |
||
406 | mov _d4, eax |
||
407 | |||
408 | ; 299 : d5 = d3 << 1; |
||
409 | shl edx, 1 |
||
410 | mov _d5, edx |
||
411 | shl edx, 3 |
||
412 | mov _d6, edx ; d6 = d5*8 to simplify index operations |
||
413 | |||
414 | ; 339 : j5 = N / d5; ; moved out of internal loop |
||
415 | mov cl, [ebp+12] |
||
416 | sub cl, ch |
||
417 | add cl, cl |
||
418 | mov edx, 1 |
||
419 | shl edx, cl |
||
420 | mov _j5, edx |
||
421 | |||
422 | ; 300 : |
||
423 | ; 301 : for (j=0; j |
||
424 | mov ebx, [ebp+16] |
||
425 | mov esi, [ebp+8] |
||
426 | shl esi, 3 |
||
427 | add esi, ebx |
||
428 | mov _end_of_array, esi |
||
429 | |||
430 | align 4 |
||
431 | .next_j: |
||
432 | |||
433 | ; { |
||
434 | ; t1 = f[j] + f[j+d2]; |
||
435 | mov eax, _d2 |
||
436 | fld qword[ebx] |
||
437 | fld qword[ebx+eax*8] |
||
438 | fld st1 |
||
439 | fadd st0, st1 |
||
440 | fstp _t1 |
||
441 | |||
442 | ; t2 = f[j] - f[j+d2]; |
||
443 | fsubp st1, st0 |
||
444 | fstp _t2 |
||
445 | |||
446 | ; t3 = f[j+d3] + f[j+d4]; |
||
447 | mov edi, _d3 |
||
448 | fld qword[ebx+edi*8] |
||
449 | mov edx, _d4 |
||
450 | fld qword[ebx+edx*8] |
||
451 | fld st1 |
||
452 | fsub st0, st1 ; st : t4, f4, f3 |
||
453 | fxch st1 ; st : f4, t4, f3 |
||
454 | |||
455 | ; t4 = f[j+d3] - f[j+d4]; |
||
456 | faddp st2, st0 ; st : t4, t3 |
||
457 | |||
458 | ; f[j+d4] = t2 - t4; |
||
459 | ; f[j+d3] = t2 + t4; |
||
460 | fld _t2 |
||
461 | fld st0 |
||
462 | fsub st0, st2 ; st : f4, t2, t4, t3 |
||
463 | fstp qword[ebx+edx*8] ; st : t2, t4, t3 |
||
464 | fadd st0, st1 ; st : f3, t4, t3 |
||
465 | fstp qword[ebx+edi*8] ; st : t4, t3 |
||
466 | |||
467 | ; f[j+d2] = t1 - t3; |
||
468 | ; f[j] = t1 + t3; |
||
469 | fld _t1 |
||
470 | fst st1 |
||
471 | fsub st0, st2 ; st : f2, t1, t3 |
||
472 | fstp qword[ebx+eax*8] ; st : t1, t3 |
||
473 | fadd st0, st1 ; st : f0, t3 |
||
474 | fstp qword[ebx] ; st : t3 |
||
475 | fstp st0 |
||
476 | |||
477 | ; jj = j + d1; / ?? |
||
478 | mov edi, _d1 |
||
479 | shl edi, 3 ; = d1*8 |
||
480 | mov edx, edi |
||
481 | mov eax, edi |
||
482 | add eax, eax ; eax = d2*8 |
||
483 | shl edx, 2 ; = d3*8 |
||
484 | add edi, ebx ; now [edi] points to f[jj] |
||
485 | add edx, edi ; and [edx] points to f[jj+d3] |
||
486 | |||
487 | ; t1 = f[jj]; |
||
488 | fld qword [edi] ; st : t1 |
||
489 | ; t3 = f[jj+d3]; |
||
490 | fld qword [edx] ; st : t3, t1 |
||
491 | |||
492 | ; t2 = f[jj+d2] * r; |
||
493 | fld qword [edi+eax] |
||
494 | fld [fht_r] |
||
495 | fmul st1, st0 ; st : r, t2, t3, t1 |
||
496 | ; t4 = f[jj+d4] * r |
||
497 | fmul qword [edx+eax] ; st : t4, t2, t3, t1 |
||
498 | |||
499 | ; f[jj] = t1 + t2 + t3; |
||
500 | fld st3 ; st : t1, t4, t2, t3, t1 |
||
501 | fadd st0, st3 |
||
502 | fadd st0, st2 |
||
503 | fstp qword [edi] |
||
504 | |||
505 | ; f[jj+d2] = t1 - t3 + t4; |
||
506 | fld st3 |
||
507 | fsub st0, st3 ; st : (t1-t3), t4, t2, t3, t1 |
||
508 | fld st0 |
||
509 | fadd st0, st2 ; st : f2, (t1-t3), t4, t2, t3, t1 |
||
510 | fstp qword [edi+eax] |
||
511 | ; f[jj+d4] = t1 - t3 - t4; |
||
512 | fsub st0, st1 ; st : f4, t4, t2, t3, t1 |
||
513 | fstp qword [edx+eax] |
||
514 | |||
515 | ; f[jj+d3] = t1 - t2 + t3; |
||
516 | fstp st0 ; st : t2, t3, t1 |
||
517 | fsubp st1, st0 ; st : (t3-t2), t1 |
||
518 | faddp st1, st0 ; st : f3 |
||
519 | fstp qword [edx] |
||
520 | |||
521 | ; for (k=1; k |
||
522 | xor ecx, ecx ; ecx = k |
||
523 | mov _jj, ecx |
||
524 | align 4 |
||
525 | .next_k: |
||
526 | inc ecx |
||
527 | cmp ecx, _d1 |
||
528 | jge .done_k |
||
529 | ; { |
||
530 | mov eax, _d2 ; the sector increment |
||
531 | ; l1 = j + k; |
||
532 | mov edx, ecx |
||
533 | mov _l1, edx ; [ebx+edx*8] --> f[j+k] |
||
534 | ; l2 = l1 + d2; |
||
535 | add edx, eax |
||
536 | mov _l2, edx |
||
537 | ; l3 = l1 + d3; |
||
538 | add edx, eax |
||
539 | mov _l3, edx |
||
540 | ; l4 = l1 + d4; |
||
541 | add edx, eax |
||
542 | mov _l4, edx |
||
543 | |||
544 | ; l5 = j + d2 - k; |
||
545 | mov edx, eax |
||
546 | sub edx, ecx |
||
547 | mov _l5, edx |
||
548 | ; l6 = l5 + d2; |
||
549 | add edx, eax |
||
550 | mov _l6, edx |
||
551 | ; l7 = l5 + d3; |
||
552 | add edx, eax |
||
553 | mov _l7, edx |
||
554 | ; l8 = l5 + d4; |
||
555 | add edx, eax |
||
556 | mov _l8, edx |
||
557 | |||
558 | |||
559 | ; 340 : j5 *= k; // add-substituted multiplication |
||
560 | mov eax, _jj |
||
561 | add eax, _j5 |
||
562 | mov _jj, eax |
||
563 | |||
564 | ; c1 = C[jj]; |
||
565 | ; s1 = S[jj]; |
||
566 | mov edi, [ebp+20] |
||
567 | fld qword[edi+eax*8] |
||
568 | mov esi, [ebp+8] |
||
569 | shl esi, 2 |
||
570 | add esi, edi |
||
571 | fld qword[esi+eax*8] ; st : s1, c1 |
||
572 | |||
573 | ; t5 = f[l2] * c1 + f[l6] * s1; |
||
574 | ; t8 = f[l6] * c1 - f[l2] * s1; |
||
575 | mov edx, _l6 |
||
576 | fld qword[ebx+edx*8] |
||
577 | mov edx, _l2 |
||
578 | fld st0 |
||
579 | fmul st0, st2 |
||
580 | fxch st1 |
||
581 | fmul st0, st3 |
||
582 | fld qword[ebx+edx*8] ; st : f[l2], f[l6]*c, f[l6]*s, s, c |
||
583 | fmul st4, st0 |
||
584 | fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c |
||
585 | fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c |
||
586 | fstp _t8 |
||
587 | faddp st2, st0 ; st : f[l2]*s, t5 |
||
588 | fstp st0 ; st : t5 |
||
589 | fstp _t5 ; st : |
||
590 | |||
591 | ; c2 = C[2*jj]; |
||
592 | ; s2 = S[2*jj]; |
||
593 | shl eax, 1 |
||
594 | fld qword[edi+eax*8] |
||
595 | fld qword[esi+eax*8] ; st : s2, c2 |
||
596 | |||
597 | ; t6 = f[l3] * c2 + f[l7] * s2; |
||
598 | ; t9 = f[l7] * c2 - f[l3] * s2; |
||
599 | mov edx, _l7 |
||
600 | fld qword[ebx+edx*8] |
||
601 | mov edx, _l3 |
||
602 | fld st0 |
||
603 | fmul st0, st2 |
||
604 | fxch st1 |
||
605 | fmul st0, st3 |
||
606 | fld qword[ebx+edx*8] ; st : f[l3], f[l7]*c, f[l7]*s, s, c |
||
607 | fmul st4, st0 |
||
608 | fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c |
||
609 | fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c |
||
610 | fstp _t9 |
||
611 | faddp st2, st0 ; st : f[l2]*s, t6 |
||
612 | fstp st0 ; st : t6 |
||
613 | fstp _t6 ; st : |
||
614 | |||
615 | ; c3 = C[3*jj]; |
||
616 | ; s3 = S[3*jj]; |
||
617 | add eax, _jj |
||
618 | fld qword[edi+eax*8] |
||
619 | fld qword[esi+eax*8] ; st : s3, c3 |
||
620 | |||
621 | ; t7 = f[l4] * c3 + f[l8] * s3; |
||
622 | ; t0 = f[l8] * c3 - f[l4] * s3; |
||
623 | mov edx, _l8 |
||
624 | fld qword[ebx+edx*8] |
||
625 | mov edx, _l4 |
||
626 | fld st0 |
||
627 | fmul st0, st2 |
||
628 | fxch st1 |
||
629 | fmul st0, st3 |
||
630 | fld qword[ebx+edx*8] ; st : f[l4], f[l8]*c, f[l8]*s, s, c |
||
631 | fmul st4, st0 |
||
632 | fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c |
||
633 | fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c |
||
634 | fstp _t0 |
||
635 | faddp st2, st0 ; st : f[l2]*s, t7 |
||
636 | fstp st0 ; st : t7 |
||
637 | fstp _t7 ; st : |
||
638 | |||
639 | ; t1 = f[l5] - t9; |
||
640 | ; t2 = f[l5] + t9; |
||
641 | mov eax, _l5 |
||
642 | fld qword [ebx+eax*8] |
||
643 | fld _t9 |
||
644 | fld st0 |
||
645 | fadd st0, st2 |
||
646 | fstp _t2 |
||
647 | fsubp st1, st0 |
||
648 | fstp _t1 |
||
649 | |||
650 | ; t3 = - t8 - t0; |
||
651 | fld _t8 |
||
652 | fadd _t0 |
||
653 | fchs |
||
654 | fstp _t3 |
||
655 | ; t4 = t5 - t7; |
||
656 | fld _t5 |
||
657 | fsub _t7 |
||
658 | fstp _t4 |
||
659 | |||
660 | ; f[l5] = t1 + t4; |
||
661 | fld _t1 |
||
662 | fld _t4 |
||
663 | fld st0 |
||
664 | fadd st0, st2 |
||
665 | fstp qword [ebx+eax*8] |
||
666 | ; f[l7] = t1 - t4; |
||
667 | mov eax, _l7 |
||
668 | fsubp st1, st0 |
||
669 | fstp qword [ebx+eax*8] |
||
670 | |||
671 | ; f[l6] = t2 + t3; |
||
672 | mov eax, _l6 |
||
673 | fld _t2 |
||
674 | fld _t3 |
||
675 | fld st0 |
||
676 | fadd st0, st2 |
||
677 | fstp qword [ebx+eax*8] |
||
678 | ; f[l8] = t2 - t3; |
||
679 | mov eax, _l8 |
||
680 | fsubp st1, st0 |
||
681 | fstp qword [ebx+eax*8] |
||
682 | |||
683 | ; t1 = f[l1] + t6; |
||
684 | mov eax, _l1 |
||
685 | fld qword [ebx+eax*8] |
||
686 | fld _t6 |
||
687 | fld st0 |
||
688 | fadd st0, st2 |
||
689 | fstp _t1 |
||
690 | ; t2 = f[l1] - t6; |
||
691 | fsubp st1, st0 |
||
692 | fstp _t2 |
||
693 | |||
694 | ; t3 = t8 - t0; |
||
695 | fld _t8 |
||
696 | fsub _t0 |
||
697 | fstp _t3 |
||
698 | ; t4 = t5 + t7; |
||
699 | fld _t5 |
||
700 | fadd _t7 |
||
701 | fstp _t4 |
||
702 | |||
703 | ; f[l1] = t1 + t4; |
||
704 | mov eax, _l1 |
||
705 | fld _t1 |
||
706 | fld _t4 |
||
707 | fld st0 |
||
708 | fadd st0, st2 |
||
709 | fstp qword [ebx+eax*8] |
||
710 | ; f[l3] = t1 - t4; |
||
711 | mov eax, _l3 |
||
712 | fsubp st1, st0 |
||
713 | fstp qword [ebx+eax*8] |
||
714 | |||
715 | ; f[l2] = t2 + t3; |
||
716 | mov eax, _l2 |
||
717 | fld _t2 |
||
718 | fld _t3 |
||
719 | fld st0 |
||
720 | fadd st0, st2 |
||
721 | fstp qword [ebx+eax*8] |
||
722 | ; f[l4] = t2 - t3; |
||
723 | mov eax, _l4 |
||
724 | fsubp st1, st0 |
||
725 | fstp qword [ebx+eax*8] |
||
726 | |||
727 | ; 374 : } |
||
728 | jmp .next_k |
||
729 | |||
730 | align 4 |
||
731 | .done_k: |
||
732 | ; 375 : } |
||
733 | add ebx, _d6 ; d6 = d5*8 |
||
734 | cmp ebx, _end_of_array |
||
735 | jb .next_j |
||
736 | |||
737 | ; 376 : } |
||
738 | mov cx, _step |
||
739 | jmp .newstep |
||
740 | .done: |
||
741 | mov esp, ebp |
||
742 | pop ebp |
||
743 | ; 377 : } |
||
744 | ret |
||
745 | |||
746 | |||
747 | ;=========== Step3 ends here =========== |
||
748 | |||
749 | |||
750 | ; ================================================================= |
||
751 | |||
752 | ;================================================================= |
||
753 | ; parameters: |
||
754 | ; -- [ebp+12] = N |
||
755 | ; -- [ebp+16] = p |
||
756 | ; -- [ebp+20] = 4k-aligned data array address |
||
757 | ; -- [ebp+24] = 4k-aligned SinCosTable address |
||
758 | ; returns: |
||
759 | ; -- nothing |
||
760 | ; destroys: |
||
761 | ; -- all GPRegs |
||
762 | ;; ========================== |
||
763 | |||
764 | align 4 |
||
765 | |||
766 | FHT_4: |
||
767 | push ebp |
||
768 | mov ebp, esp |
||
769 | |||
770 | mov edx, [ebp+20] ; a |
||
771 | mov dl, byte[ebp+16] |
||
772 | call BitInvert |
||
773 | push dword[ebp+20] ; a |
||
774 | push ecx ; N |
||
775 | call step1 ; 4-point transform |
||
776 | cmp cl, 1 |
||
777 | jz .done |
||
778 | call step2 ; 16-point transform |
||
779 | cmp byte[ebp+16],1 ; p = 2 ? |
||
780 | jz .done |
||
781 | pop edx ; N |
||
782 | pop ecx ; a |
||
783 | push dword[ebp+24] ; t |
||
784 | push ecx |
||
785 | push dword[ebp+16] ; p |
||
786 | push edx ; N |
||
787 | call step3 |
||
788 | .done: |
||
789 | mov esp, ebp |
||
790 | pop ebp |
||
791 | |||
792 | ret><>><>><>><>=p;> |