Rev 2215 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
3474 | art_zh | 1 | ; Fast Hartley Transform routine |
2 | ; Copyright (C) 1999, 2004, 2010 |
||
3 | ; Artem Jerdev artem@jerdev.co.uk |
||
4 | ; |
||
5 | ; free KolibriOS version - not to be ported to other OSes |
||
6 | ; ========================================================== |
||
7 | |||
8 | |||
9 | ; global constants |
||
10 | align 8 |
||
11 | _r dq 1.41421356237309504880169 ; = sqrt(2) |
||
12 | _r2 dq 0.70710678118654752440084 ; = sqrt(2)/2 |
||
13 | _c1 dq 0.92387953251128675612818 ; = cos(pi/8) |
||
14 | _s1 dq 0.38268343236508977172846 ; = sin(pi/8) |
||
15 | |||
16 | ;================================================================= |
||
17 | ; parameter1: |
||
18 | ; -- reg dl (bits[3:0]) = Power_of_4 |
||
19 | ; returns: |
||
20 | ; -- reg edx = _CosTable address (4k-aligned) |
||
21 | ; assumes: _SinTable = _CosTable + (N/2)*8 |
||
22 | ; user heap has to be initialized |
||
23 | ; destroys: |
||
24 | ; -- eax, ebx, ecx |
||
25 | ;; ========================== |
||
26 | align 4 |
||
27 | CreateSinCosTable: |
||
28 | xor eax, eax |
||
29 | inc eax |
||
30 | mov cl, dl |
||
31 | and cl, 15 |
||
32 | shl eax, cl |
||
33 | shl eax, cl |
||
34 | mov ecx, eax ; now ecx = N |
||
35 | shl ecx, 3 |
||
36 | mov ebx, 12 |
||
37 | mov eax, 68 |
||
38 | int 0x40 ; getmem(N*sizeof(double)) |
||
39 | |||
40 | mov edx, eax ; edx = _CosTable |
||
41 | shr ecx, 1 |
||
42 | mov ebx, eax |
||
43 | add ebx, ecx ; ebx = _SinTable |
||
44 | shr ecx, 3 |
||
45 | push ecx ; [esp] = ecx = N/2 |
||
46 | |||
47 | xor eax, eax |
||
48 | fldpi |
||
49 | fidiv dword[esp] ; st : dx = 2*pi/N |
||
50 | pop ecx |
||
51 | fldz ; st : 0, dx |
||
52 | .loop: |
||
53 | fld st0 ; st : x, x, dx |
||
54 | FSINCOS ; st : cos, sin, x, dx |
||
55 | fstp qword [edx+eax*8] ; st : sin, x, dx |
||
56 | fstp qword [ebx+eax*8] ; st : x, dx |
||
57 | fadd st0, st1 ; st : x+dx, dx |
||
58 | |||
59 | inc eax |
||
60 | cmp eax, ecx |
||
61 | jne .loop |
||
62 | fstp st0 ; st : dx |
||
63 | fstp st0 ; st : |
||
2215 | art_zh | 64 | ret |
3474 | art_zh | 65 | |
2215 | art_zh | 66 | ;================================================================= |
3474 | art_zh | 67 | ; parameter1: |
68 | ; -- reg edx = _CosTable address |
||
69 | ; destroys: |
||
70 | ; -- eax, ebx, ecx |
||
71 | ;; ========================== |
||
72 | align 4 |
||
73 | DestroySinCosTable: |
||
74 | mov ecx, edx |
||
75 | mov ebx, 13 |
||
76 | mov eax, 68 |
||
77 | int 0x40 ; free(SinCosTable) |
||
78 | ret |
||
79 | |||
80 | ;================================================================= |
||
81 | ; parameter1: |
||
82 | ; -- reg dl (bits[3:0]) = Power_of_4 |
||
83 | ; -- reg edx && (-16) = 4k-aligned data array address |
||
84 | ; returns: |
||
85 | ; -- edx = Power_of_4 |
||
86 | ; -- ecx = N |
||
87 | ; destroys: |
||
88 | ; -- eax, ebx, ecx, edx, esi |
||
89 | ;; ========================== |
||
90 | align 4 |
||
91 | BitInvert: |
||
92 | mov esi, edx |
||
93 | and esi, 0xFFFFFFF0 |
||
94 | and edx, 0x0F |
||
95 | push edx |
||
96 | mov cl, dl |
||
97 | xor eax, eax |
||
98 | inc eax |
||
99 | shl eax, cl |
||
100 | shl eax, cl |
||
101 | push eax |
||
102 | xor ecx, ecx ; index term |
||
103 | .newterm: |
||
104 | inc ecx |
||
105 | cmp ecx, [esp] ; N |
||
106 | jge .done |
||
107 | |||
108 | xor eax, eax |
||
109 | mov edx, ecx |
||
110 | xor bl, bl |
||
111 | |||
112 | .do_invert: |
||
113 | inc bl |
||
114 | cmp bl, byte[esp+4] ; Power_of_4 |
||
115 | jg .switch |
||
116 | |||
117 | mov bh, dl |
||
118 | and bh, 3 |
||
119 | shl eax, 2 |
||
120 | or al, bh |
||
121 | shr edx, 2 |
||
122 | jmp .do_invert |
||
123 | |||
124 | .switch: |
||
125 | cmp eax, ecx |
||
126 | jle .newterm |
||
127 | |||
128 | fld qword [esi+eax*8] |
||
129 | fld qword [esi+ecx*8] |
||
130 | fstp qword [esi+eax*8] |
||
131 | fstp qword [esi+ecx*8] |
||
132 | jmp .newterm |
||
133 | |||
134 | .done: |
||
135 | pop ecx |
||
136 | pop edx |
||
137 | ret |
||
138 | |||
139 | ;================================================================= |
||
140 | |||
141 | |||
142 | ;================================================================= |
||
143 | ; stdcall parameters: |
||
144 | ; -- [esp+4] = N |
||
145 | ; -- [esp+8] = 4k-aligned data array address |
||
146 | ; returns: |
||
147 | ; -- nothing |
||
148 | ; destroys: |
||
149 | ; -- ebx, esi |
||
150 | ;; ========================== |
||
151 | align 4 |
||
152 | step1: |
||
153 | mov ebx, [esp+8] |
||
154 | mov esi, [esp+4] |
||
155 | shl esi, 3 |
||
156 | add esi, ebx |
||
157 | |||
158 | .loop: |
||
159 | fld qword[ebx] |
||
160 | fld qword[ebx+8] |
||
161 | fld st1 |
||
162 | fsub st0, st1 ; st : t2, f[i+1], f[i] |
||
163 | fxch st1 ; st : f[i+1], t2, f[i] |
||
164 | faddp st2, st0 ; st : t2, t1 |
||
165 | fld qword[ebx+16] |
||
166 | fld qword[ebx+24] |
||
167 | fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1 |
||
168 | fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1 |
||
169 | fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1 |
||
170 | fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1 |
||
171 | fstp st1 ; st : t4, t3, t2, t1 |
||
172 | fld st2 ; st : t2, t4, t3, t2, t1 |
||
173 | fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1 |
||
174 | fstp qword[ebx+16] ; st : t4, t3, t2, t1 |
||
175 | fsubp st2, st0 ; st : t3, t2-t4, t1 |
||
176 | fld st2 ; st : t1, t3, t2-t4, t1 |
||
177 | fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1 |
||
178 | fstp qword[ebx] ; st : t3, t2-t4, t1 |
||
179 | fsubp st2, st0 ; st : t2-t4, t1-t3 |
||
180 | fstp qword[ebx+24] ; st : t1-t3 |
||
181 | fstp qword[ebx+8] ; st : |
||
182 | |||
183 | add ebx, 32 |
||
184 | cmp ebx, esi |
||
185 | jnz .loop |
||
186 | ret |
||
187 | |||
188 | ;================================================================= |
||
2215 | art_zh | 189 | ; SSE3 version: Step1 |
190 | ; |
||
191 | ;========================== |
||
192 | |||
193 | align 4 |
||
194 | step1_sse: |
||
195 | mov ebx, [esp+8] |
||
196 | mov esi, [esp+4] |
||
197 | shl esi, 3 |
||
198 | add esi, ebx |
||
199 | |||
200 | .loop: |
||
201 | movddup xmm0, [ebx] ; xmm0: f0 ; f0 |
||
202 | movddup xmm1, [ebx+8] ; xmm1: f1 ; f1 |
||
203 | addsubpd xmm0, xmm1 ; xmm0: t1 ; t2 ( + - ) |
||
204 | movddup xmm1, [ebx+16] ; xmm1: f2 ; f2 |
||
205 | movddup xmm2, [ebx+24] ; xmm2: f3 ; f3 |
||
206 | addsubpd xmm1, xmm2 ; xmm1: t3 ; t4 ( + - ) |
||
207 | |||
208 | movddup xmm2, xmm0 ; xmm2: t2 ; t2 |
||
209 | movddup xmm3, xmm1 ; xmm3: t4 ; t4 |
||
210 | addsubpd xmm2, xmm3 ; xmm2: 2+4; 2-4 |
||
211 | shufpd xmm2, xmm2, 1 ; xmm2: 2-4; 2+4 |
||
212 | movapd [ebx+16], xmm2 |
||
213 | |||
214 | shufpd xmm0, xmm0, 1 ; xmm0: t2 ; t1 |
||
215 | shufpd xmm1, xmm1, 1 ; xmm1: t4 ; t3 |
||
216 | movddup xmm2, xmm0 ; xmm2: t1 ; t1 |
||
217 | movddup xmm3, xmm1 ; xmm3: t3 ; t3 |
||
218 | addsubpd xmm2, xmm3 ; xmm2: 1+3; 1-3 |
||
219 | shufpd xmm2, xmm2, 1 ; xmm2: 1-3; 1+3 |
||
220 | movapd [ebx], xmm2 |
||
221 | |||
222 | add ebx, 32 |
||
223 | cmp ebx, esi |
||
224 | jnz .loop |
||
225 | ret |
||
3474 | art_zh | 226 | |
227 | ; local stack definitions |
||
228 | ;=========================================================================== |
||
229 | _t0 equ dword [esp] |
||
230 | _t1 equ dword[esp+4] |
||
231 | _t2 equ dword[esp+8] |
||
232 | _t3 equ dword[esp+12] |
||
233 | _t4 equ dword[esp+16] |
||
234 | _t5 equ dword[esp+20] |
||
235 | _t6 equ dword[esp+24] |
||
236 | _t7 equ dword[esp+28] |
||
237 | _t8 equ dword[esp+32] |
||
238 | _t9 equ dword[esp+36] |
||
239 | |||
240 | _l1 equ dword[esp+40] |
||
241 | _l2 equ dword[esp+44] |
||
242 | _l3 equ dword[esp+48] |
||
243 | _l4 equ dword[esp+52] |
||
244 | _l5 equ dword[esp+56] |
||
245 | _l6 equ dword[esp+60] |
||
246 | _l7 equ dword[esp+64] |
||
247 | _l8 equ dword[esp+68] |
||
248 | _l9 equ dword[esp+72] |
||
249 | _l0 equ dword[esp+76] |
||
250 | _d1 equ dword[esp+80] |
||
251 | _d2 equ dword[esp+84] |
||
252 | _d3 equ dword[esp+88] |
||
253 | _d4 equ dword[esp+92] |
||
254 | _d5 equ dword[esp+96] |
||
255 | _d6 equ dword[esp+100] |
||
256 | _j5 equ dword[esp+104] |
||
257 | _jj equ dword[esp+108] |
||
258 | _end_of_array equ dword[esp+112] |
||
259 | _step equ word [esp+116] |
||
260 | |||
261 | |||
262 | ;================================================================= |
||
263 | ; cdecl parameters: |
||
264 | ; -- [ebp+8] = N |
||
265 | ; -- [ebp+12] = 4k-aligned data array address |
||
266 | ; returns: |
||
267 | ; -- nothing |
||
268 | ; destroys: |
||
269 | ; -- eax, ebx |
||
270 | ; locals: |
||
271 | ; -- 10 stack-located dwords (_t0 ... _t9) |
||
272 | ;; ========================== |
||
273 | align 4 |
||
274 | step2: |
||
275 | push ebp |
||
276 | mov ebp, esp |
||
277 | sub esp, 40 |
||
278 | mov ebx, [ebp+12] |
||
279 | mov eax, [ebp+ 8] |
||
280 | shl eax, 3 |
||
281 | add eax, ebx |
||
282 | |||
283 | .loop_i: |
||
284 | |||
285 | ; -- quad subelements +0, +4, +8 and +12 (simpliest operations) |
||
286 | fld qword[ebx] |
||
287 | fld qword[ebx+8*4] |
||
288 | fld st0 |
||
289 | fadd st0, st2 ; st : t1, f_4, f_0 |
||
290 | fxch st1 |
||
291 | fsubp st2, st0 ; st : t1, t2 |
||
292 | fld qword[ebx+8*8] |
||
293 | fld qword[ebx+8*12] |
||
294 | fld st0 |
||
295 | fadd st0, st2 ; st : t3, f_12, t1, t2 |
||
296 | fxch st1 |
||
297 | fsubp st2, st0 ; st : t3, t4, t1, t2 |
||
298 | ; ------ |
||
299 | fld st2 ; st : t1, t3, t4, t1, t2 |
||
300 | fadd st0, st1 |
||
301 | fstp qword[ebx] ; st : t3, t4, t1, t2 |
||
302 | fsub st0, st2 ; st : t3-t1, t4, t1, t2 |
||
303 | fchs ; st : t1-t3, t4, t1, t2 |
||
304 | fstp qword[ebx+8*4] ; st : t4, t1, t2 |
||
305 | fst st1 ; st : t4, t4, t2 |
||
306 | fadd st0, st2 ; st : t2+t4, t4, t2 |
||
307 | fstp qword[ebx+8*8] ; st : t4, t2 |
||
308 | fsubp st1, st0 ; st : t2-t4 |
||
309 | fstp qword[ebx+8*12] ; st : |
||
310 | |||
311 | ; -- even subelements +2, +6, +10 and +14 (2 multiplications needed) |
||
312 | fld qword[ebx+8*2] |
||
313 | fld qword[ebx+8*6] |
||
314 | fld [_r] |
||
315 | fmul st1, st0 ; st : r, t2, t1 |
||
316 | fld qword[ebx+8*10] |
||
317 | fxch st1 ; st : r, t3, t2, t1 |
||
318 | fmul qword[ebx+8*14] ; st : t4, t3, t2, t1 |
||
319 | ; ------ |
||
320 | fld st3 ; st : t1, t4, t3, t2, t1 |
||
321 | fadd st0, st3 ; |
||
322 | fadd st0, st2 ; |
||
323 | fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3 |
||
324 | fsub st0, st3 ; |
||
325 | fsub st0, st3 ; |
||
326 | fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3 |
||
327 | fld st3 ; st : t1, t4, t3, t2, t1 |
||
328 | fsub st0, st2 ; |
||
329 | fsub st0, st1 ; |
||
330 | fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4 |
||
331 | fadd st0, st1 ; |
||
332 | faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1 |
||
333 | fstp qword[ebx+8*6] ; store f[i+6] |
||
334 | fstp st0 ; st : t2, t1 |
||
335 | fstp st0 ; st : t1 |
||
336 | fstp st0 ; st : |
||
337 | |||
338 | ; -- odd subelements |
||
339 | fld qword[ebx+8*9] |
||
340 | fld qword[ebx+8*11] |
||
341 | fld st1 |
||
342 | fsub st0, st1 |
||
343 | fxch st1 |
||
344 | faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7]) |
||
345 | fld [_r2] |
||
346 | fmul st2, st0 |
||
347 | fmulp st1, st0 ; st : t9, t6 |
||
348 | fld qword[ebx+8*3] |
||
349 | fld st0 |
||
350 | fadd st0, st2 ; st : t1, f[l5], t9, t6 |
||
351 | fstp _t1 |
||
352 | fsub st0, st1 |
||
353 | fstp _t2 |
||
354 | fstp _t9 ; (t9 never used) |
||
355 | fstp _t6 ; st : |
||
356 | |||
357 | fld [_c1] |
||
358 | fld [_s1] |
||
359 | fld qword[ebx+8*5] |
||
360 | fld qword[ebx+8*7] |
||
361 | fld st3 ; st: c1, f[l6], f[l2], s1, c1 |
||
362 | fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c |
||
363 | fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c |
||
364 | fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c |
||
365 | faddp st1, st0 ; st: t5, f_6, f_2, s, c |
||
366 | fstp _t5 ; st: f_6, f_2, s, c |
||
367 | fld st3 ; st: c, f_6, f_2, s, c |
||
368 | fmul st0, st1 |
||
369 | fld st3 |
||
370 | fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c |
||
371 | fsubp st1, st0 ; st: t8, f_6, f_2, s, c |
||
372 | fstp _t8 ; st: f_6, f_2, s, c |
||
373 | fstp st0 ; st: f_2, s, c |
||
374 | fstp st0 ; st: s, c |
||
375 | |||
376 | fld qword[ebx+8*13] |
||
377 | fld qword[ebx+8*15] |
||
378 | fld st3 ; st: c1, f[l8], f[l4], s1, c1 |
||
379 | fmul st0, st1 |
||
380 | fld st3 |
||
381 | fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c |
||
382 | faddp st1, st0 ; st: t7, f_8, f_4, s, c |
||
383 | fld _t5 ; st: t5, t7, f_8, f_4, s, c |
||
384 | fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c |
||
385 | fstp _t4 |
||
386 | fstp _t7 ; st: f_8, f_4, s, c |
||
387 | fld st3 ; st: c, f_8, f_4, s, c |
||
388 | fmul st0, st2 |
||
389 | fld st3 |
||
390 | fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c |
||
391 | fsubp st1, st0 ; st:-t0, f_8, f_4, s, c |
||
392 | fchs |
||
393 | fld _t8 |
||
394 | fchs ; st:-t8, t0, f_8, f_4, s, c |
||
395 | fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c |
||
396 | fstp _t3 |
||
397 | fstp _t0 ; st: f_8, f_4, s, c |
||
398 | fstp st0 ; st: f_4, s, c |
||
399 | fstp st0 ; st: s, c |
||
400 | fstp st0 ; st: c |
||
401 | fstp st0 ; st: |
||
402 | |||
403 | fld _t1 |
||
404 | fld _t4 |
||
405 | fld st1 |
||
406 | fsub st0, st1 |
||
407 | fstp qword[ebx+8*11] ; f[l7] = t1-t4 |
||
408 | faddp st1, st0 |
||
409 | fstp qword[ebx+8*3] ; f[l5] = t1+t4 |
||
410 | fld _t2 |
||
411 | fld _t3 |
||
412 | fld st1 |
||
413 | fsub st0, st1 |
||
414 | fstp qword[ebx+8*15] ; f[l8] |
||
415 | faddp st1, st0 |
||
416 | fstp qword[ebx+8*7] ; f[l6] |
||
417 | |||
418 | fld _t6 |
||
419 | fld qword[ebx+8] |
||
420 | fld st1 |
||
421 | fld st1 |
||
422 | faddp st3, st0 |
||
423 | fsubp st1, st0 ; st : t2, t1 |
||
424 | |||
425 | fld _t8 |
||
426 | fsub _t0 |
||
427 | fld _t5 |
||
428 | fadd _t7 ; st : t4, t3, t2, t1 |
||
429 | |||
430 | fld st3 |
||
431 | fsub st0, st1 |
||
432 | fstp qword[ebx+8*9] ; f[l3] = t1-t4 |
||
433 | fadd st0, st3 |
||
434 | fstp qword[ebx+8] ; f[l1] = t1+t4 |
||
435 | fld st1 ; st : t2, t3, t2, t1 |
||
436 | fsub st0, st1 ; f[l4] = t2-t3 |
||
437 | fstp qword[ebx+8*13] ; st : t3, t2, t1 |
||
438 | faddp st1, st0 ; st : t2+t3, t1 |
||
439 | fstp qword[ebx+8*5] ; f[l2] = t2+t3 |
||
440 | fstp st0 ; st : |
||
441 | |||
442 | add ebx, 16*8 |
||
443 | cmp ebx, eax |
||
444 | jb .loop_i |
||
445 | |||
446 | mov esp, ebp |
||
447 | pop ebp |
||
448 | ret |
||
449 | |||
450 | |||
451 | |||
452 | |||
453 | ;================================================================= |
||
454 | ; cdecl parameters: |
||
455 | ; -- [ebp+8] = N |
||
456 | ; -- [ebp+12] = p |
||
457 | ; -- [ebp+16] = 4k-aligned data array address |
||
458 | ; -- [ebp+20] = 4k-aligned SinCosTable address |
||
459 | ; returns: |
||
460 | ; -- nothing |
||
461 | ; destroys: |
||
462 | ; -- all GPRegs |
||
463 | ; locals: |
||
464 | ; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step) |
||
465 | ;; ========================== |
||
466 | align 4 |
||
467 | step3: |
||
468 | push ebp |
||
469 | mov ebp, esp |
||
470 | sub esp, 120 |
||
471 | ; 283 : { |
||
472 | |||
473 | |||
474 | ; 293 : for (l=3; l<=p; l++) |
||
475 | mov cx, 0x0200 |
||
476 | .newstep: |
||
477 | inc ch |
||
478 | cmp ch, byte[ebp+12] |
||
479 | jg .done |
||
480 | mov _step, cx |
||
481 | |||
482 | ; 294 : { |
||
483 | ; 295 : d1 = 1 << (l + l - 3); |
||
484 | |||
485 | mov cl, ch |
||
486 | add cl, cl |
||
487 | sub cl, 3 |
||
488 | mov edx, 1 |
||
489 | shl edx, cl |
||
490 | mov _d1, edx |
||
491 | |||
492 | ; 296 : d2 = d1 << 1; |
||
493 | shl edx, 1 |
||
494 | mov _d2, edx |
||
495 | mov eax, edx |
||
496 | |||
497 | ; 297 : d3 = d2 << 1; |
||
498 | shl edx, 1 |
||
499 | mov _d3, edx |
||
500 | |||
501 | ; 298 : d4 = d2 + d3; |
||
502 | add eax, edx |
||
503 | mov _d4, eax |
||
504 | |||
505 | ; 299 : d5 = d3 << 1; |
||
506 | shl edx, 1 |
||
507 | mov _d5, edx |
||
508 | shl edx, 3 |
||
509 | mov _d6, edx ; d6 = d5*8 to simplify index operations |
||
510 | |||
511 | ; 339 : j5 = N / d5; ; moved out of internal loop |
||
512 | mov cl, [ebp+12] |
||
513 | sub cl, ch |
||
514 | add cl, cl |
||
515 | mov edx, 1 |
||
516 | shl edx, cl |
||
517 | mov _j5, edx |
||
518 | |||
519 | ; 300 : |
||
520 | ; 301 : for (j=0; j |
||
521 | mov ebx, [ebp+16] |
||
522 | mov esi, [ebp+8] |
||
523 | shl esi, 3 |
||
524 | add esi, ebx |
||
525 | mov _end_of_array, esi |
||
526 | |||
527 | .next_j: |
||
528 | |||
529 | ; { |
||
530 | ; t1 = f[j] + f[j+d2]; |
||
531 | mov eax, _d2 |
||
532 | fld qword[ebx] |
||
533 | fld qword[ebx+eax*8] |
||
534 | fld st1 |
||
535 | fadd st0, st1 |
||
536 | fstp _t1 |
||
537 | |||
538 | ; t2 = f[j] - f[j+d2]; |
||
539 | fsubp st1, st0 |
||
540 | fstp _t2 |
||
541 | |||
542 | ; t3 = f[j+d3] + f[j+d4]; |
||
543 | mov edi, _d3 |
||
544 | fld qword[ebx+edi*8] |
||
545 | mov edx, _d4 |
||
546 | fld qword[ebx+edx*8] |
||
547 | fld st1 |
||
548 | fsub st0, st1 ; st : t4, f4, f3 |
||
549 | fxch st1 ; st : f4, t4, f3 |
||
550 | |||
551 | ; t4 = f[j+d3] - f[j+d4]; |
||
552 | faddp st2, st0 ; st : t4, t3 |
||
553 | |||
554 | ; f[j+d4] = t2 - t4; |
||
555 | ; f[j+d3] = t2 + t4; |
||
556 | fld _t2 |
||
557 | fld st0 |
||
558 | fsub st0, st2 ; st : f4, t2, t4, t3 |
||
559 | fstp qword[ebx+edx*8] ; st : t2, t4, t3 |
||
560 | fadd st0, st1 ; st : f3, t4, t3 |
||
561 | fstp qword[ebx+edi*8] ; st : t4, t3 |
||
562 | |||
563 | ; f[j+d2] = t1 - t3; |
||
564 | ; f[j] = t1 + t3; |
||
565 | fld _t1 |
||
566 | fst st1 |
||
567 | fsub st0, st2 ; st : f2, t1, t3 |
||
568 | fstp qword[ebx+eax*8] ; st : t1, t3 |
||
569 | fadd st0, st1 ; st : f0, t3 |
||
570 | fstp qword[ebx] ; st : t3 |
||
571 | fstp st0 |
||
572 | |||
573 | ; jj = j + d1; / ?? |
||
574 | mov edi, _d1 |
||
575 | shl edi, 3 ; = d1*8 |
||
576 | mov edx, edi |
||
577 | mov eax, edi |
||
578 | add eax, eax ; eax = d2*8 |
||
579 | shl edx, 2 ; = d3*8 |
||
580 | add edi, ebx ; now [edi] points to f[jj] |
||
581 | add edx, edi ; and [edx] points to f[jj+d3] |
||
582 | |||
583 | ; t1 = f[jj]; |
||
584 | fld qword [edi] ; st : t1 |
||
585 | ; t3 = f[jj+d3]; |
||
586 | fld qword [edx] ; st : t3, t1 |
||
587 | |||
588 | ; t2 = f[jj+d2] * r; |
||
589 | fld qword [edi+eax] |
||
590 | fld [_r] |
||
591 | fmul st1, st0 ; st : r, t2, t3, t1 |
||
592 | ; t4 = f[jj+d4] * r |
||
593 | fmul qword [edx+eax] ; st : t4, t2, t3, t1 |
||
594 | |||
595 | ; f[jj] = t1 + t2 + t3; |
||
596 | fld st3 ; st : t1, t4, t2, t3, t1 |
||
597 | fadd st0, st3 |
||
598 | fadd st0, st2 |
||
599 | fstp qword [edi] |
||
600 | |||
601 | ; f[jj+d2] = t1 - t3 + t4; |
||
602 | fld st3 |
||
603 | fsub st0, st3 ; st : (t1-t3), t4, t2, t3, t1 |
||
604 | fld st0 |
||
605 | fadd st0, st2 ; st : f2, (t1-t3), t4, t2, t3, t1 |
||
606 | fstp qword [edi+eax] |
||
607 | ; f[jj+d4] = t1 - t3 - t4; |
||
608 | fsub st0, st1 ; st : f4, t4, t2, t3, t1 |
||
609 | fstp qword [edx+eax] |
||
610 | |||
611 | ; f[jj+d3] = t1 - t2 + t3; |
||
612 | fstp st0 ; st : t2, t3, t1 |
||
613 | fsubp st1, st0 ; st : (t3-t2), t1 |
||
614 | faddp st1, st0 ; st : f3 |
||
615 | fstp qword [edx] |
||
616 | |||
617 | ; for (k=1; k |
||
618 | xor ecx, ecx ; ecx = k |
||
619 | mov _jj, ecx |
||
620 | .next_k: |
||
621 | inc ecx |
||
622 | cmp ecx, _d1 |
||
623 | jge .done_k |
||
624 | ; { |
||
625 | mov eax, _d2 ; the sector increment |
||
626 | ; l1 = j + k; |
||
627 | mov edx, ecx |
||
628 | mov _l1, edx ; [ebx+edx*8] --> f[j+k] |
||
629 | ; l2 = l1 + d2; |
||
630 | add edx, eax |
||
631 | mov _l2, edx |
||
632 | ; l3 = l1 + d3; |
||
633 | add edx, eax |
||
634 | mov _l3, edx |
||
635 | ; l4 = l1 + d4; |
||
636 | add edx, eax |
||
637 | mov _l4, edx |
||
638 | |||
639 | ; l5 = j + d2 - k; |
||
640 | mov edx, eax |
||
641 | sub edx, ecx |
||
642 | mov _l5, edx |
||
643 | ; l6 = l5 + d2; |
||
644 | add edx, eax |
||
645 | mov _l6, edx |
||
646 | ; l7 = l5 + d3; |
||
647 | add edx, eax |
||
648 | mov _l7, edx |
||
649 | ; l8 = l5 + d4; |
||
650 | add edx, eax |
||
651 | mov _l8, edx |
||
652 | |||
653 | |||
654 | ; 340 : j5 *= k; // add-substituted multiplication |
||
655 | mov eax, _jj |
||
656 | add eax, _j5 |
||
657 | mov _jj, eax |
||
658 | |||
659 | ; c1 = C[jj]; |
||
660 | ; s1 = S[jj]; |
||
661 | mov edi, [ebp+20] |
||
662 | fld qword[edi+eax*8] |
||
663 | mov esi, [ebp+8] |
||
664 | shl esi, 2 |
||
665 | add esi, edi |
||
666 | fld qword[esi+eax*8] ; st : s1, c1 |
||
667 | |||
668 | ; t5 = f[l2] * c1 + f[l6] * s1; |
||
669 | ; t8 = f[l6] * c1 - f[l2] * s1; |
||
670 | mov edx, _l6 |
||
671 | fld qword[ebx+edx*8] |
||
672 | mov edx, _l2 |
||
673 | fld st0 |
||
674 | fmul st0, st2 |
||
675 | fxch st1 |
||
676 | fmul st0, st3 |
||
677 | fld qword[ebx+edx*8] ; st : f[l2], f[l6]*c, f[l6]*s, s, c |
||
678 | fmul st4, st0 |
||
679 | fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c |
||
680 | fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c |
||
681 | fstp _t8 |
||
682 | faddp st2, st0 ; st : f[l2]*s, t5 |
||
683 | fstp st0 ; st : t5 |
||
684 | fstp _t5 ; st : |
||
685 | |||
686 | ; c2 = C[2*jj]; |
||
687 | ; s2 = S[2*jj]; |
||
688 | shl eax, 1 |
||
689 | fld qword[edi+eax*8] |
||
690 | fld qword[esi+eax*8] ; st : s2, c2 |
||
691 | |||
692 | ; t6 = f[l3] * c2 + f[l7] * s2; |
||
693 | ; t9 = f[l7] * c2 - f[l3] * s2; |
||
694 | mov edx, _l7 |
||
695 | fld qword[ebx+edx*8] |
||
696 | mov edx, _l3 |
||
697 | fld st0 |
||
698 | fmul st0, st2 |
||
699 | fxch st1 |
||
700 | fmul st0, st3 |
||
701 | fld qword[ebx+edx*8] ; st : f[l3], f[l7]*c, f[l7]*s, s, c |
||
702 | fmul st4, st0 |
||
703 | fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c |
||
704 | fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c |
||
705 | fstp _t9 |
||
706 | faddp st2, st0 ; st : f[l2]*s, t6 |
||
707 | fstp st0 ; st : t6 |
||
708 | fstp _t6 ; st : |
||
709 | |||
710 | ; c3 = C[3*jj]; |
||
711 | ; s3 = S[3*jj]; |
||
712 | add eax, _jj |
||
713 | fld qword[edi+eax*8] |
||
714 | fld qword[esi+eax*8] ; st : s3, c3 |
||
715 | |||
716 | ; t7 = f[l4] * c3 + f[l8] * s3; |
||
717 | ; t0 = f[l8] * c3 - f[l4] * s3; |
||
718 | mov edx, _l8 |
||
719 | fld qword[ebx+edx*8] |
||
720 | mov edx, _l4 |
||
721 | fld st0 |
||
722 | fmul st0, st2 |
||
723 | fxch st1 |
||
724 | fmul st0, st3 |
||
725 | fld qword[ebx+edx*8] ; st : f[l4], f[l8]*c, f[l8]*s, s, c |
||
726 | fmul st4, st0 |
||
727 | fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c |
||
728 | fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c |
||
729 | fstp _t0 |
||
730 | faddp st2, st0 ; st : f[l2]*s, t7 |
||
731 | fstp st0 ; st : t7 |
||
732 | fstp _t7 ; st : |
||
733 | |||
734 | ; t1 = f[l5] - t9; |
||
735 | ; t2 = f[l5] + t9; |
||
736 | mov eax, _l5 |
||
737 | fld qword [ebx+eax*8] |
||
738 | fld _t9 |
||
739 | fld st0 |
||
740 | fadd st0, st2 |
||
741 | fstp _t2 |
||
742 | fsubp st1, st0 |
||
743 | fstp _t1 |
||
744 | |||
745 | ; t3 = - t8 - t0; |
||
746 | fld _t8 |
||
747 | fadd _t0 |
||
748 | fchs |
||
749 | fstp _t3 |
||
750 | ; t4 = t5 - t7; |
||
751 | fld _t5 |
||
752 | fsub _t7 |
||
753 | fstp _t4 |
||
754 | |||
755 | ; f[l5] = t1 + t4; |
||
756 | fld _t1 |
||
757 | fld _t4 |
||
758 | fld st0 |
||
759 | fadd st0, st2 |
||
760 | fstp qword [ebx+eax*8] |
||
761 | ; f[l7] = t1 - t4; |
||
762 | mov eax, _l7 |
||
763 | fsubp st1, st0 |
||
764 | fstp qword [ebx+eax*8] |
||
765 | |||
766 | ; f[l6] = t2 + t3; |
||
767 | mov eax, _l6 |
||
768 | fld _t2 |
||
769 | fld _t3 |
||
770 | fld st0 |
||
771 | fadd st0, st2 |
||
772 | fstp qword [ebx+eax*8] |
||
773 | ; f[l8] = t2 - t3; |
||
774 | mov eax, _l8 |
||
775 | fsubp st1, st0 |
||
776 | fstp qword [ebx+eax*8] |
||
777 | |||
778 | ; t1 = f[l1] + t6; |
||
779 | mov eax, _l1 |
||
780 | fld qword [ebx+eax*8] |
||
781 | fld _t6 |
||
782 | fld st0 |
||
783 | fadd st0, st2 |
||
784 | fstp _t1 |
||
785 | ; t2 = f[l1] - t6; |
||
786 | fsubp st1, st0 |
||
787 | fstp _t2 |
||
788 | |||
789 | ; t3 = t8 - t0; |
||
790 | fld _t8 |
||
791 | fsub _t0 |
||
792 | fstp _t3 |
||
793 | ; t4 = t5 + t7; |
||
794 | fld _t5 |
||
795 | fadd _t7 |
||
796 | fstp _t4 |
||
797 | |||
798 | ; f[l1] = t1 + t4; |
||
799 | mov eax, _l1 |
||
800 | fld _t1 |
||
801 | fld _t4 |
||
802 | fld st0 |
||
803 | fadd st0, st2 |
||
804 | fstp qword [ebx+eax*8] |
||
805 | ; f[l3] = t1 - t4; |
||
806 | mov eax, _l3 |
||
807 | fsubp st1, st0 |
||
808 | fstp qword [ebx+eax*8] |
||
809 | |||
810 | ; f[l2] = t2 + t3; |
||
811 | mov eax, _l2 |
||
812 | fld _t2 |
||
813 | fld _t3 |
||
814 | fld st0 |
||
815 | fadd st0, st2 |
||
816 | fstp qword [ebx+eax*8] |
||
817 | ; f[l4] = t2 - t3; |
||
818 | mov eax, _l4 |
||
819 | fsubp st1, st0 |
||
820 | fstp qword [ebx+eax*8] |
||
821 | |||
822 | ; 374 : } |
||
823 | jmp .next_k |
||
824 | |||
825 | .done_k: |
||
826 | ; 375 : } |
||
827 | add ebx, _d6 ; d6 = d5*8 |
||
828 | cmp ebx, _end_of_array |
||
829 | jb .next_j |
||
830 | |||
831 | ; 376 : } |
||
832 | mov cx, _step |
||
833 | jmp .newstep |
||
834 | .done: |
||
835 | mov esp, ebp |
||
836 | pop ebp |
||
837 | ; 377 : } |
||
838 | ret |
||
839 | |||
840 | |||
841 | ;=========== Step3 ends here =========== |
||
842 | |||
843 | |||
844 | ; ================================================================= |
||
845 | |||
846 | ;================================================================= |
||
847 | ; parameters: |
||
848 | ; -- [ebp+8] = N |
||
849 | ; -- [ebp+12] = p |
||
850 | ; -- [ebp+16] = 4k-aligned data array address |
||
851 | ; -- [ebp+20] = 4k-aligned SinCosTable address |
||
852 | ; returns: |
||
853 | ; -- nothing |
||
854 | ; destroys: |
||
855 | ; -- all GPRegs |
||
856 | ;; ========================== |
||
857 | |||
858 | align 4 |
||
859 | |||
860 | FHT_4: |
||
861 | |||
862 | push ebp |
||
863 | mov ebp, esp |
||
864 | mov edx, [ebp+16] |
||
865 | add edx, [ebp+12] |
||
866 | call BitInvert |
||
867 | push dword[ebp+16] |
||
868 | push dword[ebp+8] |
||
869 | call step1 |
||
870 | call step2 |
||
871 | pop edx ; N |
||
872 | pop ecx ; a |
||
873 | push dword[ebp+20] ; t |
||
874 | push ecx |
||
875 | push dword[ebp+12] ; p |
||
876 | push edx ; N |
||
877 | call step3 |
||
878 | mov esp, ebp |
||
879 | pop ebp |
||
880 | |||
881 | ret><>><>><>><>=p;> |