Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
1641 art_zh 1
;           Fast Hartley Transform routine
2
;           Copyright (C) 1999, 2004, 2010
3
;          Artem Jerdev  artem@jerdev.co.uk
4
;
5
; free KolibriOS version - not to be ported to other OSes
6
; ==========================================================
7
 
8
 
9
; global constants
10
align 8
11
fht_r	   dq	   1.41421356237309504880169	   ; = sqrt(2)
12
fht_r2	   dq	   0.70710678118654752440084	   ; = sqrt(2)/2
13
fht_c1	   dq	   0.92387953251128675612818	   ; = cos(pi/8)
14
fht_s1	   dq	   0.38268343236508977172846	   ; = sin(pi/8)
15
 
16
 
17
;=================================================================
18
; parameter1:
19
; -- reg  dl (bits[3:0])   = Power_of_4
20
; -- reg edx && (-16) = 4k-aligned data array address
21
; returns:
22
; -- edx = Power_of_4
23
; -- ecx = N
24
; destroys:
25
; -- eax, ebx, ecx, edx, esi
26
;; ==========================
27
align 4
28
BitInvert:
29
	mov	esi, edx
30
	and	esi, 0xFFFFFFF0
31
	and	edx, 0x0F
32
	push	edx
33
	mov	cl, dl
34
	xor	eax, eax
35
	inc	eax
36
	shl	eax, cl
37
	shl	eax, cl
38
	push	eax
39
	xor	ecx, ecx		; index term
40
align 4
41
.newterm:
42
	inc	ecx
43
	cmp	ecx, [esp]		; N
44
	jge	.done
45
 
46
	xor	eax, eax
47
	mov	edx, ecx
48
	xor	bl, bl
49
align 4
50
.do_invert:
51
	inc	bl
52
	cmp	bl, byte[esp+4] ; Power_of_4
53
	jg	.switch
54
 
55
	mov	bh, dl
56
	and	bh,  3
57
	shl	eax, 2
58
	or	al, bh
59
	shr	edx, 2
60
	jmp	.do_invert
61
align 8
62
 
63
.switch:
64
	cmp	eax, ecx
65
	jle	.newterm
66
 
67
	fld	qword [esi+eax*8]
68
	fld	qword [esi+ecx*8]
69
	fstp	qword [esi+eax*8]
70
	fstp	qword [esi+ecx*8]
71
	jmp	.newterm
72
 
73
align 4
74
.done:
75
	pop	ecx
76
	pop	edx
77
	ret
78
 
79
;=================================================================
80
 
81
 
82
;=================================================================
83
; stdcall parameters:
84
; -- [esp+4]  = N
85
; -- [esp+8]  = 4k-aligned data array  address
86
; returns:
87
; -- nothing
88
; destroys:
89
; -- ebx, esi
90
;; ==========================
91
align 4
92
step1:
93
	mov	ebx, [esp+8]
94
	mov	esi, [esp+4]
95
	shl	esi, 3
96
	add	esi, ebx
97
 
98
align 4
99
.loop:
100
	fld	qword[ebx]
101
	fld	qword[ebx+8]
102
	fld	st1
103
	fsub	st0, st1	; st : t2, f[i+1], f[i]
104
	fxch	st1		; st : f[i+1], t2, f[i]
105
	faddp	st2, st0	; st : t2, t1
106
	fld	qword[ebx+16]
107
	fld	qword[ebx+24]
108
	fld	st1		; st : f[i+2], f[i+3], f[i+2], t2, t1
109
	fadd	st0, st1	; st : t3, f[i+3], f[i+2], t2, t1
110
	fxch	st2		; st : f[i+2], f[i+3], t3, t2, t1
111
	fsub	st0, st1	; st : t4, f[i+3], t3, t2, t1
112
	fstp	st1		; st : t4, t3, t2, t1
113
	fld	st2		; st : t2, t4, t3, t2, t1
114
	fadd	st0, st1	; st : t2+t4, t4, t3, t2, t1
115
	fstp	qword[ebx+16]	; st : t4, t3, t2, t1
116
	fsubp	st2, st0	; st : t3, t2-t4, t1
117
	fld	st2		; st : t1, t3, t2-t4, t1
118
	fadd	st0, st1	; st : t1+t3, t3, t2-t4, t1
119
	fstp	qword[ebx]	; st : t3, t2-t4, t1
120
	fsubp	st2, st0	; st : t2-t4, t1-t3
121
	fstp	qword[ebx+24]	; st : t1-t3
122
	fstp	qword[ebx+8]	; st : 
123
 
124
	add	ebx, 32
125
	cmp	ebx, esi
126
	jnz	.loop
127
ret
128
 
129
;       local stack definitions
130
;===========================================================================
131
_t0	equ	dword [esp]
132
_t1	equ	dword[esp+4]
133
_t2	equ	dword[esp+8]
134
_t3	equ	dword[esp+12]
135
_t4	equ	dword[esp+16]
136
_t5	equ	dword[esp+20]
137
_t6	equ	dword[esp+24]
138
_t7	equ	dword[esp+28]
139
_t8	equ	dword[esp+32]
140
_t9	equ	dword[esp+36]
141
 
142
_l1   equ	dword[esp+40]
143
_l2   equ	dword[esp+44]
144
_l3   equ	dword[esp+48]
145
_l4   equ	dword[esp+52]
146
_l5   equ	dword[esp+56]
147
_l6   equ	dword[esp+60]
148
_l7   equ	dword[esp+64]
149
_l8   equ	dword[esp+68]
150
_l9   equ	dword[esp+72]
151
_l0   equ	dword[esp+76]
152
_d1   equ	dword[esp+80]
153
_d2   equ	dword[esp+84]
154
_d3   equ	dword[esp+88]
155
_d4   equ	dword[esp+92]
156
_d5   equ	dword[esp+96]
157
_d6   equ	dword[esp+100]
158
_j5   equ	dword[esp+104]
159
_jj   equ	dword[esp+108]
160
_end_of_array	equ	dword[esp+112]
161
_step		equ	word [esp+116]
162
 
163
 
164
;=================================================================
165
; cdecl parameters:
166
; -- [ebp+8]   = N
167
; -- [ebp+12]  = 4k-aligned data array  address
168
; returns:
169
; -- nothing
170
; destroys:
171
; -- eax, ebx
172
; locals:
173
; -- 10 stack-located dwords (_t0 ... _t9)
174
;; ==========================
175
align 4
176
step2:
177
	push	ebp
178
	mov	ebp, esp
179
	sub	esp, 40
180
	mov	ebx, [ebp+12]
181
	mov	eax, [ebp+ 8]
182
	shl	eax, 3
183
	add	eax, ebx
184
 
185
align 4
186
.loop_i:
187
 
188
; -- quad subelements  +0, +4, +8 and +12 (simpliest operations)
189
	fld	qword[ebx]
190
	fld	qword[ebx+8*4]
191
	fld	st0
192
	fadd	st0, st2	; st : t1, f_4, f_0
193
	fxch	st1
194
	fsubp	st2, st0	; st : t1, t2
195
	fld	qword[ebx+8*8]
196
	fld	qword[ebx+8*12]
197
	fld	st0
198
	fadd	st0, st2	; st : t3, f_12, t1, t2
199
	fxch	st1
200
	fsubp	st2, st0	; st : t3, t4, t1, t2
201
	; ------
202
	fld	st2		; st : t1, t3, t4, t1, t2
203
	fadd	st0, st1
204
	fstp	qword[ebx]	; st : t3, t4, t1, t2
205
	fsub	st0, st2	; st : t3-t1, t4, t1, t2
206
	fchs			; st : t1-t3, t4, t1, t2
207
	fstp	qword[ebx+8*4]	; st : t4, t1, t2
208
	fst	st1		; st : t4, t4, t2
209
	fadd	st0, st2	; st : t2+t4, t4, t2
210
	fstp	qword[ebx+8*8]	; st : t4, t2
211
	fsubp	st1, st0	; st : t2-t4
212
	fstp	qword[ebx+8*12] ; st : 
213
 
214
; -- even subelements  +2, +6, +10 and +14 (2 multiplications needed)
215
	fld	qword[ebx+8*2]
216
	fld	qword[ebx+8*6]
217
	fld	[fht_r]
218
	fmul	st1, st0	; st : r, t2, t1
219
	fld	qword[ebx+8*10]
220
	fxch	st1		; st : r, t3, t2, t1
221
	fmul	qword[ebx+8*14] ; st : t4, t3, t2, t1
222
	; ------
223
	fld	st3		; st : t1, t4, t3, t2, t1
224
	fadd	st0, st3	;
225
	fadd	st0, st2	;
226
	fst	qword[ebx+8*2]	; store f[i+8] = t1+t2+t3
227
	fsub	st0, st3	;
228
	fsub	st0, st3	;
229
	fstp	qword[ebx+8*10] ; store f[i+10]= t1-t2+t3
230
	fld	st3		; st : t1, t4, t3, t2, t1
231
	fsub	st0, st2	;
232
	fsub	st0, st1	;
233
	fst	qword[ebx+8*14] ; store f[i+14]= t1-t3-t4
234
	fadd	st0, st1	;
235
	faddp	st1, st0	; st : t1-t3+t4, t3, t2, t1
236
	fstp	qword[ebx+8*6]	; store f[i+6]
237
	fstp	st0		; st : t2, t1
238
	fstp	st0		; st : t1
239
	fstp	st0		; st : 
240
 
241
; -- odd subelements
242
	fld	qword[ebx+8*9]
243
	fld	qword[ebx+8*11]
244
	fld	st1
245
	fsub	st0, st1
246
	fxch	st1
247
	faddp	st2, st0	; st : (f[l3]-f[l7]), (f[l3]+f[l7])
248
	fld	[fht_r2]
249
	fmul	st2, st0
250
	fmulp	st1, st0	; st : t9, t6
251
	fld	qword[ebx+8*3]
252
	fld	st0
253
	fadd	st0, st2	; st : t1, f[l5], t9, t6
254
	fstp	_t1
255
	fsub	st0, st1
256
	fstp	_t2
257
	fstp	_t9	; (t9 never used)
258
	fstp	_t6		; st : 
259
 
260
	fld	[fht_c1]
261
	fld	[fht_s1]
262
	fld	qword[ebx+8*5]
263
	fld	qword[ebx+8*7]
264
	fld	st3		; st: c1, f[l6], f[l2], s1, c1
265
	fmul	st0, st2	; st: f_2*c, f_6, f_2, s, c
266
	fld	st1		; st: f_6, f_2*c, f_6, f_2, s, c
267
	fmul	st0, st4	; st: f_6*s, f_2*c, f_6, f_2, s, c
268
	faddp	st1, st0	; st: t5, f_6, f_2, s, c
269
	fstp	_t5		; st: f_6, f_2, s, c
270
	fld	st3		; st: c, f_6, f_2, s, c
271
	fmul	st0, st1
272
	fld	st3
273
	fmul	st0, st3	; st: f_2*s, f_6*c, f_6, f_2, s, c
274
	fsubp	st1, st0	; st: t8, f_6, f_2, s, c
275
	fstp	_t8		; st: f_6, f_2, s, c
276
	fstp	st0		; st: f_2, s, c
277
	fstp	st0		; st: s, c
278
 
279
	fld	qword[ebx+8*13]
280
	fld	qword[ebx+8*15]
281
	fld	st3		; st: c1, f[l8], f[l4], s1, c1
282
	fmul	st0, st1
283
	fld	st3
284
	fmul	st0, st3	; st: f_4*s, f_8*c, f_8, f_4, s, c
285
	faddp	st1, st0	; st: t7, f_8, f_4, s, c
286
	fld	_t5		; st: t5, t7, f_8, f_4, s, c
287
	fsub	st0, st1	; st: t4, t7, f_8, f_4, s, c
288
	fstp	_t4
289
	fstp	_t7		; st: f_8, f_4, s, c
290
	fld	st3		; st: c, f_8, f_4, s, c
291
	fmul	st0, st2
292
	fld	st3
293
	fmul	st0, st2	; st: f_8*s, f_4*c, f_8, f_4, s, c
294
	fsubp	st1, st0	; st:-t0, f_8, f_4, s, c
295
	fchs
296
	fld	_t8
297
	fchs			; st:-t8, t0, f_8, f_4, s, c
298
	fsub	st0, st1	; st: t3, t0, f_8, f_4, s, c
299
	fstp	_t3
300
	fstp	_t0		; st: f_8, f_4, s, c
301
	fstp	st0		; st: f_4, s, c
302
	fstp	st0		; st: s, c
303
	fstp	st0		; st: c
304
	fstp	st0		; st: 
305
 
306
	fld	_t1
307
	fld	_t4
308
	fld	st1
309
	fsub	st0, st1
310
	fstp	qword[ebx+8*11] ; f[l7] = t1-t4
311
	faddp	st1, st0
312
	fstp	qword[ebx+8*3]	; f[l5] = t1+t4
313
	fld	_t2
314
	fld	_t3
315
	fld	st1
316
	fsub	st0, st1
317
	fstp	qword[ebx+8*15] ; f[l8]
318
	faddp	st1, st0
319
	fstp	qword[ebx+8*7]	; f[l6]
320
 
321
	fld	_t6
322
	fld	qword[ebx+8]
323
	fld	st1
324
	fsub	st0, st1
325
	fxch	st1
326
	faddp	st2, st0	; st : t2, t1
327
	fld	_t8
328
	fsub	_t0
329
	fld	_t5
330
	fadd	_t7		; st : t4, t3, t2, t1
331
 
332
	fld	st3
333
	fsub	st0, st1
334
	fstp	qword[ebx+8*9]	; f[l3] = t1-t4
335
	fadd	st0, st3
336
	fstp	qword[ebx+8]	; f[l1] = t1+t4
337
	fld	st1		; st : t2, t3, t2, t1
338
	fsub	st0, st1	; f[l4] = t2-t3
339
	fstp	qword[ebx+8*13] ; st : t3, t2, t1
340
	faddp	st1, st0	; st : t2+t3, t1
341
	fstp	qword[ebx+8*5]	; f[l2] = t2+t3
342
	fstp	st0		; st : 
343
 
344
	add	ebx, 16*8
345
	cmp	ebx, eax
346
	jb	.loop_i
347
 
348
	mov	esp, ebp
349
	pop	ebp
350
ret
351
 
352
 
353
 
354
 
355
;=================================================================
356
; cdecl parameters:
357
; -- [ebp+8]   = N
358
; -- [ebp+12]  = p
359
; -- [ebp+16]  = 4k-aligned data array  address
360
; -- [ebp+20]  = 4k-aligned SinCosTable address
361
; returns:
362
; -- nothing
363
; destroys:
364
; -- all GPRegs
365
; locals:
366
; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step)
367
;; ==========================
368
align 4
369
step3:
370
	push	ebp
371
	mov	ebp, esp
372
	sub	esp, 120
373
; 283  : {
374
 
375
 
376
; 293  :   for (l=3; l<=p; l++)
377
	mov	cx, 0x0200
378
align 4
379
.newstep:
380
	inc	ch
381
	cmp	ch, byte[ebp+12]
382
	jg	.done
383
	mov	_step, cx
384
 
385
; 294  :   {
386
; 295  :     d1 = 1 << (l + l - 3);
387
 
388
	mov	cl, ch
389
	add	cl, cl
390
	sub	cl, 3
391
	mov	edx, 1
392
	shl	edx, cl
393
	mov	_d1, edx
394
 
395
; 296  :     d2 = d1 << 1;
396
	shl	edx, 1
397
	mov	_d2, edx
398
	mov	eax, edx
399
 
400
; 297  :     d3 = d2 << 1;
401
	shl	edx, 1
402
	mov	_d3, edx
403
 
404
; 298  :     d4 = d2 + d3;
405
	add	eax, edx
406
	mov	_d4, eax
407
 
408
; 299  :     d5 = d3 << 1;
409
	shl	edx, 1
410
	mov	_d5, edx
411
	shl	edx, 3
412
	mov	_d6, edx	; d6 = d5*8 to simplify index operations
413
 
414
; 339  :         j5 = N / d5;   ; moved out of internal loop
415
	mov	cl, [ebp+12]
416
	sub	cl, ch
417
	add	cl, cl
418
	mov	edx, 1
419
	shl	edx, cl
420
	mov	_j5, edx
421
 
422
; 300  :
423
; 301  :     for (j=0; j
424
	mov	ebx, [ebp+16]
425
	mov	esi, [ebp+8]
426
	shl	esi, 3
427
	add	esi, ebx
428
	mov	_end_of_array, esi
429
 
430
align 4
431
.next_j:
432
 
433
; {
434
; t1 = f[j] + f[j+d2];
435
	mov	eax, _d2
436
	fld	qword[ebx]
437
	fld	qword[ebx+eax*8]
438
	fld	st1
439
	fadd	st0, st1
440
	fstp	_t1
441
 
442
; t2 = f[j] - f[j+d2];
443
	fsubp	st1, st0
444
	fstp	_t2
445
 
446
; t3 = f[j+d3] + f[j+d4];
447
	mov	edi, _d3
448
	fld	qword[ebx+edi*8]
449
	mov	edx, _d4
450
	fld	qword[ebx+edx*8]
451
	fld	st1
452
	fsub	st0, st1		; st : t4, f4, f3
453
	fxch	st1			; st : f4, t4, f3
454
 
455
; t4 = f[j+d3] - f[j+d4];
456
	faddp	st2, st0		; st : t4, t3
457
 
458
; f[j+d4] = t2 - t4;
459
; f[j+d3] = t2 + t4;
460
	fld	_t2
461
	fld	st0
462
	fsub	st0, st2		; st : f4, t2, t4, t3
463
	fstp	qword[ebx+edx*8]	; st : t2, t4, t3
464
	fadd	st0, st1		; st : f3, t4, t3
465
	fstp	qword[ebx+edi*8]	; st : t4, t3
466
 
467
; f[j+d2] = t1 - t3;
468
; f[j]    = t1 + t3;
469
	fld	_t1
470
	fst	st1
471
	fsub	st0, st2		; st : f2, t1, t3
472
	fstp	qword[ebx+eax*8]	; st : t1, t3
473
	fadd	st0, st1		; st : f0, t3
474
	fstp	qword[ebx]		; st : t3
475
	fstp	st0
476
 
477
; jj = j + d1;     / ??
478
	mov	edi, _d1
479
	shl	edi, 3		; = d1*8
480
	mov	edx, edi
481
	mov	eax, edi
482
	add	eax, eax	; eax = d2*8
483
	shl	edx, 2		; = d3*8
484
	add	edi, ebx	; now [edi] points to f[jj]
485
	add	edx, edi	; and [edx] points to f[jj+d3]
486
 
487
; t1 = f[jj];
488
	fld	qword [edi]	; st : t1
489
; t3 = f[jj+d3];
490
	fld	qword [edx]	; st : t3, t1
491
 
492
; t2 = f[jj+d2] * r;
493
	fld	qword [edi+eax]
494
	fld	[fht_r]
495
	fmul	st1, st0	; st : r,  t2, t3, t1
496
; t4 = f[jj+d4] * r
497
	fmul	qword [edx+eax] ; st : t4, t2, t3, t1
498
 
499
; f[jj]    = t1 + t2 + t3;
500
	fld	st3		; st : t1, t4, t2, t3, t1
501
	fadd	st0, st3
502
	fadd	st0, st2
503
	fstp	qword [edi]
504
 
505
; f[jj+d2] = t1 - t3 + t4;
506
	fld	st3
507
	fsub	st0, st3	; st : (t1-t3), t4, t2, t3, t1
508
	fld	st0
509
	fadd	st0, st2	; st : f2, (t1-t3), t4, t2, t3, t1
510
	fstp	qword [edi+eax]
511
; f[jj+d4] = t1 - t3 - t4;
512
	fsub	st0, st1	; st : f4, t4, t2, t3, t1
513
	fstp	qword [edx+eax]
514
 
515
; f[jj+d3] = t1 - t2 + t3;
516
	fstp	st0		; st : t2, t3,  t1
517
	fsubp	st1, st0	; st : (t3-t2), t1
518
	faddp	st1, st0	; st : f3
519
	fstp	qword [edx]
520
 
521
; for (k=1; k
522
	xor	ecx, ecx	; ecx = k
523
	mov	_jj, ecx
524
align 4
525
.next_k:
526
	inc	ecx
527
	cmp	ecx, _d1
528
	jge	.done_k
529
; {
530
	mov	eax, _d2	; the sector increment
531
; l1 = j  + k;
532
	mov	edx, ecx
533
	mov	_l1, edx	; [ebx+edx*8] --> f[j+k]
534
; l2 = l1 + d2;
535
	add	edx, eax
536
	mov	_l2, edx
537
; l3 = l1 + d3;
538
	add	edx, eax
539
	mov	_l3, edx
540
; l4 = l1 + d4;
541
	add	edx, eax
542
	mov	_l4, edx
543
 
544
; l5 = j  + d2 - k;
545
	mov	edx, eax
546
	sub	edx, ecx
547
	mov	_l5, edx
548
; l6 = l5 + d2;
549
	add	edx, eax
550
	mov	_l6, edx
551
; l7 = l5 + d3;
552
	add	edx, eax
553
	mov	_l7, edx
554
; l8 = l5 + d4;
555
	add	edx, eax
556
	mov	_l8, edx
557
 
558
 
559
; 340  :         j5 *= k;       // add-substituted multiplication
560
	mov	eax, _jj
561
	add	eax, _j5
562
	mov	_jj, eax
563
 
564
; c1 = C[jj];
565
; s1 = S[jj];
566
	mov	edi, [ebp+20]
567
	fld	qword[edi+eax*8]
568
	mov	esi, [ebp+8]
569
	shl	esi, 2
570
	add	esi, edi
571
	fld	qword[esi+eax*8]	; st : s1, c1
572
 
573
; t5 = f[l2] * c1 + f[l6] * s1;
574
; t8 = f[l6] * c1 - f[l2] * s1;
575
	mov	edx, _l6
576
	fld	qword[ebx+edx*8]
577
	mov	edx, _l2
578
	fld	st0
579
	fmul	st0, st2
580
	fxch	st1
581
	fmul	st0, st3
582
	fld	qword[ebx+edx*8]	; st : f[l2], f[l6]*c, f[l6]*s, s, c
583
	fmul	st4, st0
584
	fmulp	st3, st0		; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c
585
	fsub	st0, st2		; st :   t8,    f[l6]*s, f[l2]*s, f[l2]*c
586
	fstp	_t8
587
	faddp	st2, st0		; st :  f[l2]*s, t5
588
	fstp	st0			; st :  t5
589
	fstp	_t5			; st :  
590
 
591
; c2 = C[2*jj];
592
; s2 = S[2*jj];
593
	shl	eax, 1
594
	fld	qword[edi+eax*8]
595
	fld	qword[esi+eax*8]	; st : s2, c2
596
 
597
; t6 = f[l3] * c2 + f[l7] * s2;
598
; t9 = f[l7] * c2 - f[l3] * s2;
599
	mov	edx, _l7
600
	fld	qword[ebx+edx*8]
601
	mov	edx, _l3
602
	fld	st0
603
	fmul	st0, st2
604
	fxch	st1
605
	fmul	st0, st3
606
	fld	qword[ebx+edx*8]	; st : f[l3], f[l7]*c, f[l7]*s, s, c
607
	fmul	st4, st0
608
	fmulp	st3, st0		; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c
609
	fsub	st0, st2		; st :   t9,    f[l7]*s, f[l3]*s, f[l3]*c
610
	fstp	_t9
611
	faddp	st2, st0		; st :  f[l2]*s, t6
612
	fstp	st0			; st :  t6
613
	fstp	_t6			; st :  
614
 
615
; c3 = C[3*jj];
616
; s3 = S[3*jj];
617
	add	eax, _jj
618
	fld	qword[edi+eax*8]
619
	fld	qword[esi+eax*8]	; st : s3, c3
620
 
621
; t7 = f[l4] * c3 + f[l8] * s3;
622
; t0 = f[l8] * c3 - f[l4] * s3;
623
	mov	edx, _l8
624
	fld	qword[ebx+edx*8]
625
	mov	edx, _l4
626
	fld	st0
627
	fmul	st0, st2
628
	fxch	st1
629
	fmul	st0, st3
630
	fld	qword[ebx+edx*8]	; st : f[l4], f[l8]*c, f[l8]*s, s, c
631
	fmul	st4, st0
632
	fmulp	st3, st0		; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c
633
	fsub	st0, st2		; st :   t9,    f[l8]*s, f[l4]*s, f[l4]*c
634
	fstp	_t0
635
	faddp	st2, st0		; st : f[l2]*s, t7
636
	fstp	st0			; st :  t7
637
	fstp	_t7			; st :  
638
 
639
; t1 = f[l5] - t9;
640
; t2 = f[l5] + t9;
641
	mov	eax, _l5
642
	fld	qword [ebx+eax*8]
643
	fld	_t9
644
	fld	st0
645
	fadd	st0, st2
646
	fstp	_t2
647
	fsubp	st1, st0
648
	fstp	_t1
649
 
650
; t3 = - t8  - t0;
651
	fld	_t8
652
	fadd	_t0
653
	fchs
654
	fstp	_t3
655
; t4 =   t5  - t7;
656
	fld	_t5
657
	fsub	_t7
658
	fstp	_t4
659
 
660
; f[l5] = t1 + t4;
661
	fld	_t1
662
	fld	_t4
663
	fld	st0
664
	fadd	st0, st2
665
	fstp	qword [ebx+eax*8]
666
; f[l7] = t1 - t4;
667
	mov	eax, _l7
668
	fsubp	st1, st0
669
	fstp	qword [ebx+eax*8]
670
 
671
; f[l6] = t2 + t3;
672
	mov	eax, _l6
673
	fld	_t2
674
	fld	_t3
675
	fld	st0
676
	fadd	st0, st2
677
	fstp	qword [ebx+eax*8]
678
; f[l8] = t2 - t3;
679
	mov	eax, _l8
680
	fsubp	st1, st0
681
	fstp	qword [ebx+eax*8]
682
 
683
; t1 = f[l1] + t6;
684
	mov	eax, _l1
685
	fld	qword [ebx+eax*8]
686
	fld	_t6
687
	fld	st0
688
	fadd	st0, st2
689
	fstp	_t1
690
; t2 = f[l1] - t6;
691
	fsubp	st1, st0
692
	fstp	_t2
693
 
694
; t3 =    t8 - t0;
695
	fld	_t8
696
	fsub	_t0
697
	fstp	_t3
698
; t4 =    t5 + t7;
699
	fld	_t5
700
	fadd	_t7
701
	fstp	_t4
702
 
703
; f[l1] = t1 + t4;
704
	mov	eax, _l1
705
	fld	_t1
706
	fld	_t4
707
      fld     st0
708
	fadd	st0, st2
709
	fstp	qword [ebx+eax*8]
710
; f[l3] = t1 - t4;
711
	mov	eax, _l3
712
	fsubp	st1, st0
713
	fstp	qword [ebx+eax*8]
714
 
715
; f[l2] = t2 + t3;
716
	mov	eax, _l2
717
	fld	_t2
718
	fld	_t3
719
	fld	st0
720
	fadd	st0, st2
721
	fstp	qword [ebx+eax*8]
722
; f[l4] = t2 - t3;
723
	mov	eax, _l4
724
	fsubp	st1, st0
725
	fstp	qword [ebx+eax*8]
726
 
727
; 374  :       }
728
	jmp	.next_k
729
 
730
align 4
731
.done_k:
732
; 375  :     }
733
	add	ebx, _d6	; d6 = d5*8
734
	cmp	ebx, _end_of_array
735
	jb	.next_j
736
 
737
; 376  :   }
738
	mov	cx, _step
739
	jmp	.newstep
740
.done:
741
	mov	esp, ebp
742
	pop	ebp
743
; 377  : }
744
	ret
745
 
746
 
747
		;=========== Step3 ends here ===========
748
 
749
 
750
; =================================================================
751
 
752
;=================================================================
753
; parameters:
754
; -- [ebp+12]   = N
755
; -- [ebp+16]  = p
756
; -- [ebp+20]  = 4k-aligned data array  address
757
; -- [ebp+24]  = 4k-aligned SinCosTable address
758
; returns:
759
; -- nothing
760
; destroys:
761
; -- all GPRegs
762
;; ==========================
763
 
764
align 4
765
 
766
FHT_4:
767
	push	ebp
768
	mov	ebp, esp
769
 
770
	mov	edx, [ebp+20]	; a
771
	mov	dl, byte[ebp+16]
772
	call BitInvert
773
	push	dword[ebp+20]	; a
774
	push	ecx		; N
775
	call	step1		; 4-point transform
776
	cmp	cl, 1
777
	jz	.done
778
	call	step2		; 16-point transform
779
	cmp	byte[ebp+16],1	; p = 2 ?
780
	jz	.done
781
	pop	edx		; N
782
	pop	ecx		; a
783
	push	dword[ebp+24]	; t
784
	push	ecx
785
	push	dword[ebp+16]	; p
786
	push	edx		; N
787
	call	step3
788
.done:
789
	mov	esp, ebp
790
	pop	ebp
791
 
792
ret