Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2210 art_zh 1
;           Fast Hartley Transform routine
2
;           Copyright (C) 1999, 2004, 2010
3
;          Artem Jerdev  artem@jerdev.co.uk
4
;
5
; free KolibriOS version - not to be ported to other OSes
6
; ==========================================================
7
 
8
 
9
; global constants
10
align 8
11
_r	dq	1.41421356237309504880169	; = sqrt(2)
12
_r2	dq	0.70710678118654752440084	; = sqrt(2)/2
13
_c1	dq	0.92387953251128675612818	; = cos(pi/8)
14
_s1	dq	0.38268343236508977172846	; = sin(pi/8)
15
 
16
;=================================================================
17
; parameter1:
18
; -- reg dl (bits[3:0])   = Power_of_4
19
; returns:
20
; -- reg edx = _CosTable address (4k-aligned)
21
; assumes:     _SinTable  = _CosTable + (N/2)*8
22
; user heap has to be initialized
23
; destroys:
24
; -- eax, ebx, ecx
25
;; ==========================
26
align 4
27
CreateSinCosTable:
28
	xor	eax,  eax
29
	inc	eax
30
	mov	cl,  dl
31
	and	cl,  15
32
	shl	eax, cl
33
	shl	eax, cl
34
	mov	ecx, eax		; now ecx = N
35
	shl	ecx, 3
36
	mov	ebx, 12
37
	mov	eax, 68
38
	int	0x40			; getmem(N*sizeof(double))
39
 
40
	mov	edx, eax		; edx = _CosTable
41
	shr	ecx, 1
42
	mov	ebx, eax
43
	add	ebx, ecx		; ebx = _SinTable
44
	shr	ecx, 3
45
	push	ecx			; [esp] = ecx = N/2
46
 
47
	xor	eax,  eax
48
	fldpi
49
	fidiv	dword[esp]		; st : dx = 2*pi/N
50
	pop	ecx
51
	fldz				; st : 0, dx
52
.loop:
53
	fld	st0			; st : x, x, dx
54
	FSINCOS 			; st : cos, sin, x, dx
55
	fstp	qword [edx+eax*8]	; st : sin, x, dx
56
	fstp	qword [ebx+eax*8]	; st : x, dx
57
	fadd	st0, st1		; st : x+dx, dx
58
 
59
	inc	eax
60
	cmp	eax, ecx
61
	jne	.loop
62
	fstp	st0			; st : dx
63
	fstp	st0			; st : 
64
ret
65
 
66
;=================================================================
67
; parameter1:
68
; -- reg edx = _CosTable address
69
; destroys:
70
; -- eax, ebx, ecx
71
;; ==========================
72
align 4
73
DestroySinCosTable:
74
	mov	ecx, edx
75
	mov	ebx, 13
76
	mov	eax, 68
77
	int	0x40			; free(SinCosTable)
78
ret
79
 
80
;=================================================================
81
; parameter1:
82
; -- reg  dl (bits[3:0])   = Power_of_4
83
; -- reg edx && (-16) = 4k-aligned data array address
84
; returns:
85
; -- edx = Power_of_4
86
; -- ecx = N
87
; destroys:
88
; -- eax, ebx, ecx, edx, esi
89
;; ==========================
90
align 4
91
BitInvert:
92
	mov	esi, edx
93
	and	esi, 0xFFFFFFF0
94
	and	edx, 0x0F
95
	push	edx
96
	mov	cl, dl
97
	xor	eax, eax
98
	inc	eax
99
	shl	eax, cl
100
	shl	eax, cl
101
	push	eax
102
	xor	ecx, ecx		; index term
103
.newterm:
104
	inc	ecx
105
	cmp	ecx, [esp]		; N
106
	jge	.done
107
 
108
	xor	eax, eax
109
	mov	edx, ecx
110
	xor	bl, bl
111
 
112
.do_invert:
113
	inc	bl
114
	cmp	bl, byte[esp+4] ; Power_of_4
115
	jg	.switch
116
 
117
	mov	bh, dl
118
	and	bh,  3
119
	shl	eax, 2
120
	or	al, bh
121
	shr	edx, 2
122
	jmp	.do_invert
123
 
124
.switch:
125
	cmp	eax, ecx
126
	jle	.newterm
127
 
128
	fld	qword [esi+eax*8]
129
	fld	qword [esi+ecx*8]
130
	fstp	qword [esi+eax*8]
131
	fstp	qword [esi+ecx*8]
132
	jmp	.newterm
133
 
134
.done:
135
	pop	ecx
136
	pop	edx
137
	ret
138
 
139
;=================================================================
140
 
141
 
142
;=================================================================
143
; stdcall parameters:
144
; -- [esp+4]  = N
145
; -- [esp+8]  = 4k-aligned data array  address
146
; returns:
147
; -- nothing
148
; destroys:
149
; -- ebx, esi
150
;; ==========================
151
align 4
152
step1:
153
	mov	ebx, [esp+8]
154
	mov	esi, [esp+4]
155
	shl	esi, 3
156
	add	esi, ebx
157
 
158
.loop:
159
	fld	qword[ebx]
160
	fld	qword[ebx+8]
161
	fld	st1
162
	fsub	st0, st1	; st : t2, f[i+1], f[i]
163
	fxch	st1		; st : f[i+1], t2, f[i]
164
	faddp	st2, st0	; st : t2, t1
165
	fld	qword[ebx+16]
166
	fld	qword[ebx+24]
167
	fld	st1		; st : f[i+2], f[i+3], f[i+2], t2, t1
168
	fadd	st0, st1	; st : t3, f[i+3], f[i+2], t2, t1
169
	fxch	st2		; st : f[i+2], f[i+3], t3, t2, t1
170
	fsub	st0, st1	; st : t4, f[i+3], t3, t2, t1
171
	fstp	st1		; st : t4, t3, t2, t1
172
	fld	st2		; st : t2, t4, t3, t2, t1
173
	fadd	st0, st1	; st : t2+t4, t4, t3, t2, t1
174
	fstp	qword[ebx+16]	; st : t4, t3, t2, t1
175
	fsubp	st2, st0	; st : t3, t2-t4, t1
176
	fld	st2		; st : t1, t3, t2-t4, t1
177
	fadd	st0, st1	; st : t1+t3, t3, t2-t4, t1
178
	fstp	qword[ebx]	; st : t3, t2-t4, t1
179
	fsubp	st2, st0	; st : t2-t4, t1-t3
180
	fstp	qword[ebx+24]	; st : t1-t3
181
	fstp	qword[ebx+8]	; st : 
182
 
183
	add	ebx, 32
184
	cmp	ebx, esi
185
	jnz	.loop
186
ret
187
 
188
;       local stack definitions
189
;===========================================================================
190
_t0	equ	dword [esp]
191
_t1	equ	dword[esp+4]
192
_t2	equ	dword[esp+8]
193
_t3	equ	dword[esp+12]
194
_t4	equ	dword[esp+16]
195
_t5	equ	dword[esp+20]
196
_t6	equ	dword[esp+24]
197
_t7	equ	dword[esp+28]
198
_t8	equ	dword[esp+32]
199
_t9	equ	dword[esp+36]
200
 
201
_l1   equ	dword[esp+40]
202
_l2   equ	dword[esp+44]
203
_l3   equ	dword[esp+48]
204
_l4   equ	dword[esp+52]
205
_l5   equ	dword[esp+56]
206
_l6   equ	dword[esp+60]
207
_l7   equ	dword[esp+64]
208
_l8   equ	dword[esp+68]
209
_l9   equ	dword[esp+72]
210
_l0   equ	dword[esp+76]
211
_d1   equ	dword[esp+80]
212
_d2   equ	dword[esp+84]
213
_d3   equ	dword[esp+88]
214
_d4   equ	dword[esp+92]
215
_d5   equ	dword[esp+96]
216
_d6   equ	dword[esp+100]
217
_j5   equ	dword[esp+104]
218
_jj   equ	dword[esp+108]
219
_end_of_array	equ	dword[esp+112]
220
_step		equ	word [esp+116]
221
 
222
 
223
;=================================================================
224
; cdecl parameters:
225
; -- [ebp+8]   = N
226
; -- [ebp+12]  = 4k-aligned data array  address
227
; returns:
228
; -- nothing
229
; destroys:
230
; -- eax, ebx
231
; locals:
232
; -- 10 stack-located dwords (_t0 ... _t9)
233
;; ==========================
234
align 4
235
step2:
236
	push	ebp
237
	mov	ebp, esp
238
	sub	esp, 40
239
	mov	ebx, [ebp+12]
240
	mov	eax, [ebp+ 8]
241
	shl	eax, 3
242
	add	eax, ebx
243
 
244
.loop_i:
245
 
246
; -- quad subelements  +0, +4, +8 and +12 (simpliest operations)
247
	fld	qword[ebx]
248
	fld	qword[ebx+8*4]
249
	fld	st0
250
	fadd	st0, st2	; st : t1, f_4, f_0
251
	fxch	st1
252
	fsubp	st2, st0	; st : t1, t2
253
	fld	qword[ebx+8*8]
254
	fld	qword[ebx+8*12]
255
	fld	st0
256
	fadd	st0, st2	; st : t3, f_12, t1, t2
257
	fxch	st1
258
	fsubp	st2, st0	; st : t3, t4, t1, t2
259
	; ------
260
	fld	st2		; st : t1, t3, t4, t1, t2
261
	fadd	st0, st1
262
	fstp	qword[ebx]	; st : t3, t4, t1, t2
263
	fsub	st0, st2	; st : t3-t1, t4, t1, t2
264
	fchs			; st : t1-t3, t4, t1, t2
265
	fstp	qword[ebx+8*4]	; st : t4, t1, t2
266
	fst	st1		; st : t4, t4, t2
267
	fadd	st0, st2	; st : t2+t4, t4, t2
268
	fstp	qword[ebx+8*8]	; st : t4, t2
269
	fsubp	st1, st0	; st : t2-t4
270
	fstp	qword[ebx+8*12] ; st : 
271
 
272
; -- even subelements  +2, +6, +10 and +14 (2 multiplications needed)
273
	fld	qword[ebx+8*2]
274
	fld	qword[ebx+8*6]
275
	fld	[_r]
276
	fmul	st1, st0	; st : r, t2, t1
277
	fld	qword[ebx+8*10]
278
	fxch	st1		; st : r, t3, t2, t1
279
	fmul	qword[ebx+8*14] ; st : t4, t3, t2, t1
280
	; ------
281
	fld	st3		; st : t1, t4, t3, t2, t1
282
	fadd	st0, st3	;
283
	fadd	st0, st2	;
284
	fst	qword[ebx+8*2]	; store f[i+8] = t1+t2+t3
285
	fsub	st0, st3	;
286
	fsub	st0, st3	;
287
	fstp	qword[ebx+8*10] ; store f[i+10]= t1-t2+t3
288
	fld	st3		; st : t1, t4, t3, t2, t1
289
	fsub	st0, st2	;
290
	fsub	st0, st1	;
291
	fst	qword[ebx+8*14] ; store f[i+14]= t1-t3-t4
292
	fadd	st0, st1	;
293
	faddp	st1, st0	; st : t1-t3+t4, t3, t2, t1
294
	fstp	qword[ebx+8*6]	; store f[i+6]
295
	fstp	st0		; st : t2, t1
296
	fstp	st0		; st : t1
297
	fstp	st0		; st : 
298
 
299
; -- odd subelements
300
	fld	qword[ebx+8*9]
301
	fld	qword[ebx+8*11]
302
	fld	st1
303
	fsub	st0, st1
304
	fxch	st1
305
	faddp	st2, st0	; st : (f[l3]-f[l7]), (f[l3]+f[l7])
306
	fld	[_r2]
307
	fmul	st2, st0
308
	fmulp	st1, st0	; st : t9, t6
309
	fld	qword[ebx+8*3]
310
	fld	st0
311
	fadd	st0, st2	; st : t1, f[l5], t9, t6
312
	fstp	_t1
313
	fsub	st0, st1
314
	fstp	_t2
315
	fstp	_t9	; (t9 never used)
316
	fstp	_t6		; st : 
317
 
318
	fld	[_c1]
319
	fld	[_s1]
320
	fld	qword[ebx+8*5]
321
	fld	qword[ebx+8*7]
322
	fld	st3		; st: c1, f[l6], f[l2], s1, c1
323
	fmul	st0, st2	; st: f_2*c, f_6, f_2, s, c
324
	fld	st1		; st: f_6, f_2*c, f_6, f_2, s, c
325
	fmul	st0, st4	; st: f_6*s, f_2*c, f_6, f_2, s, c
326
	faddp	st1, st0	; st: t5, f_6, f_2, s, c
327
	fstp	_t5		; st: f_6, f_2, s, c
328
	fld	st3		; st: c, f_6, f_2, s, c
329
	fmul	st0, st1
330
	fld	st3
331
	fmul	st0, st3	; st: f_2*s, f_6*c, f_6, f_2, s, c
332
	fsubp	st1, st0	; st: t8, f_6, f_2, s, c
333
	fstp	_t8		; st: f_6, f_2, s, c
334
	fstp	st0		; st: f_2, s, c
335
	fstp	st0		; st: s, c
336
 
337
	fld	qword[ebx+8*13]
338
	fld	qword[ebx+8*15]
339
	fld	st3		; st: c1, f[l8], f[l4], s1, c1
340
	fmul	st0, st1
341
	fld	st3
342
	fmul	st0, st3	; st: f_4*s, f_8*c, f_8, f_4, s, c
343
	faddp	st1, st0	; st: t7, f_8, f_4, s, c
344
	fld	_t5		; st: t5, t7, f_8, f_4, s, c
345
	fsub	st0, st1	; st: t4, t7, f_8, f_4, s, c
346
	fstp	_t4
347
	fstp	_t7		; st: f_8, f_4, s, c
348
	fld	st3		; st: c, f_8, f_4, s, c
349
	fmul	st0, st2
350
	fld	st3
351
	fmul	st0, st2	; st: f_8*s, f_4*c, f_8, f_4, s, c
352
	fsubp	st1, st0	; st:-t0, f_8, f_4, s, c
353
	fchs
354
	fld	_t8
355
	fchs			; st:-t8, t0, f_8, f_4, s, c
356
	fsub	st0, st1	; st: t3, t0, f_8, f_4, s, c
357
	fstp	_t3
358
	fstp	_t0		; st: f_8, f_4, s, c
359
	fstp	st0		; st: f_4, s, c
360
	fstp	st0		; st: s, c
361
	fstp	st0		; st: c
362
	fstp	st0		; st: 
363
 
364
	fld	_t1
365
	fld	_t4
366
	fld	st1
367
	fsub	st0, st1
368
	fstp	qword[ebx+8*11] ; f[l7] = t1-t4
369
	faddp	st1, st0
370
	fstp	qword[ebx+8*3]	; f[l5] = t1+t4
371
	fld	_t2
372
	fld	_t3
373
	fld	st1
374
	fsub	st0, st1
375
	fstp	qword[ebx+8*15] ; f[l8]
376
	faddp	st1, st0
377
	fstp	qword[ebx+8*7]	; f[l6]
378
 
379
	fld	_t6
380
	fld	qword[ebx+8]
381
	fld	st1
382
	fsub	st0, st1
383
	fxch	st1
384
	faddp	st2, st0	; st : t2, t1
385
	fld	_t8
386
	fsub	_t0
387
	fld	_t5
388
	fadd	_t7		; st : t4, t3, t2, t1
389
 
390
	fld	st3
391
	fsub	st0, st1
392
	fstp	qword[ebx+8*9]	; f[l3] = t1-t4
393
	fadd	st0, st3
394
	fstp	qword[ebx+8]	; f[l1] = t1+t4
395
	fld	st1		; st : t2, t3, t2, t1
396
	fsub	st0, st1	; f[l4] = t2-t3
397
	fstp	qword[ebx+8*13] ; st : t3, t2, t1
398
	faddp	st1, st0	; st : t2+t3, t1
399
	fstp	qword[ebx+8*5]	; f[l2] = t2+t3
400
	fstp	st0		; st : 
401
 
402
	add	ebx, 16*8
403
	cmp	ebx, eax
404
	jb	.loop_i
405
 
406
	mov	esp, ebp
407
	pop	ebp
408
ret
409
 
410
 
411
 
412
 
413
;=================================================================
414
; cdecl parameters:
415
; -- [ebp+8]   = N
416
; -- [ebp+12]  = p
417
; -- [ebp+16]  = 4k-aligned data array  address
418
; -- [ebp+20]  = 4k-aligned SinCosTable address
419
; returns:
420
; -- nothing
421
; destroys:
422
; -- all GPRegs
423
; locals:
424
; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step)
425
;; ==========================
426
align 4
427
step3:
428
	push	ebp
429
	mov	ebp, esp
430
	sub	esp, 120
431
; 283  : {
432
 
433
 
434
; 293  :   for (l=3; l<=p; l++)
435
	mov	cx, 0x0200
436
.newstep:
437
	inc	ch
438
	cmp	ch, byte[ebp+12]
439
	jg	.done
440
	mov	_step, cx
441
 
442
; 294  :   {
443
; 295  :     d1 = 1 << (l + l - 3);
444
 
445
	mov	cl, ch
446
	add	cl, cl
447
	sub	cl, 3
448
	mov	edx, 1
449
	shl	edx, cl
450
	mov	_d1, edx
451
 
452
; 296  :     d2 = d1 << 1;
453
	shl	edx, 1
454
	mov	_d2, edx
455
	mov	eax, edx
456
 
457
; 297  :     d3 = d2 << 1;
458
	shl	edx, 1
459
	mov	_d3, edx
460
 
461
; 298  :     d4 = d2 + d3;
462
	add	eax, edx
463
	mov	_d4, eax
464
 
465
; 299  :     d5 = d3 << 1;
466
	shl	edx, 1
467
	mov	_d5, edx
468
	shl	edx, 3
469
	mov	_d6, edx	; d6 = d5*8 to simplify index operations
470
 
471
; 339  :         j5 = N / d5;   ; moved out of internal loop
472
	mov	cl, [ebp+12]
473
	sub	cl, ch
474
	add	cl, cl
475
	mov	edx, 1
476
	shl	edx, cl
477
	mov	_j5, edx
478
 
479
; 300  :
480
; 301  :     for (j=0; j
481
	mov	ebx, [ebp+16]
482
	mov	esi, [ebp+8]
483
	shl	esi, 3
484
	add	esi, ebx
485
	mov	_end_of_array, esi
486
 
487
.next_j:
488
 
489
; {
490
; t1 = f[j] + f[j+d2];
491
	mov	eax, _d2
492
	fld	qword[ebx]
493
	fld	qword[ebx+eax*8]
494
	fld	st1
495
	fadd	st0, st1
496
	fstp	_t1
497
 
498
; t2 = f[j] - f[j+d2];
499
	fsubp	st1, st0
500
	fstp	_t2
501
 
502
; t3 = f[j+d3] + f[j+d4];
503
	mov	edi, _d3
504
	fld	qword[ebx+edi*8]
505
	mov	edx, _d4
506
	fld	qword[ebx+edx*8]
507
	fld	st1
508
	fsub	st0, st1		; st : t4, f4, f3
509
	fxch	st1			; st : f4, t4, f3
510
 
511
; t4 = f[j+d3] - f[j+d4];
512
	faddp	st2, st0		; st : t4, t3
513
 
514
; f[j+d4] = t2 - t4;
515
; f[j+d3] = t2 + t4;
516
	fld	_t2
517
	fld	st0
518
	fsub	st0, st2		; st : f4, t2, t4, t3
519
	fstp	qword[ebx+edx*8]	; st : t2, t4, t3
520
	fadd	st0, st1		; st : f3, t4, t3
521
	fstp	qword[ebx+edi*8]	; st : t4, t3
522
 
523
; f[j+d2] = t1 - t3;
524
; f[j]    = t1 + t3;
525
	fld	_t1
526
	fst	st1
527
	fsub	st0, st2		; st : f2, t1, t3
528
	fstp	qword[ebx+eax*8]	; st : t1, t3
529
	fadd	st0, st1		; st : f0, t3
530
	fstp	qword[ebx]		; st : t3
531
	fstp	st0
532
 
533
; jj = j + d1;     / ??
534
	mov	edi, _d1
535
	shl	edi, 3		; = d1*8
536
	mov	edx, edi
537
	mov	eax, edi
538
	add	eax, eax	; eax = d2*8
539
	shl	edx, 2		; = d3*8
540
	add	edi, ebx	; now [edi] points to f[jj]
541
	add	edx, edi	; and [edx] points to f[jj+d3]
542
 
543
; t1 = f[jj];
544
	fld	qword [edi]	; st : t1
545
; t3 = f[jj+d3];
546
	fld	qword [edx]	; st : t3, t1
547
 
548
; t2 = f[jj+d2] * r;
549
	fld	qword [edi+eax]
550
	fld	[_r]
551
	fmul	st1, st0	; st : r,  t2, t3, t1
552
; t4 = f[jj+d4] * r
553
	fmul	qword [edx+eax] ; st : t4, t2, t3, t1
554
 
555
; f[jj]    = t1 + t2 + t3;
556
	fld	st3		; st : t1, t4, t2, t3, t1
557
	fadd	st0, st3
558
	fadd	st0, st2
559
	fstp	qword [edi]
560
 
561
; f[jj+d2] = t1 - t3 + t4;
562
	fld	st3
563
	fsub	st0, st3	; st : (t1-t3), t4, t2, t3, t1
564
	fld	st0
565
	fadd	st0, st2	; st : f2, (t1-t3), t4, t2, t3, t1
566
	fstp	qword [edi+eax]
567
; f[jj+d4] = t1 - t3 - t4;
568
	fsub	st0, st1	; st : f4, t4, t2, t3, t1
569
	fstp	qword [edx+eax]
570
 
571
; f[jj+d3] = t1 - t2 + t3;
572
	fstp	st0		; st : t2, t3,  t1
573
	fsubp	st1, st0	; st : (t3-t2), t1
574
	faddp	st1, st0	; st : f3
575
	fstp	qword [edx]
576
 
577
; for (k=1; k
578
	xor	ecx, ecx	; ecx = k
579
	mov	_jj, ecx
580
.next_k:
581
	inc	ecx
582
	cmp	ecx, _d1
583
	jge	.done_k
584
; {
585
	mov	eax, _d2	; the sector increment
586
; l1 = j  + k;
587
	mov	edx, ecx
588
	mov	_l1, edx	; [ebx+edx*8] --> f[j+k]
589
; l2 = l1 + d2;
590
	add	edx, eax
591
	mov	_l2, edx
592
; l3 = l1 + d3;
593
	add	edx, eax
594
	mov	_l3, edx
595
; l4 = l1 + d4;
596
	add	edx, eax
597
	mov	_l4, edx
598
 
599
; l5 = j  + d2 - k;
600
	mov	edx, eax
601
	sub	edx, ecx
602
	mov	_l5, edx
603
; l6 = l5 + d2;
604
	add	edx, eax
605
	mov	_l6, edx
606
; l7 = l5 + d3;
607
	add	edx, eax
608
	mov	_l7, edx
609
; l8 = l5 + d4;
610
	add	edx, eax
611
	mov	_l8, edx
612
 
613
 
614
; 340  :         j5 *= k;       // add-substituted multiplication
615
	mov	eax, _jj
616
	add	eax, _j5
617
	mov	_jj, eax
618
 
619
; c1 = C[jj];
620
; s1 = S[jj];
621
	mov	edi, [ebp+20]
622
	fld	qword[edi+eax*8]
623
	mov	esi, [ebp+8]
624
	shl	esi, 2
625
	add	esi, edi
626
	fld	qword[esi+eax*8]	; st : s1, c1
627
 
628
; t5 = f[l2] * c1 + f[l6] * s1;
629
; t8 = f[l6] * c1 - f[l2] * s1;
630
	mov	edx, _l6
631
	fld	qword[ebx+edx*8]
632
	mov	edx, _l2
633
	fld	st0
634
	fmul	st0, st2
635
	fxch	st1
636
	fmul	st0, st3
637
	fld	qword[ebx+edx*8]	; st : f[l2], f[l6]*c, f[l6]*s, s, c
638
	fmul	st4, st0
639
	fmulp	st3, st0		; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c
640
	fsub	st0, st2		; st :   t8,    f[l6]*s, f[l2]*s, f[l2]*c
641
	fstp	_t8
642
	faddp	st2, st0		; st :  f[l2]*s, t5
643
	fstp	st0			; st :  t5
644
	fstp	_t5			; st :  
645
 
646
; c2 = C[2*jj];
647
; s2 = S[2*jj];
648
	shl	eax, 1
649
	fld	qword[edi+eax*8]
650
	fld	qword[esi+eax*8]	; st : s2, c2
651
 
652
; t6 = f[l3] * c2 + f[l7] * s2;
653
; t9 = f[l7] * c2 - f[l3] * s2;
654
	mov	edx, _l7
655
	fld	qword[ebx+edx*8]
656
	mov	edx, _l3
657
	fld	st0
658
	fmul	st0, st2
659
	fxch	st1
660
	fmul	st0, st3
661
	fld	qword[ebx+edx*8]	; st : f[l3], f[l7]*c, f[l7]*s, s, c
662
	fmul	st4, st0
663
	fmulp	st3, st0		; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c
664
	fsub	st0, st2		; st :   t9,    f[l7]*s, f[l3]*s, f[l3]*c
665
	fstp	_t9
666
	faddp	st2, st0		; st :  f[l2]*s, t6
667
	fstp	st0			; st :  t6
668
	fstp	_t6			; st :  
669
 
670
; c3 = C[3*jj];
671
; s3 = S[3*jj];
672
	add	eax, _jj
673
	fld	qword[edi+eax*8]
674
	fld	qword[esi+eax*8]	; st : s3, c3
675
 
676
; t7 = f[l4] * c3 + f[l8] * s3;
677
; t0 = f[l8] * c3 - f[l4] * s3;
678
	mov	edx, _l8
679
	fld	qword[ebx+edx*8]
680
	mov	edx, _l4
681
	fld	st0
682
	fmul	st0, st2
683
	fxch	st1
684
	fmul	st0, st3
685
	fld	qword[ebx+edx*8]	; st : f[l4], f[l8]*c, f[l8]*s, s, c
686
	fmul	st4, st0
687
	fmulp	st3, st0		; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c
688
	fsub	st0, st2		; st :   t9,    f[l8]*s, f[l4]*s, f[l4]*c
689
	fstp	_t0
690
	faddp	st2, st0		; st : f[l2]*s, t7
691
	fstp	st0			; st :  t7
692
	fstp	_t7			; st :  
693
 
694
; t1 = f[l5] - t9;
695
; t2 = f[l5] + t9;
696
	mov	eax, _l5
697
	fld	qword [ebx+eax*8]
698
	fld	_t9
699
	fld	st0
700
	fadd	st0, st2
701
	fstp	_t2
702
	fsubp	st1, st0
703
	fstp	_t1
704
 
705
; t3 = - t8  - t0;
706
	fld	_t8
707
	fadd	_t0
708
	fchs
709
	fstp	_t3
710
; t4 =   t5  - t7;
711
	fld	_t5
712
	fsub	_t7
713
	fstp	_t4
714
 
715
; f[l5] = t1 + t4;
716
	fld	_t1
717
	fld	_t4
718
	fld	st0
719
	fadd	st0, st2
720
	fstp	qword [ebx+eax*8]
721
; f[l7] = t1 - t4;
722
	mov	eax, _l7
723
	fsubp	st1, st0
724
	fstp	qword [ebx+eax*8]
725
 
726
; f[l6] = t2 + t3;
727
	mov	eax, _l6
728
	fld	_t2
729
	fld	_t3
730
	fld	st0
731
	fadd	st0, st2
732
	fstp	qword [ebx+eax*8]
733
; f[l8] = t2 - t3;
734
	mov	eax, _l8
735
	fsubp	st1, st0
736
	fstp	qword [ebx+eax*8]
737
 
738
; t1 = f[l1] + t6;
739
	mov	eax, _l1
740
	fld	qword [ebx+eax*8]
741
	fld	_t6
742
	fld	st0
743
	fadd	st0, st2
744
	fstp	_t1
745
; t2 = f[l1] - t6;
746
	fsubp	st1, st0
747
	fstp	_t2
748
 
749
; t3 =    t8 - t0;
750
	fld	_t8
751
	fsub	_t0
752
	fstp	_t3
753
; t4 =    t5 + t7;
754
	fld	_t5
755
	fadd	_t7
756
	fstp	_t4
757
 
758
; f[l1] = t1 + t4;
759
	mov	eax, _l1
760
	fld	_t1
761
	fld	_t4
762
      fld     st0
763
	fadd	st0, st2
764
	fstp	qword [ebx+eax*8]
765
; f[l3] = t1 - t4;
766
	mov	eax, _l3
767
	fsubp	st1, st0
768
	fstp	qword [ebx+eax*8]
769
 
770
; f[l2] = t2 + t3;
771
	mov	eax, _l2
772
	fld	_t2
773
	fld	_t3
774
	fld	st0
775
	fadd	st0, st2
776
	fstp	qword [ebx+eax*8]
777
; f[l4] = t2 - t3;
778
	mov	eax, _l4
779
	fsubp	st1, st0
780
	fstp	qword [ebx+eax*8]
781
 
782
; 374  :       }
783
	jmp	.next_k
784
 
785
.done_k:
786
; 375  :     }
787
	add	ebx, _d6	; d6 = d5*8
788
	cmp	ebx, _end_of_array
789
	jb	.next_j
790
 
791
; 376  :   }
792
	mov	cx, _step
793
	jmp	.newstep
794
.done:
795
	mov	esp, ebp
796
	pop	ebp
797
; 377  : }
798
	ret
799
 
800
 
801
		;=========== Step3 ends here ===========
802
 
803
 
804
; =================================================================
805
 
806
;=================================================================
807
; parameters:
808
; -- [ebp+8]   = N
809
; -- [ebp+12]  = p
810
; -- [ebp+16]  = 4k-aligned data array  address
811
; -- [ebp+20]  = 4k-aligned SinCosTable address
812
; returns:
813
; -- nothing
814
; destroys:
815
; -- all GPRegs
816
;; ==========================
817
 
818
align 4
819
 
820
FHT_4:
821
 
822
	push	ebp
823
	mov	ebp, esp
824
	mov	edx, [ebp+16]
825
	add	edx, [ebp+12]
826
	call BitInvert
827
	push	dword[ebp+16]
828
	push	dword[ebp+8]
829
	call	step1
830
	call	step2
831
	pop	edx		; N
832
	pop	ecx		; a
833
	push	dword[ebp+20]	; t
834
	push	ecx
835
	push	dword[ebp+12]	; p
836
	push	edx		; N
837
	call	step3
838
	mov	esp, ebp
839
	pop	ebp
840
 
841
ret