Subversion Repositories Kolibri OS

Rev

Rev 3474 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
7559 art_zh 1
;	    Fast Hartley Transform routine
2
;	 Copyright (C) 1999, 2004, 2010, 2018
3
;	   Artem Jerdev  artem@jerdev.co.uk
3474 art_zh 4
;
5
; free KolibriOS version - not to be ported to other OSes
6
; ==========================================================
7
 
8
 
9
; global constants
10
align 8
11
_r	dq	1.41421356237309504880169	; = sqrt(2)
12
_r2	dq	0.70710678118654752440084	; = sqrt(2)/2
13
_c1	dq	0.92387953251128675612818	; = cos(pi/8)
14
_s1	dq	0.38268343236508977172846	; = sin(pi/8)
15
 
16
;=================================================================
17
; parameter1:
18
; -- reg dl (bits[3:0])   = Power_of_4
19
; returns:
20
; -- reg edx = _CosTable address (4k-aligned)
21
; assumes:     _SinTable  = _CosTable + (N/2)*8
22
; user heap has to be initialized
23
; destroys:
24
; -- eax, ebx, ecx
25
;; ==========================
26
align 4
27
CreateSinCosTable:
28
	xor	eax,  eax
29
	inc	eax
30
	mov	cl,  dl
31
	and	cl,  15
32
	shl	eax, cl
33
	shl	eax, cl
34
	mov	ecx, eax		; now ecx = N
35
	shl	ecx, 3
36
	mov	ebx, 12
37
	mov	eax, 68
38
	int	0x40			; getmem(N*sizeof(double))
39
 
40
	mov	edx, eax		; edx = _CosTable
41
	shr	ecx, 1
42
	mov	ebx, eax
43
	add	ebx, ecx		; ebx = _SinTable
44
	shr	ecx, 3
45
	push	ecx			; [esp] = ecx = N/2
46
 
47
	xor	eax,  eax
48
	fldpi
49
	fidiv	dword[esp]		; st : dx = 2*pi/N
50
	pop	ecx
51
	fldz				; st : 0, dx
52
.loop:
53
	fld	st0			; st : x, x, dx
54
	FSINCOS 			; st : cos, sin, x, dx
55
	fstp	qword [edx+eax*8]	; st : sin, x, dx
56
	fstp	qword [ebx+eax*8]	; st : x, dx
57
	fadd	st0, st1		; st : x+dx, dx
58
 
59
	inc	eax
60
	cmp	eax, ecx
61
	jne	.loop
62
	fstp	st0			; st : dx
63
	fstp	st0			; st : 
2215 art_zh 64
ret
3474 art_zh 65
 
2215 art_zh 66
;=================================================================
3474 art_zh 67
; parameter1:
68
; -- reg edx = _CosTable address
69
; destroys:
70
; -- eax, ebx, ecx
71
;; ==========================
72
align 4
73
DestroySinCosTable:
74
	mov	ecx, edx
75
	mov	ebx, 13
76
	mov	eax, 68
77
	int	0x40			; free(SinCosTable)
78
ret
79
 
80
;=================================================================
81
; parameter1:
82
; -- reg  dl (bits[3:0])   = Power_of_4
83
; -- reg edx && (-16) = 4k-aligned data array address
84
; returns:
85
; -- edx = Power_of_4
86
; -- ecx = N
87
; destroys:
88
; -- eax, ebx, ecx, edx, esi
89
;; ==========================
90
align 4
91
BitInvert:
92
	mov	esi, edx
93
	and	esi, 0xFFFFFFF0
94
	and	edx, 0x0F
95
	push	edx
96
	mov	cl, dl
97
	xor	eax, eax
98
	inc	eax
99
	shl	eax, cl
100
	shl	eax, cl
101
	push	eax
102
	xor	ecx, ecx		; index term
103
.newterm:
104
	inc	ecx
105
	cmp	ecx, [esp]		; N
106
	jge	.done
107
 
108
	xor	eax, eax
109
	mov	edx, ecx
110
	xor	bl, bl
111
 
112
.do_invert:
113
	inc	bl
114
	cmp	bl, byte[esp+4] ; Power_of_4
115
	jg	.switch
116
 
117
	mov	bh, dl
118
	and	bh,  3
119
	shl	eax, 2
120
	or	al, bh
121
	shr	edx, 2
122
	jmp	.do_invert
123
 
124
.switch:
125
	cmp	eax, ecx
126
	jle	.newterm
127
 
128
	fld	qword [esi+eax*8]
129
	fld	qword [esi+ecx*8]
130
	fstp	qword [esi+eax*8]
131
	fstp	qword [esi+ecx*8]
132
	jmp	.newterm
133
 
134
.done:
135
	pop	ecx
136
	pop	edx
137
	ret
138
 
139
;=================================================================
140
 
141
 
142
;=================================================================
143
; stdcall parameters:
144
; -- [esp+4]  = N
145
; -- [esp+8]  = 4k-aligned data array  address
146
; returns:
147
; -- nothing
148
; destroys:
149
; -- ebx, esi
150
;; ==========================
151
align 4
152
step1:
153
	mov	ebx, [esp+8]
154
	mov	esi, [esp+4]
155
	shl	esi, 3
156
	add	esi, ebx
157
 
158
.loop:
159
	fld	qword[ebx]
160
	fld	qword[ebx+8]
161
	fld	st1
162
	fsub	st0, st1	; st : t2, f[i+1], f[i]
163
	fxch	st1		; st : f[i+1], t2, f[i]
164
	faddp	st2, st0	; st : t2, t1
165
	fld	qword[ebx+16]
166
	fld	qword[ebx+24]
167
	fld	st1		; st : f[i+2], f[i+3], f[i+2], t2, t1
168
	fadd	st0, st1	; st : t3, f[i+3], f[i+2], t2, t1
169
	fxch	st2		; st : f[i+2], f[i+3], t3, t2, t1
170
	fsub	st0, st1	; st : t4, f[i+3], t3, t2, t1
171
	fstp	st1		; st : t4, t3, t2, t1
172
	fld	st2		; st : t2, t4, t3, t2, t1
173
	fadd	st0, st1	; st : t2+t4, t4, t3, t2, t1
174
	fstp	qword[ebx+16]	; st : t4, t3, t2, t1
175
	fsubp	st2, st0	; st : t3, t2-t4, t1
176
	fld	st2		; st : t1, t3, t2-t4, t1
177
	fadd	st0, st1	; st : t1+t3, t3, t2-t4, t1
178
	fstp	qword[ebx]	; st : t3, t2-t4, t1
179
	fsubp	st2, st0	; st : t2-t4, t1-t3
180
	fstp	qword[ebx+24]	; st : t1-t3
181
	fstp	qword[ebx+8]	; st : 
182
 
183
	add	ebx, 32
184
	cmp	ebx, esi
185
	jnz	.loop
186
ret
187
 
188
;=================================================================
7559 art_zh 189
; SSE3 version: Step1
2215 art_zh 190
;
191
;==========================
192
 
193
align 4
194
step1_sse:
195
	mov	ebx, [esp+8]
196
	mov	esi, [esp+4]
197
	shl	esi, 3
198
	add	esi, ebx
199
 
200
.loop:
7559 art_zh 201
	movddup     xmm0, [ebx]     ;	xmm0: f0 ; f0
202
	movddup     xmm1, [ebx+8]   ;	xmm1: f1 ; f1
203
	addsubpd    xmm0, xmm1	    ;	xmm0: t1 ; t2	( + - )
204
    movddup	xmm1, [ebx+16]	;   xmm1: f2 ; f2
205
    movddup	xmm2, [ebx+24]	;   xmm2: f3 ; f3
206
	addsubpd    xmm1, xmm2	    ;	xmm1: t3 ; t4	( + - )
2215 art_zh 207
 
7559 art_zh 208
    movddup	xmm2, xmm0	;   xmm2: t2 ; t2
209
    movddup	xmm3, xmm1	;   xmm3: t4 ; t4
210
	addsubpd    xmm2, xmm3	    ;	xmm2: 2+4; 2-4
211
    shufpd	xmm2, xmm2, 1	;   xmm2: 2-4; 2+4
212
    movapd	[ebx+16], xmm2
2215 art_zh 213
 
7559 art_zh 214
    shufpd	xmm0, xmm0, 1	;   xmm0: t2 ; t1
215
    shufpd	xmm1, xmm1, 1	;   xmm1: t4 ; t3
216
    movddup	xmm2, xmm0	;   xmm2: t1 ; t1
217
    movddup	xmm3, xmm1	;   xmm3: t3 ; t3
218
	addsubpd    xmm2, xmm3	    ;	xmm2: 1+3; 1-3
219
    shufpd	xmm2, xmm2, 1	;   xmm2: 1-3; 1+3
220
    movapd	[ebx], xmm2
2215 art_zh 221
 
222
	add	ebx, 32
223
	cmp	ebx, esi
224
	jnz	.loop
225
ret
3474 art_zh 226
 
7559 art_zh 227
;	local stack definitions
3474 art_zh 228
;===========================================================================
229
_t0	equ	dword [esp]
230
_t1	equ	dword[esp+4]
231
_t2	equ	dword[esp+8]
232
_t3	equ	dword[esp+12]
233
_t4	equ	dword[esp+16]
234
_t5	equ	dword[esp+20]
235
_t6	equ	dword[esp+24]
236
_t7	equ	dword[esp+28]
237
_t8	equ	dword[esp+32]
238
_t9	equ	dword[esp+36]
239
 
240
_l1   equ	dword[esp+40]
241
_l2   equ	dword[esp+44]
242
_l3   equ	dword[esp+48]
243
_l4   equ	dword[esp+52]
244
_l5   equ	dword[esp+56]
245
_l6   equ	dword[esp+60]
246
_l7   equ	dword[esp+64]
247
_l8   equ	dword[esp+68]
248
_l9   equ	dword[esp+72]
249
_l0   equ	dword[esp+76]
250
_d1   equ	dword[esp+80]
251
_d2   equ	dword[esp+84]
252
_d3   equ	dword[esp+88]
253
_d4   equ	dword[esp+92]
254
_d5   equ	dword[esp+96]
255
_d6   equ	dword[esp+100]
256
_j5   equ	dword[esp+104]
257
_jj   equ	dword[esp+108]
258
_end_of_array	equ	dword[esp+112]
259
_step		equ	word [esp+116]
260
 
261
 
262
;=================================================================
263
; cdecl parameters:
264
; -- [ebp+8]   = N
7559 art_zh 265
; -- [ebp+12]  = 4k-aligned data array	address
3474 art_zh 266
; returns:
267
; -- nothing
268
; destroys:
269
; -- eax, ebx
270
; locals:
271
; -- 10 stack-located dwords (_t0 ... _t9)
272
;; ==========================
273
align 4
274
step2:
275
	push	ebp
276
	mov	ebp, esp
277
	sub	esp, 40
278
	mov	ebx, [ebp+12]
279
	mov	eax, [ebp+ 8]
280
	shl	eax, 3
281
	add	eax, ebx
282
 
283
.loop_i:
284
 
285
; -- quad subelements  +0, +4, +8 and +12 (simpliest operations)
286
	fld	qword[ebx]
287
	fld	qword[ebx+8*4]
288
	fld	st0
289
	fadd	st0, st2	; st : t1, f_4, f_0
290
	fxch	st1
291
	fsubp	st2, st0	; st : t1, t2
292
	fld	qword[ebx+8*8]
293
	fld	qword[ebx+8*12]
294
	fld	st0
295
	fadd	st0, st2	; st : t3, f_12, t1, t2
296
	fxch	st1
297
	fsubp	st2, st0	; st : t3, t4, t1, t2
298
	; ------
299
	fld	st2		; st : t1, t3, t4, t1, t2
300
	fadd	st0, st1
301
	fstp	qword[ebx]	; st : t3, t4, t1, t2
302
	fsub	st0, st2	; st : t3-t1, t4, t1, t2
303
	fchs			; st : t1-t3, t4, t1, t2
304
	fstp	qword[ebx+8*4]	; st : t4, t1, t2
305
	fst	st1		; st : t4, t4, t2
306
	fadd	st0, st2	; st : t2+t4, t4, t2
307
	fstp	qword[ebx+8*8]	; st : t4, t2
308
	fsubp	st1, st0	; st : t2-t4
309
	fstp	qword[ebx+8*12] ; st : 
310
 
311
; -- even subelements  +2, +6, +10 and +14 (2 multiplications needed)
312
	fld	qword[ebx+8*2]
313
	fld	qword[ebx+8*6]
314
	fld	[_r]
315
	fmul	st1, st0	; st : r, t2, t1
316
	fld	qword[ebx+8*10]
317
	fxch	st1		; st : r, t3, t2, t1
318
	fmul	qword[ebx+8*14] ; st : t4, t3, t2, t1
319
	; ------
320
	fld	st3		; st : t1, t4, t3, t2, t1
321
	fadd	st0, st3	;
322
	fadd	st0, st2	;
323
	fst	qword[ebx+8*2]	; store f[i+8] = t1+t2+t3
324
	fsub	st0, st3	;
325
	fsub	st0, st3	;
326
	fstp	qword[ebx+8*10] ; store f[i+10]= t1-t2+t3
327
	fld	st3		; st : t1, t4, t3, t2, t1
328
	fsub	st0, st2	;
329
	fsub	st0, st1	;
330
	fst	qword[ebx+8*14] ; store f[i+14]= t1-t3-t4
331
	fadd	st0, st1	;
332
	faddp	st1, st0	; st : t1-t3+t4, t3, t2, t1
333
	fstp	qword[ebx+8*6]	; store f[i+6]
334
	fstp	st0		; st : t2, t1
335
	fstp	st0		; st : t1
336
	fstp	st0		; st : 
337
 
338
; -- odd subelements
339
	fld	qword[ebx+8*9]
340
	fld	qword[ebx+8*11]
341
	fld	st1
342
	fsub	st0, st1
343
	fxch	st1
344
	faddp	st2, st0	; st : (f[l3]-f[l7]), (f[l3]+f[l7])
345
	fld	[_r2]
346
	fmul	st2, st0
347
	fmulp	st1, st0	; st : t9, t6
348
	fld	qword[ebx+8*3]
349
	fld	st0
350
	fadd	st0, st2	; st : t1, f[l5], t9, t6
351
	fstp	_t1
352
	fsub	st0, st1
353
	fstp	_t2
354
	fstp	_t9	; (t9 never used)
355
	fstp	_t6		; st : 
356
 
357
	fld	[_c1]
358
	fld	[_s1]
359
	fld	qword[ebx+8*5]
360
	fld	qword[ebx+8*7]
361
	fld	st3		; st: c1, f[l6], f[l2], s1, c1
362
	fmul	st0, st2	; st: f_2*c, f_6, f_2, s, c
363
	fld	st1		; st: f_6, f_2*c, f_6, f_2, s, c
364
	fmul	st0, st4	; st: f_6*s, f_2*c, f_6, f_2, s, c
365
	faddp	st1, st0	; st: t5, f_6, f_2, s, c
366
	fstp	_t5		; st: f_6, f_2, s, c
367
	fld	st3		; st: c, f_6, f_2, s, c
368
	fmul	st0, st1
369
	fld	st3
370
	fmul	st0, st3	; st: f_2*s, f_6*c, f_6, f_2, s, c
371
	fsubp	st1, st0	; st: t8, f_6, f_2, s, c
372
	fstp	_t8		; st: f_6, f_2, s, c
373
	fstp	st0		; st: f_2, s, c
374
	fstp	st0		; st: s, c
375
 
376
	fld	qword[ebx+8*13]
377
	fld	qword[ebx+8*15]
378
	fld	st3		; st: c1, f[l8], f[l4], s1, c1
379
	fmul	st0, st1
380
	fld	st3
381
	fmul	st0, st3	; st: f_4*s, f_8*c, f_8, f_4, s, c
382
	faddp	st1, st0	; st: t7, f_8, f_4, s, c
383
	fld	_t5		; st: t5, t7, f_8, f_4, s, c
384
	fsub	st0, st1	; st: t4, t7, f_8, f_4, s, c
385
	fstp	_t4
386
	fstp	_t7		; st: f_8, f_4, s, c
387
	fld	st3		; st: c, f_8, f_4, s, c
388
	fmul	st0, st2
389
	fld	st3
390
	fmul	st0, st2	; st: f_8*s, f_4*c, f_8, f_4, s, c
391
	fsubp	st1, st0	; st:-t0, f_8, f_4, s, c
392
	fchs
393
	fld	_t8
394
	fchs			; st:-t8, t0, f_8, f_4, s, c
395
	fsub	st0, st1	; st: t3, t0, f_8, f_4, s, c
396
	fstp	_t3
397
	fstp	_t0		; st: f_8, f_4, s, c
398
	fstp	st0		; st: f_4, s, c
399
	fstp	st0		; st: s, c
400
	fstp	st0		; st: c
401
	fstp	st0		; st: 
402
 
403
	fld	_t1
404
	fld	_t4
405
	fld	st1
406
	fsub	st0, st1
407
	fstp	qword[ebx+8*11] ; f[l7] = t1-t4
408
	faddp	st1, st0
409
	fstp	qword[ebx+8*3]	; f[l5] = t1+t4
410
	fld	_t2
411
	fld	_t3
412
	fld	st1
413
	fsub	st0, st1
414
	fstp	qword[ebx+8*15] ; f[l8]
415
	faddp	st1, st0
416
	fstp	qword[ebx+8*7]	; f[l6]
417
 
418
	fld	_t6
419
	fld	qword[ebx+8]
7559 art_zh 420
	fld	st1		; st : t6, f[l1], t6
421
	fld	st1		; st : f[l1], t6, f[l1], t6
422
	faddp	st3, st0	; st : t6, f[l1], t1
3474 art_zh 423
	fsubp	st1, st0	; st : t2, t1
424
 
425
	fld	_t8
426
	fsub	_t0
427
	fld	_t5
428
	fadd	_t7		; st : t4, t3, t2, t1
429
 
430
	fld	st3
431
	fsub	st0, st1
432
	fstp	qword[ebx+8*9]	; f[l3] = t1-t4
433
	fadd	st0, st3
7559 art_zh 434
	fstp	qword[ebx+8]	  ; f[l1] = t1+t4
3474 art_zh 435
	fld	st1		; st : t2, t3, t2, t1
436
	fsub	st0, st1	; f[l4] = t2-t3
437
	fstp	qword[ebx+8*13] ; st : t3, t2, t1
438
	faddp	st1, st0	; st : t2+t3, t1
439
	fstp	qword[ebx+8*5]	; f[l2] = t2+t3
440
	fstp	st0		; st : 
441
 
442
	add	ebx, 16*8
443
	cmp	ebx, eax
444
	jb	.loop_i
445
 
446
	mov	esp, ebp
447
	pop	ebp
448
ret
449
 
450
 
451
 
452
 
453
;=================================================================
454
; cdecl parameters:
455
; -- [ebp+8]   = N
456
; -- [ebp+12]  = p
7559 art_zh 457
; -- [ebp+16]  = 4k-aligned data array	address
3474 art_zh 458
; -- [ebp+20]  = 4k-aligned SinCosTable address
459
; returns:
460
; -- nothing
461
; destroys:
462
; -- all GPRegs
463
; locals:
464
; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step)
465
;; ==========================
466
align 4
467
step3:
468
	push	ebp
469
	mov	ebp, esp
470
	sub	esp, 120
471
; 283  : {
472
 
473
 
474
; 293  :   for (l=3; l<=p; l++)
475
	mov	cx, 0x0200
476
.newstep:
477
	inc	ch
478
	cmp	ch, byte[ebp+12]
479
	jg	.done
480
	mov	_step, cx
481
 
482
; 294  :   {
483
; 295  :     d1 = 1 << (l + l - 3);
484
 
485
	mov	cl, ch
486
	add	cl, cl
487
	sub	cl, 3
488
	mov	edx, 1
489
	shl	edx, cl
490
	mov	_d1, edx
491
 
492
; 296  :     d2 = d1 << 1;
493
	shl	edx, 1
494
	mov	_d2, edx
495
	mov	eax, edx
496
 
497
; 297  :     d3 = d2 << 1;
498
	shl	edx, 1
499
	mov	_d3, edx
500
 
501
; 298  :     d4 = d2 + d3;
502
	add	eax, edx
503
	mov	_d4, eax
504
 
505
; 299  :     d5 = d3 << 1;
506
	shl	edx, 1
507
	mov	_d5, edx
508
	shl	edx, 3
509
	mov	_d6, edx	; d6 = d5*8 to simplify index operations
510
 
7559 art_zh 511
; 339  :	 j5 = N / d5;	; moved out of internal loop
3474 art_zh 512
	mov	cl, [ebp+12]
513
	sub	cl, ch
514
	add	cl, cl
515
	mov	edx, 1
516
	shl	edx, cl
517
	mov	_j5, edx
518
 
519
; 300  :
520
; 301  :     for (j=0; j
521
	mov	ebx, [ebp+16]
522
	mov	esi, [ebp+8]
523
	shl	esi, 3
524
	add	esi, ebx
525
	mov	_end_of_array, esi
526
 
527
.next_j:
528
 
529
; {
530
; t1 = f[j] + f[j+d2];
531
	mov	eax, _d2
532
	fld	qword[ebx]
533
	fld	qword[ebx+eax*8]
534
	fld	st1
535
	fadd	st0, st1
536
	fstp	_t1
537
 
538
; t2 = f[j] - f[j+d2];
539
	fsubp	st1, st0
540
	fstp	_t2
541
 
542
; t3 = f[j+d3] + f[j+d4];
543
	mov	edi, _d3
544
	fld	qword[ebx+edi*8]
545
	mov	edx, _d4
546
	fld	qword[ebx+edx*8]
547
	fld	st1
548
	fsub	st0, st1		; st : t4, f4, f3
549
	fxch	st1			; st : f4, t4, f3
550
 
551
; t4 = f[j+d3] - f[j+d4];
552
	faddp	st2, st0		; st : t4, t3
553
 
554
; f[j+d4] = t2 - t4;
555
; f[j+d3] = t2 + t4;
556
	fld	_t2
557
	fld	st0
558
	fsub	st0, st2		; st : f4, t2, t4, t3
559
	fstp	qword[ebx+edx*8]	; st : t2, t4, t3
560
	fadd	st0, st1		; st : f3, t4, t3
561
	fstp	qword[ebx+edi*8]	; st : t4, t3
562
 
563
; f[j+d2] = t1 - t3;
7559 art_zh 564
; f[j]	  = t1 + t3;
3474 art_zh 565
	fld	_t1
566
	fst	st1
567
	fsub	st0, st2		; st : f2, t1, t3
568
	fstp	qword[ebx+eax*8]	; st : t1, t3
569
	fadd	st0, st1		; st : f0, t3
570
	fstp	qword[ebx]		; st : t3
571
	fstp	st0
572
 
7559 art_zh 573
; jj = j + d1;	   / ??
3474 art_zh 574
	mov	edi, _d1
575
	shl	edi, 3		; = d1*8
576
	mov	edx, edi
577
	mov	eax, edi
578
	add	eax, eax	; eax = d2*8
579
	shl	edx, 2		; = d3*8
580
	add	edi, ebx	; now [edi] points to f[jj]
581
	add	edx, edi	; and [edx] points to f[jj+d3]
582
 
583
; t1 = f[jj];
584
	fld	qword [edi]	; st : t1
585
; t3 = f[jj+d3];
586
	fld	qword [edx]	; st : t3, t1
587
 
588
; t2 = f[jj+d2] * r;
589
	fld	qword [edi+eax]
590
	fld	[_r]
591
	fmul	st1, st0	; st : r,  t2, t3, t1
592
; t4 = f[jj+d4] * r
593
	fmul	qword [edx+eax] ; st : t4, t2, t3, t1
594
 
595
; f[jj]    = t1 + t2 + t3;
596
	fld	st3		; st : t1, t4, t2, t3, t1
597
	fadd	st0, st3
598
	fadd	st0, st2
599
	fstp	qword [edi]
600
 
601
; f[jj+d2] = t1 - t3 + t4;
602
	fld	st3
603
	fsub	st0, st3	; st : (t1-t3), t4, t2, t3, t1
604
	fld	st0
605
	fadd	st0, st2	; st : f2, (t1-t3), t4, t2, t3, t1
606
	fstp	qword [edi+eax]
607
; f[jj+d4] = t1 - t3 - t4;
608
	fsub	st0, st1	; st : f4, t4, t2, t3, t1
609
	fstp	qword [edx+eax]
610
 
611
; f[jj+d3] = t1 - t2 + t3;
7559 art_zh 612
	fstp	st0		; st : t2, t3,	t1
3474 art_zh 613
	fsubp	st1, st0	; st : (t3-t2), t1
614
	faddp	st1, st0	; st : f3
615
	fstp	qword [edx]
616
 
617
; for (k=1; k
618
	xor	ecx, ecx	; ecx = k
619
	mov	_jj, ecx
620
.next_k:
621
	inc	ecx
622
	cmp	ecx, _d1
623
	jge	.done_k
624
; {
625
	mov	eax, _d2	; the sector increment
626
; l1 = j  + k;
627
	mov	edx, ecx
628
	mov	_l1, edx	; [ebx+edx*8] --> f[j+k]
629
; l2 = l1 + d2;
630
	add	edx, eax
631
	mov	_l2, edx
632
; l3 = l1 + d3;
633
	add	edx, eax
634
	mov	_l3, edx
635
; l4 = l1 + d4;
636
	add	edx, eax
637
	mov	_l4, edx
638
 
639
; l5 = j  + d2 - k;
640
	mov	edx, eax
641
	sub	edx, ecx
642
	mov	_l5, edx
643
; l6 = l5 + d2;
644
	add	edx, eax
645
	mov	_l6, edx
646
; l7 = l5 + d3;
647
	add	edx, eax
648
	mov	_l7, edx
649
; l8 = l5 + d4;
650
	add	edx, eax
651
	mov	_l8, edx
652
 
653
 
7559 art_zh 654
; 340  :	 j5 *= k;	// add-substituted multiplication
3474 art_zh 655
	mov	eax, _jj
656
	add	eax, _j5
657
	mov	_jj, eax
658
 
659
; c1 = C[jj];
660
; s1 = S[jj];
661
	mov	edi, [ebp+20]
662
	fld	qword[edi+eax*8]
663
	mov	esi, [ebp+8]
664
	shl	esi, 2
665
	add	esi, edi
666
	fld	qword[esi+eax*8]	; st : s1, c1
667
 
668
; t5 = f[l2] * c1 + f[l6] * s1;
669
; t8 = f[l6] * c1 - f[l2] * s1;
670
	mov	edx, _l6
671
	fld	qword[ebx+edx*8]
672
	mov	edx, _l2
673
	fld	st0
674
	fmul	st0, st2
675
	fxch	st1
676
	fmul	st0, st3
677
	fld	qword[ebx+edx*8]	; st : f[l2], f[l6]*c, f[l6]*s, s, c
678
	fmul	st4, st0
679
	fmulp	st3, st0		; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c
7559 art_zh 680
	fsub	st0, st2		; st :	 t8,	f[l6]*s, f[l2]*s, f[l2]*c
3474 art_zh 681
	fstp	_t8
7559 art_zh 682
	faddp	st2, st0		; st :	f[l2]*s, t5
683
	fstp	st0			; st :	t5
684
	fstp	_t5			; st :	
3474 art_zh 685
 
686
; c2 = C[2*jj];
687
; s2 = S[2*jj];
688
	shl	eax, 1
689
	fld	qword[edi+eax*8]
690
	fld	qword[esi+eax*8]	; st : s2, c2
691
 
692
; t6 = f[l3] * c2 + f[l7] * s2;
693
; t9 = f[l7] * c2 - f[l3] * s2;
694
	mov	edx, _l7
695
	fld	qword[ebx+edx*8]
696
	mov	edx, _l3
697
	fld	st0
698
	fmul	st0, st2
699
	fxch	st1
700
	fmul	st0, st3
701
	fld	qword[ebx+edx*8]	; st : f[l3], f[l7]*c, f[l7]*s, s, c
702
	fmul	st4, st0
703
	fmulp	st3, st0		; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c
7559 art_zh 704
	fsub	st0, st2		; st :	 t9,	f[l7]*s, f[l3]*s, f[l3]*c
3474 art_zh 705
	fstp	_t9
7559 art_zh 706
	faddp	st2, st0		; st :	f[l2]*s, t6
707
	fstp	st0			; st :	t6
708
	fstp	_t6			; st :	
3474 art_zh 709
 
710
; c3 = C[3*jj];
711
; s3 = S[3*jj];
712
	add	eax, _jj
713
	fld	qword[edi+eax*8]
714
	fld	qword[esi+eax*8]	; st : s3, c3
715
 
716
; t7 = f[l4] * c3 + f[l8] * s3;
717
; t0 = f[l8] * c3 - f[l4] * s3;
718
	mov	edx, _l8
719
	fld	qword[ebx+edx*8]
720
	mov	edx, _l4
721
	fld	st0
722
	fmul	st0, st2
723
	fxch	st1
724
	fmul	st0, st3
725
	fld	qword[ebx+edx*8]	; st : f[l4], f[l8]*c, f[l8]*s, s, c
726
	fmul	st4, st0
727
	fmulp	st3, st0		; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c
7559 art_zh 728
	fsub	st0, st2		; st :	 t9,	f[l8]*s, f[l4]*s, f[l4]*c
3474 art_zh 729
	fstp	_t0
730
	faddp	st2, st0		; st : f[l2]*s, t7
7559 art_zh 731
	fstp	st0			; st :	t7
732
	fstp	_t7			; st :	
3474 art_zh 733
 
734
; t1 = f[l5] - t9;
735
; t2 = f[l5] + t9;
736
	mov	eax, _l5
737
	fld	qword [ebx+eax*8]
738
	fld	_t9
739
	fld	st0
740
	fadd	st0, st2
741
	fstp	_t2
742
	fsubp	st1, st0
743
	fstp	_t1
744
 
745
; t3 = - t8  - t0;
746
	fld	_t8
747
	fadd	_t0
748
	fchs
749
	fstp	_t3
7559 art_zh 750
; t4 =	 t5  - t7;
3474 art_zh 751
	fld	_t5
752
	fsub	_t7
753
	fstp	_t4
754
 
755
; f[l5] = t1 + t4;
756
	fld	_t1
757
	fld	_t4
758
	fld	st0
759
	fadd	st0, st2
760
	fstp	qword [ebx+eax*8]
761
; f[l7] = t1 - t4;
762
	mov	eax, _l7
763
	fsubp	st1, st0
764
	fstp	qword [ebx+eax*8]
765
 
766
; f[l6] = t2 + t3;
767
	mov	eax, _l6
768
	fld	_t2
769
	fld	_t3
770
	fld	st0
771
	fadd	st0, st2
772
	fstp	qword [ebx+eax*8]
773
; f[l8] = t2 - t3;
774
	mov	eax, _l8
775
	fsubp	st1, st0
776
	fstp	qword [ebx+eax*8]
777
 
778
; t1 = f[l1] + t6;
779
	mov	eax, _l1
780
	fld	qword [ebx+eax*8]
781
	fld	_t6
782
	fld	st0
783
	fadd	st0, st2
784
	fstp	_t1
785
; t2 = f[l1] - t6;
786
	fsubp	st1, st0
787
	fstp	_t2
788
 
7559 art_zh 789
; t3 =	  t8 - t0;
3474 art_zh 790
	fld	_t8
791
	fsub	_t0
792
	fstp	_t3
7559 art_zh 793
; t4 =	  t5 + t7;
3474 art_zh 794
	fld	_t5
795
	fadd	_t7
796
	fstp	_t4
797
 
798
; f[l1] = t1 + t4;
799
	mov	eax, _l1
800
	fld	_t1
801
	fld	_t4
802
      fld     st0
803
	fadd	st0, st2
804
	fstp	qword [ebx+eax*8]
805
; f[l3] = t1 - t4;
806
	mov	eax, _l3
807
	fsubp	st1, st0
808
	fstp	qword [ebx+eax*8]
809
 
810
; f[l2] = t2 + t3;
811
	mov	eax, _l2
812
	fld	_t2
813
	fld	_t3
814
	fld	st0
815
	fadd	st0, st2
816
	fstp	qword [ebx+eax*8]
817
; f[l4] = t2 - t3;
818
	mov	eax, _l4
819
	fsubp	st1, st0
820
	fstp	qword [ebx+eax*8]
821
 
822
; 374  :       }
823
	jmp	.next_k
824
 
825
.done_k:
826
; 375  :     }
827
	add	ebx, _d6	; d6 = d5*8
828
	cmp	ebx, _end_of_array
829
	jb	.next_j
830
 
831
; 376  :   }
832
	mov	cx, _step
833
	jmp	.newstep
834
.done:
835
	mov	esp, ebp
836
	pop	ebp
837
; 377  : }
838
	ret
839
 
840
 
841
		;=========== Step3 ends here ===========
842
 
843
 
844
; =================================================================
845
 
846
;=================================================================
847
; parameters:
848
; -- [ebp+8]   = N
849
; -- [ebp+12]  = p
7559 art_zh 850
; -- [ebp+16]  = 4k-aligned data array	address
3474 art_zh 851
; -- [ebp+20]  = 4k-aligned SinCosTable address
852
; returns:
853
; -- nothing
854
; destroys:
855
; -- all GPRegs
856
;; ==========================
857
 
858
align 4
859
 
860
FHT_4:
861
 
862
	push	ebp
863
	mov	ebp, esp
864
	mov	edx, [ebp+16]
865
	add	edx, [ebp+12]
866
	call BitInvert
867
	push	dword[ebp+16]
868
	push	dword[ebp+8]
869
	call	step1
870
	call	step2
871
	pop	edx		; N
872
	pop	ecx		; a
873
	push	dword[ebp+20]	; t
874
	push	ecx
875
	push	dword[ebp+12]	; p
876
	push	edx		; N
877
	call	step3
878
	mov	esp, ebp
879
	pop	ebp
880
 
881
ret