WebSVN – Kolibri OS – Blame – /programs/other/fft/FHT4i.inc

Rev	Author	Line No.	Line
2210	art_zh	1	; Fast Hartley Transform routine
		2	; Copyright (C) 1999, 2004, 2010
		3	; Artem Jerdev artem@jerdev.co.uk
		4	;
		5	; free KolibriOS version - not to be ported to other OSes
		6	; ==========================================================
		7
		8
		9	; global constants
		10	align 8
		11	_r dq 1.41421356237309504880169 ; = sqrt(2)
		12	_r2 dq 0.70710678118654752440084 ; = sqrt(2)/2
		13	_c1 dq 0.92387953251128675612818 ; = cos(pi/8)
		14	_s1 dq 0.38268343236508977172846 ; = sin(pi/8)
		15
		16	;=================================================================
		17	; parameter1:
		18	; -- reg dl (bits[3:0]) = Power_of_4
		19	; returns:
		20	; -- reg edx = _CosTable address (4k-aligned)
		21	; assumes: _SinTable = _CosTable + (N/2)*8
		22	; user heap has to be initialized
		23	; destroys:
		24	; -- eax, ebx, ecx
		25	;; ==========================
		26	align 4
		27	CreateSinCosTable:
		28	xor eax, eax
		29	inc eax
		30	mov cl, dl
		31	and cl, 15
		32	shl eax, cl
		33	shl eax, cl
		34	mov ecx, eax ; now ecx = N
		35	shl ecx, 3
		36	mov ebx, 12
		37	mov eax, 68
		38	int 0x40 ; getmem(N*sizeof(double))
		39
		40	mov edx, eax ; edx = _CosTable
		41	shr ecx, 1
		42	mov ebx, eax
		43	add ebx, ecx ; ebx = _SinTable
		44	shr ecx, 3
		45	push ecx ; [esp] = ecx = N/2
		46
		47	xor eax, eax
		48	fldpi
		49	fidiv dword[esp] ; st : dx = 2*pi/N
		50	pop ecx
		51	fldz ; st : 0, dx
		52	.loop:
		53	fld st0 ; st : x, x, dx
		54	FSINCOS ; st : cos, sin, x, dx
		55	fstp qword [edx+eax*8] ; st : sin, x, dx
		56	fstp qword [ebx+eax*8] ; st : x, dx
		57	fadd st0, st1 ; st : x+dx, dx
		58
		59	inc eax
		60	cmp eax, ecx
		61	jne .loop
		62	fstp st0 ; st : dx
		63	fstp st0 ; st :
		64	ret
		65
		66	;=================================================================
		67	; parameter1:
		68	; -- reg edx = _CosTable address
		69	; destroys:
		70	; -- eax, ebx, ecx
		71	;; ==========================
		72	align 4
		73	DestroySinCosTable:
		74	mov ecx, edx
		75	mov ebx, 13
		76	mov eax, 68
		77	int 0x40 ; free(SinCosTable)
		78	ret
		79
		80	;=================================================================
		81	; parameter1:
		82	; -- reg dl (bits[3:0]) = Power_of_4
		83	; -- reg edx && (-16) = 4k-aligned data array address
		84	; returns:
		85	; -- edx = Power_of_4
		86	; -- ecx = N
		87	; destroys:
		88	; -- eax, ebx, ecx, edx, esi
		89	;; ==========================
		90	align 4
		91	BitInvert:
		92	mov esi, edx
		93	and esi, 0xFFFFFFF0
		94	and edx, 0x0F
		95	push edx
		96	mov cl, dl
		97	xor eax, eax
		98	inc eax
		99	shl eax, cl
		100	shl eax, cl
		101	push eax
		102	xor ecx, ecx ; index term
		103	.newterm:
		104	inc ecx
		105	cmp ecx, [esp] ; N
		106	jge .done
		107
		108	xor eax, eax
		109	mov edx, ecx
		110	xor bl, bl
		111
		112	.do_invert:
		113	inc bl
		114	cmp bl, byte[esp+4] ; Power_of_4
		115	jg .switch
		116
		117	mov bh, dl
		118	and bh, 3
		119	shl eax, 2
		120	or al, bh
		121	shr edx, 2
		122	jmp .do_invert
		123
		124	.switch:
		125	cmp eax, ecx
		126	jle .newterm
		127
		128	fld qword [esi+eax*8]
		129	fld qword [esi+ecx*8]
		130	fstp qword [esi+eax*8]
		131	fstp qword [esi+ecx*8]
		132	jmp .newterm
		133
		134	.done:
		135	pop ecx
		136	pop edx
		137	ret
		138
		139	;=================================================================
		140
		141
		142	;=================================================================
		143	; stdcall parameters:
		144	; -- [esp+4] = N
		145	; -- [esp+8] = 4k-aligned data array address
		146	; returns:
		147	; -- nothing
		148	; destroys:
		149	; -- ebx, esi
		150	;; ==========================
		151	align 4
		152	step1:
		153	mov ebx, [esp+8]
		154	mov esi, [esp+4]
		155	shl esi, 3
		156	add esi, ebx
		157
		158	.loop:
		159	fld qword[ebx]
		160	fld qword[ebx+8]
		161	fld st1
		162	fsub st0, st1 ; st : t2, f[i+1], f[i]
		163	fxch st1 ; st : f[i+1], t2, f[i]
		164	faddp st2, st0 ; st : t2, t1
		165	fld qword[ebx+16]
		166	fld qword[ebx+24]
		167	fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1
		168	fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1
		169	fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1
		170	fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1
		171	fstp st1 ; st : t4, t3, t2, t1
		172	fld st2 ; st : t2, t4, t3, t2, t1
		173	fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1
		174	fstp qword[ebx+16] ; st : t4, t3, t2, t1
		175	fsubp st2, st0 ; st : t3, t2-t4, t1
		176	fld st2 ; st : t1, t3, t2-t4, t1
		177	fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1
		178	fstp qword[ebx] ; st : t3, t2-t4, t1
		179	fsubp st2, st0 ; st : t2-t4, t1-t3
		180	fstp qword[ebx+24] ; st : t1-t3
		181	fstp qword[ebx+8] ; st :
		182
		183	add ebx, 32
		184	cmp ebx, esi
		185	jnz .loop
		186	ret
		187
		188	; local stack definitions
		189	;===========================================================================
		190	_t0 equ dword [esp]
		191	_t1 equ dword[esp+4]
		192	_t2 equ dword[esp+8]
		193	_t3 equ dword[esp+12]
		194	_t4 equ dword[esp+16]
		195	_t5 equ dword[esp+20]
		196	_t6 equ dword[esp+24]
		197	_t7 equ dword[esp+28]
		198	_t8 equ dword[esp+32]
		199	_t9 equ dword[esp+36]
		200
		201	_l1 equ dword[esp+40]
		202	_l2 equ dword[esp+44]
		203	_l3 equ dword[esp+48]
		204	_l4 equ dword[esp+52]
		205	_l5 equ dword[esp+56]
		206	_l6 equ dword[esp+60]
		207	_l7 equ dword[esp+64]
		208	_l8 equ dword[esp+68]
		209	_l9 equ dword[esp+72]
		210	_l0 equ dword[esp+76]
		211	_d1 equ dword[esp+80]
		212	_d2 equ dword[esp+84]
		213	_d3 equ dword[esp+88]
		214	_d4 equ dword[esp+92]
		215	_d5 equ dword[esp+96]
		216	_d6 equ dword[esp+100]
		217	_j5 equ dword[esp+104]
		218	_jj equ dword[esp+108]
		219	_end_of_array equ dword[esp+112]
		220	_step equ word [esp+116]
		221
		222
		223	;=================================================================
		224	; cdecl parameters:
		225	; -- [ebp+8] = N
		226	; -- [ebp+12] = 4k-aligned data array address
		227	; returns:
		228	; -- nothing
		229	; destroys:
		230	; -- eax, ebx
		231	; locals:
		232	; -- 10 stack-located dwords (_t0 ... _t9)
		233	;; ==========================
		234	align 4
		235	step2:
		236	push ebp
		237	mov ebp, esp
		238	sub esp, 40
		239	mov ebx, [ebp+12]
		240	mov eax, [ebp+ 8]
		241	shl eax, 3
		242	add eax, ebx
		243
		244	.loop_i:
		245
		246	; -- quad subelements +0, +4, +8 and +12 (simpliest operations)
		247	fld qword[ebx]
		248	fld qword[ebx+8*4]
		249	fld st0
		250	fadd st0, st2 ; st : t1, f_4, f_0
		251	fxch st1
		252	fsubp st2, st0 ; st : t1, t2
		253	fld qword[ebx+8*8]
		254	fld qword[ebx+8*12]
		255	fld st0
		256	fadd st0, st2 ; st : t3, f_12, t1, t2
		257	fxch st1
		258	fsubp st2, st0 ; st : t3, t4, t1, t2
		259	; ------
		260	fld st2 ; st : t1, t3, t4, t1, t2
		261	fadd st0, st1
		262	fstp qword[ebx] ; st : t3, t4, t1, t2
		263	fsub st0, st2 ; st : t3-t1, t4, t1, t2
		264	fchs ; st : t1-t3, t4, t1, t2
		265	fstp qword[ebx+8*4] ; st : t4, t1, t2
		266	fst st1 ; st : t4, t4, t2
		267	fadd st0, st2 ; st : t2+t4, t4, t2
		268	fstp qword[ebx+8*8] ; st : t4, t2
		269	fsubp st1, st0 ; st : t2-t4
		270	fstp qword[ebx+8*12] ; st :
		271
		272	; -- even subelements +2, +6, +10 and +14 (2 multiplications needed)
		273	fld qword[ebx+8*2]
		274	fld qword[ebx+8*6]
		275	fld [_r]
		276	fmul st1, st0 ; st : r, t2, t1
		277	fld qword[ebx+8*10]
		278	fxch st1 ; st : r, t3, t2, t1
		279	fmul qword[ebx+8*14] ; st : t4, t3, t2, t1
		280	; ------
		281	fld st3 ; st : t1, t4, t3, t2, t1
		282	fadd st0, st3 ;
		283	fadd st0, st2 ;
		284	fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3
		285	fsub st0, st3 ;
		286	fsub st0, st3 ;
		287	fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3
		288	fld st3 ; st : t1, t4, t3, t2, t1
		289	fsub st0, st2 ;
		290	fsub st0, st1 ;
		291	fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4
		292	fadd st0, st1 ;
		293	faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1
		294	fstp qword[ebx+8*6] ; store f[i+6]
		295	fstp st0 ; st : t2, t1
		296	fstp st0 ; st : t1
		297	fstp st0 ; st :
		298
		299	; -- odd subelements
		300	fld qword[ebx+8*9]
		301	fld qword[ebx+8*11]
		302	fld st1
		303	fsub st0, st1
		304	fxch st1
		305	faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7])
		306	fld [_r2]
		307	fmul st2, st0
		308	fmulp st1, st0 ; st : t9, t6
		309	fld qword[ebx+8*3]
		310	fld st0
		311	fadd st0, st2 ; st : t1, f[l5], t9, t6
		312	fstp _t1
		313	fsub st0, st1
		314	fstp _t2
		315	fstp _t9 ; (t9 never used)
		316	fstp _t6 ; st :
		317
		318	fld [_c1]
		319	fld [_s1]
		320	fld qword[ebx+8*5]
		321	fld qword[ebx+8*7]
		322	fld st3 ; st: c1, f[l6], f[l2], s1, c1
		323	fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c
		324	fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c
		325	fmul st0, st4 ; st: f_6s, f_2c, f_6, f_2, s, c
		326	faddp st1, st0 ; st: t5, f_6, f_2, s, c
		327	fstp _t5 ; st: f_6, f_2, s, c
		328	fld st3 ; st: c, f_6, f_2, s, c
		329	fmul st0, st1
		330	fld st3
		331	fmul st0, st3 ; st: f_2s, f_6c, f_6, f_2, s, c
		332	fsubp st1, st0 ; st: t8, f_6, f_2, s, c
		333	fstp _t8 ; st: f_6, f_2, s, c
		334	fstp st0 ; st: f_2, s, c
		335	fstp st0 ; st: s, c
		336
		337	fld qword[ebx+8*13]
		338	fld qword[ebx+8*15]
		339	fld st3 ; st: c1, f[l8], f[l4], s1, c1
		340	fmul st0, st1
		341	fld st3
		342	fmul st0, st3 ; st: f_4s, f_8c, f_8, f_4, s, c
		343	faddp st1, st0 ; st: t7, f_8, f_4, s, c
		344	fld _t5 ; st: t5, t7, f_8, f_4, s, c
		345	fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c
		346	fstp _t4
		347	fstp _t7 ; st: f_8, f_4, s, c
		348	fld st3 ; st: c, f_8, f_4, s, c
		349	fmul st0, st2
		350	fld st3
		351	fmul st0, st2 ; st: f_8s, f_4c, f_8, f_4, s, c
		352	fsubp st1, st0 ; st:-t0, f_8, f_4, s, c
		353	fchs
		354	fld _t8
		355	fchs ; st:-t8, t0, f_8, f_4, s, c
		356	fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c
		357	fstp _t3
		358	fstp _t0 ; st: f_8, f_4, s, c
		359	fstp st0 ; st: f_4, s, c
		360	fstp st0 ; st: s, c
		361	fstp st0 ; st: c
		362	fstp st0 ; st:
		363
		364	fld _t1
		365	fld _t4
		366	fld st1
		367	fsub st0, st1
		368	fstp qword[ebx+8*11] ; f[l7] = t1-t4
		369	faddp st1, st0
		370	fstp qword[ebx+8*3] ; f[l5] = t1+t4
		371	fld _t2
		372	fld _t3
		373	fld st1
		374	fsub st0, st1
		375	fstp qword[ebx+8*15] ; f[l8]
		376	faddp st1, st0
		377	fstp qword[ebx+8*7] ; f[l6]
		378
		379	fld _t6
		380	fld qword[ebx+8]
		381	fld st1
		382	fsub st0, st1
		383	fxch st1
		384	faddp st2, st0 ; st : t2, t1
		385	fld _t8
		386	fsub _t0
		387	fld _t5
		388	fadd _t7 ; st : t4, t3, t2, t1
		389
		390	fld st3
		391	fsub st0, st1
		392	fstp qword[ebx+8*9] ; f[l3] = t1-t4
		393	fadd st0, st3
		394	fstp qword[ebx+8] ; f[l1] = t1+t4
		395	fld st1 ; st : t2, t3, t2, t1
		396	fsub st0, st1 ; f[l4] = t2-t3
		397	fstp qword[ebx+8*13] ; st : t3, t2, t1
		398	faddp st1, st0 ; st : t2+t3, t1
		399	fstp qword[ebx+8*5] ; f[l2] = t2+t3
		400	fstp st0 ; st :
		401
		402	add ebx, 16*8
		403	cmp ebx, eax
		404	jb .loop_i
		405
		406	mov esp, ebp
		407	pop ebp
		408	ret
		409
		410
		411
		412
		413	;=================================================================
		414	; cdecl parameters:
		415	; -- [ebp+8] = N
		416	; -- [ebp+12] = p
		417	; -- [ebp+16] = 4k-aligned data array address
		418	; -- [ebp+20] = 4k-aligned SinCosTable address
		419	; returns:
		420	; -- nothing
		421	; destroys:
		422	; -- all GPRegs
		423	; locals:
		424	; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step)
		425	;; ==========================
		426	align 4
		427	step3:
		428	push ebp
		429	mov ebp, esp
		430	sub esp, 120
		431	; 283 : {
		432
		433
		434	; 293 : for (l=3; l<=p; l++)
		435	mov cx, 0x0200
		436	.newstep:
		437	inc ch
		438	cmp ch, byte[ebp+12]
		439	jg .done
		440	mov _step, cx
		441
		442	; 294 : {
		443	; 295 : d1 = 1 << (l + l - 3);
		444
		445	mov cl, ch
		446	add cl, cl
		447	sub cl, 3
		448	mov edx, 1
		449	shl edx, cl
		450	mov _d1, edx
		451
		452	; 296 : d2 = d1 << 1;
		453	shl edx, 1
		454	mov _d2, edx
		455	mov eax, edx
		456
		457	; 297 : d3 = d2 << 1;
		458	shl edx, 1
		459	mov _d3, edx
		460
		461	; 298 : d4 = d2 + d3;
		462	add eax, edx
		463	mov _d4, eax
		464
		465	; 299 : d5 = d3 << 1;
		466	shl edx, 1
		467	mov _d5, edx
		468	shl edx, 3
		469	mov _d6, edx ; d6 = d5*8 to simplify index operations
		470
		471	; 339 : j5 = N / d5; ; moved out of internal loop
		472	mov cl, [ebp+12]
		473	sub cl, ch
		474	add cl, cl
		475	mov edx, 1
		476	shl edx, cl
		477	mov _j5, edx
		478
		479	; 300 :
		480	; 301 : for (j=0; j
		481	mov ebx, [ebp+16]
		482	mov esi, [ebp+8]
		483	shl esi, 3
		484	add esi, ebx
		485	mov _end_of_array, esi
		486
		487	.next_j:
		488
		489	; {
		490	; t1 = f[j] + f[j+d2];
		491	mov eax, _d2
		492	fld qword[ebx]
		493	fld qword[ebx+eax*8]
		494	fld st1
		495	fadd st0, st1
		496	fstp _t1
		497
		498	; t2 = f[j] - f[j+d2];
		499	fsubp st1, st0
		500	fstp _t2
		501
		502	; t3 = f[j+d3] + f[j+d4];
		503	mov edi, _d3
		504	fld qword[ebx+edi*8]
		505	mov edx, _d4
		506	fld qword[ebx+edx*8]
		507	fld st1
		508	fsub st0, st1 ; st : t4, f4, f3
		509	fxch st1 ; st : f4, t4, f3
		510
		511	; t4 = f[j+d3] - f[j+d4];
		512	faddp st2, st0 ; st : t4, t3
		513
		514	; f[j+d4] = t2 - t4;
		515	; f[j+d3] = t2 + t4;
		516	fld _t2
		517	fld st0
		518	fsub st0, st2 ; st : f4, t2, t4, t3
		519	fstp qword[ebx+edx*8] ; st : t2, t4, t3
		520	fadd st0, st1 ; st : f3, t4, t3
		521	fstp qword[ebx+edi*8] ; st : t4, t3
		522
		523	; f[j+d2] = t1 - t3;
		524	; f[j] = t1 + t3;
		525	fld _t1
		526	fst st1
		527	fsub st0, st2 ; st : f2, t1, t3
		528	fstp qword[ebx+eax*8] ; st : t1, t3
		529	fadd st0, st1 ; st : f0, t3
		530	fstp qword[ebx] ; st : t3
		531	fstp st0
		532
		533	; jj = j + d1; / ??
		534	mov edi, _d1
		535	shl edi, 3 ; = d1*8
		536	mov edx, edi
		537	mov eax, edi
		538	add eax, eax ; eax = d2*8
		539	shl edx, 2 ; = d3*8
		540	add edi, ebx ; now [edi] points to f[jj]
		541	add edx, edi ; and [edx] points to f[jj+d3]
		542
		543	; t1 = f[jj];
		544	fld qword [edi] ; st : t1
		545	; t3 = f[jj+d3];
		546	fld qword [edx] ; st : t3, t1
		547
		548	; t2 = f[jj+d2] * r;
		549	fld qword [edi+eax]
		550	fld [_r]
		551	fmul st1, st0 ; st : r, t2, t3, t1
		552	; t4 = f[jj+d4] * r
		553	fmul qword [edx+eax] ; st : t4, t2, t3, t1
		554
		555	; f[jj] = t1 + t2 + t3;
		556	fld st3 ; st : t1, t4, t2, t3, t1
		557	fadd st0, st3
		558	fadd st0, st2
		559	fstp qword [edi]
		560
		561	; f[jj+d2] = t1 - t3 + t4;
		562	fld st3
		563	fsub st0, st3 ; st : (t1-t3), t4, t2, t3, t1
		564	fld st0
		565	fadd st0, st2 ; st : f2, (t1-t3), t4, t2, t3, t1
		566	fstp qword [edi+eax]
		567	; f[jj+d4] = t1 - t3 - t4;
		568	fsub st0, st1 ; st : f4, t4, t2, t3, t1
		569	fstp qword [edx+eax]
		570
		571	; f[jj+d3] = t1 - t2 + t3;
		572	fstp st0 ; st : t2, t3, t1
		573	fsubp st1, st0 ; st : (t3-t2), t1
		574	faddp st1, st0 ; st : f3
		575	fstp qword [edx]
		576
		577	; for (k=1; k
		578	xor ecx, ecx ; ecx = k
		579	mov _jj, ecx
		580	.next_k:
		581	inc ecx
		582	cmp ecx, _d1
		583	jge .done_k
		584	; {
		585	mov eax, _d2 ; the sector increment
		586	; l1 = j + k;
		587	mov edx, ecx
		588	mov _l1, edx ; [ebx+edx*8] --> f[j+k]
		589	; l2 = l1 + d2;
		590	add edx, eax
		591	mov _l2, edx
		592	; l3 = l1 + d3;
		593	add edx, eax
		594	mov _l3, edx
		595	; l4 = l1 + d4;
		596	add edx, eax
		597	mov _l4, edx
		598
		599	; l5 = j + d2 - k;
		600	mov edx, eax
		601	sub edx, ecx
		602	mov _l5, edx
		603	; l6 = l5 + d2;
		604	add edx, eax
		605	mov _l6, edx
		606	; l7 = l5 + d3;
		607	add edx, eax
		608	mov _l7, edx
		609	; l8 = l5 + d4;
		610	add edx, eax
		611	mov _l8, edx
		612
		613
		614	; 340 : j5 *= k; // add-substituted multiplication
		615	mov eax, _jj
		616	add eax, _j5
		617	mov _jj, eax
		618
		619	; c1 = C[jj];
		620	; s1 = S[jj];
		621	mov edi, [ebp+20]
		622	fld qword[edi+eax*8]
		623	mov esi, [ebp+8]
		624	shl esi, 2
		625	add esi, edi
		626	fld qword[esi+eax*8] ; st : s1, c1
		627
		628	; t5 = f[l2] * c1 + f[l6] * s1;
		629	; t8 = f[l6] * c1 - f[l2] * s1;
		630	mov edx, _l6
		631	fld qword[ebx+edx*8]
		632	mov edx, _l2
		633	fld st0
		634	fmul st0, st2
		635	fxch st1
		636	fmul st0, st3
		637	fld qword[ebx+edx8] ; st : f[l2], f[l6]c, f[l6]*s, s, c
		638	fmul st4, st0
		639	fmulp st3, st0 ; st : f[l6]c, f[l6]s, f[l2]s, f[l2]c
		640	fsub st0, st2 ; st : t8, f[l6]s, f[l2]s, f[l2]*c
		641	fstp _t8
		642	faddp st2, st0 ; st : f[l2]*s, t5
		643	fstp st0 ; st : t5
		644	fstp _t5 ; st :
		645
		646	; c2 = C[2*jj];
		647	; s2 = S[2*jj];
		648	shl eax, 1
		649	fld qword[edi+eax*8]
		650	fld qword[esi+eax*8] ; st : s2, c2
		651
		652	; t6 = f[l3] * c2 + f[l7] * s2;
		653	; t9 = f[l7] * c2 - f[l3] * s2;
		654	mov edx, _l7
		655	fld qword[ebx+edx*8]
		656	mov edx, _l3
		657	fld st0
		658	fmul st0, st2
		659	fxch st1
		660	fmul st0, st3
		661	fld qword[ebx+edx8] ; st : f[l3], f[l7]c, f[l7]*s, s, c
		662	fmul st4, st0
		663	fmulp st3, st0 ; st : f[l7]c, f[l7]s, f[l3]s, f[l3]c
		664	fsub st0, st2 ; st : t9, f[l7]s, f[l3]s, f[l3]*c
		665	fstp _t9
		666	faddp st2, st0 ; st : f[l2]*s, t6
		667	fstp st0 ; st : t6
		668	fstp _t6 ; st :
		669
		670	; c3 = C[3*jj];
		671	; s3 = S[3*jj];
		672	add eax, _jj
		673	fld qword[edi+eax*8]
		674	fld qword[esi+eax*8] ; st : s3, c3
		675
		676	; t7 = f[l4] * c3 + f[l8] * s3;
		677	; t0 = f[l8] * c3 - f[l4] * s3;
		678	mov edx, _l8
		679	fld qword[ebx+edx*8]
		680	mov edx, _l4
		681	fld st0
		682	fmul st0, st2
		683	fxch st1
		684	fmul st0, st3
		685	fld qword[ebx+edx8] ; st : f[l4], f[l8]c, f[l8]*s, s, c
		686	fmul st4, st0
		687	fmulp st3, st0 ; st : f[l8]c, f[l8]s, f[l4]s, f[l4]c
		688	fsub st0, st2 ; st : t9, f[l8]s, f[l4]s, f[l4]*c
		689	fstp _t0
		690	faddp st2, st0 ; st : f[l2]*s, t7
		691	fstp st0 ; st : t7
		692	fstp _t7 ; st :
		693
		694	; t1 = f[l5] - t9;
		695	; t2 = f[l5] + t9;
		696	mov eax, _l5
		697	fld qword [ebx+eax*8]
		698	fld _t9
		699	fld st0
		700	fadd st0, st2
		701	fstp _t2
		702	fsubp st1, st0
		703	fstp _t1
		704
		705	; t3 = - t8 - t0;
		706	fld _t8
		707	fadd _t0
		708	fchs
		709	fstp _t3
		710	; t4 = t5 - t7;
		711	fld _t5
		712	fsub _t7
		713	fstp _t4
		714
		715	; f[l5] = t1 + t4;
		716	fld _t1
		717	fld _t4
		718	fld st0
		719	fadd st0, st2
		720	fstp qword [ebx+eax*8]
		721	; f[l7] = t1 - t4;
		722	mov eax, _l7
		723	fsubp st1, st0
		724	fstp qword [ebx+eax*8]
		725
		726	; f[l6] = t2 + t3;
		727	mov eax, _l6
		728	fld _t2
		729	fld _t3
		730	fld st0
		731	fadd st0, st2
		732	fstp qword [ebx+eax*8]
		733	; f[l8] = t2 - t3;
		734	mov eax, _l8
		735	fsubp st1, st0
		736	fstp qword [ebx+eax*8]
		737
		738	; t1 = f[l1] + t6;
		739	mov eax, _l1
		740	fld qword [ebx+eax*8]
		741	fld _t6
		742	fld st0
		743	fadd st0, st2
		744	fstp _t1
		745	; t2 = f[l1] - t6;
		746	fsubp st1, st0
		747	fstp _t2
		748
		749	; t3 = t8 - t0;
		750	fld _t8
		751	fsub _t0
		752	fstp _t3
		753	; t4 = t5 + t7;
		754	fld _t5
		755	fadd _t7
		756	fstp _t4
		757
		758	; f[l1] = t1 + t4;
		759	mov eax, _l1
		760	fld _t1
		761	fld _t4
		762	fld st0
		763	fadd st0, st2
		764	fstp qword [ebx+eax*8]
		765	; f[l3] = t1 - t4;
		766	mov eax, _l3
		767	fsubp st1, st0
		768	fstp qword [ebx+eax*8]
		769
		770	; f[l2] = t2 + t3;
		771	mov eax, _l2
		772	fld _t2
		773	fld _t3
		774	fld st0
		775	fadd st0, st2
		776	fstp qword [ebx+eax*8]
		777	; f[l4] = t2 - t3;
		778	mov eax, _l4
		779	fsubp st1, st0
		780	fstp qword [ebx+eax*8]
		781
		782	; 374 : }
		783	jmp .next_k
		784
		785	.done_k:
		786	; 375 : }
		787	add ebx, _d6 ; d6 = d5*8
		788	cmp ebx, _end_of_array
		789	jb .next_j
		790
		791	; 376 : }
		792	mov cx, _step
		793	jmp .newstep
		794	.done:
		795	mov esp, ebp
		796	pop ebp
		797	; 377 : }
		798	ret
		799
		800
		801	;=========== Step3 ends here ===========
		802
		803
		804	; =================================================================
		805
		806	;=================================================================
		807	; parameters:
		808	; -- [ebp+8] = N
		809	; -- [ebp+12] = p
		810	; -- [ebp+16] = 4k-aligned data array address
		811	; -- [ebp+20] = 4k-aligned SinCosTable address
		812	; returns:
		813	; -- nothing
		814	; destroys:
		815	; -- all GPRegs
		816	;; ==========================
		817
		818	align 4
		819
		820	FHT_4:
		821
		822	push ebp
		823	mov ebp, esp
		824	mov edx, [ebp+16]
		825	add edx, [ebp+12]
		826	call BitInvert
		827	push dword[ebp+16]
		828	push dword[ebp+8]
		829	call step1
		830	call step2
		831	pop edx ; N
		832	pop ecx ; a
		833	push dword[ebp+20] ; t
		834	push ecx
		835	push dword[ebp+12] ; p
		836	push edx ; N
		837	call step3
		838	mov esp, ebp
		839	pop ebp
		840
		841	ret

Subversion Repositories Kolibri OS

(root)/programs/other/fft/FHT4i.inc – Rev 2214