WebSVN – Kolibri OS – Blame – /programs/other/fft/fht4code.asm

Rev	Author	Line No.	Line
2210	art_zh	1	; Fast Hartley Transform routine
		2	; Copyright (C) 1999, 2004, 2010
		3	; Artem Jerdev artem@jerdev.co.uk
		4	;
		5	; free KolibriOS version - not to be ported to other OSes
		6	; ==========================================================
		7
		8
		9	; global constants
		10	align 8
		11	_r dq 1.41421356237309504880169 ; = sqrt(2)
		12	_r2 dq 0.70710678118654752440084 ; = sqrt(2)/2
		13	_c1 dq 0.92387953251128675612818 ; = cos(pi/8)
		14	_s1 dq 0.38268343236508977172846 ; = sin(pi/8)
		15
		16	;=================================================================
		17	; parameter1:
		18	; -- reg dl (bits[3:0]) = Power_of_4
		19	; returns:
		20	; -- reg edx = _CosTable address (4k-aligned)
		21	; assumes: _SinTable = _CosTable + (N/2)*8
		22	; user heap has to be initialized
		23	; destroys:
		24	; -- eax, ebx, ecx
		25	;; ==========================
		26	align 4
		27	CreateSinCosTable:
		28	xor eax, eax
		29	inc eax
		30	mov cl, dl
		31	and cl, 15
		32	shl eax, cl
		33	shl eax, cl
		34	mov ecx, eax ; now ecx = N
		35	shl ecx, 3
		36	mov ebx, 12
		37	mov eax, 68
		38	int 0x40 ; getmem(N*sizeof(double))
		39
		40	mov edx, eax ; edx = _CosTable
		41	shr ecx, 1
		42	mov ebx, eax
		43	add ebx, ecx ; ebx = _SinTable
		44	shr ecx, 3
		45	push ecx ; [esp] = ecx = N/2
		46
		47	xor eax, eax
		48	fldpi
		49	fidiv dword[esp] ; st : dx = 2*pi/N
		50	pop ecx
		51	fldz ; st : 0, dx
		52	.loop:
		53	fld st0 ; st : x, x, dx
		54	FSINCOS ; st : cos, sin, x, dx
		55	fstp qword [edx+eax*8] ; st : sin, x, dx
		56	fstp qword [ebx+eax*8] ; st : x, dx
		57	fadd st0, st1 ; st : x+dx, dx
		58
		59	inc eax
		60	cmp eax, ecx
		61	jne .loop
		62	fstp st0 ; st : dx
		63	fstp st0 ; st :
		64	ret
		65
		66	;=================================================================
		67	; parameter1:
		68	; -- reg edx = _CosTable address
		69	; destroys:
		70	; -- eax, ebx, ecx
		71	;; ==========================
		72	align 4
		73	DestroySinCosTable:
		74	mov ecx, edx
		75	mov ebx, 13
		76	mov eax, 68
		77	int 0x40 ; free(SinCosTable)
		78	ret
		79
		80	;=================================================================
		81	; parameter1:
		82	; -- reg dl (bits[3:0]) = Power_of_4
		83	; -- reg edx && (-16) = 4k-aligned data array address
		84	; returns:
		85	; -- edx = Power_of_4
		86	; -- ecx = N
		87	; destroys:
		88	; -- eax, ebx, ecx, edx, esi
		89	;; ==========================
		90	align 4
		91	BitInvert:
		92	mov esi, edx
		93	and esi, 0xFFFFFFF0
		94	and edx, 0x0F
		95	push edx
		96	mov cl, dl
		97	xor eax, eax
		98	inc eax
		99	shl eax, cl
		100	shl eax, cl
		101	push eax
		102	xor ecx, ecx ; index term
		103	.newterm:
		104	inc ecx
		105	cmp ecx, [esp] ; N
		106	jge .done
		107
		108	xor eax, eax
		109	mov edx, ecx
		110	xor bl, bl
		111
		112	.do_invert:
		113	inc bl
		114	cmp bl, byte[esp+4] ; Power_of_4
		115	jg .switch
		116
		117	mov bh, dl
		118	and bh, 3
		119	shl eax, 2
		120	or al, bh
		121	shr edx, 2
		122	jmp .do_invert
		123
		124	.switch:
		125	cmp eax, ecx
		126	jle .newterm
		127
		128	fld qword [esi+eax*8]
		129	fld qword [esi+ecx*8]
		130	fstp qword [esi+eax*8]
		131	fstp qword [esi+ecx*8]
		132	jmp .newterm
		133
		134	.done:
		135	pop ecx
		136	pop edx
		137	ret
		138
		139	;=================================================================
		140
		141
		142	;=================================================================
		143	; stdcall parameters:
		144	; -- [esp+4] = N
		145	; -- [esp+8] = 4k-aligned data array address
		146	; returns:
		147	; -- nothing
		148	; destroys:
		149	; -- ebx, esi
		150	;; ==========================
		151	align 4
		152	step1:
		153	mov ebx, [esp+8]
		154	mov esi, [esp+4]
		155	shl esi, 3
		156	add esi, ebx
		157
		158	.loop:
		159	fld qword[ebx]
		160	fld qword[ebx+8]
		161	fld st1
		162	fsub st0, st1 ; st : t2, f[i+1], f[i]
		163	fxch st1 ; st : f[i+1], t2, f[i]
		164	faddp st2, st0 ; st : t2, t1
		165	fld qword[ebx+16]
		166	fld qword[ebx+24]
		167	fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1
		168	fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1
		169	fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1
		170	fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1
		171	fstp st1 ; st : t4, t3, t2, t1
		172	fld st2 ; st : t2, t4, t3, t2, t1
		173	fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1
		174	fstp qword[ebx+16] ; st : t4, t3, t2, t1
		175	fsubp st2, st0 ; st : t3, t2-t4, t1
		176	fld st2 ; st : t1, t3, t2-t4, t1
		177	fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1
		178	fstp qword[ebx] ; st : t3, t2-t4, t1
		179	fsubp st2, st0 ; st : t2-t4, t1-t3
		180	fstp qword[ebx+24] ; st : t1-t3
		181	fstp qword[ebx+8] ; st :
		182
		183	add ebx, 32
		184	cmp ebx, esi
		185	jnz .loop
2215	art_zh	186	ret
2210	art_zh	187
2215	art_zh	188	;=================================================================
		189	; SSE3 version: Step1
		190	;
		191	;==========================
		192
		193	align 4
		194	step1_sse:
		195	mov ebx, [esp+8]
		196	mov esi, [esp+4]
		197	shl esi, 3
		198	add esi, ebx
		199
		200	.loop:
		201	movddup xmm0, [ebx] ; xmm0: f0 ; f0
		202	movddup xmm1, [ebx+8] ; xmm1: f1 ; f1
		203	addsubpd xmm0, xmm1 ; xmm0: t1 ; t2 ( + - )
		204	movddup xmm1, [ebx+16] ; xmm1: f2 ; f2
		205	movddup xmm2, [ebx+24] ; xmm2: f3 ; f3
		206	addsubpd xmm1, xmm2 ; xmm1: t3 ; t4 ( + - )
		207
		208	movddup xmm2, xmm0 ; xmm2: t2 ; t2
		209	movddup xmm3, xmm1 ; xmm3: t4 ; t4
		210	addsubpd xmm2, xmm3 ; xmm2: 2+4; 2-4
		211	shufpd xmm2, xmm2, 1 ; xmm2: 2-4; 2+4
		212	movapd [ebx+16], xmm2
		213
		214	shufpd xmm0, xmm0, 1 ; xmm0: t2 ; t1
		215	shufpd xmm1, xmm1, 1 ; xmm1: t4 ; t3
		216	movddup xmm2, xmm0 ; xmm2: t1 ; t1
		217	movddup xmm3, xmm1 ; xmm3: t3 ; t3
		218	addsubpd xmm2, xmm3 ; xmm2: 1+3; 1-3
		219	shufpd xmm2, xmm2, 1 ; xmm2: 1-3; 1+3
		220	movapd [ebx], xmm2
		221
		222	add ebx, 32
		223	cmp ebx, esi
		224	jnz .loop
		225	ret
		226
2210	art_zh	227	; local stack definitions
		228	;===========================================================================
		229	_t0 equ dword [esp]
		230	_t1 equ dword[esp+4]
		231	_t2 equ dword[esp+8]
		232	_t3 equ dword[esp+12]
		233	_t4 equ dword[esp+16]
		234	_t5 equ dword[esp+20]
		235	_t6 equ dword[esp+24]
		236	_t7 equ dword[esp+28]
		237	_t8 equ dword[esp+32]
		238	_t9 equ dword[esp+36]
		239
		240	_l1 equ dword[esp+40]
		241	_l2 equ dword[esp+44]
		242	_l3 equ dword[esp+48]
		243	_l4 equ dword[esp+52]
		244	_l5 equ dword[esp+56]
		245	_l6 equ dword[esp+60]
		246	_l7 equ dword[esp+64]
		247	_l8 equ dword[esp+68]
		248	_l9 equ dword[esp+72]
		249	_l0 equ dword[esp+76]
		250	_d1 equ dword[esp+80]
		251	_d2 equ dword[esp+84]
		252	_d3 equ dword[esp+88]
		253	_d4 equ dword[esp+92]
		254	_d5 equ dword[esp+96]
		255	_d6 equ dword[esp+100]
		256	_j5 equ dword[esp+104]
		257	_jj equ dword[esp+108]
		258	_end_of_array equ dword[esp+112]
		259	_step equ word [esp+116]
		260
		261
		262	;=================================================================
		263	; cdecl parameters:
		264	; -- [ebp+8] = N
		265	; -- [ebp+12] = 4k-aligned data array address
		266	; returns:
		267	; -- nothing
		268	; destroys:
		269	; -- eax, ebx
		270	; locals:
		271	; -- 10 stack-located dwords (_t0 ... _t9)
		272	;; ==========================
		273	align 4
		274	step2:
		275	push ebp
		276	mov ebp, esp
		277	sub esp, 40
		278	mov ebx, [ebp+12]
		279	mov eax, [ebp+ 8]
		280	shl eax, 3
		281	add eax, ebx
		282
		283	.loop_i:
		284
		285	; -- quad subelements +0, +4, +8 and +12 (simpliest operations)
		286	fld qword[ebx]
		287	fld qword[ebx+8*4]
		288	fld st0
		289	fadd st0, st2 ; st : t1, f_4, f_0
		290	fxch st1
		291	fsubp st2, st0 ; st : t1, t2
		292	fld qword[ebx+8*8]
		293	fld qword[ebx+8*12]
		294	fld st0
		295	fadd st0, st2 ; st : t3, f_12, t1, t2
		296	fxch st1
		297	fsubp st2, st0 ; st : t3, t4, t1, t2
		298	; ------
		299	fld st2 ; st : t1, t3, t4, t1, t2
		300	fadd st0, st1
		301	fstp qword[ebx] ; st : t3, t4, t1, t2
		302	fsub st0, st2 ; st : t3-t1, t4, t1, t2
		303	fchs ; st : t1-t3, t4, t1, t2
		304	fstp qword[ebx+8*4] ; st : t4, t1, t2
		305	fst st1 ; st : t4, t4, t2
		306	fadd st0, st2 ; st : t2+t4, t4, t2
		307	fstp qword[ebx+8*8] ; st : t4, t2
		308	fsubp st1, st0 ; st : t2-t4
		309	fstp qword[ebx+8*12] ; st :
		310
		311	; -- even subelements +2, +6, +10 and +14 (2 multiplications needed)
		312	fld qword[ebx+8*2]
		313	fld qword[ebx+8*6]
		314	fld [_r]
		315	fmul st1, st0 ; st : r, t2, t1
		316	fld qword[ebx+8*10]
		317	fxch st1 ; st : r, t3, t2, t1
		318	fmul qword[ebx+8*14] ; st : t4, t3, t2, t1
		319	; ------
		320	fld st3 ; st : t1, t4, t3, t2, t1
		321	fadd st0, st3 ;
		322	fadd st0, st2 ;
		323	fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3
		324	fsub st0, st3 ;
		325	fsub st0, st3 ;
		326	fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3
		327	fld st3 ; st : t1, t4, t3, t2, t1
		328	fsub st0, st2 ;
		329	fsub st0, st1 ;
		330	fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4
		331	fadd st0, st1 ;
		332	faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1
		333	fstp qword[ebx+8*6] ; store f[i+6]
		334	fstp st0 ; st : t2, t1
		335	fstp st0 ; st : t1
		336	fstp st0 ; st :
		337
		338	; -- odd subelements
		339	fld qword[ebx+8*9]
		340	fld qword[ebx+8*11]
		341	fld st1
		342	fsub st0, st1
		343	fxch st1
		344	faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7])
		345	fld [_r2]
		346	fmul st2, st0
		347	fmulp st1, st0 ; st : t9, t6
		348	fld qword[ebx+8*3]
		349	fld st0
		350	fadd st0, st2 ; st : t1, f[l5], t9, t6
		351	fstp _t1
		352	fsub st0, st1
		353	fstp _t2
		354	fstp _t9 ; (t9 never used)
		355	fstp _t6 ; st :
		356
		357	fld [_c1]
		358	fld [_s1]
		359	fld qword[ebx+8*5]
		360	fld qword[ebx+8*7]
		361	fld st3 ; st: c1, f[l6], f[l2], s1, c1
		362	fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c
		363	fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c
		364	fmul st0, st4 ; st: f_6s, f_2c, f_6, f_2, s, c
		365	faddp st1, st0 ; st: t5, f_6, f_2, s, c
		366	fstp _t5 ; st: f_6, f_2, s, c
		367	fld st3 ; st: c, f_6, f_2, s, c
		368	fmul st0, st1
		369	fld st3
		370	fmul st0, st3 ; st: f_2s, f_6c, f_6, f_2, s, c
		371	fsubp st1, st0 ; st: t8, f_6, f_2, s, c
		372	fstp _t8 ; st: f_6, f_2, s, c
		373	fstp st0 ; st: f_2, s, c
		374	fstp st0 ; st: s, c
		375
		376	fld qword[ebx+8*13]
		377	fld qword[ebx+8*15]
		378	fld st3 ; st: c1, f[l8], f[l4], s1, c1
		379	fmul st0, st1
		380	fld st3
		381	fmul st0, st3 ; st: f_4s, f_8c, f_8, f_4, s, c
		382	faddp st1, st0 ; st: t7, f_8, f_4, s, c
		383	fld _t5 ; st: t5, t7, f_8, f_4, s, c
		384	fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c
		385	fstp _t4
		386	fstp _t7 ; st: f_8, f_4, s, c
		387	fld st3 ; st: c, f_8, f_4, s, c
		388	fmul st0, st2
		389	fld st3
		390	fmul st0, st2 ; st: f_8s, f_4c, f_8, f_4, s, c
		391	fsubp st1, st0 ; st:-t0, f_8, f_4, s, c
		392	fchs
		393	fld _t8
		394	fchs ; st:-t8, t0, f_8, f_4, s, c
		395	fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c
		396	fstp _t3
		397	fstp _t0 ; st: f_8, f_4, s, c
		398	fstp st0 ; st: f_4, s, c
		399	fstp st0 ; st: s, c
		400	fstp st0 ; st: c
		401	fstp st0 ; st:
		402
		403	fld _t1
		404	fld _t4
		405	fld st1
		406	fsub st0, st1
		407	fstp qword[ebx+8*11] ; f[l7] = t1-t4
		408	faddp st1, st0
		409	fstp qword[ebx+8*3] ; f[l5] = t1+t4
		410	fld _t2
		411	fld _t3
		412	fld st1
		413	fsub st0, st1
		414	fstp qword[ebx+8*15] ; f[l8]
		415	faddp st1, st0
		416	fstp qword[ebx+8*7] ; f[l6]
		417
		418	fld _t6
		419	fld qword[ebx+8]
		420	fld st1
		421	fsub st0, st1
		422	fxch st1
		423	faddp st2, st0 ; st : t2, t1
		424	fld _t8
		425	fsub _t0
		426	fld _t5
		427	fadd _t7 ; st : t4, t3, t2, t1
		428
		429	fld st3
		430	fsub st0, st1
		431	fstp qword[ebx+8*9] ; f[l3] = t1-t4
		432	fadd st0, st3
		433	fstp qword[ebx+8] ; f[l1] = t1+t4
		434	fld st1 ; st : t2, t3, t2, t1
		435	fsub st0, st1 ; f[l4] = t2-t3
		436	fstp qword[ebx+8*13] ; st : t3, t2, t1
		437	faddp st1, st0 ; st : t2+t3, t1
		438	fstp qword[ebx+8*5] ; f[l2] = t2+t3
		439	fstp st0 ; st :
		440
		441	add ebx, 16*8
		442	cmp ebx, eax
		443	jb .loop_i
		444
		445	mov esp, ebp
		446	pop ebp
		447	ret
		448
		449
		450
		451
		452	;=================================================================
		453	; cdecl parameters:
		454	; -- [ebp+8] = N
		455	; -- [ebp+12] = p
		456	; -- [ebp+16] = 4k-aligned data array address
		457	; -- [ebp+20] = 4k-aligned SinCosTable address
		458	; returns:
		459	; -- nothing
		460	; destroys:
		461	; -- all GPRegs
		462	; locals:
		463	; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step)
		464	;; ==========================
		465	align 4
		466	step3:
		467	push ebp
		468	mov ebp, esp
		469	sub esp, 120
		470	; 283 : {
		471
		472
		473	; 293 : for (l=3; l<=p; l++)
		474	mov cx, 0x0200
		475	.newstep:
		476	inc ch
		477	cmp ch, byte[ebp+12]
		478	jg .done
		479	mov _step, cx
		480
		481	; 294 : {
		482	; 295 : d1 = 1 << (l + l - 3);
		483
		484	mov cl, ch
		485	add cl, cl
		486	sub cl, 3
		487	mov edx, 1
		488	shl edx, cl
		489	mov _d1, edx
		490
		491	; 296 : d2 = d1 << 1;
		492	shl edx, 1
		493	mov _d2, edx
		494	mov eax, edx
		495
		496	; 297 : d3 = d2 << 1;
		497	shl edx, 1
		498	mov _d3, edx
		499
		500	; 298 : d4 = d2 + d3;
		501	add eax, edx
		502	mov _d4, eax
		503
		504	; 299 : d5 = d3 << 1;
		505	shl edx, 1
		506	mov _d5, edx
		507	shl edx, 3
		508	mov _d6, edx ; d6 = d5*8 to simplify index operations
		509
		510	; 339 : j5 = N / d5; ; moved out of internal loop
		511	mov cl, [ebp+12]
		512	sub cl, ch
		513	add cl, cl
		514	mov edx, 1
		515	shl edx, cl
		516	mov _j5, edx
		517
		518	; 300 :
		519	; 301 : for (j=0; j
		520	mov ebx, [ebp+16]
		521	mov esi, [ebp+8]
		522	shl esi, 3
		523	add esi, ebx
		524	mov _end_of_array, esi
		525
		526	.next_j:
		527
		528	; {
		529	; t1 = f[j] + f[j+d2];
		530	mov eax, _d2
		531	fld qword[ebx]
		532	fld qword[ebx+eax*8]
		533	fld st1
		534	fadd st0, st1
		535	fstp _t1
		536
		537	; t2 = f[j] - f[j+d2];
		538	fsubp st1, st0
		539	fstp _t2
		540
		541	; t3 = f[j+d3] + f[j+d4];
		542	mov edi, _d3
		543	fld qword[ebx+edi*8]
		544	mov edx, _d4
		545	fld qword[ebx+edx*8]
		546	fld st1
		547	fsub st0, st1 ; st : t4, f4, f3
		548	fxch st1 ; st : f4, t4, f3
		549
		550	; t4 = f[j+d3] - f[j+d4];
		551	faddp st2, st0 ; st : t4, t3
		552
		553	; f[j+d4] = t2 - t4;
		554	; f[j+d3] = t2 + t4;
		555	fld _t2
		556	fld st0
		557	fsub st0, st2 ; st : f4, t2, t4, t3
		558	fstp qword[ebx+edx*8] ; st : t2, t4, t3
		559	fadd st0, st1 ; st : f3, t4, t3
		560	fstp qword[ebx+edi*8] ; st : t4, t3
		561
		562	; f[j+d2] = t1 - t3;
		563	; f[j] = t1 + t3;
		564	fld _t1
		565	fst st1
		566	fsub st0, st2 ; st : f2, t1, t3
		567	fstp qword[ebx+eax*8] ; st : t1, t3
		568	fadd st0, st1 ; st : f0, t3
		569	fstp qword[ebx] ; st : t3
		570	fstp st0
		571
		572	; jj = j + d1; / ??
		573	mov edi, _d1
		574	shl edi, 3 ; = d1*8
		575	mov edx, edi
		576	mov eax, edi
		577	add eax, eax ; eax = d2*8
		578	shl edx, 2 ; = d3*8
		579	add edi, ebx ; now [edi] points to f[jj]
		580	add edx, edi ; and [edx] points to f[jj+d3]
		581
		582	; t1 = f[jj];
		583	fld qword [edi] ; st : t1
		584	; t3 = f[jj+d3];
		585	fld qword [edx] ; st : t3, t1
		586
		587	; t2 = f[jj+d2] * r;
		588	fld qword [edi+eax]
		589	fld [_r]
		590	fmul st1, st0 ; st : r, t2, t3, t1
		591	; t4 = f[jj+d4] * r
		592	fmul qword [edx+eax] ; st : t4, t2, t3, t1
		593
		594	; f[jj] = t1 + t2 + t3;
		595	fld st3 ; st : t1, t4, t2, t3, t1
		596	fadd st0, st3
		597	fadd st0, st2
		598	fstp qword [edi]
		599
		600	; f[jj+d2] = t1 - t3 + t4;
		601	fld st3
		602	fsub st0, st3 ; st : (t1-t3), t4, t2, t3, t1
		603	fld st0
		604	fadd st0, st2 ; st : f2, (t1-t3), t4, t2, t3, t1
		605	fstp qword [edi+eax]
		606	; f[jj+d4] = t1 - t3 - t4;
		607	fsub st0, st1 ; st : f4, t4, t2, t3, t1
		608	fstp qword [edx+eax]
		609
		610	; f[jj+d3] = t1 - t2 + t3;
		611	fstp st0 ; st : t2, t3, t1
		612	fsubp st1, st0 ; st : (t3-t2), t1
		613	faddp st1, st0 ; st : f3
		614	fstp qword [edx]
		615
		616	; for (k=1; k
		617	xor ecx, ecx ; ecx = k
		618	mov _jj, ecx
		619	.next_k:
		620	inc ecx
		621	cmp ecx, _d1
		622	jge .done_k
		623	; {
		624	mov eax, _d2 ; the sector increment
		625	; l1 = j + k;
		626	mov edx, ecx
		627	mov _l1, edx ; [ebx+edx*8] --> f[j+k]
		628	; l2 = l1 + d2;
		629	add edx, eax
		630	mov _l2, edx
		631	; l3 = l1 + d3;
		632	add edx, eax
		633	mov _l3, edx
		634	; l4 = l1 + d4;
		635	add edx, eax
		636	mov _l4, edx
		637
		638	; l5 = j + d2 - k;
		639	mov edx, eax
		640	sub edx, ecx
		641	mov _l5, edx
		642	; l6 = l5 + d2;
		643	add edx, eax
		644	mov _l6, edx
		645	; l7 = l5 + d3;
		646	add edx, eax
		647	mov _l7, edx
		648	; l8 = l5 + d4;
		649	add edx, eax
		650	mov _l8, edx
		651
		652
		653	; 340 : j5 *= k; // add-substituted multiplication
		654	mov eax, _jj
		655	add eax, _j5
		656	mov _jj, eax
		657
		658	; c1 = C[jj];
		659	; s1 = S[jj];
		660	mov edi, [ebp+20]
		661	fld qword[edi+eax*8]
		662	mov esi, [ebp+8]
		663	shl esi, 2
		664	add esi, edi
		665	fld qword[esi+eax*8] ; st : s1, c1
		666
		667	; t5 = f[l2] * c1 + f[l6] * s1;
		668	; t8 = f[l6] * c1 - f[l2] * s1;
		669	mov edx, _l6
		670	fld qword[ebx+edx*8]
		671	mov edx, _l2
		672	fld st0
		673	fmul st0, st2
		674	fxch st1
		675	fmul st0, st3
		676	fld qword[ebx+edx8] ; st : f[l2], f[l6]c, f[l6]*s, s, c
		677	fmul st4, st0
		678	fmulp st3, st0 ; st : f[l6]c, f[l6]s, f[l2]s, f[l2]c
		679	fsub st0, st2 ; st : t8, f[l6]s, f[l2]s, f[l2]*c
		680	fstp _t8
		681	faddp st2, st0 ; st : f[l2]*s, t5
		682	fstp st0 ; st : t5
		683	fstp _t5 ; st :
		684
		685	; c2 = C[2*jj];
		686	; s2 = S[2*jj];
		687	shl eax, 1
		688	fld qword[edi+eax*8]
		689	fld qword[esi+eax*8] ; st : s2, c2
		690
		691	; t6 = f[l3] * c2 + f[l7] * s2;
		692	; t9 = f[l7] * c2 - f[l3] * s2;
		693	mov edx, _l7
		694	fld qword[ebx+edx*8]
		695	mov edx, _l3
		696	fld st0
		697	fmul st0, st2
		698	fxch st1
		699	fmul st0, st3
		700	fld qword[ebx+edx8] ; st : f[l3], f[l7]c, f[l7]*s, s, c
		701	fmul st4, st0
		702	fmulp st3, st0 ; st : f[l7]c, f[l7]s, f[l3]s, f[l3]c
		703	fsub st0, st2 ; st : t9, f[l7]s, f[l3]s, f[l3]*c
		704	fstp _t9
		705	faddp st2, st0 ; st : f[l2]*s, t6
		706	fstp st0 ; st : t6
		707	fstp _t6 ; st :
		708
		709	; c3 = C[3*jj];
		710	; s3 = S[3*jj];
		711	add eax, _jj
		712	fld qword[edi+eax*8]
		713	fld qword[esi+eax*8] ; st : s3, c3
		714
		715	; t7 = f[l4] * c3 + f[l8] * s3;
		716	; t0 = f[l8] * c3 - f[l4] * s3;
		717	mov edx, _l8
		718	fld qword[ebx+edx*8]
		719	mov edx, _l4
		720	fld st0
		721	fmul st0, st2
		722	fxch st1
		723	fmul st0, st3
		724	fld qword[ebx+edx8] ; st : f[l4], f[l8]c, f[l8]*s, s, c
		725	fmul st4, st0
		726	fmulp st3, st0 ; st : f[l8]c, f[l8]s, f[l4]s, f[l4]c
		727	fsub st0, st2 ; st : t9, f[l8]s, f[l4]s, f[l4]*c
		728	fstp _t0
		729	faddp st2, st0 ; st : f[l2]*s, t7
		730	fstp st0 ; st : t7
		731	fstp _t7 ; st :
		732
		733	; t1 = f[l5] - t9;
		734	; t2 = f[l5] + t9;
		735	mov eax, _l5
		736	fld qword [ebx+eax*8]
		737	fld _t9
		738	fld st0
		739	fadd st0, st2
		740	fstp _t2
		741	fsubp st1, st0
		742	fstp _t1
		743
		744	; t3 = - t8 - t0;
		745	fld _t8
		746	fadd _t0
		747	fchs
		748	fstp _t3
		749	; t4 = t5 - t7;
		750	fld _t5
		751	fsub _t7
		752	fstp _t4
		753
		754	; f[l5] = t1 + t4;
		755	fld _t1
		756	fld _t4
		757	fld st0
		758	fadd st0, st2
		759	fstp qword [ebx+eax*8]
		760	; f[l7] = t1 - t4;
		761	mov eax, _l7
		762	fsubp st1, st0
		763	fstp qword [ebx+eax*8]
		764
		765	; f[l6] = t2 + t3;
		766	mov eax, _l6
		767	fld _t2
		768	fld _t3
		769	fld st0
		770	fadd st0, st2
		771	fstp qword [ebx+eax*8]
		772	; f[l8] = t2 - t3;
		773	mov eax, _l8
		774	fsubp st1, st0
		775	fstp qword [ebx+eax*8]
		776
		777	; t1 = f[l1] + t6;
		778	mov eax, _l1
		779	fld qword [ebx+eax*8]
		780	fld _t6
		781	fld st0
		782	fadd st0, st2
		783	fstp _t1
		784	; t2 = f[l1] - t6;
		785	fsubp st1, st0
		786	fstp _t2
		787
		788	; t3 = t8 - t0;
		789	fld _t8
		790	fsub _t0
		791	fstp _t3
		792	; t4 = t5 + t7;
		793	fld _t5
		794	fadd _t7
		795	fstp _t4
		796
		797	; f[l1] = t1 + t4;
		798	mov eax, _l1
		799	fld _t1
		800	fld _t4
		801	fld st0
		802	fadd st0, st2
		803	fstp qword [ebx+eax*8]
		804	; f[l3] = t1 - t4;
		805	mov eax, _l3
		806	fsubp st1, st0
		807	fstp qword [ebx+eax*8]
		808
		809	; f[l2] = t2 + t3;
		810	mov eax, _l2
		811	fld _t2
		812	fld _t3
		813	fld st0
		814	fadd st0, st2
		815	fstp qword [ebx+eax*8]
		816	; f[l4] = t2 - t3;
		817	mov eax, _l4
		818	fsubp st1, st0
		819	fstp qword [ebx+eax*8]
		820
		821	; 374 : }
		822	jmp .next_k
		823
		824	.done_k:
		825	; 375 : }
		826	add ebx, _d6 ; d6 = d5*8
		827	cmp ebx, _end_of_array
		828	jb .next_j
		829
		830	; 376 : }
		831	mov cx, _step
		832	jmp .newstep
		833	.done:
		834	mov esp, ebp
		835	pop ebp
		836	; 377 : }
		837	ret
		838
		839
		840	;=========== Step3 ends here ===========
		841
		842
		843	; =================================================================
		844
		845	;=================================================================
		846	; parameters:
		847	; -- [ebp+8] = N
		848	; -- [ebp+12] = p
		849	; -- [ebp+16] = 4k-aligned data array address
		850	; -- [ebp+20] = 4k-aligned SinCosTable address
		851	; returns:
		852	; -- nothing
		853	; destroys:
		854	; -- all GPRegs
		855	;; ==========================
		856
		857	align 4
		858
		859	FHT_4:
		860
		861	push ebp
		862	mov ebp, esp
		863	mov edx, [ebp+16]
		864	add edx, [ebp+12]
		865	call BitInvert
		866	push dword[ebp+16]
		867	push dword[ebp+8]
		868	call step1
		869	call step2
		870	pop edx ; N
		871	pop ecx ; a
		872	push dword[ebp+20] ; t
		873	push ecx
		874	push dword[ebp+12] ; p
		875	push edx ; N
		876	call step3
		877	mov esp, ebp
		878	pop ebp
		879
		880	ret

Subversion Repositories Kolibri OS

(root)/programs/other/fft/fht4code.asm – Rev 2215