WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/SDL-1.2.2_newlib/src/hermes/mmxp2_32.asm

Rev	Author	Line No.	Line
8210	maxcodehac	1	;
		2	; pII-optimised MMX format converters for HERMES
		3	; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
		4	; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
		5	; This source code is licensed under the GNU LGPL
		6	;
		7	; Please refer to the file COPYING.LIB contained in the distribution for
		8	; licensing conditions
		9	;
		10	; COPYRIGHT NOTICE
		11	;
		12	; This file partly contains code that is (c) Intel Corporation, specifically
		13	; the mode detection routine, and the converter to 15 bit (8 pixel
		14	; conversion routine from the mmx programming tutorial pages).
		15	;
		16	;
		17	; These routines aren't exactly pII optimised - it's just that as they
		18	; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to
		19	; optimise them for p5 MMXs..
		20
		21	BITS 32
		22
9172	turbocat	23	%include "common.inc"
8210	maxcodehac	24
9172	turbocat	25	SDL_FUNC _ConvertMMXpII32_24RGB888
		26	SDL_FUNC _ConvertMMXpII32_16RGB565
		27	SDL_FUNC _ConvertMMXpII32_16BGR565
		28	SDL_FUNC _ConvertMMXpII32_16RGB555
		29	SDL_FUNC _ConvertMMXpII32_16BGR555
8210	maxcodehac	30
9202	turbocat	31	SDL_FUNC ConvertMMXpII32_24RGB888
		32	SDL_FUNC ConvertMMXpII32_16RGB565
		33	SDL_FUNC ConvertMMXpII32_16BGR565
		34	SDL_FUNC ConvertMMXpII32_16RGB555
		35	SDL_FUNC ConvertMMXpII32_16BGR555
		36
		37
9172	turbocat	38	;; Macros for conversion routines
8210	maxcodehac	39
9172	turbocat	40	%macro _push_immq_mask 1
		41	push dword %1
		42	push dword %1
		43	%endmacro
8210	maxcodehac	44
9172	turbocat	45	%macro load_immq 2
		46	_push_immq_mask %2
		47	movq %1, [esp]
		48	%endmacro
8210	maxcodehac	49
9172	turbocat	50	%macro pand_immq 2
		51	_push_immq_mask %2
		52	pand %1, [esp]
		53	%endmacro
8210	maxcodehac	54
9172	turbocat	55	%define CLEANUP_IMMQ_LOADS(num) \
		56	add esp, byte 8 * num
8210	maxcodehac	57
9172	turbocat	58	%define mmx32_rgb888_mask 00ffffffh
		59	%define mmx32_rgb565_b 000000f8h
		60	%define mmx32_rgb565_g 0000fc00h
		61	%define mmx32_rgb565_r 00f80000h
8210	maxcodehac	62
9172	turbocat	63	%define mmx32_rgb555_rb 00f800f8h
		64	%define mmx32_rgb555_g 0000f800h
		65	%define mmx32_rgb555_mul 20000008h
		66	%define mmx32_bgr555_mul 00082000h
		67
8210	maxcodehac	68	SECTION .text
		69
9202	turbocat	70	ConvertMMXpII32_24RGB888:
8210	maxcodehac	71	_ConvertMMXpII32_24RGB888:
		72
		73	; set up mm6 as the mask, mm7 as zero
9172	turbocat	74	load_immq mm6, mmx32_rgb888_mask
		75	CLEANUP_IMMQ_LOADS(1)
8210	maxcodehac	76	pxor mm7, mm7
		77
		78	mov edx, ecx ; save ecx
		79	and ecx, 0fffffffch ; clear lower two bits
		80	jnz .L1
		81	jmp .L2
		82
		83	.L1:
		84
		85	movq mm0, [esi] ; A R G B a r g b
		86	pand mm0, mm6 ; 0 R G B 0 r g b
		87	movq mm1, [esi+8] ; A R G B a r g b
		88	pand mm1, mm6 ; 0 R G B 0 r g b
		89
		90	movq mm2, mm0 ; 0 R G B 0 r g b
		91	punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B
		92	punpckldq mm0, mm7 ; 0 0 0 0 0 r g b
		93	psllq mm2, 24 ; 0 0 R G B 0 0 0
		94	por mm0, mm2 ; 0 0 R G B r g b
		95
		96	movq mm3, mm1 ; 0 R G B 0 r g b
		97	psllq mm3, 48 ; g b 0 0 0 0 0 0
		98	por mm0, mm3 ; g b R G B r g b
		99
		100	movq mm4, mm1 ; 0 R G B 0 r g b
		101	punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B
		102	punpckldq mm1, mm7 ; 0 0 0 0 0 r g b
		103	psrlq mm1, 16 ; 0 0 0 R G B 0 r
		104	psllq mm4, 8 ; 0 0 0 0 R G B 0
		105	por mm1, mm4 ; 0 0 0 0 R G B r
		106
		107	movq [edi], mm0
		108	add esi, BYTE 16
		109	movd [edi+8], mm1
		110	add edi, BYTE 12
		111	sub ecx, BYTE 4
		112	jnz .L1
		113
		114	.L2:
		115	mov ecx, edx
		116	and ecx, BYTE 3
		117	jz .L4
		118	.L3:
		119	mov al, [esi]
		120	mov bl, [esi+1]
		121	mov dl, [esi+2]
		122	mov [edi], al
		123	mov [edi+1], bl
		124	mov [edi+2], dl
		125	add esi, BYTE 4
		126	add edi, BYTE 3
		127	dec ecx
		128	jnz .L3
		129	.L4:
9172	turbocat	130	retn
8210	maxcodehac	131
		132
9202	turbocat	133	ConvertMMXpII32_16RGB565:
8210	maxcodehac	134	_ConvertMMXpII32_16RGB565:
		135
		136	; set up masks
9172	turbocat	137	load_immq mm5, mmx32_rgb565_b
		138	load_immq mm6, mmx32_rgb565_g
		139	load_immq mm7, mmx32_rgb565_r
		140	CLEANUP_IMMQ_LOADS(3)
8210	maxcodehac	141
		142	mov edx, ecx
		143	shr ecx, 2
		144	jnz .L1
		145	jmp .L2 ; not necessary at the moment, but doesn't hurt (much)
		146
		147	.L1:
		148	movq mm0, [esi] ; argb
		149	movq mm1, mm0 ; argb
		150	pand mm0, mm6 ; 00g0
		151	movq mm3, mm1 ; argb
		152	pand mm1, mm5 ; 000b
		153	pand mm3, mm7 ; 0r00
		154	pslld mm1, 2 ; 0 0 000000bb bbb00000
		155	por mm0, mm1 ; 0 0 ggggggbb bbb00000
		156	psrld mm0, 5 ; 0 0 00000ggg gggbbbbb
		157
		158	movq mm4, [esi+8] ; argb
		159	movq mm2, mm4 ; argb
		160	pand mm4, mm6 ; 00g0
		161	movq mm1, mm2 ; argb
		162	pand mm2, mm5 ; 000b
		163	pand mm1, mm7 ; 0r00
		164	pslld mm2, 2 ; 0 0 000000bb bbb00000
		165	por mm4, mm2 ; 0 0 ggggggbb bbb00000
		166	psrld mm4, 5 ; 0 0 00000ggg gggbbbbb
		167
		168	packuswb mm3, mm1 ; R 0 r 0
		169	packssdw mm0, mm4 ; as above.. ish
		170	por mm0, mm3 ; done.
		171	movq [edi], mm0
		172
		173	add esi, 16
		174	add edi, 8
		175	dec ecx
		176	jnz .L1
		177
		178	.L2:
		179	mov ecx, edx
		180	and ecx, BYTE 3
		181	jz .L4
		182	.L3:
		183	mov al, [esi]
		184	mov bh, [esi+1]
		185	mov ah, [esi+2]
		186	shr al, 3
		187	and eax, 0F81Fh ; BYTE?
		188	shr ebx, 5
		189	and ebx, 07E0h ; BYTE?
		190	add eax, ebx
		191	mov [edi], al
		192	mov [edi+1], ah
		193	add esi, BYTE 4
		194	add edi, BYTE 2
		195	dec ecx
		196	jnz .L3
		197
		198	.L4:
9172	turbocat	199	retn
8210	maxcodehac	200
9202	turbocat	201	ConvertMMXpII32_16BGR565:
8210	maxcodehac	202	_ConvertMMXpII32_16BGR565:
		203
9172	turbocat	204	load_immq mm5, mmx32_rgb565_r
		205	load_immq mm6, mmx32_rgb565_g
		206	load_immq mm7, mmx32_rgb565_b
		207	CLEANUP_IMMQ_LOADS(3)
8210	maxcodehac	208
		209	mov edx, ecx
		210	shr ecx, 2
		211	jnz .L1
		212	jmp .L2
		213
		214	.L1:
		215	movq mm0, [esi] ; a r g b
		216	movq mm1, mm0 ; a r g b
		217	pand mm0, mm6 ; 0 0 g 0
		218	movq mm3, mm1 ; a r g b
		219	pand mm1, mm5 ; 0 r 0 0
		220	pand mm3, mm7 ; 0 0 0 b
		221
		222	psllq mm3, 16 ; 0 b 0 0
		223	psrld mm1, 14 ; 0 0 000000rr rrr00000
		224	por mm0, mm1 ; 0 0 ggggggrr rrr00000
		225	psrld mm0, 5 ; 0 0 00000ggg gggrrrrr
		226
		227	movq mm4, [esi+8] ; a r g b
		228	movq mm2, mm4 ; a r g b
		229	pand mm4, mm6 ; 0 0 g 0
		230	movq mm1, mm2 ; a r g b
		231	pand mm2, mm5 ; 0 r 0 0
		232	pand mm1, mm7 ; 0 0 0 b
		233
		234	psllq mm1, 16 ; 0 b 0 0
		235	psrld mm2, 14 ; 0 0 000000rr rrr00000
		236	por mm4, mm2 ; 0 0 ggggggrr rrr00000
		237	psrld mm4, 5 ; 0 0 00000ggg gggrrrrr
		238
		239	packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000
		240	packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
		241	por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
		242	movq [edi], mm0
		243
		244	add esi, BYTE 16
		245	add edi, BYTE 8
		246	dec ecx
		247	jnz .L1
		248
		249	.L2:
		250	and edx, BYTE 3
		251	jz .L4
		252	.L3:
		253	mov al, [esi+2]
		254	mov bh, [esi+1]
		255	mov ah, [esi]
		256	shr al, 3
		257	and eax, 0F81Fh ; BYTE ?
		258	shr ebx, 5
		259	and ebx, 07E0h ; BYTE ?
		260	add eax, ebx
		261	mov [edi], al
		262	mov [edi+1], ah
		263	add esi, BYTE 4
		264	add edi, BYTE 2
		265	dec edx
		266	jnz .L3
		267
		268	.L4:
9172	turbocat	269	retn
8210	maxcodehac	270
9202	turbocat	271	ConvertMMXpII32_16BGR555:
8210	maxcodehac	272	_ConvertMMXpII32_16BGR555:
		273
		274	; the 16BGR555 converter is identical to the RGB555 one,
		275	; except it uses a different multiplier for the pmaddwd
		276	; instruction. cool huh.
		277
9172	turbocat	278	load_immq mm7, mmx32_bgr555_mul
8210	maxcodehac	279	jmp _convert_bgr555_cheat
		280
		281	; This is the same as the Intel version.. they obviously went to
		282	; much more trouble to expand/coil the loop than I did, so theirs
		283	; would almost certainly be faster, even if only a little.
		284	; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
		285	; (I think) a more accurate name..
9202	turbocat	286
		287	ConvertMMXpII32_16RGB555:
8210	maxcodehac	288	_ConvertMMXpII32_16RGB555:
		289
9172	turbocat	290	load_immq mm7, mmx32_rgb555_mul
8210	maxcodehac	291	_convert_bgr555_cheat:
9172	turbocat	292	load_immq mm6, mmx32_rgb555_g
		293	CLEANUP_IMMQ_LOADS(2)
8210	maxcodehac	294
		295	mov edx,ecx ; Save ecx
		296
9172	turbocat	297	and ecx,DWORD 0fffffff8h ; clear lower three bits
8210	maxcodehac	298	jnz .L_OK
9172	turbocat	299	jmp near .L2
8210	maxcodehac	300
		301	.L_OK:
		302
		303	movq mm2,[esi+8]
		304
		305	movq mm0,[esi]
		306	movq mm3,mm2
		307
9172	turbocat	308	pand_immq mm3, mmx32_rgb555_rb
8210	maxcodehac	309	movq mm1,mm0
		310
9172	turbocat	311	pand_immq mm1, mmx32_rgb555_rb
8210	maxcodehac	312	pmaddwd mm3,mm7
		313
9172	turbocat	314	CLEANUP_IMMQ_LOADS(2)
		315
8210	maxcodehac	316	pmaddwd mm1,mm7
		317	pand mm2,mm6
		318
		319	.L1:
		320	movq mm4,[esi+24]
		321	pand mm0,mm6
		322
		323	movq mm5,[esi+16]
		324	por mm3,mm2
		325
		326	psrld mm3,6
		327	por mm1,mm0
		328
		329	movq mm0,mm4
		330	psrld mm1,6
		331
9172	turbocat	332	pand_immq mm0, mmx32_rgb555_rb
8210	maxcodehac	333	packssdw mm1,mm3
		334
		335	movq mm3,mm5
		336	pmaddwd mm0,mm7
		337
9172	turbocat	338	pand_immq mm3, mmx32_rgb555_rb
8210	maxcodehac	339	pand mm4,mm6
		340
		341	movq [edi],mm1
		342	pmaddwd mm3,mm7
		343
		344	add esi,BYTE 32
		345	por mm4,mm0
		346
		347	pand mm5,mm6
		348	psrld mm4,6
		349
		350	movq mm2,[esi+8]
		351	por mm5,mm3
		352
		353	movq mm0,[esi]
		354	psrld mm5,6
		355
		356	movq mm3,mm2
		357	movq mm1,mm0
		358
9172	turbocat	359	pand_immq mm3, mmx32_rgb555_rb
8210	maxcodehac	360	packssdw mm5,mm4
		361
9172	turbocat	362	pand_immq mm1, mmx32_rgb555_rb
8210	maxcodehac	363	pand mm2,mm6
		364
9172	turbocat	365	CLEANUP_IMMQ_LOADS(4)
		366
8210	maxcodehac	367	movq [edi+8],mm5
		368	pmaddwd mm3,mm7
		369
		370	pmaddwd mm1,mm7
		371	add edi,BYTE 16
		372
		373	sub ecx,BYTE 8
		374	jz .L2
		375	jmp .L1
		376
		377
		378	.L2:
		379	mov ecx,edx
		380
		381	and ecx,BYTE 7
		382	jz .L4
		383
		384	.L3:
		385	mov ebx,[esi]
		386	add esi,BYTE 4
		387
		388	mov eax,ebx
		389	mov edx,ebx
		390
		391	shr eax,3
		392	shr edx,6
		393
		394	and eax,BYTE 0000000000011111b
		395	and edx, 0000001111100000b
		396
		397	shr ebx,9
		398
		399	or eax,edx
		400
		401	and ebx, 0111110000000000b
		402
		403	or eax,ebx
		404
		405	mov [edi],ax
		406	add edi,BYTE 2
		407
		408	dec ecx
		409	jnz .L3
		410
		411	.L4:
9172	turbocat	412	retn
8210	maxcodehac	413
9172	turbocat	414	%ifidn __OUTPUT_FORMAT__,elf32
		415	section .note.GNU-stack noalloc noexec nowrite progbits
		416	%endif

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/SDL-1.2.2_newlib/src/hermes/mmxp2_32.asm – Rev 9202