WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/SDL-1.2.2_newlib/src/hermes/mmxp2_32.asm

Rev	Author	Line No.	Line
8210	maxcodehac	1	;
		2	; pII-optimised MMX format converters for HERMES
		3	; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
		4	; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
		5	; This source code is licensed under the GNU LGPL
		6	;
		7	; Please refer to the file COPYING.LIB contained in the distribution for
		8	; licensing conditions
		9	;
		10	; COPYRIGHT NOTICE
		11	;
		12	; This file partly contains code that is (c) Intel Corporation, specifically
		13	; the mode detection routine, and the converter to 15 bit (8 pixel
		14	; conversion routine from the mmx programming tutorial pages).
		15	;
		16	;
		17	; These routines aren't exactly pII optimised - it's just that as they
		18	; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to
		19	; optimise them for p5 MMXs..
		20
		21	BITS 32
		22
9172	turbocat	23	%include "common.inc"
8210	maxcodehac	24
9172	turbocat	25	SDL_FUNC _ConvertMMXpII32_24RGB888
		26	SDL_FUNC _ConvertMMXpII32_16RGB565
		27	SDL_FUNC _ConvertMMXpII32_16BGR565
		28	SDL_FUNC _ConvertMMXpII32_16RGB555
		29	SDL_FUNC _ConvertMMXpII32_16BGR555
8210	maxcodehac	30
9172	turbocat	31	;; Macros for conversion routines
8210	maxcodehac	32
9172	turbocat	33	%macro _push_immq_mask 1
		34	push dword %1
		35	push dword %1
		36	%endmacro
8210	maxcodehac	37
9172	turbocat	38	%macro load_immq 2
		39	_push_immq_mask %2
		40	movq %1, [esp]
		41	%endmacro
8210	maxcodehac	42
9172	turbocat	43	%macro pand_immq 2
		44	_push_immq_mask %2
		45	pand %1, [esp]
		46	%endmacro
8210	maxcodehac	47
9172	turbocat	48	%define CLEANUP_IMMQ_LOADS(num) \
		49	add esp, byte 8 * num
8210	maxcodehac	50
9172	turbocat	51	%define mmx32_rgb888_mask 00ffffffh
		52	%define mmx32_rgb565_b 000000f8h
		53	%define mmx32_rgb565_g 0000fc00h
		54	%define mmx32_rgb565_r 00f80000h
8210	maxcodehac	55
9172	turbocat	56	%define mmx32_rgb555_rb 00f800f8h
		57	%define mmx32_rgb555_g 0000f800h
		58	%define mmx32_rgb555_mul 20000008h
		59	%define mmx32_bgr555_mul 00082000h
		60
8210	maxcodehac	61	SECTION .text
		62
		63	_ConvertMMXpII32_24RGB888:
		64
		65	; set up mm6 as the mask, mm7 as zero
9172	turbocat	66	load_immq mm6, mmx32_rgb888_mask
		67	CLEANUP_IMMQ_LOADS(1)
8210	maxcodehac	68	pxor mm7, mm7
		69
		70	mov edx, ecx ; save ecx
		71	and ecx, 0fffffffch ; clear lower two bits
		72	jnz .L1
		73	jmp .L2
		74
		75	.L1:
		76
		77	movq mm0, [esi] ; A R G B a r g b
		78	pand mm0, mm6 ; 0 R G B 0 r g b
		79	movq mm1, [esi+8] ; A R G B a r g b
		80	pand mm1, mm6 ; 0 R G B 0 r g b
		81
		82	movq mm2, mm0 ; 0 R G B 0 r g b
		83	punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B
		84	punpckldq mm0, mm7 ; 0 0 0 0 0 r g b
		85	psllq mm2, 24 ; 0 0 R G B 0 0 0
		86	por mm0, mm2 ; 0 0 R G B r g b
		87
		88	movq mm3, mm1 ; 0 R G B 0 r g b
		89	psllq mm3, 48 ; g b 0 0 0 0 0 0
		90	por mm0, mm3 ; g b R G B r g b
		91
		92	movq mm4, mm1 ; 0 R G B 0 r g b
		93	punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B
		94	punpckldq mm1, mm7 ; 0 0 0 0 0 r g b
		95	psrlq mm1, 16 ; 0 0 0 R G B 0 r
		96	psllq mm4, 8 ; 0 0 0 0 R G B 0
		97	por mm1, mm4 ; 0 0 0 0 R G B r
		98
		99	movq [edi], mm0
		100	add esi, BYTE 16
		101	movd [edi+8], mm1
		102	add edi, BYTE 12
		103	sub ecx, BYTE 4
		104	jnz .L1
		105
		106	.L2:
		107	mov ecx, edx
		108	and ecx, BYTE 3
		109	jz .L4
		110	.L3:
		111	mov al, [esi]
		112	mov bl, [esi+1]
		113	mov dl, [esi+2]
		114	mov [edi], al
		115	mov [edi+1], bl
		116	mov [edi+2], dl
		117	add esi, BYTE 4
		118	add edi, BYTE 3
		119	dec ecx
		120	jnz .L3
		121	.L4:
9172	turbocat	122	retn
8210	maxcodehac	123
		124
		125
		126	_ConvertMMXpII32_16RGB565:
		127
		128	; set up masks
9172	turbocat	129	load_immq mm5, mmx32_rgb565_b
		130	load_immq mm6, mmx32_rgb565_g
		131	load_immq mm7, mmx32_rgb565_r
		132	CLEANUP_IMMQ_LOADS(3)
8210	maxcodehac	133
		134	mov edx, ecx
		135	shr ecx, 2
		136	jnz .L1
		137	jmp .L2 ; not necessary at the moment, but doesn't hurt (much)
		138
		139	.L1:
		140	movq mm0, [esi] ; argb
		141	movq mm1, mm0 ; argb
		142	pand mm0, mm6 ; 00g0
		143	movq mm3, mm1 ; argb
		144	pand mm1, mm5 ; 000b
		145	pand mm3, mm7 ; 0r00
		146	pslld mm1, 2 ; 0 0 000000bb bbb00000
		147	por mm0, mm1 ; 0 0 ggggggbb bbb00000
		148	psrld mm0, 5 ; 0 0 00000ggg gggbbbbb
		149
		150	movq mm4, [esi+8] ; argb
		151	movq mm2, mm4 ; argb
		152	pand mm4, mm6 ; 00g0
		153	movq mm1, mm2 ; argb
		154	pand mm2, mm5 ; 000b
		155	pand mm1, mm7 ; 0r00
		156	pslld mm2, 2 ; 0 0 000000bb bbb00000
		157	por mm4, mm2 ; 0 0 ggggggbb bbb00000
		158	psrld mm4, 5 ; 0 0 00000ggg gggbbbbb
		159
		160	packuswb mm3, mm1 ; R 0 r 0
		161	packssdw mm0, mm4 ; as above.. ish
		162	por mm0, mm3 ; done.
		163	movq [edi], mm0
		164
		165	add esi, 16
		166	add edi, 8
		167	dec ecx
		168	jnz .L1
		169
		170	.L2:
		171	mov ecx, edx
		172	and ecx, BYTE 3
		173	jz .L4
		174	.L3:
		175	mov al, [esi]
		176	mov bh, [esi+1]
		177	mov ah, [esi+2]
		178	shr al, 3
		179	and eax, 0F81Fh ; BYTE?
		180	shr ebx, 5
		181	and ebx, 07E0h ; BYTE?
		182	add eax, ebx
		183	mov [edi], al
		184	mov [edi+1], ah
		185	add esi, BYTE 4
		186	add edi, BYTE 2
		187	dec ecx
		188	jnz .L3
		189
		190	.L4:
9172	turbocat	191	retn
8210	maxcodehac	192
		193
		194	_ConvertMMXpII32_16BGR565:
		195
9172	turbocat	196	load_immq mm5, mmx32_rgb565_r
		197	load_immq mm6, mmx32_rgb565_g
		198	load_immq mm7, mmx32_rgb565_b
		199	CLEANUP_IMMQ_LOADS(3)
8210	maxcodehac	200
		201	mov edx, ecx
		202	shr ecx, 2
		203	jnz .L1
		204	jmp .L2
		205
		206	.L1:
		207	movq mm0, [esi] ; a r g b
		208	movq mm1, mm0 ; a r g b
		209	pand mm0, mm6 ; 0 0 g 0
		210	movq mm3, mm1 ; a r g b
		211	pand mm1, mm5 ; 0 r 0 0
		212	pand mm3, mm7 ; 0 0 0 b
		213
		214	psllq mm3, 16 ; 0 b 0 0
		215	psrld mm1, 14 ; 0 0 000000rr rrr00000
		216	por mm0, mm1 ; 0 0 ggggggrr rrr00000
		217	psrld mm0, 5 ; 0 0 00000ggg gggrrrrr
		218
		219	movq mm4, [esi+8] ; a r g b
		220	movq mm2, mm4 ; a r g b
		221	pand mm4, mm6 ; 0 0 g 0
		222	movq mm1, mm2 ; a r g b
		223	pand mm2, mm5 ; 0 r 0 0
		224	pand mm1, mm7 ; 0 0 0 b
		225
		226	psllq mm1, 16 ; 0 b 0 0
		227	psrld mm2, 14 ; 0 0 000000rr rrr00000
		228	por mm4, mm2 ; 0 0 ggggggrr rrr00000
		229	psrld mm4, 5 ; 0 0 00000ggg gggrrrrr
		230
		231	packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000
		232	packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
		233	por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
		234	movq [edi], mm0
		235
		236	add esi, BYTE 16
		237	add edi, BYTE 8
		238	dec ecx
		239	jnz .L1
		240
		241	.L2:
		242	and edx, BYTE 3
		243	jz .L4
		244	.L3:
		245	mov al, [esi+2]
		246	mov bh, [esi+1]
		247	mov ah, [esi]
		248	shr al, 3
		249	and eax, 0F81Fh ; BYTE ?
		250	shr ebx, 5
		251	and ebx, 07E0h ; BYTE ?
		252	add eax, ebx
		253	mov [edi], al
		254	mov [edi+1], ah
		255	add esi, BYTE 4
		256	add edi, BYTE 2
		257	dec edx
		258	jnz .L3
		259
		260	.L4:
9172	turbocat	261	retn
8210	maxcodehac	262
		263	_ConvertMMXpII32_16BGR555:
		264
		265	; the 16BGR555 converter is identical to the RGB555 one,
		266	; except it uses a different multiplier for the pmaddwd
		267	; instruction. cool huh.
		268
9172	turbocat	269	load_immq mm7, mmx32_bgr555_mul
8210	maxcodehac	270	jmp _convert_bgr555_cheat
		271
		272	; This is the same as the Intel version.. they obviously went to
		273	; much more trouble to expand/coil the loop than I did, so theirs
		274	; would almost certainly be faster, even if only a little.
		275	; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
		276	; (I think) a more accurate name..
		277	_ConvertMMXpII32_16RGB555:
		278
9172	turbocat	279	load_immq mm7, mmx32_rgb555_mul
8210	maxcodehac	280	_convert_bgr555_cheat:
9172	turbocat	281	load_immq mm6, mmx32_rgb555_g
		282	CLEANUP_IMMQ_LOADS(2)
8210	maxcodehac	283
		284	mov edx,ecx ; Save ecx
		285
9172	turbocat	286	and ecx,DWORD 0fffffff8h ; clear lower three bits
8210	maxcodehac	287	jnz .L_OK
9172	turbocat	288	jmp near .L2
8210	maxcodehac	289
		290	.L_OK:
		291
		292	movq mm2,[esi+8]
		293
		294	movq mm0,[esi]
		295	movq mm3,mm2
		296
9172	turbocat	297	pand_immq mm3, mmx32_rgb555_rb
8210	maxcodehac	298	movq mm1,mm0
		299
9172	turbocat	300	pand_immq mm1, mmx32_rgb555_rb
8210	maxcodehac	301	pmaddwd mm3,mm7
		302
9172	turbocat	303	CLEANUP_IMMQ_LOADS(2)
		304
8210	maxcodehac	305	pmaddwd mm1,mm7
		306	pand mm2,mm6
		307
		308	.L1:
		309	movq mm4,[esi+24]
		310	pand mm0,mm6
		311
		312	movq mm5,[esi+16]
		313	por mm3,mm2
		314
		315	psrld mm3,6
		316	por mm1,mm0
		317
		318	movq mm0,mm4
		319	psrld mm1,6
		320
9172	turbocat	321	pand_immq mm0, mmx32_rgb555_rb
8210	maxcodehac	322	packssdw mm1,mm3
		323
		324	movq mm3,mm5
		325	pmaddwd mm0,mm7
		326
9172	turbocat	327	pand_immq mm3, mmx32_rgb555_rb
8210	maxcodehac	328	pand mm4,mm6
		329
		330	movq [edi],mm1
		331	pmaddwd mm3,mm7
		332
		333	add esi,BYTE 32
		334	por mm4,mm0
		335
		336	pand mm5,mm6
		337	psrld mm4,6
		338
		339	movq mm2,[esi+8]
		340	por mm5,mm3
		341
		342	movq mm0,[esi]
		343	psrld mm5,6
		344
		345	movq mm3,mm2
		346	movq mm1,mm0
		347
9172	turbocat	348	pand_immq mm3, mmx32_rgb555_rb
8210	maxcodehac	349	packssdw mm5,mm4
		350
9172	turbocat	351	pand_immq mm1, mmx32_rgb555_rb
8210	maxcodehac	352	pand mm2,mm6
		353
9172	turbocat	354	CLEANUP_IMMQ_LOADS(4)
		355
8210	maxcodehac	356	movq [edi+8],mm5
		357	pmaddwd mm3,mm7
		358
		359	pmaddwd mm1,mm7
		360	add edi,BYTE 16
		361
		362	sub ecx,BYTE 8
		363	jz .L2
		364	jmp .L1
		365
		366
		367	.L2:
		368	mov ecx,edx
		369
		370	and ecx,BYTE 7
		371	jz .L4
		372
		373	.L3:
		374	mov ebx,[esi]
		375	add esi,BYTE 4
		376
		377	mov eax,ebx
		378	mov edx,ebx
		379
		380	shr eax,3
		381	shr edx,6
		382
		383	and eax,BYTE 0000000000011111b
		384	and edx, 0000001111100000b
		385
		386	shr ebx,9
		387
		388	or eax,edx
		389
		390	and ebx, 0111110000000000b
		391
		392	or eax,ebx
		393
		394	mov [edi],ax
		395	add edi,BYTE 2
		396
		397	dec ecx
		398	jnz .L3
		399
		400	.L4:
9172	turbocat	401	retn
8210	maxcodehac	402
9172	turbocat	403	%ifidn __OUTPUT_FORMAT__,elf32
		404	section .note.GNU-stack noalloc noexec nowrite progbits
		405	%endif

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/SDL-1.2.2_newlib/src/hermes/mmxp2_32.asm – Rev 9172