WebSVN – Kolibri OS – Blame – /programs/develop/libraries/Mesa/src/mesa/x86/read_rgba_span_x86.S

Rev	Author	Line No.	Line
1901	serge	1	/*
		2	* (C) Copyright IBM Corporation 2004
		3	* All Rights Reserved.
		4	*
		5	* Permission is hereby granted, free of charge, to any person obtaining a
		6	* copy of this software and associated documentation files (the "Software"),
		7	* to deal in the Software without restriction, including without limitation
		8	* on the rights to use, copy, modify, merge, publish, distribute, sub
		9	* license, and/or sell copies of the Software, and to permit persons to whom
		10	* the Software is furnished to do so, subject to the following conditions:
		11	*
		12	* The above copyright notice and this permission notice (including the next
		13	* paragraph) shall be included in all copies or substantial portions of the
		14	* Software.
		15	*
		16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		18	* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
		19	* IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
		20	* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
		21	* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
		22	* USE OR OTHER DEALINGS IN THE SOFTWARE.
		23	*/
		24
		25	/**
		26	* \file read_rgba_span_x86.S
		27	* Optimized routines to transfer pixel data from the framebuffer to a
		28	* buffer in main memory.
		29	*
		30	* \author Ian Romanick
		31	*/
		32
		33	.file "read_rgba_span_x86.S"
		34	#if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
		35	/* Kevin F. Quinn 2nd July 2006
		36	* Replaced data segment constants with text-segment instructions.
		37	*/
		38	#define LOAD_MASK(mvins,m1,m2) \
		39	pushl $0xff00ff00 ;\
		40	pushl $0xff00ff00 ;\
		41	pushl $0xff00ff00 ;\
		42	pushl $0xff00ff00 ;\
		43	mvins (%esp), m1 ;\
		44	pushl $0x00ff0000 ;\
		45	pushl $0x00ff0000 ;\
		46	pushl $0x00ff0000 ;\
		47	pushl $0x00ff0000 ;\
		48	mvins (%esp), m2 ;\
		49	addl $32, %esp
		50
		51	/* I implemented these as macros because they appear in several places,
		52	* and I've tweaked them a number of times. I got tired of changing every
		53	* place they appear. :)
		54	*/
		55
		56	#define DO_ONE_PIXEL() \
		57	movl (%ebx), %eax ; \
		58	addl $4, %ebx ; \
		59	bswap %eax /* ARGB -> BGRA */ ; \
		60	rorl $8, %eax /* BGRA -> ABGR */ ; \
		61	movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
		62	addl $4, %ecx
		63
		64	#define DO_ONE_LAST_PIXEL() \
		65	movl (%ebx), %eax ; \
		66	bswap %eax /* ARGB -> BGRA */ ; \
		67	rorl $8, %eax /* BGRA -> ABGR */ ; \
		68	movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
		69
		70
		71	/**
		72	* MMX optimized version of the BGRA8888_REV to RGBA copy routine.
		73	*
		74	* \warning
		75	* This function assumes that the caller will issue the EMMS instruction
		76	* at the correct places.
		77	*/
		78
		79	.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
		80	.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
		81	.type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
		82	_generic_read_RGBA_span_BGRA8888_REV_MMX:
		83	pushl %ebx
		84
		85	#ifdef USE_INNER_EMMS
		86	emms
		87	#endif
		88	LOAD_MASK(movq,%mm1,%mm2)
		89
		90	movl 8(%esp), %ebx /* source pointer */
		91	movl 16(%esp), %edx /* number of pixels to copy */
		92	movl 12(%esp), %ecx /* destination pointer */
		93
		94	testl %edx, %edx
		95	jle .L20 /* Bail if there's nothing to do. */
		96
		97	movl %ebx, %eax
		98
		99	negl %eax
		100	sarl $2, %eax
		101	andl $1, %eax
		102	je .L17
		103
		104	subl %eax, %edx
		105	DO_ONE_PIXEL()
		106	.L17:
		107
		108	/* Would it be faster to unroll this loop once and process 4 pixels
		109	* per pass, instead of just two?
		110	*/
		111
		112	movl %edx, %eax
		113	shrl %eax
		114	jmp .L18
		115	.L19:
		116	movq (%ebx), %mm0
		117	addl $8, %ebx
		118
		119	/* These 9 instructions do what PSHUFB (if there were such an
		120	* instruction) could do in 1. :(
		121	*/
		122
		123	movq %mm0, %mm3
		124	movq %mm0, %mm4
		125
		126	pand %mm2, %mm3
		127	psllq $16, %mm4
		128	psrlq $16, %mm3
		129	pand %mm2, %mm4
		130
		131	pand %mm1, %mm0
		132	por %mm4, %mm3
		133	por %mm3, %mm0
		134
		135	movq %mm0, (%ecx)
		136	addl $8, %ecx
		137	subl $1, %eax
		138	.L18:
		139	jne .L19
		140
		141	#ifdef USE_INNER_EMMS
		142	emms
		143	#endif
		144
		145	/* At this point there are either 1 or 0 pixels remaining to be
		146	* converted. Convert the last pixel, if needed.
		147	*/
		148
		149	testl $1, %edx
		150	je .L20
		151
		152	DO_ONE_LAST_PIXEL()
		153
		154	.L20:
		155	popl %ebx
		156	ret
		157	.size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
		158
		159
		160	/**
		161	* SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
		162	* instructions are only actually used to read data from the framebuffer.
		163	* In practice, the speed-up is pretty small.
		164	*
		165	* \todo
		166	* Do some more testing and determine if there's any reason to have this
		167	* function in addition to the MMX version.
		168	*
		169	* \warning
		170	* This function assumes that the caller will issue the EMMS instruction
		171	* at the correct places.
		172	*/
		173
		174	.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
		175	.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
		176	.type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
		177	_generic_read_RGBA_span_BGRA8888_REV_SSE:
		178	pushl %esi
		179	pushl %ebx
		180	pushl %ebp
		181
		182	#ifdef USE_INNER_EMMS
		183	emms
		184	#endif
		185
		186	LOAD_MASK(movq,%mm1,%mm2)
		187
		188	movl 16(%esp), %ebx /* source pointer */
		189	movl 24(%esp), %edx /* number of pixels to copy */
		190	movl 20(%esp), %ecx /* destination pointer */
		191
		192	testl %edx, %edx
		193	jle .L35 /* Bail if there's nothing to do. */
		194
		195	movl %esp, %ebp
		196	subl $16, %esp
		197	andl $0xfffffff0, %esp
		198
		199	movl %ebx, %eax
		200	movl %edx, %esi
		201
		202	negl %eax
		203	andl $15, %eax
		204	sarl $2, %eax
		205	cmpl %edx, %eax
		206	cmovle %eax, %esi
		207
		208	subl %esi, %edx
		209
		210	testl $1, %esi
		211	je .L32
		212
		213	DO_ONE_PIXEL()
		214	.L32:
		215
		216	testl $2, %esi
		217	je .L31
		218
		219	movq (%ebx), %mm0
		220	addl $8, %ebx
		221
		222	movq %mm0, %mm3
		223	movq %mm0, %mm4
		224
		225	pand %mm2, %mm3
		226	psllq $16, %mm4
		227	psrlq $16, %mm3
		228	pand %mm2, %mm4
		229
		230	pand %mm1, %mm0
		231	por %mm4, %mm3
		232	por %mm3, %mm0
		233
		234	movq %mm0, (%ecx)
		235	addl $8, %ecx
		236	.L31:
		237
		238	movl %edx, %eax
		239	shrl $2, %eax
		240	jmp .L33
		241	.L34:
		242	movaps (%ebx), %xmm0
		243	addl $16, %ebx
		244
		245	/* This would be so much better if we could just move directly from
		246	* an SSE register to an MMX register. Unfortunately, that
		247	* functionality wasn't introduced until SSE2 with the MOVDQ2Q
		248	* instruction.
		249	*/
		250
		251	movaps %xmm0, (%esp)
		252	movq (%esp), %mm0
		253	movq 8(%esp), %mm5
		254
		255	movq %mm0, %mm3
		256	movq %mm0, %mm4
		257	movq %mm5, %mm6
		258	movq %mm5, %mm7
		259
		260	pand %mm2, %mm3
		261	pand %mm2, %mm6
		262
		263	psllq $16, %mm4
		264	psllq $16, %mm7
		265
		266	psrlq $16, %mm3
		267	psrlq $16, %mm6
		268
		269	pand %mm2, %mm4
		270	pand %mm2, %mm7
		271
		272	pand %mm1, %mm0
		273	pand %mm1, %mm5
		274
		275	por %mm4, %mm3
		276	por %mm7, %mm6
		277
		278	por %mm3, %mm0
		279	por %mm6, %mm5
		280
		281	movq %mm0, (%ecx)
		282	movq %mm5, 8(%ecx)
		283	addl $16, %ecx
		284
		285	subl $1, %eax
		286	.L33:
		287	jne .L34
		288
		289	#ifdef USE_INNER_EMMS
		290	emms
		291	#endif
		292	movl %ebp, %esp
		293
		294	/* At this point there are either [0, 3] pixels remaining to be
		295	* converted.
		296	*/
		297
		298	testl $2, %edx
		299	je .L36
		300
		301	movq (%ebx), %mm0
		302	addl $8, %ebx
		303
		304	movq %mm0, %mm3
		305	movq %mm0, %mm4
		306
		307	pand %mm2, %mm3
		308	psllq $16, %mm4
		309	psrlq $16, %mm3
		310	pand %mm2, %mm4
		311
		312	pand %mm1, %mm0
		313	por %mm4, %mm3
		314	por %mm3, %mm0
		315
		316	movq %mm0, (%ecx)
		317	addl $8, %ecx
		318	.L36:
		319
		320	testl $1, %edx
		321	je .L35
		322
		323	DO_ONE_LAST_PIXEL()
		324	.L35:
		325	popl %ebp
		326	popl %ebx
		327	popl %esi
		328	ret
		329	.size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
		330
		331
		332	/**
		333	* SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
		334	*/
		335
		336	.text
		337	.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
		338	.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
		339	.type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
		340	_generic_read_RGBA_span_BGRA8888_REV_SSE2:
		341	pushl %esi
		342	pushl %ebx
		343
		344	LOAD_MASK(movdqu,%xmm1,%xmm2)
		345
		346	movl 12(%esp), %ebx /* source pointer */
		347	movl 20(%esp), %edx /* number of pixels to copy */
		348	movl 16(%esp), %ecx /* destination pointer */
		349
		350	movl %ebx, %eax
		351	movl %edx, %esi
		352
		353	testl %edx, %edx
		354	jle .L46 /* Bail if there's nothing to do. */
		355
		356	/* If the source pointer isn't a multiple of 16 we have to process
		357	* a few pixels the "slow" way to get the address aligned for
		358	* the SSE fetch intsructions.
		359	*/
		360
		361	negl %eax
		362	andl $15, %eax
		363	sarl $2, %eax
		364
		365	cmpl %edx, %eax
		366	cmovbe %eax, %esi
		367	subl %esi, %edx
		368
		369	testl $1, %esi
		370	je .L41
		371
		372	DO_ONE_PIXEL()
		373	.L41:
		374	testl $2, %esi
		375	je .L40
		376
		377	movq (%ebx), %xmm0
		378	addl $8, %ebx
		379
		380	movdqa %xmm0, %xmm3
		381	movdqa %xmm0, %xmm4
		382	andps %xmm1, %xmm0
		383
		384	andps %xmm2, %xmm3
		385	pslldq $2, %xmm4
		386	psrldq $2, %xmm3
		387	andps %xmm2, %xmm4
		388
		389	orps %xmm4, %xmm3
		390	orps %xmm3, %xmm0
		391
		392	movq %xmm0, (%ecx)
		393	addl $8, %ecx
		394	.L40:
		395
		396	/* Would it be worth having a specialized version of this loop for
		397	* the case where the destination is 16-byte aligned? That version
		398	* would be identical except that it could use movedqa instead of
		399	* movdqu.
		400	*/
		401
		402	movl %edx, %eax
		403	shrl $2, %eax
		404	jmp .L42
		405	.L43:
		406	movdqa (%ebx), %xmm0
		407	addl $16, %ebx
		408
		409	movdqa %xmm0, %xmm3
		410	movdqa %xmm0, %xmm4
		411	andps %xmm1, %xmm0
		412
		413	andps %xmm2, %xmm3
		414	pslldq $2, %xmm4
		415	psrldq $2, %xmm3
		416	andps %xmm2, %xmm4
		417
		418	orps %xmm4, %xmm3
		419	orps %xmm3, %xmm0
		420
		421	movdqu %xmm0, (%ecx)
		422	addl $16, %ecx
		423	subl $1, %eax
		424	.L42:
		425	jne .L43
		426
		427
		428	/* There may be upto 3 pixels remaining to be copied. Take care
		429	* of them now. We do the 2 pixel case first because the data
		430	* will be aligned.
		431	*/
		432
		433	testl $2, %edx
		434	je .L47
		435
		436	movq (%ebx), %xmm0
		437	addl $8, %ebx
		438
		439	movdqa %xmm0, %xmm3
		440	movdqa %xmm0, %xmm4
		441	andps %xmm1, %xmm0
		442
		443	andps %xmm2, %xmm3
		444	pslldq $2, %xmm4
		445	psrldq $2, %xmm3
		446	andps %xmm2, %xmm4
		447
		448	orps %xmm4, %xmm3
		449	orps %xmm3, %xmm0
		450
		451	movq %xmm0, (%ecx)
		452	addl $8, %ecx
		453	.L47:
		454
		455	testl $1, %edx
		456	je .L46
		457
		458	DO_ONE_LAST_PIXEL()
		459	.L46:
		460
		461	popl %ebx
		462	popl %esi
		463	ret
		464	.size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
		465
		466
		467
		468	#define MASK_565_L 0x07e0f800
		469	#define MASK_565_H 0x0000001f
		470	/* Setting SCALE_ADJUST to 5 gives a perfect match with the
		471	* classic C implementation in Mesa. Setting SCALE_ADJUST
		472	* to 0 is slightly faster but at a small cost to accuracy.
		473	*/
		474	#define SCALE_ADJUST 5
		475	#if SCALE_ADJUST == 5
		476	#define PRESCALE_L 0x00100001
		477	#define PRESCALE_H 0x00000200
		478	#define SCALE_L 0x40C620E8
		479	#define SCALE_H 0x0000839d
		480	#elif SCALE_ADJUST == 0
		481	#define PRESCALE_L 0x00200001
		482	#define PRESCALE_H 0x00000800
		483	#define SCALE_L 0x01040108
		484	#define SCALE_H 0x00000108
		485	#else
		486	#error SCALE_ADJUST must either be 5 or 0.
		487	#endif
		488	#define ALPHA_L 0x00000000
		489	#define ALPHA_H 0x00ff0000
		490
		491	/**
		492	* MMX optimized version of the RGB565 to RGBA copy routine.
		493	*/
		494
		495	.text
		496	.globl _generic_read_RGBA_span_RGB565_MMX
		497	.hidden _generic_read_RGBA_span_RGB565_MMX
		498	.type _generic_read_RGBA_span_RGB565_MMX, @function
		499
		500	_generic_read_RGBA_span_RGB565_MMX:
		501
		502	#ifdef USE_INNER_EMMS
		503	emms
		504	#endif
		505
		506	movl 4(%esp), %eax /* source pointer */
		507	movl 8(%esp), %edx /* destination pointer */
		508	movl 12(%esp), %ecx /* number of pixels to copy */
		509
		510	pushl $MASK_565_H
		511	pushl $MASK_565_L
		512	movq (%esp), %mm5
		513	pushl $PRESCALE_H
		514	pushl $PRESCALE_L
		515	movq (%esp), %mm6
		516	pushl $SCALE_H
		517	pushl $SCALE_L
		518	movq (%esp), %mm7
		519	pushl $ALPHA_H
		520	pushl $ALPHA_L
		521	movq (%esp), %mm3
		522	addl $32,%esp
		523
		524	sarl $2, %ecx
		525	jl .L01 /* Bail early if the count is negative. */
		526	jmp .L02
		527
		528	.L03:
		529	/* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
		530	* second pixels into the four words of %mm0 and %mm2.
		531	*/
		532
		533	movq (%eax), %mm4
		534	addl $8, %eax
		535
		536	pshufw $0x00, %mm4, %mm0
		537	pshufw $0x55, %mm4, %mm2
		538
		539
		540	/* Mask the pixels so that each word of each register contains only
		541	* one color component.
		542	*/
		543
		544	pand %mm5, %mm0
		545	pand %mm5, %mm2
		546
		547
		548	/* Adjust the component values so that they are as small as possible,
		549	* but large enough so that we can multiply them by an unsigned 16-bit
		550	* number and get a value as large as 0x00ff0000.
		551	*/
		552
		553	pmullw %mm6, %mm0
		554	pmullw %mm6, %mm2
		555	#if SCALE_ADJUST > 0
		556	psrlw $SCALE_ADJUST, %mm0
		557	psrlw $SCALE_ADJUST, %mm2
		558	#endif
		559
		560	/* Scale the input component values to be on the range
		561	* [0, 0x00ff0000]. This it the real magic of the whole routine.
		562	*/
		563
		564	pmulhuw %mm7, %mm0
		565	pmulhuw %mm7, %mm2
		566
		567
		568	/* Always set the alpha value to 0xff.
		569	*/
		570
		571	por %mm3, %mm0
		572	por %mm3, %mm2
		573
		574
		575	/* Pack the 16-bit values to 8-bit values and store the converted
		576	* pixel data.
		577	*/
		578
		579	packuswb %mm2, %mm0
		580	movq %mm0, (%edx)
		581	addl $8, %edx
		582
		583	pshufw $0xaa, %mm4, %mm0
		584	pshufw $0xff, %mm4, %mm2
		585
		586	pand %mm5, %mm0
		587	pand %mm5, %mm2
		588	pmullw %mm6, %mm0
		589	pmullw %mm6, %mm2
		590	#if SCALE_ADJUST > 0
		591	psrlw $SCALE_ADJUST, %mm0
		592	psrlw $SCALE_ADJUST, %mm2
		593	#endif
		594	pmulhuw %mm7, %mm0
		595	pmulhuw %mm7, %mm2
		596
		597	por %mm3, %mm0
		598	por %mm3, %mm2
		599
		600	packuswb %mm2, %mm0
		601
		602	movq %mm0, (%edx)
		603	addl $8, %edx
		604
		605	subl $1, %ecx
		606	.L02:
		607	jne .L03
		608
		609
		610	/* At this point there can be at most 3 pixels left to process. If
		611	* there is either 2 or 3 left, process 2.
		612	*/
		613
		614	movl 12(%esp), %ecx
		615	testl $0x02, %ecx
		616	je .L04
		617
		618	movd (%eax), %mm4
		619	addl $4, %eax
		620
		621	pshufw $0x00, %mm4, %mm0
		622	pshufw $0x55, %mm4, %mm2
		623
		624	pand %mm5, %mm0
		625	pand %mm5, %mm2
		626	pmullw %mm6, %mm0
		627	pmullw %mm6, %mm2
		628	#if SCALE_ADJUST > 0
		629	psrlw $SCALE_ADJUST, %mm0
		630	psrlw $SCALE_ADJUST, %mm2
		631	#endif
		632	pmulhuw %mm7, %mm0
		633	pmulhuw %mm7, %mm2
		634
		635	por %mm3, %mm0
		636	por %mm3, %mm2
		637
		638	packuswb %mm2, %mm0
		639
		640	movq %mm0, (%edx)
		641	addl $8, %edx
		642
		643	.L04:
		644	/* At this point there can be at most 1 pixel left to process.
		645	* Process it if needed.
		646	*/
		647
		648	testl $0x01, %ecx
		649	je .L01
		650
		651	movzwl (%eax), %ecx
		652	movd %ecx, %mm4
		653
		654	pshufw $0x00, %mm4, %mm0
		655
		656	pand %mm5, %mm0
		657	pmullw %mm6, %mm0
		658	#if SCALE_ADJUST > 0
		659	psrlw $SCALE_ADJUST, %mm0
		660	#endif
		661	pmulhuw %mm7, %mm0
		662
		663	por %mm3, %mm0
		664
		665	packuswb %mm0, %mm0
		666
		667	movd %mm0, (%edx)
		668
		669	.L01:
		670	#ifdef USE_INNER_EMMS
		671	emms
		672	#endif
		673	ret
		674	#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
		675
		676	#if defined (__ELF__) && defined (__linux__)
		677	.section .note.GNU-stack,"",%progbits
		678	#endif

Subversion Repositories Kolibri OS

(root)/programs/develop/libraries/Mesa/src/mesa/x86/read_rgba_span_x86.S – Rev 1901