WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/Mesa/mesa-10.6.0/src/mesa/tnl/t_vertex_sse.c

Rev	Author	Line No.	Line
5564	serge	1	/*
		2	* Copyright 2003 VMware, Inc.
		3	* All Rights Reserved.
		4	*
		5	* Permission is hereby granted, free of charge, to any person obtaining a
		6	* copy of this software and associated documentation files (the "Software"),
		7	* to deal in the Software without restriction, including without limitation
		8	* on the rights to use, copy, modify, merge, publish, distribute, sub
		9	* license, and/or sell copies of the Software, and to permit persons to whom
		10	* the Software is furnished to do so, subject to the following conditions:
		11	*
		12	* The above copyright notice and this permission notice (including the next
		13	* paragraph) shall be included in all copies or substantial portions of the
		14	* Software.
		15	*
		16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		18	* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
		19	* VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
		20	* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
		21	* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
		22	* USE OR OTHER DEALINGS IN THE SOFTWARE.
		23	*
		24	* Authors:
		25	* Keith Whitwell
		26	*/
		27
		28	#include
		29
		30	#include "main/glheader.h"
		31	#include "main/context.h"
		32	#include "util/simple_list.h"
		33	#include "main/enums.h"
		34	#include "swrast/s_chan.h"
		35	#include "t_context.h"
		36	#include "t_vertex.h"
		37
		38	#if defined(USE_SSE_ASM)
		39
		40	#include "x86/rtasm/x86sse.h"
		41	#include "x86/common_x86_asm.h"
		42
		43
		44	/**
		45	* Number of bytes to allocate for generated SSE functions
		46	*/
		47	#define MAX_SSE_CODE_SIZE 1024
		48
		49
		50	#define X 0
		51	#define Y 1
		52	#define Z 2
		53	#define W 3
		54
		55
		56	struct x86_program {
		57	struct x86_function func;
		58
		59	struct gl_context *ctx;
		60	GLboolean inputs_safe;
		61	GLboolean outputs_safe;
		62	GLboolean have_sse2;
		63
		64	struct x86_reg identity;
		65	struct x86_reg chan0;
		66	};
		67
		68
		69	static struct x86_reg get_identity( struct x86_program *p )
		70	{
		71	return p->identity;
		72	}
		73
		74	static void emit_load4f_4( struct x86_program *p,
		75	struct x86_reg dest,
		76	struct x86_reg arg0 )
		77	{
		78	sse_movups(&p->func, dest, arg0);
		79	}
		80
		81	static void emit_load4f_3( struct x86_program *p,
		82	struct x86_reg dest,
		83	struct x86_reg arg0 )
		84	{
		85	/* Have to jump through some hoops:
		86	*
		87	* c 0 0 0
		88	* c 0 0 1
		89	* 0 0 c 1
		90	* a b c 1
		91	*/
		92	sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
		93	sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
		94	sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
		95	sse_movlps(&p->func, dest, arg0);
		96	}
		97
		98	static void emit_load4f_2( struct x86_program *p,
		99	struct x86_reg dest,
		100	struct x86_reg arg0 )
		101	{
		102	/* Initialize from identity, then pull in low two words:
		103	*/
		104	sse_movups(&p->func, dest, get_identity(p));
		105	sse_movlps(&p->func, dest, arg0);
		106	}
		107
		108	static void emit_load4f_1( struct x86_program *p,
		109	struct x86_reg dest,
		110	struct x86_reg arg0 )
		111	{
		112	/* Pull in low word, then swizzle in identity */
		113	sse_movss(&p->func, dest, arg0);
		114	sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
		115	}
		116
		117
		118
		119	static void emit_load3f_3( struct x86_program *p,
		120	struct x86_reg dest,
		121	struct x86_reg arg0 )
		122	{
		123	/* Over-reads by 1 dword - potential SEGV if input is a vertex
		124	* array.
		125	*/
		126	if (p->inputs_safe) {
		127	sse_movups(&p->func, dest, arg0);
		128	}
		129	else {
		130	/* c 0 0 0
		131	* c c c c
		132	* a b c c
		133	*/
		134	sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
		135	sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
		136	sse_movlps(&p->func, dest, arg0);
		137	}
		138	}
		139
		140	static void emit_load3f_2( struct x86_program *p,
		141	struct x86_reg dest,
		142	struct x86_reg arg0 )
		143	{
		144	emit_load4f_2(p, dest, arg0);
		145	}
		146
		147	static void emit_load3f_1( struct x86_program *p,
		148	struct x86_reg dest,
		149	struct x86_reg arg0 )
		150	{
		151	/* Loading from memory erases the upper bits. */
		152	sse_movss(&p->func, dest, arg0);
		153	}
		154
		155	static void emit_load2f_2( struct x86_program *p,
		156	struct x86_reg dest,
		157	struct x86_reg arg0 )
		158	{
		159	sse_movlps(&p->func, dest, arg0);
		160	}
		161
		162	static void emit_load2f_1( struct x86_program *p,
		163	struct x86_reg dest,
		164	struct x86_reg arg0 )
		165	{
		166	/* Loading from memory erases the upper bits. */
		167	sse_movss(&p->func, dest, arg0);
		168	}
		169
		170	static void emit_load1f_1( struct x86_program *p,
		171	struct x86_reg dest,
		172	struct x86_reg arg0 )
		173	{
		174	sse_movss(&p->func, dest, arg0);
		175	}
		176
		177	static void (load[4][4])( struct x86_program p,
		178	struct x86_reg dest,
		179	struct x86_reg arg0 ) = {
		180	{ emit_load1f_1,
		181	emit_load1f_1,
		182	emit_load1f_1,
		183	emit_load1f_1 },
		184
		185	{ emit_load2f_1,
		186	emit_load2f_2,
		187	emit_load2f_2,
		188	emit_load2f_2 },
		189
		190	{ emit_load3f_1,
		191	emit_load3f_2,
		192	emit_load3f_3,
		193	emit_load3f_3 },
		194
		195	{ emit_load4f_1,
		196	emit_load4f_2,
		197	emit_load4f_3,
		198	emit_load4f_4 }
		199	};
		200
		201	static void emit_load( struct x86_program *p,
		202	struct x86_reg dest,
		203	GLuint sz,
		204	struct x86_reg src,
		205	GLuint src_sz)
		206	{
		207	load[sz-1][src_sz-1](p, dest, src);
		208	}
		209
		210	static void emit_store4f( struct x86_program *p,
		211	struct x86_reg dest,
		212	struct x86_reg arg0 )
		213	{
		214	sse_movups(&p->func, dest, arg0);
		215	}
		216
		217	static void emit_store3f( struct x86_program *p,
		218	struct x86_reg dest,
		219	struct x86_reg arg0 )
		220	{
		221	if (p->outputs_safe) {
		222	/* Emit the extra dword anyway. This may hurt writecombining,
		223	* may cause other problems.
		224	*/
		225	sse_movups(&p->func, dest, arg0);
		226	}
		227	else {
		228	/* Alternate strategy - emit two, shuffle, emit one.
		229	*/
		230	sse_movlps(&p->func, dest, arg0);
		231	sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
		232	sse_movss(&p->func, x86_make_disp(dest,8), arg0);
		233	}
		234	}
		235
		236	static void emit_store2f( struct x86_program *p,
		237	struct x86_reg dest,
		238	struct x86_reg arg0 )
		239	{
		240	sse_movlps(&p->func, dest, arg0);
		241	}
		242
		243	static void emit_store1f( struct x86_program *p,
		244	struct x86_reg dest,
		245	struct x86_reg arg0 )
		246	{
		247	sse_movss(&p->func, dest, arg0);
		248	}
		249
		250
		251	static void (store[4])( struct x86_program p,
		252	struct x86_reg dest,
		253	struct x86_reg arg0 ) =
		254	{
		255	emit_store1f,
		256	emit_store2f,
		257	emit_store3f,
		258	emit_store4f
		259	};
		260
		261	static void emit_store( struct x86_program *p,
		262	struct x86_reg dest,
		263	GLuint sz,
		264	struct x86_reg temp )
		265
		266	{
		267	store[sz-1](p, dest, temp);
		268	}
		269
		270	static void emit_pack_store_4ub( struct x86_program *p,
		271	struct x86_reg dest,
		272	struct x86_reg temp )
		273	{
		274	/* Scale by 255.0
		275	*/
		276	sse_mulps(&p->func, temp, p->chan0);
		277
		278	if (p->have_sse2) {
		279	sse2_cvtps2dq(&p->func, temp, temp);
		280	sse2_packssdw(&p->func, temp, temp);
		281	sse2_packuswb(&p->func, temp, temp);
		282	sse_movss(&p->func, dest, temp);
		283	}
		284	else {
		285	struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
		286	struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
		287	sse_cvtps2pi(&p->func, mmx0, temp);
		288	sse_movhlps(&p->func, temp, temp);
		289	sse_cvtps2pi(&p->func, mmx1, temp);
		290	mmx_packssdw(&p->func, mmx0, mmx1);
		291	mmx_packuswb(&p->func, mmx0, mmx0);
		292	mmx_movd(&p->func, dest, mmx0);
		293	}
		294	}
		295
		296	static GLint get_offset( const void a, const void b )
		297	{
		298	return (const char )b - (const char )a;
		299	}
		300
		301	/* Not much happens here. Eventually use this function to try and
		302	* avoid saving/reloading the source pointers each vertex (if some of
		303	* them can fit in registers).
		304	*/
		305	static void get_src_ptr( struct x86_program *p,
		306	struct x86_reg srcREG,
		307	struct x86_reg vtxREG,
		308	struct tnl_clipspace_attr *a )
		309	{
		310	struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
		311	struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
		312
		313	/* Load current a[j].inputptr
		314	*/
		315	x86_mov(&p->func, srcREG, ptr_to_src);
		316	}
		317
		318	static void update_src_ptr( struct x86_program *p,
		319	struct x86_reg srcREG,
		320	struct x86_reg vtxREG,
		321	struct tnl_clipspace_attr *a )
		322	{
		323	if (a->inputstride) {
		324	struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
		325	struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
		326
		327	/* add a[j].inputstride (hardcoded value - could just as easily
		328	* pull the stride value from memory each time).
		329	*/
		330	x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
		331
		332	/* save new value of a[j].inputptr
		333	*/
		334	x86_mov(&p->func, ptr_to_src, srcREG);
		335	}
		336	}
		337
		338
		339	/* Lots of hardcoding
		340	*
		341	* EAX -- pointer to current output vertex
		342	* ECX -- pointer to current attribute
		343	*
		344	*/
		345	static GLboolean build_vertex_emit( struct x86_program *p )
		346	{
		347	struct gl_context *ctx = p->ctx;
		348	TNLcontext *tnl = TNL_CONTEXT(ctx);
		349	struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
		350	GLuint j = 0;
		351
		352	struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
		353	struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
		354	struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
		355	struct x86_reg vtxESI = x86_make_reg(file_REG32, reg_SI);
		356	struct x86_reg temp = x86_make_reg(file_XMM, 0);
		357	struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
		358	struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
		359	struct x86_reg temp2 = x86_make_reg(file_XMM, 3);
		360	GLubyte fixup, label;
		361
		362	/* Push a few regs?
		363	*/
		364	x86_push(&p->func, countEBP);
		365	x86_push(&p->func, vtxESI);
		366
		367
		368	/* Get vertex count, compare to zero
		369	*/
		370	x86_xor(&p->func, srcECX, srcECX);
		371	x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
		372	x86_cmp(&p->func, countEBP, srcECX);
		373	fixup = x86_jcc_forward(&p->func, cc_E);
		374
		375	/* Initialize destination register.
		376	*/
		377	x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
		378
		379	/* Dereference ctx to get tnl, then vtx:
		380	*/
		381	x86_mov(&p->func, vtxESI, x86_fn_arg(&p->func, 1));
		382	x86_mov(&p->func, vtxESI, x86_make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context)));
		383	vtxESI = x86_make_disp(vtxESI, get_offset(tnl, &tnl->clipspace));
		384
		385
		386	/* Possibly load vp0, vp1 for viewport calcs:
		387	*/
		388	if (vtx->need_viewport) {
		389	sse_movups(&p->func, vp0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_scale[0])));
		390	sse_movups(&p->func, vp1, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0])));
		391	}
		392
		393	/* always load, needed or not:
		394	*/
		395	sse_movups(&p->func, p->chan0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0])));
		396	sse_movups(&p->func, p->identity, x86_make_disp(vtxESI, get_offset(vtx, &vtx->identity[0])));
		397
		398	/* Note address for loop jump */
		399	label = x86_get_label(&p->func);
		400
		401	/* Emit code for each of the attributes. Currently routes
		402	* everything through SSE registers, even when it might be more
		403	* efficient to stick with regular old x86. No optimization or
		404	* other tricks - enough new ground to cover here just getting
		405	* things working.
		406	*/
		407	while (j < vtx->attr_count) {
		408	struct tnl_clipspace_attr *a = &vtx->attr[j];
		409	struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
		410
		411	/* Now, load an XMM reg from src, perhaps transform, then save.
		412	* Could be shortcircuited in specific cases:
		413	*/
		414	switch (a->format) {
		415	case EMIT_1F:
		416	get_src_ptr(p, srcECX, vtxESI, a);
		417	emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
		418	emit_store(p, dest, 1, temp);
		419	update_src_ptr(p, srcECX, vtxESI, a);
		420	break;
		421	case EMIT_2F:
		422	get_src_ptr(p, srcECX, vtxESI, a);
		423	emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
		424	emit_store(p, dest, 2, temp);
		425	update_src_ptr(p, srcECX, vtxESI, a);
		426	break;
		427	case EMIT_3F:
		428	/* Potentially the worst case - hardcode 2+1 copying:
		429	*/
		430	if (0) {
		431	get_src_ptr(p, srcECX, vtxESI, a);
		432	emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
		433	emit_store(p, dest, 3, temp);
		434	update_src_ptr(p, srcECX, vtxESI, a);
		435	}
		436	else {
		437	get_src_ptr(p, srcECX, vtxESI, a);
		438	emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
		439	emit_store(p, dest, 2, temp);
		440	if (a->inputsize > 2) {
		441	emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
		442	emit_store(p, x86_make_disp(dest,8), 1, temp);
		443	}
		444	else {
		445	sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
		446	}
		447	update_src_ptr(p, srcECX, vtxESI, a);
		448	}
		449	break;
		450	case EMIT_4F:
		451	get_src_ptr(p, srcECX, vtxESI, a);
		452	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
		453	emit_store(p, dest, 4, temp);
		454	update_src_ptr(p, srcECX, vtxESI, a);
		455	break;
		456	case EMIT_2F_VIEWPORT:
		457	get_src_ptr(p, srcECX, vtxESI, a);
		458	emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
		459	sse_mulps(&p->func, temp, vp0);
		460	sse_addps(&p->func, temp, vp1);
		461	emit_store(p, dest, 2, temp);
		462	update_src_ptr(p, srcECX, vtxESI, a);
		463	break;
		464	case EMIT_3F_VIEWPORT:
		465	get_src_ptr(p, srcECX, vtxESI, a);
		466	emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
		467	sse_mulps(&p->func, temp, vp0);
		468	sse_addps(&p->func, temp, vp1);
		469	emit_store(p, dest, 3, temp);
		470	update_src_ptr(p, srcECX, vtxESI, a);
		471	break;
		472	case EMIT_4F_VIEWPORT:
		473	get_src_ptr(p, srcECX, vtxESI, a);
		474	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
		475	sse_mulps(&p->func, temp, vp0);
		476	sse_addps(&p->func, temp, vp1);
		477	emit_store(p, dest, 4, temp);
		478	update_src_ptr(p, srcECX, vtxESI, a);
		479	break;
		480	case EMIT_3F_XYW:
		481	get_src_ptr(p, srcECX, vtxESI, a);
		482	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
		483	sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
		484	emit_store(p, dest, 3, temp);
		485	update_src_ptr(p, srcECX, vtxESI, a);
		486	break;
		487
		488	case EMIT_1UB_1F:
		489	/* Test for PAD3 + 1UB:
		490	*/
		491	if (j > 0 &&
		492	a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
		493	{
		494	get_src_ptr(p, srcECX, vtxESI, a);
		495	emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
		496	sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
		497	emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
		498	update_src_ptr(p, srcECX, vtxESI, a);
		499	}
		500	else {
		501	printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
		502	return GL_FALSE;
		503	}
		504	break;
		505	case EMIT_3UB_3F_RGB:
		506	case EMIT_3UB_3F_BGR:
		507	/* Test for 3UB + PAD1:
		508	*/
		509	if (j == vtx->attr_count - 1 \|\|
		510	a[1].vertoffset >= a->vertoffset + 4) {
		511	get_src_ptr(p, srcECX, vtxESI, a);
		512	emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
		513	if (a->format == EMIT_3UB_3F_BGR)
		514	sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
		515	emit_pack_store_4ub(p, dest, temp);
		516	update_src_ptr(p, srcECX, vtxESI, a);
		517	}
		518	/* Test for 3UB + 1UB:
		519	*/
		520	else if (j < vtx->attr_count - 1 &&
		521	a[1].format == EMIT_1UB_1F &&
		522	a[1].vertoffset == a->vertoffset + 3) {
		523	get_src_ptr(p, srcECX, vtxESI, a);
		524	emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
		525	update_src_ptr(p, srcECX, vtxESI, a);
		526
		527	/* Make room for incoming value:
		528	*/
		529	sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
		530
		531	get_src_ptr(p, srcECX, vtxESI, &a[1]);
		532	emit_load(p, temp2, 1, x86_deref(srcECX), a[1].inputsize);
		533	sse_movss(&p->func, temp, temp2);
		534	update_src_ptr(p, srcECX, vtxESI, &a[1]);
		535
		536	/* Rearrange and possibly do BGR conversion:
		537	*/
		538	if (a->format == EMIT_3UB_3F_BGR)
		539	sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
		540	else
		541	sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
		542
		543	emit_pack_store_4ub(p, dest, temp);
		544	j++; /* NOTE: two attrs consumed */
		545	}
		546	else {
		547	printf("Can't emit 3ub\n");
		548	return GL_FALSE; /* add this later */
		549	}
		550	break;
		551
		552	case EMIT_4UB_4F_RGBA:
		553	get_src_ptr(p, srcECX, vtxESI, a);
		554	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
		555	emit_pack_store_4ub(p, dest, temp);
		556	update_src_ptr(p, srcECX, vtxESI, a);
		557	break;
		558	case EMIT_4UB_4F_BGRA:
		559	get_src_ptr(p, srcECX, vtxESI, a);
		560	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
		561	sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
		562	emit_pack_store_4ub(p, dest, temp);
		563	update_src_ptr(p, srcECX, vtxESI, a);
		564	break;
		565	case EMIT_4UB_4F_ARGB:
		566	get_src_ptr(p, srcECX, vtxESI, a);
		567	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
		568	sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
		569	emit_pack_store_4ub(p, dest, temp);
		570	update_src_ptr(p, srcECX, vtxESI, a);
		571	break;
		572	case EMIT_4UB_4F_ABGR:
		573	get_src_ptr(p, srcECX, vtxESI, a);
		574	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
		575	sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
		576	emit_pack_store_4ub(p, dest, temp);
		577	update_src_ptr(p, srcECX, vtxESI, a);
		578	break;
		579	case EMIT_4CHAN_4F_RGBA:
		580	switch (CHAN_TYPE) {
		581	case GL_UNSIGNED_BYTE:
		582	get_src_ptr(p, srcECX, vtxESI, a);
		583	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
		584	emit_pack_store_4ub(p, dest, temp);
		585	update_src_ptr(p, srcECX, vtxESI, a);
		586	break;
		587	case GL_FLOAT:
		588	get_src_ptr(p, srcECX, vtxESI, a);
		589	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
		590	emit_store(p, dest, 4, temp);
		591	update_src_ptr(p, srcECX, vtxESI, a);
		592	break;
		593	case GL_UNSIGNED_SHORT:
		594	default:
		595	printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
		596	return GL_FALSE;
		597	}
		598	break;
		599	default:
		600	printf("unknown a[%d].format %d\n", j, a->format);
		601	return GL_FALSE; /* catch any new opcodes */
		602	}
		603
		604	/* Increment j by at least 1 - may have been incremented above also:
		605	*/
		606	j++;
		607	}
		608
		609	/* Next vertex:
		610	*/
		611	x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vtx->vertex_size));
		612
		613	/* decr count, loop if not zero
		614	*/
		615	x86_dec(&p->func, countEBP);
		616	x86_test(&p->func, countEBP, countEBP);
		617	x86_jcc(&p->func, cc_NZ, label);
		618
		619	/* Exit mmx state?
		620	*/
		621	if (p->func.need_emms)
		622	mmx_emms(&p->func);
		623
		624	/* Land forward jump here:
		625	*/
		626	x86_fixup_fwd_jump(&p->func, fixup);
		627
		628	/* Pop regs and return
		629	*/
		630	x86_pop(&p->func, x86_get_base_reg(vtxESI));
		631	x86_pop(&p->func, countEBP);
		632	x86_ret(&p->func);
		633
		634	assert(!vtx->emit);
		635	vtx->emit = (tnl_emit_func)x86_get_func(&p->func);
		636
		637	assert( (char ) p->func.csr - (char ) p->func.store <= MAX_SSE_CODE_SIZE );
		638	return GL_TRUE;
		639	}
		640
		641
		642
		643	void _tnl_generate_sse_emit( struct gl_context *ctx )
		644	{
		645	struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
		646	struct x86_program p;
		647
		648	if (!cpu_has_xmm) {
		649	vtx->codegen_emit = NULL;
		650	return;
		651	}
		652
		653	memset(&p, 0, sizeof(p));
		654
		655	p.ctx = ctx;
		656	p.inputs_safe = 0; /* for now */
		657	p.outputs_safe = 0; /* for now */
		658	p.have_sse2 = cpu_has_xmm2;
		659	p.identity = x86_make_reg(file_XMM, 6);
		660	p.chan0 = x86_make_reg(file_XMM, 7);
		661
		662	if (!x86_init_func_size(&p.func, MAX_SSE_CODE_SIZE)) {
		663	vtx->emit = NULL;
		664	return;
		665	}
		666
		667	if (build_vertex_emit(&p)) {
		668	_tnl_register_fastpath( vtx, GL_TRUE );
		669	}
		670	else {
		671	/* Note the failure so that we don't keep trying to codegen an
		672	* impossible state:
		673	*/
		674	_tnl_register_fastpath( vtx, GL_FALSE );
		675	x86_release_func(&p.func);
		676	}
		677	}
		678
		679	#else
		680
		681	void _tnl_generate_sse_emit( struct gl_context *ctx )
		682	{
		683	/* Dummy version for when USE_SSE_ASM not defined */
		684	}
		685
		686	#endif

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/Mesa/mesa-10.6.0/src/mesa/tnl/t_vertex_sse.c – Rev 5564