WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/Mesa/mesa-10.6.0/src/gallium/drivers/radeonsi/si_descriptors.c

Rev	Author	Line No.	Line
5564	serge	1	/*
		2	* Copyright 2013 Advanced Micro Devices, Inc.
		3	*
		4	* Permission is hereby granted, free of charge, to any person obtaining a
		5	* copy of this software and associated documentation files (the "Software"),
		6	* to deal in the Software without restriction, including without limitation
		7	* on the rights to use, copy, modify, merge, publish, distribute, sub
		8	* license, and/or sell copies of the Software, and to permit persons to whom
		9	* the Software is furnished to do so, subject to the following conditions:
		10	*
		11	* The above copyright notice and this permission notice (including the next
		12	* paragraph) shall be included in all copies or substantial portions of the
		13	* Software.
		14	*
		15	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		16	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		17	* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
		18	* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
		19	* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
		20	* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
		21	* USE OR OTHER DEALINGS IN THE SOFTWARE.
		22	*
		23	* Authors:
		24	* Marek Olšák
		25	*/
		26
		27	/* Resource binding slots and sampler states (each described with 8 or 4 dwords)
		28	* live in memory on SI.
		29	*
		30	* This file is responsible for managing lists of resources and sampler states
		31	* in memory and binding them, which means updating those structures in memory.
		32	*
		33	* There is also code for updating shader pointers to resources and sampler
		34	* states. CP DMA functions are here too.
		35	*/
		36
		37	#include "radeon/r600_cs.h"
		38	#include "si_pipe.h"
		39	#include "si_shader.h"
		40	#include "sid.h"
		41
		42	#include "util/u_memory.h"
		43	#include "util/u_upload_mgr.h"
		44
		45	#define SI_NUM_CONTEXTS 16
		46
		47	/* NULL image and buffer descriptor.
		48	*
		49	* For images, all fields must be zero except for the swizzle, which
		50	* supports arbitrary combinations of 0s and 1s. The texture type must be
		51	* any valid type (e.g. 1D). If the texture type isn't set, the hw hangs.
		52	*
		53	* For buffers, all fields must be zero. If they are not, the hw hangs.
		54	*
		55	* This is the only reason why the buffer descriptor must be in words [4:7].
		56	*/
		57	static uint32_t null_descriptor[8] = {
		58	0,
		59	0,
		60	0,
		61	S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) \|
		62	S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
		63	/* the rest must contain zeros, which is also used by the buffer
		64	* descriptor */
		65	};
		66
		67	/* Set this if you want the 3D engine to wait until CP DMA is done.
		68	* It should be set on the last CP DMA packet. */
		69	#define R600_CP_DMA_SYNC (1 << 0) /* R600+ */
		70
		71	/* Set this if the source data was used as a destination in a previous CP DMA
		72	* packet. It's for preventing a read-after-write (RAW) hazard between two
		73	* CP DMA packets. */
		74	#define SI_CP_DMA_RAW_WAIT (1 << 1) /* SI+ */
		75	#define CIK_CP_DMA_USE_L2 (1 << 2)
		76
		77	/* Emit a CP DMA packet to do a copy from one buffer to another.
		78	* The size must fit in bits [20:0].
		79	*/
		80	static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
		81	uint64_t dst_va, uint64_t src_va,
		82	unsigned size, unsigned flags)
		83	{
		84	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
		85	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
		86	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
		87	uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
		88	PKT3_CP_DMA_SRC_SEL(3) \| PKT3_CP_DMA_DST_SEL(3) : 0;
		89
		90	assert(size);
		91	assert((size & ((1<<21)-1)) == size);
		92
		93	if (sctx->b.chip_class >= CIK) {
		94	radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
		95	radeon_emit(cs, sync_flag \| sel); /* CP_SYNC [31] */
		96	radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */
		97	radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */
		98	radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
		99	radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */
		100	radeon_emit(cs, size \| raw_wait); /* COMMAND [29:22] \| BYTE_COUNT [20:0] */
		101	} else {
		102	radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
		103	radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */
		104	radeon_emit(cs, sync_flag \| ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] \| SRC_ADDR_HI [15:0] */
		105	radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
		106	radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
		107	radeon_emit(cs, size \| raw_wait); /* COMMAND [29:22] \| BYTE_COUNT [20:0] */
		108	}
		109	}
		110
		111	/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
		112	static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
		113	uint64_t dst_va, unsigned size,
		114	uint32_t clear_value, unsigned flags)
		115	{
		116	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
		117	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
		118	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
		119	uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? PKT3_CP_DMA_DST_SEL(3) : 0;
		120
		121	assert(size);
		122	assert((size & ((1<<21)-1)) == size);
		123
		124	if (sctx->b.chip_class >= CIK) {
		125	radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
		126	radeon_emit(cs, sync_flag \| dst_sel \| PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] \| SRC_SEL[30:29] */
		127	radeon_emit(cs, clear_value); /* DATA [31:0] */
		128	radeon_emit(cs, 0);
		129	radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
		130	radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [15:0] */
		131	radeon_emit(cs, size \| raw_wait); /* COMMAND [29:22] \| BYTE_COUNT [20:0] */
		132	} else {
		133	radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
		134	radeon_emit(cs, clear_value); /* DATA [31:0] */
		135	radeon_emit(cs, sync_flag \| PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] \| SRC_SEL[30:29] */
		136	radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
		137	radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
		138	radeon_emit(cs, size \| raw_wait); /* COMMAND [29:22] \| BYTE_COUNT [20:0] */
		139	}
		140	}
		141
		142	static void si_init_descriptors(struct si_context *sctx,
		143	struct si_descriptors *desc,
		144	unsigned shader_userdata_reg,
		145	unsigned element_dw_size,
		146	unsigned num_elements,
		147	void (emit_func)(struct si_context ctx, struct r600_atom *state))
		148	{
		149	assert(num_elements <= sizeof(desc->enabled_mask)*8);
		150	assert(num_elements <= sizeof(desc->dirty_mask)*8);
		151
		152	desc->atom.emit = (void*)emit_func;
		153	desc->shader_userdata_reg = shader_userdata_reg;
		154	desc->element_dw_size = element_dw_size;
		155	desc->num_elements = num_elements;
		156	desc->context_size = num_elements * element_dw_size * 4;
		157
		158	desc->buffer = (struct r600_resource*)
		159	pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
		160	PIPE_USAGE_DEFAULT,
		161	SI_NUM_CONTEXTS * desc->context_size);
		162
		163	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
		164	RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
		165
		166	/* We don't check for CS space here, because this should be called
		167	* only once at context initialization. */
		168	si_emit_cp_dma_clear_buffer(sctx, desc->buffer->gpu_address,
		169	desc->buffer->b.b.width0, 0,
		170	R600_CP_DMA_SYNC \| CIK_CP_DMA_USE_L2);
		171	}
		172
		173	static void si_release_descriptors(struct si_descriptors *desc)
		174	{
		175	pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
		176	}
		177
		178	static void si_update_descriptors(struct si_context *sctx,
		179	struct si_descriptors *desc)
		180	{
		181	if (desc->dirty_mask) {
		182	desc->atom.num_dw =
		183	7 + /* copy */
		184	(4 + desc->element_dw_size) * util_bitcount64(desc->dirty_mask) + /* update */
		185	4; /* pointer update */
		186
		187	if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 &&
		188	desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0)
		189	desc->atom.num_dw += 4; /* second pointer update */
		190
		191	desc->atom.dirty = true;
		192
		193	/* TODO: Investigate if these flushes can be removed after
		194	* adding CE support. */
		195
		196	/* The descriptors are read with the K cache. */
		197	sctx->b.flags \|= SI_CONTEXT_INV_KCACHE;
		198
		199	/* Since SI uses uncached CP DMA to update descriptors,
		200	* we have to flush TC L2, which is used to fetch constants
		201	* along with KCACHE. */
		202	if (sctx->b.chip_class == SI)
		203	sctx->b.flags \|= SI_CONTEXT_INV_TC_L2;
		204	} else {
		205	desc->atom.dirty = false;
		206	}
		207	}
		208
		209	static void si_emit_shader_pointer(struct si_context *sctx,
		210	struct r600_atom *atom)
		211	{
		212	struct si_descriptors desc = (struct si_descriptors)atom;
		213	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
		214	uint64_t va = desc->buffer->gpu_address +
		215	desc->current_context_id * desc->context_size +
		216	desc->buffer_offset;
		217
		218	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
		219	radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2);
		220	radeon_emit(cs, va);
		221	radeon_emit(cs, va >> 32);
		222
		223	if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 &&
		224	desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0) {
		225	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
		226	radeon_emit(cs, (desc->shader_userdata_reg +
		227	(R_00B330_SPI_SHADER_USER_DATA_ES_0 -
		228	R_00B130_SPI_SHADER_USER_DATA_VS_0) -
		229	SI_SH_REG_OFFSET) >> 2);
		230	radeon_emit(cs, va);
		231	radeon_emit(cs, va >> 32);
		232	}
		233	}
		234
		235	static void si_emit_descriptors(struct si_context *sctx,
		236	struct si_descriptors *desc,
		237	uint32_t **descriptors)
		238	{
		239	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
		240	uint64_t va_base;
		241	int packet_start = 0;
		242	int packet_size = 0;
		243	int last_index = desc->num_elements; /* point to a non-existing element */
		244	uint64_t dirty_mask = desc->dirty_mask;
		245	unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS;
		246
		247	assert(dirty_mask);
		248
		249	va_base = desc->buffer->gpu_address;
		250
		251	/* Copy the descriptors to a new context slot. */
		252	si_emit_cp_dma_copy_buffer(sctx,
		253	va_base + new_context_id * desc->context_size,
		254	va_base + desc->current_context_id * desc->context_size,
		255	desc->context_size, R600_CP_DMA_SYNC \| CIK_CP_DMA_USE_L2);
		256
		257	va_base += new_context_id * desc->context_size;
		258
		259	/* Update the descriptors.
		260	* Updates of consecutive descriptors are merged to one WRITE_DATA packet.
		261	*
		262	* XXX When unbinding lots of resources, consider clearing the memory
		263	* with CP DMA instead of emitting zeros.
		264	*/
		265	while (dirty_mask) {
		266	int i = u_bit_scan64(&dirty_mask);
		267
		268	assert(i < desc->num_elements);
		269
		270	if (last_index+1 == i && packet_size) {
		271	/* Append new data at the end of the last packet. */
		272	packet_size += desc->element_dw_size;
		273	cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0);
		274	} else {
		275	/* Start a new packet. */
		276	uint64_t va = va_base + i * desc->element_dw_size * 4;
		277
		278	packet_start = cs->cdw;
		279	packet_size = 2 + desc->element_dw_size;
		280
		281	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0));
		282	radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(sctx->b.chip_class == SI ?
		283	PKT3_WRITE_DATA_DST_SEL_MEM_SYNC :
		284	PKT3_WRITE_DATA_DST_SEL_TC_L2) \|
		285	PKT3_WRITE_DATA_WR_CONFIRM \|
		286	PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME));
		287	radeon_emit(cs, va & 0xFFFFFFFFUL);
		288	radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL);
		289	}
		290
		291	radeon_emit_array(cs, descriptors[i], desc->element_dw_size);
		292
		293	last_index = i;
		294	}
		295
		296	desc->dirty_mask = 0;
		297	desc->current_context_id = new_context_id;
		298
		299	/* Now update the shader userdata pointer. */
		300	si_emit_shader_pointer(sctx, &desc->atom);
		301	}
		302
		303	static unsigned si_get_shader_user_data_base(unsigned shader)
		304	{
		305	switch (shader) {
		306	case PIPE_SHADER_VERTEX:
		307	return R_00B130_SPI_SHADER_USER_DATA_VS_0;
		308	case PIPE_SHADER_GEOMETRY:
		309	return R_00B230_SPI_SHADER_USER_DATA_GS_0;
		310	case PIPE_SHADER_FRAGMENT:
		311	return R_00B030_SPI_SHADER_USER_DATA_PS_0;
		312	default:
		313	assert(0);
		314	return 0;
		315	}
		316	}
		317
		318	/* SAMPLER VIEWS */
		319
		320	static void si_emit_sampler_views(struct si_context sctx, struct r600_atom atom)
		321	{
		322	struct si_sampler_views views = (struct si_sampler_views)atom;
		323
		324	si_emit_descriptors(sctx, &views->desc, views->desc_data);
		325	}
		326
		327	static void si_init_sampler_views(struct si_context *sctx,
		328	struct si_sampler_views *views,
		329	unsigned shader)
		330	{
		331	int i;
		332
		333	si_init_descriptors(sctx, &views->desc,
		334	si_get_shader_user_data_base(shader) +
		335	SI_SGPR_RESOURCE * 4,
		336	8, SI_NUM_SAMPLER_VIEWS, si_emit_sampler_views);
		337
		338	for (i = 0; i < views->desc.num_elements; i++) {
		339	views->desc_data[i] = null_descriptor;
		340	views->desc.dirty_mask \|= 1llu << i;
		341	}
		342	si_update_descriptors(sctx, &views->desc);
		343	}
		344
		345	static void si_release_sampler_views(struct si_sampler_views *views)
		346	{
		347	int i;
		348
		349	for (i = 0; i < Elements(views->views); i++) {
		350	pipe_sampler_view_reference(&views->views[i], NULL);
		351	}
		352	si_release_descriptors(&views->desc);
		353	}
		354
		355	static enum radeon_bo_priority si_get_resource_ro_priority(struct r600_resource *res)
		356	{
		357	if (res->b.b.target == PIPE_BUFFER)
		358	return RADEON_PRIO_SHADER_BUFFER_RO;
		359
		360	if (res->b.b.nr_samples > 1)
		361	return RADEON_PRIO_SHADER_TEXTURE_MSAA;
		362
		363	return RADEON_PRIO_SHADER_TEXTURE_RO;
		364	}
		365
		366	static void si_sampler_views_begin_new_cs(struct si_context *sctx,
		367	struct si_sampler_views *views)
		368	{
		369	uint64_t mask = views->desc.enabled_mask;
		370
		371	/* Add relocations to the CS. */
		372	while (mask) {
		373	int i = u_bit_scan64(&mask);
		374	struct si_sampler_view *rview =
		375	(struct si_sampler_view*)views->views[i];
		376
		377	if (!rview->resource)
		378	continue;
		379
		380	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
		381	rview->resource, RADEON_USAGE_READ,
		382	si_get_resource_ro_priority(rview->resource));
		383	}
		384
		385	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
		386	RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
		387
		388	si_emit_shader_pointer(sctx, &views->desc.atom);
		389	}
		390
		391	static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
		392	unsigned slot, struct pipe_sampler_view *view,
		393	unsigned *view_desc)
		394	{
		395	struct si_sampler_views *views = &sctx->samplers[shader].views;
		396
		397	if (views->views[slot] == view)
		398	return;
		399
		400	if (view) {
		401	struct si_sampler_view *rview =
		402	(struct si_sampler_view*)view;
		403
		404	if (rview->resource)
		405	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
		406	rview->resource, RADEON_USAGE_READ,
		407	si_get_resource_ro_priority(rview->resource));
		408
		409
		410	pipe_sampler_view_reference(&views->views[slot], view);
		411	views->desc_data[slot] = view_desc;
		412	views->desc.enabled_mask \|= 1llu << slot;
		413	} else {
		414	pipe_sampler_view_reference(&views->views[slot], NULL);
		415	views->desc_data[slot] = null_descriptor;
		416	views->desc.enabled_mask &= ~(1llu << slot);
		417	}
		418
		419	views->desc.dirty_mask \|= 1llu << slot;
		420	}
		421
		422	static void si_set_sampler_views(struct pipe_context *ctx,
		423	unsigned shader, unsigned start,
		424	unsigned count,
		425	struct pipe_sampler_view **views)
		426	{
		427	struct si_context sctx = (struct si_context )ctx;
		428	struct si_textures_info *samplers = &sctx->samplers[shader];
		429	struct si_sampler_view rviews = (struct si_sampler_view )views;
		430	int i;
		431
		432	if (!count \|\| shader >= SI_NUM_SHADERS)
		433	return;
		434
		435	for (i = 0; i < count; i++) {
		436	unsigned slot = start + i;
		437
		438	if (!views \|\| !views[i]) {
		439	samplers->depth_texture_mask &= ~(1 << slot);
		440	samplers->compressed_colortex_mask &= ~(1 << slot);
		441	si_set_sampler_view(sctx, shader, slot, NULL, NULL);
		442	si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
		443	NULL, NULL);
		444	continue;
		445	}
		446
		447	si_set_sampler_view(sctx, shader, slot, views[i], rviews[i]->state);
		448
		449	if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) {
		450	struct r600_texture *rtex =
		451	(struct r600_texture*)views[i]->texture;
		452
		453	if (rtex->is_depth && !rtex->is_flushing_texture) {
		454	samplers->depth_texture_mask \|= 1 << slot;
		455	} else {
		456	samplers->depth_texture_mask &= ~(1 << slot);
		457	}
		458	if (rtex->cmask.size \|\| rtex->fmask.size) {
		459	samplers->compressed_colortex_mask \|= 1 << slot;
		460	} else {
		461	samplers->compressed_colortex_mask &= ~(1 << slot);
		462	}
		463
		464	if (rtex->fmask.size) {
		465	si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
		466	views[i], rviews[i]->fmask_state);
		467	} else {
		468	si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
		469	NULL, NULL);
		470	}
		471	} else {
		472	samplers->depth_texture_mask &= ~(1 << slot);
		473	samplers->compressed_colortex_mask &= ~(1 << slot);
		474	si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
		475	NULL, NULL);
		476	}
		477	}
		478
		479	si_update_descriptors(sctx, &samplers->views.desc);
		480	}
		481
		482	/* SAMPLER STATES */
		483
		484	static void si_emit_sampler_states(struct si_context sctx, struct r600_atom atom)
		485	{
		486	struct si_sampler_states states = (struct si_sampler_states)atom;
		487
		488	si_emit_descriptors(sctx, &states->desc, states->desc_data);
		489	}
		490
		491	static void si_sampler_states_begin_new_cs(struct si_context *sctx,
		492	struct si_sampler_states *states)
		493	{
		494	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
		495	RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
		496	si_emit_shader_pointer(sctx, &states->desc.atom);
		497	}
		498
		499	void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
		500	unsigned start, unsigned count, void **states)
		501	{
		502	struct si_sampler_states *samplers = &sctx->samplers[shader].states;
		503	struct si_sampler_state sstates = (struct si_sampler_state)states;
		504	int i;
		505
		506	if (start == 0)
		507	samplers->saved_states[0] = states[0];
		508	if (start == 1)
		509	samplers->saved_states[1] = states[0];
		510	else if (start == 0 && count >= 2)
		511	samplers->saved_states[1] = states[1];
		512
		513	for (i = 0; i < count; i++) {
		514	unsigned slot = start + i;
		515
		516	if (!sstates[i]) {
		517	samplers->desc.dirty_mask &= ~(1llu << slot);
		518	continue;
		519	}
		520
		521	samplers->desc_data[slot] = sstates[i]->val;
		522	samplers->desc.dirty_mask \|= 1llu << slot;
		523	}
		524
		525	si_update_descriptors(sctx, &samplers->desc);
		526	}
		527
		528	/* BUFFER RESOURCES */
		529
		530	static void si_emit_buffer_resources(struct si_context sctx, struct r600_atom atom)
		531	{
		532	struct si_buffer_resources buffers = (struct si_buffer_resources)atom;
		533
		534	si_emit_descriptors(sctx, &buffers->desc, buffers->desc_data);
		535	}
		536
		537	static void si_init_buffer_resources(struct si_context *sctx,
		538	struct si_buffer_resources *buffers,
		539	unsigned num_buffers, unsigned shader,
		540	unsigned shader_userdata_index,
		541	enum radeon_bo_usage shader_usage,
		542	enum radeon_bo_priority priority)
		543	{
		544	int i;
		545
		546	buffers->num_buffers = num_buffers;
		547	buffers->shader_usage = shader_usage;
		548	buffers->priority = priority;
		549	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
		550	buffers->desc_storage = CALLOC(num_buffers, sizeof(uint32_t) * 4);
		551
		552	/* si_emit_descriptors only accepts an array of arrays.
		553	* This adds such an array. */
		554	buffers->desc_data = CALLOC(num_buffers, sizeof(uint32_t*));
		555	for (i = 0; i < num_buffers; i++) {
		556	buffers->desc_data[i] = &buffers->desc_storage[i*4];
		557	}
		558
		559	si_init_descriptors(sctx, &buffers->desc,
		560	si_get_shader_user_data_base(shader) +
		561	shader_userdata_index*4, 4, num_buffers,
		562	si_emit_buffer_resources);
		563	}
		564
		565	static void si_release_buffer_resources(struct si_buffer_resources *buffers)
		566	{
		567	int i;
		568
		569	for (i = 0; i < buffers->num_buffers; i++) {
		570	pipe_resource_reference(&buffers->buffers[i], NULL);
		571	}
		572
		573	FREE(buffers->buffers);
		574	FREE(buffers->desc_storage);
		575	FREE(buffers->desc_data);
		576	si_release_descriptors(&buffers->desc);
		577	}
		578
		579	static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
		580	struct si_buffer_resources *buffers)
		581	{
		582	uint64_t mask = buffers->desc.enabled_mask;
		583
		584	/* Add relocations to the CS. */
		585	while (mask) {
		586	int i = u_bit_scan64(&mask);
		587
		588	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
		589	(struct r600_resource*)buffers->buffers[i],
		590	buffers->shader_usage, buffers->priority);
		591	}
		592
		593	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
		594	buffers->desc.buffer, RADEON_USAGE_READWRITE,
		595	RADEON_PRIO_SHADER_DATA);
		596
		597	si_emit_shader_pointer(sctx, &buffers->desc.atom);
		598	}
		599
		600	/* VERTEX BUFFERS */
		601
		602	static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
		603	{
		604	struct si_descriptors *desc = &sctx->vertex_buffers;
		605	int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0;
		606	int i;
		607
		608	for (i = 0; i < count; i++) {
		609	int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
		610
		611	if (vb >= Elements(sctx->vertex_buffer))
		612	continue;
		613	if (!sctx->vertex_buffer[vb].buffer)
		614	continue;
		615
		616	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
		617	(struct r600_resource*)sctx->vertex_buffer[vb].buffer,
		618	RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
		619	}
		620	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
		621	desc->buffer, RADEON_USAGE_READ,
		622	RADEON_PRIO_SHADER_DATA);
		623
		624	si_emit_shader_pointer(sctx, &desc->atom);
		625	}
		626
		627	void si_update_vertex_buffers(struct si_context *sctx)
		628	{
		629	struct si_descriptors *desc = &sctx->vertex_buffers;
		630	bool bound[SI_NUM_VERTEX_BUFFERS] = {};
		631	unsigned i, count = sctx->vertex_elements->count;
		632	uint64_t va;
		633	uint32_t *ptr;
		634
		635	if (!count \|\| !sctx->vertex_elements)
		636	return;
		637
		638	/* Vertex buffer descriptors are the only ones which are uploaded
		639	* directly through a staging buffer and don't go through
		640	* the fine-grained upload path.
		641	*/
		642	u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
		643	(struct pipe_resource)&desc->buffer, (void)&ptr);
		644
		645	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
		646	desc->buffer, RADEON_USAGE_READ,
		647	RADEON_PRIO_SHADER_DATA);
		648
		649	assert(count <= SI_NUM_VERTEX_BUFFERS);
		650	assert(desc->current_context_id == 0);
		651
		652	for (i = 0; i < count; i++) {
		653	struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
		654	struct pipe_vertex_buffer *vb;
		655	struct r600_resource *rbuffer;
		656	unsigned offset;
		657	uint32_t desc = &ptr[i4];
		658
		659	if (ve->vertex_buffer_index >= Elements(sctx->vertex_buffer)) {
		660	memset(desc, 0, 16);
		661	continue;
		662	}
		663
		664	vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
		665	rbuffer = (struct r600_resource*)vb->buffer;
		666	if (rbuffer == NULL) {
		667	memset(desc, 0, 16);
		668	continue;
		669	}
		670
		671	offset = vb->buffer_offset + ve->src_offset;
		672	va = rbuffer->gpu_address + offset;
		673
		674	/* Fill in T# buffer resource description */
		675	desc[0] = va & 0xFFFFFFFF;
		676	desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) \|
		677	S_008F04_STRIDE(vb->stride);
		678	if (vb->stride)
		679	/* Round up by rounding down and adding 1 */
		680	desc[2] = (vb->buffer->width0 - offset -
		681	sctx->vertex_elements->format_size[i]) /
		682	vb->stride + 1;
		683	else
		684	desc[2] = vb->buffer->width0 - offset;
		685
		686	desc[3] = sctx->vertex_elements->rsrc_word3[i];
		687
		688	if (!bound[ve->vertex_buffer_index]) {
		689	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
		690	(struct r600_resource*)vb->buffer,
		691	RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
		692	bound[ve->vertex_buffer_index] = true;
		693	}
		694	}
		695
		696	desc->atom.num_dw = 8; /* update 2 shader pointers (VS+ES) */
		697	desc->atom.dirty = true;
		698
		699	/* Don't flush the const cache. It would have a very negative effect
		700	* on performance (confirmed by testing). New descriptors are always
		701	* uploaded to a fresh new buffer, so I don't think flushing the const
		702	* cache is needed. */
		703	}
		704
		705
		706	/* CONSTANT BUFFERS */
		707
		708	void si_upload_const_buffer(struct si_context sctx, struct r600_resource *rbuffer,
		709	const uint8_t ptr, unsigned size, uint32_t const_offset)
		710	{
		711	void *tmp;
		712
		713	u_upload_alloc(sctx->b.uploader, 0, size, const_offset,
		714	(struct pipe_resource**)rbuffer, &tmp);
		715	util_memcpy_cpu_to_le32(tmp, ptr, size);
		716	}
		717
		718	static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint slot,
		719	struct pipe_constant_buffer *input)
		720	{
		721	struct si_context sctx = (struct si_context )ctx;
		722	struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
		723
		724	if (shader >= SI_NUM_SHADERS)
		725	return;
		726
		727	assert(slot < buffers->num_buffers);
		728	pipe_resource_reference(&buffers->buffers[slot], NULL);
		729
		730	/* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
		731	* with a NULL buffer). We need to use a dummy buffer instead. */
		732	if (sctx->b.chip_class == CIK &&
		733	(!input \|\| (!input->buffer && !input->user_buffer)))
		734	input = &sctx->null_const_buf;
		735
		736	if (input && (input->buffer \|\| input->user_buffer)) {
		737	struct pipe_resource *buffer = NULL;
		738	uint64_t va;
		739
		740	/* Upload the user buffer if needed. */
		741	if (input->user_buffer) {
		742	unsigned buffer_offset;
		743
		744	si_upload_const_buffer(sctx,
		745	(struct r600_resource**)&buffer, input->user_buffer,
		746	input->buffer_size, &buffer_offset);
		747	va = r600_resource(buffer)->gpu_address + buffer_offset;
		748	} else {
		749	pipe_resource_reference(&buffer, input->buffer);
		750	va = r600_resource(buffer)->gpu_address + input->buffer_offset;
		751	}
		752
		753	/* Set the descriptor. */
		754	uint32_t *desc = buffers->desc_data[slot];
		755	desc[0] = va;
		756	desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) \|
		757	S_008F04_STRIDE(0);
		758	desc[2] = input->buffer_size;
		759	desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) \|
		760	S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) \|
		761	S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) \|
		762	S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) \|
		763	S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) \|
		764	S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
		765
		766	buffers->buffers[slot] = buffer;
		767	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
		768	(struct r600_resource*)buffer,
		769	buffers->shader_usage, buffers->priority);
		770	buffers->desc.enabled_mask \|= 1llu << slot;
		771	} else {
		772	/* Clear the descriptor. */
		773	memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
		774	buffers->desc.enabled_mask &= ~(1llu << slot);
		775	}
		776
		777	buffers->desc.dirty_mask \|= 1llu << slot;
		778	si_update_descriptors(sctx, &buffers->desc);
		779	}
		780
		781	/* RING BUFFERS */
		782
		783	void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
		784	struct pipe_resource *buffer,
		785	unsigned stride, unsigned num_records,
		786	bool add_tid, bool swizzle,
		787	unsigned element_size, unsigned index_stride)
		788	{
		789	struct si_context sctx = (struct si_context )ctx;
		790	struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
		791
		792	if (shader >= SI_NUM_SHADERS)
		793	return;
		794
		795	/* The stride field in the resource descriptor has 14 bits */
		796	assert(stride < (1 << 14));
		797
		798	assert(slot < buffers->num_buffers);
		799	pipe_resource_reference(&buffers->buffers[slot], NULL);
		800
		801	if (buffer) {
		802	uint64_t va;
		803
		804	va = r600_resource(buffer)->gpu_address;
		805
		806	switch (element_size) {
		807	default:
		808	assert(!"Unsupported ring buffer element size");
		809	case 0:
		810	case 2:
		811	element_size = 0;
		812	break;
		813	case 4:
		814	element_size = 1;
		815	break;
		816	case 8:
		817	element_size = 2;
		818	break;
		819	case 16:
		820	element_size = 3;
		821	break;
		822	}
		823
		824	switch (index_stride) {
		825	default:
		826	assert(!"Unsupported ring buffer index stride");
		827	case 0:
		828	case 8:
		829	index_stride = 0;
		830	break;
		831	case 16:
		832	index_stride = 1;
		833	break;
		834	case 32:
		835	index_stride = 2;
		836	break;
		837	case 64:
		838	index_stride = 3;
		839	break;
		840	}
		841
		842	/* Set the descriptor. */
		843	uint32_t *desc = buffers->desc_data[slot];
		844	desc[0] = va;
		845	desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) \|
		846	S_008F04_STRIDE(stride) \|
		847	S_008F04_SWIZZLE_ENABLE(swizzle);
		848	desc[2] = num_records;
		849	desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) \|
		850	S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) \|
		851	S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) \|
		852	S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) \|
		853	S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) \|
		854	S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) \|
		855	S_008F0C_ELEMENT_SIZE(element_size) \|
		856	S_008F0C_INDEX_STRIDE(index_stride) \|
		857	S_008F0C_ADD_TID_ENABLE(add_tid);
		858
		859	pipe_resource_reference(&buffers->buffers[slot], buffer);
		860	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
		861	(struct r600_resource*)buffer,
		862	buffers->shader_usage, buffers->priority);
		863	buffers->desc.enabled_mask \|= 1llu << slot;
		864	} else {
		865	/* Clear the descriptor. */
		866	memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
		867	buffers->desc.enabled_mask &= ~(1llu << slot);
		868	}
		869
		870	buffers->desc.dirty_mask \|= 1llu << slot;
		871	si_update_descriptors(sctx, &buffers->desc);
		872	}
		873
		874	/* STREAMOUT BUFFERS */
		875
		876	static void si_set_streamout_targets(struct pipe_context *ctx,
		877	unsigned num_targets,
		878	struct pipe_stream_output_target **targets,
		879	const unsigned *offsets)
		880	{
		881	struct si_context sctx = (struct si_context )ctx;
		882	struct si_buffer_resources *buffers = &sctx->rw_buffers[PIPE_SHADER_VERTEX];
		883	unsigned old_num_targets = sctx->b.streamout.num_targets;
		884	unsigned i, bufidx;
		885
		886	/* We are going to unbind the buffers. Mark which caches need to be flushed. */
		887	if (sctx->b.streamout.num_targets && sctx->b.streamout.begin_emitted) {
		888	/* Since streamout uses vector writes which go through TC L2
		889	* and most other clients can use TC L2 as well, we don't need
		890	* to flush it.
		891	*
		892	* The only case which requires flushing it is VGT DMA index
		893	* fetching, which is a rare case. Thus, flag the TC L2
		894	* dirtiness in the resource and handle it when index fetching
		895	* is used.
		896	*/
		897	for (i = 0; i < sctx->b.streamout.num_targets; i++)
		898	if (sctx->b.streamout.targets[i])
		899	r600_resource(sctx->b.streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
		900
		901	/* Invalidate the scalar cache in case a streamout buffer is
		902	* going to be used as a constant buffer.
		903	*
		904	* Invalidate TC L1, because streamout bypasses it (done by
		905	* setting GLC=1 in the store instruction), but it can contain
		906	* outdated data of streamout buffers.
		907	*
		908	* VS_PARTIAL_FLUSH is required if the buffers are going to be
		909	* used as an input immediately.
		910	*/
		911	sctx->b.flags \|= SI_CONTEXT_INV_KCACHE \|
		912	SI_CONTEXT_INV_TC_L1 \|
		913	SI_CONTEXT_VS_PARTIAL_FLUSH;
		914	}
		915
		916	/* Streamout buffers must be bound in 2 places:
		917	* 1) in VGT by setting the VGT_STRMOUT registers
		918	* 2) as shader resources
		919	*/
		920
		921	/* Set the VGT regs. */
		922	r600_set_streamout_targets(ctx, num_targets, targets, offsets);
		923
		924	/* Set the shader resources.*/
		925	for (i = 0; i < num_targets; i++) {
		926	bufidx = SI_SO_BUF_OFFSET + i;
		927
		928	if (targets[i]) {
		929	struct pipe_resource *buffer = targets[i]->buffer;
		930	uint64_t va = r600_resource(buffer)->gpu_address;
		931
		932	/* Set the descriptor. */
		933	uint32_t *desc = buffers->desc_data[bufidx];
		934	desc[0] = va;
		935	desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
		936	desc[2] = 0xffffffff;
		937	desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) \|
		938	S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) \|
		939	S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) \|
		940	S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
		941
		942	/* Set the resource. */
		943	pipe_resource_reference(&buffers->buffers[bufidx],
		944	buffer);
		945	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
		946	(struct r600_resource*)buffer,
		947	buffers->shader_usage, buffers->priority);
		948	buffers->desc.enabled_mask \|= 1llu << bufidx;
		949	} else {
		950	/* Clear the descriptor and unset the resource. */
		951	memset(buffers->desc_data[bufidx], 0,
		952	sizeof(uint32_t) * 4);
		953	pipe_resource_reference(&buffers->buffers[bufidx],
		954	NULL);
		955	buffers->desc.enabled_mask &= ~(1llu << bufidx);
		956	}
		957	buffers->desc.dirty_mask \|= 1llu << bufidx;
		958	}
		959	for (; i < old_num_targets; i++) {
		960	bufidx = SI_SO_BUF_OFFSET + i;
		961	/* Clear the descriptor and unset the resource. */
		962	memset(buffers->desc_data[bufidx], 0, sizeof(uint32_t) * 4);
		963	pipe_resource_reference(&buffers->buffers[bufidx], NULL);
		964	buffers->desc.enabled_mask &= ~(1llu << bufidx);
		965	buffers->desc.dirty_mask \|= 1llu << bufidx;
		966	}
		967
		968	si_update_descriptors(sctx, &buffers->desc);
		969	}
		970
		971	static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
		972	uint32_t *desc, uint64_t old_buf_va,
		973	struct pipe_resource *new_buf)
		974	{
		975	/* Retrieve the buffer offset from the descriptor. */
		976	uint64_t old_desc_va =
		977	desc[0] \| ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
		978
		979	assert(old_buf_va <= old_desc_va);
		980	uint64_t offset_within_buffer = old_desc_va - old_buf_va;
		981
		982	/* Update the descriptor. */
		983	uint64_t va = r600_resource(new_buf)->gpu_address + offset_within_buffer;
		984
		985	desc[0] = va;
		986	desc[1] = (desc[1] & C_008F04_BASE_ADDRESS_HI) \|
		987	S_008F04_BASE_ADDRESS_HI(va >> 32);
		988	}
		989
		990	/* BUFFER DISCARD/INVALIDATION */
		991
		992	/* Reallocate a buffer a update all resource bindings where the buffer is
		993	* bound.
		994	*
		995	* This is used to avoid CPU-GPU synchronizations, because it makes the buffer
		996	* idle by discarding its contents. Apps usually tell us when to do this using
		997	* map_buffer flags, for example.
		998	*/
		999	static void si_invalidate_buffer(struct pipe_context ctx, struct pipe_resource buf)
		1000	{
		1001	struct si_context sctx = (struct si_context)ctx;
		1002	struct r600_resource *rbuffer = r600_resource(buf);
		1003	unsigned i, shader, alignment = rbuffer->buf->alignment;
		1004	uint64_t old_va = rbuffer->gpu_address;
		1005	unsigned num_elems = sctx->vertex_elements ?
		1006	sctx->vertex_elements->count : 0;
		1007	struct si_sampler_view *view;
		1008
		1009	/* Reallocate the buffer in the same pipe_resource. */
		1010	r600_init_resource(&sctx->screen->b, rbuffer, rbuffer->b.b.width0,
		1011	alignment, TRUE);
		1012
		1013	/* We changed the buffer, now we need to bind it where the old one
		1014	* was bound. This consists of 2 things:
		1015	* 1) Updating the resource descriptor and dirtying it.
		1016	* 2) Adding a relocation to the CS, so that it's usable.
		1017	*/
		1018
		1019	/* Vertex buffers. */
		1020	for (i = 0; i < num_elems; i++) {
		1021	int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
		1022
		1023	if (vb >= Elements(sctx->vertex_buffer))
		1024	continue;
		1025	if (!sctx->vertex_buffer[vb].buffer)
		1026	continue;
		1027
		1028	if (sctx->vertex_buffer[vb].buffer == buf) {
		1029	sctx->vertex_buffers_dirty = true;
		1030	break;
		1031	}
		1032	}
		1033
		1034	/* Read/Write buffers. */
		1035	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
		1036	struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
		1037	bool found = false;
		1038	uint64_t mask = buffers->desc.enabled_mask;
		1039
		1040	while (mask) {
		1041	i = u_bit_scan64(&mask);
		1042	if (buffers->buffers[i] == buf) {
		1043	si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
		1044	old_va, buf);
		1045
		1046	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
		1047	rbuffer, buffers->shader_usage,
		1048	buffers->priority);
		1049
		1050	buffers->desc.dirty_mask \|= 1llu << i;
		1051	found = true;
		1052
		1053	if (i >= SI_SO_BUF_OFFSET && shader == PIPE_SHADER_VERTEX) {
		1054	/* Update the streamout state. */
		1055	if (sctx->b.streamout.begin_emitted) {
		1056	r600_emit_streamout_end(&sctx->b);
		1057	}
		1058	sctx->b.streamout.append_bitmask =
		1059	sctx->b.streamout.enabled_mask;
		1060	r600_streamout_buffers_dirty(&sctx->b);
		1061	}
		1062	}
		1063	}
		1064	if (found) {
		1065	si_update_descriptors(sctx, &buffers->desc);
		1066	}
		1067	}
		1068
		1069	/* Constant buffers. */
		1070	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
		1071	struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
		1072	bool found = false;
		1073	uint64_t mask = buffers->desc.enabled_mask;
		1074
		1075	while (mask) {
		1076	unsigned i = u_bit_scan64(&mask);
		1077	if (buffers->buffers[i] == buf) {
		1078	si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
		1079	old_va, buf);
		1080
		1081	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
		1082	rbuffer, buffers->shader_usage,
		1083	buffers->priority);
		1084
		1085	buffers->desc.dirty_mask \|= 1llu << i;
		1086	found = true;
		1087	}
		1088	}
		1089	if (found) {
		1090	si_update_descriptors(sctx, &buffers->desc);
		1091	}
		1092	}
		1093
		1094	/* Texture buffers - update virtual addresses in sampler view descriptors. */
		1095	LIST_FOR_EACH_ENTRY(view, &sctx->b.texture_buffers, list) {
		1096	if (view->base.texture == buf) {
		1097	si_desc_reset_buffer_offset(ctx, &view->state[4], old_va, buf);
		1098	}
		1099	}
		1100	/* Texture buffers - update bindings. */
		1101	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
		1102	struct si_sampler_views *views = &sctx->samplers[shader].views;
		1103	bool found = false;
		1104	uint64_t mask = views->desc.enabled_mask;
		1105
		1106	while (mask) {
		1107	unsigned i = u_bit_scan64(&mask);
		1108	if (views->views[i]->texture == buf) {
		1109	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
		1110	rbuffer, RADEON_USAGE_READ,
		1111	RADEON_PRIO_SHADER_BUFFER_RO);
		1112
		1113	views->desc.dirty_mask \|= 1llu << i;
		1114	found = true;
		1115	}
		1116	}
		1117	if (found) {
		1118	si_update_descriptors(sctx, &views->desc);
		1119	}
		1120	}
		1121	}
		1122
		1123	/* CP DMA */
		1124
		1125	/* The max number of bytes to copy per packet. */
		1126	#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
		1127
		1128	static void si_clear_buffer(struct pipe_context ctx, struct pipe_resource dst,
		1129	unsigned offset, unsigned size, unsigned value,
		1130	bool is_framebuffer)
		1131	{
		1132	struct si_context sctx = (struct si_context)ctx;
		1133	unsigned flush_flags, tc_l2_flag;
		1134
		1135	if (!size)
		1136	return;
		1137
		1138	/* Mark the buffer range of destination as valid (initialized),
		1139	* so that transfer_map knows it should wait for the GPU when mapping
		1140	* that range. */
		1141	util_range_add(&r600_resource(dst)->valid_buffer_range, offset,
		1142	offset + size);
		1143
		1144	/* Fallback for unaligned clears. */
		1145	if (offset % 4 != 0 \|\| size % 4 != 0) {
		1146	uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
		1147	sctx->b.rings.gfx.cs,
		1148	PIPE_TRANSFER_WRITE);
		1149	size /= 4;
		1150	for (unsigned i = 0; i < size; i++)
		1151	*map++ = value;
		1152	return;
		1153	}
		1154
		1155	uint64_t va = r600_resource(dst)->gpu_address + offset;
		1156
		1157	/* Flush the caches where the resource is bound. */
		1158	if (is_framebuffer) {
		1159	flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
		1160	tc_l2_flag = 0;
		1161	} else {
		1162	flush_flags = SI_CONTEXT_INV_TC_L1 \|
		1163	(sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) \|
		1164	SI_CONTEXT_INV_KCACHE;
		1165	tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
		1166	}
		1167
		1168	sctx->b.flags \|= SI_CONTEXT_PS_PARTIAL_FLUSH \|
		1169	flush_flags;
		1170
		1171	while (size) {
		1172	unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
		1173	unsigned dma_flags = tc_l2_flag;
		1174
		1175	si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0),
		1176	FALSE);
		1177
		1178	/* This must be done after need_cs_space. */
		1179	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
		1180	(struct r600_resource*)dst, RADEON_USAGE_WRITE,
		1181	RADEON_PRIO_MIN);
		1182
		1183	/* Flush the caches for the first copy only.
		1184	* Also wait for the previous CP DMA operations. */
		1185	if (sctx->b.flags) {
		1186	si_emit_cache_flush(&sctx->b, NULL);
		1187	dma_flags \|= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */
		1188	}
		1189
		1190	/* Do the synchronization after the last copy, so that all data is written to memory. */
		1191	if (size == byte_count)
		1192	dma_flags \|= R600_CP_DMA_SYNC;
		1193
		1194	/* Emit the clear packet. */
		1195	si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags);
		1196
		1197	size -= byte_count;
		1198	va += byte_count;
		1199	}
		1200
		1201	/* Flush the caches again in case the 3D engine has been prefetching
		1202	* the resource. */
		1203	sctx->b.flags \|= flush_flags;
		1204
		1205	if (tc_l2_flag)
		1206	r600_resource(dst)->TC_L2_dirty = true;
		1207	}
		1208
		1209	void si_copy_buffer(struct si_context *sctx,
		1210	struct pipe_resource dst, struct pipe_resource src,
		1211	uint64_t dst_offset, uint64_t src_offset, unsigned size,
		1212	bool is_framebuffer)
		1213	{
		1214	unsigned flush_flags, tc_l2_flag;
		1215
		1216	if (!size)
		1217	return;
		1218
		1219	/* Mark the buffer range of destination as valid (initialized),
		1220	* so that transfer_map knows it should wait for the GPU when mapping
		1221	* that range. */
		1222	util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset,
		1223	dst_offset + size);
		1224
		1225	dst_offset += r600_resource(dst)->gpu_address;
		1226	src_offset += r600_resource(src)->gpu_address;
		1227
		1228	/* Flush the caches where the resource is bound. */
		1229	if (is_framebuffer) {
		1230	flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
		1231	tc_l2_flag = 0;
		1232	} else {
		1233	flush_flags = SI_CONTEXT_INV_TC_L1 \|
		1234	(sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) \|
		1235	SI_CONTEXT_INV_KCACHE;
		1236	tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
		1237	}
		1238
		1239	sctx->b.flags \|= SI_CONTEXT_PS_PARTIAL_FLUSH \|
		1240	flush_flags;
		1241
		1242	while (size) {
		1243	unsigned sync_flags = tc_l2_flag;
		1244	unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
		1245
		1246	si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE);
		1247
		1248	/* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */
		1249	if (sctx->b.flags) {
		1250	si_emit_cache_flush(&sctx->b, NULL);
		1251	sync_flags \|= SI_CP_DMA_RAW_WAIT;
		1252	}
		1253
		1254	/* Do the synchronization after the last copy, so that all data is written to memory. */
		1255	if (size == byte_count) {
		1256	sync_flags \|= R600_CP_DMA_SYNC;
		1257	}
		1258
		1259	/* This must be done after r600_need_cs_space. */
		1260	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src,
		1261	RADEON_USAGE_READ, RADEON_PRIO_MIN);
		1262	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst,
		1263	RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
		1264
		1265	si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags);
		1266
		1267	size -= byte_count;
		1268	src_offset += byte_count;
		1269	dst_offset += byte_count;
		1270	}
		1271
		1272	/* Flush the caches again in case the 3D engine has been prefetching
		1273	* the resource. */
		1274	sctx->b.flags \|= flush_flags;
		1275
		1276	if (tc_l2_flag)
		1277	r600_resource(dst)->TC_L2_dirty = true;
		1278	}
		1279
		1280	/* INIT/DEINIT */
		1281
		1282	void si_init_all_descriptors(struct si_context *sctx)
		1283	{
		1284	int i;
		1285
		1286	for (i = 0; i < SI_NUM_SHADERS; i++) {
		1287	si_init_buffer_resources(sctx, &sctx->const_buffers[i],
		1288	SI_NUM_CONST_BUFFERS, i, SI_SGPR_CONST,
		1289	RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
		1290	si_init_buffer_resources(sctx, &sctx->rw_buffers[i],
		1291	i == PIPE_SHADER_VERTEX ?
		1292	SI_NUM_RW_BUFFERS : SI_NUM_RING_BUFFERS,
		1293	i, SI_SGPR_RW_BUFFERS,
		1294	RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
		1295
		1296	si_init_sampler_views(sctx, &sctx->samplers[i].views, i);
		1297
		1298	si_init_descriptors(sctx, &sctx->samplers[i].states.desc,
		1299	si_get_shader_user_data_base(i) + SI_SGPR_SAMPLER * 4,
		1300	4, SI_NUM_SAMPLER_STATES, si_emit_sampler_states);
		1301
		1302	sctx->atoms.s.const_buffers[i] = &sctx->const_buffers[i].desc.atom;
		1303	sctx->atoms.s.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom;
		1304	sctx->atoms.s.sampler_views[i] = &sctx->samplers[i].views.desc.atom;
		1305	sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
		1306	}
		1307
		1308	si_init_descriptors(sctx, &sctx->vertex_buffers,
		1309	si_get_shader_user_data_base(PIPE_SHADER_VERTEX) +
		1310	SI_SGPR_VERTEX_BUFFER*4, 4, SI_NUM_VERTEX_BUFFERS,
		1311	si_emit_shader_pointer);
		1312	sctx->atoms.s.vertex_buffers = &sctx->vertex_buffers.atom;
		1313
		1314	/* Set pipe_context functions. */
		1315	sctx->b.b.set_constant_buffer = si_set_constant_buffer;
		1316	sctx->b.b.set_sampler_views = si_set_sampler_views;
		1317	sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
		1318	sctx->b.clear_buffer = si_clear_buffer;
		1319	sctx->b.invalidate_buffer = si_invalidate_buffer;
		1320	}
		1321
		1322	void si_release_all_descriptors(struct si_context *sctx)
		1323	{
		1324	int i;
		1325
		1326	for (i = 0; i < SI_NUM_SHADERS; i++) {
		1327	si_release_buffer_resources(&sctx->const_buffers[i]);
		1328	si_release_buffer_resources(&sctx->rw_buffers[i]);
		1329	si_release_sampler_views(&sctx->samplers[i].views);
		1330	si_release_descriptors(&sctx->samplers[i].states.desc);
		1331	}
		1332	si_release_descriptors(&sctx->vertex_buffers);
		1333	}
		1334
		1335	void si_all_descriptors_begin_new_cs(struct si_context *sctx)
		1336	{
		1337	int i;
		1338
		1339	for (i = 0; i < SI_NUM_SHADERS; i++) {
		1340	si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]);
		1341	si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers[i]);
		1342	si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
		1343	si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states);
		1344	}
		1345	si_vertex_buffers_begin_new_cs(sctx);
		1346	}

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/Mesa/mesa-10.6.0/src/gallium/drivers/radeonsi/si_descriptors.c – Rev 5564