Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
5564 serge 1
/*
2
 * Copyright 2013 Advanced Micro Devices, Inc.
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * on the rights to use, copy, modify, merge, publish, distribute, sub
8
 * license, and/or sell copies of the Software, and to permit persons to whom
9
 * the Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22
 *
23
 * Authors:
24
 *      Marek Olšák 
25
 */
26
 
27
/* Resource binding slots and sampler states (each described with 8 or 4 dwords)
28
 * live in memory on SI.
29
 *
30
 * This file is responsible for managing lists of resources and sampler states
31
 * in memory and binding them, which means updating those structures in memory.
32
 *
33
 * There is also code for updating shader pointers to resources and sampler
34
 * states. CP DMA functions are here too.
35
 */
36
 
37
#include "radeon/r600_cs.h"
38
#include "si_pipe.h"
39
#include "si_shader.h"
40
#include "sid.h"
41
 
42
#include "util/u_memory.h"
43
#include "util/u_upload_mgr.h"
44
 
45
#define SI_NUM_CONTEXTS 16
46
 
47
/* NULL image and buffer descriptor.
48
 *
49
 * For images, all fields must be zero except for the swizzle, which
50
 * supports arbitrary combinations of 0s and 1s. The texture type must be
51
 * any valid type (e.g. 1D). If the texture type isn't set, the hw hangs.
52
 *
53
 * For buffers, all fields must be zero. If they are not, the hw hangs.
54
 *
55
 * This is the only reason why the buffer descriptor must be in words [4:7].
56
 */
57
static uint32_t null_descriptor[8] = {
58
	0,
59
	0,
60
	0,
61
	S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) |
62
	S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
63
	/* the rest must contain zeros, which is also used by the buffer
64
	 * descriptor */
65
};
66
 
67
/* Set this if you want the 3D engine to wait until CP DMA is done.
68
 * It should be set on the last CP DMA packet. */
69
#define R600_CP_DMA_SYNC	(1 << 0) /* R600+ */
70
 
71
/* Set this if the source data was used as a destination in a previous CP DMA
72
 * packet. It's for preventing a read-after-write (RAW) hazard between two
73
 * CP DMA packets. */
74
#define SI_CP_DMA_RAW_WAIT	(1 << 1) /* SI+ */
75
#define CIK_CP_DMA_USE_L2	(1 << 2)
76
 
77
/* Emit a CP DMA packet to do a copy from one buffer to another.
78
 * The size must fit in bits [20:0].
79
 */
80
static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
81
				       uint64_t dst_va, uint64_t src_va,
82
				       unsigned size, unsigned flags)
83
{
84
	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
85
	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
86
	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
87
	uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
88
			   PKT3_CP_DMA_SRC_SEL(3) | PKT3_CP_DMA_DST_SEL(3) : 0;
89
 
90
	assert(size);
91
	assert((size & ((1<<21)-1)) == size);
92
 
93
	if (sctx->b.chip_class >= CIK) {
94
		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
95
		radeon_emit(cs, sync_flag | sel);	/* CP_SYNC [31] */
96
		radeon_emit(cs, src_va);		/* SRC_ADDR_LO [31:0] */
97
		radeon_emit(cs, src_va >> 32);		/* SRC_ADDR_HI [31:0] */
98
		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
99
		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [31:0] */
100
		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
101
	} else {
102
		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
103
		radeon_emit(cs, src_va);			/* SRC_ADDR_LO [31:0] */
104
		radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
105
		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
106
		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
107
		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
108
	}
109
}
110
 
111
/* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */
112
static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
113
					uint64_t dst_va, unsigned size,
114
					uint32_t clear_value, unsigned flags)
115
{
116
	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
117
	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
118
	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;
119
	uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? PKT3_CP_DMA_DST_SEL(3) : 0;
120
 
121
	assert(size);
122
	assert((size & ((1<<21)-1)) == size);
123
 
124
	if (sctx->b.chip_class >= CIK) {
125
		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
126
		radeon_emit(cs, sync_flag | dst_sel | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
127
		radeon_emit(cs, clear_value);		/* DATA [31:0] */
128
		radeon_emit(cs, 0);
129
		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
130
		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [15:0] */
131
		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
132
	} else {
133
		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
134
		radeon_emit(cs, clear_value);		/* DATA [31:0] */
135
		radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */
136
		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
137
		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
138
		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
139
	}
140
}
141
 
142
static void si_init_descriptors(struct si_context *sctx,
143
				struct si_descriptors *desc,
144
				unsigned shader_userdata_reg,
145
				unsigned element_dw_size,
146
				unsigned num_elements,
147
				void (*emit_func)(struct si_context *ctx, struct r600_atom *state))
148
{
149
	assert(num_elements <= sizeof(desc->enabled_mask)*8);
150
	assert(num_elements <= sizeof(desc->dirty_mask)*8);
151
 
152
	desc->atom.emit = (void*)emit_func;
153
	desc->shader_userdata_reg = shader_userdata_reg;
154
	desc->element_dw_size = element_dw_size;
155
	desc->num_elements = num_elements;
156
	desc->context_size = num_elements * element_dw_size * 4;
157
 
158
	desc->buffer = (struct r600_resource*)
159
		pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
160
				   PIPE_USAGE_DEFAULT,
161
				   SI_NUM_CONTEXTS * desc->context_size);
162
 
163
	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
164
			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
165
 
166
	/* We don't check for CS space here, because this should be called
167
	 * only once at context initialization. */
168
	si_emit_cp_dma_clear_buffer(sctx, desc->buffer->gpu_address,
169
				    desc->buffer->b.b.width0, 0,
170
				    R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
171
}
172
 
173
static void si_release_descriptors(struct si_descriptors *desc)
174
{
175
	pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
176
}
177
 
178
static void si_update_descriptors(struct si_context *sctx,
179
				  struct si_descriptors *desc)
180
{
181
	if (desc->dirty_mask) {
182
		desc->atom.num_dw =
183
			7 + /* copy */
184
			(4 + desc->element_dw_size) * util_bitcount64(desc->dirty_mask) + /* update */
185
			4; /* pointer update */
186
 
187
		if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 &&
188
		    desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0)
189
			desc->atom.num_dw += 4; /* second pointer update */
190
 
191
		desc->atom.dirty = true;
192
 
193
		/* TODO: Investigate if these flushes can be removed after
194
		 * adding CE support. */
195
 
196
		/* The descriptors are read with the K cache. */
197
		sctx->b.flags |= SI_CONTEXT_INV_KCACHE;
198
 
199
		/* Since SI uses uncached CP DMA to update descriptors,
200
		 * we have to flush TC L2, which is used to fetch constants
201
		 * along with KCACHE. */
202
		if (sctx->b.chip_class == SI)
203
			sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
204
	} else {
205
		desc->atom.dirty = false;
206
	}
207
}
208
 
209
static void si_emit_shader_pointer(struct si_context *sctx,
210
				   struct r600_atom *atom)
211
{
212
	struct si_descriptors *desc = (struct si_descriptors*)atom;
213
	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
214
	uint64_t va = desc->buffer->gpu_address +
215
		      desc->current_context_id * desc->context_size +
216
		      desc->buffer_offset;
217
 
218
	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
219
	radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2);
220
	radeon_emit(cs, va);
221
	radeon_emit(cs, va >> 32);
222
 
223
	if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 &&
224
	    desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0) {
225
		radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
226
		radeon_emit(cs, (desc->shader_userdata_reg +
227
				 (R_00B330_SPI_SHADER_USER_DATA_ES_0 -
228
				  R_00B130_SPI_SHADER_USER_DATA_VS_0) -
229
				 SI_SH_REG_OFFSET) >> 2);
230
		radeon_emit(cs, va);
231
		radeon_emit(cs, va >> 32);
232
	}
233
}
234
 
235
static void si_emit_descriptors(struct si_context *sctx,
236
				struct si_descriptors *desc,
237
				uint32_t **descriptors)
238
{
239
	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
240
	uint64_t va_base;
241
	int packet_start = 0;
242
	int packet_size = 0;
243
	int last_index = desc->num_elements; /* point to a non-existing element */
244
	uint64_t dirty_mask = desc->dirty_mask;
245
	unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS;
246
 
247
	assert(dirty_mask);
248
 
249
	va_base = desc->buffer->gpu_address;
250
 
251
	/* Copy the descriptors to a new context slot. */
252
	si_emit_cp_dma_copy_buffer(sctx,
253
				   va_base + new_context_id * desc->context_size,
254
				   va_base + desc->current_context_id * desc->context_size,
255
				   desc->context_size, R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
256
 
257
	va_base += new_context_id * desc->context_size;
258
 
259
	/* Update the descriptors.
260
	 * Updates of consecutive descriptors are merged to one WRITE_DATA packet.
261
	 *
262
	 * XXX When unbinding lots of resources, consider clearing the memory
263
	 *     with CP DMA instead of emitting zeros.
264
	 */
265
	while (dirty_mask) {
266
		int i = u_bit_scan64(&dirty_mask);
267
 
268
		assert(i < desc->num_elements);
269
 
270
		if (last_index+1 == i && packet_size) {
271
			/* Append new data at the end of the last packet. */
272
			packet_size += desc->element_dw_size;
273
			cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0);
274
		} else {
275
			/* Start a new packet. */
276
			uint64_t va = va_base + i * desc->element_dw_size * 4;
277
 
278
			packet_start = cs->cdw;
279
			packet_size = 2 + desc->element_dw_size;
280
 
281
			radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0));
282
			radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(sctx->b.chip_class == SI ?
283
						PKT3_WRITE_DATA_DST_SEL_MEM_SYNC :
284
						PKT3_WRITE_DATA_DST_SEL_TC_L2) |
285
					     PKT3_WRITE_DATA_WR_CONFIRM |
286
					     PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME));
287
			radeon_emit(cs, va & 0xFFFFFFFFUL);
288
			radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL);
289
		}
290
 
291
		radeon_emit_array(cs, descriptors[i], desc->element_dw_size);
292
 
293
		last_index = i;
294
	}
295
 
296
	desc->dirty_mask = 0;
297
	desc->current_context_id = new_context_id;
298
 
299
	/* Now update the shader userdata pointer. */
300
	si_emit_shader_pointer(sctx, &desc->atom);
301
}
302
 
303
static unsigned si_get_shader_user_data_base(unsigned shader)
304
{
305
	switch (shader) {
306
	case PIPE_SHADER_VERTEX:
307
		return R_00B130_SPI_SHADER_USER_DATA_VS_0;
308
	case PIPE_SHADER_GEOMETRY:
309
		return R_00B230_SPI_SHADER_USER_DATA_GS_0;
310
	case PIPE_SHADER_FRAGMENT:
311
		return R_00B030_SPI_SHADER_USER_DATA_PS_0;
312
	default:
313
		assert(0);
314
		return 0;
315
	}
316
}
317
 
318
/* SAMPLER VIEWS */
319
 
320
static void si_emit_sampler_views(struct si_context *sctx, struct r600_atom *atom)
321
{
322
	struct si_sampler_views *views = (struct si_sampler_views*)atom;
323
 
324
	si_emit_descriptors(sctx, &views->desc, views->desc_data);
325
}
326
 
327
static void si_init_sampler_views(struct si_context *sctx,
328
				  struct si_sampler_views *views,
329
				  unsigned shader)
330
{
331
	int i;
332
 
333
	si_init_descriptors(sctx, &views->desc,
334
			    si_get_shader_user_data_base(shader) +
335
			    SI_SGPR_RESOURCE * 4,
336
			    8, SI_NUM_SAMPLER_VIEWS, si_emit_sampler_views);
337
 
338
	for (i = 0; i < views->desc.num_elements; i++) {
339
		views->desc_data[i] = null_descriptor;
340
		views->desc.dirty_mask |= 1llu << i;
341
	}
342
	si_update_descriptors(sctx, &views->desc);
343
}
344
 
345
static void si_release_sampler_views(struct si_sampler_views *views)
346
{
347
	int i;
348
 
349
	for (i = 0; i < Elements(views->views); i++) {
350
		pipe_sampler_view_reference(&views->views[i], NULL);
351
	}
352
	si_release_descriptors(&views->desc);
353
}
354
 
355
static enum radeon_bo_priority si_get_resource_ro_priority(struct r600_resource *res)
356
{
357
	if (res->b.b.target == PIPE_BUFFER)
358
		return RADEON_PRIO_SHADER_BUFFER_RO;
359
 
360
	if (res->b.b.nr_samples > 1)
361
		return RADEON_PRIO_SHADER_TEXTURE_MSAA;
362
 
363
	return RADEON_PRIO_SHADER_TEXTURE_RO;
364
}
365
 
366
static void si_sampler_views_begin_new_cs(struct si_context *sctx,
367
					  struct si_sampler_views *views)
368
{
369
	uint64_t mask = views->desc.enabled_mask;
370
 
371
	/* Add relocations to the CS. */
372
	while (mask) {
373
		int i = u_bit_scan64(&mask);
374
		struct si_sampler_view *rview =
375
			(struct si_sampler_view*)views->views[i];
376
 
377
		if (!rview->resource)
378
			continue;
379
 
380
		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
381
				      rview->resource, RADEON_USAGE_READ,
382
				      si_get_resource_ro_priority(rview->resource));
383
	}
384
 
385
	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
386
			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
387
 
388
	si_emit_shader_pointer(sctx, &views->desc.atom);
389
}
390
 
391
static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
392
				unsigned slot, struct pipe_sampler_view *view,
393
				unsigned *view_desc)
394
{
395
	struct si_sampler_views *views = &sctx->samplers[shader].views;
396
 
397
	if (views->views[slot] == view)
398
		return;
399
 
400
	if (view) {
401
		struct si_sampler_view *rview =
402
			(struct si_sampler_view*)view;
403
 
404
		if (rview->resource)
405
			r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
406
				rview->resource, RADEON_USAGE_READ,
407
				si_get_resource_ro_priority(rview->resource));
408
 
409
 
410
		pipe_sampler_view_reference(&views->views[slot], view);
411
		views->desc_data[slot] = view_desc;
412
		views->desc.enabled_mask |= 1llu << slot;
413
	} else {
414
		pipe_sampler_view_reference(&views->views[slot], NULL);
415
		views->desc_data[slot] = null_descriptor;
416
		views->desc.enabled_mask &= ~(1llu << slot);
417
	}
418
 
419
	views->desc.dirty_mask |= 1llu << slot;
420
}
421
 
422
static void si_set_sampler_views(struct pipe_context *ctx,
423
				 unsigned shader, unsigned start,
424
                                 unsigned count,
425
				 struct pipe_sampler_view **views)
426
{
427
	struct si_context *sctx = (struct si_context *)ctx;
428
	struct si_textures_info *samplers = &sctx->samplers[shader];
429
	struct si_sampler_view **rviews = (struct si_sampler_view **)views;
430
	int i;
431
 
432
	if (!count || shader >= SI_NUM_SHADERS)
433
		return;
434
 
435
	for (i = 0; i < count; i++) {
436
		unsigned slot = start + i;
437
 
438
		if (!views || !views[i]) {
439
			samplers->depth_texture_mask &= ~(1 << slot);
440
			samplers->compressed_colortex_mask &= ~(1 << slot);
441
			si_set_sampler_view(sctx, shader, slot, NULL, NULL);
442
			si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
443
					    NULL, NULL);
444
			continue;
445
		}
446
 
447
		si_set_sampler_view(sctx, shader, slot, views[i], rviews[i]->state);
448
 
449
		if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) {
450
			struct r600_texture *rtex =
451
				(struct r600_texture*)views[i]->texture;
452
 
453
			if (rtex->is_depth && !rtex->is_flushing_texture) {
454
				samplers->depth_texture_mask |= 1 << slot;
455
			} else {
456
				samplers->depth_texture_mask &= ~(1 << slot);
457
			}
458
			if (rtex->cmask.size || rtex->fmask.size) {
459
				samplers->compressed_colortex_mask |= 1 << slot;
460
			} else {
461
				samplers->compressed_colortex_mask &= ~(1 << slot);
462
			}
463
 
464
			if (rtex->fmask.size) {
465
				si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
466
						    views[i], rviews[i]->fmask_state);
467
			} else {
468
				si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
469
						    NULL, NULL);
470
			}
471
		} else {
472
			samplers->depth_texture_mask &= ~(1 << slot);
473
			samplers->compressed_colortex_mask &= ~(1 << slot);
474
			si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot,
475
					    NULL, NULL);
476
		}
477
	}
478
 
479
	si_update_descriptors(sctx, &samplers->views.desc);
480
}
481
 
482
/* SAMPLER STATES */
483
 
484
static void si_emit_sampler_states(struct si_context *sctx, struct r600_atom *atom)
485
{
486
	struct si_sampler_states *states = (struct si_sampler_states*)atom;
487
 
488
	si_emit_descriptors(sctx, &states->desc, states->desc_data);
489
}
490
 
491
static void si_sampler_states_begin_new_cs(struct si_context *sctx,
492
					   struct si_sampler_states *states)
493
{
494
	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
495
			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
496
	si_emit_shader_pointer(sctx, &states->desc.atom);
497
}
498
 
499
void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
500
				unsigned start, unsigned count, void **states)
501
{
502
	struct si_sampler_states *samplers = &sctx->samplers[shader].states;
503
	struct si_sampler_state **sstates = (struct si_sampler_state**)states;
504
	int i;
505
 
506
	if (start == 0)
507
		samplers->saved_states[0] = states[0];
508
	if (start == 1)
509
		samplers->saved_states[1] = states[0];
510
	else if (start == 0 && count >= 2)
511
		samplers->saved_states[1] = states[1];
512
 
513
	for (i = 0; i < count; i++) {
514
		unsigned slot = start + i;
515
 
516
		if (!sstates[i]) {
517
			samplers->desc.dirty_mask &= ~(1llu << slot);
518
			continue;
519
		}
520
 
521
		samplers->desc_data[slot] = sstates[i]->val;
522
		samplers->desc.dirty_mask |= 1llu << slot;
523
	}
524
 
525
	si_update_descriptors(sctx, &samplers->desc);
526
}
527
 
528
/* BUFFER RESOURCES */
529
 
530
static void si_emit_buffer_resources(struct si_context *sctx, struct r600_atom *atom)
531
{
532
	struct si_buffer_resources *buffers = (struct si_buffer_resources*)atom;
533
 
534
	si_emit_descriptors(sctx, &buffers->desc, buffers->desc_data);
535
}
536
 
537
static void si_init_buffer_resources(struct si_context *sctx,
538
				     struct si_buffer_resources *buffers,
539
				     unsigned num_buffers, unsigned shader,
540
				     unsigned shader_userdata_index,
541
				     enum radeon_bo_usage shader_usage,
542
				     enum radeon_bo_priority priority)
543
{
544
	int i;
545
 
546
	buffers->num_buffers = num_buffers;
547
	buffers->shader_usage = shader_usage;
548
	buffers->priority = priority;
549
	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
550
	buffers->desc_storage = CALLOC(num_buffers, sizeof(uint32_t) * 4);
551
 
552
	/* si_emit_descriptors only accepts an array of arrays.
553
	 * This adds such an array. */
554
	buffers->desc_data = CALLOC(num_buffers, sizeof(uint32_t*));
555
	for (i = 0; i < num_buffers; i++) {
556
		buffers->desc_data[i] = &buffers->desc_storage[i*4];
557
	}
558
 
559
	si_init_descriptors(sctx, &buffers->desc,
560
			    si_get_shader_user_data_base(shader) +
561
			    shader_userdata_index*4, 4, num_buffers,
562
			    si_emit_buffer_resources);
563
}
564
 
565
static void si_release_buffer_resources(struct si_buffer_resources *buffers)
566
{
567
	int i;
568
 
569
	for (i = 0; i < buffers->num_buffers; i++) {
570
		pipe_resource_reference(&buffers->buffers[i], NULL);
571
	}
572
 
573
	FREE(buffers->buffers);
574
	FREE(buffers->desc_storage);
575
	FREE(buffers->desc_data);
576
	si_release_descriptors(&buffers->desc);
577
}
578
 
579
static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
580
					     struct si_buffer_resources *buffers)
581
{
582
	uint64_t mask = buffers->desc.enabled_mask;
583
 
584
	/* Add relocations to the CS. */
585
	while (mask) {
586
		int i = u_bit_scan64(&mask);
587
 
588
		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
589
				      (struct r600_resource*)buffers->buffers[i],
590
				      buffers->shader_usage, buffers->priority);
591
	}
592
 
593
	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
594
			      buffers->desc.buffer, RADEON_USAGE_READWRITE,
595
			      RADEON_PRIO_SHADER_DATA);
596
 
597
	si_emit_shader_pointer(sctx, &buffers->desc.atom);
598
}
599
 
600
/* VERTEX BUFFERS */
601
 
602
static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
603
{
604
	struct si_descriptors *desc = &sctx->vertex_buffers;
605
	int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0;
606
	int i;
607
 
608
	for (i = 0; i < count; i++) {
609
		int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
610
 
611
		if (vb >= Elements(sctx->vertex_buffer))
612
			continue;
613
		if (!sctx->vertex_buffer[vb].buffer)
614
			continue;
615
 
616
		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
617
				      (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
618
				      RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
619
	}
620
	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
621
			      desc->buffer, RADEON_USAGE_READ,
622
			      RADEON_PRIO_SHADER_DATA);
623
 
624
	si_emit_shader_pointer(sctx, &desc->atom);
625
}
626
 
627
void si_update_vertex_buffers(struct si_context *sctx)
628
{
629
	struct si_descriptors *desc = &sctx->vertex_buffers;
630
	bool bound[SI_NUM_VERTEX_BUFFERS] = {};
631
	unsigned i, count = sctx->vertex_elements->count;
632
	uint64_t va;
633
	uint32_t *ptr;
634
 
635
	if (!count || !sctx->vertex_elements)
636
		return;
637
 
638
	/* Vertex buffer descriptors are the only ones which are uploaded
639
	 * directly through a staging buffer and don't go through
640
	 * the fine-grained upload path.
641
	 */
642
	u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
643
		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
644
 
645
	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
646
			      desc->buffer, RADEON_USAGE_READ,
647
			      RADEON_PRIO_SHADER_DATA);
648
 
649
	assert(count <= SI_NUM_VERTEX_BUFFERS);
650
	assert(desc->current_context_id == 0);
651
 
652
	for (i = 0; i < count; i++) {
653
		struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
654
		struct pipe_vertex_buffer *vb;
655
		struct r600_resource *rbuffer;
656
		unsigned offset;
657
		uint32_t *desc = &ptr[i*4];
658
 
659
		if (ve->vertex_buffer_index >= Elements(sctx->vertex_buffer)) {
660
			memset(desc, 0, 16);
661
			continue;
662
		}
663
 
664
		vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
665
		rbuffer = (struct r600_resource*)vb->buffer;
666
		if (rbuffer == NULL) {
667
			memset(desc, 0, 16);
668
			continue;
669
		}
670
 
671
		offset = vb->buffer_offset + ve->src_offset;
672
		va = rbuffer->gpu_address + offset;
673
 
674
		/* Fill in T# buffer resource description */
675
		desc[0] = va & 0xFFFFFFFF;
676
		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
677
			  S_008F04_STRIDE(vb->stride);
678
		if (vb->stride)
679
			/* Round up by rounding down and adding 1 */
680
			desc[2] = (vb->buffer->width0 - offset -
681
				   sctx->vertex_elements->format_size[i]) /
682
				  vb->stride + 1;
683
		else
684
			desc[2] = vb->buffer->width0 - offset;
685
 
686
		desc[3] = sctx->vertex_elements->rsrc_word3[i];
687
 
688
		if (!bound[ve->vertex_buffer_index]) {
689
			r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
690
					      (struct r600_resource*)vb->buffer,
691
					      RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
692
			bound[ve->vertex_buffer_index] = true;
693
		}
694
	}
695
 
696
	desc->atom.num_dw = 8; /* update 2 shader pointers (VS+ES) */
697
	desc->atom.dirty = true;
698
 
699
	/* Don't flush the const cache. It would have a very negative effect
700
	 * on performance (confirmed by testing). New descriptors are always
701
	 * uploaded to a fresh new buffer, so I don't think flushing the const
702
	 * cache is needed. */
703
}
704
 
705
 
706
/* CONSTANT BUFFERS */
707
 
708
void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
709
			    const uint8_t *ptr, unsigned size, uint32_t *const_offset)
710
{
711
	void *tmp;
712
 
713
	u_upload_alloc(sctx->b.uploader, 0, size, const_offset,
714
		       (struct pipe_resource**)rbuffer, &tmp);
715
	util_memcpy_cpu_to_le32(tmp, ptr, size);
716
}
717
 
718
static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint slot,
719
				   struct pipe_constant_buffer *input)
720
{
721
	struct si_context *sctx = (struct si_context *)ctx;
722
	struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
723
 
724
	if (shader >= SI_NUM_SHADERS)
725
		return;
726
 
727
	assert(slot < buffers->num_buffers);
728
	pipe_resource_reference(&buffers->buffers[slot], NULL);
729
 
730
	/* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
731
	 * with a NULL buffer). We need to use a dummy buffer instead. */
732
	if (sctx->b.chip_class == CIK &&
733
	    (!input || (!input->buffer && !input->user_buffer)))
734
		input = &sctx->null_const_buf;
735
 
736
	if (input && (input->buffer || input->user_buffer)) {
737
		struct pipe_resource *buffer = NULL;
738
		uint64_t va;
739
 
740
		/* Upload the user buffer if needed. */
741
		if (input->user_buffer) {
742
			unsigned buffer_offset;
743
 
744
			si_upload_const_buffer(sctx,
745
					       (struct r600_resource**)&buffer, input->user_buffer,
746
					       input->buffer_size, &buffer_offset);
747
			va = r600_resource(buffer)->gpu_address + buffer_offset;
748
		} else {
749
			pipe_resource_reference(&buffer, input->buffer);
750
			va = r600_resource(buffer)->gpu_address + input->buffer_offset;
751
		}
752
 
753
		/* Set the descriptor. */
754
		uint32_t *desc = buffers->desc_data[slot];
755
		desc[0] = va;
756
		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
757
			  S_008F04_STRIDE(0);
758
		desc[2] = input->buffer_size;
759
		desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
760
			  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
761
			  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
762
			  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
763
			  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
764
			  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
765
 
766
		buffers->buffers[slot] = buffer;
767
		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
768
				      (struct r600_resource*)buffer,
769
				      buffers->shader_usage, buffers->priority);
770
		buffers->desc.enabled_mask |= 1llu << slot;
771
	} else {
772
		/* Clear the descriptor. */
773
		memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
774
		buffers->desc.enabled_mask &= ~(1llu << slot);
775
	}
776
 
777
	buffers->desc.dirty_mask |= 1llu << slot;
778
	si_update_descriptors(sctx, &buffers->desc);
779
}
780
 
781
/* RING BUFFERS */
782
 
783
void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
784
			struct pipe_resource *buffer,
785
			unsigned stride, unsigned num_records,
786
			bool add_tid, bool swizzle,
787
			unsigned element_size, unsigned index_stride)
788
{
789
	struct si_context *sctx = (struct si_context *)ctx;
790
	struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
791
 
792
	if (shader >= SI_NUM_SHADERS)
793
		return;
794
 
795
	/* The stride field in the resource descriptor has 14 bits */
796
	assert(stride < (1 << 14));
797
 
798
	assert(slot < buffers->num_buffers);
799
	pipe_resource_reference(&buffers->buffers[slot], NULL);
800
 
801
	if (buffer) {
802
		uint64_t va;
803
 
804
		va = r600_resource(buffer)->gpu_address;
805
 
806
		switch (element_size) {
807
		default:
808
			assert(!"Unsupported ring buffer element size");
809
		case 0:
810
		case 2:
811
			element_size = 0;
812
			break;
813
		case 4:
814
			element_size = 1;
815
			break;
816
		case 8:
817
			element_size = 2;
818
			break;
819
		case 16:
820
			element_size = 3;
821
			break;
822
		}
823
 
824
		switch (index_stride) {
825
		default:
826
			assert(!"Unsupported ring buffer index stride");
827
		case 0:
828
		case 8:
829
			index_stride = 0;
830
			break;
831
		case 16:
832
			index_stride = 1;
833
			break;
834
		case 32:
835
			index_stride = 2;
836
			break;
837
		case 64:
838
			index_stride = 3;
839
			break;
840
		}
841
 
842
		/* Set the descriptor. */
843
		uint32_t *desc = buffers->desc_data[slot];
844
		desc[0] = va;
845
		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
846
			  S_008F04_STRIDE(stride) |
847
			  S_008F04_SWIZZLE_ENABLE(swizzle);
848
		desc[2] = num_records;
849
		desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
850
			  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
851
			  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
852
			  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
853
			  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
854
			  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
855
			  S_008F0C_ELEMENT_SIZE(element_size) |
856
			  S_008F0C_INDEX_STRIDE(index_stride) |
857
			  S_008F0C_ADD_TID_ENABLE(add_tid);
858
 
859
		pipe_resource_reference(&buffers->buffers[slot], buffer);
860
		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
861
				      (struct r600_resource*)buffer,
862
				      buffers->shader_usage, buffers->priority);
863
		buffers->desc.enabled_mask |= 1llu << slot;
864
	} else {
865
		/* Clear the descriptor. */
866
		memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
867
		buffers->desc.enabled_mask &= ~(1llu << slot);
868
	}
869
 
870
	buffers->desc.dirty_mask |= 1llu << slot;
871
	si_update_descriptors(sctx, &buffers->desc);
872
}
873
 
874
/* STREAMOUT BUFFERS */
875
 
876
static void si_set_streamout_targets(struct pipe_context *ctx,
877
				     unsigned num_targets,
878
				     struct pipe_stream_output_target **targets,
879
				     const unsigned *offsets)
880
{
881
	struct si_context *sctx = (struct si_context *)ctx;
882
	struct si_buffer_resources *buffers = &sctx->rw_buffers[PIPE_SHADER_VERTEX];
883
	unsigned old_num_targets = sctx->b.streamout.num_targets;
884
	unsigned i, bufidx;
885
 
886
	/* We are going to unbind the buffers. Mark which caches need to be flushed. */
887
	if (sctx->b.streamout.num_targets && sctx->b.streamout.begin_emitted) {
888
		/* Since streamout uses vector writes which go through TC L2
889
		 * and most other clients can use TC L2 as well, we don't need
890
		 * to flush it.
891
		 *
892
		 * The only case which requires flushing it is VGT DMA index
893
		 * fetching, which is a rare case. Thus, flag the TC L2
894
		 * dirtiness in the resource and handle it when index fetching
895
		 * is used.
896
		 */
897
		for (i = 0; i < sctx->b.streamout.num_targets; i++)
898
			if (sctx->b.streamout.targets[i])
899
				r600_resource(sctx->b.streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
900
 
901
		/* Invalidate the scalar cache in case a streamout buffer is
902
		 * going to be used as a constant buffer.
903
		 *
904
		 * Invalidate TC L1, because streamout bypasses it (done by
905
		 * setting GLC=1 in the store instruction), but it can contain
906
		 * outdated data of streamout buffers.
907
		 *
908
		 * VS_PARTIAL_FLUSH is required if the buffers are going to be
909
		 * used as an input immediately.
910
		 */
911
		sctx->b.flags |= SI_CONTEXT_INV_KCACHE |
912
				 SI_CONTEXT_INV_TC_L1 |
913
				 SI_CONTEXT_VS_PARTIAL_FLUSH;
914
	}
915
 
916
	/* Streamout buffers must be bound in 2 places:
917
	 * 1) in VGT by setting the VGT_STRMOUT registers
918
	 * 2) as shader resources
919
	 */
920
 
921
	/* Set the VGT regs. */
922
	r600_set_streamout_targets(ctx, num_targets, targets, offsets);
923
 
924
	/* Set the shader resources.*/
925
	for (i = 0; i < num_targets; i++) {
926
		bufidx = SI_SO_BUF_OFFSET + i;
927
 
928
		if (targets[i]) {
929
			struct pipe_resource *buffer = targets[i]->buffer;
930
			uint64_t va = r600_resource(buffer)->gpu_address;
931
 
932
			/* Set the descriptor. */
933
			uint32_t *desc = buffers->desc_data[bufidx];
934
			desc[0] = va;
935
			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
936
			desc[2] = 0xffffffff;
937
			desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
938
				  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
939
				  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
940
				  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
941
 
942
			/* Set the resource. */
943
			pipe_resource_reference(&buffers->buffers[bufidx],
944
						buffer);
945
			r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
946
					      (struct r600_resource*)buffer,
947
					      buffers->shader_usage, buffers->priority);
948
			buffers->desc.enabled_mask |= 1llu << bufidx;
949
		} else {
950
			/* Clear the descriptor and unset the resource. */
951
			memset(buffers->desc_data[bufidx], 0,
952
			       sizeof(uint32_t) * 4);
953
			pipe_resource_reference(&buffers->buffers[bufidx],
954
						NULL);
955
			buffers->desc.enabled_mask &= ~(1llu << bufidx);
956
		}
957
		buffers->desc.dirty_mask |= 1llu << bufidx;
958
	}
959
	for (; i < old_num_targets; i++) {
960
		bufidx = SI_SO_BUF_OFFSET + i;
961
		/* Clear the descriptor and unset the resource. */
962
		memset(buffers->desc_data[bufidx], 0, sizeof(uint32_t) * 4);
963
		pipe_resource_reference(&buffers->buffers[bufidx], NULL);
964
		buffers->desc.enabled_mask &= ~(1llu << bufidx);
965
		buffers->desc.dirty_mask |= 1llu << bufidx;
966
	}
967
 
968
	si_update_descriptors(sctx, &buffers->desc);
969
}
970
 
971
static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
972
					uint32_t *desc, uint64_t old_buf_va,
973
					struct pipe_resource *new_buf)
974
{
975
	/* Retrieve the buffer offset from the descriptor. */
976
	uint64_t old_desc_va =
977
		desc[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
978
 
979
	assert(old_buf_va <= old_desc_va);
980
	uint64_t offset_within_buffer = old_desc_va - old_buf_va;
981
 
982
	/* Update the descriptor. */
983
	uint64_t va = r600_resource(new_buf)->gpu_address + offset_within_buffer;
984
 
985
	desc[0] = va;
986
	desc[1] = (desc[1] & C_008F04_BASE_ADDRESS_HI) |
987
		  S_008F04_BASE_ADDRESS_HI(va >> 32);
988
}
989
 
990
/* BUFFER DISCARD/INVALIDATION */
991
 
992
/* Reallocate a buffer a update all resource bindings where the buffer is
993
 * bound.
994
 *
995
 * This is used to avoid CPU-GPU synchronizations, because it makes the buffer
996
 * idle by discarding its contents. Apps usually tell us when to do this using
997
 * map_buffer flags, for example.
998
 */
999
static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource *buf)
1000
{
1001
	struct si_context *sctx = (struct si_context*)ctx;
1002
	struct r600_resource *rbuffer = r600_resource(buf);
1003
	unsigned i, shader, alignment = rbuffer->buf->alignment;
1004
	uint64_t old_va = rbuffer->gpu_address;
1005
	unsigned num_elems = sctx->vertex_elements ?
1006
				       sctx->vertex_elements->count : 0;
1007
	struct si_sampler_view *view;
1008
 
1009
	/* Reallocate the buffer in the same pipe_resource. */
1010
	r600_init_resource(&sctx->screen->b, rbuffer, rbuffer->b.b.width0,
1011
			   alignment, TRUE);
1012
 
1013
	/* We changed the buffer, now we need to bind it where the old one
1014
	 * was bound. This consists of 2 things:
1015
	 *   1) Updating the resource descriptor and dirtying it.
1016
	 *   2) Adding a relocation to the CS, so that it's usable.
1017
	 */
1018
 
1019
	/* Vertex buffers. */
1020
	for (i = 0; i < num_elems; i++) {
1021
		int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
1022
 
1023
		if (vb >= Elements(sctx->vertex_buffer))
1024
			continue;
1025
		if (!sctx->vertex_buffer[vb].buffer)
1026
			continue;
1027
 
1028
		if (sctx->vertex_buffer[vb].buffer == buf) {
1029
			sctx->vertex_buffers_dirty = true;
1030
			break;
1031
		}
1032
	}
1033
 
1034
	/* Read/Write buffers. */
1035
	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
1036
		struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
1037
		bool found = false;
1038
		uint64_t mask = buffers->desc.enabled_mask;
1039
 
1040
		while (mask) {
1041
			i = u_bit_scan64(&mask);
1042
			if (buffers->buffers[i] == buf) {
1043
				si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
1044
							    old_va, buf);
1045
 
1046
				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
1047
						      rbuffer, buffers->shader_usage,
1048
						      buffers->priority);
1049
 
1050
				buffers->desc.dirty_mask |= 1llu << i;
1051
				found = true;
1052
 
1053
				if (i >= SI_SO_BUF_OFFSET && shader == PIPE_SHADER_VERTEX) {
1054
					/* Update the streamout state. */
1055
					if (sctx->b.streamout.begin_emitted) {
1056
						r600_emit_streamout_end(&sctx->b);
1057
					}
1058
					sctx->b.streamout.append_bitmask =
1059
						sctx->b.streamout.enabled_mask;
1060
					r600_streamout_buffers_dirty(&sctx->b);
1061
				}
1062
			}
1063
		}
1064
		if (found) {
1065
			si_update_descriptors(sctx, &buffers->desc);
1066
		}
1067
	}
1068
 
1069
	/* Constant buffers. */
1070
	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
1071
		struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
1072
		bool found = false;
1073
		uint64_t mask = buffers->desc.enabled_mask;
1074
 
1075
		while (mask) {
1076
			unsigned i = u_bit_scan64(&mask);
1077
			if (buffers->buffers[i] == buf) {
1078
				si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
1079
							    old_va, buf);
1080
 
1081
				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
1082
						      rbuffer, buffers->shader_usage,
1083
						      buffers->priority);
1084
 
1085
				buffers->desc.dirty_mask |= 1llu << i;
1086
				found = true;
1087
			}
1088
		}
1089
		if (found) {
1090
			si_update_descriptors(sctx, &buffers->desc);
1091
		}
1092
	}
1093
 
1094
	/* Texture buffers - update virtual addresses in sampler view descriptors. */
1095
	LIST_FOR_EACH_ENTRY(view, &sctx->b.texture_buffers, list) {
1096
		if (view->base.texture == buf) {
1097
			si_desc_reset_buffer_offset(ctx, &view->state[4], old_va, buf);
1098
		}
1099
	}
1100
	/* Texture buffers - update bindings. */
1101
	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
1102
		struct si_sampler_views *views = &sctx->samplers[shader].views;
1103
		bool found = false;
1104
		uint64_t mask = views->desc.enabled_mask;
1105
 
1106
		while (mask) {
1107
			unsigned i = u_bit_scan64(&mask);
1108
			if (views->views[i]->texture == buf) {
1109
				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
1110
						      rbuffer, RADEON_USAGE_READ,
1111
						      RADEON_PRIO_SHADER_BUFFER_RO);
1112
 
1113
				views->desc.dirty_mask |= 1llu << i;
1114
				found = true;
1115
			}
1116
		}
1117
		if (found) {
1118
			si_update_descriptors(sctx, &views->desc);
1119
		}
1120
	}
1121
}
1122
 
1123
/* CP DMA */
1124
 
1125
/* The max number of bytes to copy per packet. */
1126
#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
1127
 
1128
static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
1129
			    unsigned offset, unsigned size, unsigned value,
1130
			    bool is_framebuffer)
1131
{
1132
	struct si_context *sctx = (struct si_context*)ctx;
1133
	unsigned flush_flags, tc_l2_flag;
1134
 
1135
	if (!size)
1136
		return;
1137
 
1138
	/* Mark the buffer range of destination as valid (initialized),
1139
	 * so that transfer_map knows it should wait for the GPU when mapping
1140
	 * that range. */
1141
	util_range_add(&r600_resource(dst)->valid_buffer_range, offset,
1142
		       offset + size);
1143
 
1144
	/* Fallback for unaligned clears. */
1145
	if (offset % 4 != 0 || size % 4 != 0) {
1146
		uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
1147
						       sctx->b.rings.gfx.cs,
1148
						       PIPE_TRANSFER_WRITE);
1149
		size /= 4;
1150
		for (unsigned i = 0; i < size; i++)
1151
			*map++ = value;
1152
		return;
1153
	}
1154
 
1155
	uint64_t va = r600_resource(dst)->gpu_address + offset;
1156
 
1157
	/* Flush the caches where the resource is bound. */
1158
	if (is_framebuffer) {
1159
		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
1160
		tc_l2_flag = 0;
1161
	} else {
1162
		flush_flags = SI_CONTEXT_INV_TC_L1 |
1163
			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
1164
			      SI_CONTEXT_INV_KCACHE;
1165
		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
1166
	}
1167
 
1168
	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
1169
			 flush_flags;
1170
 
1171
	while (size) {
1172
		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
1173
		unsigned dma_flags = tc_l2_flag;
1174
 
1175
		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0),
1176
				 FALSE);
1177
 
1178
		/* This must be done after need_cs_space. */
1179
		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
1180
				      (struct r600_resource*)dst, RADEON_USAGE_WRITE,
1181
				      RADEON_PRIO_MIN);
1182
 
1183
		/* Flush the caches for the first copy only.
1184
		 * Also wait for the previous CP DMA operations. */
1185
		if (sctx->b.flags) {
1186
			si_emit_cache_flush(&sctx->b, NULL);
1187
			dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */
1188
		}
1189
 
1190
		/* Do the synchronization after the last copy, so that all data is written to memory. */
1191
		if (size == byte_count)
1192
			dma_flags |= R600_CP_DMA_SYNC;
1193
 
1194
		/* Emit the clear packet. */
1195
		si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags);
1196
 
1197
		size -= byte_count;
1198
		va += byte_count;
1199
	}
1200
 
1201
	/* Flush the caches again in case the 3D engine has been prefetching
1202
	 * the resource. */
1203
	sctx->b.flags |= flush_flags;
1204
 
1205
	if (tc_l2_flag)
1206
		r600_resource(dst)->TC_L2_dirty = true;
1207
}
1208
 
1209
void si_copy_buffer(struct si_context *sctx,
1210
		    struct pipe_resource *dst, struct pipe_resource *src,
1211
		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
1212
		    bool is_framebuffer)
1213
{
1214
	unsigned flush_flags, tc_l2_flag;
1215
 
1216
	if (!size)
1217
		return;
1218
 
1219
	/* Mark the buffer range of destination as valid (initialized),
1220
	 * so that transfer_map knows it should wait for the GPU when mapping
1221
	 * that range. */
1222
	util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset,
1223
		       dst_offset + size);
1224
 
1225
	dst_offset += r600_resource(dst)->gpu_address;
1226
	src_offset += r600_resource(src)->gpu_address;
1227
 
1228
	/* Flush the caches where the resource is bound. */
1229
	if (is_framebuffer) {
1230
		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
1231
		tc_l2_flag = 0;
1232
	} else {
1233
		flush_flags = SI_CONTEXT_INV_TC_L1 |
1234
			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
1235
			      SI_CONTEXT_INV_KCACHE;
1236
		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
1237
	}
1238
 
1239
	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
1240
			 flush_flags;
1241
 
1242
	while (size) {
1243
		unsigned sync_flags = tc_l2_flag;
1244
		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
1245
 
1246
		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE);
1247
 
1248
		/* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */
1249
		if (sctx->b.flags) {
1250
			si_emit_cache_flush(&sctx->b, NULL);
1251
			sync_flags |= SI_CP_DMA_RAW_WAIT;
1252
		}
1253
 
1254
		/* Do the synchronization after the last copy, so that all data is written to memory. */
1255
		if (size == byte_count) {
1256
			sync_flags |= R600_CP_DMA_SYNC;
1257
		}
1258
 
1259
		/* This must be done after r600_need_cs_space. */
1260
		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src,
1261
				      RADEON_USAGE_READ, RADEON_PRIO_MIN);
1262
		r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst,
1263
				      RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
1264
 
1265
		si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags);
1266
 
1267
		size -= byte_count;
1268
		src_offset += byte_count;
1269
		dst_offset += byte_count;
1270
	}
1271
 
1272
	/* Flush the caches again in case the 3D engine has been prefetching
1273
	 * the resource. */
1274
	sctx->b.flags |= flush_flags;
1275
 
1276
	if (tc_l2_flag)
1277
		r600_resource(dst)->TC_L2_dirty = true;
1278
}
1279
 
1280
/* INIT/DEINIT */
1281
 
1282
void si_init_all_descriptors(struct si_context *sctx)
1283
{
1284
	int i;
1285
 
1286
	for (i = 0; i < SI_NUM_SHADERS; i++) {
1287
		si_init_buffer_resources(sctx, &sctx->const_buffers[i],
1288
					 SI_NUM_CONST_BUFFERS, i, SI_SGPR_CONST,
1289
					 RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
1290
		si_init_buffer_resources(sctx, &sctx->rw_buffers[i],
1291
					 i == PIPE_SHADER_VERTEX ?
1292
					 SI_NUM_RW_BUFFERS : SI_NUM_RING_BUFFERS,
1293
					 i, SI_SGPR_RW_BUFFERS,
1294
					 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
1295
 
1296
		si_init_sampler_views(sctx, &sctx->samplers[i].views, i);
1297
 
1298
		si_init_descriptors(sctx, &sctx->samplers[i].states.desc,
1299
				    si_get_shader_user_data_base(i) + SI_SGPR_SAMPLER * 4,
1300
				    4, SI_NUM_SAMPLER_STATES, si_emit_sampler_states);
1301
 
1302
		sctx->atoms.s.const_buffers[i] = &sctx->const_buffers[i].desc.atom;
1303
		sctx->atoms.s.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom;
1304
		sctx->atoms.s.sampler_views[i] = &sctx->samplers[i].views.desc.atom;
1305
		sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
1306
	}
1307
 
1308
	si_init_descriptors(sctx, &sctx->vertex_buffers,
1309
			    si_get_shader_user_data_base(PIPE_SHADER_VERTEX) +
1310
			    SI_SGPR_VERTEX_BUFFER*4, 4, SI_NUM_VERTEX_BUFFERS,
1311
			    si_emit_shader_pointer);
1312
	sctx->atoms.s.vertex_buffers = &sctx->vertex_buffers.atom;
1313
 
1314
	/* Set pipe_context functions. */
1315
	sctx->b.b.set_constant_buffer = si_set_constant_buffer;
1316
	sctx->b.b.set_sampler_views = si_set_sampler_views;
1317
	sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
1318
	sctx->b.clear_buffer = si_clear_buffer;
1319
	sctx->b.invalidate_buffer = si_invalidate_buffer;
1320
}
1321
 
1322
void si_release_all_descriptors(struct si_context *sctx)
1323
{
1324
	int i;
1325
 
1326
	for (i = 0; i < SI_NUM_SHADERS; i++) {
1327
		si_release_buffer_resources(&sctx->const_buffers[i]);
1328
		si_release_buffer_resources(&sctx->rw_buffers[i]);
1329
		si_release_sampler_views(&sctx->samplers[i].views);
1330
		si_release_descriptors(&sctx->samplers[i].states.desc);
1331
	}
1332
	si_release_descriptors(&sctx->vertex_buffers);
1333
}
1334
 
1335
void si_all_descriptors_begin_new_cs(struct si_context *sctx)
1336
{
1337
	int i;
1338
 
1339
	for (i = 0; i < SI_NUM_SHADERS; i++) {
1340
		si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]);
1341
		si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers[i]);
1342
		si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
1343
		si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states);
1344
	}
1345
	si_vertex_buffers_begin_new_cs(sctx);
1346
}