Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5564 | serge | 1 | /* |
2 | * Copyright 2013 Advanced Micro Devices, Inc. |
||
3 | * |
||
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
5 | * copy of this software and associated documentation files (the "Software"), |
||
6 | * to deal in the Software without restriction, including without limitation |
||
7 | * on the rights to use, copy, modify, merge, publish, distribute, sub |
||
8 | * license, and/or sell copies of the Software, and to permit persons to whom |
||
9 | * the Software is furnished to do so, subject to the following conditions: |
||
10 | * |
||
11 | * The above copyright notice and this permission notice (including the next |
||
12 | * paragraph) shall be included in all copies or substantial portions of the |
||
13 | * Software. |
||
14 | * |
||
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL |
||
18 | * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, |
||
19 | * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR |
||
20 | * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE |
||
21 | * USE OR OTHER DEALINGS IN THE SOFTWARE. |
||
22 | * |
||
23 | * Authors: |
||
24 | * Marek Olšák |
||
25 | */ |
||
26 | |||
27 | /* Resource binding slots and sampler states (each described with 8 or 4 dwords) |
||
28 | * live in memory on SI. |
||
29 | * |
||
30 | * This file is responsible for managing lists of resources and sampler states |
||
31 | * in memory and binding them, which means updating those structures in memory. |
||
32 | * |
||
33 | * There is also code for updating shader pointers to resources and sampler |
||
34 | * states. CP DMA functions are here too. |
||
35 | */ |
||
36 | |||
37 | #include "radeon/r600_cs.h" |
||
38 | #include "si_pipe.h" |
||
39 | #include "si_shader.h" |
||
40 | #include "sid.h" |
||
41 | |||
42 | #include "util/u_memory.h" |
||
43 | #include "util/u_upload_mgr.h" |
||
44 | |||
45 | #define SI_NUM_CONTEXTS 16 |
||
46 | |||
47 | /* NULL image and buffer descriptor. |
||
48 | * |
||
49 | * For images, all fields must be zero except for the swizzle, which |
||
50 | * supports arbitrary combinations of 0s and 1s. The texture type must be |
||
51 | * any valid type (e.g. 1D). If the texture type isn't set, the hw hangs. |
||
52 | * |
||
53 | * For buffers, all fields must be zero. If they are not, the hw hangs. |
||
54 | * |
||
55 | * This is the only reason why the buffer descriptor must be in words [4:7]. |
||
56 | */ |
||
57 | static uint32_t null_descriptor[8] = { |
||
58 | 0, |
||
59 | 0, |
||
60 | 0, |
||
61 | S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_1) | |
||
62 | S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D) |
||
63 | /* the rest must contain zeros, which is also used by the buffer |
||
64 | * descriptor */ |
||
65 | }; |
||
66 | |||
67 | /* Set this if you want the 3D engine to wait until CP DMA is done. |
||
68 | * It should be set on the last CP DMA packet. */ |
||
69 | #define R600_CP_DMA_SYNC (1 << 0) /* R600+ */ |
||
70 | |||
71 | /* Set this if the source data was used as a destination in a previous CP DMA |
||
72 | * packet. It's for preventing a read-after-write (RAW) hazard between two |
||
73 | * CP DMA packets. */ |
||
74 | #define SI_CP_DMA_RAW_WAIT (1 << 1) /* SI+ */ |
||
75 | #define CIK_CP_DMA_USE_L2 (1 << 2) |
||
76 | |||
77 | /* Emit a CP DMA packet to do a copy from one buffer to another. |
||
78 | * The size must fit in bits [20:0]. |
||
79 | */ |
||
80 | static void si_emit_cp_dma_copy_buffer(struct si_context *sctx, |
||
81 | uint64_t dst_va, uint64_t src_va, |
||
82 | unsigned size, unsigned flags) |
||
83 | { |
||
84 | struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; |
||
85 | uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0; |
||
86 | uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0; |
||
87 | uint32_t sel = flags & CIK_CP_DMA_USE_L2 ? |
||
88 | PKT3_CP_DMA_SRC_SEL(3) | PKT3_CP_DMA_DST_SEL(3) : 0; |
||
89 | |||
90 | assert(size); |
||
91 | assert((size & ((1<<21)-1)) == size); |
||
92 | |||
93 | if (sctx->b.chip_class >= CIK) { |
||
94 | radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); |
||
95 | radeon_emit(cs, sync_flag | sel); /* CP_SYNC [31] */ |
||
96 | radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ |
||
97 | radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ |
||
98 | radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ |
||
99 | radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ |
||
100 | radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ |
||
101 | } else { |
||
102 | radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); |
||
103 | radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ |
||
104 | radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */ |
||
105 | radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ |
||
106 | radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ |
||
107 | radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ |
||
108 | } |
||
109 | } |
||
110 | |||
111 | /* Emit a CP DMA packet to clear a buffer. The size must fit in bits [20:0]. */ |
||
112 | static void si_emit_cp_dma_clear_buffer(struct si_context *sctx, |
||
113 | uint64_t dst_va, unsigned size, |
||
114 | uint32_t clear_value, unsigned flags) |
||
115 | { |
||
116 | struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; |
||
117 | uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0; |
||
118 | uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0; |
||
119 | uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? PKT3_CP_DMA_DST_SEL(3) : 0; |
||
120 | |||
121 | assert(size); |
||
122 | assert((size & ((1<<21)-1)) == size); |
||
123 | |||
124 | if (sctx->b.chip_class >= CIK) { |
||
125 | radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); |
||
126 | radeon_emit(cs, sync_flag | dst_sel | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */ |
||
127 | radeon_emit(cs, clear_value); /* DATA [31:0] */ |
||
128 | radeon_emit(cs, 0); |
||
129 | radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ |
||
130 | radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [15:0] */ |
||
131 | radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ |
||
132 | } else { |
||
133 | radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); |
||
134 | radeon_emit(cs, clear_value); /* DATA [31:0] */ |
||
135 | radeon_emit(cs, sync_flag | PKT3_CP_DMA_SRC_SEL(2)); /* CP_SYNC [31] | SRC_SEL[30:29] */ |
||
136 | radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ |
||
137 | radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ |
||
138 | radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ |
||
139 | } |
||
140 | } |
||
141 | |||
142 | static void si_init_descriptors(struct si_context *sctx, |
||
143 | struct si_descriptors *desc, |
||
144 | unsigned shader_userdata_reg, |
||
145 | unsigned element_dw_size, |
||
146 | unsigned num_elements, |
||
147 | void (*emit_func)(struct si_context *ctx, struct r600_atom *state)) |
||
148 | { |
||
149 | assert(num_elements <= sizeof(desc->enabled_mask)*8); |
||
150 | assert(num_elements <= sizeof(desc->dirty_mask)*8); |
||
151 | |||
152 | desc->atom.emit = (void*)emit_func; |
||
153 | desc->shader_userdata_reg = shader_userdata_reg; |
||
154 | desc->element_dw_size = element_dw_size; |
||
155 | desc->num_elements = num_elements; |
||
156 | desc->context_size = num_elements * element_dw_size * 4; |
||
157 | |||
158 | desc->buffer = (struct r600_resource*) |
||
159 | pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, |
||
160 | PIPE_USAGE_DEFAULT, |
||
161 | SI_NUM_CONTEXTS * desc->context_size); |
||
162 | |||
163 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer, |
||
164 | RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA); |
||
165 | |||
166 | /* We don't check for CS space here, because this should be called |
||
167 | * only once at context initialization. */ |
||
168 | si_emit_cp_dma_clear_buffer(sctx, desc->buffer->gpu_address, |
||
169 | desc->buffer->b.b.width0, 0, |
||
170 | R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2); |
||
171 | } |
||
172 | |||
173 | static void si_release_descriptors(struct si_descriptors *desc) |
||
174 | { |
||
175 | pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL); |
||
176 | } |
||
177 | |||
178 | static void si_update_descriptors(struct si_context *sctx, |
||
179 | struct si_descriptors *desc) |
||
180 | { |
||
181 | if (desc->dirty_mask) { |
||
182 | desc->atom.num_dw = |
||
183 | 7 + /* copy */ |
||
184 | (4 + desc->element_dw_size) * util_bitcount64(desc->dirty_mask) + /* update */ |
||
185 | 4; /* pointer update */ |
||
186 | |||
187 | if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 && |
||
188 | desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0) |
||
189 | desc->atom.num_dw += 4; /* second pointer update */ |
||
190 | |||
191 | desc->atom.dirty = true; |
||
192 | |||
193 | /* TODO: Investigate if these flushes can be removed after |
||
194 | * adding CE support. */ |
||
195 | |||
196 | /* The descriptors are read with the K cache. */ |
||
197 | sctx->b.flags |= SI_CONTEXT_INV_KCACHE; |
||
198 | |||
199 | /* Since SI uses uncached CP DMA to update descriptors, |
||
200 | * we have to flush TC L2, which is used to fetch constants |
||
201 | * along with KCACHE. */ |
||
202 | if (sctx->b.chip_class == SI) |
||
203 | sctx->b.flags |= SI_CONTEXT_INV_TC_L2; |
||
204 | } else { |
||
205 | desc->atom.dirty = false; |
||
206 | } |
||
207 | } |
||
208 | |||
209 | static void si_emit_shader_pointer(struct si_context *sctx, |
||
210 | struct r600_atom *atom) |
||
211 | { |
||
212 | struct si_descriptors *desc = (struct si_descriptors*)atom; |
||
213 | struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; |
||
214 | uint64_t va = desc->buffer->gpu_address + |
||
215 | desc->current_context_id * desc->context_size + |
||
216 | desc->buffer_offset; |
||
217 | |||
218 | radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0)); |
||
219 | radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2); |
||
220 | radeon_emit(cs, va); |
||
221 | radeon_emit(cs, va >> 32); |
||
222 | |||
223 | if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 && |
||
224 | desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0) { |
||
225 | radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0)); |
||
226 | radeon_emit(cs, (desc->shader_userdata_reg + |
||
227 | (R_00B330_SPI_SHADER_USER_DATA_ES_0 - |
||
228 | R_00B130_SPI_SHADER_USER_DATA_VS_0) - |
||
229 | SI_SH_REG_OFFSET) >> 2); |
||
230 | radeon_emit(cs, va); |
||
231 | radeon_emit(cs, va >> 32); |
||
232 | } |
||
233 | } |
||
234 | |||
235 | static void si_emit_descriptors(struct si_context *sctx, |
||
236 | struct si_descriptors *desc, |
||
237 | uint32_t **descriptors) |
||
238 | { |
||
239 | struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; |
||
240 | uint64_t va_base; |
||
241 | int packet_start = 0; |
||
242 | int packet_size = 0; |
||
243 | int last_index = desc->num_elements; /* point to a non-existing element */ |
||
244 | uint64_t dirty_mask = desc->dirty_mask; |
||
245 | unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS; |
||
246 | |||
247 | assert(dirty_mask); |
||
248 | |||
249 | va_base = desc->buffer->gpu_address; |
||
250 | |||
251 | /* Copy the descriptors to a new context slot. */ |
||
252 | si_emit_cp_dma_copy_buffer(sctx, |
||
253 | va_base + new_context_id * desc->context_size, |
||
254 | va_base + desc->current_context_id * desc->context_size, |
||
255 | desc->context_size, R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2); |
||
256 | |||
257 | va_base += new_context_id * desc->context_size; |
||
258 | |||
259 | /* Update the descriptors. |
||
260 | * Updates of consecutive descriptors are merged to one WRITE_DATA packet. |
||
261 | * |
||
262 | * XXX When unbinding lots of resources, consider clearing the memory |
||
263 | * with CP DMA instead of emitting zeros. |
||
264 | */ |
||
265 | while (dirty_mask) { |
||
266 | int i = u_bit_scan64(&dirty_mask); |
||
267 | |||
268 | assert(i < desc->num_elements); |
||
269 | |||
270 | if (last_index+1 == i && packet_size) { |
||
271 | /* Append new data at the end of the last packet. */ |
||
272 | packet_size += desc->element_dw_size; |
||
273 | cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0); |
||
274 | } else { |
||
275 | /* Start a new packet. */ |
||
276 | uint64_t va = va_base + i * desc->element_dw_size * 4; |
||
277 | |||
278 | packet_start = cs->cdw; |
||
279 | packet_size = 2 + desc->element_dw_size; |
||
280 | |||
281 | radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0)); |
||
282 | radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(sctx->b.chip_class == SI ? |
||
283 | PKT3_WRITE_DATA_DST_SEL_MEM_SYNC : |
||
284 | PKT3_WRITE_DATA_DST_SEL_TC_L2) | |
||
285 | PKT3_WRITE_DATA_WR_CONFIRM | |
||
286 | PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME)); |
||
287 | radeon_emit(cs, va & 0xFFFFFFFFUL); |
||
288 | radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL); |
||
289 | } |
||
290 | |||
291 | radeon_emit_array(cs, descriptors[i], desc->element_dw_size); |
||
292 | |||
293 | last_index = i; |
||
294 | } |
||
295 | |||
296 | desc->dirty_mask = 0; |
||
297 | desc->current_context_id = new_context_id; |
||
298 | |||
299 | /* Now update the shader userdata pointer. */ |
||
300 | si_emit_shader_pointer(sctx, &desc->atom); |
||
301 | } |
||
302 | |||
303 | static unsigned si_get_shader_user_data_base(unsigned shader) |
||
304 | { |
||
305 | switch (shader) { |
||
306 | case PIPE_SHADER_VERTEX: |
||
307 | return R_00B130_SPI_SHADER_USER_DATA_VS_0; |
||
308 | case PIPE_SHADER_GEOMETRY: |
||
309 | return R_00B230_SPI_SHADER_USER_DATA_GS_0; |
||
310 | case PIPE_SHADER_FRAGMENT: |
||
311 | return R_00B030_SPI_SHADER_USER_DATA_PS_0; |
||
312 | default: |
||
313 | assert(0); |
||
314 | return 0; |
||
315 | } |
||
316 | } |
||
317 | |||
318 | /* SAMPLER VIEWS */ |
||
319 | |||
320 | static void si_emit_sampler_views(struct si_context *sctx, struct r600_atom *atom) |
||
321 | { |
||
322 | struct si_sampler_views *views = (struct si_sampler_views*)atom; |
||
323 | |||
324 | si_emit_descriptors(sctx, &views->desc, views->desc_data); |
||
325 | } |
||
326 | |||
327 | static void si_init_sampler_views(struct si_context *sctx, |
||
328 | struct si_sampler_views *views, |
||
329 | unsigned shader) |
||
330 | { |
||
331 | int i; |
||
332 | |||
333 | si_init_descriptors(sctx, &views->desc, |
||
334 | si_get_shader_user_data_base(shader) + |
||
335 | SI_SGPR_RESOURCE * 4, |
||
336 | 8, SI_NUM_SAMPLER_VIEWS, si_emit_sampler_views); |
||
337 | |||
338 | for (i = 0; i < views->desc.num_elements; i++) { |
||
339 | views->desc_data[i] = null_descriptor; |
||
340 | views->desc.dirty_mask |= 1llu << i; |
||
341 | } |
||
342 | si_update_descriptors(sctx, &views->desc); |
||
343 | } |
||
344 | |||
345 | static void si_release_sampler_views(struct si_sampler_views *views) |
||
346 | { |
||
347 | int i; |
||
348 | |||
349 | for (i = 0; i < Elements(views->views); i++) { |
||
350 | pipe_sampler_view_reference(&views->views[i], NULL); |
||
351 | } |
||
352 | si_release_descriptors(&views->desc); |
||
353 | } |
||
354 | |||
355 | static enum radeon_bo_priority si_get_resource_ro_priority(struct r600_resource *res) |
||
356 | { |
||
357 | if (res->b.b.target == PIPE_BUFFER) |
||
358 | return RADEON_PRIO_SHADER_BUFFER_RO; |
||
359 | |||
360 | if (res->b.b.nr_samples > 1) |
||
361 | return RADEON_PRIO_SHADER_TEXTURE_MSAA; |
||
362 | |||
363 | return RADEON_PRIO_SHADER_TEXTURE_RO; |
||
364 | } |
||
365 | |||
366 | static void si_sampler_views_begin_new_cs(struct si_context *sctx, |
||
367 | struct si_sampler_views *views) |
||
368 | { |
||
369 | uint64_t mask = views->desc.enabled_mask; |
||
370 | |||
371 | /* Add relocations to the CS. */ |
||
372 | while (mask) { |
||
373 | int i = u_bit_scan64(&mask); |
||
374 | struct si_sampler_view *rview = |
||
375 | (struct si_sampler_view*)views->views[i]; |
||
376 | |||
377 | if (!rview->resource) |
||
378 | continue; |
||
379 | |||
380 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
381 | rview->resource, RADEON_USAGE_READ, |
||
382 | si_get_resource_ro_priority(rview->resource)); |
||
383 | } |
||
384 | |||
385 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer, |
||
386 | RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA); |
||
387 | |||
388 | si_emit_shader_pointer(sctx, &views->desc.atom); |
||
389 | } |
||
390 | |||
391 | static void si_set_sampler_view(struct si_context *sctx, unsigned shader, |
||
392 | unsigned slot, struct pipe_sampler_view *view, |
||
393 | unsigned *view_desc) |
||
394 | { |
||
395 | struct si_sampler_views *views = &sctx->samplers[shader].views; |
||
396 | |||
397 | if (views->views[slot] == view) |
||
398 | return; |
||
399 | |||
400 | if (view) { |
||
401 | struct si_sampler_view *rview = |
||
402 | (struct si_sampler_view*)view; |
||
403 | |||
404 | if (rview->resource) |
||
405 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
406 | rview->resource, RADEON_USAGE_READ, |
||
407 | si_get_resource_ro_priority(rview->resource)); |
||
408 | |||
409 | |||
410 | pipe_sampler_view_reference(&views->views[slot], view); |
||
411 | views->desc_data[slot] = view_desc; |
||
412 | views->desc.enabled_mask |= 1llu << slot; |
||
413 | } else { |
||
414 | pipe_sampler_view_reference(&views->views[slot], NULL); |
||
415 | views->desc_data[slot] = null_descriptor; |
||
416 | views->desc.enabled_mask &= ~(1llu << slot); |
||
417 | } |
||
418 | |||
419 | views->desc.dirty_mask |= 1llu << slot; |
||
420 | } |
||
421 | |||
422 | static void si_set_sampler_views(struct pipe_context *ctx, |
||
423 | unsigned shader, unsigned start, |
||
424 | unsigned count, |
||
425 | struct pipe_sampler_view **views) |
||
426 | { |
||
427 | struct si_context *sctx = (struct si_context *)ctx; |
||
428 | struct si_textures_info *samplers = &sctx->samplers[shader]; |
||
429 | struct si_sampler_view **rviews = (struct si_sampler_view **)views; |
||
430 | int i; |
||
431 | |||
432 | if (!count || shader >= SI_NUM_SHADERS) |
||
433 | return; |
||
434 | |||
435 | for (i = 0; i < count; i++) { |
||
436 | unsigned slot = start + i; |
||
437 | |||
438 | if (!views || !views[i]) { |
||
439 | samplers->depth_texture_mask &= ~(1 << slot); |
||
440 | samplers->compressed_colortex_mask &= ~(1 << slot); |
||
441 | si_set_sampler_view(sctx, shader, slot, NULL, NULL); |
||
442 | si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot, |
||
443 | NULL, NULL); |
||
444 | continue; |
||
445 | } |
||
446 | |||
447 | si_set_sampler_view(sctx, shader, slot, views[i], rviews[i]->state); |
||
448 | |||
449 | if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) { |
||
450 | struct r600_texture *rtex = |
||
451 | (struct r600_texture*)views[i]->texture; |
||
452 | |||
453 | if (rtex->is_depth && !rtex->is_flushing_texture) { |
||
454 | samplers->depth_texture_mask |= 1 << slot; |
||
455 | } else { |
||
456 | samplers->depth_texture_mask &= ~(1 << slot); |
||
457 | } |
||
458 | if (rtex->cmask.size || rtex->fmask.size) { |
||
459 | samplers->compressed_colortex_mask |= 1 << slot; |
||
460 | } else { |
||
461 | samplers->compressed_colortex_mask &= ~(1 << slot); |
||
462 | } |
||
463 | |||
464 | if (rtex->fmask.size) { |
||
465 | si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot, |
||
466 | views[i], rviews[i]->fmask_state); |
||
467 | } else { |
||
468 | si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot, |
||
469 | NULL, NULL); |
||
470 | } |
||
471 | } else { |
||
472 | samplers->depth_texture_mask &= ~(1 << slot); |
||
473 | samplers->compressed_colortex_mask &= ~(1 << slot); |
||
474 | si_set_sampler_view(sctx, shader, SI_FMASK_TEX_OFFSET + slot, |
||
475 | NULL, NULL); |
||
476 | } |
||
477 | } |
||
478 | |||
479 | si_update_descriptors(sctx, &samplers->views.desc); |
||
480 | } |
||
481 | |||
482 | /* SAMPLER STATES */ |
||
483 | |||
484 | static void si_emit_sampler_states(struct si_context *sctx, struct r600_atom *atom) |
||
485 | { |
||
486 | struct si_sampler_states *states = (struct si_sampler_states*)atom; |
||
487 | |||
488 | si_emit_descriptors(sctx, &states->desc, states->desc_data); |
||
489 | } |
||
490 | |||
491 | static void si_sampler_states_begin_new_cs(struct si_context *sctx, |
||
492 | struct si_sampler_states *states) |
||
493 | { |
||
494 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer, |
||
495 | RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA); |
||
496 | si_emit_shader_pointer(sctx, &states->desc.atom); |
||
497 | } |
||
498 | |||
499 | void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader, |
||
500 | unsigned start, unsigned count, void **states) |
||
501 | { |
||
502 | struct si_sampler_states *samplers = &sctx->samplers[shader].states; |
||
503 | struct si_sampler_state **sstates = (struct si_sampler_state**)states; |
||
504 | int i; |
||
505 | |||
506 | if (start == 0) |
||
507 | samplers->saved_states[0] = states[0]; |
||
508 | if (start == 1) |
||
509 | samplers->saved_states[1] = states[0]; |
||
510 | else if (start == 0 && count >= 2) |
||
511 | samplers->saved_states[1] = states[1]; |
||
512 | |||
513 | for (i = 0; i < count; i++) { |
||
514 | unsigned slot = start + i; |
||
515 | |||
516 | if (!sstates[i]) { |
||
517 | samplers->desc.dirty_mask &= ~(1llu << slot); |
||
518 | continue; |
||
519 | } |
||
520 | |||
521 | samplers->desc_data[slot] = sstates[i]->val; |
||
522 | samplers->desc.dirty_mask |= 1llu << slot; |
||
523 | } |
||
524 | |||
525 | si_update_descriptors(sctx, &samplers->desc); |
||
526 | } |
||
527 | |||
528 | /* BUFFER RESOURCES */ |
||
529 | |||
530 | static void si_emit_buffer_resources(struct si_context *sctx, struct r600_atom *atom) |
||
531 | { |
||
532 | struct si_buffer_resources *buffers = (struct si_buffer_resources*)atom; |
||
533 | |||
534 | si_emit_descriptors(sctx, &buffers->desc, buffers->desc_data); |
||
535 | } |
||
536 | |||
537 | static void si_init_buffer_resources(struct si_context *sctx, |
||
538 | struct si_buffer_resources *buffers, |
||
539 | unsigned num_buffers, unsigned shader, |
||
540 | unsigned shader_userdata_index, |
||
541 | enum radeon_bo_usage shader_usage, |
||
542 | enum radeon_bo_priority priority) |
||
543 | { |
||
544 | int i; |
||
545 | |||
546 | buffers->num_buffers = num_buffers; |
||
547 | buffers->shader_usage = shader_usage; |
||
548 | buffers->priority = priority; |
||
549 | buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*)); |
||
550 | buffers->desc_storage = CALLOC(num_buffers, sizeof(uint32_t) * 4); |
||
551 | |||
552 | /* si_emit_descriptors only accepts an array of arrays. |
||
553 | * This adds such an array. */ |
||
554 | buffers->desc_data = CALLOC(num_buffers, sizeof(uint32_t*)); |
||
555 | for (i = 0; i < num_buffers; i++) { |
||
556 | buffers->desc_data[i] = &buffers->desc_storage[i*4]; |
||
557 | } |
||
558 | |||
559 | si_init_descriptors(sctx, &buffers->desc, |
||
560 | si_get_shader_user_data_base(shader) + |
||
561 | shader_userdata_index*4, 4, num_buffers, |
||
562 | si_emit_buffer_resources); |
||
563 | } |
||
564 | |||
565 | static void si_release_buffer_resources(struct si_buffer_resources *buffers) |
||
566 | { |
||
567 | int i; |
||
568 | |||
569 | for (i = 0; i < buffers->num_buffers; i++) { |
||
570 | pipe_resource_reference(&buffers->buffers[i], NULL); |
||
571 | } |
||
572 | |||
573 | FREE(buffers->buffers); |
||
574 | FREE(buffers->desc_storage); |
||
575 | FREE(buffers->desc_data); |
||
576 | si_release_descriptors(&buffers->desc); |
||
577 | } |
||
578 | |||
579 | static void si_buffer_resources_begin_new_cs(struct si_context *sctx, |
||
580 | struct si_buffer_resources *buffers) |
||
581 | { |
||
582 | uint64_t mask = buffers->desc.enabled_mask; |
||
583 | |||
584 | /* Add relocations to the CS. */ |
||
585 | while (mask) { |
||
586 | int i = u_bit_scan64(&mask); |
||
587 | |||
588 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
589 | (struct r600_resource*)buffers->buffers[i], |
||
590 | buffers->shader_usage, buffers->priority); |
||
591 | } |
||
592 | |||
593 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
594 | buffers->desc.buffer, RADEON_USAGE_READWRITE, |
||
595 | RADEON_PRIO_SHADER_DATA); |
||
596 | |||
597 | si_emit_shader_pointer(sctx, &buffers->desc.atom); |
||
598 | } |
||
599 | |||
600 | /* VERTEX BUFFERS */ |
||
601 | |||
602 | static void si_vertex_buffers_begin_new_cs(struct si_context *sctx) |
||
603 | { |
||
604 | struct si_descriptors *desc = &sctx->vertex_buffers; |
||
605 | int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0; |
||
606 | int i; |
||
607 | |||
608 | for (i = 0; i < count; i++) { |
||
609 | int vb = sctx->vertex_elements->elements[i].vertex_buffer_index; |
||
610 | |||
611 | if (vb >= Elements(sctx->vertex_buffer)) |
||
612 | continue; |
||
613 | if (!sctx->vertex_buffer[vb].buffer) |
||
614 | continue; |
||
615 | |||
616 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
617 | (struct r600_resource*)sctx->vertex_buffer[vb].buffer, |
||
618 | RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO); |
||
619 | } |
||
620 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
621 | desc->buffer, RADEON_USAGE_READ, |
||
622 | RADEON_PRIO_SHADER_DATA); |
||
623 | |||
624 | si_emit_shader_pointer(sctx, &desc->atom); |
||
625 | } |
||
626 | |||
627 | void si_update_vertex_buffers(struct si_context *sctx) |
||
628 | { |
||
629 | struct si_descriptors *desc = &sctx->vertex_buffers; |
||
630 | bool bound[SI_NUM_VERTEX_BUFFERS] = {}; |
||
631 | unsigned i, count = sctx->vertex_elements->count; |
||
632 | uint64_t va; |
||
633 | uint32_t *ptr; |
||
634 | |||
635 | if (!count || !sctx->vertex_elements) |
||
636 | return; |
||
637 | |||
638 | /* Vertex buffer descriptors are the only ones which are uploaded |
||
639 | * directly through a staging buffer and don't go through |
||
640 | * the fine-grained upload path. |
||
641 | */ |
||
642 | u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset, |
||
643 | (struct pipe_resource**)&desc->buffer, (void**)&ptr); |
||
644 | |||
645 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
646 | desc->buffer, RADEON_USAGE_READ, |
||
647 | RADEON_PRIO_SHADER_DATA); |
||
648 | |||
649 | assert(count <= SI_NUM_VERTEX_BUFFERS); |
||
650 | assert(desc->current_context_id == 0); |
||
651 | |||
652 | for (i = 0; i < count; i++) { |
||
653 | struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i]; |
||
654 | struct pipe_vertex_buffer *vb; |
||
655 | struct r600_resource *rbuffer; |
||
656 | unsigned offset; |
||
657 | uint32_t *desc = &ptr[i*4]; |
||
658 | |||
659 | if (ve->vertex_buffer_index >= Elements(sctx->vertex_buffer)) { |
||
660 | memset(desc, 0, 16); |
||
661 | continue; |
||
662 | } |
||
663 | |||
664 | vb = &sctx->vertex_buffer[ve->vertex_buffer_index]; |
||
665 | rbuffer = (struct r600_resource*)vb->buffer; |
||
666 | if (rbuffer == NULL) { |
||
667 | memset(desc, 0, 16); |
||
668 | continue; |
||
669 | } |
||
670 | |||
671 | offset = vb->buffer_offset + ve->src_offset; |
||
672 | va = rbuffer->gpu_address + offset; |
||
673 | |||
674 | /* Fill in T# buffer resource description */ |
||
675 | desc[0] = va & 0xFFFFFFFF; |
||
676 | desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | |
||
677 | S_008F04_STRIDE(vb->stride); |
||
678 | if (vb->stride) |
||
679 | /* Round up by rounding down and adding 1 */ |
||
680 | desc[2] = (vb->buffer->width0 - offset - |
||
681 | sctx->vertex_elements->format_size[i]) / |
||
682 | vb->stride + 1; |
||
683 | else |
||
684 | desc[2] = vb->buffer->width0 - offset; |
||
685 | |||
686 | desc[3] = sctx->vertex_elements->rsrc_word3[i]; |
||
687 | |||
688 | if (!bound[ve->vertex_buffer_index]) { |
||
689 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
690 | (struct r600_resource*)vb->buffer, |
||
691 | RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO); |
||
692 | bound[ve->vertex_buffer_index] = true; |
||
693 | } |
||
694 | } |
||
695 | |||
696 | desc->atom.num_dw = 8; /* update 2 shader pointers (VS+ES) */ |
||
697 | desc->atom.dirty = true; |
||
698 | |||
699 | /* Don't flush the const cache. It would have a very negative effect |
||
700 | * on performance (confirmed by testing). New descriptors are always |
||
701 | * uploaded to a fresh new buffer, so I don't think flushing the const |
||
702 | * cache is needed. */ |
||
703 | } |
||
704 | |||
705 | |||
706 | /* CONSTANT BUFFERS */ |
||
707 | |||
708 | void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer, |
||
709 | const uint8_t *ptr, unsigned size, uint32_t *const_offset) |
||
710 | { |
||
711 | void *tmp; |
||
712 | |||
713 | u_upload_alloc(sctx->b.uploader, 0, size, const_offset, |
||
714 | (struct pipe_resource**)rbuffer, &tmp); |
||
715 | util_memcpy_cpu_to_le32(tmp, ptr, size); |
||
716 | } |
||
717 | |||
718 | static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint slot, |
||
719 | struct pipe_constant_buffer *input) |
||
720 | { |
||
721 | struct si_context *sctx = (struct si_context *)ctx; |
||
722 | struct si_buffer_resources *buffers = &sctx->const_buffers[shader]; |
||
723 | |||
724 | if (shader >= SI_NUM_SHADERS) |
||
725 | return; |
||
726 | |||
727 | assert(slot < buffers->num_buffers); |
||
728 | pipe_resource_reference(&buffers->buffers[slot], NULL); |
||
729 | |||
730 | /* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy |
||
731 | * with a NULL buffer). We need to use a dummy buffer instead. */ |
||
732 | if (sctx->b.chip_class == CIK && |
||
733 | (!input || (!input->buffer && !input->user_buffer))) |
||
734 | input = &sctx->null_const_buf; |
||
735 | |||
736 | if (input && (input->buffer || input->user_buffer)) { |
||
737 | struct pipe_resource *buffer = NULL; |
||
738 | uint64_t va; |
||
739 | |||
740 | /* Upload the user buffer if needed. */ |
||
741 | if (input->user_buffer) { |
||
742 | unsigned buffer_offset; |
||
743 | |||
744 | si_upload_const_buffer(sctx, |
||
745 | (struct r600_resource**)&buffer, input->user_buffer, |
||
746 | input->buffer_size, &buffer_offset); |
||
747 | va = r600_resource(buffer)->gpu_address + buffer_offset; |
||
748 | } else { |
||
749 | pipe_resource_reference(&buffer, input->buffer); |
||
750 | va = r600_resource(buffer)->gpu_address + input->buffer_offset; |
||
751 | } |
||
752 | |||
753 | /* Set the descriptor. */ |
||
754 | uint32_t *desc = buffers->desc_data[slot]; |
||
755 | desc[0] = va; |
||
756 | desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | |
||
757 | S_008F04_STRIDE(0); |
||
758 | desc[2] = input->buffer_size; |
||
759 | desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | |
||
760 | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | |
||
761 | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | |
||
762 | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | |
||
763 | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | |
||
764 | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); |
||
765 | |||
766 | buffers->buffers[slot] = buffer; |
||
767 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
768 | (struct r600_resource*)buffer, |
||
769 | buffers->shader_usage, buffers->priority); |
||
770 | buffers->desc.enabled_mask |= 1llu << slot; |
||
771 | } else { |
||
772 | /* Clear the descriptor. */ |
||
773 | memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4); |
||
774 | buffers->desc.enabled_mask &= ~(1llu << slot); |
||
775 | } |
||
776 | |||
777 | buffers->desc.dirty_mask |= 1llu << slot; |
||
778 | si_update_descriptors(sctx, &buffers->desc); |
||
779 | } |
||
780 | |||
781 | /* RING BUFFERS */ |
||
782 | |||
783 | void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, |
||
784 | struct pipe_resource *buffer, |
||
785 | unsigned stride, unsigned num_records, |
||
786 | bool add_tid, bool swizzle, |
||
787 | unsigned element_size, unsigned index_stride) |
||
788 | { |
||
789 | struct si_context *sctx = (struct si_context *)ctx; |
||
790 | struct si_buffer_resources *buffers = &sctx->rw_buffers[shader]; |
||
791 | |||
792 | if (shader >= SI_NUM_SHADERS) |
||
793 | return; |
||
794 | |||
795 | /* The stride field in the resource descriptor has 14 bits */ |
||
796 | assert(stride < (1 << 14)); |
||
797 | |||
798 | assert(slot < buffers->num_buffers); |
||
799 | pipe_resource_reference(&buffers->buffers[slot], NULL); |
||
800 | |||
801 | if (buffer) { |
||
802 | uint64_t va; |
||
803 | |||
804 | va = r600_resource(buffer)->gpu_address; |
||
805 | |||
806 | switch (element_size) { |
||
807 | default: |
||
808 | assert(!"Unsupported ring buffer element size"); |
||
809 | case 0: |
||
810 | case 2: |
||
811 | element_size = 0; |
||
812 | break; |
||
813 | case 4: |
||
814 | element_size = 1; |
||
815 | break; |
||
816 | case 8: |
||
817 | element_size = 2; |
||
818 | break; |
||
819 | case 16: |
||
820 | element_size = 3; |
||
821 | break; |
||
822 | } |
||
823 | |||
824 | switch (index_stride) { |
||
825 | default: |
||
826 | assert(!"Unsupported ring buffer index stride"); |
||
827 | case 0: |
||
828 | case 8: |
||
829 | index_stride = 0; |
||
830 | break; |
||
831 | case 16: |
||
832 | index_stride = 1; |
||
833 | break; |
||
834 | case 32: |
||
835 | index_stride = 2; |
||
836 | break; |
||
837 | case 64: |
||
838 | index_stride = 3; |
||
839 | break; |
||
840 | } |
||
841 | |||
842 | /* Set the descriptor. */ |
||
843 | uint32_t *desc = buffers->desc_data[slot]; |
||
844 | desc[0] = va; |
||
845 | desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | |
||
846 | S_008F04_STRIDE(stride) | |
||
847 | S_008F04_SWIZZLE_ENABLE(swizzle); |
||
848 | desc[2] = num_records; |
||
849 | desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | |
||
850 | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | |
||
851 | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | |
||
852 | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | |
||
853 | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | |
||
854 | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) | |
||
855 | S_008F0C_ELEMENT_SIZE(element_size) | |
||
856 | S_008F0C_INDEX_STRIDE(index_stride) | |
||
857 | S_008F0C_ADD_TID_ENABLE(add_tid); |
||
858 | |||
859 | pipe_resource_reference(&buffers->buffers[slot], buffer); |
||
860 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
861 | (struct r600_resource*)buffer, |
||
862 | buffers->shader_usage, buffers->priority); |
||
863 | buffers->desc.enabled_mask |= 1llu << slot; |
||
864 | } else { |
||
865 | /* Clear the descriptor. */ |
||
866 | memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4); |
||
867 | buffers->desc.enabled_mask &= ~(1llu << slot); |
||
868 | } |
||
869 | |||
870 | buffers->desc.dirty_mask |= 1llu << slot; |
||
871 | si_update_descriptors(sctx, &buffers->desc); |
||
872 | } |
||
873 | |||
874 | /* STREAMOUT BUFFERS */ |
||
875 | |||
876 | static void si_set_streamout_targets(struct pipe_context *ctx, |
||
877 | unsigned num_targets, |
||
878 | struct pipe_stream_output_target **targets, |
||
879 | const unsigned *offsets) |
||
880 | { |
||
881 | struct si_context *sctx = (struct si_context *)ctx; |
||
882 | struct si_buffer_resources *buffers = &sctx->rw_buffers[PIPE_SHADER_VERTEX]; |
||
883 | unsigned old_num_targets = sctx->b.streamout.num_targets; |
||
884 | unsigned i, bufidx; |
||
885 | |||
886 | /* We are going to unbind the buffers. Mark which caches need to be flushed. */ |
||
887 | if (sctx->b.streamout.num_targets && sctx->b.streamout.begin_emitted) { |
||
888 | /* Since streamout uses vector writes which go through TC L2 |
||
889 | * and most other clients can use TC L2 as well, we don't need |
||
890 | * to flush it. |
||
891 | * |
||
892 | * The only case which requires flushing it is VGT DMA index |
||
893 | * fetching, which is a rare case. Thus, flag the TC L2 |
||
894 | * dirtiness in the resource and handle it when index fetching |
||
895 | * is used. |
||
896 | */ |
||
897 | for (i = 0; i < sctx->b.streamout.num_targets; i++) |
||
898 | if (sctx->b.streamout.targets[i]) |
||
899 | r600_resource(sctx->b.streamout.targets[i]->b.buffer)->TC_L2_dirty = true; |
||
900 | |||
901 | /* Invalidate the scalar cache in case a streamout buffer is |
||
902 | * going to be used as a constant buffer. |
||
903 | * |
||
904 | * Invalidate TC L1, because streamout bypasses it (done by |
||
905 | * setting GLC=1 in the store instruction), but it can contain |
||
906 | * outdated data of streamout buffers. |
||
907 | * |
||
908 | * VS_PARTIAL_FLUSH is required if the buffers are going to be |
||
909 | * used as an input immediately. |
||
910 | */ |
||
911 | sctx->b.flags |= SI_CONTEXT_INV_KCACHE | |
||
912 | SI_CONTEXT_INV_TC_L1 | |
||
913 | SI_CONTEXT_VS_PARTIAL_FLUSH; |
||
914 | } |
||
915 | |||
916 | /* Streamout buffers must be bound in 2 places: |
||
917 | * 1) in VGT by setting the VGT_STRMOUT registers |
||
918 | * 2) as shader resources |
||
919 | */ |
||
920 | |||
921 | /* Set the VGT regs. */ |
||
922 | r600_set_streamout_targets(ctx, num_targets, targets, offsets); |
||
923 | |||
924 | /* Set the shader resources.*/ |
||
925 | for (i = 0; i < num_targets; i++) { |
||
926 | bufidx = SI_SO_BUF_OFFSET + i; |
||
927 | |||
928 | if (targets[i]) { |
||
929 | struct pipe_resource *buffer = targets[i]->buffer; |
||
930 | uint64_t va = r600_resource(buffer)->gpu_address; |
||
931 | |||
932 | /* Set the descriptor. */ |
||
933 | uint32_t *desc = buffers->desc_data[bufidx]; |
||
934 | desc[0] = va; |
||
935 | desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32); |
||
936 | desc[2] = 0xffffffff; |
||
937 | desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | |
||
938 | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | |
||
939 | S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | |
||
940 | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); |
||
941 | |||
942 | /* Set the resource. */ |
||
943 | pipe_resource_reference(&buffers->buffers[bufidx], |
||
944 | buffer); |
||
945 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
946 | (struct r600_resource*)buffer, |
||
947 | buffers->shader_usage, buffers->priority); |
||
948 | buffers->desc.enabled_mask |= 1llu << bufidx; |
||
949 | } else { |
||
950 | /* Clear the descriptor and unset the resource. */ |
||
951 | memset(buffers->desc_data[bufidx], 0, |
||
952 | sizeof(uint32_t) * 4); |
||
953 | pipe_resource_reference(&buffers->buffers[bufidx], |
||
954 | NULL); |
||
955 | buffers->desc.enabled_mask &= ~(1llu << bufidx); |
||
956 | } |
||
957 | buffers->desc.dirty_mask |= 1llu << bufidx; |
||
958 | } |
||
959 | for (; i < old_num_targets; i++) { |
||
960 | bufidx = SI_SO_BUF_OFFSET + i; |
||
961 | /* Clear the descriptor and unset the resource. */ |
||
962 | memset(buffers->desc_data[bufidx], 0, sizeof(uint32_t) * 4); |
||
963 | pipe_resource_reference(&buffers->buffers[bufidx], NULL); |
||
964 | buffers->desc.enabled_mask &= ~(1llu << bufidx); |
||
965 | buffers->desc.dirty_mask |= 1llu << bufidx; |
||
966 | } |
||
967 | |||
968 | si_update_descriptors(sctx, &buffers->desc); |
||
969 | } |
||
970 | |||
971 | static void si_desc_reset_buffer_offset(struct pipe_context *ctx, |
||
972 | uint32_t *desc, uint64_t old_buf_va, |
||
973 | struct pipe_resource *new_buf) |
||
974 | { |
||
975 | /* Retrieve the buffer offset from the descriptor. */ |
||
976 | uint64_t old_desc_va = |
||
977 | desc[0] | ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32); |
||
978 | |||
979 | assert(old_buf_va <= old_desc_va); |
||
980 | uint64_t offset_within_buffer = old_desc_va - old_buf_va; |
||
981 | |||
982 | /* Update the descriptor. */ |
||
983 | uint64_t va = r600_resource(new_buf)->gpu_address + offset_within_buffer; |
||
984 | |||
985 | desc[0] = va; |
||
986 | desc[1] = (desc[1] & C_008F04_BASE_ADDRESS_HI) | |
||
987 | S_008F04_BASE_ADDRESS_HI(va >> 32); |
||
988 | } |
||
989 | |||
990 | /* BUFFER DISCARD/INVALIDATION */ |
||
991 | |||
992 | /* Reallocate a buffer a update all resource bindings where the buffer is |
||
993 | * bound. |
||
994 | * |
||
995 | * This is used to avoid CPU-GPU synchronizations, because it makes the buffer |
||
996 | * idle by discarding its contents. Apps usually tell us when to do this using |
||
997 | * map_buffer flags, for example. |
||
998 | */ |
||
999 | static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource *buf) |
||
1000 | { |
||
1001 | struct si_context *sctx = (struct si_context*)ctx; |
||
1002 | struct r600_resource *rbuffer = r600_resource(buf); |
||
1003 | unsigned i, shader, alignment = rbuffer->buf->alignment; |
||
1004 | uint64_t old_va = rbuffer->gpu_address; |
||
1005 | unsigned num_elems = sctx->vertex_elements ? |
||
1006 | sctx->vertex_elements->count : 0; |
||
1007 | struct si_sampler_view *view; |
||
1008 | |||
1009 | /* Reallocate the buffer in the same pipe_resource. */ |
||
1010 | r600_init_resource(&sctx->screen->b, rbuffer, rbuffer->b.b.width0, |
||
1011 | alignment, TRUE); |
||
1012 | |||
1013 | /* We changed the buffer, now we need to bind it where the old one |
||
1014 | * was bound. This consists of 2 things: |
||
1015 | * 1) Updating the resource descriptor and dirtying it. |
||
1016 | * 2) Adding a relocation to the CS, so that it's usable. |
||
1017 | */ |
||
1018 | |||
1019 | /* Vertex buffers. */ |
||
1020 | for (i = 0; i < num_elems; i++) { |
||
1021 | int vb = sctx->vertex_elements->elements[i].vertex_buffer_index; |
||
1022 | |||
1023 | if (vb >= Elements(sctx->vertex_buffer)) |
||
1024 | continue; |
||
1025 | if (!sctx->vertex_buffer[vb].buffer) |
||
1026 | continue; |
||
1027 | |||
1028 | if (sctx->vertex_buffer[vb].buffer == buf) { |
||
1029 | sctx->vertex_buffers_dirty = true; |
||
1030 | break; |
||
1031 | } |
||
1032 | } |
||
1033 | |||
1034 | /* Read/Write buffers. */ |
||
1035 | for (shader = 0; shader < SI_NUM_SHADERS; shader++) { |
||
1036 | struct si_buffer_resources *buffers = &sctx->rw_buffers[shader]; |
||
1037 | bool found = false; |
||
1038 | uint64_t mask = buffers->desc.enabled_mask; |
||
1039 | |||
1040 | while (mask) { |
||
1041 | i = u_bit_scan64(&mask); |
||
1042 | if (buffers->buffers[i] == buf) { |
||
1043 | si_desc_reset_buffer_offset(ctx, buffers->desc_data[i], |
||
1044 | old_va, buf); |
||
1045 | |||
1046 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
1047 | rbuffer, buffers->shader_usage, |
||
1048 | buffers->priority); |
||
1049 | |||
1050 | buffers->desc.dirty_mask |= 1llu << i; |
||
1051 | found = true; |
||
1052 | |||
1053 | if (i >= SI_SO_BUF_OFFSET && shader == PIPE_SHADER_VERTEX) { |
||
1054 | /* Update the streamout state. */ |
||
1055 | if (sctx->b.streamout.begin_emitted) { |
||
1056 | r600_emit_streamout_end(&sctx->b); |
||
1057 | } |
||
1058 | sctx->b.streamout.append_bitmask = |
||
1059 | sctx->b.streamout.enabled_mask; |
||
1060 | r600_streamout_buffers_dirty(&sctx->b); |
||
1061 | } |
||
1062 | } |
||
1063 | } |
||
1064 | if (found) { |
||
1065 | si_update_descriptors(sctx, &buffers->desc); |
||
1066 | } |
||
1067 | } |
||
1068 | |||
1069 | /* Constant buffers. */ |
||
1070 | for (shader = 0; shader < SI_NUM_SHADERS; shader++) { |
||
1071 | struct si_buffer_resources *buffers = &sctx->const_buffers[shader]; |
||
1072 | bool found = false; |
||
1073 | uint64_t mask = buffers->desc.enabled_mask; |
||
1074 | |||
1075 | while (mask) { |
||
1076 | unsigned i = u_bit_scan64(&mask); |
||
1077 | if (buffers->buffers[i] == buf) { |
||
1078 | si_desc_reset_buffer_offset(ctx, buffers->desc_data[i], |
||
1079 | old_va, buf); |
||
1080 | |||
1081 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
1082 | rbuffer, buffers->shader_usage, |
||
1083 | buffers->priority); |
||
1084 | |||
1085 | buffers->desc.dirty_mask |= 1llu << i; |
||
1086 | found = true; |
||
1087 | } |
||
1088 | } |
||
1089 | if (found) { |
||
1090 | si_update_descriptors(sctx, &buffers->desc); |
||
1091 | } |
||
1092 | } |
||
1093 | |||
1094 | /* Texture buffers - update virtual addresses in sampler view descriptors. */ |
||
1095 | LIST_FOR_EACH_ENTRY(view, &sctx->b.texture_buffers, list) { |
||
1096 | if (view->base.texture == buf) { |
||
1097 | si_desc_reset_buffer_offset(ctx, &view->state[4], old_va, buf); |
||
1098 | } |
||
1099 | } |
||
1100 | /* Texture buffers - update bindings. */ |
||
1101 | for (shader = 0; shader < SI_NUM_SHADERS; shader++) { |
||
1102 | struct si_sampler_views *views = &sctx->samplers[shader].views; |
||
1103 | bool found = false; |
||
1104 | uint64_t mask = views->desc.enabled_mask; |
||
1105 | |||
1106 | while (mask) { |
||
1107 | unsigned i = u_bit_scan64(&mask); |
||
1108 | if (views->views[i]->texture == buf) { |
||
1109 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
1110 | rbuffer, RADEON_USAGE_READ, |
||
1111 | RADEON_PRIO_SHADER_BUFFER_RO); |
||
1112 | |||
1113 | views->desc.dirty_mask |= 1llu << i; |
||
1114 | found = true; |
||
1115 | } |
||
1116 | } |
||
1117 | if (found) { |
||
1118 | si_update_descriptors(sctx, &views->desc); |
||
1119 | } |
||
1120 | } |
||
1121 | } |
||
1122 | |||
1123 | /* CP DMA */ |
||
1124 | |||
1125 | /* The max number of bytes to copy per packet. */ |
||
1126 | #define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8) |
||
1127 | |||
1128 | static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, |
||
1129 | unsigned offset, unsigned size, unsigned value, |
||
1130 | bool is_framebuffer) |
||
1131 | { |
||
1132 | struct si_context *sctx = (struct si_context*)ctx; |
||
1133 | unsigned flush_flags, tc_l2_flag; |
||
1134 | |||
1135 | if (!size) |
||
1136 | return; |
||
1137 | |||
1138 | /* Mark the buffer range of destination as valid (initialized), |
||
1139 | * so that transfer_map knows it should wait for the GPU when mapping |
||
1140 | * that range. */ |
||
1141 | util_range_add(&r600_resource(dst)->valid_buffer_range, offset, |
||
1142 | offset + size); |
||
1143 | |||
1144 | /* Fallback for unaligned clears. */ |
||
1145 | if (offset % 4 != 0 || size % 4 != 0) { |
||
1146 | uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf, |
||
1147 | sctx->b.rings.gfx.cs, |
||
1148 | PIPE_TRANSFER_WRITE); |
||
1149 | size /= 4; |
||
1150 | for (unsigned i = 0; i < size; i++) |
||
1151 | *map++ = value; |
||
1152 | return; |
||
1153 | } |
||
1154 | |||
1155 | uint64_t va = r600_resource(dst)->gpu_address + offset; |
||
1156 | |||
1157 | /* Flush the caches where the resource is bound. */ |
||
1158 | if (is_framebuffer) { |
||
1159 | flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; |
||
1160 | tc_l2_flag = 0; |
||
1161 | } else { |
||
1162 | flush_flags = SI_CONTEXT_INV_TC_L1 | |
||
1163 | (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | |
||
1164 | SI_CONTEXT_INV_KCACHE; |
||
1165 | tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; |
||
1166 | } |
||
1167 | |||
1168 | sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | |
||
1169 | flush_flags; |
||
1170 | |||
1171 | while (size) { |
||
1172 | unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); |
||
1173 | unsigned dma_flags = tc_l2_flag; |
||
1174 | |||
1175 | si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), |
||
1176 | FALSE); |
||
1177 | |||
1178 | /* This must be done after need_cs_space. */ |
||
1179 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
1180 | (struct r600_resource*)dst, RADEON_USAGE_WRITE, |
||
1181 | RADEON_PRIO_MIN); |
||
1182 | |||
1183 | /* Flush the caches for the first copy only. |
||
1184 | * Also wait for the previous CP DMA operations. */ |
||
1185 | if (sctx->b.flags) { |
||
1186 | si_emit_cache_flush(&sctx->b, NULL); |
||
1187 | dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */ |
||
1188 | } |
||
1189 | |||
1190 | /* Do the synchronization after the last copy, so that all data is written to memory. */ |
||
1191 | if (size == byte_count) |
||
1192 | dma_flags |= R600_CP_DMA_SYNC; |
||
1193 | |||
1194 | /* Emit the clear packet. */ |
||
1195 | si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags); |
||
1196 | |||
1197 | size -= byte_count; |
||
1198 | va += byte_count; |
||
1199 | } |
||
1200 | |||
1201 | /* Flush the caches again in case the 3D engine has been prefetching |
||
1202 | * the resource. */ |
||
1203 | sctx->b.flags |= flush_flags; |
||
1204 | |||
1205 | if (tc_l2_flag) |
||
1206 | r600_resource(dst)->TC_L2_dirty = true; |
||
1207 | } |
||
1208 | |||
1209 | void si_copy_buffer(struct si_context *sctx, |
||
1210 | struct pipe_resource *dst, struct pipe_resource *src, |
||
1211 | uint64_t dst_offset, uint64_t src_offset, unsigned size, |
||
1212 | bool is_framebuffer) |
||
1213 | { |
||
1214 | unsigned flush_flags, tc_l2_flag; |
||
1215 | |||
1216 | if (!size) |
||
1217 | return; |
||
1218 | |||
1219 | /* Mark the buffer range of destination as valid (initialized), |
||
1220 | * so that transfer_map knows it should wait for the GPU when mapping |
||
1221 | * that range. */ |
||
1222 | util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset, |
||
1223 | dst_offset + size); |
||
1224 | |||
1225 | dst_offset += r600_resource(dst)->gpu_address; |
||
1226 | src_offset += r600_resource(src)->gpu_address; |
||
1227 | |||
1228 | /* Flush the caches where the resource is bound. */ |
||
1229 | if (is_framebuffer) { |
||
1230 | flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; |
||
1231 | tc_l2_flag = 0; |
||
1232 | } else { |
||
1233 | flush_flags = SI_CONTEXT_INV_TC_L1 | |
||
1234 | (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | |
||
1235 | SI_CONTEXT_INV_KCACHE; |
||
1236 | tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; |
||
1237 | } |
||
1238 | |||
1239 | sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | |
||
1240 | flush_flags; |
||
1241 | |||
1242 | while (size) { |
||
1243 | unsigned sync_flags = tc_l2_flag; |
||
1244 | unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); |
||
1245 | |||
1246 | si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE); |
||
1247 | |||
1248 | /* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */ |
||
1249 | if (sctx->b.flags) { |
||
1250 | si_emit_cache_flush(&sctx->b, NULL); |
||
1251 | sync_flags |= SI_CP_DMA_RAW_WAIT; |
||
1252 | } |
||
1253 | |||
1254 | /* Do the synchronization after the last copy, so that all data is written to memory. */ |
||
1255 | if (size == byte_count) { |
||
1256 | sync_flags |= R600_CP_DMA_SYNC; |
||
1257 | } |
||
1258 | |||
1259 | /* This must be done after r600_need_cs_space. */ |
||
1260 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src, |
||
1261 | RADEON_USAGE_READ, RADEON_PRIO_MIN); |
||
1262 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst, |
||
1263 | RADEON_USAGE_WRITE, RADEON_PRIO_MIN); |
||
1264 | |||
1265 | si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags); |
||
1266 | |||
1267 | size -= byte_count; |
||
1268 | src_offset += byte_count; |
||
1269 | dst_offset += byte_count; |
||
1270 | } |
||
1271 | |||
1272 | /* Flush the caches again in case the 3D engine has been prefetching |
||
1273 | * the resource. */ |
||
1274 | sctx->b.flags |= flush_flags; |
||
1275 | |||
1276 | if (tc_l2_flag) |
||
1277 | r600_resource(dst)->TC_L2_dirty = true; |
||
1278 | } |
||
1279 | |||
1280 | /* INIT/DEINIT */ |
||
1281 | |||
1282 | void si_init_all_descriptors(struct si_context *sctx) |
||
1283 | { |
||
1284 | int i; |
||
1285 | |||
1286 | for (i = 0; i < SI_NUM_SHADERS; i++) { |
||
1287 | si_init_buffer_resources(sctx, &sctx->const_buffers[i], |
||
1288 | SI_NUM_CONST_BUFFERS, i, SI_SGPR_CONST, |
||
1289 | RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO); |
||
1290 | si_init_buffer_resources(sctx, &sctx->rw_buffers[i], |
||
1291 | i == PIPE_SHADER_VERTEX ? |
||
1292 | SI_NUM_RW_BUFFERS : SI_NUM_RING_BUFFERS, |
||
1293 | i, SI_SGPR_RW_BUFFERS, |
||
1294 | RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW); |
||
1295 | |||
1296 | si_init_sampler_views(sctx, &sctx->samplers[i].views, i); |
||
1297 | |||
1298 | si_init_descriptors(sctx, &sctx->samplers[i].states.desc, |
||
1299 | si_get_shader_user_data_base(i) + SI_SGPR_SAMPLER * 4, |
||
1300 | 4, SI_NUM_SAMPLER_STATES, si_emit_sampler_states); |
||
1301 | |||
1302 | sctx->atoms.s.const_buffers[i] = &sctx->const_buffers[i].desc.atom; |
||
1303 | sctx->atoms.s.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom; |
||
1304 | sctx->atoms.s.sampler_views[i] = &sctx->samplers[i].views.desc.atom; |
||
1305 | sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom; |
||
1306 | } |
||
1307 | |||
1308 | si_init_descriptors(sctx, &sctx->vertex_buffers, |
||
1309 | si_get_shader_user_data_base(PIPE_SHADER_VERTEX) + |
||
1310 | SI_SGPR_VERTEX_BUFFER*4, 4, SI_NUM_VERTEX_BUFFERS, |
||
1311 | si_emit_shader_pointer); |
||
1312 | sctx->atoms.s.vertex_buffers = &sctx->vertex_buffers.atom; |
||
1313 | |||
1314 | /* Set pipe_context functions. */ |
||
1315 | sctx->b.b.set_constant_buffer = si_set_constant_buffer; |
||
1316 | sctx->b.b.set_sampler_views = si_set_sampler_views; |
||
1317 | sctx->b.b.set_stream_output_targets = si_set_streamout_targets; |
||
1318 | sctx->b.clear_buffer = si_clear_buffer; |
||
1319 | sctx->b.invalidate_buffer = si_invalidate_buffer; |
||
1320 | } |
||
1321 | |||
1322 | void si_release_all_descriptors(struct si_context *sctx) |
||
1323 | { |
||
1324 | int i; |
||
1325 | |||
1326 | for (i = 0; i < SI_NUM_SHADERS; i++) { |
||
1327 | si_release_buffer_resources(&sctx->const_buffers[i]); |
||
1328 | si_release_buffer_resources(&sctx->rw_buffers[i]); |
||
1329 | si_release_sampler_views(&sctx->samplers[i].views); |
||
1330 | si_release_descriptors(&sctx->samplers[i].states.desc); |
||
1331 | } |
||
1332 | si_release_descriptors(&sctx->vertex_buffers); |
||
1333 | } |
||
1334 | |||
1335 | void si_all_descriptors_begin_new_cs(struct si_context *sctx) |
||
1336 | { |
||
1337 | int i; |
||
1338 | |||
1339 | for (i = 0; i < SI_NUM_SHADERS; i++) { |
||
1340 | si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]); |
||
1341 | si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers[i]); |
||
1342 | si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views); |
||
1343 | si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states); |
||
1344 | } |
||
1345 | si_vertex_buffers_begin_new_cs(sctx); |
||
1346 | }>>>>><>><>>><>>><>>>=>><>><>><>>><>><>><>>>><>><>><>>><>>><>><>><>>>=>>>>><>><>>><>><>><>><>><>><>><>><>>><>><>><>>><>>>>>=>=>21)-1))><21)-1))>21)-1))><21)-1))>><>><>><> |