Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5564 | serge | 1 | /* |
2 | * Copyright 2012 Advanced Micro Devices, Inc. |
||
3 | * |
||
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
5 | * copy of this software and associated documentation files (the "Software"), |
||
6 | * to deal in the Software without restriction, including without limitation |
||
7 | * on the rights to use, copy, modify, merge, publish, distribute, sub |
||
8 | * license, and/or sell copies of the Software, and to permit persons to whom |
||
9 | * the Software is furnished to do so, subject to the following conditions: |
||
10 | * |
||
11 | * The above copyright notice and this permission notice (including the next |
||
12 | * paragraph) shall be included in all copies or substantial portions of the |
||
13 | * Software. |
||
14 | * |
||
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL |
||
18 | * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, |
||
19 | * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR |
||
20 | * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE |
||
21 | * USE OR OTHER DEALINGS IN THE SOFTWARE. |
||
22 | * |
||
23 | * Authors: |
||
24 | * Christian König |
||
25 | */ |
||
26 | |||
27 | #include "si_pipe.h" |
||
28 | #include "si_shader.h" |
||
29 | #include "radeon/r600_cs.h" |
||
30 | #include "sid.h" |
||
31 | |||
32 | #include "util/u_index_modify.h" |
||
33 | #include "util/u_upload_mgr.h" |
||
34 | |||
35 | static void si_decompress_textures(struct si_context *sctx) |
||
36 | { |
||
37 | if (!sctx->blitter->running) { |
||
38 | /* Flush depth textures which need to be flushed. */ |
||
39 | for (int i = 0; i < SI_NUM_SHADERS; i++) { |
||
40 | if (sctx->samplers[i].depth_texture_mask) { |
||
41 | si_flush_depth_textures(sctx, &sctx->samplers[i]); |
||
42 | } |
||
43 | if (sctx->samplers[i].compressed_colortex_mask) { |
||
44 | si_decompress_color_textures(sctx, &sctx->samplers[i]); |
||
45 | } |
||
46 | } |
||
47 | } |
||
48 | } |
||
49 | |||
50 | static unsigned si_conv_pipe_prim(unsigned mode) |
||
51 | { |
||
52 | static const unsigned prim_conv[] = { |
||
53 | [PIPE_PRIM_POINTS] = V_008958_DI_PT_POINTLIST, |
||
54 | [PIPE_PRIM_LINES] = V_008958_DI_PT_LINELIST, |
||
55 | [PIPE_PRIM_LINE_LOOP] = V_008958_DI_PT_LINELOOP, |
||
56 | [PIPE_PRIM_LINE_STRIP] = V_008958_DI_PT_LINESTRIP, |
||
57 | [PIPE_PRIM_TRIANGLES] = V_008958_DI_PT_TRILIST, |
||
58 | [PIPE_PRIM_TRIANGLE_STRIP] = V_008958_DI_PT_TRISTRIP, |
||
59 | [PIPE_PRIM_TRIANGLE_FAN] = V_008958_DI_PT_TRIFAN, |
||
60 | [PIPE_PRIM_QUADS] = V_008958_DI_PT_QUADLIST, |
||
61 | [PIPE_PRIM_QUAD_STRIP] = V_008958_DI_PT_QUADSTRIP, |
||
62 | [PIPE_PRIM_POLYGON] = V_008958_DI_PT_POLYGON, |
||
63 | [PIPE_PRIM_LINES_ADJACENCY] = V_008958_DI_PT_LINELIST_ADJ, |
||
64 | [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_008958_DI_PT_LINESTRIP_ADJ, |
||
65 | [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_008958_DI_PT_TRILIST_ADJ, |
||
66 | [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_008958_DI_PT_TRISTRIP_ADJ, |
||
67 | [R600_PRIM_RECTANGLE_LIST] = V_008958_DI_PT_RECTLIST |
||
68 | }; |
||
69 | assert(mode < Elements(prim_conv)); |
||
70 | return prim_conv[mode]; |
||
71 | } |
||
72 | |||
73 | static unsigned si_conv_prim_to_gs_out(unsigned mode) |
||
74 | { |
||
75 | static const int prim_conv[] = { |
||
76 | [PIPE_PRIM_POINTS] = V_028A6C_OUTPRIM_TYPE_POINTLIST, |
||
77 | [PIPE_PRIM_LINES] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, |
||
78 | [PIPE_PRIM_LINE_LOOP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, |
||
79 | [PIPE_PRIM_LINE_STRIP] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, |
||
80 | [PIPE_PRIM_TRIANGLES] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, |
||
81 | [PIPE_PRIM_TRIANGLE_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, |
||
82 | [PIPE_PRIM_TRIANGLE_FAN] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, |
||
83 | [PIPE_PRIM_QUADS] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, |
||
84 | [PIPE_PRIM_QUAD_STRIP] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, |
||
85 | [PIPE_PRIM_POLYGON] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, |
||
86 | [PIPE_PRIM_LINES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, |
||
87 | [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, |
||
88 | [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, |
||
89 | [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, |
||
90 | [R600_PRIM_RECTANGLE_LIST] = V_028A6C_OUTPRIM_TYPE_TRISTRIP |
||
91 | }; |
||
92 | assert(mode < Elements(prim_conv)); |
||
93 | |||
94 | return prim_conv[mode]; |
||
95 | } |
||
96 | |||
97 | static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, |
||
98 | const struct pipe_draw_info *info) |
||
99 | { |
||
100 | struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; |
||
101 | unsigned prim = info->mode; |
||
102 | unsigned primgroup_size = 128; /* recommended without a GS */ |
||
103 | |||
104 | /* SWITCH_ON_EOP(0) is always preferable. */ |
||
105 | bool wd_switch_on_eop = false; |
||
106 | bool ia_switch_on_eop = false; |
||
107 | bool partial_vs_wave = false; |
||
108 | |||
109 | if (sctx->gs_shader) |
||
110 | primgroup_size = 64; /* recommended with a GS */ |
||
111 | |||
112 | /* This is a hardware requirement. */ |
||
113 | if ((rs && rs->line_stipple_enable) || |
||
114 | (sctx->b.screen->debug_flags & DBG_SWITCH_ON_EOP)) { |
||
115 | ia_switch_on_eop = true; |
||
116 | wd_switch_on_eop = true; |
||
117 | } |
||
118 | |||
119 | if (sctx->b.streamout.streamout_enabled || |
||
120 | sctx->b.streamout.prims_gen_query_enabled) |
||
121 | partial_vs_wave = true; |
||
122 | |||
123 | if (sctx->b.chip_class >= CIK) { |
||
124 | /* WD_SWITCH_ON_EOP has no effect on GPUs with less than |
||
125 | * 4 shader engines. Set 1 to pass the assertion below. |
||
126 | * The other cases are hardware requirements. */ |
||
127 | if (sctx->b.screen->info.max_se < 4 || |
||
128 | prim == PIPE_PRIM_POLYGON || |
||
129 | prim == PIPE_PRIM_LINE_LOOP || |
||
130 | prim == PIPE_PRIM_TRIANGLE_FAN || |
||
131 | prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY || |
||
132 | info->primitive_restart) |
||
133 | wd_switch_on_eop = true; |
||
134 | |||
135 | /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0. |
||
136 | * We don't know that for indirect drawing, so treat it as |
||
137 | * always problematic. */ |
||
138 | if (sctx->b.family == CHIP_HAWAII && |
||
139 | (info->indirect || info->instance_count > 1)) |
||
140 | wd_switch_on_eop = true; |
||
141 | |||
142 | /* If the WD switch is false, the IA switch must be false too. */ |
||
143 | assert(wd_switch_on_eop || !ia_switch_on_eop); |
||
144 | } |
||
145 | |||
146 | return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | |
||
147 | S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) | |
||
148 | S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1) | |
||
149 | S_028AA8_WD_SWITCH_ON_EOP(sctx->b.chip_class >= CIK ? wd_switch_on_eop : 0); |
||
150 | } |
||
151 | |||
152 | static void si_emit_scratch_reloc(struct si_context *sctx) |
||
153 | { |
||
154 | struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; |
||
155 | |||
156 | if (!sctx->emit_scratch_reloc) |
||
157 | return; |
||
158 | |||
159 | r600_write_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, |
||
160 | sctx->spi_tmpring_size); |
||
161 | |||
162 | if (sctx->scratch_buffer) { |
||
163 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
164 | sctx->scratch_buffer, RADEON_USAGE_READWRITE, |
||
165 | RADEON_PRIO_SHADER_RESOURCE_RW); |
||
166 | |||
167 | } |
||
168 | sctx->emit_scratch_reloc = false; |
||
169 | } |
||
170 | |||
171 | /* rast_prim is the primitive type after GS. */ |
||
172 | static void si_emit_rasterizer_prim_state(struct si_context *sctx) |
||
173 | { |
||
174 | struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; |
||
175 | unsigned rast_prim = sctx->current_rast_prim; |
||
176 | struct si_state_rasterizer *rs = sctx->emitted.named.rasterizer; |
||
177 | |||
178 | /* Skip this if not rendering lines. */ |
||
179 | if (rast_prim != PIPE_PRIM_LINES && |
||
180 | rast_prim != PIPE_PRIM_LINE_LOOP && |
||
181 | rast_prim != PIPE_PRIM_LINE_STRIP && |
||
182 | rast_prim != PIPE_PRIM_LINES_ADJACENCY && |
||
183 | rast_prim != PIPE_PRIM_LINE_STRIP_ADJACENCY) |
||
184 | return; |
||
185 | |||
186 | if (rast_prim == sctx->last_rast_prim && |
||
187 | rs->pa_sc_line_stipple == sctx->last_sc_line_stipple) |
||
188 | return; |
||
189 | |||
190 | r600_write_context_reg(cs, R_028A0C_PA_SC_LINE_STIPPLE, |
||
191 | rs->pa_sc_line_stipple | |
||
192 | S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? 1 : |
||
193 | rast_prim == PIPE_PRIM_LINE_STRIP ? 2 : 0)); |
||
194 | |||
195 | sctx->last_rast_prim = rast_prim; |
||
196 | sctx->last_sc_line_stipple = rs->pa_sc_line_stipple; |
||
197 | } |
||
198 | |||
199 | static void si_emit_draw_registers(struct si_context *sctx, |
||
200 | const struct pipe_draw_info *info) |
||
201 | { |
||
202 | struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; |
||
203 | unsigned prim = si_conv_pipe_prim(info->mode); |
||
204 | unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim); |
||
205 | unsigned ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info); |
||
206 | |||
207 | /* Draw state. */ |
||
208 | if (prim != sctx->last_prim || |
||
209 | ia_multi_vgt_param != sctx->last_multi_vgt_param) { |
||
210 | if (sctx->b.chip_class >= CIK) { |
||
211 | radeon_emit(cs, PKT3(PKT3_DRAW_PREAMBLE, 2, 0)); |
||
212 | radeon_emit(cs, prim); /* VGT_PRIMITIVE_TYPE */ |
||
213 | radeon_emit(cs, ia_multi_vgt_param); /* IA_MULTI_VGT_PARAM */ |
||
214 | radeon_emit(cs, 0); /* VGT_LS_HS_CONFIG */ |
||
215 | } else { |
||
216 | r600_write_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, prim); |
||
217 | r600_write_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param); |
||
218 | } |
||
219 | sctx->last_prim = prim; |
||
220 | sctx->last_multi_vgt_param = ia_multi_vgt_param; |
||
221 | } |
||
222 | |||
223 | if (gs_out_prim != sctx->last_gs_out_prim) { |
||
224 | r600_write_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim); |
||
225 | sctx->last_gs_out_prim = gs_out_prim; |
||
226 | } |
||
227 | |||
228 | /* Primitive restart. */ |
||
229 | if (info->primitive_restart != sctx->last_primitive_restart_en) { |
||
230 | r600_write_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, info->primitive_restart); |
||
231 | sctx->last_primitive_restart_en = info->primitive_restart; |
||
232 | |||
233 | if (info->primitive_restart && |
||
234 | (info->restart_index != sctx->last_restart_index || |
||
235 | sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN)) { |
||
236 | r600_write_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, |
||
237 | info->restart_index); |
||
238 | sctx->last_restart_index = info->restart_index; |
||
239 | } |
||
240 | } |
||
241 | } |
||
242 | |||
243 | static void si_emit_draw_packets(struct si_context *sctx, |
||
244 | const struct pipe_draw_info *info, |
||
245 | const struct pipe_index_buffer *ib) |
||
246 | { |
||
247 | struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; |
||
248 | unsigned sh_base_reg = (sctx->gs_shader ? R_00B330_SPI_SHADER_USER_DATA_ES_0 : |
||
249 | R_00B130_SPI_SHADER_USER_DATA_VS_0); |
||
250 | |||
251 | if (info->count_from_stream_output) { |
||
252 | struct r600_so_target *t = |
||
253 | (struct r600_so_target*)info->count_from_stream_output; |
||
254 | uint64_t va = t->buf_filled_size->gpu_address + |
||
255 | t->buf_filled_size_offset; |
||
256 | |||
257 | r600_write_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, |
||
258 | t->stride_in_dw); |
||
259 | |||
260 | radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); |
||
261 | radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | |
||
262 | COPY_DATA_DST_SEL(COPY_DATA_REG) | |
||
263 | COPY_DATA_WR_CONFIRM); |
||
264 | radeon_emit(cs, va); /* src address lo */ |
||
265 | radeon_emit(cs, va >> 32); /* src address hi */ |
||
266 | radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2); |
||
267 | radeon_emit(cs, 0); /* unused */ |
||
268 | |||
269 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
270 | t->buf_filled_size, RADEON_USAGE_READ, |
||
271 | RADEON_PRIO_MIN); |
||
272 | } |
||
273 | |||
274 | /* draw packet */ |
||
275 | if (info->indexed) { |
||
276 | radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0)); |
||
277 | |||
278 | if (ib->index_size == 4) { |
||
279 | radeon_emit(cs, V_028A7C_VGT_INDEX_32 | (SI_BIG_ENDIAN ? |
||
280 | V_028A7C_VGT_DMA_SWAP_32_BIT : 0)); |
||
281 | } else { |
||
282 | radeon_emit(cs, V_028A7C_VGT_INDEX_16 | (SI_BIG_ENDIAN ? |
||
283 | V_028A7C_VGT_DMA_SWAP_16_BIT : 0)); |
||
284 | } |
||
285 | } |
||
286 | |||
287 | if (!info->indirect) { |
||
288 | int base_vertex; |
||
289 | |||
290 | radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0)); |
||
291 | radeon_emit(cs, info->instance_count); |
||
292 | |||
293 | /* Base vertex and start instance. */ |
||
294 | base_vertex = info->indexed ? info->index_bias : info->start; |
||
295 | |||
296 | if (base_vertex != sctx->last_base_vertex || |
||
297 | sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN || |
||
298 | info->start_instance != sctx->last_start_instance || |
||
299 | sh_base_reg != sctx->last_sh_base_reg) { |
||
300 | si_write_sh_reg_seq(cs, sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2); |
||
301 | radeon_emit(cs, base_vertex); |
||
302 | radeon_emit(cs, info->start_instance); |
||
303 | |||
304 | sctx->last_base_vertex = base_vertex; |
||
305 | sctx->last_start_instance = info->start_instance; |
||
306 | sctx->last_sh_base_reg = sh_base_reg; |
||
307 | } |
||
308 | } else { |
||
309 | si_invalidate_draw_sh_constants(sctx); |
||
310 | |||
311 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
312 | (struct r600_resource *)info->indirect, |
||
313 | RADEON_USAGE_READ, RADEON_PRIO_MIN); |
||
314 | } |
||
315 | |||
316 | if (info->indexed) { |
||
317 | uint32_t index_max_size = (ib->buffer->width0 - ib->offset) / |
||
318 | ib->index_size; |
||
319 | uint64_t index_va = r600_resource(ib->buffer)->gpu_address + ib->offset; |
||
320 | |||
321 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, |
||
322 | (struct r600_resource *)ib->buffer, |
||
323 | RADEON_USAGE_READ, RADEON_PRIO_MIN); |
||
324 | |||
325 | if (info->indirect) { |
||
326 | uint64_t indirect_va = r600_resource(info->indirect)->gpu_address; |
||
327 | |||
328 | assert(indirect_va % 8 == 0); |
||
329 | assert(index_va % 2 == 0); |
||
330 | assert(info->indirect_offset % 4 == 0); |
||
331 | |||
332 | radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0)); |
||
333 | radeon_emit(cs, 1); |
||
334 | radeon_emit(cs, indirect_va); |
||
335 | radeon_emit(cs, indirect_va >> 32); |
||
336 | |||
337 | radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0)); |
||
338 | radeon_emit(cs, index_va); |
||
339 | radeon_emit(cs, index_va >> 32); |
||
340 | |||
341 | radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0)); |
||
342 | radeon_emit(cs, index_max_size); |
||
343 | |||
344 | radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, sctx->b.predicate_drawing)); |
||
345 | radeon_emit(cs, info->indirect_offset); |
||
346 | radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2); |
||
347 | radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2); |
||
348 | radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA); |
||
349 | } else { |
||
350 | index_va += info->start * ib->index_size; |
||
351 | |||
352 | radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, sctx->b.predicate_drawing)); |
||
353 | radeon_emit(cs, index_max_size); |
||
354 | radeon_emit(cs, index_va); |
||
355 | radeon_emit(cs, (index_va >> 32UL) & 0xFF); |
||
356 | radeon_emit(cs, info->count); |
||
357 | radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA); |
||
358 | } |
||
359 | } else { |
||
360 | if (info->indirect) { |
||
361 | uint64_t indirect_va = r600_resource(info->indirect)->gpu_address; |
||
362 | |||
363 | assert(indirect_va % 8 == 0); |
||
364 | assert(info->indirect_offset % 4 == 0); |
||
365 | |||
366 | radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0)); |
||
367 | radeon_emit(cs, 1); |
||
368 | radeon_emit(cs, indirect_va); |
||
369 | radeon_emit(cs, indirect_va >> 32); |
||
370 | |||
371 | radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, sctx->b.predicate_drawing)); |
||
372 | radeon_emit(cs, info->indirect_offset); |
||
373 | radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2); |
||
374 | radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2); |
||
375 | radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX); |
||
376 | } else { |
||
377 | radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, sctx->b.predicate_drawing)); |
||
378 | radeon_emit(cs, info->count); |
||
379 | radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | |
||
380 | S_0287F0_USE_OPAQUE(!!info->count_from_stream_output)); |
||
381 | } |
||
382 | } |
||
383 | } |
||
384 | |||
385 | #define BOTH_ICACHE_KCACHE (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_KCACHE) |
||
386 | |||
387 | void si_emit_cache_flush(struct r600_common_context *sctx, struct r600_atom *atom) |
||
388 | { |
||
389 | struct radeon_winsys_cs *cs = sctx->rings.gfx.cs; |
||
390 | uint32_t cp_coher_cntl = 0; |
||
391 | uint32_t compute = |
||
392 | PKT3_SHADER_TYPE_S(!!(sctx->flags & SI_CONTEXT_FLAG_COMPUTE)); |
||
393 | |||
394 | /* SI has a bug that it always flushes ICACHE and KCACHE if either |
||
395 | * bit is set. An alternative way is to write SQC_CACHES, but that |
||
396 | * doesn't seem to work reliably. Since the bug doesn't affect |
||
397 | * correctness (it only does more work than necessary) and |
||
398 | * the performance impact is likely negligible, there is no plan |
||
399 | * to fix it. |
||
400 | */ |
||
401 | |||
402 | if (sctx->flags & SI_CONTEXT_INV_ICACHE) |
||
403 | cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); |
||
404 | if (sctx->flags & SI_CONTEXT_INV_KCACHE) |
||
405 | cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); |
||
406 | |||
407 | if (sctx->flags & SI_CONTEXT_INV_TC_L1) |
||
408 | cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1); |
||
409 | if (sctx->flags & SI_CONTEXT_INV_TC_L2) |
||
410 | cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1); |
||
411 | |||
412 | if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB) { |
||
413 | cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | |
||
414 | S_0085F0_CB0_DEST_BASE_ENA(1) | |
||
415 | S_0085F0_CB1_DEST_BASE_ENA(1) | |
||
416 | S_0085F0_CB2_DEST_BASE_ENA(1) | |
||
417 | S_0085F0_CB3_DEST_BASE_ENA(1) | |
||
418 | S_0085F0_CB4_DEST_BASE_ENA(1) | |
||
419 | S_0085F0_CB5_DEST_BASE_ENA(1) | |
||
420 | S_0085F0_CB6_DEST_BASE_ENA(1) | |
||
421 | S_0085F0_CB7_DEST_BASE_ENA(1); |
||
422 | } |
||
423 | if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB) { |
||
424 | cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | |
||
425 | S_0085F0_DB_DEST_BASE_ENA(1); |
||
426 | } |
||
427 | |||
428 | if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB_META) { |
||
429 | radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute); |
||
430 | radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); |
||
431 | } |
||
432 | if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB_META) { |
||
433 | radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute); |
||
434 | radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); |
||
435 | } |
||
436 | if (sctx->flags & SI_CONTEXT_FLUSH_WITH_INV_L2) { |
||
437 | radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute); |
||
438 | radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH) | EVENT_INDEX(7) | |
||
439 | EVENT_WRITE_INV_L2); |
||
440 | } |
||
441 | |||
442 | /* FLUSH_AND_INV events must be emitted before PS_PARTIAL_FLUSH. |
||
443 | * Otherwise, clearing CMASK (CB meta) with CP DMA isn't reliable. |
||
444 | * |
||
445 | * I think the reason is that FLUSH_AND_INV is only added to a queue |
||
446 | * and it is PS_PARTIAL_FLUSH that waits for it to complete. |
||
447 | */ |
||
448 | if (sctx->flags & SI_CONTEXT_PS_PARTIAL_FLUSH) { |
||
449 | radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute); |
||
450 | radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); |
||
451 | } else if (sctx->flags & SI_CONTEXT_VS_PARTIAL_FLUSH) { |
||
452 | radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute); |
||
453 | radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); |
||
454 | } |
||
455 | if (sctx->flags & SI_CONTEXT_CS_PARTIAL_FLUSH) { |
||
456 | radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute); |
||
457 | radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4))); |
||
458 | } |
||
459 | if (sctx->flags & SI_CONTEXT_VGT_FLUSH) { |
||
460 | radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute); |
||
461 | radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); |
||
462 | } |
||
463 | if (sctx->flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) { |
||
464 | radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute); |
||
465 | radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0)); |
||
466 | } |
||
467 | |||
468 | /* SURFACE_SYNC must be emitted after partial flushes. |
||
469 | * It looks like SURFACE_SYNC flushes caches immediately and doesn't |
||
470 | * wait for any engines. This should be last. |
||
471 | */ |
||
472 | if (cp_coher_cntl) { |
||
473 | if (sctx->chip_class >= CIK) { |
||
474 | radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0) | compute); |
||
475 | radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ |
||
476 | radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ |
||
477 | radeon_emit(cs, 0xff); /* CP_COHER_SIZE_HI */ |
||
478 | radeon_emit(cs, 0); /* CP_COHER_BASE */ |
||
479 | radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ |
||
480 | radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ |
||
481 | } else { |
||
482 | radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0) | compute); |
||
483 | radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ |
||
484 | radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ |
||
485 | radeon_emit(cs, 0); /* CP_COHER_BASE */ |
||
486 | radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ |
||
487 | } |
||
488 | } |
||
489 | |||
490 | sctx->flags = 0; |
||
491 | } |
||
492 | |||
493 | const struct r600_atom si_atom_cache_flush = { si_emit_cache_flush, 24 }; /* number of CS dwords */ |
||
494 | |||
495 | static void si_get_draw_start_count(struct si_context *sctx, |
||
496 | const struct pipe_draw_info *info, |
||
497 | unsigned *start, unsigned *count) |
||
498 | { |
||
499 | if (info->indirect) { |
||
500 | struct r600_resource *indirect = |
||
501 | (struct r600_resource*)info->indirect; |
||
502 | int *data = r600_buffer_map_sync_with_rings(&sctx->b, |
||
503 | indirect, PIPE_TRANSFER_READ); |
||
504 | data += info->indirect_offset/sizeof(int); |
||
505 | *start = data[2]; |
||
506 | *count = data[0]; |
||
507 | } else { |
||
508 | *start = info->start; |
||
509 | *count = info->count; |
||
510 | } |
||
511 | } |
||
512 | |||
513 | void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) |
||
514 | { |
||
515 | struct si_context *sctx = (struct si_context *)ctx; |
||
516 | struct pipe_index_buffer ib = {}; |
||
517 | unsigned i; |
||
518 | |||
519 | if (!info->count && !info->indirect && |
||
520 | (info->indexed || !info->count_from_stream_output)) |
||
521 | return; |
||
522 | |||
523 | if (!sctx->ps_shader || !sctx->vs_shader) |
||
524 | return; |
||
525 | |||
526 | si_decompress_textures(sctx); |
||
527 | |||
528 | /* Set the rasterization primitive type. |
||
529 | * |
||
530 | * This must be done after si_decompress_textures, which can call |
||
531 | * draw_vbo recursively, and before si_update_shaders, which uses |
||
532 | * current_rast_prim for this draw_vbo call. */ |
||
533 | if (sctx->gs_shader) |
||
534 | sctx->current_rast_prim = sctx->gs_shader->gs_output_prim; |
||
535 | else |
||
536 | sctx->current_rast_prim = info->mode; |
||
537 | |||
538 | si_update_shaders(sctx); |
||
539 | |||
540 | if (sctx->vertex_buffers_dirty) { |
||
541 | si_update_vertex_buffers(sctx); |
||
542 | sctx->vertex_buffers_dirty = false; |
||
543 | } |
||
544 | |||
545 | if (info->indexed) { |
||
546 | /* Initialize the index buffer struct. */ |
||
547 | pipe_resource_reference(&ib.buffer, sctx->index_buffer.buffer); |
||
548 | ib.user_buffer = sctx->index_buffer.user_buffer; |
||
549 | ib.index_size = sctx->index_buffer.index_size; |
||
550 | ib.offset = sctx->index_buffer.offset; |
||
551 | |||
552 | /* Translate or upload, if needed. */ |
||
553 | if (ib.index_size == 1) { |
||
554 | struct pipe_resource *out_buffer = NULL; |
||
555 | unsigned out_offset, start, count, start_offset; |
||
556 | void *ptr; |
||
557 | |||
558 | si_get_draw_start_count(sctx, info, &start, &count); |
||
559 | start_offset = start * ib.index_size; |
||
560 | |||
561 | u_upload_alloc(sctx->b.uploader, start_offset, count * 2, |
||
562 | &out_offset, &out_buffer, &ptr); |
||
563 | |||
564 | util_shorten_ubyte_elts_to_userptr(&sctx->b.b, &ib, 0, |
||
565 | ib.offset + start_offset, |
||
566 | count, ptr); |
||
567 | |||
568 | pipe_resource_reference(&ib.buffer, NULL); |
||
569 | ib.user_buffer = NULL; |
||
570 | ib.buffer = out_buffer; |
||
571 | /* info->start will be added by the drawing code */ |
||
572 | ib.offset = out_offset - start_offset; |
||
573 | ib.index_size = 2; |
||
574 | } else if (ib.user_buffer && !ib.buffer) { |
||
575 | unsigned start, count, start_offset; |
||
576 | |||
577 | si_get_draw_start_count(sctx, info, &start, &count); |
||
578 | start_offset = start * ib.index_size; |
||
579 | |||
580 | u_upload_data(sctx->b.uploader, start_offset, count * ib.index_size, |
||
581 | (char*)ib.user_buffer + start_offset, |
||
582 | &ib.offset, &ib.buffer); |
||
583 | /* info->start will be added by the drawing code */ |
||
584 | ib.offset -= start_offset; |
||
585 | } |
||
586 | } |
||
587 | |||
588 | if (info->indexed && r600_resource(ib.buffer)->TC_L2_dirty) { |
||
589 | sctx->b.flags |= SI_CONTEXT_INV_TC_L2; |
||
590 | r600_resource(ib.buffer)->TC_L2_dirty = false; |
||
591 | } |
||
592 | |||
593 | /* Check flush flags. */ |
||
594 | if (sctx->b.flags) |
||
595 | sctx->atoms.s.cache_flush->dirty = true; |
||
596 | |||
597 | si_need_cs_space(sctx, 0, TRUE); |
||
598 | |||
599 | /* Emit states. */ |
||
600 | for (i = 0; i < SI_NUM_ATOMS(sctx); i++) { |
||
601 | if (sctx->atoms.array[i]->dirty) { |
||
602 | sctx->atoms.array[i]->emit(&sctx->b, sctx->atoms.array[i]); |
||
603 | sctx->atoms.array[i]->dirty = false; |
||
604 | } |
||
605 | } |
||
606 | |||
607 | si_pm4_emit_dirty(sctx); |
||
608 | si_emit_scratch_reloc(sctx); |
||
609 | si_emit_rasterizer_prim_state(sctx); |
||
610 | si_emit_draw_registers(sctx, info); |
||
611 | si_emit_draw_packets(sctx, info, &ib); |
||
612 | |||
613 | #if SI_TRACE_CS |
||
614 | if (sctx->screen->b.trace_bo) { |
||
615 | si_trace_emit(sctx); |
||
616 | } |
||
617 | #endif |
||
618 | |||
619 | /* Workaround for a VGT hang when streamout is enabled. |
||
620 | * It must be done after drawing. */ |
||
621 | if (sctx->b.family == CHIP_HAWAII && |
||
622 | (sctx->b.streamout.streamout_enabled || |
||
623 | sctx->b.streamout.prims_gen_query_enabled)) { |
||
624 | sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC; |
||
625 | } |
||
626 | |||
627 | /* Set the depth buffer as dirty. */ |
||
628 | if (sctx->framebuffer.state.zsbuf) { |
||
629 | struct pipe_surface *surf = sctx->framebuffer.state.zsbuf; |
||
630 | struct r600_texture *rtex = (struct r600_texture *)surf->texture; |
||
631 | |||
632 | rtex->dirty_level_mask |= 1 << surf->u.tex.level; |
||
633 | } |
||
634 | if (sctx->framebuffer.compressed_cb_mask) { |
||
635 | struct pipe_surface *surf; |
||
636 | struct r600_texture *rtex; |
||
637 | unsigned mask = sctx->framebuffer.compressed_cb_mask; |
||
638 | |||
639 | do { |
||
640 | unsigned i = u_bit_scan(&mask); |
||
641 | surf = sctx->framebuffer.state.cbufs[i]; |
||
642 | rtex = (struct r600_texture*)surf->texture; |
||
643 | |||
644 | rtex->dirty_level_mask |= 1 << surf->u.tex.level; |
||
645 | } while (mask); |
||
646 | } |
||
647 | |||
648 | pipe_resource_reference(&ib.buffer, NULL); |
||
649 | sctx->b.num_draw_calls++; |
||
650 | } |
||
651 | |||
652 | #if SI_TRACE_CS |
||
653 | void si_trace_emit(struct si_context *sctx) |
||
654 | { |
||
655 | struct si_screen *sscreen = sctx->screen; |
||
656 | struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; |
||
657 | uint64_t va; |
||
658 | |||
659 | va = sscreen->b.trace_bo->gpu_address; |
||
660 | r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, sscreen->b.trace_bo, |
||
661 | RADEON_USAGE_READWRITE, RADEON_PRIO_MIN); |
||
662 | radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 4, 0)); |
||
663 | radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_MEM_SYNC) | |
||
664 | PKT3_WRITE_DATA_WR_CONFIRM | |
||
665 | PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME)); |
||
666 | radeon_emit(cs, va & 0xFFFFFFFFUL); |
||
667 | radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL); |
||
668 | radeon_emit(cs, cs->cdw); |
||
669 | radeon_emit(cs, sscreen->b.cs_count); |
||
670 | } |
||
671 | #endif><>><>>>>>> |