Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5564 | serge | 1 | /* |
2 | * Copyright © 2011 Intel Corporation |
||
3 | * |
||
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
5 | * copy of this software and associated documentation files (the "Software"), |
||
6 | * to deal in the Software without restriction, including without limitation |
||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
||
8 | * and/or sell copies of the Software, and to permit persons to whom the |
||
9 | * Software is furnished to do so, subject to the following conditions: |
||
10 | * |
||
11 | * The above copyright notice and this permission notice (including the next |
||
12 | * paragraph) shall be included in all copies or substantial portions of the |
||
13 | * Software. |
||
14 | * |
||
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
||
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
||
20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
||
21 | * IN THE SOFTWARE. |
||
22 | */ |
||
23 | |||
24 | #include "main/macros.h" |
||
25 | #include "intel_batchbuffer.h" |
||
26 | #include "brw_context.h" |
||
27 | #include "brw_state.h" |
||
28 | #include "brw_defines.h" |
||
29 | |||
30 | /** |
||
31 | * The following diagram shows how we partition the URB: |
||
32 | * |
||
33 | * 16kB or 32kB Rest of the URB space |
||
34 | * __________-__________ _________________-_________________ |
||
35 | * / \ / \ |
||
36 | * +-------------------------------------------------------------+ |
||
37 | * | VS/FS/GS Push | VS/GS URB | |
||
38 | * | Constants | Entries | |
||
39 | * +-------------------------------------------------------------+ |
||
40 | * |
||
41 | * Notably, push constants must be stored at the beginning of the URB |
||
42 | * space, while entries can be stored anywhere. Ivybridge and Haswell |
||
43 | * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3 |
||
44 | * doubles this (32kB). |
||
45 | * |
||
46 | * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and |
||
47 | * sized) in increments of 1kB. Haswell GT3 requires them to be located and |
||
48 | * sized in increments of 2kB. |
||
49 | * |
||
50 | * Currently we split the constant buffer space evenly among whatever stages |
||
51 | * are active. This is probably not ideal, but simple. |
||
52 | * |
||
53 | * Ivybridge GT1 and Haswell GT1 have 128kB of URB space. |
||
54 | * Ivybridge GT2 and Haswell GT2 have 256kB of URB space. |
||
55 | * Haswell GT3 has 512kB of URB space. |
||
56 | * |
||
57 | * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations", |
||
58 | * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS. |
||
59 | */ |
||
60 | static void |
||
61 | gen7_allocate_push_constants(struct brw_context *brw) |
||
62 | { |
||
63 | unsigned avail_size = 16; |
||
64 | unsigned multiplier = |
||
65 | (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 2 : 1; |
||
66 | |||
67 | /* BRW_NEW_GEOMETRY_PROGRAM */ |
||
68 | bool gs_present = brw->geometry_program; |
||
69 | |||
70 | unsigned vs_size, gs_size; |
||
71 | if (gs_present) { |
||
72 | vs_size = avail_size / 3; |
||
73 | avail_size -= vs_size; |
||
74 | gs_size = avail_size / 2; |
||
75 | avail_size -= gs_size; |
||
76 | } else { |
||
77 | vs_size = avail_size / 2; |
||
78 | avail_size -= vs_size; |
||
79 | gs_size = 0; |
||
80 | } |
||
81 | unsigned fs_size = avail_size; |
||
82 | |||
83 | gen7_emit_push_constant_state(brw, multiplier * vs_size, |
||
84 | multiplier * gs_size, multiplier * fs_size); |
||
85 | |||
86 | /* From p115 of the Ivy Bridge PRM (3.2.1.4 3DSTATE_PUSH_CONSTANT_ALLOC_VS): |
||
87 | * |
||
88 | * Programming Restriction: |
||
89 | * |
||
90 | * The 3DSTATE_CONSTANT_VS must be reprogrammed prior to the next |
||
91 | * 3DPRIMITIVE command after programming the |
||
92 | * 3DSTATE_PUSH_CONSTANT_ALLOC_VS. |
||
93 | * |
||
94 | * Similar text exists for the other 3DSTATE_PUSH_CONSTANT_ALLOC_* |
||
95 | * commands. |
||
96 | */ |
||
97 | brw->ctx.NewDriverState |= BRW_NEW_PUSH_CONSTANT_ALLOCATION; |
||
98 | } |
||
99 | |||
100 | void |
||
101 | gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size, |
||
102 | unsigned gs_size, unsigned fs_size) |
||
103 | { |
||
104 | unsigned offset = 0; |
||
105 | |||
106 | BEGIN_BATCH(6); |
||
107 | OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_VS << 16 | (2 - 2)); |
||
108 | OUT_BATCH(vs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); |
||
109 | offset += vs_size; |
||
110 | |||
111 | OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_GS << 16 | (2 - 2)); |
||
112 | OUT_BATCH(gs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); |
||
113 | offset += gs_size; |
||
114 | |||
115 | OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_PS << 16 | (2 - 2)); |
||
116 | OUT_BATCH(fs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); |
||
117 | ADVANCE_BATCH(); |
||
118 | |||
119 | /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS): |
||
120 | * |
||
121 | * A PIPE_CONTOL command with the CS Stall bit set must be programmed |
||
122 | * in the ring after this instruction. |
||
123 | * |
||
124 | * No such restriction exists for Haswell or Baytrail. |
||
125 | */ |
||
126 | if (brw->gen < 8 && !brw->is_haswell && !brw->is_baytrail) |
||
127 | gen7_emit_cs_stall_flush(brw); |
||
128 | } |
||
129 | |||
130 | const struct brw_tracked_state gen7_push_constant_space = { |
||
131 | .dirty = { |
||
132 | .mesa = 0, |
||
133 | .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM, |
||
134 | }, |
||
135 | .emit = gen7_allocate_push_constants, |
||
136 | }; |
||
137 | |||
138 | static void |
||
139 | gen7_upload_urb(struct brw_context *brw) |
||
140 | { |
||
141 | const int push_size_kB = |
||
142 | (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 32 : 16; |
||
143 | |||
144 | /* BRW_NEW_VS_PROG_DATA */ |
||
145 | unsigned vs_size = MAX2(brw->vs.prog_data->base.urb_entry_size, 1); |
||
146 | unsigned vs_entry_size_bytes = vs_size * 64; |
||
147 | /* BRW_NEW_GEOMETRY_PROGRAM, BRW_NEW_GS_PROG_DATA */ |
||
148 | bool gs_present = brw->geometry_program; |
||
149 | unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_entry_size : 1; |
||
150 | unsigned gs_entry_size_bytes = gs_size * 64; |
||
151 | |||
152 | /* If we're just switching between programs with the same URB requirements, |
||
153 | * skip the rest of the logic. |
||
154 | */ |
||
155 | if (!(brw->ctx.NewDriverState & BRW_NEW_CONTEXT) && |
||
156 | brw->urb.vsize == vs_size && |
||
157 | brw->urb.gs_present == gs_present && |
||
158 | brw->urb.gsize == gs_size) { |
||
159 | return; |
||
160 | } |
||
161 | brw->urb.vsize = vs_size; |
||
162 | brw->urb.gs_present = gs_present; |
||
163 | brw->urb.gsize = gs_size; |
||
164 | |||
165 | /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS): |
||
166 | * |
||
167 | * VS Number of URB Entries must be divisible by 8 if the VS URB Entry |
||
168 | * Allocation Size is less than 9 512-bit URB entries. |
||
169 | * |
||
170 | * Similar text exists for GS. |
||
171 | */ |
||
172 | unsigned vs_granularity = (vs_size < 9) ? 8 : 1; |
||
173 | unsigned gs_granularity = (gs_size < 9) ? 8 : 1; |
||
174 | |||
175 | /* URB allocations must be done in 8k chunks. */ |
||
176 | unsigned chunk_size_bytes = 8192; |
||
177 | |||
178 | /* Determine the size of the URB in chunks. |
||
179 | */ |
||
180 | unsigned urb_chunks = brw->urb.size * 1024 / chunk_size_bytes; |
||
181 | |||
182 | /* Reserve space for push constants */ |
||
183 | unsigned push_constant_bytes = 1024 * push_size_kB; |
||
184 | unsigned push_constant_chunks = |
||
185 | push_constant_bytes / chunk_size_bytes; |
||
186 | |||
187 | /* Initially, assign each stage the minimum amount of URB space it needs, |
||
188 | * and make a note of how much additional space it "wants" (the amount of |
||
189 | * additional space it could actually make use of). |
||
190 | */ |
||
191 | |||
192 | /* VS has a lower limit on the number of URB entries */ |
||
193 | unsigned vs_chunks = |
||
194 | ALIGN(brw->urb.min_vs_entries * vs_entry_size_bytes, chunk_size_bytes) / |
||
195 | chunk_size_bytes; |
||
196 | unsigned vs_wants = |
||
197 | ALIGN(brw->urb.max_vs_entries * vs_entry_size_bytes, |
||
198 | chunk_size_bytes) / chunk_size_bytes - vs_chunks; |
||
199 | |||
200 | unsigned gs_chunks = 0; |
||
201 | unsigned gs_wants = 0; |
||
202 | if (gs_present) { |
||
203 | /* There are two constraints on the minimum amount of URB space we can |
||
204 | * allocate: |
||
205 | * |
||
206 | * (1) We need room for at least 2 URB entries, since we always operate |
||
207 | * the GS in DUAL_OBJECT mode. |
||
208 | * |
||
209 | * (2) We can't allocate less than nr_gs_entries_granularity. |
||
210 | */ |
||
211 | gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes, |
||
212 | chunk_size_bytes) / chunk_size_bytes; |
||
213 | gs_wants = |
||
214 | ALIGN(brw->urb.max_gs_entries * gs_entry_size_bytes, |
||
215 | chunk_size_bytes) / chunk_size_bytes - gs_chunks; |
||
216 | } |
||
217 | |||
218 | /* There should always be enough URB space to satisfy the minimum |
||
219 | * requirements of each stage. |
||
220 | */ |
||
221 | unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks; |
||
222 | assert(total_needs <= urb_chunks); |
||
223 | |||
224 | /* Mete out remaining space (if any) in proportion to "wants". */ |
||
225 | unsigned total_wants = vs_wants + gs_wants; |
||
226 | unsigned remaining_space = urb_chunks - total_needs; |
||
227 | if (remaining_space > total_wants) |
||
228 | remaining_space = total_wants; |
||
229 | if (remaining_space > 0) { |
||
230 | unsigned vs_additional = (unsigned) |
||
231 | round(vs_wants * (((double) remaining_space) / total_wants)); |
||
232 | vs_chunks += vs_additional; |
||
233 | remaining_space -= vs_additional; |
||
234 | gs_chunks += remaining_space; |
||
235 | } |
||
236 | |||
237 | /* Sanity check that we haven't over-allocated. */ |
||
238 | assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks); |
||
239 | |||
240 | /* Finally, compute the number of entries that can fit in the space |
||
241 | * allocated to each stage. |
||
242 | */ |
||
243 | unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes; |
||
244 | unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes; |
||
245 | |||
246 | /* Since we rounded up when computing *_wants, this may be slightly more |
||
247 | * than the maximum allowed amount, so correct for that. |
||
248 | */ |
||
249 | nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries); |
||
250 | nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries); |
||
251 | |||
252 | /* Ensure that we program a multiple of the granularity. */ |
||
253 | nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity); |
||
254 | nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity); |
||
255 | |||
256 | /* Finally, sanity check to make sure we have at least the minimum number |
||
257 | * of entries needed for each stage. |
||
258 | */ |
||
259 | assert(nr_vs_entries >= brw->urb.min_vs_entries); |
||
260 | if (gs_present) |
||
261 | assert(nr_gs_entries >= 2); |
||
262 | |||
263 | /* Gen7 doesn't actually use brw->urb.nr_{vs,gs}_entries, but it seems |
||
264 | * better to put reasonable data in there rather than leave them |
||
265 | * uninitialized. |
||
266 | */ |
||
267 | brw->urb.nr_vs_entries = nr_vs_entries; |
||
268 | brw->urb.nr_gs_entries = nr_gs_entries; |
||
269 | |||
270 | /* Lay out the URB in the following order: |
||
271 | * - push constants |
||
272 | * - VS |
||
273 | * - GS |
||
274 | */ |
||
275 | brw->urb.vs_start = push_constant_chunks; |
||
276 | brw->urb.gs_start = push_constant_chunks + vs_chunks; |
||
277 | |||
278 | if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail) |
||
279 | gen7_emit_vs_workaround_flush(brw); |
||
280 | gen7_emit_urb_state(brw, |
||
281 | brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start, |
||
282 | brw->urb.nr_gs_entries, gs_size, brw->urb.gs_start); |
||
283 | } |
||
284 | |||
285 | void |
||
286 | gen7_emit_urb_state(struct brw_context *brw, |
||
287 | unsigned nr_vs_entries, unsigned vs_size, |
||
288 | unsigned vs_start, unsigned nr_gs_entries, |
||
289 | unsigned gs_size, unsigned gs_start) |
||
290 | { |
||
291 | BEGIN_BATCH(8); |
||
292 | OUT_BATCH(_3DSTATE_URB_VS << 16 | (2 - 2)); |
||
293 | OUT_BATCH(nr_vs_entries | |
||
294 | ((vs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | |
||
295 | (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); |
||
296 | |||
297 | OUT_BATCH(_3DSTATE_URB_GS << 16 | (2 - 2)); |
||
298 | OUT_BATCH(nr_gs_entries | |
||
299 | ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | |
||
300 | (gs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); |
||
301 | |||
302 | /* Allocate the HS and DS zero space - we don't use them. */ |
||
303 | OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2)); |
||
304 | OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) | |
||
305 | (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); |
||
306 | |||
307 | OUT_BATCH(_3DSTATE_URB_DS << 16 | (2 - 2)); |
||
308 | OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) | |
||
309 | (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); |
||
310 | ADVANCE_BATCH(); |
||
311 | } |
||
312 | |||
313 | const struct brw_tracked_state gen7_urb = { |
||
314 | .dirty = { |
||
315 | .mesa = 0, |
||
316 | .brw = BRW_NEW_CONTEXT | |
||
317 | BRW_NEW_GEOMETRY_PROGRAM | |
||
318 | BRW_NEW_GS_PROG_DATA | |
||
319 | BRW_NEW_VS_PROG_DATA, |
||
320 | }, |
||
321 | .emit = gen7_upload_urb, |
||
322 | };><>><>><>><>><>><>><>><>><>><>><>><>=>=>>>>><>><>><>><>><>><> |