Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4358 | Serge | 1 | /* |
2 | * Copyright © 2008 Intel Corporation |
||
3 | * |
||
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
5 | * copy of this software and associated documentation files (the "Software"), |
||
6 | * to deal in the Software without restriction, including without limitation |
||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
||
8 | * and/or sell copies of the Software, and to permit persons to whom the |
||
9 | * Software is furnished to do so, subject to the following conditions: |
||
10 | * |
||
11 | * The above copyright notice and this permission notice (including the next |
||
12 | * paragraph) shall be included in all copies or substantial portions of the |
||
13 | * Software. |
||
14 | * |
||
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
||
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
||
20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
||
21 | * IN THE SOFTWARE. |
||
22 | * |
||
23 | * Authors: |
||
24 | * Eric Anholt |
||
25 | * Kenneth Graunke |
||
26 | */ |
||
27 | |||
28 | /** @file gen6_queryobj.c |
||
29 | * |
||
30 | * Support for query objects (GL_ARB_occlusion_query, GL_ARB_timer_query, |
||
31 | * GL_EXT_transform_feedback, and friends) on platforms that support |
||
32 | * hardware contexts (Gen6+). |
||
33 | */ |
||
34 | #include "main/imports.h" |
||
35 | |||
36 | #include "brw_context.h" |
||
37 | #include "brw_defines.h" |
||
38 | #include "brw_state.h" |
||
39 | #include "intel_batchbuffer.h" |
||
40 | #include "intel_reg.h" |
||
41 | |||
42 | /** |
||
43 | * Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. |
||
44 | */ |
||
45 | static void |
||
46 | write_timestamp(struct brw_context *brw, drm_intel_bo *query_bo, int idx) |
||
47 | { |
||
48 | /* Emit workaround flushes: */ |
||
49 | if (brw->gen == 6) { |
||
50 | /* The timestamp write below is a non-zero post-sync op, which on |
||
51 | * Gen6 necessitates a CS stall. CS stalls need stall at scoreboard |
||
52 | * set. See the comments for intel_emit_post_sync_nonzero_flush(). |
||
53 | */ |
||
54 | BEGIN_BATCH(4); |
||
55 | OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2)); |
||
56 | OUT_BATCH(PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD); |
||
57 | OUT_BATCH(0); |
||
58 | OUT_BATCH(0); |
||
59 | ADVANCE_BATCH(); |
||
60 | } |
||
61 | |||
62 | BEGIN_BATCH(5); |
||
63 | OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2)); |
||
64 | OUT_BATCH(PIPE_CONTROL_WRITE_TIMESTAMP); |
||
65 | OUT_RELOC(query_bo, |
||
66 | I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, |
||
67 | PIPE_CONTROL_GLOBAL_GTT_WRITE | |
||
68 | idx * sizeof(uint64_t)); |
||
69 | OUT_BATCH(0); |
||
70 | OUT_BATCH(0); |
||
71 | ADVANCE_BATCH(); |
||
72 | } |
||
73 | |||
74 | /** |
||
75 | * Emit PIPE_CONTROLs to write the PS_DEPTH_COUNT register into a buffer. |
||
76 | */ |
||
77 | static void |
||
78 | write_depth_count(struct brw_context *brw, drm_intel_bo *query_bo, int idx) |
||
79 | { |
||
80 | /* Emit Sandybridge workaround flush: */ |
||
81 | if (brw->gen == 6) |
||
82 | intel_emit_post_sync_nonzero_flush(brw); |
||
83 | |||
84 | BEGIN_BATCH(5); |
||
85 | OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2)); |
||
86 | OUT_BATCH(PIPE_CONTROL_DEPTH_STALL | |
||
87 | PIPE_CONTROL_WRITE_DEPTH_COUNT); |
||
88 | OUT_RELOC(query_bo, |
||
89 | I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, |
||
90 | PIPE_CONTROL_GLOBAL_GTT_WRITE | |
||
91 | (idx * sizeof(uint64_t))); |
||
92 | OUT_BATCH(0); |
||
93 | OUT_BATCH(0); |
||
94 | ADVANCE_BATCH(); |
||
95 | } |
||
96 | |||
97 | /* |
||
98 | * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM. |
||
99 | * |
||
100 | * Only TIMESTAMP and PS_DEPTH_COUNT have special PIPE_CONTROL support; other |
||
101 | * counters have to be read via the generic MI_STORE_REGISTER_MEM. This |
||
102 | * function also performs a pipeline flush for proper synchronization. |
||
103 | */ |
||
104 | static void |
||
105 | write_reg(struct brw_context *brw, |
||
106 | drm_intel_bo *query_bo, uint32_t reg, int idx) |
||
107 | { |
||
108 | assert(brw->gen >= 6); |
||
109 | |||
110 | intel_batchbuffer_emit_mi_flush(brw); |
||
111 | |||
112 | /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to |
||
113 | * read a full 64-bit register, we need to do two of them. |
||
114 | */ |
||
115 | BEGIN_BATCH(3); |
||
116 | OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); |
||
117 | OUT_BATCH(reg); |
||
118 | OUT_RELOC(query_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, |
||
119 | idx * sizeof(uint64_t)); |
||
120 | ADVANCE_BATCH(); |
||
121 | |||
122 | BEGIN_BATCH(3); |
||
123 | OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); |
||
124 | OUT_BATCH(reg + sizeof(uint32_t)); |
||
125 | OUT_RELOC(query_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, |
||
126 | sizeof(uint32_t) + idx * sizeof(uint64_t)); |
||
127 | ADVANCE_BATCH(); |
||
128 | } |
||
129 | |||
130 | static void |
||
131 | write_primitives_generated(struct brw_context *brw, |
||
132 | drm_intel_bo *query_bo, int idx) |
||
133 | { |
||
134 | write_reg(brw, query_bo, CL_INVOCATION_COUNT, idx); |
||
135 | } |
||
136 | |||
137 | static void |
||
138 | write_xfb_primitives_written(struct brw_context *brw, |
||
139 | drm_intel_bo *query_bo, int idx) |
||
140 | { |
||
141 | if (brw->gen >= 7) { |
||
142 | write_reg(brw, query_bo, SO_NUM_PRIMS_WRITTEN0_IVB, idx); |
||
143 | } else { |
||
144 | write_reg(brw, query_bo, SO_NUM_PRIMS_WRITTEN, idx); |
||
145 | } |
||
146 | } |
||
147 | |||
148 | /** |
||
149 | * Wait on the query object's BO and calculate the final result. |
||
150 | */ |
||
151 | static void |
||
152 | gen6_queryobj_get_results(struct gl_context *ctx, |
||
153 | struct brw_query_object *query) |
||
154 | { |
||
155 | struct brw_context *brw = brw_context(ctx); |
||
156 | |||
157 | if (query->bo == NULL) |
||
158 | return; |
||
159 | |||
160 | /* If the application has requested the query result, but this batch is |
||
161 | * still contributing to it, flush it now so the results will be present |
||
162 | * when mapped. |
||
163 | */ |
||
164 | if (drm_intel_bo_references(brw->batch.bo, query->bo)) |
||
165 | intel_batchbuffer_flush(brw); |
||
166 | |||
167 | if (unlikely(brw->perf_debug)) { |
||
168 | if (drm_intel_bo_busy(query->bo)) { |
||
169 | perf_debug("Stalling on the GPU waiting for a query object.\n"); |
||
170 | } |
||
171 | } |
||
172 | |||
173 | drm_intel_bo_map(query->bo, false); |
||
174 | uint64_t *results = query->bo->virtual; |
||
175 | switch (query->Base.Target) { |
||
176 | case GL_TIME_ELAPSED: |
||
177 | /* The query BO contains the starting and ending timestamps. |
||
178 | * Subtract the two and convert to nanoseconds. |
||
179 | */ |
||
180 | query->Base.Result += 80 * (results[1] - results[0]); |
||
181 | break; |
||
182 | |||
183 | case GL_TIMESTAMP: |
||
184 | /* Our timer is a clock that increments every 80ns (regardless of |
||
185 | * other clock scaling in the system). The timestamp register we can |
||
186 | * read for glGetTimestamp() masks out the top 32 bits, so we do that |
||
187 | * here too to let the two counters be compared against each other. |
||
188 | * |
||
189 | * If we just multiplied that 32 bits of data by 80, it would roll |
||
190 | * over at a non-power-of-two, so an application couldn't use |
||
191 | * GL_QUERY_COUNTER_BITS to handle rollover correctly. Instead, we |
||
192 | * report 36 bits and truncate at that (rolling over 5 times as often |
||
193 | * as the HW counter), and when the 32-bit counter rolls over, it |
||
194 | * happens to also be at a rollover in the reported value from near |
||
195 | * (1<<36) to 0. |
||
196 | * |
||
197 | * The low 32 bits rolls over in ~343 seconds. Our 36-bit result |
||
198 | * rolls over every ~69 seconds. |
||
199 | * |
||
200 | * The query BO contains a single timestamp value in results[0]. |
||
201 | */ |
||
202 | query->Base.Result = 80 * (results[0] & 0xffffffff); |
||
203 | query->Base.Result &= (1ull << 36) - 1; |
||
204 | break; |
||
205 | |||
206 | case GL_SAMPLES_PASSED_ARB: |
||
207 | /* We need to use += rather than = here since some BLT-based operations |
||
208 | * may have added additional samples to our occlusion query value. |
||
209 | */ |
||
210 | query->Base.Result += results[1] - results[0]; |
||
211 | break; |
||
212 | |||
213 | case GL_ANY_SAMPLES_PASSED: |
||
214 | case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: |
||
215 | if (results[0] != results[1]) |
||
216 | query->Base.Result = true; |
||
217 | break; |
||
218 | |||
219 | case GL_PRIMITIVES_GENERATED: |
||
220 | case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: |
||
221 | query->Base.Result = results[1] - results[0]; |
||
222 | break; |
||
223 | |||
224 | default: |
||
225 | assert(!"Unrecognized query target in brw_queryobj_get_results()"); |
||
226 | break; |
||
227 | } |
||
228 | drm_intel_bo_unmap(query->bo); |
||
229 | |||
230 | /* Now that we've processed the data stored in the query's buffer object, |
||
231 | * we can release it. |
||
232 | */ |
||
233 | drm_intel_bo_unreference(query->bo); |
||
234 | query->bo = NULL; |
||
235 | } |
||
236 | |||
237 | /** |
||
238 | * Driver hook for glBeginQuery(). |
||
239 | * |
||
240 | * Initializes driver structures and emits any GPU commands required to begin |
||
241 | * recording data for the query. |
||
242 | */ |
||
243 | static void |
||
244 | gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q) |
||
245 | { |
||
246 | struct brw_context *brw = brw_context(ctx); |
||
247 | struct brw_query_object *query = (struct brw_query_object *)q; |
||
248 | |||
249 | /* Since we're starting a new query, we need to throw away old results. */ |
||
250 | drm_intel_bo_unreference(query->bo); |
||
251 | query->bo = drm_intel_bo_alloc(brw->bufmgr, "query results", 4096, 4096); |
||
252 | |||
253 | switch (query->Base.Target) { |
||
254 | case GL_TIME_ELAPSED: |
||
255 | /* For timestamp queries, we record the starting time right away so that |
||
256 | * we measure the full time between BeginQuery and EndQuery. There's |
||
257 | * some debate about whether this is the right thing to do. Our decision |
||
258 | * is based on the following text from the ARB_timer_query extension: |
||
259 | * |
||
260 | * "(5) Should the extension measure total time elapsed between the full |
||
261 | * completion of the BeginQuery and EndQuery commands, or just time |
||
262 | * spent in the graphics library? |
||
263 | * |
||
264 | * RESOLVED: This extension will measure the total time elapsed |
||
265 | * between the full completion of these commands. Future extensions |
||
266 | * may implement a query to determine time elapsed at different stages |
||
267 | * of the graphics pipeline." |
||
268 | * |
||
269 | * We write a starting timestamp now (at index 0). At EndQuery() time, |
||
270 | * we'll write a second timestamp (at index 1), and subtract the two to |
||
271 | * obtain the time elapsed. Notably, this includes time elapsed while |
||
272 | * the system was doing other work, such as running other applications. |
||
273 | */ |
||
274 | write_timestamp(brw, query->bo, 0); |
||
275 | break; |
||
276 | |||
277 | case GL_ANY_SAMPLES_PASSED: |
||
278 | case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: |
||
279 | case GL_SAMPLES_PASSED_ARB: |
||
280 | write_depth_count(brw, query->bo, 0); |
||
281 | break; |
||
282 | |||
283 | case GL_PRIMITIVES_GENERATED: |
||
284 | write_primitives_generated(brw, query->bo, 0); |
||
285 | break; |
||
286 | |||
287 | case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: |
||
288 | write_xfb_primitives_written(brw, query->bo, 0); |
||
289 | break; |
||
290 | |||
291 | default: |
||
292 | assert(!"Unrecognized query target in brw_begin_query()"); |
||
293 | break; |
||
294 | } |
||
295 | } |
||
296 | |||
297 | /** |
||
298 | * Driver hook for glEndQuery(). |
||
299 | * |
||
300 | * Emits GPU commands to record a final query value, ending any data capturing. |
||
301 | * However, the final result isn't necessarily available until the GPU processes |
||
302 | * those commands. brw_queryobj_get_results() processes the captured data to |
||
303 | * produce the final result. |
||
304 | */ |
||
305 | static void |
||
306 | gen6_end_query(struct gl_context *ctx, struct gl_query_object *q) |
||
307 | { |
||
308 | struct brw_context *brw = brw_context(ctx); |
||
309 | struct brw_query_object *query = (struct brw_query_object *)q; |
||
310 | |||
311 | switch (query->Base.Target) { |
||
312 | case GL_TIME_ELAPSED: |
||
313 | write_timestamp(brw, query->bo, 1); |
||
314 | break; |
||
315 | |||
316 | case GL_ANY_SAMPLES_PASSED: |
||
317 | case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: |
||
318 | case GL_SAMPLES_PASSED_ARB: |
||
319 | write_depth_count(brw, query->bo, 1); |
||
320 | break; |
||
321 | |||
322 | case GL_PRIMITIVES_GENERATED: |
||
323 | write_primitives_generated(brw, query->bo, 1); |
||
324 | break; |
||
325 | |||
326 | case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: |
||
327 | write_xfb_primitives_written(brw, query->bo, 1); |
||
328 | break; |
||
329 | |||
330 | default: |
||
331 | assert(!"Unrecognized query target in brw_end_query()"); |
||
332 | break; |
||
333 | } |
||
334 | } |
||
335 | |||
336 | /** |
||
337 | * The WaitQuery() driver hook. |
||
338 | * |
||
339 | * Wait for a query result to become available and return it. This is the |
||
340 | * backing for glGetQueryObjectiv() with the GL_QUERY_RESULT pname. |
||
341 | */ |
||
342 | static void gen6_wait_query(struct gl_context *ctx, struct gl_query_object *q) |
||
343 | { |
||
344 | struct brw_query_object *query = (struct brw_query_object *)q; |
||
345 | |||
346 | gen6_queryobj_get_results(ctx, query); |
||
347 | query->Base.Ready = true; |
||
348 | } |
||
349 | |||
350 | /** |
||
351 | * The CheckQuery() driver hook. |
||
352 | * |
||
353 | * Checks whether a query result is ready yet. If not, flushes. |
||
354 | * This is the backing for glGetQueryObjectiv()'s QUERY_RESULT_AVAILABLE pname. |
||
355 | */ |
||
356 | static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q) |
||
357 | { |
||
358 | struct brw_context *brw = brw_context(ctx); |
||
359 | struct brw_query_object *query = (struct brw_query_object *)q; |
||
360 | |||
361 | /* From the GL_ARB_occlusion_query spec: |
||
362 | * |
||
363 | * "Instead of allowing for an infinite loop, performing a |
||
364 | * QUERY_RESULT_AVAILABLE_ARB will perform a flush if the result is |
||
365 | * not ready yet on the first time it is queried. This ensures that |
||
366 | * the async query will return true in finite time. |
||
367 | */ |
||
368 | if (query->bo && drm_intel_bo_references(brw->batch.bo, query->bo)) |
||
369 | intel_batchbuffer_flush(brw); |
||
370 | |||
371 | if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) { |
||
372 | gen6_queryobj_get_results(ctx, query); |
||
373 | query->Base.Ready = true; |
||
374 | } |
||
375 | } |
||
376 | |||
377 | /* Initialize Gen6+-specific query object functions. */ |
||
378 | void gen6_init_queryobj_functions(struct dd_function_table *functions) |
||
379 | { |
||
380 | functions->BeginQuery = gen6_begin_query; |
||
381 | functions->EndQuery = gen6_end_query; |
||
382 | functions->CheckQuery = gen6_check_query; |
||
383 | functions->WaitQuery = gen6_wait_query; |
||
384 | }><>36)><36)> |