Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
2352 | Serge | 1 | #include "drmP.h" |
2 | #include "drm.h" |
||
3 | #include "i915_drm.h" |
||
4 | #include "i915_drv.h" |
||
5 | #include "intel_drv.h" |
||
6 | //#include |
||
7 | |||
8 | #undef mb |
||
9 | #undef rmb |
||
10 | #undef wmb |
||
11 | #define mb() asm volatile("mfence") |
||
12 | #define rmb() asm volatile ("lfence") |
||
13 | #define wmb() asm volatile ("sfence") |
||
14 | |||
15 | |||
16 | typedef struct |
||
17 | { |
||
18 | struct drm_i915_gem_object *batch; |
||
19 | struct list_head objects; |
||
20 | u32 exec_start; |
||
21 | u32 exec_len; |
||
22 | |||
23 | }batchbuffer_t; |
||
24 | |||
25 | struct change_domains { |
||
26 | uint32_t invalidate_domains; |
||
27 | uint32_t flush_domains; |
||
28 | uint32_t flush_rings; |
||
29 | uint32_t flips; |
||
30 | }; |
||
31 | |||
32 | /* |
||
33 | * Set the next domain for the specified object. This |
||
34 | * may not actually perform the necessary flushing/invaliding though, |
||
35 | * as that may want to be batched with other set_domain operations |
||
36 | * |
||
37 | * This is (we hope) the only really tricky part of gem. The goal |
||
38 | * is fairly simple -- track which caches hold bits of the object |
||
39 | * and make sure they remain coherent. A few concrete examples may |
||
40 | * help to explain how it works. For shorthand, we use the notation |
||
41 | * (read_domains, write_domain), e.g. (CPU, CPU) to indicate the |
||
42 | * a pair of read and write domain masks. |
||
43 | * |
||
44 | * Case 1: the batch buffer |
||
45 | * |
||
46 | * 1. Allocated |
||
47 | * 2. Written by CPU |
||
48 | * 3. Mapped to GTT |
||
49 | * 4. Read by GPU |
||
50 | * 5. Unmapped from GTT |
||
51 | * 6. Freed |
||
52 | * |
||
53 | * Let's take these a step at a time |
||
54 | * |
||
55 | * 1. Allocated |
||
56 | * Pages allocated from the kernel may still have |
||
57 | * cache contents, so we set them to (CPU, CPU) always. |
||
58 | * 2. Written by CPU (using pwrite) |
||
59 | * The pwrite function calls set_domain (CPU, CPU) and |
||
60 | * this function does nothing (as nothing changes) |
||
61 | * 3. Mapped by GTT |
||
62 | * This function asserts that the object is not |
||
63 | * currently in any GPU-based read or write domains |
||
64 | * 4. Read by GPU |
||
65 | * i915_gem_execbuffer calls set_domain (COMMAND, 0). |
||
66 | * As write_domain is zero, this function adds in the |
||
67 | * current read domains (CPU+COMMAND, 0). |
||
68 | * flush_domains is set to CPU. |
||
69 | * invalidate_domains is set to COMMAND |
||
70 | * clflush is run to get data out of the CPU caches |
||
71 | * then i915_dev_set_domain calls i915_gem_flush to |
||
72 | * emit an MI_FLUSH and drm_agp_chipset_flush |
||
73 | * 5. Unmapped from GTT |
||
74 | * i915_gem_object_unbind calls set_domain (CPU, CPU) |
||
75 | * flush_domains and invalidate_domains end up both zero |
||
76 | * so no flushing/invalidating happens |
||
77 | * 6. Freed |
||
78 | * yay, done |
||
79 | * |
||
80 | * Case 2: The shared render buffer |
||
81 | * |
||
82 | * 1. Allocated |
||
83 | * 2. Mapped to GTT |
||
84 | * 3. Read/written by GPU |
||
85 | * 4. set_domain to (CPU,CPU) |
||
86 | * 5. Read/written by CPU |
||
87 | * 6. Read/written by GPU |
||
88 | * |
||
89 | * 1. Allocated |
||
90 | * Same as last example, (CPU, CPU) |
||
91 | * 2. Mapped to GTT |
||
92 | * Nothing changes (assertions find that it is not in the GPU) |
||
93 | * 3. Read/written by GPU |
||
94 | * execbuffer calls set_domain (RENDER, RENDER) |
||
95 | * flush_domains gets CPU |
||
96 | * invalidate_domains gets GPU |
||
97 | * clflush (obj) |
||
98 | * MI_FLUSH and drm_agp_chipset_flush |
||
99 | * 4. set_domain (CPU, CPU) |
||
100 | * flush_domains gets GPU |
||
101 | * invalidate_domains gets CPU |
||
102 | * wait_rendering (obj) to make sure all drawing is complete. |
||
103 | * This will include an MI_FLUSH to get the data from GPU |
||
104 | * to memory |
||
105 | * clflush (obj) to invalidate the CPU cache |
||
106 | * Another MI_FLUSH in i915_gem_flush (eliminate this somehow?) |
||
107 | * 5. Read/written by CPU |
||
108 | * cache lines are loaded and dirtied |
||
109 | * 6. Read written by GPU |
||
110 | * Same as last GPU access |
||
111 | * |
||
112 | * Case 3: The constant buffer |
||
113 | * |
||
114 | * 1. Allocated |
||
115 | * 2. Written by CPU |
||
116 | * 3. Read by GPU |
||
117 | * 4. Updated (written) by CPU again |
||
118 | * 5. Read by GPU |
||
119 | * |
||
120 | * 1. Allocated |
||
121 | * (CPU, CPU) |
||
122 | * 2. Written by CPU |
||
123 | * (CPU, CPU) |
||
124 | * 3. Read by GPU |
||
125 | * (CPU+RENDER, 0) |
||
126 | * flush_domains = CPU |
||
127 | * invalidate_domains = RENDER |
||
128 | * clflush (obj) |
||
129 | * MI_FLUSH |
||
130 | * drm_agp_chipset_flush |
||
131 | * 4. Updated (written) by CPU again |
||
132 | * (CPU, CPU) |
||
133 | * flush_domains = 0 (no previous write domain) |
||
134 | * invalidate_domains = 0 (no new read domains) |
||
135 | * 5. Read by GPU |
||
136 | * (CPU+RENDER, 0) |
||
137 | * flush_domains = CPU |
||
138 | * invalidate_domains = RENDER |
||
139 | * clflush (obj) |
||
140 | * MI_FLUSH |
||
141 | * drm_agp_chipset_flush |
||
142 | */ |
||
143 | static void |
||
144 | i915_gem_object_set_to_gpu_domain(struct drm_i915_gem_object *obj, |
||
145 | struct intel_ring_buffer *ring, |
||
146 | struct change_domains *cd) |
||
147 | { |
||
148 | uint32_t invalidate_domains = 0, flush_domains = 0; |
||
149 | |||
150 | /* |
||
151 | * If the object isn't moving to a new write domain, |
||
152 | * let the object stay in multiple read domains |
||
153 | */ |
||
154 | if (obj->base.pending_write_domain == 0) |
||
155 | obj->base.pending_read_domains |= obj->base.read_domains; |
||
156 | |||
157 | /* |
||
158 | * Flush the current write domain if |
||
159 | * the new read domains don't match. Invalidate |
||
160 | * any read domains which differ from the old |
||
161 | * write domain |
||
162 | */ |
||
163 | if (obj->base.write_domain && |
||
164 | (((obj->base.write_domain != obj->base.pending_read_domains || |
||
165 | obj->ring != ring)) || |
||
166 | (obj->fenced_gpu_access && !obj->pending_fenced_gpu_access))) { |
||
167 | flush_domains |= obj->base.write_domain; |
||
168 | invalidate_domains |= |
||
169 | obj->base.pending_read_domains & ~obj->base.write_domain; |
||
170 | } |
||
171 | /* |
||
172 | * Invalidate any read caches which may have |
||
173 | * stale data. That is, any new read domains. |
||
174 | */ |
||
175 | invalidate_domains |= obj->base.pending_read_domains & ~obj->base.read_domains; |
||
176 | if ((flush_domains | invalidate_domains) & I915_GEM_DOMAIN_CPU) |
||
177 | i915_gem_clflush_object(obj); |
||
178 | |||
179 | if (obj->base.pending_write_domain) |
||
180 | cd->flips |= atomic_read(&obj->pending_flip); |
||
181 | |||
182 | /* The actual obj->write_domain will be updated with |
||
183 | * pending_write_domain after we emit the accumulated flush for all |
||
184 | * of our domain changes in execbuffers (which clears objects' |
||
185 | * write_domains). So if we have a current write domain that we |
||
186 | * aren't changing, set pending_write_domain to that. |
||
187 | */ |
||
188 | if (flush_domains == 0 && obj->base.pending_write_domain == 0) |
||
189 | obj->base.pending_write_domain = obj->base.write_domain; |
||
190 | |||
191 | cd->invalidate_domains |= invalidate_domains; |
||
192 | cd->flush_domains |= flush_domains; |
||
193 | if (flush_domains & I915_GEM_GPU_DOMAINS) |
||
194 | cd->flush_rings |= obj->ring->id; |
||
195 | if (invalidate_domains & I915_GEM_GPU_DOMAINS) |
||
196 | cd->flush_rings |= ring->id; |
||
197 | } |
||
198 | |||
199 | static int |
||
200 | i915_gem_execbuffer_flush(struct drm_device *dev, |
||
201 | uint32_t invalidate_domains, |
||
202 | uint32_t flush_domains, |
||
203 | uint32_t flush_rings) |
||
204 | { |
||
205 | drm_i915_private_t *dev_priv = dev->dev_private; |
||
206 | int i, ret; |
||
207 | |||
208 | if (flush_domains & I915_GEM_DOMAIN_CPU) |
||
209 | intel_gtt_chipset_flush(); |
||
210 | |||
211 | if (flush_domains & I915_GEM_DOMAIN_GTT) |
||
212 | wmb(); |
||
213 | |||
214 | if ((flush_domains | invalidate_domains) & I915_GEM_GPU_DOMAINS) { |
||
215 | for (i = 0; i < I915_NUM_RINGS; i++) |
||
216 | if (flush_rings & (1 << i)) { |
||
217 | ret = i915_gem_flush_ring(&dev_priv->ring[i], |
||
218 | invalidate_domains, |
||
219 | flush_domains); |
||
220 | if (ret) |
||
221 | return ret; |
||
222 | } |
||
223 | } |
||
224 | |||
225 | return 0; |
||
226 | } |
||
227 | |||
228 | static int |
||
229 | i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring, |
||
230 | struct list_head *objects) |
||
231 | { |
||
232 | struct drm_i915_gem_object *obj; |
||
233 | struct change_domains cd; |
||
234 | int ret; |
||
235 | |||
236 | memset(&cd, 0, sizeof(cd)); |
||
237 | list_for_each_entry(obj, objects, exec_list) |
||
238 | i915_gem_object_set_to_gpu_domain(obj, ring, &cd); |
||
239 | |||
240 | if (cd.invalidate_domains | cd.flush_domains) { |
||
241 | ret = i915_gem_execbuffer_flush(ring->dev, |
||
242 | cd.invalidate_domains, |
||
243 | cd.flush_domains, |
||
244 | cd.flush_rings); |
||
245 | if (ret) |
||
246 | return ret; |
||
247 | } |
||
248 | |||
249 | // if (cd.flips) { |
||
250 | // ret = i915_gem_execbuffer_wait_for_flips(ring, cd.flips); |
||
251 | // if (ret) |
||
252 | // return ret; |
||
253 | // } |
||
254 | |||
255 | // list_for_each_entry(obj, objects, exec_list) { |
||
256 | // ret = i915_gem_execbuffer_sync_rings(obj, ring); |
||
257 | // if (ret) |
||
258 | // return ret; |
||
259 | // } |
||
260 | |||
261 | return 0; |
||
262 | } |
||
263 | |||
264 | static void |
||
265 | i915_gem_execbuffer_move_to_active(struct list_head *objects, |
||
266 | struct intel_ring_buffer *ring, |
||
267 | u32 seqno) |
||
268 | { |
||
269 | struct drm_i915_gem_object *obj; |
||
270 | |||
271 | list_for_each_entry(obj, objects, exec_list) { |
||
272 | u32 old_read = obj->base.read_domains; |
||
273 | u32 old_write = obj->base.write_domain; |
||
274 | |||
275 | |||
276 | obj->base.read_domains = obj->base.pending_read_domains; |
||
277 | obj->base.write_domain = obj->base.pending_write_domain; |
||
278 | obj->fenced_gpu_access = obj->pending_fenced_gpu_access; |
||
279 | |||
280 | i915_gem_object_move_to_active(obj, ring, seqno); |
||
281 | if (obj->base.write_domain) { |
||
282 | obj->dirty = 1; |
||
283 | obj->pending_gpu_write = true; |
||
284 | list_move_tail(&obj->gpu_write_list, |
||
285 | &ring->gpu_write_list); |
||
286 | // intel_mark_busy(ring->dev, obj); |
||
287 | } |
||
288 | |||
289 | // trace_i915_gem_object_change_domain(obj, old_read, old_write); |
||
290 | } |
||
291 | } |
||
292 | |||
293 | static void |
||
294 | i915_gem_execbuffer_retire_commands(struct drm_device *dev, |
||
295 | struct intel_ring_buffer *ring) |
||
296 | { |
||
297 | struct drm_i915_gem_request *request; |
||
298 | u32 invalidate; |
||
299 | |||
300 | /* |
||
301 | * Ensure that the commands in the batch buffer are |
||
302 | * finished before the interrupt fires. |
||
303 | * |
||
304 | * The sampler always gets flushed on i965 (sigh). |
||
305 | */ |
||
306 | invalidate = I915_GEM_DOMAIN_COMMAND; |
||
307 | if (INTEL_INFO(dev)->gen >= 4) |
||
308 | invalidate |= I915_GEM_DOMAIN_SAMPLER; |
||
309 | if (ring->flush(ring, invalidate, 0)) { |
||
310 | i915_gem_next_request_seqno(ring); |
||
311 | return; |
||
312 | } |
||
313 | |||
314 | /* Add a breadcrumb for the completion of the batch buffer */ |
||
315 | request = kzalloc(sizeof(*request), GFP_KERNEL); |
||
316 | if (request == NULL || i915_add_request(ring, NULL, request)) { |
||
317 | i915_gem_next_request_seqno(ring); |
||
318 | kfree(request); |
||
319 | } |
||
320 | } |
||
321 | |||
322 | |||
323 | int exec_batch(struct drm_device *dev, struct intel_ring_buffer *ring, |
||
324 | batchbuffer_t *exec) |
||
325 | { |
||
326 | drm_i915_private_t *dev_priv = dev->dev_private; |
||
327 | struct drm_i915_gem_object *obj; |
||
328 | |||
329 | u32 seqno; |
||
330 | int i; |
||
331 | int ret; |
||
332 | |||
333 | ring = &dev_priv->ring[RCS]; |
||
334 | |||
335 | mutex_lock(&dev->struct_mutex); |
||
336 | |||
337 | list_for_each_entry(obj, &exec->objects, exec_list) |
||
338 | { |
||
339 | obj->base.pending_read_domains = 0; |
||
340 | obj->base.pending_write_domain = 0; |
||
341 | }; |
||
342 | |||
343 | exec->batch->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND; |
||
344 | |||
345 | ret = i915_gem_execbuffer_move_to_gpu(ring, &exec->objects); |
||
346 | if (ret) |
||
347 | goto err; |
||
348 | |||
349 | seqno = i915_gem_next_request_seqno(ring); |
||
350 | // for (i = 0; i < ARRAY_SIZE(ring->sync_seqno); i++) { |
||
351 | // if (seqno < ring->sync_seqno[i]) { |
||
352 | /* The GPU can not handle its semaphore value wrapping, |
||
353 | * so every billion or so execbuffers, we need to stall |
||
354 | * the GPU in order to reset the counters. |
||
355 | */ |
||
356 | // ret = i915_gpu_idle(dev); |
||
357 | // if (ret) |
||
358 | // goto err; |
||
359 | |||
360 | // BUG_ON(ring->sync_seqno[i]); |
||
361 | // } |
||
362 | // }; |
||
363 | |||
364 | ret = ring->dispatch_execbuffer(ring, exec->exec_start, exec->exec_len); |
||
365 | if (ret) |
||
366 | goto err; |
||
367 | |||
368 | i915_gem_execbuffer_move_to_active(&exec->objects, ring, seqno); |
||
369 | i915_gem_execbuffer_retire_commands(dev, ring); |
||
370 | |||
371 | err: |
||
372 | mutex_unlock(&dev->struct_mutex); |
||
373 | |||
374 | return ret; |
||
375 | |||
376 | };>>><>> |