0,0 → 1,376 |
#include "drmP.h" |
#include "drm.h" |
#include "i915_drm.h" |
#include "i915_drv.h" |
#include "intel_drv.h" |
//#include |
|
#undef mb |
#undef rmb |
#undef wmb |
#define mb() asm volatile("mfence") |
#define rmb() asm volatile ("lfence") |
#define wmb() asm volatile ("sfence") |
|
|
typedef struct |
{ |
struct drm_i915_gem_object *batch; |
struct list_head objects; |
u32 exec_start; |
u32 exec_len; |
|
}batchbuffer_t; |
|
struct change_domains { |
uint32_t invalidate_domains; |
uint32_t flush_domains; |
uint32_t flush_rings; |
uint32_t flips; |
}; |
|
/* |
* Set the next domain for the specified object. This |
* may not actually perform the necessary flushing/invaliding though, |
* as that may want to be batched with other set_domain operations |
* |
* This is (we hope) the only really tricky part of gem. The goal |
* is fairly simple -- track which caches hold bits of the object |
* and make sure they remain coherent. A few concrete examples may |
* help to explain how it works. For shorthand, we use the notation |
* (read_domains, write_domain), e.g. (CPU, CPU) to indicate the |
* a pair of read and write domain masks. |
* |
* Case 1: the batch buffer |
* |
* 1. Allocated |
* 2. Written by CPU |
* 3. Mapped to GTT |
* 4. Read by GPU |
* 5. Unmapped from GTT |
* 6. Freed |
* |
* Let's take these a step at a time |
* |
* 1. Allocated |
* Pages allocated from the kernel may still have |
* cache contents, so we set them to (CPU, CPU) always. |
* 2. Written by CPU (using pwrite) |
* The pwrite function calls set_domain (CPU, CPU) and |
* this function does nothing (as nothing changes) |
* 3. Mapped by GTT |
* This function asserts that the object is not |
* currently in any GPU-based read or write domains |
* 4. Read by GPU |
* i915_gem_execbuffer calls set_domain (COMMAND, 0). |
* As write_domain is zero, this function adds in the |
* current read domains (CPU+COMMAND, 0). |
* flush_domains is set to CPU. |
* invalidate_domains is set to COMMAND |
* clflush is run to get data out of the CPU caches |
* then i915_dev_set_domain calls i915_gem_flush to |
* emit an MI_FLUSH and drm_agp_chipset_flush |
* 5. Unmapped from GTT |
* i915_gem_object_unbind calls set_domain (CPU, CPU) |
* flush_domains and invalidate_domains end up both zero |
* so no flushing/invalidating happens |
* 6. Freed |
* yay, done |
* |
* Case 2: The shared render buffer |
* |
* 1. Allocated |
* 2. Mapped to GTT |
* 3. Read/written by GPU |
* 4. set_domain to (CPU,CPU) |
* 5. Read/written by CPU |
* 6. Read/written by GPU |
* |
* 1. Allocated |
* Same as last example, (CPU, CPU) |
* 2. Mapped to GTT |
* Nothing changes (assertions find that it is not in the GPU) |
* 3. Read/written by GPU |
* execbuffer calls set_domain (RENDER, RENDER) |
* flush_domains gets CPU |
* invalidate_domains gets GPU |
* clflush (obj) |
* MI_FLUSH and drm_agp_chipset_flush |
* 4. set_domain (CPU, CPU) |
* flush_domains gets GPU |
* invalidate_domains gets CPU |
* wait_rendering (obj) to make sure all drawing is complete. |
* This will include an MI_FLUSH to get the data from GPU |
* to memory |
* clflush (obj) to invalidate the CPU cache |
* Another MI_FLUSH in i915_gem_flush (eliminate this somehow?) |
* 5. Read/written by CPU |
* cache lines are loaded and dirtied |
* 6. Read written by GPU |
* Same as last GPU access |
* |
* Case 3: The constant buffer |
* |
* 1. Allocated |
* 2. Written by CPU |
* 3. Read by GPU |
* 4. Updated (written) by CPU again |
* 5. Read by GPU |
* |
* 1. Allocated |
* (CPU, CPU) |
* 2. Written by CPU |
* (CPU, CPU) |
* 3. Read by GPU |
* (CPU+RENDER, 0) |
* flush_domains = CPU |
* invalidate_domains = RENDER |
* clflush (obj) |
* MI_FLUSH |
* drm_agp_chipset_flush |
* 4. Updated (written) by CPU again |
* (CPU, CPU) |
* flush_domains = 0 (no previous write domain) |
* invalidate_domains = 0 (no new read domains) |
* 5. Read by GPU |
* (CPU+RENDER, 0) |
* flush_domains = CPU |
* invalidate_domains = RENDER |
* clflush (obj) |
* MI_FLUSH |
* drm_agp_chipset_flush |
*/ |
static void |
i915_gem_object_set_to_gpu_domain(struct drm_i915_gem_object *obj, |
struct intel_ring_buffer *ring, |
struct change_domains *cd) |
{ |
uint32_t invalidate_domains = 0, flush_domains = 0; |
|
/* |
* If the object isn't moving to a new write domain, |
* let the object stay in multiple read domains |
*/ |
if (obj->base.pending_write_domain == 0) |
obj->base.pending_read_domains |= obj->base.read_domains; |
|
/* |
* Flush the current write domain if |
* the new read domains don't match. Invalidate |
* any read domains which differ from the old |
* write domain |
*/ |
if (obj->base.write_domain && |
(((obj->base.write_domain != obj->base.pending_read_domains || |
obj->ring != ring)) || |
(obj->fenced_gpu_access && !obj->pending_fenced_gpu_access))) { |
flush_domains |= obj->base.write_domain; |
invalidate_domains |= |
obj->base.pending_read_domains & ~obj->base.write_domain; |
} |
/* |
* Invalidate any read caches which may have |
* stale data. That is, any new read domains. |
*/ |
invalidate_domains |= obj->base.pending_read_domains & ~obj->base.read_domains; |
if ((flush_domains | invalidate_domains) & I915_GEM_DOMAIN_CPU) |
i915_gem_clflush_object(obj); |
|
if (obj->base.pending_write_domain) |
cd->flips |= atomic_read(&obj->pending_flip); |
|
/* The actual obj->write_domain will be updated with |
* pending_write_domain after we emit the accumulated flush for all |
* of our domain changes in execbuffers (which clears objects' |
* write_domains). So if we have a current write domain that we |
* aren't changing, set pending_write_domain to that. |
*/ |
if (flush_domains == 0 && obj->base.pending_write_domain == 0) |
obj->base.pending_write_domain = obj->base.write_domain; |
|
cd->invalidate_domains |= invalidate_domains; |
cd->flush_domains |= flush_domains; |
if (flush_domains & I915_GEM_GPU_DOMAINS) |
cd->flush_rings |= obj->ring->id; |
if (invalidate_domains & I915_GEM_GPU_DOMAINS) |
cd->flush_rings |= ring->id; |
} |
|
static int |
i915_gem_execbuffer_flush(struct drm_device *dev, |
uint32_t invalidate_domains, |
uint32_t flush_domains, |
uint32_t flush_rings) |
{ |
drm_i915_private_t *dev_priv = dev->dev_private; |
int i, ret; |
|
if (flush_domains & I915_GEM_DOMAIN_CPU) |
intel_gtt_chipset_flush(); |
|
if (flush_domains & I915_GEM_DOMAIN_GTT) |
wmb(); |
|
if ((flush_domains | invalidate_domains) & I915_GEM_GPU_DOMAINS) { |
for (i = 0; i < I915_NUM_RINGS; i++) |
if (flush_rings & (1 << i)) { |
ret = i915_gem_flush_ring(&dev_priv->ring[i], |
invalidate_domains, |
flush_domains); |
if (ret) |
return ret; |
} |
} |
|
return 0; |
} |
|
static int |
i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring, |
struct list_head *objects) |
{ |
struct drm_i915_gem_object *obj; |
struct change_domains cd; |
int ret; |
|
memset(&cd, 0, sizeof(cd)); |
list_for_each_entry(obj, objects, exec_list) |
i915_gem_object_set_to_gpu_domain(obj, ring, &cd); |
|
if (cd.invalidate_domains | cd.flush_domains) { |
ret = i915_gem_execbuffer_flush(ring->dev, |
cd.invalidate_domains, |
cd.flush_domains, |
cd.flush_rings); |
if (ret) |
return ret; |
} |
|
// if (cd.flips) { |
// ret = i915_gem_execbuffer_wait_for_flips(ring, cd.flips); |
// if (ret) |
// return ret; |
// } |
|
// list_for_each_entry(obj, objects, exec_list) { |
// ret = i915_gem_execbuffer_sync_rings(obj, ring); |
// if (ret) |
// return ret; |
// } |
|
return 0; |
} |
|
static void |
i915_gem_execbuffer_move_to_active(struct list_head *objects, |
struct intel_ring_buffer *ring, |
u32 seqno) |
{ |
struct drm_i915_gem_object *obj; |
|
list_for_each_entry(obj, objects, exec_list) { |
u32 old_read = obj->base.read_domains; |
u32 old_write = obj->base.write_domain; |
|
|
obj->base.read_domains = obj->base.pending_read_domains; |
obj->base.write_domain = obj->base.pending_write_domain; |
obj->fenced_gpu_access = obj->pending_fenced_gpu_access; |
|
i915_gem_object_move_to_active(obj, ring, seqno); |
if (obj->base.write_domain) { |
obj->dirty = 1; |
obj->pending_gpu_write = true; |
list_move_tail(&obj->gpu_write_list, |
&ring->gpu_write_list); |
// intel_mark_busy(ring->dev, obj); |
} |
|
// trace_i915_gem_object_change_domain(obj, old_read, old_write); |
} |
} |
|
static void |
i915_gem_execbuffer_retire_commands(struct drm_device *dev, |
struct intel_ring_buffer *ring) |
{ |
struct drm_i915_gem_request *request; |
u32 invalidate; |
|
/* |
* Ensure that the commands in the batch buffer are |
* finished before the interrupt fires. |
* |
* The sampler always gets flushed on i965 (sigh). |
*/ |
invalidate = I915_GEM_DOMAIN_COMMAND; |
if (INTEL_INFO(dev)->gen >= 4) |
invalidate |= I915_GEM_DOMAIN_SAMPLER; |
if (ring->flush(ring, invalidate, 0)) { |
i915_gem_next_request_seqno(ring); |
return; |
} |
|
/* Add a breadcrumb for the completion of the batch buffer */ |
request = kzalloc(sizeof(*request), GFP_KERNEL); |
if (request == NULL || i915_add_request(ring, NULL, request)) { |
i915_gem_next_request_seqno(ring); |
kfree(request); |
} |
} |
|
|
int exec_batch(struct drm_device *dev, struct intel_ring_buffer *ring, |
batchbuffer_t *exec) |
{ |
drm_i915_private_t *dev_priv = dev->dev_private; |
struct drm_i915_gem_object *obj; |
|
u32 seqno; |
int i; |
int ret; |
|
ring = &dev_priv->ring[RCS]; |
|
mutex_lock(&dev->struct_mutex); |
|
list_for_each_entry(obj, &exec->objects, exec_list) |
{ |
obj->base.pending_read_domains = 0; |
obj->base.pending_write_domain = 0; |
}; |
|
exec->batch->base.pending_read_domains |= I915_GEM_DOMAIN_COMMAND; |
|
ret = i915_gem_execbuffer_move_to_gpu(ring, &exec->objects); |
if (ret) |
goto err; |
|
seqno = i915_gem_next_request_seqno(ring); |
// for (i = 0; i < ARRAY_SIZE(ring->sync_seqno); i++) { |
// if (seqno < ring->sync_seqno[i]) { |
/* The GPU can not handle its semaphore value wrapping, |
* so every billion or so execbuffers, we need to stall |
* the GPU in order to reset the counters. |
*/ |
// ret = i915_gpu_idle(dev); |
// if (ret) |
// goto err; |
|
// BUG_ON(ring->sync_seqno[i]); |
// } |
// }; |
|
ret = ring->dispatch_execbuffer(ring, exec->exec_start, exec->exec_len); |
if (ret) |
goto err; |
|
i915_gem_execbuffer_move_to_active(&exec->objects, ring, seqno); |
i915_gem_execbuffer_retire_commands(dev, ring); |
|
err: |
mutex_unlock(&dev->struct_mutex); |
|
return ret; |
|
}; |