diff options
author | Dave Airlie <airlied@redhat.com> | 2020-06-08 11:59:56 +1000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2020-06-08 11:59:57 +1000 |
commit | 8d286e2ff4400d313955b4203fc640ca6fd9228b (patch) | |
tree | 5fe44d3527474d78534cf94830a028c169b41ab3 /drivers/gpu/drm | |
parent | fa3fa2228c886435b3eea22cea4767f3fb07671f (diff) | |
parent | f8665d797b1ce9bd81f7ed7744ef3a18d6b186ea (diff) |
Merge tag 'drm-intel-next-fixes-2020-06-04' of git://anongit.freedesktop.org/drm/drm-intel into drm-next
- Includes gvt-next-fixes-2020-05-28
- Use after free fix for display global state.
- Whitelisting context-local timestamp on Gen9
and two scheduler fixes with deps (Cc: stable)
- Removal of write flag from sysfs files where
ineffective
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200604150454.GA59322@jlahtine-desk.ger.corp.intel.com
Diffstat (limited to 'drivers/gpu/drm')
-rw-r--r-- | drivers/gpu/drm/i915/display/intel_global_state.c | 45 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/display/intel_global_state.h | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/i915_gem_context.c | 4 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 15 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gt/intel_context.c | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gvt/vgpu.c | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_cmd_parser.c | 4 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_params.c | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_params.h | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_request.c | 359 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_scheduler.c | 16 |
11 files changed, 295 insertions, 159 deletions
diff --git a/drivers/gpu/drm/i915/display/intel_global_state.c b/drivers/gpu/drm/i915/display/intel_global_state.c index 212d4ee68205..7a19215ad844 100644 --- a/drivers/gpu/drm/i915/display/intel_global_state.c +++ b/drivers/gpu/drm/i915/display/intel_global_state.c @@ -10,6 +10,28 @@ #include "intel_display_types.h" #include "intel_global_state.h" +static void __intel_atomic_global_state_free(struct kref *kref) +{ + struct intel_global_state *obj_state = + container_of(kref, struct intel_global_state, ref); + struct intel_global_obj *obj = obj_state->obj; + + obj->funcs->atomic_destroy_state(obj, obj_state); +} + +static void intel_atomic_global_state_put(struct intel_global_state *obj_state) +{ + kref_put(&obj_state->ref, __intel_atomic_global_state_free); +} + +static struct intel_global_state * +intel_atomic_global_state_get(struct intel_global_state *obj_state) +{ + kref_get(&obj_state->ref); + + return obj_state; +} + void intel_atomic_global_obj_init(struct drm_i915_private *dev_priv, struct intel_global_obj *obj, struct intel_global_state *state, @@ -17,6 +39,10 @@ void intel_atomic_global_obj_init(struct drm_i915_private *dev_priv, { memset(obj, 0, sizeof(*obj)); + state->obj = obj; + + kref_init(&state->ref); + obj->state = state; obj->funcs = funcs; list_add_tail(&obj->head, &dev_priv->global_obj_list); @@ -28,7 +54,9 @@ void intel_atomic_global_obj_cleanup(struct drm_i915_private *dev_priv) list_for_each_entry_safe(obj, next, &dev_priv->global_obj_list, head) { list_del(&obj->head); - obj->funcs->atomic_destroy_state(obj, obj->state); + + drm_WARN_ON(&dev_priv->drm, kref_read(&obj->state->ref) != 1); + intel_atomic_global_state_put(obj->state); } } @@ -97,10 +125,14 @@ intel_atomic_get_global_obj_state(struct intel_atomic_state *state, if (!obj_state) return ERR_PTR(-ENOMEM); + obj_state->obj = obj; obj_state->changed = false; + kref_init(&obj_state->ref); + state->global_objs[index].state = obj_state; - state->global_objs[index].old_state = obj->state; + state->global_objs[index].old_state = + intel_atomic_global_state_get(obj->state); state->global_objs[index].new_state = obj_state; state->global_objs[index].ptr = obj; obj_state->state = state; @@ -163,7 +195,9 @@ void intel_atomic_swap_global_state(struct intel_atomic_state *state) new_obj_state->state = NULL; state->global_objs[i].state = old_obj_state; - obj->state = new_obj_state; + + intel_atomic_global_state_put(obj->state); + obj->state = intel_atomic_global_state_get(new_obj_state); } } @@ -172,10 +206,9 @@ void intel_atomic_clear_global_state(struct intel_atomic_state *state) int i; for (i = 0; i < state->num_global_objs; i++) { - struct intel_global_obj *obj = state->global_objs[i].ptr; + intel_atomic_global_state_put(state->global_objs[i].old_state); + intel_atomic_global_state_put(state->global_objs[i].new_state); - obj->funcs->atomic_destroy_state(obj, - state->global_objs[i].state); state->global_objs[i].ptr = NULL; state->global_objs[i].state = NULL; state->global_objs[i].old_state = NULL; diff --git a/drivers/gpu/drm/i915/display/intel_global_state.h b/drivers/gpu/drm/i915/display/intel_global_state.h index e6163a469029..1f16fa3073c9 100644 --- a/drivers/gpu/drm/i915/display/intel_global_state.h +++ b/drivers/gpu/drm/i915/display/intel_global_state.h @@ -6,6 +6,7 @@ #ifndef __INTEL_GLOBAL_STATE_H__ #define __INTEL_GLOBAL_STATE_H__ +#include <linux/kref.h> #include <linux/list.h> struct drm_i915_private; @@ -54,7 +55,9 @@ struct intel_global_obj { for_each_if(obj) struct intel_global_state { + struct intel_global_obj *obj; struct intel_atomic_state *state; + struct kref ref; bool changed; }; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index 900ea8b7fc8f..f5d59d18cd5b 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -230,7 +230,7 @@ static void intel_context_set_gem(struct intel_context *ce, ce->timeline = intel_timeline_get(ctx->timeline); if (ctx->sched.priority >= I915_PRIORITY_NORMAL && - intel_engine_has_semaphores(ce->engine)) + intel_engine_has_timeslices(ce->engine)) __set_bit(CONTEXT_USE_SEMAPHORES, &ce->flags); } @@ -1969,7 +1969,7 @@ static int __apply_priority(struct intel_context *ce, void *arg) { struct i915_gem_context *ctx = arg; - if (!intel_engine_has_semaphores(ce->engine)) + if (!intel_engine_has_timeslices(ce->engine)) return 0; if (ctx->sched.priority >= I915_PRIORITY_NORMAL) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 5d5d7eef3f43..7aff3514d97a 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -39,7 +39,6 @@ static int shmem_get_pages(struct drm_i915_gem_object *obj) unsigned long last_pfn = 0; /* suppress gcc warning */ unsigned int max_segment = i915_sg_segment_size(); unsigned int sg_page_sizes; - struct pagevec pvec; gfp_t noreclaim; int ret; @@ -192,13 +191,17 @@ err_sg: sg_mark_end(sg); err_pages: mapping_clear_unevictable(mapping); - pagevec_init(&pvec); - for_each_sgt_page(page, sgt_iter, st) { - if (!pagevec_add(&pvec, page)) + if (sg != st->sgl) { + struct pagevec pvec; + + pagevec_init(&pvec); + for_each_sgt_page(page, sgt_iter, st) { + if (!pagevec_add(&pvec, page)) + check_release_pagevec(&pvec); + } + if (pagevec_count(&pvec)) check_release_pagevec(&pvec); } - if (pagevec_count(&pvec)) - check_release_pagevec(&pvec); sg_free_table(st); kfree(st); diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index 74ddb49b2941..e4aece20bc80 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -97,8 +97,6 @@ int __intel_context_do_pin(struct intel_context *ce) { int err; - GEM_BUG_ON(intel_context_is_closed(ce)); - if (unlikely(!test_bit(CONTEXT_ALLOC_BIT, &ce->flags))) { err = intel_context_alloc_state(ce); if (err) diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index 1d5ff88078bd..7d361623ff67 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -124,7 +124,7 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) */ low_avail = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE; high_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE; - num_types = sizeof(vgpu_types) / sizeof(vgpu_types[0]); + num_types = ARRAY_SIZE(vgpu_types); gvt->types = kcalloc(num_types, sizeof(struct intel_vgpu_type), GFP_KERNEL); diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 189b573d02be..372354d33f55 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -572,6 +572,9 @@ struct drm_i915_reg_descriptor { #define REG32(_reg, ...) \ { .addr = (_reg), __VA_ARGS__ } +#define REG32_IDX(_reg, idx) \ + { .addr = _reg(idx) } + /* * Convenience macro for adding 64-bit registers. * @@ -669,6 +672,7 @@ static const struct drm_i915_reg_descriptor gen9_blt_regs[] = { REG64_IDX(RING_TIMESTAMP, BSD_RING_BASE), REG32(BCS_SWCTRL), REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE), + REG32_IDX(RING_CTX_TIMESTAMP, BLT_RING_BASE), REG64_IDX(BCS_GPR, 0), REG64_IDX(BCS_GPR, 1), REG64_IDX(BCS_GPR, 2), diff --git a/drivers/gpu/drm/i915/i915_params.c b/drivers/gpu/drm/i915/i915_params.c index add00ec1f787..a3dde770226d 100644 --- a/drivers/gpu/drm/i915/i915_params.c +++ b/drivers/gpu/drm/i915/i915_params.c @@ -173,7 +173,7 @@ i915_param_named(enable_gvt, bool, 0400, #endif #if IS_ENABLED(CONFIG_DRM_I915_UNSTABLE_FAKE_LMEM) -i915_param_named_unsafe(fake_lmem_start, ulong, 0600, +i915_param_named_unsafe(fake_lmem_start, ulong, 0400, "Fake LMEM start offset (default: 0)"); #endif diff --git a/drivers/gpu/drm/i915/i915_params.h b/drivers/gpu/drm/i915/i915_params.h index 45323732f099..4f21bfffbf0e 100644 --- a/drivers/gpu/drm/i915/i915_params.h +++ b/drivers/gpu/drm/i915/i915_params.h @@ -64,7 +64,7 @@ struct drm_printer; param(int, mmio_debug, -IS_ENABLED(CONFIG_DRM_I915_DEBUG_MMIO), 0600) \ param(int, edp_vswing, 0, 0400) \ param(unsigned int, reset, 3, 0600) \ - param(unsigned int, inject_probe_failure, 0, 0600) \ + param(unsigned int, inject_probe_failure, 0, 0) \ param(int, fastboot, -1, 0600) \ param(int, enable_dpcd_backlight, -1, 0600) \ param(char *, force_probe, CONFIG_DRM_I915_FORCE_PROBE, 0400) \ diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 526c1e9acbd5..def62100e666 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -121,8 +121,39 @@ static void i915_fence_release(struct dma_fence *fence) i915_sw_fence_fini(&rq->submit); i915_sw_fence_fini(&rq->semaphore); - /* Keep one request on each engine for reserved use under mempressure */ - if (!cmpxchg(&rq->engine->request_pool, NULL, rq)) + /* + * Keep one request on each engine for reserved use under mempressure + * + * We do not hold a reference to the engine here and so have to be + * very careful in what rq->engine we poke. The virtual engine is + * referenced via the rq->context and we released that ref during + * i915_request_retire(), ergo we must not dereference a virtual + * engine here. Not that we would want to, as the only consumer of + * the reserved engine->request_pool is the power management parking, + * which must-not-fail, and that is only run on the physical engines. + * + * Since the request must have been executed to be have completed, + * we know that it will have been processed by the HW and will + * not be unsubmitted again, so rq->engine and rq->execution_mask + * at this point is stable. rq->execution_mask will be a single + * bit if the last and _only_ engine it could execution on was a + * physical engine, if it's multiple bits then it started on and + * could still be on a virtual engine. Thus if the mask is not a + * power-of-two we assume that rq->engine may still be a virtual + * engine and so a dangling invalid pointer that we cannot dereference + * + * For example, consider the flow of a bonded request through a virtual + * engine. The request is created with a wide engine mask (all engines + * that we might execute on). On processing the bond, the request mask + * is reduced to one or more engines. If the request is subsequently + * bound to a single engine, it will then be constrained to only + * execute on that engine and never returned to the virtual engine + * after timeslicing away, see __unwind_incomplete_requests(). Thus we + * know that if the rq->execution_mask is a single bit, rq->engine + * can be a physical engine with the exact corresponding mask. + */ + if (is_power_of_2(rq->execution_mask) && + !cmpxchg(&rq->engine->request_pool, NULL, rq)) return; kmem_cache_free(global.slab_requests, rq); @@ -326,6 +357,53 @@ void i915_request_retire_upto(struct i915_request *rq) } while (i915_request_retire(tmp) && tmp != rq); } +static struct i915_request * const * +__engine_active(struct intel_engine_cs *engine) +{ + return READ_ONCE(engine->execlists.active); +} + +static bool __request_in_flight(const struct i915_request *signal) +{ + struct i915_request * const *port, *rq; + bool inflight = false; + + if (!i915_request_is_ready(signal)) + return false; + + /* + * Even if we have unwound the request, it may still be on + * the GPU (preempt-to-busy). If that request is inside an + * unpreemptible critical section, it will not be removed. Some + * GPU functions may even be stuck waiting for the paired request + * (__await_execution) to be submitted and cannot be preempted + * until the bond is executing. + * + * As we know that there are always preemption points between + * requests, we know that only the currently executing request + * may be still active even though we have cleared the flag. + * However, we can't rely on our tracking of ELSP[0] to known + * which request is currently active and so maybe stuck, as + * the tracking maybe an event behind. Instead assume that + * if the context is still inflight, then it is still active + * even if the active flag has been cleared. + */ + if (!intel_context_inflight(signal->context)) + return false; + + rcu_read_lock(); + for (port = __engine_active(signal->engine); (rq = *port); port++) { + if (rq->context == signal->context) { + inflight = i915_seqno_passed(rq->fence.seqno, + signal->fence.seqno); + break; + } + } + rcu_read_unlock(); + + return inflight; +} + static int __await_execution(struct i915_request *rq, struct i915_request *signal, @@ -356,7 +434,7 @@ __await_execution(struct i915_request *rq, } spin_lock_irq(&signal->lock); - if (i915_request_is_active(signal)) { + if (i915_request_is_active(signal) || __request_in_flight(signal)) { if (hook) { hook(rq, &signal->fence); i915_request_put(signal); @@ -1022,37 +1100,91 @@ await_fence: I915_FENCE_GFP); } +static bool intel_timeline_sync_has_start(struct intel_timeline *tl, + struct dma_fence *fence) +{ + return __intel_timeline_sync_is_later(tl, + fence->context, + fence->seqno - 1); +} + +static int intel_timeline_sync_set_start(struct intel_timeline *tl, + const struct dma_fence *fence) +{ + return __intel_timeline_sync_set(tl, fence->context, fence->seqno - 1); +} + static int -i915_request_await_request(struct i915_request *to, struct i915_request *from) +__i915_request_await_execution(struct i915_request *to, + struct i915_request *from, + void (*hook)(struct i915_request *rq, + struct dma_fence *signal)) { - int ret; + int err; - GEM_BUG_ON(to == from); - GEM_BUG_ON(to->timeline == from->timeline); + GEM_BUG_ON(intel_context_is_barrier(from->context)); - if (i915_request_completed(from)) { - i915_sw_fence_set_error_once(&to->submit, from->fence.error); + /* Submit both requests at the same time */ + err = __await_execution(to, from, hook, I915_FENCE_GFP); + if (err) + return err; + + /* Squash repeated depenendices to the same timelines */ + if (intel_timeline_sync_has_start(i915_request_timeline(to), + &from->fence)) return 0; + + /* + * Wait until the start of this request. + * + * The execution cb fires when we submit the request to HW. But in + * many cases this may be long before the request itself is ready to + * run (consider that we submit 2 requests for the same context, where + * the request of interest is behind an indefinite spinner). So we hook + * up to both to reduce our queues and keep the execution lag minimised + * in the worst case, though we hope that the await_start is elided. + */ + err = i915_request_await_start(to, from); + if (err < 0) + return err; + + /* + * Ensure both start together [after all semaphores in signal] + * + * Now that we are queued to the HW at roughly the same time (thanks + * to the execute cb) and are ready to run at roughly the same time + * (thanks to the await start), our signaler may still be indefinitely + * delayed by waiting on a semaphore from a remote engine. If our + * signaler depends on a semaphore, so indirectly do we, and we do not + * want to start our payload until our signaler also starts theirs. + * So we wait. + * + * However, there is also a second condition for which we need to wait + * for the precise start of the signaler. Consider that the signaler + * was submitted in a chain of requests following another context + * (with just an ordinary intra-engine fence dependency between the + * two). In this case the signaler is queued to HW, but not for + * immediate execution, and so we must wait until it reaches the + * active slot. + */ + if (intel_engine_has_semaphores(to->engine) && + !i915_request_has_initial_breadcrumb(to)) { + err = __emit_semaphore_wait(to, from, from->fence.seqno - 1); + if (err < 0) + return err; } + /* Couple the dependency tree for PI on this exposed to->fence */ if (to->engine->schedule) { - ret = i915_sched_node_add_dependency(&to->sched, + err = i915_sched_node_add_dependency(&to->sched, &from->sched, - I915_DEPENDENCY_EXTERNAL); - if (ret < 0) - return ret; + I915_DEPENDENCY_WEAK); + if (err < 0) + return err; } - if (to->engine == from->engine) - ret = i915_sw_fence_await_sw_fence_gfp(&to->submit, - &from->submit, - I915_FENCE_GFP); - else - ret = emit_semaphore_wait(to, from, I915_FENCE_GFP); - if (ret < 0) - return ret; - - return 0; + return intel_timeline_sync_set_start(i915_request_timeline(to), + &from->fence); } static void mark_external(struct i915_request *rq) @@ -1105,23 +1237,20 @@ i915_request_await_external(struct i915_request *rq, struct dma_fence *fence) } int -i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence) +i915_request_await_execution(struct i915_request *rq, + struct dma_fence *fence, + void (*hook)(struct i915_request *rq, + struct dma_fence *signal)) { struct dma_fence **child = &fence; unsigned int nchild = 1; int ret; - /* - * Note that if the fence-array was created in signal-on-any mode, - * we should *not* decompose it into its individual fences. However, - * we don't currently store which mode the fence-array is operating - * in. Fortunately, the only user of signal-on-any is private to - * amdgpu and we should not see any incoming fence-array from - * sync-file being in signal-on-any mode. - */ if (dma_fence_is_array(fence)) { struct dma_fence_array *array = to_dma_fence_array(fence); + /* XXX Error for signal-on-any fence arrays */ + child = array->fences; nchild = array->num_fences; GEM_BUG_ON(!nchild); @@ -1134,138 +1263,95 @@ i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence) continue; } - /* - * Requests on the same timeline are explicitly ordered, along - * with their dependencies, by i915_request_add() which ensures - * that requests are submitted in-order through each ring. - */ if (fence->context == rq->fence.context) continue; - /* Squash repeated waits to the same timelines */ - if (fence->context && - intel_timeline_sync_is_later(i915_request_timeline(rq), - fence)) - continue; + /* + * We don't squash repeated fence dependencies here as we + * want to run our callback in all cases. + */ if (dma_fence_is_i915(fence)) - ret = i915_request_await_request(rq, to_request(fence)); + ret = __i915_request_await_execution(rq, + to_request(fence), + hook); else ret = i915_request_await_external(rq, fence); if (ret < 0) return ret; - - /* Record the latest fence used against each timeline */ - if (fence->context) - intel_timeline_sync_set(i915_request_timeline(rq), - fence); } while (--nchild); return 0; } -static bool intel_timeline_sync_has_start(struct intel_timeline *tl, - struct dma_fence *fence) -{ - return __intel_timeline_sync_is_later(tl, - fence->context, - fence->seqno - 1); -} - -static int intel_timeline_sync_set_start(struct intel_timeline *tl, - const struct dma_fence *fence) +static int +await_request_submit(struct i915_request *to, struct i915_request *from) { - return __intel_timeline_sync_set(tl, fence->context, fence->seqno - 1); + /* + * If we are waiting on a virtual engine, then it may be + * constrained to execute on a single engine *prior* to submission. + * When it is submitted, it will be first submitted to the virtual + * engine and then passed to the physical engine. We cannot allow + * the waiter to be submitted immediately to the physical engine + * as it may then bypass the virtual request. + */ + if (to->engine == READ_ONCE(from->engine)) + return i915_sw_fence_await_sw_fence_gfp(&to->submit, + &from->submit, + I915_FENCE_GFP); + else + return __i915_request_await_execution(to, from, NULL); } static int -__i915_request_await_execution(struct i915_request *to, - struct i915_request *from, - void (*hook)(struct i915_request *rq, - struct dma_fence *signal)) +i915_request_await_request(struct i915_request *to, struct i915_request *from) { - int err; - - GEM_BUG_ON(intel_context_is_barrier(from->context)); + int ret; - /* Submit both requests at the same time */ - err = __await_execution(to, from, hook, I915_FENCE_GFP); - if (err) - return err; + GEM_BUG_ON(to == from); + GEM_BUG_ON(to->timeline == from->timeline); - /* Squash repeated depenendices to the same timelines */ - if (intel_timeline_sync_has_start(i915_request_timeline(to), - &from->fence)) + if (i915_request_completed(from)) { + i915_sw_fence_set_error_once(&to->submit, from->fence.error); return 0; - - /* - * Wait until the start of this request. - * - * The execution cb fires when we submit the request to HW. But in - * many cases this may be long before the request itself is ready to - * run (consider that we submit 2 requests for the same context, where - * the request of interest is behind an indefinite spinner). So we hook - * up to both to reduce our queues and keep the execution lag minimised - * in the worst case, though we hope that the await_start is elided. - */ - err = i915_request_await_start(to, from); - if (err < 0) - return err; - - /* - * Ensure both start together [after all semaphores in signal] - * - * Now that we are queued to the HW at roughly the same time (thanks - * to the execute cb) and are ready to run at roughly the same time - * (thanks to the await start), our signaler may still be indefinitely - * delayed by waiting on a semaphore from a remote engine. If our - * signaler depends on a semaphore, so indirectly do we, and we do not - * want to start our payload until our signaler also starts theirs. - * So we wait. - * - * However, there is also a second condition for which we need to wait - * for the precise start of the signaler. Consider that the signaler - * was submitted in a chain of requests following another context - * (with just an ordinary intra-engine fence dependency between the - * two). In this case the signaler is queued to HW, but not for - * immediate execution, and so we must wait until it reaches the - * active slot. - */ - if (intel_engine_has_semaphores(to->engine) && - !i915_request_has_initial_breadcrumb(to)) { - err = __emit_semaphore_wait(to, from, from->fence.seqno - 1); - if (err < 0) - return err; } - /* Couple the dependency tree for PI on this exposed to->fence */ if (to->engine->schedule) { - err = i915_sched_node_add_dependency(&to->sched, + ret = i915_sched_node_add_dependency(&to->sched, &from->sched, - I915_DEPENDENCY_WEAK); - if (err < 0) - return err; + I915_DEPENDENCY_EXTERNAL); + if (ret < 0) + return ret; } - return intel_timeline_sync_set_start(i915_request_timeline(to), - &from->fence); + if (is_power_of_2(to->execution_mask | READ_ONCE(from->execution_mask))) + ret = await_request_submit(to, from); + else + ret = emit_semaphore_wait(to, from, I915_FENCE_GFP); + if (ret < 0) + return ret; + + return 0; } int -i915_request_await_execution(struct i915_request *rq, - struct dma_fence *fence, - void (*hook)(struct i915_request *rq, - struct dma_fence *signal)) +i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence) { struct dma_fence **child = &fence; unsigned int nchild = 1; int ret; + /* + * Note that if the fence-array was created in signal-on-any mode, + * we should *not* decompose it into its individual fences. However, + * we don't currently store which mode the fence-array is operating + * in. Fortunately, the only user of signal-on-any is private to + * amdgpu and we should not see any incoming fence-array from + * sync-file being in signal-on-any mode. + */ if (dma_fence_is_array(fence)) { struct dma_fence_array *array = to_dma_fence_array(fence); - /* XXX Error for signal-on-any fence arrays */ - child = array->fences; nchild = array->num_fences; GEM_BUG_ON(!nchild); @@ -1278,22 +1364,31 @@ i915_request_await_execution(struct i915_request *rq, continue; } + /* + * Requests on the same timeline are explicitly ordered, along + * with their dependencies, by i915_request_add() which ensures + * that requests are submitted in-order through each ring. + */ if (fence->context == rq->fence.context) continue; - /* - * We don't squash repeated fence dependencies here as we - * want to run our callback in all cases. - */ + /* Squash repeated waits to the same timelines */ + if (fence->context && + intel_timeline_sync_is_later(i915_request_timeline(rq), + fence)) + continue; if (dma_fence_is_i915(fence)) - ret = __i915_request_await_execution(rq, - to_request(fence), - hook); + ret = i915_request_await_request(rq, to_request(fence)); else ret = i915_request_await_external(rq, fence); if (ret < 0) return ret; + + /* Record the latest fence used against each timeline */ + if (fence->context) + intel_timeline_sync_set(i915_request_timeline(rq), + fence); } while (--nchild); return 0; diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c index f4ea318781f0..cbb880b10c65 100644 --- a/drivers/gpu/drm/i915/i915_scheduler.c +++ b/drivers/gpu/drm/i915/i915_scheduler.c @@ -209,14 +209,6 @@ static void kick_submission(struct intel_engine_cs *engine, if (!inflight) goto unlock; - ENGINE_TRACE(engine, - "bumping queue-priority-hint:%d for rq:%llx:%lld, inflight:%llx:%lld prio %d\n", - prio, - rq->fence.context, rq->fence.seqno, - inflight->fence.context, inflight->fence.seqno, - inflight->sched.attr.priority); - engine->execlists.queue_priority_hint = prio; - /* * If we are already the currently executing context, don't * bother evaluating if we should preempt ourselves. @@ -224,6 +216,14 @@ static void kick_submission(struct intel_engine_cs *engine, if (inflight->context == rq->context) goto unlock; + ENGINE_TRACE(engine, + "bumping queue-priority-hint:%d for rq:%llx:%lld, inflight:%llx:%lld prio %d\n", + prio, + rq->fence.context, rq->fence.seqno, + inflight->fence.context, inflight->fence.seqno, + inflight->sched.attr.priority); + + engine->execlists.queue_priority_hint = prio; if (need_preempt(prio, rq_prio(inflight))) tasklet_hi_schedule(&engine->execlists.tasklet); |