mirror of https://github.com/torvalds/linux.git
drm/sched: Avoid memory leaks with cancel_job() callback
Since its inception, the GPU scheduler can leak memory if the driver calls drm_sched_fini() while there are still jobs in flight. The simplest way to solve this in a backwards compatible manner is by adding a new callback, drm_sched_backend_ops.cancel_job(), which instructs the driver to signal the hardware fence associated with the job. Afterwards, the scheduler can safely use the established free_job() callback for freeing the job. Implement the new backend_ops callback cancel_job(). Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Link: https://lore.kernel.org/dri-devel/20250418113211.69956-1-tvrtko.ursulin@igalia.com/ Reviewed-by: Maíra Canal <mcanal@igalia.com> Acked-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com> Signed-off-by: Philipp Stanner <phasta@kernel.org> Link: https://lore.kernel.org/r/20250710125412.128476-4-phasta@kernel.org
This commit is contained in:
parent
fe69a39180
commit
bf8bbaefaa
|
|
@ -1352,6 +1352,18 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, const struct drm_sched_init_
|
|||
}
|
||||
EXPORT_SYMBOL(drm_sched_init);
|
||||
|
||||
static void drm_sched_cancel_remaining_jobs(struct drm_gpu_scheduler *sched)
|
||||
{
|
||||
struct drm_sched_job *job, *tmp;
|
||||
|
||||
/* All other accessors are stopped. No locking necessary. */
|
||||
list_for_each_entry_safe_reverse(job, tmp, &sched->pending_list, list) {
|
||||
sched->ops->cancel_job(job);
|
||||
list_del(&job->list);
|
||||
sched->ops->free_job(job);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* drm_sched_fini - Destroy a gpu scheduler
|
||||
*
|
||||
|
|
@ -1359,19 +1371,11 @@ EXPORT_SYMBOL(drm_sched_init);
|
|||
*
|
||||
* Tears down and cleans up the scheduler.
|
||||
*
|
||||
* This stops submission of new jobs to the hardware through
|
||||
* drm_sched_backend_ops.run_job(). Consequently, drm_sched_backend_ops.free_job()
|
||||
* will not be called for all jobs still in drm_gpu_scheduler.pending_list.
|
||||
* There is no solution for this currently. Thus, it is up to the driver to make
|
||||
* sure that:
|
||||
*
|
||||
* a) drm_sched_fini() is only called after for all submitted jobs
|
||||
* drm_sched_backend_ops.free_job() has been called or that
|
||||
* b) the jobs for which drm_sched_backend_ops.free_job() has not been called
|
||||
* after drm_sched_fini() ran are freed manually.
|
||||
*
|
||||
* FIXME: Take care of the above problem and prevent this function from leaking
|
||||
* the jobs in drm_gpu_scheduler.pending_list under any circumstances.
|
||||
* This stops submission of new jobs to the hardware through &struct
|
||||
* drm_sched_backend_ops.run_job. If &struct drm_sched_backend_ops.cancel_job
|
||||
* is implemented, all jobs will be canceled through it and afterwards cleaned
|
||||
* up through &struct drm_sched_backend_ops.free_job. If cancel_job is not
|
||||
* implemented, memory could leak.
|
||||
*/
|
||||
void drm_sched_fini(struct drm_gpu_scheduler *sched)
|
||||
{
|
||||
|
|
@ -1401,6 +1405,10 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
|
|||
/* Confirm no work left behind accessing device structures */
|
||||
cancel_delayed_work_sync(&sched->work_tdr);
|
||||
|
||||
/* Avoid memory leaks if supported by the driver. */
|
||||
if (sched->ops->cancel_job)
|
||||
drm_sched_cancel_remaining_jobs(sched);
|
||||
|
||||
if (sched->own_submit_wq)
|
||||
destroy_workqueue(sched->submit_wq);
|
||||
sched->ready = false;
|
||||
|
|
|
|||
|
|
@ -512,6 +512,24 @@ struct drm_sched_backend_ops {
|
|||
* and it's time to clean it up.
|
||||
*/
|
||||
void (*free_job)(struct drm_sched_job *sched_job);
|
||||
|
||||
/**
|
||||
* @cancel_job: Used by the scheduler to guarantee remaining jobs' fences
|
||||
* get signaled in drm_sched_fini().
|
||||
*
|
||||
* Used by the scheduler to cancel all jobs that have not been executed
|
||||
* with &struct drm_sched_backend_ops.run_job by the time
|
||||
* drm_sched_fini() gets invoked.
|
||||
*
|
||||
* Drivers need to signal the passed job's hardware fence with an
|
||||
* appropriate error code (e.g., -ECANCELED) in this callback. They
|
||||
* must not free the job.
|
||||
*
|
||||
* The scheduler will only call this callback once it stopped calling
|
||||
* all other callbacks forever, with the exception of &struct
|
||||
* drm_sched_backend_ops.free_job.
|
||||
*/
|
||||
void (*cancel_job)(struct drm_sched_job *sched_job);
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Reference in New Issue