drm/sched: Add boolean to mark if sched is ready to work v5 Problem: A particular scheduler may become unsuable (underlying HW) after some event (e.g. GPU reset). If it's later chosen by the get free sched. policy a command will fail to be submitted. Fix: Add a driver specific callback to report the sched status so rq with bad sched can be avoided in favor of working one or none in which case job init will fail. v2: Switch from driver callback to flag in scheduler. v3: rebase v4: Remove ready paramter from drm_sched_init, set uncoditionally to true once init done. v5: fix missed change in v3d in v4 (Alex) Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

commit: faf6e1a87e07423a729e04fb2e8188742e89ea4c [log] [tgz]
author: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Thu Oct 18 12:32:46 2018 -0400
committer: Alex Deucher <alexander.deucher@amd.com> Mon Nov 05 14:21:22 2018 -0500
tree: 51c205851d3e99371bbf271b8f7e3e59b32ceb04
parent: 2bb42410b1bd324912389c6ac748df1c1befd69f [diff]
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index 3e22a54..ba54c30 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c

@@ -130,7 +130,14 @@ drm_sched_entity_get_free_sched(struct drm_sched_entity *entity)
 	int i;
 
 	for (i = 0; i < entity->num_rq_list; ++i) {
-		num_jobs = atomic_read(&entity->rq_list[i]->sched->num_jobs);
+		struct drm_gpu_scheduler *sched = entity->rq_list[i]->sched;
+
+		if (!entity->rq_list[i]->sched->ready) {
+			DRM_WARN("sched%s is not ready, skipping", sched->name);
+			continue;
+		}
+
+		num_jobs = atomic_read(&sched->num_jobs);
 		if (num_jobs < min_jobs) {
 			min_jobs = num_jobs;
 			rq = entity->rq_list[i];

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 63b997d..6b2fd49 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c

@@ -420,6 +420,9 @@ int drm_sched_job_init(struct drm_sched_job *job,
 	struct drm_gpu_scheduler *sched;
 
 	drm_sched_entity_select_rq(entity);
+	if (!entity->rq)
+		return -ENOENT;
+
 	sched = entity->rq->sched;
 
 	job->sched = sched;
@@ -633,6 +636,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
 		return PTR_ERR(sched->thread);
 	}
 
+	sched->ready = true;
 	return 0;
 }
 EXPORT_SYMBOL(drm_sched_init);
@@ -648,5 +652,7 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
 {
 	if (sched->thread)
 		kthread_stop(sched->thread);
+
+	sched->ready = false;
 }
 EXPORT_SYMBOL(drm_sched_fini);

diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 0684dcd..4ae192a 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h

@@ -264,6 +264,7 @@ struct drm_sched_backend_ops {
  * @hang_limit: once the hangs by a job crosses this limit then it is marked
  *              guilty and it will be considered for scheduling further.
  * @num_jobs: the number of jobs in queue in the scheduler
+ * @ready: marks if the underlying HW is ready to work
  *
  * One scheduler is implemented for each hardware ring.
  */
@@ -283,12 +284,14 @@ struct drm_gpu_scheduler {
 	spinlock_t			job_list_lock;
 	int				hang_limit;
 	atomic_t                        num_jobs;
+	bool			ready;
 };
 
 int drm_sched_init(struct drm_gpu_scheduler *sched,
 		   const struct drm_sched_backend_ops *ops,
 		   uint32_t hw_submission, unsigned hang_limit, long timeout,
 		   const char *name);
+
 void drm_sched_fini(struct drm_gpu_scheduler *sched);
 int drm_sched_job_init(struct drm_sched_job *job,
 		       struct drm_sched_entity *entity,
commit	faf6e1a87e07423a729e04fb2e8188742e89ea4c	[log] [tgz]
author	Andrey Grodzovsky <andrey.grodzovsky@amd.com>	Thu Oct 18 12:32:46 2018 -0400
committer	Alex Deucher <alexander.deucher@amd.com>	Mon Nov 05 14:21:22 2018 -0500
tree	51c205851d3e99371bbf271b8f7e3e59b32ceb04
parent	2bb42410b1bd324912389c6ac748df1c1befd69f [diff]