Re: [PATCH v7 04/10] accel/rocket: Add job submission IOCTL - Linaro-mm-sig

24 Jun 2025

On 2025-06-06 7:28 am, Tomeu Vizoso wrote:
[...]
...

diff --git a/drivers/accel/rocket/rocket_device.h b/drivers/accel/rocket/rocket_device.h
index 10acfe8534f00a7985d40a93f4b2f7f69d43caee..50e46f0516bd1615b5f826c5002a6c0ecbf9aed4 100644
--- a/drivers/accel/rocket/rocket_device.h
+++ b/drivers/accel/rocket/rocket_device.h
@@ -13,6 +13,8 @@
  struct rocket_device {
   struct drm_device ddev;

struct mutex sched_lock;
struct mutex iommu_lock;

Just realised I missed this in the last patch, but iommu_lock appears to 
be completely unnecessary now.
...
struct rocket_core *cores;
[...]
...
+static void rocket_job_hw_submit(struct rocket_core *core, struct rocket_job *job)
+{

struct rocket_task *task;
bool task_pp_en = 1;
bool task_count = 1;

/* GO ! */

/* Don't queue the job if a reset is in progress */
if (atomic_read(&core->reset.pending))
return;



task = &job->tasks[job->next_task_idx];
job->next_task_idx++;

rocket_pc_writel(core, BASE_ADDRESS, 0x1);

rocket_cna_writel(core, S_POINTER, 0xe + 0x10000000 * core->index);
rocket_core_writel(core, S_POINTER, 0xe + 0x10000000 * core->index);

Those really look like bitfield operations rather than actual arithmetic 
to me.
...


rocket_pc_writel(core, BASE_ADDRESS, task->regcmd);

I don't see how regcmd is created (I guess that's in userspace?), but 
given that it's explicitly u64 all the way through - and especially 
since you claim to support 40-bit DMA addresses - it definitely seems 
suspicious that the upper 32 bits never seem to be consumed anywhere :/
...

rocket_pc_writel(core, REGISTER_AMOUNTS, (task->regcmd_count + 1) / 2 - 1);

rocket_pc_writel(core, INTERRUPT_MASK, PC_INTERRUPT_MASK_DPU_0 | PC_INTERRUPT_MASK_DPU_1);
rocket_pc_writel(core, INTERRUPT_CLEAR, PC_INTERRUPT_CLEAR_DPU_0 | PC_INTERRUPT_CLEAR_DPU_1);

rocket_pc_writel(core, TASK_CON, ((0x6 | task_pp_en) << 12) | task_count);

rocket_pc_writel(core, TASK_DMA_BASE_ADDR, 0x0);

rocket_pc_writel(core, OPERATION_ENABLE, 0x1);

dev_dbg(core->dev, "Submitted regcmd at 0x%llx to core %d", task->regcmd, core->index);

+}
[...]
...
+static struct dma_fence *rocket_job_run(struct drm_sched_job *sched_job)
+{

struct rocket_job *job = to_rocket_job(sched_job);
struct rocket_device *rdev = job->rdev;
struct rocket_core *core = sched_to_core(rdev, sched_job->sched);
struct dma_fence *fence = NULL;
int ret;

if (unlikely(job->base.s_fence->finished.error))
return NULL;



/*
* Nothing to execute: can happen if the job has finished while


* we were resetting the GPU.



GPU? (Similarly in various other comments/prints)
...

*/


if (job->next_task_idx == job->task_count)
return NULL;



fence = rocket_fence_create(core);
if (IS_ERR(fence))
return fence;



if (job->done_fence)
dma_fence_put(job->done_fence);


job->done_fence = dma_fence_get(fence);

ret = pm_runtime_get_sync(core->dev);
if (ret < 0)
return fence;



ret = iommu_attach_group(job->domain, iommu_group_get(core->dev));

I don't see iommu_group_put() anywhere, so you're leaking refcounts all 
over.
...

if (ret < 0)
return fence;



scoped_guard(spinlock, &core->job_lock) {
core->in_flight_job = job;


rocket_job_hw_submit(core, job);


}

return fence;

+}
[...]
...
+static void rocket_job_handle_irq(struct rocket_core *core)
+{

u32 status, raw_status;

pm_runtime_mark_last_busy(core->dev);

status = rocket_pc_readl(core, INTERRUPT_STATUS);
raw_status = rocket_pc_readl(core, INTERRUPT_RAW_STATUS);

rocket_pc_writel(core, OPERATION_ENABLE, 0x0);
rocket_pc_writel(core, INTERRUPT_CLEAR, 0x1ffff);

What was the point of reading the status registers if we're just going 
to blindly clear every possible condition anyway?
...

scoped_guard(spinlock, &core->job_lock)
if (core->in_flight_job)


	rocket_job_handle_done(core, core->in_flight_job);



But then is it really OK to just start the next task regardless of 
whether the current task was reporting successful completion or an error?
...
+}



+static void
+rocket_reset(struct rocket_core *core, struct drm_sched_job *bad)
+{

bool cookie;

if (!atomic_read(&core->reset.pending))
return;



/*
* Stop the scheduler.


*


* FIXME: We temporarily get out of the dma_fence_signalling section


* because the cleanup path generate lockdep splats when taking locks


* to release job resources. We should rework the code to follow this


* pattern:


*


*	try_lock


*	if (locked)


*		release


*	else


*		schedule_work_to_release_later


*/


drm_sched_stop(&core->sched, bad);

cookie = dma_fence_begin_signalling();

if (bad)
drm_sched_increase_karma(bad);



/*
* Mask job interrupts and synchronize to make sure we won't be


* interrupted during our reset.


*/


rocket_pc_writel(core, INTERRUPT_MASK, 0x0);
synchronize_irq(core->irq);

...except it's a shared IRQ, so it can still merrily fire at any time.
...


/* Handle the remaining interrupts before we reset. */
rocket_job_handle_irq(core);

/*
* Remaining interrupts have been handled, but we might still have


* stuck jobs. Let's make sure the PM counters stay balanced by


* manually calling pm_runtime_put_noidle() and


* rocket_devfreq_record_idle() for each stuck job.


* Let's also make sure the cycle counting register's refcnt is


* kept balanced to prevent it from running forever



Comments that don't match the code are more confusing than helpful :/
...

*/


scoped_guard(spinlock, &core->job_lock) {
if (core->in_flight_job)


	pm_runtime_put_noidle(core->dev);



core->in_flight_job = NULL;


}

/* Proceed with reset now. */
pm_runtime_force_suspend(core->dev);
pm_runtime_force_resume(core->dev);

Can you guarantee that actually resets the hardware if something else is 
holding the power domain open or RPM is disabled? I'm not familiar with 
the details of drm_sched, but if there are other jobs queued behind the 
stuck one would it even pass the rocket_job_is_idle() check for suspend 
to succeed anyway?
Not to mention that you have an actual reset control in the DT binding, 
which isn't even optional... :/
...

/* GPU has been reset, we can clear the reset pending bit. */
atomic_set(&core->reset.pending, 0);

/*
* Now resubmit jobs that were previously queued but didn't have a


* chance to finish.


* FIXME: We temporarily get out of the DMA fence signalling section


* while resubmitting jobs because the job submission logic will


* allocate memory with the GFP_KERNEL flag which can trigger memory


* reclaim and exposes a lock ordering issue.


*/


dma_fence_end_signalling(cookie);
drm_sched_resubmit_jobs(&core->sched);

Since I happened to look, this says it's deprecated?
...

cookie = dma_fence_begin_signalling();

/* Restart the scheduler */
drm_sched_start(&core->sched, 0);

dma_fence_end_signalling(cookie);

+}



+static enum drm_gpu_sched_stat rocket_job_timedout(struct drm_sched_job *sched_job)
+{

struct rocket_job *job = to_rocket_job(sched_job);
struct rocket_device *rdev = job->rdev;
struct rocket_core *core = sched_to_core(rdev, sched_job->sched);

/*
* If the GPU managed to complete this jobs fence, the timeout is


* spurious. Bail out.


*/


if (dma_fence_is_signaled(job->done_fence))
return DRM_GPU_SCHED_STAT_NOMINAL;



Do we really need the same return condition twice? What if the IRQ fires 
immediately after we've made this check, and is handled without delay 
such that sychronize_irq() effectively still does nothing? Either way 
we've taken longer than the timeout value to observe the job completing 
successfully, and either that's significant and worth warning about or 
it's not - I don't see any point in trying to (inaccurately) nitpick 
*why* it might have happened.
...

/*
* Rocket IRQ handler may take a long time to process an interrupt


* if there is another IRQ handler hogging the processing.


* For example, the HDMI encoder driver might be stuck in the IRQ


* handler for a significant time in a case of bad cable connection.



What have HDMI cables got to do with anything here? Yes, in general IRQ 
latency can be high, since CPUs can have IRQs masked and/or be taking 
higher-priority interrupts for any number of reasons. I don't see how an 
oddly-specific example (of apparently poor driver design, to boot) is 
useful.
...

* In order to catch such cases and not report spurious rocket


* job timeouts, synchronize the IRQ handler and re-check the fence


* status.


*/


synchronize_irq(core->irq);

if (dma_fence_is_signaled(job->done_fence)) {
dev_warn(core->dev, "unexpectedly high interrupt latency\n");


return DRM_GPU_SCHED_STAT_NOMINAL;


}

dev_err(core->dev, "gpu sched timeout");

atomic_set(&core->reset.pending, 1);
rocket_reset(core, sched_job);
iommu_detach_group(NULL, iommu_group_get(core->dev));

return DRM_GPU_SCHED_STAT_NOMINAL;

+}



+static void rocket_reset_work(struct work_struct *work)
+{

struct rocket_core *core;

core = container_of(work, struct rocket_core, reset.work);
rocket_reset(core, NULL);

+}



+static const struct drm_sched_backend_ops rocket_sched_ops = {

.run_job = rocket_job_run,
.timedout_job = rocket_job_timedout,
.free_job = rocket_job_free

+};



+static irqreturn_t rocket_job_irq_handler_thread(int irq, void *data)
+{

struct rocket_core *core = data;

rocket_job_handle_irq(core);

return IRQ_HANDLED;

+}



+static irqreturn_t rocket_job_irq_handler(int irq, void *data)
+{

struct rocket_core *core = data;
u32 raw_status = rocket_pc_readl(core, INTERRUPT_RAW_STATUS);

Given that this can be a shared IRQ as above, it would be a good idea to 
take care to avoid register accesses while suspended. Especially if 
you're trying to utilise suspend to reset a failing job that may well be 
throwing IOMMU faults.
...


WARN_ON(raw_status & PC_INTERRUPT_RAW_STATUS_DMA_READ_ERROR);
WARN_ON(raw_status & PC_INTERRUPT_RAW_STATUS_DMA_READ_ERROR);

if (!(raw_status & PC_INTERRUPT_RAW_STATUS_DPU_0 ||
     raw_status & PC_INTERRUPT_RAW_STATUS_DPU_1))


return IRQ_NONE;



rocket_pc_writel(core, INTERRUPT_MASK, 0x0);

return IRQ_WAKE_THREAD;

+}



+int rocket_job_init(struct rocket_core *core)
+{

struct drm_sched_init_args args = {
.ops = &rocket_sched_ops,


.num_rqs = DRM_SCHED_PRIORITY_COUNT,


.credit_limit = 1,



Ah, does this mean that all the stuff about queued jobs was in fact all 
nonsense anyway?
...

.timeout = msecs_to_jiffies(JOB_TIMEOUT_MS),


.name = dev_name(core->dev),


.dev = core->dev,


};
int ret;

INIT_WORK(&core->reset.work, rocket_reset_work);
spin_lock_init(&core->job_lock);

core->irq = platform_get_irq(to_platform_device(core->dev), 0);
if (core->irq < 0)
return core->irq;



ret = devm_request_threaded_irq(core->dev, core->irq,
			rocket_job_irq_handler,


			rocket_job_irq_handler_thread,


			IRQF_SHARED, KBUILD_MODNAME "-job",



Is it really a "job" interrupt though? The binding and the register 
definitions suggest it's just a general status interrupt for the core. 
Furthermore since we expect to have multiple cores, being able to more 
easily identify and attribute per-core IRQ activity seems more useful 
for debugging than copy-pasting from something really rather different 
which also expects to be the only one of its kind on the system.
Thanks,
Robin.
...

			core);


if (ret) {
dev_err(core->dev, "failed to request job irq");


return ret;


}