On 6/12/2024 7:53 AM, Tomeu Vizoso wrote:
Using the DRM GPU scheduler infrastructure, with a scheduler for each core.
Userspace can decide for a series of tasks to be executed sequentially in the same core, so SRAM locality can be taken advantage of.
The job submission code was intially based on Panfrost.
intially -> initially
Signed-off-by: Tomeu Vizoso tomeu@tomeuvizoso.net
drivers/accel/rocket/Makefile | 3 +- drivers/accel/rocket/rocket_core.c | 6 + drivers/accel/rocket/rocket_core.h | 16 + drivers/accel/rocket/rocket_device.c | 2 + drivers/accel/rocket/rocket_device.h | 2 + drivers/accel/rocket/rocket_drv.c | 15 + drivers/accel/rocket/rocket_drv.h | 3 + drivers/accel/rocket/rocket_job.c | 708 +++++++++++++++++++++++++++++++++++ drivers/accel/rocket/rocket_job.h | 49 +++ include/uapi/drm/rocket_accel.h | 55 +++ 10 files changed, 858 insertions(+), 1 deletion(-)
diff --git a/drivers/accel/rocket/Makefile b/drivers/accel/rocket/Makefile index 875cac2243d9..4d59036af8d9 100644 --- a/drivers/accel/rocket/Makefile +++ b/drivers/accel/rocket/Makefile @@ -6,4 +6,5 @@ rocket-y := \ rocket_core.o \ rocket_device.o \ rocket_drv.o \
- rocket_gem.o
- rocket_gem.o \
- rocket_job.o
diff --git a/drivers/accel/rocket/rocket_core.c b/drivers/accel/rocket/rocket_core.c index d6680b00fb2f..2b2d8be38f0a 100644 --- a/drivers/accel/rocket/rocket_core.c +++ b/drivers/accel/rocket/rocket_core.c @@ -11,6 +11,7 @@ #include "rocket_core.h" #include "rocket_device.h" +#include "rocket_job.h" #include "rocket_registers.h" static int rocket_clk_init(struct rocket_core *core) @@ -122,6 +123,10 @@ int rocket_core_init(struct rocket_core *core) goto out_pm_domain; }
- err = rocket_job_init(core);
- if (err)
goto out_pm_domain;
- version = rocket_read(core, REG_PC_VERSION) + (rocket_read(core, REG_PC_VERSION_NUM) & 0xffff); dev_info(rdev->dev, "Rockchip NPU core %d version: %d\n", core->index, version);
@@ -134,6 +139,7 @@ int rocket_core_init(struct rocket_core *core) void rocket_core_fini(struct rocket_core *core) {
- rocket_job_fini(core); rocket_pmdomain_fini(core); }
diff --git a/drivers/accel/rocket/rocket_core.h b/drivers/accel/rocket/rocket_core.h index e5d4c848c9f4..e6401960a9b2 100644 --- a/drivers/accel/rocket/rocket_core.h +++ b/drivers/accel/rocket/rocket_core.h @@ -8,6 +8,8 @@ #include <asm/io.h> #include <asm-generic/io.h> +#include <drm/gpu_scheduler.h>
What about includes for workqueue or atomic?
+static struct dma_fence *rocket_job_run(struct drm_sched_job *sched_job) +{
- struct rocket_job *job = to_rocket_job(sched_job);
- struct rocket_device *rdev = job->rdev;
- struct rocket_core *core = sched_to_core(rdev, sched_job->sched);
- struct dma_fence *fence = NULL;
- int ret;
- if (unlikely(job->base.s_fence->finished.error))
return NULL;
- /* Nothing to execute: can happen if the job has finished while
* we were resetting the GPU.
*/
Not the correct comment style
- if (job->next_task_idx == job->task_count)
return NULL;
- fence = rocket_fence_create(core);
- if (IS_ERR(fence))
return fence;
- if (job->done_fence)
dma_fence_put(job->done_fence);
- job->done_fence = dma_fence_get(fence);
- ret = pm_runtime_get_sync(rdev->dev);
- if (ret < 0)
return fence;
- spin_lock(&core->job_lock);
- core->in_flight_job = job;
- rocket_job_hw_submit(core, job);
- spin_unlock(&core->job_lock);
- return fence;
+}
+static void rocket_job_handle_done(struct rocket_core *core,
struct rocket_job *job)
+{
- if (job->next_task_idx < job->task_count) {
rocket_job_hw_submit(core, job);
return;
- }
- core->in_flight_job = NULL;
- dma_fence_signal_locked(job->done_fence);
- pm_runtime_put_autosuspend(core->dev->dev);
+}
+static void rocket_job_handle_irq(struct rocket_core *core) +{
- uint32_t status, raw_status;
- pm_runtime_mark_last_busy(core->dev->dev);
- status = rocket_read(core, REG_PC_INTERRUPT_STATUS);
- raw_status = rocket_read(core, REG_PC_INTERRUPT_RAW_STATUS);
- rocket_write(core, REG_PC_OPERATION_ENABLE, 0x0);
- rocket_write(core, REG_PC_INTERRUPT_CLEAR, 0x1ffff);
- spin_lock(&core->job_lock);
- if (core->in_flight_job)
rocket_job_handle_done(core, core->in_flight_job);
- spin_unlock(&core->job_lock);
+}
+static void +rocket_reset(struct rocket_core *core, struct drm_sched_job *bad) +{
- struct rocket_device *rdev = core->dev;
- bool cookie;
- if (!atomic_read(&core->reset.pending))
return;
- /* Stop the scheduler.
Not the correct comment style
*
* FIXME: We temporarily get out of the dma_fence_signalling section
* because the cleanup path generate lockdep splats when taking locks
* to release job resources. We should rework the code to follow this
* pattern:
*
* try_lock
* if (locked)
* release
* else
* schedule_work_to_release_later
*/
- drm_sched_stop(&core->sched, bad);
- cookie = dma_fence_begin_signalling();
- if (bad)
drm_sched_increase_karma(bad);
- /* Mask job interrupts and synchronize to make sure we won't be
* interrupted during our reset.
*/
Not the correct comment style, again. This is the last time I'm going to mention it.
diff --git a/drivers/accel/rocket/rocket_job.h b/drivers/accel/rocket/rocket_job.h new file mode 100644 index 000000000000..0c3c90e47d39 --- /dev/null +++ b/drivers/accel/rocket/rocket_job.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright 2024 Tomeu Vizoso tomeu@tomeuvizoso.net */
+#ifndef __ROCKET_JOB_H__ +#define __ROCKET_JOB_H__
+#include <drm/gpu_scheduler.h> +#include <drm/drm_drv.h>
Alphabetical order
diff --git a/include/uapi/drm/rocket_accel.h b/include/uapi/drm/rocket_accel.h index 8338726a83c3..888c9413e4cd 100644 --- a/include/uapi/drm/rocket_accel.h +++ b/include/uapi/drm/rocket_accel.h @@ -12,8 +12,10 @@ extern "C" { #endif #define DRM_ROCKET_CREATE_BO 0x00 +#define DRM_ROCKET_SUBMIT 0x01 #define DRM_IOCTL_ROCKET_CREATE_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_ROCKET_CREATE_BO, struct drm_rocket_create_bo) +#define DRM_IOCTL_ROCKET_SUBMIT DRM_IOW(DRM_COMMAND_BASE + DRM_ROCKET_SUBMIT, struct drm_rocket_submit) /**
- struct drm_rocket_create_bo - ioctl argument for creating Rocket BOs.
@@ -36,6 +38,59 @@ struct drm_rocket_create_bo { __u64 offset; }; +/**
- struct drm_rocket_task - A task to be run on the NPU
- A task is the smallest unit of work that can be run on the NPU.
- */
+struct drm_rocket_task {
/** DMA address to NPU mapping of register command buffer */
__u64 regcmd;
/** Number of commands in the register command buffer */
__u32 regcmd_count;
+};
+/**
- struct drm_rocket_job - A job to be run on the NPU
- The kernel will schedule the execution of this job taking into account its
- dependencies with other jobs. All tasks in the same job will be executed
- sequentially on the same core, to benefit from memory residency in SRAM.
- */
+struct drm_rocket_job {
/** Pointer to an array of struct drm_rocket_task. */
__u64 tasks;
/** Number of tasks passed in. */
__u32 task_count;
/** Pointer to a u32 array of the BOs that are read by the job. */
__u64 in_bo_handles;
/** Number of input BO handles passed in (size is that times 4). */
__u32 in_bo_handle_count;
/** Pointer to a u32 array of the BOs that are written to by the job. */
__u64 out_bo_handles;
/** Number of output BO handles passed in (size is that times 4). */
__u32 out_bo_handle_count;
+};
I feels like the mixing of 32-bit and 64-bit fields violates the guidelines on defining ioctls due to implicit padding that might or might not be present.
+/**
- struct drm_rocket_submit - ioctl argument for submitting commands to the NPU.
- The kernel will schedule the execution of these jobs in dependency order.
- */
+struct drm_rocket_submit {
/** Pointer to an array of struct drm_rocket_job. */
__u64 jobs;
/** Number of jobs passed in. */
__u32 job_count;
+};
- #if defined(__cplusplus) } #endif