On Tue, Apr 26, 2022 at 02:07:59PM +0800, Cai Huoqing wrote: [...]
diff --git a/drivers/gpu/drm/nvdla/nvdla_drv.c b/drivers/gpu/drm/nvdla/nvdla_drv.c
I'll look at this from an architectural level and leave it to other experts to review the more technical things.
[...]
+static struct nvdla_config nvdla_config_os_initial = {
- .atom_size = 32,
- .bdma_enable = true,
- .rubik_enable = true,
- .weight_compress_support = true,
+};
+static struct nvdla_config nvdla_config_small = {
- //.atom_size = 8,
- .atom_size = 32, // nv_large config
- .bdma_enable = false,
- .rubik_enable = false,
- .weight_compress_support = false,
+};
[...]
+static union nvdla_operation_container operation_desc[NVDLA_OP_NUM][NVDLA_NUM_GROUPS]; +static union nvdla_surface_container surface_desc[NVDLA_OP_NUM][NVDLA_NUM_GROUPS];
+static struct nvdla_task_desc global_task;
+static struct nvdla_engine engine = {
- .processors[NVDLA_OP_BDMA] = {
.name = "BDMA",
.op_type = NVDLA_OP_BDMA,
.program = nvdla_bdma_program,
.enable = nvdla_bdma_enable,
.set_producer = nvdla_bdma_set_producer,
.is_ready = nvdla_bdma_is_ready,
.dump_config = nvdla_bdma_dump_config,
.rdma_check = nvdla_bdma_rdma_check,
.consumer_ptr = 0,
.roi_index = 0,
.group_status = 0,
.rdma_status = 0,
.last_group = 1,
.groups[0] = {
.id = 0,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[NVDLA_OP_BDMA][0],
.surface_desc = &surface_desc[NVDLA_OP_BDMA][0],
},
.groups[1] = {
.id = 1,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[NVDLA_OP_BDMA][1],
.surface_desc = &surface_desc[NVDLA_OP_BDMA][1],
},
- },
- .processors[NVDLA_OP_CONV] = {
.name = "Convolution",
.op_type = NVDLA_OP_CONV,
.program = nvdla_conv_program,
.enable = nvdla_conv_enable,
.set_producer = nvdla_conv_set_producer,
.is_ready = nvdla_conv_is_ready,
.dump_config = nvdla_conv_dump_config,
.rdma_check = nvdla_conv_rdma_check,
.consumer_ptr = 0,
.roi_index = 0,
.group_status = 0,
.rdma_status = 0,
.last_group = 1,
.groups[0] = {
.id = 0,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[NVDLA_OP_CONV][0],
.surface_desc = &surface_desc[NVDLA_OP_CONV][0],
},
.groups[1] = {
.id = 1,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[NVDLA_OP_CONV][1],
.surface_desc = &surface_desc[NVDLA_OP_CONV][1],
},
- },
- .processors[NVDLA_OP_SDP] = {
.name = "SDP",
.op_type = NVDLA_OP_SDP,
.program = nvdla_sdp_program,
.enable = nvdla_sdp_enable,
.set_producer = nvdla_sdp_set_producer,
.is_ready = nvdla_sdp_is_ready,
.dump_config = nvdla_sdp_dump_config,
.rdma_check = nvdla_sdp_rdma_check,
.consumer_ptr = 0,
.roi_index = 0,
.group_status = 0,
.rdma_status = 0,
.last_group = 1,
.groups[0] = {
.id = 0,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[NVDLA_OP_SDP][0],
.surface_desc = &surface_desc[NVDLA_OP_SDP][0],
},
.groups[1] = {
.id = 1,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[NVDLA_OP_SDP][1],
.surface_desc = &surface_desc[NVDLA_OP_SDP][1],
},
- },
- .processors[NVDLA_OP_PDP] = {
.name = "PDP",
.op_type = NVDLA_OP_PDP,
.program = nvdla_pdp_program,
.enable = nvdla_pdp_enable,
.set_producer = nvdla_pdp_set_producer,
.is_ready = nvdla_pdp_is_ready,
.dump_config = nvdla_pdp_dump_config,
.rdma_check = nvdla_pdp_rdma_check,
.consumer_ptr = 0,
.roi_index = 0,
.group_status = 0,
.rdma_status = 0,
.last_group = 1,
.groups[0] = {
.id = 0,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[NVDLA_OP_PDP][0],
.surface_desc = &surface_desc[NVDLA_OP_PDP][0],
},
.groups[1] = {
.id = 1,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[NVDLA_OP_PDP][1],
.surface_desc = &surface_desc[NVDLA_OP_PDP][1],
},
- },
- .processors[NVDLA_OP_CDP] = {
.name = "CDP",
.op_type = NVDLA_OP_CDP,
.program = nvdla_cdp_program,
.enable = nvdla_cdp_enable,
.set_producer = nvdla_cdp_set_producer,
.is_ready = nvdla_cdp_is_ready,
.dump_config = nvdla_cdp_dump_config,
.rdma_check = nvdla_cdp_rdma_check,
.consumer_ptr = 0,
.roi_index = 0,
.group_status = 0,
.rdma_status = 0,
.last_group = 1,
.groups[0] = {
.id = 0,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[NVDLA_OP_CDP][0],
.surface_desc = &surface_desc[NVDLA_OP_CDP][0],
},
.groups[1] = {
.id = 1,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[NVDLA_OP_CDP][1],
.surface_desc = &surface_desc[NVDLA_OP_CDP][1],
},
- },
- .processors[NVDLA_OP_RUBIK] = {
.name = "RUBIK",
.op_type = NVDLA_OP_RUBIK,
.program = nvdla_rubik_program,
.enable = nvdla_rubik_enable,
.set_producer = nvdla_rubik_set_producer,
.is_ready = nvdla_rubik_is_ready,
.dump_config = nvdla_rubik_dump_config,
.rdma_check = nvdla_rubik_rdma_check,
.consumer_ptr = 0,
.roi_index = 0,
.group_status = 0,
.rdma_status = 0,
.last_group = 1,
.groups[0] = {
.id = 0,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[NVDLA_OP_RUBIK][0],
.surface_desc = &surface_desc[NVDLA_OP_RUBIK][0],
},
.groups[1] = {
.id = 1,
.rdma_id = 0,
.active = 0,
.events = 0,
.roi_index = 0,
.is_rdma_needed = 0,
.lut_index = -1,
.operation_desc = &operation_desc[NVDLA_OP_RUBIK][1],
.surface_desc = &surface_desc[NVDLA_OP_RUBIK][1],
},
- },
+};
These global variables aren't going to work because Tegra234 (Tegra194's successor) has two instances of NVDLA. So this data needs to be allocated dynamically, per instance. It also seems to me like some of this data could be separated into read-only (i.e. const) and variable data. For instance none of the function pointers ever seem to change, so those could be refactored into an "ops" structure and then passed by pointer into the engines. That might also be useful if we need to make some of these implementations different on subsequent generations of the IP.
+/* driver probe and init */ +static const struct of_device_id nvdla_of_match[] = {
- {
.compatible = "nvidia,nvdla_os_initial",
.data = &nvdla_config_os_initial,
- },
- {
.compatible = "nvidia,nvdla_2",
.data = &nvdla_config_small,
- },
- { },
+};
Looking at the definition of these configuration structures these seem to be software configuration variants rather than actually different hardware blocks. In fact, none of the above seem to be specific to any particular hardware version.
I would expect these compatible strings to reflect the exact chip that they can be found in. I.e. for Tegra194 I would expect this to be just:
"nvidia,tegra194-dla"
And then on Tegra234 it would become:
"nvidia,tegra234-dla"
at which point the parameterization can happen based on that compatible string, if we need to.
Also, and I think Peter Robinson already mentioned that, we'll need the device tree bindings for this as well as device tree changes that create this device on one of the platforms supported upstream so that we can test this.
I think in your first version of these patches you had also made a reference to the userspace driver component that you've used to test this on. Can you include links to those in the cover letter of subsequent versions so that people know where to look?
+static int32_t nvdla_probe(struct platform_device *pdev)
We generally don't use the sized types unless absolutely necessary. So this (and potentially other occurrences) should just switch to simple "int".
+{
- int32_t err = 0;
- struct resource *res;
- struct nvdla_device *nvdla_dev;
- struct device *dev = &pdev->dev;
- const struct of_device_id *match;
- if (!pdev->dev.of_node)
return -EINVAL;
- match = of_match_device(nvdla_of_match, &pdev->dev);
- if (!match) {
pr_err("Missing DT entry!\n");
return -EINVAL;
- }
There's usually no need for this check. If there was no device tree entry then the nvdla_probe() function wouldn't have been called.
- nvdla_dev = devm_kzalloc(dev, sizeof(*nvdla_dev), GFP_KERNEL);
- if (!nvdla_dev)
return -ENOMEM;
- platform_set_drvdata(pdev, nvdla_dev);
- nvdla_dev->pdev = pdev;
Do we really need the platform device later on? Typically access to the struct device (i.e. &pdev->dev) is enough.
- nvdla_dev->config_data = (struct nvdla_config *)match->data;
You can get this in a single call to of_device_get_match_data(), so you don't need the extra match local variable.
- init_completion(&nvdla_dev->event_notifier);
- res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- nvdla_dev->base = devm_ioremap_resource(&pdev->dev, res);
I think people prefer devm_platform_get_and_ioremap_resource() these days...
- if (IS_ERR(nvdla_dev->base))
return PTR_ERR(nvdla_dev->base);
- res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
- if (!res) {
dev_err(&pdev->dev, "no irq resource\n");
return -EINVAL;
- }
- nvdla_dev->irq = res->start;
You can use platform_get_irq() here, which makes this a bit simpler.
- err = devm_request_irq(&pdev->dev, nvdla_dev->irq,
nvdla_engine_isr, 0,
dev_name(&pdev->dev), nvdla_dev);
- if (err)
return err;
- nvdla_dev->engine_context = &engine;
- engine.task = &global_task;
- engine.driver_context = (void *)nvdla_dev;
Why the generic void *? Can't we just store a struct nvdla_device * as the driver context directly? Do we expect this type to vary?
What I'm missing in this ->probe() function is references to the various clocks, resets and power gates that need to be controlled in order for this IP to work. The power gates are typically implemented using generic power domains, so you can safely ignore those, provided you at least have them in the device tree, but I think you'd need to at the very least grab the clocks and resets that this needs. This might work without them for a while because Tegra194 and later do some hidden magic with clocks and resets, but for any finer-grained control we'll need them (for instance, we will want to switch into a low-power state when no tasks are being processed).
Thierry
- engine.task->task_data = NULL;
- nvdla_init_op_cache(&engine);
- nvdla_clear_task(nvdla_dev->engine_context);
- err = nvdla_drm_probe(nvdla_dev);
- if (err)
dev_err(&pdev->dev, "failed to register drm device\n");
- return err;
+}
+static int32_t __exit nvdla_remove(struct platform_device *pdev) +{
- struct nvdla_device *nvdla_dev = dev_get_drvdata(&pdev->dev);
- nvdla_drm_remove(nvdla_dev);
- return 0;
+}
+static struct platform_driver nvdla_driver = {
- .probe = nvdla_probe,
- .remove = __exit_p(nvdla_remove),
- .driver = {
.owner = THIS_MODULE,
.name = "NVDLA",
.of_match_table = of_match_ptr(nvdla_of_match),
- },
+}; +module_platform_driver(nvdla_driver);
+MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("Nvidia Deep Learning Accelerator driver"); +MODULE_IMPORT_NS(DMA_BUF);