HDP flush is used early in the init sequence as part of memory controller block initialization. Hence remapping of HDP registers needed for flush needs to happen earlier.
This also fixes the Unsupported Request error reported through AER during driver load. The error happens as a write happens to the remap offset before real remapping is done.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216373
The error was unnoticed before and got visible because of the commit referenced below. This doesn't fix anything in the commit below, rather fixes the issue in amdgpu exposed by the commit. The reference is only to associate this commit with below one so that both go together.
Fixes: 8795e182b02d ("PCI/portdrv: Don't disable AER reporting in get_port_device_capability()")
Reported-by: Tom Seewald tseewald@gmail.com Signed-off-by: Lijo Lazar lijo.lazar@amd.com Cc: stable@vger.kernel.org --- v2: Take care of IP resume cases (Alex Deucher) Add NULL check to nbio.funcs to cover older (GFXv8) ASICs (Felix Kuehling) Add more details in commit message and associate with AER patch (Bjorn Helgaas)
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/nv.c | 6 ------ drivers/gpu/drm/amd/amdgpu/soc15.c | 6 ------ drivers/gpu/drm/amd/amdgpu/soc21.c | 6 ------ 4 files changed, 24 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ce7d117efdb5..e420118769a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2334,6 +2334,26 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) return 0; }
+/** + * amdgpu_device_prepare_ip - prepare IPs for hardware initialization + * + * @adev: amdgpu_device pointer + * + * Any common hardware initialization sequence that needs to be done before + * hw init of individual IPs is performed here. This is different from the + * 'common block' which initializes a set of IPs. + */ +static void amdgpu_device_prepare_ip(struct amdgpu_device *adev) +{ + /* Remap HDP registers to a hole in mmio space, for the purpose + * of exposing those registers to process space. This needs to be + * done before hw init of ip blocks to take care of HDP flush + * operations through registers during hw_init. + */ + if (adev->nbio.funcs && adev->nbio.funcs->remap_hdp_registers && + !amdgpu_sriov_vf(adev)) + adev->nbio.funcs->remap_hdp_registers(adev); +}
/** * amdgpu_device_ip_init - run init for hardware IPs @@ -2376,6 +2396,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); goto init_failed; } + + amdgpu_device_prepare_ip(adev); r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); if (r) { DRM_ERROR("hw_init %d failed %d\n", i, r); @@ -3058,6 +3080,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) AMD_IP_BLOCK_TYPE_IH, };
+ amdgpu_device_prepare_ip(adev); for (i = 0; i < adev->num_ip_blocks; i++) { int j; struct amdgpu_ip_block *block; @@ -3139,6 +3162,7 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) { int i, r;
+ amdgpu_device_prepare_ip(adev); for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) continue; diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index b3fba8dea63c..3ac7fef74277 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -1032,12 +1032,6 @@ static int nv_common_hw_init(void *handle) nv_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev); - /* remap HDP registers to a hole in mmio space, - * for the purpose of expose those registers - * to process space - */ - if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev)) - adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ nv_enable_doorbell_aperture(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index fde6154f2009..a0481e37d7cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1240,12 +1240,6 @@ static int soc15_common_hw_init(void *handle) soc15_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev); - /* remap HDP registers to a hole in mmio space, - * for the purpose of expose those registers - * to process space - */ - if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev)) - adev->nbio.funcs->remap_hdp_registers(adev);
/* enable the doorbell aperture */ soc15_enable_doorbell_aperture(adev, true); diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 55284b24f113..16b447055102 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -660,12 +660,6 @@ static int soc21_common_hw_init(void *handle) soc21_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev); - /* remap HDP registers to a hole in mmio space, - * for the purpose of expose those registers - * to process space - */ - if (adev->nbio.funcs->remap_hdp_registers) - adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ soc21_enable_doorbell_aperture(adev, true);
Make sure the register offsets used for HDP flush in VF is initialized early so that it works fine during any early HDP flush sequence. For that, move the offset initialization to *_remap_hdp.
Signed-off-by: Lijo Lazar lijo.lazar@amd.com Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +-- drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c | 23 +++++++++++++-------- drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c | 12 +++++++---- drivers/gpu/drm/amd/amdgpu/nbio_v6_1.c | 23 +++++++++++++-------- drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 21 ++++++++++++------- drivers/gpu/drm/amd/amdgpu/nbio_v7_2.c | 24 ++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 23 +++++++++++++-------- 7 files changed, 84 insertions(+), 45 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e420118769a5..9d698b9f4e54 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2350,8 +2350,7 @@ static void amdgpu_device_prepare_ip(struct amdgpu_device *adev) * done before hw init of ip blocks to take care of HDP flush * operations through registers during hw_init. */ - if (adev->nbio.funcs && adev->nbio.funcs->remap_hdp_registers && - !amdgpu_sriov_vf(adev)) + if (adev->nbio.funcs && adev->nbio.funcs->remap_hdp_registers) adev->nbio.funcs->remap_hdp_registers(adev); }
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c b/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c index b465baa26762..20fa2c5ad510 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c @@ -65,10 +65,21 @@
static void nbio_v2_3_remap_hdp_registers(struct amdgpu_device *adev) { - WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL, - adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL); - WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL, - adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL); + if (amdgpu_sriov_vf(adev)) + adev->rmmio_remap.reg_offset = + SOC15_REG_OFFSET( + NBIO, 0, + mmBIF_BX_DEV0_EPF0_VF0_HDP_MEM_COHERENCY_FLUSH_CNTL) + << 2; + + if (!amdgpu_sriov_vf(adev)) { + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL); + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL); + } }
static u32 nbio_v2_3_get_rev_id(struct amdgpu_device *adev) @@ -338,10 +349,6 @@ static void nbio_v2_3_init_registers(struct amdgpu_device *adev)
if (def != data) WREG32_PCIE(smnPCIE_CONFIG_CNTL, data); - - if (amdgpu_sriov_vf(adev)) - adev->rmmio_remap.reg_offset = SOC15_REG_OFFSET(NBIO, 0, - mmBIF_BX_DEV0_EPF0_VF0_HDP_MEM_COHERENCY_FLUSH_CNTL) << 2; }
#define NAVI10_PCIE__LC_L0S_INACTIVITY_DEFAULT 0x00000000 // off by default, no gains over L1 diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c index 982a89f841d5..e011d9856794 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c @@ -30,10 +30,14 @@
static void nbio_v4_3_remap_hdp_registers(struct amdgpu_device *adev) { - WREG32_SOC15(NBIO, 0, regBIF_BX0_REMAP_HDP_MEM_FLUSH_CNTL, - adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL); - WREG32_SOC15(NBIO, 0, regBIF_BX0_REMAP_HDP_REG_FLUSH_CNTL, - adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL); + if (!amdgpu_sriov_vf(adev)) { + WREG32_SOC15(NBIO, 0, regBIF_BX0_REMAP_HDP_MEM_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL); + WREG32_SOC15(NBIO, 0, regBIF_BX0_REMAP_HDP_REG_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL); + } }
static u32 nbio_v4_3_get_rev_id(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v6_1.c b/drivers/gpu/drm/amd/amdgpu/nbio_v6_1.c index f7f6ddebd3e4..7536ca3fcd69 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v6_1.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v6_1.c @@ -55,10 +55,21 @@
static void nbio_v6_1_remap_hdp_registers(struct amdgpu_device *adev) { - WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL, - adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL); - WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL, - adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL); + if (amdgpu_sriov_vf(adev)) + adev->rmmio_remap.reg_offset = + SOC15_REG_OFFSET( + NBIO, 0, + mmBIF_BX_DEV0_EPF0_VF0_HDP_MEM_COHERENCY_FLUSH_CNTL) + << 2; + + if (!amdgpu_sriov_vf(adev)) { + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL); + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL); + } }
static u32 nbio_v6_1_get_rev_id(struct amdgpu_device *adev) @@ -276,10 +287,6 @@ static void nbio_v6_1_init_registers(struct amdgpu_device *adev)
if (def != data) WREG32_PCIE(smnPCIE_CI_CNTL, data); - - if (amdgpu_sriov_vf(adev)) - adev->rmmio_remap.reg_offset = SOC15_REG_OFFSET(NBIO, 0, - mmBIF_BX_DEV0_EPF0_VF0_HDP_MEM_COHERENCY_FLUSH_CNTL) << 2; }
static void nbio_v6_1_program_ltr(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c index aa0326d00c72..6b4ac16a8466 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c @@ -35,10 +35,20 @@
static void nbio_v7_0_remap_hdp_registers(struct amdgpu_device *adev) { - WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL, - adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL); - WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL, - adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL); + if (amdgpu_sriov_vf(adev)) + adev->rmmio_remap.reg_offset = + SOC15_REG_OFFSET(NBIO, 0, + mmHDP_MEM_COHERENCY_FLUSH_CNTL) + << 2; + + if (!amdgpu_sriov_vf(adev)) { + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL); + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL); + } }
static u32 nbio_v7_0_get_rev_id(struct amdgpu_device *adev) @@ -273,9 +283,6 @@ const struct nbio_hdp_flush_reg nbio_v7_0_hdp_flush_reg = {
static void nbio_v7_0_init_registers(struct amdgpu_device *adev) { - if (amdgpu_sriov_vf(adev)) - adev->rmmio_remap.reg_offset = - SOC15_REG_OFFSET(NBIO, 0, mmHDP_MEM_COHERENCY_FLUSH_CNTL) << 2; }
const struct amdgpu_nbio_funcs nbio_v7_0_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_2.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_2.c index 31776b12e4c4..fb4be72eade7 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_2.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_2.c @@ -49,10 +49,21 @@
static void nbio_v7_2_remap_hdp_registers(struct amdgpu_device *adev) { - WREG32_SOC15(NBIO, 0, regBIF_BX0_REMAP_HDP_MEM_FLUSH_CNTL, - adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL); - WREG32_SOC15(NBIO, 0, regBIF_BX0_REMAP_HDP_REG_FLUSH_CNTL, - adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL); + if (amdgpu_sriov_vf(adev)) + adev->rmmio_remap.reg_offset = + SOC15_REG_OFFSET( + NBIO, 0, + regBIF_BX_PF0_HDP_MEM_COHERENCY_FLUSH_CNTL) + << 2; + + if (!amdgpu_sriov_vf(adev)) { + WREG32_SOC15(NBIO, 0, regBIF_BX0_REMAP_HDP_MEM_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL); + WREG32_SOC15(NBIO, 0, regBIF_BX0_REMAP_HDP_REG_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL); + } }
static u32 nbio_v7_2_get_rev_id(struct amdgpu_device *adev) @@ -369,6 +380,7 @@ const struct nbio_hdp_flush_reg nbio_v7_2_hdp_flush_reg = { static void nbio_v7_2_init_registers(struct amdgpu_device *adev) { uint32_t def, data; + switch (adev->ip_versions[NBIO_HWIP][0]) { case IP_VERSION(7, 2, 1): case IP_VERSION(7, 3, 0): @@ -393,10 +405,6 @@ static void nbio_v7_2_init_registers(struct amdgpu_device *adev) WREG32_PCIE_PORT(SOC15_REG_OFFSET(NBIO, 0, regPCIE_CONFIG_CNTL), data); break; } - - if (amdgpu_sriov_vf(adev)) - adev->rmmio_remap.reg_offset = SOC15_REG_OFFSET(NBIO, 0, - regBIF_BX_PF0_HDP_MEM_COHERENCY_FLUSH_CNTL) << 2; }
const struct amdgpu_nbio_funcs nbio_v7_2_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index 11848d1e238b..3c11af99582f 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -101,10 +101,21 @@ static void nbio_v7_4_query_ras_error_count(struct amdgpu_device *adev,
static void nbio_v7_4_remap_hdp_registers(struct amdgpu_device *adev) { - WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL, - adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL); - WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL, - adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL); + if (amdgpu_sriov_vf(adev)) + adev->rmmio_remap.reg_offset = + SOC15_REG_OFFSET( + NBIO, 0, + mmBIF_BX_DEV0_EPF0_VF0_HDP_MEM_COHERENCY_FLUSH_CNTL) + << 2; + + if (!amdgpu_sriov_vf(adev)) { + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL); + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL, + adev->rmmio_remap.reg_offset + + KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL); + } }
static u32 nbio_v7_4_get_rev_id(struct amdgpu_device *adev) @@ -343,10 +354,6 @@ static void nbio_v7_4_init_registers(struct amdgpu_device *adev) { uint32_t baco_cntl;
- if (amdgpu_sriov_vf(adev)) - adev->rmmio_remap.reg_offset = SOC15_REG_OFFSET(NBIO, 0, - mmBIF_BX_DEV0_EPF0_VF0_HDP_MEM_COHERENCY_FLUSH_CNTL) << 2; - if (adev->ip_versions[NBIO_HWIP][0] == IP_VERSION(7, 4, 4) && !amdgpu_sriov_vf(adev)) { baco_cntl = RREG32_SOC15(NBIO, 0, mmBACO_CNTL);
On Mon, Aug 29, 2022 at 4:18 AM Lijo Lazar lijo.lazar@amd.com wrote:
HDP flush is used early in the init sequence as part of memory controller block initialization. Hence remapping of HDP registers needed for flush needs to happen earlier.
This also fixes the Unsupported Request error reported through AER during driver load. The error happens as a write happens to the remap offset before real remapping is done.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216373
The error was unnoticed before and got visible because of the commit referenced below. This doesn't fix anything in the commit below, rather fixes the issue in amdgpu exposed by the commit. The reference is only to associate this commit with below one so that both go together.
Fixes: 8795e182b02d ("PCI/portdrv: Don't disable AER reporting in get_port_device_capability()")
Reported-by: Tom Seewald tseewald@gmail.com Signed-off-by: Lijo Lazar lijo.lazar@amd.com Cc: stable@vger.kernel.org
How about something like the attached patch rather than these two patches? It's a bit bigger but seems cleaner and more defensive in my opinion.
Alex
v2: Take care of IP resume cases (Alex Deucher) Add NULL check to nbio.funcs to cover older (GFXv8) ASICs (Felix Kuehling) Add more details in commit message and associate with AER patch (Bjorn Helgaas)
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/nv.c | 6 ------ drivers/gpu/drm/amd/amdgpu/soc15.c | 6 ------ drivers/gpu/drm/amd/amdgpu/soc21.c | 6 ------ 4 files changed, 24 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ce7d117efdb5..e420118769a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2334,6 +2334,26 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) return 0; }
+/**
- amdgpu_device_prepare_ip - prepare IPs for hardware initialization
- @adev: amdgpu_device pointer
- Any common hardware initialization sequence that needs to be done before
- hw init of individual IPs is performed here. This is different from the
- 'common block' which initializes a set of IPs.
- */
+static void amdgpu_device_prepare_ip(struct amdgpu_device *adev) +{
/* Remap HDP registers to a hole in mmio space, for the purpose
* of exposing those registers to process space. This needs to be
* done before hw init of ip blocks to take care of HDP flush
* operations through registers during hw_init.
*/
if (adev->nbio.funcs && adev->nbio.funcs->remap_hdp_registers &&
!amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev);
+}
/**
- amdgpu_device_ip_init - run init for hardware IPs
@@ -2376,6 +2396,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); goto init_failed; }
amdgpu_device_prepare_ip(adev); r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); if (r) { DRM_ERROR("hw_init %d failed %d\n", i, r);
@@ -3058,6 +3080,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) AMD_IP_BLOCK_TYPE_IH, };
amdgpu_device_prepare_ip(adev); for (i = 0; i < adev->num_ip_blocks; i++) { int j; struct amdgpu_ip_block *block;
@@ -3139,6 +3162,7 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) { int i, r;
amdgpu_device_prepare_ip(adev); for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) continue;
diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index b3fba8dea63c..3ac7fef74277 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -1032,12 +1032,6 @@ static int nv_common_hw_init(void *handle) nv_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ nv_enable_doorbell_aperture(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index fde6154f2009..a0481e37d7cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1240,12 +1240,6 @@ static int soc15_common_hw_init(void *handle) soc15_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ soc15_enable_doorbell_aperture(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 55284b24f113..16b447055102 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -660,12 +660,6 @@ static int soc21_common_hw_init(void *handle) soc21_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers)
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ soc21_enable_doorbell_aperture(adev, true);
-- 2.25.1
On 8/29/2022 10:20 PM, Alex Deucher wrote:
On Mon, Aug 29, 2022 at 4:18 AM Lijo Lazar lijo.lazar@amd.com wrote:
HDP flush is used early in the init sequence as part of memory controller block initialization. Hence remapping of HDP registers needed for flush needs to happen earlier.
This also fixes the Unsupported Request error reported through AER during driver load. The error happens as a write happens to the remap offset before real remapping is done.
Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugzilla.k...
The error was unnoticed before and got visible because of the commit referenced below. This doesn't fix anything in the commit below, rather fixes the issue in amdgpu exposed by the commit. The reference is only to associate this commit with below one so that both go together.
Fixes: 8795e182b02d ("PCI/portdrv: Don't disable AER reporting in get_port_device_capability()")
Reported-by: Tom Seewald tseewald@gmail.com Signed-off-by: Lijo Lazar lijo.lazar@amd.com Cc: stable@vger.kernel.org
How about something like the attached patch rather than these two patches? It's a bit bigger but seems cleaner and more defensive in my opinion.
Whenever device goes to suspend/reset and then comes back, remap offset has to be set back to 0 to make sure it doesn't use the wrong offset when the register assumes default values again.
To avoid the if-check in hdp_flush (which is more frequent), another way is to initialize the remap offset to default offset during early init and hw fini/suspend sequences. It won't be obvious (even with this patch) as to when remap offset vs default offset is used though.
Thanks, Lijo
Alex
v2: Take care of IP resume cases (Alex Deucher) Add NULL check to nbio.funcs to cover older (GFXv8) ASICs (Felix Kuehling) Add more details in commit message and associate with AER patch (Bjorn Helgaas)
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/nv.c | 6 ------ drivers/gpu/drm/amd/amdgpu/soc15.c | 6 ------ drivers/gpu/drm/amd/amdgpu/soc21.c | 6 ------ 4 files changed, 24 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ce7d117efdb5..e420118769a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2334,6 +2334,26 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) return 0; }
+/**
- amdgpu_device_prepare_ip - prepare IPs for hardware initialization
- @adev: amdgpu_device pointer
- Any common hardware initialization sequence that needs to be done before
- hw init of individual IPs is performed here. This is different from the
- 'common block' which initializes a set of IPs.
- */
+static void amdgpu_device_prepare_ip(struct amdgpu_device *adev) +{
/* Remap HDP registers to a hole in mmio space, for the purpose
* of exposing those registers to process space. This needs to be
* done before hw init of ip blocks to take care of HDP flush
* operations through registers during hw_init.
*/
if (adev->nbio.funcs && adev->nbio.funcs->remap_hdp_registers &&
!amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev);
+}
/**
- amdgpu_device_ip_init - run init for hardware IPs
@@ -2376,6 +2396,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); goto init_failed; }
amdgpu_device_prepare_ip(adev); r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); if (r) { DRM_ERROR("hw_init %d failed %d\n", i, r);
@@ -3058,6 +3080,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) AMD_IP_BLOCK_TYPE_IH, };
amdgpu_device_prepare_ip(adev); for (i = 0; i < adev->num_ip_blocks; i++) { int j; struct amdgpu_ip_block *block;
@@ -3139,6 +3162,7 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) { int i, r;
amdgpu_device_prepare_ip(adev); for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) continue;
diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index b3fba8dea63c..3ac7fef74277 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -1032,12 +1032,6 @@ static int nv_common_hw_init(void *handle) nv_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ nv_enable_doorbell_aperture(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index fde6154f2009..a0481e37d7cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1240,12 +1240,6 @@ static int soc15_common_hw_init(void *handle) soc15_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ soc15_enable_doorbell_aperture(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 55284b24f113..16b447055102 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -660,12 +660,6 @@ static int soc21_common_hw_init(void *handle) soc21_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers)
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ soc21_enable_doorbell_aperture(adev, true);
-- 2.25.1
On Tue, Aug 30, 2022 at 12:05 AM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/29/2022 10:20 PM, Alex Deucher wrote:
On Mon, Aug 29, 2022 at 4:18 AM Lijo Lazar lijo.lazar@amd.com wrote:
HDP flush is used early in the init sequence as part of memory controller block initialization. Hence remapping of HDP registers needed for flush needs to happen earlier.
This also fixes the Unsupported Request error reported through AER during driver load. The error happens as a write happens to the remap offset before real remapping is done.
Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugzilla.k...
The error was unnoticed before and got visible because of the commit referenced below. This doesn't fix anything in the commit below, rather fixes the issue in amdgpu exposed by the commit. The reference is only to associate this commit with below one so that both go together.
Fixes: 8795e182b02d ("PCI/portdrv: Don't disable AER reporting in get_port_device_capability()")
Reported-by: Tom Seewald tseewald@gmail.com Signed-off-by: Lijo Lazar lijo.lazar@amd.com Cc: stable@vger.kernel.org
How about something like the attached patch rather than these two patches? It's a bit bigger but seems cleaner and more defensive in my opinion.
Whenever device goes to suspend/reset and then comes back, remap offset has to be set back to 0 to make sure it doesn't use the wrong offset when the register assumes default values again.
To avoid the if-check in hdp_flush (which is more frequent), another way is to initialize the remap offset to default offset during early init and hw fini/suspend sequences. It won't be obvious (even with this patch) as to when remap offset vs default offset is used though.
On resume, the common IP is resumed first so it will always be set. The only case that is a problem is init because we init GMC out of order. We could init common before GMC in amdgpu_device_ip_init(). I think that should be fine, but I wasn't sure if there might be some fallout from that on certain cards.
Alex
Thanks, Lijo
Alex
v2: Take care of IP resume cases (Alex Deucher) Add NULL check to nbio.funcs to cover older (GFXv8) ASICs (Felix Kuehling) Add more details in commit message and associate with AER patch (Bjorn Helgaas)
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/nv.c | 6 ------ drivers/gpu/drm/amd/amdgpu/soc15.c | 6 ------ drivers/gpu/drm/amd/amdgpu/soc21.c | 6 ------ 4 files changed, 24 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ce7d117efdb5..e420118769a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2334,6 +2334,26 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) return 0; }
+/**
- amdgpu_device_prepare_ip - prepare IPs for hardware initialization
- @adev: amdgpu_device pointer
- Any common hardware initialization sequence that needs to be done before
- hw init of individual IPs is performed here. This is different from the
- 'common block' which initializes a set of IPs.
- */
+static void amdgpu_device_prepare_ip(struct amdgpu_device *adev) +{
/* Remap HDP registers to a hole in mmio space, for the purpose
* of exposing those registers to process space. This needs to be
* done before hw init of ip blocks to take care of HDP flush
* operations through registers during hw_init.
*/
if (adev->nbio.funcs && adev->nbio.funcs->remap_hdp_registers &&
!amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev);
+}
/**
- amdgpu_device_ip_init - run init for hardware IPs
@@ -2376,6 +2396,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); goto init_failed; }
amdgpu_device_prepare_ip(adev); r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); if (r) { DRM_ERROR("hw_init %d failed %d\n", i, r);
@@ -3058,6 +3080,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) AMD_IP_BLOCK_TYPE_IH, };
amdgpu_device_prepare_ip(adev); for (i = 0; i < adev->num_ip_blocks; i++) { int j; struct amdgpu_ip_block *block;
@@ -3139,6 +3162,7 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) { int i, r;
amdgpu_device_prepare_ip(adev); for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) continue;
diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index b3fba8dea63c..3ac7fef74277 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -1032,12 +1032,6 @@ static int nv_common_hw_init(void *handle) nv_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ nv_enable_doorbell_aperture(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index fde6154f2009..a0481e37d7cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1240,12 +1240,6 @@ static int soc15_common_hw_init(void *handle) soc15_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ soc15_enable_doorbell_aperture(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 55284b24f113..16b447055102 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -660,12 +660,6 @@ static int soc21_common_hw_init(void *handle) soc21_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers)
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ soc21_enable_doorbell_aperture(adev, true);
-- 2.25.1
On 8/30/2022 7:18 PM, Alex Deucher wrote:
On Tue, Aug 30, 2022 at 12:05 AM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/29/2022 10:20 PM, Alex Deucher wrote:
On Mon, Aug 29, 2022 at 4:18 AM Lijo Lazar lijo.lazar@amd.com wrote:
HDP flush is used early in the init sequence as part of memory controller block initialization. Hence remapping of HDP registers needed for flush needs to happen earlier.
This also fixes the Unsupported Request error reported through AER during driver load. The error happens as a write happens to the remap offset before real remapping is done.
Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugzilla.k...
The error was unnoticed before and got visible because of the commit referenced below. This doesn't fix anything in the commit below, rather fixes the issue in amdgpu exposed by the commit. The reference is only to associate this commit with below one so that both go together.
Fixes: 8795e182b02d ("PCI/portdrv: Don't disable AER reporting in get_port_device_capability()")
Reported-by: Tom Seewald tseewald@gmail.com Signed-off-by: Lijo Lazar lijo.lazar@amd.com Cc: stable@vger.kernel.org
How about something like the attached patch rather than these two patches? It's a bit bigger but seems cleaner and more defensive in my opinion.
Whenever device goes to suspend/reset and then comes back, remap offset has to be set back to 0 to make sure it doesn't use the wrong offset when the register assumes default values again.
To avoid the if-check in hdp_flush (which is more frequent), another way is to initialize the remap offset to default offset during early init and hw fini/suspend sequences. It won't be obvious (even with this patch) as to when remap offset vs default offset is used though.
On resume, the common IP is resumed first so it will always be set. The only case that is a problem is init because we init GMC out of order. We could init common before GMC in amdgpu_device_ip_init(). I think that should be fine, but I wasn't sure if there might be some fallout from that on certain cards.
There are other places where an IP order is forced like in amdgpu_device_ip_reinit_early_sriov(). This also may not affect this case as remapping is not done for VF.
Agree that a better way is to have the common IP to be inited first.
Thanks, Lijo
Alex
Thanks, Lijo
Alex
v2: Take care of IP resume cases (Alex Deucher) Add NULL check to nbio.funcs to cover older (GFXv8) ASICs (Felix Kuehling) Add more details in commit message and associate with AER patch (Bjorn Helgaas)
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/nv.c | 6 ------ drivers/gpu/drm/amd/amdgpu/soc15.c | 6 ------ drivers/gpu/drm/amd/amdgpu/soc21.c | 6 ------ 4 files changed, 24 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ce7d117efdb5..e420118769a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2334,6 +2334,26 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) return 0; }
+/**
- amdgpu_device_prepare_ip - prepare IPs for hardware initialization
- @adev: amdgpu_device pointer
- Any common hardware initialization sequence that needs to be done before
- hw init of individual IPs is performed here. This is different from the
- 'common block' which initializes a set of IPs.
- */
+static void amdgpu_device_prepare_ip(struct amdgpu_device *adev) +{
/* Remap HDP registers to a hole in mmio space, for the purpose
* of exposing those registers to process space. This needs to be
* done before hw init of ip blocks to take care of HDP flush
* operations through registers during hw_init.
*/
if (adev->nbio.funcs && adev->nbio.funcs->remap_hdp_registers &&
!amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev);
+}
/** * amdgpu_device_ip_init - run init for hardware IPs @@ -2376,6 +2396,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); goto init_failed; }
amdgpu_device_prepare_ip(adev); r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); if (r) { DRM_ERROR("hw_init %d failed %d\n", i, r);
@@ -3058,6 +3080,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) AMD_IP_BLOCK_TYPE_IH, };
amdgpu_device_prepare_ip(adev); for (i = 0; i < adev->num_ip_blocks; i++) { int j; struct amdgpu_ip_block *block;
@@ -3139,6 +3162,7 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) { int i, r;
amdgpu_device_prepare_ip(adev); for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) continue;
diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index b3fba8dea63c..3ac7fef74277 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -1032,12 +1032,6 @@ static int nv_common_hw_init(void *handle) nv_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ nv_enable_doorbell_aperture(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index fde6154f2009..a0481e37d7cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1240,12 +1240,6 @@ static int soc15_common_hw_init(void *handle) soc15_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ soc15_enable_doorbell_aperture(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 55284b24f113..16b447055102 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -660,12 +660,6 @@ static int soc21_common_hw_init(void *handle) soc21_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers)
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ soc21_enable_doorbell_aperture(adev, true);
-- 2.25.1
On Tue, Aug 30, 2022 at 10:45 AM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/30/2022 7:18 PM, Alex Deucher wrote:
On Tue, Aug 30, 2022 at 12:05 AM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/29/2022 10:20 PM, Alex Deucher wrote:
On Mon, Aug 29, 2022 at 4:18 AM Lijo Lazar lijo.lazar@amd.com wrote:
HDP flush is used early in the init sequence as part of memory controller block initialization. Hence remapping of HDP registers needed for flush needs to happen earlier.
This also fixes the Unsupported Request error reported through AER during driver load. The error happens as a write happens to the remap offset before real remapping is done.
Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugzilla.k...
The error was unnoticed before and got visible because of the commit referenced below. This doesn't fix anything in the commit below, rather fixes the issue in amdgpu exposed by the commit. The reference is only to associate this commit with below one so that both go together.
Fixes: 8795e182b02d ("PCI/portdrv: Don't disable AER reporting in get_port_device_capability()")
Reported-by: Tom Seewald tseewald@gmail.com Signed-off-by: Lijo Lazar lijo.lazar@amd.com Cc: stable@vger.kernel.org
How about something like the attached patch rather than these two patches? It's a bit bigger but seems cleaner and more defensive in my opinion.
Whenever device goes to suspend/reset and then comes back, remap offset has to be set back to 0 to make sure it doesn't use the wrong offset when the register assumes default values again.
To avoid the if-check in hdp_flush (which is more frequent), another way is to initialize the remap offset to default offset during early init and hw fini/suspend sequences. It won't be obvious (even with this patch) as to when remap offset vs default offset is used though.
On resume, the common IP is resumed first so it will always be set. The only case that is a problem is init because we init GMC out of order. We could init common before GMC in amdgpu_device_ip_init(). I think that should be fine, but I wasn't sure if there might be some fallout from that on certain cards.
There are other places where an IP order is forced like in amdgpu_device_ip_reinit_early_sriov(). This also may not affect this case as remapping is not done for VF.
Agree that a better way is to have the common IP to be inited first.
How about these patches?
Alex
Thanks, Lijo
Alex
Thanks, Lijo
Alex
v2: Take care of IP resume cases (Alex Deucher) Add NULL check to nbio.funcs to cover older (GFXv8) ASICs (Felix Kuehling) Add more details in commit message and associate with AER patch (Bjorn Helgaas)
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/nv.c | 6 ------ drivers/gpu/drm/amd/amdgpu/soc15.c | 6 ------ drivers/gpu/drm/amd/amdgpu/soc21.c | 6 ------ 4 files changed, 24 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ce7d117efdb5..e420118769a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2334,6 +2334,26 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) return 0; }
+/**
- amdgpu_device_prepare_ip - prepare IPs for hardware initialization
- @adev: amdgpu_device pointer
- Any common hardware initialization sequence that needs to be done before
- hw init of individual IPs is performed here. This is different from the
- 'common block' which initializes a set of IPs.
- */
+static void amdgpu_device_prepare_ip(struct amdgpu_device *adev) +{
/* Remap HDP registers to a hole in mmio space, for the purpose
* of exposing those registers to process space. This needs to be
* done before hw init of ip blocks to take care of HDP flush
* operations through registers during hw_init.
*/
if (adev->nbio.funcs && adev->nbio.funcs->remap_hdp_registers &&
!amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev);
+}
/** * amdgpu_device_ip_init - run init for hardware IPs @@ -2376,6 +2396,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); goto init_failed; }
amdgpu_device_prepare_ip(adev); r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); if (r) { DRM_ERROR("hw_init %d failed %d\n", i, r);
@@ -3058,6 +3080,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) AMD_IP_BLOCK_TYPE_IH, };
amdgpu_device_prepare_ip(adev); for (i = 0; i < adev->num_ip_blocks; i++) { int j; struct amdgpu_ip_block *block;
@@ -3139,6 +3162,7 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) { int i, r;
amdgpu_device_prepare_ip(adev); for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) continue;
diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index b3fba8dea63c..3ac7fef74277 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -1032,12 +1032,6 @@ static int nv_common_hw_init(void *handle) nv_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ nv_enable_doorbell_aperture(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index fde6154f2009..a0481e37d7cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1240,12 +1240,6 @@ static int soc15_common_hw_init(void *handle) soc15_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ soc15_enable_doorbell_aperture(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 55284b24f113..16b447055102 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -660,12 +660,6 @@ static int soc21_common_hw_init(void *handle) soc21_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers)
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ soc21_enable_doorbell_aperture(adev, true);
-- 2.25.1
On 8/30/2022 8:39 PM, Alex Deucher wrote:
On Tue, Aug 30, 2022 at 10:45 AM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/30/2022 7:18 PM, Alex Deucher wrote:
On Tue, Aug 30, 2022 at 12:05 AM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/29/2022 10:20 PM, Alex Deucher wrote:
On Mon, Aug 29, 2022 at 4:18 AM Lijo Lazar lijo.lazar@amd.com wrote:
HDP flush is used early in the init sequence as part of memory controller block initialization. Hence remapping of HDP registers needed for flush needs to happen earlier.
This also fixes the Unsupported Request error reported through AER during driver load. The error happens as a write happens to the remap offset before real remapping is done.
Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugzilla.k...
The error was unnoticed before and got visible because of the commit referenced below. This doesn't fix anything in the commit below, rather fixes the issue in amdgpu exposed by the commit. The reference is only to associate this commit with below one so that both go together.
Fixes: 8795e182b02d ("PCI/portdrv: Don't disable AER reporting in get_port_device_capability()")
Reported-by: Tom Seewald tseewald@gmail.com Signed-off-by: Lijo Lazar lijo.lazar@amd.com Cc: stable@vger.kernel.org
How about something like the attached patch rather than these two patches? It's a bit bigger but seems cleaner and more defensive in my opinion.
Whenever device goes to suspend/reset and then comes back, remap offset has to be set back to 0 to make sure it doesn't use the wrong offset when the register assumes default values again.
To avoid the if-check in hdp_flush (which is more frequent), another way is to initialize the remap offset to default offset during early init and hw fini/suspend sequences. It won't be obvious (even with this patch) as to when remap offset vs default offset is used though.
On resume, the common IP is resumed first so it will always be set. The only case that is a problem is init because we init GMC out of order. We could init common before GMC in amdgpu_device_ip_init(). I think that should be fine, but I wasn't sure if there might be some fallout from that on certain cards.
There are other places where an IP order is forced like in amdgpu_device_ip_reinit_early_sriov(). This also may not affect this case as remapping is not done for VF.
Agree that a better way is to have the common IP to be inited first.
How about these patches?
Looks good to me. BTW, is nbio 7.7 for an APU (in which case hdp flush is not expected to be used)?
Thanks, Lijo
Alex
Thanks, Lijo
Alex
Thanks, Lijo
Alex
v2: Take care of IP resume cases (Alex Deucher) Add NULL check to nbio.funcs to cover older (GFXv8) ASICs (Felix Kuehling) Add more details in commit message and associate with AER patch (Bjorn Helgaas)
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/nv.c | 6 ------ drivers/gpu/drm/amd/amdgpu/soc15.c | 6 ------ drivers/gpu/drm/amd/amdgpu/soc21.c | 6 ------ 4 files changed, 24 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ce7d117efdb5..e420118769a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2334,6 +2334,26 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) return 0; }
+/**
- amdgpu_device_prepare_ip - prepare IPs for hardware initialization
- @adev: amdgpu_device pointer
- Any common hardware initialization sequence that needs to be done before
- hw init of individual IPs is performed here. This is different from the
- 'common block' which initializes a set of IPs.
- */
+static void amdgpu_device_prepare_ip(struct amdgpu_device *adev) +{
/* Remap HDP registers to a hole in mmio space, for the purpose
* of exposing those registers to process space. This needs to be
* done before hw init of ip blocks to take care of HDP flush
* operations through registers during hw_init.
*/
if (adev->nbio.funcs && adev->nbio.funcs->remap_hdp_registers &&
!amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev);
+}
/** * amdgpu_device_ip_init - run init for hardware IPs
@@ -2376,6 +2396,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); goto init_failed; }
amdgpu_device_prepare_ip(adev); r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); if (r) { DRM_ERROR("hw_init %d failed %d\n", i, r);
@@ -3058,6 +3080,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) AMD_IP_BLOCK_TYPE_IH, };
amdgpu_device_prepare_ip(adev); for (i = 0; i < adev->num_ip_blocks; i++) { int j; struct amdgpu_ip_block *block;
@@ -3139,6 +3162,7 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) { int i, r;
amdgpu_device_prepare_ip(adev); for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) continue;
diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index b3fba8dea63c..3ac7fef74277 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -1032,12 +1032,6 @@ static int nv_common_hw_init(void *handle) nv_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ nv_enable_doorbell_aperture(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index fde6154f2009..a0481e37d7cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1240,12 +1240,6 @@ static int soc15_common_hw_init(void *handle) soc15_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev))
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ soc15_enable_doorbell_aperture(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 55284b24f113..16b447055102 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -660,12 +660,6 @@ static int soc21_common_hw_init(void *handle) soc21_program_aspm(adev); /* setup nbio registers */ adev->nbio.funcs->init_registers(adev);
/* remap HDP registers to a hole in mmio space,
* for the purpose of expose those registers
* to process space
*/
if (adev->nbio.funcs->remap_hdp_registers)
adev->nbio.funcs->remap_hdp_registers(adev); /* enable the doorbell aperture */ soc21_enable_doorbell_aperture(adev, true);
-- 2.25.1
On Tue, Aug 30, 2022 at 12:06 PM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/30/2022 8:39 PM, Alex Deucher wrote:
On Tue, Aug 30, 2022 at 10:45 AM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/30/2022 7:18 PM, Alex Deucher wrote:
On Tue, Aug 30, 2022 at 12:05 AM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/29/2022 10:20 PM, Alex Deucher wrote:
On Mon, Aug 29, 2022 at 4:18 AM Lijo Lazar lijo.lazar@amd.com wrote: > > HDP flush is used early in the init sequence as part of memory controller > block initialization. Hence remapping of HDP registers needed for flush > needs to happen earlier. > > This also fixes the Unsupported Request error reported through AER during > driver load. The error happens as a write happens to the remap offset > before real remapping is done. > > Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugzilla.k... > > The error was unnoticed before and got visible because of the commit > referenced below. This doesn't fix anything in the commit below, rather > fixes the issue in amdgpu exposed by the commit. The reference is only > to associate this commit with below one so that both go together. > > Fixes: 8795e182b02d ("PCI/portdrv: Don't disable AER reporting in get_port_device_capability()") > > Reported-by: Tom Seewald tseewald@gmail.com > Signed-off-by: Lijo Lazar lijo.lazar@amd.com > Cc: stable@vger.kernel.org
How about something like the attached patch rather than these two patches? It's a bit bigger but seems cleaner and more defensive in my opinion.
Whenever device goes to suspend/reset and then comes back, remap offset has to be set back to 0 to make sure it doesn't use the wrong offset when the register assumes default values again.
To avoid the if-check in hdp_flush (which is more frequent), another way is to initialize the remap offset to default offset during early init and hw fini/suspend sequences. It won't be obvious (even with this patch) as to when remap offset vs default offset is used though.
On resume, the common IP is resumed first so it will always be set. The only case that is a problem is init because we init GMC out of order. We could init common before GMC in amdgpu_device_ip_init(). I think that should be fine, but I wasn't sure if there might be some fallout from that on certain cards.
There are other places where an IP order is forced like in amdgpu_device_ip_reinit_early_sriov(). This also may not affect this case as remapping is not done for VF.
Agree that a better way is to have the common IP to be inited first.
How about these patches?
Looks good to me. BTW, is nbio 7.7 for an APU (in which case hdp flush is not expected to be used)?
It would be used in some cases, e.g., GPU VM passthrough where we use the BAR rather than the carve out.
Alex
Thanks, Lijo
Alex
Thanks, Lijo
Alex
Thanks, Lijo
Alex
> --- > v2: > Take care of IP resume cases (Alex Deucher) > Add NULL check to nbio.funcs to cover older (GFXv8) ASICs (Felix Kuehling) > Add more details in commit message and associate with AER patch (Bjorn > Helgaas) > > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/nv.c | 6 ------ > drivers/gpu/drm/amd/amdgpu/soc15.c | 6 ------ > drivers/gpu/drm/amd/amdgpu/soc21.c | 6 ------ > 4 files changed, 24 insertions(+), 18 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index ce7d117efdb5..e420118769a5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2334,6 +2334,26 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) > return 0; > } > > +/** > + * amdgpu_device_prepare_ip - prepare IPs for hardware initialization > + * > + * @adev: amdgpu_device pointer > + * > + * Any common hardware initialization sequence that needs to be done before > + * hw init of individual IPs is performed here. This is different from the > + * 'common block' which initializes a set of IPs. > + */ > +static void amdgpu_device_prepare_ip(struct amdgpu_device *adev) > +{ > + /* Remap HDP registers to a hole in mmio space, for the purpose > + * of exposing those registers to process space. This needs to be > + * done before hw init of ip blocks to take care of HDP flush > + * operations through registers during hw_init. > + */ > + if (adev->nbio.funcs && adev->nbio.funcs->remap_hdp_registers && > + !amdgpu_sriov_vf(adev)) > + adev->nbio.funcs->remap_hdp_registers(adev); > +} > > /** > * amdgpu_device_ip_init - run init for hardware IPs > @@ -2376,6 +2396,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) > DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); > goto init_failed; > } > + > + amdgpu_device_prepare_ip(adev); > r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); > if (r) { > DRM_ERROR("hw_init %d failed %d\n", i, r); > @@ -3058,6 +3080,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) > AMD_IP_BLOCK_TYPE_IH, > }; > > + amdgpu_device_prepare_ip(adev); > for (i = 0; i < adev->num_ip_blocks; i++) { > int j; > struct amdgpu_ip_block *block; > @@ -3139,6 +3162,7 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) > { > int i, r; > > + amdgpu_device_prepare_ip(adev); > for (i = 0; i < adev->num_ip_blocks; i++) { > if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) > continue; > diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c > index b3fba8dea63c..3ac7fef74277 100644 > --- a/drivers/gpu/drm/amd/amdgpu/nv.c > +++ b/drivers/gpu/drm/amd/amdgpu/nv.c > @@ -1032,12 +1032,6 @@ static int nv_common_hw_init(void *handle) > nv_program_aspm(adev); > /* setup nbio registers */ > adev->nbio.funcs->init_registers(adev); > - /* remap HDP registers to a hole in mmio space, > - * for the purpose of expose those registers > - * to process space > - */ > - if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev)) > - adev->nbio.funcs->remap_hdp_registers(adev); > /* enable the doorbell aperture */ > nv_enable_doorbell_aperture(adev, true); > > diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c > index fde6154f2009..a0481e37d7cf 100644 > --- a/drivers/gpu/drm/amd/amdgpu/soc15.c > +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c > @@ -1240,12 +1240,6 @@ static int soc15_common_hw_init(void *handle) > soc15_program_aspm(adev); > /* setup nbio registers */ > adev->nbio.funcs->init_registers(adev); > - /* remap HDP registers to a hole in mmio space, > - * for the purpose of expose those registers > - * to process space > - */ > - if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev)) > - adev->nbio.funcs->remap_hdp_registers(adev); > > /* enable the doorbell aperture */ > soc15_enable_doorbell_aperture(adev, true); > diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c > index 55284b24f113..16b447055102 100644 > --- a/drivers/gpu/drm/amd/amdgpu/soc21.c > +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c > @@ -660,12 +660,6 @@ static int soc21_common_hw_init(void *handle) > soc21_program_aspm(adev); > /* setup nbio registers */ > adev->nbio.funcs->init_registers(adev); > - /* remap HDP registers to a hole in mmio space, > - * for the purpose of expose those registers > - * to process space > - */ > - if (adev->nbio.funcs->remap_hdp_registers) > - adev->nbio.funcs->remap_hdp_registers(adev); > /* enable the doorbell aperture */ > soc21_enable_doorbell_aperture(adev, true); > > -- > 2.25.1 >
How about this? We should just move HDP remap to gmc hw_init since that is mainly what uses it anyway.
Alex
On Tue, Aug 30, 2022 at 2:05 PM Alex Deucher alexdeucher@gmail.com wrote:
On Tue, Aug 30, 2022 at 12:06 PM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/30/2022 8:39 PM, Alex Deucher wrote:
On Tue, Aug 30, 2022 at 10:45 AM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/30/2022 7:18 PM, Alex Deucher wrote:
On Tue, Aug 30, 2022 at 12:05 AM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/29/2022 10:20 PM, Alex Deucher wrote: > On Mon, Aug 29, 2022 at 4:18 AM Lijo Lazar lijo.lazar@amd.com wrote: >> >> HDP flush is used early in the init sequence as part of memory controller >> block initialization. Hence remapping of HDP registers needed for flush >> needs to happen earlier. >> >> This also fixes the Unsupported Request error reported through AER during >> driver load. The error happens as a write happens to the remap offset >> before real remapping is done. >> >> Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugzilla.k... >> >> The error was unnoticed before and got visible because of the commit >> referenced below. This doesn't fix anything in the commit below, rather >> fixes the issue in amdgpu exposed by the commit. The reference is only >> to associate this commit with below one so that both go together. >> >> Fixes: 8795e182b02d ("PCI/portdrv: Don't disable AER reporting in get_port_device_capability()") >> >> Reported-by: Tom Seewald tseewald@gmail.com >> Signed-off-by: Lijo Lazar lijo.lazar@amd.com >> Cc: stable@vger.kernel.org > > How about something like the attached patch rather than these two > patches? It's a bit bigger but seems cleaner and more defensive in my > opinion. >
Whenever device goes to suspend/reset and then comes back, remap offset has to be set back to 0 to make sure it doesn't use the wrong offset when the register assumes default values again.
To avoid the if-check in hdp_flush (which is more frequent), another way is to initialize the remap offset to default offset during early init and hw fini/suspend sequences. It won't be obvious (even with this patch) as to when remap offset vs default offset is used though.
On resume, the common IP is resumed first so it will always be set. The only case that is a problem is init because we init GMC out of order. We could init common before GMC in amdgpu_device_ip_init(). I think that should be fine, but I wasn't sure if there might be some fallout from that on certain cards.
There are other places where an IP order is forced like in amdgpu_device_ip_reinit_early_sriov(). This also may not affect this case as remapping is not done for VF.
Agree that a better way is to have the common IP to be inited first.
How about these patches?
Looks good to me. BTW, is nbio 7.7 for an APU (in which case hdp flush is not expected to be used)?
It would be used in some cases, e.g., GPU VM passthrough where we use the BAR rather than the carve out.
Alex
Thanks, Lijo
Alex
Thanks, Lijo
Alex
Thanks, Lijo
> Alex > >> --- >> v2: >> Take care of IP resume cases (Alex Deucher) >> Add NULL check to nbio.funcs to cover older (GFXv8) ASICs (Felix Kuehling) >> Add more details in commit message and associate with AER patch (Bjorn >> Helgaas) >> >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++ >> drivers/gpu/drm/amd/amdgpu/nv.c | 6 ------ >> drivers/gpu/drm/amd/amdgpu/soc15.c | 6 ------ >> drivers/gpu/drm/amd/amdgpu/soc21.c | 6 ------ >> 4 files changed, 24 insertions(+), 18 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index ce7d117efdb5..e420118769a5 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -2334,6 +2334,26 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) >> return 0; >> } >> >> +/** >> + * amdgpu_device_prepare_ip - prepare IPs for hardware initialization >> + * >> + * @adev: amdgpu_device pointer >> + * >> + * Any common hardware initialization sequence that needs to be done before >> + * hw init of individual IPs is performed here. This is different from the >> + * 'common block' which initializes a set of IPs. >> + */ >> +static void amdgpu_device_prepare_ip(struct amdgpu_device *adev) >> +{ >> + /* Remap HDP registers to a hole in mmio space, for the purpose >> + * of exposing those registers to process space. This needs to be >> + * done before hw init of ip blocks to take care of HDP flush >> + * operations through registers during hw_init. >> + */ >> + if (adev->nbio.funcs && adev->nbio.funcs->remap_hdp_registers && >> + !amdgpu_sriov_vf(adev)) >> + adev->nbio.funcs->remap_hdp_registers(adev); >> +} >> >> /** >> * amdgpu_device_ip_init - run init for hardware IPs >> @@ -2376,6 +2396,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) >> DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); >> goto init_failed; >> } >> + >> + amdgpu_device_prepare_ip(adev); >> r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); >> if (r) { >> DRM_ERROR("hw_init %d failed %d\n", i, r); >> @@ -3058,6 +3080,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) >> AMD_IP_BLOCK_TYPE_IH, >> }; >> >> + amdgpu_device_prepare_ip(adev); >> for (i = 0; i < adev->num_ip_blocks; i++) { >> int j; >> struct amdgpu_ip_block *block; >> @@ -3139,6 +3162,7 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) >> { >> int i, r; >> >> + amdgpu_device_prepare_ip(adev); >> for (i = 0; i < adev->num_ip_blocks; i++) { >> if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) >> continue; >> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c >> index b3fba8dea63c..3ac7fef74277 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/nv.c >> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c >> @@ -1032,12 +1032,6 @@ static int nv_common_hw_init(void *handle) >> nv_program_aspm(adev); >> /* setup nbio registers */ >> adev->nbio.funcs->init_registers(adev); >> - /* remap HDP registers to a hole in mmio space, >> - * for the purpose of expose those registers >> - * to process space >> - */ >> - if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev)) >> - adev->nbio.funcs->remap_hdp_registers(adev); >> /* enable the doorbell aperture */ >> nv_enable_doorbell_aperture(adev, true); >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c >> index fde6154f2009..a0481e37d7cf 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c >> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c >> @@ -1240,12 +1240,6 @@ static int soc15_common_hw_init(void *handle) >> soc15_program_aspm(adev); >> /* setup nbio registers */ >> adev->nbio.funcs->init_registers(adev); >> - /* remap HDP registers to a hole in mmio space, >> - * for the purpose of expose those registers >> - * to process space >> - */ >> - if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev)) >> - adev->nbio.funcs->remap_hdp_registers(adev); >> >> /* enable the doorbell aperture */ >> soc15_enable_doorbell_aperture(adev, true); >> diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c >> index 55284b24f113..16b447055102 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/soc21.c >> +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c >> @@ -660,12 +660,6 @@ static int soc21_common_hw_init(void *handle) >> soc21_program_aspm(adev); >> /* setup nbio registers */ >> adev->nbio.funcs->init_registers(adev); >> - /* remap HDP registers to a hole in mmio space, >> - * for the purpose of expose those registers >> - * to process space >> - */ >> - if (adev->nbio.funcs->remap_hdp_registers) >> - adev->nbio.funcs->remap_hdp_registers(adev); >> /* enable the doorbell aperture */ >> soc21_enable_doorbell_aperture(adev, true); >> >> -- >> 2.25.1 >>
On 9/2/2022 1:09 AM, Alex Deucher wrote:
How about this? We should just move HDP remap to gmc hw_init since that is mainly what uses it anyway.
Sorry, I missed to R-B the previous version. Did you find any problem when common block is initialized first?
I think that patch provides a consistent IP init sequence during cold init and resume.
Thanks, Lijo
Alex
On Tue, Aug 30, 2022 at 2:05 PM Alex Deucher alexdeucher@gmail.com wrote:
On Tue, Aug 30, 2022 at 12:06 PM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/30/2022 8:39 PM, Alex Deucher wrote:
On Tue, Aug 30, 2022 at 10:45 AM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/30/2022 7:18 PM, Alex Deucher wrote:
On Tue, Aug 30, 2022 at 12:05 AM Lazar, Lijo lijo.lazar@amd.com wrote: > > > > On 8/29/2022 10:20 PM, Alex Deucher wrote: >> On Mon, Aug 29, 2022 at 4:18 AM Lijo Lazar lijo.lazar@amd.com wrote: >>> >>> HDP flush is used early in the init sequence as part of memory controller >>> block initialization. Hence remapping of HDP registers needed for flush >>> needs to happen earlier. >>> >>> This also fixes the Unsupported Request error reported through AER during >>> driver load. The error happens as a write happens to the remap offset >>> before real remapping is done. >>> >>> Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugzilla.k... >>> >>> The error was unnoticed before and got visible because of the commit >>> referenced below. This doesn't fix anything in the commit below, rather >>> fixes the issue in amdgpu exposed by the commit. The reference is only >>> to associate this commit with below one so that both go together. >>> >>> Fixes: 8795e182b02d ("PCI/portdrv: Don't disable AER reporting in get_port_device_capability()") >>> >>> Reported-by: Tom Seewald tseewald@gmail.com >>> Signed-off-by: Lijo Lazar lijo.lazar@amd.com >>> Cc: stable@vger.kernel.org >> >> How about something like the attached patch rather than these two >> patches? It's a bit bigger but seems cleaner and more defensive in my >> opinion. >> > > Whenever device goes to suspend/reset and then comes back, remap offset > has to be set back to 0 to make sure it doesn't use the wrong offset > when the register assumes default values again. > > To avoid the if-check in hdp_flush (which is more frequent), another way > is to initialize the remap offset to default offset during early init > and hw fini/suspend sequences. It won't be obvious (even with this > patch) as to when remap offset vs default offset is used though.
On resume, the common IP is resumed first so it will always be set. The only case that is a problem is init because we init GMC out of order. We could init common before GMC in amdgpu_device_ip_init(). I think that should be fine, but I wasn't sure if there might be some fallout from that on certain cards.
There are other places where an IP order is forced like in amdgpu_device_ip_reinit_early_sriov(). This also may not affect this case as remapping is not done for VF.
Agree that a better way is to have the common IP to be inited first.
How about these patches?
Looks good to me. BTW, is nbio 7.7 for an APU (in which case hdp flush is not expected to be used)?
It would be used in some cases, e.g., GPU VM passthrough where we use the BAR rather than the carve out.
Alex
Thanks, Lijo
Alex
Thanks, Lijo
Alex
> > Thanks, > Lijo > >> Alex >> >>> --- >>> v2: >>> Take care of IP resume cases (Alex Deucher) >>> Add NULL check to nbio.funcs to cover older (GFXv8) ASICs (Felix Kuehling) >>> Add more details in commit message and associate with AER patch (Bjorn >>> Helgaas) >>> >>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++ >>> drivers/gpu/drm/amd/amdgpu/nv.c | 6 ------ >>> drivers/gpu/drm/amd/amdgpu/soc15.c | 6 ------ >>> drivers/gpu/drm/amd/amdgpu/soc21.c | 6 ------ >>> 4 files changed, 24 insertions(+), 18 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> index ce7d117efdb5..e420118769a5 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> @@ -2334,6 +2334,26 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) >>> return 0; >>> } >>> >>> +/** >>> + * amdgpu_device_prepare_ip - prepare IPs for hardware initialization >>> + * >>> + * @adev: amdgpu_device pointer >>> + * >>> + * Any common hardware initialization sequence that needs to be done before >>> + * hw init of individual IPs is performed here. This is different from the >>> + * 'common block' which initializes a set of IPs. >>> + */ >>> +static void amdgpu_device_prepare_ip(struct amdgpu_device *adev) >>> +{ >>> + /* Remap HDP registers to a hole in mmio space, for the purpose >>> + * of exposing those registers to process space. This needs to be >>> + * done before hw init of ip blocks to take care of HDP flush >>> + * operations through registers during hw_init. >>> + */ >>> + if (adev->nbio.funcs && adev->nbio.funcs->remap_hdp_registers && >>> + !amdgpu_sriov_vf(adev)) >>> + adev->nbio.funcs->remap_hdp_registers(adev); >>> +} >>> >>> /** >>> * amdgpu_device_ip_init - run init for hardware IPs >>> @@ -2376,6 +2396,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) >>> DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); >>> goto init_failed; >>> } >>> + >>> + amdgpu_device_prepare_ip(adev); >>> r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); >>> if (r) { >>> DRM_ERROR("hw_init %d failed %d\n", i, r); >>> @@ -3058,6 +3080,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) >>> AMD_IP_BLOCK_TYPE_IH, >>> }; >>> >>> + amdgpu_device_prepare_ip(adev); >>> for (i = 0; i < adev->num_ip_blocks; i++) { >>> int j; >>> struct amdgpu_ip_block *block; >>> @@ -3139,6 +3162,7 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) >>> { >>> int i, r; >>> >>> + amdgpu_device_prepare_ip(adev); >>> for (i = 0; i < adev->num_ip_blocks; i++) { >>> if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) >>> continue; >>> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c >>> index b3fba8dea63c..3ac7fef74277 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/nv.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c >>> @@ -1032,12 +1032,6 @@ static int nv_common_hw_init(void *handle) >>> nv_program_aspm(adev); >>> /* setup nbio registers */ >>> adev->nbio.funcs->init_registers(adev); >>> - /* remap HDP registers to a hole in mmio space, >>> - * for the purpose of expose those registers >>> - * to process space >>> - */ >>> - if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev)) >>> - adev->nbio.funcs->remap_hdp_registers(adev); >>> /* enable the doorbell aperture */ >>> nv_enable_doorbell_aperture(adev, true); >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c >>> index fde6154f2009..a0481e37d7cf 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c >>> @@ -1240,12 +1240,6 @@ static int soc15_common_hw_init(void *handle) >>> soc15_program_aspm(adev); >>> /* setup nbio registers */ >>> adev->nbio.funcs->init_registers(adev); >>> - /* remap HDP registers to a hole in mmio space, >>> - * for the purpose of expose those registers >>> - * to process space >>> - */ >>> - if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev)) >>> - adev->nbio.funcs->remap_hdp_registers(adev); >>> >>> /* enable the doorbell aperture */ >>> soc15_enable_doorbell_aperture(adev, true); >>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c >>> index 55284b24f113..16b447055102 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/soc21.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c >>> @@ -660,12 +660,6 @@ static int soc21_common_hw_init(void *handle) >>> soc21_program_aspm(adev); >>> /* setup nbio registers */ >>> adev->nbio.funcs->init_registers(adev); >>> - /* remap HDP registers to a hole in mmio space, >>> - * for the purpose of expose those registers >>> - * to process space >>> - */ >>> - if (adev->nbio.funcs->remap_hdp_registers) >>> - adev->nbio.funcs->remap_hdp_registers(adev); >>> /* enable the doorbell aperture */ >>> soc21_enable_doorbell_aperture(adev, true); >>> >>> -- >>> 2.25.1 >>>
On Mon, Sep 5, 2022 at 1:27 AM Lazar, Lijo lijo.lazar@amd.com wrote:
On 9/2/2022 1:09 AM, Alex Deucher wrote:
How about this? We should just move HDP remap to gmc hw_init since that is mainly what uses it anyway.
Sorry, I missed to R-B the previous version. Did you find any problem when common block is initialized first?
One of the users on the bug report reported an issue with that patch, that said, that user seems to be seeing a slightly different issue since he is on a gfx9 card and the original reporter was on gfx10. https://bugzilla.kernel.org/show_bug.cgi?id=216373
Alex
I think that patch provides a consistent IP init sequence during cold init and resume.
Thanks, Lijo
Alex
On Tue, Aug 30, 2022 at 2:05 PM Alex Deucher alexdeucher@gmail.com wrote:
On Tue, Aug 30, 2022 at 12:06 PM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/30/2022 8:39 PM, Alex Deucher wrote:
On Tue, Aug 30, 2022 at 10:45 AM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/30/2022 7:18 PM, Alex Deucher wrote: > On Tue, Aug 30, 2022 at 12:05 AM Lazar, Lijo lijo.lazar@amd.com wrote: >> >> >> >> On 8/29/2022 10:20 PM, Alex Deucher wrote: >>> On Mon, Aug 29, 2022 at 4:18 AM Lijo Lazar lijo.lazar@amd.com wrote: >>>> >>>> HDP flush is used early in the init sequence as part of memory controller >>>> block initialization. Hence remapping of HDP registers needed for flush >>>> needs to happen earlier. >>>> >>>> This also fixes the Unsupported Request error reported through AER during >>>> driver load. The error happens as a write happens to the remap offset >>>> before real remapping is done. >>>> >>>> Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugzilla.k... >>>> >>>> The error was unnoticed before and got visible because of the commit >>>> referenced below. This doesn't fix anything in the commit below, rather >>>> fixes the issue in amdgpu exposed by the commit. The reference is only >>>> to associate this commit with below one so that both go together. >>>> >>>> Fixes: 8795e182b02d ("PCI/portdrv: Don't disable AER reporting in get_port_device_capability()") >>>> >>>> Reported-by: Tom Seewald tseewald@gmail.com >>>> Signed-off-by: Lijo Lazar lijo.lazar@amd.com >>>> Cc: stable@vger.kernel.org >>> >>> How about something like the attached patch rather than these two >>> patches? It's a bit bigger but seems cleaner and more defensive in my >>> opinion. >>> >> >> Whenever device goes to suspend/reset and then comes back, remap offset >> has to be set back to 0 to make sure it doesn't use the wrong offset >> when the register assumes default values again. >> >> To avoid the if-check in hdp_flush (which is more frequent), another way >> is to initialize the remap offset to default offset during early init >> and hw fini/suspend sequences. It won't be obvious (even with this >> patch) as to when remap offset vs default offset is used though. > > On resume, the common IP is resumed first so it will always be set. > The only case that is a problem is init because we init GMC out of > order. We could init common before GMC in amdgpu_device_ip_init(). I > think that should be fine, but I wasn't sure if there might be some > fallout from that on certain cards. >
There are other places where an IP order is forced like in amdgpu_device_ip_reinit_early_sriov(). This also may not affect this case as remapping is not done for VF.
Agree that a better way is to have the common IP to be inited first.
How about these patches?
Looks good to me. BTW, is nbio 7.7 for an APU (in which case hdp flush is not expected to be used)?
It would be used in some cases, e.g., GPU VM passthrough where we use the BAR rather than the carve out.
Alex
Thanks, Lijo
Alex
Thanks, Lijo
> Alex > >> >> Thanks, >> Lijo >> >>> Alex >>> >>>> --- >>>> v2: >>>> Take care of IP resume cases (Alex Deucher) >>>> Add NULL check to nbio.funcs to cover older (GFXv8) ASICs (Felix Kuehling) >>>> Add more details in commit message and associate with AER patch (Bjorn >>>> Helgaas) >>>> >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++ >>>> drivers/gpu/drm/amd/amdgpu/nv.c | 6 ------ >>>> drivers/gpu/drm/amd/amdgpu/soc15.c | 6 ------ >>>> drivers/gpu/drm/amd/amdgpu/soc21.c | 6 ------ >>>> 4 files changed, 24 insertions(+), 18 deletions(-) >>>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> index ce7d117efdb5..e420118769a5 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> @@ -2334,6 +2334,26 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) >>>> return 0; >>>> } >>>> >>>> +/** >>>> + * amdgpu_device_prepare_ip - prepare IPs for hardware initialization >>>> + * >>>> + * @adev: amdgpu_device pointer >>>> + * >>>> + * Any common hardware initialization sequence that needs to be done before >>>> + * hw init of individual IPs is performed here. This is different from the >>>> + * 'common block' which initializes a set of IPs. >>>> + */ >>>> +static void amdgpu_device_prepare_ip(struct amdgpu_device *adev) >>>> +{ >>>> + /* Remap HDP registers to a hole in mmio space, for the purpose >>>> + * of exposing those registers to process space. This needs to be >>>> + * done before hw init of ip blocks to take care of HDP flush >>>> + * operations through registers during hw_init. >>>> + */ >>>> + if (adev->nbio.funcs && adev->nbio.funcs->remap_hdp_registers && >>>> + !amdgpu_sriov_vf(adev)) >>>> + adev->nbio.funcs->remap_hdp_registers(adev); >>>> +} >>>> >>>> /** >>>> * amdgpu_device_ip_init - run init for hardware IPs >>>> @@ -2376,6 +2396,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) >>>> DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); >>>> goto init_failed; >>>> } >>>> + >>>> + amdgpu_device_prepare_ip(adev); >>>> r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); >>>> if (r) { >>>> DRM_ERROR("hw_init %d failed %d\n", i, r); >>>> @@ -3058,6 +3080,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) >>>> AMD_IP_BLOCK_TYPE_IH, >>>> }; >>>> >>>> + amdgpu_device_prepare_ip(adev); >>>> for (i = 0; i < adev->num_ip_blocks; i++) { >>>> int j; >>>> struct amdgpu_ip_block *block; >>>> @@ -3139,6 +3162,7 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) >>>> { >>>> int i, r; >>>> >>>> + amdgpu_device_prepare_ip(adev); >>>> for (i = 0; i < adev->num_ip_blocks; i++) { >>>> if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) >>>> continue; >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c >>>> index b3fba8dea63c..3ac7fef74277 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/nv.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c >>>> @@ -1032,12 +1032,6 @@ static int nv_common_hw_init(void *handle) >>>> nv_program_aspm(adev); >>>> /* setup nbio registers */ >>>> adev->nbio.funcs->init_registers(adev); >>>> - /* remap HDP registers to a hole in mmio space, >>>> - * for the purpose of expose those registers >>>> - * to process space >>>> - */ >>>> - if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev)) >>>> - adev->nbio.funcs->remap_hdp_registers(adev); >>>> /* enable the doorbell aperture */ >>>> nv_enable_doorbell_aperture(adev, true); >>>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c >>>> index fde6154f2009..a0481e37d7cf 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c >>>> @@ -1240,12 +1240,6 @@ static int soc15_common_hw_init(void *handle) >>>> soc15_program_aspm(adev); >>>> /* setup nbio registers */ >>>> adev->nbio.funcs->init_registers(adev); >>>> - /* remap HDP registers to a hole in mmio space, >>>> - * for the purpose of expose those registers >>>> - * to process space >>>> - */ >>>> - if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev)) >>>> - adev->nbio.funcs->remap_hdp_registers(adev); >>>> >>>> /* enable the doorbell aperture */ >>>> soc15_enable_doorbell_aperture(adev, true); >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c >>>> index 55284b24f113..16b447055102 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/soc21.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c >>>> @@ -660,12 +660,6 @@ static int soc21_common_hw_init(void *handle) >>>> soc21_program_aspm(adev); >>>> /* setup nbio registers */ >>>> adev->nbio.funcs->init_registers(adev); >>>> - /* remap HDP registers to a hole in mmio space, >>>> - * for the purpose of expose those registers >>>> - * to process space >>>> - */ >>>> - if (adev->nbio.funcs->remap_hdp_registers) >>>> - adev->nbio.funcs->remap_hdp_registers(adev); >>>> /* enable the doorbell aperture */ >>>> soc21_enable_doorbell_aperture(adev, true); >>>> >>>> -- >>>> 2.25.1 >>>>
On 9/6/2022 8:55 PM, Alex Deucher wrote:
On Mon, Sep 5, 2022 at 1:27 AM Lazar, Lijo lijo.lazar@amd.com wrote:
On 9/2/2022 1:09 AM, Alex Deucher wrote:
How about this? We should just move HDP remap to gmc hw_init since that is mainly what uses it anyway.
Sorry, I missed to R-B the previous version. Did you find any problem when common block is initialized first?
One of the users on the bug report reported an issue with that patch, that said, that user seems to be seeing a slightly different issue since he is on a gfx9 card and the original reporter was on gfx10. https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugzilla.k...
Yes, I see Bjorn already pointed to another potential issue in LTR enablement. So regardless of init-common-first patch, the new error will be reported and that is unrelated to the original reported error through AER. I still think init-common-first patch is good.
Thanks, Lijo
Alex
I think that patch provides a consistent IP init sequence during cold init and resume.
Thanks, Lijo
Alex
On Tue, Aug 30, 2022 at 2:05 PM Alex Deucher alexdeucher@gmail.com wrote:
On Tue, Aug 30, 2022 at 12:06 PM Lazar, Lijo lijo.lazar@amd.com wrote:
On 8/30/2022 8:39 PM, Alex Deucher wrote:
On Tue, Aug 30, 2022 at 10:45 AM Lazar, Lijo lijo.lazar@amd.com wrote: > > > > On 8/30/2022 7:18 PM, Alex Deucher wrote: >> On Tue, Aug 30, 2022 at 12:05 AM Lazar, Lijo lijo.lazar@amd.com wrote: >>> >>> >>> >>> On 8/29/2022 10:20 PM, Alex Deucher wrote: >>>> On Mon, Aug 29, 2022 at 4:18 AM Lijo Lazar lijo.lazar@amd.com wrote: >>>>> >>>>> HDP flush is used early in the init sequence as part of memory controller >>>>> block initialization. Hence remapping of HDP registers needed for flush >>>>> needs to happen earlier. >>>>> >>>>> This also fixes the Unsupported Request error reported through AER during >>>>> driver load. The error happens as a write happens to the remap offset >>>>> before real remapping is done. >>>>> >>>>> Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugzilla.k... >>>>> >>>>> The error was unnoticed before and got visible because of the commit >>>>> referenced below. This doesn't fix anything in the commit below, rather >>>>> fixes the issue in amdgpu exposed by the commit. The reference is only >>>>> to associate this commit with below one so that both go together. >>>>> >>>>> Fixes: 8795e182b02d ("PCI/portdrv: Don't disable AER reporting in get_port_device_capability()") >>>>> >>>>> Reported-by: Tom Seewald tseewald@gmail.com >>>>> Signed-off-by: Lijo Lazar lijo.lazar@amd.com >>>>> Cc: stable@vger.kernel.org >>>> >>>> How about something like the attached patch rather than these two >>>> patches? It's a bit bigger but seems cleaner and more defensive in my >>>> opinion. >>>> >>> >>> Whenever device goes to suspend/reset and then comes back, remap offset >>> has to be set back to 0 to make sure it doesn't use the wrong offset >>> when the register assumes default values again. >>> >>> To avoid the if-check in hdp_flush (which is more frequent), another way >>> is to initialize the remap offset to default offset during early init >>> and hw fini/suspend sequences. It won't be obvious (even with this >>> patch) as to when remap offset vs default offset is used though. >> >> On resume, the common IP is resumed first so it will always be set. >> The only case that is a problem is init because we init GMC out of >> order. We could init common before GMC in amdgpu_device_ip_init(). I >> think that should be fine, but I wasn't sure if there might be some >> fallout from that on certain cards. >> > > There are other places where an IP order is forced like in > amdgpu_device_ip_reinit_early_sriov(). This also may not affect this > case as remapping is not done for VF. > > Agree that a better way is to have the common IP to be inited first.
How about these patches?
Looks good to me. BTW, is nbio 7.7 for an APU (in which case hdp flush is not expected to be used)?
It would be used in some cases, e.g., GPU VM passthrough where we use the BAR rather than the carve out.
Alex
Thanks, Lijo
Alex
> > Thanks, > Lijo > >> Alex >> >>> >>> Thanks, >>> Lijo >>> >>>> Alex >>>> >>>>> --- >>>>> v2: >>>>> Take care of IP resume cases (Alex Deucher) >>>>> Add NULL check to nbio.funcs to cover older (GFXv8) ASICs (Felix Kuehling) >>>>> Add more details in commit message and associate with AER patch (Bjorn >>>>> Helgaas) >>>>> >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++ >>>>> drivers/gpu/drm/amd/amdgpu/nv.c | 6 ------ >>>>> drivers/gpu/drm/amd/amdgpu/soc15.c | 6 ------ >>>>> drivers/gpu/drm/amd/amdgpu/soc21.c | 6 ------ >>>>> 4 files changed, 24 insertions(+), 18 deletions(-) >>>>> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>> index ce7d117efdb5..e420118769a5 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>> @@ -2334,6 +2334,26 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) >>>>> return 0; >>>>> } >>>>> >>>>> +/** >>>>> + * amdgpu_device_prepare_ip - prepare IPs for hardware initialization >>>>> + * >>>>> + * @adev: amdgpu_device pointer >>>>> + * >>>>> + * Any common hardware initialization sequence that needs to be done before >>>>> + * hw init of individual IPs is performed here. This is different from the >>>>> + * 'common block' which initializes a set of IPs. >>>>> + */ >>>>> +static void amdgpu_device_prepare_ip(struct amdgpu_device *adev) >>>>> +{ >>>>> + /* Remap HDP registers to a hole in mmio space, for the purpose >>>>> + * of exposing those registers to process space. This needs to be >>>>> + * done before hw init of ip blocks to take care of HDP flush >>>>> + * operations through registers during hw_init. >>>>> + */ >>>>> + if (adev->nbio.funcs && adev->nbio.funcs->remap_hdp_registers && >>>>> + !amdgpu_sriov_vf(adev)) >>>>> + adev->nbio.funcs->remap_hdp_registers(adev); >>>>> +} >>>>> >>>>> /** >>>>> * amdgpu_device_ip_init - run init for hardware IPs >>>>> @@ -2376,6 +2396,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) >>>>> DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); >>>>> goto init_failed; >>>>> } >>>>> + >>>>> + amdgpu_device_prepare_ip(adev); >>>>> r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); >>>>> if (r) { >>>>> DRM_ERROR("hw_init %d failed %d\n", i, r); >>>>> @@ -3058,6 +3080,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) >>>>> AMD_IP_BLOCK_TYPE_IH, >>>>> }; >>>>> >>>>> + amdgpu_device_prepare_ip(adev); >>>>> for (i = 0; i < adev->num_ip_blocks; i++) { >>>>> int j; >>>>> struct amdgpu_ip_block *block; >>>>> @@ -3139,6 +3162,7 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) >>>>> { >>>>> int i, r; >>>>> >>>>> + amdgpu_device_prepare_ip(adev); >>>>> for (i = 0; i < adev->num_ip_blocks; i++) { >>>>> if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) >>>>> continue; >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c >>>>> index b3fba8dea63c..3ac7fef74277 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/nv.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c >>>>> @@ -1032,12 +1032,6 @@ static int nv_common_hw_init(void *handle) >>>>> nv_program_aspm(adev); >>>>> /* setup nbio registers */ >>>>> adev->nbio.funcs->init_registers(adev); >>>>> - /* remap HDP registers to a hole in mmio space, >>>>> - * for the purpose of expose those registers >>>>> - * to process space >>>>> - */ >>>>> - if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev)) >>>>> - adev->nbio.funcs->remap_hdp_registers(adev); >>>>> /* enable the doorbell aperture */ >>>>> nv_enable_doorbell_aperture(adev, true); >>>>> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c >>>>> index fde6154f2009..a0481e37d7cf 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c >>>>> @@ -1240,12 +1240,6 @@ static int soc15_common_hw_init(void *handle) >>>>> soc15_program_aspm(adev); >>>>> /* setup nbio registers */ >>>>> adev->nbio.funcs->init_registers(adev); >>>>> - /* remap HDP registers to a hole in mmio space, >>>>> - * for the purpose of expose those registers >>>>> - * to process space >>>>> - */ >>>>> - if (adev->nbio.funcs->remap_hdp_registers && !amdgpu_sriov_vf(adev)) >>>>> - adev->nbio.funcs->remap_hdp_registers(adev); >>>>> >>>>> /* enable the doorbell aperture */ >>>>> soc15_enable_doorbell_aperture(adev, true); >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c >>>>> index 55284b24f113..16b447055102 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/soc21.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c >>>>> @@ -660,12 +660,6 @@ static int soc21_common_hw_init(void *handle) >>>>> soc21_program_aspm(adev); >>>>> /* setup nbio registers */ >>>>> adev->nbio.funcs->init_registers(adev); >>>>> - /* remap HDP registers to a hole in mmio space, >>>>> - * for the purpose of expose those registers >>>>> - * to process space >>>>> - */ >>>>> - if (adev->nbio.funcs->remap_hdp_registers) >>>>> - adev->nbio.funcs->remap_hdp_registers(adev); >>>>> /* enable the doorbell aperture */ >>>>> soc21_enable_doorbell_aperture(adev, true); >>>>> >>>>> -- >>>>> 2.25.1 >>>>>
linux-stable-mirror@lists.linaro.org