Allows us to detect subsequent IH ring buffer overflows as well.
Cc: Joshua Ashton joshua@froggi.es Cc: Alex Deucher alexander.deucher@amd.com Cc: stable@vger.kernel.org
Signed-off-by: Friedrich Vock friedrich.vock@gmx.de --- drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h | 2 ++ drivers/gpu/drm/amd/amdgpu/cik_ih.c | 13 +++++++++++++ drivers/gpu/drm/amd/amdgpu/cz_ih.c | 14 +++++++++++++- drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 14 +++++++++++++- drivers/gpu/drm/amd/amdgpu/ih_v6_0.c | 13 +++++++++++++ drivers/gpu/drm/amd/amdgpu/ih_v6_1.c | 13 +++++++++++++ drivers/gpu/drm/amd/amdgpu/navi10_ih.c | 12 ++++++++++++ drivers/gpu/drm/amd/amdgpu/si_ih.c | 12 ++++++++++++ drivers/gpu/drm/amd/amdgpu/tonga_ih.c | 13 +++++++++++++ drivers/gpu/drm/amd/amdgpu/vega10_ih.c | 12 ++++++++++++ drivers/gpu/drm/amd/amdgpu/vega20_ih.c | 12 ++++++++++++ 11 files changed, 128 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h index 508f02eb0cf8..6041ec727f06 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h @@ -69,6 +69,8 @@ struct amdgpu_ih_ring { unsigned rptr; struct amdgpu_ih_regs ih_regs;
+ bool overflow; + /* For waiting on IH processing at checkpoint. */ wait_queue_head_t wait_process; uint64_t processed_timestamp; diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c index 6f7c031dd197..807cc30c9e33 100644 --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c @@ -204,6 +204,7 @@ static u32 cik_ih_get_wptr(struct amdgpu_device *adev, tmp = RREG32(mmIH_RB_CNTL); tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK; WREG32(mmIH_RB_CNTL, tmp); + ih->overflow = true; } return (wptr & ih->ptr_mask); } @@ -274,7 +275,19 @@ static void cik_ih_decode_iv(struct amdgpu_device *adev, static void cik_ih_set_rptr(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) { + u32 tmp; + WREG32(mmIH_RB_RPTR, ih->rptr); + + /* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit), + * reset it here to detect more overflows if they occur. + */ + if (ih->overflow) { + tmp = RREG32(mmIH_RB_CNTL); + tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK; + WREG32(mmIH_RB_CNTL, tmp); + ih->overflow = false; + } }
static int cik_ih_early_init(void *handle) diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c index b8c47e0cf37a..076559668573 100644 --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c @@ -215,7 +215,7 @@ static u32 cz_ih_get_wptr(struct amdgpu_device *adev, tmp = RREG32(mmIH_RB_CNTL); tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32(mmIH_RB_CNTL, tmp); - + ih->overflow = true;
out: return (wptr & ih->ptr_mask); @@ -266,7 +266,19 @@ static void cz_ih_decode_iv(struct amdgpu_device *adev, static void cz_ih_set_rptr(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) { + u32 tmp; + WREG32(mmIH_RB_RPTR, ih->rptr); + + /* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit), + * reset it here to detect more overflows if they occur. + */ + if (ih->overflow) { + tmp = RREG32(mmIH_RB_CNTL); + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32(mmIH_RB_CNTL, tmp); + ih->overflow = false; + } }
static int cz_ih_early_init(void *handle) diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c index aecad530b10a..1a5e668643d1 100644 --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c @@ -214,7 +214,7 @@ static u32 iceland_ih_get_wptr(struct amdgpu_device *adev, tmp = RREG32(mmIH_RB_CNTL); tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32(mmIH_RB_CNTL, tmp); - + ih->overflow = true;
out: return (wptr & ih->ptr_mask); @@ -265,7 +265,19 @@ static void iceland_ih_decode_iv(struct amdgpu_device *adev, static void iceland_ih_set_rptr(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) { + u32 tmp; + WREG32(mmIH_RB_RPTR, ih->rptr); + + /* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit), + * reset it here to detect more overflows if they occur. + */ + if (ih->overflow) { + tmp = RREG32(mmIH_RB_CNTL); + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32(mmIH_RB_CNTL, tmp); + ih->overflow = false; + } }
static int iceland_ih_early_init(void *handle) diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c index d9ed7332d805..ce8f7feec713 100644 --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c @@ -418,6 +418,8 @@ static u32 ih_v6_0_get_wptr(struct amdgpu_device *adev, tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl); tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp); + ih->overflow = true; + out: return (wptr & ih->ptr_mask); } @@ -459,6 +461,7 @@ static void ih_v6_0_irq_rearm(struct amdgpu_device *adev, static void ih_v6_0_set_rptr(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) { + u32 tmp; struct amdgpu_ih_regs *ih_regs;
if (ih->use_doorbell) { @@ -472,6 +475,16 @@ static void ih_v6_0_set_rptr(struct amdgpu_device *adev, ih_regs = &ih->ih_regs; WREG32(ih_regs->ih_rb_rptr, ih->rptr); } + + /* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit), + * reset it here to detect more overflows if they occur. + */ + if (ih->overflow) { + tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl); + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp); + ih->overflow = false; + } }
/** diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c index 8fb05eae340a..668788ad34d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c @@ -418,6 +418,8 @@ static u32 ih_v6_1_get_wptr(struct amdgpu_device *adev, tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl); tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp); + ih->overflow = true; + out: return (wptr & ih->ptr_mask); } @@ -459,6 +461,7 @@ static void ih_v6_1_irq_rearm(struct amdgpu_device *adev, static void ih_v6_1_set_rptr(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) { + u32 tmp; struct amdgpu_ih_regs *ih_regs;
if (ih->use_doorbell) { @@ -472,6 +475,16 @@ static void ih_v6_1_set_rptr(struct amdgpu_device *adev, ih_regs = &ih->ih_regs; WREG32(ih_regs->ih_rb_rptr, ih->rptr); } + + /* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit), + * reset it here to detect more overflows if they occur. + */ + if (ih->overflow) { + tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl); + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp); + ih->overflow = false; + } }
/** diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c index e64b33115848..0bdac923cb4d 100644 --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c @@ -442,6 +442,7 @@ static u32 navi10_ih_get_wptr(struct amdgpu_device *adev, tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl); tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp); + ih->overflow = true; out: return (wptr & ih->ptr_mask); } @@ -483,6 +484,7 @@ static void navi10_ih_irq_rearm(struct amdgpu_device *adev, static void navi10_ih_set_rptr(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) { + u32 tmp; struct amdgpu_ih_regs *ih_regs;
if (ih == &adev->irq.ih_soft) @@ -499,6 +501,16 @@ static void navi10_ih_set_rptr(struct amdgpu_device *adev, ih_regs = &ih->ih_regs; WREG32(ih_regs->ih_rb_rptr, ih->rptr); } + + /* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit), + * reset it here to detect more overflows if they occur. + */ + if (ih->overflow) { + tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl); + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp); + ih->overflow = false; + } }
/** diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c b/drivers/gpu/drm/amd/amdgpu/si_ih.c index 9a24f17a5750..ff35056d2b54 100644 --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c @@ -119,6 +119,7 @@ static u32 si_ih_get_wptr(struct amdgpu_device *adev, tmp = RREG32(IH_RB_CNTL); tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK; WREG32(IH_RB_CNTL, tmp); + ih->overflow = true; } return (wptr & ih->ptr_mask); } @@ -147,7 +148,18 @@ static void si_ih_decode_iv(struct amdgpu_device *adev, static void si_ih_set_rptr(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) { + u32 tmp; + WREG32(IH_RB_RPTR, ih->rptr); + + /* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit), + * reset it here to detect more overflows if they occur. + */ + if (ih->overflow) { + tmp = RREG32(IH_RB_CNTL); + tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK; + WREG32(IH_RB_CNTL, tmp); + } }
static int si_ih_early_init(void *handle) diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c index 917707bba7f3..6f5090d3db48 100644 --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c @@ -218,6 +218,7 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device *adev, tmp = RREG32(mmIH_RB_CNTL); tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32(mmIH_RB_CNTL, tmp); + ih->overflow = true;
out: return (wptr & ih->ptr_mask); @@ -268,6 +269,8 @@ static void tonga_ih_decode_iv(struct amdgpu_device *adev, static void tonga_ih_set_rptr(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) { + u32 tmp; + if (ih->use_doorbell) { /* XXX check if swapping is necessary on BE */ *ih->rptr_cpu = ih->rptr; @@ -275,6 +278,16 @@ static void tonga_ih_set_rptr(struct amdgpu_device *adev, } else { WREG32(mmIH_RB_RPTR, ih->rptr); } + + /* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit), + * reset it here to detect more overflows if they occur. + */ + if (ih->overflow) { + tmp = RREG32(mmIH_RB_CNTL); + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32(mmIH_RB_CNTL, tmp); + ih->overflow = false; + } }
static int tonga_ih_early_init(void *handle) diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c index d364c6dd152c..bb005924f194 100644 --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c @@ -372,6 +372,7 @@ static u32 vega10_ih_get_wptr(struct amdgpu_device *adev, tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl); tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp); + ih->overflow = true;
out: return (wptr & ih->ptr_mask); @@ -413,6 +414,7 @@ static void vega10_ih_irq_rearm(struct amdgpu_device *adev, static void vega10_ih_set_rptr(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) { + u32 tmp; struct amdgpu_ih_regs *ih_regs;
if (ih == &adev->irq.ih_soft) @@ -429,6 +431,16 @@ static void vega10_ih_set_rptr(struct amdgpu_device *adev, ih_regs = &ih->ih_regs; WREG32(ih_regs->ih_rb_rptr, ih->rptr); } + + /* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit), + * reset it here to detect more overflows if they occur. + */ + if (ih->overflow) { + tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl); + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp); + ih->overflow = false; + } }
/** diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c index ddfc6941f9d5..bb725a970697 100644 --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c @@ -420,6 +420,7 @@ static u32 vega20_ih_get_wptr(struct amdgpu_device *adev, tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl); tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp); + ih->overflow = true;
out: return (wptr & ih->ptr_mask); @@ -462,6 +463,7 @@ static void vega20_ih_irq_rearm(struct amdgpu_device *adev, static void vega20_ih_set_rptr(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) { + u32 tmp; struct amdgpu_ih_regs *ih_regs;
if (ih == &adev->irq.ih_soft) @@ -478,6 +480,16 @@ static void vega20_ih_set_rptr(struct amdgpu_device *adev, ih_regs = &ih->ih_regs; WREG32(ih_regs->ih_rb_rptr, ih->rptr); } + + /* If we overflowed previously (and thus set the OVERFLOW_CLEAR bit), + * reset it here to detect more overflows if they occur. + */ + if (ih->overflow) { + tmp = RREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl); + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32_NO_KIQ(ih->ih_regs.ih_rb_cntl, tmp); + ih->overflow = false; + } }
/** -- 2.43.0
If the IH ring buffer overflows, it's possible that fence signal events were lost. Check each ring for progress to prevent job timeouts/GPU hangs due to the fences staying unsignaled despite the work being done.
Cc: Joshua Ashton joshua@froggi.es Cc: Alex Deucher alexander.deucher@amd.com Cc: stable@vger.kernel.org
Signed-off-by: Friedrich Vock friedrich.vock@gmx.de --- drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c index f3b0aaf3ebc6..2a246db1d3a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c @@ -209,6 +209,7 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) { unsigned int count; u32 wptr; + int i;
if (!ih->enabled || adev->shutdown) return IRQ_NONE; @@ -227,6 +228,20 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) ih->rptr &= ih->ptr_mask; }
+ /* If the ring buffer overflowed, we might have lost some fence + * signal interrupts. Check if there was any activity so the signal + * doesn't get lost. + */ + if (ih->overflow) { + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->fence_drv.initialized) + continue; + amdgpu_fence_process(ring); + } + } + amdgpu_ih_set_rptr(adev, ih); wake_up_all(&ih->wait_process);
-- 2.43.0
Am 14.01.24 um 14:00 schrieb Friedrich Vock:
If the IH ring buffer overflows, it's possible that fence signal events were lost. Check each ring for progress to prevent job timeouts/GPU hangs due to the fences staying unsignaled despite the work being done.
That's completely unnecessary and in some cases even harmful.
We already have a timeout handler for that and overflows point to severe system problem so they should never occur in a production system.
Regards, Christian.
Cc: Joshua Ashton joshua@froggi.es Cc: Alex Deucher alexander.deucher@amd.com Cc: stable@vger.kernel.org
Signed-off-by: Friedrich Vock friedrich.vock@gmx.de
drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c index f3b0aaf3ebc6..2a246db1d3a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c @@ -209,6 +209,7 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) { unsigned int count; u32 wptr;
int i;
if (!ih->enabled || adev->shutdown) return IRQ_NONE;
@@ -227,6 +228,20 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) ih->rptr &= ih->ptr_mask; }
- /* If the ring buffer overflowed, we might have lost some fence
* signal interrupts. Check if there was any activity so the signal
* doesn't get lost.
*/
- if (ih->overflow) {
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
if (!ring || !ring->fence_drv.initialized)
continue;
amdgpu_fence_process(ring);
}
- }
- amdgpu_ih_set_rptr(adev, ih); wake_up_all(&ih->wait_process);
-- 2.43.0
On 15.01.24 11:26, Christian König wrote:
Am 14.01.24 um 14:00 schrieb Friedrich Vock:
If the IH ring buffer overflows, it's possible that fence signal events were lost. Check each ring for progress to prevent job timeouts/GPU hangs due to the fences staying unsignaled despite the work being done.
That's completely unnecessary and in some cases even harmful.
How is it harmful? The only effect it can have is prevent unnecessary GPU hangs, no? It's not like it hides any legitimate errors that you'd otherwise see.
We already have a timeout handler for that and overflows point to severe system problem so they should never occur in a production system.
IH ring buffer overflows are pretty reliably reproducible if you trigger a lot of page faults, at least on Deck. Why shouldn't enough page faults in quick succession be able to overflow the IH ring buffer?
The fence fallback timer as it is now is useless for this because it only gets triggered once after 0.5s. I guess an alternative approach would be to make a timer trigger for each work item in flight every 0.5s, but why should that be better than just handling overflow errors as they occur?
Regards, Friedrich
Regards, Christian.
Cc: Joshua Ashton joshua@froggi.es Cc: Alex Deucher alexander.deucher@amd.com Cc: stable@vger.kernel.org
Signed-off-by: Friedrich Vock friedrich.vock@gmx.de
drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c index f3b0aaf3ebc6..2a246db1d3a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c @@ -209,6 +209,7 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) { unsigned int count; u32 wptr; + int i;
if (!ih->enabled || adev->shutdown) return IRQ_NONE; @@ -227,6 +228,20 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) ih->rptr &= ih->ptr_mask; }
+ /* If the ring buffer overflowed, we might have lost some fence + * signal interrupts. Check if there was any activity so the signal + * doesn't get lost. + */ + if (ih->overflow) { + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i];
+ if (!ring || !ring->fence_drv.initialized) + continue; + amdgpu_fence_process(ring); + } + }
amdgpu_ih_set_rptr(adev, ih); wake_up_all(&ih->wait_process);
-- 2.43.0
Am 15.01.24 um 12:19 schrieb Friedrich Vock:
On 15.01.24 11:26, Christian König wrote:
Am 14.01.24 um 14:00 schrieb Friedrich Vock:
If the IH ring buffer overflows, it's possible that fence signal events were lost. Check each ring for progress to prevent job timeouts/GPU hangs due to the fences staying unsignaled despite the work being done.
That's completely unnecessary and in some cases even harmful.
How is it harmful? The only effect it can have is prevent unnecessary GPU hangs, no? It's not like it hides any legitimate errors that you'd otherwise see.
We have no guarantee that all ring buffers are actually fully initialized to allow fence processing.
Apart from that fence processing is the least of your problems when an IV overflow occurs. Other interrupt source which are not repeated are usually for more worse.
We already have a timeout handler for that and overflows point to severe system problem so they should never occur in a production system.
IH ring buffer overflows are pretty reliably reproducible if you trigger a lot of page faults, at least on Deck. Why shouldn't enough page faults in quick succession be able to overflow the IH ring buffer?
At least not on recent hw generations. Since gfx9 we have a rate limit on the number of page faults generated.
What could maybe do as well is to change the default of vm_fault_stop, but for your case that would be even worse in production.
The fence fallback timer as it is now is useless for this because it only gets triggered once after 0.5s. I guess an alternative approach would be to make a timer trigger for each work item in flight every 0.5s, but why should that be better than just handling overflow errors as they occur?
That is intentional. As I said an IH overflow just points out that there is something massively wrong in the HW programming.
After gfx9 the IH should never produce overflow any more, otherwise either the ratelimit doesn't work or isn't enabled for some reason or the IH ring buffer is just to small.
Regards, Christian.
Regards, Friedrich
Regards, Christian.
Cc: Joshua Ashton joshua@froggi.es Cc: Alex Deucher alexander.deucher@amd.com Cc: stable@vger.kernel.org
Signed-off-by: Friedrich Vock friedrich.vock@gmx.de
drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c index f3b0aaf3ebc6..2a246db1d3a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c @@ -209,6 +209,7 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) { unsigned int count; u32 wptr; + int i;
if (!ih->enabled || adev->shutdown) return IRQ_NONE; @@ -227,6 +228,20 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) ih->rptr &= ih->ptr_mask; }
+ /* If the ring buffer overflowed, we might have lost some fence + * signal interrupts. Check if there was any activity so the signal + * doesn't get lost. + */ + if (ih->overflow) { + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i];
+ if (!ring || !ring->fence_drv.initialized) + continue; + amdgpu_fence_process(ring); + } + }
amdgpu_ih_set_rptr(adev, ih); wake_up_all(&ih->wait_process);
-- 2.43.0
linux-stable-mirror@lists.linaro.org