On QUERY2 IOCTL don't query counts of correctable and uncorrectable errors, since when RAS is enabled and supported on Vega20 server boards, this takes insurmountably long time, in O(n^3), which slows the system down to the point of it being unusable when we have GUI up.
Fixes: ae363a212b14 ("drm/amdgpu: Add a new flag to AMDGPU_CTX_OP_QUERY_STATE2") Cc: Alexander Deucher Alexander.Deucher@amd.com Cc: stable@vger.kernel.org Signed-off-by: Luben Tuikov luben.tuikov@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 01fe60fedcbe..d481a33f4eaf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -363,19 +363,19 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
/*query ue count*/ - ras_counter = amdgpu_ras_query_error_count(adev, false); - /*ras counter is monotonic increasing*/ - if (ras_counter != ctx->ras_counter_ue) { - out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; - ctx->ras_counter_ue = ras_counter; - } - - /*query ce count*/ - ras_counter = amdgpu_ras_query_error_count(adev, true); - if (ras_counter != ctx->ras_counter_ce) { - out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; - ctx->ras_counter_ce = ras_counter; - } + /* ras_counter = amdgpu_ras_query_error_count(adev, false); */ + /* /*ras counter is monotonic increasing*/ */ + /* if (ras_counter != ctx->ras_counter_ue) { */ + /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; */ + /* ctx->ras_counter_ue = ras_counter; */ + /* } */ + + /* /*query ce count*/ */ + /* ras_counter = amdgpu_ras_query_error_count(adev, true); */ + /* if (ras_counter != ctx->ras_counter_ce) { */ + /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; */ + /* ctx->ras_counter_ce = ras_counter; */ + /* } */
mutex_unlock(&mgr->lock); return 0;
[AMD Public Use]
-----Original Message----- From: Tuikov, Luben Luben.Tuikov@amd.com Sent: Wednesday, May 12, 2021 1:03 PM To: amd-gfx@lists.freedesktop.org Cc: Tuikov, Luben Luben.Tuikov@amd.com; Deucher, Alexander Alexander.Deucher@amd.com; stable@vger.kernel.org Subject: [PATCH 1/2] drm/amdgpu: Don't query CE and UE errors
On QUERY2 IOCTL don't query counts of correctable and uncorrectable errors, since when RAS is enabled and supported on Vega20 server boards, this takes insurmountably long time, in O(n^3), which slows the system down to the point of it being unusable when we have GUI up.
Fixes: ae363a212b14 ("drm/amdgpu: Add a new flag to AMDGPU_CTX_OP_QUERY_STATE2") Cc: Alexander Deucher Alexander.Deucher@amd.com Cc: stable@vger.kernel.org Signed-off-by: Luben Tuikov luben.tuikov@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 26 ++++++++++++-----------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 01fe60fedcbe..d481a33f4eaf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -363,19 +363,19 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
/*query ue count*/
- ras_counter = amdgpu_ras_query_error_count(adev, false);
- /*ras counter is monotonic increasing*/
- if (ras_counter != ctx->ras_counter_ue) {
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
ctx->ras_counter_ue = ras_counter;
- }
- /*query ce count*/
- ras_counter = amdgpu_ras_query_error_count(adev, true);
- if (ras_counter != ctx->ras_counter_ce) {
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
ctx->ras_counter_ce = ras_counter;
- }
- /* ras_counter = amdgpu_ras_query_error_count(adev, false); */
- /* /*ras counter is monotonic increasing*/ */
- /* if (ras_counter != ctx->ras_counter_ue) { */
- /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
*/
- /* ctx->ras_counter_ue = ras_counter; */
- /* } */
- /* /*query ce count*/ */
- /* ras_counter = amdgpu_ras_query_error_count(adev, true); */
- /* if (ras_counter != ctx->ras_counter_ce) { */
- /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
*/
- /* ctx->ras_counter_ce = ras_counter; */
- /* } */
Rather than commenting this out, just drop it in patch 1, and then re-add this in patch 2.
Alex
mutex_unlock(&mgr->lock); return 0; -- 2.31.1.527.g2d677e5b15
On QUERY2 IOCTL don't query counts of correctable and uncorrectable errors, since when RAS is enabled and supported on Vega20 server boards, this takes insurmountably long time, in O(n^3), which slows the system down to the point of it being unusable when we have GUI up.
Fixes: ae363a212b14 ("drm/amdgpu: Add a new flag to AMDGPU_CTX_OP_QUERY_STATE2") Cc: Alexander Deucher Alexander.Deucher@amd.com Cc: stable@vger.kernel.org Signed-off-by: Luben Tuikov luben.tuikov@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 16 ---------------- 1 file changed, 16 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 01fe60fedcbe..e1557020c49d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -337,7 +337,6 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, { struct amdgpu_ctx *ctx; struct amdgpu_ctx_mgr *mgr; - unsigned long ras_counter;
if (!fpriv) return -EINVAL; @@ -362,21 +361,6 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, if (atomic_read(&ctx->guilty)) out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
- /*query ue count*/ - ras_counter = amdgpu_ras_query_error_count(adev, false); - /*ras counter is monotonic increasing*/ - if (ras_counter != ctx->ras_counter_ue) { - out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; - ctx->ras_counter_ue = ras_counter; - } - - /*query ce count*/ - ras_counter = amdgpu_ras_query_error_count(adev, true); - if (ras_counter != ctx->ras_counter_ce) { - out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; - ctx->ras_counter_ce = ras_counter; - } - mutex_unlock(&mgr->lock); return 0; }
On Thu, May 13, 2021 at 1:32 AM Luben Tuikov luben.tuikov@amd.com wrote:
On QUERY2 IOCTL don't query counts of correctable and uncorrectable errors, since when RAS is enabled and supported on Vega20 server boards, this takes insurmountably long time, in O(n^3), which slows the system down to the point of it being unusable when we have GUI up.
Fixes: ae363a212b14 ("drm/amdgpu: Add a new flag to AMDGPU_CTX_OP_QUERY_STATE2") Cc: Alexander Deucher Alexander.Deucher@amd.com Cc: stable@vger.kernel.org Signed-off-by: Luben Tuikov luben.tuikov@amd.com
Reviewed-by: Alex Deucher alexander.deucher@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 16 ---------------- 1 file changed, 16 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 01fe60fedcbe..e1557020c49d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -337,7 +337,6 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, { struct amdgpu_ctx *ctx; struct amdgpu_ctx_mgr *mgr;
unsigned long ras_counter; if (!fpriv) return -EINVAL;
@@ -362,21 +361,6 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, if (atomic_read(&ctx->guilty)) out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
/*query ue count*/
ras_counter = amdgpu_ras_query_error_count(adev, false);
/*ras counter is monotonic increasing*/
if (ras_counter != ctx->ras_counter_ue) {
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
ctx->ras_counter_ue = ras_counter;
}
/*query ce count*/
ras_counter = amdgpu_ras_query_error_count(adev, true);
if (ras_counter != ctx->ras_counter_ce) {
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
ctx->ras_counter_ce = ras_counter;
}
mutex_unlock(&mgr->lock); return 0;
}
2.31.1.527.g2d677e5b15
amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Am 12.05.21 um 19:03 schrieb Luben Tuikov:
On QUERY2 IOCTL don't query counts of correctable and uncorrectable errors, since when RAS is enabled and supported on Vega20 server boards, this takes insurmountably long time, in O(n^3), which slows the system down to the point of it being unusable when we have GUI up.
Fixes: ae363a212b14 ("drm/amdgpu: Add a new flag to AMDGPU_CTX_OP_QUERY_STATE2") Cc: Alexander Deucher Alexander.Deucher@amd.com Cc: stable@vger.kernel.org Signed-off-by: Luben Tuikov luben.tuikov@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 01fe60fedcbe..d481a33f4eaf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -363,19 +363,19 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY; /*query ue count*/
- ras_counter = amdgpu_ras_query_error_count(adev, false);
- /*ras counter is monotonic increasing*/
- if (ras_counter != ctx->ras_counter_ue) {
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
ctx->ras_counter_ue = ras_counter;
- }
- /*query ce count*/
- ras_counter = amdgpu_ras_query_error_count(adev, true);
- if (ras_counter != ctx->ras_counter_ce) {
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
ctx->ras_counter_ce = ras_counter;
- }
- /* ras_counter = amdgpu_ras_query_error_count(adev, false); */
- /* /*ras counter is monotonic increasing*/ */
- /* if (ras_counter != ctx->ras_counter_ue) { */
- /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; */
- /* ctx->ras_counter_ue = ras_counter; */
- /* } */
- /* /*query ce count*/ */
- /* ras_counter = amdgpu_ras_query_error_count(adev, true); */
- /* if (ras_counter != ctx->ras_counter_ce) { */
- /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; */
- /* ctx->ras_counter_ce = ras_counter; */
- /* } */
Please completely drop the code. We usually don't keep commented out code in the driver.
With that done the patch is Reviewed-by: Christian König christian.koenig@amd.com
Christian.
mutex_unlock(&mgr->lock); return 0;
On 2021-05-13 3:56 a.m., Christian König wrote:
Am 12.05.21 um 19:03 schrieb Luben Tuikov:
On QUERY2 IOCTL don't query counts of correctable and uncorrectable errors, since when RAS is enabled and supported on Vega20 server boards, this takes insurmountably long time, in O(n^3), which slows the system down to the point of it being unusable when we have GUI up.
Fixes: ae363a212b14 ("drm/amdgpu: Add a new flag to AMDGPU_CTX_OP_QUERY_STATE2") Cc: Alexander Deucher Alexander.Deucher@amd.com Cc: stable@vger.kernel.org Signed-off-by: Luben Tuikov luben.tuikov@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 01fe60fedcbe..d481a33f4eaf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -363,19 +363,19 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY; /*query ue count*/
- ras_counter = amdgpu_ras_query_error_count(adev, false);
- /*ras counter is monotonic increasing*/
- if (ras_counter != ctx->ras_counter_ue) {
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
ctx->ras_counter_ue = ras_counter;
- }
- /*query ce count*/
- ras_counter = amdgpu_ras_query_error_count(adev, true);
- if (ras_counter != ctx->ras_counter_ce) {
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
ctx->ras_counter_ce = ras_counter;
- }
- /* ras_counter = amdgpu_ras_query_error_count(adev, false); */
- /* /*ras counter is monotonic increasing*/ */
- /* if (ras_counter != ctx->ras_counter_ue) { */
- /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; */
- /* ctx->ras_counter_ue = ras_counter; */
- /* } */
- /* /*query ce count*/ */
- /* ras_counter = amdgpu_ras_query_error_count(adev, true); */
- /* if (ras_counter != ctx->ras_counter_ce) { */
- /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; */
- /* ctx->ras_counter_ce = ras_counter; */
- /* } */
Please completely drop the code. We usually don't keep commented out code in the driver.
1. Alex suggested this when we chatted--this is why it is commented. 2. He suggested the same thing last night and 2.5 hours before your email, I posted a patch in which the code is commented out--did you not see it? It's threaded, it appears above, 2.5 hours before your email.
Regards, Luben
With that done the patch is Reviewed-by: Christian König christian.koenig@amd.com
Christian.
mutex_unlock(&mgr->lock); return 0;
Am 13.05.21 um 21:37 schrieb Luben Tuikov:
On 2021-05-13 3:56 a.m., Christian König wrote:
Am 12.05.21 um 19:03 schrieb Luben Tuikov:
On QUERY2 IOCTL don't query counts of correctable and uncorrectable errors, since when RAS is enabled and supported on Vega20 server boards, this takes insurmountably long time, in O(n^3), which slows the system down to the point of it being unusable when we have GUI up.
Fixes: ae363a212b14 ("drm/amdgpu: Add a new flag to AMDGPU_CTX_OP_QUERY_STATE2") Cc: Alexander Deucher Alexander.Deucher@amd.com Cc: stable@vger.kernel.org Signed-off-by: Luben Tuikov luben.tuikov@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 01fe60fedcbe..d481a33f4eaf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -363,19 +363,19 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY; /*query ue count*/
- ras_counter = amdgpu_ras_query_error_count(adev, false);
- /*ras counter is monotonic increasing*/
- if (ras_counter != ctx->ras_counter_ue) {
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
ctx->ras_counter_ue = ras_counter;
- }
- /*query ce count*/
- ras_counter = amdgpu_ras_query_error_count(adev, true);
- if (ras_counter != ctx->ras_counter_ce) {
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
ctx->ras_counter_ce = ras_counter;
- }
- /* ras_counter = amdgpu_ras_query_error_count(adev, false); */
- /* /*ras counter is monotonic increasing*/ */
- /* if (ras_counter != ctx->ras_counter_ue) { */
- /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; */
- /* ctx->ras_counter_ue = ras_counter; */
- /* } */
- /* /*query ce count*/ */
- /* ras_counter = amdgpu_ras_query_error_count(adev, true); */
- /* if (ras_counter != ctx->ras_counter_ce) { */
- /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; */
- /* ctx->ras_counter_ce = ras_counter; */
- /* } */
Please completely drop the code. We usually don't keep commented out code in the driver.
- Alex suggested this when we chatted--this is why it is commented.
Sounds like a misunderstanding to me, usually it is Alex who insists on dropping the code.
- He suggested the same thing last night and 2.5 hours before your email,
I posted a patch in which the code is commented out--did you not see it? It's threaded, it appears above, 2.5 hours before your email.
Sorry for the redundancy, didn't had seen that in my inbox yet when I wrote the reply.
Regards, Christian.
Regards, Luben
With that done the patch is Reviewed-by: Christian König christian.koenig@amd.com
Christian.
mutex_unlock(&mgr->lock); return 0;
linux-stable-mirror@lists.linaro.org