From 338f7412c7ea2ce007e83c5ad7c5e01d8cfce1e1 Mon Sep 17 00:00:00 2001 From: Xiang Liu <xiang.liu@amd.com> Date: Wed, 19 Mar 2025 17:02:49 +0800 Subject: [PATCH] drm/amdgpu: Decode deferred error type in gfx aca bank parser In the case of injecting uncorrected error with background workload, the deferred error among uncorrected errors need to be specified by checking the deferred and poison bits of status register. v2: refine checking for deferred error v2: log possiable DEs among CEs v2: generate CPER records for DEs among UEs Signed-off-by: Xiang Liu <xiang.liu@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 25 +++++++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 16 +++++++++++----- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 5 ++--- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index ffd4c64e123c7..dc47f5fd4ea15 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -391,6 +391,7 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev, { struct aca_bank_node *node; struct aca_bank *bank; + int r; if (!adev->cper.enabled) return; @@ -402,11 +403,27 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev, /* UEs must be encoded into separate CPER entries */ if (type == ACA_SMU_TYPE_UE) { + struct aca_banks de_banks; + + aca_banks_init(&de_banks); list_for_each_entry(node, &banks->list, node) { bank = &node->bank; - if (amdgpu_cper_generate_ue_record(adev, bank)) - dev_warn(adev->dev, "fail to generate ue cper records\n"); + if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) { + r = aca_banks_add_bank(&de_banks, bank); + if (r) + dev_warn(adev->dev, "fail to add de banks, ret = %d\n", r); + } else { + if (amdgpu_cper_generate_ue_record(adev, bank)) + dev_warn(adev->dev, "fail to generate ue cper records\n"); + } + } + + if (!list_empty(&de_banks.list)) { + if (amdgpu_cper_generate_ce_records(adev, &de_banks, de_banks.nr_banks)) + dev_warn(adev->dev, "fail to generate de cper records\n"); } + + aca_banks_release(&de_banks); } else { /* * SMU_TYPE_CE banks are combined into 1 CPER entries, @@ -541,6 +558,10 @@ static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *h if (ret) return ret; + /* DEs may contain in CEs or UEs */ + if (type != ACA_ERROR_TYPE_DEFERRED) + aca_log_aca_error(handle, ACA_ERROR_TYPE_DEFERRED, err_data); + return aca_log_aca_error(handle, type, err_data); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h index 6f62e5d80ed6b..6b180f1b33fda 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h @@ -76,11 +76,17 @@ struct ras_query_context; #define mmSMNAID_XCD1_MCA_SMU 0x38430400 /* SMN AID XCD1 */ #define mmSMNXCD_XCD0_MCA_SMU 0x40430400 /* SMN XCD XCD0 */ -#define ACA_BANK_ERR_CE_DE_DECODE(bank) \ - ((ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \ - ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS])) ? \ - ACA_ERROR_TYPE_DEFERRED : \ - ACA_ERROR_TYPE_CE) +#define ACA_BANK_ERR_IS_DEFFERED(bank) \ + (ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \ + ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS])) + +#define ACA_BANK_ERR_CE_DE_DECODE(bank) \ + (ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \ + ACA_ERROR_TYPE_CE) + +#define ACA_BANK_ERR_UE_DE_DECODE(bank) \ + (ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \ + ACA_ERROR_TYPE_UE) enum aca_reg_idx { ACA_REG_IDX_CTL = 0, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index efe45e4edfd70..736398b0d16d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -867,9 +867,8 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle, switch (type) { case ACA_SMU_TYPE_UE: - bank->aca_err_type = ACA_ERROR_TYPE_UE; - ret = aca_error_cache_log_bank_error(handle, &info, - ACA_ERROR_TYPE_UE, 1ULL); + bank->aca_err_type = ACA_BANK_ERR_UE_DE_DECODE(bank); + ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, 1ULL); break; case ACA_SMU_TYPE_CE: bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank); -- GitLab