diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index a611e78dd4bac536d4bfae5ca6f9feb88c490baa..258498cbf1ebaaff027b8e49ce74e255d09edfc0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -217,6 +217,7 @@ struct amdgpu_gfx_funcs { int (*query_ras_error_count) (struct amdgpu_device *adev, void *ras_error_status); void (*reset_ras_error_count) (struct amdgpu_device *adev); void (*init_spm_golden)(struct amdgpu_device *adev); + void (*query_ras_error_status) (struct amdgpu_device *adev); }; struct sq_work { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h index 0c43d7fe893cce32896204fa701272b073e4b08a..1ae9bdae7311412a920cc01c033c61ff9b52c1ff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h @@ -40,6 +40,7 @@ struct amdgpu_mmhub_funcs { uint64_t page_table_base); void (*update_power_gating)(struct amdgpu_device *adev, bool enable); + void (*query_ras_error_status)(struct amdgpu_device *adev); }; struct amdgpu_mmhub { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 9e5ab7a8240fd7e88cff00ab06bf1125ea145a9b..49d10330bf645ced43a32d8e268c21cc0ac4f610 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1473,6 +1473,45 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) } } +/* Parse RdRspStatus and WrRspStatus */ +void amdgpu_ras_error_status_query(struct amdgpu_device *adev, + struct ras_query_if *info) +{ + /* + * Only two block need to query read/write + * RspStatus at current state + */ + switch (info->head.block) { + case AMDGPU_RAS_BLOCK__GFX: + if (adev->gfx.funcs->query_ras_error_status) + adev->gfx.funcs->query_ras_error_status(adev); + break; + case AMDGPU_RAS_BLOCK__MMHUB: + if (adev->mmhub.funcs->query_ras_error_status) + adev->mmhub.funcs->query_ras_error_status(adev); + break; + default: + break; + } +} + +static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_manager *obj; + + if (!con) + return; + + list_for_each_entry(obj, &con->head, node) { + struct ras_query_if info = { + .head = obj->head, + }; + + amdgpu_ras_error_status_query(adev, &info); + } +} + /* recovery begin */ /* return 0 on success. @@ -1543,8 +1582,10 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) } list_for_each_entry(remote_adev, - device_list_handle, gmc.xgmi.head) + device_list_handle, gmc.xgmi.head) { + amdgpu_ras_query_err_status(remote_adev); amdgpu_ras_log_on_err_counter(remote_adev); + } amdgpu_put_xgmi_hive(hive); } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 8c6f1f5cde655df13b64e0cb1b53707c2b34140b..f2d3ec78d50c123a4350858f47f981863af3ecc9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2075,6 +2075,7 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_gfx_funcs = { .ras_error_inject = &gfx_v9_4_ras_error_inject, .query_ras_error_count = &gfx_v9_4_query_ras_error_count, .reset_ras_error_count = &gfx_v9_4_reset_ras_error_count, + .query_ras_error_status = &gfx_v9_4_query_ras_error_status, }; static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c index bd85aed3523a2526c496e450fedf2bf7c8d90c2c..bc699d680ce8bde40d892d3a4d0263100834ec29 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c @@ -992,3 +992,32 @@ int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev, void *inject_if) return ret; } + +static const struct soc15_reg_entry gfx_v9_4_rdrsp_status_regs = + { SOC15_REG_ENTRY(GC, 0, mmGCEA_ERR_STATUS), 0, 1, 32 }; + +void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev) +{ + uint32_t i, j; + uint32_t reg_value; + + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) + return; + + mutex_lock(&adev->grbm_idx_mutex); + + for (i = 0; i < gfx_v9_4_rdrsp_status_regs.se_num; i++) { + for (j = 0; j < gfx_v9_4_rdrsp_status_regs.instance; + j++) { + gfx_v9_4_select_se_sh(adev, i, 0, j); + reg_value = RREG32(SOC15_REG_ENTRY_OFFSET( + gfx_v9_4_rdrsp_status_regs)); + if (reg_value) + dev_warn(adev->dev, "GCEA err detected at instance: %d, status: 0x%x!\n", + j, reg_value); + } + } + + gfx_v9_4_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff); + mutex_unlock(&adev->grbm_idx_mutex); +} diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h index 1ffecc5c0f0a9c7818a8f78f65851f5417dfb834..875f18473a9816dc6062916e5fd0272e9beb624b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h @@ -34,4 +34,6 @@ int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev, void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev); +void gfx_v9_4_query_ras_error_status(struct amdgpu_device *adev); + #endif /* __GFX_V9_4_H__ */ diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c index 6c6ad529c65c65fd89811e19fd054305969ae749..c2ef8142136e960adfa6d119094604eaa690c076 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c @@ -1624,6 +1624,34 @@ static void mmhub_v9_4_reset_ras_error_count(struct amdgpu_device *adev) } } +static const struct soc15_reg_entry mmhub_v9_4_err_status_regs[] = { + { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA0_ERR_STATUS), 0, 0, 0 }, + { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA1_ERR_STATUS), 0, 0, 0 }, + { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA2_ERR_STATUS), 0, 0, 0 }, + { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA3_ERR_STATUS), 0, 0, 0 }, + { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA4_ERR_STATUS), 0, 0, 0 }, + { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA5_ERR_STATUS), 0, 0, 0 }, + { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA6_ERR_STATUS), 0, 0, 0 }, + { SOC15_REG_ENTRY(MMHUB, 0, mmMMEA7_ERR_STATUS), 0, 0, 0 }, +}; + +static void mmhub_v9_4_query_ras_error_status(struct amdgpu_device *adev) +{ + int i; + uint32_t reg_value; + + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB)) + return; + + for (i = 0; i < ARRAY_SIZE(mmhub_v9_4_err_status_regs); i++) { + reg_value = + RREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v9_4_err_status_regs[i])); + if (reg_value) + dev_warn(adev->dev, "MMHUB EA err detected at instance: %d, status: 0x%x!\n", + i, reg_value); + } +} + const struct amdgpu_mmhub_funcs mmhub_v9_4_funcs = { .ras_late_init = amdgpu_mmhub_ras_late_init, .query_ras_error_count = mmhub_v9_4_query_ras_error_count, @@ -1636,4 +1664,5 @@ const struct amdgpu_mmhub_funcs mmhub_v9_4_funcs = { .set_clockgating = mmhub_v9_4_set_clockgating, .get_clockgating = mmhub_v9_4_get_clockgating, .setup_vm_pt_regs = mmhub_v9_4_setup_vm_pt_regs, + .query_ras_error_status = mmhub_v9_4_query_ras_error_status, }; diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h index f41556abfbbc8c43a76fc0246248c913f0446a23..629a8a3b55e92d231dc83ecc2c72447426f0b9d1 100644 --- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h +++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_9_4_1_offset.h @@ -205,6 +205,8 @@ #define mmGCEA_EDC_CNT2_BASE_IDX 0 #define mmGCEA_EDC_CNT3 0x071b #define mmGCEA_EDC_CNT3_BASE_IDX 0 +#define mmGCEA_ERR_STATUS 0x0712 +#define mmGCEA_ERR_STATUS_BASE_IDX 0 // addressBlock: gc_gfxudec // base address: 0x30000 @@ -261,4 +263,4 @@ #define mmRLC_EDC_CNT2 0x4d41 #define mmRLC_EDC_CNT2_BASE_IDX 1 -#endif \ No newline at end of file +#endif