diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 8e0a6c62322ec9d62d569d717982580abd58901f..689addb1520d26bbac50f74ab560c848b9670174 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -601,6 +601,7 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev) struct ras_ih_if ih_info = { .cb = amdgpu_gfx_process_ras_data_cb, }; + struct ras_query_if info = { 0 }; if (!adev->gfx.ras_if) { adev->gfx.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); @@ -612,13 +613,19 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev) strcpy(adev->gfx.ras_if->name, "gfx"); } fs_info.head = ih_info.head = *adev->gfx.ras_if; - r = amdgpu_ras_late_init(adev, adev->gfx.ras_if, &fs_info, &ih_info); if (r) goto free; if (amdgpu_ras_is_supported(adev, adev->gfx.ras_if->block)) { + if (adev->gmc.xgmi.connected_to_cpu) { + info.head = *adev->gfx.ras_if; + amdgpu_ras_query_error_status(adev, &info); + } else { + amdgpu_ras_reset_error_status(adev, AMDGPU_RAS_BLOCK__GFX); + } + r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0); if (r) goto late_fini; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index d92f0f14cbebc5bf488101b25169f72d1c9bc160..38af93f501e1e030c16defe2244847d265fb5680 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -225,9 +225,9 @@ struct amdgpu_gfx_funcs { void (*reset_ras_error_count) (struct amdgpu_device *adev); void (*init_spm_golden)(struct amdgpu_device *adev); void (*query_ras_error_status) (struct amdgpu_device *adev); + void (*reset_ras_error_status) (struct amdgpu_device *adev); void (*update_perfmon_mgcg)(struct amdgpu_device *adev, bool enable); void (*enable_watchdog_timer)(struct amdgpu_device *adev); - void (*query_sq_timeout_status)(struct amdgpu_device *adev); }; struct sq_work { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 5805c78c356b05f1fb26ba2ee78f5fdb3a390dbe..517e19fae34f5e8e94daa3e581dc5353fb39c402 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -109,7 +109,7 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, ssize_t s; char val[128]; - if (amdgpu_ras_error_query(obj->adev, &info)) + if (amdgpu_ras_query_error_status(obj->adev, &info)) return -EINVAL; s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", @@ -434,7 +434,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, return snprintf(buf, PAGE_SIZE, "Query currently inaccessible\n"); - if (amdgpu_ras_error_query(obj->adev, &info)) + if (amdgpu_ras_query_error_status(obj->adev, &info)) return -EINVAL; return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", @@ -757,8 +757,8 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, /* feature ctl end */ /* query/inject/cure begin */ -int amdgpu_ras_error_query(struct amdgpu_device *adev, - struct ras_query_if *info) +int amdgpu_ras_query_error_status(struct amdgpu_device *adev, + struct ras_query_if *info) { struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); struct ras_err_data err_data = {0, 0, 0, NULL}; @@ -787,10 +787,16 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, case AMDGPU_RAS_BLOCK__GFX: if (adev->gfx.funcs->query_ras_error_count) adev->gfx.funcs->query_ras_error_count(adev, &err_data); + + if (adev->gfx.funcs->query_ras_error_status) + adev->gfx.funcs->query_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: if (adev->mmhub.funcs->query_ras_error_count) adev->mmhub.funcs->query_ras_error_count(adev, &err_data); + + if (adev->mmhub.funcs->query_ras_error_status) + adev->mmhub.funcs->query_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__PCIE_BIF: if (adev->nbio.funcs->query_ras_error_count) @@ -826,6 +832,35 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, return 0; } +int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, + enum amdgpu_ras_block block) +{ + if (!amdgpu_ras_is_supported(adev, block)) + return -EINVAL; + + switch (block) { + case AMDGPU_RAS_BLOCK__GFX: + if (adev->gfx.funcs->reset_ras_error_count) + adev->gfx.funcs->reset_ras_error_count(adev); + + if (adev->gfx.funcs->reset_ras_error_status) + adev->gfx.funcs->reset_ras_error_status(adev); + break; + case AMDGPU_RAS_BLOCK__MMHUB: + if (adev->mmhub.funcs->reset_ras_error_count) + adev->mmhub.funcs->reset_ras_error_count(adev); + break; + case AMDGPU_RAS_BLOCK__SDMA: + if (adev->sdma.funcs->reset_ras_error_count) + adev->sdma.funcs->reset_ras_error_count(adev); + break; + default: + break; + } + + return 0; +} + /* Trigger XGMI/WAFL error */ static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, struct ta_ras_trigger_error_input *block_info) @@ -921,7 +956,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, .head = obj->head, }; - if (amdgpu_ras_error_query(adev, &info)) + if (amdgpu_ras_query_error_status(adev, &info)) return 0; data.ce_count += info.ce_count; @@ -1451,7 +1486,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF) continue; - amdgpu_ras_error_query(adev, &info); + amdgpu_ras_query_error_status(adev, &info); } } @@ -1467,9 +1502,6 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, case AMDGPU_RAS_BLOCK__GFX: if (adev->gfx.funcs->query_ras_error_status) adev->gfx.funcs->query_ras_error_status(adev); - - if (adev->gfx.funcs->query_sq_timeout_status) - adev->gfx.funcs->query_sq_timeout_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: if (adev->mmhub.funcs->query_ras_error_status) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index aed0716efa5a2c422b89cf85ac980c17d6c9502c..a9fd655ed5ee4ae2013b17b8748696c941d04cca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -590,9 +590,12 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev); -int amdgpu_ras_error_query(struct amdgpu_device *adev, +int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info); +int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, + enum amdgpu_ras_block block); + int amdgpu_ras_error_inject(struct amdgpu_device *adev, struct ras_inject_if *info); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index c37139d2c48759d49fb04f286a4fe3d336460e1f..015bd6c1c9cc9bf1b07677cc6384cd879ce598ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2121,8 +2121,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_2_gfx_funcs = { .query_ras_error_count = &gfx_v9_4_2_query_ras_error_count, .reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count, .query_ras_error_status = &gfx_v9_4_2_query_ras_error_status, + .reset_ras_error_status = &gfx_v9_4_2_reset_ras_error_status, .enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer, - .query_sq_timeout_status = &gfx_v9_4_2_query_sq_timeout_status, }; static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev) @@ -3967,9 +3967,6 @@ static int gfx_v9_0_hw_init(void *handle) if (adev->asic_type == CHIP_ALDEBARAN) gfx_v9_4_2_set_power_brake_sequence(adev); - if (adev->gfx.funcs->enable_watchdog_timer) - adev->gfx.funcs->enable_watchdog_timer(adev); - return r; } @@ -4733,14 +4730,13 @@ static int gfx_v9_0_ecc_late_init(void *handle) if (r) return r; - if (adev->gfx.funcs && - adev->gfx.funcs->reset_ras_error_count) - adev->gfx.funcs->reset_ras_error_count(adev); - r = amdgpu_gfx_ras_late_init(adev); if (r) return r; + if (adev->gfx.funcs->enable_watchdog_timer) + adev->gfx.funcs->enable_watchdog_timer(adev); + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c index 44024ab9357716431e745c8f3e1bb05a2d87167c..2e94998c98120904b1646b35f82dd44a6b84b429 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c @@ -79,6 +79,9 @@ static const struct soc15_reg_golden golden_settings_gc_9_4_2_alde[] = { SOC15_REG_GOLDEN_VALUE(GC, 0, regTCI_CNTL_3, 0xff, 0x20), }; +static void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev); +static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev); + void gfx_v9_4_2_init_golden_registers(struct amdgpu_device *adev, uint32_t die_id) { @@ -1055,8 +1058,6 @@ void gfx_v9_4_2_reset_ras_error_count(struct amdgpu_device *adev) gfx_v9_4_2_query_sram_edc_count(adev, NULL, NULL); gfx_v9_4_2_query_utc_edc_count(adev, NULL, NULL); - gfx_v9_4_2_reset_utc_err_status(adev); - gfx_v9_4_2_reset_ea_err_status(adev); } int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if) @@ -1097,6 +1098,8 @@ static void gfx_v9_4_2_query_ea_err_status(struct amdgpu_device *adev) if (reg_value) dev_warn(adev->dev, "GCEA err detected at instance: %d, status: 0x%x!\n", j, reg_value); + /* clear after read */ + WREG32(SOC15_REG_ENTRY_OFFSET(gfx_v9_4_2_rdrsp_status_regs), 0x10); } } @@ -1109,16 +1112,22 @@ static void gfx_v9_4_2_query_utc_err_status(struct amdgpu_device *adev) uint32_t data; data = RREG32_SOC15(GC, 0, regUTCL2_MEM_ECC_STATUS); - if (!data) + if (!data) { dev_warn(adev->dev, "GFX UTCL2 Mem Ecc Status: 0x%x!\n", data); + WREG32_SOC15(GC, 0, regUTCL2_MEM_ECC_STATUS, 0x3); + } data = RREG32_SOC15(GC, 0, regVML2_MEM_ECC_STATUS); - if (!data) + if (!data) { dev_warn(adev->dev, "GFX VML2 Mem Ecc Status: 0x%x!\n", data); + WREG32_SOC15(GC, 0, regVML2_MEM_ECC_STATUS, 0x3); + } data = RREG32_SOC15(GC, 0, regVML2_WALKER_MEM_ECC_STATUS); - if (!data) + if (!data) { dev_warn(adev->dev, "GFX VML2 Walker Mem Ecc Status: 0x%x!\n", data); + WREG32_SOC15(GC, 0, regVML2_WALKER_MEM_ECC_STATUS, 0x3); + } } void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev) @@ -1128,6 +1137,17 @@ void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev) gfx_v9_4_2_query_ea_err_status(adev); gfx_v9_4_2_query_utc_err_status(adev); + gfx_v9_4_2_query_sq_timeout_status(adev); +} + +void gfx_v9_4_2_reset_ras_error_status(struct amdgpu_device *adev) +{ + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) + return; + + gfx_v9_4_2_reset_utc_err_status(adev); + gfx_v9_4_2_reset_ea_err_status(adev); + gfx_v9_4_2_reset_sq_timeout_status(adev); } void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev) @@ -1209,7 +1229,7 @@ static void gfx_v9_4_2_log_cu_timeout_status(struct amdgpu_device *adev, } } -void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev) +static void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev) { uint32_t se_idx, sh_idx, cu_idx; uint32_t status; @@ -1241,4 +1261,26 @@ void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev) } gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff); mutex_unlock(&adev->grbm_idx_mutex); +} + +static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev) +{ + uint32_t se_idx, sh_idx, cu_idx; + + mutex_lock(&adev->grbm_idx_mutex); + for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines; + se_idx++) { + for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se; + sh_idx++) { + for (cu_idx = 0; + cu_idx < adev->gfx.config.max_cu_per_sh; + cu_idx++) { + gfx_v9_4_2_select_se_sh(adev, se_idx, sh_idx, + cu_idx); + WREG32_SOC15(GC, 0, regSQ_TIMEOUT_STATUS, 0); + } + } + } + gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff); + mutex_unlock(&adev->grbm_idx_mutex); } \ No newline at end of file diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h index e01fa6afa8e4f70a6a57ea2ed2cd4870a95dfa55..c143d178ef9803c98ec41b9d08f4e64d8a640443 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h @@ -35,7 +35,6 @@ int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if); void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev); int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status); - +void gfx_v9_4_2_reset_ras_error_status(struct amdgpu_device *adev); void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev); -void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev); #endif /* __GFX_V9_4_2_H__ */