diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7b60646c6ad27a09dacddbec7a18da4a436efc56..a755cc545c916de4793da888ea5a66bc9e9cceb4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3099,7 +3099,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) amdgpu_fru_get_product_info(adev); - r = amdgpu_cper_init(adev); + if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) + r = amdgpu_cper_init(adev); init_failed: @@ -4333,10 +4334,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, * for throttling interrupt) = 60 seconds. */ ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); - ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1); ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); - ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE); /* Registers mapping */ /* TODO: block userspace mapping of io register */ @@ -4368,7 +4367,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, return -ENOMEM; /* detect hw virtualization here */ - amdgpu_detect_virtualization(adev); + amdgpu_virt_init(adev); amdgpu_device_get_pcie_info(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 81a7d4faac9c882fbe355f5fd3b10cfc823bef69..d55c8b7fdb596138686d8b4a4e888fef2561de2b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -578,12 +578,32 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, return result; } +static ssize_t amdgpu_debugfs_virt_ring_read(struct file *f, char __user *buf, + size_t size, loff_t *pos) +{ + struct amdgpu_ring *ring = file_inode(f)->i_private; + + if (*pos & 3 || size & 3) + return -EINVAL; + + if (ring->funcs->type == AMDGPU_RING_TYPE_CPER) + amdgpu_virt_req_ras_cper_dump(ring->adev, false); + + return amdgpu_debugfs_ring_read(f, buf, size, pos); +} + static const struct file_operations amdgpu_debugfs_ring_fops = { .owner = THIS_MODULE, .read = amdgpu_debugfs_ring_read, .llseek = default_llseek }; +static const struct file_operations amdgpu_debugfs_virt_ring_fops = { + .owner = THIS_MODULE, + .read = amdgpu_debugfs_virt_ring_read, + .llseek = default_llseek +}; + static ssize_t amdgpu_debugfs_mqd_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { @@ -671,9 +691,14 @@ void amdgpu_debugfs_ring_init(struct amdgpu_device *adev, char name[32]; sprintf(name, "amdgpu_ring_%s", ring->name); - debugfs_create_file_size(name, S_IFREG | 0444, root, ring, - &amdgpu_debugfs_ring_fops, - ring->ring_size + 12); + if (amdgpu_sriov_vf(adev)) + debugfs_create_file_size(name, S_IFREG | 0444, root, ring, + &amdgpu_debugfs_virt_ring_fops, + ring->ring_size + 12); + else + debugfs_create_file_size(name, S_IFREG | 0444, root, ring, + &amdgpu_debugfs_ring_fops, + ring->ring_size + 12); if (ring->mqd_obj) { sprintf(name, "amdgpu_mqd_%s", ring->name); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index e6f0152e5b08720e4775d82ffd5b694370ee74b2..ab7e73d0e7b11b488a70f858e2f2ad26631ab51a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -739,7 +739,7 @@ void amdgpu_virt_exchange_data(struct amdgpu_device *adev) } } -void amdgpu_detect_virtualization(struct amdgpu_device *adev) +static u32 amdgpu_virt_init_detect_asic(struct amdgpu_device *adev) { uint32_t reg; @@ -775,8 +775,17 @@ void amdgpu_detect_virtualization(struct amdgpu_device *adev) adev->virt.caps |= AMDGPU_PASSTHROUGH_MODE; } + return reg; +} + +static bool amdgpu_virt_init_req_data(struct amdgpu_device *adev, u32 reg) +{ + bool is_sriov = false; + /* we have the ability to check now */ if (amdgpu_sriov_vf(adev)) { + is_sriov = true; + switch (adev->asic_type) { case CHIP_TONGA: case CHIP_FIJI: @@ -805,10 +814,39 @@ void amdgpu_detect_virtualization(struct amdgpu_device *adev) amdgpu_virt_request_init_data(adev); break; default: /* other chip doesn't support SRIOV */ + is_sriov = false; DRM_ERROR("Unknown asic type: %d!\n", adev->asic_type); break; } } + + return is_sriov; +} + +static void amdgpu_virt_init_ras(struct amdgpu_device *adev) +{ + ratelimit_state_init(&adev->virt.ras.ras_error_cnt_rs, 5 * HZ, 1); + ratelimit_state_init(&adev->virt.ras.ras_cper_dump_rs, 5 * HZ, 1); + + ratelimit_set_flags(&adev->virt.ras.ras_error_cnt_rs, + RATELIMIT_MSG_ON_RELEASE); + ratelimit_set_flags(&adev->virt.ras.ras_cper_dump_rs, + RATELIMIT_MSG_ON_RELEASE); + + mutex_init(&adev->virt.ras.ras_telemetry_mutex); + + adev->virt.ras.cper_rptr = 0; +} + +void amdgpu_virt_init(struct amdgpu_device *adev) +{ + bool is_sriov = false; + uint32_t reg = amdgpu_virt_init_detect_asic(adev); + + is_sriov = amdgpu_virt_init_req_data(adev, reg); + + if (is_sriov) + amdgpu_virt_init_ras(adev); } static bool amdgpu_virt_access_debugfs_is_mmio(struct amdgpu_device *adev) @@ -1288,10 +1326,12 @@ static int amdgpu_virt_req_ras_err_count_internal(struct amdgpu_device *adev, bo * will ignore incoming guest messages. Ratelimit the guest messages to * prevent guest self DOS. */ - if (__ratelimit(&adev->virt.ras_telemetry_rs) || force_update) { + if (__ratelimit(&virt->ras.ras_error_cnt_rs) || force_update) { + mutex_lock(&virt->ras.ras_telemetry_mutex); if (!virt->ops->req_ras_err_count(adev)) amdgpu_virt_cache_host_error_counts(adev, - adev->virt.fw_reserve.ras_telemetry); + virt->fw_reserve.ras_telemetry); + mutex_unlock(&virt->ras.ras_telemetry_mutex); } return 0; @@ -1322,6 +1362,98 @@ int amdgpu_virt_req_ras_err_count(struct amdgpu_device *adev, enum amdgpu_ras_bl return 0; } +static int +amdgpu_virt_write_cpers_to_ring(struct amdgpu_device *adev, + struct amdsriov_ras_telemetry *host_telemetry, + u32 *more) +{ + struct amd_sriov_ras_cper_dump *cper_dump = NULL; + struct cper_hdr *entry = NULL; + struct amdgpu_ring *ring = &adev->cper.ring_buf; + uint32_t checksum, used_size, i; + int ret = 0; + + checksum = host_telemetry->header.checksum; + used_size = host_telemetry->header.used_size; + + if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10)) + return 0; + + cper_dump = kmemdup(&host_telemetry->body.cper_dump, used_size, GFP_KERNEL); + if (!cper_dump) + return -ENOMEM; + + if (checksum != amd_sriov_msg_checksum(cper_dump, used_size, 0, 0)) + goto out; + + *more = cper_dump->more; + + if (cper_dump->wptr < adev->virt.ras.cper_rptr) { + dev_warn( + adev->dev, + "guest specified rptr that was too high! guest rptr: 0x%llx, host rptr: 0x%llx\n", + adev->virt.ras.cper_rptr, cper_dump->wptr); + + adev->virt.ras.cper_rptr = cper_dump->wptr; + goto out; + } + + entry = (struct cper_hdr *)&cper_dump->buf[0]; + + for (i = 0; i < cper_dump->count; i++) { + amdgpu_cper_ring_write(ring, entry, entry->record_length); + entry = (struct cper_hdr *)((char *)entry + + entry->record_length); + } + + if (cper_dump->overflow_count) + dev_warn(adev->dev, + "host reported CPER overflow of 0x%llx entries!\n", + cper_dump->overflow_count); + + adev->virt.ras.cper_rptr = cper_dump->wptr; +out: + kfree(cper_dump); + + return ret; +} + +static int amdgpu_virt_req_ras_cper_dump_internal(struct amdgpu_device *adev) +{ + struct amdgpu_virt *virt = &adev->virt; + int ret = 0; + uint32_t more = 0; + + if (!amdgpu_sriov_ras_cper_en(adev)) + return -EOPNOTSUPP; + + do { + if (!virt->ops->req_ras_cper_dump(adev, virt->ras.cper_rptr)) + ret = amdgpu_virt_write_cpers_to_ring( + adev, virt->fw_reserve.ras_telemetry, &more); + else + ret = 0; + } while (more); + + return ret; +} + +int amdgpu_virt_req_ras_cper_dump(struct amdgpu_device *adev, bool force_update) +{ + struct amdgpu_virt *virt = &adev->virt; + int ret = 0; + + if ((__ratelimit(&virt->ras.ras_cper_dump_rs) || force_update) && + down_read_trylock(&adev->reset_domain->sem)) { + mutex_lock(&virt->ras.ras_telemetry_mutex); + ret = amdgpu_virt_req_ras_cper_dump_internal(adev); + mutex_unlock(&virt->ras.ras_telemetry_mutex); + up_read(&adev->reset_domain->sem); + } + + return ret; +} + int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev) { unsigned long ue_count, ce_count; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index 0f3ccae5c1ab3bc7777f8e0480abbfe23e8f4644..9f65487e60f57a4d08b8c8efebd97b41b6c5cd76 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -96,6 +96,7 @@ struct amdgpu_virt_ops { enum amdgpu_ras_block block); bool (*rcvd_ras_intr)(struct amdgpu_device *adev); int (*req_ras_err_count)(struct amdgpu_device *adev); + int (*req_ras_cper_dump)(struct amdgpu_device *adev, u64 vf_rptr); }; /* @@ -140,6 +141,7 @@ enum AMDGIM_FEATURE_FLAG { AMDGIM_FEATURE_MES_INFO_ENABLE = (1 << 8), AMDGIM_FEATURE_RAS_CAPS = (1 << 9), AMDGIM_FEATURE_RAS_TELEMETRY = (1 << 10), + AMDGIM_FEATURE_RAS_CPER = (1 << 11), }; enum AMDGIM_REG_ACCESS_FLAG { @@ -242,6 +244,13 @@ struct amdgpu_virt_ras_err_handler_data { int last_reserved; }; +struct amdgpu_virt_ras { + struct ratelimit_state ras_error_cnt_rs; + struct ratelimit_state ras_cper_dump_rs; + struct mutex ras_telemetry_mutex; + uint64_t cper_rptr; +}; + /* GPU virtualization */ struct amdgpu_virt { uint32_t caps; @@ -284,8 +293,7 @@ struct amdgpu_virt { union amd_sriov_ras_caps ras_en_caps; union amd_sriov_ras_caps ras_telemetry_en_caps; - - struct ratelimit_state ras_telemetry_rs; + struct amdgpu_virt_ras ras; struct amd_sriov_ras_telemetry_error_count count_cache; }; @@ -340,6 +348,9 @@ struct amdgpu_video_codec_info; #define amdgpu_sriov_ras_telemetry_block_en(adev, sriov_blk) \ (amdgpu_sriov_ras_telemetry_en((adev)) && (adev)->virt.ras_telemetry_en_caps.all & BIT(sriov_blk)) +#define amdgpu_sriov_ras_cper_en(adev) \ +((adev)->virt.gim_feature & AMDGIM_FEATURE_RAS_CPER) + static inline bool is_virtual_machine(void) { #if defined(CONFIG_X86) @@ -378,7 +389,7 @@ void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev); void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev); void amdgpu_virt_exchange_data(struct amdgpu_device *adev); void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev); -void amdgpu_detect_virtualization(struct amdgpu_device *adev); +void amdgpu_virt_init(struct amdgpu_device *adev); bool amdgpu_virt_can_access_debugfs(struct amdgpu_device *adev); int amdgpu_virt_enable_access_debugfs(struct amdgpu_device *adev); @@ -406,6 +417,7 @@ u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 f bool amdgpu_virt_get_ras_capability(struct amdgpu_device *adev); int amdgpu_virt_req_ras_err_count(struct amdgpu_device *adev, enum amdgpu_ras_block block, struct ras_err_data *err_data); +int amdgpu_virt_req_ras_cper_dump(struct amdgpu_device *adev, bool force_update); int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev); bool amdgpu_virt_ras_telemetry_block_en(struct amdgpu_device *adev, enum amdgpu_ras_block block); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 4dcb72d1bdda21685e3b1488fccb203b9794f8ba..5aadf24cb2022a18707844e9496ad496c1e574f3 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -184,6 +184,9 @@ static int xgpu_nv_send_access_requests_with_param(struct amdgpu_device *adev, case IDH_REQ_RAS_ERROR_COUNT: event = IDH_RAS_ERROR_COUNT_READY; break; + case IDH_REQ_RAS_CPER_DUMP: + event = IDH_RAS_CPER_DUMP_READY; + break; default: break; } @@ -467,6 +470,16 @@ static int xgpu_nv_req_ras_err_count(struct amdgpu_device *adev) return xgpu_nv_send_access_requests(adev, IDH_REQ_RAS_ERROR_COUNT); } +static int xgpu_nv_req_ras_cper_dump(struct amdgpu_device *adev, u64 vf_rptr) +{ + uint32_t vf_rptr_hi, vf_rptr_lo; + + vf_rptr_hi = (uint32_t)(vf_rptr >> 32); + vf_rptr_lo = (uint32_t)(vf_rptr & 0xFFFFFFFF); + return xgpu_nv_send_access_requests_with_param( + adev, IDH_REQ_RAS_CPER_DUMP, vf_rptr_hi, vf_rptr_lo, 0); +} + const struct amdgpu_virt_ops xgpu_nv_virt_ops = { .req_full_gpu = xgpu_nv_request_full_gpu_access, .rel_full_gpu = xgpu_nv_release_full_gpu_access, @@ -478,4 +491,5 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = { .ras_poison_handler = xgpu_nv_ras_poison_handler, .rcvd_ras_intr = xgpu_nv_rcvd_ras_intr, .req_ras_err_count = xgpu_nv_req_ras_err_count, + .req_ras_cper_dump = xgpu_nv_req_ras_cper_dump, };