diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index fb9399a999aef30aa8e75543362fe9047792e79d..2871a3e3801f8a68ca616c96110e241e83fcc73b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1253,9 +1253,8 @@ int emu_soc_asic_init(struct amdgpu_device *adev); bool amdgpu_device_has_job_running(struct amdgpu_device *adev); bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev); int amdgpu_device_gpu_recover(struct amdgpu_device *adev, - struct amdgpu_job* job); -int amdgpu_device_gpu_recover(struct amdgpu_device *adev, - struct amdgpu_job *job); + struct amdgpu_job *job, + struct amdgpu_reset_context *reset_context); void amdgpu_device_pci_config_reset(struct amdgpu_device *adev); int amdgpu_device_pci_reset(struct amdgpu_device *adev); bool amdgpu_device_need_post(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 567597469a8a30adc41a477832cfc15c3673079f..5e53a5293935622049ba44844b08e0fe7fffd434 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -129,7 +129,14 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work) struct amdgpu_device *adev = container_of(work, struct amdgpu_device, kfd.reset_work); - amdgpu_device_gpu_recover(adev, NULL); + struct amdgpu_reset_context reset_context; + memset(&reset_context, 0, sizeof(reset_context)); + + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + + amdgpu_device_gpu_recover(adev, NULL, &reset_context); } void amdgpu_amdkfd_device_init(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 64f37713b270c62aa583877f1deea70b002687e2..e1c9587f659b5f74f70aca7084d2f20728534a63 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5109,7 +5109,8 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) */ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, - struct amdgpu_job *job) + struct amdgpu_job *job, + struct amdgpu_reset_context *reset_context) { struct list_head device_list, *device_list_handle = NULL; bool job_signaled = false; @@ -5119,9 +5120,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, bool need_emergency_restart = false; bool audio_suspended = false; int tmp_vram_lost_counter; - struct amdgpu_reset_context reset_context; - - memset(&reset_context, 0, sizeof(reset_context)); /* * Special case: RAS triggered and full reset isn't supported @@ -5147,12 +5145,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (hive) mutex_lock(&hive->hive_lock); - reset_context.method = AMD_RESET_METHOD_NONE; - reset_context.reset_req_dev = adev; - reset_context.job = job; - reset_context.hive = hive; - clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - + reset_context->job = job; + reset_context->hive = hive; /* * Build list of devices to reset. * In case we are in XGMI hive mode, resort the device list @@ -5245,7 +5239,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); + r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); /*TODO Should we stop ?*/ if (r) { dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", @@ -5272,7 +5266,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) amdgpu_ras_resume(adev); } else { - r = amdgpu_do_asic_reset(device_list_handle, &reset_context); + r = amdgpu_do_asic_reset(device_list_handle, reset_context); if (r && r == -EAGAIN) goto retry; } @@ -5292,7 +5286,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (amdgpu_gpu_recovery == 2 && !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) amdgpu_device_recheck_guilty_jobs( - tmp_adev, device_list_handle, &reset_context); + tmp_adev, device_list_handle, reset_context); for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 39597ab807d105bdff0dd1d0e9aa181ea10108f0..ff659d4f772b32aa39f43cd5eee5cc8282f05195 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -844,7 +844,14 @@ static void amdgpu_debugfs_reset_work(struct work_struct *work) struct amdgpu_device *adev = container_of(work, struct amdgpu_device, reset_work); - amdgpu_device_gpu_recover(adev, NULL); + struct amdgpu_reset_context reset_context; + memset(&reset_context, 0, sizeof(reset_context)); + + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + + amdgpu_device_gpu_recover(adev, NULL, &reset_context); } #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 22735790fe5020bf3792620c7433e0f4a2ed821e..36c1be77bf8fa6b397d79ba17fcb6da7ff32b85c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -29,6 +29,7 @@ #include "amdgpu.h" #include "amdgpu_trace.h" +#include "amdgpu_reset.h" static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) { @@ -64,7 +65,14 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) ti.process_name, ti.tgid, ti.task_name, ti.pid); if (amdgpu_device_should_recover_gpu(ring->adev)) { - r = amdgpu_device_gpu_recover(ring->adev, job); + struct amdgpu_reset_context reset_context; + memset(&reset_context, 0, sizeof(reset_context)); + + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + + r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context); if (r) DRM_ERROR("GPU Recovery Failed: %d\n", r); } else { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index a5b6db5ab53c98baa96664606394e5f6456661ae..ff5361f5c2d4f2746a3db6fc00877733434d6cc9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1942,8 +1942,16 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_put_xgmi_hive(hive); } - if (amdgpu_device_should_recover_gpu(ras->adev)) - amdgpu_device_gpu_recover(ras->adev, NULL); + if (amdgpu_device_should_recover_gpu(ras->adev)) { + struct amdgpu_reset_context reset_context; + memset(&reset_context, 0, sizeof(reset_context)); + + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + + amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); + } atomic_set(&ras->in_recovery, 0); } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 7ec5b5cf4bb94395476d69de12b60db5dcd333b8..12906ba74462fb65669392bc826663e8fbb60d09 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -283,8 +283,16 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) /* Trigger recovery for world switch failure if no TDR */ if (amdgpu_device_should_recover_gpu(adev) && (!amdgpu_device_has_job_running(adev) || - adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) - amdgpu_device_gpu_recover(adev, NULL); + adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) { + struct amdgpu_reset_context reset_context; + memset(&reset_context, 0, sizeof(reset_context)); + + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + + amdgpu_device_gpu_recover(adev, NULL, &reset_context); + } } static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index e18b75c8fde66cfdebb5e952486b2a8bfeb05e55..e07757eea7adf95bb43b1a330166b8e84a75468b 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -310,8 +310,16 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT || adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT || adev->compute_timeout == MAX_SCHEDULE_TIMEOUT || - adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) - amdgpu_device_gpu_recover(adev, NULL); + adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) { + struct amdgpu_reset_context reset_context; + memset(&reset_context, 0, sizeof(reset_context)); + + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + + amdgpu_device_gpu_recover(adev, NULL, &reset_context); + } } static int xgpu_nv_set_mailbox_rcv_irq(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index c5016a9263317b9137bc49e3c77d0a777dec52b6..288c414babdfa740b598ab49142666b2586beca1 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -522,8 +522,16 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) } /* Trigger recovery due to world switch failure */ - if (amdgpu_device_should_recover_gpu(adev)) - amdgpu_device_gpu_recover(adev, NULL); + if (amdgpu_device_should_recover_gpu(adev)) { + struct amdgpu_reset_context reset_context; + memset(&reset_context, 0, sizeof(reset_context)); + + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + + amdgpu_device_gpu_recover(adev, NULL, &reset_context); + } } static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,