NULL pointer deref during GPU reset
Navi10, linux 5.8.12
[77908.283234] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring gfx_0.0.0 timeout, signaled seq=9368500, emitted seq=9368502
[77908.283332] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process LariLauncher.ex pid 336283 thread LariLauncher.ex pid 336283
[77908.283343] amdgpu 0000:44:00.0: amdgpu: GPU reset begin!
[77909.553238] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0 timeout, signaled seq=3509252, emitted seq=3509254
[77909.553346] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: process pid 0 thread pid 0
[77909.553357] amdgpu 0000:44:00.0: amdgpu: GPU reset begin!
[77909.553360] [drm] Bailing on TDR for s_job:255db2, as another already in progress
[77909.553370] BUG: kernel NULL pointer dereference, address: 0000000000000020
[77909.553374] #PF: supervisor write access in kernel mode
[77909.553377] #PF: error_code(0x0002) - not-present page
[77909.553380] PGD b12fd2067 P4D b12fd2067 PUD 0
[77909.553386] Oops: 0002 [#1] PREEMPT SMP NOPTI
[77909.553390] CPU: 18 PID: 290829 Comm: kworker/18:0 Tainted: G W OE 5.8.12-arch1-1 #1
[77909.553393] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X399 Taichi, BIOS P3.30 08/14/2018
[77909.553400] Workqueue: events drm_sched_job_timedout [gpu_sched]
[77909.553408] RIP: 0010:mutex_unlock+0x13/0x30
[77909.553412] Code: 48 8b 43 10 48 89 e7 48 8b 70 10 e8 37 e6 6b ff eb a1 e8 a0 48 ff ff 0f 1f 44 00 00 65 48 8b 14 25 c0 7b 01 00 31 c9 48 89 d0 <f0> 48 0f b1 0f 48 39 c2 74 05 e9 ce fe ff ff c3 66 66 2e 0f 1f 84
[77909.553415] RSP: 0018:ffffab341bcffd98 EFLAGS: 00010246
[77909.553419] RAX: ffff8aead5ed8000 RBX: ffff8aedfb08cc00 RCX: 0000000000000000
[77909.553421] RDX: ffff8aead5ed8000 RSI: ffffffff86735947 RDI: 0000000000000020
[77909.553424] RBP: 0000000000000000 R08: 000046dbbb40ae0d R09: 0000000000000045
[77909.553426] R10: 000000000004700d R11: ffffffff86735927 R12: ffff8af280de0000
[77909.553429] R13: ffff8af280de0000 R14: 00000000d5ed8000 R15: ffff8af2d75d7ec0
[77909.553432] FS: 0000000000000000(0000) GS:ffff8af2de080000(0000) knlGS:0000000000000000
[77909.553435] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[77909.553437] CR2: 0000000000000020 CR3: 0000000362e5a000 CR4: 00000000003406e0
[77909.553440] Call Trace:
[77909.553553] amdgpu_device_gpu_recover.cold+0x3fa/0xfd4 [amdgpu]
[77909.553661] amdgpu_job_timedout+0x121/0x140 [amdgpu]
[77909.553669] drm_sched_job_timedout+0x64/0xe0 [gpu_sched]
[77909.553675] process_one_work+0x1da/0x3d0
[77909.553680] worker_thread+0x4d/0x3d0
[77909.553683] ? rescuer_thread+0x410/0x410
[77909.553687] kthread+0x142/0x160
[77909.553691] ? __kthread_bind_mask+0x60/0x60
[77909.553697] ret_from_fork+0x22/0x30
[77909.553701] Modules linked in: fuse nct6775 hwmon_vid raid1 md_mod authenc dm_integrity async_xor async_tx dm_bufio xor amdgpu iwlmvm nls_iso8859_1 snd_hda_codec_realtek edac_mce_amd nls_cp437 mac80211 snd_hda_codec_generic wmi_bmof mxm_wmi ledtrig_audio snd_hda_codec_hdmi libarc4 vfat btusb gpu_sched fat snd_hda_intel btrtl btbcm snd_intel_dspcfg kvm_amd btintel ttm iwlwifi snd_usb_audio uvcvideo snd_hda_codec bluetooth videobuf2_vmalloc drm_kms_helper kvm snd_usbmidi_lib videobuf2_memops snd_hda_core snd_rawmidi videobuf2_v4l2 videobuf2_common snd_hwdep snd_seq_device snd_pcm cec videodev irqbypass cfg80211 rc_core mousedev joydev mc input_leds snd_timer ecdh_generic rapl igb syscopyarea sysfillrect ecc sysimgblt snd i2c_algo_bit pcspkr fb_sys_fops sp5100_tco rfkill dca soundcore k10temp i2c_piix4 wmi pinctrl_amd gpio_amdpt evdev mac_hid acpi_cpufreq vboxnetflt(OE) vboxnetadp(OE) vboxdrv(OE) drm agpgart ip_tables x_tables ext4 crc32c_generic crc16 mbcache jbd2 dm_crypt cbc encrypted_keys
[77909.553786] trusted tpm hid_generic usbhid hid serio_raw atkbd libps2 crct10dif_pclmul crc32_pclmul crc32c_intel dm_mod ghash_clmulni_intel aesni_intel crypto_simd cryptd glue_helper xhci_pci ccp xhci_pci_renesas rng_core xhci_hcd i8042 serio
[77909.553813] CR2: 0000000000000020
[77909.553817] ---[ end trace 5f257f4237e9e18e ]---
[77909.553821] RIP: 0010:mutex_unlock+0x13/0x30
[77909.553825] Code: 48 8b 43 10 48 89 e7 48 8b 70 10 e8 37 e6 6b ff eb a1 e8 a0 48 ff ff 0f 1f 44 00 00 65 48 8b 14 25 c0 7b 01 00 31 c9 48 89 d0 <f0> 48 0f b1 0f 48 39 c2 74 05 e9 ce fe ff ff c3 66 66 2e 0f 1f 84
[77909.553828] RSP: 0018:ffffab341bcffd98 EFLAGS: 00010246
[77909.553831] RAX: ffff8aead5ed8000 RBX: ffff8aedfb08cc00 RCX: 0000000000000000
[77909.553834] RDX: ffff8aead5ed8000 RSI: ffffffff86735947 RDI: 0000000000000020
[77909.553836] RBP: 0000000000000000 R08: 000046dbbb40ae0d R09: 0000000000000045
[77909.553839] R10: 000000000004700d R11: ffffffff86735927 R12: ffff8af280de0000
[77909.553841] R13: ffff8af280de0000 R14: 00000000d5ed8000 R15: ffff8af2d75d7ec0
[77909.553844] FS: 0000000000000000(0000) GS:ffff8af2de080000(0000) knlGS:0000000000000000
[77909.553847] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[77909.553850] CR2: 0000000000000020 CR3: 0000000362e5a000 CR4: 00000000003406e0
[77912.283395] amdgpu 0000:44:00.0: amdgpu: failed to suspend display audio
[77912.284119] BUG: unable to handle page fault for address: ffffab341bd14788
[77912.284121] #PF: supervisor read access in kernel mode
[77912.284122] #PF: error_code(0x0000) - not-present page
[77912.284124] PGD 85ed4f067 P4D 85ed4f067 PUD 85ed50067 PMD bbaac2067 PTE 0
[77912.284127] Oops: 0000 [#2] PREEMPT SMP NOPTI
[77912.284130] CPU: 22 PID: 333998 Comm: kworker/22:1 Tainted: G D W OE 5.8.12-arch1-1 #1
[77912.284131] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X399 Taichi, BIOS P3.30 08/14/2018
[77912.284137] Workqueue: events drm_sched_job_timedout [gpu_sched]
[77912.284143] RIP: 0010:mutex_trylock+0x5/0x60
[77912.284145] Code: f0 48 0f b1 0f 48 39 c2 74 0d e9 f6 fd ff ff 83 ea 01 89 50 10 eb d3 c3 66 66 2e 0f 1f 84 00 00 00 00 00 66 90 0f 1f 44 00 00 <48> 8b 17 65 48 8b 34 25 c0 7b 01 00 48 89 d0 48 83 e0 f8 74 08 48
[77912.284145] RSP: 0018:ffffab3406edfd98 EFLAGS: 00010246
[77912.284147] RAX: ffffab341bcffdc8 RBX: ffff8af2d2c07000 RCX: 0000000000000001
[77912.284147] RDX: 00000000000000e0 RSI: 0000000000000000 RDI: ffffab341bd14788
[77912.284148] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[77912.284149] R10: 0000000000000022 R11: 0000000000000000 R12: ffff8af280de0000
[77912.284149] R13: ffffab341bcfdbf0 R14: 00000000dac73d00 R15: ffff8af280df2458
[77912.284150] FS: 0000000000000000(0000) GS:ffff8af2de180000(0000) knlGS:0000000000000000
[77912.284151] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[77912.284152] CR2: ffffab341bd14788 CR3: 0000000362e5a000 CR4: 00000000003406e0
[77912.284152] Call Trace:
[77912.284227] amdgpu_device_gpu_recover.cold+0x3c8/0xfd4 [amdgpu]
[77912.284283] amdgpu_job_timedout+0x121/0x140 [amdgpu]
[77912.284287] drm_sched_job_timedout+0x64/0xe0 [gpu_sched]
[77912.284291] process_one_work+0x1da/0x3d0
[77912.284295] worker_thread+0x4d/0x3d0
[77912.284296] ? rescuer_thread+0x410/0x410
[77912.284298] kthread+0x142/0x160
[77912.284300] ? __kthread_bind_mask+0x60/0x60
[77912.284303] ret_from_fork+0x22/0x30
[77912.284305] Modules linked in: fuse nct6775 hwmon_vid raid1 md_mod authenc dm_integrity async_xor async_tx dm_bufio xor amdgpu iwlmvm nls_iso8859_1 snd_hda_codec_realtek edac_mce_amd nls_cp437 mac80211 snd_hda_codec_generic wmi_bmof mxm_wmi ledtrig_audio snd_hda_codec_hdmi libarc4 vfat btusb gpu_sched fat snd_hda_intel btrtl btbcm snd_intel_dspcfg kvm_amd btintel ttm iwlwifi snd_usb_audio uvcvideo snd_hda_codec bluetooth videobuf2_vmalloc drm_kms_helper kvm snd_usbmidi_lib videobuf2_memops snd_hda_core snd_rawmidi videobuf2_v4l2 videobuf2_common snd_hwdep snd_seq_device snd_pcm cec videodev irqbypass cfg80211 rc_core mousedev joydev mc input_leds snd_timer ecdh_generic rapl igb syscopyarea sysfillrect ecc sysimgblt snd i2c_algo_bit pcspkr fb_sys_fops sp5100_tco rfkill dca soundcore k10temp i2c_piix4 wmi pinctrl_amd gpio_amdpt evdev mac_hid acpi_cpufreq vboxnetflt(OE) vboxnetadp(OE) vboxdrv(OE) drm agpgart ip_tables x_tables ext4 crc32c_generic crc16 mbcache jbd2 dm_crypt cbc encrypted_keys
[77912.284350] trusted tpm hid_generic usbhid hid serio_raw atkbd libps2 crct10dif_pclmul crc32_pclmul crc32c_intel dm_mod ghash_clmulni_intel aesni_intel crypto_simd cryptd glue_helper xhci_pci ccp xhci_pci_renesas rng_core xhci_hcd i8042 serio
[77912.284362] CR2: ffffab341bd14788
[77912.284364] ---[ end trace 5f257f4237e9e18f ]---
[77912.284366] RIP: 0010:mutex_unlock+0x13/0x30
[77912.284367] Code: 48 8b 43 10 48 89 e7 48 8b 70 10 e8 37 e6 6b ff eb a1 e8 a0 48 ff ff 0f 1f 44 00 00 65 48 8b 14 25 c0 7b 01 00 31 c9 48 89 d0 <f0> 48 0f b1 0f 48 39 c2 74 05 e9 ce fe ff ff c3 66 66 2e 0f 1f 84
[77912.284368] RSP: 0018:ffffab341bcffd98 EFLAGS: 00010246
[77912.284369] RAX: ffff8aead5ed8000 RBX: ffff8aedfb08cc00 RCX: 0000000000000000
[77912.284370] RDX: ffff8aead5ed8000 RSI: ffffffff86735947 RDI: 0000000000000020
[77912.284370] RBP: 0000000000000000 R08: 000046dbbb40ae0d R09: 0000000000000045
[77912.284371] R10: 000000000004700d R11: ffffffff86735927 R12: ffff8af280de0000
[77912.284372] R13: ffff8af280de0000 R14: 00000000d5ed8000 R15: ffff8af2d75d7ec0
[77912.284372] FS: 0000000000000000(0000) GS:ffff8af2de180000(0000) knlGS:0000000000000000
[77912.284373] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[77912.284374] CR2: ffffab341bd14788 CR3: 0000000362e5a000 CR4: 00000000003406e0