DG2: refcount_t: underflow; use-after-free at xe_sync_entry_cleanup()
Got this on DG2 while trying test the differences in the GPUVA expectations for vm_bind. I was not using Mesa, I was doing very simple tests using plain libdrm. I'll post a reproducer as soon as I figure out a way to reproduce the issue (I ran multiple stuff until I realized I had this in dmesg).
How to reproduce the issue:
- download https://people.freedesktop.org/~pzanoni/495/
- make
- ./vmbind
- the first time you run it, you'll get the use-after-free message.
This is with today's (Jul 20, 2023) drm-xe-next (top-most commit: drm/xe/mmio: update gt_count when probing multi-tile
).
[ 48.605909] ------------[ cut here ]------------
[ 48.605912] refcount_t: underflow; use-after-free.
[ 48.605920] WARNING: CPU: 0 PID: 1820 at lib/refcount.c:28 refcount_warn_saturate+0xba/0x110
[ 48.605926] Modules linked in: snd_seq_dummy(E) snd_hrtimer(E) snd_seq(E) snd_seq_device(E) rfkill(E) qrtr(E) overlay(E) sunrpc(E) binfmt_misc(E) intel_rapl_msr(E) intel_rapl_common(E) x86_pkg_temp_thermal(E) intel_powerclamp(E) coretemp(E) kvm_intel(E) kvm(E) irqbypass(E) crc32_pclmul(E) ghash_clmulni_intel(E) sha512_ssse3(E) snd_sof_pci_intel_cnl(E) snd_sof_intel_hda_common(E) soundwire_intel(E) soundwire_generic_allocation(E) soundwire_cadence(E) snd_sof_intel_hda_mlink(E) snd_sof_intel_hda(E) snd_sof_pci(E) snd_sof_xtensa_dsp(E) snd_sof(E) snd_sof_utils(E) soundwire_bus(E) snd_soc_skl(E) snd_soc_hdac_hda(E) snd_hda_ext_core(E) snd_soc_sst_ipc(E) snd_soc_sst_dsp(E) snd_soc_acpi_intel_match(E) snd_soc_acpi(E) snd_soc_core(E) snd_compress(E) snd_intel_dspcfg(E) aesni_intel(E) snd_intel_sdw_acpi(E) crypto_simd(E) cryptd(E) rapl(E) snd_hda_codec(E) intel_cstate(E) snd_hda_core(E) snd_hwdep(E) iTCO_wdt(E) snd_pcm(E) intel_pmc_bxt(E) intel_uncore(E) intel_wmi_thunderbolt(E) snd_timer(E) iTCO_vendor_support(E) wmi_bmof(E)
[ 48.605979] snd(E) ee1004(E) pcspkr(E) watchdog(E) nls_ascii(E) soundcore(E) nls_cp437(E) vfat(E) fat(E) joydev(E) intel_pmc_core(E) acpi_tad(E) acpi_pad(E) button(E) intel_pch_thermal(E) evdev(E) serio_raw(E) sg(E) msr(E) parport_pc(E) ppdev(E) lp(E) parport(E) loop(E) fuse(E) efi_pstore(E) dm_mod(E) configfs(E) efivarfs(E) ip_tables(E) x_tables(E) autofs4(E) hid_lenovo(E) hid_generic(E) usbhid(E) hid(E) xe(E) sd_mod(E) gpu_sched(E) drm_suballoc_helper(E) drm_ttm_helper(E) i915(E) nvme(E) i2c_algo_bit(E) nvme_core(E) drm_buddy(E) t10_pi(E) drm_display_helper(E) cec(E) ttm(E) crc64_rocksoft(E) crc64(E) drm_kms_helper(E) ahci(E) crc_t10dif(E) libahci(E) crct10dif_generic(E) i2c_i801(E) e1000e(E) xhci_pci(E) libata(E) ptp(E) crct10dif_pclmul(E) xhci_hcd(E) i2c_smbus(E) crc32c_intel(E) crct10dif_common(E) pps_core(E) scsi_mod(E) scsi_common(E) intel_lpss_pci(E) intel_lpss(E) drm(E) usbcore(E) idma64(E) usb_common(E) fan(E) video(E) wmi(E)
[ 48.606052] CPU: 0 PID: 1820 Comm: vmbind Tainted: G E 6.4.0pz+ #84
[ 48.606055] Hardware name: Intel Corporation CoffeeLake Client Platform/CoffeeLake S UDIMM RVP, BIOS CNLSFWR1.R00.X221.B00.2106281933 06/28/2021
[ 48.606057] RIP: 0010:refcount_warn_saturate+0xba/0x110
[ 48.606060] Code: 01 01 e8 69 5a a8 ff 0f 0b c3 cc cc cc cc 80 3d a5 81 37 01 00 75 85 48 c7 c7 00 23 49 b6 c6 05 95 81 37 01 01 e8 46 5a a8 ff <0f> 0b c3 cc cc cc cc 80 3d 80 81 37 01 00 0f 85 5e ff ff ff 48 c7
[ 48.606063] RSP: 0018:ffffbf0b43777ce8 EFLAGS: 00010286
[ 48.606066] RAX: 0000000000000000 RBX: ffff9aa059022280 RCX: 0000000000000000
[ 48.606068] RDX: 0000000000000002 RSI: ffffffffb640d968 RDI: 00000000ffffffff
[ 48.606070] RBP: 00000000ffffffc3 R08: 0000000000000000 R09: ffffffffb6668ec0
[ 48.606072] R10: ffffbf0b43777b80 R11: ffffffffb69e8f08 R12: ffff9aa059022280
[ 48.606074] R13: 0000000000000000 R14: 0000000000000000 R15: ffff9aa0618d4000
[ 48.606076] FS: 00007ffb61f1c740(0000) GS:ffff9aa39dc00000(0000) knlGS:0000000000000000
[ 48.606078] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 48.606080] CR2: 00007ffb61d39110 CR3: 00000001312ee003 CR4: 00000000003706f0
[ 48.606082] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 48.606084] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 48.606086] Call Trace:
[ 48.606088] <TASK>
[ 48.606089] ? refcount_warn_saturate+0xba/0x110
[ 48.606093] ? __warn+0x81/0x170
[ 48.606097] ? refcount_warn_saturate+0xba/0x110
[ 48.606101] ? report_bug+0x18d/0x1c0
[ 48.606106] ? handle_bug+0x41/0x70
[ 48.606109] ? exc_invalid_op+0x13/0x60
[ 48.606113] ? asm_exc_invalid_op+0x16/0x20
[ 48.606121] ? refcount_warn_saturate+0xba/0x110
[ 48.606125] ? refcount_warn_saturate+0xba/0x110
[ 48.606128] xe_sync_entry_cleanup+0xc5/0xf0 [xe]
[ 48.606194] xe_vm_bind_ioctl+0xe5b/0x1d00 [xe]
[ 48.606258] ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe]
[ 48.606330] drm_ioctl_kernel+0xc5/0x170 [drm]
[ 48.606372] drm_ioctl+0x252/0x4c0 [drm]
[ 48.606409] ? __pfx_xe_vm_bind_ioctl+0x10/0x10 [xe]
[ 48.606475] __x64_sys_ioctl+0x8d/0xd0
[ 48.606480] do_syscall_64+0x37/0x90
[ 48.606483] entry_SYSCALL_64_after_hwframe+0x72/0xdc
[ 48.606487] RIP: 0033:0x7ffb61b1cafb
[ 48.606490] Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1c 48 8b 44 24 18 64 48 2b 04 25 28 00 00
[ 48.606492] RSP: 002b:00007ffdde774080 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[ 48.606496] RAX: ffffffffffffffda RBX: 00007ffdde7743f8 RCX: 00007ffb61b1cafb
[ 48.606498] RDX: 00007ffdde774170 RSI: 0000000040786445 RDI: 0000000000000003
[ 48.606500] RBP: 00007ffdde774170 R08: 0000000000000001 R09: 00007ffdde7740b0
[ 48.606502] R10: 00007ffb61e52f6a R11: 0000000000000246 R12: 0000000040786445
[ 48.606504] R13: 0000000000000003 R14: 000055881d5cdd98 R15: 00007ffb61fb5020
[ 48.606511] </TASK>
[ 48.606513] irq event stamp: 9653
[ 48.606515] hardirqs last enabled at (9661): [<ffffffffb534898e>] __up_console_sem+0x5e/0x70
[ 48.606518] hardirqs last disabled at (9668): [<ffffffffb5348973>] __up_console_sem+0x43/0x70
[ 48.606520] softirqs last enabled at (9520): [<ffffffffb52aad87>] __irq_exit_rcu+0xb7/0x130
[ 48.606523] softirqs last disabled at (9513): [<ffffffffb52aad87>] __irq_exit_rcu+0xb7/0x130
[ 48.606525] ---[ end trace 0000000000000000 ]---
And:
(gdb) list *(xe_sync_entry_cleanup+0xc5)
0x31115 is in xe_sync_entry_cleanup (../include/linux/refcount.h:283).
278 smp_acquire__after_ctrl_dep();
279 return true;
280 }
281
282 if (unlikely(old < 0 || old - i < 0))
283 refcount_warn_saturate(r, REFCOUNT_SUB_UAF);
284
285 return false;
286 }
287
(gdb) list *(xe_vm_bind_ioctl+0xe5b)
0x3c6db is in xe_vm_bind_ioctl (../drivers/gpu/drm/xe/xe_vm.c:3319).
3314 return err;
3315
3316 unwind_ops:
3317 vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3318 free_syncs:
3319 while (num_syncs--)
3320 xe_sync_entry_cleanup(&syncs[num_syncs]);
3321
3322 kfree(syncs);
3323 put_obj:
(gdb)
Edited by Paulo Zanoni