diff --git a/drivers/gpu/drm/panthor/Kconfig b/drivers/gpu/drm/panthor/Kconfig index 55b40ad07f3b0779e0c434469ddc874ff74fde27..eeb80d8e80644ce217c380e0f34305ac2fe4d8ef 100644 --- a/drivers/gpu/drm/panthor/Kconfig +++ b/drivers/gpu/drm/panthor/Kconfig @@ -14,6 +14,7 @@ config DRM_PANTHOR select IOMMU_IO_PGTABLE_LPAE select IOMMU_SUPPORT select PM_DEVFREQ + select WANT_DEVCOREDUMP help DRM driver for ARM Mali CSF-based GPUs. diff --git a/drivers/gpu/drm/panthor/Makefile b/drivers/gpu/drm/panthor/Makefile index 15294719b09cbe8fd801cdbcf0e6a5c8515b6a6d..1b5dcf148553ea7cb831e646e4668873323e31c9 100644 --- a/drivers/gpu/drm/panthor/Makefile +++ b/drivers/gpu/drm/panthor/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 or MIT panthor-y := \ + panthor_devcoredump.o \ panthor_devfreq.o \ panthor_device.o \ panthor_drv.o \ diff --git a/drivers/gpu/drm/panthor/panthor_devcoredump.c b/drivers/gpu/drm/panthor/panthor_devcoredump.c new file mode 100644 index 0000000000000000000000000000000000000000..5514feb3ac3214cee275b9b51388d6c33a406c7e --- /dev/null +++ b/drivers/gpu/drm/panthor/panthor_devcoredump.c @@ -0,0 +1,337 @@ +// SPDX-License-Identifier: GPL-2.0 or MIT +/* SPDX-FileCopyrightText: Copyright Collabora 2024 */ + +#include <drm/drm_managed.h> +#include <drm/drm_gem.h> +#include <linux/gfp_types.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/iosys-map.h> +#include <linux/devcoredump.h> +#include <linux/err.h> +#include <linux/vmalloc.h> +#include <linux/types.h> +#include <uapi/drm/panthor_drm.h> + +#include "panthor_devcoredump.h" +#include "panthor_device.h" +#include "panthor_fw.h" +#include "panthor_mmu.h" +#include "panthor_sched.h" + +/* A magic value used when starting a new section in the dump */ +#define PANT_DUMP_MAGIC 0x544e4150 /* PANT */ +#define PANT_DUMP_MAJOR 1 +#define PANT_DUMP_MINOR 0 + +struct panthor_devcoredump_snapshot { + u8 *start; + u8 *curr; + size_t pos; + size_t capacity; +}; + +struct panthor_devcoredump { + /** + * @captured: Whether the first crash has been captured. + * + * If multiple crashes occur, only the first one is captured and a + * warning is emitted. + * + * This also provides the synchronization needed. If a dump has not been + * captured, no data is read. If a dump has been captured, no data is added. + * This ensures that all threads see a consistent state without using locks. + */ + atomic_t capture_started; + + /** + * @group: The group that caused the crash. + */ + struct panthor_group *group; + + /** @dump_size: The size of the entire dump. + * + * This is computed during the snapshot process and is used to allocate in + * the deferred work. + */ + int metadata_size; + + /** @alloc: Keep track of the dump sections */ + struct panthor_devcoredump_snapshot metadata_cursor; + + struct drm_panthor_dump_section_table_entry + *section_table[DRM_PANTHOR_DUMP_SECTION_TYPE_COUNT]; + + struct { + size_t size; + u8 *mem; + } group_snapshot, vm_snapshot; +}; + +struct dump_group_args { + struct panthor_device *ptdev; + struct panthor_devcoredump_snapshot *alloc; + struct panthor_group *group; +}; + +struct dump_va_args { + struct panthor_device *ptdev; + struct panthor_devcoredump_snapshot *alloc; +}; + +static void *alloc_bytes(struct panthor_devcoredump_snapshot *alloc, + size_t size) +{ + void *ret; + + if (alloc->pos + size > alloc->capacity) + return ERR_PTR(-ENOMEM); + + ret = alloc->curr; + alloc->curr += size; + alloc->pos += size; + return ret; +} + +static struct drm_panthor_dump_section_table_entry * +alloc_section_tbl_entry(struct panthor_devcoredump_snapshot *alloc, u32 type) +{ + struct drm_panthor_dump_section_table_entry *entry; + int entry_size = sizeof(*entry); + + entry = alloc_bytes(alloc, entry_size); + if (IS_ERR(entry)) + return entry; + + entry->type = type; + return entry; +} + +static int compute_metadata_size(void) +{ + int size = 0; + + size += sizeof(struct drm_panthor_dump_header); + size += DRM_PANTHOR_DUMP_SECTION_TYPE_COUNT * + sizeof(struct drm_panthor_dump_section_table_entry); + + size += sizeof(struct drm_panthor_gpu_info); + size += sizeof(struct drm_panthor_csif_info); + size += sizeof(struct drm_panthor_fw_info); + + return size; +} + +static ssize_t coredump_read(char *buffer, loff_t offset, size_t count, + void *data, size_t datalen) +{ + struct panthor_devcoredump *coredump = data; + struct panthor_vm *vm = panthor_group_vm(coredump->group); + + size_t sections_size = coredump->metadata_size - + coredump->vm_snapshot.size - + coredump->group_snapshot.size; + + size_t dumped = 0; + size_t chunk_size; + + if (offset > coredump->metadata_size) + return 0; + + if (offset < sections_size) { + chunk_size = min(count, sections_size - offset); + memcpy(buffer, coredump->metadata_cursor.start + offset, + chunk_size); + offset += chunk_size; + dumped += chunk_size; + count -= chunk_size; + } + + if (!count) + return dumped; + + dumped += panthor_vm_dump_section(vm, buffer + dumped, + offset - sections_size, count); + + return dumped; +} + +static void coredump_init(struct panthor_devcoredump *coredump) +{ + memset(coredump, 0, sizeof(*coredump)); + atomic_set(&coredump->capture_started, 0); +} + +static void coredump_free(void *data) +{ + struct panthor_devcoredump *coredump = data; + struct panthor_vm *vm; + + /* No device. */ + if (!data) + return; + + vm = panthor_group_vm(coredump->group); + panthor_vm_snapshot_destroy(vm); + kvfree(coredump->metadata_cursor.start); + coredump_init(coredump); +} + +static int metadata_snapshot(struct panthor_devcoredump *coredump, + struct panthor_device *ptdev) +{ + struct drm_panthor_dump_header *hdr; + struct drm_panthor_dump_section_table_entry *current_section; + struct drm_panthor_gpu_info *gpu_info; + struct drm_panthor_csif_info *csif_info; + struct drm_panthor_fw_info *fw_info; + int i; + + hdr = alloc_bytes(&coredump->metadata_cursor, + sizeof(struct drm_panthor_dump_header)); + + if (IS_ERR(hdr)) + return PTR_ERR(hdr); + + *hdr = (struct drm_panthor_dump_header){ + .magic = PANT_DUMP_MAGIC, + .version = { PANT_DUMP_MAJOR, PANT_DUMP_MINOR }, + .section_count = DRM_PANTHOR_DUMP_SECTION_TYPE_COUNT, + }; + + for (i = 0; i < DRM_PANTHOR_DUMP_SECTION_TYPE_COUNT; i++) { + coredump->section_table[i] = + alloc_section_tbl_entry(&coredump->metadata_cursor, i); + + if (IS_ERR(coredump->section_table[i])) + return PTR_ERR(coredump->section_table[i]); + } + + current_section = + coredump->section_table[DRM_PANTHOR_DUMP_SECTION_TYPE_GPU_INFO]; + current_section->offset = coredump->metadata_cursor.pos; + current_section->size = sizeof(*gpu_info); + + gpu_info = + alloc_bytes(&coredump->metadata_cursor, current_section->size); + if (IS_ERR(gpu_info)) + return PTR_ERR(gpu_info); + + *gpu_info = ptdev->gpu_info; + + current_section = + coredump->section_table[DRM_PANTHOR_DUMP_SECTION_TYPE_CSIF_INFO]; + current_section->offset = coredump->metadata_cursor.pos; + current_section->size = sizeof(*csif_info); + + csif_info = + alloc_bytes(&coredump->metadata_cursor, current_section->size); + if (IS_ERR(csif_info)) + return PTR_ERR(csif_info); + + *csif_info = ptdev->csif_info; + + current_section = + coredump->section_table[DRM_PANTHOR_DUMP_SECTION_TYPE_FW_INFO]; + current_section->offset = coredump->metadata_cursor.pos; + current_section->size = sizeof(*fw_info); + + fw_info = + alloc_bytes(&coredump->metadata_cursor, current_section->size); + if (IS_ERR(fw_info)) + return PTR_ERR(fw_info); + + *fw_info = ptdev->fw_info; + + return 0; +} + +int panthor_devcoredump_capture(struct panthor_device *ptdev, + struct panthor_group *group) +{ + u8 *mem; + struct panthor_devcoredump *coredump = ptdev->coredump; + int ret = 0; + struct panthor_vm *vm; + + if (atomic_cmpxchg(&coredump->capture_started, 0, 1)) { + drm_dbg(&ptdev->base, + "Multiple hangs are occurring, but only the first snapshot was taken\n"); + return ret; + } + + coredump->group = group; + coredump->metadata_size = compute_metadata_size(); + vm = panthor_group_vm(group); + + /* This will not be used for the BOs, so it's a reasonably sized allocation. */ + mem = kvzalloc(coredump->metadata_size, GFP_NOWAIT); + if (!mem) { + ret = -ENOMEM; + goto fail; + } + + coredump->metadata_cursor = (struct panthor_devcoredump_snapshot){ + .start = mem, + .curr = mem, + .pos = 0, + .capacity = coredump->metadata_size, + }; + + ret = metadata_snapshot(coredump, ptdev); + if (ret) + goto free_mem; + + ret = panthor_vm_snapshot(vm, &coredump->vm_snapshot.mem, + &coredump->vm_snapshot.size); + if (ret) + goto free_mem; + + ret = panthor_group_snapshot(coredump->group, + &coredump->vm_snapshot.mem, + &coredump->vm_snapshot.size); + if (ret) + goto free_vm_snapshot; + + dev_coredumpm(ptdev->base.dev, THIS_MODULE, coredump, 0, GFP_NOWAIT, + coredump_read, coredump_free); + return ret; + +free_vm_snapshot: + panthor_vm_snapshot_destroy(vm); +free_mem: + kvfree(mem); +fail: + drm_WARN(&ptdev->base, 1, "coredump failed (%d)\n", ret); + atomic_set(&coredump->capture_started, 0); + return ret; +} + +static void coredump_destroy(struct drm_device *drm, void *arg) +{ + struct panthor_device *ptdev = arg; + + dev_coredump_put(drm->dev); + kvfree(ptdev->coredump); +} + +int panthor_devcoredump_init(struct panthor_device *ptdev) +{ + struct panthor_devcoredump *coredump = + drmm_kzalloc(&ptdev->base, sizeof(*coredump), GFP_KERNEL); + + if (!coredump) + return -ENOMEM; + + coredump_init(coredump); + ptdev->coredump = coredump; + + return drmm_add_action_or_reset(&ptdev->base, coredump_destroy, ptdev); +} + +int panthor_devcoredump_group_snapshot(struct panthor_devcoredump *coredump) +{ + return panthor_group_snapshot(coredump->group, + &coredump->vm_snapshot.mem, + &coredump->vm_snapshot.size); +} \ No newline at end of file diff --git a/drivers/gpu/drm/panthor/panthor_devcoredump.h b/drivers/gpu/drm/panthor/panthor_devcoredump.h new file mode 100644 index 0000000000000000000000000000000000000000..b2ee717ceaebc0883127bec6add3eee48c3b55fa --- /dev/null +++ b/drivers/gpu/drm/panthor/panthor_devcoredump.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 or MIT */ +/* SPDX-FileCopyrightText: Copyright Collabora 2024 */ + +#ifndef __PANTHOR_DEVCOREDUMP_H__ +#define __PANTHOR_DEVCOREDUMP_H__ + +#include "panthor_device.h" +#include "panthor_sched.h" + +int panthor_devcoredump_init(struct panthor_device *ptdev); +int panthor_devcoredump_capture(struct panthor_device *ptdev, + struct panthor_group *group); + +int panthor_devcoredump_group_snapshot(struct panthor_devcoredump *coredump); + +#endif /* __PANTHOR_DEVCOREDUMP_H__ */ diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c index 4082c8f2951dfdace7f73a24d6fe34e9e7f920eb..0c6fd39e91d4416e9b16b06c559ddd746b09eda1 100644 --- a/drivers/gpu/drm/panthor/panthor_device.c +++ b/drivers/gpu/drm/panthor/panthor_device.c @@ -14,6 +14,7 @@ #include <drm/drm_drv.h> #include <drm/drm_managed.h> +#include "panthor_devcoredump.h" #include "panthor_devfreq.h" #include "panthor_device.h" #include "panthor_fw.h" @@ -242,6 +243,10 @@ int panthor_device_init(struct panthor_device *ptdev) if (ret) goto err_unplug_fw; + ret = panthor_devcoredump_init(ptdev); + if (ret) + goto err_unplug_sched; + /* ~3 frames */ pm_runtime_set_autosuspend_delay(ptdev->base.dev, 50); pm_runtime_use_autosuspend(ptdev->base.dev); @@ -255,8 +260,8 @@ int panthor_device_init(struct panthor_device *ptdev) err_disable_autosuspend: pm_runtime_dont_use_autosuspend(ptdev->base.dev); +err_unplug_sched: panthor_sched_unplug(ptdev); - err_unplug_fw: panthor_fw_unplug(ptdev); diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h index 224c53dcfe6d5bca35ab68d3f5dd3f5c3016a3e3..ee3f131ba3142dd1b319cc5cea3c16672c031059 100644 --- a/drivers/gpu/drm/panthor/panthor_device.h +++ b/drivers/gpu/drm/panthor/panthor_device.h @@ -19,6 +19,7 @@ struct panthor_csf; struct panthor_csf_ctx; +struct panthor_devcoredump; struct panthor_device; struct panthor_gpu; struct panthor_group_pool; @@ -165,6 +166,9 @@ struct panthor_device { */ struct page *dummy_latest_flush; } pm; + + /** @coredump: Data used by devcoredump on crashes. */ + struct panthor_devcoredump *coredump; }; /** diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c index 37f1885c54c7b9d6fda1b93fa03dd313302bb89b..d662a1b02c5bbb4e29972324b802258584ec1869 100644 --- a/drivers/gpu/drm/panthor/panthor_mmu.c +++ b/drivers/gpu/drm/panthor/panthor_mmu.c @@ -38,6 +38,13 @@ struct panthor_vm; +struct panthor_va_snapshot { + struct panthor_gem_object *bo; + u64 addr; + u64 offset; + u64 range; +}; + /** * struct panthor_as_slot - Address space slot */ @@ -386,6 +393,17 @@ struct panthor_vm { * flagged as faulty as a result. */ bool unhandled_fault; + + struct { + /** + * @referenced_bos: An array of BOs referenced by the coredump code. + * + * This is only valid if a snapshot has been captured. See + * `panthor_vm_snapshot()`. + */ + struct panthor_va_snapshot *va_snapshots; + size_t num_vas; + } coredump; }; /** @@ -2634,6 +2652,141 @@ int panthor_vm_prepare_mapped_bos_resvs(struct drm_exec *exec, struct panthor_vm return drm_gpuvm_prepare_objects(&vm->base, exec, slot_count); } +/** + * panthor_vm_snapshot() - Dump the VM state to a buffer + * + */ +int panthor_vm_snapshot(struct panthor_vm *vm, u8 **snapshot, + size_t *snapshot_size) +{ + int ret = 0; + int va_count = 0; + int va_offset = 0; + int i = 0; + struct drm_panthor_dump_gpuvm *gpuvm; + void *cursor; + size_t size = sizeof(struct drm_panthor_dump_gpuvm); + struct drm_gpuva *va; + + mutex_lock(&vm->op_lock); + drm_gpuvm_for_each_va(va, &vm->base) { + va_count++; + size += sizeof(struct drm_panthor_dump_gpuva); + size += sizeof(va->va.range); + } + + + *snapshot = cursor = kvzalloc(size, GFP_NOWAIT); + if (!cursor) { + ret = -ENOMEM; + goto out; + } + + *snapshot_size = size; + + vm->coredump.va_snapshots = kvcalloc( + va_count, sizeof(struct panthor_gem_object *), GFP_NOWAIT); + + if(!vm->coredump.va_snapshots) { + ret = -ENOMEM; + goto free_snapshots; + } + + gpuvm = (struct drm_panthor_dump_gpuvm *)cursor; + + *gpuvm = (struct drm_panthor_dump_gpuvm) { + /* data_offset is now useless */ + /* we should probably enforce alignment to 8 when computing the section table*/ + .data_size = size - sizeof(struct drm_panthor_dump_gpuvm), + .va_count = va_count, + }; + + cursor += sizeof(struct drm_panthor_dump_gpuvm); + + drm_gpuvm_for_each_va(va, &vm->base) { + struct drm_panthor_dump_gpuva *dump_va = (struct drm_panthor_dump_gpuva *)cursor; + *dump_va = (struct drm_panthor_dump_gpuva) { + .addr = va->va.addr, + .range = va->va.range, + .data_offset = va_offset, + }; + + va_offset += va->va.range; + cursor += sizeof(struct drm_panthor_dump_gpuva); + + drm_gem_object_get(va->gem.obj); + vm->coredump.va_snapshots[i++] = (struct panthor_va_snapshot) { + .bo = to_panthor_bo(va->gem.obj), + .addr = va->va.addr, + .offset = va->gem.offset, + .range = va->va.range, + }; + } + + vm->coredump.num_vas = va_count; + + out: + mutex_unlock(&vm->op_lock); + return ret; + + free_snapshots: + kvfree(vm->coredump.va_snapshots); + mutex_unlock(&vm->op_lock); + return ret; +} + +void panthor_vm_snapshot_destroy(struct panthor_vm *vm) +{ + int i; + + mutex_lock(&vm->op_lock); + for (i = 0; i < vm->coredump.num_vas; i++) + drm_gem_object_put(&vm->coredump.va_snapshots[i].bo->base.base); + + kvfree(vm->coredump.va_snapshots); + mutex_unlock(&vm->op_lock); +} + +ssize_t panthor_vm_dump_section(struct panthor_vm *vm, char *buffer, + size_t vm_start_offset, size_t buffer_size) +{ + size_t vm_cur_offset = 0; + size_t total_read = 0; + int ret; + int i; + + for (i = 0; i < vm->coredump.num_vas && total_read < buffer_size; i++) { + struct panthor_va_snapshot *va_entry = + &vm->coredump.va_snapshots[i]; + struct iosys_map map = {}; + + if (vm_cur_offset + va_entry->range < vm_start_offset) { + vm_cur_offset += va_entry->range; + continue; + } + + size_t skip = vm_start_offset > vm_cur_offset ? + vm_start_offset - vm_cur_offset : + 0; + + ret = drm_gem_vmap_unlocked(&va_entry->bo->base.base, &map); + if (ret) + return ret; + + size_t bo_chunk_size = + min(buffer_size - total_read, va_entry->range - skip); + + memcpy(buffer + total_read, map.vaddr + va_entry->offset + skip, + bo_chunk_size); + drm_gem_vunmap_unlocked(&va_entry->bo->base.base, &map); + + total_read += bo_chunk_size; + vm_cur_offset += bo_chunk_size; + } + + return total_read; +} + /** * panthor_mmu_unplug() - Unplug the MMU logic * @ptdev: Device. diff --git a/drivers/gpu/drm/panthor/panthor_mmu.h b/drivers/gpu/drm/panthor/panthor_mmu.h index 6788771071e35557ccde471301bf4aa2ef32ec8f..65dc2d647afc63063ff9b63cd15005e4a5f553cb 100644 --- a/drivers/gpu/drm/panthor/panthor_mmu.h +++ b/drivers/gpu/drm/panthor/panthor_mmu.h @@ -8,6 +8,7 @@ #include <linux/dma-resv.h> struct drm_exec; +struct drm_gpuva; struct drm_sched_job; struct panthor_gem_object; struct panthor_heap_pool; @@ -52,6 +53,13 @@ void panthor_vm_add_job_fence_to_bos_resvs(struct panthor_vm *vm, struct drm_sched_job *job); struct dma_resv *panthor_vm_resv(struct panthor_vm *vm); + +int panthor_vm_snapshot(struct panthor_vm *vm, u8 **snapshot, + size_t *snapshot_size); +void panthor_vm_snapshot_destroy(struct panthor_vm *vm); +ssize_t panthor_vm_dump_section(struct panthor_vm *vm, char *buffer, + size_t vm_start_offset, size_t buffer_size); + struct drm_gem_object *panthor_vm_root_gem(struct panthor_vm *vm); void panthor_vm_pool_destroy(struct panthor_file *pfile); diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c index 3d0cedcea8556fd94f594d1f33a41c6e3be22711..549548f9c633b6806e0f083b1a7d15599b52c0c8 100644 --- a/drivers/gpu/drm/panthor/panthor_sched.c +++ b/drivers/gpu/drm/panthor/panthor_sched.c @@ -22,6 +22,7 @@ #include <linux/platform_device.h> #include <linux/pm_runtime.h> +#include "panthor_devcoredump.h" #include "panthor_devfreq.h" #include "panthor_device.h" #include "panthor_fw.h" @@ -476,6 +477,8 @@ struct panthor_queue { */ struct list_head in_flight_jobs; } fence_ctx; + + struct drm_panthor_dump_queue_info dump_info; }; /** @@ -659,6 +662,8 @@ struct panthor_group { * panthor_group::groups::waiting list. */ struct list_head wait_node; + + struct drm_panthor_dump_group_info dump_info; }; /** @@ -776,6 +781,11 @@ struct panthor_job { struct dma_fence *done_fence; }; +struct panthor_vm *panthor_group_vm(struct panthor_group *group) +{ + return group->vm; +} + static void panthor_queue_put_syncwait_obj(struct panthor_queue *queue) { @@ -1073,6 +1083,56 @@ csg_slot_sync_priority_locked(struct panthor_device *ptdev, u32 csg_id) csg_slot->priority = (csg_iface->input->endpoint_req & CSG_EP_REQ_PRIORITY_MASK) >> 28; } +static bool +group_is_faulty(struct panthor_group *group) +{ + return group->fatal_queues != 0 || group->timedout; +} + +static int fill_queue_dump_info(struct panthor_group *group, u32 cs_id, + struct drm_panthor_dump_queue_info *info) +{ + struct panthor_queue *queue; + struct panthor_fw_cs_iface *cs_iface; + + if (cs_id >= group->queue_count) + return -EINVAL; + + queue = group->queues[cs_id]; + if (!queue) + return -EINVAL; + + cs_iface = panthor_fw_get_cs_iface(group->ptdev, group->csg_id, cs_id); + if (!cs_iface) + return -EINVAL; + + info->cs_id = cs_id; + info->ringbuf_insert = queue->iface.input->insert; + info->ringbuf_extract = queue->iface.output->extract; + info->ringbuf_gpuva = panthor_kernel_bo_gpuva(queue->ringbuf); + info->ringbuf_size = panthor_kernel_bo_size(queue->ringbuf); + + info->last_state.status_cmd_ptr = cs_iface->output->status_cmd_ptr; + info->last_state.status_wait = cs_iface->output->status_wait; + info->last_state.status_req_resource = + cs_iface->output->status_req_resource; + info->last_state.status_wait_sync_ptr = + cs_iface->output->status_wait_sync_ptr; + info->last_state.status_wait_sync_value = + ((u64)cs_iface->output->status_wait_sync_value_hi << 32) | + cs_iface->output->status_wait_sync_value; + info->last_state.status_scoreboards = + cs_iface->output->status_scoreboards; + info->last_state.status_blocked_reason = + cs_iface->output->status_blocked_reason; + info->last_state.fault = cs_iface->output->fault; + info->last_state.fatal = cs_iface->output->fatal; + info->last_state.fault_info = cs_iface->output->fault_info; + info->last_state.fatal_info = cs_iface->output->fatal_info; + + return 0; +} + /** * cs_slot_sync_queue_state_locked() - Synchronize the queue slot priority * @ptdev: Device. @@ -1124,6 +1184,9 @@ cs_slot_sync_queue_state_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs */ break; } + + if (group_is_faulty(group)) + fill_queue_dump_info(group, cs_id, &queue->dump_info); } static void @@ -1131,13 +1194,20 @@ csg_slot_sync_queues_state_locked(struct panthor_device *ptdev, u32 csg_id) { struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id]; struct panthor_group *group = csg_slot->group; + struct panthor_devcoredump *coredump = ptdev->coredump; u32 i; + int ret; lockdep_assert_held(&ptdev->scheduler->lock); group->idle_queues = 0; group->blocked_queues = 0; + if (group_is_faulty(group)) { + ret = panthor_devcoredump_group_snapshot(coredump); + drm_WARN_ON(&group->ptdev->base, ret); + } + for (i = 0; i < group->queue_count; i++) { if (group->queues[i]) cs_slot_sync_queue_state_locked(ptdev, csg_id, i); @@ -1293,14 +1363,15 @@ cs_slot_process_fatal_event_locked(struct panthor_device *ptdev, if (group) group->fatal_queues |= BIT(cs_id); + /* We need a status update for the coredump*/ + sched_queue_delayed_work(sched, tick, 0); + if (CS_EXCEPTION_TYPE(fatal) == DRM_PANTHOR_EXCEPTION_CS_UNRECOVERABLE) { /* If this exception is unrecoverable, queue a reset, and make * sure we stop scheduling groups until the reset has happened. */ panthor_device_schedule_reset(ptdev); cancel_delayed_work(&sched->tick_work); - } else { - sched_queue_delayed_work(sched, tick, 0); } drm_warn(&ptdev->base, @@ -2031,6 +2102,10 @@ group_term_post_processing(struct panthor_group *group) list_del_init(&job->node); panthor_job_put(&job->base); } + + if (group_is_faulty(group)) + panthor_devcoredump_capture(group->ptdev, group); + } static void group_term_work(struct work_struct *work) @@ -2815,6 +2890,91 @@ static void group_sync_upd_work(struct work_struct *work) group_put(group); } +static int queue_dump_info(struct panthor_group *group, u32 cs_id, + struct drm_panthor_dump_queue_info *info) +{ + struct panthor_queue *queue; + + if (drm_WARN_ON(&group->ptdev->base, cs_id >= group->queue_count)) + return -EINVAL; + + queue = group->queues[cs_id]; + if (!queue) { + drm_warn(&group->ptdev->base, "No queue found for cs_id %u\n", + cs_id); + return -EINVAL; + } + + info->cs_id = cs_id; + info->ringbuf_insert = queue->dump_info.ringbuf_insert; + info->ringbuf_extract = queue->dump_info.ringbuf_extract; + info->ringbuf_gpuva = queue->dump_info.ringbuf_gpuva; + info->ringbuf_size = queue->dump_info.ringbuf_size; + + info->last_state.status_cmd_ptr = + queue->dump_info.last_state.status_cmd_ptr; + info->last_state.status_wait = queue->dump_info.last_state.status_wait; + info->last_state.status_req_resource = + queue->dump_info.last_state.status_req_resource; + info->last_state.status_wait_sync_ptr = + queue->dump_info.last_state.status_wait_sync_ptr; + info->last_state.status_wait_sync_value = + queue->dump_info.last_state.status_wait_sync_value; + info->last_state.status_scoreboards = + queue->dump_info.last_state.status_scoreboards; + info->last_state.status_blocked_reason = + queue->dump_info.last_state.status_blocked_reason; + info->last_state.fault = queue->dump_info.last_state.fault; + info->last_state.fatal = queue->dump_info.last_state.fatal; + info->last_state.fault_info = queue->dump_info.last_state.fault_info; + info->last_state.fatal_info = queue->dump_info.last_state.fatal_info; + + return 0; +} + +int panthor_group_snapshot(struct panthor_group *group, u8 **snapshot, + size_t *snapshot_size) +{ + int i; + size_t size; + void *cursor; + int ret; + struct drm_panthor_dump_group_info *group_info; + + + size = sizeof(struct drm_panthor_dump_group_info); + size += (group->queue_count * sizeof(struct drm_panthor_dump_queue_info)); + + *snapshot = cursor = kvzalloc(size, GFP_KERNEL); + if (cursor) + return -ENOMEM; + + group_info = cursor; + + *group_info = (struct drm_panthor_dump_group_info) { + .queue_count = group->queue_count, + .faulty_bitmask = group->fatal_queues, + }; + + cursor += sizeof(*group_info); + + for (i = 0; i < MAX_CS_PER_CSG; i++) { + struct drm_panthor_dump_queue_info *queue_info = cursor; + + if (i >= group->queue_count) + break; + + ret = queue_dump_info(group, i, queue_info); + if (ret) + break; + + cursor += sizeof(*queue_info); + } + + return ret; +} + + static struct dma_fence * queue_run_job(struct drm_sched_job *sched_job) { diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h index 3a30d2328b308df6eb5bf6204b07fa2ec33d8539..1c08722b9dd816fc929c4f22b0a8ed8f781e90fe 100644 --- a/drivers/gpu/drm/panthor/panthor_sched.h +++ b/drivers/gpu/drm/panthor/panthor_sched.h @@ -17,6 +17,9 @@ struct panthor_device; struct panthor_file; struct panthor_group_pool; struct panthor_job; +struct panthor_group; +struct drm_panthor_dump_group_info; +struct drm_panthor_dump_queue_info; int panthor_group_create(struct panthor_file *pfile, const struct drm_panthor_group_create *group_args, @@ -41,6 +44,12 @@ int panthor_sched_init(struct panthor_device *ptdev); void panthor_sched_unplug(struct panthor_device *ptdev); void panthor_sched_pre_reset(struct panthor_device *ptdev); void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed); + +int panthor_group_snapshot(struct panthor_group *group, u8 **snapshot, + size_t *snapshot_size); + +struct panthor_vm *panthor_group_vm(struct panthor_group *group); + void panthor_sched_suspend(struct panthor_device *ptdev); void panthor_sched_resume(struct panthor_device *ptdev); diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h index 4cca9b05d7ee6f8f15225d3b3c48c5a9e66b04d4..3d39edbeeaaff3a2b37da7acb4574cf123f771e2 100644 --- a/include/uapi/drm/panthor_drm.h +++ b/include/uapi/drm/panthor_drm.h @@ -4,6 +4,7 @@ #define _PANTHOR_DRM_H_ #include "drm.h" +#include "panthor_fw.h" #if defined(__cplusplus) extern "C" { @@ -998,6 +999,183 @@ struct drm_panthor_tiler_heap_destroy { __u32 pad; }; +/** + * enum drm_panthor_dump_section_type - Identifies the type of data that follows + * in a panthor core dump. + * + * New sections should only be added at the end, before + * `DRM_PANTHOR_DUMP_SECTION_TYPE_COUNT`. + */ +enum drm_panthor_dump_section_type { + /** + * @DRM_PANTHOR_DUMP_SECTION_TYPE_GPU_INFO: Gpu information. + */ + DRM_PANTHOR_DUMP_SECTION_TYPE_GPU_INFO = 0, + /** + * @DRM_PANTHOR_DUMP_SECTION_TYPE_CSIF_INFO: Command stream interface information. + */ + DRM_PANTHOR_DUMP_SECTION_TYPE_CSIF_INFO, + /** + * @DRM_PANTHOR_DUMP_SECTION_TYPE_FW_INFO: Information about the firmware. + */ + DRM_PANTHOR_DUMP_SECTION_TYPE_FW_INFO, + + /** + * @DRM_PANTHOR_DUMP_SECTION_TYPE_VM: A dump of the VM for the context. + */ + DRM_PANTHOR_DUMP_SECCTION_TYPE_VM, + /** + * @DRM_PANTHOR_DUMP_SECTION_TYPE_GROUP_INFO: Describes a faulty group. + */ + DRM_PANTHOR_DUMP_SECTION_TYPE_GROUP_INFO, + /** + * @DRM_PANTHOR_DUMP_SECTION_TYPE_COUNT: The number of section types possible. + */ + DRM_PANTHOR_DUMP_SECTION_TYPE_COUNT, +}; + +struct drm_panthor_dump_section_table_entry { + enum drm_panthor_dump_section_type type; + __u32 flags; + __u64 offset; + __u64 size; +}; + +/** + * struct drm_panthor_dump_header - A header that describes a section of a panthor core dump. + */ +struct drm_panthor_dump_header { + /** @magic: Always set to PANT (0x544e4150). */ + __u32 magic; + + struct { + /** @major: Major version number. */ + __u32 major; + + /** @minor: Minor version number. */ + __u32 minor; + } version; + + __u32 section_count; +}; + +#define DRM_PANTHOR_DUMP_QUEUE_INFO_FLAGS_FAULTY (1 << 0) + +/** + * struct drm_panthor_dump_queue_info - Queue information for a Panthor GPU + * dump. + * + * This structure is used to hold information about a queue when performing a + * dump of the state of a Panthor GPU. + */ +struct drm_panthor_dump_queue_info { + /** See DRM_PANTHOR_DUMP_QUEUE_INFO_FLAGS_XXX */ + __u32 flags; + + /** @cs_id: The ID of the command stream. */ + __s32 cs_id; + + /** @ringbuf_gpuva: The GPU virtual address of the ring buffer. */ + __u64 ringbuf_gpuva; + + /** @ringbuf_insert: The insert point (i.e.: offset) in the ring buffer. This + * is where a instruction would be inserted next by the CPU. + */ + __u64 ringbuf_insert; + + /** + * @ringbuf_extract: The extract point (i.e.: offset) in the ring buffer. + * This is where the GPU would read the next instruction. + */ + __u64 ringbuf_extract; + + /** @ringbuf_size: The size of the ring buffer */ + __u64 ringbuf_size; + + struct { + /** @status_cmd_ptr: Program pointer current value */ + __u64 status_cmd_ptr; + + /** @status_wait: Wait condition status register */ + __u32 status_wait; + + /** + * @status_req_resource: Indicates the resources requested by the command + * stream + */ + __u32 status_req_resource; + + /** @status_wait_sync_ptr: Sync object pointer */ + __u64 status_wait_sync_ptr; + + /** @status_wait_sync_ptr: Sync object value */ + __u32 status_wait_sync_value; + + __u32 status_scoreboards; + + __u32 status_blocked_reason; + + /** @fault: Recoverable fault information */ + __u32 fault; + + /** @fatal: Unrecoverable fault information */ + __u32 fatal; + + /** @fault_info: Recoverable fault additional information */ + __u64 fault_info; + + /** @fatal_info: Unrecoverable fault additional information */ + __u64 fatal_info; + } last_state; +}; + +/** + * struct drm_panthor_dump_group_info - Group information for a Panthor GPU + * dump. + * + * This structure is used to hold information about a group when performing a + * dump of the state of a Panthor GPU. + */ +struct drm_panthor_dump_group_info { + /** @queue_count: The number of queues in the group. */ + __u32 queue_count; + + /** @faulty_queues: A bitmask denoting the faulty queues */ + __u32 faulty_bitmask; + struct drm_panthor_dump_queue_info queues[MAX_CS_PER_CSG]; +}; + +struct drm_panthor_dump_gpuvm { + /** + * Needed if we want to enforce some alignment so things can be directly + * mmap()-ed from the file. This offset is relative to the start of the + * VM section. + */ + __u64 data_offset; + + /* Data referenced by VA regions. */ + __u64 data_size; + + __u64 va_count; +}; + +/** + * struct drm_panthor_dump_gpuva - Describes a GPU VA range in the dump. + */ +struct drm_panthor_dump_gpuva { + /** @addr: The start address for the mapping */ + __u64 addr; + + /** @range: The range covered by the VA mapping */ + __u64 range; + + /** + * @data_offset: + * The offset at which this VA range can be found relative to + * the start of the VM section */ + __u64 data_offset; +}; + #if defined(__cplusplus) } #endif