Introduce Tyr

Rust driver for ARM Mali CSF-based GPUs The skeleton is basically taken from Nova and also rust_platform_driver.rs. So far, this is just a very early-stage experiment, but it looks promissing: - We use the same uAPI as Panthor, although this needs a bit of work, since bindgen cannot translate #defines into Rust. - The DRM registration and a few IOCTLs are implemented. There is an igt branch with tests. - Basic iomem and register set implementation, so it's possible to program the device. - IRQ handling, so we can receive notifications from the device. - We can boot the firmware. - We can communicate with CSF using the global interface. We can submit requests and the MCU will appropriately respond in the ack field. - There is GEM_CREATE and VM_BIND support. - We can send a PING request to CSF, and it will acknowledge it successfully. Notably missing (apart from literally everything else): - Job subission logic through drm_scheduler and completion through dma_fences - Devfreq, pm_idle, etc. The name "Tyr" is inspired by Norse mythology, reflecting ARM's tradition of naming their GPUs after Nordic mythological figures and places. Co-developed-by: Alice Ryhl <alice.ryhl@google.com> Signed-off-by: Alice Ryhl <alice.ryhl@google.com> Signed-off-by: Daniel Almeida <daniel.almeida@collabora.com>

Introduce Tyr
96d7356d · Daniel Almeida · 4fc4b8d4 · 96d7356d · 96d7356d · 96d7356d
Commit 96d7356d authored 2 months ago by Daniel Almeida
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -511,6 +511,8 @@ source "drivers/gpu/drm/sprd/Kconfig"

 source "drivers/gpu/drm/imagination/Kconfig"

+source "drivers/gpu/drm/tyr/Kconfig"
+
 config DRM_HYPERV
 	tristate "DRM Support for Hyper-V synthetic video device"
 	depends on DRM && PCI && MMU && HYPERV

--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -229,3 +229,4 @@ obj-y			+= solomon/
 obj-$(CONFIG_DRM_SPRD) += sprd/
 obj-$(CONFIG_DRM_LOONGSON) += loongson/
 obj-$(CONFIG_DRM_POWERVR) += imagination/
+obj-$(CONFIG_DRM_TYR) += tyr/
--- a/drivers/gpu/drm/tyr/Kconfig
+++ b/drivers/gpu/drm/tyr/Kconfig
+# SPDX-License-Identifier: GPL-2.0 or MIT
+
+config TYR_DRM_GEM_SHMEM_HELPER
+    bool
+    select DRM_GEM_SHMEM_HELPER
+
+config TYR_DRM_GPUVM
+    bool
+    select DRM_GPUVM
+
+config DRM_TYR
+	tristate "Tyr (Rust DRM support for ARM Mali CSF-based GPUs)"
+	depends on DRM=y
+	depends on RUST
+	depends on RUST_FW_LOADER_ABSTRACTIONS
+	depends on ARM || ARM64 || COMPILE_TEST
+	depends on !GENERIC_ATOMIC64  # for IOMMU_IO_PGTABLE_LPAE
+	depends on MMU
+	select TYR_DRM_GPUVM
+	select TYR_DRM_GEM_SHMEM_HELPER
+	select IOMMU_IO_PGTABLE_LPAE
+	depends on IOMMU_SUPPORT
+	help
+	  Rust DRM driver for ARM Mali CSF-based GPUs.
+
+	  This driver is for Mali (or Immortalis) Valhall Gxxx GPUs.
+
+	  Note that the Mali-G68 and Mali-G78, while Valhall architecture, will
+	  be supported with the panfrost driver as they are not CSF GPUs.
+
+	  if M is selected, the module will be called tyr.
--- a/drivers/gpu/drm/tyr/Makefile
+++ b/drivers/gpu/drm/tyr/Makefile
+# SPDX-License-Identifier: GPL-2.0 or MIT
+
+obj-$(CONFIG_DRM_TYR) += tyr.o
--- a/drivers/gpu/drm/tyr/driver.rs
+++ b/drivers/gpu/drm/tyr/driver.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+use kernel::bits::bit_u32;
+use kernel::c_str;
+use kernel::clk::Clk;
+use kernel::devres::Devres;
+use kernel::drm;
+use kernel::drm::drv;
+use kernel::drm::ioctl;
+use kernel::io;
+use kernel::io::mem::IoMem;
+use kernel::irq::request::Registration as IrqRegistration;
+use kernel::new_mutex;
+use kernel::of;
+use kernel::platform;
+use kernel::prelude::*;
+use kernel::regulator::Regulator;
+use kernel::sync::Arc;
+use kernel::sync::Mutex;
+use kernel::time;
+use kernel::types::ARef;
+
+use crate::file::File;
+use crate::fw;
+use crate::fw::irq::JobIrqHandler;
+use crate::fw::Firmware;
+use crate::gpu;
+use crate::gpu::irq::GpuIrqHandler;
+use crate::gpu::wait::PowerOnWait;
+use crate::gpu::GpuInfo;
+use crate::mmu;
+use crate::mmu::irq::MmuIrqHandler;
+use crate::mmu::Mmu;
+use crate::regs::*;
+
+use core::pin::Pin;
+
+/// Convienence type alias for the DRM device type for this driver
+pub(crate) type TyrDevice = drm::device::Device<TyrDriver>;
+
+#[pin_data(PinnedDrop)]
+pub(crate) struct TyrDriver {
+    device: ARef<TyrDevice>,
+
+    #[pin]
+    gpu_irq: IrqRegistration<GpuIrqHandler>,
+
+    #[pin]
+    mmu_irq: IrqRegistration<MmuIrqHandler>,
+
+    #[pin]
+    job_irq: IrqRegistration<JobIrqHandler>,
+}
+
+impl TyrDriver {
+    // We have switched to polling for now until SpinLockIrq advances a bit.
+    fn wait_for_mcu_to_boot(&self) -> Result {
+        pr_info!("Polling on JOB_INT_GLOBAL_IF\n");
+        let iomem = &self.device.data().iomem;
+
+        let op = || JOB_INT_RAWSTAT.read(iomem);
+        let cond = |raw_stat: &u32| -> bool { *raw_stat & JOB_INT_GLOBAL_IF != 0 };
+
+        io::poll::read_poll_timeout(
+            op,
+            cond,
+            time::Delta::from_millis(0),
+            Some(time::Delta::from_millis(100)),
+        )?;
+
+        JOB_INT_CLEAR.write(iomem, JOB_INT_GLOBAL_IF)?;
+        pr_info!("MCU booted\n");
+        Ok(())
+    }
+
+    // We have switched to polling for now until SpinLockIrq advances a bit.
+    fn wait_for_poweron(&self, poweron_wait: Arc<PowerOnWait>) -> Result {
+        pr_info!("Polling on RESET_COMPLETED\n");
+        let iomem = &self.device.data().iomem;
+
+        let op = || GPU_INT_RAWSTAT.read(iomem);
+        let cond = |raw_stat: &u32| -> bool { *raw_stat == 0x700 };
+
+        io::poll::read_poll_timeout(
+            op,
+            cond,
+            time::Delta::from_millis(0),
+            Some(time::Delta::from_millis(100)),
+        )?;
+
+        GPU_INT_CLEAR.write(iomem, 0x700)?;
+        pr_info!("GPU has powered on\n");
+        Ok(())
+    }
+
+    // fn wait_for_mcu_to_boot(&self) -> Result {
+    //     let data = self.device.data();
+    //     let op = || Ok(*data.fw.booted.lock());
+    //     let cond = |booted: &bool| *booted;
+
+    //     data.fw
+    //         .req_wait
+    //         .wait_interruptible_timeout(1000, op, &cond, true)
+    // }
+
+    // fn wait_for_poweron(&self, poweron_wait: Arc<PowerOnWait>) -> Result {
+    //     let mut powered_on = poweron_wait.powered_on.lock();
+
+    //     while !*powered_on {
+    //         match poweron_wait
+    //             .wait
+    //             .wait_interruptible_timeout(&mut powered_on, msecs_to_jiffies(100))
+    //         {
+    //             CondVarTimeoutResult::Timeout => return Err(ETIMEDOUT),
+    //             _ => {}
+    //         }
+    //     }
+
+    //     Ok(())
+    // }
+}
+
+#[pin_data]
+pub(crate) struct TyrData {
+    pub(crate) pdev: platform::Device,
+
+    #[pin]
+    clks: Mutex<Clocks>,
+
+    #[pin]
+    regulators: Mutex<Regulators>,
+
+    // Some inforation on the GPU. This is mainly queried by userspace (mesa).
+    pub(crate) gpu_info: GpuInfo,
+
+    /// The firmware running on the MCU.
+    #[pin]
+    pub(crate) fw: Firmware,
+
+    /// True if the CPU/GPU are memory coherent.
+    pub(crate) coherent: bool,
+
+    /// MMU management.
+    pub(crate) mmu: Pin<KBox<Mutex<Mmu>>>,
+
+    /// The MMIO region.
+    pub(crate) iomem: Arc<Devres<IoMem>>,
+}
+
+unsafe impl Send for TyrData {}
+unsafe impl Sync for TyrData {}
+
+fn issue_soft_reset(iomem: &Devres<IoMem<0>>) -> Result<()> {
+    let irq_enable_cmd = 1 | bit_u32(8);
+    GPU_CMD.write(iomem, irq_enable_cmd)?;
+
+    let op = || GPU_INT_RAWSTAT.read(iomem);
+    let cond = |raw_stat: &u32| -> bool { (*raw_stat >> 8) & 1 == 1 };
+    let res = io::poll::read_poll_timeout(
+        op,
+        cond,
+        time::Delta::from_millis(100),
+        Some(time::Delta::from_micros(20000)),
+    );
+
+    if let Err(e) = res {
+        pr_err!("GPU reset failed with errno {}\n", e.to_errno());
+        pr_err!("GPU_INT_RAWSTAT is {}\n", GPU_INT_RAWSTAT.read(iomem)?);
+    }
+
+    Ok(())
+}
+
+kernel::of_device_table!(
+    OF_TABLE,
+    MODULE_OF_TABLE,
+    <TyrDriver as platform::Driver>::IdInfo,
+    [
+        (of::DeviceId::new(c_str!("rockchip,rk3588-mali")), ()),
+        (of::DeviceId::new(c_str!("arm,mali-valhall-csf")), ())
+    ]
+);
+
+impl platform::Driver for TyrDriver {
+    type IdInfo = ();
+    const OF_ID_TABLE: Option<of::IdTable<Self::IdInfo>> = Some(&OF_TABLE);
+
+    fn probe(pdev: &mut platform::Device, _info: Option<&Self::IdInfo>) -> Result<Pin<KBox<Self>>> {
+        dev_dbg!(pdev.as_ref(), "Probed Tyr\n");
+
+        let core_clk = Clk::new(pdev.as_ref(), Some(c_str!("core")))?;
+        let stacks_clk = Clk::new(pdev.as_ref(), Some(c_str!("stacks")))?;
+        let coregroup_clk = Clk::new(pdev.as_ref(), Some(c_str!("coregroup")))?;
+
+        core_clk.prepare_enable()?;
+        stacks_clk.prepare_enable()?;
+        coregroup_clk.prepare_enable()?;
+
+        let mali_regulator = Regulator::get(pdev.as_ref(), c_str!("mali"))?;
+        let sram_regulator = Regulator::get(pdev.as_ref(), c_str!("sram"))?;
+
+        mali_regulator.enable()?;
+        sram_regulator.enable()?;
+
+        let resource = pdev.resource(0).ok_or(EINVAL)?;
+
+        let iomem = Arc::new(pdev.ioremap_resource(resource)?, GFP_KERNEL)?;
+
+        // Disable all interrupts for now.
+        JOB_INT_MASK.write(&iomem, 0)?;
+        MMU_INT_MASK.write(&iomem, 0)?;
+        GPU_INT_MASK.write(&iomem, 0)?;
+
+        issue_soft_reset(&iomem)?;
+        gpu::l2_power_on(&iomem)?;
+
+        let gpu_info = GpuInfo::new(&iomem)?;
+        gpu_info.log(pdev);
+
+        pdev.as_ref().dma_set_max_seg_size(u32::MAX);
+        pdev.as_ref()
+            .dma_set_mask_and_coherent(u64::from(gpu_info.pa_bits()))?;
+
+        let platform = pdev.clone();
+        let tdev: ARef<TyrDevice> = drm::device::Device::new_no_data(pdev.as_ref())?;
+
+        let mmu = KBox::pin_init(new_mutex!(Mmu::new()?), GFP_KERNEL)?;
+        let fw = Firmware::init(&tdev, pdev.clone(), &gpu_info, mmu.as_ref(), iomem.clone())?;
+
+        // Ideally we'd find a way around this useless clone too...
+        let t = tdev.clone();
+        let p = platform.clone();
+        let i = iomem.clone();
+        let data = Arc::pin_init(
+            try_pin_init!(TyrData {
+                pdev: p.clone(),
+                clks <- new_mutex!(Clocks {
+                    core: core_clk,
+                    stacks: stacks_clk,
+                    coregroup: coregroup_clk,
+                }),
+                regulators <- new_mutex!(Regulators {
+                    mali: mali_regulator,
+                    sram: sram_regulator,
+                }),
+                gpu_info,
+                fw <- fw,
+                coherent: false, // TODO. The GPU is not IO coherent on rk3588, which is what I am testing on.
+                mmu,
+                iomem: i.clone(),
+            }),
+            GFP_KERNEL,
+        )?;
+
+        // We must find a way around this. It's being discussed on Zulip already.
+        //
+        // Note that this is a problem, because if we fail at probe, then the
+        // drop code expects the data to be set, which leads to a crash.
+        unsafe { tdev.clone().init_data(data) };
+        drm::drv::Registration::new_foreign_owned(tdev.clone(), 0)?;
+
+        let poweron_wait = PowerOnWait::new()?;
+        let pow = poweron_wait.clone();
+        let i = iomem.clone();
+        let driver = KBox::pin_init(
+            try_pin_init!(TyrDriver {
+                device: t.clone(),
+                gpu_irq <- gpu::irq::gpu_irq_init(t.clone(), platform.clone(), i.clone(), pow)?,
+                mmu_irq <- mmu::irq::mmu_irq_init(t.clone(), platform.clone(), i.clone())?,
+                job_irq <- fw::irq::job_irq_init(t.clone(), platform.clone(), i.clone())?,
+            }),
+            GFP_KERNEL,
+        )?;
+
+        driver.wait_for_poweron(poweron_wait.clone())?;
+
+        pr_info!("Booting the MCU");
+
+        MCU_CONTROL.write(&iomem, MCU_CONTROL_AUTO)?;
+
+        driver.wait_for_mcu_to_boot()?;
+        let data = tdev.data();
+        let gpu_info = &data.gpu_info;
+        let core_clk = &data.clks.lock().core;
+
+        data.fw.enable(iomem.clone(), gpu_info, core_clk)?;
+        dev_info!(pdev.as_ref(), "Tyr initialized correctly.\n");
+        Ok(driver)
+    }
+}
+
+#[pinned_drop]
+impl PinnedDrop for TyrDriver {
+    fn drop(self: Pin<&mut Self>) {
+        // XXX: we will not have the `data` field here if we failed the
+        // initialization, i.e.: if probe failed.
+        //
+        // We need to figure out with the community how to properly split the
+        // creation of a DRM device from the place where the data is set and
+        // from the place where it is registered to overcome this.
+        //
+        // The current solution, i.e.: `new_from_closure` is just a hack, and it
+        // shows its shortcomings here, for example.
+        //
+        // dev_dbg!(self.device.data().pdev.as_ref(), "Removed Tyr.\n");
+    }
+}
+
+const INFO: drm::drv::DriverInfo = drm::drv::DriverInfo {
+    major: 0,
+    minor: 0,
+    patchlevel: 0,
+    name: c_str!("tyr"),
+    desc: c_str!("ARM Mali CSF-based GPU driver"),
+    date: c_str!("20252501"),
+};
+
+#[vtable]
+impl drm::drv::Driver for TyrDriver {
+    type Data = Arc<TyrData>;
+    type File = File;
+    type Object = crate::gem::Object;
+
+    const INFO: drm::drv::DriverInfo = INFO;
+    const FEATURES: u32 = drv::FEAT_GEM | drv::FEAT_GEM_GPUVA;
+
+    kernel::declare_drm_ioctls! {
+        (TYR_DEV_QUERY, drm_panthor_dev_query, ioctl::RENDER_ALLOW, File::dev_query),
+        (TYR_VM_CREATE, drm_panthor_vm_create, ioctl::RENDER_ALLOW, File::vm_create),
+        (TYR_VM_DESTROY, drm_panthor_vm_destroy, ioctl::RENDER_ALLOW, File::vm_destroy),
+        (TYR_VM_BIND, drm_panthor_vm_bind, ioctl::RENDER_ALLOW, File::vm_bind),
+        (TYR_VM_GET_STATE, drm_panthor_vm_get_state, ioctl::RENDER_ALLOW, File::vm_get_state),
+        (TYR_BO_CREATE, drm_panthor_bo_create, ioctl::RENDER_ALLOW, File::bo_create),
+        (TYR_BO_MMAP_OFFSET, drm_panthor_bo_mmap_offset, ioctl::RENDER_ALLOW, File::bo_mmap_offset),
+    }
+}
+
+#[pin_data]
+struct Clocks {
+    core: Clk,
+    stacks: Clk,
+    coregroup: Clk,
+}
+
+#[pin_data]
+struct Regulators {
+    mali: Regulator,
+    sram: Regulator,
+}
--- a/drivers/gpu/drm/tyr/file.rs
+++ b/drivers/gpu/drm/tyr/file.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+use kernel::alloc::flags::*;
+use kernel::drm;
+use kernel::drm::device::Device as DrmDevice;
+use kernel::drm::gem::BaseObject;
+use kernel::prelude::*;
+use kernel::sync::Arc;
+use kernel::transmute::FromBytes;
+use kernel::uaccess::UserSlice;
+use kernel::uapi;
+
+use crate::driver::TyrDevice;
+use crate::driver::TyrDriver;
+use crate::gem;
+use crate::mmu::vm;
+use crate::mmu::vm::pool::Pool;
+use crate::mmu::vm::VmLayout;
+use crate::mmu::vm::VmUserSize;
+
+pub(crate) struct File {
+    /// A pool storing our VMs for this particular context.
+    vm_pool: Pool,
+}
+
+/// Convenience type alias for our DRM `File` type
+pub(crate) type DrmFile = drm::file::File<File>;
+
+impl drm::file::DriverFile for File {
+    type Driver = TyrDriver;
+
+    fn open(dev: &DrmDevice<Self::Driver>) -> Result<Pin<KBox<Self>>> {
+        dev_dbg!(dev.as_ref(), "drm::device::Device::open\n");
+
+        let file = Self {
+            vm_pool: Pool::create(),
+        };
+
+        Ok(KBox::new(file, GFP_KERNEL)?.into())
+    }
+}
+
+impl File {
+    pub(crate) fn dev_query(
+        tdev: &TyrDevice,
+        devquery: &mut uapi::drm_panthor_dev_query,
+        _file: &DrmFile,
+    ) -> Result<u32> {
+        if devquery.pointer == 0 {
+            match devquery.type_ {
+                uapi::drm_panthor_dev_query_type_DRM_PANTHOR_DEV_QUERY_GPU_INFO => {
+                    devquery.size = core::mem::size_of_val(&tdev.data().gpu_info) as u32;
+                    Ok(0)
+                }
+                _ => Err(EINVAL),
+            }
+        } else {
+            match devquery.type_ {
+                uapi::drm_panthor_dev_query_type_DRM_PANTHOR_DEV_QUERY_GPU_INFO => {
+                    let mut writer =
+                        UserSlice::new(devquery.pointer as usize, devquery.size as usize).writer();
+
+                    writer.write(&tdev.data().gpu_info)?;
+
+                    Ok(0)
+                }
+                _ => Err(EINVAL),
+            }
+        }
+    }
+
+    pub(crate) fn vm_create(
+        tdev: &TyrDevice,
+        vmcreate: &mut uapi::drm_panthor_vm_create,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        let id = file.inner().vm_pool().create_vm(
+            tdev,
+            VmLayout::from_user_sz(tdev, VmUserSize::Custom(vmcreate.user_va_range)),
+        )?;
+
+        vmcreate.id = id as u32;
+        Ok(0)
+    }
+
+    pub(crate) fn vm_destroy(
+        tdev: &TyrDevice,
+        vmdestroy: &mut uapi::drm_panthor_vm_destroy,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        let iomem = tdev.data().iomem.clone();
+
+        file.inner()
+            .vm_pool()
+            .destroy_vm(vmdestroy.id as usize, iomem)?;
+
+        Ok(0)
+    }
+
+    pub(crate) fn vm_bind(
+        tdev: &TyrDevice,
+        vmbind: &mut uapi::drm_panthor_vm_bind,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        if vmbind.flags & uapi::drm_panthor_vm_bind_flags_DRM_PANTHOR_VM_BIND_ASYNC != 0 {
+            dev_info!(tdev.as_ref(), "We do not support async VM_BIND yet");
+            return Err(ENOTSUPP);
+        }
+
+        if vmbind.ops.stride as usize != core::mem::size_of::<uapi::drm_panthor_vm_bind_op>() {
+            dev_info!(
+                tdev.as_ref(),
+                "We cannot graciously handle stride mismatches yet"
+            );
+            return Err(ENOTSUPP);
+        }
+
+        let stride = vmbind.ops.stride as usize;
+        let count = vmbind.ops.count as usize;
+
+        let mut reader = UserSlice::new(vmbind.ops.array as usize, stride).reader();
+        let iomem = tdev.data().iomem.clone();
+
+        for i in 0..count {
+            let res = {
+                let op: VmBindOp = reader.read()?;
+                let mask = uapi::drm_panthor_vm_bind_op_flags_DRM_PANTHOR_VM_BIND_OP_TYPE_MASK;
+
+                match op.0.flags as i32 & mask {
+                    uapi::drm_panthor_vm_bind_op_flags_DRM_PANTHOR_VM_BIND_OP_TYPE_MAP => {
+                        let bo = gem::lookup_handle(file, op.0.bo_handle)?;
+                        let range = op.0.va..op.0.va + op.0.size;
+
+                        let vm = file
+                            .inner()
+                            .vm_pool()
+                            .get_vm(vmbind.vm_id as usize)
+                            .ok_or(EINVAL)?;
+
+                        vm.lock().bind_gem(
+                            iomem.clone(),
+                            &bo.gem,
+                            op.0.bo_offset,
+                            range,
+                            vm::map_flags::Flags::try_from(op.0.flags & 0b111)?,
+                        )?;
+                    }
+
+                    uapi::drm_panthor_vm_bind_op_flags_DRM_PANTHOR_VM_BIND_OP_TYPE_UNMAP => {
+                        if op.0.bo_handle != 0 || op.0.bo_offset != 0 {
+                            return Err(EINVAL);
+                        }
+
+                        let range = op.0.va..op.0.va + op.0.size;
+
+                        let vm = file
+                            .inner()
+                            .vm_pool()
+                            .get_vm(vmbind.vm_id as usize)
+                            .ok_or(EINVAL)?;
+
+                        vm.lock().unmap_range(iomem.clone(), range)?;
+                    }
+                    _ => return Err(ENOTSUPP),
+                }
+
+                Ok(0)
+            };
+
+            if let Err(e) = res {
+                vmbind.ops.count = i as u32;
+                return Err(e);
+            }
+        }
+
+        Ok(0)
+    }
+
+    pub(crate) fn vm_get_state(
+        _tdev: &TyrDevice,
+        _vmgetstate: &mut uapi::drm_panthor_vm_get_state,
+        _file: &DrmFile,
+    ) -> Result<u32> {
+        Err(ENOTSUPP)
+    }
+
+    pub(crate) fn bo_create(
+        tdev: &TyrDevice,
+        bocreate: &mut uapi::drm_panthor_bo_create,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        if bocreate.flags & !uapi::drm_panthor_bo_flags_DRM_PANTHOR_BO_NO_MMAP != 0 {
+            dev_err!(
+                tdev.as_ref(),
+                "bo_create: invalid flags {}\n",
+                bocreate.flags
+            );
+
+            return Err(EINVAL);
+        }
+
+        let bo = gem::new_object(tdev, bocreate.size as usize, bocreate.flags)?;
+
+        let handle = bo.gem.create_handle(file)?;
+        bocreate.handle = handle;
+        bocreate.size = bo.gem.size() as u64;
+
+        Ok(0)
+    }
+
+    pub(crate) fn bo_mmap_offset(
+        _tdev: &TyrDevice,
+        bommap: &mut uapi::drm_panthor_bo_mmap_offset,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        let bo = gem::lookup_handle(file, bommap.handle)?;
+
+        bommap.offset = bo.gem.create_mmap_offset()?;
+
+        Ok(0)
+    }
+
+    fn vm_pool(self: Pin<&Self>) -> Pin<&Pool> {
+        // SAFETY: Field projection, we never move out of this field.
+        unsafe { self.map_unchecked(|f| &f.vm_pool) }
+    }
+}
+
+#[repr(transparent)]
+struct VmBindOp(uapi::drm_panthor_vm_bind_op);
+
+// XXX: we cannot implement this trait for the uapi type directly, hence the
+// wrapper.
+// SAFETY: this struct is safe to be transmuted from a byte slice.
+unsafe impl FromBytes for VmBindOp {}
--- a/drivers/gpu/drm/tyr/flags.rs
+++ b/drivers/gpu/drm/tyr/flags.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+//! A general flags type adapted from the WIP work from Felipe Xavier.
+//!
+//! This will be replaced by his patch once it's ready.
+
+#[macro_export]
+/// Creates a new flags type.
+macro_rules! impl_flags {
+    ($flags:ident, $flag:ident, $ty:ty) => {
+        #[allow(missing_docs)]
+        #[repr(transparent)]
+        #[derive(Copy, Clone, Default, Debug, PartialEq, Eq)]
+        pub struct $flags($ty);
+
+        #[allow(missing_docs)]
+        #[derive(Copy, Clone, Debug, PartialEq, Eq)]
+        pub struct $flag($ty);
+
+        impl From<$flag> for $flags {
+            #[inline]
+            fn from(value: $flag) -> Self {
+                Self(value.0)
+            }
+        }
+
+        impl From<$flags> for $ty {
+            #[inline]
+            fn from(value: $flags) -> Self {
+                value.0
+            }
+        }
+
+        impl core::ops::BitOr for $flags {
+            type Output = Self;
+
+            #[inline]
+            fn bitor(self, rhs: Self) -> Self::Output {
+                Self(self.0 | rhs.0)
+            }
+        }
+
+        impl core::ops::BitOrAssign for $flags {
+            #[inline]
+            fn bitor_assign(&mut self, rhs: Self) {
+                *self = *self | rhs;
+            }
+        }
+
+        impl core::ops::BitAnd for $flags {
+            type Output = Self;
+
+            #[inline]
+            fn bitand(self, rhs: Self) -> Self::Output {
+                Self(self.0 & rhs.0)
+            }
+        }
+
+        impl core::ops::BitAndAssign for $flags {
+            #[inline]
+            fn bitand_assign(&mut self, rhs: Self) {
+                *self = *self & rhs;
+            }
+        }
+
+        impl core::ops::BitOr<$flag> for $flags {
+            type Output = Self;
+
+            #[inline]
+            fn bitor(self, rhs: $flag) -> Self::Output {
+                self | Self::from(rhs)
+            }
+        }
+
+        impl core::ops::BitOrAssign<$flag> for $flags {
+            #[inline]
+            fn bitor_assign(&mut self, rhs: $flag) {
+                *self = *self | rhs;
+            }
+        }
+
+        impl core::ops::BitAnd<$flag> for $flags {
+            type Output = Self;
+
+            #[inline]
+            fn bitand(self, rhs: $flag) -> Self::Output {
+                self & Self::from(rhs)
+            }
+        }
+
+        impl core::ops::BitAndAssign<$flag> for $flags {
+            #[inline]
+            fn bitand_assign(&mut self, rhs: $flag) {
+                *self = *self & rhs;
+            }
+        }
+
+        impl core::ops::BitXor for $flags {
+            type Output = Self;
+
+            #[inline]
+            fn bitxor(self, rhs: Self) -> Self::Output {
+                Self(self.0 ^ rhs.0)
+            }
+        }
+
+        impl core::ops::BitXorAssign for $flags {
+            #[inline]
+            fn bitxor_assign(&mut self, rhs: Self) {
+                *self = *self ^ rhs;
+            }
+        }
+
+        impl core::ops::Neg for $flags {
+            type Output = Self;
+
+            #[inline]
+            fn neg(self) -> Self::Output {
+                Self(!self.0)
+            }
+        }
+
+        impl $flags {
+            /// Returns an empty instance of <type> where no flags are set.
+            #[inline]
+            pub const fn empty() -> Self {
+                Self(0)
+            }
+
+            /// Checks if a specific flag is set.
+            #[inline]
+            pub fn contains(self, flag: $flag) -> bool {
+                (self.0 & flag.0) == flag.0
+            }
+        }
+    };
+}
--- a/drivers/gpu/drm/tyr/fw.rs
+++ b/drivers/gpu/drm/tyr/fw.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+use global::GlobalInterface;
+use kernel::bindings::SZ_1G;
+use kernel::clk::Clk;
+use kernel::devres::Devres;
+use kernel::io::mem::IoMem;
+use kernel::new_mutex;
+use kernel::new_spinlock_irq;
+use kernel::platform;
+use kernel::prelude::*;
+use kernel::sync::Arc;
+use kernel::sync::Mutex;
+use kernel::sync::SpinLockIrq;
+use parse::Section;
+use wait::ReqWait;
+
+use crate::driver::TyrDevice;
+use crate::gpu::GpuInfo;
+use crate::mmu::vm::Vm;
+use crate::mmu::vm::VmLayout;
+use crate::mmu::Mmu;
+
+const CSF_MCU_SHARED_REGION_START: u32 = 0x04000000;
+const CSF_MCU_SHARED_REGION_SIZE: u32 = 0x04000000;
+
+mod global;
+pub(crate) mod irq;
+mod parse;
+pub(crate) mod wait;
+
+#[repr(transparent)]
+#[derive(Debug, Clone, Copy)]
+/// An offset into the shared section that is known to be valid.
+///
+/// This can be obtained via a call to [`Firmware::kmap_offset(mcu_va)`].
+///
+/// # Invariants
+///
+/// `self.0` is a valid offset into the shared section. This means that it can
+/// safely be dereferenced by the CPU.
+struct CheckedKmapOffset(usize);
+
+impl CheckedKmapOffset {
+    fn as_mut_ptr(&self, shared_section: &mut Section) -> Result<*mut core::ffi::c_void> {
+        let vmap = shared_section.mem.vmap()?;
+        let vmap = vmap.as_mut_ptr();
+
+        // SAFETY: safe by the type invariant.
+        let offset = unsafe { vmap.add(self.0) };
+
+        Ok(offset)
+    }
+
+    fn read<T>(&self, shared_section: &mut Section) -> Result<T> {
+        let ptr = self.as_mut_ptr(shared_section)?;
+
+        // SAFETY: we know that this pointer is aligned and valid for reads for
+        // at least size_of::<Self>() bytes.
+        Ok(unsafe { core::ptr::read_volatile(ptr as *const T) })
+    }
+
+    fn write<T>(&self, shared_section: &mut Section, value: T) -> Result {
+        let ptr = self.as_mut_ptr(shared_section)?;
+
+        // SAFETY: we know that this pointer is aligned and valid for writes for
+        // at least size_of::<Self>() bytes.
+        unsafe {
+            core::ptr::write_volatile(ptr as *mut T, value);
+        }
+
+        Ok(())
+    }
+}
+
+#[repr(transparent)]
+/// An offset into the shared section that is known to point to the request field.
+///
+/// It is more convenient to use this type than reading or writing the memory
+/// areas directly if all you want is to change the request field.
+struct ReqKmapOffset(CheckedKmapOffset);
+
+impl ReqKmapOffset {
+    /// Toggle acknowledge bits to send an event to the FW
+    ///
+    /// The Host -> FW event/message passing was designed to be lockless, with each side of
+    /// the channel having its writeable section. Events are signaled as a difference between
+    /// the host and FW side in the req/ack registers (when a bit differs, there's an event
+    /// pending, when they are the same, nothing needs attention).
+    ///
+    /// This helper allows one to update the req register based on the current value of the
+    /// ack register managed by the FW. Toggling a specific bit will flag an event. In order
+    /// for events to be re-evaluated, the interface doorbell needs to be rung.
+    fn toggle_reqs(
+        &self,
+        shared_section: &mut Section,
+        ack: CheckedKmapOffset,
+        reqs: u32,
+    ) -> Result {
+        let cur_req_val = self.0.read::<u32>(shared_section)?;
+
+        let ack_val = ack.read::<u32>(shared_section)?;
+        let new_val = ((ack_val ^ reqs) & reqs) | (cur_req_val & !reqs);
+
+        self.0.write::<u32>(shared_section, new_val)
+    }
+
+    /// Update bits to reflect a configuration change
+    ///
+    /// Some configuration get passed through req registers that are also used to
+    /// send events to the FW.
+    fn update_reqs(&self, shared_section: &mut Section, val: u32, reqs: u32) -> Result {
+        let cur_req_val = self.0.read::<u32>(shared_section)?;
+        let new_val = (cur_req_val & !reqs) | (val & reqs);
+
+        self.0.write::<u32>(shared_section, new_val)
+    }
+
+    /// Returns whether any requests are pending.
+    ///
+    /// Requests are pending when the value of the given bit in the req differs
+    /// from the one in ack.
+    fn pending_reqs(
+        &self,
+        shared_section: &mut Section,
+        ack: CheckedKmapOffset,
+        reqs: u32,
+    ) -> Result<bool> {
+        let cur_req_val = self.0.read::<u32>(shared_section)? & reqs;
+        let cur_ack_val = ack.read::<u32>(shared_section)? & reqs;
+
+        Ok((cur_req_val ^ cur_ack_val) != 0)
+    }
+}
+
+/// Our interface to the MCU.
+#[pin_data]
+pub(crate) struct Firmware {
+    #[pin]
+    /// The sections read from the firmware binary. These sections are loaded
+    /// into GPU memory via BOs.
+    sections: Mutex<KVec<Section>>,
+
+    /// The global FW interface.
+    global_iface: Arc<GlobalInterface>,
+
+    /// The VM where we load the firmware into. This VM is always bound to AS0.
+    vm: Arc<Mutex<Vm>>,
+
+    #[pin]
+    /// A condvar representing a wait on a MCU request.
+    ///
+    /// We notify all waiters on every interrupt.
+    pub(crate) req_wait: Arc<ReqWait>,
+
+    #[pin]
+    /// Whether the MCU has booted.
+    pub(crate) booted: SpinLockIrq<bool>,
+}
+
+impl Firmware {
+    pub(crate) fn init(
+        tdev: &TyrDevice,
+        pdev: platform::Device,
+        gpu_info: &GpuInfo,
+        mmu: Pin<&Mutex<Mmu>>,
+        iomem: Arc<Devres<IoMem>>,
+    ) -> Result<impl PinInit<Self>> {
+        let vm = {
+            let auto_kernel_va = CSF_MCU_SHARED_REGION_START as u64
+                ..CSF_MCU_SHARED_REGION_START as u64 + CSF_MCU_SHARED_REGION_SIZE as u64;
+
+            let mut mmu = mmu.lock();
+
+            // Create the FW VM. This will be used to communicate between the CPU
+            // and the MCU.
+            let vm = mmu.create_vm(
+                tdev,
+                pdev.clone(),
+                gpu_info,
+                true,
+                VmLayout {
+                    user: 0..0,
+                    kernel: 0..4 * SZ_1G as u64,
+                },
+                auto_kernel_va,
+            )?;
+
+            mmu.bind_vm(vm.clone(), gpu_info, &iomem)?;
+
+            vm
+        };
+
+        let (sections, shared_section) =
+            Self::read_sections(tdev, iomem.clone(), gpu_info, vm.clone())?;
+
+        let req_wait = ReqWait::new()?;
+        let global_iface = GlobalInterface::new(shared_section, iomem.clone(), req_wait.clone())?;
+
+        Ok(pin_init!(Self {
+            sections <- new_mutex!(sections),
+            global_iface,
+            vm,
+            req_wait,
+            booted <- new_spinlock_irq!(false),
+        }))
+    }
+
+    /// Enables the FW interfaces.
+    pub(crate) fn enable(
+        &self,
+        iomem: Arc<Devres<IoMem>>,
+        gpu_info: &GpuInfo,
+        core_clk: &Clk,
+    ) -> Result {
+        self.global_iface.enable(iomem, gpu_info, core_clk)?;
+        self.global_iface.ping_once()?;
+        // self.global_iface.arm_watchdog()?;
+        // TODO: enable other interfaces in the future.
+        Ok(())
+    }
+
+    // Wait for acks from the MCU.
+    //
+    // `req_ptr` and `ack_ptr` are pointers to memory regions shared with the
+    // MCU.
+    //
+    // Plain references are not used because the underlying shared memory can
+    // be mutated at any time, violating Rust assumptions about its contents.
+    //
+    // # Safety
+    //
+    // - Callers must ensure that `req_ptr` and `ack_ptr` are valid pointers.
+    // unsafe fn wait_acks(
+    //     req_ptr: *const u32,
+    //     ack_ptr: *mut u32,
+    //     wait: &CondVar,
+    //     req_mask: u32,
+    //     timeout_ms: u32,
+    // ) -> Result<u32> {
+    //     // SAFETY: safe as per the safety requirements.
+    //     let req = unsafe { *req_ptr } & req_mask;
+    //     let acked = req_mask;
+
+    //     // SAFETY: safe as per the safety requirements.
+    //     let op = || unsafe { *ack_ptr };
+
+    //     let cond = |acked: &u32| *acked & req_mask == req;
+
+    //     let poll_res = io::poll::read_poll_timeout(
+    //         op,
+    //         cond,
+    //         time::Delta::from_millis(100),
+    //         Some(time::Delta::from_millis(200)),
+    //     )?;
+
+    //     if let Err(ETIMEDOUT) = poll_res {
+    //         wait.wait_interruptible_timeout(guard, jiffies)
+    //     } else {
+    //         poll_res
+    //     }
+    // }
+}
--- a/drivers/gpu/drm/tyr/fw/global.rs
+++ b/drivers/gpu/drm/tyr/fw/global.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+//! Code to control the global interface of the CSF firmware.
+
+use kernel::bits::genmask_u32;
+use kernel::clk::Clk;
+use kernel::devres::Devres;
+use kernel::impl_has_work;
+use kernel::interrupt::interrupt_disable;
+use kernel::io;
+use kernel::io::mem::IoMem;
+use kernel::new_spinlock_irq;
+use kernel::new_work;
+use kernel::prelude::*;
+use kernel::sync::Arc;
+use kernel::sync::SpinLockIrq;
+use kernel::time;
+use kernel::time::Delta;
+use kernel::workqueue;
+use kernel::workqueue::Work;
+use kernel::workqueue::WorkItem;
+
+use crate::fw::ReqKmapOffset;
+use crate::gpu::GpuInfo;
+use crate::regs::Doorbell;
+use crate::regs::CSF_GLB_DOORBELL_ID;
+
+use super::{CheckedKmapOffset, ReqWait, Section};
+
+#[allow(dead_code)]
+mod constants {
+    use kernel::bits::{bit_u32, genmask_u32};
+
+    pub(super) const GLB_TIMER_SOURCE_GPU_COUNTER: u32 = bit_u32(31);
+    pub(super) const PROGRESS_TIMEOUT_CYCLES: u32 = 5 * 500 * 1024 * 1024;
+    pub(super) const PROGRESS_TIMEOUT_SCALE_SHIFT: u32 = 10;
+    pub(super) const IDLE_HYSTERESIS_US: u32 = 800;
+    pub(super) const PWROFF_HYSTERESIS_US: u32 = 10000;
+    pub(super) const GLB_HALT: u32 = bit_u32(0);
+    pub(super) const GLB_CFG_PROGRESS_TIMER: u32 = bit_u32(1);
+    pub(super) const GLB_CFG_ALLOC_EN: u32 = bit_u32(2);
+    pub(super) const GLB_CFG_POWEROFF_TIMER: u32 = bit_u32(3);
+    pub(super) const GLB_PROTM_ENTER: u32 = bit_u32(4);
+    pub(super) const GLB_PERFCNT_EN: u32 = bit_u32(5);
+    pub(super) const GLB_PERFCNT_SAMPLE: u32 = bit_u32(6);
+    pub(super) const GLB_COUNTER_EN: u32 = bit_u32(7);
+    pub(super) const GLB_PING: u32 = bit_u32(8);
+    pub(super) const GLB_FWCFG_UPDATE: u32 = bit_u32(9);
+    pub(super) const GLB_IDLE_EN: u32 = bit_u32(10);
+    pub(super) const GLB_SLEEP: u32 = bit_u32(12);
+    pub(super) const GLB_INACTIVE_COMPUTE: u32 = bit_u32(20);
+    pub(super) const GLB_INACTIVE_FRAGMENT: u32 = bit_u32(21);
+    pub(super) const GLB_INACTIVE_TILER: u32 = bit_u32(22);
+    pub(super) const GLB_PROTM_EXIT: u32 = bit_u32(23);
+    pub(super) const GLB_PERFCNT_THRESHOLD: u32 = bit_u32(24);
+    pub(super) const GLB_PERFCNT_OVERFLOW: u32 = bit_u32(25);
+    pub(super) const GLB_IDLE: u32 = bit_u32(26);
+    pub(super) const GLB_DBG_CSF: u32 = bit_u32(30);
+    pub(super) const GLB_DBG_HOST: u32 = bit_u32(31);
+    pub(super) const GLB_REQ_MASK: u32 = genmask_u32(10, 0);
+    pub(super) const GLB_EVT_MASK: u32 = genmask_u32(26, 20);
+
+    pub(super) const PING_INTERVAL_MS: i64 = 12000;
+}
+
+use constants::*;
+
+fn glb_timer_val(val: u32) -> u32 {
+    val & genmask_u32(30, 0)
+}
+
+#[repr(transparent)]
+/// A value that is valid to pass for timeout fields in the global interface.
+struct TimeoutCycles(u32);
+
+impl TimeoutCycles {
+    fn from_micro(core_clk: &Clk, timeout_us: u32) -> Result<Self> {
+        let timer_rate = core_clk.rate() as u64;
+
+        if timer_rate == 0 {
+            return Err(EINVAL);
+        }
+
+        let mut mod_cycles = (u64::from(timeout_us) * timer_rate).div_ceil(1000000 << 10);
+
+        if mod_cycles > glb_timer_val(u32::MAX).into() {
+            pr_err!("Invalid timeout computed\n");
+            mod_cycles = glb_timer_val(u32::MAX).into();
+        }
+
+        let mod_cycles = u32::try_from(mod_cycles)?;
+        Ok(Self(
+            glb_timer_val(mod_cycles) | GLB_TIMER_SOURCE_GPU_COUNTER,
+        ))
+    }
+}
+
+impl From<TimeoutCycles> for u32 {
+    fn from(value: TimeoutCycles) -> Self {
+        value.0
+    }
+}
+
+/// The global control interface.
+#[repr(C)]
+pub(crate) struct Control {
+    pub(crate) version: u32,
+    pub(crate) features: u32,
+    pub(crate) input_va: u32,
+    pub(crate) output_va: u32,
+    pub(crate) group_num: u32,
+    pub(crate) group_stride: u32,
+    pub(crate) perfcnt_size: u32,
+    pub(crate) instr_features: u32,
+}
+
+impl Control {
+    /// CSF major version.
+    pub(crate) fn version_major(&self) -> u32 {
+        self.version >> 24
+    }
+
+    /// CSF minor version.
+    pub(crate) fn version_minor(&self) -> u32 {
+        (self.version >> 16) & 0xff
+    }
+
+    /// CSF patch version.
+    pub(crate) fn version_patch(&self) -> u32 {
+        self.version & 0xffff
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+/// The input area for the global interface
+pub(crate) struct Input {
+    pub(crate) req: u32,
+    pub(crate) ack_irq_mask: u32,
+    pub(crate) doorbell_req: u32,
+    pub(crate) reserved1: u32,
+    pub(crate) progress_timer: u32,
+    pub(crate) poweroff_timer: u32,
+    pub(crate) core_en_mask: u32,
+    pub(crate) reserved2: u32,
+    pub(crate) perfcnt_as: u32,
+    pub(crate) perfcnt_base: u64,
+    pub(crate) perfcnt_extratct: u32,
+    pub(crate) reserved3: [u32; 3],
+    pub(crate) percnt_config: u32,
+    pub(crate) percnt_csg_select: u32,
+    pub(crate) perfcnt_fw_enable: u32,
+    pub(crate) perfcnt_csg_enable: u32,
+    pub(crate) perfcnt_csf_enable: u32,
+    pub(crate) perfcnt_shader_enable: u32,
+    pub(crate) perfcnt_tiler_enable: u32,
+    pub(crate) perfcnt_mmu_l2_enable: u32,
+    pub(crate) reserved4: [u32; 8],
+    pub(crate) idle_timer: u32,
+}
+
+#[repr(C)]
+#[derive(Debug)]
+/// The output area for the global interface
+pub(crate) struct Output {
+    pub(crate) ack: u32,
+    pub(crate) reserved1: u32,
+    pub(crate) doorbell_ack: u32,
+    pub(crate) reserved2: u32,
+    pub(crate) halt_status: u32,
+    pub(crate) perfcnt_status: u32,
+    pub(crate) perfcnt_insert: u32,
+}
+
+macro_rules! impl_shared_section_rw {
+    ($type:ty) => {
+        impl $type {
+            /// Reads the control interface from the given pointer.
+            ///
+            /// Note that the area pointed to by `ptr` is shared with the MCU, so we
+            /// cannot simply parse it or cast it to &Self.
+            ///
+            /// Merely taking a reference to it would be UB, as the MCU can change the
+            /// underlying memory at any time, as it is a core running its own code.
+            pub(super) fn read(
+                shared_section: &mut Section,
+                offset: CheckedKmapOffset,
+            ) -> Result<Self> {
+                let ptr = offset.as_mut_ptr(shared_section)?;
+                // SAFETY: we know that this pointer is aligned and valid for reads for
+                // at least size_of::<Self>() bytes.
+                Ok(unsafe { core::ptr::read_volatile(ptr as *mut Self) })
+            }
+
+            /// Writes the control interface to the given pointer.
+            ///
+            /// Note that the area pointed to by `ptr` is shared with the MCU, so we
+            /// cannot simply parse it or cast it to &mut Self.
+            ///
+            /// Merely taking a reference to it would be UB, as the MCU can change the
+            /// underlying memory at any time, as it is a core running its own code.
+            pub(super) fn write(
+                self,
+                shared_section: &mut Section,
+                offset: CheckedKmapOffset,
+            ) -> Result<()> {
+                let ptr = offset.as_mut_ptr(shared_section)?;
+                // SAFETY: we know that this pointer is aligned and valid for writes for
+                // at least size_of::<Self>() bytes.
+                unsafe {
+                    core::ptr::write_volatile(ptr as *mut Self, self);
+                }
+
+                Ok(())
+            }
+        }
+    };
+}
+
+impl_shared_section_rw!(Control);
+impl_shared_section_rw!(Input);
+impl_shared_section_rw!(Output);
+
+enum GlobalInterfaceState {
+    Disabled,
+    Enabled {
+        control_offset: CheckedKmapOffset,
+        input_offset: CheckedKmapOffset,
+        output_offset: CheckedKmapOffset,
+    },
+}
+
+#[pin_data]
+/// The global interface.
+pub(super) struct GlobalInterface {
+    #[pin]
+    state: SpinLockIrq<GlobalInterfaceState>,
+
+    #[pin]
+    ping_work: Work<Self>,
+
+    iomem: Arc<Devres<IoMem>>,
+
+    #[pin]
+    shared_section: SpinLockIrq<Section>,
+
+    req_wait: Arc<ReqWait>,
+}
+
+impl GlobalInterface {
+    pub(super) fn new(
+        shared_section: Section,
+        iomem: Arc<Devres<IoMem>>,
+        req_wait: Arc<ReqWait>,
+    ) -> Result<Arc<Self>> {
+        let init = pin_init!(Self {
+            state <- new_spinlock_irq!(GlobalInterfaceState::Disabled),
+            ping_work <- new_work!("TyrFwPingWork"),
+            iomem,
+            shared_section <- new_spinlock_irq!(shared_section),
+            req_wait,
+        });
+
+        Arc::pin_init(init, GFP_KERNEL)
+    }
+
+    pub(super) fn enable(
+        self: &Arc<Self>,
+        iomem: Arc<Devres<IoMem>>,
+        gpu_info: &GpuInfo,
+        core_clk: &Clk,
+    ) -> Result {
+        // This takes a mutex internally in clk_prepare().
+        let poweroff_timer = TimeoutCycles::from_micro(core_clk, PWROFF_HYSTERESIS_US)?.into();
+
+        let interrupt_disable = interrupt_disable();
+        let mut shared_section = self.shared_section.lock_with(&interrupt_disable);
+
+        let control_offset = CheckedKmapOffset(0);
+
+        let op = || Control::read(&mut shared_section, control_offset);
+        let cond = |control: &Control| -> bool { control.version != 0 };
+        let _ = io::poll::read_poll_timeout(
+            op,
+            cond,
+            time::Delta::from_millis(0),
+            Some(time::Delta::from_millis(200)),
+        );
+
+        let control = Control::read(&mut shared_section, control_offset)?;
+        if control.version == 0 {
+            pr_err!("MCU firmware version is 0. Firmware may have failed to boot\n");
+            return Err(EINVAL);
+        }
+
+        let input_offset = super::Firmware::kmap_offset(&shared_section, control.input_va.into())?;
+
+        let output_offset =
+            super::Firmware::kmap_offset(&shared_section, control.output_va.into())?;
+
+        pr_info!(
+            "CSF FW using interface v.{}.{}.{}, Features {} Instrumentation features {}\n",
+            control.version_major(),
+            control.version_minor(),
+            control.version_patch(),
+            control.features,
+            control.instr_features
+        );
+
+        let mut input = Input::read(&mut shared_section, input_offset)?;
+
+        // Enable all shader cores.
+        input.core_en_mask = gpu_info.shader_present as u32;
+
+        // Setup timers.
+        input.poweroff_timer = poweroff_timer;
+        input.progress_timer = PROGRESS_TIMEOUT_CYCLES >> PROGRESS_TIMEOUT_SCALE_SHIFT;
+        input.idle_timer = IDLE_HYSTERESIS_US;
+
+        // Enable the interrupts we care about.
+        input.ack_irq_mask = GLB_CFG_ALLOC_EN
+            | GLB_PING
+            | GLB_CFG_POWEROFF_TIMER
+            | GLB_CFG_POWEROFF_TIMER
+            | GLB_IDLE_EN
+            | GLB_IDLE;
+
+        input.write(&mut shared_section, input_offset)?;
+
+        // Req is the first field of the input area.
+        let req = ReqKmapOffset(input_offset);
+        req.update_reqs(&mut shared_section, GLB_IDLE_EN, GLB_IDLE_EN)?;
+
+        // Ack is the first field of the output area.
+        let ack_offset = output_offset;
+        let reqs = GLB_CFG_ALLOC_EN | GLB_CFG_POWEROFF_TIMER | GLB_CFG_PROGRESS_TIMER;
+        req.toggle_reqs(&mut shared_section, ack_offset, reqs)?;
+
+        // Make lockdep happy, and also make sure that this lock is not held
+        // when the interrupt fires, which can be immediately after the doorbell
+        // is rung.
+        drop(shared_section);
+        Doorbell::new(CSF_GLB_DOORBELL_ID).write(&iomem, 1)?;
+
+        let mut state = self.state.lock();
+
+        *state = GlobalInterfaceState::Enabled {
+            control_offset,
+            input_offset,
+            output_offset,
+        };
+
+        Ok(())
+    }
+
+    pub(crate) fn arm_watchdog(self: &Arc<Self>) -> Result {
+        workqueue::system_unbound()
+            .enqueue(self.clone())
+            .map_err(|_| EINVAL)
+    }
+
+    pub(crate) fn ping_once(&self) -> Result {
+        // Have this here until we support delayed works. Instead of a
+        // heartbeat, we get a one-shot ping, but that is OK for now.
+        kernel::time::delay::fsleep(Delta::from_millis(100));
+
+        // let interrupt_disable = interrupt_disable();
+        // let mut state = this.state.lock_with(&interrupt_disable);
+
+        // Unfortunately, we cannot have both interrupts disabled and CondVar at
+        // the same time for now.
+        //
+        // TODO: I don't think we access GlobalInterfaceState from IRQ context,
+        // so we can possibly switch to a regular SpinLock or even a Mutex.
+        let state = self.state.lock();
+        if let &GlobalInterfaceState::Enabled {
+            input_offset,
+            output_offset,
+            ..
+        } = &*state
+        {
+            pr_info!("Pinging the CSF global interface\n");
+
+            // Req is the first field of the input area.
+            let req = ReqKmapOffset(input_offset);
+
+            // Ack is the first field of the output area.
+            let ack = output_offset;
+
+            req.toggle_reqs(&mut self.shared_section.lock(), ack, GLB_PING)?;
+            Doorbell::new(CSF_GLB_DOORBELL_ID).write(&self.iomem, 1)?;
+
+            if !req.pending_reqs(&mut self.shared_section.lock(), ack, GLB_PING)? {
+                pr_info!("CSF has responded to a ping request\n");
+            } else {
+                let op = || req.pending_reqs(&mut self.shared_section.lock(), ack, GLB_PING);
+                let cond = |pending_reqs: &bool| !*pending_reqs;
+
+                io::poll::read_poll_timeout(
+                    op,
+                    cond,
+                    time::Delta::from_millis(0),
+                    Some(time::Delta::from_millis(100)),
+                )?;
+
+                if !req.pending_reqs(&mut self.shared_section.lock(), ack, GLB_PING)? {
+                    pr_info!("CSF has responded to a ping request\n");
+                } else {
+                    pr_err!("CSF has not responded to a ping request\n");
+                    return Err(ETIMEDOUT);
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl_has_work! {
+    impl HasWork<Self> for GlobalInterface {
+        self.ping_work
+    }
+}
+
+impl WorkItem for GlobalInterface {
+    type Pointer = Arc<Self>;
+
+    fn run(this: Self::Pointer) {
+        /* TODO: we need support for delayed_work */
+    }
+}
--- a/drivers/gpu/drm/tyr/fw/irq.rs
+++ b/drivers/gpu/drm/tyr/fw/irq.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+//! The IRQ handling for the Job IRQs.
+//!
+//! The Job IRQ controls our interactions with the MCU.
+
+use kernel::c_str;
+use kernel::devres::Devres;
+use kernel::io::mem::IoMem;
+use kernel::irq;
+use kernel::irq::request::Handler as IrqHandler;
+use kernel::irq::request::IrqReturn;
+use kernel::irq::request::Registration as IrqRegistration;
+use kernel::platform;
+use kernel::prelude::*;
+use kernel::sync::Arc;
+use kernel::types::ARef;
+
+use crate::driver::TyrDevice;
+use crate::regs;
+
+pub(crate) struct JobIrqHandler {
+    tdev: ARef<TyrDevice>,
+    iomem: Arc<Devres<IoMem>>,
+}
+
+impl IrqHandler for JobIrqHandler {
+    fn handle_irq(&self) -> IrqReturn {
+        let rawstat = regs::JOB_INT_RAWSTAT.read(&self.iomem).unwrap_or_default();
+
+        dev_info!(self.tdev.as_ref(), "Acknoledging job IRQ\n");
+
+        let _ = regs::JOB_INT_CLEAR.write(&self.iomem, rawstat);
+
+        let data = self.tdev.data();
+        let mut booted = data.fw.booted.lock();
+
+        if !*booted && rawstat & regs::JOB_INT_GLOBAL_IF != 0 {
+            *booted = true;
+
+            dev_info!(self.tdev.as_ref(), "GPU is ready to accept requests\n");
+        }
+
+        // Notify everyone waiting on a response from CSF.
+        data.fw.req_wait.notify_all();
+        IrqReturn::Handled
+    }
+}
+
+pub(crate) fn job_irq_init(
+    tdev: ARef<TyrDevice>,
+    pdev: platform::Device,
+    iomem: Arc<Devres<IoMem>>,
+) -> Result<impl PinInit<IrqRegistration<JobIrqHandler>, Error>> {
+    let job_irq = pdev.irq_by_name(c_str!("job"))?;
+
+    let irq_handler = JobIrqHandler {
+        tdev,
+        iomem: iomem.clone(),
+    };
+
+    // Lets disable IRQs in favor of explicit polling for now due to issues with
+    // SpinLockIrq and CondVar.
+    //
+    // JOB_INT_MASK.write(&iomem, u32::MAX)?;
+    regs::JOB_INT_MASK.write(&iomem, 0)?;
+
+    Ok(IrqRegistration::register(
+        job_irq,
+        irq::request::flags::SHARED,
+        c_str!("tyr-job"),
+        irq_handler,
+    ))
+}
--- a/drivers/gpu/drm/tyr/fw/parse.rs
+++ b/drivers/gpu/drm/tyr/fw/parse.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+//! Code to parse the firmware binary.
+
+use core::ops::Range;
+
+use cursor::Cursor;
+use kernel::alloc::KVec;
+use kernel::bits::bit_u32;
+use kernel::c_str;
+use kernel::devres::Devres;
+use kernel::fmt;
+use kernel::io::mem::IoMem;
+use kernel::prelude::*;
+use kernel::str::CString;
+use kernel::sync::Arc;
+use kernel::sync::Mutex;
+
+use crate::driver::TyrDevice;
+use crate::fw::CheckedKmapOffset;
+use crate::fw::Firmware;
+use crate::fw::CSF_MCU_SHARED_REGION_START;
+use crate::gem;
+use crate::gem::KernelVaPlacement;
+use crate::gpu::GpuId;
+use crate::gpu::GpuInfo;
+use crate::mmu::vm;
+use crate::mmu::vm::Vm;
+
+mod cursor;
+
+const FW_BINARY_MAGIC: u32 = 0xc3f13a6e;
+const FW_BINARY_MAJOR_MAX: u8 = 0;
+
+mod flags {
+    use kernel::bits::bit_u32;
+    use kernel::bits::genmask_u32;
+    use kernel::prelude::*;
+
+    use crate::impl_flags;
+
+    impl_flags!(Flags, Flag, u32);
+
+    const CACHE_MODE_MASK: Flags = Flags(genmask_u32(4, 3));
+
+    impl Flags {
+        pub(crate) fn cache_mode(&self) -> Flags {
+            *self & CACHE_MODE_MASK
+        }
+    }
+
+    impl TryFrom<u32> for Flags {
+        type Error = Error;
+
+        fn try_from(value: u32) -> Result<Self, Self::Error> {
+            if value & valid_flags().0 != value {
+                Err(EINVAL)
+            } else {
+                Ok(Self(value))
+            }
+        }
+    }
+
+    pub(crate) fn valid_flags() -> Flags {
+        Flags::from(READ)
+            | Flags::from(WRITE)
+            | Flags::from(EXEC)
+            | CACHE_MODE_MASK
+            | Flags::from(PROT)
+            | Flags::from(SHARED)
+            | Flags::from(ZERO)
+    }
+
+    pub(crate) const READ: Flag = Flag(bit_u32(0));
+    pub(crate) const WRITE: Flag = Flag(bit_u32(1));
+    pub(crate) const EXEC: Flag = Flag(bit_u32(2));
+    pub(crate) const CACHE_MODE_NONE: Flag = Flag(0 << 3);
+    pub(crate) const CACHE_MODE_CACHED: Flag = Flag(1 << 3);
+    pub(crate) const CACHE_MODE_UNCACHED_COHERENT: Flag = Flag(2 << 3);
+    pub(crate) const CACHE_MODE_CACHED_COHERENT: Flag = Flag(3 << 3);
+    pub(crate) const PROT: Flag = Flag(bit_u32(5));
+    pub(crate) const SHARED: Flag = Flag(bit_u32(30));
+    pub(crate) const ZERO: Flag = Flag(bit_u32(31));
+}
+
+struct BuildInfoHeader(Range<u32>);
+
+/// A parsed section of the firmware binary.
+pub(super) struct Section {
+    /// Flags for this section.
+    flags: flags::Flags,
+
+    /// The name of the section in the binary, if any.
+    name: Option<CString>,
+
+    /// The raw parsed data for reset purposes.
+    data: KVec<u8>,
+
+    /// The BO that this section was loaded into.
+    pub(super) mem: gem::ObjectRef,
+
+    /// The VA range for this section.
+    ///
+    /// The MCU expects the firmware to be loaded at a specific addresses.
+    va: Range<u32>,
+
+    /// The flags used to map this section.
+    vm_map_flags: vm::map_flags::Flags,
+}
+
+/// The firmware header.
+struct BinaryHeader {
+    /// Magic value to check binary validity.
+    magic: u32,
+
+    /// Minor FW version.
+    minor: u8,
+
+    /// Major FW version.
+    major: u8,
+
+    /// Padding. Must be set to zero.
+    _padding1: u16,
+
+    /// FW Version hash
+    version_hash: u32,
+
+    /// Padding. Must be set to zero.
+    _padding2: u32,
+
+    /// FW binary size
+    size: u32,
+}
+
+impl BinaryHeader {
+    fn new(tdev: &TyrDevice, cursor: &mut Cursor<'_>) -> Result<Self> {
+        let magic = cursor.read_u32(tdev)?;
+        if magic != FW_BINARY_MAGIC {
+            dev_err!(tdev.as_ref(), "Invalid firmware magic");
+            return Err(EINVAL);
+        }
+
+        let minor = cursor.read_u8(tdev)?;
+        let major = cursor.read_u8(tdev)?;
+        let padding1 = cursor.read_u16(tdev)?;
+        let version_hash = cursor.read_u32(tdev)?;
+        let padding2 = cursor.read_u32(tdev)?;
+        let size = cursor.read_u32(tdev)?;
+
+        if padding1 != 0 || padding2 != 0 {
+            dev_err!(
+                tdev.as_ref(),
+                "Invalid firmware file: header padding is not zero"
+            );
+            return Err(EINVAL);
+        }
+
+        Ok(Self {
+            magic,
+            minor,
+            major,
+            _padding1: padding1,
+            version_hash,
+            _padding2: padding2,
+            size,
+        })
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+enum BinaryEntryType {
+    /// Host <-> FW interface.
+    Iface = 0,
+    /// FW config.
+    Config = 1,
+    /// Unit tests.
+    FutfTest = 2,
+    /// Trace buffer interface.
+    TraceBuffer = 3,
+    /// Timeline metadata interface,
+    TimelineMetadata = 4,
+    /// Metadata about how the FW binary was built
+    BuildInfoMetadata = 6,
+}
+
+impl TryFrom<u8> for BinaryEntryType {
+    type Error = Error;
+
+    fn try_from(value: u8) -> Result<Self, Self::Error> {
+        match value {
+            0 => Ok(BinaryEntryType::Iface),
+            1 => Ok(BinaryEntryType::Config),
+            2 => Ok(BinaryEntryType::FutfTest),
+            3 => Ok(BinaryEntryType::TraceBuffer),
+            4 => Ok(BinaryEntryType::TimelineMetadata),
+            6 => Ok(BinaryEntryType::BuildInfoMetadata),
+            _ => Err(EINVAL),
+        }
+    }
+}
+
+#[derive(Debug)]
+struct BinarySectionEntryHeader {
+    /// Section flags
+    flags: flags::Flags,
+    /// MCU virtual range to map this binary section to.
+    va: Range<u32>,
+    /// References the data in the FW binary.
+    data: Range<u32>,
+}
+
+impl BinarySectionEntryHeader {
+    fn new(tdev: &TyrDevice, cursor: &mut Cursor<'_>) -> Result<Self> {
+        let flags = cursor.read_u32(tdev)?;
+        let flags = flags::Flags::try_from(flags)?;
+
+        let va_start = cursor.read_u32(tdev)?;
+        let va_end = cursor.read_u32(tdev)?;
+
+        let va = va_start..va_end;
+
+        if va.is_empty() {
+            dev_err!(
+                tdev.as_ref(),
+                "Invalid firmware file: empty VA range at pos {}\n",
+                cursor.pos(),
+            );
+            return Err(EINVAL);
+        }
+
+        let data_start = cursor.read_u32(tdev)?;
+        let data_end = cursor.read_u32(tdev)?;
+        let data = data_start..data_end;
+
+        Ok(Self { flags, va, data })
+    }
+}
+
+struct BinaryEntryHeader(u32);
+
+impl BinaryEntryHeader {
+    /// The entry type.
+    fn entry_ty(&self) -> Result<BinaryEntryType> {
+        let v = (self.0 & 0xff) as u8;
+        BinaryEntryType::try_from(v)
+    }
+
+    /// Whether this entry is optional.
+    fn optional(&self) -> bool {
+        self.0 & bit_u32(31) != 0
+    }
+
+    /// The size of the entry.
+    fn size(&self) -> u32 {
+        self.0 >> 8 & 0xff
+    }
+}
+
+struct BinaryEntrySection {
+    hdr: BinaryEntryHeader,
+    inner: Option<Section>,
+}
+
+impl Firmware {
+    /// Parses the firmware sections from the binary.
+    pub(super) fn read_sections(
+        tdev: &TyrDevice,
+        iomem: Arc<Devres<IoMem>>,
+        gpu_info: &GpuInfo,
+        vm: Arc<Mutex<Vm>>,
+    ) -> Result<(KVec<Section>, Section)> {
+        let gpu_id = GpuId::from(gpu_info.gpu_id);
+
+        let fw_path = CString::try_from_fmt(fmt!(
+            "arm/mali/arch{}.{}/mali_csffw.bin",
+            gpu_id.arch_major,
+            gpu_id.arch_minor
+        ))?;
+
+        let fw = kernel::firmware::Firmware::request(&fw_path, tdev.as_ref())?;
+
+        let mut cursor = Cursor::new(fw.data());
+
+        dev_err!(
+            tdev.as_ref(),
+            "Requested {} bytes of firmware successfully\n",
+            fw.data().len()
+        );
+        let fw_bin_hdr = match BinaryHeader::new(tdev, &mut cursor) {
+            Ok(fw_bin_hdr) => fw_bin_hdr,
+            Err(e) => {
+                dev_err!(tdev.as_ref(), "Invalid firmware file: {}", e.to_errno());
+                return Err(e);
+            }
+        };
+
+        if fw_bin_hdr.magic != FW_BINARY_MAGIC {
+            dev_err!(tdev.as_ref(), "Invalid firmware magic");
+            return Err(EINVAL);
+        }
+
+        if fw_bin_hdr.major > FW_BINARY_MAJOR_MAX {
+            dev_err!(
+                tdev.as_ref(),
+                "Unsupported firmware binary version: {}.{}",
+                fw_bin_hdr.major,
+                fw_bin_hdr.minor
+            );
+            return Err(EINVAL);
+        }
+
+        if fw_bin_hdr.size > cursor.len() as u32 {
+            dev_err!(tdev.as_ref(), "Firmware image is truncated");
+            return Err(EINVAL);
+        }
+
+        let mut sections = Vec::new();
+        let mut shared_section = None;
+
+        while (cursor.pos() as u32) < fw_bin_hdr.size {
+            match Self::read_entry(&mut cursor, tdev, iomem.clone(), &fw, vm.clone())? {
+                section => {
+                    cursor.advance((section.hdr.size() - 4) as usize)?;
+
+                    match section.inner {
+                        Some(section) => {
+                            // TODO: refactor this.
+                            if section.flags.contains(flags::SHARED) {
+                                shared_section = Some(section);
+                            } else {
+                                sections.push(section, GFP_KERNEL)?
+                            }
+                        }
+                        None => continue,
+                    }
+                }
+            }
+        }
+
+        let shared_section = shared_section.ok_or_else(|| {
+            dev_err!(tdev.as_ref(), "No shared section found in firmware");
+            EINVAL
+        })?;
+
+        Ok((sections, shared_section))
+    }
+
+    fn read_entry(
+        cursor: &mut Cursor<'_>,
+        tdev: &TyrDevice,
+        iomem: Arc<Devres<IoMem>>,
+        fw: &kernel::firmware::Firmware,
+        vm: Arc<Mutex<Vm>>,
+    ) -> Result<BinaryEntrySection> {
+        let section = BinaryEntrySection {
+            hdr: BinaryEntryHeader(cursor.read_u32(tdev)?),
+            inner: None,
+        };
+
+        let section_size = section.hdr.size() as usize - core::mem::size_of::<BinaryEntryHeader>();
+
+        let entry_ty = match section.hdr.entry_ty() {
+            Ok(entry_ty) => entry_ty,
+            Err(e) => {
+                if section.hdr.optional() {
+                    dev_info!(
+                        tdev.as_ref(),
+                        "Skipping unknown optional firmware entry type: {}",
+                        e.to_errno()
+                    );
+                    return Ok(section);
+                } else {
+                    dev_err!(
+                        tdev.as_ref(),
+                        "Invalid firmware entry type: {}",
+                        e.to_errno()
+                    );
+                    return Err(EINVAL);
+                }
+            }
+        };
+
+        if cursor.pos() % core::mem::size_of::<u32>() != 0 {
+            dev_err!(
+                tdev.as_ref(),
+                "Invalid firmware file: entry not aligned to 4 bytes at pos {}\n",
+                cursor.pos()
+            );
+            return Err(EINVAL);
+        }
+
+        let mut entry_cursor = cursor.view(cursor.pos()..cursor.pos() + section_size)?;
+
+        match entry_ty {
+            BinaryEntryType::Iface => Ok(BinaryEntrySection {
+                hdr: section.hdr,
+                inner: Self::read_section(tdev, iomem, &mut entry_cursor, fw, vm.clone())?,
+            }),
+
+            BinaryEntryType::BuildInfoMetadata => {
+                // TODO: Read build metadata
+                Ok(section)
+            }
+
+            BinaryEntryType::Config
+            | BinaryEntryType::FutfTest
+            | BinaryEntryType::TraceBuffer
+            | BinaryEntryType::TimelineMetadata => Ok(section),
+
+            _ => {
+                if !section.hdr.optional() {
+                    dev_info!(
+                        tdev.as_ref(),
+                        "Unsupported non-optional entry type: {}",
+                        entry_ty as u32
+                    );
+
+                    Err(EINVAL)
+                } else {
+                    dev_info!(
+                        tdev.as_ref(),
+                        "Skipping unsupported firmware entry type: {}",
+                        entry_ty as u32
+                    );
+
+                    Ok(section)
+                }
+            }
+        }
+    }
+
+    fn read_section(
+        tdev: &TyrDevice,
+        iomem: Arc<Devres<IoMem>>,
+        cursor: &mut Cursor<'_>,
+        fw: &kernel::firmware::Firmware,
+        vm: Arc<Mutex<Vm>>,
+    ) -> Result<Option<Section>> {
+        let hdr = BinarySectionEntryHeader::new(tdev, cursor)?;
+
+        if hdr.flags.contains(flags::PROT) {
+            dev_warn!(
+                tdev.as_ref(),
+                "Firmware protected mode entry not supported, ignoring"
+            );
+            return Ok(None);
+        }
+
+        if hdr.va.start == CSF_MCU_SHARED_REGION_START && !hdr.flags.contains(flags::SHARED) {
+            dev_err!(
+                tdev.as_ref(),
+                "Interface at 0x{:x} must be shared",
+                CSF_MCU_SHARED_REGION_START
+            );
+            return Err(EINVAL);
+        }
+
+        let name_len = cursor.len() - cursor.pos();
+        let name_bytes = cursor.read(tdev, name_len)?;
+
+        let mut name = KVec::with_capacity(name_bytes.len() + 1, GFP_KERNEL)?;
+        name.extend_from_slice(name_bytes, GFP_KERNEL)?;
+        name.push(0, GFP_KERNEL)?;
+
+        let name = CStr::from_bytes_with_nul(&name)
+            .ok()
+            .and_then(|name| CString::try_from(name).ok());
+
+        let fw = fw.data();
+        let section_start = hdr.data.start as usize;
+        let section_end = hdr.data.end as usize;
+
+        let mut data = KVec::new();
+        data.extend_from_slice(&fw[section_start..section_end], GFP_KERNEL)?;
+
+        let bo_len = (hdr.va.end - hdr.va.start) as usize;
+
+        let cache_mode = hdr.flags.cache_mode();
+
+        let mut vm_map_flags = vm::map_flags::Flags::empty();
+
+        if !hdr.flags.contains(flags::WRITE) {
+            vm_map_flags |= vm::map_flags::READONLY;
+        }
+        if !hdr.flags.contains(flags::EXEC) {
+            vm_map_flags |= vm::map_flags::NOEXEC;
+        }
+        if cache_mode != flags::CACHE_MODE_CACHED.into() {
+            vm_map_flags |= vm::map_flags::UNCACHED;
+        }
+
+        let mut mem = gem::new_kernel_object(
+            tdev,
+            iomem,
+            bo_len,
+            vm,
+            KernelVaPlacement::At(hdr.va.start as u64..hdr.va.end as u64),
+            vm_map_flags,
+        )?;
+
+        let vmap = mem.vmap()?;
+        let vmap = vmap.as_mut_slice();
+
+        vmap[0..data.len()].copy_from_slice(&data);
+
+        if hdr.flags.contains(flags::ZERO) {
+            vmap[data.len()..].fill(0);
+        }
+
+        dev_info!(
+            tdev.as_ref(),
+            "Copied firmware data to BO {:p} of size {} with flags {}\n",
+            &mem.gem,
+            bo_len,
+            vm_map_flags
+        );
+
+        Ok(Some(Section {
+            flags: hdr.flags,
+            name,
+            data,
+            mem,
+            va: hdr.va,
+            vm_map_flags,
+        }))
+    }
+
+    fn read_build_info(cursor: &mut Cursor<'_>, tdev: &TyrDevice) -> Result<()> {
+        let meta_start = cursor.read_u32(tdev)? as usize;
+        let meta_end = cursor.read_u32(tdev)? as usize;
+
+        let expected_hdr = b"git_sha: ";
+        let hdr = cursor.read(tdev, expected_hdr.len())?;
+
+        if hdr != expected_hdr {
+            dev_warn!(tdev.as_ref(), "Firmware's git sha is missing\n");
+            return Ok(());
+        }
+
+        let sz = meta_end - meta_start - expected_hdr.len();
+        let sha = cursor.read(tdev, sz)?;
+        if sha[sha.len()] != 0 {
+            dev_warn!(tdev.as_ref(), "Firmware's git sha is not NULL terminated\n");
+            return Ok(()); // Don't treat as fatal
+        }
+
+        let sha = CStr::from_bytes_with_nul(sha).unwrap_or(c_str!(""));
+        dev_info!(
+            tdev.as_ref(),
+            "Firmware git sha: {}\n",
+            sha.to_str().unwrap()
+        );
+
+        Ok(())
+    }
+
+    /// Computes the offset into the shared section for a given VA in the shared
+    /// area.
+    ///
+    /// The result is an offset that can be safely dereferenced by the CPU.
+    pub(super) fn kmap_offset(shared_section: &Section, mcu_va: u64) -> Result<CheckedKmapOffset> {
+        let shared_mem_start = u64::from(shared_section.va.start);
+        let shared_mem_end = u64::from(shared_section.va.end);
+
+        if mcu_va < shared_mem_start || mcu_va >= shared_mem_end {
+            Err(EINVAL)
+        } else {
+            let offset = (mcu_va - shared_mem_start) as usize;
+            Ok(CheckedKmapOffset(offset))
+        }
+    }
+}
--- a/drivers/gpu/drm/tyr/fw/parse/cursor.rs
+++ b/drivers/gpu/drm/tyr/fw/parse/cursor.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+//! A bare-bones std::io::Cursor<[u8]> clone to keep track of the current
+//! position in the firmware binary.
+
+use core::ops::Range;
+
+use kernel::prelude::*;
+
+use crate::driver::TyrDevice;
+
+pub(crate) struct Cursor<'a> {
+    data: &'a [u8],
+    pos: usize,
+}
+
+impl<'a> Cursor<'a> {
+    pub(crate) fn new(data: &'a [u8]) -> Self {
+        Self { data, pos: 0 }
+    }
+
+    pub(super) fn len(&self) -> usize {
+        self.data.len()
+    }
+
+    pub(super) fn pos(&self) -> usize {
+        self.pos
+    }
+
+    pub(super) fn advance(&mut self, nbytes: usize) -> Result {
+        if self.pos + nbytes > self.data.len() {
+            return Err(EINVAL);
+        }
+
+        self.pos += nbytes;
+        Ok(())
+    }
+
+    /// Returns a view into the cursor's data.
+    ///
+    /// This spawns a new cursor, leaving the current cursor unchanged.
+    pub(super) fn view(&self, range: Range<usize>) -> Result<Cursor<'_>> {
+        if range.start < self.pos || range.end > self.data.len() {
+            pr_err!(
+                "Invalid cursor range {:?} for data of length {}",
+                range,
+                self.data.len()
+            );
+
+            Err(EINVAL)
+        } else {
+            Ok(Self {
+                data: &self.data[range],
+                pos: 0,
+            })
+        }
+    }
+
+    pub(super) fn read(&mut self, tdev: &TyrDevice, nbytes: usize) -> Result<&[u8]> {
+        let start = self.pos;
+        let end = start + nbytes;
+
+        if end > self.data.len() {
+            dev_err!(
+                tdev.as_ref(),
+                "Invalid firmware file: read of size {} at position {} is out of bounds",
+                nbytes,
+                start,
+            );
+            return Err(EINVAL);
+        }
+
+        self.pos += nbytes;
+        Ok(&self.data[start..end])
+    }
+
+    pub(super) fn read_u8(&mut self, tdev: &TyrDevice) -> Result<u8> {
+        let bytes = self.read(tdev, 1)?;
+        Ok(bytes[0])
+    }
+
+    pub(super) fn read_u16(&mut self, tdev: &TyrDevice) -> Result<u16> {
+        let bytes = self.read(tdev, 2)?;
+        Ok(u16::from_le_bytes(bytes.try_into().unwrap()))
+    }
+
+    pub(super) fn read_u32(&mut self, tdev: &TyrDevice) -> Result<u32> {
+        let bytes = self.read(tdev, 4)?;
+        Ok(u32::from_le_bytes(bytes.try_into().unwrap()))
+    }
+}
--- a/drivers/gpu/drm/tyr/fw/wait.rs
+++ b/drivers/gpu/drm/tyr/fw/wait.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+//! Code to wait on CSF responses.
+
+use kernel::io;
+use kernel::new_condvar;
+use kernel::new_spinlock_irq;
+use kernel::prelude::*;
+use kernel::sync::Arc;
+use kernel::sync::CondVar;
+use kernel::sync::SpinLockIrq;
+use kernel::time;
+use kernel::time::msecs_to_jiffies;
+
+#[pin_data]
+/// Represents a wait on a request made to CSF.
+pub(crate) struct ReqWait {
+    #[pin]
+    /// The actual wait/signal mechanism.
+    cond: CondVar,
+
+    #[pin]
+    /// Serializes the access to the lock.
+    ///
+    /// This deviates a bit from the general Rust pattern of having the lock
+    /// wrap the data. That is because the "data" is actually a shared section
+    /// of GPU memory whose layout has to match the firmware's expectations.
+    lock: SpinLockIrq<()>,
+}
+
+impl ReqWait {
+    /// A convenience function to initialize the `ReqWait` struct.
+    ///
+    /// There is only one instance of this struct in the entire driver.
+    ///
+    /// Code that needs to wait on a given CSF request take a reference to
+    /// `ReqWait`. A `notify_all()` is called every time the job IRQ is
+    /// triggered.
+    pub(crate) fn new() -> Result<Arc<Self>> {
+        Arc::pin_init(
+            pin_init!(Self {
+                cond <- new_condvar!(),
+                lock <- new_spinlock_irq!(()),
+            }),
+            GFP_KERNEL,
+        )
+    }
+
+    /// Waits for the completion of a previously submitted CSF request.
+    ///
+    /// It's usually useful to busy-wait for a short period of time before going
+    /// to sleep, as some requests can be answered extremely quickly.
+    pub(crate) fn wait_interruptible_timeout<Op, Cond, T>(
+        &self,
+        timeout_ms: u32,
+        mut op: Op,
+        mut cond: Cond,
+        busy_wait_first: bool,
+    ) -> Result
+    where
+        Op: FnMut() -> Result<T>,
+        Cond: FnMut(&T) -> bool,
+    {
+        if busy_wait_first {
+            let res = io::poll::read_poll_timeout(
+                &mut op,
+                &mut cond,
+                time::Delta::from_millis(0),
+                Some(time::Delta::from_micros(10)),
+            );
+
+            if res.is_ok() {
+                return Ok(());
+            }
+        }
+
+        let mut guard = self.lock.lock();
+        let mut remaining_time = msecs_to_jiffies(timeout_ms);
+
+        loop {
+            match self
+                .cond
+                .wait_interruptible_timeout(&mut guard, remaining_time)
+            {
+                kernel::sync::CondVarTimeoutResult::Woken { jiffies } => {
+                    let op = op()?;
+                    if cond(&op) {
+                        return Ok(());
+                    } else {
+                        remaining_time -= jiffies
+                    }
+                }
+                kernel::sync::CondVarTimeoutResult::Timeout => return Err(ETIMEDOUT),
+                kernel::sync::CondVarTimeoutResult::Signal { .. } => return Err(ERESTARTSYS),
+            }
+        }
+    }
+
+    pub(crate) fn notify_one(&self) {
+        let _guard = self.lock.lock();
+        self.cond.notify_one();
+    }
+
+    pub(crate) fn notify_all(&self) {
+        let _guard = self.lock.lock();
+        self.cond.notify_all();
+    }
+}
--- a/drivers/gpu/drm/tyr/gem.rs
+++ b/drivers/gpu/drm/tyr/gem.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+use core::ops::Range;
+
+use kernel::devres::Devres;
+use kernel::drm::gem::shmem;
+use kernel::drm::gem::BaseObject;
+use kernel::drm::gem::{self};
+use kernel::drm::mm;
+use kernel::io::mem::IoMem;
+use kernel::prelude::*;
+use kernel::sync::Arc;
+use kernel::sync::Mutex;
+
+use crate::driver::TyrDevice;
+use crate::driver::TyrDriver;
+use crate::file::DrmFile;
+use crate::mmu::vm;
+use crate::mmu::vm::Vm;
+
+/// GEM Object inner driver data
+#[pin_data]
+pub(crate) struct DriverObject {
+    /// Whether this is a kernel or user BO.
+    ty: ObjectType,
+
+    /// The flags received at BO creation time.
+    flags: u32,
+}
+
+enum ObjectType {
+    Kernel {
+        // Kernel objects have their VA managed by the MM allocator. This node
+        // represents the allocation.
+        node: mm::Node<(), ()>,
+    },
+
+    User,
+}
+
+/// Type alias for the GEM object tyoe for this driver.
+pub(crate) type Object = gem::shmem::Object<DriverObject>;
+
+/// Type alias for the SGTable type for this driver.
+pub(crate) type SGTable = shmem::SGTable<DriverObject>;
+
+impl gem::BaseDriverObject<Object> for DriverObject {
+    fn new(dev: &TyrDevice, _size: usize) -> impl PinInit<Self, Error> {
+        dev_dbg!(dev.as_ref(), "DriverObject::new\n");
+        DriverObject {
+            ty: ObjectType::User,
+            flags: 0,
+        }
+    }
+}
+
+impl gem::shmem::DriverObject for DriverObject {
+    type Driver = TyrDriver;
+}
+
+/// A shared reference to a GEM object for this driver.
+pub(crate) struct ObjectRef {
+    /// The underlying GEM object reference
+    pub(crate) gem: gem::ObjectRef<shmem::Object<DriverObject>>,
+
+    /// The kernel-side VMap of this object, if any.
+    vmap: Option<shmem::VMap<DriverObject>>,
+}
+
+impl ObjectRef {
+    /// Create a new wrapper for a raw GEM object reference.
+    pub(crate) fn new(gem: gem::ObjectRef<shmem::Object<DriverObject>>) -> ObjectRef {
+        ObjectRef { gem, vmap: None }
+    }
+
+    /// Return the `VMap` for this object, creating it if necessary.
+    pub(crate) fn vmap(&mut self) -> Result<&mut shmem::VMap<DriverObject>> {
+        if self.vmap.is_none() {
+            self.vmap = Some(self.gem.vmap()?);
+        }
+        Ok(self.vmap.as_mut().unwrap())
+    }
+
+    /// Returns the size of an object in bytes
+    pub(crate) fn size(&self) -> usize {
+        self.gem.size()
+    }
+}
+
+/// Create a new DRM GEM object.
+pub(crate) fn new_object(dev: &TyrDevice, size: usize, flags: u32) -> Result<ObjectRef> {
+    let aligned_size = size.next_multiple_of(1 << 12);
+
+    if size == 0 || size > aligned_size {
+        return Err(EINVAL);
+    }
+
+    let mut gem = Object::new(dev, aligned_size)?;
+    gem.set_wc(true);
+    gem.flags = flags;
+
+    Ok(ObjectRef::new(gem.into_ref()))
+}
+
+/// Look up a GEM object handle for a `File` and return an `ObjectRef` for it.
+pub(crate) fn lookup_handle(file: &DrmFile, handle: u32) -> Result<ObjectRef> {
+    Ok(ObjectRef::new(shmem::Object::lookup_handle(file, handle)?))
+}
+
+/// Create a new kernel-owned GEM object.
+pub(crate) fn new_kernel_object(
+    tdev: &TyrDevice,
+    iomem: Arc<Devres<IoMem>>,
+    size: usize,
+    vm: Arc<Mutex<Vm>>,
+    va: KernelVaPlacement,
+    flags: vm::map_flags::Flags,
+) -> Result<ObjectRef> {
+    let aligned_size = size.next_multiple_of(1 << 12);
+
+    let mut gem: gem::UniqueObjectRef<shmem::Object<DriverObject>> =
+        shmem::Object::<DriverObject>::new(tdev, aligned_size)?;
+    gem.set_wc(true);
+
+    let node = vm.lock().alloc_kernel_range(va)?;
+
+    let range = node.start()..node.start() + node.size();
+    gem.ty = ObjectType::Kernel { node };
+
+    let gem = ObjectRef::new(gem.into_ref());
+    vm.lock().bind_gem(iomem, &gem.gem, 0, range, flags)?;
+
+    Ok(gem)
+}
+
+/// Creates a dummy GEM object to serve as the root of a GPUVM.
+pub(crate) fn new_dummy_object(tdev: &TyrDevice) -> Result<ObjectRef> {
+    let mut gem = Object::new(tdev, 4096)?;
+    gem.set_wc(true);
+
+    Ok(ObjectRef::new(gem.into_ref()))
+}
+
+/// Controls the VA range assigned to a kernel-owned GEM object.
+pub(crate) enum KernelVaPlacement {
+    /// Automatically place this object in a free spot in the kernel VA range.
+    Auto,
+    /// Place this object at a given address.
+    At(Range<u64>),
+}
--- a/drivers/gpu/drm/tyr/gpu.rs
+++ b/drivers/gpu/drm/tyr/gpu.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+use crate::regs::*;
+use kernel::bits;
+use kernel::bits::genmask_u32;
+use kernel::devres::Devres;
+use kernel::io;
+use kernel::io::mem::IoMem;
+use kernel::platform;
+use kernel::prelude::*;
+use kernel::time;
+use kernel::transmute::AsBytes;
+
+pub(crate) mod irq;
+pub(crate) mod wait;
+
+#[repr(C)]
+// This can be queried by userspace to get information about the GPU.
+pub(crate) struct GpuInfo {
+    pub(crate) gpu_id: u32,
+    pub(crate) csf_id: u32,
+    pub(crate) gpu_rev: u32,
+    pub(crate) core_features: u32,
+    pub(crate) l2_features: u32,
+    pub(crate) tiler_features: u32,
+    pub(crate) mem_features: u32,
+    pub(crate) mmu_features: u32,
+    pub(crate) thread_features: u32,
+    pub(crate) max_threads: u32,
+    pub(crate) thread_max_workgroup_size: u32,
+    pub(crate) thread_max_barrier_size: u32,
+    pub(crate) coherency_features: u32,
+    pub(crate) texture_features: [u32; 4],
+    pub(crate) as_present: u32,
+    pub(crate) shader_present: u64,
+    pub(crate) tiler_present: u64,
+    pub(crate) l2_present: u64,
+}
+
+impl GpuInfo {
+    pub(crate) fn new(iomem: &Devres<IoMem>) -> Result<Self> {
+        let gpu_id = GPU_ID.read(iomem)?;
+        let csf_id = GPU_CSF_ID.read(iomem)?;
+        let gpu_rev = GPU_REVID.read(iomem)?;
+        let core_features = GPU_CORE_FEATURES.read(iomem)?;
+        let l2_features = GPU_L2_FEATURES.read(iomem)?;
+        let tiler_features = GPU_TILER_FEATURES.read(iomem)?;
+        let mem_features = GPU_MEM_FEATURES.read(iomem)?;
+        let mmu_features = GPU_MMU_FEATURES.read(iomem)?;
+        let thread_features = GPU_THREAD_FEATURES.read(iomem)?;
+        let max_threads = GPU_THREAD_MAX_THREADS.read(iomem)?;
+        let thread_max_workgroup_size = GPU_THREAD_MAX_WORKGROUP_SIZE.read(iomem)?;
+        let thread_max_barrier_size = GPU_THREAD_MAX_BARRIER_SIZE.read(iomem)?;
+        let coherency_features = GPU_COHERENCY_FEATURES.read(iomem)?;
+
+        let texture_features = GPU_TEXTURE_FEATURES0.read(iomem)?;
+
+        let as_present = GPU_AS_PRESENT.read(iomem)?;
+
+        let shader_present = GPU_SHADER_PRESENT_LO.read(iomem)? as u64;
+        let shader_present = shader_present | (GPU_SHADER_PRESENT_HI.read(iomem)? as u64) << 32;
+
+        let tiler_present = GPU_TILER_PRESENT_LO.read(iomem)? as u64;
+        let tiler_present = tiler_present | (GPU_TILER_PRESENT_HI.read(iomem)? as u64) << 32;
+
+        let l2_present = GPU_L2_PRESENT_LO.read(iomem)? as u64;
+        let l2_present = l2_present | (GPU_L2_PRESENT_HI.read(iomem)? as u64) << 32;
+
+        Ok(Self {
+            gpu_id,
+            csf_id,
+            gpu_rev,
+            core_features,
+            l2_features,
+            tiler_features,
+            mem_features,
+            mmu_features,
+            thread_features,
+            max_threads,
+            thread_max_workgroup_size,
+            thread_max_barrier_size,
+            coherency_features,
+            texture_features: [texture_features, 0, 0, 0],
+            as_present,
+            shader_present,
+            tiler_present,
+            l2_present,
+        })
+    }
+
+    pub(crate) fn log(&self, pdev: &platform::Device) {
+        let major = (self.gpu_id >> 16) & 0xff;
+        let minor = (self.gpu_id >> 8) & 0xff;
+        let status = self.gpu_id & 0xff;
+
+        let model_name = if let Some(model) = GPU_MODELS
+            .iter()
+            .find(|&f| f.major == major && f.minor == minor)
+        {
+            model.name
+        } else {
+            "unknown"
+        };
+
+        dev_info!(
+            pdev.as_ref(),
+            "mali-{} id 0x{:x} major 0x{:x} minor 0x{:x} status 0x{:x}",
+            model_name,
+            self.gpu_id >> 16,
+            major,
+            minor,
+            status
+        );
+
+        dev_info!(
+            pdev.as_ref(),
+            "Features: L2:{:#x} Tiler:{:#x} Mem:{:#x} MMU:{:#x} AS:{:#x}",
+            self.l2_features,
+            self.tiler_features,
+            self.mem_features,
+            self.mmu_features,
+            self.as_present
+        );
+
+        dev_info!(
+            pdev.as_ref(),
+            "shader_present=0x{:016x} l2_present=0x{:016x} tiler_present=0x{:016x}",
+            self.shader_present,
+            self.l2_present,
+            self.tiler_present
+        );
+    }
+
+    pub(crate) fn va_bits(&self) -> u32 {
+        self.mmu_features & bits::genmask_u32(7, 0)
+    }
+
+    pub(crate) fn pa_bits(&self) -> u32 {
+        (self.mmu_features >> 8) & bits::genmask_u32(7, 0)
+    }
+}
+
+// SAFETY:
+//
+// This type is the same type exposed by Panthor's uAPI. As it's declared as
+// #repr(C), we can be sure that the layout is the same. Therefore, it is safe
+// to expose this to userspace.
+unsafe impl AsBytes for GpuInfo {}
+
+struct GpuModels {
+    name: &'static str,
+    major: u32,
+    minor: u32,
+}
+
+const GPU_MODELS: [GpuModels; 1] = [GpuModels {
+    name: "g610",
+    major: 10,
+    minor: 7,
+}];
+
+#[allow(dead_code)]
+pub(crate) struct GpuId {
+    pub(crate) arch_major: u32,
+    pub(crate) arch_minor: u32,
+    pub(crate) arch_rev: u32,
+    pub(crate) prod_major: u32,
+    pub(crate) ver_major: u32,
+    pub(crate) ver_minor: u32,
+    pub(crate) ver_status: u32,
+}
+
+impl From<u32> for GpuId {
+    fn from(value: u32) -> Self {
+        GpuId {
+            arch_major: (value & genmask_u32(31, 28)) >> 28,
+            arch_minor: (value & genmask_u32(27, 24)) >> 24,
+            arch_rev: (value & genmask_u32(23, 20)) >> 20,
+            prod_major: (value & genmask_u32(19, 16)) >> 16,
+            ver_major: (value & genmask_u32(15, 12)) >> 12,
+            ver_minor: (value & genmask_u32(11, 4)) >> 4,
+            ver_status: value & genmask_u32(3, 0),
+        }
+    }
+}
+
+/// Powers on the l2 block.
+pub(crate) fn l2_power_on(iomem: &Devres<IoMem>) -> Result<()> {
+    let op = || L2_PWRTRANS_LO.read(iomem);
+
+    let cond = |pwr_trans: &u32| *pwr_trans == 0;
+
+    let _ = io::poll::read_poll_timeout(
+        op,
+        cond,
+        time::Delta::from_millis(100),
+        Some(time::Delta::from_millis(200)),
+    )?;
+
+    L2_PWRON_LO.write(iomem, 1)?;
+
+    let op = || L2_READY_LO.read(iomem);
+    let cond = |l2_ready: &u32| *l2_ready == 1;
+
+    let _ = io::poll::read_poll_timeout(
+        op,
+        cond,
+        time::Delta::from_millis(100),
+        Some(time::Delta::from_millis(200)),
+    )?;
+
+    Ok(())
+}
--- a/drivers/gpu/drm/tyr/gpu/irq.rs
+++ b/drivers/gpu/drm/tyr/gpu/irq.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+//! GPU IRQ handler.
+
+use kernel::bits::bit_u32;
+use kernel::c_str;
+use kernel::devres::Devres;
+use kernel::io::mem::IoMem;
+use kernel::irq;
+use kernel::irq::request::Handler as IrqHandler;
+use kernel::irq::request::IrqReturn;
+use kernel::irq::request::Registration as IrqRegistration;
+use kernel::platform;
+use kernel::prelude::*;
+use kernel::sync::Arc;
+use kernel::types::ARef;
+
+use crate::driver::TyrDevice;
+use crate::gpu::wait::PowerOnWait;
+use crate::regs;
+
+const RESET_COMPLETED: u32 = bit_u32(8);
+const POWER_CHANGED_SINGLE: u32 = bit_u32(9);
+const POWER_CHANGED_ALL: u32 = bit_u32(10);
+
+pub(crate) struct GpuIrqHandler {
+    _tdev: ARef<TyrDevice>,
+    iomem: Arc<Devres<IoMem>>,
+    poweron_wait: Arc<PowerOnWait>,
+}
+
+impl IrqHandler for GpuIrqHandler {
+    fn handle_irq(&self) -> IrqReturn {
+        let int_stat = regs::GPU_INT_RAWSTAT.read(&self.iomem).unwrap_or_default();
+
+        pr_info!("Acknowledging GPU_INT_RAWSTAT: {:#x}\n", int_stat);
+
+        let _ = regs::GPU_INT_CLEAR.write(&self.iomem, int_stat);
+
+        if int_stat == RESET_COMPLETED | POWER_CHANGED_SINGLE | POWER_CHANGED_ALL {
+            *self.poweron_wait.powered_on.lock() = true;
+            self.poweron_wait.wait.notify_one();
+        }
+
+        IrqReturn::Handled
+    }
+}
+
+pub(crate) fn gpu_irq_init(
+    tdev: ARef<TyrDevice>,
+    pdevice: platform::Device,
+    iomem: Arc<Devres<IoMem>>,
+    poweron_wait: Arc<PowerOnWait>,
+) -> Result<impl PinInit<IrqRegistration<GpuIrqHandler>, Error>> {
+    let gpu_irq = pdevice.irq_by_name(c_str!("gpu"))?;
+    let irq_handler = GpuIrqHandler {
+        _tdev: tdev,
+        iomem: iomem.clone(),
+        poweron_wait,
+    };
+
+    // Lets disable IRQs in favor of explicit polling for now due to issues with
+    // SpinLockIrq and CondVar.
+    //
+    // GPU_INT_MASK.write(&iomem, core::u32::MAX)?;
+    regs::GPU_INT_MASK.write(&iomem, 0)?;
+
+    Ok(IrqRegistration::register(
+        gpu_irq,
+        irq::request::flags::SHARED,
+        c_str!("tyr-gpu"),
+        irq_handler,
+    ))
+}
--- a/drivers/gpu/drm/tyr/gpu/wait.rs
+++ b/drivers/gpu/drm/tyr/gpu/wait.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+//! A wait on GPU events. Currently only used for power on.
+
+use kernel::new_condvar;
+use kernel::new_spinlock;
+use kernel::prelude::*;
+use kernel::sync::Arc;
+use kernel::sync::CondVar;
+use kernel::sync::SpinLock;
+
+#[pin_data]
+pub(crate) struct PowerOnWait {
+    #[pin]
+    pub(crate) wait: CondVar,
+    #[pin]
+    pub(crate) powered_on: SpinLock<bool>,
+}
+
+impl PowerOnWait {
+    pub(crate) fn new() -> Result<Arc<Self>> {
+        Arc::pin_init(
+            pin_init!(PowerOnWait {
+                wait <- new_condvar!(),
+                powered_on <- new_spinlock!(false),
+            }),
+            GFP_KERNEL,
+        )
+    }
+}
--- a/drivers/gpu/drm/tyr/mmu.rs
+++ b/drivers/gpu/drm/tyr/mmu.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+use core::ops::Range;
+
+use as_lock::AsLockToken;
+use faults::decode_faults;
+use kernel::devres::Devres;
+use kernel::io;
+use kernel::io::mem::IoMem;
+use kernel::io_pgtable;
+use kernel::new_mutex;
+use kernel::platform;
+use kernel::prelude::*;
+use kernel::sync::Arc;
+use kernel::sync::Mutex;
+use kernel::time::Delta;
+use kernel::types::ForeignOwnable;
+use vm::Vm;
+use vm::VmLayout;
+
+use crate::driver::TyrDevice;
+use crate::gpu::GpuInfo;
+use crate::regs::*;
+
+mod as_lock;
+mod faults;
+pub(crate) mod irq;
+pub(crate) mod vm;
+
+pub(crate) struct Mmu {
+    /// List containing all VMs.
+    vms: KVec<Arc<Mutex<Vm>>>,
+    /// Tracks which of the 32 AS slots are free.
+    free_slots: usize,
+    // slot_allocator: Arc<Mutex<SlotAllocator>>,
+}
+
+impl Mmu {
+    pub(crate) fn new() -> Result<Self> {
+        Ok(Self {
+            vms: KVec::new(),
+            // slot_allocator: Arc::pin_init(
+            //     new_mutex!(SlotAllocator {
+            //         free_mask: u32::MAX,
+            //     }),
+            //     GFP_KERNEL,
+            // )?,
+            free_slots: usize::MAX,
+        })
+    }
+
+    pub(crate) fn create_vm(
+        &mut self,
+        tdev: &TyrDevice,
+        pdev: platform::Device,
+        gpu_info: &GpuInfo,
+        for_mcu: bool,
+        layout: VmLayout,
+        auto_kernel_va: Range<u64>,
+        /* coherent: bool, */
+    ) -> Result<Arc<Mutex<Vm>>> {
+        let vm = Vm::create(tdev, pdev, for_mcu, gpu_info, layout, auto_kernel_va)?;
+
+        let vm = Arc::pin_init(new_mutex!(vm), GFP_KERNEL)?;
+        self.vms.push(vm.clone(), GFP_KERNEL)?;
+        Ok(vm)
+    }
+
+    fn flush_range(iomem: &Devres<IoMem>, as_nr: usize, range: Range<u64>) -> Result {
+        Self::do_as_command(iomem, as_nr, AS_COMMAND_FLUSH_PT, range)
+    }
+
+    fn allocate_as(&mut self) -> Result<usize> {
+        let slot = self.free_slots.trailing_zeros();
+        if slot == 32 {
+            return Err(EBUSY);
+        }
+
+        self.free_slots |= 1 << slot;
+        Ok(slot as usize)
+    }
+
+    fn wait_ready(iomem: &Devres<IoMem>, as_nr: usize) -> Result {
+        let op = || as_status(as_nr)?.read(iomem);
+        let cond = |status: &u32| -> bool { *status & AS_STATUS_ACTIVE == 0 };
+        let _ = io::poll::read_poll_timeout(
+            op,
+            cond,
+            Delta::from_millis(0),
+            Some(Delta::from_micros(10000)),
+        )?;
+
+        Ok(())
+    }
+
+    /// TODO: The code to manage AS slots is still TODO.
+    fn free_as(&mut self, as_nr: usize) {
+        self.free_slots &= !(1 << as_nr);
+    }
+
+    fn do_as_command(
+        iomem: &Devres<IoMem>,
+        as_nr: usize,
+        command: u32,
+        region: Range<u64>,
+    ) -> Result {
+        if command == AS_COMMAND_UNLOCK {
+            as_command(as_nr)?.write(iomem, command)?;
+        } else {
+            let _lock = AsLockToken::lock_region(iomem, as_nr, region)?;
+            Self::wait_ready(iomem, as_nr)?;
+            as_command(as_nr)?.write(iomem, command)?;
+            Self::wait_ready(iomem, as_nr)?;
+        }
+
+        Ok(())
+    }
+
+    pub(crate) fn bind_vm(
+        &mut self,
+        vm: Arc<Mutex<Vm>>,
+        gpu_info: &GpuInfo,
+        iomem: &Devres<IoMem>,
+    ) -> Result {
+        let mut vm = vm.lock();
+        let va_bits = gpu_info.va_bits();
+
+        let transtab = vm.gpuvm.exec_lock(None)?.page_table.cfg().ttbr;
+        let transcfg = AS_TRANSCFG_PTW_MEMATTR_WB
+            | AS_TRANSCFG_PTW_RA
+            | AS_TRANSCFG_ADRMODE_AARCH64_4K
+            | as_transcfg_ina_bits((55 - va_bits).into());
+
+        let memattr = vm.memattr;
+        let as_nr = if vm.for_mcu { 0 } else { self.allocate_as()? };
+
+        Self::enable_as(iomem, as_nr as usize, transtab, transcfg.into(), memattr)?;
+
+        vm.address_space = Some(as_nr as usize);
+        Ok(())
+    }
+
+    fn enable_as(
+        iomem: &Devres<IoMem>,
+        as_nr: usize,
+        transtab: u64,
+        transcfg: u64,
+        memattr: u64,
+    ) -> Result {
+        let active = as_status(as_nr)?.read(iomem)? & AS_STATUS_ACTIVE != 0;
+        if active {
+            return Err(EBUSY);
+        }
+
+        Self::do_as_command(iomem, as_nr, AS_COMMAND_FLUSH_MEM, 0..u64::MAX)?;
+
+        let transtab_lo = (transtab & 0xffffffff) as u32;
+        let transtab_hi = (transtab >> 32) as u32;
+
+        let transcfg_lo = (transcfg & 0xffffffff) as u32;
+        let transcfg_hi = (transcfg >> 32) as u32;
+
+        let memattr_lo = (memattr & 0xffffffff) as u32;
+        let memattr_hi = (memattr >> 32) as u32;
+
+        as_transtab_lo(as_nr)?.write(iomem, transtab_lo)?;
+        as_transtab_hi(as_nr)?.write(iomem, transtab_hi)?;
+
+        as_transcfg_lo(as_nr)?.write(iomem, transcfg_lo)?;
+        as_transcfg_hi(as_nr)?.write(iomem, transcfg_hi)?;
+
+        as_memattr_lo(as_nr)?.write(iomem, memattr_lo)?;
+        as_memattr_hi(as_nr)?.write(iomem, memattr_hi)?;
+
+        as_command(as_nr)?.write(iomem, AS_COMMAND_UPDATE)?;
+
+        let op = || as_status(as_nr)?.read(iomem);
+        let cond = |status: &u32| -> bool { *status & AS_STATUS_ACTIVE == 0 };
+        let _ = io::poll::read_poll_timeout(
+            op,
+            cond,
+            Delta::from_millis(0),
+            Some(Delta::from_micros(200)),
+        )?;
+
+        Ok(())
+    }
+}
+
+/* dummy TLB ops, the real TLB flush happens in panthor_vm_flush_range() */
+impl io_pgtable::FlushOps for Mmu {
+    type Data = ();
+
+    fn tlb_flush_all(_data: <Self::Data as ForeignOwnable>::Borrowed<'_>) {}
+    fn tlb_flush_walk(
+        _data: <Self::Data as ForeignOwnable>::Borrowed<'_>,
+        _iova: usize,
+        _size: usize,
+        _granule: usize,
+    ) {
+    }
+    fn tlb_add_page(
+        _data: <Self::Data as ForeignOwnable>::Borrowed<'_>,
+        _iova: usize,
+        _granule: usize,
+    ) {
+    }
+}
--- a/drivers/gpu/drm/tyr/mmu/as_lock.rs
+++ b/drivers/gpu/drm/tyr/mmu/as_lock.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+//! Address space locking.
+
+use core::ops::Range;
+
+use kernel::bits::genmask_u64;
+use kernel::devres::Devres;
+use kernel::io::mem::IoMem;
+use kernel::prelude::*;
+
+use crate::mmu::Mmu;
+use crate::regs::*;
+
+/// A token type that represents a lock on a region of a given address space.
+pub(super) struct AsLockToken<'a> {
+    iomem: &'a Devres<IoMem>,
+    as_nr: usize,
+}
+
+impl<'a> AsLockToken<'a> {
+    /// Lock a `region` of `as_nr`.
+    pub(super) fn lock_region(
+        iomem: &'a Devres<IoMem>,
+        as_nr: usize,
+        region: Range<u64>,
+    ) -> Result<Self> {
+        if region.end - region.start == 0 {
+            return Err(EINVAL);
+        }
+
+        // The locked region is a naturally aligned power of 2 block encoded as
+        // log2 minus(1).
+        //
+        // Calculate the desired start/end and look for the highest bit which
+        // differs. The smallest naturally aligned block must include this bit
+        // change, the desired region starts with this bit (and subsequent bits)
+        // zeroed and ends with the bit (and subsequent bits) set to one.
+        let region_width = core::cmp::max(
+            (region.start ^ (region.end - 1)).leading_zeros() as u8,
+            64 - AS_LOCK_REGION_MIN_SIZE.trailing_zeros() as u8,
+        ) - 1;
+
+        // Mask off the low bits of region.start, which would be ignored by the
+        // hardware anyways.
+        let region_start = region.start & genmask_u64(63, region_width as u32);
+
+        let region = (region_width as u64) | region_start;
+
+        let region_lo = (region & 0xffffffff) as u32;
+        let region_hi = (region >> 32) as u32;
+
+        // Lock the region that needs to be updated.
+        as_lockaddr_lo(as_nr)?.write(iomem, region_lo)?;
+        as_lockaddr_hi(as_nr)?.write(iomem, region_hi)?;
+        as_command(as_nr)?.write(iomem, AS_COMMAND_LOCK)?;
+
+        Ok(Self { iomem, as_nr })
+    }
+}
+
+impl Drop for AsLockToken<'_> {
+    fn drop(&mut self) {
+        let as_cmd = as_command(self.as_nr);
+        match as_cmd {
+            Ok(as_cmd) => {
+                if let Err(err) = Mmu::wait_ready(self.iomem, self.as_nr) {
+                    pr_err!("MMU is busy for AS{}: {:?}\n", self.as_nr, err);
+                    return;
+                }
+                if let Err(err) = as_cmd.write(self.iomem, AS_COMMAND_FLUSH_PT) {
+                    pr_err!(
+                        "Failed to flush page tables for AS{}: {:?}\n",
+                        self.as_nr,
+                        err
+                    );
+                    return;
+                }
+                if let Err(err) = Mmu::wait_ready(self.iomem, self.as_nr) {
+                    pr_err!("MMU is busy for AS{}: {:?}\n", self.as_nr, err);
+                }
+            }
+
+            Err(err) => {
+                pr_err!("Failed to unlock AS{}: {:?}\n", self.as_nr, err);
+            }
+        }
+    }
+}
--- a/drivers/gpu/drm/tyr/mmu/faults.rs
+++ b/drivers/gpu/drm/tyr/mmu/faults.rs
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+//! Fault reporting.
+
+use crate::regs::*;
+use kernel::c_str;
+use kernel::devres::Devres;
+use kernel::io::mem::IoMem;
+use kernel::prelude::*;
+use kernel::str::CStr;
+
+pub(crate) const EXCEPTION_MAP: &[(u32, &CStr)] = &[
+    (0x00, c_str!("OK")),
+    (0x04, c_str!("TERMINATED")),
+    (0x05, c_str!("KABOOM")),
+    (0x06, c_str!("EUREKA")),
+    (0x08, c_str!("ACTIVE")),
+    (0x0f, c_str!("CS_RES_TERM")),
+    (0x3f, c_str!("MAX_NON_FAULT")),
+    (0x40, c_str!("CS_CONFIG_FAULT")),
+    (0x41, c_str!("CS_UNRECOVERABLE")),
+    (0x44, c_str!("CS_ENDPOINT_FAULT")),
+    (0x48, c_str!("CS_BUS_FAULT")),
+    (0x49, c_str!("CS_INSTR_INVALID")),
+    (0x4a, c_str!("CS_CALL_STACK_OVERFLOW")),
+    (0x4b, c_str!("CS_INHERIT_FAULT")),
+    (0x50, c_str!("INSTR_INVALID_PC")),
+    (0x51, c_str!("INSTR_INVALID_ENC")),
+    (0x55, c_str!("INSTR_BARRIER_FAULT")),
+    (0x58, c_str!("DATA_INVALID_FAULT")),
+    (0x59, c_str!("TILE_RANGE_FAULT")),
+    (0x5a, c_str!("ADDR_RANGE_FAULT")),
+    (0x5b, c_str!("IMPRECISE_FAULT")),
+    (0x60, c_str!("OOM")),
+    (0x68, c_str!("CSF_FW_INTERNAL_ERROR")),
+    (0x69, c_str!("CSF_RES_EVICTION_TIMEOUT")),
+    (0x80, c_str!("GPU_BUS_FAULT")),
+    (0x88, c_str!("GPU_SHAREABILITY_FAULT")),
+    (0x89, c_str!("SYS_SHAREABILITY_FAULT")),
+    (0x8a, c_str!("GPU_CACHEABILITY_FAULT")),
+    (0xc0, c_str!("TRANSLATION_FAULT_0")),
+    (0xc1, c_str!("TRANSLATION_FAULT_1")),
+    (0xc2, c_str!("TRANSLATION_FAULT_2")),
+    (0xc3, c_str!("TRANSLATION_FAULT_3")),
+    (0xc4, c_str!("TRANSLATION_FAULT_4")),
+    (0xc8, c_str!("PERM_FAULT_0")),
+    (0xc9, c_str!("PERM_FAULT_1")),
+    (0xca, c_str!("PERM_FAULT_2")),
+    (0xcb, c_str!("PERM_FAULT_3")),
+    (0xd9, c_str!("ACCESS_FLAG_1")),
+    (0xda, c_str!("ACCESS_FLAG_2")),
+    (0xdb, c_str!("ACCESS_FLAG_3")),
+    (0xe0, c_str!("ADDR_SIZE_FAULT_IN")),
+    (0xe4, c_str!("ADDR_SIZE_FAULT_OUT0")),
+    (0xe5, c_str!("ADDR_SIZE_FAULT_OUT1")),
+    (0xe6, c_str!("ADDR_SIZE_FAULT_OUT2")),
+    (0xe7, c_str!("ADDR_SIZE_FAULT_OUT3")),
+    (0xe8, c_str!("MEM_ATTR_FAULT_0")),
+    (0xe9, c_str!("MEM_ATTR_FAULT_1")),
+    (0xea, c_str!("MEM_ATTR_FAULT_2")),
+    (0xeb, c_str!("MEM_ATTR_FAULT_3")),
+];
+
+pub(crate) fn get_exception_name(code: u32) -> &'static CStr {
+    for &(exception_code, name) in EXCEPTION_MAP {
+        if exception_code == code {
+            return name;
+        }
+    }
+    c_str!("UNKNOWN")
+}
+
+pub(crate) fn access_type_name(fault_status: u32) -> &'static str {
+    match fault_status & AS_FAULTSTATUS_ACCESS_TYPE_MASK {
+        AS_FAULTSTATUS_ACCESS_TYPE_ATOMIC => "ATOMIC",
+        AS_FAULTSTATUS_ACCESS_TYPE_READ => "READ",
+        AS_FAULTSTATUS_ACCESS_TYPE_WRITE => "WRITE",
+        AS_FAULTSTATUS_ACCESS_TYPE_EX => "EXECUTE",
+        _ => "UNKNOWN",
+    }
+}
+
+/// Decodes a MMU fault, printing a message to the kernel log.
+pub(super) fn decode_faults(mut status: u32, iomem: &Devres<IoMem>) -> Result {
+    while status != 0 {
+        let as_index = (status | (status >> 16)).trailing_zeros();
+        let mask = kernel::bits::bit_u32(as_index);
+
+        let mut addr: u64;
+
+        let fault_status: u32 = as_faultstatus(as_index as usize).unwrap().read(iomem)?;
+        addr = as_faultaddress_lo(as_index as usize).unwrap().read(iomem)? as u64;
+        addr |= (as_faultaddress_hi(as_index as usize).unwrap().read(iomem)? as u64) << 32;
+
+        let exception_type: u32 = fault_status & 0xff;
+        let access_type: u32 = (fault_status >> 8) & 0x3;
+        let source_id: u32 = fault_status >> 16;
+
+        pr_err!(
+            "Unhandled Page fault in AS{} at VA 0x{:016X}\n\
+                raw fault status: 0x{:X}\n\
+                decoded fault status: {}\n\
+                exception type 0x{:X}: {}\n\
+                access type 0x{:X}: {}\n\
+                source id 0x{:X}\n",
+            as_index,
+            addr,
+            fault_status,
+            if fault_status & (1 << 10) != 0 {
+                "DECODER FAULT"
+            } else {
+                "SLAVE FAULT"
+            },
+            exception_type,
+            get_exception_name(exception_type),
+            access_type,
+            access_type_name(fault_status),
+            source_id
+        );
+
+        // Update status to process the next fault
+        status &= !mask;
+    }
+
+    Ok(())
+}