diff --git a/src/asahi/vulkan/hk_cmd_meta.c b/src/asahi/vulkan/hk_cmd_meta.c
index 96e91f37117efd2fae3c0ec571146a46e4440405..c9bb25ea1f86bc92e19d3d8e772c02ae95342665 100644
--- a/src/asahi/vulkan/hk_cmd_meta.c
+++ b/src/asahi/vulkan/hk_cmd_meta.c
@@ -1382,13 +1382,23 @@ hk_meta_resolve_rendering(struct hk_cmd_buffer *cmd,
 static void
 hk_cmd_copy(struct hk_cmd_buffer *cmd, uint64_t dst, uint64_t src, size_t size)
 {
-   if (size / 16) {
-      libagx_copy_uint4(cmd, agx_1d(size / 16), AGX_BARRIER_ALL, dst, src);
+   /* Use vectorized copies for as much of the buffer as possible. This requires
+    * that dst, src, and size are all properly aligned. Failing to check for
+    * alignment on the buffers causes subtle and hard-to-debug issues!
+    */
+   if (size >= 16 && (dst & 0xf) == 0 && (src & 0xf) == 0) {
+      unsigned uint4s = size / 16;
+      unsigned bytes = uint4s * 16;
+
+      libagx_copy_uint4(cmd, agx_1d(uint4s), AGX_BARRIER_ALL, dst, src);
+
+      dst += bytes;
+      src += bytes;
+      size -= bytes;
    }
 
-   if (size % 16) {
-      libagx_copy_uchar(cmd, agx_1d(size % 16), AGX_BARRIER_ALL,
-                        dst + (size & ~15), src + (size & ~15));
+   if (size) {
+      libagx_copy_uchar(cmd, agx_1d(size), AGX_BARRIER_ALL, dst, src);
    }
 }