Commit 300d3ae8 authored by Carsten Haitzler's avatar Carsten Haitzler Committed by Eric Anholt

vc4: Declare the cpu pointers as being modified in NEON asm.

Otherwise, the compiler is free to reuse the register containing the input
for another call and assume that the value hasn't been modified.  Fixes
crashes on texture upload/download with current gcc.

We now have to have a temporary for the cpu2 value, since outputs must be
lvalues.

(commit message by anholt)

Fixes: 4d300242 ("vc4: Use NEON to speed up utile loads on Pi2.")
parent 522f6884
......@@ -49,13 +49,13 @@ v3d_load_utile(void *cpu, uint32_t cpu_stride,
"vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
"vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
"vst1.8 d7, [%[cpu]]\n"
:
: [cpu] "+r"(cpu)
: [gpu] "r"(gpu),
[cpu] "r"(cpu),
[cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
return;
} else if (gpu_stride == 16) {
void *cpu2 = cpu + 8;
__asm__ volatile (
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
......@@ -73,10 +73,9 @@ v3d_load_utile(void *cpu, uint32_t cpu_stride,
"vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
"vst1.8 d6, [%[cpu]]\n"
"vst1.8 d7, [%[cpu2]]\n"
:
: [cpu] "+r"(cpu),
[cpu2] "+r"(cpu2)
: [gpu] "r"(gpu),
[cpu] "r"(cpu),
[cpu2] "r"(cpu + 8),
[cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
return;
......@@ -99,13 +98,13 @@ v3d_load_utile(void *cpu, uint32_t cpu_stride,
"st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
"st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
"st1 {v3.D}[1], [%[cpu]]\n"
:
: [cpu] "+r"(cpu)
: [gpu] "r"(gpu),
[cpu] "r"(cpu),
[cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
return;
} else if (gpu_stride == 16) {
void *cpu2 = cpu + 8;
__asm__ volatile (
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
......@@ -123,10 +122,9 @@ v3d_load_utile(void *cpu, uint32_t cpu_stride,
"st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
"st1 {v3.D}[0], [%[cpu]]\n"
"st1 {v3.D}[1], [%[cpu2]]\n"
:
: [cpu] "+r"(cpu),
[cpu2] "+r"(cpu2)
: [gpu] "r"(gpu),
[cpu] "r"(cpu),
[cpu2] "r"(cpu + 8),
[cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
return;
......@@ -168,6 +166,7 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
: "q0", "q1", "q2", "q3");
return;
} else if (gpu_stride == 16) {
void *cpu2 = cpu + 8;
__asm__ volatile (
/* Load each 16-byte line in 2 parts from the cpu-side
* destination. (vld1 can only store one d-register
......@@ -183,10 +182,9 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
"vld1.8 d7, [%[cpu2]]\n"
/* Store to the GPU in one shot, no interleave. */
"vstm %[gpu], {q0, q1, q2, q3}\n"
:
: [cpu] "+r"(cpu),
[cpu2] "+r"(cpu2)
: [gpu] "r"(gpu),
[cpu] "r"(cpu),
[cpu2] "r"(cpu + 8),
[cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
return;
......@@ -207,13 +205,13 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
"ld1 {v3.D}[1], [%[cpu]]\n"
/* Store to the GPU in one shot, no interleave. */
"st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
:
: [cpu] "+r"(cpu)
: [gpu] "r"(gpu),
[cpu] "r"(cpu),
[cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
return;
} else if (gpu_stride == 16) {
void *cpu2 = cpu + 8;
__asm__ volatile (
/* Load each 16-byte line in 2 parts from the cpu-side
* destination. (vld1 can only store one d-register
......@@ -229,10 +227,9 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
"ld1 {v3.D}[1], [%[cpu2]]\n"
/* Store to the GPU in one shot, no interleave. */
"st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
:
: [cpu] "+r"(cpu),
[cpu2] "+r"(cpu2)
: [gpu] "r"(gpu),
[cpu] "r"(cpu),
[cpu2] "r"(cpu + 8),
[cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
return;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment