Commit 8acfddaa authored by Alyssa Rosenzweig's avatar Alyssa Rosenzweig 💜

Cleanup and integrate fast path

parent 8c97bb38
......@@ -22,6 +22,7 @@
* DEALINGS IN THE SOFTWARE.
*/
#include <stdio.h>
#include "texture-swizzle.h"
#define ALIGN(x, y) (((x) + ((y) - 1)) & ~((y) - 1))
......@@ -43,6 +44,7 @@ space_bits_4(int i)
* mapping, just with bits twiddled around. */
uint32_t space_filler[16][16];
unsigned space_filler_one[16];
void
trans_generate_space_filler_indices()
......@@ -52,51 +54,49 @@ trans_generate_space_filler_indices()
space_filler[y][x] =
space_bits_4(y ^ x) | (space_bits_4(y) << 1);
}
space_filler_one[y] = space_bits_4(y);
}
}
void
trans_texture_swizzle(int width, int height, int bytes_per_pixel, int source_stride,
const uint8_t *pixels,
uint8_t *ldest)
static void
swizzle_bpp4_align16(int width, int height, int source_stride, int block_pitch,
const uint32_t *pixels,
uint32_t *ldest)
{
int block_pitch = ALIGN(width, 16) >> 4;
/* Calculate maximum size, overestimating a bit */
uint32_t sz = bytes_per_pixel * 256 * ((height >> 4) + 1) * block_pitch;
for (int y = 0; y < height; ++y) {
int block_y = y >> 4;
int rem_y = y & 0x0F;
int block_start_s = block_y * block_pitch * 256;
int source_start = y * source_stride;
for (int x = 0; x < width; ++x) {
int block_x_s = (x >> 4) * 256;
int rem_x = x & 0x0F;
int rem_y = y & 0x0f;
uint32_t *block_start_s = ldest + (block_y * block_pitch * 256);
const uint32_t *source_start = pixels + (y * source_stride);
int index = space_filler[rem_y][rem_x];
const uint8_t *source = &pixels[source_start + bytes_per_pixel * x];
uint8_t *dest = ldest + bytes_per_pixel * (block_start_s + block_x_s + index);
for (int x = 0; x < width; x += 16) {
const uint32_t *src = source_start + x;
uint32_t *dst = block_start_s + (x << 4);
for (int b = 0; b < bytes_per_pixel; ++b)
dest[b] = source[b];
for (int j = 0; j < 16; ++j)
dst[space_filler[rem_y][j]] = src[j];
}
}
}
void
trans_texture_swizzle_bpp4(int width, int height, int source_stride,
const uint32_t *pixels,
uint32_t *ldest)
trans_texture_swizzle(int width, int height, int bytes_per_pixel, int source_stride,
const uint8_t *pixels,
uint8_t *ldest)
{
int block_pitch = ALIGN(width, 16) >> 4;
int bytes_per_pixel = 4;
source_stride /= 4;
/* Calculate maximum size, overestimating a bit */
int block_pitch = ALIGN(width, 16) >> 4;
uint32_t sz = bytes_per_pixel * 256 * ((height >> 4) + 1) * block_pitch;
/* Use fast path if available */
if (bytes_per_pixel == 4 /* && (ALIGN(width, 16) == width) */) {
swizzle_bpp4_align16(width, height, source_stride >> 2, block_pitch, (const uint32_t *) pixels, (uint32_t *) ldest);
return;
}
/* Otherwise, default back on generic path */
for (int y = 0; y < height; ++y) {
int block_y = y >> 4;
int rem_y = y & 0x0F;
......@@ -108,127 +108,16 @@ trans_texture_swizzle_bpp4(int width, int height, int source_stride,
int rem_x = x & 0x0F;
int index = space_filler[rem_y][rem_x];
const uint8_t *source = &pixels[source_start + bytes_per_pixel * x];
uint8_t *dest = ldest + bytes_per_pixel * (block_start_s + block_x_s + index);
ldest[block_start_s + block_x_s + index] = pixels[source_start + x];
}
}
}
void
trans_texture_swizzle_bpp4_align16(int width, int height, int source_stride,
const uint32_t *pixels,
uint32_t *ldest)
{
int block_pitch = width >> 4;
int bytes_per_pixel = 4;
source_stride /= 4;
/* calculate maximum size, overestimating a bit */
uint32_t sz = bytes_per_pixel * 256 * ((height >> 4) + 1) * block_pitch;
for (int y = 0; y < height; ++y) {
int block_y = y >> 4;
int rem_y = y & 0x0f;
int block_start_s = block_y * block_pitch * 256;
int source_start = y * source_stride;
for (int x = 0; x < width; x += 16) {
int block_x_s = (x >> 4) * 256;
const uint32_t *src = pixels + source_start + x;
uint32_t *dst = ldest + block_start_s + block_x_s;
uint32_t *space = &space_filler[rem_y][0];
for (int j = 0; j < 16; ++j) {
dst[space[j]] = src[j];
}
for (int b = 0; b < bytes_per_pixel; ++b)
dest[b] = source[b];
}
}
}
#include <stdbool.h>
#include <arm_neon.h>
void
trans_texture_swizzle_bpp4_align16_neon1(int width, int height, int source_stride,
const uint32_t *pixels,
uint32_t *ldest)
{
int block_pitch = width >> 4;
int bytes_per_pixel = 4;
source_stride /= 4;
/* calculate maximum size, overestimating a bit */
uint32_t sz = bytes_per_pixel * 256 * ((height >> 4) + 1) * block_pitch;
for (int y = 0; y < height; ++y) {
int block_y = y >> 4;
int rem_y = y & 0x0f;
int block_start_s = block_y * block_pitch * 256;
int source_start = y * source_stride;
uintptr_t psrc = (uintptr_t) (pixels + source_start);
uint32x4_t PSrc = vdupq_n_u32(psrc);
uintptr_t pdst = (uintptr_t) (ldest + block_start_s);
uint32x4_t PDst = vdupq_n_u32(pdst);
uint32_t *space = &space_filler[rem_y][0];
for (uint32_t x = 0; x < width; x += 64) {
uint32x4_t X = vdupq_n_u32(x);
X = vsetq_lane_u32(x + 16, X, 1);
X = vsetq_lane_u32(x + 32, X, 2);
X = vsetq_lane_u32(x + 48, X, 3);
uint32x4_t Block_x_s = vshlq_n_u32(X, 6);
uint32x4_t X4 = vshlq_n_u32(X, 2);
uint32x4_t U_Src = vaddq_u32(PSrc, X4);
uint32x4_t U_Dst = vaddq_u32(PDst, Block_x_s);
const uint32_t *src1 = (const uint32_t *) vgetq_lane_u32(U_Src, 0);
uint32_t *dst1 = (uint32_t *) vgetq_lane_u32(U_Dst, 0);
for (int j = 0; j < 16; j += 4) {
uint32x4_t s = vld1q_u32(&src1[j]);
uint32x4_t p = vld1q_u32(&space[j]);
vst1q_lane_u32(dst1 + vgetq_lane_u32(p, 0), s, 0);
vst1q_lane_u32(dst1 + vgetq_lane_u32(p, 1), s, 1);
vst1q_lane_u32(dst1 + vgetq_lane_u32(p, 2), s, 2);
vst1q_lane_u32(dst1 + vgetq_lane_u32(p, 3), s, 3);
#if 0
dst1[vgetq_lane_u32(p, 0)] = vgetq_lane_u32(s, 0);
dst1[vgetq_lane_u32(p, 1)] = vgetq_lane_u32(s, 1);
dst1[vgetq_lane_u32(p, 2)] = vgetq_lane_u32(s, 2);
dst1[vgetq_lane_u32(p, 3)] = vgetq_lane_u32(s, 3);
#endif
}
const uint32_t *src2 = (const uint32_t *) vgetq_lane_u32(U_Src, 1);
uint32_t *dst2 = (uint32_t *) vgetq_lane_u32(U_Dst, 1);
for (int j = 0; j < 16; ++j) {
dst2[space[j]] = src2[j];
}
const uint32_t *src3 = (const uint32_t *) vgetq_lane_u32(U_Src, 2);
uint32_t *dst3 = (uint32_t *) vgetq_lane_u32(U_Dst, 2);
for (int j = 0; j < 16; ++j) {
dst3[space[j]] = src3[j];
}
const uint32_t *src4 = (const uint32_t *) vgetq_lane_u32(U_Src, 3);
uint32_t *dst4 = (uint32_t *) vgetq_lane_u32(U_Dst, 3);
for (int j = 0; j < 16; ++j) {
dst4[space[j]] = src4[j];
}
}
}
}
#if 1
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
......@@ -243,8 +132,7 @@ void main() {
uint8_t *out = malloc(TW*TH*4*2);
for (int i = 0; i < 60; ++i) {
trans_texture_swizzle_bpp4_align16_neon1(TW, TH, TW*4, (uint32_t *) in, (uint32_t *) out);
//trans_texture_swizzle_bpp4_align16(TW, TH, TW*4, (uint32_t *) in, (uint32_t *) out);
trans_texture_swizzle_bpp4_align16(TW, TH, TW*4, (uint32_t *) in, (uint32_t *) out);
//trans_texture_swizzle_bpp4(TW, TH, TW*4, (uint32_t *) in, (uint32_t *) out);
//trans_texture_swizzle(TW, TH, 4, TW*4, (uint32_t *) in, (uint32_t *) out);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment