Commit 6a06d014 authored by Chris Wilson's avatar Chris Wilson

lib: Provide an accelerated routine for readback from WC

Reading from WC is awfully slow as each access is uncached and so
performed synchronously, stalling for the memory load. x86 did introduce
some new instructions in SSE 4.1 to provide a small internal buffer to
accelerate reading back a cacheline at a time from uncached memory, for
this purpose.

v2: Don't be lazy and handle misalignment.
v3: Switch out of sse41 before emitting the generic memcpy routine
v4: Replace opencoded memcpy_from_wc
v5: Always flush the internal buffer before use (Eric)
v6: Assume bulk moves, so check for dst alignment.
v7: Use _mm_fence for _buitlin_ia32_mfence for consistency, remove
superfluous defines (Ville)
Signed-off-by: Chris Wilson's avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Eric Anholt <eric@anholt.net>
Reviewed-by: Ville Syrjälä's avatarVille Syrjälä <ville.syrjala@linux.intel.com>
parent 5aed726a
......@@ -32,6 +32,7 @@
#include "drmtest.h"
#include "igt_fb.h"
#include "igt_kms.h"
#include "igt_x86.h"
#include "ioctl_wrappers.h"
#include "intel_chipset.h"
......@@ -1340,7 +1341,7 @@ static void convert_nv12_to_rgb24(struct igt_fb *fb, struct fb_convert_blit_uplo
* it's faster to copy the whole BO to a temporary buffer and convert
* from there.
*/
memcpy(buf, blit->linear.map, blit->linear.size);
igt_memcpy_from_wc(buf, blit->linear.map, blit->linear.size);
y = &buf[blit->linear.offsets[0]];
uv = &buf[blit->linear.offsets[1]];
......
......@@ -36,7 +36,11 @@
#endif
#include "igt_x86.h"
#include "igt_aux.h"
#include <stdint.h>
#include <stdio.h>
#include <string.h>
/**
* SECTION:igt_x86
......@@ -174,3 +178,115 @@ char *igt_x86_features_to_string(unsigned features, char *line)
return ret;
}
#endif
#if defined(__x86_64__) && !defined(__clang__)
#pragma GCC push_options
#pragma GCC target("sse4.1")
#pragma GCC diagnostic ignored "-Wpointer-arith"
#include <smmintrin.h>
static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
{
char buf[16];
/* Flush the internal buffer of potential stale gfx data */
_mm_mfence();
if ((uintptr_t)src & 15) {
__m128i *S = (__m128i *)((uintptr_t)src & ~15);
unsigned long misalign = (uintptr_t)src & 15;
unsigned long copy = min(len, 16 - misalign);
_mm_storeu_si128((__m128i *)buf,
_mm_stream_load_si128(S));
memcpy(dst, buf + misalign, copy);
dst += copy;
src += copy;
len -= copy;
}
/* We assume we are doing bulk transfers, so prefer aligned moves */
if (((uintptr_t)dst & 15) == 0) {
while (len >= 64) {
__m128i *S = (__m128i *)src;
__m128i *D = (__m128i *)dst;
__m128i tmp[4];
tmp[0] = _mm_stream_load_si128(S + 0);
tmp[1] = _mm_stream_load_si128(S + 1);
tmp[2] = _mm_stream_load_si128(S + 2);
tmp[3] = _mm_stream_load_si128(S + 3);
_mm_store_si128(D + 0, tmp[0]);
_mm_store_si128(D + 1, tmp[1]);
_mm_store_si128(D + 2, tmp[2]);
_mm_store_si128(D + 3, tmp[3]);
src += 64;
dst += 64;
len -= 64;
}
} else {
while (len >= 64) {
__m128i *S = (__m128i *)src;
__m128i *D = (__m128i *)dst;
__m128i tmp[4];
tmp[0] = _mm_stream_load_si128(S + 0);
tmp[1] = _mm_stream_load_si128(S + 1);
tmp[2] = _mm_stream_load_si128(S + 2);
tmp[3] = _mm_stream_load_si128(S + 3);
_mm_storeu_si128(D + 0, tmp[0]);
_mm_storeu_si128(D + 1, tmp[1]);
_mm_storeu_si128(D + 2, tmp[2]);
_mm_storeu_si128(D + 3, tmp[3]);
src += 64;
dst += 64;
len -= 64;
}
}
while (len >= 16) {
_mm_storeu_si128((__m128i *)dst,
_mm_stream_load_si128((__m128i *)src));
src += 16;
dst += 16;
len -= 16;
}
if (len) {
_mm_storeu_si128((__m128i *)buf,
_mm_stream_load_si128((__m128i *)src));
memcpy(dst, buf, len);
}
}
#pragma GCC pop_options
static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
{
memcpy(dst, src, len);
}
static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
{
if (igt_x86_features() & SSE4_1)
return memcpy_from_wc_sse41;
return memcpy_from_wc;
}
void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
__attribute__((ifunc("resolve_memcpy_from_wc")));
#else
void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
{
memcpy(dst, src, len);
}
#endif
......@@ -55,4 +55,6 @@ static inline char *igt_x86_features_to_string(unsigned features, char *line)
}
#endif
void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len);
#endif /* IGT_X86_H */
......@@ -107,75 +107,16 @@ bo_copy (void *_arg)
return NULL;
}
#if defined(__x86_64__) && !defined(__clang__)
#pragma GCC push_options
#pragma GCC target("sse4.1")
#include <smmintrin.h>
#define MOVNT 512
__attribute__((noinline))
static void copy_wc_page(void *dst, void *src)
{
if (igt_x86_features() & SSE4_1) {
__m128i *S = (__m128i *)src;
__m128i *D = (__m128i *)dst;
for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) {
__m128i tmp[4];
tmp[0] = _mm_stream_load_si128(S++);
tmp[1] = _mm_stream_load_si128(S++);
tmp[2] = _mm_stream_load_si128(S++);
tmp[3] = _mm_stream_load_si128(S++);
_mm_store_si128(D++, tmp[0]);
_mm_store_si128(D++, tmp[1]);
_mm_store_si128(D++, tmp[2]);
_mm_store_si128(D++, tmp[3]);
}
} else
memcpy(dst, src, PAGE_SIZE);
}
static void copy_wc_cacheline(void *dst, void *src)
{
if (igt_x86_features() & SSE4_1) {
__m128i *S = (__m128i *)src;
__m128i *D = (__m128i *)dst;
__m128i tmp[4];
tmp[0] = _mm_stream_load_si128(S++);
tmp[1] = _mm_stream_load_si128(S++);
tmp[2] = _mm_stream_load_si128(S++);
tmp[3] = _mm_stream_load_si128(S++);
_mm_store_si128(D++, tmp[0]);
_mm_store_si128(D++, tmp[1]);
_mm_store_si128(D++, tmp[2]);
_mm_store_si128(D++, tmp[3]);
} else
memcpy(dst, src, CACHELINE);
}
#pragma GCC pop_options
#else
static void copy_wc_page(void *dst, const void *src)
{
memcpy(dst, src, PAGE_SIZE);
igt_memcpy_from_wc(dst, src, PAGE_SIZE);
}
static void copy_wc_cacheline(void *dst, const void *src)
{
memcpy(dst, src, CACHELINE);
igt_memcpy_from_wc(dst, src, CACHELINE);
}
#endif
static void
_bo_write_verify(struct test *t)
{
......
......@@ -529,45 +529,10 @@ test_huge_bo(int fd, int huge, int tiling)
munmap(linear_pattern, PAGE_SIZE);
}
#if defined(__x86_64__) && !defined(__clang__)
#define MOVNT 512
#pragma GCC push_options
#pragma GCC target("sse4.1")
#include <smmintrin.h>
__attribute__((noinline))
static void copy_wc_page(void *dst, void *src)
{
if (igt_x86_features() & SSE4_1) {
__m128i *S = (__m128i *)src;
__m128i *D = (__m128i *)dst;
for (int i = 0; i < PAGE_SIZE/64; i++) {
__m128i tmp[4];
tmp[0] = _mm_stream_load_si128(S++);
tmp[1] = _mm_stream_load_si128(S++);
tmp[2] = _mm_stream_load_si128(S++);
tmp[3] = _mm_stream_load_si128(S++);
_mm_store_si128(D++, tmp[0]);
_mm_store_si128(D++, tmp[1]);
_mm_store_si128(D++, tmp[2]);
_mm_store_si128(D++, tmp[3]);
}
} else
memcpy(dst, src, PAGE_SIZE);
}
#pragma GCC pop_options
#else
static void copy_wc_page(void *dst, const void *src)
{
memcpy(dst, src, PAGE_SIZE);
igt_memcpy_from_wc(dst, src, PAGE_SIZE);
}
#endif
static unsigned int tile_row_size(int tiling, unsigned int stride)
{
......
......@@ -100,45 +100,10 @@ create_bo(int fd)
return handle;
}
#if defined(__x86_64__) && !defined(__clang__)
#define MOVNT 512
#pragma GCC push_options
#pragma GCC target("sse4.1")
#include <smmintrin.h>
__attribute__((noinline))
static void copy_wc_page(void *dst, void *src)
{
if (igt_x86_features() & SSE4_1) {
__m128i *S = (__m128i *)src;
__m128i *D = (__m128i *)dst;
for (int i = 0; i < PAGE_SIZE/64; i++) {
__m128i tmp[4];
tmp[0] = _mm_stream_load_si128(S++);
tmp[1] = _mm_stream_load_si128(S++);
tmp[2] = _mm_stream_load_si128(S++);
tmp[3] = _mm_stream_load_si128(S++);
_mm_store_si128(D++, tmp[0]);
_mm_store_si128(D++, tmp[1]);
_mm_store_si128(D++, tmp[2]);
_mm_store_si128(D++, tmp[3]);
}
} else
memcpy(dst, src, PAGE_SIZE);
}
#pragma GCC pop_options
#else
static void copy_wc_page(void *dst, const void *src)
{
memcpy(dst, src, PAGE_SIZE);
igt_memcpy_from_wc(dst, src, PAGE_SIZE);
}
#endif
igt_simple_main
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment