Commit 0e499632 authored by Alok Hota's avatar Alok Hota 🤖
Browse files

swr/rast: AVX512 support compiled in by default



- Emulation of AVX512 built into SIMDLIB
  - Remove associated macros
- Remove knobs controlling AVX512 and let emulation handle it
- Refactor variable names for SIMD16
Reviewed-by: Bruce Cherniak's avatarBruce Cherniak <bruce.cherniak@intel.com>
parent 0bf1df2b
......@@ -265,9 +265,7 @@ typedef MEGABYTE GIGABYTE[1024];
#define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64)
#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES)
#if ENABLE_AVX512_SIMD16
#define OSALIGNSIMD16(RWORD) OSALIGN(RWORD, KNOB_SIMD16_BYTES)
#endif
#include "common/swr_assert.h"
......
......@@ -24,8 +24,6 @@
#ifndef __SWR_SIMD16INTRIN_H__
#define __SWR_SIMD16INTRIN_H__
#if ENABLE_AVX512_SIMD16
#if KNOB_SIMD16_WIDTH == 16
typedef SIMD512 SIMD16;
#else
......@@ -167,6 +165,4 @@ typedef SIMD512 SIMD16;
#define _simd16_mask2int(mask) int(mask)
#define _simd16_vmask_ps SIMD16::vmask_ps
#endif // ENABLE_AVX512_SIMD16
#endif //__SWR_SIMD16INTRIN_H_
......@@ -341,8 +341,6 @@ static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const& a)
return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
}
#if ENABLE_AVX512_SIMD16
#include "simd16intrin.h"
#endif // ENABLE_AVX512_SIMD16
#endif //__SWR_SIMDINTRIN_H__
......@@ -230,7 +230,6 @@ typedef void (*PFN_PROCESS_PRIMS)(DRAW_CONTEXT* pDC,
simdscalari const& viewportIdx,
simdscalari const& rtIdx);
#if ENABLE_AVX512_SIMD16
// function signature for pipeline stages that execute after primitive assembly
typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT* pDC,
PA_STATE& pa,
......@@ -241,7 +240,6 @@ typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT* pDC,
simd16scalari const& viewportIdx,
simd16scalari const& rtIdx);
#endif
OSALIGNLINE(struct) API_STATE
{
// Vertex Buffers
......
......@@ -33,15 +33,17 @@
/// SOA RGBA32_FLOAT format.
/// @param pSrc - source data in SOA form
/// @param dst - output data in SOA form
template <SWR_FORMAT SrcFormat>
INLINE void LoadSOA(const uint8_t* pSrc, simdvector& dst)
template <typename SIMD_T, SWR_FORMAT SrcFormat>
INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, Vec4<SIMD_T>& dst)
{
// fast path for float32
if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
(FormatTraits<SrcFormat>::GetBPC(0) == 32))
{
auto lambda = [&](int comp) {
simdscalar vComp = _simd_load_ps((const float*)(pSrc + comp * sizeof(simdscalar)));
auto lambda = [&](int comp)
{
Float<SIMD_T> vComp =
SIMD_T::load_ps(reinterpret_cast<const float*>(pSrc + comp * sizeof(Float<SIMD_T>)));
dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
};
......@@ -50,9 +52,11 @@ INLINE void LoadSOA(const uint8_t* pSrc, simdvector& dst)
return;
}
auto lambda = [&](int comp) {
auto lambda = [&](int comp)
{
// load SIMD components
simdscalar vComp = FormatTraits<SrcFormat>::loadSOA(comp, pSrc);
Float<SIMD_T> vComp;
FormatTraits<SrcFormat>::loadSOA(comp, pSrc, vComp);
// unpack
vComp = FormatTraits<SrcFormat>::unpack(comp, vComp);
......@@ -60,250 +64,119 @@ INLINE void LoadSOA(const uint8_t* pSrc, simdvector& dst)
// convert
if (FormatTraits<SrcFormat>::isNormalized(comp))
{
vComp = _simd_cvtepi32_ps(_simd_castps_si(vComp));
vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<SrcFormat>::toFloat(comp)));
vComp = SIMD_T::cvtepi32_ps(SIMD_T::castps_si(vComp));
vComp = SIMD_T::mul_ps(vComp, SIMD_T::set1_ps(FormatTraits<SrcFormat>::toFloat(comp)));
}
dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8;
// is there a better way to get this from the SIMD traits?
const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float);
pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * SIMD_WIDTH) / 8;
};
UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
}
template <SWR_FORMAT SrcFormat>
INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simdvector& dst)
{
LoadSOA<SIMD256, SrcFormat>(pSrc, dst);
}
template <SWR_FORMAT SrcFormat>
INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simd16vector& dst)
{
LoadSOA<SIMD512, SrcFormat>(pSrc, dst);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Clamps the given component based on the requirements on the
/// Format template arg
/// @param vComp - SIMD vector of floats
/// @param Component - component
template <SWR_FORMAT Format>
INLINE simdscalar Clamp(simdscalar const& vC, uint32_t Component)
template <typename SIMD_T, SWR_FORMAT Format>
INLINE Float<SIMD_T> SIMDCALL Clamp(Float<SIMD_T> const& v, uint32_t Component)
{
simdscalar vComp = vC;
Float<SIMD_T> vComp = v;
if (FormatTraits<Format>::isNormalized(Component))
{
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM)
{
vComp = _simd_max_ps(vComp, _simd_setzero_ps());
vComp = SIMD_T::max_ps(vComp, SIMD_T::setzero_ps());
}
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SNORM)
{
vComp = _simd_max_ps(vComp, _simd_set1_ps(-1.0f));
vComp = SIMD_T::max_ps(vComp, SIMD_T::set1_ps(-1.0f));
}
vComp = _simd_min_ps(vComp, _simd_set1_ps(1.0f));
vComp = SIMD_T::min_ps(vComp, SIMD_T::set1_ps(1.0f));
}
else if (FormatTraits<Format>::GetBPC(Component) < 32)
{
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT)
{
int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
int iMin = 0;
simdscalari vCompi = _simd_castps_si(vComp);
vCompi = _simd_max_epu32(vCompi, _simd_set1_epi32(iMin));
vCompi = _simd_min_epu32(vCompi, _simd_set1_epi32(iMax));
vComp = _simd_castsi_ps(vCompi);
int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
int iMin = 0;
Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp);
vCompi = SIMD_T::max_epu32(vCompi, SIMD_T::set1_epi32(iMin));
vCompi = SIMD_T::min_epu32(vCompi, SIMD_T::set1_epi32(iMax));
vComp = SIMD_T::castsi_ps(vCompi);
}
else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT)
{
int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
int iMin = -1 - iMax;
simdscalari vCompi = _simd_castps_si(vComp);
vCompi = _simd_max_epi32(vCompi, _simd_set1_epi32(iMin));
vCompi = _simd_min_epi32(vCompi, _simd_set1_epi32(iMax));
vComp = _simd_castsi_ps(vCompi);
int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
int iMin = -1 - iMax;
Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp);
vCompi = SIMD_T::max_epi32(vCompi, SIMD_T::set1_epi32(iMin));
vCompi = SIMD_T::min_epi32(vCompi, SIMD_T::set1_epi32(iMax));
vComp = SIMD_T::castsi_ps(vCompi);
}
}
return vComp;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Normalize the given component based on the requirements on the
/// Format template arg
/// @param vComp - SIMD vector of floats
/// @param Component - component
template <SWR_FORMAT Format>
INLINE simdscalar Normalize(simdscalar const& vC, uint32_t Component)
INLINE simdscalar SIMDCALL Clamp(simdscalar const& v, uint32_t Component)
{
simdscalar vComp = vC;
if (FormatTraits<Format>::isNormalized(Component))
{
vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<Format>::fromFloat(Component)));
vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
}
return vComp;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Convert and store simdvector of pixels in SOA
/// RGBA32_FLOAT to SOA format
/// @param src - source data in SOA form
/// @param dst - output data in SOA form
template <SWR_FORMAT DstFormat>
INLINE void StoreSOA(const simdvector& src, uint8_t* pDst)
{
// fast path for float32
if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
(FormatTraits<DstFormat>::GetBPC(0) == 32))
{
for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
{
simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
// Gamma-correct
if (FormatTraits<DstFormat>::isSRGB)
{
if (comp < 3) // Input format is always RGBA32_FLOAT.
{
vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
}
}
_simd_store_ps((float*)(pDst + comp * sizeof(simdscalar)), vComp);
}
return;
}
auto lambda = [&](int comp) {
simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
// Gamma-correct
if (FormatTraits<DstFormat>::isSRGB)
{
if (comp < 3) // Input format is always RGBA32_FLOAT.
{
vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
}
}
// clamp
vComp = Clamp<DstFormat>(vComp, comp);
// normalize
vComp = Normalize<DstFormat>(vComp, comp);
// pack
vComp = FormatTraits<DstFormat>::pack(comp, vComp);
// store
FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp);
pDst += (FormatTraits<DstFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8;
};
UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda);
return Clamp<SIMD256, Format>(v, Component);
}
#if ENABLE_AVX512_SIMD16
//////////////////////////////////////////////////////////////////////////
/// @brief Load SIMD packed pixels in SOA format and converts to
/// SOA RGBA32_FLOAT format.
/// @param pSrc - source data in SOA form
/// @param dst - output data in SOA form
template <SWR_FORMAT SrcFormat>
INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simd16vector& dst)
template <SWR_FORMAT Format>
INLINE simd16scalar SIMDCALL Clamp(simd16scalar const& v, uint32_t Component)
{
// fast path for float32
if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
(FormatTraits<SrcFormat>::GetBPC(0) == 32))
{
auto lambda = [&](int comp) {
simd16scalar vComp =
_simd16_load_ps(reinterpret_cast<const float*>(pSrc + comp * sizeof(simd16scalar)));
dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
};
UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
return;
}
auto lambda = [&](int comp) {
// load SIMD components
simd16scalar vComp = FormatTraits<SrcFormat>::loadSOA_16(comp, pSrc);
// unpack
vComp = FormatTraits<SrcFormat>::unpack(comp, vComp);
// convert
if (FormatTraits<SrcFormat>::isNormalized(comp))
{
vComp = _simd16_cvtepi32_ps(_simd16_castps_si(vComp));
vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<SrcFormat>::toFloat(comp)));
}
dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * KNOB_SIMD16_WIDTH) / 8;
};
UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
return Clamp<SIMD512, Format>(v, Component);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Clamps the given component based on the requirements on the
/// @brief Normalize the given component based on the requirements on the
/// Format template arg
/// @param vComp - SIMD vector of floats
/// @param Component - component
template <SWR_FORMAT Format>
INLINE simd16scalar SIMDCALL Clamp(simd16scalar const& v, uint32_t Component)
template <typename SIMD_T, SWR_FORMAT Format>
INLINE Float<SIMD_T> SIMDCALL Normalize(Float<SIMD_T> const& vComp, uint32_t Component)
{
simd16scalar vComp = v;
Float<SIMD_T> r = vComp;
if (FormatTraits<Format>::isNormalized(Component))
{
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM)
{
vComp = _simd16_max_ps(vComp, _simd16_setzero_ps());
}
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SNORM)
{
vComp = _simd16_max_ps(vComp, _simd16_set1_ps(-1.0f));
}
vComp = _simd16_min_ps(vComp, _simd16_set1_ps(1.0f));
}
else if (FormatTraits<Format>::GetBPC(Component) < 32)
{
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT)
{
int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
int iMin = 0;
simd16scalari vCompi = _simd16_castps_si(vComp);
vCompi = _simd16_max_epu32(vCompi, _simd16_set1_epi32(iMin));
vCompi = _simd16_min_epu32(vCompi, _simd16_set1_epi32(iMax));
vComp = _simd16_castsi_ps(vCompi);
}
else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT)
{
int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
int iMin = -1 - iMax;
simd16scalari vCompi = _simd16_castps_si(vComp);
vCompi = _simd16_max_epi32(vCompi, _simd16_set1_epi32(iMin));
vCompi = _simd16_min_epi32(vCompi, _simd16_set1_epi32(iMax));
vComp = _simd16_castsi_ps(vCompi);
}
r = SIMD_T::mul_ps(r, SIMD_T::set1_ps(FormatTraits<Format>::fromFloat(Component)));
r = SIMD_T::castsi_ps(SIMD_T::cvtps_epi32(r));
}
return r;
}
return vComp;
template <SWR_FORMAT Format>
INLINE simdscalar SIMDCALL Normalize(simdscalar const& vComp, uint32_t Component)
{
return Normalize<SIMD256, Format>(vComp, Component);
}
//////////////////////////////////////////////////////////////////////////
/// @brief Normalize the given component based on the requirements on the
/// Format template arg
/// @param vComp - SIMD vector of floats
/// @param Component - component
template <SWR_FORMAT Format>
INLINE simd16scalar SIMDCALL Normalize(simd16scalar const& vComp, uint32_t Component)
{
simd16scalar r = vComp;
if (FormatTraits<Format>::isNormalized(Component))
{
r = _simd16_mul_ps(r, _simd16_set1_ps(FormatTraits<Format>::fromFloat(Component)));
r = _simd16_castsi_ps(_simd16_cvtps_epi32(r));
}
return r;
return Normalize<SIMD512, Format>(vComp, Component);
}
//////////////////////////////////////////////////////////////////////////
......@@ -311,8 +184,8 @@ INLINE simd16scalar SIMDCALL Normalize(simd16scalar const& vComp, uint32_t Compo
/// RGBA32_FLOAT to SOA format
/// @param src - source data in SOA form
/// @param dst - output data in SOA form
template <SWR_FORMAT DstFormat>
INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
template <typename SIMD_T, SWR_FORMAT DstFormat>
INLINE void SIMDCALL StoreSOA(const Vec4<SIMD_T>& src, uint8_t* pDst)
{
// fast path for float32
if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
......@@ -320,7 +193,7 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
{
for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
{
simd16scalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
// Gamma-correct
if (FormatTraits<DstFormat>::isSRGB)
......@@ -331,13 +204,13 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
}
}
_simd16_store_ps(reinterpret_cast<float*>(pDst + comp * sizeof(simd16scalar)), vComp);
SIMD_T::store_ps(reinterpret_cast<float*>(pDst + comp * sizeof(simd16scalar)), vComp);
}
return;
}
auto lambda = [&](int comp) {
simd16scalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
// Gamma-correct
if (FormatTraits<DstFormat>::isSRGB)
......@@ -349,10 +222,10 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
}
// clamp
vComp = Clamp<DstFormat>(vComp, comp);
vComp = Clamp<SIMD_T, DstFormat>(vComp, comp);
// normalize
vComp = Normalize<DstFormat>(vComp, comp);
vComp = Normalize<SIMD_T, DstFormat>(vComp, comp);
// pack
vComp = FormatTraits<DstFormat>::pack(comp, vComp);
......@@ -360,10 +233,24 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
// store
FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp);
pDst += (FormatTraits<DstFormat>::GetBPC(comp) * KNOB_SIMD16_WIDTH) / 8;
// is there a better way to get this from the SIMD traits?
const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float);
pDst += (FormatTraits<DstFormat>::GetBPC(comp) * SIMD_WIDTH) / 8;
};
UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda);
}
#endif
template <SWR_FORMAT DstFormat>
INLINE void SIMDCALL StoreSOA(const simdvector& src, uint8_t* pDst)
{
StoreSOA<SIMD256, DstFormat>(src, pDst);
}
template <SWR_FORMAT DstFormat>
INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
{
StoreSOA<SIMD512, DstFormat>(src, pDst);
}
......@@ -36,17 +36,17 @@
template <uint32_t NumBits, bool Signed = false>
struct PackTraits
{
static const uint32_t MyNumBits = NumBits;
static const uint32_t MyNumBits = NumBits;
static simdscalar loadSOA(const uint8_t* pSrc) = delete;
static void storeSOA(uint8_t* pDst, simdscalar const& src) = delete;
static simdscalar unpack(simdscalar& in) = delete;
static simdscalar pack(simdscalar& in) = delete;
#if ENABLE_AVX512_SIMD16
static simd16scalar loadSOA_16(const uint8_t* pSrc) = delete;
static simd16scalar loadSOA_16(const uint8_t* pSrc) = delete;
static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) = delete;
static simd16scalar unpack(simd16scalar& in) = delete;
static simd16scalar pack(simd16scalar& in) = delete;
#endif
};
//////////////////////////////////////////////////////////////////////////
......@@ -61,12 +61,11 @@ struct PackTraits<0, false>
static void storeSOA(uint8_t* pDst, simdscalar const& src) { return; }
static simdscalar unpack(simdscalar& in) { return _simd_setzero_ps(); }
static simdscalar pack(simdscalar& in) { return _simd_setzero_ps(); }
#if ENABLE_AVX512_SIMD16
static simd16scalar loadSOA_16(const uint8_t* pSrc) { return _simd16_setzero_ps(); }
static simd16scalar loadSOA_16(const uint8_t* pSrc) { return _simd16_setzero_ps(); }
static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) { return; }
static simd16scalar unpack(simd16scalar& in) { return _simd16_setzero_ps(); }
static simd16scalar pack(simd16scalar& in) { return _simd16_setzero_ps(); }
#endif
};
//////////////////////////////////////////////////////////////////////////
......@@ -131,7 +130,6 @@ struct PackTraits<8, false>
#error Unsupported vector width
#endif
}
#if ENABLE_AVX512_SIMD16
static simd16scalar loadSOA_16(const uint8_t* pSrc)
{
......@@ -163,40 +161,31 @@ struct PackTraits<8, false>
static simd16scalar pack(simd16scalar& in)
{
// clang-format off
simd16scalari result = _simd16_setzero_si();
simdscalari inlo =
_simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF
simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF
simdscalari permlo =
_simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
simdscalari permhi =
_simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
simdscalari pack = _simd_packus_epi32(
permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
simdscalari pack = _simd_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
const simdscalari zero = _simd_setzero_si();
permlo = _simd_permute2f128_si(
pack,
zero,
0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
permhi = _simd_permute2f128_si(
pack,
zero,
0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
pack = _simd_packus_epi16(permlo,
permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00
// 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
pack = _simd_packus_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
result = _simd16_insert_si(result, pack, 0);
return _simd16_castsi_ps(result);
// clang-format on
}
#endif
};
//////////////////////////////////////////////////////////////////////////
......@@ -262,7 +251,6 @@ struct PackTraits<8, true>
#error Unsupported vector width
#endif
}
#if ENABLE_AVX512_SIMD16
static simd16scalar loadSOA_16(const uint8_t* pSrc)
{
......@@ -294,40 +282,31 @@ struct PackTraits<8, true>
static simd16scalar pack(simd16scalar& in)
{
// clang-format off
simd16scalari result = _simd16_setzero_si();