Commit 0bf1df2b authored by Alok Hota's avatar Alok Hota 🤖
Browse files

swr/rast: Remove deprecated 4x2 backend code



- Use 8x2 tiling by default
  - Remove associated macros
- Use SIMDLIB emulation for SIMD16 on SIMD8 hardware
- Remove code rot in Load/StoreTile
Reviewed-by: Bruce Cherniak's avatarBruce Cherniak <bruce.cherniak@intel.com>
parent e8bf4efc
......@@ -37,29 +37,11 @@
#include <algorithm>
template <SWR_FORMAT format>
void ClearRasterTile(uint8_t* pTileBuffer, simdvector& value)
{
auto lambda = [&](int32_t comp) {
FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
};
const uint32_t numIter =
(KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
for (uint32_t i = 0; i < numIter; ++i)
{
UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
}
}
#if USE_8x2_TILE_BACKEND
template <SWR_FORMAT format>
void ClearRasterTile(uint8_t* pTileBuffer, simd16vector& value)
{
auto lambda = [&](int32_t comp) {
auto lambda = [&](int32_t comp)
{
FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
......@@ -74,7 +56,6 @@ void ClearRasterTile(uint8_t* pTileBuffer, simd16vector& value)
}
}
#endif
template <SWR_FORMAT format>
INLINE void ClearMacroTile(DRAW_CONTEXT* pDC,
HANDLE hWorkerPrivateData,
......@@ -86,37 +67,22 @@ INLINE void ClearMacroTile(DRAW_CONTEXT* pDC,
{
// convert clear color to hottile format
// clear color is in RGBA float/uint32
#if USE_8x2_TILE_BACKEND
simd16vector vClear;
for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
{
simd16scalar vComp;
vComp = _simd16_load1_ps((const float*)&clear[comp]);
simd16scalar vComp = _simd16_load1_ps((const float*)&clear[comp]);
if (FormatTraits<format>::isNormalized(comp))
{
vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
}
vComp = FormatTraits<format>::pack(comp, vComp);
vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
}
vComp = FormatTraits<format>::pack(comp, vComp);
#else
simdvector vClear;
for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
{
simdscalar vComp;
vComp = _simd_load1_ps((const float*)&clear[comp]);
if (FormatTraits<format>::isNormalized(comp))
{
vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
}
vComp = FormatTraits<format>::pack(comp, vComp);
vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
}
#endif
uint32_t tileX, tileY;
MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
......
......@@ -894,87 +894,6 @@ static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs,
psContext.vJ.sample);
}
// Merge Output to 4x2 SIMD Tile Format
INLINE void OutputMerger4x2(DRAW_CONTEXT* pDC,
SWR_PS_CONTEXT& psContext,
uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS],
uint32_t sample,
const SWR_BLEND_STATE* pBlendState,
const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS],
simdscalar& coverageMask,
simdscalar const& depthPassMask,
uint32_t renderTargetMask,
uint32_t workerId)
{
// type safety guaranteed from template instantiation in BEChooser<>::GetFunc
const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
simdvector blendOut;
DWORD rt = 0;
while (_BitScanForward(&rt, renderTargetMask))
{
renderTargetMask &= ~(1 << rt);
uint8_t* pColorSample = pColorBase[rt] + rasterTileColorOffset;
const SWR_RENDER_TARGET_BLEND_STATE* pRTBlend = &pBlendState->renderTarget[rt];
SWR_BLEND_CONTEXT blendContext = {0};
{
// pfnBlendFunc may not update all channels. Initialize with PS output.
/// TODO: move this into the blend JIT.
blendOut = psContext.shaded[rt];
blendContext.pBlendState = pBlendState;
blendContext.src = &psContext.shaded[rt];
blendContext.src1 = &psContext.shaded[1];
blendContext.src0alpha = reinterpret_cast<simdvector*>(&psContext.shaded[0].w);
blendContext.sampleNum = sample;
blendContext.pDst = (simdvector*)&pColorSample;
blendContext.result = &blendOut;
blendContext.oMask = &psContext.oMask;
blendContext.pMask = reinterpret_cast<simdscalari*>(&coverageMask);
// Blend outputs and update coverage mask for alpha test
if (pfnBlendFunc[rt] != nullptr)
{
pfnBlendFunc[rt](&blendContext);
}
}
// Track alpha events
AR_EVENT(
AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended));
// final write mask
simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,
"Unsupported hot tile format");
const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
// store with color mask
if (!pRTBlend->writeDisableRed)
{
_simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
}
if (!pRTBlend->writeDisableGreen)
{
_simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
}
if (!pRTBlend->writeDisableBlue)
{
_simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
}
if (!pRTBlend->writeDisableAlpha)
{
_simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
}
}
}
#if USE_8x2_TILE_BACKEND
// Merge Output to 8x2 SIMD16 Tile Format
INLINE void OutputMerger8x2(DRAW_CONTEXT* pDC,
SWR_PS_CONTEXT& psContext,
......@@ -1076,8 +995,6 @@ INLINE void OutputMerger8x2(DRAW_CONTEXT* pDC,
}
}
#endif
template <typename T>
void BackendPixelRate(DRAW_CONTEXT* pDC,
uint32_t workerId,
......@@ -1137,9 +1054,9 @@ void BackendPixelRate(DRAW_CONTEXT* pDC,
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
#if USE_8x2_TILE_BACKEND
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
#endif
simdscalar activeLanes;
if (!(work.anyCoveredSamples & MASK))
{
......@@ -1264,7 +1181,7 @@ void BackendPixelRate(DRAW_CONTEXT* pDC,
}
// broadcast the results of the PS to all passing pixels
#if USE_8x2_TILE_BACKEND
OutputMerger8x2(pDC,
psContext,
psContext.pColorBuffer,
......@@ -1276,18 +1193,7 @@ void BackendPixelRate(DRAW_CONTEXT* pDC,
state.psState.renderTargetMask,
useAlternateOffset,
workerId);
#else // USE_8x2_TILE_BACKEND
OutputMerger4x2(pDC,
psContext,
psContext.pColorBuffer,
sample,
&state.blendState,
state.pfnBlendFunc,
coverageMask,
depthMask,
state.psState.renderTargetMask,
workerId);
#endif // USE_8x2_TILE_BACKEND
if (!state.psState.forceEarlyZ && !T::bForcedSampleCount)
{
......@@ -1320,7 +1226,6 @@ void BackendPixelRate(DRAW_CONTEXT* pDC,
}
work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
#if USE_8x2_TILE_BACKEND
if (useAlternateOffset)
{
DWORD rt;
......@@ -1332,16 +1237,7 @@ void BackendPixelRate(DRAW_CONTEXT* pDC,
(2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
}
#else
DWORD rt;
uint32_t rtMask = state.colorHottileEnable;
while (_BitScanForward(&rt, rtMask))
{
rtMask &= ~(1 << rt);
psContext.pColorBuffer[rt] +=
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
#endif
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
pStencilBuffer +=
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
......
......@@ -81,9 +81,9 @@ void BackendSampleRate(DRAW_CONTEXT* pDC,
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
#if USE_8x2_TILE_BACKEND
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
#endif
if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
{
const uint64_t* pCoverageMask =
......@@ -252,7 +252,7 @@ void BackendSampleRate(DRAW_CONTEXT* pDC,
// output merger
RDTSC_BEGIN(BEOutputMerger, pDC->drawId);
#if USE_8x2_TILE_BACKEND
OutputMerger8x2(pDC,
psContext,
psContext.pColorBuffer,
......@@ -264,18 +264,6 @@ void BackendSampleRate(DRAW_CONTEXT* pDC,
state.psState.renderTargetMask,
useAlternateOffset,
workerId);
#else
OutputMerger4x2(pDC,
psContext,
psContext.pColorBuffer,
sample,
&state.blendState,
state.pfnBlendFunc,
vCoverageMask,
depthPassMask,
state.psState.renderTargetMask,
workerId);
#endif
// do final depth write after all pixel kills
if (!state.psState.forceEarlyZ)
......@@ -305,7 +293,6 @@ void BackendSampleRate(DRAW_CONTEXT* pDC,
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
#if USE_8x2_TILE_BACKEND
if (useAlternateOffset)
{
DWORD rt;
......@@ -317,16 +304,7 @@ void BackendSampleRate(DRAW_CONTEXT* pDC,
(2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
}
#else
DWORD rt;
uint32_t rtMask = state.colorHottileEnable;
while (_BitScanForward(&rt, rtMask))
{
rtMask &= ~(1 << rt);
psContext.pColorBuffer[rt] +=
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
#endif
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
pStencilBuffer +=
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
......
......@@ -82,9 +82,9 @@ void BackendSingleSample(DRAW_CONTEXT* pDC,
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
#if USE_8x2_TILE_BACKEND
const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
#endif
simdmask coverageMask = work.coverageMask[0] & MASK;
if (coverageMask)
......@@ -237,7 +237,7 @@ void BackendSingleSample(DRAW_CONTEXT* pDC,
// output merger
RDTSC_BEGIN(BEOutputMerger, pDC->drawId);
#if USE_8x2_TILE_BACKEND
OutputMerger8x2(pDC,
psContext,
psContext.pColorBuffer,
......@@ -249,19 +249,6 @@ void BackendSingleSample(DRAW_CONTEXT* pDC,
state.psState.renderTargetMask,
useAlternateOffset,
workerId);
#else
OutputMerger4x2(pDC,
psContext,
psContext.pColorBuffer,
0,
&state.blendState,
state.pfnBlendFunc,
vCoverageMask,
depthPassMask,
state.psState.renderTargetMask,
workerId,
workerId);
#endif
// do final depth write after all pixel kills
if (!state.psState.forceEarlyZ)
......@@ -288,7 +275,6 @@ void BackendSingleSample(DRAW_CONTEXT* pDC,
work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
}
#if USE_8x2_TILE_BACKEND
if (useAlternateOffset)
{
DWORD rt;
......@@ -300,16 +286,7 @@ void BackendSingleSample(DRAW_CONTEXT* pDC,
(2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
}
#else
DWORD rt;
uint32_t rtMask = state.colorHottileEnable;
while (_BitScanForward(&rt, rtMask))
{
rtMask &= ~(1 << rt);
psContext.pColorBuffer[rt] +=
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
}
#endif
pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
pStencilBuffer +=
(KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
......
......@@ -39,7 +39,6 @@
///////////////////////////////////////////////////////////////////////////////
#define ENABLE_AVX512_SIMD16 1
#define USE_8x2_TILE_BACKEND 1
#define USE_SIMD16_FRONTEND 1
#define USE_SIMD16_SHADERS 1 // requires USE_SIMD16_FRONTEND
#define USE_SIMD16_VS 1 // requires USE_SIMD16_SHADERS
......
......@@ -244,7 +244,6 @@ HOTTILE* HotTileMgr::GetHotTileNoLoad(SWR_CONTEXT* pContext,
return &hotTile;
}
#if USE_8x2_TILE_BACKEND
void HotTileMgr::ClearColorHotTile(
const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
{
......@@ -330,91 +329,6 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
}
}
#else
void HotTileMgr::ClearColorHotTile(
const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
{
// Load clear color into SIMD register...
float* pClearData = (float*)(pHotTile->clearData);
simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
float* pfBuf = (float*)pHotTile->pBuffer;
uint32_t numSamples = pHotTile->numSamples;
for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
{
for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
{
for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
si +=
SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) // SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
{
_simd_store_ps(pfBuf, valR);
pfBuf += KNOB_SIMD_WIDTH;
_simd_store_ps(pfBuf, valG);
pfBuf += KNOB_SIMD_WIDTH;
_simd_store_ps(pfBuf, valB);
pfBuf += KNOB_SIMD_WIDTH;
_simd_store_ps(pfBuf, valA);
pfBuf += KNOB_SIMD_WIDTH;
}
}
}
}
void HotTileMgr::ClearDepthHotTile(
const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
{
// Load clear color into SIMD register...
float* pClearData = (float*)(pHotTile->clearData);
simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
float* pfBuf = (float*)pHotTile->pBuffer;
uint32_t numSamples = pHotTile->numSamples;
for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
{
for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
{
for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
{
_simd_store_ps(pfBuf, valZ);
pfBuf += KNOB_SIMD_WIDTH;
}
}
}
}
void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
{
// convert from F32 to U8.
uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
// broadcast 32x into __m256i...
simdscalari valS = _simd_set1_epi8(clearVal);
simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
uint32_t numSamples = pHotTile->numSamples;
for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
{
for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
{
// We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
{
_simd_store_si(pBuf, valS);
pBuf += 1;
}
}
}
}
#endif
//////////////////////////////////////////////////////////////////////////
/// @brief InitializeHotTiles
/// for draw calls, we initialize the active hot tiles and perform deferred
......
......@@ -67,7 +67,6 @@ struct LoadRasterTile
uint32_t x, uint32_t y,
uint8_t* pDst)
{
#if USE_8x2_TILE_BACKEND
typedef SimdTile_16<DstFormat, SrcFormat> SimdT;
SimdT* pDstSimdTiles = (SimdT*)pDst;
......@@ -81,21 +80,6 @@ struct LoadRasterTile
uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
pSimdTile->SetSwizzledColor(simdOffset, srcColor);
#else
typedef SimdTile<DstFormat, SrcFormat> SimdT;
SimdT* pDstSimdTiles = (SimdT*)pDst;
// Compute which simd tile we're accessing within 8x8 tile.
// i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
SimdT* pSimdTile = &pDstSimdTiles[simdIndex];
uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
pSimdTile->SetSwizzledColor(simdOffset, srcColor);
#endif
}
//////////////////////////////////////////////////////////////////////////
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment