Commit fe208cb9 authored by Faith Ekstrand's avatar Faith Ekstrand Committed by Lionel Landwerlin
Browse files

anv: Import GRL



GRL, or Graphics Library for Ray-tracing is a library we share with the
Windows drivers for doing BVH builds on the GPU.  It consists of a few
headers shared between CL and C code, a bunch of CL kernels, and some
GRL meta-kernels in their own format.

Acked-by: Lionel Landwerlin's avatarLionel Landwerlin <lionel.g.landwerlin@intel.com>
parent 363ab3da
//
// Copyright (C) 2009-2021 Intel Corporation
//
// SPDX-License-Identifier: MIT
//
//
#pragma once
#include "shared.h"
#include "intrinsics.h"
#ifndef __OPENCL_VERSION__
#include "stdio.h"
#endif
GRL_NAMESPACE_BEGIN(GRL)
GRL_NAMESPACE_BEGIN(RTAS)
/* ====== QUAD ENCODING config ====== */
#define QUAD_GEOMID_BITS 27 // dxr limit is 2^24 geos... we have headroom
#define QUAD_PRIMID_DIFF_BITS (32 - QUAD_GEOMID_BITS)
#define QUAD_GEOMID_MASK ((1<<QUAD_GEOMID_BITS)-1)
#define QUAD_PRIMID_BITS 29 // dxr limit is 2^29 prims total within one blas
#define QUAD_PRIMID_MASK ((1<<QUAD_PRIMID_BITS)-1)
#define INSTANCE_ID_BITS 24
#define INSTANCE_ID_MASK ((1<<INSTANCE_ID_BITS)-1)
// JDB TODO: Make this a separate, dedicated structure.. Aliasing a float4 AABB as a primref is needlessly obfuscated
typedef struct AABB PrimRef;
GRL_INLINE void AABB_init(struct AABB *aabb)
{
aabb->lower = (float4)(INFINITY, INFINITY, INFINITY, 0);
aabb->upper = -(float4)(INFINITY, INFINITY, INFINITY, 0);
}
GRL_INLINE uint PRIMREF_geomID( PrimRef* aabb)
{
const uint v = as_uint(aabb->lower.w);
return v & QUAD_GEOMID_MASK;
}
GRL_INLINE uint PRIMREF_primID0( PrimRef* aabb)
{
return as_uint( aabb->upper.w ) & QUAD_PRIMID_MASK;
}
GRL_INLINE uint PRIMREF_primID1( PrimRef* aabb)
{
const uint v = as_uint(aabb->lower.w);
const uint primID0 = as_uint(aabb->upper.w) & QUAD_PRIMID_MASK;
const uint deltaID = v >> QUAD_GEOMID_BITS;
const uint primID1 = primID0 + deltaID;
return primID1;
}
GRL_INLINE uint PRIMREF_geomFlags( PrimRef* aabb )
{
const uint v = as_uint( aabb->upper.w );
return (v >> QUAD_PRIMID_BITS) ;
}
GRL_INLINE uint PRIMREF_instanceIndex( PrimRef* aabb )
{
return as_uint(aabb->lower.w) & INSTANCE_ID_MASK;
}
GRL_INLINE uchar PRIMREF_instanceMask( PrimRef* aabb )
{
return as_uint(aabb->lower.w) >> INSTANCE_ID_BITS;
}
GRL_INLINE void PRIMREF_setProceduralMetaData( PrimRef* primref, uint geomID, uint primID, uint geomFlags )
{
/* encode geomID, primID */
uint flags = (geomFlags << QUAD_PRIMID_BITS);
primref->lower.w = as_float( geomID );
primref->upper.w = as_float( primID | flags );
}
GRL_INLINE void PRIMREF_setQuadMetaData( PrimRef* primref, uint primID0, uint primID1, uint geomID, uint geomFlags )
{
const uint primID_diff = primID1 - primID0;
uint flags = geomFlags << QUAD_PRIMID_BITS;
primref->lower.w = as_float( geomID | (primID_diff << QUAD_GEOMID_BITS) );
primref->upper.w = as_float( (primID0 | flags) );
}
GRL_INLINE void PRIMREF_setAABB( PrimRef* primref, float3 lower, float3 upper )
{
primref->lower.xyz = lower.xyz;
primref->upper.xyz = upper.xyz;
}
GRL_INLINE PrimRef PRIMREF_set_instance( float3 lower, float3 upper, uint instanceIndex, uint instanceMask, uint rootOffset, bool is_procedural )
{
PrimRef new_ref;
new_ref.lower.xyz = lower;
new_ref.lower.w = as_float(instanceIndex | (instanceMask << 24));
new_ref.upper.xyz = upper;
new_ref.upper.w = as_float(rootOffset + (is_procedural? 0x80000000 : 0));
return new_ref;
}
GRL_INLINE bool PRIMREF_isProceduralInstance( PrimRef* primref )
{
return (as_uint(primref->upper.w) & 0x80000000) != 0;
}
GRL_INLINE uint PRIMREF_instanceRootNodeOffset(PrimRef* primref)
{
return (as_uint(primref->upper.w) & 0x7fffffff);
}
GRL_INLINE float3 PRIMREF_lower( PrimRef* primref )
{
return primref->lower.xyz;
}
GRL_INLINE float3 PRIMREF_upper( PrimRef* primref )
{
return primref->upper.xyz;
}
GRL_INLINE void AABB_extend(struct AABB *aabb, struct AABB *v)
{
aabb->lower = min(aabb->lower, v->lower);
aabb->upper = max(aabb->upper, v->upper);
}
GRL_INLINE void AABB_extend_point(struct AABB *aabb, const float4 p)
{
aabb->lower = min(aabb->lower, p);
aabb->upper = max(aabb->upper, p);
}
GRL_INLINE void AABB_extendlu(struct AABB *aabb, const float4 lower, const float4 upper)
{
aabb->lower = min(aabb->lower, lower);
aabb->upper = max(aabb->upper, upper);
}
GRL_INLINE struct AABB AABB_enlarge(struct AABB *aabb, const float v)
{
struct AABB box;
box.lower = aabb->lower - (float4)v;
box.upper = aabb->upper + (float4)v;
return box;
}
GRL_INLINE void AABB_intersect(struct AABB *aabb, struct AABB *v)
{
aabb->lower = max(aabb->lower, v->lower);
aabb->upper = min(aabb->upper, v->upper);
}
GRL_INLINE float4 AABB_size(struct AABB *aabb)
{
return aabb->upper - aabb->lower;
}
GRL_INLINE float4 AABB_centroid2(struct AABB *aabb)
{
return aabb->lower + aabb->upper;
}
GRL_INLINE float AABB_halfArea(struct AABB *aabb)
{
const float4 d = AABB_size(aabb);
return halfarea(d.xyz);
}
GRL_INLINE float AABB_intersecion_size(struct AABB* aabb, struct AABB* v)
{
struct AABB temp = *aabb;
AABB_intersect(&temp, v);
float4 len = AABB_size(&temp);
float ret = 0.0f;
if (len.x >= 0.0f && len.y >= 0.0f && len.z >= 0.0f) {
float3 v = { len.x, len.y, len.z };
ret = halfarea(v);
}
return ret;
}
GRL_INLINE bool AABB_subset(struct AABB* small, struct AABB* big)
{
const int4 b0 = small->lower >= big->lower;
const int4 b1 = small->upper <= big->upper;
const int4 b = b0 & b1;
return b.x & b.y & b.z;
}
GRL_INLINE struct AABB AABBfromAABB3f(const struct AABB3f box)
{
struct AABB box4d = {
{box.lower[0], box.lower[1], box.lower[2], 0.0f},
{box.upper[0], box.upper[1], box.upper[2], 0.0f}
};
return box4d;
}
GRL_INLINE struct AABB3f AABB3fFromAABB(const struct AABB box)
{
struct AABB3f box3d = {
{box.lower[0], box.lower[1], box.lower[2]},
{box.upper[0], box.upper[1], box.upper[2]}
};
return box3d;
}
GRL_INLINE bool AABB_verify(struct AABB* aabb)
{
bool error = false;
if (aabb->lower.x > aabb->upper.x)
error = true;
if (aabb->lower.y > aabb->upper.y)
error = true;
if (aabb->lower.z > aabb->upper.z)
error = true;
if (!isfinite(aabb->lower.x))
error = true;
if (!isfinite(aabb->lower.y))
error = true;
if (!isfinite(aabb->lower.z))
error = true;
if (!isfinite(aabb->upper.x))
error = true;
if (!isfinite(aabb->upper.y))
error = true;
if (!isfinite(aabb->upper.z))
error = true;
return error;
}
GRL_INLINE void AABB_print(struct AABB* aabb)
{
printf("AABB {\n area = %f\n lower = %f\n upper = %f\n geomID = %i primID0 = %i primID1 = %i\n aabb->lower.w = %x aabb->upper.w = %x }\n",
AABB_halfArea(aabb),
aabb->lower.xyz,
aabb->upper.xyz,
PRIMREF_geomID(aabb),
PRIMREF_primID0(aabb),
PRIMREF_primID1(aabb),
as_uint(aabb->lower.w),
as_uint(aabb->upper.w));
}
#ifdef __OPENCL_VERSION__
GRL_INLINE PrimRef PrimRef_sub_group_shuffle(PrimRef* primRef, const uint slotID)
{
PrimRef shuffledPrimref;
shuffledPrimref.lower.x = intel_sub_group_shuffle(primRef->lower.x, slotID);
shuffledPrimref.lower.y = intel_sub_group_shuffle(primRef->lower.y, slotID);
shuffledPrimref.lower.z = intel_sub_group_shuffle(primRef->lower.z, slotID);
shuffledPrimref.lower.w = intel_sub_group_shuffle(primRef->lower.w, slotID);
shuffledPrimref.upper.x = intel_sub_group_shuffle(primRef->upper.x, slotID);
shuffledPrimref.upper.y = intel_sub_group_shuffle(primRef->upper.y, slotID);
shuffledPrimref.upper.z = intel_sub_group_shuffle(primRef->upper.z, slotID);
shuffledPrimref.upper.w = intel_sub_group_shuffle(primRef->upper.w, slotID);
return shuffledPrimref;
}
GRL_INLINE struct AABB AABB_sub_group_broadcast(struct AABB *aabb, const uint slotID)
{
struct AABB bounds;
bounds.lower.x = sub_group_broadcast(aabb->lower.x, slotID);
bounds.lower.y = sub_group_broadcast(aabb->lower.y, slotID);
bounds.lower.z = sub_group_broadcast(aabb->lower.z, slotID);
bounds.lower.w = 0;
bounds.upper.x = sub_group_broadcast(aabb->upper.x, slotID);
bounds.upper.y = sub_group_broadcast(aabb->upper.y, slotID);
bounds.upper.z = sub_group_broadcast(aabb->upper.z, slotID);
bounds.upper.w = 0;
return bounds;
}
GRL_INLINE struct AABB AABB_sub_group_shuffle(struct AABB* aabb, const uint slotID)
{
struct AABB bounds;
bounds.lower.x = intel_sub_group_shuffle(aabb->lower.x, slotID);
bounds.lower.y = intel_sub_group_shuffle(aabb->lower.y, slotID);
bounds.lower.z = intel_sub_group_shuffle(aabb->lower.z, slotID);
bounds.lower.w = 0;
bounds.upper.x = intel_sub_group_shuffle(aabb->upper.x, slotID);
bounds.upper.y = intel_sub_group_shuffle(aabb->upper.y, slotID);
bounds.upper.z = intel_sub_group_shuffle(aabb->upper.z, slotID);
bounds.upper.w = 0;
return bounds;
}
GRL_INLINE uint AABB_sub_group_shuffle_coordPerLane(struct AABB* aabb, const uint slotID)
{
float coordData[8] = {
sub_group_broadcast(aabb->lower.x, slotID),
sub_group_broadcast(aabb->lower.y, slotID),
sub_group_broadcast(aabb->lower.z, slotID),
sub_group_broadcast(aabb->lower.w, slotID),
sub_group_broadcast(aabb->upper.x, slotID),
sub_group_broadcast(aabb->upper.y, slotID),
sub_group_broadcast(aabb->upper.z, slotID),
sub_group_broadcast(aabb->upper.w, slotID) };
uint coordDataFiltered;
const uint lane = get_sub_group_local_id();
if (lane < 8) coordDataFiltered = as_uint(coordData[lane]);
return coordDataFiltered;
}
GRL_INLINE struct AABB AABB_sub_group_reduce(struct AABB *aabb)
{
struct AABB bounds;
bounds.lower.x = sub_group_reduce_min(aabb->lower.x);
bounds.lower.y = sub_group_reduce_min(aabb->lower.y);
bounds.lower.z = sub_group_reduce_min(aabb->lower.z);
bounds.lower.w = 0;
bounds.upper.x = sub_group_reduce_max(aabb->upper.x);
bounds.upper.y = sub_group_reduce_max(aabb->upper.y);
bounds.upper.z = sub_group_reduce_max(aabb->upper.z);
bounds.upper.w = 0;
return bounds;
}
GRL_INLINE struct AABB AABB_sub_group_reduce_N6( struct AABB* aabb )
{
float3 l = aabb->lower.xyz;
float3 u = aabb->upper.xyz;
l = min( l, intel_sub_group_shuffle_down( l, l, 4 ) );
l = min( l, intel_sub_group_shuffle_down( l, l, 2 ) );
l = min( l, intel_sub_group_shuffle_down( l, l, 1 ) );
u = max( u, intel_sub_group_shuffle_down( u, u, 4 ) );
u = max( u, intel_sub_group_shuffle_down( u, u, 2 ) );
u = max( u, intel_sub_group_shuffle_down( u, u, 1 ) );
struct AABB bounds;
bounds.lower.x = l.x;
bounds.lower.y = l.y;
bounds.lower.z = l.z;
bounds.lower.w = 0;
bounds.upper.x = u.x;
bounds.upper.y = u.y;
bounds.upper.z = u.z;
bounds.upper.w = 0;
return bounds;
}
GRL_INLINE struct AABB AABB_work_group_reduce(struct AABB *aabb)
{
struct AABB bounds;
bounds.lower.x = work_group_reduce_min(aabb->lower.x);
bounds.lower.y = work_group_reduce_min(aabb->lower.y);
bounds.lower.z = work_group_reduce_min(aabb->lower.z);
bounds.upper.x = work_group_reduce_max(aabb->upper.x);
bounds.upper.y = work_group_reduce_max(aabb->upper.y);
bounds.upper.z = work_group_reduce_max(aabb->upper.z);
return bounds;
}
GRL_INLINE struct AABB AABB_sub_group_scan_exclusive_min_max(struct AABB *aabb)
{
struct AABB bounds;
bounds.lower.x = sub_group_scan_exclusive_min(aabb->lower.x);
bounds.lower.y = sub_group_scan_exclusive_min(aabb->lower.y);
bounds.lower.z = sub_group_scan_exclusive_min(aabb->lower.z);
bounds.lower.w = 0;
bounds.upper.x = sub_group_scan_exclusive_max(aabb->upper.x);
bounds.upper.y = sub_group_scan_exclusive_max(aabb->upper.y);
bounds.upper.z = sub_group_scan_exclusive_max(aabb->upper.z);
bounds.upper.w = 0;
return bounds;
}
GRL_INLINE struct AABB AABB_sub_group_scan_inclusive_min_max(struct AABB *aabb)
{
struct AABB bounds;
bounds.lower.x = sub_group_scan_inclusive_min(aabb->lower.x);
bounds.lower.y = sub_group_scan_inclusive_min(aabb->lower.y);
bounds.lower.z = sub_group_scan_inclusive_min(aabb->lower.z);
bounds.lower.w = 0;
bounds.upper.x = sub_group_scan_inclusive_max(aabb->upper.x);
bounds.upper.y = sub_group_scan_inclusive_max(aabb->upper.y);
bounds.upper.z = sub_group_scan_inclusive_max(aabb->upper.z);
bounds.upper.w = 0;
return bounds;
}
GRL_INLINE void AABB_global_atomic_merge(global struct AABB *global_aabb, struct AABB *aabb)
{
atomic_min((volatile __global float *)&global_aabb->lower + 0, aabb->lower.x);
atomic_min((volatile __global float *)&global_aabb->lower + 1, aabb->lower.y);
atomic_min((volatile __global float *)&global_aabb->lower + 2, aabb->lower.z);
atomic_max((volatile __global float *)&global_aabb->upper + 0, aabb->upper.x);
atomic_max((volatile __global float *)&global_aabb->upper + 1, aabb->upper.y);
atomic_max((volatile __global float *)&global_aabb->upper + 2, aabb->upper.z);
}
GRL_INLINE void AABB_global_atomic_merge_lu(global struct AABB* global_aabb, float3 lower, float3 upper )
{
atomic_min((volatile __global float*) & global_aabb->lower + 0, lower.x);
atomic_min((volatile __global float*) & global_aabb->lower + 1, lower.y);
atomic_min((volatile __global float*) & global_aabb->lower + 2, lower.z);
atomic_max((volatile __global float*) & global_aabb->upper + 0, upper.x);
atomic_max((volatile __global float*) & global_aabb->upper + 1, upper.y);
atomic_max((volatile __global float*) & global_aabb->upper + 2, upper.z);
}
GRL_INLINE void AABB_global_atomic_merge_sub_group_lu(uniform global struct AABB* aabb, float3 lower, float3 upper)
{
uint lane = get_sub_group_local_id();
float l[3];
l[0] = sub_group_reduce_min(lower.x);
l[1] = sub_group_reduce_min(lower.y);
l[2] = sub_group_reduce_min(lower.z);
float u[3];
u[0] = sub_group_reduce_max(upper.x);
u[1] = sub_group_reduce_max(upper.y);
u[2] = sub_group_reduce_max(upper.z);
if (lane < 3)
{
atomic_min((global float*)&aabb->lower + lane, l[lane]);
atomic_max((global float*)&aabb->upper + lane, u[lane]);
}
}
GRL_INLINE void AABB_local_atomic_merge(local struct AABB *aabb, const float4 lower, const float4 upper)
{
if (lower.x < aabb->lower.x)
atomic_min((local float *)&aabb->lower + 0, lower.x);
if (lower.y < aabb->lower.y)
atomic_min((local float *)&aabb->lower + 1, lower.y);
if (lower.z < aabb->lower.z)
atomic_min((local float *)&aabb->lower + 2, lower.z);
if (upper.x > aabb->upper.x)
atomic_max((local float *)&aabb->upper + 0, upper.x);
if (upper.y > aabb->upper.y)
atomic_max((local float *)&aabb->upper + 1, upper.y);
if (upper.z > aabb->upper.z)
atomic_max((local float *)&aabb->upper + 2, upper.z);
}
#endif
GRL_NAMESPACE_END(RTAS)
GRL_NAMESPACE_END(GRL)
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
//
// Copyright (C) 2009-2021 Intel Corporation
//
// SPDX-License-Identifier: MIT
//
//
module atomic_update;
kernel_module atomic_update ("atomic_update.cl")
{
links lsc_intrinsics;
kernel init_refit_scratch < kernelFunction = "init_refit_scratch" >;
kernel traverse_aabbs_quad < kernelFunction = "traverse_aabbs_quad" >;
kernel write_inner_nodes < kernelFunction = "write_inner_nodes" >;
kernel build_fatleaf_table < kernelFunction = "build_fatleaf_table" >;
kernel build_innernode_table < kernelFunction = "build_innernode_table" >;
kernel update_single_group_quads < kernelFunction = "update_single_group_quads" >;
kernel build_fatleaf_table_new_update < kernelFunction = "build_fatleaf_table_new_update" >;
kernel fixup_quad_table < kernelFunction = "fixup_quad_table" >;
kernel traverse_aabbs_new_update < kernelFunction = "traverse_aabbs_new_update" >;
kernel traverse_aabbs_new_update_single_geo < kernelFunction = "traverse_aabbs_new_update_single_geo" >;
}
import struct MKBuilderState "structs.grl";
// this metakernel only initializes registers for use in a batching loop by "init_refit_scratch"
metakernel init_refit_scratch_metakernel_registers()
{
REG0.hi = 0;
REG1 = 3;
REG2 = 63;
REG3 = 4;
REG4 = 2;
DISPATCHDIM_Y = 1;
DISPATCHDIM_Z = 1;
}
metakernel init_refit_scratch( qword bvh_base, qword scratch)//, dword max_inner_nodes )
{
REG0.lo = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
define C_3 REG1;
define C_63 REG2;
define C_4 REG3;
define C_2 REG4;
REG0 = REG0 - C_3; // nodedataCurr - fixed offset
REG0 = REG0 + C_63; // + 63
REG0 = REG0 >> C_4; // >> 4
REG0 = REG0 >> C_2; // >> 2 == >> 6 == /64
DISPATCHDIM_X = REG0.lo;
dispatch_indirect init_refit_scratch//( (max_inner_nodes+63)/64, 1, 1 )
args(bvh_base,scratch);
}
metakernel build_node_tables( qword bvh_base )
{
REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
REG1 = 2;
REG2 = 63;
REG3 = 4;
REG4 = 3; // fixed offset... TODO: DON'T HARDCODE!!
REG0 = REG0 - REG4; // nodedataCurr - fixed offset
REG0 = REG0 + REG2; // + 63
REG0 = REG0 >> REG3; // >> 4
REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64
DISPATCHDIM_X = REG0.lo;
DISPATCHDIM_Y = 1;
DISPATCHDIM_Z = 1;
dispatch_indirect build_fatleaf_table//( (max_inner_nodes+63)/64, 1, 1 )
args(bvh_base);
dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 )
args(bvh_base);
}
metakernel build_node_tables_new_update( MKBuilderState state, qword bvh_base )
{
REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
REG1 = 2;
REG2 = 63;
REG3 = 4;
REG4 = 3; // fixed offset... TODO: DON'T HARDCODE!!
REG0 = REG0 - REG4; // nodedataCurr - fixed offset
REG0 = REG0 + REG2; // + 63
REG0 = REG0 >> REG3; // >> 4
REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64
DISPATCHDIM_X = REG0.lo;
DISPATCHDIM_Y = 1;
DISPATCHDIM_Z = 1;
dispatch_indirect build_fatleaf_table_new_update//( (max_inner_nodes+63)/64, 1, 1 )
args(state.build_globals, bvh_base);
dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 )
args(bvh_base);
}
metakernel fixup_quad_table( qword bvh_base )
{
dispatch fixup_quad_table(2,1,1)
args(bvh_base);
}
// this metakernel only initializes registers for use in a batching loop by "traverse_aabbs_quad" and "write_inner_nodes"
metakernel init_traverse_aabbs_quad_and_write_inner_nodes()
{
REG0.hi = 0;
REG1 = 1;
REG2 = 31;
REG3 = 4;
REG4 = 2;
REG5 = 7;
REG6 = 255;
DISPATCHDIM_Y = 1;
DISPATCHDIM_Z = 1;
}
metakernel traverse_aabbs_quad( qword bvh_base, qword scratch, qword geos)//, dword max_inner_nodes )
{
REG0.lo = load_dword( bvh_base + 64 ); // TODO: DOn't hardcode!
define C_1 REG1;
define C_31 REG2;
define C_4 REG3;
REG0 = REG0 + C_31; // + 31
REG0 = REG0 >> C_4; // >> 4
REG0 = REG0 >> C_1; // >> 1 == >> 5 == /32
DISPATCHDIM_X = REG0.lo;