Initial commit.

2024-04-23 10:14:24 +02:00 · 2024-04-23 10:14:24 +02:00 · d3bb49b3f5
commit d3bb49b3f5
1073 changed files with 484757 additions and 0 deletions
--- a/Framework/external/embree/kernels/rthwif/CMakeLists.txt
+++ b/Framework/external/embree/kernels/rthwif/CMakeLists.txt
@ -0,0 +1,182 @@
+## Copyright 2009-2021 Intel Corporation
+## SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.1.0)
+
+project(ze_raytracing)
+
+INCLUDE(CTest)
+
+SET(RTHWIF_VERSION_MAJOR 4)
+SET(RTHWIF_VERSION_MINOR 1)
+SET(RTHWIF_VERSION_PATCH 0)
+SET(RTHWIF_VERSION ${RTHWIF_VERSION_MAJOR}.${RTHWIF_VERSION_MINOR}.${RTHWIF_VERSION_PATCH})
+
+SET(CMAKE_CXX_STANDARD 17)
+
+IF (NOT DEFINED EMBREE_VERSION_MAJOR)
+  
+  SET(RTHWIF_STANDALONE ON)
+  SET(RTHWIF_NAME ze_raytracing)
+  ADD_DEFINITIONS("-DRTHWIF_STANDALONE")
+
+  SET(EMBREE_CMAKEEXPORT_DIR "cmake")
+
+  OPTION(EMBREE_SYCL_IMPLICIT_DISPATCH_GLOBALS "Using L0 allocated Dispatch Globals" ON)
+  
+  SET(EMBREE_RTHWIF_STATIC_LIB OFF)
+  SET(EMBREE_BUILDER_TBB_STATIC ON)
+
+  SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
+  SET(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
+  SET(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
+  SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" ${CMAKE_MODULE_PATH})
+
+  CONFIGURE_FILE(
+    "${PROJECT_SOURCE_DIR}/../../kernels/config.h.in"
+    "${PROJECT_SOURCE_DIR}/../../kernels/config.h"
+    )
+
+  SET(EMBREE_MAX_INSTANCE_LEVEL_COUNT 1)
+  CONFIGURE_FILE(
+    "${PROJECT_SOURCE_DIR}/../../kernels/rtcore_config.h.in"
+    "${PROJECT_SOURCE_DIR}/../../include/embree4/rtcore_config.h"
+    )
+
+  IF (NOT WIN32)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")                       # generate position independent code suitable for shared libraries
+  ENDIF()
+
+  ADD_SUBDIRECTORY(../../common/sys sys)
+  ADD_SUBDIRECTORY(../../common/simd simd)
+
+  GET_FILENAME_COMPONENT(SYCL_COMPILER_DIR ${CMAKE_CXX_COMPILER} PATH)
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -isystem \"${SYCL_COMPILER_DIR}/../include/sycl\" -isystem \"${SYCL_COMPILER_DIR}/../include/\"")       # disable warning from SYCL header (FIXME: why required?)
+
+  find_package(TBB 2020)
+      
+ELSE()
+  SET(RTHWIF_NAME embree_rthwif)
+  OPTION(EMBREE_RTHWIF_STATIC_LIB "Build RTHWIF as a static library." ON)
+  option(EMBREE_BUILDER_TBB_STATIC "Use a staticaly compiled TBB version for the Embree builder for GPU." OFF)
+ENDIF()
+
+IF (EMBREE_SYCL_RT_VALIDATION_API AND NOT EMBREE_SYCL_IMPLICIT_DISPATCH_GLOBALS)
+    ADD_DEFINITIONS("-DEMBREE_SYCL_ALLOC_DISPATCH_GLOBALS")
+ENDIF()
+
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-bitwise-instead-of-logical") # disables "use of bitwise '&' with boolean operands" warning
+SET(CMAKE_CXX_FLAGS_SYCL "${CMAKE_CXX_FLAGS_SYCL} -Wno-bitwise-instead-of-logical") # disables "use of bitwise '&' with boolean operands" warning
+
+if (EMBREE_RTHWIF_STATIC_LIB)
+  set(RTHWIF_LIB_TYPE STATIC)
+else()
+  set(RTHWIF_LIB_TYPE SHARED)
+endif()
+
+# by default link against the tasking target that has all TBB related
+# information we need when TASKING_TBB is used
+set(TBB_TARGET tasking)
+
+if (EMBREE_BUILDER_TBB_STATIC OR NOT TASKING_TBB)
+
+  ####################################################################
+  # fetch TBB and build static version of it
+  set(TBB_TARGET tbb)
+  ADD_DEFINITIONS("-D_CRT_SECURE_NO_WARNINGS")
+
+  option(TBB_STRICT "Treat compiler warnings as errors" OFF)
+  option(TBB_TEST "Enable testing" OFF)
+  option(TBBMALLOC_BUILD "Enable tbbmalloc build" OFF)
+  SET(TBB_DIR OFF)
+  SET(BUILD_SHARED_LIBS OFF)
+
+  INCLUDE(FetchContent)
+
+  SET(FETCHCONTENT_QUIET OFF)
+
+  IF (NOT EMBREE_RTHWIF_TBB_GIT_REPOSITORY) # allow setting this externally
+    SET(EMBREE_RTHWIF_TBB_GIT_REPOSITORY "https://github.com/oneapi-src/oneTBB.git")
+  ENDIF()
+
+  FetchContent_Declare(
+    tbb_static
+    GIT_REPOSITORY ${EMBREE_RTHWIF_TBB_GIT_REPOSITORY}
+    GIT_TAG v2021.6.0
+  )
+
+  FetchContent_GetProperties(tbb_static)
+  if(NOT tbb_static_POPULATED)
+    FetchContent_Populate(tbb_static)
+    # We want to build tbb_static to link it into embree_rthwif, but don't want to
+    # install it as part of the Embree install targets.
+    add_subdirectory(${tbb_static_SOURCE_DIR} ${tbb_static_BINARY_DIR} EXCLUDE_FROM_ALL)
+  endif()
+
+  MARK_AS_ADVANCED(FETCHCONTENT_BASE_DIR)
+  MARK_AS_ADVANCED(FETCHCONTENT_FULLY_DISCONNECTED)
+  MARK_AS_ADVANCED(FETCHCONTENT_QUIET)
+  MARK_AS_ADVANCED(FETCHCONTENT_SOURCE_DIR_TBB_STATIC)
+  MARK_AS_ADVANCED(FETCHCONTENT_UPDATES_DISCONNECTED)
+  MARK_AS_ADVANCED(FETCHCONTENT_UPDATES_DISCONNECTED_TBB_STATIC)
+
+  MARK_AS_ADVANCED(TBB4PY_BUILD)
+  MARK_AS_ADVANCED(TBBMALLOC_BUILD)
+  MARK_AS_ADVANCED(TBB_BUILD)
+  MARK_AS_ADVANCED(TBB_CPF)
+  MARK_AS_ADVANCED(TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH)
+  MARK_AS_ADVANCED(TBB_ENABLE_IPO)
+  MARK_AS_ADVANCED(TBB_EXAMPLES)
+  MARK_AS_ADVANCED(TBB_FIND_PACKAGE)
+  MARK_AS_ADVANCED(TBB_INSTALL_VARS)
+  MARK_AS_ADVANCED(TBB_NO_APPCONTAINER)
+  MARK_AS_ADVANCED(TBB_SANITIZE)
+  MARK_AS_ADVANCED(TBB_STRICT)
+  MARK_AS_ADVANCED(TBB_TEST)
+  MARK_AS_ADVANCED(TBB_TEST_SPEC)
+  MARK_AS_ADVANCED(TBB_VALGRIND_MEMCHECK)
+  MARK_AS_ADVANCED(TBB_WINDOWS_DRIVER)
+
+  ADD_DEFINITIONS(-DTASKING_TBB)
+  ####################################################################
+ENDIF()
+
+IF (RTHWIF_STANDALONE)
+  include(package_ze_raytracing)
+  INCLUDE(CPack)
+ENDIF()
+
+IF (EMBREE_SYCL_RT_VALIDATION_API)
+  ADD_LIBRARY(embree_rthwif_sycl STATIC rttrace/rttrace_validation.cpp)
+  SET_PROPERTY(TARGET embree_rthwif_sycl APPEND PROPERTY COMPILE_FLAGS "-DEMBREE_SYCL_SUPPORT")
+  SET_TARGET_PROPERTIES(embree_rthwif_sycl PROPERTIES COMPILE_FLAGS ${CMAKE_CXX_FLAGS_SYCL})
+
+  INSTALL(TARGETS embree_rthwif_sycl EXPORT embree_rthwif_sycl-targets ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" COMPONENT lib)
+  INSTALL(EXPORT embree_rthwif_sycl-targets DESTINATION "${EMBREE_CMAKEEXPORT_DIR}" COMPONENT devel)
+
+  SET(EMBREE_RTHWIF_SYCL embree_rthwif_sycl)
+ENDIF()
+
+IF (NOT EMBREE_SYCL_L0_RTAS_BUILDER)
+ADD_LIBRARY(embree_rthwif ${RTHWIF_LIB_TYPE} rtbuild/rtbuild.cpp rtbuild/qbvh6.cpp rtbuild/statistics.cpp)
+TARGET_LINK_LIBRARIES(embree_rthwif PUBLIC ${EMBREE_RTHWIF_SYCL} PRIVATE ${TBB_TARGET} simd sys)
+SET_TARGET_PROPERTIES(embree_rthwif PROPERTIES OUTPUT_NAME ${RTHWIF_NAME})
+IF (EMBREE_RTHWIF_STATIC_LIB)
+  TARGET_COMPILE_DEFINITIONS(embree_rthwif PUBLIC EMBREE_RTHWIF_STATIC_LIB)
+ENDIF()
+
+TARGET_COMPILE_DEFINITIONS(embree_rthwif PUBLIC EMBREE_SYCL_SUPPORT)
+
+IF (EMBREE_STATIC_LIB OR NOT EMBREE_RTHWIF_STATIC_LIB)
+  INSTALL(TARGETS embree_rthwif EXPORT ${RTHWIF_NAME}-targets
+    LIBRARY NAMELINK_SKIP DESTINATION "${CMAKE_INSTALL_LIBDIR}" COMPONENT lib
+    RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT examples
+    ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" COMPONENT devel)
+  INSTALL(EXPORT ${RTHWIF_NAME}-targets DESTINATION "${EMBREE_CMAKEEXPORT_DIR}" COMPONENT devel)
+ENDIF()
+ENDIF()
+
+ADD_SUBDIRECTORY(testing)
+
+
+
--- a/Framework/external/embree/kernels/rthwif/package_ze_raytracing.cmake
+++ b/Framework/external/embree/kernels/rthwif/package_ze_raytracing.cmake
@ -0,0 +1,60 @@
+## Copyright 2009-2021 Intel Corporation
+## SPDX-License-Identifier: Apache-2.0
+
+INCLUDE(GNUInstallDirs)
+
+##############################################################
+# Install Documentation
+##############################################################
+
+INSTALL(FILES "${PROJECT_SOURCE_DIR}/../../LICENSE.txt" DESTINATION doc COMPONENT lib)
+INSTALL(FILES "${PROJECT_SOURCE_DIR}/../../CHANGELOG.md" DESTINATION doc COMPONENT lib)
+INSTALL(FILES "${PROJECT_SOURCE_DIR}/../../third-party-programs.txt" DESTINATION doc COMPONENT lib)
+INSTALL(FILES "${PROJECT_SOURCE_DIR}/../../third-party-programs-TBB.txt" DESTINATION doc COMPONENT lib)
+INSTALL(FILES "${PROJECT_SOURCE_DIR}/../../third-party-programs-OIDN.txt" DESTINATION doc COMPONENT lib)
+INSTALL(FILES "${PROJECT_SOURCE_DIR}/../../third-party-programs-DPCPP.txt" DESTINATION doc COMPONENT lib)
+INSTALL(FILES "${PROJECT_SOURCE_DIR}/../../third-party-programs-oneAPI-DPCPP.txt" DESTINATION doc COMPONENT lib)
+
+##############################################################
+# CPack specific stuff
+##############################################################
+
+SET(CPACK_PACKAGE_NAME "L0 Ray Tracing Build API")
+SET(CPACK_PACKAGE_FILE_NAME "ze_raytracing-${RTHWIF_VERSION}")
+SET(CPACK_STRIP_FILES TRUE)
+
+SET(CPACK_PACKAGE_VERSION_MAJOR ${EMBREE_VERSION_MAJOR})
+SET(CPACK_PACKAGE_VERSION_MINOR ${EMBREE_VERSION_MINOR})
+SET(CPACK_PACKAGE_VERSION_PATCH ${EMBREE_VERSION_PATCH})
+SET(CPACK_PACKAGE_VERSION ${EMBREE_VERSION})
+SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Implements acceleration structure build for L0 ray tracing extension.")
+SET(CPACK_PACKAGE_VENDOR "Intel Corporation")
+SET(CPACK_PACKAGE_CONTACT embree_support@intel.com)
+SET(CPACK_MONOLITHIC_INSTALL 1)
+
+SET(CPACK_COMPONENT_LIB_DISPLAY_NAME "Library")
+SET(CPACK_COMPONENT_LIB_DESCRIPTION "Library")
+
+SET(CPACK_COMPONENT_DEVEL_DISPLAY_NAME "Development")
+SET(CPACK_COMPONENT_DEVEL_DESCRIPTION "Development")
+
+SET(CPACK_COMPONENT_EXAMPLES_DISPLAY_NAME "Examples")
+SET(CPACK_COMPONENT_EXAMPLES_DESCRIPTION "Examples")
+
+# Windows specific settings
+IF(WIN32)
+  SET(CPACK_GENERATOR ZIP)
+  SET(CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_FILE_NAME}.x64.windows")
+
+# MacOSX specific settings
+ELSEIF(APPLE)
+  SET(CPACK_GENERATOR ZIP)
+  SET(CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_FILE_NAME}.x86_64.macosx")
+
+# Linux specific settings
+ELSE()
+
+  SET(CPACK_GENERATOR TGZ)
+  SET(CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_FILE_NAME}.x86_64.linux")
+ 
+ENDIF()
--- a/Framework/external/embree/kernels/rthwif/rtbuild/leaf.h
+++ b/Framework/external/embree/kernels/rthwif/rtbuild/leaf.h
@ -0,0 +1,629 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#if defined(ZE_RAYTRACING)
+#include "sys/sysinfo.h"
+#include "sys/vector.h"
+#include "math/vec2.h"
+#include "math/vec3.h"
+#include "math/bbox.h"
+#include "math/affinespace.h"
+#else
+#include "../../../common/sys/sysinfo.h"
+#include "../../../common/sys/vector.h"
+#include "../../../common/math/vec2.h"
+#include "../../../common/math/vec3.h"
+#include "../../../common/math/bbox.h"
+#include "../../../common/math/lbbox.h"
+#include "../../../common/math/affinespace.h"
+#endif
+
+#include "node_type.h"
+
+#include <map>
+#include <bitset>
+
+namespace embree
+{
+  /*
+
+    Internal representation for GeometryFlags.
+
+  */
+  
+#undef OPAQUE     // Windows defines OPAQUE in gdi.h
+  enum class GeometryFlags : uint32_t
+  {
+    NONE = 0x0,
+    OPAQUE = 0x1
+  };
+
+  inline bool operator& (GeometryFlags a, GeometryFlags b) {
+    return (int(a) & int(b)) ? true : false;
+  }
+
+  /* output operator for GeometryFlags */
+  inline std::ostream& operator<<(std::ostream& cout, const GeometryFlags& gflags)
+  {
+#if !defined(__SYCL_DEVICE_ONLY__)
+    if (gflags == GeometryFlags::NONE) return cout << "NONE";
+    if (gflags & GeometryFlags::OPAQUE) cout << "OPAQUE ";
+#endif
+    return cout;
+  }
+
+  /*
+
+    This structure is a header for each leaf type. Only the
+    InstanceLeaf has a slightly different header.
+
+    All primitives inside a leaf are of the same geometry, thus have
+    the same geometry index (geomIndex), the same shader index
+    (shaderIndex), the same geometry mask (geomMask), and the same
+    geometry flags (geomFlags).
+
+    The shaderIndex is used to calculate the shader record to
+    invoke. This is an extension to DXR where the geomIndex is used
+    for that purpose. For DXR we can always set the shaderIndex to be
+    equal to the geomIndex.
+
+   */
+  
+  struct PrimLeafDesc 
+  {
+    static const uint32_t MAX_GEOM_INDEX = 0x3FFFFFFF;
+    static const uint32_t MAX_SHADER_INDEX = 0xFFFFFF;
+    
+    enum Type : uint32_t
+    {
+      TYPE_NONE = 0,
+
+      /* For a node type of NODE_TYPE_PROCEDURAL we support enabling
+       * and disabling the opaque/non_opaque culling. */
+        
+      TYPE_OPACITY_CULLING_ENABLED = 0,
+      TYPE_OPACITY_CULLING_DISABLED = 1
+    };
+    
+    PrimLeafDesc() {}
+    
+    PrimLeafDesc(uint32_t shaderIndex, uint32_t geomIndex, GeometryFlags gflags, uint32_t geomMask, Type type = TYPE_NONE)
+    : shaderIndex(shaderIndex), geomMask(geomMask), geomIndex(geomIndex), type(type), geomFlags((uint32_t)gflags)
+    {
+      if (shaderIndex > MAX_SHADER_INDEX)
+        throw std::runtime_error("too large shader ID");
+      
+      if (geomIndex > MAX_GEOM_INDEX)
+        throw std::runtime_error("too large geometry ID");
+    }
+
+    /* compares two PrimLeafDesc's for equality */
+    friend bool operator ==(const PrimLeafDesc& a, const PrimLeafDesc& b)
+    {
+      if (a.geomIndex != b.geomIndex) return false;
+      assert(a.shaderIndex == b.shaderIndex);
+      assert(a.geomMask == b.geomMask);
+      assert(a.type == b.type);
+      assert(a.geomFlags == b.geomFlags);
+      return true;
+    }
+
+    friend bool operator !=(const PrimLeafDesc& a, const PrimLeafDesc& b) {
+      return !(a == b);
+    }
+
+    void print(std::ostream& cout, uint32_t depth) const
+    {
+#if !defined(__SYCL_DEVICE_ONLY__)
+      cout << tab(depth) << "PrimLeafDesc {" << std::endl;
+      cout << tab(depth) << "  shaderIndex = " << shaderIndex << std::endl;
+      cout << tab(depth) << "  geomMask = " << std::bitset<8>(geomMask) << std::endl;
+      cout << tab(depth) << "  geomFlags = " << getGeomFlags() << std::endl;
+      cout << tab(depth) << "  geomIndex = " << geomIndex << std::endl;
+      cout << tab(depth) << "}";
+#endif
+    }
+
+    friend inline std::ostream& operator<<(std::ostream& cout, const PrimLeafDesc& desc) {
+      desc.print(cout,0); return cout;
+    }
+
+    /* Checks if opaque culling is enabled. */
+    bool opaqueCullingEnabled() const {
+      return type == TYPE_OPACITY_CULLING_ENABLED;
+    }
+
+    /* procedural instances store some valid shader index */
+    bool isProceduralInstance() const {
+      return shaderIndex != 0xFFFFFF;
+    }
+
+    /* returns geometry flags */
+    GeometryFlags getGeomFlags() const {
+      return (GeometryFlags) geomFlags;
+    }
+
+  public:
+    uint32_t shaderIndex : 24;    // shader index used for shader record calculations
+    uint32_t geomMask    : 8;     // geometry mask used for ray masking
+ 
+    uint32_t geomIndex      : 29; // the geometry index specifies the n'th geometry of the scene
+    /*Type*/ uint32_t type  : 1;  // enable/disable culling for procedurals and instances
+    /*GeometryFlags*/ uint32_t geomFlags : 2;  // geometry flags of this geometry
+  };
+
+  /*
+
+    The QuadLeaf structure stores a single quad. A quad is a triangle
+    pair with a shared edge. The first triangle has vertices v0,v1,v2,
+    while the second triangle has vertices v[j0],v[j1],v[j2], thus the
+    second triangle used local triangle indices.
+
+   */
+  
+  struct QuadLeaf
+  {
+    QuadLeaf() {}
+
+    QuadLeaf (Vec3f v0, Vec3f v1, Vec3f v2, Vec3f v3,
+              uint8_t j0, uint8_t j1, uint8_t j2,
+              uint32_t shaderIndex, uint32_t geomIndex, uint32_t primIndex0, uint32_t primIndex1,
+              GeometryFlags gflags, uint32_t geomMask, bool last)
+
+      : leafDesc(shaderIndex,geomIndex,gflags,geomMask),
+        primIndex0(primIndex0), 
+        primIndex1Delta(primIndex1-primIndex0), pad1(0),
+        j0(j0),j1(j1),j2(j2),last(last),pad(0),
+        v0(v0), v1(v1), v2(v2), v3(v3)
+    {
+      /* There are some constraints on the primitive indices. The
+       * second primitive index always has to be the largest and the
+       * distance between them can be at most 0xFFFF as we use 16 bits
+       * to encode that difference. */
+      assert(primIndex0 <= primIndex1 && primIndex1 - primIndex0 < 0xFFFF);
+    }
+
+    /* returns the i'th vertex */
+    __forceinline Vec3f vertex(size_t i) const {
+      assert(i < 4); return (&v0)[i];
+    }
+
+    /* Checks if the specified triange is the last inside a leaf
+     * list. */
+    bool isLast(uint32_t i = 1) const
+    {
+      assert(i<2);
+      if (i == 0) return false; // the first triangle is never the last
+      else return last;         // the last bit tags the second triangle to be last
+    }
+
+    /* Checks if the second triangle exists. */
+    bool valid2() const {
+      return !(j0 == 0 && j1 == 0 && j2 == 0);
+    }
+
+    /* Calculates the number of stored triangles. */
+    size_t size() const {
+      return 1 + valid2();
+    }
+
+    /* Calculates the effectively used bytes. If we store only one
+     * triangle we waste the storage of one vertex. */
+    size_t usedBytes() const
+    {
+      if (valid2()) return sizeof(QuadLeaf);
+      else          return sizeof(QuadLeaf)-sizeof(Vec3f);
+    }
+
+    /* Calculates to delta to add to primIndex0 to get the primitive
+     * index of the i'th triangle. */
+    uint32_t primIndexDelta(uint32_t i) const
+    {
+      assert(i<2);
+      return i*primIndex1Delta;
+    }
+
+    /* Calculates the primitive index of the i'th triangle. */
+    uint32_t primIndex(uint32_t i) const
+    {
+      assert(i<2);
+      return primIndex0 + primIndexDelta(i);
+    }   
+
+    /* Quad mode is a special mode where the uv's over the quad are
+     * defined over the entire range [0,1]x[0,1]. */
+    bool quadMode() const {
+      return primIndex1Delta == 0;
+    }
+
+    /* Calculates the bounding box of this leaf. */
+    BBox3f bounds() const
+    {
+      BBox3f b = empty;
+      b.extend(v0);
+      b.extend(v1);
+      b.extend(v2);
+      if (valid2())
+        b.extend(v3);
+      return b;
+    }
+
+    /* output of quad leaf */
+    void print(std::ostream& cout, uint32_t depth) const
+    {
+#if !defined(__SYCL_DEVICE_ONLY__)
+      cout << tab(depth) << "QuadLeaf {" << std::endl;
+      cout << tab(depth) << "  addr = " << this << std::endl;
+      cout << tab(depth) << "  shaderIndex = " << leafDesc.shaderIndex << std::endl;
+      cout << tab(depth) << "  geomMask = " << std::bitset<8>(leafDesc.geomMask) << std::endl;
+      cout << tab(depth) << "  geomFlags = " << leafDesc.getGeomFlags() << std::endl;
+      cout << tab(depth) << "  geomIndex = " << leafDesc.geomIndex << std::endl;
+      cout << tab(depth) << "  triangle0 = { " << std::endl;
+      cout << tab(depth) << "    primIndex = " << primIndex(0) << std::endl;
+      cout << tab(depth) << "    v0 = " << v0 << std::endl;
+      cout << tab(depth) << "    v1 = " << v1 << std::endl;
+      cout << tab(depth) << "    v2 = " << v2 << std::endl;
+      cout << tab(depth) << "  }" << std::endl;
+      if (valid2()) {
+        cout << tab(depth) << "  triangle1 = { " << std::endl;
+        cout << tab(depth) << "    primIndex = " << primIndex(1) << std::endl;
+        cout << tab(depth) << "    v0 = " << vertex(j0) << std::endl;
+        cout << tab(depth) << "    v1 = " << vertex(j1) << std::endl;
+        cout << tab(depth) << "    v2 = " << vertex(j2) << std::endl;
+        cout << tab(depth) << "  }" << std::endl;
+      }
+      cout << tab(depth) << "}";
+#endif
+    }
+
+    /* output operator for QuadLeaf */
+    friend inline std::ostream& operator<<(std::ostream& cout, const QuadLeaf& leaf) {
+      leaf.print(cout,0); return cout;
+    }
+
+  public:
+    PrimLeafDesc leafDesc;  // the leaf header
+
+    uint32_t primIndex0;    // primitive index of first triangle
+    struct {
+      uint32_t primIndex1Delta : 5;  // delta encoded primitive index of second triangle
+      uint32_t pad1            : 11; // MBZ
+      uint32_t j0              : 2;   // specifies first vertex of second triangle
+      uint32_t j1              : 2;   // specified second vertex of second triangle
+      uint32_t j2              : 2;   // specified third vertex of second triangle    
+      uint32_t last            : 1;   // true if the second triangle is the last triangle in a leaf list
+      uint32_t pad             : 9;   // unused bits
+    };
+    
+    Vec3f v0;  // first vertex of first triangle
+    Vec3f v1;  // second vertex of first triangle
+    Vec3f v2;  // third vertex of first triangle
+    Vec3f v3;  // forth vertex used for second triangle
+  };
+
+  static_assert(sizeof(QuadLeaf) == 64, "QuadLeaf must be 64 bytes large");
+
+  /* 
+
+     Internal instance flags definition.
+
+  */
+  
+  struct InstanceFlags
+  {
+    enum Flags : uint8_t
+    {
+      NONE = 0x0,
+      TRIANGLE_CULL_DISABLE = 0x1,              // disables culling of front and back facing triangles through ray flags
+      TRIANGLE_FRONT_COUNTERCLOCKWISE = 0x2,    // for mirroring transformations the instance can switch front and backface of triangles
+      FORCE_OPAQUE = 0x4,                       // forces all primitives inside this instance to be opaque
+      FORCE_NON_OPAQUE = 0x8                    // forces all primitives inside this instane to be non-opaque
+    };
+
+    InstanceFlags() {}
+
+    InstanceFlags(Flags rflags)
+      : flags(rflags) {}
+
+    InstanceFlags(uint8_t rflags)
+      : flags((Flags)rflags) {}
+
+    operator Flags () const {
+      return flags;
+    }
+
+    /* output operator for InstanceFlags */
+    friend inline std::ostream& operator<<(std::ostream& cout, const InstanceFlags& iflags)
+    {
+#if !defined(__SYCL_DEVICE_ONLY__)
+      if (iflags == InstanceFlags::NONE) return cout << "NONE";
+      if (iflags.triangle_cull_disable) cout << "TRIANGLE_CULL_DISABLE ";
+      if (iflags.triangle_front_counterclockwise) cout << "TRIANGLE_FRONT_COUNTERCLOCKWISE ";
+      if (iflags.force_opaque) cout << "FORCE_OPAQUE ";
+      if (iflags.force_non_opaque) cout << "FORCE_NON_OPAQUE ";
+#endif
+      return cout;
+    }
+
+  public:
+    union
+    {
+      Flags flags;
+      struct
+      {
+        bool triangle_cull_disable : 1;
+        bool triangle_front_counterclockwise : 1;
+        bool force_opaque : 1;
+        bool force_non_opaque : 1;
+      };
+    };
+  };
+
+  inline InstanceFlags::Flags operator| (InstanceFlags::Flags a,InstanceFlags::Flags b) {
+    return (InstanceFlags::Flags)(int(a) | int(b));
+  }
+  
+  /* 
+
+     The instance leaf represent an instance. It essentially stores
+     transformation matrices (local to world as well as world to
+     local) of the instance as well as a pointer to the start node
+     of some BVH.
+
+     The instance leaf consists of two parts, part0 (first 64 bytes)
+     and part1 (second 64 bytes). Part0 will only get accessed by
+     hardware and stores the world to local transformation as well as
+     the BVH node to start traversal. Part1 stores additional data
+     that is only read by the shader, e.g. it stores the local to
+     world transformation of the instance.
+
+     The layout of the first part of the InstanceLeaf is compatible
+     with a ProceduralLeaf, thus we can use the same layout for
+     software instancing if we want.
+
+  */
+  
+  struct InstanceLeaf
+  {
+    InstanceLeaf() {}
+    
+    InstanceLeaf (AffineSpace3f obj2world, uint64_t startNodePtr, uint32_t instID, uint32_t instUserID, uint8_t instMask)
+    {
+      part0.shaderIndex = 0; //InstShaderRecordID;
+      part0.geomMask = instMask;
+      
+      part0.instanceContributionToHitGroupIndex = 0; //desc.InstanceContributionToHitGroupIndex;
+      part0.pad0 = 0;
+      part0.type = PrimLeafDesc::TYPE_OPACITY_CULLING_ENABLED;
+      part0.geomFlags = (uint32_t) GeometryFlags::NONE;
+    
+      part0.startNodePtr = startNodePtr;
+      assert((startNodePtr >> 48) == 0);
+      part0.instFlags = (InstanceFlags) 0;
+      part0.pad1 = 0;
+      
+      part1.instanceID = instUserID;
+      part1.instanceIndex = instID;
+      part1.bvhPtr = (uint64_t) 0;
+      part1.pad = 0;
+
+      part1.obj2world_vx = obj2world.l.vx;
+      part1.obj2world_vy = obj2world.l.vy;
+      part1.obj2world_vz = obj2world.l.vz;
+      part0.obj2world_p = obj2world.p;
+      
+      const AffineSpace3f world2obj = rcp(obj2world);
+      part0.world2obj_vx = world2obj.l.vx;
+      part0.world2obj_vy = world2obj.l.vy;
+      part0.world2obj_vz = world2obj.l.vz;
+      part1.world2obj_p = world2obj.p;
+    }
+
+    /* Returns the address of the start node pointer. We need this
+     * address to calculate relocation tables when dumping the BVH to
+     * disk. */
+    const uint64_t startNodePtrAddr() const {
+      return (uint64_t)((char*)&part0 + 8);
+    }
+
+    /* Returns the address of the BVH that contains the start node. */
+    const uint64_t bvhPtrAddr() const {
+      return (uint64_t)&part1;
+    }
+
+    /* returns the world to object space transformation matrix. */
+    const AffineSpace3f World2Obj() const {
+      return AffineSpace3f(part0.world2obj_vx,part0.world2obj_vy,part0.world2obj_vz,part1.world2obj_p);
+    }
+
+    /* returns the object to world space transformation matrix. */
+    const AffineSpace3f Obj2World() const {
+      return AffineSpace3f(part1.obj2world_vx,part1.obj2world_vy,part1.obj2world_vz,part0.obj2world_p);
+    }
+
+    /* output operator for instance leaf */
+    void print (std::ostream& cout, uint32_t depth) const
+    {
+#if !defined(__SYCL_DEVICE_ONLY__)
+      if (!part0.type) cout << tab(depth) << "InstanceLeaf {" << std::endl;
+      else             cout << tab(depth) << "ProceduralInstanceLeaf {" << std::endl;
+        
+      cout << tab(depth) << "  addr = " << this << std::endl;
+      cout << tab(depth) << "  shaderIndex = " << part0.shaderIndex << std::endl;
+      cout << tab(depth) << "  geomMask = " << std::bitset<8>(part0.geomMask) << std::endl;
+      cout << tab(depth) << "  geomIndex = " << part1.instanceIndex << std::endl;
+      cout << tab(depth) << "  instanceID = " << part1.instanceID << std::endl;
+      cout << tab(depth) << "  instFlags = " << InstanceFlags(part0.instFlags) << std::endl;
+      cout << tab(depth) << "  startNodePtr = " << (void*)(size_t)part0.startNodePtr << std::endl;
+      cout << tab(depth) << "  obj2world.vx = " << part1.obj2world_vx << std::endl;
+      cout << tab(depth) << "  obj2world.vy = " << part1.obj2world_vy << std::endl;
+      cout << tab(depth) << "  obj2world.vz = " << part1.obj2world_vz << std::endl;
+      cout << tab(depth) << "  obj2world.p = " << part0.obj2world_p << std::endl;
+      cout << tab(depth) << "  world2obj.vx = " << part0.world2obj_vx << std::endl;
+      cout << tab(depth) << "  world2obj.vy = " << part0.world2obj_vy << std::endl;
+      cout << tab(depth) << "  world2obj.vz = " << part0.world2obj_vz << std::endl;
+      cout << tab(depth) << "  world2obj.p = " << part1.world2obj_p << std::endl;
+      cout << tab(depth) << "  instanceContributionToHitGroupIndex = " << part0.instanceContributionToHitGroupIndex << std::endl;
+      cout << tab(depth) << "}";
+#endif
+    }
+
+    /* output operator for InstanceLeaf */
+    friend inline std::ostream& operator<<(std::ostream& cout, const InstanceLeaf& leaf) {
+      leaf.print(cout,0); return cout;
+    }
+
+    /* first 64 bytes accessed during traversal by hardware */
+    struct Part0
+    {
+      /* Checks if opaque culling is enabled. */
+      bool opaqueCullingEnabled() const {
+        return type == PrimLeafDesc::TYPE_OPACITY_CULLING_ENABLED;
+      }
+
+    public:
+      uint32_t shaderIndex : 24;  // shader index used to calculate instancing shader in case of software instancing
+      uint32_t geomMask : 8;      // geometry mask used for ray masking
+      
+      uint32_t instanceContributionToHitGroupIndex : 24;
+      uint32_t pad0 : 5;
+
+      /* the following two entries are only used for procedural instances */
+      /*PrimLeafDesc::Type*/ uint32_t type : 1; // enables/disables opaque culling
+      /*GeometryFlags*/ uint32_t geomFlags : 2; // unused for instances
+      
+      uint64_t startNodePtr : 48;  // start node where to continue traversal of the instanced object
+      uint64_t instFlags : 8;      // flags for the instance (see InstanceFlags)
+      uint64_t pad1 : 8;           // unused bits
+      
+      Vec3f world2obj_vx;   // 1st column of Worl2Obj transform
+      Vec3f world2obj_vy;   // 2nd column of Worl2Obj transform
+      Vec3f world2obj_vz;   // 3rd column of Worl2Obj transform
+      Vec3f obj2world_p;    // translation of Obj2World transform (on purpose in first 64 bytes)
+    } part0;
+
+    /* second 64 bytes accessed during shading */
+    struct Part1
+    {
+      uint64_t bvhPtr : 48;   // pointer to BVH where start node belongs too
+      uint64_t pad : 16;      // unused bits
+      
+      uint32_t instanceID;    // user defined value per DXR spec
+      uint32_t instanceIndex; // geometry index of the instance (n'th geometry in scene)
+      
+      Vec3f obj2world_vx;   // 1st column of Obj2World transform
+      Vec3f obj2world_vy;   // 2nd column of Obj2World transform
+      Vec3f obj2world_vz;   // 3rd column of Obj2World transform
+      Vec3f world2obj_p;    // translation of World2Obj transform
+    } part1;
+  };
+
+  static_assert(sizeof(InstanceLeaf) == 128, "InstanceLeaf must be 128 bytes large");
+
+
+  /*
+    Leaf type for procedural geometry. This leaf only contains the
+    leaf header (which identifices the geometry) and a list of
+    primitive indices.
+
+    The BVH will typically reference only some of the primitives
+    stores inside this leaf. The range is specified by a start
+    primitive and the last primitive is tagged with a bit.
+
+   */
+  
+  struct ProceduralLeaf
+  {
+    static const uint32_t N = 13;
+
+    /* Creates an empty procedural leaf. */
+    ProceduralLeaf ()
+      : leafDesc(PrimLeafDesc::MAX_SHADER_INDEX,PrimLeafDesc::MAX_GEOM_INDEX,GeometryFlags::NONE,0), numPrimitives(0), pad(0), last(0)
+    {
+      for (auto& id : _primIndex) id = 0xFFFFFFFF;
+    }
+
+    /* Creates a procedural leaf with one primitive. More primitives
+     * of the same geometry can get added later using the add
+     * function. */
+    
+    ProceduralLeaf (PrimLeafDesc leafDesc, uint32_t primIndex, bool last)
+    : leafDesc(leafDesc), numPrimitives(1), pad(0), last(last ? 0xFFFFFFFF : 0xFFFFFFFE)
+    {
+      for (auto& id : _primIndex) id = 0xFFFFFFFF;
+      _primIndex[0] = primIndex;
+    }
+
+    /* returns the number of primitives stored inside this leaf */
+    uint32_t size() const  {
+      return numPrimitives;
+    }
+
+    /* Calculates the effectively used bytes. */
+    size_t usedBytes() const
+    {
+      /*if (leafDesc.isProceduralInstance())
+        return sizeof(InstanceLeaf);
+      else*/
+        return sizeof(PrimLeafDesc)+4+4*numPrimitives;
+    }
+    
+    /* if possible adds a new primitive to this leaf */
+    bool add(PrimLeafDesc leafDesc_in, uint32_t primIndex_in, bool last_in)
+    {
+      assert(primIndex_in != 0xFFFFFFFF);
+      if (numPrimitives >= N) return false;
+      if (!numPrimitives) leafDesc = leafDesc_in;
+      if (leafDesc != leafDesc_in) return false;
+      _primIndex[numPrimitives] = primIndex_in;
+      if (last_in) last |=   1 << numPrimitives;
+      else         last &= ~(1 << numPrimitives);
+      numPrimitives++;
+      return true;
+    }
+
+    /* returns the primitive index of the i'th primitive */
+    uint32_t primIndex(uint32_t i) const
+    {
+      assert(i < N);
+      return _primIndex[i];
+    }
+
+    /* checks if the i'th primitive is the last in a leaf list */
+    bool isLast(uint32_t i) const {
+      if (i >= N) return true; // just to make some verify tests happy
+      else return (last >> i) & 1;
+    }
+
+    /* output operator for procedural leaf */
+    void print (std::ostream& cout, uint32_t i, uint32_t depth) const
+    {
+#if !defined(__SYCL_DEVICE_ONLY__)
+      cout << tab(depth) << "ProceduralLeaf {" << std::endl;
+      cout << tab(depth) << "  addr = " << this << std::endl;
+      cout << tab(depth) << "  slot = " << i << std::endl;
+      if (i < N) {
+        cout << tab(depth) << "  shaderIndex = " << leafDesc.shaderIndex << std::endl;
+        cout << tab(depth) << "  geomMask = " << std::bitset<8>(leafDesc.geomMask) << std::endl;
+        cout << tab(depth) << "  geomFlags = " << leafDesc.getGeomFlags() << std::endl;
+        cout << tab(depth) << "  geomIndex = " << leafDesc.geomIndex << std::endl;
+        cout << tab(depth) << "  primIndex = " << primIndex(i) << std::endl;
+      } else {
+        cout << tab(depth) << " INVALID" << std::endl;
+      }
+      cout << tab(depth) << "}";
+#endif
+    }
+
+  public:
+    PrimLeafDesc leafDesc;           // leaf header identifying the geometry
+    uint32_t numPrimitives : 4;      // number of stored primitives
+    uint32_t pad           : 32-4-N;
+    uint32_t last          : N;      // bit vector with a last bit per primitive
+    uint32_t _primIndex[N];          // primitive indices of all primitives stored inside the leaf
+  };
+
+  static_assert(sizeof(ProceduralLeaf) == 64, "ProceduralLeaf must be 64 bytes large");
+}
--- a/Framework/external/embree/kernels/rthwif/rtbuild/node_type.h
+++ b/Framework/external/embree/kernels/rthwif/rtbuild/node_type.h
@ -0,0 +1,56 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdint>
+#include <iostream>
+
+namespace embree
+{
+  /* The type of a node. */
+  enum NodeType : uint8_t
+  {
+    NODE_TYPE_MIXED = 0x0,        // identifies a mixed internal node where each child can have a different type
+    NODE_TYPE_INTERNAL = 0x0,     // internal BVH node with 6 children
+    NODE_TYPE_INSTANCE = 0x1,     // instance leaf
+    NODE_TYPE_PROCEDURAL = 0x3,   // procedural leaf
+    NODE_TYPE_QUAD = 0x4,         // quad leaf
+    NODE_TYPE_INVALID = 0x7       // indicates invalid node
+  };
+
+  /* output operator for NodeType */
+  inline std::ostream& operator<<(std::ostream& _cout, const NodeType& _type)
+  {
+#if !defined(__RTRT_GSIM)
+    switch (_type)
+    {
+    case NODE_TYPE_INTERNAL: _cout << "INTERNAL"; break;
+    case NODE_TYPE_INSTANCE: _cout << "INSTANCE"; break;
+    case NODE_TYPE_PROCEDURAL: _cout << "PROCEDURAL"; break;
+    case NODE_TYPE_QUAD: _cout << "QUAD"; break;
+    case NODE_TYPE_INVALID: _cout << "INVALID"; break;
+    default: _cout << "INVALID NODE TYPE"; break;
+    }
+#endif
+    return _cout;
+  };
+
+  /* 
+     Sub-type definition for each NodeType
+  */
+
+  enum SubType : uint8_t
+  {
+    SUB_TYPE_NONE = 0,
+    
+    /* sub-type for NODE_TYPE_INTERNAL */
+    SUB_TYPE_INTERNAL6 = 0x00,        // Xe+: internal node with 6 children
+
+    /* Sub-type for NODE_TYPE_QUAD */
+    SUB_TYPE_QUAD = 0,                // Xe+: standard quad leaf (64 bytes)
+
+    /* Sub-type for NODE_TYPE_PROCEDURAL */
+    SUB_TYPE_PROCEDURAL = 0,          // Xe+: standard procedural leaf
+  };
+}
--- a/Framework/external/embree/kernels/rthwif/rtbuild/qbvh6.cpp
+++ b/Framework/external/embree/kernels/rthwif/rtbuild/qbvh6.cpp
@ -0,0 +1,265 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "qbvh6.h"
+
+namespace embree
+{
+  template<typename InternalNode>
+  void computeInternalNodeStatistics(BVHStatistics& stats, QBVH6::Node node, const BBox1f time_range, const float node_bounds_area, const float root_bounds_area)
+  {
+    InternalNode* inner = node.innerNode<InternalNode>();
+
+    size_t size = 0;
+    for (uint32_t i = 0; i < InternalNode::NUM_CHILDREN; i++)
+    {
+      if (inner->valid(i))
+      {
+        size++;
+        computeStatistics(stats, inner->child(i), time_range, area(inner->bounds(i)), root_bounds_area, InternalNode::NUM_CHILDREN);
+      }
+    }
+
+    /* update BVH statistics */
+    stats.internalNode.numNodes++;
+    stats.internalNode.numChildrenUsed += size;
+    stats.internalNode.numChildrenTotal += InternalNode::NUM_CHILDREN;
+    stats.internalNode.nodeSAH += time_range.size() * node_bounds_area / root_bounds_area;
+    stats.internalNode.numBytes += sizeof(InternalNode);
+  }
+
+  void computeStatistics(BVHStatistics& stats, QBVH6::Node node, const BBox1f time_range, const float node_bounds_area, const float root_bounds_area, uint32_t numChildren)
+  {
+    switch (node.type)
+    {
+    case NODE_TYPE_INSTANCE:
+    {
+      stats.instanceLeaf.numLeaves++;
+      stats.instanceLeaf.numPrimsUsed++;
+      stats.instanceLeaf.numPrimsTotal++;
+      stats.instanceLeaf.leafSAH += time_range.size() * node_bounds_area / root_bounds_area;
+      stats.instanceLeaf.numBytesUsed += sizeof(InstanceLeaf);
+      stats.instanceLeaf.numBytesTotal += sizeof(InstanceLeaf);
+      break;
+    }
+    case NODE_TYPE_QUAD:
+    {
+      bool last = false;
+      stats.quadLeaf.numLeaves++;
+
+      do
+      {
+        QuadLeaf* quad = node.leafNodeQuad();
+        node.node += sizeof(QuadLeaf);
+        last = quad->isLast();
+
+        stats.quadLeaf.numPrimsUsed += quad->size();
+        stats.quadLeaf.numPrimsTotal += 2;
+        stats.quadLeaf.numBytesUsed += quad->usedBytes();
+        stats.quadLeaf.numBytesTotal += sizeof(QuadLeaf);
+        stats.quadLeaf.leafSAH += quad->size() * time_range.size() * node_bounds_area / root_bounds_area;
+        
+      } while (!last);
+      
+      break;
+    }
+    case NODE_TYPE_PROCEDURAL:
+    {
+      /*if (node.leafNodeProcedural()->leafDesc.isProceduralInstance()) // FIXME: for some reason we always to into this case!?
+      {
+        stats.proceduralLeaf.numLeaves++;
+        stats.proceduralLeaf.numPrimsUsed += 1;
+        stats.proceduralLeaf.numPrimsTotal += 1;
+        stats.proceduralLeaf.leafSAH += time_range.size() * node_bounds_area / root_bounds_area;
+        stats.proceduralLeaf.numBytesUsed += sizeof(InstanceLeaf);
+        stats.proceduralLeaf.numBytesTotal += sizeof(InstanceLeaf);
+      }
+      else*/
+      {
+        bool last = false;
+        uint32_t currPrim = node.cur_prim;
+        stats.proceduralLeaf.numLeaves++;
+        
+        do
+        {
+          ProceduralLeaf* leaf = node.leafNodeProcedural();     
+          last = leaf->isLast(currPrim);
+
+          if (currPrim == 0) {
+            stats.proceduralLeaf.numBlocks++;
+            stats.proceduralLeaf.numBytesUsed += leaf->usedBytes();
+            stats.proceduralLeaf.numBytesTotal += sizeof(ProceduralLeaf);
+          }
+          
+          uint32_t primsInBlock = leaf->size();
+          
+          stats.proceduralLeaf.numPrimsUsed++;
+          stats.proceduralLeaf.numPrimsTotal++;
+          stats.proceduralLeaf.leafSAH += time_range.size() * node_bounds_area / root_bounds_area;
+          
+          if (++currPrim >= primsInBlock) {
+            currPrim = 0;
+            node.node += sizeof(ProceduralLeaf);
+          }
+          
+        } while (!last);
+      }
+      break;
+    }
+    case NODE_TYPE_INTERNAL:
+    {
+      computeInternalNodeStatistics<QBVH6::InternalNode6>(stats, node, time_range, node_bounds_area, root_bounds_area);
+      break;
+    }
+    default:
+      assert(false);
+    }
+  }
+
+  BVHStatistics QBVH6::computeStatistics() const
+  {
+    BVHStatistics stats;
+    if (empty()) return stats;
+    embree::computeStatistics(stats,root(),BBox1f(0,1),area(bounds),area(bounds),6);
+    return stats;
+  }
+
+  template<typename QInternalNode>
+  void QBVH6::printInternalNodeStatistics(std::ostream& cout, QBVH6::Node node, uint32_t depth, uint32_t numChildren)
+  {
+    QInternalNode* inner = node.innerNode<QInternalNode>();
+    inner->print(cout, depth, false);
+    std::cout << std::endl;
+
+    for (uint32_t i = 0; i < QInternalNode::NUM_CHILDREN; i++)
+    {
+      if (inner->valid(i))
+        print(cout, inner->child(i), depth + 1, QInternalNode::NUM_CHILDREN);
+    }
+
+    cout << tab(depth) << "}" << std::endl;
+  }
+
+  void QBVH6::print( std::ostream& cout, QBVH6::Node node, uint32_t depth, uint32_t numChildren)
+  {
+    switch (node.type)
+    {
+    case NODE_TYPE_INSTANCE: {
+      node.leafNodeInstance()->print(cout,depth);
+      cout << std::endl;
+      break;
+    }
+    case NODE_TYPE_QUAD:
+    {
+      std::cout << tab(depth) << "List {" << std::endl;
+      
+      bool last = false;
+      
+      do
+      {
+        QuadLeaf* quad = node.leafNodeQuad();
+        node.node += sizeof(QuadLeaf);
+        last = quad->isLast();
+
+        quad->print(cout,depth+1);
+        std::cout << std::endl;
+
+      } while (!last);
+
+      std::cout << tab(depth) << "}" << std::endl;
+      break;
+    }
+    case NODE_TYPE_PROCEDURAL:
+    {
+      /*if (!node.leafNodeProcedural()->leafDesc.opaqueCullingEnabled())
+      {
+        InstanceLeaf* leaf = (InstanceLeaf*) node.node;
+        leaf->print(cout,depth+1);
+        std::cout << std::endl;
+      }
+      else*/
+      {
+        std::cout << tab(depth) << "List {" << std::endl;
+      
+        bool last = false;
+        uint32_t currPrim = node.cur_prim;
+        
+        do
+        {
+          ProceduralLeaf* leaf = node.leafNodeProcedural();     
+          last = leaf->isLast(currPrim);
+          
+          uint32_t primsInBlock = leaf->size();
+
+          leaf->print(cout,currPrim,depth+1);
+          std::cout << std::endl;
+          
+          if (++currPrim >= primsInBlock) {
+            currPrim = 0;
+            node.node += sizeof(ProceduralLeaf);
+          }
+          
+        } while (!last);
+
+        std::cout << tab(depth) << "}" << std::endl;
+      }
+      break;
+    }
+    case NODE_TYPE_INTERNAL:
+    {
+      printInternalNodeStatistics<QBVH6::InternalNode6>(cout, node, depth, numChildren);
+      break;
+    }
+    default:
+      std::cout << "{ INVALID_NODE }" << std::endl;
+      //assert(false);
+    }
+  }
+
+  unsigned* getBackPointersData(const QBVH6* base) { // FIXME: should be member function
+    return (unsigned*)(((const char*)base) + 64 * base->backPointerDataStart);
+  }
+
+  unsigned getNumBackpointers(const QBVH6* base) { // FIXME: should be member function
+    return ((base->backPointerDataEnd - base->backPointerDataStart) * 64) / sizeof(unsigned);
+  }
+
+  uint64_t getBackpointerChildOffset(const QBVH6* base, unsigned idx) { // FIXME: should be member function
+    return 64 * uint64_t(base->nodeDataStart + idx);
+  }
+
+  uint64_t getParentFromBackpointerOffset(const QBVH6* base, unsigned idx) { // FIXME: should be member function
+    return 64 * uint64_t(base->nodeDataStart + (getBackPointersData(base)[idx] >> 6));
+  }
+
+  void QBVH6::print ( std::ostream& cout ) const
+  {
+    
+    cout << "QBVH @ "<< this <<" header: {\n";
+    cout << "  rootNodeOffset = " << rootNodeOffset << std::endl;
+    cout << "  bounds = " << bounds << std::endl;
+    cout << "  nodeDataStart = " << nodeDataStart << std::endl;
+    cout << "  nodeDataCur = " << nodeDataCur << std::endl;
+    cout << "  leafDataStart = " << leafDataCur << std::endl;
+    cout << "  leafDataCur = " << leafDataCur << std::endl;
+    cout << "  proceduralDataStart = " << proceduralDataStart << std::endl;
+    cout << "  proceduralDataCur = " << proceduralDataCur << std::endl;
+    cout << "  backPointerDataStart = " << backPointerDataStart << std::endl;
+    cout << "  backPointerDataEnd = " << backPointerDataEnd << std::endl;
+    cout << "  numPrims = " << numPrims << std::endl;
+    cout << "}" << std::endl;
+
+    if (empty()) return;
+    
+    print(cout,root(),0,6);
+    
+    if (hasBackPointers())
+    {
+      cout << "backpointers: {\n";
+      for (unsigned bp = 0; bp < getNumBackpointers(this); ++bp) {
+        cout << " node @ offset " << (void*)getBackpointerChildOffset(this, bp) << " parent = " << (void*)getParentFromBackpointerOffset(this, bp) << ", num children = " << ((getBackPointersData(this)[bp] >> 3) & 0x7) << "\n";
+      }
+      cout << "}\n";
+    }
+  }
+}
--- a/Framework/external/embree/kernels/rthwif/rtbuild/qbvh6.h
+++ b/Framework/external/embree/kernels/rthwif/rtbuild/qbvh6.h
@ -0,0 +1,230 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "qnode.h"
+#include "statistics.h"
+#include "rtbuild.h"
+
+namespace embree
+{
+  /*
+    
+    The QBVH6 structure defines the bounding volume hierarchy (BVH)
+    that is used by the hardware. It is a BVH with 6-wide branching
+    factor, and quantized bounding boxes. At the leaf level quads
+    (QuadLeaf type), procedural geometries (ProceduralLeaf
+    type), and instances (InstanceLeaf type) can get referenced.
+
+   */
+
+  inline constexpr size_t roundOffsetTo128(size_t offset) {
+    return 2 * ((offset + 127) / 128);
+  }
+
+  struct QBVH6
+  {
+    typedef NodeRef Node;
+    typedef InternalNode<InternalNode6Data> InternalNode6;
+
+    static constexpr uint64_t rootNodeOffset = 128;
+    
+    static_assert(sizeof(InternalNode6) == 64, "InternalNode6 must be 64 bytes large");
+
+    /* structure used to initialize the memory allocator inside the BVH */
+    struct SizeEstimate
+    {
+      SizeEstimate ()
+      : nodeBytes(0), leafBytes(0), proceduralBytes(0) {}
+
+      SizeEstimate (size_t nodeBytes, size_t leafBytes, size_t proceduralBytes)
+      : nodeBytes(nodeBytes), leafBytes(leafBytes), proceduralBytes(proceduralBytes) {}
+
+      size_t bytes() const {
+        return sizeof(QBVH6) + nodeBytes + leafBytes + proceduralBytes;
+      }
+
+      friend bool operator<= (SizeEstimate a, SizeEstimate b)
+      {
+        if (a.nodeBytes > b.nodeBytes) return false;
+        if (a.leafBytes > b.leafBytes) return false;
+        if (a.proceduralBytes > b.proceduralBytes) return false;
+        return true;
+      }
+
+      friend SizeEstimate operator+ (const SizeEstimate& a, const SizeEstimate& b)
+      {
+        return SizeEstimate(a.nodeBytes + b.nodeBytes,
+                            a.leafBytes + b.leafBytes,
+                            a.proceduralBytes + b.proceduralBytes);
+      }
+
+      /* output operator */
+      friend inline std::ostream& operator<<(std::ostream& cout, const SizeEstimate& estimate)
+      {
+        cout << "SizeEstimate {" << std::endl;
+        cout << "  nodeBytes = " << estimate.nodeBytes << ", " << std::endl;
+        cout << "  leafBytes = " << estimate.leafBytes << ", " << std::endl;
+        cout << "  proceduralBytes = " << estimate.proceduralBytes << ", " << std::endl;
+        return cout << "}";
+      }
+
+    public:
+      size_t nodeBytes;  // bytes required to store internal nodes
+      size_t leafBytes;  // bytes required to store leaf nodes
+      size_t proceduralBytes;  // bytes required to store procedural leaf nodes
+    };
+
+    /* Initializes a QBVH6 node with its provided size. The memory for
+     * the QBVH6 structure is overallocated and the allocation size is
+     * provided to the constructor, such that the allocator of the BVH
+     * can get initialized properly. */
+
+  QBVH6(SizeEstimate size)
+      : nodeDataStart((uint32_t)roundOffsetTo128(sizeof(QBVH6))), nodeDataCur(nodeDataStart),
+        leafDataStart(nodeDataCur + (uint32_t)(size.nodeBytes / 64)), leafDataCur(leafDataStart),
+        proceduralDataStart(leafDataCur + (uint32_t)(size.leafBytes / 64)), proceduralDataCur(proceduralDataStart),
+        backPointerDataStart(proceduralDataCur + (uint32_t)(size.proceduralBytes/64)), backPointerDataEnd(backPointerDataStart)
+    {
+      assert(size.nodeBytes % 64 == 0);
+      assert(size.leafBytes % 64 == 0);
+      assert(size.proceduralBytes % 64 == 0);
+      assert(size.bytes() <= (64LL << 32));
+
+      bounds = embree::empty;
+    }
+
+    /* Returns the root node of the BVH */
+    Node root() const {
+      return Node(rootNodeOffset,(uint64_t)this);
+    }
+
+    /* sets root not offset to point to this specified node */
+    void setRootNodeOffset(Node node) {
+      assert(node.cur_prim == 0);
+      uint64_t MAYBE_UNUSED rootNodeOffset1 = (uint64_t)node - (uint64_t)this;
+      assert(rootNodeOffset == rootNodeOffset1);
+    }
+
+    /* check if BVH is empty */
+    bool empty() const {
+      return root().type == NODE_TYPE_INVALID;
+    }
+
+    /* pretty printing */
+    template<typename QInternalNode>
+    static void printInternalNodeStatistics(std::ostream& cout, QBVH6::Node node, uint32_t depth, uint32_t numChildren = 6);
+    static void print(std::ostream& cout, QBVH6::Node node, uint32_t depth, uint32_t numChildren=6);
+    void print(std::ostream& cout = std::cout) const;
+
+    /* output operator */
+    friend inline std::ostream& operator<<(std::ostream& cout, const QBVH6& qbvh) {
+      qbvh.print(cout); return cout;
+    }
+    
+    /* calculates BVH statistics */
+    BVHStatistics computeStatistics() const;
+
+    /*
+       This section implements a simple allocator for BVH data. The
+       BVH data is separated into two section, a section where nodes
+       and leaves in mixed mode are allocated, and a section where
+       only leaves are allocate in fat-leaf mode.
+
+     */
+  public:
+
+    /* allocate data in the node memory section */
+    char* allocNode(size_t bytes)
+    {
+      assert(bytes % 64 == 0);
+      uint32_t blocks = (uint32_t)bytes / 64;
+      assert(nodeDataCur + blocks <= leafDataStart);
+      char* ptr = (char*)this + 64 * (size_t)nodeDataCur;
+      nodeDataCur += blocks;
+      return ptr;
+    }
+
+    /* allocate memory in the leaf memory section */
+    char* allocLeaf(size_t bytes)
+    {
+      assert(bytes % 64 == 0);
+      uint32_t blocks = (uint32_t)bytes / 64;      
+      assert(leafDataCur + blocks <= proceduralDataStart);
+      char* ptr = (char*)this + 64 * (size_t)leafDataCur;
+      leafDataCur += blocks;
+      return ptr;
+    }
+
+    /* allocate memory in procedural leaf memory section */
+    char* allocProceduralLeaf(size_t bytes)
+    {
+      assert(bytes % 64 == 0);
+      uint32_t blocks = (uint32_t)bytes / 64;
+      assert(proceduralDataCur + blocks <= backPointerDataStart);
+      char* ptr = (char*)this + 64 * (size_t)proceduralDataCur;
+      proceduralDataCur += blocks;
+      return ptr;
+    }
+
+    /* returns pointer to node address */
+    char* nodePtr(size_t ofs) {
+      return (char*)this + 64 * size_t(nodeDataStart) + ofs;
+    }
+    /* returns pointer to address for next leaf allocation */
+    char* leafPtr() {
+      return (char*)this + 64 * (size_t)leafDataCur;
+    }
+
+    /* returns the total number of bytes of the BVH */
+    size_t getTotalBytes() const {
+      return 64 * (size_t)backPointerDataEnd;
+    }
+
+    /* returns number of bytes available for node allocations */
+    size_t getFreeNodeBytes() const {
+      return 64 * (size_t)(leafDataStart - nodeDataCur);
+    }
+
+    /* returns number of bytes available for leaf allocations */
+    size_t getFreeLeafBytes() const {
+      return 64 * (size_t)(proceduralDataStart - leafDataCur);
+    }
+
+    /* returns number of bytes available for procedural leaf allocations */
+    size_t getFreeProceduralLeafBytes() const {
+      return 64 * (size_t)(backPointerDataStart - proceduralDataCur);
+    }
+
+    /* returns the bytes used by allocations */
+    size_t getUsedBytes() const {
+      return getTotalBytes() - getFreeNodeBytes() - getFreeLeafBytes() - getFreeProceduralLeafBytes();
+    }
+
+    bool hasBackPointers() const {
+      return backPointerDataStart < backPointerDataEnd;
+    }
+
+  public:
+    ze_raytracing_accel_format_internal_t rtas_format = ZE_RTAS_DEVICE_FORMAT_EXP_VERSION_1;
+    uint32_t reserved1;
+    BBox3f bounds;                  // bounding box of the BVH
+
+    uint32_t nodeDataStart;         // first 64 byte block of node data
+    uint32_t nodeDataCur;           // next free 64 byte block for node allocations
+    uint32_t leafDataStart;         // first 64 byte block of leaf data
+    uint32_t leafDataCur;           // next free 64 byte block for leaf allocations
+    uint32_t proceduralDataStart;   // first 64 byte block for procedural leaf data
+    uint32_t proceduralDataCur;     // next free 64 byte block for procedural leaf allocations
+    uint32_t backPointerDataStart;  // first 64 byte block for back pointers
+    uint32_t backPointerDataEnd;    // end of back pointer array
+    uint32_t numTimeSegments = 1;
+    uint32_t numPrims = 0;              // number of primitives in this BVH
+    uint32_t reserved[12];
+    uint64_t dispatchGlobalsPtr;
+  };
+
+  static_assert(sizeof(QBVH6) == 128, "QBVH6 must be 128 bytes large");
+}
+
--- a/Framework/external/embree/kernels/rthwif/rtbuild/qbvh6_builder_sah.h
+++ b/Framework/external/embree/kernels/rthwif/rtbuild/qbvh6_builder_sah.h
--- a/Framework/external/embree/kernels/rthwif/rtbuild/qnode.h
+++ b/Framework/external/embree/kernels/rthwif/rtbuild/qnode.h
@ -0,0 +1,508 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdint>
+#include <iostream>
+
+#include "leaf.h"
+
+#if defined(__INTEL_LLVM_COMPILER) && defined(WIN32)
+inline float embree_frexp(float value, int* exp)
+{
+   // using the Intel(R) oneAPI DPC++/C++ Compiler with -no-intel-libs results
+   // in an unresolved external symbol "__imp_frexp" error and therefore we
+   // provide a the manual implemetation referenced here
+   // https://en.cppreference.com/w/c/numeric/math/frexp in this case
+   static_assert(FLT_RADIX == 2, "custom implementation of frexp only works for base 2 floating point representations");
+   *exp = (value == 0) ? 0 : (int)(1 + logb(value));
+   return scalbn(value, -(*exp));
+}
+#endif
+
+namespace embree
+{
+  /* The NodeRef structure references a node of the BVH. It stores the
+     * pointer to that node as well as the node's type. If a leaf node
+     * is referenced the current primitive to intersect is also
+     * stored. */
+
+  struct NodeRef
+  {
+    NodeRef ()
+    : node(nullptr), type(NODE_TYPE_INVALID), cur_prim(0) {}
+    
+    NodeRef (void* node, NodeType type, uint8_t cur_prim)
+    : node((char*)node), type(type), cur_prim(cur_prim)
+    {
+      assert(cur_prim < 16);
+    }
+    
+    /* decode from 64 bit encoding used in MemRay and Instances */
+    NodeRef (uint64_t nodePtr, uint64_t offset = 0)
+    {
+      node = (char*) (nodePtr & ~(uint64_t)0xF) + offset;
+      //type = NODE_TYPE_INTERNAL; // we can only reference internal nodes inside ray and instances
+      type = (NodeType) (nodePtr & 0xF);
+      cur_prim = 0;
+    }
+    
+    /* 64 bit encoding used in MemRay and Instances */
+    operator uint64_t() const
+    {
+      //assert(type == NODE_TYPE_INTERNAL);
+      assert(((uint64_t)node & 0xF) == 0);
+      assert(cur_prim == 0);
+      return (uint64_t)node + (uint64_t) type;
+    }
+    
+    /* returns the internal node that is referenced */
+    template<typename InternalNode>
+    InternalNode* innerNode() const {
+      assert(type == NODE_TYPE_INTERNAL);
+      return (InternalNode*)node;
+    }
+
+    /* returns the instance leaf node that is referenced */
+    InstanceLeaf* leafNodeInstance() const {
+      assert(type == NODE_TYPE_INSTANCE);
+      return (InstanceLeaf*)node;
+    }
+    
+    /* returns the quad leaf node that is referenced */
+    QuadLeaf* leafNodeQuad() const {
+      assert(type == NODE_TYPE_QUAD);
+      return (QuadLeaf*)node;
+    }
+
+    /* returns the procedural leaf node that is referenced */
+    ProceduralLeaf* leafNodeProcedural() const {
+      assert(type == NODE_TYPE_PROCEDURAL);
+      return (ProceduralLeaf*)node;
+    }
+    
+    friend bool operator ==(const NodeRef& a, const NodeRef& b) {
+      return (a.node == b.node) && (a.type == b.type) && (a.cur_prim == b.cur_prim);
+    }
+    
+    friend bool operator !=(const NodeRef& a, const NodeRef& b) {
+      return !(a == b);
+    }
+    
+#if !defined(__RTRT_GSIM)
+    friend inline std::ostream& operator<<(std::ostream& _cout, const NodeRef& node) {
+      return _cout << "NodeRef { " << (void*)node.node << ", " << node.type << ", " << (int)node.cur_prim << " }";
+    }
+#endif
+    
+  public:
+    char* node;           // pointer to the referenced node
+    NodeType type;        // type of the node referenced
+    uint8_t cur_prim : 4; // current primitive referenced in the leaf
+  };
+
+   /*
+
+      The internal nodes of the BVH store references to 6 children and
+      quantized bounds for each of these children.
+
+      All children are stored consecutively in memory at a location
+      refered to by the childOffset. To calculate the relative
+      location of the i'th child the size (as encoded in blockIncr) of
+      all the children with index smaller than i has to get added to
+      that childOffset. The calculated offset specifies the signed
+      number of 64 bytes blocks relative to the node address to reach
+      the child.
+
+      If the nodeType is INTERNAL we are in mixed mode and the type of
+      each child is encoded inside the startPrim member. Otherwise we
+      are in fat leaf mode and each child has the same type 'nodeType'
+      and startPrim identifies the primitive where the leaf
+      starts. The leaf spans all primitives from this start primitive
+      to the end primitive which is marked as 'last'.
+
+      The bounding boxes of the children are quantized into a regular
+      3D grid. The world space position of the origin of that grid is
+      stored at full precision in the lower member, while the step
+      size is encoded in the exp_x, exp_y, and exp_z members as power
+      of 2. Thus grid coordinates together with their exponent
+      (xi,exp_x), (yi,exp_y), (zi,exp_z) correspond to the mantissa
+      and exponent of a floating point number representation without
+      leading zero. Thus the world space position of the bounding
+      planes can get calculated as follows:
+
+        x = lower.x + pow(2,exp_x) * 0.xi
+        y = lower.y + pow(2,exp_y) * 0.yi
+        z = lower.z + pow(2,exp_z) * 0.zi
+
+      As the stored grid coordinates for child bounds are only
+      unsigned 8-bit values, ray/box intersections can get performed
+      with reduced precision.
+
+      The node also stores a mask used for ray filtering. Only rays
+      with (node.nodeMask & ray.rayMask) != 0 are traversed, all
+      others are culled.
+
+    */
+  
+  struct InternalNode6Data
+  {
+    static constexpr uint32_t NUM_CHILDREN = 6;
+
+    Vec3f lower;          // world space origin of quantization grid
+    int32_t childOffset; // offset to all children in 64B multiples 
+
+    NodeType nodeType;    // the type of the node    
+    uint8_t pad;          // unused byte
+
+    int8_t exp_x;          // 2^exp_x is the size of the grid in x dimension
+    int8_t exp_y;          // 2^exp_y is the size of the grid in y dimension
+    int8_t exp_z;          // 2^exp_z is the size of the grid in z dimension
+    uint8_t nodeMask;      // mask used for ray filtering
+
+    struct ChildData
+    {
+      uint8_t blockIncr : 2; // size of child in 64 byte blocks
+      uint8_t startPrim : 4; // start primitive in fat leaf mode or child type in mixed mode
+      uint8_t pad : 2; // unused bits
+    } childData[NUM_CHILDREN];
+
+    uint8_t lower_x[NUM_CHILDREN];  // the quantized lower bounds in x-dimension
+    uint8_t upper_x[NUM_CHILDREN];  // the quantized upper bounds in x-dimension
+    uint8_t lower_y[NUM_CHILDREN];  // the quantized lower bounds in y-dimension
+    uint8_t upper_y[NUM_CHILDREN];  // the quantized upper bounds in y-dimension
+    uint8_t lower_z[NUM_CHILDREN];  // the quantized lower bounds in z-dimension
+    uint8_t upper_z[NUM_CHILDREN];  // the quantized upper bounds in z-dimension
+  };
+
+  static_assert(sizeof(InternalNode6Data) == 64, "InternalNode6Data must be 64 bytes large");
+
+  template<typename InternalNodeData>
+    struct InternalNodeCommon : public InternalNodeData
+  {
+    using InternalNodeData::NUM_CHILDREN;
+    
+    InternalNodeCommon() {
+    }
+    
+    InternalNodeCommon(NodeType type)
+    {
+      this->nodeType = type;
+      this->childOffset = 0;
+      this->nodeMask = 0xFF;
+      
+      for (uint32_t i = 0; i < InternalNodeData::NUM_CHILDREN; i++)
+        this->childData[i] = { 0, 0, 0 };
+      
+      this->lower = Vec3f(0.0f);
+      this->exp_x = 0;
+      this->exp_y = 0;
+      this->exp_z = 0;
+      
+      /* set all child bounds to invalid */
+      for (uint32_t i = 0; i < InternalNodeData::NUM_CHILDREN; i++) {
+        this->lower_x[i] = this->lower_y[i] = this->lower_z[i] = 0x80;
+        this->upper_x[i] = this->upper_y[i] = this->upper_z[i] = 0x00;
+      }
+    }
+    
+    /* this function slightly enlarges bounds in order to make traversal watertight */
+    static const BBox3f conservativeBox(const BBox3f box, float ulps = 1.0f) {
+      const float err = ulps*std::numeric_limits<float>::epsilon() * std::max(reduce_max(abs(box.lower)), reduce_max(abs(box.upper)));
+      return enlarge(box, Vec3f(err));
+    }
+
+    /* this function quantizes the provided bounds */
+    const BBox3f quantize_bounds(BBox3f fbounds, Vec3f base) const
+    {
+      const Vec3f lower = fbounds.lower-base;
+      const Vec3f upper = fbounds.upper-base;
+      float qlower_x = ldexpf(lower.x, -this->exp_x + 8); 
+      float qlower_y = ldexpf(lower.y, -this->exp_y + 8); 
+      float qlower_z = ldexpf(lower.z, -this->exp_z + 8); 
+      float qupper_x = ldexpf(upper.x, -this->exp_x + 8); 
+      float qupper_y = ldexpf(upper.y, -this->exp_y + 8); 
+      float qupper_z = ldexpf(upper.z, -this->exp_z + 8); 
+      assert(qlower_x >= 0.0f && qlower_x <= 255.0f);
+      assert(qlower_y >= 0.0f && qlower_y <= 255.0f);
+      assert(qlower_z >= 0.0f && qlower_z <= 255.0f);
+      assert(qupper_x >= 0.0f && qupper_x <= 255.0f);
+      assert(qupper_y >= 0.0f && qupper_y <= 255.0f);
+      assert(qupper_z >= 0.0f && qupper_z <= 255.0f); 
+      qlower_x = min(max(floorf(qlower_x),0.0f),255.0f);
+      qlower_y = min(max(floorf(qlower_y),0.0f),255.0f);
+      qlower_z = min(max(floorf(qlower_z),0.0f),255.0f);
+      qupper_x = min(max(ceilf(qupper_x),0.0f),255.0f);
+      qupper_y = min(max(ceilf(qupper_y),0.0f),255.0f);
+      qupper_z = min(max(ceilf(qupper_z),0.0f),255.0f);
+      BBox3f qbounds(Vec3f(qlower_x, qlower_y, qlower_z), Vec3f(qupper_x, qupper_y, qupper_z));
+
+      /* verify that quantized bounds are conservative */
+      BBox3f dbounds = dequantize_bounds(qbounds, base);
+      dbounds.lower.x -= 2.0f*float(ulp) * (fabs(base.x) + ldexpf(255.0f,this->exp_x-8));
+      dbounds.lower.y -= 2.0f*float(ulp) * (fabs(base.y) + ldexpf(255.0f,this->exp_y-8));
+      dbounds.lower.z -= 2.0f*float(ulp) * (fabs(base.z) + ldexpf(255.0f,this->exp_z-8));
+      dbounds.upper.x += 2.0f*float(ulp) * (fabs(base.x) + ldexpf(255.0f,this->exp_x-8));
+      dbounds.upper.y += 2.0f*float(ulp) * (fabs(base.y) + ldexpf(255.0f,this->exp_y-8));
+      dbounds.upper.z += 2.0f*float(ulp) * (fabs(base.z) + ldexpf(255.0f,this->exp_z-8));
+      assert(subset(fbounds, dbounds));
+
+      return qbounds;
+    }
+    
+    /* this function de-quantizes the provided bounds */
+    const BBox3f dequantize_bounds(const BBox3f& qbounds, Vec3f base) const
+    {
+      const float dlower_x = base.x + ldexpf(qbounds.lower.x, this->exp_x - 8);
+      const float dlower_y = base.y + ldexpf(qbounds.lower.y, this->exp_y - 8);
+      const float dlower_z = base.z + ldexpf(qbounds.lower.z, this->exp_z - 8);
+      const float dupper_x = base.x + ldexpf(qbounds.upper.x, this->exp_x - 8);
+      const float dupper_y = base.y + ldexpf(qbounds.upper.y, this->exp_y - 8);
+      const float dupper_z = base.z + ldexpf(qbounds.upper.z, this->exp_z - 8);
+      return BBox3f(Vec3f(dlower_x, dlower_y, dlower_z), Vec3f(dupper_x, dupper_y, dupper_z));
+    }
+    
+    /* Determines if a child is valid. We have only to look at the
+     * topmost bit of lower_x and upper_x to determine if child is
+     * valid */
+    bool valid(int i) const {
+      return !(this->lower_x[i] & 0x80) || (this->upper_x[i] & 0x80);
+    }
+    
+    /* Determines if the node is in fat leaf mode. */
+    bool isFatLeaf() const {
+      return this->nodeType != NODE_TYPE_MIXED;
+    }
+    
+    /* Sets the offset to the child memory. */
+    void setChildOffset(void* childDataPtr)
+    {
+      int64_t childDataOffset = childDataPtr ? (char*)childDataPtr - (char*)this : 0;
+      assert(childDataOffset % 64 == 0);
+      assert((int64_t)(int32_t)(childDataOffset / 64) == (childDataOffset / 64));
+      this->childOffset = (int32_t)(childDataOffset / 64);
+    }
+    
+    /* Sets the type, size, and current primitive of a child */
+    void setChildType(uint32_t child, NodeType childType, uint32_t block_delta, uint32_t cur_prim)
+    {
+      // there is no need to store block_delta for last child
+      if (child == NUM_CHILDREN-1) block_delta = 0;
+      
+      assert(block_delta < 4);
+      assert(cur_prim < 16);
+      
+      if (isFatLeaf())
+      {
+        assert(this->nodeType == childType);
+        this->childData[child].startPrim = cur_prim;
+        this->childData[child].blockIncr = block_delta;
+      }
+      else
+      {
+        assert(cur_prim == 0);
+        this->childData[child].startPrim = childType;
+        this->childData[child].blockIncr = block_delta;
+      }
+    }
+    
+    void invalidateChild(uint32_t childID)
+    {
+      /* set child bounds to invalid */
+      this->lower_x[childID] = this->lower_y[childID] = this->lower_z[childID] = 0x80;
+      this->upper_x[childID] = this->upper_y[childID] = this->upper_z[childID] = 0x00;
+    }
+
+    /* Sets child bounds */
+    void setChildBounds(uint32_t childID, const BBox3f& fbounds)
+    {
+      assert(fbounds.lower.x <= fbounds.upper.x);
+      assert(fbounds.lower.y <= fbounds.upper.y);
+      assert(fbounds.lower.z <= fbounds.upper.z);
+      const BBox3f qbounds = quantize_bounds(conservativeBox(fbounds), this->lower);
+      this->lower_x[childID] = (uint8_t)qbounds.lower.x;
+      this->lower_y[childID] = (uint8_t)qbounds.lower.y;
+      this->lower_z[childID] = (uint8_t)qbounds.lower.z;
+      this->upper_x[childID] = (uint8_t)qbounds.upper.x;
+      this->upper_y[childID] = (uint8_t)qbounds.upper.y;
+      this->upper_z[childID] = (uint8_t)qbounds.upper.z;
+      assert(valid(childID));
+    }
+    
+    /* Sets an entire child, including bounds, type, size, and referenced primitive. */
+    void setChild(uint32_t childID, const BBox3f& fbounds, NodeType type, uint32_t block_delta, uint32_t cur_prim = 0)
+    {
+      setChildType(childID, type, block_delta, cur_prim);
+      setChildBounds(childID, fbounds);
+    }
+    
+    /* Calculates the byte offset to the child. The offset is
+     * relative to the address this node. */
+    int64_t getChildOffset(uint32_t childID) const
+    {
+      int64_t ofs = this->childOffset;
+      for (uint32_t j = 0; j < childID; j++)
+        ofs += this->childData[j].blockIncr;
+      return 64 * ofs;
+    }
+    
+    /* Returns the type of the child. In fat leaf mode the type is
+     * shared between all children, otherwise a per-child type is
+     * encoded inside the startPrim member for each child. */
+    NodeType getChildType(uint32_t childID) const
+    {
+      if (isFatLeaf())
+        return this->nodeType;
+      
+      else
+        return (NodeType)(this->childData[childID].startPrim);
+    }
+    
+    /* Returns the start primitive of a child. In case of children
+     * in fat-leaf mode, all children are leaves, and the start
+     * primitive specifies the primitive in a leaf block where the
+     * leaf start. */
+    uint32_t getChildStartPrim(uint32_t childID) const
+    {
+      if (isFatLeaf())
+        return  this->childData[childID].startPrim;
+      
+      else
+        return 0;
+    }
+    
+    /* Returns a node reference for the given child. This reference
+     * includes the node pointer, type, and start primitive. */
+    NodeRef child(void* This, int childID) const {
+      return NodeRef((char*)This + getChildOffset(childID), getChildType(childID), getChildStartPrim(childID));
+    }
+    
+    NodeRef child(int i) const {
+      return child((void*)this, i);
+    }
+  };
+
+  template<typename QInternalNode>
+    struct InternalNode : public InternalNodeCommon<QInternalNode>
+  {
+    using InternalNodeCommon<QInternalNode>::valid;
+    using InternalNodeCommon<QInternalNode>::getChildType;
+    using InternalNodeCommon<QInternalNode>::getChildOffset;
+    using InternalNodeCommon<QInternalNode>::getChildStartPrim;
+    using InternalNodeCommon<QInternalNode>::conservativeBox;
+    using InternalNodeCommon<QInternalNode>::dequantize_bounds;
+    using InternalNodeCommon<QInternalNode>::NUM_CHILDREN;
+    
+    InternalNode() {
+    }
+    
+    InternalNode (NodeType type)
+      : InternalNodeCommon<QInternalNode>(type) {}
+
+    /* Constructs an internal node. The quantization grid gets
+     * initialized from the provided parent bounds. */
+    InternalNode (BBox3f box, NodeType type = NODE_TYPE_MIXED)
+      : InternalNode(type)
+    {
+      setNodeBounds(box);
+    }
+
+    void setNodeBounds(BBox3f box)
+    {
+      /* initialize quantization grid */
+      box = conservativeBox(box);
+      const float _ulp = std::numeric_limits<float>::epsilon();
+      const float up = 1.0f + float(_ulp);
+      Vec3f len = box.size() * up;
+      this->lower = box.lower;
+#if defined(__INTEL_LLVM_COMPILER) && defined(WIN32)
+      int _exp_x; float mant_x = embree_frexp(len.x, &_exp_x); _exp_x += (mant_x > 255.0f / 256.0f);
+      int _exp_y; float mant_y = embree_frexp(len.y, &_exp_y); _exp_y += (mant_y > 255.0f / 256.0f);
+      int _exp_z; float mant_z = embree_frexp(len.z, &_exp_z); _exp_z += (mant_z > 255.0f / 256.0f);
+#else
+      int _exp_x; float mant_x = frexp(len.x, &_exp_x); _exp_x += (mant_x > 255.0f / 256.0f);
+      int _exp_y; float mant_y = frexp(len.y, &_exp_y); _exp_y += (mant_y > 255.0f / 256.0f);
+      int _exp_z; float mant_z = frexp(len.z, &_exp_z); _exp_z += (mant_z > 255.0f / 256.0f);
+#endif
+      _exp_x = max(-128,_exp_x); // enlarge too tight bounds
+      _exp_y = max(-128,_exp_y);
+      _exp_z = max(-128,_exp_z);
+      this->exp_x = _exp_x; assert(_exp_x >= -128 && _exp_x <= 127);
+      this->exp_y = _exp_y; assert(_exp_y >= -128 && _exp_y <= 127);
+      this->exp_z = _exp_z; assert(_exp_z >= -128 && _exp_z <= 127);
+    }
+    
+    /* dequantizes the bounds of the specified child */
+    const BBox3f bounds(uint32_t childID) const
+    {
+      return dequantize_bounds(BBox3f(Vec3f(this->lower_x[childID], this->lower_y[childID], this->lower_z[childID]),
+                                      Vec3f(this->upper_x[childID], this->upper_y[childID], this->upper_z[childID])),
+                               this->lower);
+    }
+
+    const BBox3f bounds() const
+    {
+      BBox3f b = empty;
+      for (size_t i=0; i<NUM_CHILDREN; i++) {
+        if (!valid(i)) continue;
+        b.extend(bounds(i));
+      }
+      return b;
+    }
+
+    void copy_to( InternalNode* dst ) const
+    {
+      *dst = *this;
+      dst->setChildOffset((char*)this + getChildOffset(0));
+    }
+    
+#if !defined(__RTRT_GSIM)
+    
+    /* output of internal node */
+    void print(std::ostream& cout, uint32_t depth, bool close) const
+    {
+      cout << tab(depth) << "InternalNode" << NUM_CHILDREN << " {" << std::endl;
+      cout << tab(depth) << "  addr = " << this << std::endl;
+      cout << tab(depth) << "  childOffset = " << 64 * int64_t(this->childOffset) << std::endl;
+      cout << tab(depth) << "  nodeType = " << NodeType(this->nodeType) << std::endl;
+      cout << tab(depth) << "  nodeMask = " << std::bitset<8>(this->nodeMask) << std::endl;
+      
+      for (uint32_t i = 0; i < NUM_CHILDREN; i++)
+      {
+        cout << tab(depth) << "  child" << i << " = { ";
+        if (valid(i))
+        {
+          cout << "type = " << getChildType(i);
+          cout << ", offset = " << getChildOffset(i);
+          cout << ", prim = " << getChildStartPrim(i);
+          cout << ", bounds = " << bounds(i);
+        }
+        else {
+          cout << "INVALID";
+        }
+        cout << "  }" << std::endl;
+      }
+      
+      if (close)
+        cout << tab(depth) << "}";
+    }
+    
+    /* output operator for internal node */
+    friend inline std::ostream& operator<<(std::ostream& cout, const InternalNode& node) {
+      node.print(cout, 0, true); return cout;
+    }
+#endif
+  };
+
+  inline size_t GetInternalNodeSize(uint32_t numChildren)
+  {
+    if (numChildren <= 6)
+      return sizeof(InternalNode6Data);
+    else
+      assert(false);
+    return 0;
+  }
+    
+  typedef InternalNode<InternalNode6Data> InternalNode6;
+}
--- a/Framework/external/embree/kernels/rthwif/rtbuild/quadifier.h
+++ b/Framework/external/embree/kernels/rthwif/rtbuild/quadifier.h
@ -0,0 +1,151 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#if defined(ZE_RAYTRACING)
+#include "sys/sysinfo.h"
+#include "sys/vector.h"
+#include "math/vec2.h"
+#include "math/vec3.h"
+#include "math/bbox.h"
+#include "math/affinespace.h"
+#else
+#include "../../common/default.h"
+#endif
+
+namespace embree
+{
+  enum QuadifierType : uint16_t
+  {
+    QUADIFIER_PAIRED = 0xFFFF,   // indicates that triangle is paired with a previous triangle
+    QUADIFIER_TRIANGLE = 0,      // indicates that this triangle cannot get paired
+    QUADIFIER_QUAD = 1,          // all values > 0 and != 0xFFFF indicate offset to paired triangle
+    QUADIFIER_MAX_DISTANCE = 31,
+  };
+
+  template<typename Ty, size_t N>
+  struct static_deque
+  {
+    __forceinline Ty pop_front() {
+      assert(size());
+      return operator[](begin++);
+    }
+
+    __forceinline void push_back(const Ty& v) {
+      assert(size() < N);
+      operator[](end++) = v;
+    }
+    
+    __forceinline size_t size() const {
+      assert(end >= begin);
+      return end-begin;
+    }
+
+    __forceinline bool full() const {
+      return size() == N;
+    }
+
+    __forceinline void erase( size_t j )
+    {
+      assert(j >= begin && j < end);
+
+      /* fast path as we mostly just merge with the subsequent triangle */
+      if (likely(j == begin))
+        begin++;
+
+      /* fastest when left side is small */
+      else if (j-begin < end-j-1) {
+        for (size_t i=j; i>=begin+1; i--) operator[](i) = operator[](i-1);
+        begin++;
+      }
+
+      /* fastest if right side is small */
+      else {
+        for (size_t i=j+1; i<end; i++) operator[](i-1) = operator[](i);
+        end--;
+      }
+    }
+    
+    __forceinline       Ty& operator[] ( const size_t i )       { return array[i%N]; }
+    __forceinline const Ty& operator[] ( const size_t i ) const { return array[i%N]; }
+    
+    Ty array[N];
+    size_t begin = 0;
+    size_t end = 0;
+  };
+            
+  __forceinline bool pair_triangles(Vec3<uint32_t> a, Vec3<uint32_t> b, uint8_t& lb0, uint8_t& lb1, uint8_t& lb2)
+  {
+    const vuint<4> va(a.x,a.y,a.z,0);
+    const vboolf<4> mb0 = vboolf<4>(0x8) | vuint<4>(b.x) == va;
+    const vboolf<4> mb1 = vboolf<4>(0x8) | vuint<4>(b.y) == va;
+    const vboolf<4> mb2 = vboolf<4>(0x8) | vuint<4>(b.z) == va;
+    lb0 = bsf(movemask(mb0));
+    lb1 = bsf(movemask(mb1));
+    lb2 = bsf(movemask(mb2));
+    return (lb0 == 3) + (lb1 == 3) + (lb2 == 3) <= 1;
+  }
+
+  template<typename GetTriangleFunc>
+  __forceinline void merge_triangle_window( uint32_t geomID, static_deque<uint32_t,32>& triangleWindow, QuadifierType* quads_o, const GetTriangleFunc& getTriangle )
+  {
+    uint32_t primID0 = triangleWindow.pop_front();
+    
+    /* load first triangle */
+    Vec3<uint32_t> tri0 = getTriangle(geomID, primID0);
+    
+    /* find a second triangle in triangle window to pair with */
+    for ( size_t slot = triangleWindow.begin; slot != triangleWindow.end; ++slot )
+    {
+      /* load second triangle */
+      uint32_t primID1 = triangleWindow[slot];
+      Vec3<uint32_t> tri1 = getTriangle(geomID, primID1);
+      
+      /* try to pair triangles */
+      uint8_t lb0,lb1,lb2;
+      bool pair = pair_triangles(tri0,tri1,lb0,lb1,lb2);
+
+      /* the offset between the triangles cannot be too large as hardware limits bits for offset encode */
+      uint32_t prim_offset = primID1 - primID0;
+      pair &= prim_offset <= QUADIFIER_MAX_DISTANCE;
+
+      /* store pairing if successful */
+      if (pair)
+      {
+        assert(prim_offset > 0 && prim_offset < QUADIFIER_PAIRED);
+        quads_o[primID0] = (QuadifierType) prim_offset;
+        quads_o[primID1] = QUADIFIER_PAIRED;
+        triangleWindow.erase(slot);
+        return;
+      }
+    }
+    
+    /* make a triangle if we fail to find a candiate to pair with */
+    quads_o[primID0] = QUADIFIER_TRIANGLE;
+  }
+  
+  template<typename GetTriangleFunc>
+  inline size_t pair_triangles( uint32_t geomID, QuadifierType* quads_o, uint32_t primID0, uint32_t primID1, const GetTriangleFunc& getTriangle ) 
+  {
+    static_deque<uint32_t, 32> triangleWindow;
+
+    size_t numTrianglePairs = 0;
+    for (uint32_t primID=primID0; primID<primID1; primID++)
+    {
+      triangleWindow.push_back(primID);
+      
+      if (triangleWindow.full()) {
+        merge_triangle_window(geomID, triangleWindow,quads_o,getTriangle);
+        numTrianglePairs++;
+      }
+    }
+    
+    while (triangleWindow.size()) {
+      merge_triangle_window(geomID, triangleWindow,quads_o,getTriangle);
+      numTrianglePairs++;
+    }
+
+    return numTrianglePairs;
+  }
+}
--- a/Framework/external/embree/kernels/rthwif/rtbuild/rtbuild.cpp
+++ b/Framework/external/embree/kernels/rthwif/rtbuild/rtbuild.cpp
@ -0,0 +1,762 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#define RTHWIF_EXPORT_API
+
+#include "rtbuild.h"
+#include "qbvh6_builder_sah.h"
+
+// get definition of debug extension
+#if defined(EMBREE_SYCL_ALLOC_DISPATCH_GLOBALS)
+#include "../../level_zero/ze_wrapper.h"
+#endif
+
+namespace embree
+{
+  using namespace embree::isa;
+
+  static tbb::task_arena g_arena(tbb::this_task_arena::max_concurrency(),tbb::this_task_arena::max_concurrency());
+  
+  inline ze_rtas_triangle_indices_uint32_exp_t getPrimitive(const ze_rtas_builder_triangles_geometry_info_exp_t* geom, uint32_t primID) {
+    assert(primID < geom->triangleCount);
+    return *(ze_rtas_triangle_indices_uint32_exp_t*)((char*)geom->pTriangleBuffer + uint64_t(primID)*geom->triangleStride);
+  }
+  
+  inline Vec3f getVertex(const ze_rtas_builder_triangles_geometry_info_exp_t* geom, uint32_t vertexID) {
+    assert(vertexID < geom->vertexCount);
+    return *(Vec3f*)((char*)geom->pVertexBuffer + uint64_t(vertexID)*geom->vertexStride);
+  }
+  
+  inline ze_rtas_quad_indices_uint32_exp_t getPrimitive(const ze_rtas_builder_quads_geometry_info_exp_t* geom, uint32_t primID) {
+    assert(primID < geom->quadCount);
+    return *(ze_rtas_quad_indices_uint32_exp_t*)((char*)geom->pQuadBuffer + uint64_t(primID)*geom->quadStride);
+  }
+  
+  inline Vec3f getVertex(const ze_rtas_builder_quads_geometry_info_exp_t* geom, uint32_t vertexID) {
+    assert(vertexID < geom->vertexCount);
+    return *(Vec3f*)((char*)geom->pVertexBuffer + uint64_t(vertexID)*geom->vertexStride);
+  }
+
+  inline AffineSpace3fa getTransform(const ze_rtas_builder_instance_geometry_info_exp_t* geom)
+  {
+    switch (geom->transformFormat)
+    {
+    case ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3X4_COLUMN_MAJOR: {
+      const ze_rtas_transform_float3x4_column_major_exp_t* xfm = (const ze_rtas_transform_float3x4_column_major_exp_t*) geom->pTransform;
+      return {
+        { xfm->vx_x, xfm->vx_y, xfm->vx_z },
+        { xfm->vy_x, xfm->vy_y, xfm->vy_z },
+        { xfm->vz_x, xfm->vz_y, xfm->vz_z },
+        { xfm-> p_x, xfm-> p_y, xfm-> p_z }
+      };
+    }
+    case ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3X4_ALIGNED_COLUMN_MAJOR: {
+      const ze_rtas_transform_float3x4_aligned_column_major_exp_t* xfm = (const ze_rtas_transform_float3x4_aligned_column_major_exp_t*) geom->pTransform;
+      return {
+        { xfm->vx_x, xfm->vx_y, xfm->vx_z },
+        { xfm->vy_x, xfm->vy_y, xfm->vy_z },
+        { xfm->vz_x, xfm->vz_y, xfm->vz_z },
+        { xfm-> p_x, xfm-> p_y, xfm-> p_z }
+      };
+    }
+    case ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3X4_ROW_MAJOR: {
+      const ze_rtas_transform_float3x4_row_major_exp_t* xfm = (const ze_rtas_transform_float3x4_row_major_exp_t*) geom->pTransform;
+      return {
+        { xfm->vx_x, xfm->vx_y, xfm->vx_z },
+        { xfm->vy_x, xfm->vy_y, xfm->vy_z },
+        { xfm->vz_x, xfm->vz_y, xfm->vz_z },
+        { xfm-> p_x, xfm-> p_y, xfm-> p_z }
+      };
+    }
+    default:
+      throw std::runtime_error("invalid transform format");
+    }
+  }
+  
+  inline void verifyGeometryDesc(const ze_rtas_builder_triangles_geometry_info_exp_t* geom)
+  {
+    if (geom->triangleFormat != ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_TRIANGLE_INDICES_UINT32)
+      throw std::runtime_error("triangle format must be ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_TRIANGLE_INDICES_UINT32");
+    
+    if (geom->vertexFormat != ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3)
+      throw std::runtime_error("vertex format must be ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3");
+ 
+    if (geom->triangleCount && geom->pTriangleBuffer == nullptr) throw std::runtime_error("no triangle buffer specified");
+    if (geom->vertexCount   && geom->pVertexBuffer   == nullptr) throw std::runtime_error("no vertex buffer specified");
+  }
+
+  inline void verifyGeometryDesc(const ze_rtas_builder_quads_geometry_info_exp_t* geom)
+  {
+    if (geom->quadFormat != ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_QUAD_INDICES_UINT32)
+      throw std::runtime_error("quad format must be ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_QUAD_INDICES_UINT32");
+    
+    if (geom->vertexFormat != ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3)
+      throw std::runtime_error("vertex format must be ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3");
+ 
+    if (geom->quadCount   && geom->pQuadBuffer   == nullptr) throw std::runtime_error("no quad buffer specified");
+    if (geom->vertexCount && geom->pVertexBuffer == nullptr) throw std::runtime_error("no vertex buffer specified");
+  }
+
+  inline void verifyGeometryDesc(const ze_rtas_builder_procedural_geometry_info_exp_t* geom)
+  {
+    if (geom->primCount   && geom->pfnGetBoundsCb == nullptr) throw std::runtime_error("no bounds function specified");
+    if (geom->reserved != 0) throw std::runtime_error("reserved value must be zero");
+  }
+
+  inline void verifyGeometryDesc(const ze_rtas_builder_instance_geometry_info_exp_t* geom)
+  {
+    if (geom->pTransform == nullptr) throw std::runtime_error("no instance transformation specified");
+    if (geom->pBounds == nullptr) throw std::runtime_error("no acceleration structure bounds specified");
+    if (geom->pAccelerationStructure == nullptr) throw std::runtime_error("no acceleration structure to instanciate specified");
+  }
+
+  inline bool buildBounds(const ze_rtas_builder_triangles_geometry_info_exp_t* geom, uint32_t primID, BBox3fa& bbox, void* buildUserPtr)
+  {
+    if (primID >= geom->triangleCount) return false;
+    const ze_rtas_triangle_indices_uint32_exp_t tri = getPrimitive(geom,primID);
+    if (unlikely(tri.v0 >= geom->vertexCount)) return false;
+    if (unlikely(tri.v1 >= geom->vertexCount)) return false;
+    if (unlikely(tri.v2 >= geom->vertexCount)) return false;
+    
+    const Vec3f p0 = getVertex(geom,tri.v0);
+    const Vec3f p1 = getVertex(geom,tri.v1);
+    const Vec3f p2 = getVertex(geom,tri.v2);
+    if (unlikely(!isvalid(p0))) return false;
+    if (unlikely(!isvalid(p1))) return false;
+    if (unlikely(!isvalid(p2))) return false;
+    
+    bbox = BBox3fa(min(p0,p1,p2),max(p0,p1,p2));
+    return true;
+  }
+
+  inline bool buildBounds(const ze_rtas_builder_quads_geometry_info_exp_t* geom, uint32_t primID, BBox3fa& bbox, void* buildUserPtr)
+  {
+    if (primID >= geom->quadCount) return false;
+    const ze_rtas_quad_indices_uint32_exp_t tri = getPrimitive(geom,primID);
+    if (unlikely(tri.v0 >= geom->vertexCount)) return false;
+    if (unlikely(tri.v1 >= geom->vertexCount)) return false;
+    if (unlikely(tri.v2 >= geom->vertexCount)) return false;
+    if (unlikely(tri.v3 >= geom->vertexCount)) return false;
+    
+    const Vec3f p0 = getVertex(geom,tri.v0);
+    const Vec3f p1 = getVertex(geom,tri.v1);
+    const Vec3f p2 = getVertex(geom,tri.v2);
+    const Vec3f p3 = getVertex(geom,tri.v3);
+    if (unlikely(!isvalid(p0))) return false;
+    if (unlikely(!isvalid(p1))) return false;
+    if (unlikely(!isvalid(p2))) return false;
+    if (unlikely(!isvalid(p3))) return false;
+    
+    bbox = BBox3fa(min(p0,p1,p2,p3),max(p0,p1,p2,p3));
+    return true;
+  }
+
+  inline bool buildBounds(const ze_rtas_builder_procedural_geometry_info_exp_t* geom, uint32_t primID, BBox3fa& bbox, void* buildUserPtr)
+  {
+    if (primID >= geom->primCount) return false;
+    if (geom->pfnGetBoundsCb == nullptr) return false;
+
+    BBox3f bounds;
+    ze_rtas_geometry_aabbs_exp_cb_params_t params = { ZE_STRUCTURE_TYPE_RTAS_GEOMETRY_AABBS_EXP_CB_PARAMS };
+    params.primID = primID;
+    params.primIDCount = 1;
+    params.pGeomUserPtr = geom->pGeomUserPtr;
+    params.pBuildUserPtr = buildUserPtr;
+    params.pBoundsOut = (ze_rtas_aabb_exp_t*) &bounds;
+    (geom->pfnGetBoundsCb)(&params);
+    
+    if (unlikely(!isvalid(bounds.lower))) return false;
+    if (unlikely(!isvalid(bounds.upper))) return false;
+    if (unlikely(bounds.empty())) return false;
+    
+    bbox = (BBox3f&) bounds;
+    return true;
+  }
+
+  inline bool buildBounds(const ze_rtas_builder_instance_geometry_info_exp_t* geom, uint32_t primID, BBox3fa& bbox, void* buildUserPtr)
+  {
+    if (primID >= 1) return false;
+    if (geom->pAccelerationStructure == nullptr) return false;
+    if (geom->pTransform == nullptr) return false;
+    
+    const AffineSpace3fa local2world = getTransform(geom);
+    const Vec3fa lower(geom->pBounds->lower.x,geom->pBounds->lower.y,geom->pBounds->lower.z);
+    const Vec3fa upper(geom->pBounds->upper.x,geom->pBounds->upper.y,geom->pBounds->upper.z);
+    const BBox3fa bounds = xfmBounds(local2world,BBox3fa(lower,upper));
+     
+    if (unlikely(!isvalid(bounds.lower))) return false;
+    if (unlikely(!isvalid(bounds.upper))) return false;
+    if (unlikely(bounds.empty())) return false;
+    
+    bbox = bounds;
+    return true;
+  }
+
+  template<typename GeometryType>
+  PrimInfo createGeometryPrimRefArray(const GeometryType* geom, void* buildUserPtr, evector<PrimRef>& prims, const range<size_t>& r, size_t k, unsigned int geomID)
+  {
+    PrimInfo pinfo(empty);
+    for (uint32_t primID=r.begin(); primID<r.end(); primID++)
+    {
+      BBox3fa bounds = empty;
+      if (!buildBounds(geom,primID,bounds,buildUserPtr)) continue;
+      const PrimRef prim(bounds,geomID,primID);
+      pinfo.add_center2(prim);
+      prims[k++] = prim;
+    }
+    return pinfo;
+  }
+  
+  typedef struct _zet_base_desc_t
+  {
+    /** [in] type of this structure */
+    ze_structure_type_t stype;
+    
+    /** [in,out][optional] must be null or a pointer to an extension-specific structure */
+    const void* pNext;
+    
+  } zet_base_desc_t_;
+
+  #define VALIDATE(arg) \
+  {\
+  ze_result_t result = validate(arg);\
+  if (result != ZE_RESULT_SUCCESS) return result; \
+  }
+
+#define VALIDATE_PTR(arg)                       \
+  {                                                                     \
+    if ((arg) == nullptr) return ZE_RESULT_ERROR_INVALID_NULL_POINTER; \
+  }                                                                     \
+
+   ze_result_t validate(ze_driver_handle_t hDriver)
+  {
+    if (hDriver == nullptr)
+      return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;
+    
+    return ZE_RESULT_SUCCESS;
+  }
+
+  ze_result_t validate(ze_device_handle_t hDevice)
+  {
+    if (hDevice == nullptr)
+      return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;
+    
+    return ZE_RESULT_SUCCESS;
+  }
+  
+  bool checkDescChain(zet_base_desc_t_* desc)
+  {
+    /* supporting maximal 1024 to also detect cycles */
+    for (size_t i=0; i<1024; i++) {
+      if (desc->pNext == nullptr) return true;
+      desc = (zet_base_desc_t_*) desc->pNext;
+    }
+    return false;
+  }
+
+  struct ze_rtas_builder
+  {
+    ze_rtas_builder () {
+    }
+    
+    ~ze_rtas_builder() {
+      magick = 0x0;
+    }
+
+    bool verify() const {
+      return magick == MAGICK;
+    }
+    
+    enum { MAGICK = 0x45FE67E1 };
+    uint32_t magick = MAGICK;
+  };
+
+  ze_result_t validate(ze_rtas_builder_exp_handle_t hBuilder)
+  {
+    if (hBuilder == nullptr)
+      return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;
+    
+    if (!((ze_rtas_builder*)hBuilder)->verify())
+      return ZE_RESULT_ERROR_INVALID_ARGUMENT;
+
+    return ZE_RESULT_SUCCESS;
+  }
+
+  struct ze_rtas_parallel_operation_t
+  {
+    ze_rtas_parallel_operation_t() {
+    }
+
+    ~ze_rtas_parallel_operation_t() {
+      magick = 0x0;
+    }
+
+    ze_result_t verify() const
+    {
+      if (magick != MAGICK)
+        return ZE_RESULT_ERROR_INVALID_ARGUMENT;
+
+      return ZE_RESULT_SUCCESS;
+    }
+    
+    enum { MAGICK = 0xE84567E1 };
+    uint32_t magick = MAGICK;
+    std::atomic<bool> object_in_use = false;
+    ze_result_t errorCode = ZE_RESULT_SUCCESS;
+    tbb::task_group group;
+  };
+
+  ze_result_t validate(ze_rtas_parallel_operation_exp_handle_t hParallelOperation)
+  {
+    if (hParallelOperation == nullptr)
+      return ZE_RESULT_ERROR_INVALID_NULL_HANDLE;
+    
+    return ((ze_rtas_parallel_operation_t*)hParallelOperation)->verify();
+  }
+
+  ze_result_t validate(const ze_rtas_builder_exp_desc_t* pDescriptor)
+  {
+    if (pDescriptor == nullptr)
+      return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+
+    if (pDescriptor->stype != ZE_STRUCTURE_TYPE_RTAS_BUILDER_EXP_DESC)
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+
+    if (!checkDescChain((zet_base_desc_t_*)pDescriptor))
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+
+    if (uint32_t(ZE_RTAS_BUILDER_EXP_VERSION_CURRENT) < uint32_t(pDescriptor->builderVersion))
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+    
+    return ZE_RESULT_SUCCESS;
+  }
+
+  ze_result_t validate(ze_rtas_device_exp_properties_t* pProperties)
+  { 
+    if (pProperties == nullptr)
+      return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+
+    if (pProperties->stype != ZE_STRUCTURE_TYPE_RTAS_DEVICE_EXP_PROPERTIES)
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+    
+    if (!checkDescChain((zet_base_desc_t_*)pProperties))
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+    
+    return ZE_RESULT_SUCCESS;
+  }
+
+  ze_result_t validate(ze_rtas_format_exp_t rtasFormat)
+  {
+    if (rtasFormat == ZE_RTAS_FORMAT_EXP_INVALID)
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+      
+    if (uint32_t(rtasFormat) > uint32_t(ZE_RTAS_DEVICE_FORMAT_EXP_VERSION_MAX))
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+
+    return ZE_RESULT_SUCCESS;
+  }
+  
+  ze_result_t validate(const ze_rtas_builder_build_op_exp_desc_t* args)
+  {
+    /* check for valid pointers */
+    if (args == nullptr)
+      return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+
+    /* check if input descriptor has proper type */
+    if (args->stype != ZE_STRUCTURE_TYPE_RTAS_BUILDER_BUILD_OP_EXP_DESC)
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+
+    /* check valid pNext chain */
+    if (!checkDescChain((zet_base_desc_t_*)args))
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+
+    /* check if acceleration structure format is supported */
+    VALIDATE(args->rtasFormat);
+
+    /* check for valid geometries array */
+    if (args->ppGeometries == nullptr && args->numGeometries > 0)
+      return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+
+    /* validate that number of geometries are in range */
+    if (args->numGeometries > 0x00FFFFFF)
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+
+    /* validate build quality */
+    if (args->buildQuality < 0 || ZE_RTAS_BUILDER_BUILD_QUALITY_HINT_EXP_HIGH < args->buildQuality)
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+
+    /* validate build flags */
+    if (args->buildFlags >= (ZE_RTAS_BUILDER_BUILD_OP_EXP_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION<<1))
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+    
+    return ZE_RESULT_SUCCESS;
+  }
+
+  ze_result_t validate(ze_rtas_builder_exp_properties_t* pProp)
+  {
+    /* check for valid pointers */
+    if (pProp == nullptr)
+      return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+    
+    /* check if return property has proper type */
+    if (pProp->stype != ZE_STRUCTURE_TYPE_RTAS_BUILDER_EXP_PROPERTIES)
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+
+    /* check valid pNext chain */
+    if (!checkDescChain((zet_base_desc_t_*)pProp))
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+
+    return ZE_RESULT_SUCCESS;
+  }
+
+  ze_result_t validate(ze_rtas_parallel_operation_exp_properties_t* pProperties)
+  {
+    /* check for valid pointer */
+    if (pProperties == nullptr)
+      return ZE_RESULT_ERROR_INVALID_NULL_POINTER;
+
+    /* check for proper property */
+    if (pProperties->stype != ZE_STRUCTURE_TYPE_RTAS_PARALLEL_OPERATION_EXP_PROPERTIES)
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+
+    /* check valid pNext chain */
+    if (!checkDescChain((zet_base_desc_t_*)pProperties))
+      return ZE_RESULT_ERROR_INVALID_ENUMERATION;
+
+    return ZE_RESULT_SUCCESS;
+  }
+  
+  RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASBuilderCreateExpImpl(ze_driver_handle_t hDriver, const ze_rtas_builder_exp_desc_t *pDescriptor, ze_rtas_builder_exp_handle_t *phBuilder)
+  {
+    /* input validation */
+    VALIDATE(hDriver);
+    VALIDATE(pDescriptor);
+    VALIDATE_PTR(phBuilder);
+
+    *phBuilder = (ze_rtas_builder_exp_handle_t) new ze_rtas_builder();
+    return ZE_RESULT_SUCCESS;
+  }
+
+  RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASBuilderDestroyExpImpl(ze_rtas_builder_exp_handle_t hBuilder)
+  {
+    VALIDATE(hBuilder);
+    delete (ze_rtas_builder*) hBuilder;
+    return ZE_RESULT_SUCCESS;
+  }
+
+  RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeDriverRTASFormatCompatibilityCheckExpImpl( ze_driver_handle_t hDriver,
+                                                                                        const ze_rtas_format_exp_t accelFormat,
+                                                                                        const ze_rtas_format_exp_t otherAccelFormat )
+  {
+    /* input validation */
+    VALIDATE(hDriver);
+    VALIDATE(accelFormat);
+    VALIDATE(otherAccelFormat);
+
+    /* check if rtas formats are compatible */
+    if (accelFormat == otherAccelFormat)
+      return ZE_RESULT_SUCCESS;
+
+    /* report incompatible format */
+    return ZE_RESULT_EXP_ERROR_OPERANDS_INCOMPATIBLE;
+  }
+
+  uint32_t getNumPrimitives(const ze_rtas_builder_geometry_info_exp_t* geom)
+  {
+    switch (geom->geometryType) {
+    case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_TRIANGLES  : return ((ze_rtas_builder_triangles_geometry_info_exp_t*) geom)->triangleCount;
+    case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_PROCEDURAL : return ((ze_rtas_builder_procedural_geometry_info_exp_t*) geom)->primCount;
+    case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_QUADS      : return ((ze_rtas_builder_quads_geometry_info_exp_t*) geom)->quadCount;
+    case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_INSTANCE   : return 1;
+    default                              : return 0;
+    };
+  }
+  
+  RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASBuilderGetBuildPropertiesExpImpl(ze_rtas_builder_exp_handle_t hBuilder,
+                                                                                  const ze_rtas_builder_build_op_exp_desc_t* args,
+                                                                                  ze_rtas_builder_exp_properties_t* pProp)
+  {
+    /* input validation */
+    VALIDATE(hBuilder);
+    VALIDATE(args);
+    VALIDATE(pProp);
+
+    const ze_rtas_builder_geometry_info_exp_t** geometries = args->ppGeometries;
+    const size_t numGeometries = args->numGeometries;
+
+    auto getSize = [&](uint32_t geomID) -> size_t {
+      const ze_rtas_builder_geometry_info_exp_t* geom = geometries[geomID];
+      if (geom == nullptr) return 0;
+      return getNumPrimitives(geom);
+    };
+    
+    auto getType = [&](unsigned int geomID)
+    {
+      const ze_rtas_builder_geometry_info_exp_t* geom = geometries[geomID];
+      assert(geom);
+      switch (geom->geometryType) {
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_TRIANGLES : return QBVH6BuilderSAH::TRIANGLE;
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_QUADS: return QBVH6BuilderSAH::QUAD;
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_PROCEDURAL: return QBVH6BuilderSAH::PROCEDURAL;
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_INSTANCE: return QBVH6BuilderSAH::INSTANCE;
+      default: throw std::runtime_error("invalid geometry type");
+      };
+    };
+
+    /* query memory requirements from builder */
+    size_t expectedBytes = 0;
+    size_t worstCaseBytes = 0;
+    size_t scratchBytes = 0;
+    QBVH6BuilderSAH::estimateSize(numGeometries, getSize, getType, args->rtasFormat, args->buildQuality, args->buildFlags, expectedBytes, worstCaseBytes, scratchBytes);
+    
+    /* fill return struct */
+    pProp->flags = 0;
+    pProp->rtasBufferSizeBytesExpected = expectedBytes;
+    pProp->rtasBufferSizeBytesMaxRequired = worstCaseBytes;
+    pProp->scratchBufferSizeBytes = scratchBytes;
+    return ZE_RESULT_SUCCESS;
+  }
+  
+  ze_result_t zeRTASBuilderBuildExpBody(const ze_rtas_builder_build_op_exp_desc_t* args,
+                                            void *pScratchBuffer, size_t scratchBufferSizeBytes,
+                                            void *pRtasBuffer, size_t rtasBufferSizeBytes,
+                                            void *pBuildUserPtr, ze_rtas_aabb_exp_t *pBounds, size_t *pRtasBufferSizeBytes) try
+  {
+    const ze_rtas_builder_geometry_info_exp_t** geometries = args->ppGeometries;
+    const uint32_t numGeometries = args->numGeometries;
+
+    /* verify input descriptors */
+    parallel_for(numGeometries,[&](uint32_t geomID) {
+      const ze_rtas_builder_geometry_info_exp_t* geom = geometries[geomID];
+      if (geom == nullptr) return;
+      
+      switch (geom->geometryType) {
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_TRIANGLES  : verifyGeometryDesc((ze_rtas_builder_triangles_geometry_info_exp_t*)geom); break;
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_QUADS      : verifyGeometryDesc((ze_rtas_builder_quads_geometry_info_exp_t*    )geom); break;
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_PROCEDURAL : verifyGeometryDesc((ze_rtas_builder_procedural_geometry_info_exp_t*)geom); break;
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_INSTANCE   : verifyGeometryDesc((ze_rtas_builder_instance_geometry_info_exp_t* )geom); break;
+      default: throw std::runtime_error("invalid geometry type");
+      };
+    });
+    
+    auto getSize = [&](uint32_t geomID) -> size_t {
+      const ze_rtas_builder_geometry_info_exp_t* geom = geometries[geomID];
+      if (geom == nullptr) return 0;
+      return getNumPrimitives(geom);
+    };
+    
+    auto getType = [&](unsigned int geomID)
+    {
+      const ze_rtas_builder_geometry_info_exp_t* geom = geometries[geomID];
+      assert(geom);
+      switch (geom->geometryType) {
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_TRIANGLES : return QBVH6BuilderSAH::TRIANGLE;
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_QUADS: return QBVH6BuilderSAH::QUAD;
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_PROCEDURAL: return QBVH6BuilderSAH::PROCEDURAL;
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_INSTANCE: return QBVH6BuilderSAH::INSTANCE;
+      default: throw std::runtime_error("invalid geometry type");
+      };
+    };
+    
+    auto createPrimRefArray = [&] (evector<PrimRef>& prims, BBox1f time_range, const range<size_t>& r, size_t k, unsigned int geomID) -> PrimInfo
+    {
+      const ze_rtas_builder_geometry_info_exp_t* geom = geometries[geomID];
+      assert(geom);
+
+      switch (geom->geometryType) {
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_TRIANGLES  : return createGeometryPrimRefArray((ze_rtas_builder_triangles_geometry_info_exp_t*)geom,pBuildUserPtr,prims,r,k,geomID);
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_QUADS      : return createGeometryPrimRefArray((ze_rtas_builder_quads_geometry_info_exp_t*    )geom,pBuildUserPtr,prims,r,k,geomID);
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_PROCEDURAL: return createGeometryPrimRefArray((ze_rtas_builder_procedural_geometry_info_exp_t*)geom,pBuildUserPtr,prims,r,k,geomID);
+      case ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_INSTANCE: return createGeometryPrimRefArray((ze_rtas_builder_instance_geometry_info_exp_t* )geom,pBuildUserPtr,prims,r,k,geomID);
+      default: throw std::runtime_error("invalid geometry type");
+      };
+    };
+
+    auto convertGeometryFlags = [&] (ze_rtas_builder_packed_geometry_exp_flags_t flags) -> GeometryFlags {
+      return (flags & ZE_RTAS_BUILDER_GEOMETRY_EXP_FLAG_NON_OPAQUE) ? GeometryFlags::NONE : GeometryFlags::OPAQUE;
+    };
+    
+    auto getTriangle = [&](unsigned int geomID, unsigned int primID)
+    {
+      const ze_rtas_builder_triangles_geometry_info_exp_t* geom = (const ze_rtas_builder_triangles_geometry_info_exp_t*) geometries[geomID];
+      assert(geom);
+      
+      const ze_rtas_triangle_indices_uint32_exp_t tri = getPrimitive(geom,primID);
+      if (unlikely(tri.v0 >= geom->vertexCount)) return QBVH6BuilderSAH::Triangle();
+      if (unlikely(tri.v1 >= geom->vertexCount)) return QBVH6BuilderSAH::Triangle();
+      if (unlikely(tri.v2 >= geom->vertexCount)) return QBVH6BuilderSAH::Triangle();
+      
+      const Vec3f p0 = getVertex(geom,tri.v0);
+      const Vec3f p1 = getVertex(geom,tri.v1);
+      const Vec3f p2 = getVertex(geom,tri.v2);
+      if (unlikely(!isvalid(p0))) return QBVH6BuilderSAH::Triangle();
+      if (unlikely(!isvalid(p1))) return QBVH6BuilderSAH::Triangle();
+      if (unlikely(!isvalid(p2))) return QBVH6BuilderSAH::Triangle();
+
+      const GeometryFlags gflags = convertGeometryFlags(geom->geometryFlags);
+      return QBVH6BuilderSAH::Triangle(tri.v0,tri.v1,tri.v2,p0,p1,p2,gflags,geom->geometryMask);
+    };
+    
+    auto getTriangleIndices = [&] (uint32_t geomID, uint32_t primID) {
+      const ze_rtas_builder_triangles_geometry_info_exp_t* geom = (const ze_rtas_builder_triangles_geometry_info_exp_t*) geometries[geomID];
+      assert(geom);
+      const ze_rtas_triangle_indices_uint32_exp_t tri = getPrimitive(geom,primID);
+      return Vec3<uint32_t>(tri.v0,tri.v1,tri.v2);
+    };
+    
+    auto getQuad = [&](unsigned int geomID, unsigned int primID)
+    {
+      const ze_rtas_builder_quads_geometry_info_exp_t* geom = (const ze_rtas_builder_quads_geometry_info_exp_t*) geometries[geomID];
+      assert(geom);
+                     
+      const ze_rtas_quad_indices_uint32_exp_t quad = getPrimitive(geom,primID);
+      const Vec3f p0 = getVertex(geom,quad.v0);
+      const Vec3f p1 = getVertex(geom,quad.v1);
+      const Vec3f p2 = getVertex(geom,quad.v2);
+      const Vec3f p3 = getVertex(geom,quad.v3);
+
+      const GeometryFlags gflags = convertGeometryFlags(geom->geometryFlags);
+      return QBVH6BuilderSAH::Quad(p0,p1,p2,p3,gflags,geom->geometryMask);
+    };
+    
+    auto getProcedural = [&](unsigned int geomID, unsigned int primID) {
+      const ze_rtas_builder_procedural_geometry_info_exp_t* geom = (const ze_rtas_builder_procedural_geometry_info_exp_t*) geometries[geomID];
+      assert(geom);
+      return QBVH6BuilderSAH::Procedural(geom->geometryMask); // FIXME: pass gflags
+    };
+    
+    auto getInstance = [&](unsigned int geomID, unsigned int primID)
+    {
+      assert(geometries[geomID]);
+      assert(geometries[geomID]->geometryType == ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_INSTANCE);
+      const ze_rtas_builder_instance_geometry_info_exp_t* geom = (const ze_rtas_builder_instance_geometry_info_exp_t*) geometries[geomID];
+      void* accel = geom->pAccelerationStructure;
+      const AffineSpace3fa local2world = getTransform(geom);
+      return QBVH6BuilderSAH::Instance(local2world,accel,geom->geometryMask,geom->instanceUserID); // FIXME: pass instance flags
+    };
+
+    /* dispatch globals ptr for debugging purposes */
+    void* dispatchGlobalsPtr = nullptr;
+#if defined(EMBREE_SYCL_ALLOC_DISPATCH_GLOBALS)
+    if (args->pNext) {
+      zet_base_desc_t_* next = (zet_base_desc_t_*) args->pNext;
+      if (next->stype == ZE_STRUCTURE_TYPE_RTAS_BUILDER_BUILD_OP_DEBUG_EXP_DESC) {
+        ze_rtas_builder_build_op_debug_exp_desc_t* debug_ext = (ze_rtas_builder_build_op_debug_exp_desc_t*) next;
+        dispatchGlobalsPtr = debug_ext->dispatchGlobalsPtr;
+      }
+    }
+#endif
+
+    bool verbose = false;
+    bool success = QBVH6BuilderSAH::build(numGeometries, nullptr, 
+                           getSize, getType, 
+                           createPrimRefArray, getTriangle, getTriangleIndices, getQuad, getProcedural, getInstance,
+                           (char*)pRtasBuffer, rtasBufferSizeBytes,
+                           pScratchBuffer, scratchBufferSizeBytes,
+                           (BBox3f*) pBounds, pRtasBufferSizeBytes,
+                           args->rtasFormat, args->buildQuality, args->buildFlags, verbose, dispatchGlobalsPtr);
+    if (!success) {
+      return ZE_RESULT_EXP_RTAS_BUILD_RETRY;
+    }
+    return ZE_RESULT_SUCCESS;
+  }
+  catch (std::exception& e) {
+    //std::cerr << "caught exception during BVH build: " << e.what() << std::endl;
+    return ZE_RESULT_ERROR_UNKNOWN;
+  }
+  
+  RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASBuilderBuildExpImpl(ze_rtas_builder_exp_handle_t hBuilder,
+                                                                     const ze_rtas_builder_build_op_exp_desc_t* args,
+                                                                     void *pScratchBuffer, size_t scratchBufferSizeBytes,
+                                                                     void *pRtasBuffer, size_t rtasBufferSizeBytes,
+                                                                     ze_rtas_parallel_operation_exp_handle_t hParallelOperation,
+                                                                     void *pBuildUserPtr, ze_rtas_aabb_exp_t *pBounds, size_t *pRtasBufferSizeBytes)
+  {
+    /* input validation */
+    VALIDATE(hBuilder);
+    VALIDATE(args);
+    VALIDATE_PTR(pScratchBuffer);
+    VALIDATE_PTR(pRtasBuffer);
+    
+    /* if parallel operation is provided then execute using thread arena inside task group ... */
+    if (hParallelOperation)
+    {
+      VALIDATE(hParallelOperation);
+      
+      ze_rtas_parallel_operation_t* op = (ze_rtas_parallel_operation_t*) hParallelOperation;
+      
+      if (op->object_in_use.load())
+        return ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE;
+      
+      op->object_in_use.store(true);
+      
+      g_arena.execute([&](){ op->group.run([=](){
+         op->errorCode = zeRTASBuilderBuildExpBody(args,
+                                                       pScratchBuffer, scratchBufferSizeBytes,
+                                                       pRtasBuffer, rtasBufferSizeBytes,
+                                                       pBuildUserPtr, pBounds, pRtasBufferSizeBytes);
+                                            });
+                       });
+      return ZE_RESULT_EXP_RTAS_BUILD_DEFERRED;
+    }
+    /* ... otherwise we just execute inside task arena to avoid spawning of TBB worker threads */
+    else
+    {
+      ze_result_t errorCode = ZE_RESULT_SUCCESS;
+      g_arena.execute([&](){ errorCode = zeRTASBuilderBuildExpBody(args,
+                                                                        pScratchBuffer, scratchBufferSizeBytes,
+                                                                        pRtasBuffer, rtasBufferSizeBytes,
+                                                                        pBuildUserPtr, pBounds, pRtasBufferSizeBytes);
+                       });
+      return errorCode;
+    }
+  }
+
+  RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASParallelOperationCreateExpImpl(ze_driver_handle_t hDriver, ze_rtas_parallel_operation_exp_handle_t* phParallelOperation)
+  {
+    /* input validation */
+    VALIDATE(hDriver);
+    VALIDATE_PTR(phParallelOperation);
+
+    /* create parallel operation object */
+    *phParallelOperation = (ze_rtas_parallel_operation_exp_handle_t) new ze_rtas_parallel_operation_t();
+    return ZE_RESULT_SUCCESS;
+  }
+  
+  RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASParallelOperationDestroyExpImpl( ze_rtas_parallel_operation_exp_handle_t hParallelOperation )
+  {
+    /* input validation */
+    VALIDATE(hParallelOperation);
+
+    /* delete parallel operation */
+    delete (ze_rtas_parallel_operation_t*) hParallelOperation;
+    return ZE_RESULT_SUCCESS;
+  }
+  
+  RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASParallelOperationGetPropertiesExpImpl( ze_rtas_parallel_operation_exp_handle_t hParallelOperation, ze_rtas_parallel_operation_exp_properties_t* pProperties )
+  {
+    /* input validation */
+    VALIDATE(hParallelOperation);
+    VALIDATE(pProperties);
+
+    ze_rtas_parallel_operation_t* op = (ze_rtas_parallel_operation_t*) hParallelOperation;
+    if (!op->object_in_use.load())
+      return ZE_RESULT_ERROR_INVALID_ARGUMENT;
+    
+    /* return properties */
+    pProperties->flags = 0;
+    pProperties->maxConcurrency = tbb::this_task_arena::max_concurrency();
+    return ZE_RESULT_SUCCESS;
+  }
+  
+  RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASParallelOperationJoinExpImpl( ze_rtas_parallel_operation_exp_handle_t hParallelOperation)
+  {
+    /* check for valid handle */
+    VALIDATE(hParallelOperation);
+    
+    ze_rtas_parallel_operation_t* op = (ze_rtas_parallel_operation_t*) hParallelOperation;
+    g_arena.execute([&](){ op->group.wait(); });
+    op->object_in_use.store(false); // this is slighty too early
+    return op->errorCode;
+  }
+}
--- a/Framework/external/embree/kernels/rthwif/rtbuild/rtbuild.h
+++ b/Framework/external/embree/kernels/rthwif/rtbuild/rtbuild.h
@ -0,0 +1,66 @@
+// Copyright 2009-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "../../level_zero/ze_api.h"
+
+#if !defined(ZE_RTAS_BUILDER_EXP_NAME)
+#include "../../level_zero/ze_rtas.h"
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__cplusplus)
+#  define RTHWIF_API_EXTERN_C extern "C"
+#else
+#  define RTHWIF_API_EXTERN_C
+#endif
+
+#if defined(_WIN32)
+#if defined(EMBREE_RTHWIF_STATIC_LIB)
+#  define RTHWIF_API_IMPORT RTHWIF_API_EXTERN_C
+#  define RTHWIF_API_EXPORT RTHWIF_API_EXTERN_C
+#else
+#  define RTHWIF_API_IMPORT RTHWIF_API_EXTERN_C __declspec(dllimport)
+#  define RTHWIF_API_EXPORT RTHWIF_API_EXTERN_C __declspec(dllexport)
+#endif
+#else
+#  define RTHWIF_API_IMPORT RTHWIF_API_EXTERN_C
+#  define RTHWIF_API_EXPORT RTHWIF_API_EXTERN_C __attribute__ ((visibility ("default")))
+#endif
+
+typedef enum _ze_raytracing_accel_format_internal_t {
+  ZE_RTAS_DEVICE_FORMAT_EXP_INVALID = 0,      // invalid acceleration structure format
+  ZE_RTAS_DEVICE_FORMAT_EXP_VERSION_1 = 1,    // acceleration structure format version 1
+  ZE_RTAS_DEVICE_FORMAT_EXP_VERSION_2 = 2,    // acceleration structure format version 2
+  ZE_RTAS_DEVICE_FORMAT_EXP_VERSION_MAX = 2
+} ze_raytracing_accel_format_internal_t;
+
+RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASBuilderCreateExpImpl(ze_driver_handle_t hDriver, const ze_rtas_builder_exp_desc_t *pDescriptor, ze_rtas_builder_exp_handle_t *phBuilder);
+
+RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASBuilderDestroyExpImpl(ze_rtas_builder_exp_handle_t hBuilder);
+
+RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeDriverRTASFormatCompatibilityCheckExpImpl( ze_driver_handle_t hDriver,
+                                                                                       const ze_rtas_format_exp_t accelFormat,
+                                                                                       const ze_rtas_format_exp_t otherAccelFormat);
+RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASBuilderGetBuildPropertiesExpImpl(ze_rtas_builder_exp_handle_t hBuilder,
+                                                                                const ze_rtas_builder_build_op_exp_desc_t* args,
+                                                                                ze_rtas_builder_exp_properties_t* pProp);
+  
+RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASBuilderBuildExpImpl(ze_rtas_builder_exp_handle_t hBuilder,
+                                                                   const ze_rtas_builder_build_op_exp_desc_t* args,
+                                                                   void *pScratchBuffer, size_t scratchBufferSizeBytes,
+                                                                   void *pRtasBuffer, size_t rtasBufferSizeBytes,
+                                                                   ze_rtas_parallel_operation_exp_handle_t hParallelOperation,
+                                                                   void *pBuildUserPtr, ze_rtas_aabb_exp_t *pBounds, size_t *pRtasBufferSizeBytes);
+
+RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASParallelOperationCreateExpImpl(ze_driver_handle_t hDriver, ze_rtas_parallel_operation_exp_handle_t* phParallelOperation);
+
+RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASParallelOperationDestroyExpImpl( ze_rtas_parallel_operation_exp_handle_t hParallelOperation );
+
+RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASParallelOperationGetPropertiesExpImpl( ze_rtas_parallel_operation_exp_handle_t hParallelOperation, ze_rtas_parallel_operation_exp_properties_t* pProperties );
+
+RTHWIF_API_EXPORT ze_result_t ZE_APICALL zeRTASParallelOperationJoinExpImpl( ze_rtas_parallel_operation_exp_handle_t hParallelOperation);
+
--- a/Framework/external/embree/kernels/rthwif/rtbuild/statistics.cpp
+++ b/Framework/external/embree/kernels/rthwif/rtbuild/statistics.cpp
@ -0,0 +1,155 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "statistics.h"
+
+namespace embree
+{
+  class RestoreStreamState 
+  {
+  public:
+    RestoreStreamState(std::ostream& iostream)
+      : iostream(iostream), flags(iostream.flags()), precision(iostream.precision()) {
+    }
+
+    ~RestoreStreamState() {
+      iostream.flags(flags);
+      iostream.precision(precision);
+    }
+    
+  private:
+    std::ostream& iostream;
+    std::ios::fmtflags flags;
+    std::streamsize precision;
+  };
+  
+  double ratio(double a, double b)
+  {
+    if (b == 0.0) return 0.0f;
+    else return a/b;
+  }
+
+  double percent(double a, double b) {
+    return 100.0*ratio(a,b);
+  }
+
+  double ratio(size_t a, size_t b) {
+    return ratio(double(a), double(b));
+  }
+  double percent(size_t a, size_t b) {
+    return percent(double(a), double(b));
+  }
+  
+  void BVHStatistics::NodeStat::print(std::ostream& cout, double totalSAH, size_t totalBytes, size_t numPrimitives) const
+  {
+    RestoreStreamState iostate(cout);
+    cout << std::setw(7) << numNodes << " ";
+    cout << std::setw(7) << std::setprecision(3) << sah();
+    cout << std::setw(7) << std::setprecision(2) << percent(sah(),totalSAH) << "% ";
+    cout << std::setw(8) << std::setprecision(2) << bytes()/1E6  << " MB ";
+    cout << std::setw(7) << std::setprecision(2) << percent(numBytes,numBytes) << "% ";
+    cout << std::setw(7) << std::setprecision(2) << percent(bytes(),totalBytes) << "% ";
+    cout << std::setw(8) << std::setprecision(2) << ratio(bytes(),numNodes) << " ";
+    cout << std::setw(8) << std::setprecision(2) << ratio(bytes(),numChildrenUsed) << " ";
+    cout << std::setw(8) << std::setprecision(2) << ratio(bytes(),numPrimitives) << " ";
+    cout << std::setw(7) << std::setprecision(2) << ratio(numChildrenUsed,numNodes) << " ";
+    cout << std::setw(7) << std::setprecision(2) << 100.0*fillRate() << "% ";
+    cout << std::endl;
+  }
+  
+  void BVHStatistics::LeafStat::print(std::ostream& cout, double totalSAH, size_t totalBytes, size_t numPrimitives, bool blocks) const
+  {
+    RestoreStreamState iostate(cout);
+    size_t N = blocks ? numBlocks : numLeaves;
+    cout << std::setw(7) << N << " ";
+    cout << std::setw(7) << std::setprecision(3) << sah();
+    cout << std::setw(7) << std::setprecision(2) << percent(sah(),totalSAH) << "% ";
+    cout << std::setw(8) << std::setprecision(2) << double(bytes())/1E6  << " MB ";
+    cout << std::setw(7) << std::setprecision(2) << percent(numBytesUsed,numBytesTotal) << "% ";
+    cout << std::setw(7) << std::setprecision(2) << percent(bytes(),totalBytes) << "% ";
+    cout << std::setw(8) << std::setprecision(2) << ratio(bytes(),N) << " ";
+    cout << std::setw(8) << std::setprecision(2) << ratio(bytes(),numPrimsUsed) << " ";
+    cout << std::setw(8) << std::setprecision(2) << ratio(bytes(),numPrimitives) << " ";
+    cout << std::setw(7) << std::setprecision(2) << ratio(numPrimsUsed,N) << " ";
+    cout << std::setw(7) << std::setprecision(2) << 100.0*fillRate() << "% ";
+    cout << std::endl;
+  }
+  
+  void BVHStatistics::print (std::ostream& cout) const
+  {
+    RestoreStreamState iostate(cout);
+    cout.setf(std::ios::fixed, std::ios::floatfield);
+    cout.fill(' ');
+    
+    double totalSAH   = internalNode.nodeSAH + quadLeaf.leafSAH + proceduralLeaf.leafSAH + instanceLeaf.leafSAH;
+    size_t totalBytes = internalNode.bytes() + quadLeaf.bytes() + proceduralLeaf.bytes() + instanceLeaf.bytes();
+    size_t totalNodes = internalNode.numNodes + quadLeaf.numLeaves + proceduralLeaf.numLeaves + instanceLeaf.numLeaves;
+    size_t totalPrimitives = quadLeaf.numPrimsUsed + proceduralLeaf.numPrimsUsed + instanceLeaf.numPrimsUsed;
+
+    cout << std::endl;
+    cout << "BVH statistics:" << std::endl;
+    cout << "---------------" << std::endl;
+    cout << "  numScenePrimitives          = " << numScenePrimitives << std::endl;
+    cout << "  numBuildPrimitives          = " << numBuildPrimitives << std::endl;
+    cout << "  numBuildPrimitivesPostSplit = " << numBuildPrimitivesPostSplit << std::endl;
+    cout << "  primRefSplits               = " << std::setprecision(2) << percent(numBuildPrimitivesPostSplit,numBuildPrimitives) << "%" << std::endl;
+    cout << "  numBVHPrimitives            = " << totalPrimitives << std::endl;
+    cout << "  spatialSplits               = " << std::setprecision(2) << percent(totalPrimitives,numScenePrimitives) << "%" << std::endl;    
+    cout << std::endl;
+     
+    cout << "                      #nodes     SAH   total       bytes     used    total   b/node  b/child   b/prim  #child     fill" << std::endl;
+    cout << "----------------------------------------------------------------------------------------------------------------------" << std::endl;
+       cout << "  total            : ";
+    cout << std::setw(7) << totalNodes << " ";
+    cout << std::setw(7) << std::setprecision(3) << totalSAH;
+    cout << " 100.00% ";
+    cout << std::setw(8) << std::setprecision(2) << totalBytes/1E6 << " MB ";
+    cout << " 100.00% ";
+    cout << " 100.00% ";
+    cout << "         ";
+    cout << "         ";
+    cout << std::setw(8) << std::setprecision(2) << ratio(totalBytes,totalPrimitives) << std::endl;
+
+    LeafStat leaf = quadLeaf + proceduralLeaf + instanceLeaf;
+    cout << "  internalNode     : "; internalNode  .print(cout,totalSAH,totalBytes,totalPrimitives);
+    cout << "  leaves           : "; leaf          .print(cout,totalSAH,totalBytes,totalPrimitives);
+    cout << "    quadLeaf       : "; quadLeaf      .print(cout,totalSAH,totalBytes,totalPrimitives);
+    cout << "    proceduralLeaf : "; proceduralLeaf.print(cout,totalSAH,totalBytes,totalPrimitives);
+    cout << "    proceduralBlock: "; proceduralLeaf.print(cout,totalSAH,totalBytes,totalPrimitives,true);
+    cout << "    instanceLeaf   : "; instanceLeaf  .print(cout,totalSAH,totalBytes,totalPrimitives);
+  }
+  
+  void BVHStatistics::print_raw(std::ostream& cout) const
+  {
+    RestoreStreamState iostate(cout);
+    size_t totalPrimitives = quadLeaf.numPrimsUsed + proceduralLeaf.numPrimsUsed + instanceLeaf.numPrimsUsed;
+    cout << "bvh_spatial_split_factor = " << percent(totalPrimitives,numBuildPrimitives) << std::endl;
+    
+    cout << "bvh_internal_sah = " << internalNode.nodeSAH << std::endl;
+    cout << "bvh_internal_num = " << internalNode.numNodes << std::endl;
+    cout << "bvh_internal_num_children_used = " << internalNode.numChildrenUsed << std::endl;
+    cout << "bvh_internal_num_children_total = " << internalNode.numChildrenTotal << std::endl;
+    cout << "bvh_internal_num_bytes = " << internalNode.bytes() << std::endl;
+    
+    cout << "bvh_quad_leaf_sah = " << quadLeaf.leafSAH << std::endl;
+    cout << "bvh_quad_leaf_num = " << quadLeaf.numLeaves << std::endl;
+    cout << "bvh_quad_leaf_num_prims_used = " << quadLeaf.numPrimsUsed << std::endl;
+    cout << "bvh_quad_leaf_num_prims_total = " << quadLeaf.numPrimsTotal << std::endl;
+    cout << "bvh_quad_leaf_num_bytes_used = " << quadLeaf.numBytesUsed << std::endl;
+    cout << "bvh_quad_leaf_num_bytes_total = " << quadLeaf.numBytesTotal << std::endl;
+
+    cout << "bvh_procedural_leaf_sah = " << proceduralLeaf.leafSAH << std::endl;
+    cout << "bvh_procedural_leaf_num = " << proceduralLeaf.numLeaves << std::endl;
+    cout << "bvh_procedural_leaf_num_prims_used = " << proceduralLeaf.numPrimsUsed << std::endl;
+    cout << "bvh_procedural_leaf_num_prims_total = " << proceduralLeaf.numPrimsTotal << std::endl;
+    cout << "bvh_procedural_leaf_num_bytes_used = " << proceduralLeaf.numBytesUsed << std::endl;
+    cout << "bvh_procedural_leaf_num_bytes_total = " << proceduralLeaf.numBytesTotal << std::endl;
+
+    cout << "bvh_instance_leaf_sah = " << instanceLeaf.leafSAH << std::endl;
+    cout << "bvh_instance_leaf_num = " << instanceLeaf.numLeaves << std::endl;
+    cout << "bvh_instance_leaf_num_prims_used = " << instanceLeaf.numPrimsUsed << std::endl;
+    cout << "bvh_instance_leaf_num_prims_total = " << instanceLeaf.numPrimsTotal << std::endl;
+    cout << "bvh_instance_leaf_num_bytes_used = " << instanceLeaf.numBytesUsed << std::endl;
+    cout << "bvh_instance_leaf_num_bytes_total = " << instanceLeaf.numBytesTotal << std::endl;
+  }
+}
--- a/Framework/external/embree/kernels/rthwif/rtbuild/statistics.h
+++ b/Framework/external/embree/kernels/rthwif/rtbuild/statistics.h
@ -0,0 +1,118 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#if defined(ZE_RAYTRACING)
+#include "sys/platform.h"
+#else
+#include "../../../common/sys/platform.h"
+#endif
+
+namespace embree
+{
+  struct BVHStatistics
+  {
+    struct NodeStat
+    {
+      NodeStat ( double nodeSAH = 0,
+                 size_t numNodes = 0, 
+                 size_t numChildrenUsed = 0,
+                 size_t numChildrenTotal = 0,
+                 size_t numBytes = 0)
+        : nodeSAH(nodeSAH),
+        numNodes(numNodes), 
+        numChildrenUsed(numChildrenUsed),
+        numChildrenTotal(numChildrenTotal),
+        numBytes(numBytes) {}
+      
+      double sah()   const { return nodeSAH; }
+      size_t bytes() const { return numBytes; }
+      size_t size()  const { return numNodes; }
+      
+      double fillRateNom () const { return double(numChildrenUsed);  }
+      double fillRateDen () const { return double(numChildrenTotal);  }
+      double fillRate    () const { return fillRateDen() ? fillRateNom()/fillRateDen() : 0.0; }
+
+      friend NodeStat operator+ ( const NodeStat& a, const NodeStat& b)
+      {
+        return NodeStat(a.nodeSAH + b.nodeSAH,
+                        a.numNodes+b.numNodes,
+                        a.numChildrenUsed+b.numChildrenUsed,
+                        a.numChildrenTotal+b.numChildrenTotal,
+                        a.numBytes+b.numBytes);
+      }
+            
+      void print(std::ostream& cout, double totalSAH, size_t totalBytes, size_t numPrimitives) const;
+      
+    public:
+      double nodeSAH;
+      size_t numNodes;
+      size_t numChildrenUsed;
+      size_t numChildrenTotal;
+      size_t numBytes;
+    };
+    
+    struct LeafStat
+    {
+      LeafStat(double leafSAH = 0.0f,
+        size_t numLeaves = 0,
+        size_t numBlocks = 0,
+        size_t numPrimsUsed = 0,
+        size_t numPrimsTotal = 0,
+        size_t numBytesUsed = 0,
+        size_t numBytesTotal = 0)
+        : leafSAH(leafSAH),
+        numLeaves(numLeaves),
+        numBlocks(numBlocks),
+        numPrimsUsed(numPrimsUsed),
+        numPrimsTotal(numPrimsTotal),
+        numBytesUsed(numBytesUsed),
+        numBytesTotal(numBytesTotal) {}
+      
+      double sah()   const { return leafSAH; }
+      size_t bytes() const { return numBytesTotal; }
+      size_t size()  const { return numLeaves; }
+      
+      double fillRateNom () const { return double(numPrimsUsed);  }
+      double fillRateDen () const { return double(numPrimsTotal);  }
+      double fillRate    () const { return fillRateDen() ? fillRateNom()/fillRateDen() : 0.0; }
+
+      friend LeafStat operator+ ( const LeafStat& a, const LeafStat& b)
+      {
+        return LeafStat(a.leafSAH + b.leafSAH,
+                        a.numLeaves+b.numLeaves,
+                        a.numBlocks+b.numBlocks,
+                        a.numPrimsUsed+b.numPrimsUsed,
+                        a.numPrimsTotal+b.numPrimsTotal,
+                        a.numBytesUsed+b.numBytesUsed,
+                        a.numBytesTotal+b.numBytesTotal);
+      }
+      
+      void print(std::ostream& cout, double totalSAH, size_t totalBytes, size_t numPrimitives, bool blocks = false) const;
+
+    public:
+      double leafSAH;                    //!< SAH of the leaves only
+      size_t numLeaves;                  //!< Number of leaf nodes.
+      size_t numBlocks;                  //!< Number of blocks referenced
+      size_t numPrimsUsed;               //!< Number of active primitives
+      size_t numPrimsTotal;              //!< Number of active and inactive primitives
+      size_t numBytesUsed;               //!< Number of used bytes
+      size_t numBytesTotal;              //!< Number of total bytes of leaves.
+    };
+
+    BVHStatistics ()
+    : numScenePrimitives(0), numBuildPrimitives(0), numBuildPrimitivesPostSplit(0) {}
+        
+    void print    (std::ostream& cout) const;
+    void print_raw(std::ostream& cout) const;
+
+    size_t numScenePrimitives;
+    size_t numBuildPrimitives;
+    size_t numBuildPrimitivesPostSplit;
+    NodeStat internalNode;
+    LeafStat quadLeaf;
+    LeafStat proceduralLeaf;
+    LeafStat instanceLeaf;
+  };
+}
--- a/Framework/external/embree/kernels/rthwif/rttrace/rttrace.h
+++ b/Framework/external/embree/kernels/rthwif/rttrace/rttrace.h
@ -0,0 +1,266 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#if defined(ZE_RAYTRACING_RT_SIMULATION)
+#include "rtcore.h"
+#endif
+
+#if defined(EMBREE_SYCL_RT_VALIDATION_API)
+#  include "rttrace_validation.h"
+#else
+
+#include <cstdint>
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#pragma clang diagnostic ignored "-W#pragma-messages"
+
+#include <sycl/sycl.hpp>
+
+#pragma clang diagnostic pop
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreturn-type-c-linkage"
+
+enum intel_ray_flags_t
+{
+  intel_ray_flags_none = 0x00,
+  intel_ray_flags_force_opaque = 0x01,                      // forces geometry to be opaque (no anyhit shader invokation)
+  intel_ray_flags_force_non_opaque = 0x02,                  // forces geometry to be non-opqaue (invoke anyhit shader)
+  intel_ray_flags_accept_first_hit_and_end_search = 0x04,   // terminates traversal on the first hit found (shadow rays)
+  intel_ray_flags_skip_closest_hit_shader = 0x08,           // skip execution of the closest hit shader
+  intel_ray_flags_cull_back_facing_triangles = 0x10,        // back facing triangles to not produce a hit
+  intel_ray_flags_cull_front_facing_triangles = 0x20,       // front facing triangles do not produce a hit
+  intel_ray_flags_cull_opaque = 0x40,                       // opaque geometry does not produce a hit
+  intel_ray_flags_cull_non_opaque = 0x80,                   // non-opaque geometry does not produce a hit
+  intel_ray_flags_skip_triangles = 0x100,                   // treat all triangle intersections as misses.
+  intel_ray_flags_skip_procedural_primitives = 0x200,       // skip execution of intersection shaders
+};
+
+enum intel_hit_type_t
+{
+  intel_hit_type_committed_hit = 0,
+  intel_hit_type_potential_hit = 1,
+};
+
+enum intel_raytracing_ext_flag_t
+{
+  intel_raytracing_ext_flag_ray_query   = 1 << 0,        // true if ray queries are supported
+};
+
+// opaque types
+typedef __attribute__((opencl_private)) struct intel_ray_query_opaque_t* intel_ray_query_t;
+typedef __attribute__((opencl_global )) struct intel_raytracing_acceleration_structure_opaque_t* intel_raytracing_acceleration_structure_t;
+
+struct intel_float2
+{
+  float x, y;
+
+  intel_float2() {}
+
+  intel_float2(float x, float y)
+    : x(x), y(y) {}
+  
+  intel_float2(sycl::float2 v)
+    : x(v.x()), y(v.y()) {}
+
+  operator sycl::float2() {
+    return sycl::float2(x,y);
+  }
+};
+
+struct intel_float3
+{
+  float x, y, z;
+
+  intel_float3() {}
+
+  intel_float3(float x, float y, float z)
+    : x(x), y(y), z(z) {}
+
+  intel_float3(sycl::float3 v)
+    : x(v.x()), y(v.y()), z(v.z()) {}
+
+  operator sycl::float3() {
+    return sycl::float3(x,y,z);
+  }
+};
+
+struct intel_float4x3 {
+  intel_float3 vx, vy, vz, p;
+};
+
+struct intel_ray_desc_t
+{
+  intel_float3 origin;
+  intel_float3 direction;
+  float tmin;
+  float tmax;
+  unsigned int mask;
+  intel_ray_flags_t flags;
+};
+
+// if traversal returns one can test if a triangle or procedural is hit
+enum intel_candidate_type_t
+{
+  intel_candidate_type_triangle,
+  intel_candidate_type_procedural
+};
+
+#ifdef __SYCL_DEVICE_ONLY__
+
+
+// check supported ray tracing features
+SYCL_EXTERNAL extern "C" intel_raytracing_ext_flag_t intel_get_raytracing_ext_flag();
+
+// initializes a ray query
+SYCL_EXTERNAL extern "C" intel_ray_query_t intel_ray_query_init(
+  intel_ray_desc_t ray,
+  intel_raytracing_acceleration_structure_t accel
+);
+
+// setup for instance traversal using a transformed ray and bottom-level AS
+SYCL_EXTERNAL extern "C" void intel_ray_query_forward_ray(
+  intel_ray_query_t query,
+  intel_ray_desc_t ray,
+  intel_raytracing_acceleration_structure_t accel
+);
+
+// commit the potential hit
+SYCL_EXTERNAL extern "C" void intel_ray_query_commit_potential_hit(
+  intel_ray_query_t query
+);
+
+// commit the potential hit and override hit distance and UVs
+SYCL_EXTERNAL extern "C" void intel_ray_query_commit_potential_hit_override(
+  intel_ray_query_t query,
+  float override_hit_distance,
+  intel_float2 override_uv
+);
+
+// start traversal of a ray query
+SYCL_EXTERNAL extern "C" void intel_ray_query_start_traversal( intel_ray_query_t query );
+
+// synchronize rayquery execution.  If a ray was dispatched, 
+//  This must be called prior to calling any of the accessors below.
+SYCL_EXTERNAL extern "C" void intel_ray_query_sync( intel_ray_query_t query );
+
+// signal that a ray query will not be used further.  This is the moral equaivalent of a delete
+// this function does an implicit sync
+SYCL_EXTERNAL extern "C" void intel_ray_query_abandon( intel_ray_query_t query );
+
+// read hit information during shader execution
+SYCL_EXTERNAL extern "C" unsigned int intel_get_hit_bvh_level( intel_ray_query_t query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL extern "C" float intel_get_hit_distance( intel_ray_query_t query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL extern "C" intel_float2 intel_get_hit_barycentrics( intel_ray_query_t query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL extern "C" bool intel_get_hit_front_face( intel_ray_query_t query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL extern "C" unsigned int intel_get_hit_geometry_id(intel_ray_query_t query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL extern "C" unsigned int intel_get_hit_primitive_id( intel_ray_query_t query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL extern "C" unsigned int intel_get_hit_triangle_primitive_id( intel_ray_query_t query, intel_hit_type_t hit_type );  // fast path for quad leaves
+SYCL_EXTERNAL extern "C" unsigned int intel_get_hit_procedural_primitive_id( intel_ray_query_t query, intel_hit_type_t hit_type ); // fast path for procedural leaves
+SYCL_EXTERNAL extern "C" unsigned int intel_get_hit_instance_id( intel_ray_query_t query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL extern "C" unsigned int intel_get_hit_instance_user_id( intel_ray_query_t query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL extern "C" intel_float4x3 intel_get_hit_world_to_object( intel_ray_query_t query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL extern "C" intel_float4x3 intel_get_hit_object_to_world( intel_ray_query_t query, intel_hit_type_t hit_type );
+
+// fetch triangle vertices for a hit
+SYCL_EXTERNAL extern "C" void intel_get_hit_triangle_vertices( intel_ray_query_t query, intel_float3 vertices_out[3], intel_hit_type_t hit_type );
+
+// Read ray-data. This is used to read transformed rays produced by HW instancing pipeline
+// during any-hit or intersection shader execution.
+SYCL_EXTERNAL extern "C" intel_float3 intel_get_ray_origin( intel_ray_query_t query, unsigned int bvh_level );
+SYCL_EXTERNAL extern "C" intel_float3 intel_get_ray_direction( intel_ray_query_t query, unsigned int bvh_level );
+SYCL_EXTERNAL extern "C" float intel_get_ray_tmin( intel_ray_query_t query, unsigned int bvh_level );
+SYCL_EXTERNAL extern "C" intel_ray_flags_t intel_get_ray_flags( intel_ray_query_t query, unsigned int bvh_level );
+SYCL_EXTERNAL extern "C" unsigned int intel_get_ray_mask( intel_ray_query_t query, unsigned int bvh_level );
+
+SYCL_EXTERNAL extern "C" intel_candidate_type_t intel_get_hit_candidate( intel_ray_query_t query, intel_hit_type_t hit_type );
+
+// test whether traversal has terminated.  If false, the ray has reached
+//  a procedural leaf or a non-opaque triangle leaf, and requires shader processing
+SYCL_EXTERNAL extern "C" bool intel_is_traversal_done( intel_ray_query_t query );
+
+// if traversal is done one can test for the presence of a committed hit to either invoke miss or closest hit shader
+SYCL_EXTERNAL extern "C" bool intel_has_committed_hit( intel_ray_query_t query );
+
+#else
+
+inline intel_raytracing_ext_flag_t intel_get_raytracing_ext_flag() {
+  return intel_raytracing_ext_flag_ray_query;
+}
+
+inline intel_ray_query_t intel_ray_query_init(
+  intel_ray_desc_t ray,
+  intel_raytracing_acceleration_structure_t accel
+  ) { return NULL; }
+
+// setup for instance traversal using a transformed ray and bottom-level AS
+inline void intel_ray_query_forward_ray(
+  intel_ray_query_t query,
+  intel_ray_desc_t ray,
+  intel_raytracing_acceleration_structure_t accel
+) {}
+
+// commit the potential hit
+inline void intel_ray_query_commit_potential_hit(
+  intel_ray_query_t query
+) {}
+
+// commit the potential hit and override hit distance and UVs
+inline void intel_ray_query_commit_potential_hit_override(
+  intel_ray_query_t query,
+  float override_hit_distance,
+  intel_float2 override_uv
+) {}
+
+// start traversal of a ray query
+inline void intel_ray_query_start_traversal( intel_ray_query_t query ) {}
+
+// synchronize rayquery execution.  If a ray was dispatched, 
+//  This must be called prior to calling any of the accessors below.
+inline void intel_ray_query_sync( intel_ray_query_t query ) {}
+
+// signal that a ray query will not be used further.  This is the moral equaivalent of a delete
+// this function does an implicit sync
+inline void intel_ray_query_abandon( intel_ray_query_t query ) {}
+
+// read hit information during shader execution
+inline unsigned int intel_get_hit_bvh_level( intel_ray_query_t query, intel_hit_type_t hit_type ) { return 0; }
+inline float intel_get_hit_distance( intel_ray_query_t query, intel_hit_type_t hit_type ) { return 0.0f; }
+inline intel_float2 intel_get_hit_barycentrics( intel_ray_query_t query, intel_hit_type_t hit_type ) { return { 0,0 }; }
+inline bool intel_get_hit_front_face( intel_ray_query_t query, intel_hit_type_t hit_type ) { return false; }
+inline unsigned int intel_get_hit_geometry_id(intel_ray_query_t query, intel_hit_type_t hit_type ) { return 0; }
+inline unsigned int intel_get_hit_primitive_id( intel_ray_query_t query, intel_hit_type_t hit_type ) { return 0; }
+inline unsigned int intel_get_hit_triangle_primitive_id( intel_ray_query_t query, intel_hit_type_t hit_type ) { return 0; }  // fast path for quad leaves
+inline unsigned int intel_get_hit_procedural_primitive_id( intel_ray_query_t query, intel_hit_type_t hit_type ) { return 0; } // fast path for procedural leaves
+inline unsigned int intel_get_hit_instance_id( intel_ray_query_t query, intel_hit_type_t hit_type ) { return 0; }
+inline unsigned int intel_get_hit_instance_user_id( intel_ray_query_t query, intel_hit_type_t hit_type ) { return 0; }
+inline intel_float4x3 intel_get_hit_world_to_object( intel_ray_query_t query, intel_hit_type_t hit_type ) { return { {0,0,0}, {0,0,0}, {0,0,0}, {0,0,0} }; }
+inline intel_float4x3 intel_get_hit_object_to_world( intel_ray_query_t query, intel_hit_type_t hit_type ) { return { {0,0,0}, {0,0,0}, {0,0,0}, {0,0,0} }; }
+
+// fetch triangle vertices for a hit
+inline void intel_get_hit_triangle_vertices( intel_ray_query_t query, intel_float3 vertices_out[3], intel_hit_type_t hit_type ) {}
+
+// Read ray-data. This is used to read transformed rays produced by HW instancing pipeline
+// during any-hit or intersection shader execution.
+inline intel_float3 intel_get_ray_origin( intel_ray_query_t query, unsigned int bvh_level ) { return { 0,0,0 }; }
+inline intel_float3 intel_get_ray_direction( intel_ray_query_t query, unsigned int bvh_level ) { return { 0,0,0 }; }
+inline float intel_get_ray_tmin( intel_ray_query_t query, unsigned int bvh_level ) { return 0.0f; }
+inline intel_ray_flags_t intel_get_ray_flags( intel_ray_query_t query, unsigned int bvh_level ) { return intel_ray_flags_none; }
+inline unsigned int intel_get_ray_mask( intel_ray_query_t query, unsigned int bvh_level ) { return 0; }
+
+inline intel_candidate_type_t intel_get_hit_candidate( intel_ray_query_t query, intel_hit_type_t hit_type ) { return intel_candidate_type_triangle; }
+
+// test whether traversal has terminated.  If false, the ray has reached
+//  a procedural leaf or a non-opaque triangle leaf, and requires shader processing
+inline bool intel_is_traversal_done( intel_ray_query_t query ) { return false; }
+
+// if traversal is done one can test for the presence of a committed hit to either invoke miss or closest hit shader
+inline bool intel_has_committed_hit( intel_ray_query_t query ) { return false; }
+
+#endif
+
+#pragma clang diagnostic pop
+
+#endif
--- a/Framework/external/embree/kernels/rthwif/rttrace/rttrace_internal.h
+++ b/Framework/external/embree/kernels/rthwif/rttrace/rttrace_internal.h
@ -0,0 +1,293 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#  define MemRay MemRayV1
+#  define MemHit MemHitV1
+#  define QuadLeaf QuadLeafV1
+#  define InstanceLeaf InstanceLeafV1
+
+#include <cstdint>
+
+enum TraceRayCtrl
+{
+  TRACE_RAY_INITIAL = 0,              // Initializes hit and initializes traversal state
+  TRACE_RAY_INSTANCE = 1,             // Loads committed hit and initializes traversal state
+  TRACE_RAY_COMMIT = 2,               // Loads potential hit and loads traversal state 
+  TRACE_RAY_CONTINUE = 3,             // Loads committed hit and loads traversal state
+  TRACE_RAY_DONE = 256,               // for internal use only 
+};
+
+typedef __attribute__((opencl_global)) struct rtglobals_opaque_t* rtglobals_t;
+typedef __attribute__((opencl_private)) struct rtfence_opaque_t* rtfence_t;
+
+#if defined(__SYCL_DEVICE_ONLY__) || defined(EMBREE_SYCL_RT_SIMULATION)
+
+SYCL_EXTERNAL extern "C" __attribute__((opencl_global)) void* intel_get_implicit_dispatch_globals();
+SYCL_EXTERNAL extern "C" void* intel_get_rt_stack(rtglobals_t rt_dispatch_globals);
+SYCL_EXTERNAL extern "C" void* intel_get_thread_btd_stack(rtglobals_t rt_dispatch_globals);
+SYCL_EXTERNAL extern "C" void* intel_get_global_btd_stack(rtglobals_t rt_dispatch_globals);
+SYCL_EXTERNAL extern "C" rtfence_t intel_dispatch_trace_ray_query(rtglobals_t rt_dispatch_globals, unsigned int bvh_level, unsigned int traceRayCtrl);
+SYCL_EXTERNAL extern "C" void intel_rt_sync(rtfence_t fence);
+
+#else
+
+inline void* intel_get_implicit_dispatch_globals() { return nullptr; }
+inline void* intel_get_rt_stack(rtglobals_t rt_dispatch_globals) { return nullptr; }
+inline void* intel_get_thread_btd_stack(rtglobals_t rt_dispatch_globals) { return nullptr; }
+inline void* intel_get_global_btd_stack(rtglobals_t rt_dispatch_globals) { return nullptr; }
+inline rtfence_t intel_dispatch_trace_ray_query(rtglobals_t rt_dispatch_globals, unsigned int bvh_level, unsigned int traceRayCtrl) { return nullptr; }
+inline void intel_rt_sync(rtfence_t fence) {}
+
+#endif
+
+enum NodeType
+{
+  NODE_TYPE_MIXED = 0x0,        // identifies a mixed internal node where each child can have a different type
+  NODE_TYPE_INTERNAL = 0x0,     // internal BVH node with 6 children
+  NODE_TYPE_INSTANCE = 0x1,     // instance leaf 
+  NODE_TYPE_PROCEDURAL = 0x3,   // procedural leaf
+  NODE_TYPE_QUAD = 0x4,         // quad leaf
+  NODE_TYPE_INVALID = 0x7       // indicates invalid node
+};
+
+struct __attribute__ ((packed,aligned(32))) MemRayV1
+{
+  void init(intel_ray_desc_t ray, uint64_t rootNodePtr_i)
+  {
+    org[0] = ray.origin.x;
+    org[1] = ray.origin.y;
+    org[2] = ray.origin.z;
+    dir[0] = ray.direction.x;
+    dir[1] = ray.direction.y;
+    dir[2] = ray.direction.z;
+    tnear  = ray.tmin;
+    tfar   = ray.tmax;
+    rootNodePtr = rootNodePtr_i;
+    rayFlags = ray.flags;
+    hitGroupSRBasePtr = 0;
+    hitGroupSRStride = 0;
+    missSRPtr = 0;
+    pad0 = 0;
+    shaderIndexMultiplier = 0;
+    instLeafPtr = 0;
+    rayMask = ray.mask;
+    pad1 = 0;
+  }
+  
+  // 32 B  
+  float org[3];
+  float dir[3];
+  float tnear;
+  float tfar;
+
+  // 32 B
+  struct { // FIXME: removing these anonymous structs triggers IGC bug
+    uint64_t rootNodePtr : 48;  // root node to start traversal at
+    uint64_t rayFlags : 16;     // ray flags (see RayFlag structure)
+  };
+
+  struct {
+    uint64_t hitGroupSRBasePtr : 48; // base of hit group shader record array (16-bytes alignment)
+    uint64_t hitGroupSRStride : 16;  // stride of hit group shader record array (16-bytes alignment)
+  };
+
+  struct {
+    uint64_t missSRPtr : 48;  // pointer to miss shader record to invoke on a miss (8-bytes alignment)
+    uint64_t pad0 : 8;        // padding byte (has to be zero)
+    uint64_t shaderIndexMultiplier : 8; // shader index multiplier
+  };
+
+  struct {
+    uint64_t instLeafPtr : 48;  // the pointer to instance leaf in case we traverse an instance (64-bytes alignment)
+    uint64_t rayMask : 8;       // ray mask used for ray masking
+    uint64_t pad1 : 8;          // padding byte (has to be zero)
+  };
+};
+
+struct __attribute__ ((packed,aligned(32))) MemHitV1
+{
+  inline float getT() const {
+    return ft;
+  }
+
+  inline void setT(float t) {
+    ft = t;
+  }
+
+  inline float getU() const {
+    return fu;
+  }
+
+  inline void setU(float u) {
+    fu = u;
+  }
+  
+  inline float getV() const {
+    return fv;
+  }
+
+  inline void setV(float v) {
+    fv = v;
+  }
+  
+  inline void* getPrimLeafPtr() {
+    return sycl::global_ptr<void>((void*)(uint64_t(primLeafPtr)*64)).get();
+  }
+
+  inline void* getInstanceLeafPtr() {
+    return sycl::global_ptr<void>((void*)(uint64_t(instLeafPtr)*64)).get();
+  }
+
+public:
+  float    ft;                   // hit distance of current hit (or initial traversal distance)
+  float    fu,fv;                 // barycentric hit coordinates
+
+  union {
+    struct {
+      uint32_t primIndexDelta  : 16; // prim index delta for compressed meshlets and quads
+      uint32_t valid           : 1; // set if there is a hit
+      uint32_t leafType        : 3; // type of node primLeafPtr is pointing to
+      uint32_t primLeafIndex   : 4; // index of the hit primitive inside the leaf
+      uint32_t bvhLevel        : 3; // the instancing level at which the hit occured
+      uint32_t frontFace       : 1; // whether we hit the front-facing side of a triangle (also used to pass opaque flag when calling intersection shaders)
+      uint32_t done            : 1; // used in sync mode to indicate that traversal is done
+      uint32_t pad0            : 3; // unused bits
+    };
+    uint32_t data;
+  };
+    
+  struct { // FIXME: removing these anonymous structs triggers IGC bug
+    uint64_t primLeafPtr     : 42; // pointer to BVH leaf node (multiple of 64 bytes)
+    uint64_t hitGroupRecPtr0 : 22; // LSB of hit group record of the hit triangle (multiple of 16 bytes)
+  };
+
+  struct {
+    uint64_t instLeafPtr     : 42; // pointer to BVH instance leaf node (in multiple of 64 bytes)
+    uint64_t hitGroupRecPtr1 : 22; // MSB of hit group record of the hit triangle (multiple of 16 bytes)
+  };
+
+  void clear(bool _done, bool _valid) {
+    //*(sycl::int8*) this = sycl::int8(0x7F800000 /* INFINITY */, 0, 0, (_done ? 0x10000000 : 0) | (_valid ? 0x10000), 0, 0, 0, 0);
+    ft = fu = fv = 0.0f;
+    data = 0;
+    done = _done ? 1 : 0;
+    valid = _valid ? 1 : 0;    
+  }
+};
+
+struct __attribute__ ((packed,aligned(64))) RTStack
+{
+  union {
+    struct {
+      struct MemHit committedHit;    // stores committed hit
+      struct MemHit potentialHit;    // stores potential hit that is passed to any hit shader
+    };
+    struct MemHit hit[2]; // committedHit, potentialHit
+  };
+  struct MemRay ray[2];
+  char travStack[32*2];
+};
+
+struct __attribute__ ((packed)) HWAccel
+{
+  uint64_t reserved;
+  float bounds[2][3];             // bounding box of the BVH
+  uint32_t reserved0[8];
+  uint32_t numTimeSegments;
+  uint32_t reserved1[13];
+  uint64_t dispatchGlobalsPtr;
+};
+
+struct  __attribute__ ((packed,aligned(8))) PrimLeafDesc 
+{
+  struct {
+    uint32_t shaderIndex : 24;    // shader index used for shader record calculations
+    uint32_t geomMask    : 8;     // geometry mask used for ray masking
+  };
+
+  struct {
+    uint32_t geomIndex   : 29; // the geometry index specifies the n'th geometry of the scene
+    uint32_t type        : 1;  // enable/disable culling for procedurals and instances
+    uint32_t geomFlags   : 2;  // geometry flags of this geometry
+  };
+};
+
+struct __attribute__ ((packed,aligned(64))) QuadLeafV1
+{
+  struct PrimLeafDesc leafDesc;
+  unsigned int primIndex0;
+  struct {
+    uint32_t primIndex1Delta : 16;  // delta encoded primitive index of second triangle
+    uint32_t j0              : 2;   // specifies first vertex of second triangle
+    uint32_t j1              : 2;   // specified second vertex of second triangle
+    uint32_t j2              : 2;   // specified third vertex of second triangle    
+    uint32_t last            : 1;   // true if the second triangle is the last triangle in a leaf list
+    uint32_t pad             : 9;   // unused bits
+  };
+  float v[4][3]; 
+};
+
+struct __attribute__ ((packed,aligned(64))) ProceduralLeaf
+{
+  static const constexpr uint32_t N = 13;
+  
+  struct PrimLeafDesc leafDesc; // leaf header identifying the geometry
+  struct {
+    uint32_t numPrimitives : 4; // number of stored primitives
+    uint32_t pad           : 32-4-N;
+    uint32_t last          : N; // bit vector with a last bit per primitive
+  };
+  uint32_t _primIndex[N]; // primitive indices of all primitives stored inside the leaf
+};
+
+struct __attribute__ ((packed,aligned(64))) InstanceLeafV1
+{
+  /* first 64 bytes accessed during traversal by hardware */
+  struct Part0
+  {
+  public:
+    struct {
+      uint32_t shaderIndex : 24;  // shader index used to calculate instancing shader in case of software instancing
+      uint32_t geomMask : 8;      // geometry mask used for ray masking
+    };
+
+    struct {
+      uint32_t instanceContributionToHitGroupIndex : 24;
+      uint32_t pad0 : 5;
+      
+      /* the following two entries are only used for procedural instances */
+      uint32_t type : 1; // enables/disables opaque culling
+      uint32_t geomFlags : 2; // unused for instances
+    };
+
+    struct {
+      uint64_t startNodePtr : 48;  // start node where to continue traversal of the instanced object
+      uint64_t instFlags : 8;      // flags for the instance (see InstanceFlags)
+      uint64_t pad1 : 8;           // unused bits
+    };
+    
+    float world2obj_vx[3];   // 1st column of Worl2Obj transform
+    float world2obj_vy[3];   // 2nd column of Worl2Obj transform
+    float world2obj_vz[3];   // 3rd column of Worl2Obj transform
+    float obj2world_p[3];    // translation of Obj2World transform (on purpose in first 64 bytes)
+  } part0;
+  
+  /* second 64 bytes accessed during shading */
+  struct Part1
+  {
+    struct {
+      uint64_t bvhPtr : 48;   // pointer to BVH where start node belongs too
+      uint64_t pad : 16;      // unused bits
+    };
+
+    uint32_t instanceID;    // user defined value per DXR spec
+    uint32_t instanceIndex; // geometry index of the instance (n'th geometry in scene)
+    
+    float obj2world_vx[3];   // 1st column of Obj2World transform
+    float obj2world_vy[3];   // 2nd column of Obj2World transform
+    float obj2world_vz[3];   // 3rd column of Obj2World transform
+    float world2obj_p[3];    // translation of World2Obj transform
+  } part1;
+};
--- a/Framework/external/embree/kernels/rthwif/rttrace/rttrace_validation.cpp
+++ b/Framework/external/embree/kernels/rthwif/rttrace/rttrace_validation.cpp
@ -0,0 +1,287 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "rttrace_validation.h"
+
+#define sizeof_QBVH6_InternalNode6 64
+#define QBVH6_rootNodeOffset 128
+
+ /*struct rayquery_impl_t {
+    rtfence_t fence;
+    rtglobals_t dispatchGlobalsPtr;
+    struct RTStack* rtStack;
+    TraceRayCtrl ctrl;
+    unsigned int bvh_level;
+ };*/
+
+void use_rthwif_production()
+{
+}
+
+SYCL_EXTERNAL intel_raytracing_ext_flag_t intel_get_raytracing_ext_flag()
+{
+  return intel_raytracing_ext_flag_ray_query;
+}
+
+SYCL_EXTERNAL intel_ray_query_t intel_ray_query_init(intel_ray_desc_t ray, intel_raytracing_acceleration_structure_t accel_i )
+{
+  unsigned int bvh_level = 0;
+  
+  //intel_raytracing_acceleration_structure_t* accel_i = sycl::global_ptr<intel_raytracing_acceleration_structure_t>(_accel_i).get();
+  HWAccel* accel = (HWAccel*)accel_i;
+#if defined(EMBREE_SYCL_ALLOC_DISPATCH_GLOBALS)
+  rtglobals_t dispatchGlobalsPtr = (rtglobals_t) accel->dispatchGlobalsPtr;
+#else
+  rtglobals_t dispatchGlobalsPtr = (rtglobals_t) intel_get_implicit_dispatch_globals();
+#endif
+  struct RTStack* __restrict rtStack = sycl::global_ptr<RTStack>((struct RTStack*)intel_get_rt_stack( (rtglobals_t)dispatchGlobalsPtr )).get();
+
+  /* init ray */
+  rtStack->ray[bvh_level].init(ray,(uint64_t)accel + QBVH6_rootNodeOffset);
+  
+  rtStack->committedHit.setT(INFINITY);
+  rtStack->committedHit.setU(0.0f);
+  rtStack->committedHit.setV(0.0f);
+  rtStack->committedHit.data = 0;
+
+  rtStack->potentialHit.setT(INFINITY);
+  rtStack->potentialHit.setU(0.0f);
+  rtStack->potentialHit.setV(0.0f);
+  rtStack->potentialHit.data = 0;
+  rtStack->potentialHit.done = 1;
+  rtStack->potentialHit.valid = 1;
+  
+  return { nullptr, (void*) dispatchGlobalsPtr, rtStack, TRACE_RAY_INITIAL, bvh_level };
+}
+
+SYCL_EXTERNAL void intel_ray_query_forward_ray( intel_ray_query_t& query, intel_ray_desc_t ray, intel_raytracing_acceleration_structure_t accel_i)
+{
+  HWAccel* accel = (HWAccel*)accel_i;
+  struct RTStack* __restrict rtStack = sycl::global_ptr<RTStack>((struct RTStack*)query.opaque2).get();
+
+  /* init ray */
+  unsigned int bvh_level = query.bvh_level+1;
+  rtStack->ray[bvh_level].init(ray,(uint64_t)accel + QBVH6_rootNodeOffset);
+  query = { nullptr, query.opaque1, query.opaque2, TRACE_RAY_INSTANCE, bvh_level };
+}
+
+SYCL_EXTERNAL void intel_ray_query_commit_potential_hit( intel_ray_query_t& query )
+{
+  struct RTStack* __restrict rtStack = sycl::global_ptr<RTStack>((struct RTStack*)query.opaque2).get();
+  
+  unsigned int bvh_level = query.bvh_level;
+  unsigned int rflags = rtStack->ray[bvh_level].rayFlags;
+  if (rflags & intel_ray_flags_accept_first_hit_and_end_search) {
+    rtStack->committedHit = rtStack->potentialHit;
+    rtStack->committedHit.valid = 1;
+    query = { nullptr, query.opaque1, query.opaque2, TRACE_RAY_DONE, bvh_level };
+  } else {
+    rtStack->potentialHit.valid = 1; // FIXME: is this required?
+    query = { nullptr, query.opaque1, query.opaque2, TRACE_RAY_COMMIT, bvh_level };
+  }
+}
+
+SYCL_EXTERNAL void intel_ray_query_commit_potential_hit_override( intel_ray_query_t& query, float override_hit_distance, intel_float2 override_uv )
+{
+  //struct RTStack* rtStack = (struct RTStack*) query.opaque2;  
+  struct RTStack* __restrict rtStack = sycl::global_ptr<RTStack>((struct RTStack*)query.opaque2).get();
+  
+  rtStack->potentialHit.setT(override_hit_distance);
+  rtStack->potentialHit.setU(override_uv.x);
+  rtStack->potentialHit.setV(override_uv.y);
+  intel_ray_query_commit_potential_hit(query);
+}
+
+SYCL_EXTERNAL void intel_ray_query_start_traversal( intel_ray_query_t& query )
+{
+  rtglobals_t dispatchGlobalsPtr = (rtglobals_t) query.opaque1;
+  struct RTStack* __restrict rtStack = sycl::global_ptr<RTStack>((struct RTStack*)query.opaque2).get();
+
+  rtStack->potentialHit.done = 1;
+  rtStack->potentialHit.valid = 1;
+  
+  if (query.ctrl == TRACE_RAY_DONE) return;
+  rtfence_t fence = intel_dispatch_trace_ray_query(dispatchGlobalsPtr,query.bvh_level,query.ctrl);
+  query = { (void*) fence, query.opaque1, query.opaque2, TRACE_RAY_INITIAL, 0 }; 
+}
+
+SYCL_EXTERNAL void intel_ray_query_sync( intel_ray_query_t& query )
+{
+  intel_rt_sync((rtfence_t)query.opaque0);
+  
+  /* continue is default behaviour */
+  struct RTStack* __restrict rtStack = sycl::global_ptr<RTStack>((struct RTStack*)query.opaque2).get();
+  
+  unsigned int bvh_level = rtStack->potentialHit.bvhLevel;
+  query = { query.opaque0, query.opaque1, query.opaque2, TRACE_RAY_CONTINUE, bvh_level };
+}
+
+SYCL_EXTERNAL void intel_sync_ray_query( intel_ray_query_t& query )
+{
+  intel_rt_sync((rtfence_t)query.opaque0);
+  
+  /* continue is default behaviour */
+  struct RTStack* __restrict rtStack = sycl::global_ptr<RTStack>((struct RTStack*)query.opaque2).get();
+  
+  unsigned int bvh_level = rtStack->potentialHit.bvhLevel;
+  query = { query.opaque0, query.opaque1, query.opaque2, TRACE_RAY_CONTINUE, bvh_level };
+}
+
+SYCL_EXTERNAL void intel_ray_query_abandon( intel_ray_query_t& query )
+{
+  intel_ray_query_sync(query);
+  query = { nullptr, nullptr, nullptr, TRACE_RAY_INITIAL, 0 };
+}
+
+SYCL_EXTERNAL unsigned int intel_get_hit_bvh_level( intel_ray_query_t& query, intel_hit_type_t hit_type ) {
+  return query.hit(hit_type).bvhLevel;
+}
+
+SYCL_EXTERNAL float intel_get_hit_distance( intel_ray_query_t& query, intel_hit_type_t hit_type ) {
+  return query.hit(hit_type).getT();
+}
+
+SYCL_EXTERNAL intel_float2 intel_get_hit_barycentrics( intel_ray_query_t& query, intel_hit_type_t hit_type ) {
+  return { query.hit(hit_type).getU(), query.hit(hit_type).getV() };
+}
+
+SYCL_EXTERNAL bool intel_get_hit_front_face( intel_ray_query_t& query, intel_hit_type_t hit_type ) {
+  return query.hit(hit_type).frontFace;
+}
+
+SYCL_EXTERNAL unsigned int intel_get_hit_geometry_id(intel_ray_query_t& query, intel_hit_type_t hit_type )
+{
+  struct PrimLeafDesc* __restrict leaf = (struct PrimLeafDesc*)query.hit(hit_type).getPrimLeafPtr();
+  return leaf->geomIndex;
+}
+
+SYCL_EXTERNAL unsigned int intel_get_hit_primitive_id( intel_ray_query_t& query, intel_hit_type_t hit_type )
+{
+  MemHit& hit = query.hit(hit_type);
+  void* __restrict leaf = hit.getPrimLeafPtr();
+  
+  if (hit.leafType == NODE_TYPE_QUAD)
+    return ((QuadLeaf*)leaf)->primIndex0 + hit.primIndexDelta;
+  else
+     return ((ProceduralLeaf*)leaf)->_primIndex[hit.primLeafIndex];
+}
+
+SYCL_EXTERNAL unsigned int intel_get_hit_triangle_primitive_id( intel_ray_query_t& query, intel_hit_type_t hit_type )
+{
+  MemHit& hit = query.hit(hit_type);
+  QuadLeaf* __restrict leaf = (QuadLeaf*) hit.getPrimLeafPtr();
+  
+  return leaf->primIndex0 + hit.primIndexDelta;
+}
+
+SYCL_EXTERNAL unsigned int intel_get_hit_procedural_primitive_id( intel_ray_query_t& query, intel_hit_type_t hit_type )
+{
+  MemHit& hit = query.hit(hit_type);
+  ProceduralLeaf* __restrict leaf = (ProceduralLeaf*) hit.getPrimLeafPtr();
+  return leaf->_primIndex[hit.primLeafIndex];
+}
+
+SYCL_EXTERNAL unsigned int intel_get_hit_instance_id( intel_ray_query_t& query, intel_hit_type_t hit_type )
+{
+  MemHit& hit = query.hit(hit_type);
+  InstanceLeaf* __restrict leaf = (InstanceLeaf*) hit.getInstanceLeafPtr();
+  if (leaf == nullptr) return -1;
+  return leaf->part1.instanceIndex;
+}
+
+SYCL_EXTERNAL unsigned int intel_get_hit_instance_user_id( intel_ray_query_t& query, intel_hit_type_t hit_type )
+{
+  MemHit& hit = query.hit(hit_type);
+  InstanceLeaf* __restrict leaf = (InstanceLeaf*) hit.getInstanceLeafPtr();
+  if (leaf == nullptr) return -1;
+  return leaf->part1.instanceID;
+}
+
+SYCL_EXTERNAL intel_float4x3 intel_get_hit_world_to_object( intel_ray_query_t& query, intel_hit_type_t hit_type )
+{
+  MemHit& hit = query.hit(hit_type);
+  InstanceLeaf* __restrict leaf = (InstanceLeaf*) hit.getInstanceLeafPtr();
+  if (leaf == nullptr) return { { 1,0,0 }, { 0,1,0 }, { 0,0,1 }, { 0,0,0 } };
+  return {
+    { leaf->part0.world2obj_vx[0], leaf->part0.world2obj_vx[1], leaf->part0.world2obj_vx[2] },
+    { leaf->part0.world2obj_vy[0], leaf->part0.world2obj_vy[1], leaf->part0.world2obj_vy[2] },
+    { leaf->part0.world2obj_vz[0], leaf->part0.world2obj_vz[1], leaf->part0.world2obj_vz[2] },
+    { leaf->part1.world2obj_p [0], leaf->part1.world2obj_p [1], leaf->part1.world2obj_p [2] }
+  };
+}
+
+SYCL_EXTERNAL intel_float4x3 intel_get_hit_object_to_world( intel_ray_query_t& query, intel_hit_type_t hit_type )
+{
+  MemHit& hit = query.hit(hit_type);
+  InstanceLeaf* __restrict leaf = (InstanceLeaf*) hit.getInstanceLeafPtr();
+  if (leaf == nullptr) return { { 1,0,0 }, { 0,1,0 }, { 0,0,1 }, { 0,0,0 } };
+  return {
+    { leaf->part1.obj2world_vx[0], leaf->part1.obj2world_vx[1], leaf->part1.obj2world_vx[2] },
+    { leaf->part1.obj2world_vy[0], leaf->part1.obj2world_vy[1], leaf->part1.obj2world_vy[2] },
+    { leaf->part1.obj2world_vz[0], leaf->part1.obj2world_vz[1], leaf->part1.obj2world_vz[2] },
+    { leaf->part0.obj2world_p [0], leaf->part0.obj2world_p [1], leaf->part0.obj2world_p [2] }
+  };
+}
+
+SYCL_EXTERNAL void intel_get_hit_triangle_vertices( intel_ray_query_t& query, intel_float3 verts_out[3], intel_hit_type_t hit_type )
+{
+  const QuadLeaf* __restrict leaf = (const QuadLeaf*) query.hit(hit_type).getPrimLeafPtr();
+  
+  unsigned int j0 = 0, j1 = 1, j2 = 2;
+  if (query.hit(hit_type).primLeafIndex != 0)
+  {
+    j0 = leaf->j0;
+    j1 = leaf->j1;
+    j2 = leaf->j2;
+  }
+
+  verts_out[0] = { leaf->v[j0][0], leaf->v[j0][1], leaf->v[j0][2] };
+  verts_out[1] = { leaf->v[j1][0], leaf->v[j1][1], leaf->v[j1][2] };
+  verts_out[2] = { leaf->v[j2][0], leaf->v[j2][1], leaf->v[j2][2] };
+}
+
+SYCL_EXTERNAL intel_float3 intel_get_ray_origin( intel_ray_query_t& query, unsigned int bvh_level)
+{
+  struct RTStack* __restrict rtStack = sycl::global_ptr<RTStack>((struct RTStack*)query.opaque2).get();
+  
+  MemRay& ray = rtStack->ray[bvh_level];
+  return { ray.org[0], ray.org[1], ray.org[2] };
+}
+
+SYCL_EXTERNAL intel_float3 intel_get_ray_direction( intel_ray_query_t& query, unsigned int bvh_level)
+{
+  struct RTStack* __restrict rtStack = sycl::global_ptr<RTStack>((struct RTStack*)query.opaque2).get();
+  MemRay& ray = rtStack->ray[bvh_level];
+  return { ray.dir[0], ray.dir[1], ray.dir[2] };
+}
+
+SYCL_EXTERNAL float intel_get_ray_tmin( intel_ray_query_t& query, unsigned int bvh_level)
+{
+  struct RTStack* __restrict rtStack = sycl::global_ptr<RTStack>((struct RTStack*)query.opaque2).get();
+  return rtStack->ray[bvh_level].tnear;
+}
+
+SYCL_EXTERNAL intel_ray_flags_t intel_get_ray_flags( intel_ray_query_t& query, unsigned int bvh_level)
+{
+  struct RTStack* __restrict rtStack = sycl::global_ptr<RTStack>((struct RTStack*)query.opaque2).get();
+  return (intel_ray_flags_t) rtStack->ray[bvh_level].rayFlags;
+}
+
+SYCL_EXTERNAL unsigned int intel_get_ray_mask( intel_ray_query_t& query, unsigned int bvh_level)
+{
+  struct RTStack* __restrict rtStack = sycl::global_ptr<RTStack>((struct RTStack*)query.opaque2).get();
+  return rtStack->ray[bvh_level].rayMask;
+}
+
+SYCL_EXTERNAL bool intel_is_traversal_done( intel_ray_query_t& query ) {
+  return query.hit(intel_hit_type_potential_hit).done;
+}
+
+SYCL_EXTERNAL intel_candidate_type_t intel_get_hit_candidate( intel_ray_query_t& query, intel_hit_type_t hit_type) {
+  return query.hit(hit_type).leafType == NODE_TYPE_QUAD ? intel_candidate_type_triangle : intel_candidate_type_procedural;
+}
+
+SYCL_EXTERNAL bool intel_has_committed_hit( intel_ray_query_t& query ) {
+  return query.hit(intel_hit_type_committed_hit).valid;
+}
+
--- a/Framework/external/embree/kernels/rthwif/rttrace/rttrace_validation.h
+++ b/Framework/external/embree/kernels/rthwif/rttrace/rttrace_validation.h
@ -0,0 +1,180 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdint>
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#pragma clang diagnostic ignored "-W#pragma-messages"
+
+#include <sycl/sycl.hpp>
+
+#pragma clang diagnostic pop
+
+enum intel_ray_flags_t
+{
+  intel_ray_flags_none = 0x00,
+  intel_ray_flags_force_opaque = 0x01,                      // forces geometry to be opaque (no anyhit shader invokation)
+  intel_ray_flags_force_non_opaque = 0x02,                  // forces geometry to be non-opqaue (invoke anyhit shader)
+  intel_ray_flags_accept_first_hit_and_end_search = 0x04,   // terminates traversal on the first hit found (shadow rays)
+  intel_ray_flags_skip_closest_hit_shader = 0x08,           // skip execution of the closest hit shader
+  intel_ray_flags_cull_back_facing_triangles = 0x10,        // back facing triangles to not produce a hit
+  intel_ray_flags_cull_front_facing_triangles = 0x20,       // front facing triangles do not produce a hit
+  intel_ray_flags_cull_opaque = 0x40,                       // opaque geometry does not produce a hit
+  intel_ray_flags_cull_non_opaque = 0x80,                   // non-opaque geometry does not produce a hit
+  intel_ray_flags_skip_triangles = 0x100,                   // treat all triangle intersections as misses.
+  intel_ray_flags_skip_procedural_primitives = 0x200,       // skip execution of intersection shaders
+};
+
+enum intel_hit_type_t
+{
+  intel_hit_type_committed_hit = 0,
+  intel_hit_type_potential_hit = 1,
+};
+
+enum intel_raytracing_ext_flag_t
+{
+  intel_raytracing_ext_flag_ray_query   = 1 << 0,        // true if ray queries are supported
+};
+
+struct intel_float2
+{
+  float x, y;
+
+  intel_float2() {}
+
+  intel_float2(float x, float y)
+    : x(x), y(y) {}
+  
+  intel_float2(sycl::float2 v)
+    : x(v.x()), y(v.y()) {}
+
+  operator sycl::float2() {
+    return sycl::float2(x,y);
+  }
+};
+
+struct intel_float3
+{
+  float x, y, z;
+
+  intel_float3() {}
+
+  intel_float3(float x, float y, float z)
+    : x(x), y(y), z(z) {}
+
+  intel_float3(sycl::float3 v)
+    : x(v.x()), y(v.y()), z(v.z()) {}
+
+  operator sycl::float3() {
+    return sycl::float3(x,y,z);
+  }
+};
+
+struct intel_float4x3 {
+  intel_float3 vx, vy, vz, p;
+};
+
+struct intel_ray_desc_t
+{
+  intel_float3 origin;
+  intel_float3 direction;
+  float tmin;
+  float tmax;
+  unsigned int mask;
+  intel_ray_flags_t flags;
+};
+
+#include "rttrace_internal.h"
+
+// opaque types
+struct intel_ray_query_t {
+  void* opaque0; void* opaque1; void* opaque2; uint32_t ctrl; uint32_t bvh_level;
+  MemHit& hit(intel_hit_type_t ty) {
+    struct RTStack* rtStack = (struct RTStack*) opaque2;
+    return rtStack->hit[ty];
+  }
+};
+typedef __attribute__((opencl_global )) struct intel_raytracing_acceleration_structure_opaque_t* intel_raytracing_acceleration_structure_t;
+
+// check supported ray tracing features
+SYCL_EXTERNAL intel_raytracing_ext_flag_t intel_get_raytracing_ext_flag();
+
+// initializes a ray query
+SYCL_EXTERNAL intel_ray_query_t intel_ray_query_init(
+  intel_ray_desc_t ray,
+  intel_raytracing_acceleration_structure_t accel
+);
+
+// setup for instance traversal using a transformed ray and bottom-level AS
+SYCL_EXTERNAL void intel_ray_query_forward_ray(
+  intel_ray_query_t& query,
+  intel_ray_desc_t ray,
+  intel_raytracing_acceleration_structure_t accel
+);
+
+// commit the potential hit
+SYCL_EXTERNAL void intel_ray_query_commit_potential_hit(
+  intel_ray_query_t& query
+);
+
+// commit the potential hit and override hit distance and UVs
+SYCL_EXTERNAL void intel_ray_query_commit_potential_hit_override(
+  intel_ray_query_t& query,
+  float override_hit_distance,
+  intel_float2 override_uv
+);
+
+// start traversal of a ray query
+SYCL_EXTERNAL void intel_ray_query_start_traversal( intel_ray_query_t& query );
+
+// synchronize rayquery execution.  If a ray was dispatched, 
+//  This must be called prior to calling any of the accessors below.
+SYCL_EXTERNAL void intel_ray_query_sync( intel_ray_query_t& query );
+
+// signal that a ray query will not be used further.  This is the moral equaivalent of a delete
+// this function does an implicit sync
+SYCL_EXTERNAL void intel_ray_query_abandon( intel_ray_query_t& query );
+
+// read hit information during shader execution
+SYCL_EXTERNAL unsigned int intel_get_hit_bvh_level( intel_ray_query_t& query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL float intel_get_hit_distance( intel_ray_query_t& query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL intel_float2 intel_get_hit_barycentrics( intel_ray_query_t& query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL bool intel_get_hit_front_face( intel_ray_query_t& query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL unsigned int intel_get_hit_geometry_id(intel_ray_query_t& query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL unsigned int intel_get_hit_primitive_id( intel_ray_query_t& query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL unsigned int intel_get_hit_triangle_primitive_id( intel_ray_query_t& query, intel_hit_type_t hit_type );  // fast path for quad leaves
+SYCL_EXTERNAL unsigned int intel_get_hit_procedural_primitive_id( intel_ray_query_t& query, intel_hit_type_t hit_type ); // fast path for procedural leaves
+SYCL_EXTERNAL unsigned int intel_get_hit_instance_id( intel_ray_query_t& query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL unsigned int intel_get_hit_instance_user_id( intel_ray_query_t& query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL intel_float4x3 intel_get_hit_world_to_object( intel_ray_query_t& query, intel_hit_type_t hit_type );
+SYCL_EXTERNAL intel_float4x3 intel_get_hit_object_to_world( intel_ray_query_t& query, intel_hit_type_t hit_type );
+
+// fetch triangle vertices for a hit
+SYCL_EXTERNAL void intel_get_hit_triangle_vertices( intel_ray_query_t& query, intel_float3 vertices_out[3], intel_hit_type_t hit_type );
+
+// Read ray-data. This is used to read transformed rays produced by HW instancing pipeline
+// during any-hit or intersection shader execution.
+SYCL_EXTERNAL intel_float3 intel_get_ray_origin( intel_ray_query_t& query, unsigned int bvh_level );
+SYCL_EXTERNAL intel_float3 intel_get_ray_direction( intel_ray_query_t& query, unsigned int bvh_level );
+SYCL_EXTERNAL float intel_get_ray_tmin( intel_ray_query_t& query, unsigned int bvh_level );
+SYCL_EXTERNAL intel_ray_flags_t intel_get_ray_flags( intel_ray_query_t& query, unsigned int bvh_level );
+SYCL_EXTERNAL unsigned int intel_get_ray_mask( intel_ray_query_t& query, unsigned int bvh_level );
+
+// if traversal returns one can test if a triangle or procedural is hit
+enum intel_candidate_type_t
+{
+  intel_candidate_type_triangle,
+  intel_candidate_type_procedural
+};
+
+SYCL_EXTERNAL intel_candidate_type_t intel_get_hit_candidate( intel_ray_query_t& query, intel_hit_type_t hit_type );
+
+// test whether traversal has terminated.  If false, the ray has reached
+//  a procedural leaf or a non-opaque triangle leaf, and requires shader processing
+SYCL_EXTERNAL bool intel_is_traversal_done( intel_ray_query_t& query );
+
+// if traversal is done one can test for the presence of a committed hit to either invoke miss or closest hit shader
+SYCL_EXTERNAL bool intel_has_committed_hit( intel_ray_query_t& query );
--- a/Framework/external/embree/kernels/rthwif/testing/CMakeLists.txt
+++ b/Framework/external/embree/kernels/rthwif/testing/CMakeLists.txt
@ -0,0 +1,89 @@
+## Copyright 2009-2022 Intel Corporation
+## SPDX-License-Identifier: Apache-2.0
+
+#PROJECT(rthwif_testing)
+#CMAKE_MINIMUM_REQUIRED(VERSION 3.1.0)
+
+SET(CMAKE_CXX_STANDARD 17)
+
+IF (NOT WIN32)
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")                       # generate position independent code suitable for shared libraries
+ENDIF()
+
+IF (NOT DEFINED EMBREE_SYCL_IMPLICIT_DISPATCH_GLOBALS)
+  OPTION(EMBREE_SYCL_IMPLICIT_DISPATCH_GLOBALS "Using L0 allocated Dispatch Globals" ON)
+ENDIF()
+
+IF (NOT DEFINED EMBREE_SYCL_RT_VALIDATION_API)
+  OPTION(EMBREE_SYCL_RT_VALIDATION_API "Use rt_validation API instead of IGC provided rt_production API" OFF)
+ENDIF()
+
+IF (EMBREE_SYCL_RT_VALIDATION_API)
+  ADD_DEFINITIONS("-DEMBREE_SYCL_RT_VALIDATION_API")
+ENDIF()
+
+IF (EMBREE_SYCL_RT_VALIDATION_API AND NOT EMBREE_SYCL_IMPLICIT_DISPATCH_GLOBALS)
+  ADD_DEFINITIONS("-DEMBREE_SYCL_ALLOC_DISPATCH_GLOBALS")
+ENDIF()
+
+IF (EMBREE_SYCL_RT_SIMULATION)
+  SET(RT_SIM_LIBRARY rtcore)
+ENDIF()
+
+ADD_EXECUTABLE(embree_rthwif_cornell_box rthwif_cornell_box.cpp)
+TARGET_LINK_LIBRARIES(embree_rthwif_cornell_box sys simd ${TBB_TARGET} ${RT_SIM_LIBRARY} ze_wrapper)
+SET_PROPERTY(TARGET embree_rthwif_cornell_box APPEND PROPERTY COMPILE_FLAGS "-fsycl -fsycl-targets=spir64 -DEMBREE_SYCL_SUPPORT")
+SET_PROPERTY(TARGET embree_rthwif_cornell_box APPEND PROPERTY LINK_FLAGS "-fsycl -fsycl-targets=spir64 -Xsycl-target-backend=spir64 \" -cl-intel-greater-than-4GB-buffer-required \"")
+INSTALL(TARGETS embree_rthwif_cornell_box DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT examples)
+SIGN_TARGET(embree_rthwif_cornell_box)
+
+ADD_EXECUTABLE(embree_rthwif_test rthwif_test.cpp)
+TARGET_LINK_LIBRARIES(embree_rthwif_test sys simd ${TBB_TARGET} ${RT_SIM_LIBRARY} ze_wrapper)
+SET_PROPERTY(TARGET embree_rthwif_test APPEND PROPERTY COMPILE_FLAGS "-fsycl -fsycl-targets=spir64 -DEMBREE_SYCL_SUPPORT")
+SET_PROPERTY(TARGET embree_rthwif_test APPEND PROPERTY LINK_FLAGS "-fsycl -fsycl-targets=spir64 -Xsycl-target-backend=spir64 \" -cl-intel-greater-than-4GB-buffer-required \"")
+INSTALL(TARGETS embree_rthwif_test DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT examples)
+SIGN_TARGET(embree_rthwif_test)
+
+IF (NOT DEFINED EMBREE_SYCL_RT_VALIDATION_API OR EMBREE_SYCL_IMPLICIT_DISPATCH_GLOBALS)
+
+  IF (DEFINED EMBREE_MODEL_DIR)
+    SET(CORNELL_BOX_REFERENCE "${EMBREE_MODEL_DIR}/reference/cornell_box_reference.tga")
+  ELSE()
+    SET(CORNELL_BOX_REFERENCE "${CMAKE_CURRENT_SOURCE_DIR}/cornell_box_reference.tga")
+  ENDIF()
+  
+  ADD_TEST(NAME rthwif_cornell_box
+           COMMAND embree_rthwif_cornell_box --compare "${CORNELL_BOX_REFERENCE}"
+           WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}")
+ENDIF()
+
+ADD_EMBREE_TEST_ECS(rthwif_test_builder_triangles_expected    embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --build_test_triangles    --build_mode_expected)
+ADD_EMBREE_TEST_ECS(rthwif_test_builder_procedurals_expected  embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --build_test_procedurals  --build_mode_expected)
+ADD_EMBREE_TEST_ECS(rthwif_test_builder_instances_expected    embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --build_test_instances    --build_mode_expected)
+ADD_EMBREE_TEST_ECS(rthwif_test_builder_mixed_expected        embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --build_test_mixed        --build_mode_expected)
+
+ADD_EMBREE_TEST_ECS(rthwif_test_benchmark_triangles           embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --benchmark_triangles)
+ADD_EMBREE_TEST_ECS(rthwif_test_benchmark_procedurals         embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --benchmark_procedurals)
+
+ADD_EMBREE_TEST_ECS(rthwif_test_builder_triangles_worst_case    embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --build_test_triangles   --build_mode_worst_case)
+ADD_EMBREE_TEST_ECS(rthwif_test_builder_procedurals_worst_case  embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --build_test_procedurals --build_mode_worst_case)
+ADD_EMBREE_TEST_ECS(rthwif_test_builder_instances_worst_case    embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --build_test_instances   --build_mode_worst_case)
+ADD_EMBREE_TEST_ECS(rthwif_test_builder_mixed_worst_case        embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --build_test_mixed       --build_mode_worst_case)
+
+ADD_EMBREE_TEST_ECS(rthwif_test_triangles_committed_hit         embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --no-instancing --triangles-committed-hit)
+ADD_EMBREE_TEST_ECS(rthwif_test_triangles_potential_hit         embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --no-instancing --triangles-potential-hit)
+ADD_EMBREE_TEST_ECS(rthwif_test_triangles_anyhit_shader_commit  embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --no-instancing --triangles-anyhit-shader-commit)
+ADD_EMBREE_TEST_ECS(rthwif_test_triangles_anyhit_shader_reject  embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --no-instancing --triangles-anyhit-shader-reject)
+ADD_EMBREE_TEST_ECS(rthwif_test_procedurals_committed_hit       embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --no-instancing --procedurals-committed-hit)
+
+ADD_EMBREE_TEST_ECS(rthwif_test_hwinstancing_triangles_committed_hit          embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --hw-instancing --triangles-committed-hit)
+ADD_EMBREE_TEST_ECS(rthwif_test_hwinstancing_triangles_potential_hit          embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --hw-instancing --triangles-potential-hit)
+ADD_EMBREE_TEST_ECS(rthwif_test_hwinstancing_triangles_anyhit_shader_commit   embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --hw-instancing --triangles-anyhit-shader-commit)
+ADD_EMBREE_TEST_ECS(rthwif_test_hwinstancing_triangles_anyhit_shader_reject   embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --hw-instancing --triangles-anyhit-shader-reject)
+ADD_EMBREE_TEST_ECS(rthwif_test_hwinstancing_procedurals_committed_hit        embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --hw-instancing --procedurals-committed-hit)
+
+ADD_EMBREE_TEST_ECS(rthwif_test_swinstancing_triangles_committed_hit          embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --sw-instancing --triangles-committed-hit)
+ADD_EMBREE_TEST_ECS(rthwif_test_swinstancing_triangles_potential_hit          embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --sw-instancing --triangles-potential-hit)
+ADD_EMBREE_TEST_ECS(rthwif_test_swinstancing_triangles_anyhit_shader_commit   embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --sw-instancing --triangles-anyhit-shader-commit)
+ADD_EMBREE_TEST_ECS(rthwif_test_swinstancing_triangles_anyhit_shader_reject   embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --sw-instancing --triangles-anyhit-shader-reject)
+ADD_EMBREE_TEST_ECS(rthwif_test_swinstancing_procedurals_committed_hit        embree_rthwif_test NO_REFERENCE NO_POSTFIX INTENSITY 1 CONDITION "EMBREE_SYCL_SUPPORT == ON" ARGS --sw-instancing --procedurals-committed-hit)
--- a/Framework/external/embree/kernels/rthwif/testing/cornell_box_reference.tga
+++ b/Framework/external/embree/kernels/rthwif/testing/cornell_box_reference.tga
--- a/Framework/external/embree/kernels/rthwif/testing/rthwif_cornell_box.cpp
+++ b/Framework/external/embree/kernels/rthwif/testing/rthwif_cornell_box.cpp
@ -0,0 +1,630 @@
+// Copyright 2009-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <CL/sycl.hpp>
+#include "tbb/tbb.h"
+
+#include "../rttrace/rttrace.h"
+
+#include <level_zero/ze_wrapper.h>
+
+#include <vector>
+#include <iostream>
+#include <fstream>
+
+void* dispatchGlobalsPtr = nullptr;
+
+static uint32_t global_width = 512;
+static uint32_t global_height = 512;
+
+void exception_handler(sycl::exception_list exceptions)
+{
+  for (std::exception_ptr const& e : exceptions) {
+    try {
+      std::rethrow_exception(e);
+    } catch(sycl::exception const& e) {
+      std::cout << "Caught asynchronous SYCL exception: " << e.what() << std::endl;
+    }
+  }
+};
+
+inline void fwrite_uchar (unsigned char  v, std::fstream& file) { file.write((const char*)&v,sizeof(v)); }
+inline void fwrite_ushort(unsigned short v, std::fstream& file) { file.write((const char*)&v,sizeof(v)); }
+
+void storeTga(uint32_t* pixels, uint32_t width, uint32_t height, const std::string& fileName) try
+{
+  std::fstream file;
+  file.exceptions (std::fstream::failbit | std::fstream::badbit);
+  file.open (fileName.c_str(), std::fstream::out | std::fstream::binary);
+
+  fwrite_uchar(0x00, file);
+  fwrite_uchar(0x00, file);
+  fwrite_uchar(0x02, file);
+  fwrite_ushort(0x0000, file);
+  fwrite_ushort(0x0000, file);
+  fwrite_uchar(0x00, file);
+  fwrite_ushort(0x0000, file);
+  fwrite_ushort(0x0000, file);
+  fwrite_ushort((unsigned short)width , file);
+  fwrite_ushort((unsigned short)height, file);
+  fwrite_uchar(0x18, file);
+  fwrite_uchar(0x20, file);
+
+  for (size_t y=0; y<height; y++) {
+    for (size_t x=0; x<width; x++) {
+      const uint32_t c = pixels[y*width+x];
+      fwrite_uchar((unsigned char)((c>>0)&0xFF), file);
+      fwrite_uchar((unsigned char)((c>>8)&0xFF), file);
+      fwrite_uchar((unsigned char)((c>>16)&0xFF), file);
+    }
+  }
+}
+catch (std::exception const& e) {
+  std::cout << "Error: Cannot write file " << fileName << std::endl;
+  throw;
+}
+
+std::vector<unsigned char> readFile(const std::string& fileName) try
+{
+  std::fstream file;
+  file.exceptions (std::fstream::failbit | std::fstream::badbit);
+  file.open (fileName.c_str(), std::fstream::in | std::fstream::binary);
+
+  file.seekg (0, std::ios::end);
+  std::streampos size = file.tellg();
+  std::vector<unsigned char> data(size);
+  file.seekg (0, std::ios::beg);
+  file.read ((char*)data.data(), size);
+  file.close();
+
+  return data;
+}
+catch (std::exception const& e) {
+  std::cout << "Error: Cannot read file " << fileName << std::endl;
+  throw;
+}
+
+size_t compareTga(const std::string& fileNameA, const std::string& fileNameB)
+{
+  const std::vector<unsigned char> dataA = readFile(fileNameA);
+  const std::vector<unsigned char> dataB = readFile(fileNameB);
+  if (dataA.size() != dataB.size())
+    return false;
+
+  size_t diff = 0;
+  for (int i=0; i<dataA.size(); i++)
+  {
+    if (std::abs((int)dataA[i] - (int)dataB[i]) == 1) diff++;
+    if (std::abs((int)dataA[i] - (int)dataB[i]) == 2) diff+=4;
+    if (std::abs((int)dataA[i] - (int)dataB[i]) >= 3) diff+=100;
+  }
+  return diff;
+}
+
+/* Properly allocates an acceleration structure buffer using ze_raytracing_mem_alloc_ext_desc_t property. */
+void* alloc_accel_buffer(size_t bytes, sycl::device device, sycl::context context)
+{
+  ze_context_handle_t hContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(context);
+  ze_device_handle_t  hDevice  = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(device);
+  
+  ze_rtas_device_exp_properties_t rtasProp = { ZE_STRUCTURE_TYPE_RTAS_DEVICE_EXP_PROPERTIES };
+  ze_device_properties_t devProp = { ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, &rtasProp };
+  ze_result_t err = ZeWrapper::zeDeviceGetProperties(hDevice, &devProp );
+  if (err != ZE_RESULT_SUCCESS)
+    throw std::runtime_error("zeDeviceGetProperties failed");
+  
+  ze_raytracing_mem_alloc_ext_desc_t rt_desc;
+  rt_desc.stype = ZE_STRUCTURE_TYPE_RAYTRACING_MEM_ALLOC_EXT_DESC;
+  rt_desc.pNext = nullptr;
+  rt_desc.flags = 0;
+    
+  ze_device_mem_alloc_desc_t device_desc;
+  device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+  device_desc.pNext = &rt_desc;
+  device_desc.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED;
+  device_desc.ordinal = 0;
+
+  ze_host_mem_alloc_desc_t host_desc;
+  host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
+  host_desc.pNext = nullptr;
+  host_desc.flags = ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED;
+  
+  void* ptr = nullptr;
+  ze_result_t result = ZeWrapper::zeMemAllocShared(hContext,&device_desc,&host_desc,bytes,rtasProp.rtasBufferAlignment,hDevice,&ptr);
+  if (result != ZE_RESULT_SUCCESS)
+    throw std::runtime_error("acceleration buffer allocation failed");
+
+  return ptr;
+}
+
+void free_accel_buffer(void* ptr, sycl::context context)
+{
+  ze_context_handle_t hContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(context);
+  ze_result_t result = ZeWrapper::zeMemFree(hContext,ptr);
+  if (result != ZE_RESULT_SUCCESS)
+    throw std::runtime_error("acceleration buffer free failed");
+}
+
+
+/* dispatch globals allocation is for debugging only */
+
+enum Flags : uint32_t {
+  FLAGS_NONE,
+  DEPTH_TEST_LESS_EQUAL = 1 << 0  // when set we use <= for depth test, otherwise <
+};
+
+struct DispatchGlobals
+{
+  uint64_t rtMemBasePtr;               // base address of the allocated stack memory
+  uint64_t callStackHandlerKSP;             // this is the KSP of the continuation handler that is invoked by BTD when the read KSP is 0
+  uint32_t asyncStackSize;             // async-RT stack size in 64 byte blocks
+  uint32_t numDSSRTStacks : 16;        // number of stacks per DSS
+  uint32_t syncRayQueryCount : 4;      // number of ray queries in the sync-RT stack: 0-15 mapped to: 1-16
+  unsigned _reserved_mbz : 12;
+  uint32_t maxBVHLevels;               // the maximal number of supported instancing levels (0->8, 1->1, 2->2, ...)
+  Flags flags;                         // per context control flags
+};
+
+void* allocDispatchGlobals(sycl::device device, sycl::context context)
+{
+  size_t maxBVHLevels = 2; //RTC_MAX_INSTANCE_LEVEL_COUNT+1;
+  
+  size_t rtstack_bytes = (64+maxBVHLevels*(64+32)+63)&-64;
+  size_t num_rtstacks = 1<<17; // this is sufficiently large also for PVC
+  size_t dispatchGlobalSize = 128+num_rtstacks*rtstack_bytes;
+  
+  void* dispatchGlobalsPtr = alloc_accel_buffer(dispatchGlobalSize,device,context);
+  memset(dispatchGlobalsPtr, 0, dispatchGlobalSize);
+
+  DispatchGlobals* dg = (DispatchGlobals*) dispatchGlobalsPtr;
+  dg->rtMemBasePtr = (uint64_t) dispatchGlobalsPtr + dispatchGlobalSize;
+  dg->callStackHandlerKSP = 0;
+  dg->asyncStackSize = 0;
+  dg->numDSSRTStacks = 0;
+  dg->syncRayQueryCount = 0;
+  dg->_reserved_mbz = 0;
+  dg->maxBVHLevels = maxBVHLevels;
+  dg->flags = DEPTH_TEST_LESS_EQUAL;
+  
+  return dispatchGlobalsPtr;
+}
+
+/* vertex indices for cornell_box model */
+ze_rtas_triangle_indices_uint32_exp_t indices[] = {
+  { 0, 1, 2 },
+  { 0, 2, 3 },
+  { 4, 5, 6 },
+  { 4, 6, 7 },
+  { 8, 9, 10 },
+  { 8, 10, 11 },
+  { 12, 13, 14 },
+  { 12, 14, 15 },
+  { 16, 17, 18 },
+  { 16, 18, 19 },
+  { 20, 21, 22 },
+  { 20, 22, 23 },
+  { 24, 25, 26 },
+  { 24, 26, 27 },
+  { 28, 29, 30 },
+  { 28, 30, 31 },
+  { 32, 33, 34 },
+  { 32, 34, 35 },
+  { 36, 37, 38 },
+  { 36, 38, 39 },
+  { 40, 41, 42 },
+  { 40, 42, 43 },
+  { 44, 45, 46 },
+  { 44, 46, 47 },
+  { 48, 49, 50 },
+  { 48, 50, 51 },
+  { 52, 53, 54 },
+  { 52, 54, 55 },
+  { 56, 57, 58 },
+  { 56, 58, 59 },
+  { 60, 61, 62 },
+  { 60, 62, 63 },
+  { 64, 65, 66 },
+  { 64, 66, 67 }
+};
+
+/* vertex positions for cornell_box model */
+ze_rtas_float3_exp_t vertices[] = {
+  { 552.8, 0, 0 },
+  { 0, 0, 0 },
+  { 0, 0, 559.2 },
+  { 549.6, 0, 559.2 },
+  { 290, 0, 114 },
+  { 240, 0, 272 },
+  { 82, 0, 225 },
+  { 130, 0, 65 },
+  { 472, 0, 406 },
+  { 314, 0, 456 },
+  { 265, 0, 296 },
+  { 423, 0, 247 },
+  { 556, 548.8, 0 },
+  { 556, 548.8, 559.2 },
+  { 0, 548.8, 559.2 },
+  { 0, 548.8, 0 },
+  { 549.6, 0, 559.2 },
+  { 0, 0, 559.2 },
+  { 0, 548.8, 559.2 },
+  { 556, 548.8, 559.2 },
+  { 0, 0, 559.2 },
+  { 0, 0, 0 },
+  { 0, 548.8, 0 },
+  { 0, 548.8, 559.2 },
+  { 552.8, 0, 0 },
+  { 549.6, 0, 559.2 },
+  { 556, 548.8, 559.2 },
+  { 556, 548.8, 0 },
+  { 130, 165, 65 },
+  { 82, 165, 225 },
+  { 240, 165, 272 },
+  { 290, 165, 114 },
+  { 290, 0, 114 },
+  { 290, 165, 114 },
+  { 240, 165, 272 },
+  { 240, 0, 272 },
+  { 130, 0, 65 },
+  { 130, 165, 65 },
+  { 290, 165, 114 },
+  { 290, 0, 114 },
+  { 82, 0, 225 },
+  { 82, 165, 225 },
+  { 130, 165, 65 },
+  { 130, 0, 65 },
+  { 240, 0, 272 },
+  { 240, 165, 272 },
+  { 82, 165, 225 },
+  { 82, 0, 225 },
+  { 423, 330, 247 },
+  { 265, 330, 296 },
+  { 314, 330, 456 },
+  { 472, 330, 406 },
+  { 423, 0, 247 },
+  { 423, 330, 247 },
+  { 472, 330, 406 },
+  { 472, 0, 406 },
+  { 472, 0, 406 },
+  { 472, 330, 406 },
+  { 314, 330, 456 },
+  { 314, 0, 456 },
+  { 314, 0, 456 },
+  { 314, 330, 456 },
+  { 265, 330, 296 },
+  { 265, 0, 296 },
+  { 265, 0, 296 },
+  { 265, 330, 296 },
+  { 423, 330, 247 },
+  { 423, 0, 247 },
+};
+
+/* builds acceleration structure */
+void* build_rtas(sycl::device device, sycl::context context)
+{
+  /* get L0 handles */
+  ze_driver_handle_t hDriver = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(device.get_platform());
+  ze_device_handle_t hDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(device);
+    
+  /* create rtas builder object */
+  ze_rtas_builder_exp_desc_t builderDesc = { ZE_STRUCTURE_TYPE_RTAS_BUILDER_EXP_DESC };
+  ze_rtas_builder_exp_handle_t hBuilder = nullptr;
+  ze_result_t err = ZeWrapper::zeRTASBuilderCreateExp(hDriver, &builderDesc, &hBuilder);
+  if (err != ZE_RESULT_SUCCESS)
+    throw std::runtime_error("ze_rtas_builder creation failed");
+    
+  /* create geometry descriptor for single triangle mesh */
+  ze_rtas_builder_triangles_geometry_info_exp_t mesh = {};
+  mesh.geometryType = ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_TRIANGLES;
+  mesh.geometryFlags = 0;
+  mesh.geometryMask = 0xFF;
+  
+  mesh.triangleFormat = ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_TRIANGLE_INDICES_UINT32;
+  mesh.triangleCount = sizeof(indices)/sizeof(ze_rtas_triangle_indices_uint32_exp_t);
+  mesh.triangleStride = sizeof(ze_rtas_triangle_indices_uint32_exp_t);
+  mesh.pTriangleBuffer = indices;
+
+  mesh.vertexFormat = ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3;
+  mesh.vertexCount = sizeof(vertices)/sizeof(ze_rtas_float3_exp_t);
+  mesh.vertexStride = sizeof(ze_rtas_float3_exp_t);
+  mesh.pVertexBuffer = vertices;
+
+  /* fill geometry descriptor array with pointer to single geometry descriptor */
+  std::vector<ze_rtas_builder_geometry_info_exp_t*> descs;
+  descs.push_back((ze_rtas_builder_geometry_info_exp_t*)&mesh);
+  
+  /* get acceleration structure format for this device */
+  ze_rtas_device_exp_properties_t rtasProp = { ZE_STRUCTURE_TYPE_RTAS_DEVICE_EXP_PROPERTIES };
+  ze_device_properties_t devProp = { ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, &rtasProp };
+  err = ZeWrapper::zeDeviceGetProperties(hDevice, &devProp );
+  if (err != ZE_RESULT_SUCCESS)
+    throw std::runtime_error("zeDeviceGetProperties failed");
+
+  /* create parallel operation for parallel build */
+  ze_rtas_parallel_operation_exp_handle_t hParallelOperation = nullptr;
+  err = ZeWrapper::zeRTASParallelOperationCreateExp(hDriver, &hParallelOperation);
+  if (err != ZE_RESULT_SUCCESS)
+    throw std::runtime_error("zeRTASParallelOperationCreateExp failed");
+
+  /* create descriptor of build operation */
+  size_t accelBufferBytesOut = 0;
+  ze_rtas_aabb_exp_t bounds;
+  ze_rtas_builder_build_op_exp_desc_t buildOp = {};
+  buildOp.stype = ZE_STRUCTURE_TYPE_RTAS_BUILDER_BUILD_OP_EXP_DESC;
+  buildOp.pNext = nullptr;
+  buildOp.rtasFormat = rtasProp.rtasFormat;
+  buildOp.buildQuality = ZE_RTAS_BUILDER_BUILD_QUALITY_HINT_EXP_MEDIUM;
+  buildOp.buildFlags = 0;
+  buildOp.ppGeometries = (const ze_rtas_builder_geometry_info_exp_t **) descs.data();
+  buildOp.numGeometries = descs.size();
+
+  /* just for debugging purposes */
+#if defined(EMBREE_SYCL_ALLOC_DISPATCH_GLOBALS)
+  ze_rtas_builder_build_op_debug_exp_desc_t buildOpDebug = { ZE_STRUCTURE_TYPE_RTAS_BUILDER_BUILD_OP_DEBUG_EXP_DESC };
+  buildOpDebug.dispatchGlobalsPtr = dispatchGlobalsPtr;
+  buildOp.pNext = &buildOpDebug;
+#endif
+
+  /* query required buffer sizes */
+  ze_rtas_builder_exp_properties_t buildProps = { ZE_STRUCTURE_TYPE_RTAS_BUILDER_EXP_PROPERTIES };
+  err = ZeWrapper::zeRTASBuilderGetBuildPropertiesExp(hBuilder,&buildOp,&buildProps);
+  if (err != ZE_RESULT_SUCCESS)
+    throw std::runtime_error("zeRTASBuilderGetBuildPropertiesExp failed");
+
+  /* allocate scratch buffer */
+  std::vector<char> scratchBuffer(buildProps.scratchBufferSizeBytes);
+  memset(scratchBuffer.data(),0,scratchBuffer.size());
+
+  /* allocate acceleration structure buffer */
+  size_t accelBytes = buildProps.rtasBufferSizeBytesMaxRequired;
+  void* accel = alloc_accel_buffer(accelBytes,device,context);
+  memset(accel,0,accelBytes); // optional
+  
+  /* build acceleration strucuture multi threaded */
+  err = ZeWrapper::zeRTASBuilderBuildExp(hBuilder,&buildOp,
+                                  scratchBuffer.data(),scratchBuffer.size(),
+                                  accel, accelBytes,
+                                  hParallelOperation,
+                                  nullptr, &bounds, &accelBufferBytesOut);
+  
+  if (err != ZE_RESULT_EXP_RTAS_BUILD_DEFERRED)
+    throw std::runtime_error("zeRTASBuilderBuildExp failed");
+
+  /* after the build is started one can query number of threads to use for the build */
+  ze_rtas_parallel_operation_exp_properties_t prop = { ZE_STRUCTURE_TYPE_RTAS_PARALLEL_OPERATION_EXP_PROPERTIES };
+  err = ZeWrapper::zeRTASParallelOperationGetPropertiesExp(hParallelOperation,&prop);
+
+  if (err != ZE_RESULT_SUCCESS)
+    throw std::runtime_error("zeRTASParallelOperationGetPropertiesExp failed");
+
+  /* build in parallel using maximal number of build threads */
+  tbb::parallel_for(0u, prop.maxConcurrency, 1u, [&](uint32_t) {
+    err = ZeWrapper::zeRTASParallelOperationJoinExp(hParallelOperation);
+  });
+  
+  if (err != ZE_RESULT_SUCCESS)
+    throw std::runtime_error("zeRTASParallelOperationJoinExp failed");
+
+  /* destroy parallel operation again */
+  err = ZeWrapper::zeRTASParallelOperationDestroyExp(hParallelOperation);
+  if (err != ZE_RESULT_SUCCESS)
+    throw std::runtime_error("zeRTASParallelOperationDestroyExp failed");
+
+  /* destroy rtas builder again */
+  err = ZeWrapper::zeRTASBuilderDestroyExp(hBuilder);
+  if (err != ZE_RESULT_SUCCESS)
+    throw std::runtime_error("zeRTASBuilderDestroyExp failed");
+  
+  return accel;
+}
+
+/* render using simple UV shading */
+void render(unsigned int x, unsigned int y, void* bvh, unsigned int* pixels, unsigned int width, unsigned int height)
+{
+  /* write zero image if ray tracing extension is not supported */
+  intel_raytracing_ext_flag_t flags = intel_get_raytracing_ext_flag();
+  if (!(flags & intel_raytracing_ext_flag_ray_query)) {
+    pixels[y*width+x] = 0;
+    return;
+  }
+
+  /* fixed camera */
+  sycl::float3 vx(-1.f, -0.f, -0.f);
+  sycl::float3 vy(-0.f, -1.f, -0.f);
+  sycl::float3 vz(32.f, 32.f, 95.6379f);
+  sycl::float3 p(278.f, 273.f, -800.f);
+
+  /* compute primary ray */
+  intel_ray_desc_t ray;
+  ray.origin = p;
+  ray.direction = float(x)*vx*64.0f/float(width) + float(y)*vy*64/float(height) + vz;
+  ray.tmin = 0.0f;
+  ray.tmax = INFINITY;
+  ray.mask = 0xFF;
+  ray.flags = intel_ray_flags_none;
+
+  /* trace ray */
+  intel_ray_query_t query = intel_ray_query_init(ray,(intel_raytracing_acceleration_structure_t)bvh);
+  intel_ray_query_start_traversal(query);
+  intel_ray_query_sync(query);
+
+  /* get UVs of hit point */
+  float u = 0, v = 0;
+  if (intel_has_committed_hit(query))
+  {
+    sycl::float2 uv = intel_get_hit_barycentrics( query, intel_hit_type_committed_hit );
+    u = uv.x();
+    v = uv.y();
+  }
+
+  /* write color to framebuffer */
+  sycl::float3 color(u,v,1.0f-u-v);
+  unsigned int r = (unsigned int) (255.0f * color.x());
+  unsigned int g = (unsigned int) (255.0f * color.y());
+  unsigned int b = (unsigned int) (255.0f * color.z());
+  pixels[y*width+x] = (b << 16) + (g << 8) + r;
+}
+
+int main(int argc, char* argv[]) try
+{
+  /* use can specify reference image to compare against */
+#if defined(EMBREE_SYCL_L0_RTAS_BUILDER)
+  ZeWrapper::RTAS_BUILD_MODE rtas_build_mode = ZeWrapper::RTAS_BUILD_MODE::LEVEL_ZERO;
+#else
+  ZeWrapper::RTAS_BUILD_MODE rtas_build_mode = ZeWrapper::RTAS_BUILD_MODE::INTERNAL;
+#endif
+  
+  char* reference_img = NULL;
+  for (int i=1; i<argc; i++)
+  {
+    if (strcmp(argv[i], "--compare") == 0) {
+      if (++i >= argc) throw std::runtime_error("--compare: filename expected");
+      reference_img = argv[i];
+    }
+    else if (strcmp(argv[i], "--internal-rtas-builder") == 0) {
+      rtas_build_mode = ZeWrapper::RTAS_BUILD_MODE::INTERNAL;
+    }
+    else if (strcmp(argv[i], "--level-zero-rtas-builder") == 0) {
+      rtas_build_mode = ZeWrapper::RTAS_BUILD_MODE::LEVEL_ZERO;
+    }
+    else if (strcmp(argv[i], "--default-rtas-builder") == 0) {
+      rtas_build_mode = ZeWrapper::RTAS_BUILD_MODE::AUTO;
+    }
+    else if (strcmp(argv[i], "--size") == 0) {
+      if (++i >= argc) throw std::runtime_error("--size: width expected");
+      global_width = atoi(argv[i]);
+      if (++i >= argc) throw std::runtime_error("--size: height expected");
+      global_height = atoi(argv[i]);
+      if (global_width == 0) throw std::runtime_error("--size: width is zero");
+      if (global_height == 0) throw std::runtime_error("--size: height is zero");
+      if (global_width > 4096) throw std::runtime_error("--size: width too large");
+      if (global_height > 4096) throw std::runtime_error("--size: height too large");
+    }
+    else {
+      throw std::runtime_error("unknown command line argument");
+    }
+  }
+
+  /* create SYCL objects */
+  sycl::device device = sycl::device(sycl::gpu_selector_v);
+  sycl::queue queue = sycl::queue(device,exception_handler);
+  sycl::context context = queue.get_context();
+
+  if (ZeWrapper::init() != ZE_RESULT_SUCCESS) {
+    std::cerr << "ZeWrapper not successfully initialized" << std::endl;
+    return 1;
+  }
+
+  ze_result_t result = ZE_RESULT_SUCCESS;
+  sycl::platform platform = device.get_platform();
+  ze_driver_handle_t hDriver = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(platform);
+
+  /* enable RTAS extension only when enabled */
+  if (rtas_build_mode == ZeWrapper::RTAS_BUILD_MODE::AUTO)
+  {
+    uint32_t count = 0;
+    std::vector<ze_driver_extension_properties_t> extensions;
+    result = ZeWrapper::zeDriverGetExtensionProperties(hDriver,&count,extensions.data());
+    if (result != ZE_RESULT_SUCCESS)
+      throw std::runtime_error("zeDriverGetExtensionProperties failed");
+    
+    extensions.resize(count);
+    result = ZeWrapper::zeDriverGetExtensionProperties(hDriver,&count,extensions.data());
+    if (result != ZE_RESULT_SUCCESS)
+      throw std::runtime_error("zeDriverGetExtensionProperties failed");
+    
+    bool ze_rtas_builder = false;
+    for (uint32_t i=0; i<extensions.size(); i++)
+    {
+      if (strncmp("ZE_experimental_rtas_builder",extensions[i].name,sizeof(extensions[i].name)) == 0)
+        ze_rtas_builder = true;
+    }
+
+    if (ze_rtas_builder)
+      result = ZeWrapper::initRTASBuilder(hDriver,ZeWrapper::RTAS_BUILD_MODE::AUTO);
+    else
+      result = ZeWrapper::initRTASBuilder(hDriver,ZeWrapper::RTAS_BUILD_MODE::INTERNAL);
+  }
+  else
+    result = ZeWrapper::initRTASBuilder(hDriver,rtas_build_mode);
+
+  if (result == ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE)
+    throw std::runtime_error("cannot load ZE_experimental_rtas_builder extension");
+  
+  if (result != ZE_RESULT_SUCCESS)
+    throw std::runtime_error("cannot initialize ZE_experimental_rtas_builder extension");
+  
+  if (ZeWrapper::rtas_builder == ZeWrapper::INTERNAL)
+    std::cout << "using internal RTAS builder" << std::endl;
+  else
+    std::cout << "using Level Zero RTAS builder" << std::endl;
+
+#if defined(ZE_RAYTRACING_RT_SIMULATION)
+  RTCore::Init();
+  RTCore::SetXeVersion((RTCore::XeVersion)ZE_RAYTRACING_DEVICE);
+#endif
+
+#if defined(EMBREE_SYCL_ALLOC_DISPATCH_GLOBALS)
+  dispatchGlobalsPtr = allocDispatchGlobals(device,context);
+#endif
+
+  /* build acceleration structure */
+  void* bvh = build_rtas(device,context);
+
+  /* creates framebuffer */
+  const uint32_t width = global_width;
+  const uint32_t height = global_height;
+  unsigned int* pixels = (unsigned int*) sycl::aligned_alloc(64,width*height*sizeof(unsigned int),device,context,sycl::usm::alloc::shared);
+  memset(pixels, 0, width*height*sizeof(uint32_t));
+
+  /* renders image on device */
+#if defined(ZE_RAYTRACING_RT_SIMULATION)
+  tbb::parallel_for(tbb::blocked_range2d<uint32_t>(0,height,0,width),
+     [&](const tbb::blocked_range2d<uint32_t>& r) {
+        for (int y=r.rows().begin(); y<r.rows().end(); y++) {
+          for (int x=r.cols().begin(); x<r.cols().end(); x++) {
+            render(x,y,bvh,pixels,width,height);
+          }
+        }
+     });
+#else
+  queue.submit([&](sycl::handler& cgh) {
+                 const sycl::range<2> range(width,height);
+                 cgh.parallel_for(range, [=](sycl::item<2> item) {
+                                              const uint32_t x = item.get_id(0);
+                                              const uint32_t y = item.get_id(1);
+                                              render(x,y,bvh,pixels,width,height);
+                                            });
+               });
+  queue.wait_and_throw();
+#endif
+  
+  /* free acceleration structure again */
+  free_accel_buffer(bvh,context);
+
+#if defined(EMBREE_SYCL_ALLOC_DISPATCH_GLOBALS)
+  free_accel_buffer(dispatchGlobalsPtr, context);
+#endif
+  
+#if defined(ZE_RAYTRACING_RT_SIMULATION)
+  RTCore::Cleanup();
+#endif
+
+  /* store image to disk */
+  storeTga(pixels,width,height,"cornell_box.tga");
+  if (!reference_img) return 0;
+
+  /* compare to reference image */
+  const size_t err = compareTga("cornell_box.tga", "cornell_box_reference.tga");
+  std::cout << "difference to reference image is " << err << std::endl;
+  const bool ok = err < 32;
+  std::cout << "cornell_box ";
+  if (ok) std::cout << "[PASSED]" << std::endl;
+  else    std::cout << "[FAILED]" << std::endl;
+
+  return ok ? 0 : 1;
+}
+catch (std::runtime_error e) {
+  std::cerr << "std::runtime_error: " << e.what() << std::endl;
+  return 1;
+}
--- a/Framework/external/embree/kernels/rthwif/testing/rthwif_test.cpp
+++ b/Framework/external/embree/kernels/rthwif/testing/rthwif_test.cpp