Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 1 addition & 26 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -343,31 +343,6 @@ jobs:
OPENIMAGEIO_CMAKE_FLAGS="-DUSE_PYTHON=0"
CMAKE_BUILD_TYPE=RelWithDebInfo

- desc: icc/C++17 llvm14 py3.9 oiio-2.5 avx2
nametag: linux-icc
runner: ubuntu-latest
container: aswf/ci-osl:2023-clang15
cc_compiler: icc
cxx_compiler: icpc
cxx_std: 17
fmt_ver: 7.1.3
opencolorio_ver: v2.3.2
openimageio_ver: v2.5.17.0
# Changes to OIIO's simd.h starting in commit 68666db9 (from PR
# #4187) seem to trigger compiler bugs in icc and generate wrong
# SIMD code. It's probably not worth tracking down for just this
# obsolete compiler. Just lock down to OIIO 2.5 for icc builds to
# avoid the problem.
# openimageio_ver: e41ac03c0b21 # works
# openimageio_ver: 68666db994d5 # broken
python_ver: "3.10"
pybind11_ver: v2.10.0
# simd: avx2,f16c
batched: b8_AVX2_noFMA
setenvs: export OSL_CMAKE_FLAGS="-DSTOP_ON_WARNING=OFF -DEXTRA_CPP_ARGS=-fp-model=consistent"
OPENIMAGEIO_CMAKE_FLAGS=-DBUILD_FMT_VERSION=7.1.3
USE_OPENVDB=0
OPENCOLORIO_CMAKE_FLAGS="-DCMAKE_CXX_COMPILER=g++"
- desc: icx/C++17 llvm14 py3.10 oiio-3.0 avx2
nametag: linux-icx
runner: ubuntu-latest
Expand All @@ -380,7 +355,7 @@ jobs:
cxx_std: 17
fmt_ver: 7.1.3
opencolorio_ver: v2.3.2
openimageio_ver: v3.0.11.0
openimageio_ver: v3.0.15.0
python_ver: "3.10"
pybind11_ver: v2.10.0
simd: avx2,f16c
Expand Down
2 changes: 0 additions & 2 deletions src/include/OSL/Imathx/Imathx.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,9 +248,7 @@ affineInverse(const Matrix44 &m)
// differently than the LLVM IR version.
// NOTE: only using "inline" to get ODR (One Definition Rule) behavior
static inline OSL_HOSTDEVICE Matrix44
#if !OSL_INTEL_CLASSIC_COMPILER_VERSION
OSL_GNUC_ATTRIBUTE(optimize("fp-contract=off"))
#endif
nonAffineInverse(const Matrix44 &source);

Matrix44 OSL_HOSTDEVICE nonAffineInverse(const Matrix44 &source)
Expand Down
13 changes: 0 additions & 13 deletions src/include/OSL/mask.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,6 @@ OSL_NAMESPACE_BEGIN
using std::popcount;
using std::countr_zero;

#elif OSL_INTEL_CLASSIC_COMPILER_VERSION

#include <immintrin.h>

OSL_FORCEINLINE int popcount(uint32_t x) noexcept { return _mm_popcnt_u32(x);}
OSL_FORCEINLINE int popcount(uint64_t x) noexcept { return _mm_popcnt_u64(x); }
OSL_FORCEINLINE int countr_zero(uint32_t x) noexcept { return _bit_scan_forward(x); }
OSL_FORCEINLINE int countr_zero(uint64_t x) noexcept {
unsigned __int32 index;
_BitScanForward64(&index, x);
return static_cast<int>(index);
}

#elif defined(__GNUC__) || defined(__clang__)

OSL_FORCEINLINE int popcount(uint32_t x) noexcept { return __builtin_popcount(x); }
Expand Down
37 changes: 1 addition & 36 deletions src/include/OSL/oslnoise.h
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ OSL_FORCEINLINE OSL_HOSTDEVICE Dual2<float> select(const bool b, const Dual2<flo
// versus requiring a stack location.
// Without this work per component, gathers & scatters were being emitted
// when used inside SIMD loops.
#if OSL_ANY_CLANG && !OSL_INTEL_CLASSIC_COMPILER_VERSION && !OSL_INTEL_LLVM_COMPILER_VERSION
#if OSL_ANY_CLANG && !OSL_INTEL_LLVM_COMPILER_VERSION
// Clang's vectorizor was really insistent that a select operation could not be replaced
// with control flow, so had to re-introduce the ? operator to make it happy
return Dual2<float> (
Expand Down Expand Up @@ -2254,7 +2254,6 @@ OSL_FORCEINLINE OSL_HOSTDEVICE void perlin (Dual2<Vec3> &result, const H &hash,

// With Dual2<Vec3> data types, a lot of code is generated below
// which caused some runaway compiler memory consumption when vectorizing
#if !OSL_INTEL_CLASSIC_COMPILER_VERSION
auto l_result = OIIO::lerp (
OIIO::trilerp (grad (hash (X , Y , Z , W ), fx , fy , fz , fw ),
grad (hash (X+1, Y , Z , W ), fx-1.0f, fy , fz , fw ),
Expand All @@ -2275,40 +2274,6 @@ OSL_FORCEINLINE OSL_HOSTDEVICE void perlin (Dual2<Vec3> &result, const H &hash,
grad (hash (X+1, Y+1, Z+1, W+1), fx-1.0f, fy-1.0f, fz-1.0f, fw-1.0f),
u, v, t),
s);
#else
// Use a loop to avoid repeating code gen twice
Dual2<Vec3> v0, v1;
// GCC emits -Wmaybe-uninitialized errors for v0,v1.
// To avoid, GCC uses reference version above

// Clang doesn't want to vectorize with the vIndex loop
// To enable vectorization, Clang uses reference version above
OSL_INTEL_PRAGMA(nounroll_and_jam)
for(int vIndex=0; vIndex < 2;++vIndex) {
int vW = W + vIndex;
Dual2<float> vfw = fw - float(vIndex);

Dual2<Vec3> vResult = OIIO::trilerp (
grad (hash (X , Y , Z , vW ), fx , fy , fz , vfw ),
grad (hash (X+1, Y , Z , vW ), fx-1.0f, fy , fz , vfw ),
grad (hash (X , Y+1, Z , vW ), fx , fy-1.0f, fz , vfw ),
grad (hash (X+1, Y+1, Z , vW ), fx-1.0f, fy-1.0f, fz , vfw ),
grad (hash (X , Y , Z+1, vW ), fx , fy , fz-1.0f, vfw ),
grad (hash (X+1, Y , Z+1, vW ), fx-1.0f, fy , fz-1.0f, vfw ),
grad (hash (X , Y+1, Z+1, vW ), fx , fy-1.0f, fz-1.0f, vfw ),
grad (hash (X+1, Y+1, Z+1, vW ), fx-1.0f, fy-1.0f, fz-1.0f, vfw ),
u, v, t);
// Rather than dynamic indexing array,
// use masking to store outputs,
// to better enable SROA (Scalar Replacement of Aggregates) optimizations
if (vIndex == 0) {
v0 = vResult;
} else {
v1 = vResult;
}
}
auto l_result = OIIO::lerp (v0, v1, s);
#endif

result = scale4 (l_result);
}
Expand Down
18 changes: 1 addition & 17 deletions src/include/OSL/sfmath.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,23 +68,7 @@ namespace sfm
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#if OSL_INTEL_CLASSIC_COMPILER_VERSION
// std::isinf wasn't vectorizing and was branchy. This slightly
// perturbed version fairs better and is branch free when vectorized
// with the Intel compiler.
OSL_FORCEINLINE OSL_HOSTDEVICE int isinf (float x) {
int r = 0;
// NOTE: using bitwise | to avoid branches
if (!(std::isfinite(x)|std::isnan(x))) {
r = static_cast<int>(copysignf(1.0f,x));
}
return r;
}
#else
// Other compilers don't seem to vectorize well no matter what, so just
// use the standard version.
using std::isinf;
#endif

template<typename T>
OSL_FORCEINLINE OSL_HOSTDEVICE T
Expand Down Expand Up @@ -191,7 +175,7 @@ namespace sfm
}
}

#if OSL_ANY_CLANG && !OSL_INTEL_CLASSIC_COMPILER_VERSION && !OSL_INTEL_LLVM_COMPILER_VERSION
#if OSL_ANY_CLANG && !OSL_INTEL_LLVM_COMPILER_VERSION

// To make clang's loop vectorizor happy
// we need to make sure result of min and max
Expand Down
5 changes: 2 additions & 3 deletions src/include/OSL/wide.h
Original file line number Diff line number Diff line change
Expand Up @@ -1904,7 +1904,7 @@ struct WideImpl<const Dual2<ElementT>[], WidthT, true /*IsConstT */> {
} // namespace pvt


#if OSL_INTEL_CLASSIC_COMPILER_VERSION || OSL_GNUC_VERSION
#if OSL_GNUC_VERSION
// Workaround for error #3466: inheriting constructors must be inherited from a direct base class
# define __OSL_INHERIT_BASE_CTORS(DERIVED, BASE) \
using Base = typename DERIVED::BASE; \
Expand Down Expand Up @@ -3210,8 +3210,7 @@ template<typename DataT, int WidthT>
OSL_FORCEINLINE bool
testIfAnyLaneIsNonZero(const Wide<DataT, WidthT>& wvalues)
{
#if OSL_ANY_CLANG && !OSL_INTEL_CLASSIC_COMPILER_VERSION \
&& !OSL_INTEL_LLVM_COMPILER_VERSION
#if OSL_ANY_CLANG && !OSL_INTEL_LLVM_COMPILER_VERSION
int anyLaneIsOn = 0;
OSL_OMP_PRAGMA(omp simd simdlen(WidthT) reduction(max : anyLaneIsOn))
for (int i = 0; i < WidthT; ++i) {
Expand Down
3 changes: 1 addition & 2 deletions src/liboslexec/opcolor_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,7 @@ hsv_to_rgb(const COLOR3& hsv)
// Avoid switch statement vectorizor doesn't like
// Also avoid if/else nest which some optimizers might
// convert back into a switch statement
# if OSL_ANY_CLANG && !OSL_INTEL_CLASSIC_COMPILER_VERSION \
&& !OSL_INTEL_LLVM_COMPILER_VERSION
# if OSL_ANY_CLANG && !OSL_INTEL_LLVM_COMPILER_VERSION
// Clang was still transforming series of if's back into a switch.
// Alternate between == and <= comparisons to avoid
# define __OSL_ASC_EQ <=
Expand Down
2 changes: 1 addition & 1 deletion src/liboslexec/wide/wide_opcolor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ namespace {

// Note: Clang 14 seems to no longer allow vectorizing these loops
#if ((OSL_CLANG_VERSION && OSL_CLANG_VERSION < 140000) \
|| OSL_INTEL_CLASSIC_COMPILER_VERSION || OSL_INTEL_LLVM_COMPILER_VERSION)
|| OSL_INTEL_LLVM_COMPILER_VERSION)
# define WIDE_TRANSFORMC_OMP_SIMD_LOOP(...) OSL_OMP_SIMD_LOOP(__VA_ARGS__)
#else
# define WIDE_TRANSFORMC_OMP_SIMD_LOOP(...)
Expand Down
10 changes: 0 additions & 10 deletions src/liboslexec/wide/wide_opnoise_periodic_perlin_deriv_Vec3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,7 @@ template<> struct BatchedCGPolicy<Param::WDV, Param::WDV, Param::WV> {
};
template<>
struct BatchedCGPolicy<Param::WDV, Param::WDV, Param::WDF, Param::WV, Param::WF> {
#if ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
// Avoid stack overflow on windows build because compiler has hard coded stack limit.
// By not forcing everything to recursively inline and not explicitly vectorizing,
// the compiler uses less stack space at the cost of NOT creating a properly SIMD optimized function.
// Linux & OSX can increase stacksize before building

static constexpr int simd_threshold
= __OSL_WIDTH + 1; // Make SIMD code path unreachable
#else
static constexpr int simd_threshold = 6;
#endif
};
} // namespace

Expand Down
10 changes: 0 additions & 10 deletions src/liboslexec/wide/wide_opnoise_periodic_uperlin_deriv_Vec3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,7 @@ template<> struct BatchedCGPolicy<Param::WDV, Param::WDV, Param::WV> {
};
template<>
struct BatchedCGPolicy<Param::WDV, Param::WDV, Param::WDF, Param::WV, Param::WF> {
#if ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
// Avoid stack overflow on windows build because compiler has hard coded stack limit.
// By not forcing everything to recursively inline and not explicitly vectorizing,
// the compiler uses less stack space at the cost of NOT creating a properly SIMD optimized function.
// Linux & OSX can increase stacksize before building

static constexpr int simd_threshold
= __OSL_WIDTH + 1; // Make SIMD code path unreachable
#else
static constexpr int simd_threshold = 6;
#endif
};
} // namespace

Expand Down
10 changes: 0 additions & 10 deletions src/liboslexec/wide/wide_opnoise_perlin_deriv_Vec3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,7 @@ template<> struct BatchedCGPolicy<Param::WDV, Param::WDV> {
static constexpr int simd_threshold = 5;
};
template<> struct BatchedCGPolicy<Param::WDV, Param::WDV, Param::WDF> {
#if ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
// Avoid stack overflow on windows build because compiler has hard coded stack limit.
// By not forcing everything to recursively inline and not explicitly vectorizing,
// the compiler uses less stack space at the cost of NOT creating a properly SIMD optimized function.
// Linux & OSX can increase stacksize before building

static constexpr int simd_threshold
= __OSL_WIDTH + 1; // Make SIMD code path unreachable
#else
static constexpr int simd_threshold = 6;
#endif
};
} // namespace

Expand Down
10 changes: 0 additions & 10 deletions src/liboslexec/wide/wide_opnoise_uperlin_deriv_Vec3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,7 @@ template<> struct BatchedCGPolicy<Param::WDV, Param::WDV> {
static constexpr int simd_threshold = 5;
};
template<> struct BatchedCGPolicy<Param::WDV, Param::WDV, Param::WDF> {
#if ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
// Avoid stack overflow on windows build because compiler has hard coded stack limit.
// By not forcing everything to recursively inline and not explicitly vectorizing,
// the compiler uses less stack space at the cost of NOT creating a properly SIMD optimized function.
// Linux & OSX can increase stacksize before building

static constexpr int simd_threshold
= __OSL_WIDTH + 1; // Make SIMD code path unreachable
#else
static constexpr int simd_threshold = 6;
#endif
};
} // namespace

Expand Down
2 changes: 1 addition & 1 deletion src/liboslexec/wide/wide_opstring.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ __OSL_MASKED_OP2(strlen, Wi, Ws)(void* wr_, void* ws_, unsigned int mask_value)

OSL_FORCEINLINE_BLOCK
{
#if (!OSL_CLANG_VERSION || OSL_INTEL_CLASSIC_COMPILER_VERSION)
#if !OSL_CLANG_VERSION
// Clang 11 generated SIMD crashes at runtime
// TODO: investigate clang crash when vectorizing
OSL_OMP_PRAGMA(omp simd simdlen(__OSL_WIDTH))
Expand Down
14 changes: 1 addition & 13 deletions src/liboslnoise/sfm_gabornoise.h
Original file line number Diff line number Diff line change
Expand Up @@ -356,19 +356,7 @@ gabor_cell(const sfm::GaborUniformParams& gup, const sfm::GaborParams& gp,
Dual2<float> gk = gabor_kernel(w_i_t_s_f, omega_i_t_s_f,
phi_i_t_s_f, a_i_t_s_f,
x_k_i_t); // 2D
#if defined(__AVX512F__) && defined(__INTEL_COMPILER) \
&& (__INTEL_COMPILER < 1800)
// icc17 with AVX512 had some incorrect results
// due to the not_finite code path executing even
// when the value was finite. Workaround: using isnan | isinf
// instead of isfinite avoided the issue.
// icc18u3 doesn't exhibit the problem
// NOTE: tried using bitwise | to avoid branches and got internal compiler error
//bool not_finite = std::isnan(gk.val()) | std::isinf(gk.val());
bool not_finite = std::isnan(gk.val()) || std::isinf(gk.val());
#else
bool not_finite = !std::isfinite(gk.val());
#endif
bool not_finite = !std::isfinite(gk.val());
if (OSL_UNLIKELY(not_finite)) {
// Numeric failure of the filtered version. Fall
// back on the unfiltered.
Expand Down
10 changes: 0 additions & 10 deletions src/testshade/batched_simplerend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,17 +180,7 @@ BatchedSimpleRenderer<WidthT>::get_matrix(BatchedShaderGlobals* bsg,

OSL_OMP_PRAGMA(omp simd simdlen(WidthT))
for (int lane = 0; lane < WidthT; ++lane) {
# if __INTEL_COMPILER >= 1900
// Used load + blend + store instead of masked store to temporarily work around
// an icc19u5 issue when automatic ISA dispatch is used causing scatters to be generated
Matrix44 m = result[lane];
if (result.mask()[lane]) {
m = uniformTransform;
}
result[ActiveLane(lane)] = m;
# else
result[lane] = uniformTransform;
# endif
}
#endif
}
Expand Down
Loading
Loading