Skip to content
This repository has been archived by the owner on Sep 23, 2024. It is now read-only.

Commit

Permalink
kernel*.ispc modify use of rcp & rsqrt
Browse files Browse the repository at this point in the history
consolidate rcp and rsqrt to inline function to explicitly pick level of approximation - note ispc files still compiled with --opt=fast-math so rcp and rsqrt still used.
also, ASTC solid block colors scale colors fix (issue #28)
  • Loading branch information
DaveBookout-Intel committed Mar 5, 2021
1 parent c02540b commit 1587ff5
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 8 deletions.
28 changes: 23 additions & 5 deletions ispc_texcomp/kernel.ispc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@
// SOFTWARE.
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

<<<<<<< Updated upstream
#ifndef ISPC_UINT_IS_DEFINED
=======
//these are defined in ISPC version 1.13.0 and later
>>>>>>> Stashed changes
typedef unsigned int8 uint8;
typedef unsigned int32 uint32;
typedef unsigned int64 uint64;
Expand All @@ -22,6 +26,20 @@ typedef unsigned int64 uint64;
///////////////////////////
// generic helpers

inline float RCP(float x)
{
return 1.0f/x; // uses rcp when compiled with --opt=fast-math
//return rcp(x);
//return rcp_fast(x);
}

inline float RSQRT(float x)
{
return 1.0f/sqrt(x); // uses rsqrt when compiled with --opt=fast-math
//return rsqrt(x);
//return rsqrt_fast(x);
}

inline void swap_ints(int u[], int v[], uniform int n)
{
for (uniform int i=0; i<n; i++)
Expand Down Expand Up @@ -197,7 +215,7 @@ inline void compute_axis3(float axis[3], float covar[6], uniform const int power
for (uniform int p=0; p<3; p++)
norm_sq += axis[p]*axis[p];

float rnorm = rsqrt(norm_sq);
float rnorm = RSQRT(norm_sq);
for (uniform int p=0; p<3; p++) vec[p] *= rnorm;
}
}
Expand All @@ -221,7 +239,7 @@ inline void compute_axis(float axis[4], float covar[10], uniform const int power
for (uniform int p=0; p<channels; p++)
norm_sq += axis[p]*axis[p];

float rnorm = rsqrt(norm_sq);
float rnorm = RSQRT(norm_sq);
for (uniform int p=0; p<channels; p++) vec[p] *= rnorm;
}
}
Expand Down Expand Up @@ -298,7 +316,7 @@ inline void pick_endpoints(float c0[3], float c1[3], float block[48], float axis
for (uniform int p=0; p<3; p++)
norm_sq += axis[p]*axis[p];

float rnorm_sq = rcp(norm_sq);
float rnorm_sq = RCP(norm_sq);
for (uniform int p=0; p<3; p++)
{
c0[p] = clamp(dc[p]+min_dot*rnorm_sq*axis[p], 0, 255);
Expand All @@ -319,7 +337,7 @@ inline uint32 fast_quant(float block[48], int p0, int p1)
float sq_norm = 0;
for (uniform int p=0; p<3; p++) sq_norm += sq(dir[p]);

float rsq_norm = rcp(sq_norm);
float rsq_norm = RCP(sq_norm);

for (uniform int p=0; p<3; p++) dir[p] *= rsq_norm*3;

Expand Down Expand Up @@ -464,7 +482,7 @@ inline void bc1_refine(int pe[2], float block[48], unsigned int32 bits, float dc
float Cxx = 16*sq(3)-2*3*sum_q+sum_qq;
float Cyy = sum_qq;
float Cxy = 3*sum_q-sum_qq;
float scale = 3f * rcp(Cxx*Cyy - Cxy*Cxy);
float scale = 3f * RCP(Cxx*Cyy - Cxy*Cxy);

for (uniform int p=0; p<3; p++)
{
Expand Down
19 changes: 16 additions & 3 deletions ispc_texcomp/kernel_astc.ispc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,20 @@ typedef unsigned int64 uint64_t;
///////////////////////////
// generic helpers

inline float RCP(float x)
{
return 1.0f/x; // uses rcp when compiled with --opt=fast-math
//return rcp(x);
//return rcp_fast(x);
}

inline float RSQRT(float x)
{
return 1.0f/sqrt(x); // uses rsqrt when compiled with --opt=fast-math
//return rsqrt(x);
//return rsqrt_fast(x);
}

void swap(float& a, float& b)
{
int t = a;
Expand Down Expand Up @@ -295,7 +309,7 @@ inline void compute_axis(float axis[4], float covar[10], uniform const int power
for (uniform int p = 0; p < channels; p++)
norm_sq += axis[p] * axis[p];

float rnorm = rsqrt(norm_sq);
float rnorm = RSQRT(norm_sq);
for (uniform int p = 0; p < channels; p++) vec[p] *= rnorm;
}
}
Expand Down Expand Up @@ -1721,8 +1735,7 @@ void ls_refine_scale(float endpoints[4], float scaled_pixels[], astc_block block
float scale = xx[0] / (xx[1] + xx[0]);

if (xx[1] + xx[0] < 1) scale = 1;
if (scale > 0.9999) scale = 0.9999;
if (scale < 0) scale = 0;
scale = clamp(scale, 0.0f, 0.9999f); // note: clamp also takes care of possible NaNs

float sum_zz = 0;
float sum_zp[3] = { 0, 0, 0 };
Expand Down

0 comments on commit 1587ff5

Please sign in to comment.