diff --git a/scene/xml/whiskey.xml b/scene/xml/whiskey.xml
index c2c134d..def7a14 100644
--- a/scene/xml/whiskey.xml
+++ b/scene/xml/whiskey.xml
@@ -84,11 +84,11 @@
-
+
diff --git a/src/core/aabb.cuh b/src/core/aabb.cuh
index 1fe4d84..c6f903f 100644
--- a/src/core/aabb.cuh
+++ b/src/core/aabb.cuh
@@ -44,16 +44,15 @@ public:
CPT_CPU_GPU Vec3 centroid() const noexcept {return (maxi + mini) * 0.5f;}
CPT_CPU_GPU Vec3 range() const noexcept {return maxi - mini;}
- CPT_CPU_GPU bool intersect(const Ray& ray, float& t_near) const {
- auto invDir = 1.0f / ray.d;
- // long scoreboard
- auto t1s = (mini - ray.o) * invDir;
+ CPT_GPU bool intersect(const Ray& ray, float& t_near) const {
+ auto invDir = ray.d.rcp();
+ auto t1s = (mini - ray.o) * invDir; // long scoreboard
auto t2s = (maxi - ray.o) * invDir;
float tmin = t1s.minimize(t2s).max_elem();
float tmax = t1s.maximize(t2s).min_elem();
t_near = tmin;
- return tmax > tmin && tmax > 0;
+ return (tmax > tmin) && (tmax > 0); // local memory access problem
}
CONDITION_TEMPLATE(AABBType, AABB)
@@ -104,5 +103,5 @@ struct AABBWrapper {
float4 _padding; // padding is here to avoid bank conflict
};
-using ConstAABBPtr = const AABB* const;
-using ConstAABBWPtr = const AABBWrapper* const;
\ No newline at end of file
+using ConstAABBPtr = const AABB* const __restrict__;
+using ConstAABBWPtr = const AABBWrapper* const __restrict__;
\ No newline at end of file
diff --git a/src/core/aos.cuh b/src/core/aos.cuh
index d4c7ce2..3d1b6d3 100644
--- a/src/core/aos.cuh
+++ b/src/core/aos.cuh
@@ -212,7 +212,7 @@ public:
#undef INDEX_Y
#undef INDEX_Z
-using ConstF4Ptr = const float4* const;
-using ConstVertPtr = const PrecomputedArray* const;
-using ConstNormPtr = const ArrayType* const;
-using ConstUVPtr = const ArrayType* const;
+using ConstF4Ptr = const float4* const __restrict__;
+using ConstVertPtr = const PrecomputedArray* const __restrict__;
+using ConstNormPtr = const ArrayType* const __restrict__;
+using ConstUVPtr = const ArrayType* const __restrict__;
diff --git a/src/core/bsdf.cuh b/src/core/bsdf.cuh
index 0bc308c..29aadab 100644
--- a/src/core/bsdf.cuh
+++ b/src/core/bsdf.cuh
@@ -43,13 +43,13 @@ public:
CPT_GPU void set_ex_id(int v) noexcept { this->ex_tex_id = v; }
CPT_GPU void set_lobe(int v) noexcept { this->bsdf_flag = v; }
- CPT_CPU_GPU virtual float pdf(const Interaction& it, const Vec3& out, const Vec3& in) const = 0;
+ CPT_GPU virtual float pdf(const Interaction& it, const Vec3& out, const Vec3& in) const = 0;
- CPT_CPU_GPU virtual Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const = 0;
+ CPT_GPU virtual Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const = 0;
- CPT_CPU_GPU virtual Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float&, Vec2&& uv) const = 0;
+ CPT_GPU virtual Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float&, Vec2&& uv) const = 0;
- CPT_CPU_GPU_INLINE bool require_lobe(BSDFFlag flags) const noexcept {
+ CPT_GPU_INLINE bool require_lobe(BSDFFlag flags) const noexcept {
return (bsdf_flag & (int)flags) > 0;
}
};
@@ -64,14 +64,14 @@ public:
CPT_CPU_GPU LambertianBSDF(): BSDF() {}
- CPT_CPU_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override {
+ CPT_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override {
// printf("it.norm: %f, %f, %f\n", it.shading_norm.x(), it.shading_norm.y(), it.shading_norm.z());
// printf("out: %f, %f, %f\n", out.x(), out.y(), out.z());
float dot_val = it.shading_norm.dot(out);
return max(it.shading_norm.dot(out), 0.f) * M_1_Pi;
}
- CPT_CPU_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
+ CPT_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
float cos_term = it.shading_norm.dot(out);
float dot_in = it.shading_norm.dot(in);
float same_side = (dot_in > 0) ^ (cos_term > 0); // should be positive or negative at the same time
@@ -79,7 +79,7 @@ public:
return k_d * max(0.f, cos_term) * M_1_Pi * same_side;
}
- CPT_CPU_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
+ CPT_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
auto local_ray = sample_cosine_hemisphere(std::move(uv), pdf);
auto out_ray = delocalize_rotate(it.shading_norm, local_ray);
// throughput *= f / pdf --> k_d * cos / pi / (pdf = cos / pi) == k_d
@@ -98,16 +98,16 @@ public:
CPT_CPU_GPU SpecularBSDF(): BSDF() {}
- CPT_CPU_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override {
+ CPT_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override {
return 0.f;
}
- CPT_CPU_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
+ CPT_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
auto ref_dir = in.advance(it.shading_norm, -2.f * in.dot(it.shading_norm)).normalized();
return k_s * (out.dot(ref_dir) > 0.99999f);
}
- CPT_CPU_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
+ CPT_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
// throughput *= f / pdf
pdf = 1.f;
throughput *= k_s * (indir.dot(it.shading_norm) < 0);
@@ -125,11 +125,11 @@ public:
CPT_CPU_GPU TranslucentBSDF(): BSDF() {}
- CPT_CPU_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& incid) const override {
+ CPT_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& incid) const override {
return 0.f;
}
- CPT_CPU_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
+ CPT_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
float dot_normal = in.dot(it.shading_norm);
// at least according to pbrt-v3, ni / nr is computed as the following (using shading normal)
// see https://computergraphics.stackexchange.com/questions/13540/shading-normal-and-geometric-normal-for-refractive-surface-rendering
@@ -143,7 +143,7 @@ public:
return k_s * (reflc_dot | refra_dot);
}
- CPT_CPU_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
+ CPT_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
float dot_normal = indir.dot(it.shading_norm);
// at least according to pbrt-v3, ni / nr is computed as the following (using shading normal)
// see https://computergraphics.stackexchange.com/questions/13540/shading-normal-and-geometric-normal-for-refractive-surface-rendering
@@ -160,11 +160,11 @@ public:
return ret_dir;
}
- CPT_CPU_GPU_INLINE static bool is_total_reflection(float dot_normal, float ni, float nr) {
+ CPT_GPU_INLINE static bool is_total_reflection(float dot_normal, float ni, float nr) {
return (1.f - (ni * ni) / (nr * nr) * (1.f - dot_normal * dot_normal)) < 0.f;
}
- CPT_CPU_GPU static Vec3 snell_refraction(const Vec3& incid, const Vec3& normal, float& cos_r2, float dot_n, float ni, float nr) {
+ CPT_GPU static Vec3 snell_refraction(const Vec3& incid, const Vec3& normal, float& cos_r2, float dot_n, float ni, float nr) {
/* Refraction vector by Snell's Law, note that an extra flag will be returned */
float ratio = ni / nr;
cos_r2 = 1.f - (ratio * ratio) * (1. - dot_n * dot_n); // refraction angle cosine
@@ -173,7 +173,7 @@ public:
return (ratio * incid - ratio * dot_n * normal + sgn(dot_n) * sqrtf(fabsf(cos_r2)) * normal).normalized() * (cos_r2 > 0.f);
}
- CPT_CPU_GPU static float fresnel_equation(float n_in, float n_out, float cos_inc, float cos_ref) {
+ CPT_GPU static float fresnel_equation(float n_in, float n_out, float cos_inc, float cos_ref) {
/**
Fresnel Equation for calculating specular ratio
Since Schlick's Approximation is not clear about n1->n2, n2->n1 (different) effects
diff --git a/src/core/object.cuh b/src/core/object.cuh
index 2a1519e..d09229a 100644
--- a/src/core/object.cuh
+++ b/src/core/object.cuh
@@ -18,7 +18,7 @@ public:
float inv_area; // inverse area
uint8_t emitter_id; // index to the emitter, 0xff means not an emitter
public:
- CPT_CPU_GPU_INLINE bool intersect(const Ray& ray, float& t_near) const noexcept {
+ CPT_GPU_INLINE bool intersect(const Ray& ray, float& t_near) const noexcept {
return _aabb.intersect(ray, t_near);
}
diff --git a/src/core/primitives.cuh b/src/core/primitives.cuh
index 8d07030..cc18c29 100644
--- a/src/core/primitives.cuh
+++ b/src/core/primitives.cuh
@@ -12,14 +12,8 @@
#include "core/ray.cuh"
#include "core/interaction.cuh"
-using SharedVec3Ptr = Vec3 (*)[32];
-using SharedVec2Ptr = Vec2 (*)[32];
-using ConstSharedVec3Ptr = const Vec3 (*)[32];
-using ConstSharedVec2Ptr = const Vec2 (*)[32];
-
// #define TRIANGLE_ONLY
-// All static
class Primitive {
private:
CPT_GPU_INLINE static float intersect_sphere(
diff --git a/src/core/so3.cuh b/src/core/so3.cuh
index 5ee0e78..5df78f2 100644
--- a/src/core/so3.cuh
+++ b/src/core/so3.cuh
@@ -192,8 +192,8 @@ public:
return output;
}
- friend CPT_CPU_GPU SO3 rotation_between(Vec3&& from, const Vec3& to);
- friend CPT_CPU_GPU SO3 rotation_local_to_world(const Vec3& to);
+ friend CPT_GPU SO3 rotation_between(Vec3&& from, const Vec3& to);
+ friend CPT_GPU SO3 rotation_local_to_world(const Vec3& to);
};
CPT_CPU_GPU_INLINE SO3 skew_symmetry(const Vec3& v) {
@@ -213,7 +213,7 @@ CPT_CPU_GPU_INLINE SO3 vec_mul(const Vec3& v1, const Vec3& v2) {
}
// This can be improved (maybe not, Rodrigues tranformation is already good enough)
-CPT_CPU_GPU_INLINE SO3 rotation_between(Vec3&& from, const Vec3& to) {
+CPT_GPU_INLINE SO3 rotation_between(Vec3&& from, const Vec3& to) {
auto axis = from.cross(to);
float cos_theta = from.dot(to);
SO3 R = SO3::diag(cos_theta);
@@ -225,7 +225,7 @@ CPT_CPU_GPU_INLINE SO3 rotation_between(Vec3&& from, const Vec3& to) {
return R;
}
-CPT_CPU_GPU_INLINE SO3 rotation_local_to_world(const Vec3& to) {
+CPT_GPU_INLINE SO3 rotation_local_to_world(const Vec3& to) {
auto axis = Vec3(-to.y(), to.x(), 0);
SO3 R = SO3::diag(to.z());
if (abs(to.z()) < 1.f - 1e-5f) {
@@ -237,12 +237,12 @@ CPT_CPU_GPU_INLINE SO3 rotation_local_to_world(const Vec3& to) {
}
CONDITION_TEMPLATE(VecType, Vec3)
-CPT_CPU_GPU_INLINE Vec3 delocalize_rotate(VecType&& anchor, const Vec3& to, const Vec3& input) {
+CPT_GPU_INLINE Vec3 delocalize_rotate(VecType&& anchor, const Vec3& to, const Vec3& input) {
SO3 R = rotation_between(std::move(anchor), to);
return R.rotate(input);
}
// Specialized, when the anchor is (0, 0, 1)
-CPT_CPU_GPU_INLINE Vec3 delocalize_rotate(const Vec3& to, const Vec3& input) {
+CPT_GPU_INLINE Vec3 delocalize_rotate(const Vec3& to, const Vec3& input) {
return rotation_local_to_world(to).rotate(input);
}
\ No newline at end of file
diff --git a/src/core/vec2.cuh b/src/core/vec2.cuh
index 8c37ad2..8be968a 100644
--- a/src/core/vec2.cuh
+++ b/src/core/vec2.cuh
@@ -95,17 +95,17 @@ public:
return *this;
}
- CPT_CPU_GPU_INLINE
- Vec2 normalized() const { return *this * rsqrtf(length2()); }
+ CPT_GPU_INLINE
+ Vec2 normalized() const { return *this * rhypotf(_data.x, _data.y); }
- CPT_CPU_GPU_INLINE
- void normalize() { this->operator*=(rsqrtf(length2())); }
+ CPT_GPU_INLINE
+ void normalize() { this->operator*=(rhypotf(_data.x, _data.y)); }
CPT_CPU_GPU_INLINE
float length2() const { return _data.x * _data.x + _data.y * _data.y; }
- CPT_CPU_GPU_INLINE
- float length() const { return sqrt(length2()); }
+ CPT_GPU_INLINE
+ float length() const { return hypotf(_data.x, _data.y); }
CONDITION_TEMPLATE(VecType, Vec2)
CPT_CPU_GPU_INLINE
diff --git a/src/core/vec3.cuh b/src/core/vec3.cuh
index 6f7325b..5bb00fb 100644
--- a/src/core/vec3.cuh
+++ b/src/core/vec3.cuh
@@ -139,17 +139,28 @@ public:
return Vec3(fmaf(d.x(), t, _data.x), fmaf(d.y(), t, _data.y), fmaf(d.z(), t, _data.z));
}
- CPT_CPU_GPU_INLINE
- Vec3 normalized() const { return *this * rsqrtf(length2()); }
+ // ============== Specialized version using CUDA math function ===============
+ CPT_GPU_INLINE
+ Vec3 normalized() const { return *this * rnorm3df(_data.x, _data.y, _data.z); }
- CPT_CPU_GPU_INLINE
- void normalize() { this->operator*=(rsqrtf(length2())); }
+ CPT_GPU_INLINE
+ void normalize() { this->operator*=(rnorm3df(_data.x, _data.y, _data.z)); }
+
+ CPT_GPU_INLINE
+ float length() const { return norm3df(_data.x, _data.y, _data.z); }
+ // ============== Specialized version using CUDA math function ===============
CPT_CPU_GPU_INLINE
float length2() const { return fmaf(_data.x, _data.x, fmaf(_data.y, _data.y, _data.z * _data.z)); }
CPT_CPU_GPU_INLINE
- float length() const { return sqrt(length2()); }
+ Vec3 normalized_h() const { return *this * rsqrtf(length2()); }
+
+ CPT_CPU_GPU_INLINE
+ void normalize_h() { this->operator*=(rsqrtf(length2())); }
+
+ CPT_CPU_GPU_INLINE
+ float length_h() const { return sqrtf(length2()); }
CONDITION_TEMPLATE(VecType, Vec3)
CPT_CPU_GPU_INLINE
@@ -195,6 +206,8 @@ public:
CPT_CPU_GPU_INLINE
float min_elem() const noexcept { return fminf(_data.x, fminf(_data.y, _data.z)); }
+ CPT_GPU_INLINE Vec3 rcp() const noexcept { return Vec3(__frcp_rn(_data.x), __frcp_rn(_data.y), __frcp_rn(_data.z)); }
+
CPT_CPU_GPU_INLINE operator float3() const {return _data;}
};
@@ -207,5 +220,5 @@ CPT_CPU_GPU_INLINE
Vec3 operator*(float b, VecType&& v) noexcept { return Vec3(v.x() * b, v.y() * b, v.z() * b); }
CONDITION_TEMPLATE(VecType, Vec3)
-CPT_CPU_GPU_INLINE
-Vec3 operator/(float b, VecType&& v) noexcept { return Vec3(b / v.x(), b / v.y(), b / v.z()); }
\ No newline at end of file
+CPT_GPU_INLINE
+Vec3 operator/(float b, VecType&& v) noexcept { return Vec3(b * __frcp_rn(v.x()), b * __frcp_rn(v.y()), b * __frcp_rn(v.z())); }
\ No newline at end of file
diff --git a/src/impl/camera_model.cu b/src/impl/camera_model.cu
index 73fad33..91215ae 100644
--- a/src/impl/camera_model.cu
+++ b/src/impl/camera_model.cu
@@ -38,9 +38,9 @@ CPT_CPU_GPU DeviceCamera::DeviceCamera(
const Vec3& from, const Vec3& lookat, float fov,
float w, float h, float hsign, float vsign, Vec3 up
): t(from), inv_focal(1.f / fov2focal(fov, w)), _hw(w * 0.5f), _hh(h * 0.5f), signs(hsign, vsign), use_orthogonal(false) {
- Vec3 forward = (lookat - from).normalized();
- up.normalize();
- Vec3 right = up.cross(forward).normalized();
+ Vec3 forward = (lookat - from).normalized_h();
+ up.normalize_h();
+ Vec3 right = up.cross(forward).normalized_h();
R = SO3(right, up, forward, false);
}
@@ -49,11 +49,11 @@ CPT_CPU void DeviceCamera::rotate(float yaw, float pitch) {
quat_pit = Quaternion::angleAxis(pitch, Vec3(-signs.y(), 0, 0));
SO3 rot = SO3::from_quat(quat_yaw * quat_pit);
R = R * rot;
- Vec3 forward = R.col(2).normalized(),
+ Vec3 forward = R.col(2).normalized_h(),
right = R.col(0);
right.y() = 0;
right *= 1.f / sqrtf(right.x() * right.x() + right.z() * right.z());
- Vec3 up = -right.cross(forward).normalized();
+ Vec3 up = -right.cross(forward).normalized_h();
R = SO3(right, up, forward, false);
}
diff --git a/src/impl/object.cu b/src/impl/object.cu
index 89fb04d..cf47115 100644
--- a/src/impl/object.cu
+++ b/src/impl/object.cu
@@ -23,7 +23,7 @@ CPT_CPU_GPU void ObjInfo::setup(const ArrayType& prims, bool is_polygon) {
_aabb.maxi.maximized(prims.z(i));
_aabb.mini -= AABB_EPS;
_aabb.maxi += AABB_EPS;
- inv_area += (prims.y(i) - prims.x(i)).cross(prims.z(i) - prims.x(i)).length();
+ inv_area += (prims.y(i) - prims.x(i)).cross(prims.z(i) - prims.x(i)).length_h();
} else {
_aabb.mini = prims.x(i) - prims.y(i).x();
_aabb.maxi = prims.x(i) + prims.y(i).x();
@@ -48,7 +48,7 @@ CPT_CPU void ObjInfo::setup(const std::array, 3>& prims, bool
_aabb.maxi.maximized(prims[2][i]);
_aabb.mini -= AABB_EPS;
_aabb.maxi += AABB_EPS;
- inv_area += (prims[1][i] - prims[0][i]).cross(prims[2][i] - prims[0][i]).length();
+ inv_area += (prims[1][i] - prims[0][i]).cross(prims[2][i] - prims[0][i]).length_h();
} else {
_aabb.mini = prims[0][i] - prims[1][i].x();
_aabb.maxi = prims[0][i] + prims[1][i].x();
diff --git a/src/impl/scene.cu b/src/impl/scene.cu
index d2d7c10..aee5d64 100644
--- a/src/impl/scene.cu
+++ b/src/impl/scene.cu
@@ -338,7 +338,7 @@ void parseObjShape(
if (!has_normal) { // compute normals ourselves
printf("Normal vector not found in '%s' primitive %llu, computing yet normal direction is not guaranteed.\n", name.c_str(), i);
Vec3 diff = verts_list[1][i] - verts_list[0][i];
- Vec3 normal = diff.cross(verts_list[2][i] - verts_list[0][i]).normalized();
+ Vec3 normal = diff.cross(verts_list[2][i] - verts_list[0][i]).normalized_h();
for (int j = 0; j < 3; j++) {
norms_list[j].push_back(normal);
}
diff --git a/src/pt_impl/megakernel_lt.cu b/src/pt_impl/megakernel_lt.cu
index 70c3d7d..d0d1c1c 100644
--- a/src/pt_impl/megakernel_lt.cu
+++ b/src/pt_impl/megakernel_lt.cu
@@ -43,7 +43,7 @@ CPT_KERNEL void render_lt_kernel(
const cudaTextureObject_t node_backs,
ConstF4Ptr cached_nodes,
DeviceImage image,
- float* output_buffer,
+ float* __restrict__ output_buffer,
int num_prims,
int num_objects,
int num_emitter,
@@ -195,7 +195,7 @@ template CPT_KERNEL void render_lt_kernel(
const cudaTextureObject_t node_backs,
ConstF4Ptr cached_nodes,
DeviceImage image,
- float* output_buffer,
+ float* __restrict__ output_buffer,
int num_prims,
int num_objects,
int num_emitter,
@@ -220,7 +220,7 @@ template CPT_KERNEL void render_lt_kernel(
const cudaTextureObject_t node_backs,
ConstF4Ptr cached_nodes,
DeviceImage image,
- float* output_buffer,
+ float* __restrict__ output_buffer,
int num_prims,
int num_objects,
int num_emitter,
diff --git a/src/pt_impl/megakernel_pt.cu b/src/pt_impl/megakernel_pt.cu
index 5685d77..fe26692 100644
--- a/src/pt_impl/megakernel_pt.cu
+++ b/src/pt_impl/megakernel_pt.cu
@@ -228,7 +228,7 @@ CPT_KERNEL void render_pt_kernel(
const cudaTextureObject_t node_backs,
ConstF4Ptr cached_nodes,
DeviceImage image,
- float* output_buffer,
+ float* __restrict__ output_buffer,
int num_prims,
int num_objects,
int num_emitter,
@@ -384,7 +384,7 @@ template CPT_KERNEL void render_pt_kernel(
const cudaTextureObject_t node_backs,
ConstF4Ptr cached_nodes,
DeviceImage image,
- float* output_buffer,
+ float* __restrict__ output_buffer,
int num_prims,
int num_objects,
int num_emitter,
@@ -407,7 +407,7 @@ template CPT_KERNEL void render_pt_kernel(
const cudaTextureObject_t node_backs,
ConstF4Ptr cached_nodes,
DeviceImage image,
- float* output_buffer,
+ float* __restrict__ output_buffer,
int num_prims,
int num_objects,
int num_emitter,
diff --git a/src/pt_impl/wavefront_pt.cu b/src/pt_impl/wavefront_pt.cu
index 4682a9d..c9df632 100644
--- a/src/pt_impl/wavefront_pt.cu
+++ b/src/pt_impl/wavefront_pt.cu
@@ -7,7 +7,7 @@
#include "renderer/wavefront_pt.cuh"
namespace {
- using PayLoadBuffer = PayLoadBufferSoA* const;
+ using PayLoadBuffer = PayLoadBufferSoA* const __restrict__;
using ConstPayLoadBuffer = const PayLoadBuffer;
}
diff --git a/src/renderer/megakernel_pt.cuh b/src/renderer/megakernel_pt.cuh
index 1abe590..0aa029e 100644
--- a/src/renderer/megakernel_pt.cuh
+++ b/src/renderer/megakernel_pt.cuh
@@ -16,10 +16,10 @@
extern __constant__ Emitter* c_emitter[9]; // c_emitter[8] is a dummy emitter
extern __constant__ BSDF* c_material[32];
-using ConstNodePtr = const LinearNode* const;
-using ConstObjPtr = const ObjInfo* const;
-using ConstBSDFPtr = const BSDF* const;
-using ConstIndexPtr = const int* const;
+using ConstNodePtr = const LinearNode* const __restrict__;
+using ConstObjPtr = const ObjInfo* const __restrict__;
+using ConstBSDFPtr = const BSDF* const __restrict__;
+using ConstIndexPtr = const int* const __restrict__;
/**
* Occlusion test, computation is done on global memory
@@ -105,7 +105,7 @@ CPT_KERNEL void render_pt_kernel(
const cudaTextureObject_t node_backs,
ConstF4Ptr cached_nodes,
DeviceImage image,
- float* output_buffer,
+ float* __restrict__ output_buffer,
int num_prims,
int num_objects,
int num_emitter,
@@ -137,7 +137,7 @@ CPT_KERNEL void render_lt_kernel(
const cudaTextureObject_t node_backs,
ConstF4Ptr cached_nodes,
DeviceImage image,
- float* output_buffer,
+ float* __restrict__ output_buffer,
int num_prims,
int num_objects,
int num_emitter,
diff --git a/src/renderer/wavefront_pt.cuh b/src/renderer/wavefront_pt.cuh
index 4dfa75a..23d885a 100644
--- a/src/renderer/wavefront_pt.cuh
+++ b/src/renderer/wavefront_pt.cuh
@@ -53,7 +53,7 @@ namespace {
constexpr int PATCH_Y = BLOCK_Y * THREAD_Y;
constexpr int TOTAL_RAY = PATCH_X * PATCH_Y;
- using IndexBuffer = uint32_t* const;
+ using IndexBuffer = uint32_t* const __restrict__;
}
union PDFInteraction {