diff --git a/scene/xml/whiskey.xml b/scene/xml/whiskey.xml index c2c134d..def7a14 100644 --- a/scene/xml/whiskey.xml +++ b/scene/xml/whiskey.xml @@ -84,11 +84,11 @@ - + diff --git a/src/core/aabb.cuh b/src/core/aabb.cuh index 1fe4d84..c6f903f 100644 --- a/src/core/aabb.cuh +++ b/src/core/aabb.cuh @@ -44,16 +44,15 @@ public: CPT_CPU_GPU Vec3 centroid() const noexcept {return (maxi + mini) * 0.5f;} CPT_CPU_GPU Vec3 range() const noexcept {return maxi - mini;} - CPT_CPU_GPU bool intersect(const Ray& ray, float& t_near) const { - auto invDir = 1.0f / ray.d; - // long scoreboard - auto t1s = (mini - ray.o) * invDir; + CPT_GPU bool intersect(const Ray& ray, float& t_near) const { + auto invDir = ray.d.rcp(); + auto t1s = (mini - ray.o) * invDir; // long scoreboard auto t2s = (maxi - ray.o) * invDir; float tmin = t1s.minimize(t2s).max_elem(); float tmax = t1s.maximize(t2s).min_elem(); t_near = tmin; - return tmax > tmin && tmax > 0; + return (tmax > tmin) && (tmax > 0); // local memory access problem } CONDITION_TEMPLATE(AABBType, AABB) @@ -104,5 +103,5 @@ struct AABBWrapper { float4 _padding; // padding is here to avoid bank conflict }; -using ConstAABBPtr = const AABB* const; -using ConstAABBWPtr = const AABBWrapper* const; \ No newline at end of file +using ConstAABBPtr = const AABB* const __restrict__; +using ConstAABBWPtr = const AABBWrapper* const __restrict__; \ No newline at end of file diff --git a/src/core/aos.cuh b/src/core/aos.cuh index d4c7ce2..3d1b6d3 100644 --- a/src/core/aos.cuh +++ b/src/core/aos.cuh @@ -212,7 +212,7 @@ public: #undef INDEX_Y #undef INDEX_Z -using ConstF4Ptr = const float4* const; -using ConstVertPtr = const PrecomputedArray* const; -using ConstNormPtr = const ArrayType* const; -using ConstUVPtr = const ArrayType* const; +using ConstF4Ptr = const float4* const __restrict__; +using ConstVertPtr = const PrecomputedArray* const __restrict__; +using ConstNormPtr = const ArrayType* const __restrict__; +using ConstUVPtr = const ArrayType* const __restrict__; diff --git a/src/core/bsdf.cuh b/src/core/bsdf.cuh index 0bc308c..29aadab 100644 --- a/src/core/bsdf.cuh +++ b/src/core/bsdf.cuh @@ -43,13 +43,13 @@ public: CPT_GPU void set_ex_id(int v) noexcept { this->ex_tex_id = v; } CPT_GPU void set_lobe(int v) noexcept { this->bsdf_flag = v; } - CPT_CPU_GPU virtual float pdf(const Interaction& it, const Vec3& out, const Vec3& in) const = 0; + CPT_GPU virtual float pdf(const Interaction& it, const Vec3& out, const Vec3& in) const = 0; - CPT_CPU_GPU virtual Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const = 0; + CPT_GPU virtual Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const = 0; - CPT_CPU_GPU virtual Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float&, Vec2&& uv) const = 0; + CPT_GPU virtual Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float&, Vec2&& uv) const = 0; - CPT_CPU_GPU_INLINE bool require_lobe(BSDFFlag flags) const noexcept { + CPT_GPU_INLINE bool require_lobe(BSDFFlag flags) const noexcept { return (bsdf_flag & (int)flags) > 0; } }; @@ -64,14 +64,14 @@ public: CPT_CPU_GPU LambertianBSDF(): BSDF() {} - CPT_CPU_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override { + CPT_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override { // printf("it.norm: %f, %f, %f\n", it.shading_norm.x(), it.shading_norm.y(), it.shading_norm.z()); // printf("out: %f, %f, %f\n", out.x(), out.y(), out.z()); float dot_val = it.shading_norm.dot(out); return max(it.shading_norm.dot(out), 0.f) * M_1_Pi; } - CPT_CPU_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override { + CPT_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override { float cos_term = it.shading_norm.dot(out); float dot_in = it.shading_norm.dot(in); float same_side = (dot_in > 0) ^ (cos_term > 0); // should be positive or negative at the same time @@ -79,7 +79,7 @@ public: return k_d * max(0.f, cos_term) * M_1_Pi * same_side; } - CPT_CPU_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override { + CPT_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override { auto local_ray = sample_cosine_hemisphere(std::move(uv), pdf); auto out_ray = delocalize_rotate(it.shading_norm, local_ray); // throughput *= f / pdf --> k_d * cos / pi / (pdf = cos / pi) == k_d @@ -98,16 +98,16 @@ public: CPT_CPU_GPU SpecularBSDF(): BSDF() {} - CPT_CPU_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override { + CPT_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override { return 0.f; } - CPT_CPU_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override { + CPT_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override { auto ref_dir = in.advance(it.shading_norm, -2.f * in.dot(it.shading_norm)).normalized(); return k_s * (out.dot(ref_dir) > 0.99999f); } - CPT_CPU_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override { + CPT_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override { // throughput *= f / pdf pdf = 1.f; throughput *= k_s * (indir.dot(it.shading_norm) < 0); @@ -125,11 +125,11 @@ public: CPT_CPU_GPU TranslucentBSDF(): BSDF() {} - CPT_CPU_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& incid) const override { + CPT_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& incid) const override { return 0.f; } - CPT_CPU_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override { + CPT_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override { float dot_normal = in.dot(it.shading_norm); // at least according to pbrt-v3, ni / nr is computed as the following (using shading normal) // see https://computergraphics.stackexchange.com/questions/13540/shading-normal-and-geometric-normal-for-refractive-surface-rendering @@ -143,7 +143,7 @@ public: return k_s * (reflc_dot | refra_dot); } - CPT_CPU_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override { + CPT_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override { float dot_normal = indir.dot(it.shading_norm); // at least according to pbrt-v3, ni / nr is computed as the following (using shading normal) // see https://computergraphics.stackexchange.com/questions/13540/shading-normal-and-geometric-normal-for-refractive-surface-rendering @@ -160,11 +160,11 @@ public: return ret_dir; } - CPT_CPU_GPU_INLINE static bool is_total_reflection(float dot_normal, float ni, float nr) { + CPT_GPU_INLINE static bool is_total_reflection(float dot_normal, float ni, float nr) { return (1.f - (ni * ni) / (nr * nr) * (1.f - dot_normal * dot_normal)) < 0.f; } - CPT_CPU_GPU static Vec3 snell_refraction(const Vec3& incid, const Vec3& normal, float& cos_r2, float dot_n, float ni, float nr) { + CPT_GPU static Vec3 snell_refraction(const Vec3& incid, const Vec3& normal, float& cos_r2, float dot_n, float ni, float nr) { /* Refraction vector by Snell's Law, note that an extra flag will be returned */ float ratio = ni / nr; cos_r2 = 1.f - (ratio * ratio) * (1. - dot_n * dot_n); // refraction angle cosine @@ -173,7 +173,7 @@ public: return (ratio * incid - ratio * dot_n * normal + sgn(dot_n) * sqrtf(fabsf(cos_r2)) * normal).normalized() * (cos_r2 > 0.f); } - CPT_CPU_GPU static float fresnel_equation(float n_in, float n_out, float cos_inc, float cos_ref) { + CPT_GPU static float fresnel_equation(float n_in, float n_out, float cos_inc, float cos_ref) { /** Fresnel Equation for calculating specular ratio Since Schlick's Approximation is not clear about n1->n2, n2->n1 (different) effects diff --git a/src/core/object.cuh b/src/core/object.cuh index 2a1519e..d09229a 100644 --- a/src/core/object.cuh +++ b/src/core/object.cuh @@ -18,7 +18,7 @@ public: float inv_area; // inverse area uint8_t emitter_id; // index to the emitter, 0xff means not an emitter public: - CPT_CPU_GPU_INLINE bool intersect(const Ray& ray, float& t_near) const noexcept { + CPT_GPU_INLINE bool intersect(const Ray& ray, float& t_near) const noexcept { return _aabb.intersect(ray, t_near); } diff --git a/src/core/primitives.cuh b/src/core/primitives.cuh index 8d07030..cc18c29 100644 --- a/src/core/primitives.cuh +++ b/src/core/primitives.cuh @@ -12,14 +12,8 @@ #include "core/ray.cuh" #include "core/interaction.cuh" -using SharedVec3Ptr = Vec3 (*)[32]; -using SharedVec2Ptr = Vec2 (*)[32]; -using ConstSharedVec3Ptr = const Vec3 (*)[32]; -using ConstSharedVec2Ptr = const Vec2 (*)[32]; - // #define TRIANGLE_ONLY -// All static class Primitive { private: CPT_GPU_INLINE static float intersect_sphere( diff --git a/src/core/so3.cuh b/src/core/so3.cuh index 5ee0e78..5df78f2 100644 --- a/src/core/so3.cuh +++ b/src/core/so3.cuh @@ -192,8 +192,8 @@ public: return output; } - friend CPT_CPU_GPU SO3 rotation_between(Vec3&& from, const Vec3& to); - friend CPT_CPU_GPU SO3 rotation_local_to_world(const Vec3& to); + friend CPT_GPU SO3 rotation_between(Vec3&& from, const Vec3& to); + friend CPT_GPU SO3 rotation_local_to_world(const Vec3& to); }; CPT_CPU_GPU_INLINE SO3 skew_symmetry(const Vec3& v) { @@ -213,7 +213,7 @@ CPT_CPU_GPU_INLINE SO3 vec_mul(const Vec3& v1, const Vec3& v2) { } // This can be improved (maybe not, Rodrigues tranformation is already good enough) -CPT_CPU_GPU_INLINE SO3 rotation_between(Vec3&& from, const Vec3& to) { +CPT_GPU_INLINE SO3 rotation_between(Vec3&& from, const Vec3& to) { auto axis = from.cross(to); float cos_theta = from.dot(to); SO3 R = SO3::diag(cos_theta); @@ -225,7 +225,7 @@ CPT_CPU_GPU_INLINE SO3 rotation_between(Vec3&& from, const Vec3& to) { return R; } -CPT_CPU_GPU_INLINE SO3 rotation_local_to_world(const Vec3& to) { +CPT_GPU_INLINE SO3 rotation_local_to_world(const Vec3& to) { auto axis = Vec3(-to.y(), to.x(), 0); SO3 R = SO3::diag(to.z()); if (abs(to.z()) < 1.f - 1e-5f) { @@ -237,12 +237,12 @@ CPT_CPU_GPU_INLINE SO3 rotation_local_to_world(const Vec3& to) { } CONDITION_TEMPLATE(VecType, Vec3) -CPT_CPU_GPU_INLINE Vec3 delocalize_rotate(VecType&& anchor, const Vec3& to, const Vec3& input) { +CPT_GPU_INLINE Vec3 delocalize_rotate(VecType&& anchor, const Vec3& to, const Vec3& input) { SO3 R = rotation_between(std::move(anchor), to); return R.rotate(input); } // Specialized, when the anchor is (0, 0, 1) -CPT_CPU_GPU_INLINE Vec3 delocalize_rotate(const Vec3& to, const Vec3& input) { +CPT_GPU_INLINE Vec3 delocalize_rotate(const Vec3& to, const Vec3& input) { return rotation_local_to_world(to).rotate(input); } \ No newline at end of file diff --git a/src/core/vec2.cuh b/src/core/vec2.cuh index 8c37ad2..8be968a 100644 --- a/src/core/vec2.cuh +++ b/src/core/vec2.cuh @@ -95,17 +95,17 @@ public: return *this; } - CPT_CPU_GPU_INLINE - Vec2 normalized() const { return *this * rsqrtf(length2()); } + CPT_GPU_INLINE + Vec2 normalized() const { return *this * rhypotf(_data.x, _data.y); } - CPT_CPU_GPU_INLINE - void normalize() { this->operator*=(rsqrtf(length2())); } + CPT_GPU_INLINE + void normalize() { this->operator*=(rhypotf(_data.x, _data.y)); } CPT_CPU_GPU_INLINE float length2() const { return _data.x * _data.x + _data.y * _data.y; } - CPT_CPU_GPU_INLINE - float length() const { return sqrt(length2()); } + CPT_GPU_INLINE + float length() const { return hypotf(_data.x, _data.y); } CONDITION_TEMPLATE(VecType, Vec2) CPT_CPU_GPU_INLINE diff --git a/src/core/vec3.cuh b/src/core/vec3.cuh index 6f7325b..5bb00fb 100644 --- a/src/core/vec3.cuh +++ b/src/core/vec3.cuh @@ -139,17 +139,28 @@ public: return Vec3(fmaf(d.x(), t, _data.x), fmaf(d.y(), t, _data.y), fmaf(d.z(), t, _data.z)); } - CPT_CPU_GPU_INLINE - Vec3 normalized() const { return *this * rsqrtf(length2()); } + // ============== Specialized version using CUDA math function =============== + CPT_GPU_INLINE + Vec3 normalized() const { return *this * rnorm3df(_data.x, _data.y, _data.z); } - CPT_CPU_GPU_INLINE - void normalize() { this->operator*=(rsqrtf(length2())); } + CPT_GPU_INLINE + void normalize() { this->operator*=(rnorm3df(_data.x, _data.y, _data.z)); } + + CPT_GPU_INLINE + float length() const { return norm3df(_data.x, _data.y, _data.z); } + // ============== Specialized version using CUDA math function =============== CPT_CPU_GPU_INLINE float length2() const { return fmaf(_data.x, _data.x, fmaf(_data.y, _data.y, _data.z * _data.z)); } CPT_CPU_GPU_INLINE - float length() const { return sqrt(length2()); } + Vec3 normalized_h() const { return *this * rsqrtf(length2()); } + + CPT_CPU_GPU_INLINE + void normalize_h() { this->operator*=(rsqrtf(length2())); } + + CPT_CPU_GPU_INLINE + float length_h() const { return sqrtf(length2()); } CONDITION_TEMPLATE(VecType, Vec3) CPT_CPU_GPU_INLINE @@ -195,6 +206,8 @@ public: CPT_CPU_GPU_INLINE float min_elem() const noexcept { return fminf(_data.x, fminf(_data.y, _data.z)); } + CPT_GPU_INLINE Vec3 rcp() const noexcept { return Vec3(__frcp_rn(_data.x), __frcp_rn(_data.y), __frcp_rn(_data.z)); } + CPT_CPU_GPU_INLINE operator float3() const {return _data;} }; @@ -207,5 +220,5 @@ CPT_CPU_GPU_INLINE Vec3 operator*(float b, VecType&& v) noexcept { return Vec3(v.x() * b, v.y() * b, v.z() * b); } CONDITION_TEMPLATE(VecType, Vec3) -CPT_CPU_GPU_INLINE -Vec3 operator/(float b, VecType&& v) noexcept { return Vec3(b / v.x(), b / v.y(), b / v.z()); } \ No newline at end of file +CPT_GPU_INLINE +Vec3 operator/(float b, VecType&& v) noexcept { return Vec3(b * __frcp_rn(v.x()), b * __frcp_rn(v.y()), b * __frcp_rn(v.z())); } \ No newline at end of file diff --git a/src/impl/camera_model.cu b/src/impl/camera_model.cu index 73fad33..91215ae 100644 --- a/src/impl/camera_model.cu +++ b/src/impl/camera_model.cu @@ -38,9 +38,9 @@ CPT_CPU_GPU DeviceCamera::DeviceCamera( const Vec3& from, const Vec3& lookat, float fov, float w, float h, float hsign, float vsign, Vec3 up ): t(from), inv_focal(1.f / fov2focal(fov, w)), _hw(w * 0.5f), _hh(h * 0.5f), signs(hsign, vsign), use_orthogonal(false) { - Vec3 forward = (lookat - from).normalized(); - up.normalize(); - Vec3 right = up.cross(forward).normalized(); + Vec3 forward = (lookat - from).normalized_h(); + up.normalize_h(); + Vec3 right = up.cross(forward).normalized_h(); R = SO3(right, up, forward, false); } @@ -49,11 +49,11 @@ CPT_CPU void DeviceCamera::rotate(float yaw, float pitch) { quat_pit = Quaternion::angleAxis(pitch, Vec3(-signs.y(), 0, 0)); SO3 rot = SO3::from_quat(quat_yaw * quat_pit); R = R * rot; - Vec3 forward = R.col(2).normalized(), + Vec3 forward = R.col(2).normalized_h(), right = R.col(0); right.y() = 0; right *= 1.f / sqrtf(right.x() * right.x() + right.z() * right.z()); - Vec3 up = -right.cross(forward).normalized(); + Vec3 up = -right.cross(forward).normalized_h(); R = SO3(right, up, forward, false); } diff --git a/src/impl/object.cu b/src/impl/object.cu index 89fb04d..cf47115 100644 --- a/src/impl/object.cu +++ b/src/impl/object.cu @@ -23,7 +23,7 @@ CPT_CPU_GPU void ObjInfo::setup(const ArrayType& prims, bool is_polygon) { _aabb.maxi.maximized(prims.z(i)); _aabb.mini -= AABB_EPS; _aabb.maxi += AABB_EPS; - inv_area += (prims.y(i) - prims.x(i)).cross(prims.z(i) - prims.x(i)).length(); + inv_area += (prims.y(i) - prims.x(i)).cross(prims.z(i) - prims.x(i)).length_h(); } else { _aabb.mini = prims.x(i) - prims.y(i).x(); _aabb.maxi = prims.x(i) + prims.y(i).x(); @@ -48,7 +48,7 @@ CPT_CPU void ObjInfo::setup(const std::array, 3>& prims, bool _aabb.maxi.maximized(prims[2][i]); _aabb.mini -= AABB_EPS; _aabb.maxi += AABB_EPS; - inv_area += (prims[1][i] - prims[0][i]).cross(prims[2][i] - prims[0][i]).length(); + inv_area += (prims[1][i] - prims[0][i]).cross(prims[2][i] - prims[0][i]).length_h(); } else { _aabb.mini = prims[0][i] - prims[1][i].x(); _aabb.maxi = prims[0][i] + prims[1][i].x(); diff --git a/src/impl/scene.cu b/src/impl/scene.cu index d2d7c10..aee5d64 100644 --- a/src/impl/scene.cu +++ b/src/impl/scene.cu @@ -338,7 +338,7 @@ void parseObjShape( if (!has_normal) { // compute normals ourselves printf("Normal vector not found in '%s' primitive %llu, computing yet normal direction is not guaranteed.\n", name.c_str(), i); Vec3 diff = verts_list[1][i] - verts_list[0][i]; - Vec3 normal = diff.cross(verts_list[2][i] - verts_list[0][i]).normalized(); + Vec3 normal = diff.cross(verts_list[2][i] - verts_list[0][i]).normalized_h(); for (int j = 0; j < 3; j++) { norms_list[j].push_back(normal); } diff --git a/src/pt_impl/megakernel_lt.cu b/src/pt_impl/megakernel_lt.cu index 70c3d7d..d0d1c1c 100644 --- a/src/pt_impl/megakernel_lt.cu +++ b/src/pt_impl/megakernel_lt.cu @@ -43,7 +43,7 @@ CPT_KERNEL void render_lt_kernel( const cudaTextureObject_t node_backs, ConstF4Ptr cached_nodes, DeviceImage image, - float* output_buffer, + float* __restrict__ output_buffer, int num_prims, int num_objects, int num_emitter, @@ -195,7 +195,7 @@ template CPT_KERNEL void render_lt_kernel( const cudaTextureObject_t node_backs, ConstF4Ptr cached_nodes, DeviceImage image, - float* output_buffer, + float* __restrict__ output_buffer, int num_prims, int num_objects, int num_emitter, @@ -220,7 +220,7 @@ template CPT_KERNEL void render_lt_kernel( const cudaTextureObject_t node_backs, ConstF4Ptr cached_nodes, DeviceImage image, - float* output_buffer, + float* __restrict__ output_buffer, int num_prims, int num_objects, int num_emitter, diff --git a/src/pt_impl/megakernel_pt.cu b/src/pt_impl/megakernel_pt.cu index 5685d77..fe26692 100644 --- a/src/pt_impl/megakernel_pt.cu +++ b/src/pt_impl/megakernel_pt.cu @@ -228,7 +228,7 @@ CPT_KERNEL void render_pt_kernel( const cudaTextureObject_t node_backs, ConstF4Ptr cached_nodes, DeviceImage image, - float* output_buffer, + float* __restrict__ output_buffer, int num_prims, int num_objects, int num_emitter, @@ -384,7 +384,7 @@ template CPT_KERNEL void render_pt_kernel( const cudaTextureObject_t node_backs, ConstF4Ptr cached_nodes, DeviceImage image, - float* output_buffer, + float* __restrict__ output_buffer, int num_prims, int num_objects, int num_emitter, @@ -407,7 +407,7 @@ template CPT_KERNEL void render_pt_kernel( const cudaTextureObject_t node_backs, ConstF4Ptr cached_nodes, DeviceImage image, - float* output_buffer, + float* __restrict__ output_buffer, int num_prims, int num_objects, int num_emitter, diff --git a/src/pt_impl/wavefront_pt.cu b/src/pt_impl/wavefront_pt.cu index 4682a9d..c9df632 100644 --- a/src/pt_impl/wavefront_pt.cu +++ b/src/pt_impl/wavefront_pt.cu @@ -7,7 +7,7 @@ #include "renderer/wavefront_pt.cuh" namespace { - using PayLoadBuffer = PayLoadBufferSoA* const; + using PayLoadBuffer = PayLoadBufferSoA* const __restrict__; using ConstPayLoadBuffer = const PayLoadBuffer; } diff --git a/src/renderer/megakernel_pt.cuh b/src/renderer/megakernel_pt.cuh index 1abe590..0aa029e 100644 --- a/src/renderer/megakernel_pt.cuh +++ b/src/renderer/megakernel_pt.cuh @@ -16,10 +16,10 @@ extern __constant__ Emitter* c_emitter[9]; // c_emitter[8] is a dummy emitter extern __constant__ BSDF* c_material[32]; -using ConstNodePtr = const LinearNode* const; -using ConstObjPtr = const ObjInfo* const; -using ConstBSDFPtr = const BSDF* const; -using ConstIndexPtr = const int* const; +using ConstNodePtr = const LinearNode* const __restrict__; +using ConstObjPtr = const ObjInfo* const __restrict__; +using ConstBSDFPtr = const BSDF* const __restrict__; +using ConstIndexPtr = const int* const __restrict__; /** * Occlusion test, computation is done on global memory @@ -105,7 +105,7 @@ CPT_KERNEL void render_pt_kernel( const cudaTextureObject_t node_backs, ConstF4Ptr cached_nodes, DeviceImage image, - float* output_buffer, + float* __restrict__ output_buffer, int num_prims, int num_objects, int num_emitter, @@ -137,7 +137,7 @@ CPT_KERNEL void render_lt_kernel( const cudaTextureObject_t node_backs, ConstF4Ptr cached_nodes, DeviceImage image, - float* output_buffer, + float* __restrict__ output_buffer, int num_prims, int num_objects, int num_emitter, diff --git a/src/renderer/wavefront_pt.cuh b/src/renderer/wavefront_pt.cuh index 4dfa75a..23d885a 100644 --- a/src/renderer/wavefront_pt.cuh +++ b/src/renderer/wavefront_pt.cuh @@ -53,7 +53,7 @@ namespace { constexpr int PATCH_Y = BLOCK_Y * THREAD_Y; constexpr int TOTAL_RAY = PATCH_X * PATCH_Y; - using IndexBuffer = uint32_t* const; + using IndexBuffer = uint32_t* const __restrict__; } union PDFInteraction {