Slightly optimized, the current performance is stable.

Rest of the problems are hard to resolve: thread divergence in BVH tree accessing, local memory access problem and triangle intersection long scoreboard.
Enigmatisms · Oct 12, 2024 · 1fa91a6 · 1fa91a6
1 parent c0a5798
commit 1fa91a6
Show file tree

Hide file tree

Showing 17 changed files with 83 additions and 77 deletions.
diff --git a/scene/xml/whiskey.xml b/scene/xml/whiskey.xml
@@ -84,11 +84,11 @@
 		<ref type="material" id="glass"/>
 	</shape>
 
-	<!-- <shape type="sphere">
+	<shape type="sphere">
 		<point name="center" x="-0.35" y="0.02" z="0.05"/>
 		<float name="radius" value="0.02"/>
 		<ref type="material" id="glass"/>
-	</shape> -->
+	</shape>
 
 	<shape type="obj">
 		<string name="filename" value="../meshes/whiskey/wine.obj"/>

diff --git a/src/core/aabb.cuh b/src/core/aabb.cuh
@@ -44,16 +44,15 @@ public:
     CPT_CPU_GPU Vec3 centroid() const noexcept {return (maxi + mini) * 0.5f;}
     CPT_CPU_GPU Vec3 range()    const noexcept {return maxi - mini;}
 
-    CPT_CPU_GPU bool intersect(const Ray& ray, float& t_near) const {
-        auto invDir = 1.0f / ray.d;
-        // long scoreboard
-        auto t1s = (mini - ray.o) * invDir;
+    CPT_GPU bool intersect(const Ray& ray, float& t_near) const {
+        auto invDir = ray.d.rcp();
+        auto t1s = (mini - ray.o) * invDir;             // long scoreboard
         auto t2s = (maxi - ray.o) * invDir;
 
         float tmin = t1s.minimize(t2s).max_elem();
         float tmax = t1s.maximize(t2s).min_elem();
         t_near = tmin;
-        return tmax > tmin && tmax > 0;
+        return (tmax > tmin) && (tmax > 0);             // local memory access problem
     }
 
     CONDITION_TEMPLATE(AABBType, AABB)
@@ -104,5 +103,5 @@ struct AABBWrapper {
     float4 _padding;            // padding is here to avoid bank conflict
 };
 
-using ConstAABBPtr = const AABB* const;
-using ConstAABBWPtr = const AABBWrapper* const;
+using ConstAABBPtr = const AABB* const __restrict__;
+using ConstAABBWPtr = const AABBWrapper* const __restrict__;
diff --git a/src/core/aos.cuh b/src/core/aos.cuh
@@ -212,7 +212,7 @@ public:
 #undef INDEX_Y
 #undef INDEX_Z
 
-using ConstF4Ptr   = const float4* const;
-using ConstVertPtr = const PrecomputedArray* const;
-using ConstNormPtr = const ArrayType<Vec3>* const;
-using ConstUVPtr   = const ArrayType<Vec2>* const;
+using ConstF4Ptr   = const float4* const __restrict__;
+using ConstVertPtr = const PrecomputedArray* const __restrict__;
+using ConstNormPtr = const ArrayType<Vec3>* const __restrict__;
+using ConstUVPtr   = const ArrayType<Vec2>* const __restrict__;
diff --git a/src/core/bsdf.cuh b/src/core/bsdf.cuh
@@ -43,13 +43,13 @@ public:
     CPT_GPU void set_ex_id(int v) noexcept { this->ex_tex_id = v; }
     CPT_GPU void set_lobe(int v) noexcept { this->bsdf_flag = v; }
 
-    CPT_CPU_GPU virtual float pdf(const Interaction& it, const Vec3& out, const Vec3& in) const = 0;
+    CPT_GPU virtual float pdf(const Interaction& it, const Vec3& out, const Vec3& in) const = 0;
 
-    CPT_CPU_GPU virtual Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const = 0;
+    CPT_GPU virtual Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const = 0;
 
-    CPT_CPU_GPU virtual Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float&, Vec2&& uv) const = 0;
+    CPT_GPU virtual Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float&, Vec2&& uv) const = 0;
 
-    CPT_CPU_GPU_INLINE bool require_lobe(BSDFFlag flags) const noexcept {
+    CPT_GPU_INLINE bool require_lobe(BSDFFlag flags) const noexcept {
         return (bsdf_flag & (int)flags) > 0;
     }
 };
@@ -64,22 +64,22 @@ public:
 
     CPT_CPU_GPU LambertianBSDF(): BSDF() {}
 
-    CPT_CPU_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override {
+    CPT_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override {
         // printf("it.norm: %f, %f, %f\n", it.shading_norm.x(), it.shading_norm.y(), it.shading_norm.z());
         // printf("out: %f, %f, %f\n", out.x(), out.y(), out.z());
         float dot_val = it.shading_norm.dot(out);
         return max(it.shading_norm.dot(out), 0.f) * M_1_Pi;
     }
 
-    CPT_CPU_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
+    CPT_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
         float cos_term = it.shading_norm.dot(out);
         float dot_in  = it.shading_norm.dot(in);
         float same_side = (dot_in > 0) ^ (cos_term > 0);     // should be positive or negative at the same time
         // printf("%f, k_d: %f, %f, %f\n", cosine_term, k_d.x(), k_d.y(), k_d.z());
         return k_d * max(0.f, cos_term) * M_1_Pi * same_side;
     }
 
-    CPT_CPU_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
+    CPT_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
         auto local_ray = sample_cosine_hemisphere(std::move(uv), pdf);
         auto out_ray = delocalize_rotate(it.shading_norm, local_ray);
         // throughput *= f / pdf --> k_d * cos / pi / (pdf = cos / pi) == k_d
@@ -98,16 +98,16 @@ public:
 
     CPT_CPU_GPU SpecularBSDF(): BSDF() {}
 
-    CPT_CPU_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override {
+    CPT_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override {
         return 0.f;
     }
 
-    CPT_CPU_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
+    CPT_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
         auto ref_dir = in.advance(it.shading_norm, -2.f * in.dot(it.shading_norm)).normalized();
         return k_s * (out.dot(ref_dir) > 0.99999f);
     }
 
-    CPT_CPU_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
+    CPT_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
         // throughput *= f / pdf
         pdf = 1.f;
         throughput *= k_s * (indir.dot(it.shading_norm) < 0);
@@ -125,11 +125,11 @@ public:
 
     CPT_CPU_GPU TranslucentBSDF(): BSDF() {}
 
-    CPT_CPU_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& incid) const override {
+    CPT_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& incid) const override {
         return 0.f;
     }
 
-    CPT_CPU_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
+    CPT_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
         float dot_normal = in.dot(it.shading_norm);
         // at least according to pbrt-v3, ni / nr is computed as the following (using shading normal)
         // see https://computergraphics.stackexchange.com/questions/13540/shading-normal-and-geometric-normal-for-refractive-surface-rendering
@@ -143,7 +143,7 @@ public:
         return k_s * (reflc_dot | refra_dot);
     }
 
-    CPT_CPU_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
+    CPT_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
         float dot_normal = indir.dot(it.shading_norm);
         // at least according to pbrt-v3, ni / nr is computed as the following (using shading normal)
         // see https://computergraphics.stackexchange.com/questions/13540/shading-normal-and-geometric-normal-for-refractive-surface-rendering
@@ -160,11 +160,11 @@ public:
         return ret_dir;
     }
 
-    CPT_CPU_GPU_INLINE static bool is_total_reflection(float dot_normal, float ni, float nr) {
+    CPT_GPU_INLINE static bool is_total_reflection(float dot_normal, float ni, float nr) {
         return (1.f - (ni * ni) / (nr * nr) * (1.f - dot_normal * dot_normal)) < 0.f;
     }
 
-    CPT_CPU_GPU static Vec3 snell_refraction(const Vec3& incid, const Vec3& normal, float& cos_r2, float dot_n, float ni, float nr) {
+    CPT_GPU static Vec3 snell_refraction(const Vec3& incid, const Vec3& normal, float& cos_r2, float dot_n, float ni, float nr) {
         /* Refraction vector by Snell's Law, note that an extra flag will be returned */
         float ratio = ni / nr;
         cos_r2 = 1.f - (ratio * ratio) * (1. - dot_n * dot_n);        // refraction angle cosine
@@ -173,7 +173,7 @@ public:
         return (ratio * incid - ratio * dot_n * normal + sgn(dot_n) * sqrtf(fabsf(cos_r2)) * normal).normalized() * (cos_r2 > 0.f);
     }
 
-    CPT_CPU_GPU static float fresnel_equation(float n_in, float n_out, float cos_inc, float cos_ref) {
+    CPT_GPU static float fresnel_equation(float n_in, float n_out, float cos_inc, float cos_ref) {
         /**
             Fresnel Equation for calculating specular ratio
             Since Schlick's Approximation is not clear about n1->n2, n2->n1 (different) effects

diff --git a/src/core/object.cuh b/src/core/object.cuh
@@ -18,7 +18,7 @@ public:
     float inv_area;         // inverse area
     uint8_t emitter_id;     // index to the emitter, 0xff means not an emitter
 public:
-    CPT_CPU_GPU_INLINE bool intersect(const Ray& ray, float& t_near) const noexcept {
+    CPT_GPU_INLINE bool intersect(const Ray& ray, float& t_near) const noexcept {
         return _aabb.intersect(ray, t_near);
     }
 

diff --git a/src/core/primitives.cuh b/src/core/primitives.cuh
@@ -12,14 +12,8 @@
 #include "core/ray.cuh"
 #include "core/interaction.cuh"
 
-using SharedVec3Ptr = Vec3 (*)[32];
-using SharedVec2Ptr = Vec2 (*)[32];
-using ConstSharedVec3Ptr = const Vec3 (*)[32];
-using ConstSharedVec2Ptr = const Vec2 (*)[32];
-
 // #define TRIANGLE_ONLY
 
-// All static
 class Primitive {
 private:
     CPT_GPU_INLINE static float intersect_sphere(

diff --git a/src/core/so3.cuh b/src/core/so3.cuh
@@ -192,8 +192,8 @@ public:
         return output;
     }
 
-    friend CPT_CPU_GPU SO3 rotation_between(Vec3&& from, const Vec3& to);
-    friend CPT_CPU_GPU SO3 rotation_local_to_world(const Vec3& to);
+    friend CPT_GPU SO3 rotation_between(Vec3&& from, const Vec3& to);
+    friend CPT_GPU SO3 rotation_local_to_world(const Vec3& to);
 };
 
 CPT_CPU_GPU_INLINE SO3 skew_symmetry(const Vec3& v) {
@@ -213,7 +213,7 @@ CPT_CPU_GPU_INLINE SO3 vec_mul(const Vec3& v1, const Vec3& v2) {
 }
 
 // This can be improved (maybe not, Rodrigues tranformation is already good enough)
-CPT_CPU_GPU_INLINE SO3 rotation_between(Vec3&& from, const Vec3& to) {
+CPT_GPU_INLINE SO3 rotation_between(Vec3&& from, const Vec3& to) {
     auto axis = from.cross(to);
     float cos_theta = from.dot(to);
     SO3 R = SO3::diag(cos_theta);
@@ -225,7 +225,7 @@ CPT_CPU_GPU_INLINE SO3 rotation_between(Vec3&& from, const Vec3& to) {
     return R;
 }
 
-CPT_CPU_GPU_INLINE SO3 rotation_local_to_world(const Vec3& to) {
+CPT_GPU_INLINE SO3 rotation_local_to_world(const Vec3& to) {
     auto axis = Vec3(-to.y(), to.x(), 0);
     SO3 R = SO3::diag(to.z());
     if (abs(to.z()) < 1.f - 1e-5f) {
@@ -237,12 +237,12 @@ CPT_CPU_GPU_INLINE SO3 rotation_local_to_world(const Vec3& to) {
 }
 
 CONDITION_TEMPLATE(VecType, Vec3)
-CPT_CPU_GPU_INLINE Vec3 delocalize_rotate(VecType&& anchor, const Vec3& to, const Vec3& input) {
+CPT_GPU_INLINE Vec3 delocalize_rotate(VecType&& anchor, const Vec3& to, const Vec3& input) {
     SO3 R = rotation_between(std::move(anchor), to);
     return R.rotate(input);
 }   
 
 // Specialized, when the anchor is (0, 0, 1)
-CPT_CPU_GPU_INLINE Vec3 delocalize_rotate(const Vec3& to, const Vec3& input) {
+CPT_GPU_INLINE Vec3 delocalize_rotate(const Vec3& to, const Vec3& input) {
     return rotation_local_to_world(to).rotate(input);
 }   
diff --git a/src/core/vec2.cuh b/src/core/vec2.cuh
@@ -95,17 +95,17 @@ public:
         return *this;
     }
 
-    CPT_CPU_GPU_INLINE
-    Vec2 normalized() const { return *this * rsqrtf(length2()); }
+    CPT_GPU_INLINE
+    Vec2 normalized() const { return *this * rhypotf(_data.x, _data.y); }
 
-    CPT_CPU_GPU_INLINE
-    void normalize() { this->operator*=(rsqrtf(length2())); }
+    CPT_GPU_INLINE
+    void normalize() { this->operator*=(rhypotf(_data.x, _data.y)); }
 
     CPT_CPU_GPU_INLINE
     float length2() const { return _data.x * _data.x + _data.y * _data.y; }
 
-    CPT_CPU_GPU_INLINE
-    float length() const { return sqrt(length2()); }
+    CPT_GPU_INLINE
+    float length() const { return hypotf(_data.x, _data.y); }
 
     CONDITION_TEMPLATE(VecType, Vec2)
     CPT_CPU_GPU_INLINE

diff --git a/src/core/vec3.cuh b/src/core/vec3.cuh
@@ -139,17 +139,28 @@ public:
         return Vec3(fmaf(d.x(), t, _data.x), fmaf(d.y(), t, _data.y), fmaf(d.z(), t, _data.z));
     } 
 
-    CPT_CPU_GPU_INLINE
-    Vec3 normalized() const { return *this * rsqrtf(length2()); }
+    // ============== Specialized version using CUDA math function ===============
+    CPT_GPU_INLINE
+    Vec3 normalized() const { return *this * rnorm3df(_data.x, _data.y, _data.z); }
 
-    CPT_CPU_GPU_INLINE
-    void normalize() { this->operator*=(rsqrtf(length2())); }
+    CPT_GPU_INLINE
+    void normalize() { this->operator*=(rnorm3df(_data.x, _data.y, _data.z)); }
+
+    CPT_GPU_INLINE
+    float length() const { return norm3df(_data.x, _data.y, _data.z); }
+    // ============== Specialized version using CUDA math function ===============
 
     CPT_CPU_GPU_INLINE
     float length2() const { return fmaf(_data.x, _data.x, fmaf(_data.y, _data.y, _data.z * _data.z)); }
 
     CPT_CPU_GPU_INLINE
-    float length() const { return sqrt(length2()); }
+    Vec3 normalized_h() const { return *this * rsqrtf(length2()); }
+
+    CPT_CPU_GPU_INLINE
+    void normalize_h() { this->operator*=(rsqrtf(length2())); }
+
+    CPT_CPU_GPU_INLINE
+    float length_h() const { return sqrtf(length2()); }
 
     CONDITION_TEMPLATE(VecType, Vec3)
     CPT_CPU_GPU_INLINE
@@ -195,6 +206,8 @@ public:
     CPT_CPU_GPU_INLINE
     float min_elem() const noexcept { return fminf(_data.x, fminf(_data.y, _data.z)); }
 
+    CPT_GPU_INLINE Vec3 rcp() const noexcept { return Vec3(__frcp_rn(_data.x), __frcp_rn(_data.y), __frcp_rn(_data.z)); }
+
     CPT_CPU_GPU_INLINE operator float3() const {return _data;}
 };
 
@@ -207,5 +220,5 @@ CPT_CPU_GPU_INLINE
 Vec3 operator*(float b, VecType&& v) noexcept { return Vec3(v.x() * b, v.y() * b, v.z() * b); }
 
 CONDITION_TEMPLATE(VecType, Vec3)
-CPT_CPU_GPU_INLINE
-Vec3 operator/(float b, VecType&& v) noexcept { return Vec3(b / v.x(), b / v.y(), b / v.z()); }
+CPT_GPU_INLINE
+Vec3 operator/(float b, VecType&& v) noexcept { return Vec3(b * __frcp_rn(v.x()), b * __frcp_rn(v.y()), b * __frcp_rn(v.z())); }
diff --git a/src/impl/camera_model.cu b/src/impl/camera_model.cu
@@ -38,9 +38,9 @@ CPT_CPU_GPU DeviceCamera::DeviceCamera(
     const Vec3& from, const Vec3& lookat, float fov, 
     float w, float h, float hsign, float vsign, Vec3 up
 ): t(from), inv_focal(1.f / fov2focal(fov, w)), _hw(w * 0.5f), _hh(h * 0.5f), signs(hsign, vsign), use_orthogonal(false) {
-    Vec3 forward = (lookat - from).normalized();
-    up.normalize();
-    Vec3 right = up.cross(forward).normalized();
+    Vec3 forward = (lookat - from).normalized_h();
+    up.normalize_h();
+    Vec3 right = up.cross(forward).normalized_h();
     R = SO3(right, up, forward, false);
 }
 
@@ -49,11 +49,11 @@ CPT_CPU void DeviceCamera::rotate(float yaw, float pitch) {
             quat_pit = Quaternion::angleAxis(pitch, Vec3(-signs.y(), 0, 0));
     SO3 rot = SO3::from_quat(quat_yaw * quat_pit);
     R = R * rot;
-    Vec3 forward = R.col(2).normalized(),
+    Vec3 forward = R.col(2).normalized_h(),
          right   = R.col(0);
     right.y() = 0;
     right *= 1.f / sqrtf(right.x() * right.x() + right.z() * right.z());
-    Vec3 up = -right.cross(forward).normalized();
+    Vec3 up = -right.cross(forward).normalized_h();
     R = SO3(right, up, forward, false);
 }
 

diff --git a/src/impl/object.cu b/src/impl/object.cu
@@ -23,7 +23,7 @@ CPT_CPU_GPU void ObjInfo::setup(const ArrayType<Vec3>& prims, bool is_polygon) {
             _aabb.maxi.maximized(prims.z(i));
             _aabb.mini -= AABB_EPS;
             _aabb.maxi += AABB_EPS;
-            inv_area += (prims.y(i) - prims.x(i)).cross(prims.z(i) - prims.x(i)).length();
+            inv_area += (prims.y(i) - prims.x(i)).cross(prims.z(i) - prims.x(i)).length_h();
         } else {
             _aabb.mini = prims.x(i) - prims.y(i).x();
             _aabb.maxi = prims.x(i) + prims.y(i).x();
@@ -48,7 +48,7 @@ CPT_CPU void ObjInfo::setup(const std::array<std::vector<Vec3>, 3>& prims, bool
             _aabb.maxi.maximized(prims[2][i]);
             _aabb.mini -= AABB_EPS;
             _aabb.maxi += AABB_EPS;
-            inv_area += (prims[1][i] - prims[0][i]).cross(prims[2][i] - prims[0][i]).length();
+            inv_area += (prims[1][i] - prims[0][i]).cross(prims[2][i] - prims[0][i]).length_h();
         } else {
             _aabb.mini = prims[0][i] - prims[1][i].x();
             _aabb.maxi = prims[0][i] + prims[1][i].x();

diff --git a/src/impl/scene.cu b/src/impl/scene.cu
@@ -338,7 +338,7 @@ void parseObjShape(
             if (!has_normal) {      // compute normals ourselves
                 printf("Normal vector not found in '%s' primitive %llu, computing yet normal direction is not guaranteed.\n", name.c_str(), i);
                 Vec3 diff = verts_list[1][i] - verts_list[0][i];
-                Vec3 normal = diff.cross(verts_list[2][i] - verts_list[0][i]).normalized();
+                Vec3 normal = diff.cross(verts_list[2][i] - verts_list[0][i]).normalized_h();
                 for (int j = 0; j < 3; j++) {
                     norms_list[j].push_back(normal);
                 }

diff --git a/src/pt_impl/megakernel_lt.cu b/src/pt_impl/megakernel_lt.cu
@@ -43,7 +43,7 @@ CPT_KERNEL void render_lt_kernel(
     const cudaTextureObject_t node_backs,
     ConstF4Ptr cached_nodes,
     DeviceImage image,
-    float* output_buffer,
+    float* __restrict__ output_buffer,
     int num_prims,
     int num_objects,
     int num_emitter,
@@ -195,7 +195,7 @@ template CPT_KERNEL void render_lt_kernel<true>(
     const cudaTextureObject_t node_backs,
     ConstF4Ptr cached_nodes,
     DeviceImage image,
-    float* output_buffer,
+    float* __restrict__ output_buffer,
     int num_prims,
     int num_objects,
     int num_emitter,
@@ -220,7 +220,7 @@ template CPT_KERNEL void render_lt_kernel<false>(
     const cudaTextureObject_t node_backs,
     ConstF4Ptr cached_nodes,
     DeviceImage image,
-    float* output_buffer,
+    float* __restrict__ output_buffer,
     int num_prims,
     int num_objects,
     int num_emitter,