Skip to content

Commit

Permalink
Slightly optimized, the current performance is stable.
Browse files Browse the repository at this point in the history
Rest of the problems are hard to resolve: thread divergence in BVH tree accessing, local memory access problem and triangle intersection long scoreboard.
  • Loading branch information
Enigmatisms committed Oct 12, 2024
1 parent c0a5798 commit 1fa91a6
Show file tree
Hide file tree
Showing 17 changed files with 83 additions and 77 deletions.
4 changes: 2 additions & 2 deletions scene/xml/whiskey.xml
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,11 @@
<ref type="material" id="glass"/>
</shape>

<!-- <shape type="sphere">
<shape type="sphere">
<point name="center" x="-0.35" y="0.02" z="0.05"/>
<float name="radius" value="0.02"/>
<ref type="material" id="glass"/>
</shape> -->
</shape>

<shape type="obj">
<string name="filename" value="../meshes/whiskey/wine.obj"/>
Expand Down
13 changes: 6 additions & 7 deletions src/core/aabb.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,15 @@ public:
CPT_CPU_GPU Vec3 centroid() const noexcept {return (maxi + mini) * 0.5f;}
CPT_CPU_GPU Vec3 range() const noexcept {return maxi - mini;}

CPT_CPU_GPU bool intersect(const Ray& ray, float& t_near) const {
auto invDir = 1.0f / ray.d;
// long scoreboard
auto t1s = (mini - ray.o) * invDir;
CPT_GPU bool intersect(const Ray& ray, float& t_near) const {
auto invDir = ray.d.rcp();
auto t1s = (mini - ray.o) * invDir; // long scoreboard
auto t2s = (maxi - ray.o) * invDir;

float tmin = t1s.minimize(t2s).max_elem();
float tmax = t1s.maximize(t2s).min_elem();
t_near = tmin;
return tmax > tmin && tmax > 0;
return (tmax > tmin) && (tmax > 0); // local memory access problem
}

CONDITION_TEMPLATE(AABBType, AABB)
Expand Down Expand Up @@ -104,5 +103,5 @@ struct AABBWrapper {
float4 _padding; // padding is here to avoid bank conflict
};

using ConstAABBPtr = const AABB* const;
using ConstAABBWPtr = const AABBWrapper* const;
using ConstAABBPtr = const AABB* const __restrict__;
using ConstAABBWPtr = const AABBWrapper* const __restrict__;
8 changes: 4 additions & 4 deletions src/core/aos.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ public:
#undef INDEX_Y
#undef INDEX_Z

using ConstF4Ptr = const float4* const;
using ConstVertPtr = const PrecomputedArray* const;
using ConstNormPtr = const ArrayType<Vec3>* const;
using ConstUVPtr = const ArrayType<Vec2>* const;
using ConstF4Ptr = const float4* const __restrict__;
using ConstVertPtr = const PrecomputedArray* const __restrict__;
using ConstNormPtr = const ArrayType<Vec3>* const __restrict__;
using ConstUVPtr = const ArrayType<Vec2>* const __restrict__;
32 changes: 16 additions & 16 deletions src/core/bsdf.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@ public:
CPT_GPU void set_ex_id(int v) noexcept { this->ex_tex_id = v; }
CPT_GPU void set_lobe(int v) noexcept { this->bsdf_flag = v; }

CPT_CPU_GPU virtual float pdf(const Interaction& it, const Vec3& out, const Vec3& in) const = 0;
CPT_GPU virtual float pdf(const Interaction& it, const Vec3& out, const Vec3& in) const = 0;

CPT_CPU_GPU virtual Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const = 0;
CPT_GPU virtual Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const = 0;

CPT_CPU_GPU virtual Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float&, Vec2&& uv) const = 0;
CPT_GPU virtual Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float&, Vec2&& uv) const = 0;

CPT_CPU_GPU_INLINE bool require_lobe(BSDFFlag flags) const noexcept {
CPT_GPU_INLINE bool require_lobe(BSDFFlag flags) const noexcept {
return (bsdf_flag & (int)flags) > 0;
}
};
Expand All @@ -64,22 +64,22 @@ public:

CPT_CPU_GPU LambertianBSDF(): BSDF() {}

CPT_CPU_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override {
CPT_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override {
// printf("it.norm: %f, %f, %f\n", it.shading_norm.x(), it.shading_norm.y(), it.shading_norm.z());
// printf("out: %f, %f, %f\n", out.x(), out.y(), out.z());
float dot_val = it.shading_norm.dot(out);
return max(it.shading_norm.dot(out), 0.f) * M_1_Pi;
}

CPT_CPU_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
CPT_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
float cos_term = it.shading_norm.dot(out);
float dot_in = it.shading_norm.dot(in);
float same_side = (dot_in > 0) ^ (cos_term > 0); // should be positive or negative at the same time
// printf("%f, k_d: %f, %f, %f\n", cosine_term, k_d.x(), k_d.y(), k_d.z());
return k_d * max(0.f, cos_term) * M_1_Pi * same_side;
}

CPT_CPU_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
CPT_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
auto local_ray = sample_cosine_hemisphere(std::move(uv), pdf);
auto out_ray = delocalize_rotate(it.shading_norm, local_ray);
// throughput *= f / pdf --> k_d * cos / pi / (pdf = cos / pi) == k_d
Expand All @@ -98,16 +98,16 @@ public:

CPT_CPU_GPU SpecularBSDF(): BSDF() {}

CPT_CPU_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override {
CPT_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& /* in */) const override {
return 0.f;
}

CPT_CPU_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
CPT_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
auto ref_dir = in.advance(it.shading_norm, -2.f * in.dot(it.shading_norm)).normalized();
return k_s * (out.dot(ref_dir) > 0.99999f);
}

CPT_CPU_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
CPT_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
// throughput *= f / pdf
pdf = 1.f;
throughput *= k_s * (indir.dot(it.shading_norm) < 0);
Expand All @@ -125,11 +125,11 @@ public:

CPT_CPU_GPU TranslucentBSDF(): BSDF() {}

CPT_CPU_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& incid) const override {
CPT_GPU float pdf(const Interaction& it, const Vec3& out, const Vec3& incid) const override {
return 0.f;
}

CPT_CPU_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
CPT_GPU Vec4 eval(const Interaction& it, const Vec3& out, const Vec3& in, bool is_mi = false) const override {
float dot_normal = in.dot(it.shading_norm);
// at least according to pbrt-v3, ni / nr is computed as the following (using shading normal)
// see https://computergraphics.stackexchange.com/questions/13540/shading-normal-and-geometric-normal-for-refractive-surface-rendering
Expand All @@ -143,7 +143,7 @@ public:
return k_s * (reflc_dot | refra_dot);
}

CPT_CPU_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
CPT_GPU Vec3 sample_dir(const Vec3& indir, const Interaction& it, Vec4& throughput, float& pdf, Vec2&& uv) const override {
float dot_normal = indir.dot(it.shading_norm);
// at least according to pbrt-v3, ni / nr is computed as the following (using shading normal)
// see https://computergraphics.stackexchange.com/questions/13540/shading-normal-and-geometric-normal-for-refractive-surface-rendering
Expand All @@ -160,11 +160,11 @@ public:
return ret_dir;
}

CPT_CPU_GPU_INLINE static bool is_total_reflection(float dot_normal, float ni, float nr) {
CPT_GPU_INLINE static bool is_total_reflection(float dot_normal, float ni, float nr) {
return (1.f - (ni * ni) / (nr * nr) * (1.f - dot_normal * dot_normal)) < 0.f;
}

CPT_CPU_GPU static Vec3 snell_refraction(const Vec3& incid, const Vec3& normal, float& cos_r2, float dot_n, float ni, float nr) {
CPT_GPU static Vec3 snell_refraction(const Vec3& incid, const Vec3& normal, float& cos_r2, float dot_n, float ni, float nr) {
/* Refraction vector by Snell's Law, note that an extra flag will be returned */
float ratio = ni / nr;
cos_r2 = 1.f - (ratio * ratio) * (1. - dot_n * dot_n); // refraction angle cosine
Expand All @@ -173,7 +173,7 @@ public:
return (ratio * incid - ratio * dot_n * normal + sgn(dot_n) * sqrtf(fabsf(cos_r2)) * normal).normalized() * (cos_r2 > 0.f);
}

CPT_CPU_GPU static float fresnel_equation(float n_in, float n_out, float cos_inc, float cos_ref) {
CPT_GPU static float fresnel_equation(float n_in, float n_out, float cos_inc, float cos_ref) {
/**
Fresnel Equation for calculating specular ratio
Since Schlick's Approximation is not clear about n1->n2, n2->n1 (different) effects
Expand Down
2 changes: 1 addition & 1 deletion src/core/object.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public:
float inv_area; // inverse area
uint8_t emitter_id; // index to the emitter, 0xff means not an emitter
public:
CPT_CPU_GPU_INLINE bool intersect(const Ray& ray, float& t_near) const noexcept {
CPT_GPU_INLINE bool intersect(const Ray& ray, float& t_near) const noexcept {
return _aabb.intersect(ray, t_near);
}

Expand Down
6 changes: 0 additions & 6 deletions src/core/primitives.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,8 @@
#include "core/ray.cuh"
#include "core/interaction.cuh"

using SharedVec3Ptr = Vec3 (*)[32];
using SharedVec2Ptr = Vec2 (*)[32];
using ConstSharedVec3Ptr = const Vec3 (*)[32];
using ConstSharedVec2Ptr = const Vec2 (*)[32];

// #define TRIANGLE_ONLY

// All static
class Primitive {
private:
CPT_GPU_INLINE static float intersect_sphere(
Expand Down
12 changes: 6 additions & 6 deletions src/core/so3.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,8 @@ public:
return output;
}

friend CPT_CPU_GPU SO3 rotation_between(Vec3&& from, const Vec3& to);
friend CPT_CPU_GPU SO3 rotation_local_to_world(const Vec3& to);
friend CPT_GPU SO3 rotation_between(Vec3&& from, const Vec3& to);
friend CPT_GPU SO3 rotation_local_to_world(const Vec3& to);
};

CPT_CPU_GPU_INLINE SO3 skew_symmetry(const Vec3& v) {
Expand All @@ -213,7 +213,7 @@ CPT_CPU_GPU_INLINE SO3 vec_mul(const Vec3& v1, const Vec3& v2) {
}

// This can be improved (maybe not, Rodrigues tranformation is already good enough)
CPT_CPU_GPU_INLINE SO3 rotation_between(Vec3&& from, const Vec3& to) {
CPT_GPU_INLINE SO3 rotation_between(Vec3&& from, const Vec3& to) {
auto axis = from.cross(to);
float cos_theta = from.dot(to);
SO3 R = SO3::diag(cos_theta);
Expand All @@ -225,7 +225,7 @@ CPT_CPU_GPU_INLINE SO3 rotation_between(Vec3&& from, const Vec3& to) {
return R;
}

CPT_CPU_GPU_INLINE SO3 rotation_local_to_world(const Vec3& to) {
CPT_GPU_INLINE SO3 rotation_local_to_world(const Vec3& to) {
auto axis = Vec3(-to.y(), to.x(), 0);
SO3 R = SO3::diag(to.z());
if (abs(to.z()) < 1.f - 1e-5f) {
Expand All @@ -237,12 +237,12 @@ CPT_CPU_GPU_INLINE SO3 rotation_local_to_world(const Vec3& to) {
}

CONDITION_TEMPLATE(VecType, Vec3)
CPT_CPU_GPU_INLINE Vec3 delocalize_rotate(VecType&& anchor, const Vec3& to, const Vec3& input) {
CPT_GPU_INLINE Vec3 delocalize_rotate(VecType&& anchor, const Vec3& to, const Vec3& input) {
SO3 R = rotation_between(std::move(anchor), to);
return R.rotate(input);
}

// Specialized, when the anchor is (0, 0, 1)
CPT_CPU_GPU_INLINE Vec3 delocalize_rotate(const Vec3& to, const Vec3& input) {
CPT_GPU_INLINE Vec3 delocalize_rotate(const Vec3& to, const Vec3& input) {
return rotation_local_to_world(to).rotate(input);
}
12 changes: 6 additions & 6 deletions src/core/vec2.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,17 @@ public:
return *this;
}

CPT_CPU_GPU_INLINE
Vec2 normalized() const { return *this * rsqrtf(length2()); }
CPT_GPU_INLINE
Vec2 normalized() const { return *this * rhypotf(_data.x, _data.y); }

CPT_CPU_GPU_INLINE
void normalize() { this->operator*=(rsqrtf(length2())); }
CPT_GPU_INLINE
void normalize() { this->operator*=(rhypotf(_data.x, _data.y)); }

CPT_CPU_GPU_INLINE
float length2() const { return _data.x * _data.x + _data.y * _data.y; }

CPT_CPU_GPU_INLINE
float length() const { return sqrt(length2()); }
CPT_GPU_INLINE
float length() const { return hypotf(_data.x, _data.y); }

CONDITION_TEMPLATE(VecType, Vec2)
CPT_CPU_GPU_INLINE
Expand Down
27 changes: 20 additions & 7 deletions src/core/vec3.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -139,17 +139,28 @@ public:
return Vec3(fmaf(d.x(), t, _data.x), fmaf(d.y(), t, _data.y), fmaf(d.z(), t, _data.z));
}

CPT_CPU_GPU_INLINE
Vec3 normalized() const { return *this * rsqrtf(length2()); }
// ============== Specialized version using CUDA math function ===============
CPT_GPU_INLINE
Vec3 normalized() const { return *this * rnorm3df(_data.x, _data.y, _data.z); }

CPT_CPU_GPU_INLINE
void normalize() { this->operator*=(rsqrtf(length2())); }
CPT_GPU_INLINE
void normalize() { this->operator*=(rnorm3df(_data.x, _data.y, _data.z)); }

CPT_GPU_INLINE
float length() const { return norm3df(_data.x, _data.y, _data.z); }
// ============== Specialized version using CUDA math function ===============

CPT_CPU_GPU_INLINE
float length2() const { return fmaf(_data.x, _data.x, fmaf(_data.y, _data.y, _data.z * _data.z)); }

CPT_CPU_GPU_INLINE
float length() const { return sqrt(length2()); }
Vec3 normalized_h() const { return *this * rsqrtf(length2()); }

CPT_CPU_GPU_INLINE
void normalize_h() { this->operator*=(rsqrtf(length2())); }

CPT_CPU_GPU_INLINE
float length_h() const { return sqrtf(length2()); }

CONDITION_TEMPLATE(VecType, Vec3)
CPT_CPU_GPU_INLINE
Expand Down Expand Up @@ -195,6 +206,8 @@ public:
CPT_CPU_GPU_INLINE
float min_elem() const noexcept { return fminf(_data.x, fminf(_data.y, _data.z)); }

CPT_GPU_INLINE Vec3 rcp() const noexcept { return Vec3(__frcp_rn(_data.x), __frcp_rn(_data.y), __frcp_rn(_data.z)); }

CPT_CPU_GPU_INLINE operator float3() const {return _data;}
};

Expand All @@ -207,5 +220,5 @@ CPT_CPU_GPU_INLINE
Vec3 operator*(float b, VecType&& v) noexcept { return Vec3(v.x() * b, v.y() * b, v.z() * b); }

CONDITION_TEMPLATE(VecType, Vec3)
CPT_CPU_GPU_INLINE
Vec3 operator/(float b, VecType&& v) noexcept { return Vec3(b / v.x(), b / v.y(), b / v.z()); }
CPT_GPU_INLINE
Vec3 operator/(float b, VecType&& v) noexcept { return Vec3(b * __frcp_rn(v.x()), b * __frcp_rn(v.y()), b * __frcp_rn(v.z())); }
10 changes: 5 additions & 5 deletions src/impl/camera_model.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ CPT_CPU_GPU DeviceCamera::DeviceCamera(
const Vec3& from, const Vec3& lookat, float fov,
float w, float h, float hsign, float vsign, Vec3 up
): t(from), inv_focal(1.f / fov2focal(fov, w)), _hw(w * 0.5f), _hh(h * 0.5f), signs(hsign, vsign), use_orthogonal(false) {
Vec3 forward = (lookat - from).normalized();
up.normalize();
Vec3 right = up.cross(forward).normalized();
Vec3 forward = (lookat - from).normalized_h();
up.normalize_h();
Vec3 right = up.cross(forward).normalized_h();
R = SO3(right, up, forward, false);
}

Expand All @@ -49,11 +49,11 @@ CPT_CPU void DeviceCamera::rotate(float yaw, float pitch) {
quat_pit = Quaternion::angleAxis(pitch, Vec3(-signs.y(), 0, 0));
SO3 rot = SO3::from_quat(quat_yaw * quat_pit);
R = R * rot;
Vec3 forward = R.col(2).normalized(),
Vec3 forward = R.col(2).normalized_h(),
right = R.col(0);
right.y() = 0;
right *= 1.f / sqrtf(right.x() * right.x() + right.z() * right.z());
Vec3 up = -right.cross(forward).normalized();
Vec3 up = -right.cross(forward).normalized_h();
R = SO3(right, up, forward, false);
}

Expand Down
4 changes: 2 additions & 2 deletions src/impl/object.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ CPT_CPU_GPU void ObjInfo::setup(const ArrayType<Vec3>& prims, bool is_polygon) {
_aabb.maxi.maximized(prims.z(i));
_aabb.mini -= AABB_EPS;
_aabb.maxi += AABB_EPS;
inv_area += (prims.y(i) - prims.x(i)).cross(prims.z(i) - prims.x(i)).length();
inv_area += (prims.y(i) - prims.x(i)).cross(prims.z(i) - prims.x(i)).length_h();
} else {
_aabb.mini = prims.x(i) - prims.y(i).x();
_aabb.maxi = prims.x(i) + prims.y(i).x();
Expand All @@ -48,7 +48,7 @@ CPT_CPU void ObjInfo::setup(const std::array<std::vector<Vec3>, 3>& prims, bool
_aabb.maxi.maximized(prims[2][i]);
_aabb.mini -= AABB_EPS;
_aabb.maxi += AABB_EPS;
inv_area += (prims[1][i] - prims[0][i]).cross(prims[2][i] - prims[0][i]).length();
inv_area += (prims[1][i] - prims[0][i]).cross(prims[2][i] - prims[0][i]).length_h();
} else {
_aabb.mini = prims[0][i] - prims[1][i].x();
_aabb.maxi = prims[0][i] + prims[1][i].x();
Expand Down
2 changes: 1 addition & 1 deletion src/impl/scene.cu
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ void parseObjShape(
if (!has_normal) { // compute normals ourselves
printf("Normal vector not found in '%s' primitive %llu, computing yet normal direction is not guaranteed.\n", name.c_str(), i);
Vec3 diff = verts_list[1][i] - verts_list[0][i];
Vec3 normal = diff.cross(verts_list[2][i] - verts_list[0][i]).normalized();
Vec3 normal = diff.cross(verts_list[2][i] - verts_list[0][i]).normalized_h();
for (int j = 0; j < 3; j++) {
norms_list[j].push_back(normal);
}
Expand Down
6 changes: 3 additions & 3 deletions src/pt_impl/megakernel_lt.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ CPT_KERNEL void render_lt_kernel(
const cudaTextureObject_t node_backs,
ConstF4Ptr cached_nodes,
DeviceImage image,
float* output_buffer,
float* __restrict__ output_buffer,
int num_prims,
int num_objects,
int num_emitter,
Expand Down Expand Up @@ -195,7 +195,7 @@ template CPT_KERNEL void render_lt_kernel<true>(
const cudaTextureObject_t node_backs,
ConstF4Ptr cached_nodes,
DeviceImage image,
float* output_buffer,
float* __restrict__ output_buffer,
int num_prims,
int num_objects,
int num_emitter,
Expand All @@ -220,7 +220,7 @@ template CPT_KERNEL void render_lt_kernel<false>(
const cudaTextureObject_t node_backs,
ConstF4Ptr cached_nodes,
DeviceImage image,
float* output_buffer,
float* __restrict__ output_buffer,
int num_prims,
int num_objects,
int num_emitter,
Expand Down
Loading

0 comments on commit 1fa91a6

Please sign in to comment.