From ad31b9d2289582bafc0e07f3465f070de8dd1134 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20M=C3=BCller?= <thomas94@gmx.net>
Date: Mon, 14 Mar 2022 16:29:56 +0100
Subject: [PATCH] Avoid redundant computation due to auto+Eigen

---
 .../common_device.cuh                         | 20 +++++++---------
 src/testbed.cu                                |  4 ++--
 src/testbed_image.cu                          | 10 ++++----
 src/testbed_nerf.cu                           | 24 +++++++------------
 src/testbed_volume.cu                         |  2 +-
 5 files changed, 26 insertions(+), 34 deletions(-)
diff --git a/include/neural-graphics-primitives/common_device.cuh b/include/neural-graphics-primitives/common_device.cuh
index 0577880c5..458294453 100644
--- a/include/neural-graphics-primitives/common_device.cuh
+++ b/include/neural-graphics-primitives/common_device.cuh
@@ -77,10 +77,10 @@ inline __host__ __device__ Eigen::Array3f linear_to_srgb_derivative(const Eigen:
 
 template <uint32_t N_DIMS, typename T>
 __host__ __device__ Eigen::Matrix<float, N_DIMS, 1> read_image(const T* __restrict__ data, const Eigen::Vector2i& resolution, const Eigen::Vector2f& pos) {
-	auto pos_float = Eigen::Vector2f{pos.x() * (float)(resolution.x()-1), pos.y() * (float)(resolution.y()-1)};
-	Eigen::Vector2i texel = pos_float.cast<int>();
+	const Eigen::Vector2f pos_float = Eigen::Vector2f{pos.x() * (float)(resolution.x()-1), pos.y() * (float)(resolution.y()-1)};
+	const Eigen::Vector2i texel = pos_float.cast<int>();
 
-	auto weight = pos_float - texel.cast<float>();
+	const Eigen::Vector2f weight = pos_float - texel.cast<float>();
 
 	auto read_val = [&](Eigen::Vector2i pos) {
 		pos.x() = std::max(std::min(pos.x(), resolution.x()-1), 0);
@@ -100,22 +100,20 @@ __host__ __device__ Eigen::Matrix<float, N_DIMS, 1> read_image(const T* __restri
 		return result;
 	};
 
-	auto result = (
+	return (
 		(1 - weight.x()) * (1 - weight.y()) * read_val({texel.x(), texel.y()}) +
 		(weight.x()) * (1 - weight.y()) * read_val({texel.x()+1, texel.y()}) +
 		(1 - weight.x()) * (weight.y()) * read_val({texel.x(), texel.y()+1}) +
 		(weight.x()) * (weight.y()) * read_val({texel.x()+1, texel.y()+1})
 	);
-
-	return result;
 }
 
 template <uint32_t N_DIMS, typename T>
 __device__ void deposit_image_gradient(const Eigen::Matrix<float, N_DIMS, 1>& value, T* __restrict__ gradient, T* __restrict__ gradient_weight, const Eigen::Vector2i& resolution, const Eigen::Vector2f& pos) {
-	auto pos_float = Eigen::Vector2f{pos.x() * (resolution.x()-1), pos.y() * (resolution.y()-1)};
-	Eigen::Vector2i texel = pos_float.cast<int>();
+	const Eigen::Vector2f pos_float = Eigen::Vector2f{pos.x() * (resolution.x()-1), pos.y() * (resolution.y()-1)};
+	const Eigen::Vector2i texel = pos_float.cast<int>();
 
-	auto weight = pos_float - texel.cast<float>();
+	const Eigen::Vector2f weight = pos_float - texel.cast<float>();
 
 	auto deposit_val = [&](const Eigen::Matrix<float, N_DIMS, 1>& value, T weight, Eigen::Vector2i pos) {
 		pos.x() = std::max(std::min(pos.x(), resolution.x()-1), 0);
@@ -210,7 +208,7 @@ inline __host__ __device__ Ray pixel_to_ray_pinhole(
 	float focus_z = 1.0f,
 	float dof = 0.0f
 ) {
-	auto uv = pixel.cast<float>().cwiseQuotient(resolution.cast<float>());
+	const Eigen::Vector2f uv = pixel.cast<float>().cwiseQuotient(resolution.cast<float>());
 
 	Eigen::Vector3f dir = {
 		(uv.x() - screen_center.x()) * (float)resolution.x() / focal_length.x(),
@@ -258,7 +256,7 @@ inline __host__ __device__ Ray pixel_to_ray(
 	const Eigen::Vector2i distortion_resolution = Eigen::Vector2i::Zero()
 ) {
 	Eigen::Vector2f offset = ld_random_pixel_offset(snap_to_pixel_centers ? 0 : spp, pixel.x(), pixel.y());
-	auto uv = (pixel.cast<float>() + offset).cwiseQuotient(resolution.cast<float>());
+	Eigen::Vector2f uv = (pixel.cast<float>() + offset).cwiseQuotient(resolution.cast<float>());
 
 	Eigen::Vector3f dir;
 	if (camera_distortion.mode == ECameraDistortionMode::FTheta) {
diff --git a/src/testbed.cu b/src/testbed.cu
index f12d8df53..5383ad7e8 100644
--- a/src/testbed.cu
+++ b/src/testbed.cu
@@ -478,7 +478,7 @@ void Testbed::imgui() {
 		if (ImGui::SliderFloat("Crop size", &render_diam, 0.1f, max_diam, "%.3f", ImGuiSliderFlags_Logarithmic | ImGuiSliderFlags_NoRoundToFormat)) {
 			accum_reset = true;
 			if (old_render_diam > 0.f && render_diam > 0.f) {
-				auto center = (m_render_aabb.max + m_render_aabb.min) * 0.5f;
+				const Vector3f center = (m_render_aabb.max + m_render_aabb.min) * 0.5f;
 				float scale = render_diam / old_render_diam;
 				m_render_aabb.max = ((m_render_aabb.max-center) * scale + center).cwiseMin(m_aabb.max);
 				m_render_aabb.min = ((m_render_aabb.min-center) * scale + center).cwiseMax(m_aabb.min);
@@ -859,7 +859,7 @@ void Testbed::draw_visualizations(const Matrix<float, 3, 4>& camera_matrix) {
 
 		float zscale = 1.0f / focal[m_fov_axis];
 		float xyscale = (float)m_window_res[m_fov_axis];
-		auto screen_center = render_screen_center();
+		Vector2f screen_center = render_screen_center();
 		view2proj <<
 			xyscale, 0,       (float)m_window_res.x()*screen_center.x()*zscale, 0,
 			0,       xyscale, (float)m_window_res.y()*screen_center.y()*zscale, 0,
diff --git a/src/testbed_image.cu b/src/testbed_image.cu
index ec71f0cb2..09e729075 100644
--- a/src/testbed_image.cu
+++ b/src/testbed_image.cu
@@ -152,7 +152,7 @@ __global__ void shade_kernel_image(Vector2i resolution, const Vector2f* __restri
 
 	uint32_t idx = x + resolution.x() * y;
 
-	auto uv = positions[idx];
+	const Vector2f uv = positions[idx];
 	if (uv.x() < 0.0f || uv.x() > 1.0f || uv.y() < 0.0f || uv.y() > 1.0f) {
 		frame_buffer[idx] = Array4f::Zero();
 		return;
@@ -186,7 +186,7 @@ __global__ void eval_image_kernel_and_snap(uint32_t n_elements, const T* __restr
 
 	auto read_val = [&](int x, int y) {
 		auto val = ((tcnn::vector_t<T, 4>*)texture)[y * resolution.x() + x];
-		auto result = Array4f(val[0], val[1], val[2], val[3]);
+		Array4f result{val[0], val[1], val[2], val[3]};
 		if (!linear_colors) {
 			result.head<3>() = linear_to_srgb(result.head<3>());
 		}
@@ -202,10 +202,10 @@ __global__ void eval_image_kernel_and_snap(uint32_t n_elements, const T* __restr
 	} else {
 		pos = (pos.cwiseProduct(resolution.cast<float>()) - Vector2f::Constant(0.5f)).cwiseMax(0.0f).cwiseMin(resolution.cast<float>() - Vector2f::Constant(1.0f + 1e-4f));
 
-		Vector2i pos_int = pos.cast<int>();
-		auto weight = pos - pos_int.cast<float>();
+		const Vector2i pos_int = pos.cast<int>();
+		const Vector2f weight = pos - pos_int.cast<float>();
 
-		Vector2i idx = pos_int.cwiseMin(resolution - Vector2i::Constant(2)).cwiseMax(0);
+		const Vector2i idx = pos_int.cwiseMin(resolution - Vector2i::Constant(2)).cwiseMax(0);
 
 		val =
 			(1 - weight.x()) * (1 - weight.y()) * read_val(idx.x(), idx.y()) +
diff --git a/src/testbed_nerf.cu b/src/testbed_nerf.cu
index 79ec5ac24..7d51aee7f 100644
--- a/src/testbed_nerf.cu
+++ b/src/testbed_nerf.cu
@@ -1142,10 +1142,10 @@ __device__ LossAndGradient loss_and_gradient(const Vector3f& target, const Vecto
 inline __device__ Array3f composit_and_lerp(Vector2f pos, const Vector2i& resolution, uint32_t img, const __half* training_images, const Array3f& background_color, const Array3f& exposure_scale = Array3f::Ones()) {
 	pos = (pos.cwiseProduct(resolution.cast<float>()) - Vector2f::Constant(0.5f)).cwiseMax(0.0f).cwiseMin(resolution.cast<float>() - Vector2f::Constant(1.0f + 1e-4f));
 
-	Vector2i pos_int = pos.cast<int>();
-	auto weight = pos - pos_int.cast<float>();
+	const Vector2i pos_int = pos.cast<int>();
+	const Vector2f weight = pos - pos_int.cast<float>();
 
-	Vector2i idx = pos_int.cwiseMin(resolution - Vector2i::Constant(2)).cwiseMax(0);
+	const Vector2i idx = pos_int.cwiseMin(resolution - Vector2i::Constant(2)).cwiseMax(0);
 
 	auto read_val = [&](const Vector2i& p) {
 		__half val[4];
@@ -1153,38 +1153,32 @@ inline __device__ Array3f composit_and_lerp(Vector2f pos, const Vector2i& resolu
 		return Array3f{val[0], val[1], val[2]} * exposure_scale + background_color * (1.0f - (float)val[3]);
 	};
 
-	Array3f result = (
+	return (
 		(1 - weight.x()) * (1 - weight.y()) * read_val({idx.x(), idx.y()}) +
 		(weight.x()) * (1 - weight.y()) * read_val({idx.x()+1, idx.y()}) +
 		(1 - weight.x()) * (weight.y()) * read_val({idx.x(), idx.y()+1}) +
 		(weight.x()) * (weight.y()) * read_val({idx.x()+1, idx.y()+1})
 	);
-
-	return result;
 }
 
 inline __device__ Array3f composit(Vector2f pos, const Vector2i& resolution, uint32_t img, const __half* training_images, const Array3f& background_color, const Array3f& exposure_scale = Array3f::Ones()) {
-	Vector2i idx = image_pos(pos, resolution);
-
 	auto read_val = [&](const Vector2i& p) {
 		__half val[4];
 		*(uint64_t*)&val[0] = ((uint64_t*)training_images)[pixel_idx(p, resolution, img)];
 		return Array3f{val[0], val[1], val[2]} * exposure_scale + background_color * (1.0f - (float)val[3]);
 	};
 
-	return read_val(idx);
+	return read_val(image_pos(pos, resolution));
 }
 
 inline __device__ Array4f read_rgba(Vector2f pos, const Vector2i& resolution, uint32_t img, const __half* training_images) {
-	Vector2i idx = image_pos(pos, resolution);
-
 	auto read_val = [&](const Vector2i& p) {
 		__half val[4];
 		*(uint64_t*)&val[0] = ((uint64_t*)training_images)[pixel_idx(p, resolution, img)];
 		return Array4f{val[0], val[1], val[2], val[3]};
 	};
 
-	return read_val(idx);
+	return read_val(image_pos(pos, resolution));
 }
 
 __global__ void compute_loss_kernel_train_nerf(
@@ -1363,9 +1357,9 @@ __global__ void compute_loss_kernel_train_nerf(
 	}
 
 	if (error_map) {
-		Vector2f pos = (xy.cwiseProduct(error_map_res.cast<float>()) - Vector2f::Constant(0.5f)).cwiseMax(0.0f).cwiseMin(error_map_res.cast<float>() - Vector2f::Constant(1.0f + 1e-4f));
-		Vector2i pos_int = pos.cast<int>();
-		auto weight = pos - pos_int.cast<float>();
+		const Vector2f pos = (xy.cwiseProduct(error_map_res.cast<float>()) - Vector2f::Constant(0.5f)).cwiseMax(0.0f).cwiseMin(error_map_res.cast<float>() - Vector2f::Constant(1.0f + 1e-4f));
+		const Vector2i pos_int = pos.cast<int>();
+		const Vector2f weight = pos - pos_int.cast<float>();
 
 		Vector2i idx = pos_int.cwiseMin(resolution - Vector2i::Constant(2)).cwiseMax(0);
 
diff --git a/src/testbed_volume.cu b/src/testbed_volume.cu
index d74b26883..23faa984f 100644
--- a/src/testbed_volume.cu
+++ b/src/testbed_volume.cu
@@ -419,7 +419,7 @@ void Testbed::render_volume(CudaRenderBuffer& render_buffer,
 	m_volume.hit_counter.enlarge(2);
 	m_volume.hit_counter.memset(0);
 
-	auto sky_col = m_background_color.head<3>();
+	Array3f sky_col = m_background_color.head<3>();
 
 	const dim3 threads = { 16, 8, 1 };
 	const dim3 blocks = { div_round_up((uint32_t)res.x(), threads.x), div_round_up((uint32_t)res.y(), threads.y), 1 };