stan-dev · SteveBronder · May 31, 2024 · Aug 1, 2023 · Aug 2, 2023 · Aug 2, 2023
diff --git a/doxygen/contributor_help_pages/common_pitfalls.md b/doxygen/contributor_help_pages/common_pitfalls.md
@@ -190,6 +190,37 @@ The general rules to follow for passing values to a function are:
 2. If you are writing a function for reverse mode, pass values by `const&`
 3. In prim, if you are confident and working with larger types, use perfect forwarding to pass values that can be moved from. Otherwise simply pass values by `const&`.
 
+### Using auto is Dangerous With Eigen Matrix Functions in Reverse Mode
+
+The use of auto with the Stan Math library should be used with care, like in [Eigen](https://eigen.tuxfamily.org/dox/TopicPitfalls.html). Along with the cautions mentioned in the Eigen docs, there are also memory considerations when using reverse mode automatic differentiation. When returning from a function in the Stan math library with an Eigen matrix output with a scalar `var` type, the actual returned type will often be an `arena_matrix<Eigen::Matrix<...>>`. The `arena_matrix` class is an Eigen matrix where the underlying array of memory is located in Stan's memory arena. The `arena_matrix` that is returned by Stan functions is normally the same one resting in the callback used to calculate gradients in the reverse pass. Directly changing the elements of this matrix would also change the memory the reverse pass callback sees which would result in incorrect calculations.
+
+The simple solution to this is that when you use a math library function that returns a matrix and then want to assign to any of the individual elements of the matrix, assign to an actual Eigen matrix type instead of using auto. In the below example, we see the first case which uses auto and will change the memory of the `arena_matrix` returned in the callback for multiply's reverse mode. Directly below it is the safe version, which just directly assigns to an Eigen matrix type and is safe to do element insertion into.
+
+```c++
+Eigen::Matrix<var, -1, 1> y;
+Eigen::Matrix<var, -1, -1> X;
+// Bad!! Will change memory used by reverse pass callback within multiply!
+auto mu = multiply(X, y);
+mu(4) = 1.0;
+// Good! Will not change memory used by reverse pass callback within multiply
+Eigen::Matrix<var, -1, 1> mu_good = multiply(X, y);
+mu_good(4) = 1.0;
+```
+
+The reason we do this is for cases where functions returns are passe to other functions. An `arena_matrix` will always make a shallow copy when being constructed from another `arena_matrix`, which let's the functions avoid unnecessary copies.
-The reason we do this is for cases where functions returns are passe to other functions. An `arena_matrix` will always make a shallow copy when being constructed from another `arena_matrix`, which let's the functions avoid unnecessary copies.
+The reason we do this is for cases where function returns are passed to other functions. An `arena_matrix` will always make a shallow copy when being constructed from another `arena_matrix`, which lets the functions avoid unnecessary copies.
-The reason we do this is for cases where functions returns are passe to other functions. An `arena_matrix` will always make a shallow copy when being constructed from another `arena_matrix`, which let's the functions avoid unnecessary copies.
+The reason we do this is for cases where function returns are passed to other functions. An `arena_matrix` will always make a shallow copy when being constructed from another `arena_matrix`, which lets the functions avoid unnecessary copies.
+
+```c++
+Eigen::Matrix<var, -1, 1> y1;
+Eigen::Matrix<var, -1, -1> X1;
+Eigen::Matrix<var, -1, 1> y2;
+Eigen::Matrix<var, -1, -1> X2;
+auto mu1 = multiply(X1, y1);
+auto mu2 = multiply(X2, y2);
+// Inputs not copied in this case!
+auto z = add(mu1, mu2);
+```
+
+
 ### Passing variables that need destructors called after the reverse pass (`make_chainable_ptr`)
 
 When possible, non-arena variables should be copied to the arena to be used in the reverse pass.

diff --git a/stan/math/prim/prob/normal_log.hpp b/stan/math/prim/prob/normal_log.hpp
@@ -29,20 +29,20 @@ namespace math {
  * @tparam T_loc Type of location parameter.
  */
 template <bool propto, typename T_y, typename T_loc, typename T_scale>
-inline return_type_t<T_y, T_loc, T_scale> normal_log(const T_y& y,
-                                                     const T_loc& mu,
-                                                     const T_scale& sigma) {
-  return normal_lpdf<propto, T_y, T_loc, T_scale>(y, mu, sigma);
+inline return_type_t<T_y, T_loc, T_scale> normal_log(T_y&& y, T_loc&& mu,
+                                                     T_scale&& sigma) {
+  return normal_lpdf<propto>(std::forward<T_y>(y), std::forward<T_loc>(mu),
+                             std::forward<T_scale>(sigma));
 }
 
 /** \ingroup prob_dists
  * @deprecated use <code>normal_lpdf</code>
  */
 template <typename T_y, typename T_loc, typename T_scale>
-inline return_type_t<T_y, T_loc, T_scale> normal_log(const T_y& y,
-                                                     const T_loc& mu,
-                                                     const T_scale& sigma) {
-  return normal_lpdf<T_y, T_loc, T_scale>(y, mu, sigma);
+inline return_type_t<T_y, T_loc, T_scale> normal_log(T_y&& y, T_loc&& mu,
+                                                     T_scale&& sigma) {
+  return normal_lpdf(std::forward<T_y>(y), std::forward<T_loc>(mu),
+                     std::forward<T_scale>(sigma));
 }
 
 }  // namespace math

diff --git a/stan/math/prim/prob/normal_lpdf.hpp b/stan/math/prim/prob/normal_lpdf.hpp
@@ -41,19 +41,18 @@ namespace math {
 template <bool propto, typename T_y, typename T_loc, typename T_scale,
           require_all_not_nonscalar_prim_or_rev_kernel_expression_t<
               T_y, T_loc, T_scale>* = nullptr>
-inline return_type_t<T_y, T_loc, T_scale> normal_lpdf(const T_y& y,
-                                                      const T_loc& mu,
-                                                      const T_scale& sigma) {
+inline return_type_t<T_y, T_loc, T_scale> normal_lpdf(T_y&& y, T_loc&& mu,
+                                                      T_scale&& sigma) {
   using T_partials_return = partials_return_t<T_y, T_loc, T_scale>;
   using T_y_ref = ref_type_if_not_constant_t<T_y>;
   using T_mu_ref = ref_type_if_not_constant_t<T_loc>;
   using T_sigma_ref = ref_type_if_not_constant_t<T_scale>;
   static constexpr const char* function = "normal_lpdf";
   check_consistent_sizes(function, "Random variable", y, "Location parameter",
                          mu, "Scale parameter", sigma);
-  T_y_ref y_ref = y;
-  T_mu_ref mu_ref = mu;
-  T_sigma_ref sigma_ref = sigma;
+  T_y_ref y_ref = std::forward<T_y>(y);
+  T_mu_ref mu_ref = std::forward<T_loc>(mu);
+  T_sigma_ref sigma_ref = std::forward<T_scale>(sigma);
 
   decltype(auto) y_val = to_ref(as_value_column_array_or_scalar(y_ref));
   decltype(auto) mu_val = to_ref(as_value_column_array_or_scalar(mu_ref));
@@ -63,7 +62,7 @@ inline return_type_t<T_y, T_loc, T_scale> normal_lpdf(const T_y& y,
   check_finite(function, "Location parameter", mu_val);
   check_positive(function, "Scale parameter", sigma_val);
 
-  if (size_zero(y, mu, sigma)) {
+  if (size_zero(y_ref, mu_ref, sigma_ref)) {
     return 0.0;
   }
   if (!include_summand<propto, T_y, T_loc, T_scale>::value) {
@@ -78,7 +77,7 @@ inline return_type_t<T_y, T_loc, T_scale> normal_lpdf(const T_y& y,
   const auto& y_scaled_sq
       = to_ref_if<!is_constant_all<T_scale>::value>(y_scaled * y_scaled);
 
-  size_t N = max_size(y, mu, sigma);
+  size_t N = max_size(y_ref, mu_ref, sigma_ref);
   T_partials_return logp = -0.5 * sum(y_scaled_sq);
   if (include_summand<propto>::value) {
     logp += NEG_LOG_SQRT_TWO_PI * N;
@@ -106,10 +105,10 @@ inline return_type_t<T_y, T_loc, T_scale> normal_lpdf(const T_y& y,
 }
 
 template <typename T_y, typename T_loc, typename T_scale>
-inline return_type_t<T_y, T_loc, T_scale> normal_lpdf(const T_y& y,
-                                                      const T_loc& mu,
-                                                      const T_scale& sigma) {
-  return normal_lpdf<false>(y, mu, sigma);
+inline return_type_t<T_y, T_loc, T_scale> normal_lpdf(T_y&& y, T_loc&& mu,
+                                                      T_scale&& sigma) {
+  return normal_lpdf<false>(std::forward<T_y>(y), std::forward<T_loc>(mu),
+                            std::forward<T_scale>(sigma));
 }
 
 }  // namespace math

diff --git a/stan/math/rev/core/arena_matrix.hpp b/stan/math/rev/core/arena_matrix.hpp
@@ -4,7 +4,7 @@
 #include <stan/math/prim/fun/Eigen.hpp>
 #include <stan/math/rev/core/chainable_alloc.hpp>
 #include <stan/math/rev/core/chainablestack.hpp>
-
+#include <stan/math/rev/core/chainable_object.hpp>
 namespace stan {
 namespace math {
 
@@ -54,7 +54,7 @@ class arena_matrix : public Eigen::Map<MatrixType> {
           size) {}
 
   /**
-   * Constructs `arena_matrix` from an expression.
+   * Constructs `arena_matrix` from an expression
    * @param other expression
    */
   template <typename T, require_eigen_t<T>* = nullptr>
@@ -73,6 +73,50 @@ class arena_matrix : public Eigen::Map<MatrixType> {
     *this = other;
   }
 
+  /**
+   * Constructs `arena_matrix` from an expression, then send it to either the
+   * object stack or memory arena.
+   * @tparam T A type that inherits from Eigen::DenseBase that is not an
+   * `arena_matrix`.
+   * @param other expression
+   * @note When T is both an rvalue and a plain type, the expression is moved to
+   * the object stack. However when T is an lvalue, or an rvalue that is not a
+   * plain type, the expression is copied to the memory arena.
+   */
+  template <typename T, require_eigen_t<T>* = nullptr,
+            require_not_arena_matrix_t<T>* = nullptr>
+  arena_matrix(T&& other)  // NOLINT
+      : Base::Map([](auto&& x) {
+          using base_map_t =
+              typename stan::math::arena_matrix<MatrixType>::Base;
+          using T_t = std::decay_t<T>;
+          if (std::is_rvalue_reference<decltype(x)>::value
+              && is_plain_type<T_t>::value) {
+            // Note: plain_type_t here does nothing since T_t is plain type
+            auto other
+                = make_chainable_ptr(plain_type_t<MatrixType>(std::move(x)));
+            // other has it's rows and cols swapped already if it needed that
+            return base_map_t(&(other->coeffRef(0)), other->rows(),
+                              other->cols());
+          } else {
+            base_map_t map(
+                ChainableStack::instance_->memalloc_.alloc_array<Scalar>(
+                    x.size()),
+                (RowsAtCompileTime == 1 && T_t::ColsAtCompileTime == 1)
+                        || (ColsAtCompileTime == 1
+                            && T_t::RowsAtCompileTime == 1)
+                    ? x.cols()
+                    : x.rows(),
+                (RowsAtCompileTime == 1 && T_t::ColsAtCompileTime == 1)
+                        || (ColsAtCompileTime == 1
+                            && T_t::RowsAtCompileTime == 1)
+                    ? x.rows()
+                    : x.cols());
+            map = x;
+            return map;
+          }
+        }(std::forward<T>(other))) {}
+
   /**
    * Constructs `arena_matrix` from an expression. This makes an assumption that
    * any other `Eigen::Map` also contains memory allocated in the arena.
@@ -110,23 +154,32 @@ class arena_matrix : public Eigen::Map<MatrixType> {
    * @param a expression to evaluate into this
    * @return `*this`
    */
-  template <typename T>
-  arena_matrix& operator=(const T& a) {
-    // do we need to transpose?
-    if ((RowsAtCompileTime == 1 && T::ColsAtCompileTime == 1)
-        || (ColsAtCompileTime == 1 && T::RowsAtCompileTime == 1)) {
-      // placement new changes what data map points to - there is no allocation
-      new (this) Base(
-          ChainableStack::instance_->memalloc_.alloc_array<Scalar>(a.size()),
-          a.cols(), a.rows());
-
+  template <typename T, require_not_arena_matrix_t<T>* = nullptr>
+  arena_matrix& operator=(T&& a) {
+    using T_t = std::decay_t<T>;
+    if (std::is_rvalue_reference<T&&>::value && is_plain_type<T_t>::value) {
+      // Note: plain_type_t here does nothing since T_t is plain type
+      auto other = make_chainable_ptr(plain_type_t<MatrixType>(std::move(a)));
+      new (this) Base(&(other->coeffRef(0)), other->rows(), other->cols());
+      return *this;
     } else {
-      new (this) Base(
-          ChainableStack::instance_->memalloc_.alloc_array<Scalar>(a.size()),
-          a.rows(), a.cols());
+      // do we need to transpose?
+      if ((RowsAtCompileTime == 1 && T_t::ColsAtCompileTime == 1)
+          || (ColsAtCompileTime == 1 && T_t::RowsAtCompileTime == 1)) {
+        // placement new changes what data map points to - there is no
+        // allocation
+        new (this) Base(
+            ChainableStack::instance_->memalloc_.alloc_array<Scalar>(a.size()),
+            a.cols(), a.rows());
+
+      } else {
+        new (this) Base(
+            ChainableStack::instance_->memalloc_.alloc_array<Scalar>(a.size()),
+            a.rows(), a.cols());
+      }
+      Base::operator=(a);
+      return *this;
     }
-    Base::operator=(a);
-    return *this;
   }
   /**
    * Forces hard copying matrices into an arena matrix

diff --git a/stan/math/rev/core/chainable_object.hpp b/stan/math/rev/core/chainable_object.hpp
@@ -1,11 +1,9 @@
 #ifndef STAN_MATH_REV_CORE_CHAINABLE_OBJECT_HPP
 #define STAN_MATH_REV_CORE_CHAINABLE_OBJECT_HPP
 
-#include <stan/math/rev/meta.hpp>
-#include <stan/math/rev/core/chainable_alloc.hpp>
-#include <stan/math/rev/core/typedefs.hpp>
 #include <stan/math/prim/fun/Eigen.hpp>
-#include <stan/math/prim/fun/typedefs.hpp>
+#include <stan/math/prim/meta.hpp>
+#include <stan/math/rev/core/chainable_alloc.hpp>
 #include <vector>
 
 namespace stan {

diff --git a/stan/math/rev/core/operator_addition.hpp b/stan/math/rev/core/operator_addition.hpp
@@ -108,12 +108,12 @@ inline var operator+(Arith a, const var& b) {
  */
 template <typename VarMat1, typename VarMat2,
           require_all_rev_matrix_t<VarMat1, VarMat2>* = nullptr>
-inline auto add(const VarMat1& a, const VarMat2& b) {
+inline auto add(VarMat1&& a, VarMat2&& b) {
   check_matching_dims("add", "a", a, "b", b);
   using op_ret_type = decltype(a.val() + b.val());
   using ret_type = return_var_matrix_t<op_ret_type, VarMat1, VarMat2>;
-  arena_t<VarMat1> arena_a(a);
-  arena_t<VarMat2> arena_b(b);
+  arena_t<VarMat1> arena_a(std::forward<VarMat1>(a));
+  arena_t<VarMat2> arena_b(std::forward<VarMat2>(b));
   arena_t<ret_type> ret(arena_a.val() + arena_b.val());
   reverse_pass_callback([ret, arena_a, arena_b]() mutable {
     for (Eigen::Index j = 0; j < ret.cols(); ++j) {
@@ -124,7 +124,7 @@ inline auto add(const VarMat1& a, const VarMat2& b) {
       }
     }
   });
-  return ret_type(ret);
+  return ret;
 }
 
 /**
@@ -139,18 +139,18 @@ inline auto add(const VarMat1& a, const VarMat2& b) {
 template <typename Arith, typename VarMat,
           require_st_arithmetic<Arith>* = nullptr,
           require_rev_matrix_t<VarMat>* = nullptr>
-inline auto add(const VarMat& a, const Arith& b) {
+inline auto add(VarMat&& a, const Arith& b) {
   if (is_eigen<Arith>::value) {
     check_matching_dims("add", "a", a, "b", b);
   }
   using op_ret_type
       = decltype((a.val().array() + as_array_or_scalar(b)).matrix());
   using ret_type = return_var_matrix_t<op_ret_type, VarMat>;
-  arena_t<VarMat> arena_a(a);
+  arena_t<VarMat> arena_a(std::forward<VarMat>(a));
   arena_t<ret_type> ret(arena_a.val().array() + as_array_or_scalar(b));
   reverse_pass_callback(
       [ret, arena_a]() mutable { arena_a.adj() += ret.adj_op(); });
-  return ret_type(ret);
+  return ret;
 }
 
 /**
@@ -165,8 +165,8 @@ inline auto add(const VarMat& a, const Arith& b) {
 template <typename Arith, typename VarMat,
           require_st_arithmetic<Arith>* = nullptr,
           require_rev_matrix_t<VarMat>* = nullptr>
-inline auto add(const Arith& a, const VarMat& b) {
-  return add(b, a);
+inline auto add(const Arith& a, VarMat&& b) {
+  return add(std::forward<VarMat>(b), a);
 }
 
 /**
@@ -185,7 +185,7 @@ inline auto add(const Var& a, const EigMat& b) {
   using ret_type = return_var_matrix_t<EigMat>;
   arena_t<ret_type> ret(a.val() + b.array());
   reverse_pass_callback([ret, a]() mutable { a.adj() += ret.adj().sum(); });
-  return ret_type(ret);
+  return ret;
 }
 
 /**
@@ -217,9 +217,9 @@ inline auto add(const EigMat& a, const Var& b) {
 template <typename Var, typename VarMat,
           require_var_vt<std::is_arithmetic, Var>* = nullptr,
           require_rev_matrix_t<VarMat>* = nullptr>
-inline auto add(const Var& a, const VarMat& b) {
+inline auto add(const Var& a, VarMat&& b) {
   using ret_type = return_var_matrix_t<VarMat>;
-  arena_t<VarMat> arena_b(b);
+  arena_t<VarMat> arena_b(std::forward<VarMat>(b));
   arena_t<ret_type> ret(a.val() + arena_b.val().array());
   reverse_pass_callback([ret, a, arena_b]() mutable {
     for (Eigen::Index j = 0; j < ret.cols(); ++j) {
@@ -230,7 +230,7 @@ inline auto add(const Var& a, const VarMat& b) {
       }
     }
   });
-  return ret_type(ret);
+  return ret;
 }
 
 /**
@@ -246,8 +246,8 @@ inline auto add(const Var& a, const VarMat& b) {
 template <typename Var, typename VarMat,
           require_var_vt<std::is_arithmetic, Var>* = nullptr,
           require_rev_matrix_t<VarMat>* = nullptr>
-inline auto add(const VarMat& a, const Var& b) {
-  return add(b, a);
+inline auto add(VarMat&& a, const Var& b) {
+  return add(b, std::forward<VarMat>(a));
 }
 
 template <typename T1, typename T2,
@@ -274,8 +274,8 @@ inline auto add(const T1& a, const T2& b) {
  */
 template <typename VarMat1, typename VarMat2,
           require_any_var_matrix_t<VarMat1, VarMat2>* = nullptr>
-inline auto operator+(const VarMat1& a, const VarMat2& b) {
-  return add(a, b);
+inline auto operator+(VarMat1&& a, VarMat2&& b) {
+  return add(std::forward<VarMat1>(a), std::forward<VarMat2>(b));
 }
 
 }  // namespace math

diff --git a/stan/math/rev/fun/fill.hpp b/stan/math/rev/fun/fill.hpp
@@ -23,7 +23,7 @@ namespace math {
 template <typename VarMat, typename S, require_var_matrix_t<VarMat>* = nullptr,
           require_var_t<S>* = nullptr>
 inline void fill(VarMat& x, const S& y) {
-  arena_t<plain_type_t<value_type_t<VarMat>>> prev_vals = x.val().eval();
+  arena_t<plain_type_t<value_type_t<VarMat>>> prev_vals(x.val().eval());
   x.vi_->val_.fill(y.val());
   reverse_pass_callback([x, y, prev_vals]() mutable {
     x.vi_->val_ = prev_vals;
@@ -46,7 +46,7 @@ inline void fill(VarMat& x, const S& y) {
 template <typename VarMat, typename S, require_var_matrix_t<VarMat>* = nullptr,
           require_arithmetic_t<S>* = nullptr>
 inline void fill(VarMat& x, const S& y) {
-  arena_t<plain_type_t<value_type_t<VarMat>>> prev_vals = x.val().eval();
+  arena_t<plain_type_t<value_type_t<VarMat>>> prev_vals(x.val().eval());
   x.vi_->val_.fill(y);
   reverse_pass_callback([x, prev_vals]() mutable {
     x.vi_->val_ = prev_vals;