From a9f033e9562e40d4d0cbed1264a03e7d5df5a8a4 Mon Sep 17 00:00:00 2001 From: Alexis DUBURCQ Date: Sun, 31 Jan 2021 23:08:41 +0100 Subject: [PATCH] [core] Speedup simulation by avoiding memory allocations in stepper. (#276) * [gym] Use float32 action space instead of float64 because it is the usual dtype for neural networks. * [core] Avoid memory allocation during Lie algebra computations without relying on compiler optimizations. * [misc] Update Cmake to support RelWithDebInfo. Enable more Cmake warnings. Co-authored-by: Alexis Duburcq --- CMakeLists.txt | 7 +- build_tools/cmake/base.cmake | 6 +- core/include/jiminy/core/stepper/LieGroup.h | 206 ++++++++++++------ .../core/stepper/RungeKuttaDOPRIStepper.h | 7 +- core/src/engine/EngineMultiRobot.cc | 1 - core/src/stepper/AbstractRungeKuttaStepper.cc | 16 +- core/src/stepper/ExplicitEulerStepper.cc | 2 +- core/src/stepper/RungeKuttaDOPRIStepper.cc | 14 +- examples/double_pendulum/double_pendulum.cc | 2 +- .../gym_jiminy/common/envs/env_generic.py | 12 +- 10 files changed, 181 insertions(+), 92 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e724e1ac0..564bdb8c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.10) # Set the build version -set(BUILD_VERSION 1.5.4) +set(BUILD_VERSION 1.5.5) # Add definition of Jiminy version for C++ headers add_definitions("-DJIMINY_VERSION=\"${BUILD_VERSION}\"") @@ -21,9 +21,9 @@ include(${CMAKE_SOURCE_DIR}/build_tools/cmake/buildPythonWheel.cmake) # Set the compilation flags if(NOT WIN32) - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -g -ftemplate-backtrace-limit=0 ${WARN_FULL}") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -O0 -g -ftemplate-backtrace-limit=0 ${WARN_FULL}") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -DNDEBUG -O3 -Wfatal-errors -Werror ${WARN_FULL} \ - -Wno-non-virtual-dtor -Wno-deprecated-declarations -Wno-unused-parameter") + -Wno-non-virtual-dtor -Wno-deprecated-declarations -Wno-unused-parameter") else() set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} /EHsc /bigobj -g /Wall") # It would be great to have the same quality standard for Windows but @@ -32,6 +32,7 @@ else() set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /EHsc /bigobj /permissive- -DNDEBUG -DNOMINMAX /O2 /W2 \ /wd4068 /wd4715 /wd4820 /wd4244 /wd4005 /WX") endif() +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELEASE} -g") # Sub-projects add_subdirectory(soup) diff --git a/build_tools/cmake/base.cmake b/build_tools/cmake/base.cmake index d6ea1bcbf..31993ede3 100644 --- a/build_tools/cmake/base.cmake +++ b/build_tools/cmake/base.cmake @@ -25,8 +25,10 @@ set(WARN_FULL "-Wall -Wextra -Weffc++ -pedantic -pedantic-errors \ -Wmissing-noreturn -Wpacked -Wpointer-arith \ -Wredundant-decls -Wshadow -Wstack-protector \ -Wstrict-aliasing=2 -Wswitch-default -Wswitch-enum \ - -Wunreachable-code -Wunused" -) + -Wunreachable-code -Wunused -Wundef -Wlogical-op \ + -Wdisabled-optimization -Wmissing-braces -Wtrigraphs \ + -Wparentheses -Wwrite-strings -Werror=return-type \ + -Wsequence-point -Wdeprecated") # -Wconversion # Shared libraries need PIC set(CMAKE_POSITION_INDEPENDENT_CODE ON) diff --git a/core/include/jiminy/core/stepper/LieGroup.h b/core/include/jiminy/core/stepper/LieGroup.h index 7b99857e0..cf349d268 100644 --- a/core/include/jiminy/core/stepper/LieGroup.h +++ b/core/include/jiminy/core/stepper/LieGroup.h @@ -85,7 +85,20 @@ namespace Eigen RealScalar norm(void) const { return lpNorm<2>(); }; RealScalar normInf(void) const { return lpNorm(); }; + void setZero(void) + { + derived().v().setZero(); + derived().a().setZero(); + } + #define GENERATE_OPERATOR_MULT(OP,NAME) \ + StateDerivativeBase & (operator EIGEN_CAT(OP,=))(Scalar const & scalar) \ + { \ + v().array() EIGEN_CAT(OP,=) scalar; \ + a().array() EIGEN_CAT(OP,=) scalar; \ + return *this; \ + } \ + \ StateDerivativeWrapper const \ (operator OP)(Scalar const & scalar) const \ { \ @@ -420,24 +433,37 @@ namespace Eigen RealScalar norm(void) const { return lpNorm<2>(); }; RealScalar normInf(void) const { return lpNorm(); }; - template - StateBase & sumInPlace(StateDerivativeBase const & velocity) + void setZero(void) + { + derived().q().setZero(); + derived().v().setZero(); + } + + template + StateBase & sum(StateDerivativeBase const & velocity, + StateBase & out) const { // 'Sum' q = q + v, remember q is part of a Lie group (dim(q) != dim(v)) - assert(robot() == velocity.robot()); - pinocchio::integrate(robot()->pncModel_, q(), velocity.v(), q()); - v() += velocity.a(); - return *this; + assert(robot() == velocity.robot() == out.robot()); + pinocchio::integrate(robot()->pncModel_, q(), velocity.v(), out.q()); + out.v() = v() + velocity.a(); + return out; } template - StateDerivative difference(StateBase const & other) const + StateBase & sumInPlace(StateDerivativeBase const & velocity) { - assert(robot() == std::forward const &>(other).robot()); - StateDerivative dx(robot()); - pinocchio::difference(robot()->pncModel_, q(), other.q(), dx.v()); - dx.a() = v() - other.v(); - return dx; + return sum(velocity, *this); + } + + template + StateDerivativeBase & difference(StateBase const & position, + StateDerivativeBase & out) const + { + assert(robot() == position.robot() == out.robot()); + pinocchio::difference(robot()->pncModel_, q(), position.q(), out.v()); + out.a() = v() - position.v(); + return out; } }; @@ -693,6 +719,14 @@ namespace Eigen RealScalar norm(void) const { return lpNorm<2>(); }; RealScalar normInf(void) const { return lpNorm(); }; + void setZero(void) + { + for (ValueType & element : vector()) + { + element.setZero(); + } + } + #define GENERATE_OPERATOR_ARITHEMTIC(OP,NAME) \ auto const (operator OP)(Scalar const & scalar) const \ { \ @@ -729,6 +763,15 @@ namespace Eigen result.emplace_back(vector_[i] OP vectorIn[i]); \ } \ return VectorContainerWrapper(std::move(result)); \ + } \ + \ + VectorContainerBase & (operator EIGEN_CAT(OP,=))(Scalar const & scalar) \ + { \ + for (ValueType & element : vector()) \ + { \ + element EIGEN_CAT(OP,=) scalar; \ + } \ + return *this; \ } GENERATE_OPERATOR_ARITHEMTIC(*,product) @@ -752,15 +795,6 @@ namespace Eigen vector_[i] EIGEN_CAT(OP,=) vectorIn[i]; \ } \ return *this; \ - } \ - \ - VectorContainerBase & (operator EIGEN_CAT(OP,=))(Scalar const & scalar) \ - { \ - for (ValueType & element : vector()) \ - { \ - element EIGEN_CAT(OP,=) scalar; \ - } \ - return *this; \ } GENERATE_OPERATOR_COMPOUND(+,sum) @@ -777,12 +811,12 @@ namespace Eigen static inline RealScalar run(VectorContainerBase const & container) { EIGEN_USING_STD_MATH(pow) - RealScalar cumsum = 0.0; + RealScalar total = 0.0; for (typename internal::traits::ValueType const & element : container.vector()) { - cumsum += pow(element.template lpNorm

(), p); + total += pow(element.template lpNorm

(), p); } - return pow(cumsum, RealScalar(1)/p); + return pow(total, RealScalar(1)/p); } }; @@ -792,11 +826,13 @@ namespace Eigen typedef typename VectorContainerBase::RealScalar RealScalar; static inline RealScalar run(VectorContainerBase const & container) { - return std::accumulate(container.vector().begin(), container.vector().end(), RealScalar(0.0), - [](RealScalar & cumsum, typename internal::traits::ValueType const & element) - { - return element.template lpNorm<1>(); - }); + return std::accumulate( + container.vector().begin(), container.vector().end(), RealScalar(0.0), + [](RealScalar & cumsum, typename internal::traits::ValueType const & element) + { + return element.template lpNorm<1>(); + } + ); } }; @@ -806,13 +842,16 @@ namespace Eigen typedef typename VectorContainerBase::RealScalar RealScalar; static inline RealScalar run(VectorContainerBase const & container) { - std::vector normInfVector; - normInfVector.reserve(container.vector().size()); + RealScalar maxValue = 0.0; for (typename internal::traits::ValueType const & element : container.vector()) { - normInfVector.emplace_back(element.template lpNorm()); + RealScalar value = element.template lpNorm(); + if (value > maxValue) + { + maxValue = value; + } } - return *std::max_element(normInfVector.begin(), normInfVector.end()); + return maxValue; } }; } @@ -924,9 +963,11 @@ namespace Eigen VectorContainer & operator=(VectorContainerBase const & other) { std::vector::ValueType> const & vectorIn = other.vector(); - vector_.clear(); - vector_.reserve(vectorIn.size()); - std::copy(vectorIn.begin(), vectorIn.end(), std::back_inserter(vector())); + vector_.resize(vectorIn.size()); + for (uint32_t i = 0; i < vector_.size(); ++i) + { + vector_[i] = vectorIn[i]; + } return *this; } @@ -1141,8 +1182,8 @@ namespace Eigen VAR2.reserve(vectorIn.size()); \ for (uint32_t i = 0; i < vectorIn.size(); ++i) \ { \ - VAR1.push_back(std::move(vectorIn[i]. VAR1 ())); \ - VAR2.push_back(std::move(vectorIn[i]. VAR2 ())); \ + VAR1.push_back(std::move(vectorIn[i].VAR1())); \ + VAR2.push_back(std::move(vectorIn[i].VAR2())); \ vector_.emplace_back(vectorIn[i].robot(), VAR1[i], VAR2[i]); \ } \ } \ @@ -1177,8 +1218,6 @@ namespace Eigen for (uint32_t i = 0; i < vector_.size(); ++i) \ { \ assert(vectorIn[i].robot() == vector_[i].robot()); \ - assert(vectorIn[i].VAR1().size() == VAR1[i].size()); \ - assert(vectorIn[i].VAR2().size() == VAR2[i].size()); \ VAR1[i] = vectorIn[i].VAR1(); \ VAR2[i] = vectorIn[i].VAR2(); \ } \ @@ -1188,13 +1227,12 @@ namespace Eigen EIGEN_CAT(BASE,Vector) & operator=(EIGEN_CAT(BASE,Vector) const & other) \ { \ std::vector const & vectorIn = other.vector(); \ - vector_.clear(); \ - vector_.reserve(vectorIn.size()); \ + assert(vectorIn.size() == vector_.size()); \ for (uint32_t i = 0; i < vectorIn.size(); ++i) \ { \ + assert(vectorIn[i].robot() == vector_[i].robot()); \ VAR1[i] = other.VAR1[i]; \ VAR2[i] = other.VAR2[i]; \ - vector_.emplace_back(vectorIn[i].robot(), VAR1[i], VAR2[i]); \ } \ return *this; \ } \ @@ -1213,52 +1251,94 @@ namespace Eigen std::vector VAR2; \ }; - #define StateDerivative_SHARED_ADDON - - #define State_SHARED_ADDON \ + #define StateDerivative_SHARED_ADDON \ template::ValueType>::value, void> > \ - StateVector & sumInPlace(VectorContainerBase const & other) \ + StateDerivativeVector & sumInPlace(VectorContainerBase const & other, \ + Scalar const & scale) \ { \ std::vector::ValueType> const & \ vectorIn = other.vector(); \ - assert(vector_.size() == vectorIn.size()); \ + assert(vector_.size() == vectorOut.size()); \ for (uint32_t i = 0; i < vector_.size(); ++i) \ { \ - vector_[i].sumInPlace(vectorIn[i]); \ + vector_[i] += scale * vectorIn[i]; \ } \ return *this; \ + } + + #define State_SHARED_ADDON \ + template::ValueType>::value && \ + is_base_of_template::ValueType>::value, \ + void> > \ + VectorContainerBase & sum(VectorContainerBase const & other, \ + VectorContainerBase & out) const \ + { \ + std::vector::ValueType> const & \ + vectorIn = other.vector(); \ + std::vector::ValueType> & \ + vectorOut = out.vector(); \ + assert(vectorIn.size() == vectorOut.size()); \ + for (uint32_t i = 0; i < vector_.size(); ++i) \ + { \ + vector_[i].sum(vectorIn[i], vectorOut[i]); \ + } \ + return out; \ } \ \ template::ValueType>::value, void> > \ - StateVector sum(VectorContainerBase const & other) const \ + StateVector & sumInPlace(VectorContainerBase const & other) \ { \ - StateVector result = *this; \ - result.sumInPlace(other); \ - return result; \ + sum(other, *this); \ + return *this; \ } \ \ template::ValueType>::value, void> > \ - StateDerivativeVector difference(VectorContainerBase const & other) const \ + StateVector & sumInPlace(VectorContainerBase const & other, \ + Scalar const & scale) \ + { \ + std::vector::ValueType> const & \ + vectorIn = other.vector(); \ + assert(vector_.size() == vectorOut.size()); \ + for (uint32_t i = 0; i < vector_.size(); ++i) \ + { \ + vector_[i].sumInPlace(scale * vectorIn[i]); \ + } \ + return *this; \ + } \ + \ + template::ValueType>::value && \ + is_base_of_template::ValueType>::value, \ + void> > \ + VectorContainerBase & difference(VectorContainerBase const & other, \ + VectorContainerBase & out) const \ { \ std::vector::ValueType> const & \ vectorIn = other.vector(); \ - assert(vector_.size() == vectorIn.size()); \ - std::vector > velocities; \ - velocities.reserve(vector_.size()); \ + std::vector::ValueType> & \ + vectorOut = out.vector(); \ + assert(vectorIn.size() == vectorOut.size()); \ for (uint32_t i = 0; i < vector_.size(); ++i) \ { \ - velocities.push_back(vector_[i].difference(vectorIn[i])); \ + vector_[i].difference(vectorIn[i], vectorOut[i]); \ } \ - return StateDerivativeVector(std::move(velocities)); \ + return out; \ } - // Disable "-Weffc++" flag while generting this code because it is is buggy... + // Disable "-Weffc++" flag while generting this code because it is buggy... #pragma GCC diagnostic ignored "-Weffc++" GENERATE_SHARED_IMPL(StateDerivative,v,nv,a,nv) GENERATE_SHARED_IMPL(State,q,nq,v,nv) diff --git a/core/include/jiminy/core/stepper/RungeKuttaDOPRIStepper.h b/core/include/jiminy/core/stepper/RungeKuttaDOPRIStepper.h index 0827f8a80..f7d53c17c 100644 --- a/core/include/jiminy/core/stepper/RungeKuttaDOPRIStepper.h +++ b/core/include/jiminy/core/stepper/RungeKuttaDOPRIStepper.h @@ -86,9 +86,10 @@ namespace jiminy float64_t & dt); private: - float64_t tolRel_; ///< Relative tolerance - float64_t tolAbs_; ///< Absolute tolerance - state_t alternativeSolution_; ///< Internal buffer for alternative solution during error computation + float64_t tolRel_; ///< Relative tolerance + float64_t tolAbs_; ///< Absolute tolerance + state_t alternativeSolution_; ///< Internal buffer for alternative solution during error computation + stateDerivative_t errorSolution_; ///< Internal buffer for difference between solutions during error computation }; } diff --git a/core/src/engine/EngineMultiRobot.cc b/core/src/engine/EngineMultiRobot.cc index 13f567576..d46203cf1 100644 --- a/core/src/engine/EngineMultiRobot.cc +++ b/core/src/engine/EngineMultiRobot.cc @@ -2855,7 +2855,6 @@ namespace jiminy matrixN_t jointJacobian = matrixN_t::Zero(6, model.nv); for (int32_t i = 1; i < model.njoints; ++i) { - jointJacobian.setZero(); pinocchio::getJointJacobian(model, data, i, diff --git a/core/src/stepper/AbstractRungeKuttaStepper.cc b/core/src/stepper/AbstractRungeKuttaStepper.cc index 5a2fb6c92..309553f6c 100644 --- a/core/src/stepper/AbstractRungeKuttaStepper.cc +++ b/core/src/stepper/AbstractRungeKuttaStepper.cc @@ -36,24 +36,24 @@ namespace jiminy for (uint32_t i = 1; i < c_.size(); ++i) { - stateIncrement_ = dt * A_(i, 0) * ki_[0]; - for (uint32_t j = 1; j < i; ++j) + stateIncrement_.setZero(); + for (uint32_t j = 0; j < i; ++j) { - stateIncrement_ += dt * A_(i, j) * ki_[j]; + stateIncrement_.sumInPlace(ki_[j], dt * A_(i, j)); // Equivalent to `stateIncrement_ += dt * A_(i, j) * ki_[j]` but more efficient because it avoid temporaries } - stateBuffer_ = state.sum(stateIncrement_); + state.sum(stateIncrement_, stateBuffer_); ki_[i] = f(t + c_[i] * dt, stateBuffer_); } /* Now we have all the ki's: compute the solution. Sum the velocities before summing into position the accuracy is greater for summing vectors than for summing velocities into lie groups. */ - stateIncrement_ = dt * b_[0] * ki_[0]; - for (uint32_t i = 1; i < ki_.size(); ++i) + stateIncrement_.setZero(); + for (uint32_t i = 0; i < ki_.size(); ++i) { - stateIncrement_ += dt * b_[i] * ki_[i]; + stateIncrement_.sumInPlace(ki_[i], dt * b_[i]); } - candidateSolution_ = state.sum(stateIncrement_); + state.sum(stateIncrement_, candidateSolution_); // Evaluate the solution's error for step adjustment bool_t const hasSucceeded = adjustStep(state, candidateSolution_, dt); diff --git a/core/src/stepper/ExplicitEulerStepper.cc b/core/src/stepper/ExplicitEulerStepper.cc index 081effc6f..f8f88bbb2 100644 --- a/core/src/stepper/ExplicitEulerStepper.cc +++ b/core/src/stepper/ExplicitEulerStepper.cc @@ -17,7 +17,7 @@ namespace jiminy { // Simple explicit Euler: x(t + dt) = x(t) + dt dx(t) stateDerivative = f(t, state); - state.sumInPlace(dt * stateDerivative); + state.sumInPlace(stateDerivative, dt); // Scheme never considers failure. return true; diff --git a/core/src/stepper/RungeKuttaDOPRIStepper.cc b/core/src/stepper/RungeKuttaDOPRIStepper.cc index f4597eb2b..4e741c381 100644 --- a/core/src/stepper/RungeKuttaDOPRIStepper.cc +++ b/core/src/stepper/RungeKuttaDOPRIStepper.cc @@ -9,7 +9,8 @@ namespace jiminy AbstractRungeKuttaStepper(f, robots, DOPRI::A, DOPRI::b, DOPRI::c, true), tolRel_(tolRel), tolAbs_(tolAbs), - alternativeSolution_(robots) + alternativeSolution_(robots), + errorSolution_(robots) { // Empty on purpose } @@ -27,15 +28,16 @@ namespace jiminy float64_t const & dt) { // Compute alternative solution. - stateIncrement_ = dt * DOPRI::e[0] * ki_[0]; - for (uint32_t i = 1; i < ki_.size(); ++i) + stateIncrement_.setZero(); + for (uint32_t i = 0; i < ki_.size(); ++i) { - stateIncrement_ += dt * DOPRI::e[i] * ki_[i]; + stateIncrement_.sumInPlace(ki_[i], dt * DOPRI::e[i]); } - alternativeSolution_ = initialState.sum(stateIncrement_); + initialState.sum(stateIncrement_, alternativeSolution_); // Evaluate error between both states to adjust step - float64_t const errorNorm = solution.difference(alternativeSolution_).norm(); + solution.difference(alternativeSolution_, errorSolution_); + float64_t const errorNorm = errorSolution_.norm(); // Compute error scale float64_t const scale = tolAbs_ + tolRel_ * initialState.normInf(); diff --git a/examples/double_pendulum/double_pendulum.cc b/examples/double_pendulum/double_pendulum.cc index 67c59db92..6caf924a8 100644 --- a/examples/double_pendulum/double_pendulum.cc +++ b/examples/double_pendulum/double_pendulum.cc @@ -151,7 +151,7 @@ int main(int argc, char_t * argv[]) timer.toc(); std::cout << "Write log CSV: " << (timer.dt * 1.0e3) << "ms" << std::endl; timer.tic(); - engine->writeLog((outputDirPath / "log.h5").string(), "hdf5"); + engine->writeLog((outputDirPath / "log.hdf5").string(), "hdf5"); timer.toc(); std::cout << "Write log HDF5: " << (timer.dt * 1.0e3) << "ms" << std::endl; diff --git a/python/gym_jiminy/common/gym_jiminy/common/envs/env_generic.py b/python/gym_jiminy/common/gym_jiminy/common/envs/env_generic.py index 0c05aa892..0be51e442 100644 --- a/python/gym_jiminy/common/gym_jiminy/common/envs/env_generic.py +++ b/python/gym_jiminy/common/gym_jiminy/common/envs/env_generic.py @@ -391,12 +391,16 @@ def _refresh_action_space(self) -> None: effort_limit[motor.joint_velocity_idx] = \ MOTOR_EFFORT_MAX - # Set the action space + # Set the action space. + # Note that float32 is used instead of float64, because otherwise it + # would requires the neural network to perform float64 computations + # or cast the output for no really advantage since the action is + # directly forwarded to the motors, without intermediary computations. motors_velocity_idx = self.robot.motors_velocity_idx self.action_space = gym.spaces.Box( - low=-effort_limit[motors_velocity_idx], - high=effort_limit[motors_velocity_idx], - dtype=np.float64) + low=-effort_limit[motors_velocity_idx].astype(np.float32), + high=effort_limit[motors_velocity_idx].astype(np.float32), + dtype=np.float32) def reset(self, controller_hook: Optional[Callable[[], Optional[Tuple[